(****************************************************************)
(* UCalcSpvTreeCART.pas - Copyright (c) 2005 Ricco RAKOTOMALALA *)
(****************************************************************)

{
@abstract(Structure arbre de dcision pour C4.5 (Quinlan, 1993))
@author(Ricco)
@created(29/12/2005)

C4.5 tel qu'il est dcrit dans le livre de Quinlan, il manque apparemment
toute la floppe de subtilits dans les multiples releases du code source sur le net.
}

unit UCalcSpvTreeC45;

interface

USES
    UCalcTreeStructureDefinition,
    UCalcSpvTreeDefinition;

TYPE
    //feuille
    TSplitLeafSpvC45 = class(TSplitLeafSpv)
                       end;

    //split
    TSplitAttributSpvC45 = class(TSplitAttributSpv)
                           protected
                           function    getClassSplitLeaf(): TClassSplitLeaf; override;
                           function    ComputeGoodness(): double; override;
                           function    ComputeAcceptSplit(): boolean; override;
                           end;

    //liste de splits
    TLstSplitAttSpvC45 = class(TLstSplitAttSpv)
                         protected
                         function  getClassSplitAttribut(): TClassSplitAttribut; override;
                         end;

    //noeud de l'arbre
    TMLTreeNodeSpvC45 = class(TMLTreeNodeSpv)
                        private
                        //erreur pessimiste (c'est le nombre d'erreur ici !!!) -- calcule lors de l'assignation de la conclusion
                        FPessimisticErr: double;
                        //calcul de l'cart  l'erreur pour avoir la borne -- taille sommet, contre-exemples, confidence-level
                        //extrait du livre de Quinlan, "Programs for Machine Learning..."
                        function  addErrs(N,CE,CF: double): double;
                        protected
                        procedure AssignConclusion(); override;
                        function  isNoSplitNeeded(): boolean; override;
                        function  getClassLstSplitAttributes(): TClassLstSplitAttributes; override;
                        public
                        //savoir si un noeud est prunable, i.e. a des enfants, et ce sont tous des feuilles
                        function  isPrunable(): boolean;
                        //erreur pessimiste
                        property  pessimistic: double read FPessimisticErr;
                        end;

    //structure d'arbre
    TMLTreeStructureSpvC45 = class(TMLTreeStructureSpv)
                             protected
                             function   getClassMLTreeNode(): TClassMLTreeNode; override;
                             public
                             //ce qui est spcifique  C4.5
                             procedure   PostPruning(); override;
                             end;

implementation

uses
    Math, Contnrs, Sysutils,
    UCompSpvTreeC45, ULogFile;

{ TSplitAttributSpvC45 }

function TSplitAttributSpvC45.ComputeAcceptSplit: boolean;
var j: integer;
    nb: integer;
    minSize: integer;
    ok: boolean;
begin
 minSize:= (self.PrmMethod as TOpPrmSpvTreeC45).SizeMinLeaf;
 //chercher s'il existe au moins deux feuilles de taille suprieure ou gale  min
 ok:= (self.Count >= 2);
 //tous de taille suprieure  deux
 if ok
  then
   begin
    nb:= 0;
    for j:= 0 to pred(self.Count) do
     nb:= nb + ORD(self.SplitLeaf[j].NbExamples >= minSize);
    ok:= (nb >= 2);
   end;
 //
 result:= ok;
end;

function TSplitAttributSpvC45.ComputeGoodness: double;
var gain,ratio: double;
    v,sum: double;
    j: integer;
    leaf: TSplitLeafSpv;
begin
 //gain d'entropie
 gain:= inherited ComputeGoodness();
 if (gain > 0)
  then
   begin
    //effectif total du sommet pre du dcoupage
    sum:= (self.Node as TMLTreeNodeSpvC45).DistClass.TabFreq.Value[0];
    if (sum > 0.0)
     then
      begin
       //calculer l'entropie transversale
       ratio:= 0.0;
       for j:= 0 to pred(self.Count) do
        begin
         leaf:= self.SplitLeaf[j] as TSplitLeafSpvC45;
         v:= leaf.Dist.TabFreq.Value[0] / sum;
         if (v > 0)
          then ratio := ratio + v * log2(v);
        end;
       //modifier le signe
       ratio:= -1.0 * ratio;
       //calculer le gain ratio
       if (ratio > 0.0)
        then gain:= gain / ratio
        else gain:= 0.0;
      end;
   end;
 //renvoyer le rsultat
 result:= gain;
end;

function TSplitAttributSpvC45.getClassSplitLeaf: TClassSplitLeaf;
begin
 result:= TSplitLeafSpvC45;
end;

{ TLstSplitAttSpvC45 }

function TLstSplitAttSpvC45.getClassSplitAttribut: TClassSplitAttribut;
begin
 result:= TSplitAttributSpvC45;
end;

{ TMLTreeNodeSpvC45 }

function TMLTreeNodeSpvC45.addErrs(N, CE, CF: double): double;
Const Val: Array[0..8] Of Real = (0,0.001,0.005,0.01,0.05,0.10,0.20,0.40,1.00);
      Dev: Array[0..8] Of Real = (100,3.09,2.58,2.33,1.65,1.28,0.84,0.25,0.00);
      Coeff: Extended = 0;
Var Val0,Pr: Extended;
    i: Byte;
Begin
 //gestion des exceptions, surtout pour N=0
 If (N>0)
  Then
   Begin
     i:= 0;
     While (CF>Val[i]) Do Inc(i);
     Coeff:= Dev[i-1]+(Dev[i]-Dev[i-1])*(CF-Val[i-1])/(Val[i]-Val[i-1]);
     Coeff:= Coeff*Coeff;
     If (ce<1E-6)
      Then AddErrs:= N*(1-Exp(ln(CF)/N))
      Else
       If (ce<0.9999)
        Then
         Begin
          Val0:= N*(1-Exp(ln(CF)/N));
          addErrs:= Val0+ce*(addErrs(N,1.0,CF)-Val0);
         End
        Else
         If ((ce+0.5)>=N)
          Then AddErrs:= 0.67*(N-ce)
          Else
           Begin
            Pr:= (ce+0.5+Coeff/2+SQRT(Coeff*((ce+0.5)*(1-(ce+0.5)/N)+Coeff/4)))/(N+Coeff);
            AddErrs:= N*Pr-ce;
           End;
   End
  Else addErrs:= 0;//parce qu'il n'y a pas d'individus couverts...,  revoir plus tard du point de vue thorique
End;

procedure TMLTreeNodeSpvC45.AssignConclusion;
var ce: double;
begin
 inherited AssignConclusion();
 //calculer les contre-exemples
 ce:= self.DistClass.TabFreq.Value[0] - self.DistClass.TabFreq.Value[self.NodeConclusion];
 //et la borne haute de l'intervalle de confiance du nombre de contre-exemples
 FPessimisticErr:= ce + self.addErrs(self.DistClass.TabFreq.Value[0],ce,(self.PrmMethod as TOpPrmSpvTreeC45).PLevel);
end;

function TMLTreeNodeSpvC45.getClassLstSplitAttributes: TClassLstSplitAttributes;
begin
 result:= TLstSplitAttSpvC45; 
end;

function TMLTreeNodeSpvC45.isNoSplitNeeded: boolean;
begin
  result:= (self.DistClass.NbExamples < 2);
end;

function TMLTreeNodeSpvC45.isPrunable: boolean;
var ok: boolean;
    j: integer;
begin
 ok:= not(self.isLeaf());
 //il ne rentre mme pas dans la boucle si c'est une feuille
 for j:= 0 to pred(self.getCountSuccessors()) do
  ok:= ok and self.getSuccessor(j).isLeaf();
  //
 result:= ok;
end;

{ TMLTreeStructureSpvC45 }

function TMLTreeStructureSpvC45.getClassMLTreeNode: TClassMLTreeNode;
begin
 result:= TMLTreeNodeSpvC45; 
end;

//comparaison -- pour le tri selon invers selon le niveau
function compareNodeAsDepth(Item1, Item2: Pointer): Integer;
begin
 if (TMLTreeNodeSpvC45(Item1).Depth > TMLTreeNodeSpvC45(Item2).Depth)
  then result:= +1
  else
   begin
    if (TMLTreeNodeSpvC45(Item1).Depth < TMLTreeNodeSpvC45(Item2).Depth)
     then result:= -1
     else result:= 0;
   end;
end;
//-----------------------------------------------------------

procedure TMLTreeStructureSpvC45.PostPruning;
var srtLeaves: TObjectList;
    i: integer;
    leaf,predecessor,successor: TMLTreeNodeSpvC45;
    errRef,errLeaves: double;
begin
 TraceLog.WriteToLogFile(format('[C4.5] number of leaves >> before << pruning = %d',[self.CountLeaves]));
 srtLeaves:= TObjectList.Create(FALSE);
 //rcuprer les feuilles
 for i:= 0 to pred(self.CountLeaves) do
  srtLeaves.Add(self.Leaf[i]);
 //trier selon le niveau
 srtLeaves.Sort(compareNodeAsDepth);
 //lancer le post-pruning -- s'il n'y a qu'une feuille, c'est la racine !
 while (srtLeaves.Count > 1) do
  begin
   //rcuprer le premier noeud, qui est en fait le plus profond
   leaf:= srtLeaves.Items[srtLeaves.Count-1] as TMLTreeNodeSpvC45;
   //si c'est pas la racine
   if (leaf.Predecessor <> NIL)
    then
     begin
      predecessor:= leaf.Predecessor as TMLTreeNodeSpvC45;
      if predecessor.isPrunable()
       then
        begin
         //calculer l'erreur pessimiste
         errRef:= predecessor.pessimistic;
         //pour chaque enfant
         errLeaves:= 0.0;
         for i:= 0 to pred(predecessor.getCountSuccessors()) do
          begin
           successor:= predecessor.getSuccessor(i) as TMLTreeNodeSpvC45;
           errLeaves:= errLeaves + successor.pessimistic;
           //quelle que soit le rsultat, les successeurs seront exclus du calcul au tour suivant
           srtLeaves.Extract(successor);
          end;
         //comparer pour savoir s'il faut laguer ou pas
         if (errRef <= errLeaves)
          then
           begin
            predecessor.pruneFromHere();
            //le prdecesseur devient une feuille -- candidat  la suppression donc
            srtLeaves.Add(predecessor);
            srtLeaves.Sort(compareNodeAsDepth);
           end;
        end
       //else srtLeaves.Extract(leaf);
       else srtLeaves.delete(srtLeaves.Count-1);//c'est plus rapide !!!
     end;
  end;
 srtLeaves.Free();
 TraceLog.WriteToLogFile(format('[C4.5] number of leaves >> after << pruning = %d',[self.CountLeaves]));
end;

end.
