(**********************************************************************)
(* UCalcSpvTreeDefinition.pas - Copyright (c) 2004 Ricco RAKOTOMALALA *)
(**********************************************************************)

{
@abstract(Definition de base d'un arbre de dcision supervis - algo ID3 de base)
@author(Ricco)
@created(12/01/2004)
}
unit UCalcSpvTreeDefinition;

interface

USES
        UCalcStatDes, UDatasetdefinition, UDatasetImplementation,
        UOperatorDefinition,
        UDatasetExamples,
        UCalcRulesDefinition,
        UCalcTreeStructureDefinition,
        UCalcDistribution;

TYPE
        {la feuille supervise}
        TSplitLeafSpv = class(TSplitLeaf)
                        private
                        FDist: TCalcStatDesDiscrete;
                        public
                        constructor create(prmTargetAttributes: TLstAttributes; prmCond: TRuleCondition; nbExamples: integer); override;
                        procedure   BeginUpdate(); override;
                        procedure   AddExample(prmExample: integer); override;
                        procedure   EndUpdate(); override;
                        procedure   Merge(prmOther: TSplitLeaf); override;
                        destructor  Destroy; override;
                        property    Dist: TCalcStatDesDiscrete read FDist;
                        end;

        {le split d'un attribut en supervis}
        TSplitAttributSpv = class(TSplitAttribut)
                            private
                            protected
                            function    getClassSplitLeaf(): TClassSplitLeaf; override;
                            function    ComputeGoodness(): double; override;
                            function    ComputeAcceptSplit(): boolean; override;
                            procedure   OptimizeContinuousLeaves(); override;
                            end;

        {la liste de split en supervis}
        TLstSplitAttSpv = class(TLstSplitAttributes)
                          protected
                          function  getClassSplitAttribut(): TClassSplitAttribut; override;
                          end;

        {un noeud de l'arbre}
        TMLTreeNodeSpv = class(TMLTreeNode)
                         private
                         FDistClass: TCalcStatDesDiscrete;
                         FNodeConclusion: TTypeDiscrete;
                         protected
                         function  isNoSplitNeeded(): boolean; override;
                         function  getClassLstSplitAttributes(): TClassLstSplitAttributes; override;
                         procedure computeLocalInfos(prmExamples: TExamples); override;
                         procedure getLeafInfos(prmLeaf: TSplitLeaf); override;
                         {assigner une conclusion au noeud}
                         procedure AssignConclusion();
                         public
                         function   getHTMLLeafInfos(): string; override;
                         function   getTXTLeafInfos(): string; override;                         
                         destructor destroy; override;
                         property   DistClass: TCalcStatDesDiscrete read FDistClass;
                         property   NodeConclusion: TTypeDiscrete read FNodeConclusion;
                         end;

        {la structure de l'arbre}
        TMLTreeStructureSpv = class(TMLTreeStructure)
                              private
                              {attribut  prdire}
                              FClassAttribute: TAttDiscrete;
                              protected
                              function   getClassMLTreeNode(): TClassMLTreeNode; override;
                              public
                              constructor create(paramMethod: TOperatorParameter; prmTarget,prmInput: TLstAttributes; prmExamples: TExamples); override;
                              {assigner une concusion  tous les sommets}
                              procedure   PostGrowing(); override;
                              {post-pruning -- non utilis dans ID3}
                              procedure   PostPruning(); override;
                              {pointeur sur attribut  prdire}
                              property ClassAttribute: TAttDiscrete read FClassAttribute;
                              end;  

implementation

uses
        Sysutils,
        UCompSpvTreeID3, UConstConfiguration, ULogFile;

{ TSplitLeafSpv }

procedure TSplitLeafSpv.AddExample(prmExample: integer);
begin
 inherited AddExample(prmExample);
 FDist.AddValue(prmExample);
end;

procedure TSplitLeafSpv.BeginUpdate;
begin
 inherited BeginUpdate();
 FDist.BeginUpdate();
end;

constructor TSplitLeafSpv.create(prmTargetAttributes: TLstAttributes;
  prmCond: TRuleCondition; nbExamples: integer);
begin
 inherited Create(prmTargetAttributes,prmCond,nbExamples);
 //et la stat locale, le cast peut planter si mauvaise dfinition mais c'est forc pour viter toute confusion
 FDist:= TCalcStatDesDiscrete.Create(prmTargetAttributes.Attribute[0] as TAttDiscrete,NIL);
end;

destructor TSplitLeafSpv.Destroy;
begin
 FDist.Free;
 inherited;
end;

procedure TSplitLeafSpv.EndUpdate;
begin
 inherited EndUpdate();
 FDist.EndUpdate();
end;

procedure TSplitLeafSpv.Merge(prmOther: TSplitLeaf);
begin
 inherited Merge(prmOther);
 FDist.Merge((prmOther as TSplitLeafSpv).Dist);
end;

{ TSplitAttribitSpv }

function TSplitAttributSpv.ComputeAcceptSplit: boolean;
var prm: TOpPrmSpvTree;
    i,nb: integer;
    ok: boolean;
begin
 prm:= PrmMethod as TOpPrmSpvTree;
 //premier test
 ok:= (self.GoodnessSplit>prm.GoodnessThresold);
 //second test  mettre en oeuvre ?
 if ok
  then
   begin
     nb:= 0;
     for i:= 0 to pred(self.Count) do
      begin
       //modif le 01/05/2004 -- test en >= et non plus >
       if (self.SplitLeaf[i].NbExamples>=prm.SizeAfterSplit)
        then inc(nb);
      end;
     ok:= (nb>=2);// la paramtre  spcifier
   end;
 result:= ok;
end;

function TSplitAttributSpv.ComputeGoodness: double;
var localNode: TMLTreeNodeSpv;
    total,partial,v: double;
    i: integer;
    leaf: TSplitLeafSpv;
begin
 localNode:= self.Node as TMLTreeNodeSpv;
 total:= localNode.DistClass.Shannon();
 partial:= 0.0;
 for i:= 0 to pred(self.Count) do
  begin
   leaf:= self.SplitLeaf[i] as TSplitLeafSpv;
   v:= 1.0*leaf.Dist.TabFreq.Value[0]/(1.0*localNode.DistClass.TabFreq.Value[0]);
   partial:= partial+v*leaf.Dist.Shannon();
  end;
 //le gain d'entropie
 //comme la construction de l'arbre n'est pas ordonn, il est inutile de pondrer par le poids du neoud
 //il en serait tout autrement s'il s'agissait d'un graphe... ou de la mthode Catlett limit en splits...
 result:= -1.0*(total-partial);
end;

function TSplitAttributSpv.getClassSplitLeaf: TClassSplitLeaf;
begin
 result:= TSplitLeafSpv;
end;

{ TLstSplitAttSpv }

function TLstSplitAttSpv.getClassSplitAttribut: TClassSplitAttribut;
begin
 result:= TSplitAttributSpv;
end;

{ TMLTreeNodeSpv }

procedure TMLTreeNodeSpv.AssignConclusion;
var iMax: TTypeDiscrete;
    //vMax,v: integer;
begin
 (*
 iMax:= 0;
 vMax:= -1;
 if (FDistClass.NbExamples>0)
  then
   begin
     for i:= 1 to FDistClass.Attribute.nbValues do
      begin
       v:= FDistClass.TabFreq.Value[i];
       if (v>vMax)
        then
         begin
          iMax:= i;
          vMax:= v;
         end;
      end;
   end;
 *)
 iMax:= FDistClass.TabFreq.getIndexMaxValue();
 //si pas de conclusion alors conclusion alatoire
 if (iMax = 0)
  then iMax:= succ(random(FDistClass.Attribute.nbValues));
 //affecter
 FNodeConclusion:= iMax;
end;

procedure TMLTreeNodeSpv.computeLocalInfos(prmExamples: TExamples);
begin
 inherited ComputeLocalInfos(prmExamples);
 //distribution de la classe sur le sommet
 FDistClass:= TCalcStatDesDiscrete.Create((TreeStructure as TMLTreeStructureSpv).ClassAttribute,prmExamples);
end;

destructor TMLTreeNodeSpv.destroy;
begin
 if assigned(FDistClass)
  then freeandnil(FDistClass);
 inherited destroy;
end;

function TMLTreeNodeSpv.getClassLstSplitAttributes: TClassLstSplitAttributes;
begin
 result:= TLstSplitAttSpv; 
end;

function TMLTreeNodeSpv.getHTMLLeafInfos: string;
var s: string;
begin
 s:= format(' then %s = <b>%s</b>',[FDistClass.Attribute.Name,FDistClass.Attribute.LstValues.getDescription(FNodeConclusion)]);
 s:= s+format(' (%.2f %s of %d examples)',[100.0*FDistClass.TabFreq.Frequence[FNodeConclusion],'%',FDistClass.NbExamples]);
 result:= s;
end;

function TMLTreeNodeSpv.getTXTLeafInfos: string;
var s: string;
begin
 s:= format('Then %s = %s',[FDistClass.Attribute.Name,FDistClass.Attribute.LstValues.getDescription(FNodeConclusion)]);
 s:= s+format(' [%.4f]',[FDistClass.TabFreq.Frequence[FNodeConclusion]]);
 result:= s;
end;

procedure TMLTreeNodeSpv.getLeafInfos(prmLeaf: TSplitLeaf);
var leaf: TSplitLeafSpv;
begin
 inherited getLeafInfos(prmLeaf);
 leaf:= prmLeaf as TSplitLeafSpv;
 FDistClass:= leaf.Dist.Duplicate() as TCalcStatDesDiscrete;
end;

function TMLTreeNodeSpv.isNoSplitNeeded: boolean;
var prm: TOpPrmSpvTree;
begin
 prm:= self.PrmMethod as TOpPrmSpvTree;
 //sommet trop petit pour tre splitt
 //ou profondeur excessive
 result:= (DistClass.NbExamples<prm.SizeBeforeSplit) OR (self.Depth>=prm.MaxDepth)
end;

{ TMLTreeStructureSpv }

constructor TMLTreeStructureSpv.create(paramMethod: TOperatorParameter; prmTarget, prmInput: TLstAttributes;
  prmExamples: TExamples);
begin
 FClassAttribute:= prmTarget.Attribute[0] as TAttDiscrete;
 inherited Create(paramMethod, prmTarget,prmInput,prmExamples);
end;

function TMLTreeStructureSpv.getClassMLTreeNode: TClassMLTreeNode;
begin
 result:= TMLTreeNodeSpv;
end;

procedure TMLTreeStructureSpv.PostGrowing;
var i: integer;
    node: TMLTreeNodeSpv;
begin
 for i:= 0 to pred(self.CountNodes) do
  begin
   node:= self.Node[i] as TMLTreeNodeSpv;
   node.AssignConclusion();
  end;
end;

procedure TSplitAttributSpv.OptimizeContinuousLeaves;
var ls,rs: TSplitLeafSpv;
    exSorted: TExamples;
    i,example,numAtt: integer;
    v,vMax: double;
    m,mMax: TTypeContinue;
    n,vt: double;
begin
 //toujours binaire donc
 ls:= self.SplitLeaf[0] as TSplitLeafSpv;
 rs:= self.SplitLeaf[1] as TSplitLeafSpv;
 //prendre la liste trie pour l'instant,  voir les optimisations plus tard

   //rcuprer celui envoy par le sommet - ouh l c'est pas beau
   numAtt:= Node.InputAttributes.GetIndex(Attribute);
   exSorted:= nil;
   if (numAtt<Node.InputSortedExamples.Count)
    then exSorted:= Node.InputSortedExamples.Items[numAtt] as TExamples;
   if not(assigned(exSorted)) OR (exSorted.Size<>Node.Examples.Size)
    //c'est la cata
    then
     begin
      TraceLog.WriteToLogFile('mauvaise rcupration de la liste trie');
      Raise Exception.Create('mauvaise rcupration de la liste trie');
     end;

 ls.Dist.BeginUpdate();//il n'y a personne sur cette portion pour l'instant
 rs.Dist.RefreshStat(Node.Examples);//il y a tout le monde sur cette portion pour l'instant
 mMax:= self.Attribute.cValue[exSorted.Number[1]];
 vMax:= 0.0;
 vt:= (Node as TMLTreeNodeSpv).DistClass.Shannon();
 n:= 1.0*(Node as TMLTreeNodeSpv).DistClass.TabFreq.Value[0];
 //pour chaque individu
 for i:= 1 to pred(exSorted.Size) do
  begin
   example:= exSorted.Number[i];
   ls.Dist.AddValue(example);
   rs.Dist.RemoveValue(example);
   //pas de tests  effectuer sur les ex-aequos
   if (self.Attribute.cValue[example]<self.Attribute.cValue[exSorted.Number[succ(i)]])
    then
     begin
      m:= 0.5*(self.Attribute.cValue[example]+self.Attribute.cValue[exSorted.Number[succ(i)]]);
      //v:= self.ComputeGoodness();
      //plus rapide, plein de choses ne sont plus recalcules
      v:= -1.0*(vt-((1.0*ls.Dist.TabFreq.Value[0]/n)*ls.Dist.Shannon()+(1.0*rs.Dist.TabFreq.Value[0]/n)*rs.Dist.Shannon()));
      //suite standard
      if (v>vMax)
       then
        begin
         vMax:= v;
         mMax:= m;
        end;
     end;
  end;
 //vider la liste de tri si elle est cre localement
 //fichier log
 //TraceLog.WriteToLogFile(format('discretize %s at %.2f >> split value %.4f',[self.Attribute.Name,mMax,vMax]));
 //rafrachir la liste des enfants, on peut se passer du tri mais pas de cette partie....
 (ls.Condition as TRuleCondContinue).SetNewThresold(mMax);
 (rs.Condition as TRuleCondContinue).SetNewThresold(mMax);
 self.BeginUpdate();
 for i:= 1 to self.Node.Examples.Size do
  begin
   example:= self.Node.Examples.Number[i];
   m:= self.Attribute.cValue[example];
   //si ce n'est toi, c'est donc ton frre
   if ls.Condition.TestValue(m)
    then ls.AddExample(example)
    else rs.AddExample(example);
  end;
 self.EndUpdate();
end;

procedure TMLTreeStructureSpv.PostPruning;
begin
 //none...
end;

end.
