(******************************************************************)
(* UCompSpvNaiveBayes.pas - Copyright (c) 2004 Ricco RAKOTOMALALA *)
(******************************************************************)

{
@abstract(Un apprentissage bayesien naf trs simple, les descripteurs sont dj discrtiss)
@author(Ricco)
@created(12/01/2004)
}
unit UCompSpvNaiveBayes;

interface

USES
        Forms, IniFiles,
        UCompDefinition,
        UCompSpvLDefinition,
        UOperatorDefinition,
        UDatasetDefinition,
        UDatasetImplementation,
        UDatasetExamples,
        UCalcStatDesCrossTab,
        UCalcDistribution,
        UCalcSpvStructScore;

TYPE
        {le gnrateur de composant Supervised}
        TMLGCompNaiveBayes = class(TMLGenCompSpvLearning)
                             protected
                             procedure   GenCompInitializations(); override;
                             public
                             function    GetClassMLComponent: TClassMLComponent; override;
                             end;

        {le composant Spv}
        TMLCompNaiveBayes = class(TMLCompSpvLearning)
                            protected
                            function    getClassOperator: TClassOperator; override;
                            function    GetLogResultDescription(): string; override;
                            end;

        {l'oprateur}
        TOpNaiveBayes = class(TOpSpvLearningDiscrete)
                          protected
                          function    getClassParameter: TClassOperatorParameter; override;
                          function    getClassSpvLearning(): TClassCalcSpvLearning; override;
                          end;

        {paramtrage de l'oprateur}
        TOpPrmNaiveBayes = class(TOpPrmSpvLearning)
                           protected
                           {montrer les distributions conditionnelles}
                           FShowCondDist: boolean;
                           //new -- 29/12/2005 -- introduire la possibilit d'utiliser le laplacien (0 : non ; 1 : oui [dfaut])
                           FUseLaplacian : integer;
                           //new -- 29/12/2005 -- la valeur du lambda, par dfaut 1.0
                           FLambdaLaplacian: double;
                           function    CreateDlgParameters(): TForm; override;
                           procedure   SetDefaultParameters(); override;
                           public
                           function    getHTMLParameters(): string; override;
                           procedure   LoadFromINI(prmSection: string; prmINI: TMemIniFile); override;
                           procedure   SaveToINI(prmSection: string; prmINI: TMemIniFile); override;
                           property    showCondDist: boolean read FShowCondDist write FShowCondDist;
                           property    useLaplacian: integer read FUseLaplacian write FUseLaplacian;
                           property    lambdaLaplacian: double read FLambdaLaplacian write FLambdaLaplacian;
                           end;

        {structure ultra-rapide avec 0 contrle pour les probas de naive bayes}
        TColProba = array of double;
        TTabProba = array of TColProba;
        TStatProba = array of TTabProba;

        {la classe de calcul, un petit baysien naf fera l'affaire, on prend le produit simple !!!}
        TCalcNaiveBayes = class(TCalcSpvLearning)
                          protected
                          {distribution des classe}
                          FDistClass: TTabFrequence;
                          {liste des stats de croisement}
                          FStatCross: TLstCalcStatDesCrossTab;
                          {structure internes rapide de proba conditionnelle}
                          FProbaCond: TStatProba;
                          //new -- 29/12/2005 -- utilisation du laplacien -- si pas utilis, on le pose  zro tout simplement
                          FLambdaLaplacian: double;
                          {construire la structure de proba conditionnelle}
                          procedure   buildStatProba();
                          {crer la liste}
                          procedure   createStructures(); override;
                          {vider la liste}
                          procedure   destroyStructures(); override;
                          {apprentissage proprement dit}
                          function    coreLearning(examples: TExamples): boolean; override;
                          public
                          //rcuprer les paramtres de lambda et de lapacien
                          constructor create(prmOpSpv: TOpPrmSpvLearning; prmClass: TAttribute; prmDescriptors: TLstAttributes; prmAllAttributes: TLstAttributes); override;
                          procedure   getScore(example: integer; var postProba: TTabScore); override;
                          //on le maintient ici car getScore peut tre sujet  des dbordements souvent lorsque le nombre de variables est lev (cf. fichier debugfile.txt)
                          procedure   classification(example: integer; var response: TTypeDiscrete); override;
                          function    getHTMLResults(): string; override;
                          end;




implementation

uses
        Math,
        Classes, SysUtils, ULogFile, UCalcCrossTab, UStringsResources,
        UDlgOpPrmSpvNaiveBayes, UConstConfiguration;

var
        SEUIL_PRECISION_CALC_EXPONENTIELLE : float;

{ TMLGCompNaiveBayes }

procedure TMLGCompNaiveBayes.GenCompInitializations;
begin
 FMLComp:= mlcSpvLearning;
 //FMLNumIcon:= 16;
 //FMLCompName:= str_comp_name_spvl_naive_bayes;
 //FMLBitmapFileName:= 'MLSpvNaiveBayes.bmp';
end;

function TMLGCompNaiveBayes.GetClassMLComponent: TClassMLComponent;
begin
 result:= TMLCompNaiveBayes;
end;

{ TMLCompNaiveBayes }

function TMLCompNaiveBayes.getClassOperator: TClassOperator;
begin
 result:= TOpNaiveBayes;
end;

function TMLCompNaiveBayes.GetLogResultDescription: string;
begin
 result:= format('MLCOMP >> one instance of %s running',[self.ClassName]);
end;

{ TOpNaiveBayes }

function TOpNaiveBayes.getClassParameter: TClassOperatorParameter;
begin
 result:= TOpPrmNaiveBayes;
end;

function TOpNaiveBayes.getClassSpvLearning: TClassCalcSpvLearning;
begin
 result:= TCalcNaiveBayes;
end;

{ TOpPrmNaiveBayes }

function TOpPrmNaiveBayes.CreateDlgParameters: TForm;
begin
 result:= TDlgOpPrmSpvNaiveBayes.CreateFromOpPrm(self);
end;

function TOpPrmNaiveBayes.getHTMLParameters: string;
var s: string;
begin
 s:= '';
 s:= HTML_HEADER_TABLE_RESULT;
 s:= s+HTML_TABLE_COLOR_HEADER_GRAY+'<TH colspan=2>Parameters</TH></TR>';
 s:= s+HTML_TABLE_COLOR_DATA_GRAY+format('<TD>Use laplacian</TD><TD align="right">%d</TD></TR>',[FUseLaplacian]);
 s:= s+HTML_TABLE_COLOR_DATA_GRAY+format('<TD>Lambda for laplacian</TD><TD align="right">%.4f</TD></TR>',[FLambdaLaplacian]);
 s:= s+HTML_TABLE_COLOR_DATA_GRAY+format('<TD>Show conditional probabilities</TD><TD align="right">%d</TD></TR>',[ord(FShowCondDist)]);
 s:= s+'</table>';
 //and then...
 result:= s;
end;

procedure TOpPrmNaiveBayes.LoadFromINI(prmSection: string;
  prmINI: TMemIniFile);
begin
 FShowCondDist:= prmINI.ReadBool(prmSection,'show_cond_dist',FShowCondDist);
 FUseLaplacian:= prmINI.ReadInteger(prmSection,'use_laplacian',FUseLaplacian);
 FLambdaLaplacian:= prmINI.ReadFloat(prmSection,'lambda_laplacian',FLambdaLaplacian);
end;

procedure TOpPrmNaiveBayes.SaveToINI(prmSection: string;
  prmINI: TMemIniFile);
begin
 prmINI.WriteBool(prmSection,'show_cond_dist',FShowCondDist);
 prmINI.WriteInteger(prmSection,'use_laplacian',FUseLaplacian);
 prmINI.WriteFloat(prmSection,'lambda_laplacian',FLambdaLaplacian);
end;

procedure TOpPrmNaiveBayes.SetDefaultParameters;
begin
 FShowCondDist:= TRUE;
 //new -- 29/12/2005
 FUseLaplacian:= 1;
 FLambdaLaplacian:= 1.0;
end;

{ TCalcNaiveBayes }

procedure TCalcNaiveBayes.buildStatProba;
var i,j,k: integer;
    stat: TCalcSDCrossTab;
begin
 setLength(FProbaCond,FStatCross.Count);
 //pour chaque descripteur
 for j:= 0 to pred(FStatCross.Count) do
  begin
   stat:= TCalcSDCrossTab(FStatCross.Stat(j));
   //l'indice 0 est inutilis ici - on a le nombre de modalits de l'attribut-classe ici
   setLength(FProbaCond[j],succ(stat.CrossTab.RowCount));
   //pour chaque modalit du descripteur
   for i:= 0 to stat.CrossTab.RowCount do
    begin
     setLength(FProbaCond[j][i],succ(stat.CrossTab.ColCount));
     //remplir cette portion de tableau
     for k:= 1 to stat.CrossTab.ColCount do
      //new -- 29/12/2005 -- lissage avec un laplacien -- lambda = 0.0 si pas utilis
      FprobaCond[j][i][k]:= (1.0*stat.CrossTab.Value[i,k]+FLambdaLaplacian)/
                            (1.0*stat.CrossTab.Value[i,0]+1.0*stat.CrossTab.ColCount*FLambdaLaplacian);
    end;
  end;
end;

procedure TCalcNaiveBayes.getScore(example: integer;
  var postProba: TTabScore);
var s,v: double;
    j: integer;
    i,k: TTypeDiscrete;
    att: TAttribute;
    ok: boolean;
begin
 postProba[0]:= 0.0;
 //pour chaque modalit de l'endogne
 for i:= 1 to ClassAttribute.nbValues do
  begin
   s:= -1.0e308;
   ok:= true;
   //v:= FDistClass.Frequence[i];
   //new -- 29/12/2005 -- lissage avec un laplacien --> paramtre de la mthode
   v:= (1.0*FDistClass.Value[i]+FLambdaLaplacian)/(1.0*FDistClass.Value[0]+1.0*FDistClass.Size*FLambdaLaplacian);
   //en passant par les logs on vite les dbordements de capacit
   if (v>0)
    then
     begin
       //on passe en LOG la proba a priori d'appartenance
       s:= ln(v);
       for j:= 0 to pred(FStatCross.Count) do
        begin
         att:= Descriptors.Attribute[j];
         //la modalit  activer
         k:= att.dValue[example];
         //la valeur du profil ligne
         //stat:= TCalcSDCrossTab(FStatCross.Stat(j));
         //v:= stat.CrossTab.RowFreq[i,k];
         //solution alternative, on gagne 10 malheureuses secondes sur 60s pour covtype !!!
         v:= FProbaCond[j][i][k];
         //la somme des LN
         if (v>0)
          then s:= s+ln(v)
          else
           begin
            ok:= FALSE;
            BREAK;
           end;
        end;
     end;
   //store si computable...
   if ok
    then
     begin
     if (s < SEUIL_PRECISION_CALC_EXPONENTIELLE)
      then
       begin
        postProba[i]:= 0.0;
        TraceLog.WriteToLogFile(format('[NAIVE BAYES] dbordement dans calcul exponentiel = %.4f',[s]));
       end
      else postProba[i]:= exp(s) //on repasse en exponentielle pour avoir des valeurs positives ?
     end
    else postProba[i]:= 0.0;
  end;
end;

procedure TCalcNaiveBayes.classification(example: integer;
  var response: TTypeDiscrete);
var s,smax,v: double;
    j: integer;
    i,iMax,k: TTypeDiscrete;
    //stat: TCalcSDCrossTab;
    att: TAttribute;
    ok: boolean;
begin
 smax:= -1.0e308;
 imax:= 0;
 //pour chaque modalit de l'endogne
 for i:= 1 to ClassAttribute.nbValues do
  begin
   s:= -1.0e308;
   ok:= true;
   //new -- 29/12/2005 -- idem, utilisation du laplacien
   v:= (1.0*FDistClass.Value[i]+FLambdaLaplacian)/(1.0*FDistClass.Value[0]+1.0*FDistClass.Size*FLambdaLaplacian);
   //en passant par les logs on vite les dbordements de capacit
   if (v>0)
    then
     begin
       s:= ln(v);
       for j:= 0 to pred(FStatCross.Count) do
        begin
         att:= Descriptors.Attribute[j];
         //la modalit  activer
         k:= att.dValue[example];
         //la valeur du profil ligne
         //stat:= TCalcSDCrossTab(FStatCross.Stat(j));
         //v:= stat.CrossTab.RowFreq[i,k];
         //solution alternative, on gagne 10 malheureuses secondes sur 60s pour covtype !!!
         v:= FProbaCond[j][i][k];
         //la somme des LN
         if (v>0)
          then s:= s+ln(v)
          else
           begin
            ok:= FALSE;
            BREAK;
           end;
        end;
     end;
   //tester le max
   if ok and (s>smax)
    then
     begin
      smax:= s;
      imax:= i;
     end;
  end;
 //vrification si une conclusion a bien t assigne
 if (iMax=0)
  //conclusion issue de la distribution du classifieur par dfaut
  then iMax:= self.FDistClass.getIndexMaxValue();
 //rponse
 response:= imax;
end;

function TCalcNaiveBayes.coreLearning(examples: TExamples): boolean;
begin
 result:= true;
 try
 FDistClass.Refresh(examples);
 FStatCross.RefreshStat(examples);
 //construire la structure de proba
 buildStatProba();
 except
 result:= false;
 end;
end;

procedure TCalcNaiveBayes.createStructures;
var stat: TCalcSDCrossTab;
    j: integer;
    y,x: TAttribute;
begin
 //distribution de l'endogne
 y:= ClassAttribute;
 FDistClass:= TTabFrequence.CreateFromAtt(y,NIL);
 //les stats croises
 FStatCross:= TLstCalcStatDesCrossTab.Create(NIL,NIL);
 for j:= 0 to pred(Descriptors.Count) do
  begin
   x:= Descriptors.Attribute[j];
   stat:= TCalcSDCrossTab.Create(y,x,nil);
   FStatCross.AddStat(stat);
  end;
end;

procedure TCalcNaiveBayes.destroyStructures;
var i,j: integer;
begin
 for j:= 0 to pred(FStatCross.Count) do
  begin
   for i:= 0 to ClassAttribute.nbValues do
    setLength(FProbaCond[j][i],0);
   setLength(FProbaCond[j],0);
  end;
 Setlength(FProbaCond,0);
 //suite sur les listes
 FDistClass.Free;
 FStatCross.FreeAll;//inutile mais a rassure
 FStatCross.Free;
end;

function TCalcNaiveBayes.getHTMLResults: string;
var s: string;
begin
 s:= format('<P><H4>Prior distribution of class attribute "%s"</H4>',[ClassAttribute.Name]);
 s:= s+FDistClass.getHTMLResult();
 if (self.OpPrmSpv as TOpPrmNaiveBayes).showCondDist
  then
   begin
    s:= s+'<P><H4>Conditionnal distribution : P(descriptor / class attribute)</H4>';
    s:= s+FStatCross.getHTMLResults(ord(vctRow));//pour profil ligne
   end
  else s:= s+'<P><H4>Modifiy option if you want view conditional distribution...</H4></P>';
 //
 result:= s;
end;

constructor TCalcNaiveBayes.create(prmOpSpv: TOpPrmSpvLearning;
  prmClass: TAttribute; prmDescriptors, prmAllAttributes: TLstAttributes);
begin
 inherited;
 FLambdaLaplacian:= (prmOpSpv as TOpPrmNaiveBayes).FLambdaLaplacian;
 //la manipulation du diable -- si pas d'utilisation du laplacien, on passe le lambda  zro -- astuce suprme pour simplifier les critures
 If ((prmOpSpv as TOpPrmNaiveBayes).FUseLaplacian = 0)
  then FLambdaLaplacian:= 0.0;
 //pour vrifications
 TraceLog.WriteToLogFile(format('[NAIVE BAYES] used laplacian = %.2f',[FLambdaLaplacian]));
end;

initialization
 RegisterClass(TMLGCompNaiveBayes);
 //dterminer la prcision pour les exponentielles
 SEUIL_PRECISION_CALC_EXPONENTIELLE:= -1.0*LN(MATH.MaxDouble-1.0);
end.
