(**************************************************************************)
(* UCompSpvLogisticRegression.pas - Copyright (c) 2004 Ricco RAKOTOMALALA *)
(**************************************************************************)

{
@abstract(Rgression logistique, attention l'attribut classe ne peut prendre que deux modalits !!!)
@author(Ricco)
@created(12/01/2004)

Premire bauche, se documenter plus en dtail pour les statistiques indiquant la qualit de la rgression
(globalement, test du chi-2, les diffrents R2, etc.), et les aides  l'interprtation (odds-ratios).

L'algo d'optimisation utilis ici est Levenberg-Marquardt, qui a l'avantage de proposer directement
la matrice hessienne  l'optima, et donc une estimation adquate de la variance.

new -- 15/04/2005 -- intgration de la version Juillet 2004 de la biblio de J. DEBORD -- biblio "d6_xxxxx.pas" -- seuls les parties utiles  l'optimisation ont t intgres 
}
unit UCompSpvLogisticRegression;

interface

USES
        Forms, Contnrs,
        UCompDefinition,
        UCompSpvLDefinition,
        UOperatorDefinition,
        UDatasetDefinition,
        UDatasetImplementation,
        UDatasetExamples,
        UCalcDistribution,
        Stat,
        //new test with biblio ver. 07/2004
        d6_matrices, d6_fmath,
        UCalcSpvStructScore;

TYPE
        {le gnrateur de composant Supervised}
        TMLGCompLogisticReg = class(TMLGenCompSpvLearning)
                              protected
                              procedure   GenCompInitializations(); override;
                              public
                              function    GetClassMLComponent: TClassMLComponent; override;
                              end;

        {le composant Spv}
        TMLCompLogisticReg = class(TMLCompSpvLearning)
                             protected
                             function    getClassOperator: TClassOperator; override;
                             function    GetLogResultDescription(): string; override;
                             end;

        {l'oprateur}
        TOpLogisticReg = class(TOpSpvLearningContinuous)
                          protected
                          function    ConnectClassAtt(prmData: TMLDataset): boolean; override; 
                          function    getClassParameter: TClassOperatorParameter; override;
                          function    getClassSpvLearning(): TClassCalcSpvLearning; override;
                          end;

        {paramtrage de l'oprateur}
        TOpPrmLogisticReg = class(TOpPrmSpvLearning)
                            protected
                            {pas de paramtres pour l'instant, ajouter les options plus tard}
                            function    CreateDlgParameters(): TForm; override;
                            end;

        {le calculateur}
        TCalcLogisticReg = class(TCalcSpvLearning)
                           private
                           {Code d'erreur renvoye par la procdure d'optimisation}
                           FErrCodeOptimization: integer;
                           {la valeur optimise}
                           FMinLogLikelihood: float;
                           {les coefficients de l'quation}
                           FCoef: TVector;
                           {l'inverse de la matrice hessienne}
                           FHInv: TMatrix;
                           {le vecteurt des cart-types}
                           FStdDev: TVector;
                           {les individus  manipuler}
                           FCurExamples: TExamples;
                           {Chi-2 et proba associe}
                           FChi2, FPrChi2: double;
                           {initialiser les coefficients}
                           procedure   initCoefs();
                           {calculer les statistiques, ex. wald etc.}
                           procedure   CalcStats();
                           {la fonction  optimiser}
                           function    Log_Likelihood(X: TVector): float;
                           {lancer l'optimisation}
                           procedure   RunOptimization();
                           protected
                           {crer les structures de calcul}
                           procedure   createStructures(); override;
                           {vider les structures}
                           procedure   destroyStructures(); override;
                           {apprentissage proprement dit}
                           function    coreLearning(examples: TExamples): boolean; override;
                           public
                           procedure   getScore(example: integer; var postProba: TTabScore); override;
                           function    getHTMLResults(): string; override;
                           end;


implementation

USES
        Windows, Sysutils, Classes, Math,
        fmath,
        d6_optim,
        UConstConfiguration, UCalcStatDes, UStringsResources, ULogFile;

CONST
        MAX_ITER_OPTMIZATION_LOGISTIC = 50;
        TOLERANCE_OPTIMIZATION_LOGISTIC = 1.0e-3;


{ TMLGCompLogisticReg }

procedure TMLGCompLogisticReg.GenCompInitializations;
begin
 FMLComp:= mlcSpvLearning;
end;

function TMLGCompLogisticReg.GetClassMLComponent: TClassMLComponent;
begin
 result:= TMLCompLogisticReg;
end;

{ TMLCompLogisticReg }

function TMLCompLogisticReg.getClassOperator: TClassOperator;
begin
 result:= TOpLogisticReg;
end;

function TMLCompLogisticReg.GetLogResultDescription: string;
begin
 result:= format('Logistic regression result generated [%s]',[self.Description]);
end;

{ TOpLogisticReg }

function TOpLogisticReg.ConnectClassAtt(prmData: TMLDataset): boolean;
var ok: boolean;
begin
 //discret ?
 ok:= inherited ConnectClassAtt(prmData);
 //binaire ?
 {$B-}
 ok:= ok and (ClassAttribute.nbValues = 2);
 //retour
 result:= ok;
end;

function TOpLogisticReg.getClassParameter: TClassOperatorParameter;
begin
 result:= TOpPrmLogisticReg;
end;

function TOpLogisticReg.getClassSpvLearning: TClassCalcSpvLearning;
begin
 result:= TCalcLogisticReg;
end;

{ TOpPrmLogisticReg }

function TOpPrmLogisticReg.CreateDlgParameters: TForm;
begin
 result:= NIL;
end;

{ TCalcLogisticReg }

procedure TCalcLogisticReg.CalcStats;
var i: integer;
    a,v,vraisemblance: double;
    tmpVec: TVector;
begin
 //la vraisemblance avec un seul coef
 //si l'quation tait une simple constante, le coef estim directement sera
 v:= StatClassAtt.TabFreq.Frequence[2];
 a:= ln(v/(1.0-v));
 //calculer la vraisemblance de ce modle
 dimVector(tmpVec,Descriptors.Count);
 tmpVec[0]:= a;
 //vraisemblance du modle constitu uniquement de la constante
 vraisemblance:= self.Log_Likelihood(tmpVec);
 
 //>>delVector(tmpVec,Descriptors.Count);
 tmpVec:= nil;

 //le chi-2
 FChi2:= vraisemblance-FMinLogLikelihood;
 //proba associe
 FPrChi2:= fmath.PKHI2(Descriptors.Count,FChi2);
 //l'cart-type
 for i:= 0 to descriptors.Count do
  if (FHInv[i][i]<=0.0)
   then FStdDev[i]:= 0.0
   else FStdDev[i]:= SQRT(2.0*FHInv[i][i]);
end;

procedure TCalcLogisticReg.getScore(example: integer;
  var postProba: TTabScore);
var v,proba: double;
    j: integer;
begin
 //calculer la proba P(Y=2/X)
 //combinaison linaire
 v:= FCoef[0];
 for j:= 1 to descriptors.Count do
  v:= v+FCoef[j]*descriptors.Attribute[pred(j)].cValue[example];
 //la proba
 proba:= 1.0/(1.0+exp(-v));
 //affecation
 postProba[0]:= 1.0;
 postProba[1]:= 1.0-proba;
 postProba[2]:= proba;
end;

function TCalcLogisticReg.coreLearning(examples: TExamples): boolean;
begin
 result:= true;
 try
 FCurExamples:= examples;
 self.initCoefs();
 self.RunOptimization();
 self.CalcStats();
 except
 result:= false;
 end;
end;

procedure TCalcLogisticReg.createStructures;
begin
 dimVector(FCoef,Descriptors.Count);
 dimMatrix(FHInv,Descriptors.Count,Descriptors.Count);
 dimVector(FStdDev,Descriptors.Count);
end;

procedure TCalcLogisticReg.destroyStructures;
begin
 //>
 //delVector(FCoef,Descriptors.Count);
 FCoef:= nil;
 //delMatrix(FHInv,Descriptors.Count,Descriptors.Count);
 FHInv:= nil;
 //delVector(FStdDev,Descriptors.Count);
 FStdDev:= nil;
end;

function TCalcLogisticReg.getHTMLResults: string;
var s: string;
    Wald,pWald: double;
    i: integer;
begin
 s:= '';
 if (FErrCodeOptimization = OPT_NON_CONV)
  then s:= Format('<BLINK>Carreful -- No convergence after %d iterations </BLINK>',[MAX_ITER_OPTMIZATION_LOGISTIC]);

 s:= s+'<P><B>Adjustement quality</B><BR>';
 // complter la premire partie des rsultats
 s:= s+HTML_HEADER_TABLE_RESULT;
 s:= s+HTML_TABLE_COLOR_DATA_BLUE+format('<TD>Predicted attribute</TD><TD>%s</TD></TR>',[ClassAttribute.Name]);
 s:= s+HTML_TABLE_COLOR_DATA_BLUE+format('<TD>Number of examples</TD><TD>%d</TD></TR>',[FCurExamples.Size]);
 s:= s+HTML_TABLE_COLOR_DATA_BLUE+'<TD>-2 Log Likelihood</TD>'+format('<TD>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FMinLogLikelihood]);
 s:= s+HTML_TABLE_COLOR_DATA_BLUE+'<TD>Chi-2</TD>'+format('<TD>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FChi2]);
 s:= s+HTML_TABLE_COLOR_DATA_BLUE+'<TD>P(>Chi-2)</TD>'+format('<TD>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FPrChi2]); 
 s:= s+'</table>';
 //les coefficients
 s:= s+'<P><B>Attributes in the equation</B><BR>';
 s:= s+HTML_HEADER_TABLE_RESULT;
 s:= s+HTML_TABLE_COLOR_HEADER_GRAY+
       '<TH width=120>Attribute</TH><TH width=80>Coef.</TH><TH width=80>Std-dev</TH><TH width=80>Wald</TH><TH width=80>Signif</TH></TR>';
 for i:= 0 to descriptors.count do
  begin
   s:= s+HTML_TABLE_COLOR_DATA_GRAY;
   if (i=0)
    then s:= s+'<TD>constant</TD>'
    else s:= s+'<TD>'+descriptors.Attribute[pred(i)].Name+'</TD>';
   //statistique de wald
   Wald:= 0.0;
   pWald:= 0.0;
   TRY
   if (FStdDev[i]>0.0)
    then Wald:= SQR(FCoef[i]/FStdDev[i])
    else Wald:= 0.0;
   if (Wald>0.0)
    then pWald:= fmath.PKHI2(1,Wald)
    else pWald:= 0.0;
   EXCEPT
   END;
   //envoyer la sauce
   s:= s+format('<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>'+
                '<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>'+
                '<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>'+
                '<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>',
                [FCoef[i],FStdDev[i],Wald,pWald]);
   s:= s+'</TR>';
  end;
 s:= s+'</table>';
 //renvoyer le tout
 result:= s;
end;

procedure TCalcLogisticReg.initCoefs;
var i: integer;
begin
 for i:= 0 to descriptors.Count do
  FCoef[i]:= 0.5;
end;

function TCalcLogisticReg.Log_Likelihood(X: TVector): d6_fmath.float;
var i,j: integer;
    //s,v,tmpY: double;
    s,v,tmpY: extended;//pour prvenir les dbordements de capacit...
    example: integer;
begin
 TRY
 s:= 0.0;
 //pour chaque observation
 for i:= 1 to FCurExamples.Size do
  begin
   example:= FCurExamples.Number[i];
   //constante
   v:= X[0];
   //produit scalaire
   for j:= 1 to descriptors.Count do
    v:= v+X[j]*descriptors.Attribute[pred(j)].cValue[example];
   //hypothse implicite, Y est cod y1=1 et y2=2, c'est ce qui utilis dans la structure LSTVALUES
   //proba P(Y=2/X)
   v:= 1.0/(1.0+exp(-v));
   if (v>0.0) and (v<1.0)
    then
     begin
      //log vraisemblance
      //1.0-1.0=0.0 et 2.0-1.0=1.0 au cas o on en douterait
      //en tous les cas, c'est nettement plus rapide qu'un IF
      tmpY:= ClassAttribute.cValue[example]-1.0;
      v:= tmpY*ln(v)+(1.0-tmpY)*ln(1.0-v);
      //somme de la vraisemblance
      s:= s+v;
     end;
  end;
 //pour la minimisation, on passe au ngatif
 result:= -2.0*s;
 EXCEPT
   on e: exception do
   begin
    TraceLog.WriteToLogFile(Format('[LOGREG] error LL function == %s',[e.Message]));
    result:= 0.0001*MATH.MaxDouble;
   end;
 END;
end;

procedure TCalcLogisticReg.RunOptimization;
begin
 FErrCodeOptimization:= d6_optim.Marquardt(Log_Likelihood,d6_optim.NumHessGrad,FCoef,0,descriptors.Count,MAX_ITER_OPTMIZATION_LOGISTIC,TOLERANCE_OPTIMIZATION_LOGISTIC,FMinLogLikelihood,FHInv);
end;

initialization
 Classes.RegisterClass(TMLGCompLogisticReg);
end.
