(**************************************************************************)
(* UCompSpvLogisticRegression.pas - Copyright (c) 2004 Ricco RAKOTOMALALA *)
(**************************************************************************)

{
@abstract(Rgression logistique, attention l'attribut classe ne peut prendre que deux modalits !!!)
@author(Ricco)
@created(12/01/2004)

Premire bauche, se documenter plus en dtail pour les statistiques indiquant la qualit de la rgression
(globalement, test du chi-2, les diffrents R2, etc.), et les aides  l'interprtation (odds-ratios).

L'algo d'optimisation utilis ici est Levenberg-Marquardt, qui a l'avantage de proposer directement
la matrice hessienne  l'optima, et donc une estimation adquate de la variance.

new -- 15/04/2005 -- intgration de la version Juillet 2004 de la biblio de J. DEBORD -- biblio "d6_xxxxx.pas" -- seuls les parties utiles  l'optimisation ont t intgres
new -- 03/05/2006 -- calcul direct du vecteur gradient et de la matrice hessienne (cf. Nakache & Confais, 2005 -- pp. 82 et annexes + doc. rcupre sur internet V. Jain Fv.2006)
new -- 03/05/2006 -- autre astuce norme, on centre et rduit automatiquement les donnes avant les calculs, a vite les dbordements sur l'exponentielle (!!!)

}
unit UCompSpvLogisticRegression;

interface

USES
        Forms, Contnrs,
        UCompDefinition,
        UCompSpvLDefinition,
        UOperatorDefinition,
        UDatasetDefinition,
        UDatasetImplementation,
        UDatasetExamples,
        UCalcDistribution,
        Stat,
        //new test with biblio ver. 07/2004
        d6_matrices, d6_fmath,
        UCalcSpvStructScore,
        UCalcStatDes;

TYPE
        {le gnrateur de composant Supervised}
        TMLGCompLogisticReg = class(TMLGenCompSpvLearning)
                              protected
                              procedure   GenCompInitializations(); override;
                              public
                              function    GetClassMLComponent: TClassMLComponent; override;
                              end;

        {le composant Spv}
        TMLCompLogisticReg = class(TMLCompSpvLearning)
                             protected
                             function    getClassOperator: TClassOperator; override;
                             function    GetLogResultDescription(): string; override;
                             end;

        {l'oprateur}
        TOpLogisticReg = class(TOpSpvLearningContinuous)
                          protected
                          function    ConnectClassAtt(prmData: TMLDataset): boolean; override; 
                          function    getClassParameter: TClassOperatorParameter; override;
                          function    getClassSpvLearning(): TClassCalcSpvLearning; override;
                          end;

        {paramtrage de l'oprateur}
        TOpPrmLogisticReg = class(TOpPrmSpvLearning)
                            protected
                            {pas de paramtres pour l'instant, ajouter les options plus tard}
                            function    CreateDlgParameters(): TForm; override;
                            end;

        {le calculateur}
        TCalcLogisticReg = class(TCalcSpvLearning)
                           private
                           {Code d'erreur renvoye par la procdure d'optimisation}
                           FErrCodeOptimization: integer;
                           {la valeur optimise}
                           FMinLogLikelihood: float;
                           {les coefficients de l'quation}
                           FCoef: TVector;
                           {l'inverse de la matrice hessienne}
                           FHInv: TMatrix;
                           {le vecteurt des cart-types}
                           FStdDev: TVector;
                           {les individus  manipuler}
                           FCurExamples: TExamples;
                           {Chi-2 et proba associe}
                           FChi2, FPrChi2: double;
                           {Statistiques descriptives sur les descripteurs}
                           FStatDes: TLstCalcStatDesContinuous;
                           {tableaux internes pour centrage-rduction}
                           FCentrage,FReduction: array of float;
                           {tableaux des probas individus}
                           FTabPk: array of float;
                           {calcul des paramtres de centrage et rduction}
                           procedure   computePrmCentrageReduction();
                           {rcuprer la valeur corrige d'un individu (example) sur une variable (j)}
                           function    getCorrectedValue(j: integer; example: integer): float;
                           {initialiser les coefficients}
                           procedure   initCoefs();
                           {calculer les statistiques, ex. wald etc.}
                           procedure   CalcStats();
                           {la fonction  optimiser}
                           function    Log_Likelihood(X: TVector): float;
                           {new -- 03/05/2006 -- calcul de la matrice hessienne et du vecteur gradient}
                           procedure   directHessGrad(Func : TFuncNVar; X: TVector; Lbound, Ubound: Integer; G: TVector; H: TMatrix);
                           {lancer l'optimisation}
                           procedure   RunOptimization();
                           protected
                           {crer les structures de calcul}
                           procedure   createStructures(); override;
                           {vider les structures}
                           procedure   destroyStructures(); override;
                           {apprentissage proprement dit}
                           function    coreLearning(examples: TExamples): boolean; override;
                           public
                           procedure   getScore(example: integer; var postProba: TTabScore); override;
                           function    getHTMLResults(): string; override;
                           end;


implementation

USES
        Windows, Sysutils, Classes, Math,
        fmath,
        d6_optim,
        UConstConfiguration, UStringsResources, ULogFile;

CONST
        MAX_ITER_OPTMIZATION_LOGISTIC = 50;
        TOLERANCE_OPTIMIZATION_LOGISTIC = 1.0e-3;

        MIN_VALUE_FOR_STDDEV = 1.0e-6;


{ TMLGCompLogisticReg }

procedure TMLGCompLogisticReg.GenCompInitializations;
begin
 FMLComp:= mlcSpvLearning;
end;

function TMLGCompLogisticReg.GetClassMLComponent: TClassMLComponent;
begin
 result:= TMLCompLogisticReg;
end;

{ TMLCompLogisticReg }

function TMLCompLogisticReg.getClassOperator: TClassOperator;
begin
 result:= TOpLogisticReg;
end;

function TMLCompLogisticReg.GetLogResultDescription: string;
begin
 result:= format('Logistic regression result generated [%s]',[self.Description]);
end;

{ TOpLogisticReg }

function TOpLogisticReg.ConnectClassAtt(prmData: TMLDataset): boolean;
var ok: boolean;
begin
 //discret ?
 ok:= inherited ConnectClassAtt(prmData);
 //binaire ?
 {$B-}
 ok:= ok and (ClassAttribute.nbValues = 2);
 //retour
 result:= ok;
end;

function TOpLogisticReg.getClassParameter: TClassOperatorParameter;
begin
 result:= TOpPrmLogisticReg;
end;

function TOpLogisticReg.getClassSpvLearning: TClassCalcSpvLearning;
begin
 result:= TCalcLogisticReg;
end;

{ TOpPrmLogisticReg }

function TOpPrmLogisticReg.CreateDlgParameters: TForm;
begin
 result:= NIL;
end;

{ TCalcLogisticReg }

procedure TCalcLogisticReg.CalcStats;
var i: integer;
    a,v,vraisemblance: double;
    tmpVec: TVector;
    (*
    H: TMatrix;
    example: integer;
    j1,j2,j: integer;
    value,sum_rapport,rapport,z,z1,z2: extended;
    *)
begin
 //la vraisemblance avec un seul coef
 //si l'quation tait une simple constante, le coef estim directement sera
 v:= StatClassAtt.TabFreq.Frequence[2];
 a:= ln(v/(1.0-v));
 //calculer la vraisemblance de ce modle
 dimVector(tmpVec,Descriptors.Count);
 tmpVec[0]:= a;
 //vraisemblance du modle constitu uniquement de la constante
 vraisemblance:= self.Log_Likelihood(tmpVec);
 //>>delVector(tmpVec,Descriptors.Count);
 tmpVec:= nil;
 //le chi-2
 FChi2:= vraisemblance-FMinLogLikelihood;
 //proba associe
 FPrChi2:= fmath.PKHI2(Descriptors.Count,FChi2);

 //******************************************************************
 //recalcul de la matrice hessienne avec les donnes non-transformes
 //pas ncessaire finalement....

 (*
 //remplissage du tableau des probabilits individuelles Pk
 for i:= 1 to FCurExamples.Size do
  begin
   example:= FCurExamples.Number[i];
   //produit scalaire trs simple
   value:= FCoef[0];
   for j:= 1 to self.Descriptors.Count do
    value:= value + FCoef[j] * self.descriptors.Attribute[pred(j)].cValue[example];
   //fonction logistique
   value:= 1.0 / (1.0 + exp(-1.0 * value));
   //rcupration de la valeur
   FTabPk[i]:= value;
  end;

 dimMatrix(H,self.descriptors.count,self.descriptors.count);

 for j1:= 0 to self.Descriptors.Count do
  for j2:= j1 to self.Descriptors.Count do
   begin
    sum_rapport:= 0.0;
    for i:= 1 to FCurExamples.Size do
     begin
      example:= FCurExamples.Number[i];
      z1:= 1.0;
      if (j1 > 0) then z1:= self.descriptors.Attribute[pred(j1)].cValue[example];
      z2:= 1.0;
      if (j2 > 0) then z2:= self.descriptors.Attribute[pred(j2)].cValue[example];
      rapport:= z1 * z2 * FTabPk[i] * (1.0 - FTabPk[i]);
      //somme
      sum_rapport:= sum_rapport + rapport;
     end;
    //affectation
    H[j1,j2]:= sum_rapport;
    H[j2,j1]:= sum_rapport;
   end;

 InvMat(H,0,self.Descriptors.Count,FHInv);
 *)
 //*********************************************

 //l'cart-type
 // >> on peut le faire sans recalcul, mais attention souci avec l'cart-type de la constante
 for i:= 0 to descriptors.Count do
  if (FHInv[i][i]<=0.0) or (FReduction[i] < MIN_VALUE_FOR_STDDEV)
   then FStdDev[i]:= 0.0
   else FStdDev[i]:= SQRT(1.0*FHInv[i][i]) / FReduction[i];
end;

procedure TCalcLogisticReg.getScore(example: integer;
  var postProba: TTabScore);
var v,proba: double;
    j: integer;
begin
 //new -- 03/05/2006 -- calculer de la proba P(Y=1/X), premire modalit de Y
 //>>on a chang les dignes des coefs. pour cela
 //suite...
 //combinaison linaire
 v:= FCoef[0];
 for j:= 1 to descriptors.Count do
  v:= v+FCoef[j]*descriptors.Attribute[pred(j)].cValue[example];
 //la proba
 proba:= 1.0/(1.0+exp(-v));
 //affecation
 postProba[0]:= 1.0;
 postProba[1]:= proba;//<<  la place de "1.0-proba"
 postProba[2]:= 1.0-proba;//<<  la place de "proba"
end;

function TCalcLogisticReg.coreLearning(examples: TExamples): boolean;
begin
 result:= true;
 try
 FCurExamples:= examples;
 //new -- 03/05/2006 -- calcul des stat. descriptives pour centrage et rduction
 if assigned(FStatDes) then FStatDes.Free;
 FStatDes:= TLstCalcStatDesContinuous.Create(self.Descriptors,examples);
 //initialisation des coefficients
 self.initCoefs();
 //calcul des paramtres de centrage et rduction
 //!\ attention, les coefs. initiaux peuvent tre mis  zro si on dsactive une variable qui s'avre tre une constante
 self.computePrmCentrageReduction();
 //tableau des probas individuelles, on s'en sert comment tampon pour viter les re-calculs
 setLength(FTabPk,succ(examples.Size));
 //optimimisation....
 self.RunOptimization();
 //calcul des statistiques, wald, chi-2, etc.
 self.CalcStats();
 except
 result:= false;
 end;
end;

procedure TCalcLogisticReg.createStructures;
begin
 dimVector(FCoef,Descriptors.Count);
 dimMatrix(FHInv,Descriptors.Count,Descriptors.Count);
 dimVector(FStdDev,Descriptors.Count);
end;

procedure TCalcLogisticReg.destroyStructures;
begin
 //>
 //delVector(FCoef,Descriptors.Count);
 FCoef:= nil;
 //delMatrix(FHInv,Descriptors.Count,Descriptors.Count);
 FHInv:= nil;
 //delVector(FStdDev,Descriptors.Count);
 FStdDev:= nil;
 //new tableaux pour centrage-rduction -- 03/05/2006
 Finalize(FCentrage);
 Finalize(FReduction);
 if assigned(FStatDes) then FStatDes.Free;
 Finalize(FTabPk);
end;

//*** code couleur pour la p-value
function codeCouleur(alpha: double): string;
begin
 if (alpha < SIGNIFICANCE_LEVEL_FOR_TESTS)
  then result:= HTML_BGCOLOR_DATA_RED
  else result:= HTML_BGCOLOR_DATA_GRAY;
end;
//***

function TCalcLogisticReg.getHTMLResults: string;
var s: string;
    Wald,pWald: double;
    i: integer;
begin
 s:= '';
 if (FErrCodeOptimization = OPT_NON_CONV)
  then s:= Format('<BLINK>Carreful -- No convergence after %d iterations </BLINK>',[MAX_ITER_OPTMIZATION_LOGISTIC]);

 s:= s+'<P><H3>Adjustement quality</H3><BR>';
 // complter la premire partie des rsultats
 s:= s+HTML_HEADER_TABLE_RESULT;
 s:= s+HTML_TABLE_COLOR_DATA_BLUE+format('<TD>Predicted attribute</TD><TD>%s</TD></TR>',[ClassAttribute.Name]);
 s:= s+HTML_TABLE_COLOR_DATA_BLUE+format('<TD>Number of examples</TD><TD>%d</TD></TR>',[FCurExamples.Size]);
 s:= s+HTML_TABLE_COLOR_DATA_BLUE+'<TD>-2 Log Likelihood</TD>'+format('<TD>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FMinLogLikelihood]);
 s:= s+HTML_TABLE_COLOR_DATA_BLUE+'<TD>Chi-2</TD>'+format('<TD>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FChi2]);
 s:= s+HTML_TABLE_COLOR_DATA_BLUE+'<TD>P(>Chi-2)</TD>'+format('<TD>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FPrChi2]); 
 s:= s+'</table>';
 //les coefficients
 s:= s+'<P><H3>Attributes in the equation</H3><BR>';
 s:= s+HTML_HEADER_TABLE_RESULT;
 s:= s+HTML_TABLE_COLOR_HEADER_GRAY+
       '<TH width=120>Attribute</TH><TH width=80>Coef.</TH><TH width=80>Std-dev</TH><TH width=80>Wald</TH><TH width=80>Signif</TH></TR>';
 for i:= 0 to descriptors.count do
  begin
   s:= s+HTML_TABLE_COLOR_DATA_GRAY;
   if (i=0)
    then s:= s+'<TD>constant</TD>'
    else s:= s+'<TD>'+descriptors.Attribute[pred(i)].Name+'</TD>';
   //statistique de wald
   Wald:= 0.0;
   pWald:= 0.0;
   TRY
   if (FStdDev[i] > 0.0)
    then Wald:= SQR(FCoef[i]/FStdDev[i])
    else Wald:= 0.0;
   if (Wald > 0.0)
    then pWald:= fmath.PKHI2(1,Wald)
    else pWald:= 0.0;
   EXCEPT
   END;
   //envoyer la sauce
   if (i = 0)
    then s:= s + format('<TD align=right>%.6f</TD><TD align=center>-</TD><TD align=center>-</TD><TD align=center>-</TD>',[FCoef[i]])
    else
     s:= s+format('<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY_HIGH+'</TD>'+
                  '<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>'+
                  '<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>'+
                  '<TD align=right %s>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>',
                  [FCoef[i],FStdDev[i],Wald,codeCouleur(pWald),pWald]);
   s:= s+'</TR>';
  end;
 s:= s+'</table>';

 //odds ratio et intervalle de confiance  5%
 s:= s+'<P><H3>Odds ratios and 95% confidence intervals</H3><BR>';
 s:= s+HTML_HEADER_TABLE_RESULT;
 s:= s+HTML_TABLE_COLOR_HEADER_GRAY+
       '<TH width=120>Attribute</TH><TH width=80>Coef.</TH><TH width=80>Low</TH><TH width=80>High</TH></TR>';
 for i:= 1 to descriptors.count do
  begin
   s:= s+HTML_TABLE_COLOR_DATA_GRAY;
   s:= s+'<TD>'+descriptors.Attribute[pred(i)].Name+'</TD>';
   s:= s+format('<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>'+
                '<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>'+
                '<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>',
                [exp(FCoef[i]),exp(FCoef[i]-fmath.InvNorm(0.975)*FStdDev[i]),exp(FCoef[i]+fmath.InvNorm(0.975)*FStdDev[i])]);
  end;
 s:= s + '</table>';
 //renvoyer le tout
 result:= s;
end;

procedure TCalcLogisticReg.initCoefs;
var i: integer;
begin
 for i:= 0 to descriptors.Count do
  FCoef[i]:= 0.5;
end;

function TCalcLogisticReg.Log_Likelihood(X: TVector): d6_fmath.float;
var i,j: integer;
    //s,v,tmpY: double;
    s,v,tmpY: extended;//pour prvenir les dbordements de capacit...
    example: integer;
begin
 TRY
 s:= 0.0;
 //pour chaque observation
 for i:= 1 to FCurExamples.Size do
  begin
   example:= FCurExamples.Number[i];
   //**** produit scalaire ****
   //constante
   v:= X[0];
   for j:= 1 to descriptors.Count do
    v:= v + X[j] * self.getCorrectedValue(pred(j),example);
   //hypothse implicite, Y est cod y1=1 et y2=2, c'est ce qui utilis dans la structure LSTVALUES
   //proba P(Y=2/X)
   v:= 1.0 / (1.0 + exp(-1.0 * v));
   if (v > 0.0) and (v < 1.0)
    then
     begin
      //log vraisemblance
      //1.0-1.0=0.0 et 2.0-1.0=1.0 au cas o on en douterait
      //en tous les cas, c'est nettement plus rapide qu'un IF
      tmpY:= ClassAttribute.cValue[example]-1.0;
      v:= tmpY*ln(v)+(1.0-tmpY)*ln(1.0-v);
      //somme de la vraisemblance
      s:= s+v;
     end;
  end;
 //pour la minimisation, on passe au ngatif
 result:= -2.0*s;
 EXCEPT
   on e: exception do
   begin
    TraceLog.WriteToLogFile(Format('[LOGREG] error LL function == %s',[e.Message]));
    result:= 0.0001*MATH.MaxDouble;
   end;
 END;
end;

procedure TCalcLogisticReg.RunOptimization;
var j: integer;
begin
 //optimisation sur les variables centres
 //FErrCodeOptimization:= d6_optim.Marquardt(Log_Likelihood,d6_optim.NumHessGrad,FCoef,0,descriptors.Count,MAX_ITER_OPTMIZATION_LOGISTIC,TOLERANCE_OPTIMIZATION_LOGISTIC,FMinLogLikelihood,FHInv);
 FErrCodeOptimization:= d6_optim.MarquardtObject(Log_Likelihood,directHessGrad,FCoef,0,descriptors.Count,MAX_ITER_OPTMIZATION_LOGISTIC,TOLERANCE_OPTIMIZATION_LOGISTIC,FMinLogLikelihood,FHInv);
 //correction des coefficients
 for j:= 1 to self.Descriptors.Count do
  begin
   //variable considre comme constante ?
   if (FReduction[j] < MIN_VALUE_FOR_STDDEV)
    then FCoef[j]:= 0.0
    else
     begin
      FCoef[j]:= FCoef[j] / FReduction[j];
      FCoef[0]:= FCoef[0] - FCoef[j] * FCentrage[j];
     end;
  end;
 //new -- 03/05/2006 -- corriger les coefs. pour qu'ils fournissent en priorit la proba P(Y=1/X) de la premire modalit donc...
 for j:= 0 to self.Descriptors.Count do
  FCoef[j]:= -1.0 * FCoef[j];
end;

procedure TCalcLogisticReg.directHessGrad(Func: TFuncNVar; X: TVector;
  Lbound, Ubound: Integer; G: TVector; H: TMatrix);
var j,j1,j2,i: integer;
    rapport,value,sum_rapport: extended;
    example: integer;
    y,z,z1,z2: extended;
    //s: string;
begin
 //**************************
 //remplissage du tableau des probabilits individuelles Pk
 for i:= 1 to FCurExamples.Size do
  begin
   example:= FCurExamples.Number[i];
   //produit scalaire trs simple
   value:= X[0];
   for j:= 1 to self.Descriptors.Count do
    value:= value + X[j] * self.getCorrectedValue(pred(j),example);
   //fonction logistique
   value:= 1.0 / (1.0 + exp(-1.0 * value));
   //rcupration de la valeur
   FTabPk[i]:= value;
  end;

 //**************************
 //vecteur gradient -- eq. (4)
 for j:= LBound to UBound do
  begin
   sum_rapport:= 0.0;
   for i:= 1 to FCurExamples.Size do
    begin
     example:= FCurExamples.Number[i];
     //y en 0/1
     y:= ClassAttribute.cValue[example] - 1.0;
     //somme des produits
     z:= 1.0;
     if (j > 0) then z:= self.getCorrectedValue(pred(j),example);
     sum_rapport:= sum_rapport + (y - FTabPk[i]) * z;
    end;
   //et donc le vecteur gradient
   G[j]:= sum_rapport;
  end;
  
 //*********************************
 //matrice hessienne -- eq. suivante
 for j1:= LBound to UBound do
  for j2:= j1 to UBound do
   begin
    sum_rapport:= 0.0;
    for i:= 1 to FCurExamples.Size do
     begin
      example:= FCurExamples.Number[i];
      z1:= 1.0;
      if (j1 > 0) then z1:= self.getCorrectedValue(pred(j1),example);
      z2:= 1.0;
      if (j2 > 0) then z2:= self.getCorrectedValue(pred(j2),example);
      rapport:= z1 * z2 * FTabPk[i] * (1.0 - FTabPk[i]);
      //somme
      sum_rapport:= sum_rapport + rapport; 
     end;
    //affectation
    H[j1,j2]:= sum_rapport;
    H[j2,j1]:= sum_rapport;
   end;

 (*
 s:= 'H = [';
 for i:= LBound to UBound do
  s:= s + format('%.4f, ',[H[1,i]]);
 s:= copy(s,1,length(s)-2) + ']';
 TraceLog.WriteToLogFile('LOGISTIC-HESS --> '+s);
 *)
end;

procedure TCalcLogisticReg.computePrmCentrageReduction;
var j: integer;
begin
 setlength(FCentrage,FStatDes.Count+1);
 setLength(FReduction,FStatDes.Count+1);
 //pour la constante
 FCentrage[0]:= 0;
 FReduction[0]:= 1;
 for j:= 0 to pred(FStatDes.Count) do
  begin
   FCentrage[succ(j)]:= (FStatDes.Stat(j) as TCalcStatDesContinuous).Average;
   FReduction[succ(j)]:= (FStatDes.Stat(j) as TCalcStatDesContinuous).StdDev;
   //!\ si la varible est quasi-constante, le coef. associ est corrig
   if (FReduction[succ(j)] < MIN_VALUE_FOR_STDDEV)
    then FCoef[succ(j)]:= 0.0;
  end;
end;

function TCalcLogisticReg.getCorrectedValue(j, example: integer): float;
begin
 if (FReduction[succ(j)] < MIN_VALUE_FOR_STDDEV)
  then result:= 0
  else result:= (self.Descriptors.Attribute[j].cValue[example] - FCentrage[succ(j)]) / FReduction[succ(j)];
end;

initialization
 Classes.RegisterClass(TMLGCompLogisticReg);
end.
