(****************************************************************************)
(* UCompSpvDiscriminantAnalysis.pas - Copyright (c) 2004 Ricco RAKOTOMALALA *)
(****************************************************************************)

{
@abstract(Analyse discriminante linaire)
@author(Ricco)
@created(12/01/2004)
Trs simple au dpart, on ne s'occupe que de la discrimination linaire et pas de slection de variables. Qqs
indicateurs pourront trea jouts plus tard, ex. Lambda de Wilks, les F pour chaque exogne, etc...  voir galement
l'eventualit de parler des axes factoriels (analyse disc canonique) ???

New -- 08/08/2005 -- Une classe MANOVA a t mis en place, du coup une grande partie des calculs a t transfre
dans cette nouvelle classe: calcul de la matrice de var-covar INTRA, les stats. conditionnelles, etc.
}
unit UCompSpvDiscriminantAnalysis;

interface

USES
        Forms, Contnrs,
        Matrices,
        UCompDefinition,
        UCompSpvLDefinition,
        UOperatorDefinition,
        UDatasetDefinition,
        UDatasetImplementation,
        UDatasetExamples,
        UCalcDistribution,
        UCalcSpvStructScore,
        UCalcStatDesManova;

TYPE
        //numration des stats associes  la LDA
        TEnumStatLDA = (stLda_lambdaWilks,stLda_correlation_ratio,stLda_F,stLda_pValue);

TYPE
        {le gnrateur de composant Supervised}
        TMLGCompLDiscAnalysis = class(TMLGenCompSpvLearning)
                                protected
                                procedure   GenCompInitializations(); override;
                                public
                                function    GetClassMLComponent: TClassMLComponent; override;
                                end;

        {le composant Spv}
        TMLCompLDiscAnalysis = class(TMLCompSpvLearning)
                                protected
                                function    getClassOperator: TClassOperator; override;
                                function    GetLogResultDescription(): string; override;
                                end;

        {l'oprateur}
        TOpLDiscAnalysis = class(TOpSpvLearningContinuous)
                            protected
                            function    getClassParameter: TClassOperatorParameter; override;
                            function    getClassSpvLearning(): TClassCalcSpvLearning; override;
                            end;

        {paramtrage de l'oprateur}
        TOpPrmLDiscAnalysis = class(TOpPrmSpvLearning)
                              protected
                              {pas de paramtres pour l'instant, ajouter les options plus tard}
                              function    CreateDlgParameters(): TForm; override;
                              end;

        {la classe de calcul, un petit baysien naf fera l'affaire, on prend le produit simple !!!}
        TCalcLDiscAnalysis = class(TCalcSpvLearning)
                              private
                              FNbExamples: integer;
                              FCurExamples: TExamples;
                              FMatIntraVCV: PMatrix;
                              //statistiques descriptives conditionnelles et MANOVA
                              FManova: TCalcSDCondManova;
                              //stats  afficher
                              FTabStatCoef: array of array of double;
                              //calculer la distribution des classes
                              procedure CalcPriorDist();
                              procedure CalcVCVIntra();
                              procedure CalcScores();
                              procedure calcStatsCoefs();
                              procedure DelTmpStructures();
                              procedure normalizeProbas(var postProba: TTabScore);
                              protected
                              {distribution des classes}
                              FDistClass: TTabFrequence;
                              {fonction score}
                              FMatScores: PMatrix;
                              {tableau temporaire pour le calcul des probas}
                              FTmpProba: array of extended;
                              {crer la liste}
                              procedure   createStructures(); override;
                              {vider la liste}
                              procedure   destroyStructures(); override;
                              {apprentissage proprement dit}
                              function    coreLearning(examples: TExamples): boolean; override;
                              public
                              procedure   getScore(example: integer; var postProba: TTabScore); override;
                              function    getHTMLResults(): string; override;
                             end;


implementation

USES
        Classes, Sysutils, UCalcStatDes, UCalcMatrixToAttributes,
        UConstConfiguration, UStringsResources, FMath;

{ TMLGCompLDiscAnalysis }

procedure TMLGCompLDiscAnalysis.GenCompInitializations;
begin
 FMLComp:= mlcSpvLearning;
end;

function TMLGCompLDiscAnalysis.GetClassMLComponent: TClassMLComponent;
begin
 result:= TMLCompLDiscAnalysis;
end;

{ TMLCompLDiscAnalysis }

function TMLCompLDiscAnalysis.getClassOperator: TClassOperator;
begin
 result:= TOpLDiscAnalysis;
end;

function TMLCompLDiscAnalysis.GetLogResultDescription: string;
begin
 result:= format('Linear discriminant result generated [%s]',[self.Description]);
end;

{ TOpLDiscAnalysis }

function TOpLDiscAnalysis.getClassParameter: TClassOperatorParameter;
begin
 result:= TOpPrmLDiscAnalysis;
end;

function TOpLDiscAnalysis.getClassSpvLearning: TClassCalcSpvLearning;
begin
 result:= TCalcLDiscAnalysis;
end;

{ TOpPrmLDiscAnalysis }

function TOpPrmLDiscAnalysis.CreateDlgParameters: TForm;
begin
 result:= NIL;
end;

{ TCalcLDiscAnalysis }

procedure TCalcLDiscAnalysis.CalcPriorDist;
begin
 FDistClass.Refresh(FCurExamples);
end;

procedure TCalcLDiscAnalysis.CalcScores;
var err: integer;
    invVCV: PMatrix;
    k,i,j: integer;
    stats: TLstCalcStatDesContinuous;
    c,s: double;
begin
 try
 dimMatrix(invVCV,Descriptors.Count,Descriptors.Count);
 //inversion de la matrice de variance covariance, s'il y a pbm, ce sera ici
 err:= invMat(FMatIntraVCV,1,Descriptors.Count,invVCV);
 if (err=MAT_SINGUL)
  then raise Exception.Create('singular matrix');
 //si on est l c'est que tout est bon
 for k:= 1 to ClassAttribute.nbValues do
  begin
   stats:= FManova.getStatGroup(pred(k));
   //calculer M'VM avec M vecteur des moyennes, V l'inverse de la matrice de var-covar
   //c'est la constante du score
   c:= 0.0;
   for j:= 1 to Descriptors.Count do
    begin
     s:= 0.0;
     for i:= 1 to Descriptors.Count do
      s:= s+TCalcStatDesContinuous(stats.Stat(pred(i))).Average*invVCV^[i]^[j];
     c:= c+s*TCalcStatDesContinuous(stats.Stat(pred(j))).Average;
    end;
   //cf. formule Saporta pp.420
   FMatScores^[0]^[k]:= -0.5*c+ln(FDistClass.Frequence[k]);
   //pour chaque descripteur
   for j:= 1 to Descriptors.Count do
    begin
     c:= 0.0;
     for i:= 1 to Descriptors.Count do
      c:= c+TCalcStatDesContinuous(stats.Stat(pred(i))).Average*invVCV^[i]^[j];
     FMatScores^[j]^[k]:= c;
    end;
  end;
 finally
 delMatrix(invVCV,Descriptors.Count,Descriptors.Count);
 end;
end;

procedure TCalcLDiscAnalysis.CalcVCVIntra;
var i,j: integer;
    v: double;
begin
 //rcuprer la mvcv-intra brute
 DimMatrix(FMatIntraVCV,Descriptors.Count,Descriptors.Count);
 CopyMatrix(FMatIntraVCV,FManova.MatWBrut,1,1,Descriptors.Count,Descriptors.Count);
 //pondrer pour avoir un estimateur sans biais
 v:= 1.0/(1.0*(FCurExamples.Size-ClassAttribute.nbValues));
 for i:= 1 to Descriptors.Count do
  for j:= 1 to Descriptors.Count do
   FMatIntraVCV^[i]^[j]:= FMatIntraVCV^[i]^[j]*v;
end;

procedure TCalcLDiscAnalysis.getScore(example: integer;
  var postProba: TTabScore);
var k,j: integer;
    s,v: double;
begin
 //produit scalaire direct pour chaque modalit de l'endogne et recherche du max.
 for k:= 1 to ClassAttribute.nbValues do
  begin
   //score de la modalit k
   s:= FMatScores^[0]^[k];
   for j:= 1 to Descriptors.Count do
    begin
     v:= Descriptors.Attribute[pred(j)].cValue[example];
     s:= s+FMatScores^[j]^[k]*v;
    end;
   //affectation
   postProba[k]:= s;
  end;
 //new -- 16/08/2005 -- normalisation maison par les rapports d'exponentielles
 //cela permet d'viter les risques de dbordements dues  la fonction exponentielle
 self.normalizeProbas(postProba);
 //la normalisation est ralise par l'appelant
 //postProba.normalize();
end;

function TCalcLDiscAnalysis.coreLearning(examples: TExamples): boolean;
begin
 result:= true;
 try
 FNbExamples:= examples.Size;
 //new -- 08/08/2005 -- calcul MANOVA
 if assigned(FManova) then FManova.Free();
 FManova:= TCalcSDCondManova.create(self.ClassAttribute,self.Descriptors,examples);
 //suite standard
 FCurExamples:= examples;
 CalcPriorDist();
 CalcVCVIntra();
 CalcScores();
 //new -- 08/08/2005 -- calculer les Lambda de Wilks partiels, et les tests statistiques associs
 //si trop long  calculer, une option serait la bienvenue ? --  voir !
 calcStatsCoefs();
 //suite...
 DelTmpStructures();
 except
 result:= false;
 end;
end;

procedure TCalcLDiscAnalysis.createStructures;
begin
 //distribution de l'endogne
 FDistClass:= TTabFrequence.CreateFromAtt(ClassAttribute,NIL);
 //prparer la matrice des scores, la ligne 0 sera utilise ici (pour la constante)
 dimMatrix(FMatScores,Descriptors.Count,ClassAttribute.nbValues);
 //tableau des stats
 setLength(FTabStatCoef,Descriptors.Count,ord(high(TEnumStatLDA))+1);
 //tableau temporaire des probas
 setLength(FTmpProba,succ(ClassAttribute.nbValues));
end;

procedure TCalcLDiscAnalysis.DelTmpStructures;
begin
 if assigned(FMatIntraVCV) then delMatrix(FMatIntraVCV,Descriptors.Count,Descriptors.Count);
end;

procedure TCalcLDiscAnalysis.destroyStructures;
begin
 if (FTmpProba <> nil) then Finalize(FTmpProba);
 if (FTabStatCoef <> nil) then Finalize(FTabStatCoef);
 if assigned(FMatScores)
  then delMatrix(FMatScores,Descriptors.Count,ClassAttribute.nbValues);
 FDistClass.Free;
end;

function codeCouleur(alpha: double): string;
begin
 if (alpha < SIGNIFICANCE_LEVEL_FOR_TESTS)
  then result:= HTML_BGCOLOR_DATA_RED
  else result:= HTML_BGCOLOR_DATA_GREEN; 
end;

function TCalcLDiscAnalysis.getHTMLResults: string;
var s: string;
    k,j: integer;
begin
 s:= s + '<H3>MANOVA</H3>';
 s:= s + FManova.getHTMLStatisticalTests();
 s:= s + '<h3>LDA Summary</h3>';
 s:= s + HTML_HEADER_TABLE_RESULT;
 //1er niveau en-tte
 s:= s + HTML_TABLE_COLOR_HEADER_GRAY+format('<TH>&nbsp;</TH><TH colspan=%d>Classification functions</TH><TH colspan=4>Statistical Evaluation</TH></TR>',[ClassAttribute.nbValues]);
 s:= s + HTML_TABLE_COLOR_HEADER_BLUE+'<TH width=80>Attribute</TH>';
 //2me niveau en-tte
 for k:= 1 to ClassAttribute.nbValues do
  s:= s+format('<TH width=70>%s</TH>',[ClassAttribute.LstValues.getDescription(k)]);
 //stats d'valuation, attention pour le F, l'en-tte intgre les infos sur les DDL
 s:= s + format('<TH width=70 %s>Wilks L.</TH>',[HTML_BGCOLOR_HEADER_GREEN]);
 s:= s + format('<TH width=70 %s>Partial L.</TH>',[HTML_BGCOLOR_HEADER_GREEN]);
 //Saporta, p.424 --> ddl de la loi de Fisher
 s:= s + format('<TH width=70 %s>F(%d,%d)</TH>',[HTML_BGCOLOR_HEADER_GREEN,ClassAttribute.nbValues-1,FNbExamples-ClassAttribute.nbValues-(Descriptors.Count-1)]);
 s:= s + format('<TH width=70 %s>p-value</TH>',[HTML_BGCOLOR_HEADER_GREEN]);
 s:= s+'</TR>';
 //pour chaque attribut
 for j:= 1 to Descriptors.Count do
  begin
   s:= s+HTML_TABLE_COLOR_DATA_BLUE;
   s:= s+format('<TD %s>%s</TD>',[HTML_BGCOLOR_HEADER_GRAY,Descriptors.Attribute[pred(j)].Name]);
   for k:= 1 to ClassAttribute.nbValues do
    s:= s+format('<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY_HIGH+'</TD>',[FMatScores^[j]^[k]]);
   //les stats d'valuation --> (stLda_lambdaWilks,stLda_correlation_ratio,stLda_F,stLda_pValue);
   s:= s + format('<TD align="right" %s>%.6f</TD>',[HTML_BGCOLOR_DATA_GREEN,FTabStatCoef[pred(j),ord(stLda_lambdaWilks)]]);
   s:= s + format('<TD align="right" %s>%.6f</TD>',[HTML_BGCOLOR_DATA_GREEN,FTabStatCoef[pred(j),ord(stLda_correlation_ratio)]]);
   s:= s + format('<TD align="right" %s>%.5f</TD>',[HTML_BGCOLOR_DATA_GREEN,FTabStatCoef[pred(j),ord(stLda_F)]]);
   s:= s + format('<TD align="right" %s>%.6f</TD>',[codeCouleur(FTabStatCoef[pred(j),ord(stLda_pValue)]),FTabStatCoef[pred(j),ord(stLda_pValue)]]);
   s:= s+'</TR>';
  end;
 //la constante
 s:= s+HTML_TABLE_COLOR_DATA_BLUE;
 s:= s+format('<TD %s>%s</TD>',[HTML_BGCOLOR_HEADER_GRAY,'constant']);
 for k:= 1 to ClassAttribute.nbValues do
  s:= s+format('<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY_HIGH+'</TD>',[FMatScores^[0]^[k]]);
 s:= s + format('<TD colspan=4 %s align=center>-</TD>',[HTML_BGCOLOR_HEADER_GRAY]);
 s:= s+'</TR>';
 s:= s+'</table>';
 //and then...
 result:= s;
end;

procedure TCalcLDiscAnalysis.calcStatsCoefs;
var T,W: PMatrix;
    tmpT,tmpW: PMatrix;
    i,j,desc,iM,jM: integer;
    v: double;
    tmpWilks: double;
begin
 T:= FManova.MatT;
 dimMatrix(W,Descriptors.Count,Descriptors.Count);
 copyMatrix(W,FManova.MatWBrut,1,1,Descriptors.Count,Descriptors.Count);
 //pondrer par l'effectif total dans ce cas
 v:= 1.0/(1.0*FCurExamples.Size);
 for i:= 1 to Descriptors.Count do
  for j:= 1 to Descriptors.Count do
   W^[i]^[j]:= W^[i]^[j]*v;
 //crer les matrices intermdiaires
 dimMatrix(tmpT,Descriptors.Count-1,Descriptors.Count-1);
 dimMatrix(tmpW,Descriptors.Count-1,Descriptors.Count-1);
 //***********************
 //pour chaque descripteur
 //***********************
 for desc:= 1 to Descriptors.Count do
  begin
   iM:= 0;
   //recopier uniquement les colonnes adquates
   for i:= 1 to Descriptors.Count do
    begin
     //si c'est la colonne de travail -- pas de calcul
     if (i <> desc)
      then
       begin
        inc(iM);
        jM:= 0;
        for j:= 1 to Descriptors.Count do
         begin
          if (j <> desc)
           then
            begin
             inc(jM);
             tmpT^[iM]^[jM]:= T^[i]^[j];
             tmpW^[iM]^[jM]:= W^[i]^[j];
            end;
         end;
       end;
    end;
   //calculer le lambda de Wilks associ --> !!! la dimension est rduite de 1
   tmpWilks:= FManova.computeWilks(tmpT,tmpW,Descriptors.Count-1);
   FTabStatCoef[pred(desc),ord(stLda_lambdaWilks)]:= tmpWilks;
   if (tmpWilks > 0.0)
    then
     begin
      //rapport entre les lambda de Wilks : avec et sans la variable tudie
      FTabStatCoef[pred(desc),ord(stLda_correlation_ratio)]:= FManova.Wilks / tmpWilks;
      //Saporta, p.424 -- test F pour l'addition (ou le retrait) d'une variable supplmentaire
      //pbm d'approximation j'imagine, un choua de diffrence avec STATISTICA
      FTabStatCoef[pred(desc),ord(stLda_F)]:= (tmpWilks / FManova.Wilks - 1.0) * (1.0 * (FNbExamples-ClassAttribute.nbValues-(Descriptors.Count-1))) / (-1.0 + ClassAttribute.nbValues);
      FTabStatCoef[pred(desc),ord(stLda_pValue)]:= PSnedecor(ClassAttribute.nbValues-1,FNbExamples-ClassAttribute.nbValues-(Descriptors.Count-1),FTabStatCoef[pred(desc),ord(stLda_F)]);
     end;
  end;
 //***********************
 delMatrix(W,Descriptors.Count,Descriptors.Count);
 delMatrix(tmpT,Descriptors.Count-1,Descriptors.Count-1);
 delMatrix(tmpW,Descriptors.Count-1,Descriptors.Count-1);
end;

procedure TCalcLDiscAnalysis.normalizeProbas(var postProba: TTabScore);
var k,j: integer;
begin
 FillChar(FTmpProba[0],succ(ClassAttribute.nbValues) * sizeof(extended),0);
 //pour chaque modalit de la variables  prdire
 for k:= 1 to ClassAttribute.nbValues do
  begin
   FTmpProba[k]:= 1.0;
   for j:= 1 to ClassAttribute.nbValues do
    if (j <> k) then FTmpProba[k]:= FTmpProba[k] + EXP(postProba.value[j]-postProba.value[k]);
   //1.0 auquel on a ajout des valeurs positives ne peut pas tre gal  0 ! pas besoin de vrifier
   FTmpProba[k]:= 1.0 / FTmpProba[k];
   FTmpProba[0]:= FTmpProba[0] + FTmpProba[k]; 
  end;
 //retransmettre
 for k:= 0 to ClassAttribute.nbValues do
  postProba.value[k]:= FTmpProba[k];
end;

initialization
 RegisterClass(TMLGCompLDiscAnalysis);
end.
