(*******************************************************************************)
(* UCalcStatDesVarianceHomogeneity.pas - Copyright (c) 2005 Ricco RAKOTOMALALA *)
(*******************************************************************************)

{
@abstract(Description conditionnelle d'une variable continue  partir d'une variable discrte -- Tests d'homognit de variances)
@author(Ricco)
@created(22/07/2005)

Principales REFS :
------------------
Site NIST --> http://www.itl.nist.gov/div898/handbook/eda/section3/eda35.htm
Et le livre de HOWELL --> http://www.uvm.edu/~dhowell/StatPages/StatHomePage.html

Rfs spcifiques
----------------
Comparaison de 2 variances (loi normale) -- Test de Fisher : http://www.cons-dev.org/elearning/stat/parametrique/5-2/5-2.html
Comparaison de K variances (loi normale) -- Test de Bartlett : http://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm

Comparaison de K varianes -- Test de Levene : http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
--------- // ------------ -- Test de Brown and Forsythe : http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm

}
unit UCalcStatDesVarianceHomogeneity;

interface

USES
        UCalcStatDesConditionnalDesc,
        UDatasetExamples,
        UDatasetDefinition;

TYPE

        //classe de base pour la comparaison de variance
        TCalcSDVarianceHomogeneityTest = class(TCalcSDCondDesc)
                                         protected
                                         //indique que le calcul a t ralis correctement
                                         FOkCalcul: boolean;
                                         //calculer la statistique et la p-value
                                         procedure   computeStats(prmExamples: TExamples); virtual; abstract;
                                         //envoyer les infos spcifiques au test
                                         function    getHTMLTestResult(): string; virtual; abstract;
                                         public
                                         procedure   RefreshStat(prmExamples: TExamples); override;
                                         function    getHTMLResult(prmOption: integer = -1): string; override;
                                         end;

        //liste de stats conditionnelles
        TLstCalcSDVarianceHomogeneityTest = class(TLstCalcStatDesConditionnal)
                                            public
                                            //en-tte de la liste
                                            function    getHeaderHTML(): string; override;    
                                            end;    

        //Test de Fisher
        TCalcSDVHFisher = class(TCalcSDVarianceHomogeneityTest)
                          private
                          //Statistique de Fisher
                          FFisherStat: double;
                          //p-value
                          FFisherPValue: double;
                          //degrs de libert
                          FDfSup, FDfInf: integer;
                          protected
                          //rapport de variance donc...
                          procedure   computeStats(prmExamples: TExamples); override;
                          //infos sur les rsultats
                          function    getHTMLTestResult(): string; override;
                          public
                          //stat pour le tri
                          function    getStatForSorting(): double; override;
                          //p-value pour le tri
                          function    getPValueForSorting(): double; override;
                          end;

        //Test de Bartlett
        TCalcSDVHBartlett = class(TCalcSDVarianceHomogeneityTest)
                            private
                            //la stat. de Bartlett
                            FT: double;
                            //degr de libert
                            Fddl: integer;
                            //p-value
                            FPValue: double;
                            //pooled variance
                            FPooledVar: double;
                            protected
                            //rapport de variance donc...
                            procedure   computeStats(prmExamples: TExamples); override;
                            //infos sur les rsultats
                            function    getHTMLTestResult(): string; override;
                            public
                            //stat pour le tri
                            function    getStatForSorting(): double; override;
                            //p-value pour le tri
                            function    getPValueForSorting(): double; override;
                            end;



        //Test de Levene
        TCalcSDVHLevene = class(TCalcSDVarianceHomogeneityTest)
                          private
                          //la statistique
                          FW: double;
                          //ddlNum, ddlDenom
                          FddlNum, FddlDenom: integer;
                          //p-value
                          FPValue: double;
                          protected
                          //calculer le Z
                          function    computeZ(example: integer; group: integer): TTypeContinue; virtual;
                          //cration de la structure
                          procedure   CreateStructureCond(prmExamples: TExamples); override;
                          //calculer la statistique et la p-value -- inoprant ici
                          procedure   computeStats(prmExamples: TExamples); override;
                          //envoyer les infos spcifiques au test
                          function    getHTMLTestResult(): string; override;
                          public
                          procedure   RefreshStat(prmExamples: TExamples); override;
                          end;

        //Test de Brown & Forsythe
        TCalcSDVHBrownForsythe = class(TCalcSDVHLevene)
                                 protected
                                 //calculer le Z
                                 function    computeZ(example: integer; group: integer): TTypeContinue; override;
                                 //envoyer les infos spcifiques au test
                                 function    getHTMLTestResult(): string; override;
                                 end;


implementation

USES
        Sysutils, Math, Contnrs,
        UConstConfiguration, UCalcStatDes,
        UDatasetImplementation,
        FMath, UCalcStatDesContinuousPlus;

CONST
        //taille minimale accepte dans chaque groupe
        VAR_HOMOGENEITY_TEST_MIN_SIZE_OF_GROUP = 4;

{ TCalcSDVarianceHomogeneityTest }

function TCalcSDVarianceHomogeneityTest.getHTMLResult(
  prmOption: integer): string;
var s: string;
    k: integer;
begin
 //description sur une variable
 s:= format('<TD>%s</TD><TD>%s</TD>',[self.Attribute.Name,self.AttDescription.Name]);
 //les stat descriptives conditionelles
 s:= s+'<TD valign=top>'+HTML_HEADER_TABLE_RESULT+HTML_TABLE_COLOR_HEADER_BLUE+
       '<TH width=120>Value</TH><TH width=70>Examples</TH><TH width=70>Average</TH><TH width=70>Std-dev</TH></TR>';
 //pour chaque modalit de la variable de description
 for k:= 0 to pred(self.AttDescription.nbValues) do
  begin
   s:= s+HTML_TABLE_COLOR_DATA_BLUE+
         format('<TH>%s</TH><TD align=right>%d</td><td align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</td><td align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</td>',
                [self.AttDescription.LstValues.GetDescription(succ(k)),StatCond[k].NbExamples,StatCond[k].Average,StatCond[k].StdDev])+'</tr>';
  end;
 //stat globales (sur tous les individus)
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+
       format('<TH>All</TH><TD align=right>%d</td><td align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</td><td align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</td>',
              [self.NbExamples,self.StatGlobal.Average,self.StatGlobal.StdDev])+'</tr>';
 s:= s+'</table></td>';
 //les infos spcifiques au test
 if FOkCalcul
  then s:= s+'<TD valign=top>'+HTML_HEADER_TABLE_RESULT+HTML_TABLE_COLOR_HEADER_GREEN+'<TH colspan=2>Test</TH>'+self.getHTMLTestResult()+'</table></TD>'
  else s:= s+'<TD>an error occurs<br>perhaps too few examples on some groups</TD>';
 //renvoyer le tout
 result:= s;
end;

procedure TCalcSDVarianceHomogeneityTest.RefreshStat(
  prmExamples: TExamples);
var k: integer;
begin
 inherited RefreshStat(prmExamples);
 //calculer le nombre d'lments dans chaque groupe
 FOkCalcul:= true;
 for k:= 0 to pred(self.getNbGroups()) do
  FOkCalcul:= FOkCalcul and (self.StatCond[k].NbExamples >= VAR_HOMOGENEITY_TEST_MIN_SIZE_OF_GROUP);
 //calculer les indicateurs si OK
 if FOkCalcul
  then self.computeStats(prmExamples);
end;

{ TCalcSDVHFisher }

procedure TCalcSDVHFisher.computeStats(prmExamples: TExamples);
var st1, st2: TCalcStatDesContinuous;
begin
 //on assume le fait qu'il ne peut y avoir que 2 modalits !
 st1:= self.StatCond[0];
 st2:= self.StatCond[1];
 //le calcul est possible ?
 if (MIN(st1.Variance,st2.Variance) > 0.0)
  then
   begin
     //selon la valeur de la variance
     if (st1.Variance > st2.Variance)
      then
       begin
        //ddl
        FDfSup:= st1.NbExamples - 1;
        FDfInf:= st2.NbExamples - 1;
        //stat
        FFisherStat:= st1.Variance / st2.Variance;
       end
      else
       begin
        //ddl
        FDfSup:= st2.NbExamples - 1;
        FDfInf:= st1.NbExamples - 1;
        //stat
        FFisherStat:= st2.Variance / st1.Variance;
       end;
     //p-value -- facteur 2.0 car le test est bilatral --> H0 : sigma1 = sigma 2 vs. H1 : sigma1 <> sigma2
     FFisherPValue:= 2.0 * PSnedecor(FDfSup,FDfInf,FFisherStat);
   end
  else
   begin
    FFisherStat:= 0.0;
    FFisherPValue:= 1.0;
   end;
end;

function TCalcSDVHFisher.getHTMLTestResult: string;
var s: string;
begin
 s:= HTML_TABLE_COLOR_DATA_GREEN+format('<TD>Fisher</TD><TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FFisherStat]);
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+format('<TD>df</TD><TD align=right>%d/%d</TD></TR>',[FDfSup,FDfInf]);
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+format('<TD>p-value</TD><TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FFisherPValue]);
 //and then...
 result:= s;
end;

function TCalcSDVHFisher.getPValueForSorting: double;
begin
 result:= FFisherPValue;
end;

function TCalcSDVHFisher.getStatForSorting: double;
begin
 result:= FFisherStat; 
end;

{ TLstCalcSDVarianceHomogeneityTest }

function TLstCalcSDVarianceHomogeneityTest.getHeaderHTML: string;
begin
 result:= HTML_TABLE_COLOR_HEADER_GRAY+'<TH>Attribute_Y</TH><TH>Attribute_X</TH><TH>Description</TH><TH>Statistical test</TH></TR>';
end;

{ TCalcSDVHBartlett }

procedure TCalcSDVHBartlett.computeStats(prmExamples: TExamples);
var j: integer;
    K,N: double;
    num,denom: double;
begin
 //initialisations
 FPooledVar:= 0.0;
 FT:= 0.0;
 Fddl:= 0;
 FPValue:= 1.0;
 //calculs possibles ?
 if (self.NbExamples > self.getNbGroups())
  then
   begin
     //nombre d'exemples
     N:= 1.0*self.NbExamples;
     //nombre de modalits
     K:= 1.0*self.getNbGroups();
     //le degr de libert
     Fddl:= self.getNbGroups() - 1;
     //calculer la pooled variance
     FPooledVar:= 0.0;
     for j:= 0 to pred (self.getNbGroups()) do
      FPooledVar:= FPooledVar + (-1.0 + self.StatCond[j].NbExamples) * self.StatCond[j].Variance;
     FPooledVar:= FPooledVar / (N - K);
     //numrateur
     num:= 0.0;
     for j:= 0 to pred(self.getNbGroups()) do
      //scurisons
      if (self.StatCond[j].NbExamples > 1) and (self.StatCond[j].Variance > 0)
       then num:= num + (-1.0 + self.StatCond[j].NbExamples) * LN(self.StatCond[j].Variance);
     //ne peut pas tre ngatif
     num:= MATH.MAX(0.0 , (N - K) * LN(FPooledVar) - num);
     //dnominateur
     denom:= 0.0;
     for j:= 0 to pred(self.getNbGroups()) do
      if (self.StatCond[j].NbExamples > 1)
       //qu'en est-il du degr de liberts dans ce cas ???
       then denom:= denom + (1.0 / (-1.0 + self.StatCond[j].NbExamples))
       //rduire puisqu'on a moins de groupes -- #ToDo1 --  voir s'il existe de la doc. en ce sens, a semble logique en tous cas...
       else dec(Fddl);
     denom:= denom - (1.0 / (N - K));
     denom:= 1.0 + (1.0 / (3.0 * (K - 1.0))) *  denom;
     //non ngatif non plus
     denom:= MATH.MAX(0.0 , denom);
     //le T de Bartlett
     if (denom > 0.0)
      then FT:= num / denom
      else FT:= 0.0;
     //la p-value
     FPValue:= PKHI2(Fddl,FT);
   end;
end;

function TCalcSDVHBartlett.getHTMLTestResult: string;
var s: string;
begin
 s:= HTML_TABLE_COLOR_DATA_GREEN+format('<TD>Pooled var.</TD><TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FPooledVar]);
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+format('<TD>Bartlett''s T</TD><TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FT]);
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+format('<TD>df</TD><TD align=right>%d</TD></TR>',[Fddl]);
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+format('<TD>p-value</TD><TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD></TR>',[FPValue]);
 //and then...
 result:= s;
end;

function TCalcSDVHBartlett.getPValueForSorting: double;
begin
 result:= FPValue;
end;

function TCalcSDVHBartlett.getStatForSorting: double;
begin
 result:= FT;
end;

{ TCalcSDVHLevene }

procedure TCalcSDVHLevene.computeStats(prmExamples: TExamples);
begin
 //none -- court-circuit par "RefreshStat"
end;

function TCalcSDVHLevene.computeZ(example: integer; group: integer): TTypeContinue;
begin
 result:= ABS(self.Attribute.cValue[example] - self.StatCond[group].Average); 
end;

procedure TCalcSDVHLevene.CreateStructureCond(prmExamples: TExamples);
var i: integer;
    FSameAtts: TLstAttributes;
begin
 //stat globale sur l'attribut continu
 FStatCont:= TCalcStatDesContPlus.Create(Attribute,prmExamples);
 //cration d'une liste fictive d'attributs
 //sachant que c'est le mme attribut que l'on ajoute systmatiquement
 //astuce un peu norme mais bon...
 FSameAtts:= TLstAttributes.Create(FALSE,Attribute.Size);
 for i:= 0 to pred(FAttDesc.nbValues) do
  FSameAtts.Add(Attribute);
 //grosse diffrence -- pour avoir toutes les estimations (la MEDIANE entre autres !!!)
 FLstStatCont:= TLstCalcStatDesContinuousPlus.Create(FSameAtts,NIL);
 FSameAtts.Free;
 //puis remplir si demand
 if assigned(prmExamples)
  then self.RefreshStat(prmExamples);
end;

function TCalcSDVHLevene.getHTMLTestResult: string;
var s: string;
begin
 s:= HTML_TABLE_COLOR_DATA_GREEN+format('<TD>Levene''s W</TD><TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY_HIGH+'</TD></TR>',[FW]);
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+format('<TD>df</TD><TD align=right>%d/%d</TD></TR>',[FddlNum,FddlDenom]);
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+format('<TD>p-value</TD><TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY_HIGH+'</TD></TR>',[FPValue]);
 //and then...
 result:= s;
end;

procedure TCalcSDVHLevene.RefreshStat(prmExamples: TExamples);
var lstEx: TObjectList;
    tmpEx: TExamples;
    zAtt: TAttContinue;
    i,k,example: integer;
    zStat: TCalcSDCondDescANOVA;
    num,denom,denomGroup,zBarreGroup: double;
begin
 //comme l'appel  la mthode de l'anctre est court-circuit, il nous incombe ici de refaire qqs stats
 FNbExamples:= prmExamples.Size;
 //la stat. globale
 FStatCont.RefreshStat(prmExamples);
 //rcuprer les listes par groupe pour calculer la stat
 lstEx:= prmExamples.DispatchExamples(self.AttDescription);
 //tester le nombre d'individus par groupe avant de continuer
 FOkCalcul:= TRUE;
 for k:= 0 to pred(lstEx.Count) do
  FOkCalcul:= FOkCalcul and ((lstEx.Items[k] as TExamples).Size >= VAR_HOMOGENEITY_TEST_MIN_SIZE_OF_GROUP);
 //si tout va bien, on y va
 if FOkCalcul
  then
   begin
     //calculer les stats. par groupe
     for k:= 0 to pred(self.getNbGroups()) do
       (self.StatCond[k] as TCalcStatDesContPlus).RefreshStat(lstEx.Items[k] as TExamples);
     //construire la variable Z
     zAtt:= TAttContinue.Create('_temp_z',self.Attribute.Size);
     for i:= 1 to prmExamples.Size do
      begin
       example:= prmExamples.Number[i];
       //groupe
       k:= pred(self.AttDescription.dValue[example]);
       //z_example_k (groupe k, individu example dans le groupe k) -- notre stratgie est diffrente mais c'est la mme chose
       zAtt.cValue[example]:= self.computeZ(example,k);
      end;
     //calculer les stats. conditionnelles en Z -- tout le bnfice est l !
     zStat:= TCalcSDCondDescANOVA.Create(zAtt,self.AttDescription,prmExamples);

     //********************
     //* construire la stat
     //********************

     //le dnominateur
     denom:= 0.0;
     for k:= 0 to pred(zStat.getNbGroups()) do
      begin
       denomGroup:= 0.0;
       tmpEx:= lstEx.Items[k] as TExamples;
       zBarreGroup:= zStat.StatCond[k].Average;
       for i:= 1 to tmpEx.Size do
        denomGroup:= denomGroup + SQR(zAtt.cValue[tmpEx.Number[i]] - zBarreGroup);
       //additionner
       denom:= denom + denomGroup;
      end;
     //au final
     denom:= MATH.MAX(0 , (-1.0 + self.getNbGroups()) * denom);

     //le numrateur
     num:= 0.0;
     for k:= 0 to pred(zStat.getNbGroups()) do
      num:= num + 1.0 * zStat.StatCond[k].NbExamples * SQR(zStat.StatCond[k].Average - zStat.StatGlobal.Average);
     //au final
     num:= MATH.MAX(0 , (1.0 * zStat.NbExamples - 1.0 * zStat.getNbGroups()) * num);

     //les degree of freedom
     FddlNum:= zStat.getNbGroups() - 1;
     FddlDenom:= zStat.NbExamples - zStat.getNbGroups;

     //les indicteurs stats
     if (denom > 0.0)
      then
       begin
        FW:= num / denom;
        FPValue:= PSnedecor(FddlNum,FddlDenom,FW);
       end
      else
       begin
        FW:= 0.0;
        FPvalue:= 1.0;
       end;

     //reste  vider tout cela
     zStat.Free();
     zAtt.Free();
   end;
 //et vider celui-l (qui a t cr mme si le calcul n'a pas t effectu rellement)
 lstEx.Free();
end;

{ TCalcSDVHBrownForsythe }

function TCalcSDVHBrownForsythe.computeZ(example,
  group: integer): TTypeContinue;
begin
 //la seule ligne de code  changer
 result:= ABS(self.Attribute.cValue[example] - TCalcStatDesContPlus(self.StatCond[group]).Median);
end;

function TCalcSDVHBrownForsythe.getHTMLTestResult: string;
var s: string;
begin
 s:= HTML_TABLE_COLOR_DATA_GREEN+format('<TD>Brown & Forsythe''s W</TD><TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY_HIGH+'</TD></TR>',[FW]);
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+format('<TD>df</TD><TD align=right>%d/%d</TD></TR>',[FddlNum,FddlDenom]);
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+format('<TD>p-value</TD><TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY_HIGH+'</TD></TR>',[FPValue]);
 //and then...
 result:= s;
end;

end.
