(************************************************************************)
(* UCalcStatDesCondDescRuns.pas - Copyright (c) 2004 Ricco RAKOTOMALALA *)
(************************************************************************)

{
@abstract(Description conditionnelle d'une variable continue  partir d'une variable discrte -- Test des squences)
@author(Ricco)
@created(24/11/2004)

Test des squences -- test de Mood (1940) -- Surtout utile pour trier les variables selon la "sparabilit".
Est intressant lorsque les distributions conditionnelles ne sont pas unimodales !!!

17/07/2005 -- Test des Squences de Wald et Wolfowitz

Pour les calculs, voir

(a) A.Mood (1940) -- "The distribution Theory of Runs" -- Annals of Mathematical Statistics -- 11:367-392.
(b) Test de Wald et Wolfowitz --> Siegel -- pp.58-62

/!\ L'utilisation de ces tests dans la comparaison de populations est quand mme sujette  caution


}

unit UCalcStatDesCondDescRuns;

interface

USES
        UCalcStatDes,
        UDatasetDefinition,
        UDatasetImplementation,
        UDatasetExamples,
        UCalcStatDesConditionnalDesc;

TYPE
        {statistique descriptive -- inspir de la classe K-W}
        TCalcSDRunsCondDesc = class(TCalcSDCondDesc)
                              private
                              {nombre de squences}
                              FNbRuns: integer;
                              {stat et proba associe}
                              FStatRuns, FProbaRuns: double;
                              {compter les squences}
                              procedure   countRuns(sortedExamples: TExamples);
                              {cacluler les stats}
                              procedure   computeStats(prmExamples: TExamples); virtual;
                              public
                              {calcul stat}
                              procedure   RefreshStat(prmExamples: TExamples); override;
                              {envoyer le rapport}
                              function    getHTMLResult(prmOption: integer = -1): string; override;
                              //critre de tri fond sur la stat
                              function    getStatForSorting(): double; override;
                              //critre de tri fond sur la p-value
                              function    getPValueForSorting(): double; override;
                              //properties
                              property    NbRuns: integer read FNbRuns;
                              property    StatRuns: double read FStatRuns;
                              property    ProbaStatRuns: double read FProbaRuns;
                              end;

        {liste de Runs}
        TLstCalcStatDesCondRuns = class(TLstCalcStatDesCond)
                                  public
                                  function  getHeaderHTML(): string; override;
                                  end;

        {classe de calcul pour le test de Wald et Wolfowitz}
        TCalcSDRunsWaldWolfowitz = class(TCalcSDRunsCondDesc)
                                   protected
                                   //calculer la stat de Wald & Wolfowitz
                                   procedure   computeStats(prmExamples: TExamples); override;
                                   end;

        {liste de bi-runs}
        TLstCalcStatDesCondRunsWW = class(TLstCalcStatDesCondRuns)
                                    end;
        

implementation

USES
        Math, Sysutils, Classes,
        UCalcDistribution, FMath, UConstConfiguration, UCalcRndGenerator;

{ TCalcSDRunsCondDesc }

procedure TCalcSDRunsCondDesc.computeStats(prmExamples: TExamples);
var n: double;
    dist: TCalcStatDesDiscrete;
    k: integer;
    sum2, sum3: double;
    value: double;
    avg,std: double;
begin
 //nombre d'individus
 n:= 1.0*prmExamples.Size;
 //calculer la distribution sur l'attribut de description -- tableau des e_k
 dist:= TCalcStatDesDiscrete.Create(self.AttDescription,prmExamples);
 //sommes carrs et cubes
 sum2:= 0.0;
 sum3:= 0.0;
 //sumAvg:= 0.0;
 //sumVar:= 0.0;
 for k:= 1 to self.AttDescription.nbValues do
  begin
   value:= dist.TabFreq.Frequence[k];
   sum2:= sum2+power(value,2.0);
   sum3:= sum3+power(value,3.0);
   //new -- 17/07/2005
   //sumAvg:= sumAvg + value*(1.0-value);
   //sumVar:= sumVar + value*(1.0-value)-3.0*power(value,2.0)*power(1.0-value,2.0);
  end;
 dist.Free();
 //moyenne
 //avg:= n * sumAvg;
 avg:= n * (1.0 - sum2);
 //std-dev.
 //std:= sqrt(n) * sqrt(sumVar);
 std:= SQRT(n) * SQRT(sum2 - 2.0 * sum3 + sum2*sum2);
 //la stat
 if (std>0)
  then FStatRuns:= (1.0 * FNbRuns - avg) / std
  else FStatRuns:= +3.0;
 //passer le test dans l'autre sens
 //FStatRuns:= -1.0 * FStatRuns;
 //proba critique --> test unilatral
 //FProbaRuns:= 1.0-FNorm(FStatRuns);
 FProbaRuns:= FNorm(FStatRuns);
end;

procedure TCalcSDRunsCondDesc.countRuns(sortedExamples: TExamples);
var i:integer;
begin
 FNbRuns:= 1;
 for i:= 1 to pred(sortedExamples.Size) do
  begin
   //les donnes sont tries -- rangs alatoires sur les ex-aequos -- la procdure est simplifie
   if (self.AttDescription.dValue[sortedExamples.Number[succ(i)]]<>self.AttDescription.dValue[sortedExamples.Number[i]])
    then inc (FNbRuns);
  end;
end;

function TCalcSDRunsCondDesc.getHTMLResult(prmOption: integer): string;
var s: string;
    k: integer;
begin
 //****************************
 //description sur une variable
 //****************************
 s:= format('<TD>%s</TD><TD>%s</TD>',[self.Attribute.Name,self.AttDescription.Name]);
 //les stat descriptives conditionelles
 s:= s+'<TD>'+HTML_HEADER_TABLE_RESULT+HTML_TABLE_COLOR_HEADER_BLUE+
       '<TH width=120>Value</TH><TH width=80>Examples</TH><TH width=80>Average</TH></TR>';
 //pour chaque modalit de la variable de description
 for k:= 0 to pred(self.AttDescription.nbValues) do
  begin
   s:= s+HTML_TABLE_COLOR_DATA_BLUE+
         format('<TH>%s</TH><TD align=right>%d</td><td align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+
         '</td></tr>',
         [self.AttDescription.LstValues.GetDescription(succ(k)),statCond[k].NbExamples,statCond[k].Average]);
  end;
 //stat globales (sur tous les individus)
 s:= s+HTML_TABLE_COLOR_DATA_GREEN+
       format('<TH>All</TH><TD align=right>%d</td><td align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</td></tr>',
              [self.NbExamples,self.StatGlobal.Average]);
 s:= s+'</table></td>';

 //****************
 //test statistique
 //****************
 s:= s+'<TD valign=top>';

 s:= s+HTML_HEADER_TABLE_RESULT+HTML_TABLE_COLOR_HEADER_BLUE+
       '<TH>Measure</TH><TH>Value</TH></TR>';

 //nombre de runs
 s:= s+format(HTML_TABLE_COLOR_DATA_BLUE+'<TD>%s</TD><TD align=right>%d</TD></TR>',['Runs',self.NbRuns]);
 //stat de Mood
 s:= s+format(HTML_TABLE_COLOR_DATA_BLUE+'<TD>%s</TD><TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY_HIGH+
              '</TD></TR>',['Z',self.StatRuns]);
 //proba critique
 s:= s+format(HTML_TABLE_COLOR_DATA_BLUE+'<TD>%s</TD><TD align=right %s>'+STR_FORMAT_VIEW_STAT_ACCURACY_HIGH+
              '</TD></TR>',['p-value',UCalcStatDesConditionnalDesc.codeCouleur(self.ProbaStatRuns),self.ProbaStatRuns]);


 s:= s+'</table>';

 s:= s+'</TD>';

 //and then...
 result:= s;
end;

function TCalcSDRunsCondDesc.getPValueForSorting: double;
begin
 result:= self.ProbaStatRuns;
end;

function TCalcSDRunsCondDesc.getStatForSorting: double;
begin
 result:= self.StatRuns;
end;

procedure TCalcSDRunsCondDesc.RefreshStat(prmExamples: TExamples);
var lstSorted: TExamples;
begin
 //rcuprer les individus concerns
 inherited RefreshStat(prmExamples);
 //trier selon X puis Y
 lstSorted:= TExamples.Create(prmExamples.Size); 
 lstSorted.Copy(prmExamples);
 //pour intgrer les rangs alatoires -- on les perturbe globalement
 lstSorted.procRandomizeExamples(seedStandard);
 //puis on trie selon les X, la comparaison sur Y est alatoire
 lstSorted.QuickSortBy(self.Attribute);
 //compter le nombre de squences
 self.countRuns(lstSorted);
 //calculer les stats associes
 self.computeStats(prmExamples);
 //vider la liste de tri
 lstSorted.Free();
end;

{ TLstCalcStatDesCondRuns }

function TLstCalcStatDesCondRuns.getHeaderHTML: string;
begin
 result:= HTML_TABLE_COLOR_HEADER_GRAY+
          '<TH>Attribute_Y</TH><TH>Attribute_X</TH><TH>Description</TH><TH>Statistical test</TH></TR>';
end;


{ TCalcSDRunsWaldWolfowitz }

//cf. formule -- Siegel. pp.62
procedure TCalcSDRunsWaldWolfowitz.computeStats(prmExamples: TExamples);
var dist: TCalcStatDesDiscrete;
    pn,m,GN: double;
    h, muR, varR: double;
begin
 //calculer la distribution sur l'attribut de description -- tableau des e_k
 dist:= TCalcStatDesDiscrete.Create(self.AttDescription,prmExamples);
 pn:= dist.TabFreq.Value[1];
 m:= dist.TabFreq.Value[2];
 GN:= dist.TabFreq.Value[0];
 //moyenne
 muR:= 2.0 * m * pn / GN + 1.0;
 //correction de continuit
 if (FNbRuns < muR)
  then h:= +0.5
  else if (FNbRuns > muR)
        then h:= -0.5
        else h:= 0.0;
 //variance
 varR:= (2.0 * m * pn *(2.0 * m * pn - GN))/(GN * GN * (GN - 1.0));
 //stat. et proba critique
 if (varR > 0.0)
  then
   begin
    //La stat peut tre ngative -- le  mlange serait "trop parfait" dans ce cas mais c'est une configuration qui ne nous intresse pas
    FStatRuns:= (1.0*FNbRuns + h - muR) / sqrt(varR);
    //p-value -- attention donc --> test unilatral  gauche
    FProbaRuns:= FNorm(FStatRuns);
   end
  else
   begin
    FStatRuns :=  0.0;
    FProbaRuns := 1.0;
   end;
 //librer
 dist.Free();
end;

end.
