(***********************************************************************)
(* UCalcStatDesCorrelation.pas - Copyright (c) 2004 Ricco RAKOTOMALALA *)
(***********************************************************************)

{
@abstract(Calcul de corrlation linaire (Pearson) entre deux variables)
@author(Ricco)
@created(12/01/2004)
}
unit UCalcStatDesCorrelation;

interface

USES
        UCalcStatDes,
        UDatasetDefinition,
        UDatasetExamples;

TYPE
        {un classe corrlation}
        TCalcSDCorrelation = class(TCalcStatDes)
                             private
                             {la variable X}
                             FXAtt: TAttribute;
                             {sommes sur Y et X}
                             FSumY,FSumX: double;
                             {sommes aux carrs et croiss}
                             FSum2Y,FSum2X,FSumYX: double;
                             {coeff de corrlation}
                             FR: double;
                             {transforme de Fisher}
                             FT: double;
                             {proba de la transforme}
                             FPrT: double;
                             {accder  la premire variable}
                             function    getYAtt(): TAttribute;
                             {calculer les indicateurs stats}
                             procedure   CalcStats();
                             protected
                             procedure   BeginUpdate(); override;
                             procedure   AddValue(prmExample: integer); override;
                             procedure   EndUpdate(); override;
                             public
                             {rfrencer les variables Y et X}
                             constructor Create(prmY,prmX: TAttribute; prmExamples: TExamples = nil);
                             {rapport HTML}
                             function    getHTMLResult(prmOption: integer = -1): string; override;
                             {la premire variable}
                             property    YAtt: TAttribute read getYAtt;
                             {la seconde variable}
                             property    XAtt: TAttribute read FXAtt;
                             {r}
                             property    R: double read FR;
                             {z}
                             property    T: double read FT;
                             {PrZ}
                             property    PrT: double read FPrT;
                             end;

        {classe de classe corrlation}
        TClassCalcSDCorrelation = class of TCalcSDCorrelation; 

        {liste de corrlations}
        TLstCalcSDCorrelation = class(TLstCalcStatDes)
                                public
                                {tri des rsultats}
                                procedure SortStats(); override;
                                {description HTML des rsultats}
                                function    getHeaderHTML(): string; override;
                                end;

        {corrlation de rangs de Spearman}
        TCalcSDSpearmanCorr = class(TCalcSDCorrelation)
                              public
                              //court-circuiter l'adjonction des individus pour raliser un calcul en une passe
                              procedure   RefreshStat(prmExamples: TExamples); override;
                              end;

        {liste de corrlation de rangs}
        TLstCalcSDSpearmanCorr = class(TLstCalcSDCorrelation)
                                 end;

        {tau de Kendall}
        TCalcSDKendallCorr = class(TCalcSDSpearmanCorr)
                             //court-circuiter l'adjonction des individus pour raliser un calcul en une passe
                             procedure   RefreshStat(prmExamples: TExamples); override;
                             end;

        {liste de Kendall's tau}
        TLstCalcSDKendallCorr = class(TLstCalcSDSpearmanCorr)
                                end;   
                                     

implementation

uses
        Sysutils,Classes,Math,
        FMath, UConstConfiguration, UCalcStatDesRankForNonParametricStat,
  ULogFile;

{ TCalcSDCorrelation }

procedure TCalcSDCorrelation.AddValue(prmExample: integer);
var y,x: double;
begin
 y:= YAtt.cValue[prmExample];
 x:= XAtt.cValue[prmExample];
 //y
 FSumY:= FSumY+y;
 FSum2Y:= FSum2Y+y*y;
 //x
 FSumX:= FSumX+x;
 FSum2X:= FSum2X+x*x;
 //crois
 FSumYX:= FSumYX+y*x;
 //nombre
 inc(FNbExamples);
end;

procedure TCalcSDCorrelation.BeginUpdate;
begin
 FNbExamples:= 0;
 FSumY:= 0.0;
 FSumX:= 0.0;
 FSumYX:= 0.0;
 FSum2Y:= 0.0;
 FSum2X:= 0.0;
end;

procedure TCalcSDCorrelation.CalcStats;
var num,denom,mY,mX,sY,sX,n: double;
begin
 FR:= 0.0;
 FT:= 0.0;
 FPrT:= 0.0;
 if (FNbExamples>0)
  then
   begin
    n:= 1.0*FNbExamples;
    mY:= FSumY/n;
    mX:= FSumX/n;
    num:= FSumYX-n*mY*mX;
    sY := FSum2Y-n*mY*mY;
    sX := FSum2X-n*mX*mX;
    denom:= sY*sX;
    if (denom>0)
     then
      begin
       denom:= SQRT(denom);
       FR:= num/denom;
       if (FR>-1.0) and (FR<1.0) and (n>2.0)
        then
         begin
          FT:= FR*sqrt(n-2.0)/sqrt(1.0-FR*FR);
          FPrT:= PStudent(FNbExamples-2,FT);
         end;
      end;
   end;
end;

constructor TCalcSDCorrelation.Create(prmY, prmX: TAttribute;
  prmExamples: TExamples);
begin
 FXAtt:= prmX;
 inherited Create(prmY,prmExamples);
end;

procedure TCalcSDCorrelation.EndUpdate;
begin
 CalcStats();
end;

function TCalcSDCorrelation.getHTMLResult(prmOption: integer): string;
var s: string;
begin
 s:= format('<TD>%s</TD><TD>%s</TD>',[YAtt.Name,XAtt.Name]);
 s:= s+Format('<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>',[R]);
 s:= s+Format('<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>',[R*R]);
 s:= s+Format('<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>',[T]);
 s:= s+Format('<TD align=right>'+STR_FORMAT_VIEW_STAT_ACCURACY+'</TD>',[PrT]);  
 result:= s;
end;

function TCalcSDCorrelation.getYAtt: TAttribute;
begin
 result:= Attribute;
end;

{ TLstCalcSDCorrelation }

function TLstCalcSDCorrelation.getHeaderHTML: string;
begin
 result:=  HTML_TABLE_COLOR_HEADER_GRAY+
           '<TH width=80>Y</TH><TH width=80>X</TH><TH width=60>r</TH><TH width=60>r</TH><TH width=60>t</TH><TH width=60>Pr(>|t|)</TH>';
end;

{trier selon le nom des Y}
function ListSortCompareYAttName(item1,item2: pointer): integer;
var st1,st2: TCalcSDCorrelation;
begin
 st1:= TCalcSDCorrelation(item1);
 st2:= TCalcSDCorrelation(item2);
 if (st1.YAtt.Name<st2.YAtt.Name)
  then result:= -1
  else
   if (st1.YAtt.Name>st2.YAtt.Name)
    then result:= +1
    else result:= 0;
end;

{trier selon le nom des X}
function ListSortCompareXAttName(item1,item2: pointer): integer;
var st1,st2: TCalcSDCorrelation;
begin
 st1:= TCalcSDCorrelation(item1);
 st2:= TCalcSDCorrelation(item2);
 if (st1.XAtt.Name<st2.XAtt.Name)
  then result:= -1
  else
   if (st1.XAtt.Name>st2.XAtt.Name)
    then result:= +1
    else result:= 0;
end;

{trier selon le r}
function ListSortCompareR(item1,item2: pointer): integer;
var st1,st2: TCalcSDCorrelation;
begin
 st1:= TCalcSDCorrelation(item1);
 st2:= TCalcSDCorrelation(item2);
 //tri invers
 if (st1.R<st2.R)
  then result:= +1
  else
   if (st1.R>st2.R)
    then result:= -1
    else result:= 0;
end;

{trier selon la valeur absolue de r}
function ListSortCompareAbsoluteR(item1,item2: pointer): integer;
var st1,st2: TCalcSDCorrelation;
begin
 st1:= TCalcSDCorrelation(item1);
 st2:= TCalcSDCorrelation(item2);
 //tri invers
 if (abs(st1.R)<abs(st2.R))
  then result:= +1
  else
   if (abs(st1.R)>abs(st2.R))
    then result:= -1
    else result:= 0;
end;

procedure TLstCalcSDCorrelation.SortStats;
var funcCompare: TListSortCompare;
begin
 if (CompareMode>=0)
  then
   begin

    case CompareMode of
     0: funcCompare:= ListSortCompareYAttName;
     1: funcCompare:= ListSortCompareXAttName;
     2: funcCompare:= ListSortCompareR;
     3: funcCompare:= ListSortCompareAbsoluteR;
     else
      funcCompare:= NIL;
    end;

    if assigned(funcCompare)
     then LstStat.Sort(funcCompare);
   end;
end;

{ TCalcSDSpearmanCorr }

procedure TCalcSDSpearmanCorr.RefreshStat(prmExamples: TExamples);
var rankX, rankY: TRankComputed;
    correcX, correcY: double;
    i,example: integer;
    D,N,powerN: double;

    //calculer les rangs pour chaque attribut
    function _computeRank(att: TAttribute; lst: TExamples; var rank: TRankComputed): double;
    var lstSorted: TExamples;
    begin
     //trier les individus selon l'attribut de rf.
     lstSorted:= TExamples.Create(lst.Size);
     lstSorted.Copy(lst);
     lstSorted.QuickSortBy(att);
     //construire les rangs
     rank:= TRankComputed.create(att);
     result:= rank.computeRank(lstSorted);
     //
     lstSorted.Free();
    end;

begin
 FNbExamples:= prmExamples.Size;
 //calculer les rangs selon X et Y
 correcX:= _computeRank(self.XAtt,prmExamples,rankX);
 correcY:= _computeRank(self.YAtt,prmExamples,rankY);
 //*** construire la stat de Spearman ***
 //la valeur de D
 D:= 0.0;
 for i:= 1 to prmExamples.Size do
  begin
   example:= prmExamples.Number[i];
   D:= D + power(rankX.getRank(example)-rankY.getRank(example),2.0);
  end;
 //vider les objets
 rankX.Free();
 rankY.Free();
 //**
 //le rho (corrig) -- Siegel, pp.239 (formule 9.7)
 //**
 N:= 1.0*self.NbExamples;
 powerN:= power(N,3.0)-N;
 //
 FR:= powerN - 6.0 * D - (correcX + correcY) / 2.0;
 FR:= FR / SQRT(power(powerN,2.0) - (correcX + correcY) * powerN + (correcX * correcY));
 //la transforme pour le test de significativit
 FT:= FR*SQRT((N - 2.0) / (1.0 - power(FR,2.0)));
 //proba critique associe
 FPrT:= PStudent(FNbExamples-2,FT);
end;

{ TCalcSDKendallCorr }

procedure TCalcSDKendallCorr.RefreshStat(prmExamples: TExamples);
var rankX, rankY: TRankComputedKendallTau;
    lstSrtX, lstSrtY: TExamples;
    correcX, correcY: double;
    i,j,example: integer;
    refRank,curRank: TTypeRankValue;
    agree,disagree: double;
    S,N,T,PowerN: double;

    //calculer les rangs pour chaque attribut -- renvoyer la liste trie
    function _computeRank(att: TAttribute; lst: TExamples; var rank: TRankComputedKendallTau; var lstSorted: TExamples): double;
    begin
     //trier les individus selon l'attribut de rf.
     lstSorted:= TExamples.Create(lst.Size);
     lstSorted.Copy(lst);
     lstSorted.QuickSortBy(att);
     //construire les rangs
     rank:= TRankComputedKendallTau.create(att);
     result:= rank.computeRank(lstSorted);
    end;

begin
 FNbExamples:= prmExamples.Size;
 //calculer les rangs selon X et Y
 correcX:= _computeRank(self.XAtt,prmExamples,rankX,lstSrtX);
 correcY:= _computeRank(self.YAtt,prmExamples,rankY,lstSrtY);
 //la liste lstSrtX sert de rf. et sera utilise, on peut se dbarasser de la liste Y
 lstSrtY.Free();
 //*************************************
 //*** construire la stat de Kendall ***
 //*************************************
 //pour chaque individu class sur X -- c'est clairement en O(n^2) !
 //!\ souci, quelle stratgie lorsqu'il y a des ex-aequos sur les X ?
 //!\ on remarquera que pour l'instant, selon le choix de X ou Y comme rf., les rsultats sont diffrents !!! shocking !
 // ---> rsolu le 19/07/2005 !!! === on compare uniquement en cas de non-ex-aequo sur les X !!!
 S:= 0.0;
 for i:= 1 to pred(lstSrtX.Size) do
  begin
   //on se cale sur le classement en X
   example:= lstSrtX.Number[i];
   //on regarde la concordance des classements sur Y
   refRank:= rankY.getRank(example);
   //calcul accords - dsaccords
   agree:= 0.0;
   disagree:= 0.0;
   //vrification pour l'autre liste
   for j:= succ(i) to lstSrtX.Size do
    begin
     curRank:= rankY.getRank(lstSrtX.Number[j]);
     //new -- 19/07/2005 -- si le rang sur les X est bien diffrent ? (c'est la solution) !!!
     if (rankX.getRank(example) < rankX.getRank(lstSrtX.Number[j]))
      then
       begin
         //incrmenter "agreements" or "disagreements"
         //cf. ex. 9.4a, Siegel -- pp.248
         if (curRank > refRank)
          then agree:= agree + 1.0
          else if (curRank < refRank)
                then disagree:= disagree + 1.0;
       end;
    end;
   //accumuler
   S:= S + agree - disagree;
  end;
 //vider les objets
 lstSrtX.Free();
 rankY.Free();
 rankX.Free();
 //*** la statistique *** Siegel pp.249 (eq. 9.10)
 N:= 1.0*self.NbExamples;
 powerN:= N * (N - 1.0);
 //Tau
 T:= (2.0*S)/(SQRT(powerN-correcX)*SQRT(powerN-correcY));
 //variance
 //VT:= (2.0*(2.0*N+5.0))/(9.0*powerN);
 //passer les rsultats
 FR:= T;
 //vrification
 //TraceLog.WriteToLogFile(format('%s - %s --> S = %.4f ; correcX = %.4f ; correcY = %.4f ; Tau = %.4f',[YAtt.Name, XAtt.Name, S, correcX, correcY, T]));
 //la transforme pour le test de significativit (9.11, p.253 -- Siegel)
 FT:= 3.0*T*SQRT(N*(N-1.0))/SQRT(2.0*(2.0*N+5.0));
 (*
 if (VT > 0.0)
  then FT:= FR/SQRT(VT)
  else FT:= 0.0;
 *)
 //proba critique associe
 FPrT:= PNorm(abs(FT));
end;

end.
