(*********************************************************************)
(* UCompFSUnivariateChi2.pas - Copyright (c) 2004 Ricco RAKOTOMALALA *)
(*********************************************************************)

{

@abstract(Slectionner les input avec un test univari du chi-2)
@author(Ricco)
@created(12/05/2004)

Test basique mais trs utilis et surtout trs rapide : mthode supervise bien sr
(1 TARGET discret obligatoire) et tous les INPUT doivent tre discrets.

3 paramtrages possibles :
--------------------------
(0) seuil p-value du chi-2
(1) seuil t de Tschuprow
(2) les x meilleurs au sens du t de Tschuprow

>> NEW -- 04/02/2005 --
On introduit une variante dans le calcul, il est possible de filtrer selon le TSCHUPROW ou le Symmetrical Uncertainty (Press, 1988)
Cela joue
(a) sur le classement
(b) sur les choix dons dans les x-meilleurs
(c) pour l'instant -- 04/02/2005 -- le seuil de p-value est encore celui du CHI-2, il faut voir la formule de Kvalseth (1987) pour dcider du test stat
>> --------------------

Mieux vaut que la mesure soit normalise entre 0 et 1 de toute manire, c'est mieux pour les prsentations.

}

unit UCompFSUnivariateChi2;

interface

USES
        Forms, Classes, IniFiles,
        UCompDefinition,
        UCompFSDefinition,
        UCompFSInputSelection,
        UOperatorDefinition,
        UCalcStatDesCrossTab;

TYPE
        {gnrateur}
        TMLGenFSUnivChi2 = class(TMLGenFS)
                           public
                           function    GetClassMLComponent: TClassMLComponent; override;
                           end;

        {composant}
        TMLCompFSUnivChi2 = class(TMLCompFSInputSelection)
                            protected
                            function    getClassOperator: TClassOperator; override;
                            end;

        {oprateur}
        TOpFSUnivChi2 = class(TOpFSInputSelSpvInputDiscrete)
                        private
                        {gestionnaire de stats}
                        FLstCrossTab: TLstCalcStatDesCrossTab;
                        {filtrer selon la pvalue}
                        procedure   FilterWithPValue();
                        {filtrer la statistique Tschuprow ou autres (ex. Symmetrical Uncertainty)}
                        procedure   FilterWithStatisticValue();
                        {filtrer en prenant les x meilleurs}
                        procedure   FilterWithXBest();
                        {filtrer selon le "saut" le plus lev de la statistique}
                        procedure   FilterWithStatisticGap();
                        protected
                        function    getClassParameter: TClassOperatorParameter; override;
                        public
                        constructor Create(AOwner: TObject); override;
                        destructor  Destroy(); override;
                        function    getHTMLResultsSummary(): string; override;
                        function    CoreExecute(): boolean; override;
                        end;

        {paramtrage}
        TOpPrmFSUnivChi2 = class(TOpPrmFSInputSelection)
                           private
                           {choix de la statistique  utiliser : 0 -- Tschuprow, 1 -- SU...}
                           FUsedStatistic: integer;
                           {0 - p-value, 1 - statistique, 2 - x meilleurs, 3 - le gap le plus lev}
                           FActivePrm: integer;
                           {p-value}
                           FPValue: double;
                           {statistique}
                           FStatistic: double;
                           {x-meilleurs}
                           FXBest: integer;
                           protected
                           procedure   SetDefaultParameters(); override;
                           function    CreateDlgParameters(): TForm; override;
                           public
                           procedure   LoadFromStream(prmStream: TStream); override;
                           procedure   SaveToStream(prmStream: TStream); override;
                           procedure   LoadFromINI(prmSection: string; prmINI: TMemIniFile); override;
                           procedure   SaveToINI(prmSection: string; prmINI: TMemIniFile); override;
                           function    getHTMLParameters(): string; override;
                           property    UsedStatistic: integer read FUsedStatistic write FUsedStatistic;
                           property    ActivePrm: integer read FActivePrm write FActivePrm;
                           property    PValue: double read FPValue write FPValue;
                           property    Statistic: double read FStatistic write FStatistic;
                           property    XBest: integer read FXBest write FXBest;
                           end;
        


implementation

uses
        sysutils,
        UDatasetDefinition, UCompManageDataset, UDatasetImplementation,
        UDlgOpFSUnivariateChi2, UConstConfiguration, UStringAddBuffered;

{ TMLGenFSUnivChi2 }

function TMLGenFSUnivChi2.GetClassMLComponent: TClassMLComponent;
begin
 result:= TMLCompFSUnivChi2;
end;

{ TMLCompFSUnivChi2 }

function TMLCompFSUnivChi2.getClassOperator: TClassOperator;
begin
 result:= TOpFSUnivChi2;
end;

{ TOpFSUnivChi2 }

function TOpFSUnivChi2.CoreExecute: boolean;
var j: integer;
    attTarget,attInput: TAttribute;
begin
 result:= TRUE;
 TRY
 //refaire toutes les stats au cas o la slection a chang
 FLstCrossTab.FreeAll();
 //recalcul
 attTarget:= self.WorkData.LstAtts[asTarget].Attribute[0];
 for j:= 0 to pred(self.WorkData.LstAtts[asInput].Count) do
  begin
   attInput:= self.WorkData.LstAtts[asInput].Attribute[j];
   FLstCrossTab.AddStat(TCalcSDCrossTab.Create(attTarget,attInput,self.WorkData.Examples));
  end;
 //toujours trier selon la statistique -- qq soit la mthode de seuillage
 FLstCrossTab.CompareMode:= 2+ord((self.PrmOp as TOpPrmFSUnivChi2).UsedStatistic);//hum, hum, bricolage, bricolage, a ne tient que si les stats sont conscutifs...
 FLstCrossTab.SortStats();
 //filtrage
 case (self.PrmOp as TOpPrmFSUnivChi2).ActivePrm of
  1: self.FilterWithStatisticValue();
  2: self.FilterWithXBest();
  3: self.FilterWithStatisticGap()
  //par dfaut, on utilise la p-value
  else self.FilterWithPValue();
 end;
 EXCEPT
 result:= FALSE;
 END;
end;

constructor TOpFSUnivChi2.Create(AOwner: TObject);
begin
 inherited Create(AOwner);
 FLstCrossTab:= TLstCalcStatDesCrossTab.Create(NIL,NIL);
end;

destructor TOpFSUnivChi2.Destroy;
begin
 FLstCrossTab.Free();
 inherited Destroy();
end;

function TOpFSUnivChi2.getClassParameter: TClassOperatorParameter;
begin
 result:= TOpPrmFSUnivChi2;
end;

procedure TOpFSUnivChi2.FilterWithPValue;
var j: integer;
    stat: TCalcSDCrossTab;
    thresold: double;
    usedStat: TEnumTypeStatFromCrossTab;
begin
 //la statistique utilise
 usedStat:= TEnumTypeStatFromCrossTab((self.PrmOp as TOpPrmFSUnivChi2).UsedStatistic);
 //seuil en p-value
 thresold:= (self.PrmOp as TOpPrmFSUnivChi2).PValue;
 for j:= 0 to pred(self.FLstCrossTab.Count) do
  begin
   stat:= self.FLstCrossTab.Stat(j) as TCalcSDCrossTab;
   if (stat.pValue(usedStat)<=thresold)
    then self.OutputData.LstAtts[asInput].Add(stat.ColAtt)
    else self.RemovedFromInput.AddObject(stat.ColAtt.Name,stat.ColAtt);
  end;
end;

procedure TOpFSUnivChi2.FilterWithStatisticValue;
var j: integer;
    stat: TCalcSDCrossTab;
    thresold: double;
    usedStat: TEnumTypeStatFromCrossTab;
begin
 //la statistique utilise
 usedStat:= TEnumTypeStatFromCrossTab((self.PrmOp as TOpPrmFSUnivChi2).UsedStatistic);
 //valeur du seuil
 thresold:= (self.PrmOp as TOpPrmFSUnivChi2).Statistic;
 for j:= 0 to pred(self.FLstCrossTab.Count) do
  begin
   stat:= self.FLstCrossTab.Stat(j) as TCalcSDCrossTab;
   if (stat.statisticValue(usedStat)>thresold)
    then self.OutputData.LstAtts[asInput].Add(stat.ColAtt)
    else self.RemovedFromInput.AddObject(stat.ColAtt.Name,stat.ColAtt);
  end;
end;

procedure TOpFSUnivChi2.FilterWithXBest;
var nb,j: integer;
begin
 nb:= (self.PrmOp as TOpPrmFSUnivChi2).XBest;
 //vitons les surprises inutiles
 if (nb>self.FLstCrossTab.Count)
  then nb:= self.FLstCrossTab.Count;
 //puis prendre d'un ct les meilleurs
 for j:= 0 to pred(nb) do
  self.OutputData.LstAtts[asInput].Add((self.FLstCrossTab.Stat(j) as TCalcSDCrossTab).ColAtt);
 //et de l'autre les exclus de la socit...
 for j:= nb to pred(self.FLstCrossTab.Count) do
  self.RemovedFromInput.AddObject((self.FLstCrossTab.Stat(j) as TCalcSDCrossTab).ColAtt.Name,(self.FLstCrossTab.Stat(j) as TCalcSDCrossTab).ColAtt);
end;

function TOpFSUnivChi2.getHTMLResultsSummary: string;
var s: string;
    stat: TCalcSDCrossTab;
    j: integer;
    buf: TBufString;
    usedStat: TEnumTypeStatFromCrossTab;
begin
 //la statistique utilise
 usedStat:= TEnumTypeStatFromCrossTab((self.PrmOp as TOpPrmFSUnivChi2).UsedStatistic);
 //crer le buffer de sortie
 buf:= TBufString.Create();
 buf.BeginUpdate();
 s:= inherited getHTMLResultsSummary();
 buf.AddStr(s);
 //ajouter le dtail des calculs
 s:= '<H3>Calculations details</H3>';
 s:= s+HTML_HEADER_TABLE_RESULT;
 s:= s+HTML_TABLE_COLOR_HEADER_GRAY;
 s:= s+'<TH>N</TH><TH>Attribute</TH><TH>Values</TH><TH>Statistic</TH><TH>Statistic (Histogram)</TH><TH>p-value</TH>';
 s:= s+'</TR>';
 buf.AddStr(s);
 for j:= 0 to pred(self.FLstCrossTab.Count) do
  begin
   stat:= self.FLstCrossTab.Stat(j) as TCalcSDCrossTab;
   s:= HTML_TABLE_COLOR_DATA_GRAY+format('<TD>%d</TD><TD>%s</TD><TD align="right">%d</TD><TD align="right">%.6f</TD><TD width="200">%s</TD><TD align="right">%.6f</TD></TR>',
                                         [succ(j),stat.ColAtt.Name,stat.ColAtt.NbValues,stat.statisticValue(usedStat),getHtmlHistogram(trunc(70*stat.statisticValue(usedStat))),stat.pValue(usedStat)]);
   buf.AddStr(s);
  end;
 s:= '</table>';
 buf.AddStr(s);
 buf.EndUpdate();
 //result:= s;
 result:= buf.BufS;
 buf.Free();
end;

procedure TOpFSUnivChi2.FilterWithStatisticGap();
var j,jMax: integer;
    delta,maxDelta: double;
    usedStat: TEnumTypeStatFromCrossTab;
begin
 //la statistique utilise
 usedStat:= TEnumTypeStatFromCrossTab((self.PrmOp as TOpPrmFSUnivChi2).UsedStatistic);
 //reprer le "saut" le plus lev
 jMax:= -1;
 maxDelta:= -1.0e308;
 for j:= 0 to self.FLstCrossTab.Count-2 do
  begin
   delta:= (self.FLstCrossTab.Stat(j) as TCalcSDCrossTab).statisticValue(usedStat) - (self.FLstCrossTab.Stat(succ(j)) as TCalcSDCrossTab).statisticValue(usedStat);
   if (delta>maxDelta)
    then
     begin
      maxDelta:= delta;
      jMax:= j;
     end;
  end;
 //puis subdiviser en deux parties
 //les slectionns
 for j:= 0 to jMax do
  self.OutputData.LstAtts[asInput].Add((self.FLstCrossTab.Stat(j) as TCalcSDCrossTab).ColAtt);
 //les exclus
 for j:= succ(jMax) to pred(self.FLstCrossTab.Count) do
  self.RemovedFromInput.AddObject((self.FLstCrossTab.Stat(j) as TCalcSDCrossTab).ColAtt.Name,(self.FLstCrossTab.Stat(j) as TCalcSDCrossTab).ColAtt);
end;

{ TOpPrmFSUnivChi2 }

function TOpPrmFSUnivChi2.CreateDlgParameters: TForm;
begin
 result:= TDlgOpPrmUnivChi2.CreateFromOpPrm(self);
end;

function TOpPrmFSUnivChi2.getHTMLParameters: string;
var s: string;
begin
 s:= HTML_HEADER_TABLE_RESULT;
 s:= s+HTML_TABLE_COLOR_HEADER_GRAY+'<TH colspan="2">Parameters</TH></TR>';
 s:= s+format('%s<TD>Used measure</TD><TD align="right">%s</TD></TR>',[HTML_TABLE_COLOR_DATA_GRAY,STR_STAT_NAME_FROM_CROSS_TAB[TEnumTypeStatFromCrossTab(FUsedStatistic)]]);
 s:= s+format('%s<TD>Active parameter</TD><TD align="right">%d</TD></TR>',[HTML_TABLE_COLOR_DATA_GRAY,FActivePrm]);
 s:= s+format('%s<TH>Parameter</TH><TH>Value</TH></TR>',[HTML_TABLE_COLOR_HEADER_GRAY]);
 s:= s+format('%s<TD>p-value thresold</TD><TD align="right">%.6f</TD></TR>',[HTML_TABLE_COLOR_DATA_GRAY,FPValue]);
 s:= s+format('%s<TD>Statistic thresold</TD><TD align="right">%.2f</TD></TR>',[HTML_TABLE_COLOR_DATA_GRAY,FStatistic]);
 s:= s+format('%s<TD>Best attributes</TD><TD align="right">%d</TD></TR>',[HTML_TABLE_COLOR_DATA_GRAY,FXBest]);
 s:= s+'</table>';
 result:= s;
end;

procedure TOpPrmFSUnivChi2.LoadFromINI(prmSection: string;
  prmINI: TMemIniFile);
begin
 FUsedStatistic:= prmINI.ReadInteger(prmSection,'used_statistic',FUsedStatistic);
 FActivePrm:= prmINI.ReadInteger(prmSection,'active_prm',FActivePrm);
 FPValue:= prmINI.ReadFloat(prmSection,'pvalue_thresold',FPValue);
 FStatistic:= prmINI.ReadFloat(prmSection,'statistic_thresold',FStatistic);
 FXBest:= prmINI.ReadInteger(prmSection,'x_best',FXBest);
end;

procedure TOpPrmFSUnivChi2.LoadFromStream(prmStream: TStream);
begin
 prmStream.ReadBuffer(FActivePrm,sizeof(FUsedStatistic));
 prmStream.ReadBuffer(FActivePrm,sizeof(FActivePrm));
 prmStream.ReadBuffer(FPvalue,sizeof(FPValue));
 prmStream.ReadBuffer(FStatistic,sizeof(FStatistic));
 prmStream.ReadBuffer(FXBest,sizeof(FXBest));
end;

procedure TOpPrmFSUnivChi2.SaveToINI(prmSection: string;
  prmINI: TMemIniFile);
begin
 prmINI.WriteInteger(prmSection,'used_statistic',FUsedStatistic);
 prmINI.WriteInteger(prmSection,'active_prm',FActivePrm);
 prmINI.WriteFloat(prmSection,'pvalue_thresold',FPValue);
 prmINI.WriteFloat(prmSection,'statistic_thresold',FStatistic);
 prmINI.WriteInteger(prmSection,'x_best',FXBest);
end;

procedure TOpPrmFSUnivChi2.SaveToStream(prmStream: TStream);
begin
 prmStream.WriteBuffer(FUsedStatistic,sizeof(FUsedStatistic));
 prmStream.WriteBuffer(FActivePrm,sizeof(FActivePrm));
 prmStream.WriteBuffer(FPvalue,sizeof(FPValue));
 prmStream.WriteBuffer(FStatistic,sizeof(FStatistic));
 prmStream.WriteBuffer(FXBest,sizeof(FXBest));
end;

procedure TOpPrmFSUnivChi2.SetDefaultParameters;
begin
 FUsedStatistic:= 0;
 //p-value par dfaut
 FActivePrm:= 0;
 //puis...
 FPValue:= 0.001;
 FStatistic:= 0.30;
 FXBest:= 10;
end;

initialization
 RegisterClass(TMLGenFSUnivChi2);
end.
