(******************************************************************************)
(* UCompAssociationRuleDefinition.pas - Copyright (c) 2004 Ricco RAKOTOMALALA *)
(******************************************************************************)

{
@abstract(Composant de base des rgles d'association)
@author(Ricco)
@created(12/01/2004)
On aurait pu galement utilis des TCollectionItem etc., a aurait t plus "clean" mais bon ici
a marche et c'est rapide... (enfin, j'imagine..., faudrait tester en dtail...)
}
unit UCompAssociationRuleDefinition;

interface

USES
        Classes, IniFiles,
        UCompManageDataset,
        UDatasetDefinition,
        UDatasetImplementation,
        UOperatorDefinition,
        UDatasetExamples,
        UCalcAssocStructure;

TYPE
        {composant}
        TMLCompAssocRule = class(TMLCompLocalData)
                           end;

        {calculateur de structure d'association}
        TCalcAssocRule = class;
        TClassCalcAssocRule = class of TCalcAssocRule;

        {oprateur}
        TOpAssocRule    = class(TOpLocalData)
                          protected
                          {les attributs de travail -> les inputs}
                          FInputs: TLstAttributes;
                          {calculateur}
                          FCalcAssocRule: TCalcAssocRule;
                          {initialiser le calculateur}
                          procedure   InitializeCalcAssocRule();
                          {gnrateur d'association rule}
                          function    getClassAssocRule(): TClassCalcAssocRule; virtual; abstract;
                          {construire les rgles}
                          procedure   BuildRules();
                          {tester les attributs}
                          function    CheckAttributes(): boolean; override;
                          {tester le max des individus}
                          function    CheckExamples(): boolean; override;
                          public
                          function    CoreExecute(): boolean; override;
                          destructor  Destroy; override;
                          function    getHTMLResultsSummary(): string; override;
                          property    Inputs: TLstAttributes read FInputs;
                          property    CalcAssocRule: TCalcAssocRule read FCalcAssocRule;
                          end;

        {paramtre de l'oprateur}
        TOpPrmAssocRule = class(TOperatorParameter)
                          private
                          {support min en fraction}
                          FMinSupport: double;
                          {confiance min en fraction}
                          FMinConfidence: double;
                          {longeur max. des rgles = card max des itemsets}
                          FMaxRuleLength: integer;
                          {LIFT min des rgles pour tre admis dans la liste des solutions}
                          FMinLift: double;
                          protected
                          procedure   SetDefaultParameters(); override;
                          public
                          procedure   LoadFromStream(prmStream: TStream); override;
                          procedure   SaveToStream(prmStream: TStream); override;
                          procedure   LoadFromINI(prmSection: string; prmINI: TMemIniFile); override;
                          procedure   SaveToINI(prmSection: string; prmINI: TMemIniFile); override;
                          property  MinSupport: double read FMinSupport write FMinSupport;
                          property  MinConfidence: double read FMinConfidence write FMinConfidence;
                          property  MaxRuleLength: integer read FMaxRuleLength write FMaxRuleLength;
                          property  MinLift: double read FMinLift write FMinLift;
                          end;

        {la classe de calcul}
        TCalcAssocRule = class(TObject)
                         protected
                         {paramtre des oprations}
                         FPrmAssocRule: TOpPrmAssocRule;
                         {les inputs}
                         FInputs: TLstAttributes;
                         {liste des items candidats}
                         FItems: TLstAssocItem;
                         {support min en absolu}
                         FAbsMinSupport: integer;
                         {liste de toutes les rgles}
                         FRules: TLstAssocRules;
                         {Nombre de transactions -- observations -- qui ont servi  la construction des rgles}
                         FNbExamples: integer;
                         {tableau de comptage des card d'itemsets}
                         FTabCardItemsets: array of integer;
                         {Structures d'itemsets frquents}
                         FStructureItemsets: TStructureItemsets;
                         {construire les items}
                         procedure   buildItems(prmExamples: TExamples); virtual;
                         {construire les itemsets de card=2, l'ide est de pouvoir filtrer  partir de cette base par la suite -- appele dans builFrequentItemset}
                         procedure   buildFrequentItemsetOfCard2(prmExamples: TExamples); virtual;
                         {construire les itemsets frquents - mthode de base, sans tests mis  part sur la base - surcharge sans inherited}
                         procedure   buildFrequentItemset(prmExamples: TExamples); virtual;
                         {construire les rgles}
                         procedure   buildRulesFromItemset(prmExamples: TExamples; prmClassAssocStructure: TClassAssocRuleStructure); virtual;
                         {initialiser les structures de calcul}
                         procedure   initializeCalcStructure(); virtual;
                         {filtrer les rgles en se basant sur le lift, la base de rgle doit tre au pralable trie pour que la procdure soit valide}
                         procedure   filteringRulesFromLift();
                         {classe de rgles produites}
                         function    classRuleStructure(): TClassAssocRuleStructure; virtual; abstract;
                         function    classLstAssocRules(): TClassLstAssocRules; virtual; abstract;
                         function    getClassCalcRuleFromItemset(): TClassCalcRuleFromItemset; virtual;
                         public
                         //constructor create(prmInputs: TLstAttributes; prmOp: TOpPrmAssocRule); virtual;
                         constructor create(AssocOperator: TOpAssocRule); virtual;
                         destructor  destroy; override;
                         procedure   buildRules(prmExamples: TExamples);
                         function    getHTMLResults(): string; virtual;
                         //new -- 10/03/2006 -- lister les rgles,  voir la surcharge plus loin
                         function    listingRulesHTML(): string; virtual;
                         //
                         property    prmAssocRule: TOpPrmAssocRule read FPrmAssocRule;
                         property    inputs: TLstAttributes read FInputs;
                         property    rules: TLstAssocRules read FRules;
                         end;

implementation

uses
        Windows, Sysutils, UStringAddBuffered, UConstConfiguration, ULogFile;

{ TOpAssocRule }

procedure TOpAssocRule.BuildRules;
begin
 FCalcAssocRule.buildRules(self.WorkData.Examples);
end;

function TOpAssocRule.CheckAttributes: boolean;
var ok: boolean;
begin
 ok:= (self.WorkData.LstAtts[asInput].Count>0) and (self.WorkData.LstAtts[asInput].isAllCategory(caDiscrete));
 //ou bien -- new -- 05/11/2004 -- tous en 0 (faux) et 1 (true)
 ok:= ok OR (self.WorkData.LstAtts[asInput].Count>0) and (self.WorkData.LstAtts[asInput].isAllCategory(caContinue));
 //rcuprer
 if ok  then FInputs:= self.WorkData.LstAtts[asInput];
 //renvoyer
 result:= ok;
end;

function TOpAssocRule.CheckExamples: boolean;
begin
 result:= (self.WorkData.Examples.Size < MAX_EXAMPLES_ASSOC_RULE);
end;

function TOpAssocRule.CoreExecute: boolean;
begin
 if assigned(FCalcAssocRule)
  then FCalcAssocRule.Free;
 //instanciation
 self.InitializeCalcAssocRule();
 //apprentissage
 result:= TRUE;
 TRY
 FCalcAssocRule.buildRules(self.WorkData.Examples);
 EXCEPT
 result:= FALSE;
 END;
end;

destructor TOpAssocRule.Destroy;
begin
  if assigned(FCalcAssocRule)
   then FreeAndNil(FCalcAssocRule);
  inherited;
end;

function TOpAssocRule.getHTMLResultsSummary: string;
begin
 result:= FCalcAssocRule.getHTMLResults();
end;

procedure TOpAssocRule.InitializeCalcAssocRule;
begin
 //FCalcAssocRule:= getClassAssocRule.create(FInputs,prmOp as TOpPrmAssocRule);
 FCalcAssocRule:= getClassAssocRule.create(self);
end;

{ TOpPrmAssocRule }

procedure TOpPrmAssocRule.LoadFromINI(prmSection: string;
  prmINI: TMemIniFile);
begin
 FMinSupport:= prmINI.ReadFloat(prmSection,'min_support',FMinSupport);
 FMinConfidence:= prmINI.ReadFloat(prmSection,'min_confidence',FMinConfidence);
 FMaxRuleLength:= prmINI.ReadInteger(prmSection,'max_rule_length',FMaxRuleLength);
 FMinLift:= prmINI.ReadFloat(prmSection,'min_lift',FMinLift);
end;

procedure TOpPrmAssocRule.LoadFromStream(prmStream: TStream);
begin
 prmStream.ReadBuffer(FMinSupport,sizeof(FMinSupport));
 prmStream.ReadBuffer(FMinConfidence,sizeof(FMinConfidence));
 prmStream.ReadBuffer(FMaxRuleLength,sizeof(FMaxRuleLength));
 prmStream.ReadBuffer(FMinLift,sizeof(FMinLift));
end;

procedure TOpPrmAssocRule.SaveToINI(prmSection: string;
  prmINI: TMemIniFile);
begin
 prmINI.WriteFloat(prmSection,'min_support',FMinSupport);
 prmINI.WriteFloat(prmSection,'min_confidence',FMinConfidence);
 prmINI.WriteInteger(prmSection,'max_rule_length',FMaxRuleLength);
 prmINI.WriteFloat(prmSection,'min_lift',FMinLift);
end;

procedure TOpPrmAssocRule.SaveToStream(prmStream: TStream);
begin
 prmStream.WriteBuffer(FMinSupport,sizeof(FMinSupport));
 prmStream.WriteBuffer(FMinConfidence,sizeof(FMinConfidence));
 prmStream.WriteBuffer(FMaxRuleLength,sizeof(FMaxRuleLength));
 prmStream.WriteBuffer(FMinLift,sizeof(FMinLift));  
end;

procedure TOpPrmAssocRule.SetDefaultParameters;
begin
 FMinSupport:= 0.33;
 FMinConfidence:= 0.75;
 FMaxRuleLength:= 4;
 FMinLift:= 1.1;
end;

{ TCalcAssocRule }

procedure TCalcAssocRule.buildFrequentItemsetOfCard2(
  prmExamples: TExamples);
var lstItemsets: TLstAssocItemSet;
    i,j: integer;
    itemset: TAssocItemSet;
    item1,item2: TAssocItem;
begin
 //les items de cardinal 2
 lstItemsets:= FStructureItemsets.AddLstAssocItemset();
 //tester le croisement
 for i:= 0 to FItems.Count-2 do
  begin
   item1:= FItems.Item[i];
   for j:= succ(i) to pred(FItems.Count) do
   begin
    item2:= FItems.Item[j];
    itemset:= TAssocItemSet.Create(item1,item2);
    //faire passer sur les individus et tester si acceptable
    //itemset.computeSupport(prmExamples);
    //avec les bit array, c'est inutile
    if (itemset.Support<FAbsMinSupport)
     then itemset.Free
     else lstItemsets.AddItemSet(itemset);
   end;
  end;
end;

procedure TCalcAssocRule.buildFrequentItemset(prmExamples: TExamples);
var lstItemsets,newLstItemsets: TLstAssocItemSet;
    itemset,newItemset: TAssocItemSet;
    item1,lastItem: TAssocItem;
    i,j: integer;
    nbFrequent: integer;
    ruleLength: integer;
begin
 nbFrequent:= FItems.Count;
 //appel de la construction de l'tage en dessous des itemsets -- new 26/07/2004
 self.buildFrequentItemsetOfCard2(prmExamples);
 ruleLength:= 2;
 //>>> si ok, on continue jusqu' ce que la liste cre soit vide
 lstItemsets:= FStructureItemSets.getLstItemsets(pred(FStructureItemSets.Count));//se brancher sur la dernire liste d'itemsets cre
 while (lstItemsets.Count>0) and (ruleLength<FPrmAssocRule.MaxRuleLength) do
  begin
   inc(nbFrequent,lstItemsets.Count);
   TraceLog.WriteToLogFile(format('CALC_ASSOC_RULE >> buildFrequentItemset (length = %d) :: %d, current memory allocated : %d',[ruleLength,lstItemsets.Count,AllocMemSize]));
   newLstItemsets:= FStructureItemsets.AddLstAssocItemset();
   //no_passaran:= 0;
   //pour chaque itemset  enrichir
   for j:= 0 to pred(lstItemsets.Count) do
    begin
     itemset:= lstItemsets.getItemset(j);
     //new - on connat le numro du dernier lment de l'itemset
     lastItem:= itemset.Last();
     //pour chaque item candidat
     //for i:= 0 to pred(FItems.Count) do
     for i:= succ(lastItem.Numero) to pred(FItems.Count) do
      begin
       item1:= FItems.Item[i];
       //le test d'inclusion devient inutile
       //if itemset.isItemNotIncluded(item1)
        //then
         //begin
          // ce stade, il est possible avec a priori de limiter la recherche en testant au pralable la possibilit de l'existence de l'itemset
          //nous, on se contente ici de la former et de la tester sur les donnes
          newItemset:= TAssocItemSet.Create(itemset,item1);
          //newItemset.computeSupport(prmExamples);
          if (newItemset.Support<FAbsMinSupport)
           then newItemset.Free
           else newLstItemsets.AddItemSet(newItemset);
         //end
        //else inc(no_passaran);
      end;
    end;
   //TraceLog.WriteToLogFile(format('no passaran = %d',[no_passaran]));
   //vider les tableaux de boolens inutiles sur les itemsets
   lstItemsets.freeBoolArray();
   //pour passer au suivant
   lstItemsets:= newLstItemsets;
   inc(ruleLength);
  end;
 //dernire info pour vrification
 TraceLog.WriteToLogFile(format('CALC_ASSOC_RULE >> buildFrequentItemset (length = %d) :: %d, current memory allocated : %d',[ruleLength,lstItemsets.Count,AllocMemSize]));
 //vider les tableaux de boolens
 lstItemsets.freeBoolArray();
 //supprimer le dernier avec lstitemsets.count = 0 s'il existe
 if (lstItemsets.Count = 0)
  then FStructureItemsets.FreeLast();
 //informer
 TraceLog.WriteToLogFile(format('nombre d_itemsets frquents : %d',[nbFrequent]));
 //rcuprer sous forme de tableau le card des itemsets
 setLength(FTabCardItemsets,FStructureItemsets.Count);
 for j:= 0 to pred(FStructureItemsets.Count) do
  FTabCardItemsets[j]:= FStructureItemsets.getLstItemsets(j).Count;
end;

procedure TCalcAssocRule.buildItems(prmExamples: TExamples);
var j: integer;
    att: TAttribute;
begin
 if assigned(FItems)
  then FItems.Free;
 //construire les items
 FItems:= TLstAssocItem.Create();
 for j:= 0 to pred(FInputs.Count) do
  begin
   att:= FInputs.Attribute[j];
   FItems.buildItems(att,prmExamples);
  end;
 //filtrer les items
 FItems.FilterItemsOnSupport(FAbsMinSupport);
 //*************************************************************
 //R.R. -- 24/10/2005 -- affichage pour valuer les items (TEST)
 //*************************************************************
 (*
 TraceLog.WriteToLogFile('>>> [Begin] TRANSACTIONS <<<');
 for i:= 0 to pred(FNbExamples) do
  begin
   s:= '';
   for j:= 0 to pred(FItems.Count) do
    if FItems.Item[j].BoolArray.Flag[i]
     then s:= s+FItems.Item[j].Description+',';
   if (length(s) > 0) then s:= copy(s,1,pred(length(s)));
   Tracelog.WriteToLogFile(s);
  end;
 TraceLog.WriteToLogFile('>>> TRANSACTIONS [End] <<<');
 *)
end;

procedure TCalcAssocRule.buildRules(prmExamples: TExamples);
//var firstMemAlloc,secondMemAlloc: Integer;
var tps: cardinal;
begin
 //firstMemAlloc:= AllocMemSize;
 FNbExamples:= prmExamples.Size;
 //support min en absolu
 FAbsMinSupport:= ROUND(1.0*FPrmAssocRule.MinSupport*prmExamples.Size);
 tps:= GetTickCount();
 self.buildItems(prmExamples);
 tps:= GetTickCount()-tps;
 TraceLog.WriteToLogFile(format('[ASSOC] :: dure <<buildItems>> = %d ms.',[tps]));
 //lancer les calculs spcifiques aux mthodes
 self.initializeCalcStructure();
 tps:= GetTickCount();
 self.buildFrequentItemset(prmExamples);
 tps:= GetTickCount()-tps;
 TraceLog.WriteToLogFile(format('[ASSOC] :: dure <<buildFrequentItemset>> = %d ms.',[tps]));
 tps:= GetTickCount();
 self.buildRulesFromItemset(prmExamples,self.classRuleStructure());
 tps:= GetTickCount()-tps;
 TraceLog.WriteToLogFile(format('[ASSOC] :: dure <<buildRulesFromItemset>> = %d ms.',[tps]));
 //secondMemAlloc:= AllocMemSize;
 //librer les itemsets ???
 if assigned(FStructureItemsets)
  then FreeAndNil(FStructureItemsets);
 //infos sur la mmoire
 //TraceLog.WriteToLogFile(format('%d mem allocated before buildRules, %d after buildrules and %d after free itemsets',[firstMemAlloc,secondMemAlloc,AllocMemSize]));
end;

procedure TCalcAssocRule.buildRulesFromItemset(prmExamples: TExamples; prmClassAssocStructure: TClassAssocRuleStructure);
var calc: TCalcRuleFromItemset;
    i,j,k: integer;
    lstItemsets: TLstAssocItemSet;
    itemset: TAssocItemset;
    rule: TAssocRuleStructure;
begin
 calc:= self.getClassCalcRuleFromItemset().Create(prmExamples.Size,FPrmAssocRule.MinConfidence,self);
 //pour chaque liste d'itemsets
 for i:= pred(FStructureItemsets.Count) downto 0 do
  begin
   lstItemsets:= FStructureItemsets.getLstItemsets(i);
   for j:= 0 to pred(lstItemsets.Count) do
    begin
     itemset:= lstItemsets.getItemset(j);
     calc.BuildRules(itemset,prmClassAssocStructure);
     //lister les rgles produites
     for k:= 0 to pred(calc.lstRules.Count) do
      begin
       rule:= calc.lstRules.Items[k] as TAssocRuleStructure;
       FRules.addRule(rule);
      end;
    end;
  end;
 //vider la procdure de calcul
 calc.Free;
 //trier les rgles
 FRules.sortRulesOn(RuleCompareOnLift);
 //infos
 TraceLog.WriteToLogFile(format('nombre de rgles gnres avant filtrage : %d',[FRules.Count]));
 //filtrage fond sur le lift
 self.filteringRulesFromLift();
 //infos (bis)
 TraceLog.WriteToLogFile(format('nombre de rgles gnres aprs filtrage sur le lift : %d',[FRules.Count]));
end;

(*
constructor TCalcAssocRule.create(prmInputs: TLstAttributes;
  prmOp: TOpPrmAssocRule);
begin
 inherited Create();
 FPrmAssocRule:= prmOp;
 FInputs:= prmInputs;
 //FRules:= TLstAssocRules.create();
 //modif. 23/05/2004
 FRules:= classLstAssocRules.create();
end;
*)

constructor TCalcAssocRule.create(AssocOperator: TOpAssocRule);
begin
 inherited Create();
 FPrmAssocRule:= AssocOperator.PrmOp as TOpPrmAssocRule;
 FInputs:= AssocOperator.Inputs;
 FRules:= classLstAssocRules.create(TRUE);
end;

destructor TCalcAssocRule.destroy;
begin
 if assigned(FItems)
  then FItems.Free;
 if assigned(FStructureItemsets)
  then FStructureItemsets.Free;
 if assigned(FRules)
  then FRules.Free;
 setLength(FTabCardItemsets,0);
 inherited destroy;
end;

procedure TCalcAssocRule.filteringRulesFromLift();
var i: integer;
    rule: TAssocRuleStructure;
begin
 for i:= pred(FRules.count) downto 0 do
  begin
   rule:= FRules.getRule(i);
   if (rule.Lift<FPrmAssocRule.MinLift)
    then FRules.deleteRule(i)
    //on peut s'arrter puis que la base de rgles est trie
    else BREAK;
  end;
end;

function TCalcAssocRule.getHTMLResults: string;
var buf: TBufString;
    j: integer;
begin
 buf:= TBufString.Create();
 buf.BeginUpdate();
 buf.AddStr('<H3>ITEMS</H3>');
 buf.AddStr(HTML_HEADER_TABLE_RESULT);

 buf.AddStr(HTML_TABLE_COLOR_DATA_GRAY+format('<TD>Transactions</TD><TD align="right">%d</TD></TR>',[FNbExamples]));

 buf.AddStr(HTML_TABLE_COLOR_HEADER_GRAY+'<TH colspan=2>Counting items</TH></TR>');
 buf.AddStr(HTML_TABLE_COLOR_DATA_GRAY+format('<TD>All items</TD><TD align=right>%d</TD></TR>',[FItems.CountFirstItems]));
 buf.AddStr(HTML_TABLE_COLOR_DATA_GRAY+format('<TD>Filtered items</TD><TD align=right>%d</TD></TR>',[FItems.Count]));

 buf.AddStr(HTML_TABLE_COLOR_HEADER_BLUE+'<TH colspan=2>Counting itemsets</TH></TR>');
 for j:= 0 to high(FTabCardItemsets) do
  buf.AddStr(HTML_TABLE_COLOR_DATA_BLUE+format('<TD> card(itemset) = %d</TD><TD align=right>%d</TD></TR>',[j+2,FTabCardItemsets[j]]));

 //nombre de rgles
 buf.AddStr(HTML_TABLE_COLOR_HEADER_GRAY+'<TH colspan=2>Rules</TH></TR>');
 buf.AddStr(HTML_TABLE_COLOR_DATA_RED+format('<TD><B>Number of rules</B></TD><TD align=right><B>%d</B></TD></TR>',[FRules.Count]));
 buf.AddStr('</table>');

 //ajouter les rgles
 buf.AddStr('<H3>RULES</H3>');
 buf.AddStr(self.listingRulesHTML());
 buf.EndUpdate();
 result:= buf.BufS;
 buf.Free;
end;

procedure TCalcAssocRule.initializeCalcStructure;
begin
 FStructureItemsets:= TStructureItemsets.Create();
end;

function TCalcAssocRule.getClassCalcRuleFromItemset: TClassCalcRuleFromItemset;
begin
 result:= TCalcRuleFromItemset;
end;

function TCalcAssocRule.listingRulesHTML: string;
begin
 result:= FRules.DescriptionHTML();
end;

end.
