#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo/candisc")
#importation des données
import pandas
df = pandas.read_excel("Olive_Oil_Candisc.xlsx",sheet_name="dataset")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLASSE 569 non-null object 1 palmitic 569 non-null int64 2 palmitoleic 569 non-null int64 3 stearic 569 non-null int64 4 oleic 569 non-null int64 5 linoleic 569 non-null int64 6 linolenic 569 non-null int64 7 arachidic 569 non-null int64 8 eicosenoic 569 non-null int64 dtypes: int64(8), object(1) memory usage: 40.1+ KB
#première lignes
df.head()
CLASSE | palmitic | palmitoleic | stearic | oleic | linoleic | linolenic | arachidic | eicosenoic | |
---|---|---|---|---|---|---|---|---|---|
0 | Centre_North | 1400 | 90 | 270 | 7420 | 800 | 0 | 20 | 2 |
1 | Centre_North | 990 | 120 | 250 | 7750 | 870 | 10 | 10 | 2 |
2 | Centre_North | 1020 | 100 | 220 | 7530 | 1030 | 0 | 0 | 3 |
3 | Centre_North | 1098 | 54 | 202 | 7945 | 595 | 42 | 32 | 2 |
4 | Centre_North | 1120 | 80 | 260 | 7750 | 680 | 30 | 80 | 3 |
#classes - distribution
df.CLASSE.value_counts()
South 322 Centre_North 150 Sardinia 97 Name: CLASSE, dtype: int64
#analyse factorielle discriminante
from scientisttools.discriminant_analysis import CANDISC
#instanciation - nombre max de facteurs = min(nb var. descriptives,nb groupes-1)
candisc = CANDISC(n_components=2,
target=['CLASSE'],
features_labels=list(df.columns[1:]),
row_labels=df.index,
priors=None)
#entraînement
candisc.fit(df)
CANDISC(features_labels=['palmitic', 'palmitoleic', 'stearic', 'oleic', 'linoleic', 'linolenic', 'arachidic', 'eicosenoic'], n_components=2, row_labels=RangeIndex(start=0, stop=569, step=1), target=['CLASSE'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
CANDISC(features_labels=['palmitic', 'palmitoleic', 'stearic', 'oleic', 'linoleic', 'linolenic', 'arachidic', 'eicosenoic'], n_components=2, row_labels=RangeIndex(start=0, stop=569, step=1), target=['CLASSE'])
#résumé des résultats
from scientisttools.extractfactor import summaryCANDISC
summaryCANDISC(candisc,to_markdown=True)
Canonical Discriminant Analysis - Results Summary Information | | Total Sample Size | Variables | Classes | DF Total | DF Within Classes | DF Between Classes | |:------|--------------------:|------------:|----------:|-----------:|--------------------:|---------------------:| | value | 569 | 8 | 3 | 568 | 566 | 2 | Class Level information | | n(k) | p(k) | |:------------------|-------:|---------:| | ('South',) | 322 | 0.565905 | | ('Centre_North',) | 150 | 0.26362 | | ('Sardinia',) | 97 | 0.170475 | Importance of components | | LD1 | LD2 | |:------------------------|-------:|--------:| | Variance | 8.472 | 2.303 | | Difference | 6.168 | nan | | % of var. | 78.623 | 21.377 | | Cumulative of % of var. | 78.623 | 100 | Group means: | CLASSE | palmitic | palmitoleic | stearic | oleic | linoleic | linolenic | arachidic | eicosenoic | |:-------------|-----------:|--------------:|----------:|--------:|-----------:|------------:|------------:|-------------:| | Centre_North | 1094.83 | 83.893 | 231.04 | 7791.97 | 727.88 | 21.747 | 37.547 | 1.973 | | Sardinia | 1112.06 | 96.351 | 226.351 | 7266.91 | 1197.36 | 27.01 | 73 | 1.928 | | South | 1332.37 | 154.888 | 228.708 | 7099.53 | 1034.01 | 38.037 | 63.102 | 27.332 | Coefficients of canonical discriminants: | | LD1 | LD2 | |:------------|--------:|--------:| | palmitic | 0.003 | 0.009 | | palmitoleic | 0.013 | 0.018 | | stearic | -0.003 | 0.004 | | oleic | 0.001 | 0.006 | | linoleic | 0.001 | -0.001 | | linolenic | 0.041 | 0.006 | | arachidic | -0.017 | -0.035 | | eicosenoic | 0.163 | 0.01 | | Intercept | -13.065 | -56.917 | Classification functions coefficients: | | Centre_North | Sardinia | South | |:------------|---------------:|-----------:|--------:| | palmitic | 0.007 | -0.034 | 0.007 | | palmitoleic | -0.01 | -0.096 | 0.033 | | stearic | 0.017 | -0.003 | -0.007 | | oleic | 0.009 | -0.02 | 0.002 | | linoleic | -0.006 | -0 | 0.003 | | linolenic | -0.126 | -0.152 | 0.104 | | arachidic | -0.006 | 0.156 | -0.044 | | eicosenoic | -0.523 | -0.567 | 0.415 | | Intercept | -70.09 | 194.655 | -37.181 | Individuals (the 10 first) | | LD1 | LD2 | |---:|-------:|------:| | 0 | -3.245 | 2.894 | | 1 | -3.058 | 2.107 | | 2 | -3.184 | 0.602 | | 3 | -2.746 | 2.63 | | 4 | -3.695 | 0.503 | | 5 | -3.568 | 2.404 | | 6 | -3.185 | 3.136 | | 7 | -3.803 | 3.569 | | 8 | -2.244 | 3.061 | | 9 | -3.767 | 1.808 | Continues variables | | total.1 | between.1 | within.1 | total.2 | between.2 | within.2 | |:------------|----------:|------------:|-----------:|----------:|------------:|-----------:| | palmitic | 0.718 | 0.997 | 0.318 | -0.038 | -0.148 | -0.028 | | palmitoleic | 0.66 | 0.98 | 0.276 | -0.09 | -0.268 | -0.064 | | stearic | -0.007 | -0.161 | -0.002 | 0.049 | 0.996 | 0.027 | | oleic | -0.629 | -0.667 | -0.297 | 0.497 | 0.789 | 0.398 | | linoleic | 0.264 | 0.288 | 0.116 | -0.745 | -0.975 | -0.553 | | linolenic | 0.575 | 0.928 | 0.225 | -0.155 | -0.436 | -0.103 | | arachidic | 0.278 | 0.34 | 0.111 | -0.62 | -0.962 | -0.419 | | eicosenoic | 0.945 | 1 | 0.683 | 0.004 | -0.063 | 0.004 |
#test MANOVA - les nuages sont-ils séparables ?
#cf. le logiciel SAS
print(candisc.manova_)
Multivariate linear model ================================================================= ----------------------------------------------------------------- CLASSE Value Num DF Den DF F Value Pr > F ----------------------------------------------------------------- Wilks' lambda 0.0320 16.0000 1118.0000 320.9837 0.0000 Pillai's trace 1.5917 16.0000 1120.0000 272.8899 0.0000 Hotelling-Lawley trace 10.7752 16.0000 911.1473 375.9385 0.0000 Roy's greatest root 8.4718 8.0000 560.0000 593.0293 0.0000 =================================================================
#autre expression des tests
#cf. TANAGRA
candisc.global_performance_
Stat | Value | p-value | |
---|---|---|---|
0 | Wilks' Lambda | 0.031960 | NaN |
1 | Bartlett -- C(16) | 1936.843029 | 0.000000e+00 |
2 | Rao -- F(16,1118) | 320.983726 | 1.110223e-16 |
#tests de séparabilité en prenant les variables individuellement
#indépendamment les unes des autres
#triés de manière décroissante selon le carré du rapport de corrélation
candisc.univariate_test_statistis_.sort_values(by='R-squared',ascending=False)
Std. Dev. | R-squared | Rsq/(1-Rsq) | F-statistic | Prob (F-statistic) | |
---|---|---|---|---|---|
eicosenoic | 14.092942 | 0.797929 | 3.948758 | 1117.498522 | 2.867939e-197 |
oleic | 405.966522 | 0.526521 | 1.112025 | 314.703134 | 1.288711e-92 |
palmitic | 168.715167 | 0.461491 | 0.856978 | 242.524854 | 8.465810e-77 |
linoleic | 242.802985 | 0.449629 | 0.816955 | 231.198312 | 4.032628e-74 |
palmitoleic | 52.558263 | 0.395095 | 0.653152 | 184.841942 | 1.650063e-62 |
arachidic | 22.033442 | 0.337110 | 0.508547 | 143.918675 | 2.936859e-51 |
linolenic | 12.985725 | 0.312278 | 0.454076 | 128.503464 | 9.724383e-47 |
stearic | 36.795195 | 0.001728 | 0.001731 | 0.489942 | 6.129213e-01 |
#représentation graphique dans le plan
from scientisttools.pyplot import plotCANDISC
import matplotlib.pyplot as plt
fig, axe =plt.subplots(figsize=(8,5))
plotCANDISC(candisc,color=['skyblue','chartreuse','orange'],marker=['.','.','.'],ax=axe)
plt.show()
#coefficients des équations de projection
from scientisttools.extractfactor import get_candisc_coef
coef = get_candisc_coef(candisc,choice="absolute")
coef
LD1 | LD2 | |
---|---|---|
palmitic | 0.002760 | 0.008854 |
palmitoleic | 0.013092 | 0.018422 |
stearic | -0.002794 | 0.004253 |
oleic | 0.000626 | 0.006239 |
linoleic | 0.001131 | -0.001260 |
linolenic | 0.041077 | 0.005822 |
arachidic | -0.017345 | -0.034675 |
eicosenoic | 0.163051 | 0.010059 |
Intercept | -13.064616 | -56.916913 |
#test des facteurs
#c.-à-d. le facteur concerné et les suivants
#présentent-ils des rapports de corrélation nuls ?
candisc.likelihood_test_
statistic | DDL num. | DDL den. | Pr>F | |
---|---|---|---|---|
0 | 320.983726 | 16.0 | 1118.0 | 1.110223e-16 |
1 | 184.272085 | 7.0 | 560.0 | 1.110223e-16 |
#barycentres conditionnels
candisc.gcenter_
LD1 | LD2 | |
---|---|---|
CLASSE | ||
Centre_North | -3.322164 | 1.843764 |
Sardinia | -3.302678 | -2.860687 |
South | 2.542498 | 0.002863 |
#faire apparaître les moyennes conditionnelles dans le graphique
fig, axe =plt.subplots(figsize=(8,5))
plotCANDISC(candisc,color=['skyblue','chartreuse','orange'],marker=['.','.','.'],ax=axe)
plt.scatter(candisc.gcenter_.LD1,candisc.gcenter_.LD2,c=['blue','green','red'],marker='X',s=90)
plt.show()
#distance (au carré) entre les centres de classes
candisc.squared_mdist_
Centre_North | Sardinia | South | |
---|---|---|---|
Centre_North | 0.000000 | 22.132240 | 37.783184 |
Sardinia | 22.132240 | 0.000000 | 42.366007 |
South | 37.783184 | 42.366007 | 0.000000 |
#corrélation (totales) des variables avec les facteurs
candisc.tcorr_
LD1 | LD2 | |
---|---|---|
palmitic | 0.717538 | -0.037627 |
palmitoleic | 0.659883 | -0.089794 |
stearic | -0.007083 | 0.049135 |
oleic | -0.629206 | 0.497264 |
linoleic | 0.264460 | -0.745062 |
linolenic | 0.574829 | -0.154918 |
arachidic | 0.278118 | -0.619875 |
eicosenoic | 0.944513 | 0.003556 |
#cercle des corrélations
#direction des variables par rapport aux facteurs
fig, axe =plt.subplots(figsize=(6,6))
axe.axis([-1.0,+1.0,-1.0,+1.0])
axe.plot([-1.0,+1.0],[0,0],color='gainsboro',linestyle='--')
axe.plot([0,0],[-1.0,+1.0],color='gainsboro',linestyle='--')
axe.add_artist(plt.Circle((0,0),radius=1.0,
facecolor='khaki',edgecolor='silver',fill=False))
plt.scatter(candisc.tcorr_.LD1,candisc.tcorr_.LD2,alpha=0.2)
for i in range(candisc.tcorr_.shape[0]):
axe.text(candisc.tcorr_.LD1[i],candisc.tcorr_.LD2[i],candisc.tcorr_.index[i],
horizontalalignment='center',verticalalignment='center',fontsize=7)
plt.arrow(0,0,candisc.tcorr_.LD1[i]*0.85,candisc.tcorr_.LD2[i]*0.85,width=0.0005,head_width=0.02,color='grey')
plt.show()
#vérification des moyennes conditionnelles
gb = df.groupby('CLASSE')
gb.mean()
palmitic | palmitoleic | stearic | oleic | linoleic | linolenic | arachidic | eicosenoic | |
---|---|---|---|---|---|---|---|---|
CLASSE | ||||||||
Centre_North | 1094.833333 | 83.893333 | 231.040000 | 7791.973333 | 727.880000 | 21.746667 | 37.546667 | 1.973333 |
Sardinia | 1112.061856 | 96.350515 | 226.350515 | 7266.907216 | 1197.360825 | 27.010309 | 73.000000 | 1.927835 |
South | 1332.369565 | 154.888199 | 228.708075 | 7099.531056 | 1034.009317 | 38.037267 | 63.102484 | 27.332298 |
#corrélation (inter-groupes) des variables avec les facteurs
#indique le rôle que joue la variable dans l'écartement des groupes
#pour le facteur étudié
candisc.bcorr_
LD1 | LD2 | |
---|---|---|
palmitic | 0.996819 | -0.148078 |
palmitoleic | 0.979502 | -0.268340 |
stearic | -0.160870 | 0.995704 |
oleic | -0.666629 | 0.789483 |
linoleic | 0.287779 | -0.975225 |
linolenic | 0.927928 | -0.435710 |
arachidic | 0.340070 | -0.961566 |
eicosenoic | 0.999983 | -0.063032 |
#importation
XSupp = pandas.read_excel("Olive_Oil_Candisc.xlsx",sheet_name="supplementaires")
XSupp
palmitic | palmitoleic | stearic | oleic | linoleic | linolenic | arachidic | eicosenoic | |
---|---|---|---|---|---|---|---|---|
0 | 1306 | 127 | 250 | 7554 | 869 | 47 | 68 | 16 |
#calcul des coordonnées factorielles
coordSupp = candisc.transform(XSupp)
coordSupp
LD1 | LD2 | |
---|---|---|
0 | 0.578707 | 2.157809 |
#rajouter dans le graphique
fig, axe =plt.subplots(figsize=(8,5))
plotCANDISC(candisc,color=['skyblue','chartreuse','orange'],marker=['.','.','.'],ax=axe)
plt.scatter(candisc.gcenter_.LD1,candisc.gcenter_.LD2,c=['blue','green','red'],marker='X',s=90)
plt.scatter(coordSupp.LD1,coordSupp.LD2,marker='o',c='black',s=30)
#matéraliser l'écartement par rapport aux centroïdes
candisc.gcenter_.apply(axis=1,
func=lambda x:plt.plot([coordSupp.values[0,0],x.values[0]],[coordSupp.values[0,1],x.values[1]],
c='grey',linestyle=':'))
plt.show()
Calcul de la distance aux barycentres.
#la plus proche au sens du barycentre conditionnel
#rappel - affichage des moyennes conditionnelles
candisc.gcenter_
LD1 | LD2 | |
---|---|---|
CLASSE | ||
Centre_North | -3.322164 | 1.843764 |
Sardinia | -3.302678 | -2.860687 |
South | 2.542498 | 0.002863 |
#distance aux barycentres conditionnels
import numpy
dist2 = candisc.gcenter_.apply(axis=1,func=lambda x:numpy.sum((x.values-coordSupp.values[0,:])**2))
numpy.sqrt(dist2)
CLASSE Centre_North 3.913492 Sardinia 6.344325 South 2.915522 dtype: float64
Conversion en probabilité d'affectation.
#fréquence des classes
candisc.class_level_information_[['p(k)']].sort_index()
p(k) | |
---|---|
CLASSE | |
Centre_North | 0.263620 |
Sardinia | 0.170475 |
South | 0.565905 |
#distance carrée généralisée
distGen = dist2.values - 2.0 * numpy.log(candisc.class_level_information_[['p(k)']].sort_index().values.reshape(-1))
distGen
array([17.98191075, 43.78879377, 9.63892755])
#proba d'affectation - softmax
proba = numpy.exp(-0.5*distGen)/numpy.sum(numpy.exp(-0.5*distGen))
proba
array([1.51947843e-02, 3.78269298e-08, 9.84805178e-01])
Vérification avec la probabilité fournie par "scientisttools".
#proba forunie par scientisttols
#complètement cohérente !!!
candisc.predict_proba(XSupp)
Centre_North | Sardinia | South | |
---|---|---|---|
0 | 0.015195 | 3.782693e-08 | 0.984805 |