In [2]:
import xlwings as xw
xw.__version__
Out[2]:
'0.33.18'
In [3]:
# récupérer les données IRIS
# intégrées dans le package scikit-learn
from sklearn.datasets import load_iris
data = load_iris(as_frame=True)
df_iris = data.frame
df_iris.head()
Out[3]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
In [4]:
# type des variables
df_iris.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB
In [5]:
# visualisation dans Excel
# envoi des données via xlwings
# un nouveau classeur est créé
xw.view(df_iris)
In [ ]:
# récupération du dataset "AVC"
# ouvert dans un classeur Excel
# attention : la première colonne n'est pas un index
df = xw.load(index=False)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           200 non-null    float64
 1   hypertension  200 non-null    float64
 2   diabetes      200 non-null    float64
 3   bmi           200 non-null    float64
 4   smoking       200 non-null    float64
 5   avc           200 non-null    object 
dtypes: float64(5), object(1)
memory usage: 9.5+ KB
In [7]:
# premières lignes
df.head()
Out[7]:
age hypertension diabetes bmi smoking avc
0 60.0 0.0 1.0 23.8 0.0 no
1 53.0 0.0 0.0 24.7 0.0 no
2 62.0 1.0 1.0 24.3 0.0 yes
3 73.0 1.0 0.0 24.3 1.0 yes
4 52.0 1.0 0.0 24.1 0.0 yes
In [8]:
# analyse discriminante
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(df.iloc[:,:-1],df.avc)

#coefficents
lda.coef_
Out[8]:
array([[0.10881118, 0.54667029, 1.47561449, 0.08543448, 0.55774535]])
In [9]:
# intercept
lda.intercept_
Out[9]:
array([-10.52693045])
In [10]:
import pandas
df_coefs = pandas.DataFrame(data=lda.coef_[0],index=df.columns[:-1],columns=["coefficients"])
df_coefs
Out[10]:
coefficients
age 0.108811
hypertension 0.546670
diabetes 1.475614
bmi 0.085434
smoking 0.557745
In [11]:
# ajouter l'intercept
df_coefs.loc['intercept_'] = lda.intercept_
df_coefs
Out[11]:
coefficients
age 0.108811
hypertension 0.546670
diabetes 1.475614
bmi 0.085434
smoking 0.557745
intercept_ -10.526930
In [12]:
# l'envoyer vers un nouveau classeur Excel
# pour post traitement
xw.view(df_coefs)
In [13]:
# decision function
lda.decision_function(df.iloc[:,:-1])[:10]
Out[13]:
array([-0.48930481, -2.64970651,  0.31770507,  0.59675889, -2.26310808,
       -0.5909943 ,  1.9418004 ,  0.27913436, -1.99762247, -0.29505916])
In [14]:
# matrice de confusion pour vérification
pandas.crosstab(df.avc,lda.predict(df.iloc[:,:-1]))
Out[14]:
col_0 no yes
avc
no 127 13
yes 21 39