In [2]:
import xlwings as xw
xw.__version__
Out[2]:
'0.33.18'
In [3]:
# récupérer les données IRIS
# intégrées dans le package scikit-learn
from sklearn.datasets import load_iris
data = load_iris(as_frame=True)
df_iris = data.frame
df_iris.head()
Out[3]:
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | target | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
In [4]:
# type des variables
df_iris.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sepal length (cm) 150 non-null float64 1 sepal width (cm) 150 non-null float64 2 petal length (cm) 150 non-null float64 3 petal width (cm) 150 non-null float64 4 target 150 non-null int64 dtypes: float64(4), int64(1) memory usage: 6.0 KB
In [5]:
# visualisation dans Excel
# envoi des données via xlwings
# un nouveau classeur est créé
xw.view(df_iris)
In [ ]:
# récupération du dataset "AVC"
# ouvert dans un classeur Excel
# attention : la première colonne n'est pas un index
df = xw.load(index=False)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 200 non-null float64 1 hypertension 200 non-null float64 2 diabetes 200 non-null float64 3 bmi 200 non-null float64 4 smoking 200 non-null float64 5 avc 200 non-null object dtypes: float64(5), object(1) memory usage: 9.5+ KB
In [7]:
# premières lignes
df.head()
Out[7]:
| age | hypertension | diabetes | bmi | smoking | avc | |
|---|---|---|---|---|---|---|
| 0 | 60.0 | 0.0 | 1.0 | 23.8 | 0.0 | no |
| 1 | 53.0 | 0.0 | 0.0 | 24.7 | 0.0 | no |
| 2 | 62.0 | 1.0 | 1.0 | 24.3 | 0.0 | yes |
| 3 | 73.0 | 1.0 | 0.0 | 24.3 | 1.0 | yes |
| 4 | 52.0 | 1.0 | 0.0 | 24.1 | 0.0 | yes |
In [8]:
# analyse discriminante
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(df.iloc[:,:-1],df.avc)
#coefficents
lda.coef_
Out[8]:
array([[0.10881118, 0.54667029, 1.47561449, 0.08543448, 0.55774535]])
In [9]:
# intercept
lda.intercept_
Out[9]:
array([-10.52693045])
In [10]:
import pandas
df_coefs = pandas.DataFrame(data=lda.coef_[0],index=df.columns[:-1],columns=["coefficients"])
df_coefs
Out[10]:
| coefficients | |
|---|---|
| age | 0.108811 |
| hypertension | 0.546670 |
| diabetes | 1.475614 |
| bmi | 0.085434 |
| smoking | 0.557745 |
In [11]:
# ajouter l'intercept
df_coefs.loc['intercept_'] = lda.intercept_
df_coefs
Out[11]:
| coefficients | |
|---|---|
| age | 0.108811 |
| hypertension | 0.546670 |
| diabetes | 1.475614 |
| bmi | 0.085434 |
| smoking | 0.557745 |
| intercept_ | -10.526930 |
In [12]:
# l'envoyer vers un nouveau classeur Excel
# pour post traitement
xw.view(df_coefs)
In [13]:
# decision function
lda.decision_function(df.iloc[:,:-1])[:10]
Out[13]:
array([-0.48930481, -2.64970651, 0.31770507, 0.59675889, -2.26310808,
-0.5909943 , 1.9418004 , 0.27913436, -1.99762247, -0.29505916])
In [14]:
# matrice de confusion pour vérification
pandas.crosstab(df.avc,lda.predict(df.iloc[:,:-1]))
Out[14]:
| col_0 | no | yes |
|---|---|---|
| avc | ||
| no | 127 | 13 |
| yes | 21 | 39 |