Importation et préparation des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement des données
import pandas
df = pandas.read_excel("breast.xlsx")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   clump       699 non-null    int64 
 1   ucellsize   699 non-null    int64 
 2   ucellshape  699 non-null    int64 
 3   mgadhesion  699 non-null    int64 
 4   sepics      699 non-null    int64 
 5   bnuclei     699 non-null    int64 
 6   bchromatin  699 non-null    int64 
 7   normnucl    699 non-null    int64 
 8   mitoses     699 non-null    int64 
 9   target      699 non-null    object
dtypes: int64(9), object(1)
memory usage: 54.7+ KB
In [ ]:
#distribution des classes
df.target.value_counts()
Out[ ]:
begnin       458
malignant    241
Name: target, dtype: int64
In [ ]:
#partition train-test
from sklearn.model_selection import train_test_split
dfTrain, dfTest = train_test_split(df,train_size=0.6,random_state=0,stratify=df.target)

#dimensions
print(dfTrain.shape)
print(dfTest.shape)
(419, 10)
(280, 10)

Classification supervisée¶

Régresion logistique¶

In [ ]:
#régression logistique
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(dfTrain.iloc[:,:-1],dfTrain.target)

#coefficients
pandas.DataFrame(lr.coef_[0],index=df.columns[:-1],columns=['Coef. LR'])
Out[ ]:
Coef. LR
clump 0.538170
ucellsize 0.148067
ucellshape 0.299107
mgadhesion 0.050552
sepics 0.187089
bnuclei 0.655944
bchromatin 0.239245
normnucl 0.279688
mitoses 0.681851
In [ ]:
#constante
lr.intercept_
Out[ ]:
array([-10.79086603])
In [ ]:
#performances en test
lr.score(dfTest.iloc[:,:-1],dfTest.target)
Out[ ]:
0.95

Analyse discriminante¶

In [ ]:
#analyse discriminante
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
adl = LinearDiscriminantAnalysis()
adl.fit(dfTrain.iloc[:,:-1],dfTrain.target)

#coef.
pandas.DataFrame(adl.coef_[0],index=df.columns[:-1],columns=['Coef. ADL'])
Out[ ]:
Coef. ADL
clump 1.017883
ucellsize 0.932632
ucellshape 0.396092
mgadhesion -0.025529
sepics 0.121232
bnuclei 1.549885
bchromatin 0.476761
normnucl 0.656702
mitoses 0.386724
In [ ]:
#constante
adl.intercept_
Out[ ]:
array([-23.51163489])
In [ ]:
#perfs. en test
adl.score(dfTest.iloc[:,:-1],dfTest.target)
Out[ ]:
0.9464285714285714
In [ ]:
#comparaison coefs. ADL et LR
lr.coef_[0]/adl.coef_[0]
Out[ ]:
array([ 0.52871535,  0.15876256,  0.75514554, -1.98018256,  1.54323453,
        0.42322136,  0.50181235,  0.42589748,  1.76314779])

Régression linéaire¶

In [ ]:
#pour codage de la cible pour la régression
#fréquences relatives des classes
freq = dfTrain.target.value_counts(normalize=True)
print(freq)
begnin       0.656325
malignant    0.343675
Name: target, dtype: float64
In [ ]:
#codage de la variable cible quantitative
#en fonction des fréquences des classes
import numpy
zTrain = numpy.where(dfTrain.target=='begnin',freq['malignant'],-1.0*freq['begnin'])

#vérif. moyenne doit être nulle
numpy.mean(zTrain)
Out[ ]:
9.53890904212545e-18
In [ ]:
#régression linéaire multiple sur variable recodée
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(dfTrain.iloc[:,:-1],zTrain)

#coefs.
pandas.DataFrame(linreg.coef_,index=df.columns[:-1],columns=['Coef. LinReg'])
Out[ ]:
Coef. LinReg
clump -0.032408
ucellsize -0.029694
ucellshape -0.012611
mgadhesion 0.000813
sepics -0.003860
bnuclei -0.049347
bchromatin -0.015180
normnucl -0.020909
mitoses -0.012313
In [ ]:
#constante
linreg.intercept_
Out[ ]:
0.5936269896039255
In [ ]:
#rapport avec les coefs. de l'ADL
#l'équivalence avec l'ADL est patente
adl.coef_[0]/linreg.coef_
Out[ ]:
array([-31.40791062, -31.40791062, -31.40791062, -31.40791062,
       -31.40791062, -31.40791062, -31.40791062, -31.40791062,
       -31.40791062])
In [ ]:
#prédiction - cible quantitative
predLinreg = linreg.predict(dfTest.iloc[:,:-1])
predLinreg[:10]
Out[ ]:
array([ 0.28667293, -0.57621614, -0.49859713,  0.3057125 , -0.86592343,
        0.318601  ,  0.33426109, -0.42733103,  0.25492959,  0.35148993])
In [ ]:
#prédiction - conversion en cible qualitative
#le seuil 0 parce que z codée de manière à ce que mean(z) = 0
classLinreg = numpy.where(predLinreg >= 0,'begnin','malignant')
classLinreg[:10]
Out[ ]:
array(['begnin', 'malignant', 'malignant', 'begnin', 'malignant',
       'begnin', 'begnin', 'malignant', 'begnin', 'begnin'], dtype='<U9')
In [ ]:
#score - performances en test
numpy.mean(dfTest.target == classLinreg)
Out[ ]:
0.9535714285714286