#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement des données
import pandas
df = pandas.read_excel("breast.xlsx")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 699 entries, 0 to 698 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 clump 699 non-null int64 1 ucellsize 699 non-null int64 2 ucellshape 699 non-null int64 3 mgadhesion 699 non-null int64 4 sepics 699 non-null int64 5 bnuclei 699 non-null int64 6 bchromatin 699 non-null int64 7 normnucl 699 non-null int64 8 mitoses 699 non-null int64 9 target 699 non-null object dtypes: int64(9), object(1) memory usage: 54.7+ KB
#distribution des classes
df.target.value_counts()
begnin 458 malignant 241 Name: target, dtype: int64
#partition train-test
from sklearn.model_selection import train_test_split
dfTrain, dfTest = train_test_split(df,train_size=0.6,random_state=0,stratify=df.target)
#dimensions
print(dfTrain.shape)
print(dfTest.shape)
(419, 10) (280, 10)
#régression logistique
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(dfTrain.iloc[:,:-1],dfTrain.target)
#coefficients
pandas.DataFrame(lr.coef_[0],index=df.columns[:-1],columns=['Coef. LR'])
Coef. LR | |
---|---|
clump | 0.538170 |
ucellsize | 0.148067 |
ucellshape | 0.299107 |
mgadhesion | 0.050552 |
sepics | 0.187089 |
bnuclei | 0.655944 |
bchromatin | 0.239245 |
normnucl | 0.279688 |
mitoses | 0.681851 |
#constante
lr.intercept_
array([-10.79086603])
#performances en test
lr.score(dfTest.iloc[:,:-1],dfTest.target)
0.95
#analyse discriminante
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
adl = LinearDiscriminantAnalysis()
adl.fit(dfTrain.iloc[:,:-1],dfTrain.target)
#coef.
pandas.DataFrame(adl.coef_[0],index=df.columns[:-1],columns=['Coef. ADL'])
Coef. ADL | |
---|---|
clump | 1.017883 |
ucellsize | 0.932632 |
ucellshape | 0.396092 |
mgadhesion | -0.025529 |
sepics | 0.121232 |
bnuclei | 1.549885 |
bchromatin | 0.476761 |
normnucl | 0.656702 |
mitoses | 0.386724 |
#constante
adl.intercept_
array([-23.51163489])
#perfs. en test
adl.score(dfTest.iloc[:,:-1],dfTest.target)
0.9464285714285714
#comparaison coefs. ADL et LR
lr.coef_[0]/adl.coef_[0]
array([ 0.52871535, 0.15876256, 0.75514554, -1.98018256, 1.54323453, 0.42322136, 0.50181235, 0.42589748, 1.76314779])
#pour codage de la cible pour la régression
#fréquences relatives des classes
freq = dfTrain.target.value_counts(normalize=True)
print(freq)
begnin 0.656325 malignant 0.343675 Name: target, dtype: float64
#codage de la variable cible quantitative
#en fonction des fréquences des classes
import numpy
zTrain = numpy.where(dfTrain.target=='begnin',freq['malignant'],-1.0*freq['begnin'])
#vérif. moyenne doit être nulle
numpy.mean(zTrain)
9.53890904212545e-18
#régression linéaire multiple sur variable recodée
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(dfTrain.iloc[:,:-1],zTrain)
#coefs.
pandas.DataFrame(linreg.coef_,index=df.columns[:-1],columns=['Coef. LinReg'])
Coef. LinReg | |
---|---|
clump | -0.032408 |
ucellsize | -0.029694 |
ucellshape | -0.012611 |
mgadhesion | 0.000813 |
sepics | -0.003860 |
bnuclei | -0.049347 |
bchromatin | -0.015180 |
normnucl | -0.020909 |
mitoses | -0.012313 |
#constante
linreg.intercept_
0.5936269896039255
#rapport avec les coefs. de l'ADL
#l'équivalence avec l'ADL est patente
adl.coef_[0]/linreg.coef_
array([-31.40791062, -31.40791062, -31.40791062, -31.40791062, -31.40791062, -31.40791062, -31.40791062, -31.40791062, -31.40791062])
#prédiction - cible quantitative
predLinreg = linreg.predict(dfTest.iloc[:,:-1])
predLinreg[:10]
array([ 0.28667293, -0.57621614, -0.49859713, 0.3057125 , -0.86592343, 0.318601 , 0.33426109, -0.42733103, 0.25492959, 0.35148993])
#prédiction - conversion en cible qualitative
#le seuil 0 parce que z codée de manière à ce que mean(z) = 0
classLinreg = numpy.where(predLinreg >= 0,'begnin','malignant')
classLinreg[:10]
array(['begnin', 'malignant', 'malignant', 'begnin', 'malignant', 'begnin', 'begnin', 'malignant', 'begnin', 'begnin'], dtype='<U9')
#score - performances en test
numpy.mean(dfTest.target == classLinreg)
0.9535714285714286