Configuration, vérification

#chargement de la libraire
library(reticulate)

#connexion avec Python
#environnement "base" d'Anaconda
reticulate::use_python("C:/Users/ricco/anaconda3")

#vérification de la version
reticulate::py_config()
## python:         C:/Users/ricco/anaconda3/python.exe
## libpython:      C:/Users/ricco/anaconda3/python312.dll
## pythonhome:     C:/Users/ricco/anaconda3
## version:        3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
## Architecture:   64bit
## numpy:          C:/Users/ricco/anaconda3/Lib/site-packages/numpy
## numpy_version:  1.26.4
## 
## NOTE: Python version was forced by use_python() function
#essai d'exécution
reticulate::py_eval("3**3")
## [1] 27

Traitements sous R

Chargement des données

#importation des données
library(readxl)
df <- readxl::read_excel("heart_disease_male.xlsx")
str(df)
## tibble [209 × 7] (S3: tbl_df/tbl/data.frame)
##  $ age            : num [1:209] 43 39 39 42 49 50 59 54 59 56 ...
##  $ chest_pain     : chr [1:209] "asympt" "atyp_angina" "non_anginal" "non_anginal" ...
##  $ rest_bpress    : num [1:209] 140 120 160 160 140 140 140 200 130 170 ...
##  $ blood_sugar    : chr [1:209] "f" "f" "t" "f" ...
##  $ max_heart_rate : num [1:209] 135 160 160 146 130 135 119 142 125 122 ...
##  $ exercice_angina: chr [1:209] "yes" "yes" "no" "no" ...
##  $ disease        : chr [1:209] "positive" "negative" "negative" "negative" ...

Analyse factorielle des données mixtes sous R

#isoler les descripteurs
X <- df[1:6]
colnames(X)
## [1] "age"             "chest_pain"      "rest_bpress"     "blood_sugar"    
## [5] "max_heart_rate"  "exercice_angina"
# FAMD
library(FactoMineR)
res <- FactoMineR::FAMD(X,ncp=2,graph=FALSE)

#affichage
print(res$eig)
##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1   2.311369               28.89211                          28.89211
## comp 2   1.119654               13.99568                          42.88779
#coordonnées des individus
head(res$ind$coord)
##        Dim.1      Dim.2
## 1  0.9535241 -1.0043685
## 2 -1.0365063 -0.9633966
## 3 -0.5403707  3.2596526
## 4 -0.8021856  1.0455921
## 5  0.3646007 -0.2912144
## 6  0.3096903 -0.2311542
#projetés dans le plan
plot(res$ind$coord)

#récupération des coordonnées dans un data frame
factCoord <- as.data.frame(res$ind$coord)
head(factCoord)
#rajouter la classe
factCoord$disease <- df$disease
head(factCoord)

Régression logistique sous Python

#récupération des données à partir de R
dfR = r.factCoord
print(type(dfR))
## <class 'pandas.core.frame.DataFrame'>
#afficher les infos
dfR.info()
## <class 'pandas.core.frame.DataFrame'>
## Index: 209 entries, 1 to 209
## Data columns (total 3 columns):
##  #   Column   Non-Null Count  Dtype  
## ---  ------   --------------  -----  
##  0   Dim.1    209 non-null    float64
##  1   Dim.2    209 non-null    float64
##  2   disease  209 non-null    object 
## dtypes: float64(2), object(1)
## memory usage: 6.5+ KB
#premières lignes
dfR.head()
##       Dim.1     Dim.2   disease
## 1  0.953524 -1.004369  positive
## 2 -1.036506 -0.963397  negative
## 3 -0.540371  3.259653  negative
## 4 -0.802186  1.045592  negative
## 5  0.364601 -0.291214  negative
#régression logistique
from sklearn.linear_model import LogisticRegression
modele = LogisticRegression()

#entraînement
modele.fit(dfR.iloc[:,:-1],dfR.disease)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
#affichage des coefficients
modele.coef_
## array([[ 0.9742213 , -0.36294673]])
#et l'intercept
modele.intercept_
## array([-0.32582659])
#mettre les infos dans un vecteur numpy
import numpy
coefs = numpy.concatenate((modele.coef_[0],modele.intercept_))
print(coefs)
## [ 0.9742213  -0.36294673 -0.32582659]

Graphique dans R

#récupération des coefficients
RCoefs <- py$coefs
print(RCoefs)
## [1]  0.9742213 -0.3629467 -0.3258266
#calcul de la pente et de l'origine de la droite
a <- -RCoefs[3]/RCoefs[2]
b <- -RCoefs[1]/RCoefs[2]
#frontières dans le plan
y <- factor(factCoord$disease)
plot(factCoord[,1],factCoord[,2],col=c("red","blue")[y])
legend(-3,3,levels(y),fill=c('red','blue'),cex=0.8)
abline(a,b,lty=6,lwd=3)

Matrice de confusion sous Python

#et matrice de confusion
import matplotlib.pyplot as plt
from sklearn import metrics
metrics.ConfusionMatrixDisplay.from_estimator(modele,dfR.iloc[:,:-1],dfR.disease)
## <sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay object at 0x000001DAD7D4B8F0>
plt.show()