PLS-LDA à la mode ChatGPT¶
In [30]:
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
class PLS_LDA:
def __init__(self, n_components=2, scale=True):
self.n_components = n_components
self.scale = scale
self.scaler = None
self.pls = None
self.lda = None
self.lb = None
self.coef_ = None
self.intercept_ = None
#R.R.
self.classes_ = None
def fit(self, X, y):
X = np.asarray(X)
y = np.asarray(y)
n, p = X.shape
# -------------------------
# Standardisation
# -------------------------
if self.scale:
self.scaler = StandardScaler()
Xs = self.scaler.fit_transform(X)
else:
Xs = X.copy()
# -------------------------
# Encodage classes
# -------------------------
self.lb = LabelBinarizer()
Y = self.lb.fit_transform(y)
#R.R.
self.classes_ = self.lb.classes_
if Y.ndim == 1:
Y = np.vstack((1 - Y, Y)).T
# -------------------------
# PLS
# -------------------------
self.pls = PLSRegression(n_components=self.n_components)
self.pls.fit(Xs, Y)
T = self.pls.x_scores_
# -------------------------
# LDA sur scores PLS
# -------------------------
self.lda = LinearDiscriminantAnalysis()
self.lda.fit(T, y)
# -------------------------
# Calcul des coefficients globaux
# -------------------------
# T = Xs W*
W_star = self.pls.x_rotations_
# Fonction discriminante :
# f(x) = (X - mean)/std @ W* @ lda.coef_.T + intercept
B_scaled = W_star @ self.lda.coef_.T # (p Ă— K)
if self.scale:
# Remise aux unités originales
scale = self.scaler.scale_.reshape(-1, 1)
self.coef_ = B_scaled / scale
self.intercept_ = (
self.lda.intercept_
- (self.scaler.mean_ / self.scaler.scale_) @ B_scaled
)
else:
self.coef_ = B_scaled
self.intercept_ = self.lda.intercept_
return self
# -------------------------------------------------
# Coordonnées factorielles PLS
# -------------------------------------------------
def transform(self, X):
X = np.asarray(X)
if self.scale:
X = self.scaler.transform(X)
return self.pls.transform(X)
# -------------------------------------------------
# Scores d'affectation (fonctions discriminantes)
# -------------------------------------------------
def predict_scores(self, X):
X = np.asarray(X)
return X @ self.coef_ + self.intercept_
# -------------------------------------------------
# Prédiction classes
# -------------------------------------------------
def predict(self, X):
scores = self.predict_scores(X)
class_index = np.argmax(scores, axis=1)
return self.lb.classes_[class_index]
# -------------------------------------------------
# VIP
# -------------------------------------------------
def vip(self):
T = self.pls.x_scores_
W = self.pls.x_weights_
Q = self.pls.y_loadings_
p, h = W.shape
# Variance expliquée par composante
SS = np.sum((T @ Q.T) ** 2, axis=0)
total_SS = np.sum(SS)
vip = np.zeros(p)
for j in range(p):
weight = np.array([
(W[j, a] ** 2) * SS[a]
for a in range(h)
])
vip[j] = np.sqrt(p * np.sum(weight) / total_SS)
return vip
Exemple d'utilisation fourni par ChatGPT¶
In [31]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data = load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=0
)
model = PLS_LDA(n_components=2)
model.fit(X_train, y_train)
# Prédiction
y_pred = model.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))
# Scores discriminants
scores = model.predict_scores(X_test)
print("Scores d'affectation :", scores[:5])
# Coordonnées factorielles
print("Scores factoriels :", model.transform(X_test)[:5])
# VIP
print("VIP :", model.vip())
# Coefficients globaux
print("Coefficients :", model.coef_)
print("Intercept :", model.intercept_)
Accuracy : 0.9333333333333333 Scores d'affectation : [[-35.85010696 2.14715841 5.03270374] [-26.7478757 1.31743915 -2.2217795 ] [ 22.18290829 -11.31218282 -34.51661932] [-44.03307765 3.45794658 11.09105713] [ 14.51004254 -8.49955712 -30.13524974]] Scores factoriels : [[ 1.39772157 -0.32878015] [ 0.44079519 -1.84639484] [-2.47277523 2.19641888] [ 2.103773 0.19192841] [-2.24311368 0.31960289]] VIP : [0.72336544 0.73913436 0.86193218 0.86587726] Coefficients : [[-6.6649037 1.16072627 4.85803808] [13.20517396 -4.12909686 -8.12422603] [-4.41178435 0.89711342 3.11007791] [-9.64668113 1.82321276 6.91395769]] Intercept : [ 11.4839823 -1.97457183 -32.85098004]
Application sur notre jeu de données "health"¶
Importation et préparation des données¶
In [32]:
# dossier de nos données
import os
os.chdir("C:/Users/ricco/Desktop/demo")
# importation
import pandas
df = pandas.read_excel("health_dataset.xlsx")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1000 non-null int64 1 IMC 1000 non-null int64 2 PA_systolique 1000 non-null int64 3 Cholesterol 1000 non-null int64 4 Glycemie_jeun 1000 non-null int64 5 Freq_card_repos 1000 non-null int64 6 Taille_cm 1000 non-null int64 7 Score_bien_etre 1000 non-null int64 8 Risque 1000 non-null object dtypes: int64(8), object(1) memory usage: 70.4+ KB
In [33]:
# X et y
X = df.drop(columns="Risque")
y = df.Risque
Appel de la PLS-LDA sur nos données¶
Coefficients pour la prédiction (scores d'affectation)
In [34]:
# appel de la PLS_LDA sur nos données
pls_lda = PLS_LDA(n_components=2)
pls_lda.fit(X,y)
# coefficients
pandas.DataFrame(pls_lda.coef_,index=X.columns,columns=pls_lda.classes_)
Out[34]:
| 1Faible | 2Modere | 3Eleve | |
|---|---|---|---|
| Age | -0.219197 | 0.063533 | 0.291913 |
| IMC | -0.621378 | 0.198533 | 0.802028 |
| PA_systolique | -0.163117 | 0.049618 | 0.213993 |
| Cholesterol | -0.079185 | 0.022826 | 0.105626 |
| Glycemie_jeun | -0.130784 | 0.029979 | 0.185132 |
| Freq_card_repos | 0.010408 | -0.014467 | 0.001972 |
| Taille_cm | -0.009326 | 0.025943 | -0.019716 |
| Score_bien_etre | 0.001332 | -0.000109 | -0.002157 |
In [35]:
# intercept
pls_lda.intercept_
Out[35]:
array([ 71.22668385, -27.36823675, -112.08688265])
Variable importance in projection (VIP). Mais pas en prédiction.
In [36]:
# VIP
pandas.DataFrame({"var":X.columns,"vip":pls_lda.vip()})
Out[36]:
| var | vip | |
|---|---|---|
| 0 | Age | 0.908469 |
| 1 | IMC | 0.937889 |
| 2 | PA_systolique | 0.926170 |
| 3 | Cholesterol | 0.940766 |
| 4 | Glycemie_jeun | 0.993227 |
| 5 | Freq_card_repos | 0.220935 |
| 6 | Taille_cm | 0.379321 |
| 7 | Score_bien_etre | 0.012142 |
Coordonnées des individus dans l'espace factoriel
In [37]:
# coordonnées dans l'espace factoriel
coord = pls_lda.transform(X)
coord[:5,:]
Out[37]:
array([[-1.9481837 , -0.16997513],
[-0.96002189, -0.53064808],
[-1.94562027, 0.82897599],
[-1.73718419, 0.78633768],
[-1.22045979, -0.05027799]])
In [38]:
# nuage de points
import seaborn as sns
sns.scatterplot(x=coord[:,0],y=coord[:,1])
Out[38]:
<Axes: >
In [39]:
# nuage de points conditionnellement aux classes
sns.scatterplot(x=coord[:,0],y=coord[:,1],hue=y)
Out[39]:
<Axes: >
Prédiction, matrice de confusion en resubstitution
In [40]:
# matrice de confusion en resubstitution
pandas.crosstab(y,pls_lda.predict(X))
Out[40]:
| col_0 | 1Faible | 2Modere | 3Eleve |
|---|---|---|---|
| Risque | |||
| 1Faible | 420 | 1 | 0 |
| 2Modere | 5 | 322 | 9 |
| 3Eleve | 0 | 20 | 223 |