PLS-LDA à la mode ChatGPT¶

In [30]:
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


class PLS_LDA:
    def __init__(self, n_components=2, scale=True):
        self.n_components = n_components
        self.scale = scale

        self.scaler = None
        self.pls = None
        self.lda = None
        self.lb = None

        self.coef_ = None
        self.intercept_ = None

        #R.R.
        self.classes_ = None

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)

        n, p = X.shape

        # -------------------------
        # Standardisation
        # -------------------------
        if self.scale:
            self.scaler = StandardScaler()
            Xs = self.scaler.fit_transform(X)
        else:
            Xs = X.copy()

        # -------------------------
        # Encodage classes
        # -------------------------
        self.lb = LabelBinarizer()
        Y = self.lb.fit_transform(y)

        #R.R.
        self.classes_ = self.lb.classes_

        if Y.ndim == 1:
            Y = np.vstack((1 - Y, Y)).T

        # -------------------------
        # PLS
        # -------------------------
        self.pls = PLSRegression(n_components=self.n_components)
        self.pls.fit(Xs, Y)

        T = self.pls.x_scores_

        # -------------------------
        # LDA sur scores PLS
        # -------------------------
        self.lda = LinearDiscriminantAnalysis()
        self.lda.fit(T, y)

        # -------------------------
        # Calcul des coefficients globaux
        # -------------------------
        # T = Xs W*
        W_star = self.pls.x_rotations_

        # Fonction discriminante :
        # f(x) = (X - mean)/std @ W* @ lda.coef_.T + intercept
        B_scaled = W_star @ self.lda.coef_.T  # (p Ă— K)

        if self.scale:
            # Remise aux unités originales
            scale = self.scaler.scale_.reshape(-1, 1)
            self.coef_ = B_scaled / scale
            self.intercept_ = (
                self.lda.intercept_
                - (self.scaler.mean_ / self.scaler.scale_) @ B_scaled
            )
        else:
            self.coef_ = B_scaled
            self.intercept_ = self.lda.intercept_

        return self

    # -------------------------------------------------
    # Coordonnées factorielles PLS
    # -------------------------------------------------
    def transform(self, X):
        X = np.asarray(X)

        if self.scale:
            X = self.scaler.transform(X)

        return self.pls.transform(X)

    # -------------------------------------------------
    # Scores d'affectation (fonctions discriminantes)
    # -------------------------------------------------
    def predict_scores(self, X):
        X = np.asarray(X)
        return X @ self.coef_ + self.intercept_

    # -------------------------------------------------
    # Prédiction classes
    # -------------------------------------------------
    def predict(self, X):
        scores = self.predict_scores(X)
        class_index = np.argmax(scores, axis=1)
        return self.lb.classes_[class_index]

    # -------------------------------------------------
    # VIP
    # -------------------------------------------------
    def vip(self):
        T = self.pls.x_scores_
        W = self.pls.x_weights_
        Q = self.pls.y_loadings_

        p, h = W.shape

        # Variance expliquée par composante
        SS = np.sum((T @ Q.T) ** 2, axis=0)

        total_SS = np.sum(SS)

        vip = np.zeros(p)

        for j in range(p):
            weight = np.array([
                (W[j, a] ** 2) * SS[a]
                for a in range(h)
            ])
            vip[j] = np.sqrt(p * np.sum(weight) / total_SS)

        return vip

Exemple d'utilisation fourni par ChatGPT¶

In [31]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_iris()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0
)

model = PLS_LDA(n_components=2)
model.fit(X_train, y_train)

# Prédiction
y_pred = model.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))

# Scores discriminants
scores = model.predict_scores(X_test)
print("Scores d'affectation :", scores[:5])

# Coordonnées factorielles
print("Scores factoriels :", model.transform(X_test)[:5])

# VIP
print("VIP :", model.vip())

# Coefficients globaux
print("Coefficients :", model.coef_)
print("Intercept :", model.intercept_)
Accuracy : 0.9333333333333333
Scores d'affectation : [[-35.85010696   2.14715841   5.03270374]
 [-26.7478757    1.31743915  -2.2217795 ]
 [ 22.18290829 -11.31218282 -34.51661932]
 [-44.03307765   3.45794658  11.09105713]
 [ 14.51004254  -8.49955712 -30.13524974]]
Scores factoriels : [[ 1.39772157 -0.32878015]
 [ 0.44079519 -1.84639484]
 [-2.47277523  2.19641888]
 [ 2.103773    0.19192841]
 [-2.24311368  0.31960289]]
VIP : [0.72336544 0.73913436 0.86193218 0.86587726]
Coefficients : [[-6.6649037   1.16072627  4.85803808]
 [13.20517396 -4.12909686 -8.12422603]
 [-4.41178435  0.89711342  3.11007791]
 [-9.64668113  1.82321276  6.91395769]]
Intercept : [ 11.4839823   -1.97457183 -32.85098004]

Application sur notre jeu de données "health"¶

Importation et préparation des données¶

In [32]:
# dossier de nos données
import os
os.chdir("C:/Users/ricco/Desktop/demo")

# importation
import pandas
df = pandas.read_excel("health_dataset.xlsx")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              1000 non-null   int64 
 1   IMC              1000 non-null   int64 
 2   PA_systolique    1000 non-null   int64 
 3   Cholesterol      1000 non-null   int64 
 4   Glycemie_jeun    1000 non-null   int64 
 5   Freq_card_repos  1000 non-null   int64 
 6   Taille_cm        1000 non-null   int64 
 7   Score_bien_etre  1000 non-null   int64 
 8   Risque           1000 non-null   object
dtypes: int64(8), object(1)
memory usage: 70.4+ KB
In [33]:
# X et y
X = df.drop(columns="Risque")
y = df.Risque

Appel de la PLS-LDA sur nos données¶

Coefficients pour la prédiction (scores d'affectation)

In [34]:
# appel de la PLS_LDA sur nos données
pls_lda = PLS_LDA(n_components=2)
pls_lda.fit(X,y)

# coefficients
pandas.DataFrame(pls_lda.coef_,index=X.columns,columns=pls_lda.classes_)
Out[34]:
1Faible 2Modere 3Eleve
Age -0.219197 0.063533 0.291913
IMC -0.621378 0.198533 0.802028
PA_systolique -0.163117 0.049618 0.213993
Cholesterol -0.079185 0.022826 0.105626
Glycemie_jeun -0.130784 0.029979 0.185132
Freq_card_repos 0.010408 -0.014467 0.001972
Taille_cm -0.009326 0.025943 -0.019716
Score_bien_etre 0.001332 -0.000109 -0.002157
In [35]:
# intercept
pls_lda.intercept_
Out[35]:
array([  71.22668385,  -27.36823675, -112.08688265])

Variable importance in projection (VIP). Mais pas en prédiction.

In [36]:
# VIP
pandas.DataFrame({"var":X.columns,"vip":pls_lda.vip()})
Out[36]:
var vip
0 Age 0.908469
1 IMC 0.937889
2 PA_systolique 0.926170
3 Cholesterol 0.940766
4 Glycemie_jeun 0.993227
5 Freq_card_repos 0.220935
6 Taille_cm 0.379321
7 Score_bien_etre 0.012142

Coordonnées des individus dans l'espace factoriel

In [37]:
# coordonnées dans l'espace factoriel
coord = pls_lda.transform(X)
coord[:5,:]
Out[37]:
array([[-1.9481837 , -0.16997513],
       [-0.96002189, -0.53064808],
       [-1.94562027,  0.82897599],
       [-1.73718419,  0.78633768],
       [-1.22045979, -0.05027799]])
In [38]:
# nuage de points
import seaborn as sns
sns.scatterplot(x=coord[:,0],y=coord[:,1])
Out[38]:
<Axes: >
No description has been provided for this image
In [39]:
# nuage de points conditionnellement aux classes
sns.scatterplot(x=coord[:,0],y=coord[:,1],hue=y)
Out[39]:
<Axes: >
No description has been provided for this image

Prédiction, matrice de confusion en resubstitution

In [40]:
# matrice de confusion en resubstitution
pandas.crosstab(y,pls_lda.predict(X))
Out[40]:
col_0 1Faible 2Modere 3Eleve
Risque
1Faible 420 1 0
2Modere 5 322 9
3Eleve 0 20 223