Version Sentence Transformers¶

In [46]:
import sentence_transformers
sentence_transformers.__version__
Out[46]:
'5.1.2'

Chargement du modèle - Quelques essais¶

In [47]:
# importation de la fonction
from sentence_transformers import SentenceTransformer

# chargement du modèle pré-entraîné
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
In [48]:
# vérif. encodage d'une phrase
phrase = ["le chat est sur le canape"]

# coord.
embedding = model.encode(phrase)
print(embedding.shape)
(1, 384)
In [49]:
# 10 premieres valeurs
print(embedding[:,:10])
[[ 0.3657621  -0.19775355 -0.07118681  0.20523095 -0.12379494  0.13273717
   0.37111837  0.21607073  0.03021704  0.08567287]]
In [50]:
# phrases dans des langues différentes
phrases = ["le chat est sur le canape",
           "the cheetah is very fast",
           "the jaguar is sitting on the sofa",
           "il a enfile son slip en peau de lapin"]

# coord
embeddings = model.encode(phrases)
print(embeddings.shape)
(4, 384)
In [51]:
# similarites entre les phrases (exprimées dans les langues différentes)
similarites = model.similarity(embeddings,embeddings)
print(similarites)
tensor([[1.0000, 0.0918, 0.3295, 0.2208],
        [0.0918, 1.0000, 0.1986, 0.0611],
        [0.3295, 0.1986, 1.0000, 0.2373],
        [0.2208, 0.0611, 0.2373, 1.0000]])

Chargement du corpus¶

In [52]:
# changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

# importation
import pandas
df = pandas.read_excel("corpus_multilingue.xlsx",sheet_name="clean 10000")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   article_id   10000 non-null  int64 
 1   country      10000 non-null  object
 2   source       10000 non-null  object
 3   source_type  10000 non-null  object
 4   d_fr_eco     10000 non-null  int64 
 5   d_fr_lab     10000 non-null  int64 
 6   d_fr_sec     10000 non-null  int64 
 7   d_fr_wel     10000 non-null  int64 
 8   headline     10000 non-null  object
dtypes: int64(5), object(4)
memory usage: 703.3+ KB
In [53]:
# premières lignes
df.head()
Out[53]:
article_id country source source_type d_fr_eco d_fr_lab d_fr_sec d_fr_wel headline
0 54857 UK mirror.co.uk Online 0 0 1 0 Labour's Lord Adonis has apologised for sharin...
1 33456 Hungary mno.hu Online 0 0 0 0 Ezért olyan fontos Orbán Viktornak Farkas Flórián
2 3304 Hungary Magyar Hirlap Print 0 0 0 0 Az iráni atomalku fenntartását szorgalmazza Pe...
3 37988 Poland wyborcza.pl Online 0 0 1 0 Włochy: Nie będziemy wpuszczać statków z imigr...
4 50783 UK dailymail.co.uk Online 0 0 0 0 BRIAN VINER: Ofcom can't censor British TV his...
In [54]:
# transformer les documents en minuscule
df['headline'] = df['headline'].str.lower()
df.head()
Out[54]:
article_id country source source_type d_fr_eco d_fr_lab d_fr_sec d_fr_wel headline
0 54857 UK mirror.co.uk Online 0 0 1 0 labour's lord adonis has apologised for sharin...
1 33456 Hungary mno.hu Online 0 0 0 0 ezért olyan fontos orbán viktornak farkas flórián
2 3304 Hungary Magyar Hirlap Print 0 0 0 0 az iráni atomalku fenntartását szorgalmazza pe...
3 37988 Poland wyborcza.pl Online 0 0 1 0 włochy: nie będziemy wpuszczać statków z imigr...
4 50783 UK dailymail.co.uk Online 0 0 0 0 brian viner: ofcom can't censor british tv his...
In [55]:
# inspection des langues
df.country.value_counts()
Out[55]:
country
Germany    3101
UK         1954
Hungary    1595
Spain      1163
Sweden     1003
Poland      762
Romania     422
Name: count, dtype: int64
In [56]:
# variable "eco"
df.d_fr_eco.sum()
Out[56]:
np.int64(1463)
In [57]:
# création de la classe "eco"
new_eco = df.d_fr_eco.map({1:"_yes",0:"no"})

# comptage
new_eco.value_counts()
Out[57]:
d_fr_eco
no      8537
_yes    1463
Name: count, dtype: int64
In [58]:
# croisement avec les pays
pandas.crosstab(new_eco,df.country)
Out[58]:
country Germany Hungary Poland Romania Spain Sweden UK
d_fr_eco
_yes 359 292 112 70 219 137 274
no 2742 1303 650 352 944 866 1680
In [59]:
# proportion selon les pays
pandas.crosstab(new_eco,df.country,normalize="columns")
Out[59]:
country Germany Hungary Poland Romania Spain Sweden UK
d_fr_eco
_yes 0.115769 0.183072 0.146982 0.165877 0.188306 0.13659 0.140225
no 0.884231 0.816928 0.853018 0.834123 0.811694 0.86341 0.859775
In [60]:
# rajout au data frame
df["eco"] = new_eco
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   article_id   10000 non-null  int64 
 1   country      10000 non-null  object
 2   source       10000 non-null  object
 3   source_type  10000 non-null  object
 4   d_fr_eco     10000 non-null  int64 
 5   d_fr_lab     10000 non-null  int64 
 6   d_fr_sec     10000 non-null  int64 
 7   d_fr_wel     10000 non-null  int64 
 8   headline     10000 non-null  object
 9   eco          10000 non-null  object
dtypes: int64(5), object(5)
memory usage: 781.4+ KB

Analyse corpus "UK"¶

In [61]:
# que les documents en UK
dfUK = df.loc[df.country=="UK",["eco","headline"]]
dfUK.head()
Out[61]:
eco headline
0 no labour's lord adonis has apologised for sharin...
4 no brian viner: ofcom can't censor british tv his...
11 no british cabinet rebels face down may over brex...
13 _yes ‘there will be no hypocrisy!' macron demands b...
24 no bayer sued over essure contraceptive that alle...
In [62]:
# dimension
dfUK.shape
Out[62]:
(1954, 2)
In [63]:
# partition train-test
from sklearn.model_selection import train_test_split
dfUKTrain, dfUKTest = train_test_split(dfUK,train_size=0.7,random_state=0)

#dimensions
print(dfUKTrain.shape)
print(dfUKTest.shape)
(1367, 2)
(587, 2)
In [64]:
# matrice embeddings Train
XUKTrain = model.encode(dfUKTrain.headline.to_list())

#dimensions
print(XUKTrain.shape)
(1367, 384)
In [65]:
# modélisation avec une régression logistique
from sklearn.linear_model import LogisticRegression
lrUK = LogisticRegression()
lrUK.fit(XUKTrain,dfUKTrain.eco)

#vérif. affichage de l'intercept simplement
print(lrUK.intercept_)
[3.25939647]
In [66]:
# matrice en test
XUKTest = model.encode(dfUKTest.headline.to_list())

#dimensions
print(XUKTest.shape)
(587, 384)
In [67]:
# prédiction des probas d'appartenance à "_yes"
predUK = lrUK.predict_proba(XUKTest)[:,0]

# premieres valeurs
print(predUK[:10])
[0.53009273 0.01913674 0.00677878 0.08790191 0.00606818 0.01276383
 0.0361352  0.13250934 0.0533336  0.12560829]
In [68]:
# ROC AUC-score
from sklearn import metrics
aucUK = metrics.roc_auc_score(dfUKTest.eco=="_yes",predUK)
print(aucUK)
0.6937256292095002
In [69]:
# code pour calcul de l'intervalle de confiance l'AUC
# par défaut, intervalle à 90% (mais paramétrable)
import numpy as np
from scipy.stats import norm

def auc_ci(y_true, y_score, alpha = 0.1):
    """
    Variance asymptotique de l'AUC selon Hanley & McNeil (1982)
    y_true : array-like de 0/1
    y_score : scores ou probabilités prédictives
    """
    y_true = np.asarray(y_true)
    y_score = np.asarray(y_score)

    # nombre de positifs et négatifs
    n1 = np.sum(y_true == 1)
    n0 = np.sum(y_true == 0)

    # calcul AUC
    A = metrics.roc_auc_score(y_true, y_score)

    # formules de Hanley & McNeil
    Q1 = A / (2 - A)
    Q2 = 2 * A**2 / (1 + A)

    # variance
    var = (
        A * (1 - A)
        + (n1 - 1) * (Q1 - A**2)
        + (n0 - 1) * (Q2 - A**2)
    ) / (n0 * n1)

    # quantile de la loi normale
    z = norm.ppf(1.0-alpha/2.0)

    # bornes basses et hautes
    bb = A - z * np.sqrt(var)
    bh = A + z * np.sqrt(var)

    return A, (bb, bh)
In [70]:
# intervalle de confiance
print(auc_ci(dfUKTest.eco=="_yes",predUK))
(0.6937256292095002, (np.float64(0.640212774730191), np.float64(0.7472384836888094)))

Analyse multilingue (sans UK)¶

In [71]:
# tout sauf UK
dfNotUK = df.loc[df.country!="UK",["eco","headline"]]
dfNotUK.head()
Out[71]:
eco headline
1 no ezért olyan fontos orbán viktornak farkas flórián
2 no az iráni atomalku fenntartását szorgalmazza pe...
3 no włochy: nie będziemy wpuszczać statków z imigr...
5 _yes oettinger: a migrációs kérdés nem keverhető ös...
6 no ”vi lärde oss en sorts kulturkonservativ läxa”
In [72]:
# dimenions
dfNotUK.shape
Out[72]:
(8046, 2)
In [73]:
# partition train/test
dfNotUKTrain, dfNotUKTest = train_test_split(dfNotUK,train_size=0.7,random_state=0)

#dimensions
print(dfNotUKTrain.shape)
print(dfNotUKTest.shape)
(5632, 2)
(2414, 2)
In [74]:
# matrice embeddings Train
# avec un CORPUS MULTILINGUE !!!
XNotUKTrain = model.encode(dfNotUKTrain.headline.to_list())

#dimensions
print(XNotUKTrain.shape)
(5632, 384)
In [75]:
# modélisation avec une régression logistique
# multilingue mais sans UK
lrNotUK = LogisticRegression()
lrNotUK.fit(XNotUKTrain,dfNotUKTrain.eco)

#vérif. affichage de l'intercept simplement
print(lrNotUK.intercept_)
[2.63413597]
In [76]:
# matrice en test
XNotUKTest = model.encode(dfNotUKTest.headline.to_list())

#dimensions
print(XNotUKTest.shape)
(2414, 384)
In [77]:
# prédiction 
predNotUK = lrNotUK.predict_proba(XNotUKTest)[:,0]

# ROC AUC-Score
print(metrics.roc_auc_score(dfNotUKTest.eco=="_yes",predNotUK))
0.6900690769324354

Classifieur multilingue en test sur UK !!!¶

In [78]:
# prédiction sur dfUKTest
predUK_bis = lrNotUK.predict_proba(XUKTest)[:,0]

# ROC AUC-Score meilleur !!! Hé ben !
aucUK_bis = metrics.roc_auc_score(dfUKTest.eco=="_yes",predUK_bis)
print(aucUK_bis)
0.7357541651896491
In [79]:
# intervalle de confiance
print(auc_ci(dfUKTest.eco=="_yes",predUK_bis))
(0.7357541651896491, (np.float64(0.6840229106926714), np.float64(0.7874854196866268)))
In [80]:
# courbes ROC
# modèle UK
fpr1,tpr1,_ = metrics.roc_curve(dfUKTest.eco=="_yes",predUK)
#modèle multilingue
fpr2,tpr2,_ = metrics.roc_curve(dfUKTest.eco=="_yes",predUK_bis)
# dessin des courbes
import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(fpr1, tpr1, color='darkorange',lw=lw, label=f"Modele UK (AUC = {round(aucUK,4)})")
plt.plot(fpr2, tpr2, color='seagreen',lw=lw, label=f"Modele Multilingue (AUC = {round(aucUK_bis,4)})")
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image