Version Sentence Transformers¶
In [46]:
import sentence_transformers
sentence_transformers.__version__
Out[46]:
'5.1.2'
Chargement du modèle - Quelques essais¶
In [47]:
# importation de la fonction
from sentence_transformers import SentenceTransformer
# chargement du modèle pré-entraîné
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
In [48]:
# vérif. encodage d'une phrase
phrase = ["le chat est sur le canape"]
# coord.
embedding = model.encode(phrase)
print(embedding.shape)
(1, 384)
In [49]:
# 10 premieres valeurs
print(embedding[:,:10])
[[ 0.3657621 -0.19775355 -0.07118681 0.20523095 -0.12379494 0.13273717 0.37111837 0.21607073 0.03021704 0.08567287]]
In [50]:
# phrases dans des langues différentes
phrases = ["le chat est sur le canape",
"the cheetah is very fast",
"the jaguar is sitting on the sofa",
"il a enfile son slip en peau de lapin"]
# coord
embeddings = model.encode(phrases)
print(embeddings.shape)
(4, 384)
In [51]:
# similarites entre les phrases (exprimées dans les langues différentes)
similarites = model.similarity(embeddings,embeddings)
print(similarites)
tensor([[1.0000, 0.0918, 0.3295, 0.2208],
[0.0918, 1.0000, 0.1986, 0.0611],
[0.3295, 0.1986, 1.0000, 0.2373],
[0.2208, 0.0611, 0.2373, 1.0000]])
Chargement du corpus¶
In [52]:
# changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
# importation
import pandas
df = pandas.read_excel("corpus_multilingue.xlsx",sheet_name="clean 10000")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 article_id 10000 non-null int64 1 country 10000 non-null object 2 source 10000 non-null object 3 source_type 10000 non-null object 4 d_fr_eco 10000 non-null int64 5 d_fr_lab 10000 non-null int64 6 d_fr_sec 10000 non-null int64 7 d_fr_wel 10000 non-null int64 8 headline 10000 non-null object dtypes: int64(5), object(4) memory usage: 703.3+ KB
In [53]:
# premières lignes
df.head()
Out[53]:
| article_id | country | source | source_type | d_fr_eco | d_fr_lab | d_fr_sec | d_fr_wel | headline | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 54857 | UK | mirror.co.uk | Online | 0 | 0 | 1 | 0 | Labour's Lord Adonis has apologised for sharin... |
| 1 | 33456 | Hungary | mno.hu | Online | 0 | 0 | 0 | 0 | Ezért olyan fontos Orbán Viktornak Farkas Flórián |
| 2 | 3304 | Hungary | Magyar Hirlap | 0 | 0 | 0 | 0 | Az iráni atomalku fenntartását szorgalmazza Pe... | |
| 3 | 37988 | Poland | wyborcza.pl | Online | 0 | 0 | 1 | 0 | Włochy: Nie będziemy wpuszczać statków z imigr... |
| 4 | 50783 | UK | dailymail.co.uk | Online | 0 | 0 | 0 | 0 | BRIAN VINER: Ofcom can't censor British TV his... |
In [54]:
# transformer les documents en minuscule
df['headline'] = df['headline'].str.lower()
df.head()
Out[54]:
| article_id | country | source | source_type | d_fr_eco | d_fr_lab | d_fr_sec | d_fr_wel | headline | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 54857 | UK | mirror.co.uk | Online | 0 | 0 | 1 | 0 | labour's lord adonis has apologised for sharin... |
| 1 | 33456 | Hungary | mno.hu | Online | 0 | 0 | 0 | 0 | ezért olyan fontos orbán viktornak farkas flórián |
| 2 | 3304 | Hungary | Magyar Hirlap | 0 | 0 | 0 | 0 | az iráni atomalku fenntartását szorgalmazza pe... | |
| 3 | 37988 | Poland | wyborcza.pl | Online | 0 | 0 | 1 | 0 | włochy: nie będziemy wpuszczać statków z imigr... |
| 4 | 50783 | UK | dailymail.co.uk | Online | 0 | 0 | 0 | 0 | brian viner: ofcom can't censor british tv his... |
In [55]:
# inspection des langues
df.country.value_counts()
Out[55]:
country Germany 3101 UK 1954 Hungary 1595 Spain 1163 Sweden 1003 Poland 762 Romania 422 Name: count, dtype: int64
In [56]:
# variable "eco"
df.d_fr_eco.sum()
Out[56]:
np.int64(1463)
In [57]:
# création de la classe "eco"
new_eco = df.d_fr_eco.map({1:"_yes",0:"no"})
# comptage
new_eco.value_counts()
Out[57]:
d_fr_eco no 8537 _yes 1463 Name: count, dtype: int64
In [58]:
# croisement avec les pays
pandas.crosstab(new_eco,df.country)
Out[58]:
| country | Germany | Hungary | Poland | Romania | Spain | Sweden | UK |
|---|---|---|---|---|---|---|---|
| d_fr_eco | |||||||
| _yes | 359 | 292 | 112 | 70 | 219 | 137 | 274 |
| no | 2742 | 1303 | 650 | 352 | 944 | 866 | 1680 |
In [59]:
# proportion selon les pays
pandas.crosstab(new_eco,df.country,normalize="columns")
Out[59]:
| country | Germany | Hungary | Poland | Romania | Spain | Sweden | UK |
|---|---|---|---|---|---|---|---|
| d_fr_eco | |||||||
| _yes | 0.115769 | 0.183072 | 0.146982 | 0.165877 | 0.188306 | 0.13659 | 0.140225 |
| no | 0.884231 | 0.816928 | 0.853018 | 0.834123 | 0.811694 | 0.86341 | 0.859775 |
In [60]:
# rajout au data frame
df["eco"] = new_eco
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 article_id 10000 non-null int64 1 country 10000 non-null object 2 source 10000 non-null object 3 source_type 10000 non-null object 4 d_fr_eco 10000 non-null int64 5 d_fr_lab 10000 non-null int64 6 d_fr_sec 10000 non-null int64 7 d_fr_wel 10000 non-null int64 8 headline 10000 non-null object 9 eco 10000 non-null object dtypes: int64(5), object(5) memory usage: 781.4+ KB
Analyse corpus "UK"¶
In [61]:
# que les documents en UK
dfUK = df.loc[df.country=="UK",["eco","headline"]]
dfUK.head()
Out[61]:
| eco | headline | |
|---|---|---|
| 0 | no | labour's lord adonis has apologised for sharin... |
| 4 | no | brian viner: ofcom can't censor british tv his... |
| 11 | no | british cabinet rebels face down may over brex... |
| 13 | _yes | ‘there will be no hypocrisy!' macron demands b... |
| 24 | no | bayer sued over essure contraceptive that alle... |
In [62]:
# dimension
dfUK.shape
Out[62]:
(1954, 2)
In [63]:
# partition train-test
from sklearn.model_selection import train_test_split
dfUKTrain, dfUKTest = train_test_split(dfUK,train_size=0.7,random_state=0)
#dimensions
print(dfUKTrain.shape)
print(dfUKTest.shape)
(1367, 2) (587, 2)
In [64]:
# matrice embeddings Train
XUKTrain = model.encode(dfUKTrain.headline.to_list())
#dimensions
print(XUKTrain.shape)
(1367, 384)
In [65]:
# modélisation avec une régression logistique
from sklearn.linear_model import LogisticRegression
lrUK = LogisticRegression()
lrUK.fit(XUKTrain,dfUKTrain.eco)
#vérif. affichage de l'intercept simplement
print(lrUK.intercept_)
[3.25939647]
In [66]:
# matrice en test
XUKTest = model.encode(dfUKTest.headline.to_list())
#dimensions
print(XUKTest.shape)
(587, 384)
In [67]:
# prédiction des probas d'appartenance à "_yes"
predUK = lrUK.predict_proba(XUKTest)[:,0]
# premieres valeurs
print(predUK[:10])
[0.53009273 0.01913674 0.00677878 0.08790191 0.00606818 0.01276383 0.0361352 0.13250934 0.0533336 0.12560829]
In [68]:
# ROC AUC-score
from sklearn import metrics
aucUK = metrics.roc_auc_score(dfUKTest.eco=="_yes",predUK)
print(aucUK)
0.6937256292095002
In [69]:
# code pour calcul de l'intervalle de confiance l'AUC
# par défaut, intervalle à 90% (mais paramétrable)
import numpy as np
from scipy.stats import norm
def auc_ci(y_true, y_score, alpha = 0.1):
"""
Variance asymptotique de l'AUC selon Hanley & McNeil (1982)
y_true : array-like de 0/1
y_score : scores ou probabilités prédictives
"""
y_true = np.asarray(y_true)
y_score = np.asarray(y_score)
# nombre de positifs et négatifs
n1 = np.sum(y_true == 1)
n0 = np.sum(y_true == 0)
# calcul AUC
A = metrics.roc_auc_score(y_true, y_score)
# formules de Hanley & McNeil
Q1 = A / (2 - A)
Q2 = 2 * A**2 / (1 + A)
# variance
var = (
A * (1 - A)
+ (n1 - 1) * (Q1 - A**2)
+ (n0 - 1) * (Q2 - A**2)
) / (n0 * n1)
# quantile de la loi normale
z = norm.ppf(1.0-alpha/2.0)
# bornes basses et hautes
bb = A - z * np.sqrt(var)
bh = A + z * np.sqrt(var)
return A, (bb, bh)
In [70]:
# intervalle de confiance
print(auc_ci(dfUKTest.eco=="_yes",predUK))
(0.6937256292095002, (np.float64(0.640212774730191), np.float64(0.7472384836888094)))
Analyse multilingue (sans UK)¶
In [71]:
# tout sauf UK
dfNotUK = df.loc[df.country!="UK",["eco","headline"]]
dfNotUK.head()
Out[71]:
| eco | headline | |
|---|---|---|
| 1 | no | ezért olyan fontos orbán viktornak farkas flórián |
| 2 | no | az iráni atomalku fenntartását szorgalmazza pe... |
| 3 | no | włochy: nie będziemy wpuszczać statków z imigr... |
| 5 | _yes | oettinger: a migrációs kérdés nem keverhető ös... |
| 6 | no | ”vi lärde oss en sorts kulturkonservativ läxa” |
In [72]:
# dimenions
dfNotUK.shape
Out[72]:
(8046, 2)
In [73]:
# partition train/test
dfNotUKTrain, dfNotUKTest = train_test_split(dfNotUK,train_size=0.7,random_state=0)
#dimensions
print(dfNotUKTrain.shape)
print(dfNotUKTest.shape)
(5632, 2) (2414, 2)
In [74]:
# matrice embeddings Train
# avec un CORPUS MULTILINGUE !!!
XNotUKTrain = model.encode(dfNotUKTrain.headline.to_list())
#dimensions
print(XNotUKTrain.shape)
(5632, 384)
In [75]:
# modélisation avec une régression logistique
# multilingue mais sans UK
lrNotUK = LogisticRegression()
lrNotUK.fit(XNotUKTrain,dfNotUKTrain.eco)
#vérif. affichage de l'intercept simplement
print(lrNotUK.intercept_)
[2.63413597]
In [76]:
# matrice en test
XNotUKTest = model.encode(dfNotUKTest.headline.to_list())
#dimensions
print(XNotUKTest.shape)
(2414, 384)
In [77]:
# prédiction
predNotUK = lrNotUK.predict_proba(XNotUKTest)[:,0]
# ROC AUC-Score
print(metrics.roc_auc_score(dfNotUKTest.eco=="_yes",predNotUK))
0.6900690769324354
Classifieur multilingue en test sur UK !!!¶
In [78]:
# prédiction sur dfUKTest
predUK_bis = lrNotUK.predict_proba(XUKTest)[:,0]
# ROC AUC-Score meilleur !!! Hé ben !
aucUK_bis = metrics.roc_auc_score(dfUKTest.eco=="_yes",predUK_bis)
print(aucUK_bis)
0.7357541651896491
In [79]:
# intervalle de confiance
print(auc_ci(dfUKTest.eco=="_yes",predUK_bis))
(0.7357541651896491, (np.float64(0.6840229106926714), np.float64(0.7874854196866268)))
In [80]:
# courbes ROC
# modèle UK
fpr1,tpr1,_ = metrics.roc_curve(dfUKTest.eco=="_yes",predUK)
#modèle multilingue
fpr2,tpr2,_ = metrics.roc_curve(dfUKTest.eco=="_yes",predUK_bis)
# dessin des courbes
import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(fpr1, tpr1, color='darkorange',lw=lw, label=f"Modele UK (AUC = {round(aucUK,4)})")
plt.plot(fpr2, tpr2, color='seagreen',lw=lw, label=f"Modele Multilingue (AUC = {round(aucUK_bis,4)})")
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()