Importation et inspection du corpus

Importation

# chargement du +corpus
import pandas
df = pandas.read_excel("./categorie_texte_oshumed.xlsx")

# info
df.info()
<class 'pandas.DataFrame'>
RangeIndex: 1996 entries, 0 to 1995
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   categorie  1996 non-null   str  
 1   texte      1996 non-null   str  
dtypes: str(2)
memory usage: 31.3 KB
# premières lignes
df.head()
categorie texte
0 eye Changing indications for penetrating keratopla...
1 eye Is corneal deposition of antimalarial any indi...
2 eye The management of chalazion: a survey of Ontar...
3 eye Sarcoidosis of the eyelid skin.\n A 64-year-ol...
4 eye Clinical prognostic factors in patients with p...
# distrbution des classes
df.categorie.value_counts()
categorie
eye       998
cardio    998
Name: count, dtype: int64

Termes fréquents (sans distinction de classe)

# documents
docs = df.texte.to_list()
len(docs)
1996
# matrice documents termes
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english")
M = cv.fit_transform(docs)
M.shape
(1996, 15077)
# mettre sous la forme d'une matrice dense
M = M.toarray()
import numpy
freq_words = numpy.sum(M,axis=0)
freq_words
array([20, 50,  1, ...,  1,  1,  3], shape=(15077,))
# sous la forme d'un vecteur
freq_words = numpy.asarray(freq_words).ravel()
freq_words
array([20, 50,  1, ...,  1,  1,  3], shape=(15077,))
# termes corresp.
cv.get_feature_names_out()
array(['00', '000', '00001', ..., 'zosteriform', 'zygosity', 'zymosan'],
      shape=(15077,), dtype=object)
# sous la forme d'un data frame
freq_df = pandas.DataFrame({"word":cv.get_feature_names_out(),"freq":freq_words})
freq_df.head()
word freq
0 00 20
1 000 50
2 00001 1
3 00002 2
4 00005 1
# trié de manière décroissante selon la fréquence
freq_df = freq_df.sort_values(by="freq",ascending=False)
freq_df.head()
word freq
10223 patients 4323
4486 disease 1073
6333 group 924
11064 pressure 899
3648 coronary 895
# dictionnaire terme - freq
dico = {}

# 20 premiers termes (les plus fréquents)
for i in range(20):
    dico[freq_df.iloc[i,0]] = freq_df.iloc[i,1]

# verif.
dico
{'patients': np.int64(4323),
 'disease': np.int64(1073),
 'group': np.int64(924),
 'pressure': np.int64(899),
 'coronary': np.int64(895),
 'treatment': np.int64(880),
 'eyes': np.int64(820),
 'blood': np.int64(770),
 'study': np.int64(694),
 'artery': np.int64(658),
 'patient': np.int64(623),
 'heart': np.int64(618),
 'clinical': np.int64(598),
 'ventricular': np.int64(597),
 'results': np.int64(594),
 'retinal': np.int64(586),
 'years': np.int64(579),
 'mean': np.int64(569),
 '10': np.int64(566),
 'visual': np.int64(558)}
# wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wc = WordCloud(width=800,height=600,
               background_color="white").generate_from_frequencies(dico)

# affichage
plt.imshow(wc)
plt.axis("off")
plt.show()

Termes fréquents par catégorie

# liste des catégories
modalites = numpy.flip(numpy.unique(df.categorie))
modalites
array(['eye', 'cardio'], dtype=object)
# wordclouds conditionnels
wcs = []

# pour chaque catégorie
for m in modalites:
    # somme conditionnelle
    freq_cond = numpy.sum(M[df.categorie == m,:], axis=0)
    freq_cond = numpy.asarray(freq_cond).ravel()
    # sous la forme d'index trié (inverse)
    index = numpy.flip(numpy.argsort(freq_cond))
    # création du dictionnaire de fréquence
    dico = {}
    for i in range(10):
        dico[cv.get_feature_names_out()[index[i]]] = freq_cond[index[i]]
    # création du wordcloud
    wc = WordCloud(width=800,height=600,background_color="white").generate_from_frequencies(dico)
    # ajout dans la liste
    wcs.append(wc)

# affichage côte à côte
_, axes = plt.subplots(1,2)
# les deux graphiques
axes[0].imshow(wcs[0])
axes[0].set_title(modalites[0])
axes[1].imshow(wcs[1])
axes[1].set_title(modalites[1])
# affichage
plt.tight_layout()
plt.show()

Topic Modeling Manuel (corpus étiqueté)

import bertopic
bertopic.__version__
'0.17.4'

Traitement initial

# transformer les étiquettes en numérique
y = (df.categorie == "cardio").astype(int)
y
0       0
1       0
2       0
3       0
4       0
       ..
1991    1
1992    1
1993    1
1994    1
1995    1
Name: categorie, Length: 1996, dtype: int64
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction

# Prepare our empty sub-models and reduce frequent words while we are at it.
empty_embedding_model = BaseEmbedder()
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Fit BERTopic without actually performing any clustering
tm = BERTopic(
        embedding_model=empty_embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model
)
# entraînement
topics, probs = tm.fit_transform(docs, y=y)
# topics et probas
# topics ne change pas
# probs n'a pas lieu d'être
print("Topics = ",topics[:10],topics[-10:])
print("Probs = ",probs)
Topics =  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Probs =  None
# info sur les topics
tm.get_topic_info()
Topic Count Name Representation Representative_Docs
0 0 998 0_the_of_and_in [the, of, and, in, with, to, patients, eyes, w... [The association of Fuchs's corneal endothelia...
1 1 998 1_of_the_in_and [of, the, in, and, to, with, patients, was, we... [Quantitative angiography after directional co...
# topic (0)
tm.get_topic(0)
[('the', np.float64(0.555913088840372)),
 ('of', np.float64(0.5484898920007266)),
 ('and', np.float64(0.492535501927523)),
 ('in', np.float64(0.48886055957188584)),
 ('with', np.float64(0.45758709335971254)),
 ('to', np.float64(0.43236332319396376)),
 ('patients', np.float64(0.4093316748286859)),
 ('eyes', np.float64(0.405202713512738)),
 ('was', np.float64(0.39410873863640256)),
 ('were', np.float64(0.3856734322159167))]
# topic(1)
tm.get_topic(1)
[('of', np.float64(0.518553717294377)),
 ('the', np.float64(0.5161396354881208)),
 ('in', np.float64(0.5084482967637018)),
 ('and', np.float64(0.5056673983393246)),
 ('to', np.float64(0.45466845127268807)),
 ('with', np.float64(0.43871513639165255)),
 ('patients', np.float64(0.42218696337810574)),
 ('was', np.float64(0.40961714991464193)),
 ('were', np.float64(0.3798589310635548)),
 ('than', np.float64(0.37547734979005093))]

Affiner la caractérisation avec KeyBERTInspired

# *** importation et instanciation des outils ***

# outil pour calcul embeddings (nécéssaire à l'association)
from sentence_transformers import SentenceTransformer
emb_model = SentenceTransformer("all-MiniLM-L6-v2")

# relancer les calculs
# on veut 20 mots-clés par topic
tm_2 = BERTopic(
        embedding_model=emb_model,#!!! modification ici !!!
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model
)
tm_2.fit_transform(docs, y=y)

# outil pour association mots-clés et documents
from bertopic.representation import KeyBERTInspired
kbi_model = KeyBERTInspired()

# affiner la description des topics
tm_2.update_topics(docs,representation_model=kbi_model)

# affichage
tm_2.get_topic_info()
Topic Count Name Representation Representative_Docs
0 0 998 0_intraocular_ocular_glaucoma_retinal [intraocular, ocular, glaucoma, retinal, catar... [The association of Fuchs's corneal endothelia...
1 1 998 1_hypertension_coronary_infarction_cardiac [hypertension, coronary, infarction, cardiac, ... [Quantitative angiography after directional co...
# de nouveau topic(0)
tm_2.get_topic(0)
[('intraocular', np.float32(0.5502204)),
 ('ocular', np.float32(0.5108341)),
 ('glaucoma', np.float32(0.5063115)),
 ('retinal', np.float32(0.4767272)),
 ('cataract', np.float32(0.47429907)),
 ('corneal', np.float32(0.45763624)),
 ('eye', np.float32(0.3740924)),
 ('macular', np.float32(0.34358108)),
 ('eyes', np.float32(0.3278302)),
 ('optic', np.float32(0.312919))]
# de nouveau topic(1)
tm_2.get_topic(1)
[('hypertension', np.float32(0.4635601)),
 ('coronary', np.float32(0.36461848)),
 ('infarction', np.float32(0.34103602)),
 ('cardiac', np.float32(0.3059579)),
 ('angioplasty', np.float32(0.29512602)),
 ('artery', np.float32(0.28911167)),
 ('arterial', np.float32(0.2878757)),
 ('myocardial', np.float32(0.2862712)),
 ('ischemia', np.float32(0.27953634)),
 ('vascular', np.float32(0.25971344))]

Wordcloud KeyBERTInspired

# wordclouds conditionnels basé sur BERTopic
wcs = []

# pour chaque catégorie
for j,m in enumerate(modalites):
    # description du topic
    desc = tm_2.get_topic(j)
    # création du dictionnaire de fréquence
    dico = {}
    for i in range(len(desc)):
        dico[desc[i][0]] = desc[i][1]
    # création du wordcloud
    wc = WordCloud(width=800,height=600,background_color="white").generate_from_frequencies(dico)
    # ajout dans la liste
    wcs.append(wc)

# affichage côte à côte
_, axes = plt.subplots(1,2)
# les deux graphiques
axes[0].imshow(wcs[0])
axes[0].set_title(modalites[0])
axes[1].imshow(wcs[1])
axes[1].set_title(modalites[1])
# affichage
plt.tight_layout()
plt.show()

Modélisation supervisée

Matrice bag-of-words TF-IDF

# matrice documents-termes -- pondération TF-DIF
# retrait des termes trop peu fréquents (moins de 10 documents)
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(stop_words="english",min_df=10,lowercase=True)
Z = tvec.fit_transform(docs).toarray()
Z.shape
(1996, 2552)
# régression logistique Ridge
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(l1_ratio=0,solver="saga")
lr.fit(Z,y)
LogisticRegression(l1_ratio=0, solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# qualité de modélisation - accuracy en resubstitution
# pour avoir juste une idée de la qualité de l'entraînement
lr.score(Z,y)
0.9819639278557114
# mettre des coefficients dans un data frame
coefs = pandas.DataFrame({"var":tvec.get_feature_names_out(),"coef":lr.coef_[0]})
coefs.head()
var coef
0 00 -0.121132
1 000 0.076735
2 0001 -0.117594
3 0005 -0.130952
4 001 0.350922
# triés de manière décroissante selon la valeur absolue
coefs = coefs.sort_values(by="coef", key = lambda v: abs(v), ascending = False)
coefs.head(10)
var coef
2044 retinal -3.654828
962 eyes -3.300180
1616 ocular -3.138602
613 coronary 3.095162
1096 heart 2.909394
959 eye -2.787287
426 cardiac 2.720079
1057 glaucoma -2.641695
2501 visual -2.579157
400 blood 2.505824

Topic Modeling Supervisé

# Skip over dimensionality reduction, replace cluster model with classifier,
# and reduce frequent words while we are at it.
empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression(l1_ratio=0,solver="saga") # !!! régression logistique !!!
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Create a fully supervised BERTopic instance
# l'outil utilise bien les embeddings ici !!!
tm_spv = BERTopic(language = "english",
        umap_model = empty_dimensionality_model,
        hdbscan_model = clf, # !!! l'astuce est ici, remplacer UMAP par un classifieur !!!
        ctfidf_model = ctfidf_model,
        representation_model = KeyBERTInspired() #on introduit directement KeyBERTInspired
)

# entraînement
topics, probs = tm_spv.fit_transform(docs, y=y)
# info sur les topics
tm_spv.get_topic_info()
Topic Count Name Representation Representative_Docs
0 0 998 0_corneal_uveitis_cataract_ocular [corneal, uveitis, cataract, ocular, intraocul... [Repair of retinal detachment caused by cytome...
1 1 998 1_vascular_disease_ischemia_angioplasty [vascular, disease, ischemia, angioplasty, art... [Determinants and significance of diltiazem pl...
# topic(0)
# rôle des termes dans le processus d'identification par le modèle de "eye"
tm_spv.get_topic(0)
[('corneal', np.float32(0.49348512)),
 ('uveitis', np.float32(0.45535284)),
 ('cataract', np.float32(0.44035614)),
 ('ocular', np.float32(0.42535532)),
 ('intraocular', np.float32(0.42202055)),
 ('glaucoma', np.float32(0.4161051)),
 ('retinal', np.float32(0.41194612)),
 ('eye', np.float32(0.34302366)),
 ('macular', np.float32(0.33214647)),
 ('eyes', np.float32(0.2786855))]
# comparaison avec "manual topic modeling"
# à partir de l'étiquette observée
tm_2.get_topic(0)
[('intraocular', np.float32(0.5502204)),
 ('ocular', np.float32(0.5108341)),
 ('glaucoma', np.float32(0.5063115)),
 ('retinal', np.float32(0.4767272)),
 ('cataract', np.float32(0.47429907)),
 ('corneal', np.float32(0.45763624)),
 ('eye', np.float32(0.3740924)),
 ('macular', np.float32(0.34358108)),
 ('eyes', np.float32(0.3278302)),
 ('optic', np.float32(0.312919))]
# on se rend compte de la régression logistique
# a bien été entraîné sur les embeddings de 384 dimensions
# celle de Sentence Transformers / all-MiniLM-L6-v2

# coefficients - classe cible binaire
print(f"Dim coefs = {tm_spv.hdbscan_model.coef_.shape}")

# intercept
print(f"Intercept = {tm_spv.hdbscan_model.intercept_}")
Dim coefs = (1, 384)
Intercept = [0.6422906]