Importation des données

import pandas
df = pandas.read_excel("imdb_reviews_topic.xlsx")
df.info()
<class 'pandas.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   ID            200 non-null    int64
 1   label         200 non-null    str  
 2   commentaires  200 non-null    str  
dtypes: int64(1), str(2)
memory usage: 4.8 KB
# premières lignes
df.head(5)
ID label commentaires
0 1 neg This guy has no idea of cinema. Okay, it seems...
1 2 neg This movie was extremely depressing.   The cha...
2 3 neg Now, I'm one to watch movies that got poor rev...
3 4 neg One hour, eight minutes and twelve seconds int...
4 5 neg Another FRIDAY THE 13TH ripoff, even featuring...

Topic Modeling avec BERTopic

Instanciation - Entraînement

# instanciation
# attention, il y a un random_state non contrôlé dans UMAP
# on peut spécifier le nombre de topics si on le souhaite
# language influe sur le modèle "embedding / Hugging Face" utilisé
from bertopic import BERTopic
tm = BERTopic(language="english")
# mettre les documents dans une structure dédiée
docs = df.commentaires.to_list()
len(docs)
200
# appliquer sur notre corpus
# il est possible d'importer explicitement l'algo d'embedding à utiliser
topics, probs = tm.fit_transform(docs)
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 5337.91it/s]

Info sur l’embedding utilisé

# vars() indique la liste des attributs d'un objet
# on voit : algorithme utilisé => SentenceTransformer
# modèle préentraîné chargé sur Hugging Face => all-MiniLM-L6-v2
vars(tm.embedding_model)
{'embedding_model': SentenceTransformer(
   (0): Transformer({'transformer_task': 'feature-extraction', 'modality_config': {'text': {'method': 'forward', 'method_output_name': 'last_hidden_state'}}, 'module_output_name': 'token_embeddings', 'architecture': 'BertModel'})
   (1): Pooling({'embedding_dimension': 384, 'pooling_mode': 'mean', 'include_prompt': True})
   (2): Normalize({})
 ),
 'word_embedding_model': None,
 '_hf_model': 'sentence-transformers/all-MiniLM-L6-v2'}

Inspection des topics (à revoir en détail plus bas)

… Parce qu’on se rend compte qu’ils sont pollués par les stop words !!!

# info sur les topics
tm.get_topic_info()
Topic Count Name Representation Representative_Docs
0 -1 91 -1_the_of_and_to [the, of, and, to, in, is, it, that, this, as] [I was 16 when I first saw the movie, and it h...
1 0 63 0_the_of_to_this [the, of, to, this, and, it, movie, is, in, that] [In the title I write that the story is ludicr...
2 1 31 1_the_and_is_to [the, and, is, to, of, in, her, as, with, she] [Punctuating the opening credits sequence is a...
3 2 15 2_the_and_of_show [the, and, of, show, is, to, that, in, my, it] [Catscratch is the best thing to come out of N...
# pour le Topic "0" -- cf. concept de c-TF-IDF
tm.get_topic(0)
[('the', np.float64(0.10179238251010395)),
 ('of', np.float64(0.06209858424300599)),
 ('to', np.float64(0.06002157972996907)),
 ('this', np.float64(0.056579490649638985)),
 ('and', np.float64(0.05440087703492089)),
 ('it', np.float64(0.0494740146248586)),
 ('movie', np.float64(0.04911095233185278)),
 ('is', np.float64(0.0457445543010888)),
 ('in', np.float64(0.04277849392636411)),
 ('that', np.float64(0.042200278525156655))]
# pour le topic "1"
tm.get_topic(1)
[('the', np.float64(0.08884123467475584)),
 ('and', np.float64(0.07396684493360615)),
 ('is', np.float64(0.06787816648863378)),
 ('to', np.float64(0.059557249590423376)),
 ('of', np.float64(0.059497880275420745)),
 ('in', np.float64(0.04951346199062108)),
 ('her', np.float64(0.04711280758982641)),
 ('as', np.float64(0.03703647902862474)),
 ('with', np.float64(0.03664817964092528)),
 ('she', np.float64(0.03645810327735422))]

Topics associés aux documents

#topic et proba d'appartenance
pandas.DataFrame({"id":df.ID,"topic":topics,"proba":probs}).head(10)
id topic proba
0 1 0 0.853942
1 2 1 0.950680
2 3 0 0.848061
3 4 1 0.936802
4 5 -1 0.000000
5 6 2 1.000000
6 7 1 0.919525
7 8 0 0.887973
8 9 0 0.967274
9 10 0 1.000000
# informations détaillées
tm.get_document_info(docs)
Document Topic Name Representation Representative_Docs Top_n_words Probability Representative_document
0 This guy has no idea of cinema. Okay, it seems... 0 0_the_of_to_this [the, of, to, this, and, it, movie, is, in, that] [In the title I write that the story is ludicr... the - of - to - this - and - it - movie - is -... 0.853942 False
1 This movie was extremely depressing.   The cha... 1 1_the_and_is_to [the, and, is, to, of, in, her, as, with, she] [Punctuating the opening credits sequence is a... the - and - is - to - of - in - her - as - wit... 0.950680 False
2 Now, I'm one to watch movies that got poor rev... 0 0_the_of_to_this [the, of, to, this, and, it, movie, is, in, that] [In the title I write that the story is ludicr... the - of - to - this - and - it - movie - is -... 0.848061 False
3 One hour, eight minutes and twelve seconds int... 1 1_the_and_is_to [the, and, is, to, of, in, her, as, with, she] [Punctuating the opening credits sequence is a... the - and - is - to - of - in - her - as - wit... 0.936802 False
4 Another FRIDAY THE 13TH ripoff, even featuring... -1 -1_the_of_and_to [the, of, and, to, in, is, it, that, this, as] [I was 16 when I first saw the movie, and it h... the - of - and - to - in - is - it - that - th... 0.000000 False
... ... ... ... ... ... ... ... ...
195 after seeing this film for the 3rd time now i ... 0 0_the_of_to_this [the, of, to, this, and, it, movie, is, in, that] [In the title I write that the story is ludicr... the - of - to - this - and - it - movie - is -... 0.935157 False
196 There are few movies that appear to provide en... 0 0_the_of_to_this [the, of, to, this, and, it, movie, is, in, that] [In the title I write that the story is ludicr... the - of - to - this - and - it - movie - is -... 0.876990 False
197 Vincent Price's follow-up to HOUSE OF WAX (195... -1 -1_the_of_and_to [the, of, and, to, in, is, it, that, this, as] [I was 16 when I first saw the movie, and it h... the - of - and - to - in - is - it - that - th... 0.000000 False
198 Let's face it: the final season (#8) was one o... 2 2_the_and_of_show [the, and, of, show, is, to, that, in, my, it] [Catscratch is the best thing to come out of N... the - and - of - show - is - to - that - in - ... 1.000000 False
199 Ardh Satya is one of the finest film ever made... -1 -1_the_of_and_to [the, of, and, to, in, is, it, that, this, as] [I was 16 when I first saw the movie, and it h... the - of - and - to - in - is - it - that - th... 0.000000 False

200 rows × 8 columns

Croisement avec la polarité des documents

# confrontation avec l'appréciation du film
pandas.crosstab(df.label,topics)
col_0 -1 0 1 2
label
neg 42 43 9 6
pos 49 20 22 9

Traitement des documents supplémentaires

# chargement
dfSupp = pandas.read_excel("imdb_reviews_topic.xlsx",sheet_name="additional")
dfSupp
commentaires
0 Panic in the Streets is a fairly unknown littl...
# identification du topic assiocié
top, pr = tm.transform(dfSupp.commentaires.to_list())
print(top,pr)
[np.int64(-1)] [0.]

Fine-Tune Topics (post-traitement pour interprétation)

KeyBERTInspired

# importation et instanciation de l'outil
from bertopic.representation import KeyBERTInspired
kbi_repr_model = KeyBERTInspired()

# màj des topics
tm.update_topics(docs,representation_model=kbi_repr_model)

# affichage
tm.get_topic_info()
Topic Count Name Representation Representative_Docs
0 -1 91 -1_film_acting_films_movie [film, acting, films, movie, very, bad, origin... [I was 16 when I first saw the movie, and it h...
1 0 63 0_film_movie_films_movies [film, movie, films, movies, acting, actors, h... [In the title I write that the story is ludicr...
2 1 31 1_film_movie_films_noir [film, movie, films, noir, scene, woman, chara... [Punctuating the opening credits sequence is a...
3 2 15 2_episodes_shows_scooby_episode [episodes, shows, scooby, episode, shaggy, sea... [Catscratch is the best thing to come out of N...
# topic(0)
tm.get_topic(0)
[('film', np.float32(0.40094614)),
 ('movie', np.float32(0.39515123)),
 ('films', np.float32(0.39073944)),
 ('movies', np.float32(0.3714434)),
 ('acting', np.float32(0.35525686)),
 ('actors', np.float32(0.31051892)),
 ('horror', np.float32(0.30528277)),
 ('characters', np.float32(0.17916778)),
 ('character', np.float32(0.15959851)),
 ('too', np.float32(0.15218914))]
# topic(1)
tm.get_topic(1)
[('film', np.float32(0.40409464)),
 ('movie', np.float32(0.3833954)),
 ('films', np.float32(0.3802221)),
 ('noir', np.float32(0.35741296)),
 ('scene', np.float32(0.3458007)),
 ('woman', np.float32(0.25563133)),
 ('character', np.float32(0.22626133)),
 ('plot', np.float32(0.21965522)),
 ('characters', np.float32(0.21897805)),
 ('wife', np.float32(0.21610656))]

Utilisation des n-grams

# màj des topics
tm.update_topics(docs,representation_model=kbi_repr_model,n_gram_range=(1,3))
# affichage
tm.get_topic_info()
Topic Count Name Representation Representative_Docs
0 -1 91 -1_the film_film_films_movie [the film, film, films, movie, the movie, this... [I was 16 when I first saw the movie, and it h...
1 0 63 0_films_film_movie_movies [films, film, movie, movies, the movie, this f... [In the title I write that the story is ludicr...
2 1 31 1_the film_film_this film_films [the film, film, this film, films, movie, scen... [Punctuating the opening credits sequence is a...
3 2 15 2_the show_episodes_this show_shows [the show, episodes, this show, shows, the cha... [Catscratch is the best thing to come out of N...
# topic(0)
tm.get_topic(0)
[('films', np.float32(0.39660233)),
 ('film', np.float32(0.39083824)),
 ('movie', np.float32(0.38979483)),
 ('movies', np.float32(0.3852972)),
 ('the movie', np.float32(0.38005644)),
 ('this film', np.float32(0.35904634)),
 ('acting', np.float32(0.34009022)),
 ('this movie', np.float32(0.3388986)),
 ('actors', np.float32(0.30643165)),
 ('horror', np.float32(0.2766531))]

Part-of-speech (Spacy) – Catégories lexicales

# importation et instanciation de l'outil
from bertopic.representation import PartOfSpeech
spacy_repr_model = PartOfSpeech("en_core_web_sm",
                                pos_patterns=[
                                    [{"POS": "NOUN"}],
                                    [{"POS": "VERB"}],
                                    [{"POS": "ADJ"}]
                                ])

# màj des topics
tm.update_topics(docs,representation_model=spacy_repr_model)

# affichage
tm.get_topic_info()
Topic Count Name Representation Representative_Docs
0 -1 91 -1_film_one_movie_have [film, one, movie, have, like, has, more, good... [I was 16 when I first saw the movie, and it h...
1 0 63 0_movie_have_film_like [movie, have, film, like, one, did, movies, go... [In the title I write that the story is ludicr...
2 1 31 1_film_one_have_only [film, one, have, only, movie, like, about, ha... [Punctuating the opening credits sequence is a...
3 2 15 2_show_is_have_like [show, is, have, like, episode, season, one, g... [Catscratch is the best thing to come out of N...
# mots-clés pour topic(0)
tm.get_topic(0)
[('movie', np.float64(0.04911095233185278)),
 ('have', np.float64(0.028370742244443192)),
 ('film', np.float64(0.02480238380193781)),
 ('like', np.float64(0.023798011183105056)),
 ('one', np.float64(0.016300222876897634)),
 ('did', np.float64(0.014222837317518243)),
 ('movies', np.float64(0.014095807615209666)),
 ('good', np.float64(0.014051672325150224)),
 ('has', np.float64(0.0133941018577551)),
 ('bad', np.float64(0.01248388726269001))]
# mots-clé pour topic(1)
tm.get_topic(1)
[('film', np.float64(0.02964574641659732)),
 ('one', np.float64(0.020868520946990297)),
 ('have', np.float64(0.0163170518784571)),
 ('only', np.float64(0.015571014978517468)),
 ('movie', np.float64(0.014442370108761385)),
 ('like', np.float64(0.012406758333346484)),
 ('about', np.float64(0.011825087090145686)),
 ('has', np.float64(0.011471786510539199)),
 ('more', np.float64(0.011243136239215943)),
 ('story', np.float64(0.009942085493925168))]

Filtrage des termes pour l’interprétation (stop words)

# utilisation la liste de stopwords de NLTK
import nltk
#importer la librairie des stopwords
#si ce n'est pas déjà fait
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ricco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
True
#charger les stopwords
from nltk.corpus import stopwords
mots_vides = stopwords.words('english')
print(mots_vides)
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she'd", "she'll", "she's", 'should', 'shouldn', "shouldn't", "should've", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', "we'd", "we'll", "we're", 'were', 'weren', "weren't", "we've", 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll", 'your', "you're", 'yours', 'yourself', 'yourselves', "you've"]
#compléter la liste des stopwords
#avec les termes propres au cinéma
mots_vides.extend(['movie','film','story','see','scene','films','like','good','bad'])
print(mots_vides)
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she'd", "she'll", "she's", 'should', 'shouldn', "shouldn't", "should've", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', "we'd", "we'll", "we're", 'were', 'weren', "weren't", "we've", 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll", 'your', "you're", 'yours', 'yourself', 'yourselves', "you've", 'movie', 'film', 'story', 'see', 'scene', 'films', 'like', 'good', 'bad']
# utiliser un countvectorizer spécifique
from sklearn.feature_extraction.text import CountVectorizer

# instancier en excluant les stopwords
# avec la possibilité d'introduire des n-grams (2 max.)
# termes (ou bi-grams) doit apparaître dans au moins 5 documents
mon_vectorizer = CountVectorizer(stop_words=mots_vides,lowercase=True,ngram_range=(1,2))
# mettre à jour les topics
tm.update_topics(docs,vectorizer_model=mon_vectorizer)

# affichage
tm.get_topic_info()
Topic Count Name Representation Representative_Docs
0 -1 91 -1_one_even_great_would [one, even, great, would, people, well, time, ... [I was 16 when I first saw the movie, and it h...
1 0 63 0_would_one_even_movies [would, one, even, movies, time, think, dont, ... [In the title I write that the story is ludicr...
2 1 31 1_one_woman_life_also [one, woman, life, also, man, great, love, cha... [Punctuating the opening credits sequence is a...
3 2 15 2_show_episode_season_one [show, episode, season, one, get, scooby, full... [Catscratch is the best thing to come out of N...
# topic(0)
tm.get_topic(0)
[('would', np.float64(0.016618210015872603)),
 ('one', np.float64(0.016453236552428727)),
 ('even', np.float64(0.01480983300607646)),
 ('movies', np.float64(0.014190746095391648)),
 ('time', np.float64(0.012085035143167762)),
 ('think', np.float64(0.011861915793560482)),
 ('dont', np.float64(0.011181744743718348)),
 ('characters', np.float64(0.011055136151299827)),
 ('horror', np.float64(0.010585302605585921)),
 ('seen', np.float64(0.010287901894462112))]
# topic(1)
tm.get_topic(1)
[('one', np.float64(0.02007071578432185)),
 ('woman', np.float64(0.010719423322500924)),
 ('life', np.float64(0.010646018460812593)),
 ('also', np.float64(0.009855377602328354)),
 ('man', np.float64(0.009215956811807627)),
 ('great', np.float64(0.009076533684174777)),
 ('love', np.float64(0.008681932113596968)),
 ('character', np.float64(0.008414582114286093)),
 ('time', np.float64(0.008331953844021328)),
 ('husband', np.float64(0.00830040450350368))]