Chargement et inspection des données¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement des données
import pandas
full_D = pandas.read_excel("sentences_subjectivity.xlsx")
full_D.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentype   10000 non-null  object
 1   sentence  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB
In [ ]:
#premières lignes
full_D.head()
Out[ ]:
sentype sentence
0 objective the movie begins in the past where a young boy...
1 objective emerging from the human psyche and showing cha...
2 objective spurning her mother's insistence that she get ...
3 objective amitabh can't believe the board of directors a...
4 objective she , among others excentricities , talks to a...
In [ ]:
#récupérer la seule colonne des "phrases"
phrases = full_D[['sentence']]
phrases.head()
Out[ ]:
sentence
0 the movie begins in the past where a young boy...
1 emerging from the human psyche and showing cha...
2 spurning her mother's insistence that she get ...
3 amitabh can't believe the board of directors a...
4 she , among others excentricities , talks to a...

Clustering avec PyCaret¶

In [ ]:
#installation si nécessaire
#!pip install pycaret
In [ ]:
#version utilisée dans ce tutoriel
import pycaret
pycaret.__version__
Out[ ]:
'3.0.0'
In [ ]:
#création d'une "expérimentation"
#importation de la classe de calcul
from pycaret import clustering

#instanciation
ce = clustering.ClusteringExperiment()

#initialisation
#cf. https://pycaret.readthedocs.io/en/latest/api/clustering.html#pycaret.clustering.setup
ce.setup(data=phrases,text_features=['sentence'])
Out[ ]:
<pycaret.clustering.oop.ClusteringExperiment at 0x1b7419c5ca0>
In [ ]:
#k-means -- celui de scikit-learn, param additionnel avec random_state
km = ce.create_model('kmeans',num_clusters=2,random_state=123)
Initiated . . . . . . . . . . . . . . . . . . 11:19:53
Status . . . . . . . . . . . . . . . . . . Loading Dependencies
Estimator . . . . . . . . . . . . . . . . . . Compiling Library
  Silhouette Calinski-Harabasz Davies-Bouldin Homogeneity Rand Index Completeness
0 0.0028 33.9453 14.6118 0 0 0
Processing:   0%|          | 0/3 [00:00<?, ?it/s]
In [ ]:
#type du dernier objet de la chaîne de traitements (pipeline)
type(km)
Out[ ]:
sklearn.cluster._kmeans.KMeans
In [ ]:
#accès à ses propriétés - ex. l'inertie intra-classes
km.inertia_
Out[ ]:
9770.411077901583
In [ ]:
#effectifs par classe
ce.plot_model(km,'distribution')
In [ ]:
#groupe d'appartenance
new_D = ce.assign_model(km)
new_D.head()
Out[ ]:
sentence Cluster
0 the movie begins in the past where a young boy... Cluster 0
1 emerging from the human psyche and showing cha... Cluster 0
2 spurning her mother's insistence that she get ... Cluster 1
3 amitabh can't believe the board of directors a... Cluster 0
4 she , among others excentricities , talks to a... Cluster 1
In [ ]:
#tableau croisé avec les classes de référence (subjectivité)
pandas.crosstab(full_D.sentype,new_D.Cluster)
Out[ ]:
Cluster Cluster 0 Cluster 1
sentype
objective 2861 2139
subjective 4749 251
In [ ]:
#affichage dans le premier plan factoriel de l'ACP
ce.plot_model(km,'cluster')

Décortiquer le processus de clustering¶

In [ ]:
#description du processus de traitement
ce.pipeline
Out[ ]:
Pipeline(memory=FastMemory(location=C:\Users\ricco\AppData\Local\Temp\joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=[], transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent'))),
                ('text_embedding',
                 TransformerWrapper(include=['sentence'],
                                    transformer=EmbedTextFeatures()))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(memory=FastMemory(location=C:\Users\ricco\AppData\Local\Temp\joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=[], transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent'))),
                ('text_embedding',
                 TransformerWrapper(include=['sentence'],
                                    transformer=EmbedTextFeatures()))])
TransformerWrapper(include=[], transformer=SimpleImputer())
SimpleImputer()
SimpleImputer()
TransformerWrapper(include=[],
                   transformer=SimpleImputer(strategy='most_frequent'))
SimpleImputer(strategy='most_frequent')
SimpleImputer(strategy='most_frequent')
TransformerWrapper(include=['sentence'], transformer=EmbedTextFeatures())
EmbedTextFeatures()
EmbedTextFeatures()
In [ ]:
#type de l'objet text_embedding
type(ce.pipeline['text_embedding'])
Out[ ]:
pycaret.internal.preprocess.transformers.TransformerWrapper
In [ ]:
#champs et méthodes
dir(ce.pipeline['text_embedding'])
Out[ ]:
['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_feature_names',
 '_check_n_features',
 '_exclude',
 '_feature_names_in',
 '_get_param_names',
 '_get_tags',
 '_include',
 '_more_tags',
 '_name_cols',
 '_prepare_df',
 '_reorder_cols',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sklearn_auto_wrap_output_keys',
 '_train_only',
 '_validate_data',
 '_validate_params',
 'exclude',
 'feature_names_in_',
 'fit',
 'fit_transform',
 'get_params',
 'include',
 'set_output',
 'set_params',
 'target_name_',
 'transform',
 'transformer']
In [ ]:
#liste des paramètres du TransformerWrapper
ce.pipeline['text_embedding'].get_params()
Out[ ]:
{'exclude': None,
 'include': ['sentence'],
 'transformer__kwargs': None,
 'transformer__method': 'tf-idf',
 'transformer': EmbedTextFeatures()}
In [ ]:
#type du transformer utilisé
type(ce.pipeline['text_embedding'].transformer)
Out[ ]:
pycaret.internal.preprocess.transformers.EmbedTextFeatures
In [ ]:
#propriétés et méthodes du transformer
dir(ce.pipeline['text_embedding'].transformer)
Out[ ]:
['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_feature_names',
 '_check_n_features',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sklearn_auto_wrap_output_keys',
 '_validate_data',
 '_validate_params',
 'estimators_',
 'fit',
 'fit_transform',
 'get_params',
 'kwargs',
 'method',
 'set_output',
 'set_params',
 'transform']
In [ ]:
#type de l'estimateur utilisé --> scikit-learn
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
ce.pipeline['text_embedding'].transformer.estimators_
Out[ ]:
{'sentence': TfidfVectorizer()}
In [ ]:
#vocabulaire -> dictionnaire des termes
ce.pipeline['text_embedding'].transformer.estimators_['sentence'].vocabulary_
Out[ ]:
{'the': 18605,
 'movie': 12180,
 'begins': 1833,
 'in': 9237,
 'past': 13420,
 'where': 20428,
 'young': 20822,
 'boy': 2347,
 'named': 12351,
 'sam': 16029,
 'attempts': 1389,
 'to': 18835,
 'save': 16118,
 'celebi': 3016,
 'from': 7470,
 'hunter': 8964,
 'emerging': 6039,
 'human': 8924,
 'psyche': 14543,
 'and': 918,
 'showing': 16765,
 'characteristics': 3111,
 'of': 12860,
 'abstract': 313,
 'expressionism': 6630,
 'minimalism': 11850,
 'russian': 15926,
 'constructivism': 3981,
 'graffiti': 8015,
 'removal': 15294,
 'has': 8419,
 'secured': 16367,
 'its': 9828,
 'place': 13842,
 'history': 8698,
 'modern': 11980,
 'art': 1224,
 'while': 20438,
 'being': 1852,
 'created': 4301,
 'by': 2681,
 'artists': 1244,
 'who': 20467,
 'are': 1152,
 'unconscious': 19408,
 'their': 18616,
 'artistic': 1241,
 'achievements': 394,
 'spurning': 17508,
 'her': 8590,
 'mother': 12129,
 'insistence': 9524,
 'that': 18604,
 'she': 16651,
 'get': 7768,
 'on': 12927,
 'with': 20591,
 'life': 10738,
 'mary': 11410,
 'is': 9789,
 'thrown': 18726,
 'out': 13074,
 'house': 8884,
 'rejected': 15204,
 'joe': 9990,
 'expelled': 6568,
 'school': 16212,
 'as': 1254,
 'grows': 8156,
 'larger': 10497,
 'child': 3215,
 'amitabh': 860,
 'can': 2785,
 'believe': 1863,
 'board': 2180,
 'directors': 5205,
 'his': 8688,
 'mind': 11835,
 'filled': 7001,
 'revenge': 15533,
 'what': 20414,
 'better': 1961,
 'than': 18597,
 'robbing': 15710,
 'bank': 1633,
 'himself': 8668,
 'ironic': 9764,
 'it': 9819,
 'may': 11498,
 'sound': 17314,
 'among': 865,
 'others': 13063,
 'excentricities': 6486,
 'talks': 18349,
 'small': 17095,
 'rock': 15724,
 'gertrude': 7763,
 'like': 10771,
 'if': 9072,
 'was': 20271,
 'alive': 758,
 'this': 18671,
 'gives': 7837,
 'girls': 7831,
 'fair': 6730,
 'chance': 3085,
 'pulling': 14588,
 'wool': 20651,
 'over': 13120,
 'eyes': 6676,
 'using': 19813,
 'sexiness': 16561,
 'poach': 13952,
 'any': 1035,
 'last': 10513,
 'vestige': 19980,
 'common': 3718,
 'sense': 16448,
 'dons': 5514,
 'might': 11784,
 'have': 8451,
 'had': 8260,
 'styled': 17908,
 'after': 609,
 'vh1': 19986,
 'behind': 1848,
 'music': 12270,
 'mockumentary': 11972,
 'profiles': 14401,
 'rise': 15663,
 'fall': 6748,
 'an': 891,
 'internet': 9637,
 'startup': 17623,
 'called': 2744,
 'icevan': 9026,
 'com': 3656,
 'blue': 2162,
 'not': 12687,
 'only': 12938,
 'predicament': 14205,
 'he': 8474,
 'also': 796,
 'lacks': 10427,
 'ability': 282,
 'outwardly': 13116,
 'express': 6625,
 'emotions': 6054,
 'killer': 10247,
 'clues': 3515,
 'perversion': 13662,
 'biblical': 1976,
 'punishments': 14615,
 'for': 7257,
 'sins': 16912,
 'stoning': 17759,
 'burning': 2635,
 'decapitation': 4702,
 'david': 4635,
 'painter': 13257,
 'block': 2124,
 'takes': 18333,
 'job': 9984,
 'waiter': 20193,
 'some': 17248,
 'inspiration': 9531,
 'women': 20626,
 'craved': 4288,
 'him': 8667,
 'men': 11646,
 'wanted': 20237,
 'be': 1758,
 'set': 16530,
 'island': 9800,
 'off': 12861,
 'coast': 3537,
 'florida': 7184,
 'techno': 18455,
 'rave': 14899,
 'party': 13392,
 'attracts': 1411,
 'diverse': 5407,
 'group': 8146,
 'college': 3620,
 'coeds': 3568,
 'guard': 8171,
 'officer': 12877,
 'lesson': 10681,
 'learned': 10598,
 'never': 12528,
 'mess': 11699,
 'gay': 7679,
 'mafia': 11151,
 'theme': 18624,
 'film': 7007,
 'simultaneously': 16885,
 'addresses': 469,
 'similarities': 16866,
 'between': 1967,
 'two': 19313,
 'factions': 6707,
 'law': 10555,
 'crime': 4341,
 'revealing': 15526,
 'brothers': 2512,
 'they': 18645,
 're': 14911,
 'jewish': 9963,
 'grandmothers': 8033,
 'lesbians': 10676,
 'but': 2661,
 'neglecting': 12477,
 'work': 20660,
 'carping': 2909,
 'at': 1353,
 'mom': 12014,
 'behaving': 1842,
 'badly': 1565,
 'toward': 18958,
 'loyal': 11019,
 'friend': 7449,
 'bobbi': 2193,
 'all': 759,
 'going': 7914,
 'gerry': 7759,
 'estranged': 6375,
 'wife': 20499,
 'margaret': 11326,
 'worried': 20682,
 'daughter': 4629,
 'safety': 15979,
 'finds': 7035,
 'herself': 8615,
 'another': 995,
 'target': 18383,
 'race': 14746,
 'find': 7032,
 'code': 3561,
 'valento': 19849,
 'feels': 6891,
 'heat': 8518,
 'turns': 19275,
 'table': 18297,
 'makes': 11207,
 'dupe': 5751,
 'into': 9670,
 'one': 12930,
 'own': 13201,
 'rubs': 15871,
 'da': 4528,
 'nose': 12680,
 'saigon': 15987,
 '1952': 90,
 'beautiful': 1788,
 'exotic': 6550,
 'mysterious': 12308,
 'city': 3357,
 'caught': 2991,
 'grips': 8117,
 'vietnamese': 20024,
 'war': 20240,
 'liberation': 10719,
 'french': 7426,
 'colonial': 3634,
 'powers': 14148,
 'deep': 4761,
 'northwest': 12673,
 'there': 18636,
 'lone': 10931,
 'ranch': 14834,
 'tucked': 19237,
 'away': 1506,
 'so': 17180,
 'purposefully': 14643,
 'way': 20306,
 'looking': 10949,
 'teenager': 18468,
 'father': 6833,
 'silver': 16860,
 'lead': 10579,
 'says': 16138,
 'rikki': 15643,
 'ortega': 13050,
 'moves': 12179,
 'king': 10270,
 'street': 17812,
 '193': 72,
 'nglio': 12555,
 'east': 5813,
 'side': 16811,
 'these': 18642,
 'games': 7618,
 'chasing': 3144,
 'rejecting': 15205,
 'seducing': 16377,
 'played': 13882,
 'economically': 5842,
 'spiritually': 17430,
 'depressed': 4927,
 'hong': 8785,
 'kong': 10341,
 'without': 20597,
 'much': 12197,
 'gusto': 8230,
 'television': 18487,
 'made': 11138,
 'famous': 6767,
 'biggest': 1990,
 'hits': 8712,
 'happened': 8356,
 'screen': 16281,
 'jordan': 10027,
 'long': 10936,
 'search': 16332,
 'true': 19208,
 'faith': 6739,
 'tries': 19146,
 'protect': 14498,
 'believes': 1866,
 'injustice': 9465,
 'bloody': 2147,
 'magic': 11157,
 'story': 17776,
 'zack': 20844,
 'eleven': 5949,
 'year': 20785,
 'old': 12898,
 'family': 6766,
 'visited': 20099,
 'three': 18704,
 'debt': 4693,
 'collectors': 3618,
 'however': 8902,
 'jane': 9882,
 'wendy': 20392,
 '12': 14,
 'sees': 16398,
 'make': 11204,
 'refuses': 15139,
 'tales': 18341,
 'elegant': 5937,
 'documentary': 5451,
 'sundance': 18051,
 'eloquent': 5983,
 'deeply': 4766,
 'moving': 12187,
 'la': 10407,
 'times': 18797,
 'toyo': 18977,
 'miyatake': 11955,
 'infinite': 9393,
 'shades': 16576,
 'gray': 8059,
 'penetrating': 13551,
 'portrait': 14072,
 'photographer': 13724,
 'truth': 19221,
 'beauty': 1791,
 'world': 20673,
 'impermanence': 9170,
 'straight': 17784,
 'up': 19753,
 'helicopters': 8554,
 'action': 421,
 'will': 20515,
 'take': 18330,
 'audiences': 1416,
 'series': 16505,
 'aerial': 566,
 'adventures': 550,
 'lapp': 10488,
 'woman': 20623,
 'anni': 977,
 'shelter': 16669,
 'both': 2306,
 'them': 18621,
 'farm': 6797,
 'touches': 18940,
 'encroachment': 6110,
 'christianity': 3284,
 'brought': 2513,
 'missionaries': 11924,
 'which': 20435,
 'odds': 12848,
 'mepe': 11667,
 'tribal': 19129,
 'traditional': 18995,
 'roots': 15808,
 'grisly': 8118,
 'murders': 12256,
 'brings': 2469,
 'fbi': 6853,
 'agent': 631,
 'graham': 8017,
 'norton': 12675,
 'retirement': 15500,
 'puts': 14663,
 'atrocious': 1373,
 'fiennes': 6974,
 'driven': 5670,
 'image': 9112,
 'painting': 13260,
 'soon': 17280,
 'team': 18428,
 'suspect': 18169,
 'knowles': 10327,
 'main': 11184,
 'objective': 12783,
 'actually': 442,
 'recover': 15048,
 'prototype': 14506,
 'dna': 5437,
 'testing': 18582,
 'machine': 11118,
 'huxley': 8987,
 'project': 14425,
 'company': 3742,
 'spent': 17399,
 'years': 20790,
 'millions': 11825,
 'dollars': 5482,
 'developing': 5058,
 'persuades': 13654,
 'renowned': 15319,
 'entomologist': 6239,
 'trip': 19162,
 'jungle': 10093,
 'butterfly': 2671,
 'leading': 10583,
 'adventure': 547,
 'transform': 19037,
 'lives': 10866,
 'rare': 14868,
 'gift': 7798,
 'melding': 11615,
 'subjectivity': 17930,
 'biographical': 2016,
 'facts': 6711,
 '225': 152,
 'rton': 15866,
 'sabina': 15946,
 'spielrein': 17408,
 'back': 1539,
 'body': 2202,
 'soul': 17309,
 'seeking': 16389,
 'mental': 11657,
 'escape': 6331,
 'simone': 16872,
 'tune': 19252,
 'happening': 8357,
 'other': 13062,
 'couples': 4213,
 'around': 1202,
 'beatle': 1782,
 'fan': 6769,
 'drama': 5604,
 'about': 294,
 'albert': 718,
 'psychotic': 14562,
 'prisoner': 14335,
 'devoted': 5082,
 'john': 9995,
 'lennon': 10662,
 'beatles': 1783,
 'then': 18628,
 '1974': 109,
 'something': 17254,
 'incredible': 9294,
 'fell': 6901,
 'love': 11002,
 'deathbed': 4678,
 'candice': 2795,
 'klein': 10301,
 'accidentally': 350,
 'asks': 1279,
 'question': 14707,
 'did': 5121,
 'ever': 6433,
 'do': 5438,
 'deserve': 4971,
 'shot': 16744,
 'scenes': 16172,
 'look': 10947,
 'how': 8900,
 'fictional': 6959,
 'kung': 10388,
 'fu': 7496,
 'basically': 1722,
 'making': 11211,
 'before': 1821,
 'investigation': 9723,
 'ends': 6131,
 'we': 20311,
 've': 19902,
 'met': 11709,
 'boyfriends': 2351,
 'drug': 5693,
 'dealer': 4667,
 'alicia': 749,
 'hadley': 8262,
 'dad': 4534,
 'nurses': 12758,
 'doctors': 5446,
 'orderly': 13019,
 'exactly': 6465,
 'good': 7939,
 '38': 187,
 'evil': 6452,
 'mexico': 11735,
 '2002': 139,
 'based': 1716,
 '1800': 50,
 'rainone': 14797,
 'affair': 572,
 'singing': 16899,
 'sensation': 16444,
 'kelly': 10192,
 'mcguire': 11524,
 'whom': 20475,
 'discovered': 5269,
 'near': 12443,
 'demise': 4861,
 'hands': 8331,
 'prot': 14495,
 '233': 158,
 'vincent': 20058,
 'riccola': 15596,
 'juice': 10069,
 'fuels': 7502,
 'roller': 15757,
 'coaster': 3539,
 'ride': 15618,
 'through': 18719,
 'debauchery': 4684,
 'decades': 4700,
 'greed': 8072,
 'sex': 16557,
 'drugs': 5695,
 'roll': 15755,
 'trapped': 19067,
 'lovers': 11010,
 'triangle': 19127,
 'ruthless': 15935,
 'game': 7614,
 'lust': 11082,
 'betrayal': 1952,
 'follow': 7231,
 'hypnotic': 9003,
 'journey': 10042,
 'discover': 5268,
 'self': 16414,
 'decent': 4713,
 'dull': 5724,
 'dek': 4809,
 'loves': 11011,
 'shirley': 16708,
 'humiliates': 8940,
 'proposing': 14474,
 'warning': 20258,
 'national': 12400,
 'since': 16888,
 'architects': 1142,
 'either': 5908,
 'busy': 2660,
 'otherwise': 13064,
 'or': 13001,
 'too': 18886,
 'conservative': 3952,
 'style': 17907,
 'ambivalent': 839,
 'honour': 8795,
 'falls': 6753,
 'numerobis': 12750,
 'goes': 7911,
 'town': 18965,
 'darkness': 4607,
 'tooth': 18893,
 'fairy': 6737,
 'strange': 17793,
 'employing': 6074,
 'home': 8759,
 'movies': 12185,
 'newly': 12541,
 'footage': 7249,
 'effort': 5883,
 'expose': 6615,
 'hungarian': 8954,
 'mutiple': 12296,
 'problems': 14358,
 '1940s': 81,
 'current': 4486,
 'matsumoto': 11467,
 'sawako': 16131,
 'were': 20394,
 'happy': 8363,
 'couple': 4211,
 'meddling': 11572,
 'parents': 13345,
 'chase': 3141,
 'success': 17972,
 'push': 14656,
 'tragic': 19001,
 'choice': 3251,
 'elvis': 5993,
 'teams': 18433,
 'jack': 9840,
 'ossie': 13058,
 'davis': 4639,
 'fellow': 6902,
 'nursing': 12759,
 'resident': 15413,
 'thinks': 18658,
 'president': 14261,
 'kennedy': 10196,
 'valiant': 19853,
 'codgers': 3567,
 'sally': 16015,
 'forth': 7321,
 'battle': 1744,
 'egyptian': 5898,
 'entity': 6238,
 'chosen': 3269,
 'term': 18543,
 'care': 2866,
 'facility': 6703,
 'hunting': 8966,
 'grounds': 8145,
 'everywhere': 6445,
 'plagued': 13850,
 'cats': 2988,
 'when': 20426,
 'meets': 11594,
 'carol': 2897,
 'lonely': 10933,
 'highway': 8653,
 'must': 12279,
 'begin': 1828,
 'avoiding': 1488,
 'private': 14340,
 'detective': 5030,
 'mr': 12190,
 'barlow': 1679,
 'terrifying': 18562,
 'inhuman': 9448,
 'creature': 4310,
 'uncover': 19416,
 'dark': 4601,
 'charlie': 3127,
 'used': 19804,
 'living': 10868,
 'poverty': 14135,
 'seemed': 16392,
 'impossible': 9201,
 'cass': 2943,
 'cary': 2933,
 'comfortable': 3683,
 'bountiful': 2329,
 'until': 19730,
 'doqa': 5530,
 'gracia': 7998,
 'comes': 3679,
 'bring': 2467,
 'normal': 12664,
 'come': 3669,
 'dreams': 5642,
 'those': 18682,
 'lost': 10984,
 'possibilities': 14099,
 'want': 20236,
 'realize': 14945,
 'grit': 8119,
 'determination': 5036,
 'molly': 12012,
 'guides': 8196,
 'epic': 6264,
 'step': 17675,
 'ahead': 662,
 'authorities': 1451,
 '500': 211,
 'miles': 11803,
 'australia': 1439,
 'outback': 13075,
 'rabbit': 14743,
 'proof': 14453,
 'fence': 6915,
 'bisects': 2034,
 'continent': 4022,
 'nevertherless': 12530,
 'feel': 6887,
 'responsible': 15459,
 'flag': 7106,
 'monday': 12028,
 'ceremony': 3055,
 'doctor': 5445,
 'realizes': 14947,
 'virus': 20087,
 'advises': 562,
 'woo': 20640,
 'girl': 7826,
 'somehow': 17251,
 'realizing': 14948,
 'munnabhai': 12247,
 'fallen': 6749,
 'none': 12645,
 'younger': 20823,
 'sister': 16922,
 'komal': 10339,
 'inhabit': 9436,
 'short': 16738,
 'time': 18789,
 'later': 10520,
 'maine': 11185,
 'woods': 20646,
 'overtaken': 13177,
 'blizzard': 2120,
 'vicious': 19998,
 'storm': 17773,
 'more': 12091,
 'ominous': 12922,
 'consequently': 3950,
 'enthusiastic': 6231,
 'road': 15694,
 'roadside': 15697,
 'obstacles': 12815,
 'threaten': 18698,
 'prevent': 14290,
 'boys': 2353,
 'competition': 3767,
 'gets': 7769,
 'trouble': 19196,
 'police': 13997,
 'simon': 16871,
 'represses': 15367,
 'death': 4677,
 'wish': 20579,
 'decides': 4720,
 'help': 8565,
 'chon': 3257,
 'travels': 19086,
 'new': 12531,
 'york': 20814,
 'roy': 15863,
 'bannon': 1643,
 'owen': 13199,
 'wilson': 20531,
 'overcome': 13127,
 'enemy': 6139,
 'adept': 476,
 'technological': 18456,
 'witchery': 20590,
 'curse': 4489,
 'marks': 11358,
 'destiny': 5011,
 'becomes': 1803,
 'less': 10678,
 'ordinary': 13021,
 'encounter': 6102,
 'herb': 8593,
 'mischievous': 11889,
 'malevolent': 11226,
 'geek': 7690,
 'call': 2741,
 'themselves': 18627,
 'starts': 17622,
 'hakimi': 8281,
 'freelance': 7416,
 'scriptwriter': 16301,
 'send': 16437,
 'imelda': 9130,
 'ex': 6462,
 'stormy': 17775,
 'night': 12579,
 'leads': 10584,
 'informants': 9408,
 'turn': 19271,
 'dead': 4659,
 'nick': 12565,
 'unhappy': 19567,
 'henry': 8587,
 'protective': 14501,
 'cop': 4107,
 'second': 16349,
 'part': 13371,
 'aki': 698,
 'kaurism': 10174,
 '228': 154,
 'ki': 10217,
 'finland': 7048,
 'trilogy': 19155,
 'follows': 7235,
 'man': 11249,
 'arrives': 1218,
 'helsinki': 8574,
 'beaten': 1780,
 'severely': 16554,
 'develops': 5061,
 'amnesia': 861,
 'accident': 348,
 'survivors': 18164,
 'start': 17615,
 'dropping': 5683,
 'flies': 7157,
 'edgar': 5849,
 'intent': 9595,
 'laying': 10568,
 'down': 5563,
 'rules': 15894,
 'turning': 19274,
 'coddled': 3560,
 'son': 17261,
 'someone': 17252,
 'bound': 2324,
 'red': 15067,
 'cord': 4127,
 'wanders': 20228,
 'forgotten': 7299,
 'sudden': 17990,
 'fame': 6758,
 'does': 5462,
 'seem': 16391,
 'solve': 17243,
 'everything': 6444,
 'no': 12618,
 'option': 12996,
 'katsuragi': 10169,
 'use': 19803,
 'martial': 11391,
 'arts': 1246,
 'skills': 16972,
 'fight': 6983,
 'muscle': 12265,
 'dome': 5490,
 'drawing': 5623,
 'kids': 10240,
 'writes': 20730,
 'children': 3221,
 'don': 5502,
 'grow': 8152,
 'artist': 1238,
 'hasn': 8423,
 'picked': 13748,
 'brush': 2528,
 'kidnapped': 10232,
 'held': 8549,
 'exchange': 6496,
 'priceless': 14303,
 'diamonds': 5102,
 'leader': 10581,
 'crew': 4332,
 'highly': 8652,
 'skilled': 16969,
 'urban': 19788,
 'thieves': 18650,
 'dmx': 5436,
 'forges': 7287,
 'unlikely': 19626,
 'alliance': 773,
 'taiwanese': 18326,
 'intelligence': 9584,
 'jet': 9953,
 'li': 10708,
 'rescue': 15394,
 'garmento': 7653,
 'tells': 18493,
 'satirical': 16092,
 'wholesale': 20471,
 'garment': 7652,
 'industry': 9354,
 'shady': 16580,
 'deals': 4670,
 'buck': 2542,
 'ruthlessness': 15937,
 'prerequisite': 14248,
 'career': 2869,
 'rudy': 15876,
 'yellowshirt': 20795,
 'investigator': 9726,
 'department': 4900,
 'witnesses': 20602,
 'firsthand': 7069,
 'painful': 13251,
 'legacy': 10630,
 'indian': 9312,
 'existence': 6539,
 'journeying': 10044,
 'vietnam': 20023,
 'pulaski': 14583,
 'tennessee': 18524,
 'danang': 4563,
 'tensely': 18528,
 'unfolds': 19543,
 'cultural': 4462,
 'differences': 5137,
 'separation': 16476,
 'toll': 18861,
 'riveting': 15689,
 'longing': 10941,
 'personal': 13640,
 'each': 5789,
 'weekend': 20352,
 'nothing': 12699,
 'hangover': 8340,
 'robert': 15712,
 'de': 4656,
 'niro': 12612,
 'plays': 13894,
 'therapist': 18634,
 'obsessive': 12811,
 'compulsive': 3810,
 'agoraphobic': 652,
 'left': 10625,
 'apartment': 1046,
 'six': 16937,
 'cyber': 4516,
 'breakers': 2409,
 'got': 7972,
 'dream': 5634,
 'win': 20536,
 'dance': 4564,
 'final': 7021,
 'reaching': 14916,
 'usa': 19802,
 'rival': 15680,
 'ld': 10577,
 'deal': 4666,
 'excessive': 6494,
 'live': 10859,
 'passion': 13410,
 'samia': 16034,
 'graced': 7993,
 'instinctive': 9552,
 'jenny': 9936,
 'thomas': 18673,
 'wants': 20239,
 'become': 1802,
 'professional': 14390,
 'dancer': 4565,
 'genetically': 7718,
 'engineered': 6161,
 'immune': 9155,
 'charisma': 3120,
 'draws': 5627,
 'claire': 3371,
 'brutally': 2532,
 'guns': 8221,
 'harmless': 8393,
 'dissidents': 5355,
 'doubt': 5547,
 'really': 14949,
 'trust': 19216,
 'anyone': 1038,
 'palm': 13279,
 'springs': 17500,
 'white': 20458,
 'backdrop': 1541,
 'fast': 6819,
 'paced': 13219,
 'circuit': 3343,
 'parties': 13383,
 'few': 6951,
 'questionable': 14708,
 'actions': 424,
 'finally': 7024,
 'questioning': 14710,
 'ramu': 14832,
 'stands': 17595,
 'repeats': 15338,
 'wisdom': 20573,
 'sharonna': 16635,
 'freeing': 7414,
 'posh': 14085,
 'guests': 8187,
 'attracting': 1406,
 'lexi': 10703,
 'anna': 973,
 'spanish': 17343,
 'greek': 8074,
 'believing': 1867,
 'parent': 13342,
 'bruce': 2519,
 'raised': 14800,
 'adoptive': 517,
 'identified': 9047,
 'deserters': 4968,
 'returned': 15515,
 'barracks': 1688,
 'court': 4220,
 'martialed': 11392,
 'frances': 7372,
 'break': 2407,
 'buys': 2679,
 'villa': 20047,
 'tuscan': 19279,
 'countryside': 4207,
 'anew': 937,
 'los': 10977,
 'angeles': 940,
 'april': 1109,
 '1992': 126,
 ...}
In [ ]:
#taille du dictionnaire
len(ce.pipeline['text_embedding'].transformer.estimators_['sentence'].vocabulary_)
Out[ ]:
20897
In [ ]:
#nombre de descripteurs utilisés par les k-means
km.n_features_in_
Out[ ]:
20897

Variante "tandem analysis" pour le clustering de corpus¶

In [ ]:
#passage par un PCA
#instanciation
ce_pca = clustering.ClusteringExperiment()

#initialisation
#cf. https://pycaret.readthedocs.io/en/latest/api/clustering.html#pycaret.clustering.setup
#réalisation des K-Means dans le premier plan factoriel
ce_pca.setup(data=phrases,text_features=['sentence'],pca=True,pca_components=2)

#calculs
km_pca = ce_pca.create_model('kmeans',num_clusters=2,random_state=123)
Initiated . . . . . . . . . . . . . . . . . . 11:41:38
Status . . . . . . . . . . . . . . . . . . Loading Dependencies
Estimator . . . . . . . . . . . . . . . . . . Compiling Library
  Silhouette Calinski-Harabasz Davies-Bouldin Homogeneity Rand Index Completeness
0 0.3195 4665.4252 1.1121 0 0 0
Processing:   0%|          | 0/3 [00:00<?, ?it/s]
In [ ]:
#pipeline associé
ce_pca.pipeline
Out[ ]:
Pipeline(memory=FastMemory(location=C:\Users\ricco\AppData\Local\Temp\joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=[], transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent'))),
                ('text_embedding',
                 TransformerWrapper(include=['sentence'],
                                    transformer=EmbedTextFeatures())),
                ('pca',
                 TransformerWrapper(exclude=[],
                                    transformer=PCA(n_components=2)))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(memory=FastMemory(location=C:\Users\ricco\AppData\Local\Temp\joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=[], transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent'))),
                ('text_embedding',
                 TransformerWrapper(include=['sentence'],
                                    transformer=EmbedTextFeatures())),
                ('pca',
                 TransformerWrapper(exclude=[],
                                    transformer=PCA(n_components=2)))])
TransformerWrapper(include=[], transformer=SimpleImputer())
SimpleImputer()
SimpleImputer()
TransformerWrapper(include=[],
                   transformer=SimpleImputer(strategy='most_frequent'))
SimpleImputer(strategy='most_frequent')
SimpleImputer(strategy='most_frequent')
TransformerWrapper(include=['sentence'], transformer=EmbedTextFeatures())
EmbedTextFeatures()
EmbedTextFeatures()
TransformerWrapper(exclude=[], transformer=PCA(n_components=2))
PCA(n_components=2)
PCA(n_components=2)
In [ ]:
#affichage dans le plan factoriel
ce_pca.plot_model(km_pca,'cluster')