Versions¶

In [ ]:
#version de Python
import sys
print(sys.version)
3.11.6 | packaged by conda-forge | (main, Oct  3 2023, 10:29:11) [MSC v.1935 64 bit (AMD64)]
In [ ]:
#version de spacy
import spacy
print(spacy.__version__)
3.7.2

Analyse de texte¶

In [ ]:
#texte à parser - première strophe de la chanson "Vesoul" de Jacques Brel
#un peu normalisée
texte = "Jacques Brel. Tu as voulu voir Vierzon, et on a vu Vierzon. Tu as voulu voir Vesoul, et on a vu Vesoul. Tu as voulu voir Honfleur, et on a vu Honfleur. Tu as voulu voir Hambourg, et on a vu Hambourg. J'ai voulu voir Anvers, on a revu Hambourg. J'ai voulu voir ta soeur, et on a vu ta mère. Comme toujours."
texte = texte.lower()
print(texte)
jacques brel. tu as voulu voir vierzon, et on a vu vierzon. tu as voulu voir vesoul, et on a vu vesoul. tu as voulu voir honfleur, et on a vu honfleur. tu as voulu voir hambourg, et on a vu hambourg. j'ai voulu voir anvers, on a revu hambourg. j'ai voulu voir ta soeur, et on a vu ta mère. comme toujours.
In [ ]:
#"modèle" pré-entraîné pour parser les documents en français
#la liste des "modèles" français dispos : https://spacy.io/models/fr
parser = spacy.load("fr_core_news_lg")

#traitement + tagging
my_doc = parser(texte)
In [ ]:
#vérif. -- le texte sous sa forme brute (qui a été parsé)
my_doc.text
Out[ ]:
"jacques brel. tu as voulu voir vierzon, et on a vu vierzon. tu as voulu voir vesoul, et on a vu vesoul. tu as voulu voir honfleur, et on a vu honfleur. tu as voulu voir hambourg, et on a vu hambourg. j'ai voulu voir anvers, on a revu hambourg. j'ai voulu voir ta soeur, et on a vu ta mère. comme toujours."
In [ ]:
#pour chaque terme après tokenisation
#affichage -> terme + lemmatisation + catégorie lexicale + morphologie
for mot in my_doc:
    print(f"{mot.text} : {mot.lemma_}, {mot.pos_}, {mot.morph}")
jacques : jacques, PROPN, Gender=Masc|Number=Sing
brel : brel, PROPN, Gender=Masc|Number=Sing
. : ., PUNCT, 
tu : tu, PRON, Number=Sing|Person=1
as : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
voir : voir, VERB, VerbForm=Inf
vierzon : vierzon, NOUN, Gender=Masc|Number=Sing
, : ,, PUNCT, 
et : et, CCONJ, 
on : on, PRON, Number=Sing|Person=3
a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
vu : voir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
vierzon : vierzon, PROPN, 
. : ., PUNCT, 
tu : tu, PRON, Number=Sing|Person=1
as : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
voir : voir, VERB, VerbForm=Inf
vesoul : vesoul, NOUN, Gender=Masc|Number=Sing
, : ,, PUNCT, 
et : et, CCONJ, 
on : on, PRON, Number=Sing|Person=3
a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
vu : voir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
vesoul : vesoul, NOUN, Gender=Masc|Number=Sing
. : ., PUNCT, 
tu : tu, PRON, Number=Sing|Person=1
as : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
voir : voir, VERB, VerbForm=Inf
honfleur : honfleur, NOUN, Gender=Masc|Number=Sing
, : ,, PUNCT, 
et : et, CCONJ, 
on : on, PRON, Number=Sing|Person=3
a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
vu : voir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
honfleur : honfleur, NOUN, Gender=Masc|Number=Sing
. : ., PUNCT, 
tu : tu, PRON, Number=Sing|Person=1
as : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
voir : voir, VERB, VerbForm=Inf
hambourg : hambourg, PROPN, Gender=Masc|Number=Sing
, : ,, PUNCT, 
et : et, CCONJ, 
on : on, PRON, Number=Sing|Person=3
a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
vu : voir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
hambourg : hambourg, PROPN, Gender=Masc|Number=Sing
. : ., PUNCT, 
j' : je, PRON, Number=Sing|Person=1
ai : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
voir : voir, VERB, VerbForm=Inf
anvers : anvers, PROPN, Gender=Masc|Number=Sing
, : ,, PUNCT, 
on : on, PRON, Number=Sing|Person=3
a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
revu : revoir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
hambourg : hambourg, PROPN, Gender=Masc|Number=Sing
. : ., PUNCT, 
j' : je, PRON, Number=Sing|Person=1
ai : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
voir : voir, VERB, VerbForm=Inf
ta : ton, DET, Gender=Fem|Number=Sing|Poss=Yes
soeur : soeur, NOUN, Gender=Fem|Number=Sing
, : ,, PUNCT, 
et : et, CCONJ, 
on : on, PRON, Number=Sing|Person=3
a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
vu : voir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part
ta : ton, DET, Gender=Fem|Number=Sing|Poss=Yes
mère : mère, NOUN, Gender=Fem|Number=Sing
. : ., PUNCT, 
comme : comme, ADP, 
toujours : toujours, ADV, 
. : ., PUNCT, 
In [ ]:
#filtrage - retrait des ponctuations, des conjonctions de coordination et des déterminants possessifs
#texte ramené aux lemmes
texte_clean = [mot.lemma_ for mot in my_doc if mot.pos_ not in ['PUNCT','CCONJ','DET']]
print(" ".join(texte_clean))
jacques brel tu avoir vouloir voir vierzon on avoir voir vierzon tu avoir vouloir voir vesoul on avoir voir vesoul tu avoir vouloir voir honfleur on avoir voir honfleur tu avoir vouloir voir hambourg on avoir voir hambourg je avoir vouloir voir anvers on avoir revoir hambourg je avoir vouloir voir soeur on avoir voir mère comme toujours
In [ ]:
#dictionnaire après coup
import numpy
dico_bis = numpy.unique(texte_clean)
print(dico_bis)
['anvers' 'avoir' 'brel' 'comme' 'hambourg' 'honfleur' 'jacques' 'je'
 'mère' 'on' 'revoir' 'soeur' 'toujours' 'tu' 'vesoul' 'vierzon' 'voir'
 'vouloir']
In [ ]:
#détection des entités nommées (real-world object)
#c.-à-d. companies, localisations, organisations, produits
for mot in my_doc.ents:
    print(mot.text,mot.label_)
jacques brel PER
tu as MISC
vierzon LOC
vierzon LOC
tu as MISC
vesoul LOC
vesoul LOC
tu as MISC
honfleur LOC
honfleur LOC
tu as MISC
hambourg MISC
hambourg LOC
anvers LOC
hambourg LOC
In [ ]:
#displacy pour identifier les éléments dans le document
from spacy import displacy

#affichage pour les entités nommées
displacy.render(my_doc,style='ent')
jacques brel PER . tu as MISC voulu voir vierzon LOC , et on a vu vierzon LOC . tu as MISC voulu voir vesoul LOC , et on a vu vesoul LOC . tu as MISC voulu voir honfleur LOC , et on a vu honfleur LOC . tu as MISC voulu voir hambourg MISC , et on a vu hambourg LOC . j'ai voulu voir anvers LOC , on a revu hambourg LOC . j'ai voulu voir ta soeur, et on a vu ta mère. comme toujours.
In [ ]:
#explication (lexique)
spacy.explain('LOC')
Out[ ]:
'Non-GPE locations, mountain ranges, bodies of water'
In [ ]:
#structure des phrases
displacy.render(my_doc, style="dep")
jacques PROPN brel. PROPN tu PRON as AUX voulu VERB voir VERB vierzon, NOUN et CCONJ on PRON a AUX vu VERB vierzon. PROPN tu PRON as AUX voulu VERB voir VERB vesoul, NOUN et CCONJ on PRON a AUX vu VERB vesoul. NOUN tu PRON as AUX voulu VERB voir VERB honfleur, NOUN et CCONJ on PRON a AUX vu VERB honfleur. NOUN tu PRON as AUX voulu VERB voir VERB hambourg, PROPN et CCONJ on PRON a AUX vu VERB hambourg. PROPN j' PRON ai AUX voulu VERB voir VERB anvers, PROPN on PRON a AUX revu VERB hambourg. PROPN j' PRON ai AUX voulu VERB voir VERB ta DET soeur, NOUN et CCONJ on PRON a AUX vu VERB ta DET mère. NOUN comme ADP toujours. ADV flat:name nsubj aux:tense xcomp obj cc nsubj aux:tense conj obj nsubj aux:tense xcomp obj cc nsubj aux:tense conj obj nsubj aux:tense xcomp obj cc nsubj aux:tense conj obj nsubj aux:tense xcomp obj cc nsubj aux:tense conj obj nsubj aux:tense xcomp obj nsubj aux:tense parataxis nsubj nsubj aux:tense xcomp det obj cc nsubj aux:tense conj det obj fixed

Représentation vectorielle des termes¶

In [ ]:
#liste des termes dans le document
#et leurs numéros
for i,mot in enumerate(my_doc):
    print(i, mot.text)
0 jacques
1 brel
2 .
3 tu
4 as
5 voulu
6 voir
7 vierzon
8 ,
9 et
10 on
11 a
12 vu
13 vierzon
14 .
15 tu
16 as
17 voulu
18 voir
19 vesoul
20 ,
21 et
22 on
23 a
24 vu
25 vesoul
26 .
27 tu
28 as
29 voulu
30 voir
31 honfleur
32 ,
33 et
34 on
35 a
36 vu
37 honfleur
38 .
39 tu
40 as
41 voulu
42 voir
43 hambourg
44 ,
45 et
46 on
47 a
48 vu
49 hambourg
50 .
51 j'
52 ai
53 voulu
54 voir
55 anvers
56 ,
57 on
58 a
59 revu
60 hambourg
61 .
62 j'
63 ai
64 voulu
65 voir
66 ta
67 soeur
68 ,
69 et
70 on
71 a
72 vu
73 ta
74 mère
75 .
76 comme
77 toujours
78 .
In [ ]:
#représentation vectorielle pour le 7e terme, vierzon
my_doc[7].has_vector
Out[ ]:
True
In [ ]:
#récupération du vecteur
my_doc[7].vector
Out[ ]:
array([-4.0657e-01, -2.0025e-01,  2.8754e-01, -4.2928e-01,  2.7163e-01,
        6.9482e-02, -1.2550e-01,  3.7818e-01,  9.7598e-01, -1.6251e-01,
        1.9031e-01, -7.9347e-01, -4.6508e-01, -1.5558e-01,  1.2947e+00,
       -1.1455e+00,  3.9202e-01, -1.0322e+00, -9.7683e-01,  4.5076e-01,
        7.1663e-01,  7.7132e-01,  4.7204e-01, -2.0041e-01,  7.8794e-01,
       -1.2724e+00,  1.5912e+00, -7.8458e-01,  2.5732e-01, -6.6908e-01,
       -8.5539e-01,  1.2967e+00, -2.6708e-01, -1.9598e-01, -1.4309e-01,
        6.6540e-03, -1.2907e+00,  1.4002e+00, -1.8218e+00,  5.3225e-01,
        2.0713e-01,  2.2242e-01,  1.0662e-01,  2.2235e-01,  7.5361e-01,
        1.1899e+00,  1.5472e-01, -2.3241e-01,  5.6915e-01,  1.5155e+00,
       -3.5420e-01, -5.5860e-01, -1.3645e-02, -4.5693e-01,  1.1369e+00,
        1.3149e+00, -9.6851e-01, -4.0033e-01,  5.4487e-02, -6.6694e-01,
        1.7506e-01, -6.2798e-01,  1.0074e+00, -5.3175e-01, -3.0654e-01,
       -6.5145e-01, -4.2323e-01,  8.6252e-01,  1.3160e+00, -2.0910e-01,
       -1.4855e-01, -5.0082e-02, -4.8467e-01, -8.6203e-02, -1.3226e+00,
       -5.1566e-01,  1.8179e-01,  1.0349e+00, -4.2297e-01,  1.3841e+00,
       -1.5807e+00, -1.4678e-01, -1.3161e-01, -1.6265e+00, -3.7110e-01,
        7.9721e-01, -7.4050e-01, -7.7008e-01,  3.2064e-01,  1.8384e+00,
       -3.8825e-01,  5.7544e-02, -1.4938e+00, -4.4638e-01, -6.5860e-01,
       -5.6837e-01,  4.1797e-01,  2.2528e-01,  3.4527e-01, -3.9242e-01,
       -2.1223e-01, -3.6062e-01, -9.2100e-01, -8.9633e-01, -1.2510e-01,
       -6.2732e-02,  8.1116e-01,  1.1423e-01, -1.2302e+00,  1.9736e-01,
       -1.2433e-01,  3.7199e-01,  2.4325e-01, -1.6955e-01, -2.1964e-01,
        1.8599e+00,  1.1425e-02, -2.0298e-01,  8.9051e-02, -3.0032e-01,
       -2.0283e-01,  2.9174e-01, -2.5473e-01, -2.2293e-01, -8.4426e-01,
        1.3266e+00,  1.6016e+00,  1.2428e+00, -4.0662e-01,  8.2429e-02,
        1.4955e+00, -6.6329e-01,  3.3609e-01,  9.4762e-02,  5.3329e-01,
       -7.0475e-01,  5.5864e-01, -2.7856e-01, -1.9157e+00,  2.3704e-01,
       -4.2899e-01, -5.7577e-01,  1.5515e-02,  2.4383e-01,  4.9577e-01,
        1.1405e+00, -1.6564e-01,  2.8747e-01,  1.7616e+00, -1.6911e-01,
       -6.3712e-01, -1.9791e-01, -8.2447e-01,  5.8574e-01,  6.3516e-01,
        4.5650e-01,  1.6442e-01, -3.8441e-01, -1.7971e-01,  1.2859e+00,
       -1.2575e+00, -1.4858e+00,  2.4661e-01,  4.1250e-01,  9.1271e-01,
        3.2353e-01,  8.6869e-01,  5.4169e-02, -9.4836e-01,  2.6973e-01,
        1.4008e+00,  2.8192e-01,  5.7193e-01, -1.0867e+00,  1.7838e-01,
        5.4392e-01, -4.2186e-01, -3.0026e-01,  9.0635e-02, -5.0220e-01,
        3.3055e-01,  3.0955e-03, -5.8019e-01, -2.9985e-01, -5.4717e-01,
       -8.2679e-01, -7.0257e-02, -8.1559e-01, -1.2090e+00,  2.7562e-01,
       -6.7841e-01,  2.0542e-01,  4.9799e-01, -1.2865e-01,  4.2938e-01,
        1.2731e+00, -1.2410e-01, -3.4725e-01,  5.9236e-01, -4.8761e-01,
       -6.2629e-01,  2.3057e-03, -9.2564e-01,  4.8470e-02, -6.3772e-02,
       -6.7288e-01,  2.7903e-01,  2.8268e-02, -1.4076e-01,  6.8378e-01,
        2.1612e-01,  6.0460e-01,  3.6177e-01, -1.0455e+00,  8.5451e-01,
        2.4390e-01,  1.4620e+00, -2.3945e-01,  1.3431e+00, -9.8238e-01,
        8.8327e-01, -1.7935e-01,  6.6251e-01,  2.0690e-01, -1.0792e-01,
        7.4089e-01,  1.6170e+00, -7.2707e-02, -6.2213e-01,  9.6534e-01,
       -2.9215e-02,  7.9944e-01,  6.5622e-02,  4.6552e-01, -1.1256e-01,
       -1.3510e+00, -6.9111e-01,  9.5096e-01, -9.6088e-01, -2.8245e-01,
       -8.6613e-01, -1.6196e-01, -3.5614e-01, -6.0044e-01, -1.4988e+00,
        6.8974e-01, -6.8883e-01,  1.6172e-01,  4.8205e-01, -1.1944e-01,
        2.3086e-01, -6.7829e-01,  8.8963e-02,  9.0213e-01,  9.4611e-01,
       -7.9906e-02, -4.2308e-01,  1.0481e+00,  1.3916e+00,  6.3237e-01,
        3.5527e-01, -4.3014e-01, -4.9953e-02, -4.2331e-01, -1.0916e+00,
        7.2194e-01, -5.0015e-01,  9.2636e-01,  3.7391e-01,  8.9174e-01,
       -1.0275e+00,  8.1164e-01,  3.2245e-01,  4.8588e-01, -6.6116e-01,
        6.7272e-01,  8.8292e-01,  6.3029e-01, -5.9496e-01, -5.2185e-03,
        4.8353e-01,  7.0208e-01, -5.4277e-01, -6.2411e-01, -1.3636e+00,
       -9.9813e-01, -7.3958e-01, -1.0664e-03, -1.2001e+00,  3.6152e-01,
       -6.2169e-01,  1.8455e-01, -2.1798e+00, -8.7334e-01, -2.0561e-02,
        1.2941e+00,  5.9920e-01, -2.1451e-02, -1.2552e+00, -4.8696e-01],
      dtype=float32)
In [ ]:
#dimension
my_doc[7].vector.shape
Out[ ]:
(300,)
In [ ]:
#on a bien le même terme
print(my_doc[7].text)
print(my_doc[13].text)
vierzon
vierzon
In [ ]:
#vérification que l'on a bien la même représentation
numpy.sum((my_doc[7].vector-my_doc[13].vector)**2)
Out[ ]:
0.0
In [ ]:
#liste de termes et leurs coordonnées
#word embedding
coord = {}

#charger les coordonnées
for mot in my_doc:
    if not(mot.text in coord):
        coord[mot.text] = mot.vector
        
#coordonnées de vesoul
print(coord['vesoul'])
[-5.8407e-01  3.4424e-01 -2.2901e-01  9.9353e-02 -1.6188e-01 -2.3497e-01
  8.0822e-01  3.4484e-01 -7.7716e-01  6.5560e-01  2.2229e-01 -9.4003e-02
 -4.8322e-01 -4.5401e-01  1.4715e-01  6.7225e-03  7.6638e-01 -7.5615e-01
 -3.0963e-01 -6.9015e-02  7.1394e-01  1.6666e+00  1.2212e+00 -2.3179e-01
 -2.1714e-01 -1.1320e+00 -7.4422e-01  1.0710e+00  5.6044e-01 -1.2510e+00
  8.6863e-02  2.7075e-01 -2.3794e-01 -8.0670e-01  5.0118e-01 -3.1831e-01
  2.9396e-02 -6.4636e-03  9.8959e-01  6.3312e-02  1.7129e-01 -8.4535e-01
  6.1919e-01  1.2757e+00 -4.2155e-01 -2.2575e-01 -2.0978e-01 -1.8917e+00
  7.6063e-02  1.6763e+00 -5.0706e-01 -8.3366e-01 -2.8649e-01 -2.9436e-01
  1.0061e+00  5.0607e-01 -8.6939e-01  6.5620e-01  4.0167e-01 -9.4017e-01
  8.5270e-01 -4.5480e-01 -7.5253e-01 -8.2199e-02  4.8929e-02  4.7423e-01
 -3.4895e-01  9.0311e-01 -1.0631e-01  1.6204e-02  4.0598e-01 -4.3379e-01
  3.8747e-01 -6.8255e-01 -7.3036e-01 -6.5625e-01  1.1343e+00  2.2641e-01
  9.6053e-02  5.5185e-01 -1.0586e-02  7.1051e-01  1.7653e-01 -9.8917e-01
 -3.4193e-01  1.1521e+00  3.6208e-01 -4.9975e-01 -9.4832e-01  1.1999e+00
 -6.6913e-01 -7.7592e-01 -1.0546e+00 -1.2149e+00 -7.2905e-01  5.2374e-01
  1.2252e+00  7.0492e-01  7.9432e-01 -9.6642e-01 -1.9159e-01 -4.0667e-01
 -3.2944e-01 -1.6421e-01  6.0935e-01  9.0460e-01  7.6646e-01 -8.5255e-01
 -6.4933e-01  2.4853e-01 -6.8751e-01 -3.0451e-01  3.2306e-01 -3.2068e-01
  6.6219e-02  7.0929e-01  1.0140e-01 -5.0990e-01 -4.8441e-01 -1.0660e+00
 -5.4247e-01 -4.4798e-01  9.6521e-01  5.0421e-01 -8.3784e-01  3.4046e-01
  8.2372e-01 -3.9867e-01  4.4751e-01  4.6875e-01  1.3789e+00 -8.0593e-01
  5.2308e-01 -5.4083e-01  7.8277e-01 -1.3942e-01 -6.4330e-01  6.8972e-01
 -2.1223e+00 -1.6389e-01 -4.2503e-01  7.6170e-02 -3.2761e-01  3.2396e-01
 -1.7469e+00  9.3386e-01 -1.3022e+00  1.2676e+00  1.5521e+00 -6.1249e-01
 -2.1223e-01  7.0340e-01 -8.8014e-02 -7.0633e-01  1.3715e+00 -2.4819e-01
 -5.6164e-01 -6.6568e-01  7.4097e-01 -1.5313e-01 -1.6301e+00 -8.9272e-01
 -1.3026e-01  2.7284e-01  8.0913e-01  5.6019e-01  9.5325e-01  1.1896e+00
  2.3602e-01 -1.9886e-01  5.0916e-01  8.7401e-01  5.0070e-01 -9.1593e-01
  8.2782e-01  1.2882e+00 -4.0974e-01  1.6180e+00 -5.8194e-01  1.0008e-01
  8.3763e-01  5.6150e-01 -8.4785e-01 -4.8341e-01  5.1680e-01  8.1683e-01
 -5.9333e-01 -6.7202e-01 -2.5270e-01  5.5265e-01 -5.1497e-01  1.1494e-01
  1.9935e-03 -1.4527e-02  2.4835e-01  4.5724e-02  7.0839e-01  1.0196e-01
  9.9780e-01 -4.6006e-01 -4.3787e-01  5.8046e-01 -5.2978e-01 -9.3563e-01
 -4.0745e-01  1.1533e+00 -3.3321e-01 -1.0931e-01  3.1795e-01 -2.9053e-01
 -3.9995e-01 -2.7213e-01 -5.3465e-01 -7.3220e-01  1.4295e+00  1.7616e-01
  6.4572e-01 -6.0241e-01  7.3246e-01  6.7409e-02  2.5494e-01  1.9218e-01
 -8.8132e-02 -2.2286e-01  6.1991e-02 -2.7595e-01 -1.0547e-01 -3.8189e-01
 -3.4555e-01  1.5986e+00 -1.4851e-01  8.0789e-01  8.0484e-01  2.9581e-01
 -9.1896e-02 -1.1847e+00 -1.0589e-01 -1.0386e+00  8.4735e-01  4.6961e-02
 -4.6145e-01 -2.3212e-01 -1.6656e+00  4.8900e-01 -4.6923e-01  2.6997e-01
 -5.6505e-01  1.2146e-01  3.4244e-01 -9.3093e-01 -1.1812e+00 -3.9865e-01
  6.0374e-02  1.9107e+00  5.9228e-01 -6.1370e-03 -2.6013e-01  4.6038e-01
  6.4547e-01  8.1412e-01  9.2802e-01 -5.2924e-02  5.3487e-01  6.7531e-01
 -3.0926e-01  1.8939e-01 -3.0234e-01  4.0307e-02  3.5449e-01  4.6310e-01
  6.5369e-01  1.1442e+00  8.3256e-01 -1.1165e+00 -2.3187e-01 -7.0950e-01
  3.0229e-01  2.7046e-01 -1.3302e+00 -5.2441e-01 -1.5810e-01 -4.7788e-01
 -3.7892e-01 -5.6443e-01 -9.0476e-02 -2.7887e-01  1.3334e+00  1.5497e+00
 -2.3411e-01  6.1919e-02 -3.8621e-01 -1.1364e-01 -7.4522e-02 -3.0313e-01
  1.3659e+00  7.7729e-01  1.1739e+00  5.4830e-01 -1.1586e-01 -6.0751e-01]
In [ ]:
#dimension - vérif. encore
print(coord['vierzon'].shape)
(300,)
In [ ]:
#liste des termes vectorisés
print(coord.keys())
dict_keys(['jacques', 'brel', '.', 'tu', 'as', 'voulu', 'voir', 'vierzon', ',', 'et', 'on', 'a', 'vu', 'vesoul', 'honfleur', 'hambourg', "j'", 'ai', 'anvers', 'revu', 'ta', 'soeur', 'mère', 'comme', 'toujours'])
In [ ]:
#fonction pour similarité cosinus
def similarite_cosinus(v1,v2):
    res = numpy.dot(v1,v2)/(numpy.linalg.norm(v1)*numpy.linalg.norm(v2))
    return res
In [ ]:
#similarité cosinus entre "vesoul" et "vierzon"
print(similarite_cosinus(coord['vesoul'], coord['vierzon']))
0.39401367
In [ ]:
#similarité entre "jacques" et "brel"
print(similarite_cosinus(coord['jacques'], coord['brel']))
0.36657745
In [ ]:
#fonction interne de calcul des similarités
#et s'appuie bien sur une similarité cosinus
my_doc[0].similarity(my_doc[1])
Out[ ]:
0.3665774464607239

Représentation d'un document¶

Composé d'un ensemble de termes...

In [ ]:
#l'ensemble du paragraphe (de la strophe entière)
print(my_doc.vector)
[ 8.54071043e-03  2.74834752e-01 -3.50937581e+00  7.40022004e-01
  1.25529134e+00  1.53335011e+00 -2.32128334e+00 -1.02142103e-01
  4.32935268e-01  5.98222077e-01  1.19048464e+00  1.03554368e+00
  5.38600087e-01 -1.89471453e-01  4.41725731e-01 -3.69690716e-01
 -6.61469996e-01 -1.23821807e+00  1.27446759e+00 -1.84450102e+00
  9.00520325e-01 -3.27972978e-01  1.05935395e+00  1.22191370e+00
  2.72004902e-01  1.01664329e+00  8.38955343e-01 -1.98171884e-01
 -4.58234906e-01  2.57503009e+00  3.09241080e+00 -2.28456914e-01
  6.73313998e-03 -2.05305529e+00  6.43110633e-01  5.82398176e-01
 -1.78304088e+00  3.95900726e+00 -7.43371546e-01 -3.74488616e+00
  3.75972748e-01 -2.89105511e+00 -1.81379631e-01  9.35952783e-01
  5.10660946e-01 -1.79199553e+00  1.11294299e-01 -1.47862601e+00
 -1.59714901e+00  1.79036176e+00 -1.86572537e-01 -2.81084323e+00
  9.80933964e-01 -4.33306545e-01  1.77158618e+00 -1.17055932e-02
 -8.33573341e-01  6.40884638e-02 -1.34679961e+00 -9.99486923e-01
  4.90743697e-01  1.31642878e+00  1.95130980e+00  3.19589901e+00
  1.22135615e+00  8.00969601e-02 -2.28863549e+00  8.17215919e-01
 -1.00551176e+00  3.29085684e+00  7.97130644e-01 -8.84275675e-01
 -2.00685096e+00 -1.85397398e+00  1.51758149e-01  1.14009631e+00
  9.98709857e-01 -2.51245826e-01 -3.19628268e-01  9.05652761e-01
 -2.40076232e+00  9.84759390e-01 -1.75243521e+00 -1.42035234e+00
 -1.13195431e+00  6.47265494e-01  1.42559123e+00 -2.11657444e-03
  1.70083117e+00  2.09998822e+00  9.32927787e-01  5.33584177e-01
  8.46255600e-01  3.70527893e-01 -1.40039027e+00 -9.73722219e-01
  2.54122353e+00  3.41843534e+00 -9.14492190e-01  6.30743742e-01
  1.01225090e+00  1.54992670e-01 -9.52577889e-01 -1.19824517e+00
  1.70235336e+00 -2.35534504e-01  1.24899280e+00 -2.44930804e-01
  1.58462620e+00 -1.63999283e+00  1.18683851e+00 -5.23683906e-01
  3.27429265e-01 -7.30383575e-01 -2.77057314e+00 -5.65913737e-01
  4.72364649e-02  1.47599792e+00  7.96574771e-01 -3.28604579e+00
  4.52138841e-01 -1.49302691e-01  1.39449251e+00 -1.57893908e+00
 -1.06467378e+00  4.75724846e-01 -1.35048211e+00 -4.90460396e-01
 -8.10190797e-01  4.43491936e-01  8.75523150e-01  1.18624461e+00
  6.45797849e-01  1.32548594e+00 -1.57959774e-01  1.04500508e+00
  5.47951087e-02 -1.30291057e+00  1.49257600e+00  5.60474277e-01
  3.86743456e-01  4.06284183e-01 -2.81473130e-01 -4.12439138e-01
 -1.08074498e+00 -3.81308287e-01  1.77380955e+00  1.49721012e-01
  2.71170187e+00 -1.28000772e+00  2.45102382e+00 -5.12013547e-02
  7.12734580e-01  1.99196422e+00 -1.63034737e+00  8.92213583e-01
 -5.78251779e-01 -6.12294614e-01  1.14384389e+00 -1.06257570e+00
 -2.72162139e-01  2.54026294e-01 -1.57059789e+00  1.05224922e-01
 -1.79139888e+00  3.60995144e-01 -1.03780162e+00 -1.10274827e+00
  1.69297051e+00  1.58983134e-02  1.17376339e+00  6.33648038e-01
  7.52244473e-01  4.38615590e-01  1.26718378e+00 -1.59162962e+00
 -6.39494896e-01  1.46695077e+00 -3.82820189e-01  4.87739891e-02
  2.37272048e+00  1.41792941e+00  9.45534706e-01  2.04811358e+00
 -1.39721179e+00  1.17254066e+00 -6.83715165e-01  1.20775926e+00
 -1.26009548e+00 -1.91462770e-01 -2.47833920e+00  5.03519058e-01
 -2.50229549e+00 -2.08238840e+00  1.74879706e+00  2.22020835e-01
 -1.33342993e+00 -1.28504992e+00 -1.65212601e-01  2.12433293e-01
 -2.38663864e+00  1.38421428e+00  2.47467399e+00  2.45869309e-01
 -1.42729735e+00  1.45380545e+00 -1.73741490e-01 -2.03180480e+00
 -9.91540134e-01 -1.05076051e+00 -2.34034753e+00 -1.40936628e-01
  3.21699947e-01 -1.81070685e+00  1.18953311e+00 -2.06306219e-01
 -4.85516042e-01 -2.99172783e+00  1.57572865e-01 -1.93550742e+00
  1.10156201e-02  6.39679469e-03  1.38491130e+00 -4.91976917e-01
 -1.10390174e+00 -2.30114564e-01  1.00739360e-01  1.10543616e-01
 -2.66368151e-01  1.13526773e+00 -1.56934166e+00 -4.48390245e-02
  2.14757466e+00  1.41380394e+00  6.66661933e-02 -2.30144429e+00
  1.46004632e-01 -8.06587160e-01 -1.79078186e+00 -2.60955048e+00
  1.28498721e+00  1.44019508e+00  1.33939493e+00 -1.16577947e+00
  2.60619187e+00  3.70054066e-01  1.04145813e+00 -1.44158661e+00
  1.15776682e+00 -1.64530110e+00  2.16309547e+00  1.95918581e-03
  8.25892210e-01  1.09968670e-01 -4.87819731e-01  1.19315052e+00
 -3.10592830e-01 -4.28579211e-01  2.22917989e-01 -5.01658261e-01
  1.09935299e-01 -2.13310242e+00 -1.05589473e+00 -5.55368364e-01
 -1.87726051e-01 -1.29797292e+00  6.10999644e-01  1.83215368e+00
 -4.19958085e-01 -1.47877121e+00 -3.09041351e-01 -1.04420602e+00
 -8.84850323e-01 -5.37264645e-01 -1.01103449e+00 -1.16927850e+00
  8.07683527e-01  9.28857923e-01  5.52378058e-01  3.62249911e-01
  1.57382965e-01 -6.17787428e-02 -3.03287327e-01  1.58902562e+00
 -9.03641045e-01 -1.46209562e+00 -4.57526684e-01  5.89973688e-01
  1.98776269e+00 -2.54537678e+00  6.89070880e-01 -1.73213691e-01
 -1.63254249e+00  8.87211382e-01 -1.06655121e+00 -1.38965654e+00
  1.27103961e+00  9.73660350e-01 -8.45093071e-01 -1.63603500e-01]
In [ ]:
#et on est bien dans la dimension de représentation
print(my_doc.vector.shape)
(300,)
In [ ]:
#représentation du premier sous-document "jacques brel"
#si on choisit de partitionner en phrases (délimiteur '.')
w1 = my_doc[0:2].vector
print(w1)
[ 1.01979303e+00 -2.10774988e-01 -2.03495002e+00  3.46100003e-01
  5.53399980e-01 -9.84100029e-02  5.04965007e-01 -1.13951492e+00
  5.40235043e-01 -1.80635005e-01 -5.31499982e-02 -1.46449506e-01
  1.48181999e+00 -1.74460006e+00  1.06970012e-01 -1.03129995e+00
  1.85123491e+00 -8.48654985e-01 -2.33174992e+00 -8.28119993e-01
 -6.60560012e-01  1.14914984e-01  3.33649993e-01  3.59766006e-01
  7.84144998e-01 -1.49019504e+00 -2.53000021e-01 -8.01829994e-01
  2.78770018e+00 -4.06379986e+00 -4.79430497e-01  2.94939995e-01
  1.20925999e+00 -2.54099965e-01 -5.79618514e-01 -2.09415007e+00
 -1.23400998e+00 -8.96940082e-02 -1.69100165e-02 -1.75228044e-01
  1.57814002e+00 -7.68379986e-01 -8.17414999e-01  2.16540003e+00
  1.84099996e+00  1.52745008e+00 -1.37559009e+00 -1.97045004e+00
  2.80200005e-01  3.40250015e+00 -5.09494960e-01 -7.44077027e-01
 -4.35072511e-01 -4.09240007e-01 -9.60945010e-01  6.17359951e-02
 -1.96344995e+00 -2.07924986e+00  1.46777999e+00 -2.54750013e+00
 -1.22735500e+00 -1.16578496e+00  4.52350006e-02 -1.70254993e+00
  3.94274998e+00 -4.81114984e-01  8.29313040e-01  1.68735003e+00
 -3.51190001e-01 -4.03785020e-01 -9.59834993e-01 -1.20859504e+00
 -7.79184997e-01 -9.71419990e-01 -2.15324998e+00  1.94050014e-01
  3.23765010e-01  7.17572987e-01 -2.48300016e-01  4.37364995e-01
  3.34208667e-01  1.55252492e+00  2.86530018e-01 -1.56719998e-01
 -3.37050009e+00  2.20642000e-01 -1.62084496e+00 -1.75899005e+00
  1.86016500e+00  3.45744991e+00  1.19967997e+00 -7.07269490e-01
  3.76514971e-01  1.01101494e+00  6.44534945e-01  4.57949996e-01
 -1.61155015e-01  9.67384219e-01 -2.24584997e-01  1.82031000e+00
  1.39744997e+00 -5.71130037e-01 -9.71300006e-01 -4.75600004e-01
  2.32474995e+00 -1.42014503e+00  1.17119506e-01  2.71824986e-01
  1.02921009e+00  1.59500003e-01  4.15562510e-01  1.18238997e+00
 -1.15129995e+00 -3.63685012e-01  3.52050006e-01  3.86325002e-01
  1.02406406e+00  1.04363990e+00 -1.28431499e+00 -2.14979982e+00
  6.72260046e-01  1.48233497e+00  3.57834995e-01 -1.51650012e-02
  3.15499902e-02  3.16349983e-01  1.90145993e+00 -5.13345003e-01
  1.16266000e+00 -8.72105002e-01  3.03750038e-02 -1.60745001e+00
  7.75390029e-01 -1.44249976e-01  4.59200025e-01 -9.75449979e-01
  4.28054988e-01  6.44192040e-01 -1.73696005e+00 -1.15984404e+00
  2.92794991e+00  4.20904994e-01 -7.75261521e-01  1.72979999e+00
  2.76244521e-01  5.61219990e-01 -5.47749996e-01  1.09489501e+00
  2.78980017e+00  1.07026005e+00  8.90510023e-01  1.94999576e-03
  6.34559989e-01  1.06867003e+00  1.22264004e+00  2.95364976e-01
 -8.21929991e-01  2.58151501e-01  5.03349960e-01 -2.57000327e-03
  2.30299979e-02 -2.14648515e-01  3.26564968e-01 -1.63024998e+00
  3.78099978e-02 -4.26829994e-01 -6.51844978e-01 -1.23925853e+00
  4.24569994e-01 -4.91954982e-01  4.06481504e-01 -2.18875006e-01
 -6.94779992e-01  7.22979963e-01  1.62361002e+00 -3.65560018e-02
 -5.29915035e-01 -2.71149993e+00 -5.93204975e-01 -2.05325007e+00
  1.38151991e+00 -7.09999800e-02 -3.78019989e-01  8.85535002e-01
 -5.24408460e-01 -3.80490005e-01 -1.80829000e+00 -1.19086552e+00
 -1.63674998e+00  6.25100017e-01 -1.21773899e+00 -1.57907999e+00
  2.94729996e+00  5.16335011e-01  1.46704996e+00  2.22909999e+00
 -8.08169961e-01 -1.34388506e+00  7.74595022e-01 -1.40117300e+00
 -1.74738002e+00 -1.44509995e+00 -1.70933998e+00  1.01620996e+00
 -3.59014988e+00  4.77695018e-01  2.09694996e-01  8.50105047e-01
 -3.52610022e-01  9.28799987e-01 -8.98850024e-01  9.13850069e-02
  1.66445005e+00 -6.70095026e-01  2.85214996e+00 -1.26300502e+00
  1.30885005e+00  1.98839998e+00 -3.59948486e-01 -2.57914998e-02
  4.51515019e-01 -5.92440963e-01 -1.04718995e+00  2.03301001e+00
  8.14639986e-01  8.74464989e-01 -4.25835013e-01  2.34526500e-01
 -3.76450002e-01  7.44850039e-02  6.19099975e-01 -1.22574985e-01
  6.59500003e-01  8.23069990e-01 -3.87654960e-01 -8.83099973e-01
 -1.49570000e+00  5.94694972e-01 -1.08964491e+00 -2.49156505e-01
 -6.54134989e-01  8.49979997e-01  4.75750029e-01 -1.33160496e+00
  3.02715003e-01  1.25730002e+00  1.41860008e+00  1.07823002e+00
 -6.13991469e-02 -2.88854986e-01  1.76905498e-01  9.00340021e-01
 -9.87725019e-01  1.71143997e+00  1.26633000e+00 -1.55230001e-01
  7.26096988e-01  4.25249994e-01  1.13871503e+00 -1.01552501e-01
  3.03698152e-01 -2.07424998e+00 -1.10468495e+00  8.76590014e-01
  1.51839995e+00  1.66681492e+00  1.06517005e+00  2.07234645e+00
 -1.66084504e+00 -1.19000673e-03 -1.67089498e+00 -1.10319996e+00
  1.14716494e+00 -3.28750014e-01 -1.81500006e+00  1.38399506e+00
 -5.49000055e-02  1.79454994e+00 -5.05070508e-01  2.36115003e+00
  5.23214996e-01 -1.64549991e-01 -4.73250151e-02 -8.50265026e-01
  1.36299968e-01  9.36850011e-01  5.73079944e-01 -1.47864997e+00
 -2.06215000e+00 -3.24449956e-01 -2.19420004e+00  3.29998016e-01
 -2.10669994e+00 -2.55329990e+00 -3.61352503e-01  6.49405003e-01
 -5.37189960e-01  7.94017017e-01  1.27259004e+00 -1.10343504e+00]
In [ ]:
#moyenne des représentations individuelles de 'jacques' et 'brel'
w2 = numpy.mean([coord['jacques'],coord['brel']],axis=0)
print(w2)
[ 1.01979303e+00 -2.10774988e-01 -2.03495002e+00  3.46100003e-01
  5.53399980e-01 -9.84100029e-02  5.04965007e-01 -1.13951492e+00
  5.40235043e-01 -1.80635005e-01 -5.31499982e-02 -1.46449506e-01
  1.48181999e+00 -1.74460006e+00  1.06970012e-01 -1.03129995e+00
  1.85123491e+00 -8.48654985e-01 -2.33174992e+00 -8.28119993e-01
 -6.60560012e-01  1.14914984e-01  3.33649993e-01  3.59766006e-01
  7.84144998e-01 -1.49019504e+00 -2.53000021e-01 -8.01829994e-01
  2.78770018e+00 -4.06379986e+00 -4.79430497e-01  2.94939995e-01
  1.20925999e+00 -2.54099965e-01 -5.79618514e-01 -2.09415007e+00
 -1.23400998e+00 -8.96940082e-02 -1.69100165e-02 -1.75228044e-01
  1.57814002e+00 -7.68379986e-01 -8.17414999e-01  2.16540003e+00
  1.84099996e+00  1.52745008e+00 -1.37559009e+00 -1.97045004e+00
  2.80200005e-01  3.40250015e+00 -5.09494960e-01 -7.44077027e-01
 -4.35072511e-01 -4.09240007e-01 -9.60945010e-01  6.17359951e-02
 -1.96344995e+00 -2.07924986e+00  1.46777999e+00 -2.54750013e+00
 -1.22735500e+00 -1.16578496e+00  4.52350006e-02 -1.70254993e+00
  3.94274998e+00 -4.81114984e-01  8.29313040e-01  1.68735003e+00
 -3.51190001e-01 -4.03785020e-01 -9.59834993e-01 -1.20859504e+00
 -7.79184997e-01 -9.71419990e-01 -2.15324998e+00  1.94050014e-01
  3.23765010e-01  7.17572987e-01 -2.48300016e-01  4.37364995e-01
  3.34208667e-01  1.55252492e+00  2.86530018e-01 -1.56719998e-01
 -3.37050009e+00  2.20642000e-01 -1.62084496e+00 -1.75899005e+00
  1.86016500e+00  3.45744991e+00  1.19967997e+00 -7.07269490e-01
  3.76514971e-01  1.01101494e+00  6.44534945e-01  4.57949996e-01
 -1.61155015e-01  9.67384219e-01 -2.24584997e-01  1.82031000e+00
  1.39744997e+00 -5.71130037e-01 -9.71300006e-01 -4.75600004e-01
  2.32474995e+00 -1.42014503e+00  1.17119506e-01  2.71824986e-01
  1.02921009e+00  1.59500003e-01  4.15562510e-01  1.18238997e+00
 -1.15129995e+00 -3.63685012e-01  3.52050006e-01  3.86325002e-01
  1.02406406e+00  1.04363990e+00 -1.28431499e+00 -2.14979982e+00
  6.72260046e-01  1.48233497e+00  3.57834995e-01 -1.51650012e-02
  3.15499902e-02  3.16349983e-01  1.90145993e+00 -5.13345003e-01
  1.16266000e+00 -8.72105002e-01  3.03750038e-02 -1.60745001e+00
  7.75390029e-01 -1.44249976e-01  4.59200025e-01 -9.75449979e-01
  4.28054988e-01  6.44192040e-01 -1.73696005e+00 -1.15984404e+00
  2.92794991e+00  4.20904994e-01 -7.75261521e-01  1.72979999e+00
  2.76244521e-01  5.61219990e-01 -5.47749996e-01  1.09489501e+00
  2.78980017e+00  1.07026005e+00  8.90510023e-01  1.94999576e-03
  6.34559989e-01  1.06867003e+00  1.22264004e+00  2.95364976e-01
 -8.21929991e-01  2.58151501e-01  5.03349960e-01 -2.57000327e-03
  2.30299979e-02 -2.14648515e-01  3.26564968e-01 -1.63024998e+00
  3.78099978e-02 -4.26829994e-01 -6.51844978e-01 -1.23925853e+00
  4.24569994e-01 -4.91954982e-01  4.06481504e-01 -2.18875006e-01
 -6.94779992e-01  7.22979963e-01  1.62361002e+00 -3.65560018e-02
 -5.29915035e-01 -2.71149993e+00 -5.93204975e-01 -2.05325007e+00
  1.38151991e+00 -7.09999800e-02 -3.78019989e-01  8.85535002e-01
 -5.24408460e-01 -3.80490005e-01 -1.80829000e+00 -1.19086552e+00
 -1.63674998e+00  6.25100017e-01 -1.21773899e+00 -1.57907999e+00
  2.94729996e+00  5.16335011e-01  1.46704996e+00  2.22909999e+00
 -8.08169961e-01 -1.34388506e+00  7.74595022e-01 -1.40117300e+00
 -1.74738002e+00 -1.44509995e+00 -1.70933998e+00  1.01620996e+00
 -3.59014988e+00  4.77695018e-01  2.09694996e-01  8.50105047e-01
 -3.52610022e-01  9.28799987e-01 -8.98850024e-01  9.13850069e-02
  1.66445005e+00 -6.70095026e-01  2.85214996e+00 -1.26300502e+00
  1.30885005e+00  1.98839998e+00 -3.59948486e-01 -2.57914998e-02
  4.51515019e-01 -5.92440963e-01 -1.04718995e+00  2.03301001e+00
  8.14639986e-01  8.74464989e-01 -4.25835013e-01  2.34526500e-01
 -3.76450002e-01  7.44850039e-02  6.19099975e-01 -1.22574985e-01
  6.59500003e-01  8.23069990e-01 -3.87654960e-01 -8.83099973e-01
 -1.49570000e+00  5.94694972e-01 -1.08964491e+00 -2.49156505e-01
 -6.54134989e-01  8.49979997e-01  4.75750029e-01 -1.33160496e+00
  3.02715003e-01  1.25730002e+00  1.41860008e+00  1.07823002e+00
 -6.13991469e-02 -2.88854986e-01  1.76905498e-01  9.00340021e-01
 -9.87725019e-01  1.71143997e+00  1.26633000e+00 -1.55230001e-01
  7.26096988e-01  4.25249994e-01  1.13871503e+00 -1.01552501e-01
  3.03698152e-01 -2.07424998e+00 -1.10468495e+00  8.76590014e-01
  1.51839995e+00  1.66681492e+00  1.06517005e+00  2.07234645e+00
 -1.66084504e+00 -1.19000673e-03 -1.67089498e+00 -1.10319996e+00
  1.14716494e+00 -3.28750014e-01 -1.81500006e+00  1.38399506e+00
 -5.49000055e-02  1.79454994e+00 -5.05070508e-01  2.36115003e+00
  5.23214996e-01 -1.64549991e-01 -4.73250151e-02 -8.50265026e-01
  1.36299968e-01  9.36850011e-01  5.73079944e-01 -1.47864997e+00
 -2.06215000e+00 -3.24449956e-01 -2.19420004e+00  3.29998016e-01
 -2.10669994e+00 -2.55329990e+00 -3.61352503e-01  6.49405003e-01
 -5.37189960e-01  7.94017017e-01  1.27259004e+00 -1.10343504e+00]
In [ ]:
#vérification
print(numpy.sum((w1-w2)**2))
0.0