#version de Python
import sys
print(sys.version)
3.11.6 | packaged by conda-forge | (main, Oct 3 2023, 10:29:11) [MSC v.1935 64 bit (AMD64)]
#version de spacy
import spacy
print(spacy.__version__)
3.7.2
#texte à parser - première strophe de la chanson "Vesoul" de Jacques Brel
#un peu normalisée
texte = "Jacques Brel. Tu as voulu voir Vierzon, et on a vu Vierzon. Tu as voulu voir Vesoul, et on a vu Vesoul. Tu as voulu voir Honfleur, et on a vu Honfleur. Tu as voulu voir Hambourg, et on a vu Hambourg. J'ai voulu voir Anvers, on a revu Hambourg. J'ai voulu voir ta soeur, et on a vu ta mère. Comme toujours."
texte = texte.lower()
print(texte)
jacques brel. tu as voulu voir vierzon, et on a vu vierzon. tu as voulu voir vesoul, et on a vu vesoul. tu as voulu voir honfleur, et on a vu honfleur. tu as voulu voir hambourg, et on a vu hambourg. j'ai voulu voir anvers, on a revu hambourg. j'ai voulu voir ta soeur, et on a vu ta mère. comme toujours.
#"modèle" pré-entraîné pour parser les documents en français
#la liste des "modèles" français dispos : https://spacy.io/models/fr
parser = spacy.load("fr_core_news_lg")
#traitement + tagging
my_doc = parser(texte)
#vérif. -- le texte sous sa forme brute (qui a été parsé)
my_doc.text
"jacques brel. tu as voulu voir vierzon, et on a vu vierzon. tu as voulu voir vesoul, et on a vu vesoul. tu as voulu voir honfleur, et on a vu honfleur. tu as voulu voir hambourg, et on a vu hambourg. j'ai voulu voir anvers, on a revu hambourg. j'ai voulu voir ta soeur, et on a vu ta mère. comme toujours."
#pour chaque terme après tokenisation
#affichage -> terme + lemmatisation + catégorie lexicale + morphologie
for mot in my_doc:
print(f"{mot.text} : {mot.lemma_}, {mot.pos_}, {mot.morph}")
jacques : jacques, PROPN, Gender=Masc|Number=Sing brel : brel, PROPN, Gender=Masc|Number=Sing . : ., PUNCT, tu : tu, PRON, Number=Sing|Person=1 as : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part voir : voir, VERB, VerbForm=Inf vierzon : vierzon, NOUN, Gender=Masc|Number=Sing , : ,, PUNCT, et : et, CCONJ, on : on, PRON, Number=Sing|Person=3 a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin vu : voir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part vierzon : vierzon, PROPN, . : ., PUNCT, tu : tu, PRON, Number=Sing|Person=1 as : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part voir : voir, VERB, VerbForm=Inf vesoul : vesoul, NOUN, Gender=Masc|Number=Sing , : ,, PUNCT, et : et, CCONJ, on : on, PRON, Number=Sing|Person=3 a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin vu : voir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part vesoul : vesoul, NOUN, Gender=Masc|Number=Sing . : ., PUNCT, tu : tu, PRON, Number=Sing|Person=1 as : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part voir : voir, VERB, VerbForm=Inf honfleur : honfleur, NOUN, Gender=Masc|Number=Sing , : ,, PUNCT, et : et, CCONJ, on : on, PRON, Number=Sing|Person=3 a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin vu : voir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part honfleur : honfleur, NOUN, Gender=Masc|Number=Sing . : ., PUNCT, tu : tu, PRON, Number=Sing|Person=1 as : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part voir : voir, VERB, VerbForm=Inf hambourg : hambourg, PROPN, Gender=Masc|Number=Sing , : ,, PUNCT, et : et, CCONJ, on : on, PRON, Number=Sing|Person=3 a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin vu : voir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part hambourg : hambourg, PROPN, Gender=Masc|Number=Sing . : ., PUNCT, j' : je, PRON, Number=Sing|Person=1 ai : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part voir : voir, VERB, VerbForm=Inf anvers : anvers, PROPN, Gender=Masc|Number=Sing , : ,, PUNCT, on : on, PRON, Number=Sing|Person=3 a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin revu : revoir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part hambourg : hambourg, PROPN, Gender=Masc|Number=Sing . : ., PUNCT, j' : je, PRON, Number=Sing|Person=1 ai : avoir, AUX, Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin voulu : vouloir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part voir : voir, VERB, VerbForm=Inf ta : ton, DET, Gender=Fem|Number=Sing|Poss=Yes soeur : soeur, NOUN, Gender=Fem|Number=Sing , : ,, PUNCT, et : et, CCONJ, on : on, PRON, Number=Sing|Person=3 a : avoir, AUX, Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin vu : voir, VERB, Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part ta : ton, DET, Gender=Fem|Number=Sing|Poss=Yes mère : mère, NOUN, Gender=Fem|Number=Sing . : ., PUNCT, comme : comme, ADP, toujours : toujours, ADV, . : ., PUNCT,
#filtrage - retrait des ponctuations, des conjonctions de coordination et des déterminants possessifs
#texte ramené aux lemmes
texte_clean = [mot.lemma_ for mot in my_doc if mot.pos_ not in ['PUNCT','CCONJ','DET']]
print(" ".join(texte_clean))
jacques brel tu avoir vouloir voir vierzon on avoir voir vierzon tu avoir vouloir voir vesoul on avoir voir vesoul tu avoir vouloir voir honfleur on avoir voir honfleur tu avoir vouloir voir hambourg on avoir voir hambourg je avoir vouloir voir anvers on avoir revoir hambourg je avoir vouloir voir soeur on avoir voir mère comme toujours
#dictionnaire après coup
import numpy
dico_bis = numpy.unique(texte_clean)
print(dico_bis)
['anvers' 'avoir' 'brel' 'comme' 'hambourg' 'honfleur' 'jacques' 'je' 'mère' 'on' 'revoir' 'soeur' 'toujours' 'tu' 'vesoul' 'vierzon' 'voir' 'vouloir']
#détection des entités nommées (real-world object)
#c.-à-d. companies, localisations, organisations, produits
for mot in my_doc.ents:
print(mot.text,mot.label_)
jacques brel PER tu as MISC vierzon LOC vierzon LOC tu as MISC vesoul LOC vesoul LOC tu as MISC honfleur LOC honfleur LOC tu as MISC hambourg MISC hambourg LOC anvers LOC hambourg LOC
#displacy pour identifier les éléments dans le document
from spacy import displacy
#affichage pour les entités nommées
displacy.render(my_doc,style='ent')
#explication (lexique)
spacy.explain('LOC')
'Non-GPE locations, mountain ranges, bodies of water'
#structure des phrases
displacy.render(my_doc, style="dep")
#liste des termes dans le document
#et leurs numéros
for i,mot in enumerate(my_doc):
print(i, mot.text)
0 jacques 1 brel 2 . 3 tu 4 as 5 voulu 6 voir 7 vierzon 8 , 9 et 10 on 11 a 12 vu 13 vierzon 14 . 15 tu 16 as 17 voulu 18 voir 19 vesoul 20 , 21 et 22 on 23 a 24 vu 25 vesoul 26 . 27 tu 28 as 29 voulu 30 voir 31 honfleur 32 , 33 et 34 on 35 a 36 vu 37 honfleur 38 . 39 tu 40 as 41 voulu 42 voir 43 hambourg 44 , 45 et 46 on 47 a 48 vu 49 hambourg 50 . 51 j' 52 ai 53 voulu 54 voir 55 anvers 56 , 57 on 58 a 59 revu 60 hambourg 61 . 62 j' 63 ai 64 voulu 65 voir 66 ta 67 soeur 68 , 69 et 70 on 71 a 72 vu 73 ta 74 mère 75 . 76 comme 77 toujours 78 .
#représentation vectorielle pour le 7e terme, vierzon
my_doc[7].has_vector
True
#récupération du vecteur
my_doc[7].vector
array([-4.0657e-01, -2.0025e-01, 2.8754e-01, -4.2928e-01, 2.7163e-01, 6.9482e-02, -1.2550e-01, 3.7818e-01, 9.7598e-01, -1.6251e-01, 1.9031e-01, -7.9347e-01, -4.6508e-01, -1.5558e-01, 1.2947e+00, -1.1455e+00, 3.9202e-01, -1.0322e+00, -9.7683e-01, 4.5076e-01, 7.1663e-01, 7.7132e-01, 4.7204e-01, -2.0041e-01, 7.8794e-01, -1.2724e+00, 1.5912e+00, -7.8458e-01, 2.5732e-01, -6.6908e-01, -8.5539e-01, 1.2967e+00, -2.6708e-01, -1.9598e-01, -1.4309e-01, 6.6540e-03, -1.2907e+00, 1.4002e+00, -1.8218e+00, 5.3225e-01, 2.0713e-01, 2.2242e-01, 1.0662e-01, 2.2235e-01, 7.5361e-01, 1.1899e+00, 1.5472e-01, -2.3241e-01, 5.6915e-01, 1.5155e+00, -3.5420e-01, -5.5860e-01, -1.3645e-02, -4.5693e-01, 1.1369e+00, 1.3149e+00, -9.6851e-01, -4.0033e-01, 5.4487e-02, -6.6694e-01, 1.7506e-01, -6.2798e-01, 1.0074e+00, -5.3175e-01, -3.0654e-01, -6.5145e-01, -4.2323e-01, 8.6252e-01, 1.3160e+00, -2.0910e-01, -1.4855e-01, -5.0082e-02, -4.8467e-01, -8.6203e-02, -1.3226e+00, -5.1566e-01, 1.8179e-01, 1.0349e+00, -4.2297e-01, 1.3841e+00, -1.5807e+00, -1.4678e-01, -1.3161e-01, -1.6265e+00, -3.7110e-01, 7.9721e-01, -7.4050e-01, -7.7008e-01, 3.2064e-01, 1.8384e+00, -3.8825e-01, 5.7544e-02, -1.4938e+00, -4.4638e-01, -6.5860e-01, -5.6837e-01, 4.1797e-01, 2.2528e-01, 3.4527e-01, -3.9242e-01, -2.1223e-01, -3.6062e-01, -9.2100e-01, -8.9633e-01, -1.2510e-01, -6.2732e-02, 8.1116e-01, 1.1423e-01, -1.2302e+00, 1.9736e-01, -1.2433e-01, 3.7199e-01, 2.4325e-01, -1.6955e-01, -2.1964e-01, 1.8599e+00, 1.1425e-02, -2.0298e-01, 8.9051e-02, -3.0032e-01, -2.0283e-01, 2.9174e-01, -2.5473e-01, -2.2293e-01, -8.4426e-01, 1.3266e+00, 1.6016e+00, 1.2428e+00, -4.0662e-01, 8.2429e-02, 1.4955e+00, -6.6329e-01, 3.3609e-01, 9.4762e-02, 5.3329e-01, -7.0475e-01, 5.5864e-01, -2.7856e-01, -1.9157e+00, 2.3704e-01, -4.2899e-01, -5.7577e-01, 1.5515e-02, 2.4383e-01, 4.9577e-01, 1.1405e+00, -1.6564e-01, 2.8747e-01, 1.7616e+00, -1.6911e-01, -6.3712e-01, -1.9791e-01, -8.2447e-01, 5.8574e-01, 6.3516e-01, 4.5650e-01, 1.6442e-01, -3.8441e-01, -1.7971e-01, 1.2859e+00, -1.2575e+00, -1.4858e+00, 2.4661e-01, 4.1250e-01, 9.1271e-01, 3.2353e-01, 8.6869e-01, 5.4169e-02, -9.4836e-01, 2.6973e-01, 1.4008e+00, 2.8192e-01, 5.7193e-01, -1.0867e+00, 1.7838e-01, 5.4392e-01, -4.2186e-01, -3.0026e-01, 9.0635e-02, -5.0220e-01, 3.3055e-01, 3.0955e-03, -5.8019e-01, -2.9985e-01, -5.4717e-01, -8.2679e-01, -7.0257e-02, -8.1559e-01, -1.2090e+00, 2.7562e-01, -6.7841e-01, 2.0542e-01, 4.9799e-01, -1.2865e-01, 4.2938e-01, 1.2731e+00, -1.2410e-01, -3.4725e-01, 5.9236e-01, -4.8761e-01, -6.2629e-01, 2.3057e-03, -9.2564e-01, 4.8470e-02, -6.3772e-02, -6.7288e-01, 2.7903e-01, 2.8268e-02, -1.4076e-01, 6.8378e-01, 2.1612e-01, 6.0460e-01, 3.6177e-01, -1.0455e+00, 8.5451e-01, 2.4390e-01, 1.4620e+00, -2.3945e-01, 1.3431e+00, -9.8238e-01, 8.8327e-01, -1.7935e-01, 6.6251e-01, 2.0690e-01, -1.0792e-01, 7.4089e-01, 1.6170e+00, -7.2707e-02, -6.2213e-01, 9.6534e-01, -2.9215e-02, 7.9944e-01, 6.5622e-02, 4.6552e-01, -1.1256e-01, -1.3510e+00, -6.9111e-01, 9.5096e-01, -9.6088e-01, -2.8245e-01, -8.6613e-01, -1.6196e-01, -3.5614e-01, -6.0044e-01, -1.4988e+00, 6.8974e-01, -6.8883e-01, 1.6172e-01, 4.8205e-01, -1.1944e-01, 2.3086e-01, -6.7829e-01, 8.8963e-02, 9.0213e-01, 9.4611e-01, -7.9906e-02, -4.2308e-01, 1.0481e+00, 1.3916e+00, 6.3237e-01, 3.5527e-01, -4.3014e-01, -4.9953e-02, -4.2331e-01, -1.0916e+00, 7.2194e-01, -5.0015e-01, 9.2636e-01, 3.7391e-01, 8.9174e-01, -1.0275e+00, 8.1164e-01, 3.2245e-01, 4.8588e-01, -6.6116e-01, 6.7272e-01, 8.8292e-01, 6.3029e-01, -5.9496e-01, -5.2185e-03, 4.8353e-01, 7.0208e-01, -5.4277e-01, -6.2411e-01, -1.3636e+00, -9.9813e-01, -7.3958e-01, -1.0664e-03, -1.2001e+00, 3.6152e-01, -6.2169e-01, 1.8455e-01, -2.1798e+00, -8.7334e-01, -2.0561e-02, 1.2941e+00, 5.9920e-01, -2.1451e-02, -1.2552e+00, -4.8696e-01], dtype=float32)
#dimension
my_doc[7].vector.shape
(300,)
#on a bien le même terme
print(my_doc[7].text)
print(my_doc[13].text)
vierzon vierzon
#vérification que l'on a bien la même représentation
numpy.sum((my_doc[7].vector-my_doc[13].vector)**2)
0.0
#liste de termes et leurs coordonnées
#word embedding
coord = {}
#charger les coordonnées
for mot in my_doc:
if not(mot.text in coord):
coord[mot.text] = mot.vector
#coordonnées de vesoul
print(coord['vesoul'])
[-5.8407e-01 3.4424e-01 -2.2901e-01 9.9353e-02 -1.6188e-01 -2.3497e-01 8.0822e-01 3.4484e-01 -7.7716e-01 6.5560e-01 2.2229e-01 -9.4003e-02 -4.8322e-01 -4.5401e-01 1.4715e-01 6.7225e-03 7.6638e-01 -7.5615e-01 -3.0963e-01 -6.9015e-02 7.1394e-01 1.6666e+00 1.2212e+00 -2.3179e-01 -2.1714e-01 -1.1320e+00 -7.4422e-01 1.0710e+00 5.6044e-01 -1.2510e+00 8.6863e-02 2.7075e-01 -2.3794e-01 -8.0670e-01 5.0118e-01 -3.1831e-01 2.9396e-02 -6.4636e-03 9.8959e-01 6.3312e-02 1.7129e-01 -8.4535e-01 6.1919e-01 1.2757e+00 -4.2155e-01 -2.2575e-01 -2.0978e-01 -1.8917e+00 7.6063e-02 1.6763e+00 -5.0706e-01 -8.3366e-01 -2.8649e-01 -2.9436e-01 1.0061e+00 5.0607e-01 -8.6939e-01 6.5620e-01 4.0167e-01 -9.4017e-01 8.5270e-01 -4.5480e-01 -7.5253e-01 -8.2199e-02 4.8929e-02 4.7423e-01 -3.4895e-01 9.0311e-01 -1.0631e-01 1.6204e-02 4.0598e-01 -4.3379e-01 3.8747e-01 -6.8255e-01 -7.3036e-01 -6.5625e-01 1.1343e+00 2.2641e-01 9.6053e-02 5.5185e-01 -1.0586e-02 7.1051e-01 1.7653e-01 -9.8917e-01 -3.4193e-01 1.1521e+00 3.6208e-01 -4.9975e-01 -9.4832e-01 1.1999e+00 -6.6913e-01 -7.7592e-01 -1.0546e+00 -1.2149e+00 -7.2905e-01 5.2374e-01 1.2252e+00 7.0492e-01 7.9432e-01 -9.6642e-01 -1.9159e-01 -4.0667e-01 -3.2944e-01 -1.6421e-01 6.0935e-01 9.0460e-01 7.6646e-01 -8.5255e-01 -6.4933e-01 2.4853e-01 -6.8751e-01 -3.0451e-01 3.2306e-01 -3.2068e-01 6.6219e-02 7.0929e-01 1.0140e-01 -5.0990e-01 -4.8441e-01 -1.0660e+00 -5.4247e-01 -4.4798e-01 9.6521e-01 5.0421e-01 -8.3784e-01 3.4046e-01 8.2372e-01 -3.9867e-01 4.4751e-01 4.6875e-01 1.3789e+00 -8.0593e-01 5.2308e-01 -5.4083e-01 7.8277e-01 -1.3942e-01 -6.4330e-01 6.8972e-01 -2.1223e+00 -1.6389e-01 -4.2503e-01 7.6170e-02 -3.2761e-01 3.2396e-01 -1.7469e+00 9.3386e-01 -1.3022e+00 1.2676e+00 1.5521e+00 -6.1249e-01 -2.1223e-01 7.0340e-01 -8.8014e-02 -7.0633e-01 1.3715e+00 -2.4819e-01 -5.6164e-01 -6.6568e-01 7.4097e-01 -1.5313e-01 -1.6301e+00 -8.9272e-01 -1.3026e-01 2.7284e-01 8.0913e-01 5.6019e-01 9.5325e-01 1.1896e+00 2.3602e-01 -1.9886e-01 5.0916e-01 8.7401e-01 5.0070e-01 -9.1593e-01 8.2782e-01 1.2882e+00 -4.0974e-01 1.6180e+00 -5.8194e-01 1.0008e-01 8.3763e-01 5.6150e-01 -8.4785e-01 -4.8341e-01 5.1680e-01 8.1683e-01 -5.9333e-01 -6.7202e-01 -2.5270e-01 5.5265e-01 -5.1497e-01 1.1494e-01 1.9935e-03 -1.4527e-02 2.4835e-01 4.5724e-02 7.0839e-01 1.0196e-01 9.9780e-01 -4.6006e-01 -4.3787e-01 5.8046e-01 -5.2978e-01 -9.3563e-01 -4.0745e-01 1.1533e+00 -3.3321e-01 -1.0931e-01 3.1795e-01 -2.9053e-01 -3.9995e-01 -2.7213e-01 -5.3465e-01 -7.3220e-01 1.4295e+00 1.7616e-01 6.4572e-01 -6.0241e-01 7.3246e-01 6.7409e-02 2.5494e-01 1.9218e-01 -8.8132e-02 -2.2286e-01 6.1991e-02 -2.7595e-01 -1.0547e-01 -3.8189e-01 -3.4555e-01 1.5986e+00 -1.4851e-01 8.0789e-01 8.0484e-01 2.9581e-01 -9.1896e-02 -1.1847e+00 -1.0589e-01 -1.0386e+00 8.4735e-01 4.6961e-02 -4.6145e-01 -2.3212e-01 -1.6656e+00 4.8900e-01 -4.6923e-01 2.6997e-01 -5.6505e-01 1.2146e-01 3.4244e-01 -9.3093e-01 -1.1812e+00 -3.9865e-01 6.0374e-02 1.9107e+00 5.9228e-01 -6.1370e-03 -2.6013e-01 4.6038e-01 6.4547e-01 8.1412e-01 9.2802e-01 -5.2924e-02 5.3487e-01 6.7531e-01 -3.0926e-01 1.8939e-01 -3.0234e-01 4.0307e-02 3.5449e-01 4.6310e-01 6.5369e-01 1.1442e+00 8.3256e-01 -1.1165e+00 -2.3187e-01 -7.0950e-01 3.0229e-01 2.7046e-01 -1.3302e+00 -5.2441e-01 -1.5810e-01 -4.7788e-01 -3.7892e-01 -5.6443e-01 -9.0476e-02 -2.7887e-01 1.3334e+00 1.5497e+00 -2.3411e-01 6.1919e-02 -3.8621e-01 -1.1364e-01 -7.4522e-02 -3.0313e-01 1.3659e+00 7.7729e-01 1.1739e+00 5.4830e-01 -1.1586e-01 -6.0751e-01]
#dimension - vérif. encore
print(coord['vierzon'].shape)
(300,)
#liste des termes vectorisés
print(coord.keys())
dict_keys(['jacques', 'brel', '.', 'tu', 'as', 'voulu', 'voir', 'vierzon', ',', 'et', 'on', 'a', 'vu', 'vesoul', 'honfleur', 'hambourg', "j'", 'ai', 'anvers', 'revu', 'ta', 'soeur', 'mère', 'comme', 'toujours'])
#fonction pour similarité cosinus
def similarite_cosinus(v1,v2):
res = numpy.dot(v1,v2)/(numpy.linalg.norm(v1)*numpy.linalg.norm(v2))
return res
#similarité cosinus entre "vesoul" et "vierzon"
print(similarite_cosinus(coord['vesoul'], coord['vierzon']))
0.39401367
#similarité entre "jacques" et "brel"
print(similarite_cosinus(coord['jacques'], coord['brel']))
0.36657745
#fonction interne de calcul des similarités
#et s'appuie bien sur une similarité cosinus
my_doc[0].similarity(my_doc[1])
0.3665774464607239
Composé d'un ensemble de termes...
#l'ensemble du paragraphe (de la strophe entière)
print(my_doc.vector)
[ 8.54071043e-03 2.74834752e-01 -3.50937581e+00 7.40022004e-01 1.25529134e+00 1.53335011e+00 -2.32128334e+00 -1.02142103e-01 4.32935268e-01 5.98222077e-01 1.19048464e+00 1.03554368e+00 5.38600087e-01 -1.89471453e-01 4.41725731e-01 -3.69690716e-01 -6.61469996e-01 -1.23821807e+00 1.27446759e+00 -1.84450102e+00 9.00520325e-01 -3.27972978e-01 1.05935395e+00 1.22191370e+00 2.72004902e-01 1.01664329e+00 8.38955343e-01 -1.98171884e-01 -4.58234906e-01 2.57503009e+00 3.09241080e+00 -2.28456914e-01 6.73313998e-03 -2.05305529e+00 6.43110633e-01 5.82398176e-01 -1.78304088e+00 3.95900726e+00 -7.43371546e-01 -3.74488616e+00 3.75972748e-01 -2.89105511e+00 -1.81379631e-01 9.35952783e-01 5.10660946e-01 -1.79199553e+00 1.11294299e-01 -1.47862601e+00 -1.59714901e+00 1.79036176e+00 -1.86572537e-01 -2.81084323e+00 9.80933964e-01 -4.33306545e-01 1.77158618e+00 -1.17055932e-02 -8.33573341e-01 6.40884638e-02 -1.34679961e+00 -9.99486923e-01 4.90743697e-01 1.31642878e+00 1.95130980e+00 3.19589901e+00 1.22135615e+00 8.00969601e-02 -2.28863549e+00 8.17215919e-01 -1.00551176e+00 3.29085684e+00 7.97130644e-01 -8.84275675e-01 -2.00685096e+00 -1.85397398e+00 1.51758149e-01 1.14009631e+00 9.98709857e-01 -2.51245826e-01 -3.19628268e-01 9.05652761e-01 -2.40076232e+00 9.84759390e-01 -1.75243521e+00 -1.42035234e+00 -1.13195431e+00 6.47265494e-01 1.42559123e+00 -2.11657444e-03 1.70083117e+00 2.09998822e+00 9.32927787e-01 5.33584177e-01 8.46255600e-01 3.70527893e-01 -1.40039027e+00 -9.73722219e-01 2.54122353e+00 3.41843534e+00 -9.14492190e-01 6.30743742e-01 1.01225090e+00 1.54992670e-01 -9.52577889e-01 -1.19824517e+00 1.70235336e+00 -2.35534504e-01 1.24899280e+00 -2.44930804e-01 1.58462620e+00 -1.63999283e+00 1.18683851e+00 -5.23683906e-01 3.27429265e-01 -7.30383575e-01 -2.77057314e+00 -5.65913737e-01 4.72364649e-02 1.47599792e+00 7.96574771e-01 -3.28604579e+00 4.52138841e-01 -1.49302691e-01 1.39449251e+00 -1.57893908e+00 -1.06467378e+00 4.75724846e-01 -1.35048211e+00 -4.90460396e-01 -8.10190797e-01 4.43491936e-01 8.75523150e-01 1.18624461e+00 6.45797849e-01 1.32548594e+00 -1.57959774e-01 1.04500508e+00 5.47951087e-02 -1.30291057e+00 1.49257600e+00 5.60474277e-01 3.86743456e-01 4.06284183e-01 -2.81473130e-01 -4.12439138e-01 -1.08074498e+00 -3.81308287e-01 1.77380955e+00 1.49721012e-01 2.71170187e+00 -1.28000772e+00 2.45102382e+00 -5.12013547e-02 7.12734580e-01 1.99196422e+00 -1.63034737e+00 8.92213583e-01 -5.78251779e-01 -6.12294614e-01 1.14384389e+00 -1.06257570e+00 -2.72162139e-01 2.54026294e-01 -1.57059789e+00 1.05224922e-01 -1.79139888e+00 3.60995144e-01 -1.03780162e+00 -1.10274827e+00 1.69297051e+00 1.58983134e-02 1.17376339e+00 6.33648038e-01 7.52244473e-01 4.38615590e-01 1.26718378e+00 -1.59162962e+00 -6.39494896e-01 1.46695077e+00 -3.82820189e-01 4.87739891e-02 2.37272048e+00 1.41792941e+00 9.45534706e-01 2.04811358e+00 -1.39721179e+00 1.17254066e+00 -6.83715165e-01 1.20775926e+00 -1.26009548e+00 -1.91462770e-01 -2.47833920e+00 5.03519058e-01 -2.50229549e+00 -2.08238840e+00 1.74879706e+00 2.22020835e-01 -1.33342993e+00 -1.28504992e+00 -1.65212601e-01 2.12433293e-01 -2.38663864e+00 1.38421428e+00 2.47467399e+00 2.45869309e-01 -1.42729735e+00 1.45380545e+00 -1.73741490e-01 -2.03180480e+00 -9.91540134e-01 -1.05076051e+00 -2.34034753e+00 -1.40936628e-01 3.21699947e-01 -1.81070685e+00 1.18953311e+00 -2.06306219e-01 -4.85516042e-01 -2.99172783e+00 1.57572865e-01 -1.93550742e+00 1.10156201e-02 6.39679469e-03 1.38491130e+00 -4.91976917e-01 -1.10390174e+00 -2.30114564e-01 1.00739360e-01 1.10543616e-01 -2.66368151e-01 1.13526773e+00 -1.56934166e+00 -4.48390245e-02 2.14757466e+00 1.41380394e+00 6.66661933e-02 -2.30144429e+00 1.46004632e-01 -8.06587160e-01 -1.79078186e+00 -2.60955048e+00 1.28498721e+00 1.44019508e+00 1.33939493e+00 -1.16577947e+00 2.60619187e+00 3.70054066e-01 1.04145813e+00 -1.44158661e+00 1.15776682e+00 -1.64530110e+00 2.16309547e+00 1.95918581e-03 8.25892210e-01 1.09968670e-01 -4.87819731e-01 1.19315052e+00 -3.10592830e-01 -4.28579211e-01 2.22917989e-01 -5.01658261e-01 1.09935299e-01 -2.13310242e+00 -1.05589473e+00 -5.55368364e-01 -1.87726051e-01 -1.29797292e+00 6.10999644e-01 1.83215368e+00 -4.19958085e-01 -1.47877121e+00 -3.09041351e-01 -1.04420602e+00 -8.84850323e-01 -5.37264645e-01 -1.01103449e+00 -1.16927850e+00 8.07683527e-01 9.28857923e-01 5.52378058e-01 3.62249911e-01 1.57382965e-01 -6.17787428e-02 -3.03287327e-01 1.58902562e+00 -9.03641045e-01 -1.46209562e+00 -4.57526684e-01 5.89973688e-01 1.98776269e+00 -2.54537678e+00 6.89070880e-01 -1.73213691e-01 -1.63254249e+00 8.87211382e-01 -1.06655121e+00 -1.38965654e+00 1.27103961e+00 9.73660350e-01 -8.45093071e-01 -1.63603500e-01]
#et on est bien dans la dimension de représentation
print(my_doc.vector.shape)
(300,)
#représentation du premier sous-document "jacques brel"
#si on choisit de partitionner en phrases (délimiteur '.')
w1 = my_doc[0:2].vector
print(w1)
[ 1.01979303e+00 -2.10774988e-01 -2.03495002e+00 3.46100003e-01 5.53399980e-01 -9.84100029e-02 5.04965007e-01 -1.13951492e+00 5.40235043e-01 -1.80635005e-01 -5.31499982e-02 -1.46449506e-01 1.48181999e+00 -1.74460006e+00 1.06970012e-01 -1.03129995e+00 1.85123491e+00 -8.48654985e-01 -2.33174992e+00 -8.28119993e-01 -6.60560012e-01 1.14914984e-01 3.33649993e-01 3.59766006e-01 7.84144998e-01 -1.49019504e+00 -2.53000021e-01 -8.01829994e-01 2.78770018e+00 -4.06379986e+00 -4.79430497e-01 2.94939995e-01 1.20925999e+00 -2.54099965e-01 -5.79618514e-01 -2.09415007e+00 -1.23400998e+00 -8.96940082e-02 -1.69100165e-02 -1.75228044e-01 1.57814002e+00 -7.68379986e-01 -8.17414999e-01 2.16540003e+00 1.84099996e+00 1.52745008e+00 -1.37559009e+00 -1.97045004e+00 2.80200005e-01 3.40250015e+00 -5.09494960e-01 -7.44077027e-01 -4.35072511e-01 -4.09240007e-01 -9.60945010e-01 6.17359951e-02 -1.96344995e+00 -2.07924986e+00 1.46777999e+00 -2.54750013e+00 -1.22735500e+00 -1.16578496e+00 4.52350006e-02 -1.70254993e+00 3.94274998e+00 -4.81114984e-01 8.29313040e-01 1.68735003e+00 -3.51190001e-01 -4.03785020e-01 -9.59834993e-01 -1.20859504e+00 -7.79184997e-01 -9.71419990e-01 -2.15324998e+00 1.94050014e-01 3.23765010e-01 7.17572987e-01 -2.48300016e-01 4.37364995e-01 3.34208667e-01 1.55252492e+00 2.86530018e-01 -1.56719998e-01 -3.37050009e+00 2.20642000e-01 -1.62084496e+00 -1.75899005e+00 1.86016500e+00 3.45744991e+00 1.19967997e+00 -7.07269490e-01 3.76514971e-01 1.01101494e+00 6.44534945e-01 4.57949996e-01 -1.61155015e-01 9.67384219e-01 -2.24584997e-01 1.82031000e+00 1.39744997e+00 -5.71130037e-01 -9.71300006e-01 -4.75600004e-01 2.32474995e+00 -1.42014503e+00 1.17119506e-01 2.71824986e-01 1.02921009e+00 1.59500003e-01 4.15562510e-01 1.18238997e+00 -1.15129995e+00 -3.63685012e-01 3.52050006e-01 3.86325002e-01 1.02406406e+00 1.04363990e+00 -1.28431499e+00 -2.14979982e+00 6.72260046e-01 1.48233497e+00 3.57834995e-01 -1.51650012e-02 3.15499902e-02 3.16349983e-01 1.90145993e+00 -5.13345003e-01 1.16266000e+00 -8.72105002e-01 3.03750038e-02 -1.60745001e+00 7.75390029e-01 -1.44249976e-01 4.59200025e-01 -9.75449979e-01 4.28054988e-01 6.44192040e-01 -1.73696005e+00 -1.15984404e+00 2.92794991e+00 4.20904994e-01 -7.75261521e-01 1.72979999e+00 2.76244521e-01 5.61219990e-01 -5.47749996e-01 1.09489501e+00 2.78980017e+00 1.07026005e+00 8.90510023e-01 1.94999576e-03 6.34559989e-01 1.06867003e+00 1.22264004e+00 2.95364976e-01 -8.21929991e-01 2.58151501e-01 5.03349960e-01 -2.57000327e-03 2.30299979e-02 -2.14648515e-01 3.26564968e-01 -1.63024998e+00 3.78099978e-02 -4.26829994e-01 -6.51844978e-01 -1.23925853e+00 4.24569994e-01 -4.91954982e-01 4.06481504e-01 -2.18875006e-01 -6.94779992e-01 7.22979963e-01 1.62361002e+00 -3.65560018e-02 -5.29915035e-01 -2.71149993e+00 -5.93204975e-01 -2.05325007e+00 1.38151991e+00 -7.09999800e-02 -3.78019989e-01 8.85535002e-01 -5.24408460e-01 -3.80490005e-01 -1.80829000e+00 -1.19086552e+00 -1.63674998e+00 6.25100017e-01 -1.21773899e+00 -1.57907999e+00 2.94729996e+00 5.16335011e-01 1.46704996e+00 2.22909999e+00 -8.08169961e-01 -1.34388506e+00 7.74595022e-01 -1.40117300e+00 -1.74738002e+00 -1.44509995e+00 -1.70933998e+00 1.01620996e+00 -3.59014988e+00 4.77695018e-01 2.09694996e-01 8.50105047e-01 -3.52610022e-01 9.28799987e-01 -8.98850024e-01 9.13850069e-02 1.66445005e+00 -6.70095026e-01 2.85214996e+00 -1.26300502e+00 1.30885005e+00 1.98839998e+00 -3.59948486e-01 -2.57914998e-02 4.51515019e-01 -5.92440963e-01 -1.04718995e+00 2.03301001e+00 8.14639986e-01 8.74464989e-01 -4.25835013e-01 2.34526500e-01 -3.76450002e-01 7.44850039e-02 6.19099975e-01 -1.22574985e-01 6.59500003e-01 8.23069990e-01 -3.87654960e-01 -8.83099973e-01 -1.49570000e+00 5.94694972e-01 -1.08964491e+00 -2.49156505e-01 -6.54134989e-01 8.49979997e-01 4.75750029e-01 -1.33160496e+00 3.02715003e-01 1.25730002e+00 1.41860008e+00 1.07823002e+00 -6.13991469e-02 -2.88854986e-01 1.76905498e-01 9.00340021e-01 -9.87725019e-01 1.71143997e+00 1.26633000e+00 -1.55230001e-01 7.26096988e-01 4.25249994e-01 1.13871503e+00 -1.01552501e-01 3.03698152e-01 -2.07424998e+00 -1.10468495e+00 8.76590014e-01 1.51839995e+00 1.66681492e+00 1.06517005e+00 2.07234645e+00 -1.66084504e+00 -1.19000673e-03 -1.67089498e+00 -1.10319996e+00 1.14716494e+00 -3.28750014e-01 -1.81500006e+00 1.38399506e+00 -5.49000055e-02 1.79454994e+00 -5.05070508e-01 2.36115003e+00 5.23214996e-01 -1.64549991e-01 -4.73250151e-02 -8.50265026e-01 1.36299968e-01 9.36850011e-01 5.73079944e-01 -1.47864997e+00 -2.06215000e+00 -3.24449956e-01 -2.19420004e+00 3.29998016e-01 -2.10669994e+00 -2.55329990e+00 -3.61352503e-01 6.49405003e-01 -5.37189960e-01 7.94017017e-01 1.27259004e+00 -1.10343504e+00]
#moyenne des représentations individuelles de 'jacques' et 'brel'
w2 = numpy.mean([coord['jacques'],coord['brel']],axis=0)
print(w2)
[ 1.01979303e+00 -2.10774988e-01 -2.03495002e+00 3.46100003e-01 5.53399980e-01 -9.84100029e-02 5.04965007e-01 -1.13951492e+00 5.40235043e-01 -1.80635005e-01 -5.31499982e-02 -1.46449506e-01 1.48181999e+00 -1.74460006e+00 1.06970012e-01 -1.03129995e+00 1.85123491e+00 -8.48654985e-01 -2.33174992e+00 -8.28119993e-01 -6.60560012e-01 1.14914984e-01 3.33649993e-01 3.59766006e-01 7.84144998e-01 -1.49019504e+00 -2.53000021e-01 -8.01829994e-01 2.78770018e+00 -4.06379986e+00 -4.79430497e-01 2.94939995e-01 1.20925999e+00 -2.54099965e-01 -5.79618514e-01 -2.09415007e+00 -1.23400998e+00 -8.96940082e-02 -1.69100165e-02 -1.75228044e-01 1.57814002e+00 -7.68379986e-01 -8.17414999e-01 2.16540003e+00 1.84099996e+00 1.52745008e+00 -1.37559009e+00 -1.97045004e+00 2.80200005e-01 3.40250015e+00 -5.09494960e-01 -7.44077027e-01 -4.35072511e-01 -4.09240007e-01 -9.60945010e-01 6.17359951e-02 -1.96344995e+00 -2.07924986e+00 1.46777999e+00 -2.54750013e+00 -1.22735500e+00 -1.16578496e+00 4.52350006e-02 -1.70254993e+00 3.94274998e+00 -4.81114984e-01 8.29313040e-01 1.68735003e+00 -3.51190001e-01 -4.03785020e-01 -9.59834993e-01 -1.20859504e+00 -7.79184997e-01 -9.71419990e-01 -2.15324998e+00 1.94050014e-01 3.23765010e-01 7.17572987e-01 -2.48300016e-01 4.37364995e-01 3.34208667e-01 1.55252492e+00 2.86530018e-01 -1.56719998e-01 -3.37050009e+00 2.20642000e-01 -1.62084496e+00 -1.75899005e+00 1.86016500e+00 3.45744991e+00 1.19967997e+00 -7.07269490e-01 3.76514971e-01 1.01101494e+00 6.44534945e-01 4.57949996e-01 -1.61155015e-01 9.67384219e-01 -2.24584997e-01 1.82031000e+00 1.39744997e+00 -5.71130037e-01 -9.71300006e-01 -4.75600004e-01 2.32474995e+00 -1.42014503e+00 1.17119506e-01 2.71824986e-01 1.02921009e+00 1.59500003e-01 4.15562510e-01 1.18238997e+00 -1.15129995e+00 -3.63685012e-01 3.52050006e-01 3.86325002e-01 1.02406406e+00 1.04363990e+00 -1.28431499e+00 -2.14979982e+00 6.72260046e-01 1.48233497e+00 3.57834995e-01 -1.51650012e-02 3.15499902e-02 3.16349983e-01 1.90145993e+00 -5.13345003e-01 1.16266000e+00 -8.72105002e-01 3.03750038e-02 -1.60745001e+00 7.75390029e-01 -1.44249976e-01 4.59200025e-01 -9.75449979e-01 4.28054988e-01 6.44192040e-01 -1.73696005e+00 -1.15984404e+00 2.92794991e+00 4.20904994e-01 -7.75261521e-01 1.72979999e+00 2.76244521e-01 5.61219990e-01 -5.47749996e-01 1.09489501e+00 2.78980017e+00 1.07026005e+00 8.90510023e-01 1.94999576e-03 6.34559989e-01 1.06867003e+00 1.22264004e+00 2.95364976e-01 -8.21929991e-01 2.58151501e-01 5.03349960e-01 -2.57000327e-03 2.30299979e-02 -2.14648515e-01 3.26564968e-01 -1.63024998e+00 3.78099978e-02 -4.26829994e-01 -6.51844978e-01 -1.23925853e+00 4.24569994e-01 -4.91954982e-01 4.06481504e-01 -2.18875006e-01 -6.94779992e-01 7.22979963e-01 1.62361002e+00 -3.65560018e-02 -5.29915035e-01 -2.71149993e+00 -5.93204975e-01 -2.05325007e+00 1.38151991e+00 -7.09999800e-02 -3.78019989e-01 8.85535002e-01 -5.24408460e-01 -3.80490005e-01 -1.80829000e+00 -1.19086552e+00 -1.63674998e+00 6.25100017e-01 -1.21773899e+00 -1.57907999e+00 2.94729996e+00 5.16335011e-01 1.46704996e+00 2.22909999e+00 -8.08169961e-01 -1.34388506e+00 7.74595022e-01 -1.40117300e+00 -1.74738002e+00 -1.44509995e+00 -1.70933998e+00 1.01620996e+00 -3.59014988e+00 4.77695018e-01 2.09694996e-01 8.50105047e-01 -3.52610022e-01 9.28799987e-01 -8.98850024e-01 9.13850069e-02 1.66445005e+00 -6.70095026e-01 2.85214996e+00 -1.26300502e+00 1.30885005e+00 1.98839998e+00 -3.59948486e-01 -2.57914998e-02 4.51515019e-01 -5.92440963e-01 -1.04718995e+00 2.03301001e+00 8.14639986e-01 8.74464989e-01 -4.25835013e-01 2.34526500e-01 -3.76450002e-01 7.44850039e-02 6.19099975e-01 -1.22574985e-01 6.59500003e-01 8.23069990e-01 -3.87654960e-01 -8.83099973e-01 -1.49570000e+00 5.94694972e-01 -1.08964491e+00 -2.49156505e-01 -6.54134989e-01 8.49979997e-01 4.75750029e-01 -1.33160496e+00 3.02715003e-01 1.25730002e+00 1.41860008e+00 1.07823002e+00 -6.13991469e-02 -2.88854986e-01 1.76905498e-01 9.00340021e-01 -9.87725019e-01 1.71143997e+00 1.26633000e+00 -1.55230001e-01 7.26096988e-01 4.25249994e-01 1.13871503e+00 -1.01552501e-01 3.03698152e-01 -2.07424998e+00 -1.10468495e+00 8.76590014e-01 1.51839995e+00 1.66681492e+00 1.06517005e+00 2.07234645e+00 -1.66084504e+00 -1.19000673e-03 -1.67089498e+00 -1.10319996e+00 1.14716494e+00 -3.28750014e-01 -1.81500006e+00 1.38399506e+00 -5.49000055e-02 1.79454994e+00 -5.05070508e-01 2.36115003e+00 5.23214996e-01 -1.64549991e-01 -4.73250151e-02 -8.50265026e-01 1.36299968e-01 9.36850011e-01 5.73079944e-01 -1.47864997e+00 -2.06215000e+00 -3.24449956e-01 -2.19420004e+00 3.29998016e-01 -2.10669994e+00 -2.55329990e+00 -3.61352503e-01 6.49405003e-01 -5.37189960e-01 7.94017017e-01 1.27259004e+00 -1.10343504e+00]
#vérification
print(numpy.sum((w1-w2)**2))
0.0