#nltk
import nltk
print(nltk.__version__)
3.6.5
#gensim
import gensim
print(gensim.__version__)
4.0.1
#changer de dossier par défaut
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#importation du fichier
import pandas
D = pandas.read_excel("imdb_reviews_1000.xlsx")
D.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2000 entries, 0 to 1999 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2000 non-null int64 1 label 2000 non-null object 2 commentaires 2000 non-null object dtypes: int64(1), object(2) memory usage: 47.0+ KB
#première lignes
D.head()
ID | label | commentaires | |
---|---|---|---|
0 | 1 | neg | I don't have much to say about this movie. It ... |
1 | 2 | neg | Race car drivers say that 100 mph seems fast t... |
2 | 3 | neg | I found this film to be quite an oddity. From ... |
3 | 4 | neg | Jane Russell proved to be a delightful musical... |
4 | 5 | neg | This movie makes Canadians and Brits out to be... |
#récupérer sous forme de liste
corpus = D['commentaires'].tolist()
print(corpus[0])
I don't have much to say about this movie. It could have been a wonderful tour-de-force for Peter Sellers, but it is one of the most tragic misfires in movie history. That it was Sellers final movie makes it all the more painful. The terrible screenplay, direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made. I wish so much that I could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber, but it's a lifeless, humorless disaster. The truth hurts. Peter, why couldn't you have stopped at BEING THERE?
#passer en minuscule
corpus = [doc.lower() for doc in corpus]
print(corpus[0])
i don't have much to say about this movie. it could have been a wonderful tour-de-force for peter sellers, but it is one of the most tragic misfires in movie history. that it was sellers final movie makes it all the more painful. the terrible screenplay, direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made. i wish so much that i could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber, but it's a lifeless, humorless disaster. the truth hurts. peter, why couldn't you have stopped at being there?
#liste des ponctuations
import string
ponctuations = list(string.punctuation)
print(ponctuations)
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
#retrait des ponctuations
corpus = ["".join([char for char in list(doc) if not (char in ponctuations)]) for doc in corpus]
print(corpus[0])
i dont have much to say about this movie it could have been a wonderful tourdeforce for peter sellers but it is one of the most tragic misfires in movie history that it was sellers final movie makes it all the more painful the terrible screenplay direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made i wish so much that i could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber but its a lifeless humorless disaster the truth hurts peter why couldnt you have stopped at being there
#nécessité de punkt - modèle de tokénisation
#à charger en ligne si ce n'est pas déjà fait
import nltk
#nltk.download()
#nltk.download('punkt')
#transformer le corpus en liste de listes (les documents)
#par tokénisation
from nltk.tokenize import word_tokenize
corpus_tk = [word_tokenize(doc) for doc in corpus]
#avant
print(corpus[0])
#après tokénisation
print('\n')
print(corpus_tk[0])
i dont have much to say about this movie it could have been a wonderful tourdeforce for peter sellers but it is one of the most tragic misfires in movie history that it was sellers final movie makes it all the more painful the terrible screenplay direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made i wish so much that i could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber but its a lifeless humorless disaster the truth hurts peter why couldnt you have stopped at being there ['i', 'dont', 'have', 'much', 'to', 'say', 'about', 'this', 'movie', 'it', 'could', 'have', 'been', 'a', 'wonderful', 'tourdeforce', 'for', 'peter', 'sellers', 'but', 'it', 'is', 'one', 'of', 'the', 'most', 'tragic', 'misfires', 'in', 'movie', 'history', 'that', 'it', 'was', 'sellers', 'final', 'movie', 'makes', 'it', 'all', 'the', 'more', 'painful', 'the', 'terrible', 'screenplay', 'direction', 'and', 'shockingly', 'wooden', 'performances', 'all', 'come', 'dreadfully', 'together', 'to', 'make', 'this', 'one', 'of', 'the', 'most', 'unwatchably', 'awful', 'movies', 'ever', 'made', 'i', 'wish', 'so', 'much', 'that', 'i', 'could', 'find', 'even', 'a', 'snicker', 'or', 'a', 'chuckle', 'buried', 'somewhere', 'in', 'this', 'pile', 'of', 'putrid', 'blubber', 'but', 'its', 'a', 'lifeless', 'humorless', 'disaster', 'the', 'truth', 'hurts', 'peter', 'why', 'couldnt', 'you', 'have', 'stopped', 'at', 'being', 'there']
#importation librarire pour lemmatisation
#si ce n'est pas déjà fait
#nltk.download('wordnet')
#lemmatisation
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
corpus_lm = [[lem.lemmatize(mot) for mot in doc] for doc in corpus_tk]
print(corpus_lm[0])
['i', 'dont', 'have', 'much', 'to', 'say', 'about', 'this', 'movie', 'it', 'could', 'have', 'been', 'a', 'wonderful', 'tourdeforce', 'for', 'peter', 'seller', 'but', 'it', 'is', 'one', 'of', 'the', 'most', 'tragic', 'misfire', 'in', 'movie', 'history', 'that', 'it', 'wa', 'seller', 'final', 'movie', 'make', 'it', 'all', 'the', 'more', 'painful', 'the', 'terrible', 'screenplay', 'direction', 'and', 'shockingly', 'wooden', 'performance', 'all', 'come', 'dreadfully', 'together', 'to', 'make', 'this', 'one', 'of', 'the', 'most', 'unwatchably', 'awful', 'movie', 'ever', 'made', 'i', 'wish', 'so', 'much', 'that', 'i', 'could', 'find', 'even', 'a', 'snicker', 'or', 'a', 'chuckle', 'buried', 'somewhere', 'in', 'this', 'pile', 'of', 'putrid', 'blubber', 'but', 'it', 'a', 'lifeless', 'humorless', 'disaster', 'the', 'truth', 'hurt', 'peter', 'why', 'couldnt', 'you', 'have', 'stopped', 'at', 'being', 'there']
#importer la librairie des stopwords
#si ce n'est pas déjà fait
#nltk.download('stopwords')
#charger les stopwords
from nltk.corpus import stopwords
mots_vides = stopwords.words('english')
print(mots_vides)
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
#suppression des mots-vides
corpus_sw = [[mot for mot in doc if not (mot in mots_vides)] for doc in corpus_lm]
#vérification - origine
print(corpus_lm[0])
#sans les stopwords
print('\n')
print(corpus_sw[0])
['i', 'dont', 'have', 'much', 'to', 'say', 'about', 'this', 'movie', 'it', 'could', 'have', 'been', 'a', 'wonderful', 'tourdeforce', 'for', 'peter', 'seller', 'but', 'it', 'is', 'one', 'of', 'the', 'most', 'tragic', 'misfire', 'in', 'movie', 'history', 'that', 'it', 'wa', 'seller', 'final', 'movie', 'make', 'it', 'all', 'the', 'more', 'painful', 'the', 'terrible', 'screenplay', 'direction', 'and', 'shockingly', 'wooden', 'performance', 'all', 'come', 'dreadfully', 'together', 'to', 'make', 'this', 'one', 'of', 'the', 'most', 'unwatchably', 'awful', 'movie', 'ever', 'made', 'i', 'wish', 'so', 'much', 'that', 'i', 'could', 'find', 'even', 'a', 'snicker', 'or', 'a', 'chuckle', 'buried', 'somewhere', 'in', 'this', 'pile', 'of', 'putrid', 'blubber', 'but', 'it', 'a', 'lifeless', 'humorless', 'disaster', 'the', 'truth', 'hurt', 'peter', 'why', 'couldnt', 'you', 'have', 'stopped', 'at', 'being', 'there'] ['dont', 'much', 'say', 'movie', 'could', 'wonderful', 'tourdeforce', 'peter', 'seller', 'one', 'tragic', 'misfire', 'movie', 'history', 'wa', 'seller', 'final', 'movie', 'make', 'painful', 'terrible', 'screenplay', 'direction', 'shockingly', 'wooden', 'performance', 'come', 'dreadfully', 'together', 'make', 'one', 'unwatchably', 'awful', 'movie', 'ever', 'made', 'wish', 'much', 'could', 'find', 'even', 'snicker', 'chuckle', 'buried', 'somewhere', 'pile', 'putrid', 'blubber', 'lifeless', 'humorless', 'disaster', 'truth', 'hurt', 'peter', 'couldnt', 'stopped']
#retirer les token de moins de 3 lettres
corpus_sw = [[mot for mot in doc if len(mot) >= 3] for doc in corpus_sw]
print(corpus_sw[0])
['dont', 'much', 'say', 'movie', 'could', 'wonderful', 'tourdeforce', 'peter', 'seller', 'one', 'tragic', 'misfire', 'movie', 'history', 'seller', 'final', 'movie', 'make', 'painful', 'terrible', 'screenplay', 'direction', 'shockingly', 'wooden', 'performance', 'come', 'dreadfully', 'together', 'make', 'one', 'unwatchably', 'awful', 'movie', 'ever', 'made', 'wish', 'much', 'could', 'find', 'even', 'snicker', 'chuckle', 'buried', 'somewhere', 'pile', 'putrid', 'blubber', 'lifeless', 'humorless', 'disaster', 'truth', 'hurt', 'peter', 'couldnt', 'stopped']
#chargement
from gensim.models import keyedvectors
trained = keyedvectors.load_word2vec_format("enwiki_20180420_nolg_100d.txt.bz2",binary=False,unicode_errors='ignore')
#dimension
print(trained.vectors.shape)
(4530030, 100)
#coordonnées de mcenroe
print(trained['mcenroe'])
[-7.4420e-01 -4.5610e-01 -1.2600e-02 5.6630e-01 -4.6300e-01 7.3300e-02 6.8820e-01 1.9560e-01 -8.9910e-01 1.0800e-01 7.0610e-01 -8.0840e-01 -1.8190e-01 3.6030e-01 -6.8800e-02 -3.0630e-01 -6.1360e-01 3.6390e-01 -7.3810e-01 -7.9940e-01 4.6100e-02 4.3510e-01 -9.6800e-02 -3.5930e-01 -4.9690e-01 -4.5480e-01 5.3730e-01 -1.0270e-01 1.3370e-01 8.5890e-01 3.1370e-01 -9.6030e-01 -3.6690e-01 8.3000e-03 -1.4760e-01 3.3450e-01 1.2571e+00 -5.9200e-01 8.7860e-01 -4.4370e-01 -1.1520e-01 2.8510e-01 4.1550e-01 4.9250e-01 -4.0670e-01 -3.1810e-01 -1.4320e-01 -7.2440e-01 9.2200e-02 2.7000e-02 1.1894e+00 2.5290e-01 7.8090e-01 -1.1340e-01 5.4830e-01 5.0870e-01 -2.9000e-01 9.5020e-01 2.8940e-01 7.5940e-01 7.1400e-02 8.5000e-01 -6.8700e-02 4.7350e-01 3.1300e-01 3.9880e-01 -6.0740e-01 9.3000e-02 1.2710e-01 -1.6430e-01 2.3380e-01 -1.3960e-01 4.2420e-01 3.6150e-01 -1.5500e-01 1.1000e-03 -4.3480e-01 -1.2919e+00 8.9000e-03 6.4780e-01 1.7650e-01 -2.1630e-01 -2.5750e-01 -2.1000e-02 1.5820e-01 4.2400e-02 -2.6540e-01 1.4170e-01 -1.0390e-01 3.2700e-01 2.9700e-01 -5.0000e-02 -2.8050e-01 -4.8700e-01 1.8690e-01 1.1499e+00 2.3760e-01 7.0400e-02 4.2950e-01 8.1050e-01]
#similarité avec mcenroe
print(trained.most_similar(['mcenroe']))
[('ENTITY/John_McEnroe', 0.9442253708839417), ('ENTITY/Jimmy_Connors', 0.8725614547729492), ('ENTITY/Jim_Courier', 0.8710089325904846), ('ENTITY/Boris_Becker', 0.8668992519378662), ('ENTITY/Peter_Fleming_(tennis)', 0.8609349131584167), ('ENTITY/Patrick_McEnroe', 0.8583357930183411), ('ENTITY/Pat_Cash', 0.8523910045623779), ('ENTITY/Ivan_Lendl', 0.8495336771011353), ('ENTITY/Stefan_Edberg', 0.842862606048584), ('ENTITY/Mats_Wilander', 0.836287260055542)]
#l'exemple emblématique de word2vec
print(trained.most_similar(positive=['king','woman'],negative=['man']))
[('queen', 0.8103642463684082), ('ENTITY/Queen_regnant', 0.7852334976196289), ('ENTITY/Queen_consort', 0.7786893248558044), ('ENTITY/Queen_mother', 0.7416544556617737), ('ENTITY/King', 0.7342149615287781), ('ENTITY/Monarch', 0.7231631278991699), ('monarch', 0.7166460752487183), ('ENTITY/Nana_of_Iberia', 0.7136412858963013), ('ENTITY/Queen_dowager', 0.7117531895637512), ('ENTITY/Theodora_Komnene,_Queen_of_Jerusalem', 0.7023915648460388)]
Représentation d'un document : moyenne des vecteurs des termes qui le composent.
#librairie numpy
import numpy
#fonction pour transformer un document en vecteur
#à partir des tokens qui le composent
#entrée : doc à traiter
# modèle préentrainé
#sortie : vecteur représentant le document
def my_doc_2_vec(doc,trained):
#dimension de représentation
p = trained.vectors.shape[1]
#initialiser le vecteur
vec = numpy.zeros(p)
#nombre de tokens trouvés
nb = 0
#traitement de chaque token du document
for tk in doc:
#ne traiter que les tokens reconnus
try:
values = trained[tk]
vec = vec + values
nb = nb + 1.0
except:
pass
#faire la moyenne des valeurs
#uniquement si on a trové des tokens reconnus bien sûr
if (nb > 0.0):
vec = vec/nb
#renvoyer le vecteur
#si aucun token trouvé, on a un vecteur de valeurs nulles
return vec
#traiter les documents du corpus corpus
docsVec = list()
#pour chaque document du corpus nettoyé
for doc in corpus_sw:
#calcul de son vecteur
vec = my_doc_2_vec(doc,trained)
#ajouter dans la liste
docsVec.append(vec)
#transformer en matrice numpy
matVec = numpy.array(docsVec)
print(matVec.shape)
(2000, 100)
#transformer en data frame
df = pandas.DataFrame(matVec,columns=["v"+str(i+1) for i in range(matVec.shape[1])])
df.head()
v1 | v2 | v3 | v4 | v5 | v6 | v7 | v8 | v9 | v10 | ... | v91 | v92 | v93 | v94 | v95 | v96 | v97 | v98 | v99 | v100 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.131553 | 0.100132 | 0.165700 | 0.009436 | 0.010170 | -0.003608 | 0.145883 | 0.060662 | -0.397132 | -0.195987 | ... | 0.441032 | -0.390251 | 0.169462 | -0.079419 | -0.191558 | 0.270696 | -0.174706 | -0.143091 | 0.357096 | 0.118587 |
1 | -0.249970 | 0.083869 | 0.091946 | 0.048479 | -0.125262 | -0.049997 | 0.131749 | 0.020954 | -0.309151 | -0.135079 | ... | 0.346065 | -0.278472 | 0.143229 | -0.054096 | -0.244812 | 0.326215 | -0.174979 | -0.187383 | 0.308165 | 0.169554 |
2 | -0.088852 | 0.171857 | 0.105345 | 0.074749 | 0.015463 | -0.009727 | 0.066934 | 0.106400 | -0.379400 | -0.093555 | ... | 0.320787 | -0.299099 | 0.116794 | -0.092579 | -0.259819 | 0.217037 | -0.135245 | -0.183582 | 0.358513 | 0.191394 |
3 | -0.257327 | 0.175456 | 0.078214 | -0.064266 | -0.072026 | 0.016851 | 0.039667 | 0.011454 | -0.383173 | 0.029893 | ... | 0.243410 | -0.186563 | 0.188371 | -0.154981 | -0.249814 | 0.295906 | -0.146443 | -0.096103 | 0.292211 | 0.106881 |
4 | -0.199620 | 0.125612 | 0.011547 | 0.017742 | 0.017988 | 0.014695 | 0.078220 | 0.037230 | -0.346651 | -0.066920 | ... | 0.372772 | -0.303270 | 0.194698 | -0.118251 | -0.239577 | 0.222664 | -0.118425 | -0.125406 | 0.392238 | 0.157714 |
5 rows × 100 columns
#ajouter la classe
df['label'] = D.label
df.head()
v1 | v2 | v3 | v4 | v5 | v6 | v7 | v8 | v9 | v10 | ... | v92 | v93 | v94 | v95 | v96 | v97 | v98 | v99 | v100 | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.131553 | 0.100132 | 0.165700 | 0.009436 | 0.010170 | -0.003608 | 0.145883 | 0.060662 | -0.397132 | -0.195987 | ... | -0.390251 | 0.169462 | -0.079419 | -0.191558 | 0.270696 | -0.174706 | -0.143091 | 0.357096 | 0.118587 | neg |
1 | -0.249970 | 0.083869 | 0.091946 | 0.048479 | -0.125262 | -0.049997 | 0.131749 | 0.020954 | -0.309151 | -0.135079 | ... | -0.278472 | 0.143229 | -0.054096 | -0.244812 | 0.326215 | -0.174979 | -0.187383 | 0.308165 | 0.169554 | neg |
2 | -0.088852 | 0.171857 | 0.105345 | 0.074749 | 0.015463 | -0.009727 | 0.066934 | 0.106400 | -0.379400 | -0.093555 | ... | -0.299099 | 0.116794 | -0.092579 | -0.259819 | 0.217037 | -0.135245 | -0.183582 | 0.358513 | 0.191394 | neg |
3 | -0.257327 | 0.175456 | 0.078214 | -0.064266 | -0.072026 | 0.016851 | 0.039667 | 0.011454 | -0.383173 | 0.029893 | ... | -0.186563 | 0.188371 | -0.154981 | -0.249814 | 0.295906 | -0.146443 | -0.096103 | 0.292211 | 0.106881 | neg |
4 | -0.199620 | 0.125612 | 0.011547 | 0.017742 | 0.017988 | 0.014695 | 0.078220 | 0.037230 | -0.346651 | -0.066920 | ... | -0.303270 | 0.194698 | -0.118251 | -0.239577 | 0.222664 | -0.118425 | -0.125406 | 0.392238 | 0.157714 | neg |
5 rows × 101 columns
#partition apprentissage test
from sklearn.model_selection import train_test_split
dfTrain, dfTest = train_test_split(df,train_size=0.7,stratify=df.label,random_state=0)
print(dfTrain.shape)
print(dfTest.shape)
(1400, 101) (600, 101)
#SVM avec un noyau RBF par défaut
from sklearn.svm import SVC
clf = SVC(random_state=0)
clf.fit(dfTrain[dfTrain.columns[:-1]],dfTrain.label)
SVC(random_state=0)
#prédiction en test
pred = clf.predict(dfTest[dfTest.columns[:-1]])
print(pred.shape)
(600,)
#évaluation des performances
from sklearn import metrics
print(metrics.classification_report(dfTest.label,pred))
precision recall f1-score support neg 0.79 0.79 0.79 300 pos 0.79 0.79 0.79 300 accuracy 0.79 600 macro avg 0.79 0.79 0.79 600 weighted avg 0.79 0.79 0.79 600
#document à traiter
mon_texte = "I really enjoyed this movie"
#passage en minuscule
my_doc = mon_texte.lower()
print(my_doc)
i really enjoyed this movie
#tokenisation
my_doc_tk = word_tokenize(my_doc)
print(my_doc_tk)
['i', 'really', 'enjoyed', 'this', 'movie']
#lemmatisation
my_doc_lm = [lem.lemmatize(mot) for mot in my_doc_tk]
print(my_doc_lm)
['i', 'really', 'enjoyed', 'this', 'movie']
#retrait des stopwords
my_doc_sw = [mot for mot in my_doc_lm if not (mot in mots_vides)]
print(my_doc_sw)
['really', 'enjoyed', 'movie']
#avec le modèle pré-entrainé
#et avec la fonction ci-dessus
my_vec = my_doc_2_vec(my_doc_sw,trained)
print(my_vec)
[ 0.14133333 0.4578 0.16876667 0.1376 -0.17376666 -0.139 0.1596 -0.00546667 -0.4604 -0.0272 0.69959999 0.0517 0.31383333 -0.12583333 0.25066667 -0.05556667 0.17226667 0.05286667 -0.26963334 -0.3864 0.30213333 -0.01356667 0.27853332 -0.04396667 -0.30666666 -0.3631 0.34976668 -0.21940001 -0.23033333 -0.13483334 -0.1676 -0.40646667 -0.26816667 -0.15953333 -0.20853333 0.22583334 0.37956667 -0.15933333 -0.08013333 -0.23146667 0.19406667 0.34699999 -0.116 0.0288 -0.42423333 -0.45486667 -0.10653333 -0.36789999 -0.29693334 -0.00703333 0.16506667 -0.02273333 0.42640001 -0.29513334 0.14643333 0.12113333 0.36063333 0.12966667 -0.0612 -0.0395 0.16416667 -0.1069 0.11146667 -0.16103334 0.32013335 0.18616667 -0.10243333 0.10156667 0.1852 0.074 0.04903333 -0.71543331 -0.17940001 -0.0819 0.27606667 0.10383334 0.34496667 -0.319 -0.34130001 0.00553333 0.22963333 -0.2596 -0.13866667 0.15226667 0.2449 0.00303333 0.35613332 0.01623333 0.13326667 -0.08446667 0.4282 -0.41099999 0.29809999 -0.30083333 -0.17126667 0.21246666 -0.15586667 -0.1545 0.43966667 0.14073334]
#prédiction avec le SVM
pred_my_doc = clf.predict(my_vec.reshape(1,-1))
#le commentaire est de nature positive
print(pred_my_doc)
['pos']