#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement des données
import pandas
D = pandas.read_csv("training.1600000.processed.noemoticon.csv",sep=",",names=['polarity','id','date','query','user','tweet'],encoding="UTF",encoding_errors="ignore")
D.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1600000 entries, 0 to 1599999 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 polarity 1600000 non-null int64 1 id 1600000 non-null int64 2 date 1600000 non-null object 3 query 1600000 non-null object 4 user 1600000 non-null object 5 tweet 1600000 non-null object dtypes: int64(2), object(4) memory usage: 73.2+ MB
#distribution des classes
print(D.polarity.value_counts())
0 800000 4 800000 Name: polarity, dtype: int64
#récupérer les deux colonnes qui nous intéressent
#et n'utiliser qu'une partie des données
DS = D.sample(n=30000,random_state=0)[['polarity','tweet']]
#premières lignes
DS.head()
polarity | tweet | |
---|---|---|
557138 | 0 | wants to compete! i want hard competition! i w... |
349381 | 0 | It seems we are stuck on the ground in Amarill... |
182051 | 0 | where the f are my pinking shears? rarararrrar... |
571236 | 0 | 0ff t0 tHE MEEtiN.. i HAtE WhEN PPl V0lUNtEER... |
1339637 | 4 | @ reply me pls |
#****************************************
#préparation des librairies et des outils
#pour le nettoyage des tweets
#****************************************
#expressions régulières
import re
#ponctuations
import string
ponctuations = list(string.punctuation)
print(ponctuations)
#tokénisation
#import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
#lemmatisation
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
#charger les stopwords
#nltk.download('stopwords')
from nltk.corpus import stopwords
mots_vides = stopwords.words('english')
print('\n')
print(mots_vides)
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'] ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
#fonction pour nettoyage de chaque document
#tweet = corps du tweet = document
#ponctuations : liste des ponctuations
#stopwords : liste des stopwords à retirer
#lem : fonction pour la lemmatisation des termes
def clean_tweet(tweet,ponctuations,stopwords,lem):
#harmonisation de la casse
temp = tweet.lower()
#retirer les contractions en anglais
temp = re.sub("'", "", temp)
#retrait des @ (mentions)
temp = re.sub("@[A-Za-z0-9_]+","", temp)
#retrait des # (hashtags)
temp = re.sub("#[A-Za-z0-9_]+","", temp)
#retrait des liens web (http et https)
temp = re.sub(r'http\S+', '', temp)
#retrait des ponctuations
temp = "".join([char for char in list(temp) if not (char in ponctuations)])
#retrait des nombres
temp = re.sub("[0-9]","", temp)
#tokenisation
temp = word_tokenize(temp)
#lemmatisation des termes
temp = [lem.lemmatize(mot) for mot in temp]
#retrait des stopwords
temp = [mot for mot in temp if not mot in stopwords]
#retirer les tokens de moins de 3 caractères
temp = [mot for mot in temp if len(mot) >= 3]
#reformer la chaîne
temp = " ".join(mot for mot in temp)
return temp
#appliquer le nettoyage au corpus
corpus = list(DS.tweet)
corpus = [clean_tweet(doc,ponctuations,mots_vides,lem) for doc in corpus]
#nouveau data frame
DC = pandas.DataFrame({'polarity':(DS.polarity==4).astype('int32'),'tweet':corpus})
#comparaison avant...
DS.head(10)
polarity | tweet | |
---|---|---|
557138 | 0 | wants to compete! i want hard competition! i w... |
349381 | 0 | It seems we are stuck on the ground in Amarill... |
182051 | 0 | where the f are my pinking shears? rarararrrar... |
571236 | 0 | 0ff t0 tHE MEEtiN.. i HAtE WhEN PPl V0lUNtEER... |
1339637 | 4 | @ reply me pls |
758738 | 0 | @bharathy_99: Jazz in India is just Honda stra... |
1065984 | 4 | aaaaaaaaaaah, met a boy. he seems nice. im hap... |
1373568 | 4 | @jonasbrothers http://twitpic.com/6q1om - Spor... |
1092137 | 4 | @saragarth Not bad, bit grumpy cause of exams ... |
74877 | 0 | @luke_redroot can't watch it what is it? |
#... et après nettoyage
DC.head(10)
polarity | tweet | |
---|---|---|
557138 | 0 | want compete want hard competition want rally ... |
349381 | 0 | seems stuck ground amarillo put ground stop fl... |
182051 | 0 | pinking shear rarararrrarararrbabyproofing cut... |
571236 | 0 | meetin hate ppl vlunteer free timegrrr |
1339637 | 1 | reply pls |
758738 | 0 | jazz india honda strategy prove make affordabl... |
1065984 | 1 | aaaaaaaaaaah met boy seems nice happppppy |
1373568 | 1 | sport center guy legit quit wooww |
1092137 | 1 | bad bit grumpy cause exam generally |
74877 | 0 | cant watch |
#des tweets vides après nettoyage ?
print(DC.loc[DC.tweet==""].shape[0])
163
#retrait des tweets correspondants
DC_ok = DC.loc[DC.tweet != ""]
print(DC_ok.shape)
(29837, 2)
#partition app-test
from sklearn.model_selection import train_test_split
dfTrain, dfTest = train_test_split(DC_ok,train_size=0.7,stratify=DC_ok.polarity,random_state=0)
#vérification
print(dfTrain.shape)
print(dfTest.shape)
(20885, 2) (8952, 2)
#version de keras
import keras
print(keras.__version__)
2.7.0
#tokénisation avec Keras
#num_words = None => sans limitation du nombre de termes à extraire
from keras.preprocessing.text import Tokenizer
tk = Tokenizer(num_words=None)
#création du dictionnaire à partir des documents
#de l'échantillon d'apprentissage
tk.fit_on_texts(dfTrain.tweet)
#nombre de documents traités
print(tk.document_count)
20885
#taille du dictionnaire
dico_size = len(tk.word_counts)
print(dico_size)
21057
#liste des termes avec leur indice - les 10 premiers
print(list(tk.word_index.items())[:10])
[('day', 1), ('good', 2), ('get', 3), ('like', 4), ('got', 5), ('going', 6), ('love', 7), ('today', 8), ('work', 9), ('dont', 10)]
#les 10 derniers
print(list(tk.word_index.items())[-10:])
[('wolfy', 21048), ('quotbeautifulquot', 21049), ('nacher', 21050), ('poolpartey', 21051), ('dandy', 21052), ('warhol', 21053), ('bohemian', 21054), ('mihama', 21055), ('ilovestef', 21056), ('dickhead', 21057)]
#transformation des documents en séquences de tokens
#cf. la doc
seqTrain = tk.texts_to_sequences(dfTrain.tweet)
print(seqTrain[:3])
[[7293], [65, 3935, 2780, 7294, 50, 21, 438, 326], [286, 535, 164]]
#tweet corresp.
print(dfTrain.tweet[:3])
981878 najs 951717 watching daft punk intersella right still coup... 611852 couldnt comment yesterday Name: tweet, dtype: object
#longueur max des documents
#dans l'échantillon train
import numpy
max_length = numpy.max(numpy.array([len(doc) for doc in seqTrain]))
print(max_length)
20
#transformer en pad_sequences pour
#manipuler une structure de taille fixe
#maxlen = max_length + marge pour éventuellement
#gérer les docs. plus longs en déploiement
#marge_length
marge_length = 5
from keras.preprocessing.sequence import pad_sequences
padTrain = pad_sequences(seqTrain,maxlen=max_length + marge_length,padding='post')
#les 3 premiers documents
print(padTrain[:3,:])
[[7293 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 65 3935 2780 7294 50 21 438 326 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 286 535 164 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Adresse de récupération => http://nlp.stanford.edu/data/glove.6B.zip
import numpy
#structure dictionnaire pour stocker les couples
#termes, vecteurs
embeddings_index = {}
#ouvrir le fichier en lecture
#représentation dim = 50
f = open("glove.6B.50d.txt","r",encoding="utf-8")
#lire ligne par ligne et décomposer
for line in f:
word, coefs = line.split(maxsplit=1)
coefs = numpy.fromstring(coefs, "f", sep=" ")
embeddings_index[word] = coefs
#fermer le fichier
f.close()
#vérification du nombre de termes
print("Found %s word vectors." % len(embeddings_index))
Found 400000 word vectors.
#coordonnées des 4 premiers termes
print(list(embeddings_index.items())[:4])
[('the', array([ 4.1800e-01, 2.4968e-01, -4.1242e-01, 1.2170e-01, 3.4527e-01, -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01, 2.7843e-01, -1.4767e-01, -5.5677e-01, 1.4658e-01, -9.5095e-03, 1.1658e-02, 1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01, -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01, -1.8823e+00, -7.6746e-01, 9.9051e-02, -4.2125e-01, -1.9526e-01, 4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01, 5.9213e-04, 7.4449e-03, 1.7778e-01, -1.5897e-01, 1.2041e-02, -5.4223e-02, -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01, 1.8785e-01, 2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01], dtype=float32)), (',', array([ 0.013441, 0.23682 , -0.16899 , 0.40951 , 0.63812 , 0.47709 , -0.42852 , -0.55641 , -0.364 , -0.23938 , 0.13001 , -0.063734, -0.39575 , -0.48162 , 0.23291 , 0.090201, -0.13324 , 0.078639, -0.41634 , -0.15428 , 0.10068 , 0.48891 , 0.31226 , -0.1252 , -0.037512, -1.5179 , 0.12612 , -0.02442 , -0.042961, -0.28351 , 3.5416 , -0.11956 , -0.014533, -0.1499 , 0.21864 , -0.33412 , -0.13872 , 0.31806 , 0.70358 , 0.44858 , -0.080262, 0.63003 , 0.32111 , -0.46765 , 0.22786 , 0.36034 , -0.37818 , -0.56657 , 0.044691, 0.30392 ], dtype=float32)), ('.', array([ 1.5164e-01, 3.0177e-01, -1.6763e-01, 1.7684e-01, 3.1719e-01, 3.3973e-01, -4.3478e-01, -3.1086e-01, -4.4999e-01, -2.9486e-01, 1.6608e-01, 1.1963e-01, -4.1328e-01, -4.2353e-01, 5.9868e-01, 2.8825e-01, -1.1547e-01, -4.1848e-02, -6.7989e-01, -2.5063e-01, 1.8472e-01, 8.6876e-02, 4.6582e-01, 1.5035e-02, 4.3474e-02, -1.4671e+00, -3.0384e-01, -2.3441e-02, 3.0589e-01, -2.1785e-01, 3.7460e+00, 4.2284e-03, -1.8436e-01, -4.6209e-01, 9.8329e-02, -1.1907e-01, 2.3919e-01, 1.1610e-01, 4.1705e-01, 5.6763e-02, -6.3681e-05, 6.8987e-02, 8.7939e-02, -1.0285e-01, -1.3931e-01, 2.2314e-01, -8.0803e-02, -3.5652e-01, 1.6413e-02, 1.0216e-01], dtype=float32)), ('of', array([ 0.70853 , 0.57088 , -0.4716 , 0.18048 , 0.54449 , 0.72603 , 0.18157 , -0.52393 , 0.10381 , -0.17566 , 0.078852 , -0.36216 , -0.11829 , -0.83336 , 0.11917 , -0.16605 , 0.061555 , -0.012719 , -0.56623 , 0.013616 , 0.22851 , -0.14396 , -0.067549 , -0.38157 , -0.23698 , -1.7037 , -0.86692 , -0.26704 , -0.2589 , 0.1767 , 3.8676 , -0.1613 , -0.13273 , -0.68881 , 0.18444 , 0.0052464, -0.33874 , -0.078956 , 0.24185 , 0.36576 , -0.34727 , 0.28483 , 0.075693 , -0.062178 , -0.38988 , 0.22902 , -0.21617 , -0.22562 , -0.093918 , -0.80375 ], dtype=float32))]
#coordonnées de "good"
print(embeddings_index['good'])
[-3.5586e-01 5.2130e-01 -6.1070e-01 -3.0131e-01 9.4862e-01 -3.1539e-01 -5.9831e-01 1.2188e-01 -3.1943e-02 5.5695e-01 -1.0621e-01 6.3399e-01 -4.7340e-01 -7.5895e-02 3.8247e-01 8.1569e-02 8.2214e-01 2.2220e-01 -8.3764e-03 -7.6620e-01 -5.6253e-01 6.1759e-01 2.0292e-01 -4.8598e-02 8.7815e-01 -1.6549e+00 -7.7418e-01 1.5435e-01 9.4823e-01 -3.9520e-01 3.7302e+00 8.2855e-01 -1.4104e-01 1.6395e-02 2.1115e-01 -3.6085e-02 -1.5587e-01 8.6583e-01 2.6309e-01 -7.1015e-01 -3.6770e-02 1.8282e-03 -1.7704e-01 2.7032e-01 1.1026e-01 1.4133e-01 -5.7322e-02 2.7207e-01 3.1305e-01 9.2771e-01]
#coordonnées de "nice"
print(embeddings_index['nice'])
[ 0.20189 0.80606 -1.1281 -0.59593 0.52756 -0.4769 -0.5264 0.14526 -0.86087 0.56199 -0.43708 -0.16586 -0.23328 -0.21726 0.52114 0.062307 0.55115 -0.18002 -0.32983 -0.94434 -0.62019 0.78764 -0.36133 0.6858 0.3791 -0.87744 -0.76792 1.2885 1.142 -0.73489 2.3932 1.0967 -0.48686 0.5284 0.3927 -0.056427 0.29632 1.0798 0.45157 -0.98115 1.0037 0.15634 0.022584 -0.14832 -0.092933 0.33691 0.42171 -0.21642 1.0139 1.0535 ]
#similarité cosinus entre "good" et "nice"
import scipy
print(1.0-scipy.spatial.distance.cosine(embeddings_index['good'],embeddings_index['nice']))
0.7958537936210632
#nombre de tokens
#taille du dictionnaire + 1
num_tokens = dico_size + 1
#dimension de représentation
#puisque nous avons choisi "glove.6B.50d.txt"
embedding_dim = 50
#nombre de termes trouvés
#et liste des non trouvés
hit = 0
misses = []
#*****************************************
#remplir la matrice avec les coordonnées
#issues de la représentation pré-entraînée
#*****************************************
#initialisation avec des valeurs 0
embedding_matrix = numpy.zeros((num_tokens,embedding_dim))
#remplissage à partir des représentations pré-entraînées
#pourvu que le terme (de notre dictionnaire) recherché soit présent
#dans la représentation pré-entraînée de GloVe
for word, i in tk.word_index.items():
embedding_vector = embeddings_index.get(word)
if not(embedding_vector is None):
#compléter la ligne n°i
embedding_matrix[i] = embedding_vector
#compteur
hit = hit + 1
else:
misses.append(word)
#affichage de contrôle
print('Termes du dico trouvés {0} et non trouvés {1}'.format(hit,len(misses)))
Termes du dico trouvés 13387 et non trouvés 7670
7670 termes non référencés c.-à-d. 7670 termes présents dans les tweets mais pas dans GloVe. Les lignes corresp. sont restées à 0 dans notre matrice ! Attention grosse perte d'information !!!
#qqs exemples de termes non recensés
#dans la représentation pré-entraînée
print(misses[:20])
['hahaha', 'lmao', 'bday', 'youve', 'idk', 'itll', 'hahah', 'quotthe', 'youu', 'werent', 'thanx', 'tweeps', 'hahahaha', 'retweet', 'everyones', 'tshirt', 'shouldnt', 'gunna', 'quoti', 'gtlt']
#librairies
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding
#construction de la couche initiale embeddings
#à partir de la représentation pré-entraînée
embedding_layer = Embedding(
input_dim = num_tokens,
output_dim = embedding_dim,
input_length = max_length + marge_length,
embeddings_initializer=keras.initializers.Constant(embedding_matrix),#/!\ très important
trainable = False #/!\ très important !!!
)
#construction du perceptron multicouche
#==> output_dim précise la taille de l'espace de
#représentation dans laquelle seront projetés les termes
#==> input_dim = dico_size + 1 parce que l'index
#des termes commence à la colonne 1 (la colonne 0 existe mais n'est pas associée à un terme)
pmc = Sequential()
pmc.add(embedding_layer)
pmc.add(Flatten())
pmc.add(Dense(units=1,activation="sigmoid"))
#structure du réseau
print(pmc.summary())
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 25, 50) 1052900 flatten (Flatten) (None, 1250) 0 dense (Dense) (None, 1) 1251 ================================================================= Total params: 1,054,151 Trainable params: 1,251 Non-trainable params: 1,052,900 _________________________________________________________________ None
#paramétrage de l'algorithme d'apprentissage
pmc.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])
#lancement - une partie du train est réservée pour la validation
#c.-à-d. pour un suivi plus réaliste des performances
history = pmc.fit(padTrain,dfTrain.polarity,epochs=10,validation_split=0.2)
Epoch 1/10 523/523 [==============================] - 1s 1ms/step - loss: 0.6552 - accuracy: 0.6096 - val_loss: 0.6298 - val_accuracy: 0.6560 Epoch 2/10 523/523 [==============================] - 1s 962us/step - loss: 0.6232 - accuracy: 0.6527 - val_loss: 0.6257 - val_accuracy: 0.6598 Epoch 3/10 523/523 [==============================] - 1s 971us/step - loss: 0.6148 - accuracy: 0.6618 - val_loss: 0.6249 - val_accuracy: 0.6617 Epoch 4/10 523/523 [==============================] - 1s 967us/step - loss: 0.6102 - accuracy: 0.6639 - val_loss: 0.6273 - val_accuracy: 0.6612 Epoch 5/10 523/523 [==============================] - 1s 977us/step - loss: 0.6084 - accuracy: 0.6670 - val_loss: 0.6279 - val_accuracy: 0.6591 Epoch 6/10 523/523 [==============================] - 0s 931us/step - loss: 0.6067 - accuracy: 0.6668 - val_loss: 0.6289 - val_accuracy: 0.6572 Epoch 7/10 523/523 [==============================] - 0s 927us/step - loss: 0.6059 - accuracy: 0.6699 - val_loss: 0.6308 - val_accuracy: 0.6567 Epoch 8/10 523/523 [==============================] - 0s 893us/step - loss: 0.6053 - accuracy: 0.6706 - val_loss: 0.6328 - val_accuracy: 0.6524 Epoch 9/10 523/523 [==============================] - 1s 1ms/step - loss: 0.6045 - accuracy: 0.6710 - val_loss: 0.6335 - val_accuracy: 0.6538 Epoch 10/10 523/523 [==============================] - 1s 989us/step - loss: 0.6041 - accuracy: 0.6703 - val_loss: 0.6334 - val_accuracy: 0.6524
#graphique accuracy
import matplotlib.pyplot as plt
#fonction pour évolution de l'accuracy
def graphique(history):
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
#appel de la fonction
graphique(history)
#coordonnées des termes issu du réseau de neurones
#coordonnées de la matrice
terms_coord = pmc.get_weights()[0]
print(terms_coord.shape)
(21058, 50)
#coordonnées de "good" dans le réseau
coord_good = terms_coord[tk.word_index['good'],:]
print(coord_good)
[-3.5586e-01 5.2130e-01 -6.1070e-01 -3.0131e-01 9.4862e-01 -3.1539e-01 -5.9831e-01 1.2188e-01 -3.1943e-02 5.5695e-01 -1.0621e-01 6.3399e-01 -4.7340e-01 -7.5895e-02 3.8247e-01 8.1569e-02 8.2214e-01 2.2220e-01 -8.3764e-03 -7.6620e-01 -5.6253e-01 6.1759e-01 2.0292e-01 -4.8598e-02 8.7815e-01 -1.6549e+00 -7.7418e-01 1.5435e-01 9.4823e-01 -3.9520e-01 3.7302e+00 8.2855e-01 -1.4104e-01 1.6395e-02 2.1115e-01 -3.6085e-02 -1.5587e-01 8.6583e-01 2.6309e-01 -7.1015e-01 -3.6770e-02 1.8282e-03 -1.7704e-01 2.7032e-01 1.1026e-01 1.4133e-01 -5.7322e-02 2.7207e-01 3.1305e-01 9.2771e-01]
#coordonnées de "good" pré-entraîné de GloVe
print(embeddings_index['good'])
[-3.5586e-01 5.2130e-01 -6.1070e-01 -3.0131e-01 9.4862e-01 -3.1539e-01 -5.9831e-01 1.2188e-01 -3.1943e-02 5.5695e-01 -1.0621e-01 6.3399e-01 -4.7340e-01 -7.5895e-02 3.8247e-01 8.1569e-02 8.2214e-01 2.2220e-01 -8.3764e-03 -7.6620e-01 -5.6253e-01 6.1759e-01 2.0292e-01 -4.8598e-02 8.7815e-01 -1.6549e+00 -7.7418e-01 1.5435e-01 9.4823e-01 -3.9520e-01 3.7302e+00 8.2855e-01 -1.4104e-01 1.6395e-02 2.1115e-01 -3.6085e-02 -1.5587e-01 8.6583e-01 2.6309e-01 -7.1015e-01 -3.6770e-02 1.8282e-03 -1.7704e-01 2.7032e-01 1.1026e-01 1.4133e-01 -5.7322e-02 2.7207e-01 3.1305e-01 9.2771e-01]
#préparation de l'échantillon test
#formattage en pad_sequences
seqTest = tk.texts_to_sequences(dfTest.tweet)
padTest = pad_sequences(seqTest,maxlen=max_length + marge_length,padding='post')
#3 premiers tweets de test
print(dfTest.tweet[:3])
#représentation en séquences
print('\n')
print(padTest[:3,:])
1489257 work kicked butt upshot free personal trainer 1302750 haha well hoping someone would come look flat ... 1580107 welll story summer emily noob hahahh chill day... Name: tweet, dtype: object [[ 9 1344 1104 268 1705 4391 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 38 19 384 174 46 42 59 1667 89 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [15248 466 127 2691 5981 971 1 124 1397 205 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
#évaluation des performances en test avec evaluate()
print(pmc.evaluate(padTest,dfTest.polarity))
280/280 [==============================] - 0s 649us/step - loss: 0.6285 - accuracy: 0.6547 [0.6285085678100586, 0.654714047908783]
#un nouveau tweet
my_tweet = "#beatles all you need is love"
#nettoyage
my_clean = clean_tweet(my_tweet,ponctuations,mots_vides,lem)
print(my_clean)
need love
#transformation en séquence
my_seq = tk.texts_to_sequences([my_clean])
print(my_seq)
[[24, 7]]
#puis en pad
my_pad = pad_sequences(my_seq,maxlen=max_length + marge_length,padding='post')
print(my_pad)
[[24 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
#prédiction
print(pmc.predict(my_pad))
[[0.6502417]]