#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
#chargement des données
import pandas
D = pandas.read_csv("training.1600000.processed.noemoticon.csv",sep=",",names=['polarity','id','date','query','user','tweet'],encoding="UTF",encoding_errors="ignore")
D.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1600000 entries, 0 to 1599999 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 polarity 1600000 non-null int64 1 id 1600000 non-null int64 2 date 1600000 non-null object 3 query 1600000 non-null object 4 user 1600000 non-null object 5 tweet 1600000 non-null object dtypes: int64(2), object(4) memory usage: 73.2+ MB
#distribution des classes
print(D.polarity.value_counts())
0 800000 4 800000 Name: polarity, dtype: int64
#récupérer les deux colonnes qui nous intéressent
#et n'utiliser qu'une partie des données
DS = D.sample(n=30000,random_state=0)[['polarity','tweet']]
#premières lignes
DS.head()
polarity | tweet | |
---|---|---|
557138 | 0 | wants to compete! i want hard competition! i w... |
349381 | 0 | It seems we are stuck on the ground in Amarill... |
182051 | 0 | where the f are my pinking shears? rarararrrar... |
571236 | 0 | 0ff t0 tHE MEEtiN.. i HAtE WhEN PPl V0lUNtEER... |
1339637 | 4 | @ reply me pls |
#****************************************
#préparation des librairies et des outils
#pour le nettoyage des tweets
#****************************************
#expressions régulières
import re
#ponctuations
import string
ponctuations = list(string.punctuation)
print(ponctuations)
#tokénisation
#import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
#lemmatisation
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
#charger les stopwords
#nltk.download('stopwords')
from nltk.corpus import stopwords
mots_vides = stopwords.words('english')
print('\n')
print(mots_vides)
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'] ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
#fonction pour nettoyage de chaque document
#tweet = corps du tweet = document
#ponctuations : liste des ponctuations
#stopwords : liste des stopwords à retirer
#lem : fonction pour la lemmatisation des termes
def clean_tweet(tweet,ponctuations,stopwords,lem):
#harmonisation de la casse
temp = tweet.lower()
#retirer les contractions en anglais
temp = re.sub("'", "", temp)
#retrait des @ (mentions)
temp = re.sub("@[A-Za-z0-9_]+","", temp)
#retrait des # (hashtags)
temp = re.sub("#[A-Za-z0-9_]+","", temp)
#retrait des liens web (http et https)
temp = re.sub(r'http\S+', '', temp)
#retrait des ponctuations
temp = "".join([char for char in list(temp) if not (char in ponctuations)])
#retrait des nombres
temp = re.sub("[0-9]","", temp)
#tokenisation
temp = word_tokenize(temp)
#lemmatisation des termes
temp = [lem.lemmatize(mot) for mot in temp]
#retrait des stopwords
temp = [mot for mot in temp if not mot in stopwords]
#retirer les tokens de moins de 3 caractères
temp = [mot for mot in temp if len(mot) >= 3]
#reformer la chaîne
temp = " ".join(mot for mot in temp)
return temp
#appliquer le nettoyage au corpus
corpus = list(DS.tweet)
corpus = [clean_tweet(doc,ponctuations,mots_vides,lem) for doc in corpus]
#nouveau data frame
DC = pandas.DataFrame({'polarity':(DS.polarity==4).astype('int32'),'tweet':corpus})
#comparaison avant...
DS.head(10)
polarity | tweet | |
---|---|---|
557138 | 0 | wants to compete! i want hard competition! i w... |
349381 | 0 | It seems we are stuck on the ground in Amarill... |
182051 | 0 | where the f are my pinking shears? rarararrrar... |
571236 | 0 | 0ff t0 tHE MEEtiN.. i HAtE WhEN PPl V0lUNtEER... |
1339637 | 4 | @ reply me pls |
758738 | 0 | @bharathy_99: Jazz in India is just Honda stra... |
1065984 | 4 | aaaaaaaaaaah, met a boy. he seems nice. im hap... |
1373568 | 4 | @jonasbrothers http://twitpic.com/6q1om - Spor... |
1092137 | 4 | @saragarth Not bad, bit grumpy cause of exams ... |
74877 | 0 | @luke_redroot can't watch it what is it? |
#... et après nettoyage
DC.head(10)
polarity | tweet | |
---|---|---|
557138 | 0 | want compete want hard competition want rally ... |
349381 | 0 | seems stuck ground amarillo put ground stop fl... |
182051 | 0 | pinking shear rarararrrarararrbabyproofing cut... |
571236 | 0 | meetin hate ppl vlunteer free timegrrr |
1339637 | 1 | reply pls |
758738 | 0 | jazz india honda strategy prove make affordabl... |
1065984 | 1 | aaaaaaaaaaah met boy seems nice happppppy |
1373568 | 1 | sport center guy legit quit wooww |
1092137 | 1 | bad bit grumpy cause exam generally |
74877 | 0 | cant watch |
#des tweets vides après nettoyage ?
print(DC.loc[DC.tweet==""].shape[0])
163
#retrait des tweets correspondants
DC_ok = DC.loc[DC.tweet != ""]
print(DC_ok.shape)
(29837, 2)
#partition app-test
from sklearn.model_selection import train_test_split
dfTrain, dfTest = train_test_split(DC_ok,train_size=0.7,stratify=DC_ok.polarity,random_state=0)
#vérification
print(dfTrain.shape)
print(dfTest.shape)
(20885, 2) (8952, 2)
#version de keras
import keras
print(keras.__version__)
2.7.0
#tokénisation avec Keras
#num_words = None => sans limitation du nombre de termes à extraire
from keras.preprocessing.text import Tokenizer
tk = Tokenizer(num_words=None)
#création du dictionnaire à partir des documents
#de l'échantillon d'apprentissage
tk.fit_on_texts(dfTrain.tweet)
#nombre de documents traités
print(tk.document_count)
20885
#taille du dictionnaire
dico_size = len(tk.word_counts)
print(dico_size)
21057
#liste des mots et leurs fréquences
print(list(tk.word_counts.items())[:10])
[('najs', 1), ('watching', 296), ('daft', 3), ('punk', 5), ('intersella', 1), ('right', 341), ('still', 601), ('couple', 53), ('drink', 74), ('couldnt', 86)]
#liste triée dans l'ordre
#de la fréquence décroissante
print(sorted(list(tk.word_counts.items()),key=lambda x: -x[1])[:10])
[('day', 1383), ('good', 1192), ('get', 1085), ('like', 1023), ('got', 989), ('going', 889), ('love', 880), ('today', 878), ('work', 870), ('dont', 857)]
#liste des termes avec leur indice - les 10 premiers
print(list(tk.word_index.items())[:10])
[('day', 1), ('good', 2), ('get', 3), ('like', 4), ('got', 5), ('going', 6), ('love', 7), ('today', 8), ('work', 9), ('dont', 10)]
#les 10 derniers
print(list(tk.word_index.items())[-10:])
[('wolfy', 21048), ('quotbeautifulquot', 21049), ('nacher', 21050), ('poolpartey', 21051), ('dandy', 21052), ('warhol', 21053), ('bohemian', 21054), ('mihama', 21055), ('ilovestef', 21056), ('dickhead', 21057)]
#transformation des documents en séquences de tokens
#cf. la doc
seqTrain = tk.texts_to_sequences(dfTrain.tweet)
print(seqTrain[:3])
[[7293], [65, 3935, 2780, 7294, 50, 21, 438, 326], [286, 535, 164]]
#tweet corresp.
print(dfTrain.tweet[:3])
981878 najs 951717 watching daft punk intersella right still coup... 611852 couldnt comment yesterday Name: tweet, dtype: object
#vérification - najs
print(list(tk.word_index.items())[7293-1])
('najs', 7293)
#vérification - daft
print(list(tk.word_index.items())[3935-1])
('daft', 3935)
#longueur max des documents
#dans l'échantillon train
import numpy
max_length = numpy.max(numpy.array([len(doc) for doc in seqTrain]))
print(max_length)
20
#transformer en pad_sequences pour
#manipuler une structure de taille fixe
#maxlen = max_length + marge pour éventuellement
#gérer les docs. plus longs en déploiement
#marge_length
marge_length = 5
from keras.preprocessing.sequence import pad_sequences
padTrain = pad_sequences(seqTrain,maxlen=max_length + marge_length,padding='post')
#les 3 premiers documents
print(padTrain[:3,:])
[[7293 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 65 3935 2780 7294 50 21 438 326 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 286 535 164 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
#créer un modèle
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding
#perceptron multicouche
#==> output_dim précise la taille de l'espace de
#représentation dans laquelle seront projetés les termes
#==> input_dim = dico_size + 1 parce que l'index
#des termes commence à la colonne 1 (la colonne 0 existe mais n'est pas associée à un terme)
pmc = Sequential()
pmc.add(Embedding(input_dim = dico_size + 1, output_dim = 10, input_length = max_length + marge_length))
pmc.add(Flatten())
pmc.add(Dense(units=1,activation="sigmoid"))
#structure du réseau
print(pmc.summary())
Model: "sequential_2" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_2 (Embedding) (None, 25, 10) 210580 flatten_2 (Flatten) (None, 250) 0 dense_2 (Dense) (None, 1) 251 ================================================================= Total params: 210,831 Trainable params: 210,831 Non-trainable params: 0 _________________________________________________________________ None
#paramétrage de l'algorithme d'apprentissage
pmc.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])
#lancement - une partie du train est réservée pour la validation
#c.-à-d. pour un suivi plus réaliste des performances
history = pmc.fit(padTrain,dfTrain.polarity,epochs=10,validation_split=0.2)
Epoch 1/10 523/523 [==============================] - 1s 2ms/step - loss: 0.6783 - accuracy: 0.5897 - val_loss: 0.6361 - val_accuracy: 0.6797 Epoch 2/10 523/523 [==============================] - 1s 2ms/step - loss: 0.5446 - accuracy: 0.7651 - val_loss: 0.5564 - val_accuracy: 0.7194 Epoch 3/10 523/523 [==============================] - 1s 1ms/step - loss: 0.4150 - accuracy: 0.8362 - val_loss: 0.5437 - val_accuracy: 0.7283 Epoch 4/10 523/523 [==============================] - 1s 2ms/step - loss: 0.3198 - accuracy: 0.8860 - val_loss: 0.5529 - val_accuracy: 0.7285 Epoch 5/10 523/523 [==============================] - 1s 2ms/step - loss: 0.2486 - accuracy: 0.9182 - val_loss: 0.5716 - val_accuracy: 0.7237 Epoch 6/10 523/523 [==============================] - 1s 1ms/step - loss: 0.1966 - accuracy: 0.9388 - val_loss: 0.5966 - val_accuracy: 0.7204 Epoch 7/10 523/523 [==============================] - 1s 1ms/step - loss: 0.1586 - accuracy: 0.9526 - val_loss: 0.6241 - val_accuracy: 0.7096 Epoch 8/10 523/523 [==============================] - 1s 2ms/step - loss: 0.1303 - accuracy: 0.9619 - val_loss: 0.6571 - val_accuracy: 0.7039 Epoch 9/10 523/523 [==============================] - 1s 2ms/step - loss: 0.1089 - accuracy: 0.9691 - val_loss: 0.6862 - val_accuracy: 0.7086 Epoch 10/10 523/523 [==============================] - 1s 2ms/step - loss: 0.0920 - accuracy: 0.9740 - val_loss: 0.7232 - val_accuracy: 0.7060
#graphique accuracy
import matplotlib.pyplot as plt
#fonction pour évolution de l'accuracy
def graphique(history):
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
#appel de la fonction
graphique(history)
#coordonnées des termes - dimension de la matrice
terms_coord = pmc.get_weights()[0]
print(terms_coord.shape)
(21058, 10)
#coordonnées de "good"
coord_good = terms_coord[tk.word_index['good'],:]
print(coord_good)
[ 0.06048839 0.15175326 0.02972553 -0.0928456 -0.05328212 0.16087744 -0.15695451 0.20639941 -0.16128987 0.04626485]
#coordonnées de "nice"
coord_nice = terms_coord[tk.word_index['nice'],:]
print(coord_nice)
[-0.10318595 0.20821479 0.12270731 0.05744137 0.04779329 0.17350547 -0.17394568 0.03638811 -0.07846153 -0.00998036]
#autres
from keras.layers import MaxPooling1D, Dropout
#perceptron multicouche
model = Sequential()
model.add(Embedding(input_dim = dico_size + 1, output_dim = 10, input_length = max_length + marge_length))
#2 opérations de régularisation
model.add(MaxPooling1D(pool_size=5,strides=2))
model.add(Dropout(0.7))
#suite du réseau
model.add(Flatten())
model.add(Dense(units=1,activation="sigmoid"))
#structure du réseau
print(model.summary())
Model: "sequential_3" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_3 (Embedding) (None, 25, 10) 210580 max_pooling1d (MaxPooling1D (None, 11, 10) 0 ) dropout (Dropout) (None, 11, 10) 0 flatten_3 (Flatten) (None, 110) 0 dense_3 (Dense) (None, 1) 111 ================================================================= Total params: 210,691 Trainable params: 210,691 Non-trainable params: 0 _________________________________________________________________ None
#paramétrage de l'algorithme d'apprentissage
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])
#lancement - une partie du train est réservée pour la validation
#c.-à-d. pour un suivi plus réaliste des performances
history = model.fit(padTrain,dfTrain.polarity,epochs=10,validation_split=0.2)
Epoch 1/10 523/523 [==============================] - 1s 2ms/step - loss: 0.6871 - accuracy: 0.5576 - val_loss: 0.6741 - val_accuracy: 0.6181 Epoch 2/10 523/523 [==============================] - 1s 2ms/step - loss: 0.6475 - accuracy: 0.6437 - val_loss: 0.6208 - val_accuracy: 0.6897 Epoch 3/10 523/523 [==============================] - 1s 2ms/step - loss: 0.5918 - accuracy: 0.7060 - val_loss: 0.5834 - val_accuracy: 0.7067 Epoch 4/10 523/523 [==============================] - 1s 2ms/step - loss: 0.5489 - accuracy: 0.7410 - val_loss: 0.5631 - val_accuracy: 0.7299 Epoch 5/10 523/523 [==============================] - 1s 2ms/step - loss: 0.5120 - accuracy: 0.7656 - val_loss: 0.5509 - val_accuracy: 0.7364 Epoch 6/10 523/523 [==============================] - 1s 2ms/step - loss: 0.4874 - accuracy: 0.7844 - val_loss: 0.5461 - val_accuracy: 0.7335 Epoch 7/10 523/523 [==============================] - 1s 2ms/step - loss: 0.4591 - accuracy: 0.8039 - val_loss: 0.5454 - val_accuracy: 0.7343 Epoch 8/10 523/523 [==============================] - 1s 2ms/step - loss: 0.4310 - accuracy: 0.8203 - val_loss: 0.5466 - val_accuracy: 0.7290 Epoch 9/10 523/523 [==============================] - 1s 2ms/step - loss: 0.4142 - accuracy: 0.8259 - val_loss: 0.5477 - val_accuracy: 0.7381 Epoch 10/10 523/523 [==============================] - 1s 2ms/step - loss: 0.3927 - accuracy: 0.8338 - val_loss: 0.5500 - val_accuracy: 0.7345
#graphique historique
graphique(history)
#pad pour le test
seqTest = tk.texts_to_sequences(dfTest.tweet)
padTest = pad_sequences(seqTest,maxlen=max_length + marge_length,padding='post')
#tweets
print(dfTest.tweet[:3])
#vérif.
print('\n')
print(padTest[:3,:])
1489257 work kicked butt upshot free personal trainer 1302750 haha well hoping someone would come look flat ... 1580107 welll story summer emily noob hahahh chill day... Name: tweet, dtype: object [[ 9 1344 1104 268 1705 4391 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 38 19 384 174 46 42 59 1667 89 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [15248 466 127 2691 5981 971 1 124 1397 205 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
#premier modèle
#évaluation en test avec evaluate()
print(pmc.evaluate(padTest,dfTest.polarity))
280/280 [==============================] - 0s 626us/step - loss: 0.7470 - accuracy: 0.7014 [0.7469548583030701, 0.7014074921607971]
#2nd modèle avec régularisation
#évaluation en test avec evaluate()
print(model.evaluate(padTest,dfTest.polarity))
280/280 [==============================] - 0s 665us/step - loss: 0.5531 - accuracy: 0.7307 [0.5530531406402588, 0.730674684047699]
#un nouveau tweet
my_tweet = "#lmfao i am sexy and i know it"
#nettoyage
my_clean = clean_tweet(my_tweet,ponctuations,mots_vides,lem)
print(my_clean)
sexy know
#transformation en séquence
my_seq = tk.texts_to_sequences([my_clean])
print(my_seq)
[[1386, 15]]
#puis en pad
my_pad = pad_sequences(my_seq,maxlen=max_length + marge_length,padding='post')
print(my_pad)
[[1386 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
#prédiction
print(model.predict(my_pad))
[[0.87658215]]