Base avec 2000 observations : 1000 appréciations négatives et 1000 positives.
#vider la mémoire
rm(list=ls())
#changer de dossier
setwd("C:/Users/ricco/Desktop/demo")
#charger les données
library(readxl)
D <- readxl::read_excel("imdb_reviews_1000_train.xls")
str(D)
## tibble [2,000 x 2] (S3: tbl_df/tbl/data.frame)
## $ label : chr [1:2000] "neg" "neg" "neg" "neg" ...
## $ commentaires: chr [1:2000] "I don't have much to say about this movie. It could have been a wonderful tour-de-force for Peter Sellers, but "| __truncated__ "Race car drivers say that 100 mph seems fast till you've driven 150, and 150 mph seems fast till you've driven "| __truncated__ "I found this film to be quite an oddity. From the very get go I found it extremely hard to like this movie, and"| __truncated__ "Jane Russell proved to be a delightful musical-comedy performer in the similarly titled \"Gentlemen Prefer Blon"| __truncated__ ...
Affichage du premier document.
#1er document
print(D$commentaires[1])
## [1] "I don't have much to say about this movie. It could have been a wonderful tour-de-force for Peter Sellers, but it is one of the most tragic misfires in movie history. That it was Sellers final movie makes it all the more painful.<br /><br />The terrible screenplay, direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made.<br /><br />I wish so much that I could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber, but it's a lifeless, humorless disaster. The truth hurts.<br /><br />Peter, why couldn't you have stopped at BEING THERE?"
#enlever la balise saut de ligne
D$commentaires <- gsub(pattern = "<br />",x=D$commentaires,replacement="")
print(D$commentaires[1])
## [1] "I don't have much to say about this movie. It could have been a wonderful tour-de-force for Peter Sellers, but it is one of the most tragic misfires in movie history. That it was Sellers final movie makes it all the more painful.The terrible screenplay, direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made.I wish so much that I could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber, but it's a lifeless, humorless disaster. The truth hurts.Peter, why couldn't you have stopped at BEING THERE?"
#harmonisation de la casse
D$commentaires <- tolower(D$commentaires)
print(D$commentaires[1])
## [1] "i don't have much to say about this movie. it could have been a wonderful tour-de-force for peter sellers, but it is one of the most tragic misfires in movie history. that it was sellers final movie makes it all the more painful.the terrible screenplay, direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made.i wish so much that i could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber, but it's a lifeless, humorless disaster. the truth hurts.peter, why couldn't you have stopped at being there?"
Il faut transformer les documents en corpus, type reconnu par “quanteda”.
#quanteda
library(quanteda)
## Package version: 3.1.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
#création d'un corpus
mon_corpus <- quanteda::corpus(D,text_field='commentaires')
#print
print(mon_corpus)
## Corpus consisting of 2,000 documents and 1 docvar.
## text1 :
## "i don't have much to say about this movie. it could have bee..."
##
## text2 :
## "race car drivers say that 100 mph seems fast till you've dri..."
##
## text3 :
## "i found this film to be quite an oddity. from the very get g..."
##
## text4 :
## "jane russell proved to be a delightful musical-comedy perfor..."
##
## text5 :
## "this movie makes canadians and brits out to be asinine, moro..."
##
## text6 :
## "i'm a great admirer of lon chaney, but the screen writing of..."
##
## [ reached max_ndoc ... 1,994 more documents ]
#summary - plus détaillé (10 premiers documents ici)
#on voit les informations additionnelles
#dont un identifant de document (Text)
#types est le nombre de tokens uniques
summary(mon_corpus,10)
#quanteda distingue les informations
#relatives aux documents (ici l'etiquette -> "label")
print(head(quanteda::docvars(mon_corpus)))
## label
## 1 neg
## 2 neg
## 3 neg
## 4 neg
## 5 neg
## 6 neg
#tokenisation
#avec un nettoyage à la volée
#pas de nombres
#et pas de ponctuation (d'où la différence avec ci-dessus)
doc_tokenized_A <- quanteda::tokens(mon_corpus,remove_numbers=TRUE,remove_punct=TRUE)
print(doc_tokenized_A)
## Tokens consisting of 2,000 documents and 1 docvar.
## text1 :
## [1] "i" "don't" "have" "much" "to" "say" "about" "this" "movie"
## [10] "it" "could" "have"
## [ ... and 92 more ]
##
## text2 :
## [1] "race" "car" "drivers" "say" "that" "mph" "seems"
## [8] "fast" "till" "you've" "driven" "and"
## [ ... and 236 more ]
##
## text3 :
## [1] "i" "found" "this" "film" "to" "be" "quite" "an"
## [9] "oddity" "from" "the" "very"
## [ ... and 119 more ]
##
## text4 :
## [1] "jane" "russell" "proved" "to"
## [5] "be" "a" "delightful" "musical-comedy"
## [9] "performer" "in" "the" "similarly"
## [ ... and 107 more ]
##
## text5 :
## [1] "this" "movie" "makes" "canadians" "and" "brits"
## [7] "out" "to" "be" "asinine" "moronic" "idiots"
## [ ... and 262 more ]
##
## text6 :
## [1] "i'm" "a" "great" "admirer" "of" "lon" "chaney"
## [8] "but" "the" "screen" "writing" "of"
## [ ... and 195 more ]
##
## [ reached max_ndoc ... 1,994 more documents ]
#voir en détail le 1er document
print(doc_tokenized_A[[1]])
## [1] "i" "don't" "have" "much"
## [5] "to" "say" "about" "this"
## [9] "movie" "it" "could" "have"
## [13] "been" "a" "wonderful" "tour-de-force"
## [17] "for" "peter" "sellers" "but"
## [21] "it" "is" "one" "of"
## [25] "the" "most" "tragic" "misfires"
## [29] "in" "movie" "history" "that"
## [33] "it" "was" "sellers" "final"
## [37] "movie" "makes" "it" "all"
## [41] "the" "more" "painful.the" "terrible"
## [45] "screenplay" "direction" "and" "shockingly"
## [49] "wooden" "performances" "all" "come"
## [53] "dreadfully" "together" "to" "make"
## [57] "this" "one" "of" "the"
## [61] "most" "unwatchably" "awful" "movies"
## [65] "ever" "made.i" "wish" "so"
## [69] "much" "that" "i" "could"
## [73] "find" "even" "a" "snicker"
## [77] "or" "a" "chuckle" "buried"
## [81] "somewhere" "in" "this" "pile"
## [85] "of" "putrid" "blubber" "but"
## [89] "it's" "a" "lifeless" "humorless"
## [93] "disaster" "the" "truth" "hurts.peter"
## [97] "why" "couldn't" "you" "have"
## [101] "stopped" "at" "being" "there"
#pour rappel
print(D$commentaires[1])
## [1] "i don't have much to say about this movie. it could have been a wonderful tour-de-force for peter sellers, but it is one of the most tragic misfires in movie history. that it was sellers final movie makes it all the more painful.the terrible screenplay, direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made.i wish so much that i could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber, but it's a lifeless, humorless disaster. the truth hurts.peter, why couldn't you have stopped at being there?"
#quelques docs. où bad apparaît
print(head(grep(pattern='bad',D$commentaires),10))
## [1] 8 9 11 15 17 18 19 21 24 27
#ex. document n°8
print(D$commentaires[8])
## [1] "a doctor who is trying to complete the medical dream of transplantation is experimenting secretly on corpses from the hospital with varying success. his final best chance comes when he lovingly wraps his girlfriend's head in his jacket as he rescues it from a burning vehicle.i was looking for cheese and with this premise i believed i found it. it has everything everything that bad movie hunters look for - chest and brain surgery with the surgeons leaving with pristine white scrubs, unique camera angles (i always love watching the rear passenger wheels of cars), cheesy clarinet stripper music, and one of the longest death scenes in movie history. but unfortunately these so-bad-they-are-good moments can't overcome the too-bad-they-stink stretches.jan in the pan annoyed me, with her droning monologues in a hoarse whisper, the somewhat less than evil laughter, and the fact she was kept alive with some columbian home brew coffee and 2 dd batteries.i couldn't even entertain myself with dr bill's horrid overacting and moral self righteousness. usually such ham makes these movies a must see in my opinion, in this case i was bored with it.the best part of the movie in my opinion was the 1960's version of \"body shopping\" and i even found myself nodding off during that.don't spend money on this one - there are better bad movies out there to entertain your sick sense of humor."
#recherche des termes adjacents à "bad"
#2 précédents, 2 suivants
#une forme de contexualisation
print(head(quanteda::kwic(doc_tokenized_A,pattern="bad",window=2),10))
## Keyword-in-context with 10 matches.
## [text8, 65] everything that | bad | movie hunters
## [text8, 219] are better | bad | movies out
## [text11, 223] into the | bad | guys this
## [text15, 95] just as | bad | these two
## [text17, 65] everything that | bad | movie hunters
## [text17, 219] are better | bad | movies out
## [text18, 452] is plain | bad | police wok
## [text18, 520] was so | bad | towards the
## [text21, 44] it very | bad | whichever the
## [text24, 13] of embarrassingly | bad | moments i
Suite du nettoyage, suppression des mots-vides.
#récupérer une liste des mots-vides
#quanteda en propose pour l'anglais
mots_vides <- quanteda::stopwords(language="en")
print(head(mots_vides,20))
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
#en tout
print(paste("==>",length(mots_vides)))
## [1] "==> 175"
#compléter la liste
#avec des mots vides contextuels
mots_vides <- c(mots_vides,c("movie","movies","film","films"))
print(length(mots_vides))
## [1] 179
#retrait des mots-vides parmi les tokens
doc_tokenized_smv <- quanteda::tokens_select(doc_tokenized_A,pattern=mots_vides,selection="remove")
#1er doc. après nettoyage
print(doc_tokenized_smv[[1]])
## [1] "much" "say" "wonderful" "tour-de-force"
## [5] "peter" "sellers" "one" "tragic"
## [9] "misfires" "history" "sellers" "final"
## [13] "makes" "painful.the" "terrible" "screenplay"
## [17] "direction" "shockingly" "wooden" "performances"
## [21] "come" "dreadfully" "together" "make"
## [25] "one" "unwatchably" "awful" "ever"
## [29] "made.i" "wish" "much" "find"
## [33] "even" "snicker" "chuckle" "buried"
## [37] "somewhere" "pile" "putrid" "blubber"
## [41] "lifeless" "humorless" "disaster" "truth"
## [45] "hurts.peter" "stopped"
#alors qu'initialement
print(doc_tokenized_A[[1]])
## [1] "i" "don't" "have" "much"
## [5] "to" "say" "about" "this"
## [9] "movie" "it" "could" "have"
## [13] "been" "a" "wonderful" "tour-de-force"
## [17] "for" "peter" "sellers" "but"
## [21] "it" "is" "one" "of"
## [25] "the" "most" "tragic" "misfires"
## [29] "in" "movie" "history" "that"
## [33] "it" "was" "sellers" "final"
## [37] "movie" "makes" "it" "all"
## [41] "the" "more" "painful.the" "terrible"
## [45] "screenplay" "direction" "and" "shockingly"
## [49] "wooden" "performances" "all" "come"
## [53] "dreadfully" "together" "to" "make"
## [57] "this" "one" "of" "the"
## [61] "most" "unwatchably" "awful" "movies"
## [65] "ever" "made.i" "wish" "so"
## [69] "much" "that" "i" "could"
## [73] "find" "even" "a" "snicker"
## [77] "or" "a" "chuckle" "buried"
## [81] "somewhere" "in" "this" "pile"
## [85] "of" "putrid" "blubber" "but"
## [89] "it's" "a" "lifeless" "humorless"
## [93] "disaster" "the" "truth" "hurts.peter"
## [97] "why" "couldn't" "you" "have"
## [101] "stopped" "at" "being" "there"
Ou encore “document-feature matrix” dans la terminologie “quanteda”. Organisation propice à la mise en oeuvre des algorithmes de machine learning.
#construction à partir des tokens
dfm_smv <- quanteda::dfm(doc_tokenized_smv)
print(dfm_smv)
## Document-feature matrix of: 2,000 documents, 30,259 features (99.68% sparse) and 1 docvar.
## features
## docs much say wonderful tour-de-force peter sellers one tragic misfires
## text1 2 1 1 1 1 2 2 1 1
## text2 0 1 0 0 0 0 0 0 0
## text3 1 0 0 0 0 0 2 0 0
## text4 0 0 0 0 0 0 0 0 0
## text5 1 0 0 0 0 0 1 0 0
## text6 0 0 0 0 0 0 0 0 0
## features
## docs history
## text1 1
## text2 0
## text3 0
## text4 0
## text5 0
## text6 0
## [ reached max_ndoc ... 1,994 more documents, reached max_nfeat ... 30,249 more features ]
#nombre de documents
print(quanteda::ndoc(dfm_smv))
## [1] 2000
#nombre de termes (features)
print(quanteda::nfeat(dfm_smv))
## [1] 30259
#identifiant des documents
print(head(quanteda::docnames(dfm_smv)))
## [1] "text1" "text2" "text3" "text4" "text5" "text6"
#liste des termes
print(head(quanteda::featnames(dfm_smv)))
## [1] "much" "say" "wonderful" "tour-de-force"
## [5] "peter" "sellers"
#fréquence des termes
print(quanteda::topfeatures(dfm_smv,10))
## one like just good even really story see can time
## 2042 1623 1370 1126 1018 910 902 897 880 878
#on aurait pu obtenir le même résultat avec un colSums
print(head(sort(colSums(dfm_smv),decreasing=TRUE),10))
## one like just good even really story see can time
## 2042 1623 1370 1126 1018 910 902 897 880 878
La matrice documents-termes (features) est propice aux calculs statistiques. “quanteda” propose un package spécialisé : “quanteda.textstats”.
#librairie pour les statistiques
library(quanteda.textstats)
#frequence globale des termes
#affichage du nombre de documents concernés
freq_terms <- quanteda.textstats::textstat_frequency(dfm_smv)
print(head(freq_terms,10))
## feature frequency rank docfreq group
## 1 one 2042 1 1100 all
## 2 like 1623 2 930 all
## 3 just 1370 3 832 all
## 4 good 1126 4 709 all
## 5 even 1018 5 663 all
## 6 really 910 6 613 all
## 7 story 902 7 617 all
## 8 see 897 8 648 all
## 9 can 880 9 632 all
## 10 time 878 10 636 all
#l'objet est compatible data.frame
#on peut effectuer des recherches simples
print(freq_terms[freq_terms$feature=="bad",])
## feature frequency rank docfreq group
## 16 bad 699 16 460 all
#frequence par groupe (utilisation de label -- recensé dans les docvars)
freq_terms_bg <- quanteda.textstats::textstat_frequency(dfm_smv,groups=label)
print(head(freq_terms_bg,10))
## feature frequency rank docfreq group
## 1 one 940 1 529 neg
## 2 like 841 2 476 neg
## 3 just 793 3 466 neg
## 4 even 608 4 385 neg
## 5 bad 554 5 344 neg
## 6 good 540 6 346 neg
## 7 really 465 7 314 neg
## 8 can 462 8 324 neg
## 9 time 427 9 319 neg
## 10 story 398 10 279 neg
#intéressons-nous de nouveau pour "bad"
print(freq_terms_bg[freq_terms_bg$feature=="bad",])
## feature frequency rank docfreq group
## 5 bad 554 5 344 neg
## 19332 bad 145 78 116 pos
#association entre les termes et l'étiquette (modalité cible : label == 'pos')
#avec l'utilisation keyness
ts_key <- quanteda.textstats::textstat_keyness(dfm_smv,target=(dfm_smv$label=='pos'))
#affichage en triant de manière décroissante
#sur la valeur absolue du khi-2
head(ts_key[order(abs(ts_key$chi2),decreasing=TRUE),],10)
D’où vient ce KHI-2 pour “bad” ? Et pourquoi il est négatif ????
#vérifions pour label = pos vs. autre (i.e. label = neg)
#somme en ligne + consolidation selon la classe
print(tapply(rowSums(dfm_smv),D$label,sum))
## neg pos
## 107687 119050
#matrice pour khi-2
#(neg=not(pos) vs. pos) x (bad vs. not(bad))
m <- matrix(c(554,145,107687-554,119050-145),nrow=2,ncol=2)
print(m)
## [,1] [,2]
## [1,] 554 107133
## [2,] 145 118905
#test d'indépendance du khi-2 sans correction de continuité
print(stats::chisq.test(m,correct=FALSE))
##
## Pearson's Chi-squared test
##
## data: m
## X-squared = 283.65, df = 1, p-value < 2.2e-16
Ok pour le KHI-2, et le signe pour déterminer si la modalité cible est sous-représenté (signe négatif) ou sur-représenté (signe positif).
Même si nous avons déjà constitué la matrice documents-termes dans la section précédente, on reprend à zéro ici pour bien préciser chaque étape.
#données d'apprentissage = données dispo ici
DTrain <- D
#création d'un corpus d'apprentissage
corpTrain <- quanteda::corpus(DTrain,text_field='commentaires')
print(corpTrain)
## Corpus consisting of 2,000 documents and 1 docvar.
## text1 :
## "i don't have much to say about this movie. it could have bee..."
##
## text2 :
## "race car drivers say that 100 mph seems fast till you've dri..."
##
## text3 :
## "i found this film to be quite an oddity. from the very get g..."
##
## text4 :
## "jane russell proved to be a delightful musical-comedy perfor..."
##
## text5 :
## "this movie makes canadians and brits out to be asinine, moro..."
##
## text6 :
## "i'm a great admirer of lon chaney, but the screen writing of..."
##
## [ reached max_ndoc ... 1,994 more documents ]
#fréquences des classes sur l'échantillon d'apprentissage
print(table(DTrain$label))
##
## neg pos
## 1000 1000
#tokenisation
dtokTrain <- quanteda::tokens(corpTrain,remove_numbers=TRUE,remove_punct=TRUE)
print(dtokTrain)
## Tokens consisting of 2,000 documents and 1 docvar.
## text1 :
## [1] "i" "don't" "have" "much" "to" "say" "about" "this" "movie"
## [10] "it" "could" "have"
## [ ... and 92 more ]
##
## text2 :
## [1] "race" "car" "drivers" "say" "that" "mph" "seems"
## [8] "fast" "till" "you've" "driven" "and"
## [ ... and 236 more ]
##
## text3 :
## [1] "i" "found" "this" "film" "to" "be" "quite" "an"
## [9] "oddity" "from" "the" "very"
## [ ... and 119 more ]
##
## text4 :
## [1] "jane" "russell" "proved" "to"
## [5] "be" "a" "delightful" "musical-comedy"
## [9] "performer" "in" "the" "similarly"
## [ ... and 107 more ]
##
## text5 :
## [1] "this" "movie" "makes" "canadians" "and" "brits"
## [7] "out" "to" "be" "asinine" "moronic" "idiots"
## [ ... and 262 more ]
##
## text6 :
## [1] "i'm" "a" "great" "admirer" "of" "lon" "chaney"
## [8] "but" "the" "screen" "writing" "of"
## [ ... and 195 more ]
##
## [ reached max_ndoc ... 1,994 more documents ]
#retrait des mots-vides
dtokTrain <- quanteda::tokens_select(dtokTrain,pattern=mots_vides,selection="remove")
print(dtokTrain)
## Tokens consisting of 2,000 documents and 1 docvar.
## text1 :
## [1] "much" "say" "wonderful" "tour-de-force"
## [5] "peter" "sellers" "one" "tragic"
## [9] "misfires" "history" "sellers" "final"
## [ ... and 34 more ]
##
## text2 :
## [1] "race" "car" "drivers" "say" "mph" "seems" "fast"
## [8] "till" "driven" "mph" "seems" "fast"
## [ ... and 136 more ]
##
## text3 :
## [1] "found" "quite" "oddity" "get" "go" "found"
## [7] "extremely" "hard" "like" "now" "little" "thinking"
## [ ... and 55 more ]
##
## text4 :
## [1] "jane" "russell" "proved" "delightful"
## [5] "musical-comedy" "performer" "similarly" "titled"
## [9] "gentlemen" "prefer" "blondes" "sadly"
## [ ... and 59 more ]
##
## text5 :
## [1] "makes" "canadians" "brits" "asinine" "moronic" "idiots"
## [7] "men" "get" "stoned" "drunk" "yell" "beat"
## [ ... and 125 more ]
##
## text6 :
## [1] "great" "admirer" "lon" "chaney" "screen" "writing" "just"
## [8] "work" "story" "jumps" "around" "oddly"
## [ ... and 89 more ]
##
## [ reached max_ndoc ... 1,994 more documents ]
#matrice documents-features (termes)
#term-frequency
dfmTrainTf <- quanteda::dfm(dtokTrain)
print(head(dfmTrainTf))
## Document-feature matrix of: 6 documents, 30,259 features (99.73% sparse) and 1 docvar.
## features
## docs much say wonderful tour-de-force peter sellers one tragic misfires
## text1 2 1 1 1 1 2 2 1 1
## text2 0 1 0 0 0 0 0 0 0
## text3 1 0 0 0 0 0 2 0 0
## text4 0 0 0 0 0 0 0 0 0
## text5 1 0 0 0 0 0 1 0 0
## text6 0 0 0 0 0 0 0 0 0
## features
## docs history
## text1 1
## text2 0
## text3 0
## text4 0
## text5 0
## text6 0
## [ reached max_nfeat ... 30,249 more features ]
#passage en pondération binaire
#évite les problèmes de disparité d'échelle entre les termes
#certains sont plus fréquents que d'autres globalement
dfmTrainBin <- quanteda::dfm_weight(dfmTrainTf,scheme="boolean")
print(head(dfmTrainBin))
## Document-feature matrix of: 6 documents, 30,259 features (99.73% sparse) and 1 docvar.
## features
## docs much say wonderful tour-de-force peter sellers one tragic misfires
## text1 1 1 1 1 1 1 1 1 1
## text2 0 1 0 0 0 0 0 0 0
## text3 1 0 0 0 0 0 1 0 0
## text4 0 0 0 0 0 0 0 0 0
## text5 1 0 0 0 0 0 1 0 0
## text6 0 0 0 0 0 0 0 0 0
## features
## docs history
## text1 1
## text2 0
## text3 0
## text4 0
## text5 0
## text6 0
## [ reached max_nfeat ... 30,249 more features ]
#FILTRAGE sur la fréquence
#i.e. retrait des termes trop peu fréquents (apparaissent dans moins de 10 docs.)
#min_docFreq <=> min_termFreq ici puisque pondération binaire
dfmTrainBinSel <- quanteda::dfm_trim(dfmTrainBin,min_docfreq=10)
print(head(dfmTrainBinSel))
## Document-feature matrix of: 6 documents, 3,132 features (98.20% sparse) and 1 docvar.
## features
## docs much say wonderful peter one tragic history final makes terrible
## text1 1 1 1 1 1 1 1 1 1 1
## text2 0 1 0 0 0 0 0 0 1 0
## text3 1 0 0 0 1 0 0 0 0 0
## text4 0 0 0 0 0 0 0 0 0 0
## text5 1 0 0 0 1 0 0 0 1 0
## text6 0 0 0 0 0 0 0 0 0 0
## [ reached max_nfeat ... 3,122 more features ]
#convertir en data frame
dataTrain <- quanteda::convert(dfmTrainBinSel,to='data.frame')
print(head(colnames(dataTrain)))
## [1] "doc_id" "much" "say" "wonderful" "peter" "one"
#virer le docId
dataTrain$doc_id <- NULL
#rajouter le label
dataTrain['LABEL'] <- factor(DTrain$label)
print(tail(colnames(dataTrain)))
## [1] "matthau" "winner" "crafted" "flawless" "chilling" "LABEL"
Toute méthode supervisée peut faire l’affaire. Mais vu la forte dimensionalité, mieux vaut utiliser d’emblée une méthode adaptée (aux données plus larges que longues) et fortement régularisées.
#construire un SVM linéaire
library(e1071)
clf <- e1071::svm(LABEL ~ ., data=dataTrain, kernel="linear")
print(clf)
##
## Call:
## svm(formula = LABEL ~ ., data = dataTrain, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 1166
La démarche doit être totalement cohérente avec les manipulations effectuées sur l’échantillon d’apprentissage. Nous devons en particulier utiliser le dictionnaire des termes correspondant.
#chargement de l'échantillon test
DTest <- readxl::read_excel("imdb_reviews_500_test.xls")
str(DTest)
## tibble [1,000 x 2] (S3: tbl_df/tbl/data.frame)
## $ label : chr [1:1000] "neg" "neg" "neg" "neg" ...
## $ commentaires: chr [1:1000] "Oh my god, what a horrible film. The film has all the right people involved, unfortunately it is not worth watc"| __truncated__ "i rate this movie with 3 skulls, only coz the girls knew how to scream, this could've been a better movie, if a"| __truncated__ "As someone who has both read the novel and seen the film, I have a different take on why the film was such a fl"| __truncated__ "This film is basically two hours of Dafoe's character drinking himself - nearly literally - to death. The only "| __truncated__ ...
#fréquence des classes sur l'échantillon test
print(table(DTest$label))
##
## neg pos
## 500 500
#réappliquer le premier nettoyage
#enlever la balise saut de ligne
DTest$commentaires <- gsub(pattern = "<br />",x=DTest$commentaires,replacement="")
print(DTest$commentaires[1])
## [1] "Oh my god, what a horrible film. The film has all the right people involved, unfortunately it is not worth watching. I saw it for free at my local library. If I had paid to watch this I would be even more upset. This film is unwatchable. How could Tarintino be involved with such a slow paced, unexciting film. No wonder it didn't get much distribution, every one involved must have been ashamed. I can make a better film with a Dated Camcorder and my Big toe. Its beyond boring, I really hated it. Tarintino just lost some standing in my eyes. This must be some kind of sick joke. Don't Bother with this film. If some one even hints you should watch it, kill them."
#harmonisation de la casse
DTest$commentaires <- tolower(DTest$commentaires)
print(DTest$commentaires[1])
## [1] "oh my god, what a horrible film. the film has all the right people involved, unfortunately it is not worth watching. i saw it for free at my local library. if i had paid to watch this i would be even more upset. this film is unwatchable. how could tarintino be involved with such a slow paced, unexciting film. no wonder it didn't get much distribution, every one involved must have been ashamed. i can make a better film with a dated camcorder and my big toe. its beyond boring, i really hated it. tarintino just lost some standing in my eyes. this must be some kind of sick joke. don't bother with this film. if some one even hints you should watch it, kill them."
#corpus échantillon test
corpTest <- quanteda::corpus(DTest,text_field='commentaires')
print(corpTest)
## Corpus consisting of 1,000 documents and 1 docvar.
## text1 :
## "oh my god, what a horrible film. the film has all the right ..."
##
## text2 :
## "i rate this movie with 3 skulls, only coz the girls knew how..."
##
## text3 :
## "as someone who has both read the novel and seen the film, i ..."
##
## text4 :
## "this film is basically two hours of dafoe's character drinki..."
##
## text5 :
## "if you didn't know better, you would believe the christian m..."
##
## text6 :
## "this movie is hilarious, not in good way. the fights are awf..."
##
## [ reached max_ndoc ... 994 more documents ]
#tokéniser l'échantillon test
dtokTest <- quanteda::tokens(corpTest,remove_numbers=TRUE,remove_punct=TRUE)
print(dtokTest)
## Tokens consisting of 1,000 documents and 1 docvar.
## text1 :
## [1] "oh" "my" "god" "what" "a" "horrible"
## [7] "film" "the" "film" "has" "all" "the"
## [ ... and 114 more ]
##
## text2 :
## [1] "i" "rate" "this" "movie" "with" "skulls" "only" "coz"
## [9] "the" "girls" "knew" "how"
## [ ... and 127 more ]
##
## text3 :
## [1] "as" "someone" "who" "has" "both" "read" "the"
## [8] "novel" "and" "seen" "the" "film"
## [ ... and 295 more ]
##
## text4 :
## [1] "this" "film" "is" "basically" "two" "hours"
## [7] "of" "dafoe's" "character" "drinking" "himself" "nearly"
## [ ... and 159 more ]
##
## text5 :
## [1] "if" "you" "didn't" "know" "better" "you"
## [7] "would" "believe" "the" "christian" "moral" "majority"
## [ ... and 103 more ]
##
## text6 :
## [1] "this" "movie" "is" "hilarious" "not" "in"
## [7] "good" "way" "the" "fights" "are" "awfully"
## [ ... and 100 more ]
##
## [ reached max_ndoc ... 994 more documents ]
#retrait des mots-vides
dtokTest <- quanteda::tokens_select(dtokTest,pattern=mots_vides,selection="remove")
print(dtokTest)
## Tokens consisting of 1,000 documents and 1 docvar.
## text1 :
## [1] "oh" "god" "horrible" "right"
## [5] "people" "involved" "unfortunately" "worth"
## [9] "watching" "saw" "free" "local"
## [ ... and 46 more ]
##
## text2 :
## [1] "rate" "skulls" "coz" "girls" "knew" "scream"
## [7] "could've" "better" "actors" "better" "twins" "ok"
## [ ... and 51 more ]
##
## text3 :
## [1] "someone" "read" "novel" "seen" "different"
## [6] "take" "flop" "first" "comparisons" "novel"
## [11] "purely" "superficial"
## [ ... and 149 more ]
##
## text4 :
## [1] "basically" "two" "hours" "dafoe's" "character" "drinking"
## [7] "nearly" "literally" "death" "surprise" "enough" "clues"
## [ ... and 75 more ]
##
## text5 :
## [1] "know" "better" "believe" "christian" "moral"
## [6] "majority" "preachy" "testimonial" "sins" "young"
## [11] "questing" "satan"
## [ ... and 48 more ]
##
## text6 :
## [1] "hilarious" "good" "way" "fights" "awfully" "bad"
## [7] "done" "sometimes" "try" "shock" "breaking" "bones"
## [ ... and 53 more ]
##
## [ reached max_ndoc ... 994 more documents ]
#transformer en dfm
dfmTestTf <- dfm(dtokTest)
print(head(dfmTestTf))
## Document-feature matrix of: 6 documents, 20,615 features (99.64% sparse) and 1 docvar.
## features
## docs oh god horrible right people involved unfortunately worth watching saw
## text1 1 1 1 1 1 3 1 1 1 1
## text2 0 0 0 0 0 0 0 0 0 0
## text3 0 0 0 0 1 0 0 0 0 0
## text4 0 0 0 0 0 0 0 0 0 0
## text5 0 0 0 0 0 0 0 0 0 0
## text6 0 0 0 0 1 0 0 2 0 0
## [ reached max_nfeat ... 20,605 more features ]
#passer en pondération binaire aussi
dfmTestBin <- quanteda::dfm_weight(dfmTestTf,scheme="boolean")
print(head(dfmTestBin))
## Document-feature matrix of: 6 documents, 20,615 features (99.64% sparse) and 1 docvar.
## features
## docs oh god horrible right people involved unfortunately worth watching saw
## text1 1 1 1 1 1 1 1 1 1 1
## text2 0 0 0 0 0 0 0 0 0 0
## text3 0 0 0 0 1 0 0 0 0 0
## text4 0 0 0 0 0 0 0 0 0 0
## text5 0 0 0 0 0 0 0 0 0 0
## text6 0 0 0 0 1 0 0 1 0 0
## [ reached max_nfeat ... 20,605 more features ]
Argh ! 3132 features sur l’échantillon d’apprentissage présenté au SVM, et ici nous avons 20615 features !
ETAPE TRES IMPORTANTE
#matching avec les features de l'échantillon d'apprentissage
#sinon, gros problème lors de l'appel de predict(.)
dfmTestBinMatched <- quanteda::dfm_match(dfmTestBin,features=featnames(dfmTrainBinSel))
print(dfmTestBinMatched)
## Document-feature matrix of: 1,000 documents, 3,132 features (97.81% sparse) and 1 docvar.
## features
## docs much say wonderful peter one tragic history final makes terrible
## text1 1 0 0 0 1 0 0 0 0 0
## text2 0 0 0 0 0 0 0 0 0 0
## text3 0 0 0 0 0 0 0 0 0 0
## text4 0 0 0 1 1 0 0 0 0 0
## text5 0 0 0 0 1 0 0 0 0 0
## text6 0 0 0 0 0 0 0 0 0 0
## [ reached max_ndoc ... 994 more documents, reached max_nfeat ... 3,122 more features ]
#pour rappel dfmTrain
print(head(dfmTrainBinSel))
## Document-feature matrix of: 6 documents, 3,132 features (98.20% sparse) and 1 docvar.
## features
## docs much say wonderful peter one tragic history final makes terrible
## text1 1 1 1 1 1 1 1 1 1 1
## text2 0 1 0 0 0 0 0 0 1 0
## text3 1 0 0 0 1 0 0 0 0 0
## text4 0 0 0 0 0 0 0 0 0 0
## text5 1 0 0 0 1 0 0 0 1 0
## text6 0 0 0 0 0 0 0 0 0 0
## [ reached max_nfeat ... 3,122 more features ]
#vérification du matching
#sur la matrice document-feature (terme)
print(sum(featnames(dfmTrainBinSel) != featnames(dfmTestBinMatched)))
## [1] 0
#transformer en data frame
dataTest <- quanteda::convert(dfmTestBinMatched,to='data.frame')
print(head(colnames(dataTest)))
## [1] "doc_id" "much" "say" "wonderful" "peter" "one"
#retirer doc_id
dataTest$doc_id <- NULL
#rajouter les labels
dataTest['LABEL'] <- factor(DTest$label)
print(tail(colnames(dataTest)))
## [1] "matthau" "winner" "crafted" "flawless" "chilling" "LABEL"
#vérification de la cohérence encore une fois
#sur le data.frame cette fois-ci
print(sum(colnames(dataTrain) != colnames(dataTest)))
## [1] 0
#ok macthing - on peut réaliser le predict
predLABEL <- predict(clf,newdata=dataTest)
print(table(predLABEL))
## predLABEL
## neg pos
## 535 465
#matrice de confusion
mc <- table(dataTest$LABEL,predLABEL)
print(mc)
## predLABEL
## neg pos
## neg 425 75
## pos 110 390
#accuracy - taux de reconnaissance
acc <- sum(diag(mc))/sum(mc)
print(acc)
## [1] 0.815
#rappel
rappel <- mc['pos','pos']/sum(mc['pos',])
print(rappel)
## [1] 0.78
#précision
precision <- mc['pos','pos']/sum(mc[,'pos'])
print(precision)
## [1] 0.8387097
#F1-Score
F1 <- 2.0*(precision*rappel)/(precision+rappel)
print(F1)
## [1] 0.8082902