#vider la mémoire
rm(list=ls())
#changer de dossier courant
setwd("C:/Users/ricco/Desktop/demo")
#charger les packages de l'univers Tidy
#en particulier dplyr
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#charger les données
#avec le package readxl
D <- readxl::read_excel("imdb_reviews_1000.xlsx")
str(D)
## tibble [2,000 x 2] (S3: tbl_df/tbl/data.frame)
## $ label : chr [1:2000] "neg" "neg" "neg" "neg" ...
## $ commentaires: chr [1:2000] "I don't have much to say about this movie. It could have been a wonderful tour-de-force for Peter Sellers, but "| __truncated__ "Race car drivers say that 100 mph seems fast till you've driven 150, and 150 mph seems fast till you've driven "| __truncated__ "I found this film to be quite an oddity. From the very get go I found it extremely hard to like this movie, and"| __truncated__ "Jane Russell proved to be a delightful musical-comedy performer in the similarly titled \"Gentlemen Prefer Blon"| __truncated__ ...
#premiers documents de la base de données
head(D)
#inspection des classes
print(table(D$label))
##
## neg pos
## 1000 1000
Présence de la balise HTML “saut de ligne” dans les commentaires.
#présence des '<br />' dans les commentaires
tmp <- grep("<br />",D$commentaires)
print(length(tmp))
## [1] 1190
#lequels
print(head(tmp))
## [1] 1 2 4 5 8 10
#le premier par ex.
print(D$commentaires[1])
## [1] "I don't have much to say about this movie. It could have been a wonderful tour-de-force for Peter Sellers, but it is one of the most tragic misfires in movie history. That it was Sellers final movie makes it all the more painful.<br /><br />The terrible screenplay, direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made.<br /><br />I wish so much that I could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber, but it's a lifeless, humorless disaster. The truth hurts.<br /><br />Peter, why couldn't you have stopped at BEING THERE?"
#le troisième par ex.
print(D$commentaires[3])
## [1] "I found this film to be quite an oddity. From the very get go I found it extremely hard to like this movie, and now after a little thinking about it I can pretty much pinpoint the reason why. Jean-Marc Barr, although I love him to bits (I think Zentropa is one of the best movies ever made) is quite miscast here, and although I can't figure for the life of me who would be better, I am sure someone could have taken his place quite easily and make this film work. Everything else is fine, except for the stabs at weak comedy (A Meet The Parents Joke is not really needed, filmmakers!) and I really like Richard E. Grant as the British Major. It just suffers from one thing.. Jean-Marc."
Présence de chiffres dans les commentaires.
#présence des chiffres
#cf. expressions régulières sous R
#http://tutoriels-data-mining.blogspot.com/2017/01/les-expression-regulieres-sous-r.html
tmp <- grep('[0-9]',D$commentaires)
print(length(tmp))
## [1] 1139
#lesquels
print(head(tmp))
## [1] 2 4 8 9 10 12
#le 2ème message
print(D$commentaires[2])
## [1] "Race car drivers say that 100 mph seems fast till you've driven 150, and 150 mph seems fast till you've driven 250.<br /><br />OK.<br /><br />Andalusian Dog seems breathtakingly bizarre till you've seen Eraserhead, and Eraserhead seems breathtakingly bizarre till you've seen Begotten.<br /><br />And Begotten seems breathtakingly bizarre till you've seen the works of C. Frederic Hobbs. Race fans, there is NOTHING in all the world of film like the works of C. Frederic Hobbs.<br /><br />Alabama's Ghost comes as close as any of his films to having a coherent plot, and it only involves hippies, rock concerts, voodoo, ghosts, vampires, robots, magicians, corrupt multinational corporations, elephants and Mystery Gas. And the Fabulous Woodmobile, cruising the Sunset District in San Francisco, of course.<br /><br />What's really startling is that somebody gave him a LOT of money to make Alabama's Ghost. There's sets, lighting, hundreds of extras, costumes, lots and lots of effects. Somehow that makes Alabama's Ghost SO WRONG. You watch some awful cheeseball like Night of Horror or Plutonium Baby, and at least some part of the weirdness is excusable on the basis that they were obviously making the film off the headroom on their Discover cards. But Alabama's Ghost was made with an actual budget, and that's EVIL. I mean, I've got a script about a tribe of cannibals living in Thunder Bay, Ontario, building a secret temple in the woods out of Twizzlers, and nobody's beating down MY door waving a checkbook - how did this guy get the funds for FOUR of the flakiest movies ever made?"
#le premier
print(D$commentaires[1])
## [1] "I don't have much to say about this movie. It could have been a wonderful tour-de-force for Peter Sellers, but it is one of the most tragic misfires in movie history. That it was Sellers final movie makes it all the more painful.<br /><br />The terrible screenplay, direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made.<br /><br />I wish so much that I could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber, but it's a lifeless, humorless disaster. The truth hurts.<br /><br />Peter, why couldn't you have stopped at BEING THERE?"
#charger spécifiquement la libraire tidytext
library(tidytext)
La numérotation des documents permet de mieux comprendre les associations des termes avec les documents par la suite.
#modifier la structure du tibble
#en numérotant les documents
DPRIM <- tibble(line=1:nrow(D),commentaires=D$commentaires)
print(head(DPRIM))
## # A tibble: 6 x 2
## line commentaires
## <int> <chr>
## 1 1 "I don't have much to say about this movie. It could have been a wonder~
## 2 2 "Race car drivers say that 100 mph seems fast till you've driven 150, a~
## 3 3 "I found this film to be quite an oddity. From the very get go I found ~
## 4 4 "Jane Russell proved to be a delightful musical-comedy performer in the~
## 5 5 "This movie makes Canadians and Brits out to be asinine, moronic idiots~
## 6 6 "I'm a great admirer of Lon Chaney, but the screen writing of this movi~
Transformation des documents en une structure “tidy”, en recensant les mots par document. L’information sur l’ordre des mots est perdue. On note en revanche que la casse a été changée, la ponctuation a été supprimée.
#traiter le texte brut
res_A <- DPRIM %>%
unnest_tokens(output=word,input=commentaires)
#
print(res_A)
## # A tibble: 467,429 x 2
## line word
## <int> <chr>
## 1 1 i
## 2 1 don't
## 3 1 have
## 4 1 much
## 5 1 to
## 6 1 say
## 7 1 about
## 8 1 this
## 9 1 movie
## 10 1 it
## # ... with 467,419 more rows
#si on s'en tient aux mots du premier document
mots_1 <- res_A %>%
filter(line==1) %>%
select(word)
print(mots_1$word)
## [1] "i" "don't" "have" "much" "to"
## [6] "say" "about" "this" "movie" "it"
## [11] "could" "have" "been" "a" "wonderful"
## [16] "tour" "de" "force" "for" "peter"
## [21] "sellers" "but" "it" "is" "one"
## [26] "of" "the" "most" "tragic" "misfires"
## [31] "in" "movie" "history" "that" "it"
## [36] "was" "sellers" "final" "movie" "makes"
## [41] "it" "all" "the" "more" "painful"
## [46] "br" "br" "the" "terrible" "screenplay"
## [51] "direction" "and" "shockingly" "wooden" "performances"
## [56] "all" "come" "dreadfully" "together" "to"
## [61] "make" "this" "one" "of" "the"
## [66] "most" "unwatchably" "awful" "movies" "ever"
## [71] "made" "br" "br" "i" "wish"
## [76] "so" "much" "that" "i" "could"
## [81] "find" "even" "a" "snicker" "or"
## [86] "a" "chuckle" "buried" "somewhere" "in"
## [91] "this" "pile" "of" "putrid" "blubber"
## [96] "but" "it's" "a" "lifeless" "humorless"
## [101] "disaster" "the" "truth" "hurts" "br"
## [106] "br" "peter" "why" "couldn't" "you"
## [111] "have" "stopped" "at" "being" "there"
#pour rappel le 1er document était...
print(D$commentaires[1])
## [1] "I don't have much to say about this movie. It could have been a wonderful tour-de-force for Peter Sellers, but it is one of the most tragic misfires in movie history. That it was Sellers final movie makes it all the more painful.<br /><br />The terrible screenplay, direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made.<br /><br />I wish so much that I could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber, but it's a lifeless, humorless disaster. The truth hurts.<br /><br />Peter, why couldn't you have stopped at BEING THERE?"
#dans le premier document
mots_1 %>%
group_by(word) %>%
count(sort=TRUE)
#comptage des termes par document
compte_A <- res_A %>%
group_by(line,word) %>%
summarize(freq=n())
## `summarise()` has grouped output by 'line'. You can override using the `.groups` argument.
print(compte_A)
## # A tibble: 281,109 x 3
## # Groups: line [2,000]
## line word freq
## <int> <chr> <int>
## 1 1 a 4
## 2 1 about 1
## 3 1 all 2
## 4 1 and 1
## 5 1 at 1
## 6 1 awful 1
## 7 1 been 1
## 8 1 being 1
## 9 1 blubber 1
## 10 1 br 6
## # ... with 281,099 more rows
#dictionnaire - comptage
#ordonnancement selon la fréquence
dico_A <- res_A %>%
count(word,sort=TRUE)
#affichage
print(dico_A)
## # A tibble: 26,495 x 2
## word n
## <chr> <int>
## 1 the 26766
## 2 and 12860
## 3 a 12706
## 4 of 11611
## 5 to 10527
## 6 is 8443
## 7 br 8258
## 8 in 7454
## 9 it 6289
## 10 i 6100
## # ... with 26,485 more rows
#les termes les moins fréquents
print(tail(dico_A))
## # A tibble: 6 x 2
## word n
## <chr> <int>
## 1 zorro 1
## 2 zouzou 1
## 3 zucker 1
## 4 zues 1
## 5 zunz 1
## 6 zvonimir 1
Librairie pour les stopwords en anglais.
#récupération des stopwords
data("stop_words")
print(stop_words)
## # A tibble: 1,149 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
## 7 accordingly SMART
## 8 across SMART
## 9 actually SMART
## 10 after SMART
## # ... with 1,139 more rows
Sans les chiffres, la balise “saut de ligne” et les stopwords.
#deuxième version, sans les chiffres et les br
res_B <- DPRIM %>%
mutate(text=gsub(x=commentaires,pattern="[0-9]",replacement="")) %>%
mutate(text=gsub(x=text,pattern="<br />",replacement="")) %>%
unnest_tokens(output=word,input=text) %>%
filter(!word %in% stop_words$word) %>%
select(line,word)
#affichage
print(res_B)
## # A tibble: 172,664 x 2
## line word
## <int> <chr>
## 1 1 movie
## 2 1 wonderful
## 3 1 tour
## 4 1 de
## 5 1 force
## 6 1 peter
## 7 1 sellers
## 8 1 tragic
## 9 1 misfires
## 10 1 movie
## # ... with 172,654 more rows
#vérification pour le premier document
print(res_B[res_B$line==1,]$word)
## [1] "movie" "wonderful" "tour" "de" "force"
## [6] "peter" "sellers" "tragic" "misfires" "movie"
## [11] "history" "sellers" "final" "movie" "makes"
## [16] "painful.the" "terrible" "screenplay" "direction" "shockingly"
## [21] "wooden" "performances" "dreadfully" "unwatchably" "awful"
## [26] "movies" "made.i" "snicker" "chuckle" "buried"
## [31] "pile" "putrid" "blubber" "lifeless" "humorless"
## [36] "disaster" "truth" "hurts.peter" "stopped"
#par comparaison
DPRIM$commentaires[1]
## [1] "I don't have much to say about this movie. It could have been a wonderful tour-de-force for Peter Sellers, but it is one of the most tragic misfires in movie history. That it was Sellers final movie makes it all the more painful.<br /><br />The terrible screenplay, direction and shockingly wooden performances all come dreadfully together to make this one of the most unwatchably awful movies ever made.<br /><br />I wish so much that I could find even a snicker or a chuckle buried somewhere in this pile of putrid blubber, but it's a lifeless, humorless disaster. The truth hurts.<br /><br />Peter, why couldn't you have stopped at BEING THERE?"
#dictionnaire
dico_B <- res_B %>%
count(word,sort=TRUE)
print(dico_B)
## # A tibble: 27,759 x 2
## word n
## <chr> <int>
## 1 movie 3421
## 2 film 2991
## 3 time 912
## 4 story 906
## 5 people 747
## 6 bad 706
## 7 characters 560
## 8 movies 547
## 9 films 542
## 10 watch 526
## # ... with 27,749 more rows
Avec un petit “wordcloud” pour faire joli…
#un petit wordcloud ici
library(wordcloud)
## Le chargement a nécessité le package : RColorBrewer
wordcloud(words=dico_B$word,freq=dico_B$n,max.word=50,colors = brewer.pal(8,'Dark2'))
Une matrice avec en ligne les documents, en colonne les termes, et en valeurs la fréquence des termes dans les documents.
#rappel structure tidy
print(head(res_B))
## # A tibble: 6 x 2
## line word
## <int> <chr>
## 1 1 movie
## 2 1 wonderful
## 3 1 tour
## 4 1 de
## 5 1 force
## 6 1 peter
#comptage des termes par document
compte_B <- res_B %>%
group_by(line,word) %>%
summarize(freq=n())
## `summarise()` has grouped output by 'line'. You can override using the `.groups` argument.
print(compte_B)
## # A tibble: 145,871 x 3
## # Groups: line [2,000]
## line word freq
## <int> <chr> <int>
## 1 1 awful 1
## 2 1 blubber 1
## 3 1 buried 1
## 4 1 chuckle 1
## 5 1 de 1
## 6 1 direction 1
## 7 1 disaster 1
## 8 1 dreadfully 1
## 9 1 final 1
## 10 1 force 1
## # ... with 145,861 more rows
#nécessité de disposer de la librairie "tm"
library(tm)
## Le chargement a nécessité le package : NLP
##
## Attachement du package : 'NLP'
## L'objet suivant est masqué depuis 'package:ggplot2':
##
## annotate
#"cast" en MDT (pondération = fréquence)
#autre pondération possible, ex. TF-IDF
#cf. https://www.rdocumentation.org/packages/tidytext/versions/0.3.2/topics/bind_tf_idf
mdt_B <- compte_B %>%
cast_dtm(document = line, term = word, value = freq)
#affichage
print(mdt_B)
## <<DocumentTermMatrix (documents: 2000, terms: 27759)>>
## Non-/sparse entries: 145871/55372129
## Sparsity : 100%
## Maximal term length: 36
## Weighting : term frequency (tf)
#dans un format "matrix"
#plus facile à manipuler pour nous
mat_B <- as.matrix(mdt_B)
#classe
print(class(mat_B))
## [1] "matrix" "array"
#dimension
print(dim(mat_B))
## [1] 2000 27759
#comptage des termes pour le document 1
print(mat_B[1,which(mat_B[1,]>0)])
## awful blubber buried chuckle de direction
## 1 1 1 1 1 1
## disaster dreadfully final force history humorless
## 1 1 1 1 1 1
## hurts.peter lifeless made.i makes misfires movie
## 1 1 1 1 1 3
## movies painful.the performances peter pile putrid
## 1 1 1 1 1 1
## screenplay sellers shockingly snicker stopped terrible
## 1 2 1 1 1 1
## tour tragic truth unwatchably wonderful wooden
## 1 1 1 1 1 1
#vérifions
tmp <- compte_B %>%
filter(line==1)
#affichage
tmpvec <- tmp$freq
names(tmpvec) <- tmp$word
print(tmpvec)
## awful blubber buried chuckle de direction
## 1 1 1 1 1 1
## disaster dreadfully final force history humorless
## 1 1 1 1 1 1
## hurts.peter lifeless made.i makes misfires movie
## 1 1 1 1 1 3
## movies painful.the performances peter pile putrid
## 1 1 1 1 1 1
## screenplay sellers shockingly snicker stopped terrible
## 1 2 1 1 1 1
## tour tragic truth unwatchably wonderful wooden
## 1 1 1 1 1 1
#pour rappel
head(dico_B,20)
Refaire le calcul à partir de la matrice documents-termes.
print(head(sort(colSums(mat_B),decreasing=TRUE),20))
## movie film time story people bad characters
## 3421 2991 912 906 747 706 560
## movies films watch life love character plot
## 547 542 526 525 510 498 489
## acting scenes scene real actors watching
## 489 439 424 386 350 331
Nombre de documents où les termes apparaissent au moins une fois.
#comptage des doc.
app_termes <- apply(mat_B,2,function(x){sum(x>0)})
#affichage trié
print(head(sort(app_termes,decreasing=TRUE),20))
## movie film time story people bad watch
## 1204 1085 653 619 489 461 434
## acting characters movies plot films life character
## 411 407 392 386 372 369 368
## love scenes scene real makes actors
## 362 314 308 307 284 277
Retirer de la matrice les termes qui apparaissent dans trop peu ou trop nombreux documents.
#condition sur les colonnes
mat_B_filtered <- mat_B[,app_termes > 10 & app_termes <1000]
#dimensions
print(dim(mat_B_filtered))
## [1] 2000 2529
Quelques pistes pour éexploiter la matrice documents-termes.
N’oublions pas que les documents sont étiquetés (commentaires positifs ou négatifs). L’idée est de déterminer si les termes sont plus souvent associés à des documents positifs ou négatifs.
On ne travaille que sur les termes filtrés, avec la pondération binaire.
#pondération binaire
mat_C <- ifelse(mat_B_filtered>0,1,0)
#transformer la matrice en data.frame
df_C <- as.data.frame(mat_C)
print(class(df_C))
## [1] "data.frame"
#calcul conditionnel
sum_per_class <- aggregate(x=df_C,by=list(D$label),sum)
print(sum_per_class[,1:15])
## Group.1 awful de direction disaster final force history makes movies
## 1 neg 92 13 57 17 34 12 36 119 217
## 2 pos 12 16 53 9 53 24 42 165 175
## performances peter pile screenplay stopped
## 1 43 22 12 21 8
## 2 95 22 0 19 8
#structure temporaire
tmp <- as.matrix(sum_per_class[,2:ncol(sum_per_class)])
row.names(tmp) <- sum_per_class$Group.1
#proportion des positifs
prop_pos <- tmp['pos',]/colSums(tmp)
print(sort(prop_pos)[1:15])
## pile sucks atrocious rubbish figured redeeming poorly
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.03703704 0.04166667
## clue waste stupidity soap horrid amateurish mess
## 0.05000000 0.05940594 0.06250000 0.07142857 0.07692308 0.07692308 0.08163265
## chick
## 0.08333333
#ex. le cas de 'waste'
print(tmp[,'waste'])
## neg pos
## 95 6
On n’est pas vraiment dans un schéma prédictif, mais plutôt dans la description. Sinon, il faudrait repenser le processus de préparation des données.
#construire un arbre de décision
library(rpart)
arbre <- rpart(D$label ~ ., data = df_C)
print(arbre)
## n= 2000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2000 1000 neg (0.5000000 0.5000000)
## 2) bad>=0.5 461 116 neg (0.7483731 0.2516269) *
## 3) bad< 0.5 1539 655 pos (0.4256010 0.5743990)
## 6) worst>=0.5 104 16 neg (0.8461538 0.1538462) *
## 7) worst< 0.5 1435 567 pos (0.3951220 0.6048780)
## 14) waste>=0.5 48 3 neg (0.9375000 0.0625000) *
## 15) waste< 0.5 1387 522 pos (0.3763518 0.6236482)
## 30) boring>=0.5 66 13 neg (0.8030303 0.1969697) *
## 31) boring< 0.5 1321 469 pos (0.3550341 0.6449659)
## 62) plot>=0.5 226 102 neg (0.5486726 0.4513274)
## 124) films< 0.5 163 58 neg (0.6441718 0.3558282) *
## 125) films>=0.5 63 19 pos (0.3015873 0.6984127) *
## 63) plot< 0.5 1095 345 pos (0.3150685 0.6849315)
## 126) awful>=0.5 27 4 neg (0.8518519 0.1481481) *
## 127) awful< 0.5 1068 322 pos (0.3014981 0.6985019) *
#affichage plus sympathique
library(rpart.plot)
rpart.plot(arbre)
#proportion de documents positifs pour les "pires" termes
print(prop_pos[c('bad','worst','waste','boring')])
## bad worst waste boring
## 0.25162690 0.12371134 0.05940594 0.17699115
S’intéresser aux polarités de termes pour inférer sur la polarité des documents.
#lexique pour l'analyse des sentiments
#polarité des termes - get_sentiments() de tidtytext
#"bing" est une source possible, il y en a d'autres
polarite_termes <- get_sentiments("bing")
print(polarite_termes)
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
Polarité d’un document = aggrégation de la polarité des termes qui le compose… s’ils sont recensés bien sûr. Pas besoin de partition apprentissage-test ici parce que la classe (label) n’est pas mise à contribution lors de la “modélisation”. L’approche est non-supervisée par nature, pas de risque de surapprentissage.
Calcul de la polarité des documents par utilisation du lexique des sentiments. Etape par étape.
#recensement des termes par document
res_B %>%
filter(line==1)
#pour un document, jointure avec le lexique
res_B %>%
filter(line==1) %>%
left_join(polarite_termes,by="word")
#retrait des NA, à ne pas comptabiliser
res_B %>%
filter(line==1) %>%
left_join(polarite_termes,by="word") %>%
filter(!is.na(sentiment))
#comptabilisation des polarités
res_B %>%
filter(line==1) %>%
left_join(polarite_termes,by="word") %>%
filter(!is.na(sentiment)) %>%
count(sentiment)
Il ne reste plus qu’à réaliser l’opération sur l’ensemble des documents.
#polarité des documents déduite des termes
#qui les composent
#--> proportion des termes positifs
pol_per_doc <- res_B %>%
group_by(line) %>%
left_join(polarite_termes,by="word") %>%
filter(!is.na(sentiment)) %>%
count(sentiment) %>%
summarise(freq_pos=n[sentiment=='positive']/sum(n))
## `summarise()` has grouped output by 'line'. You can override using the `.groups` argument.
#affichage
print(pol_per_doc)
## # A tibble: 1,916 x 2
## # Groups: line [1,916]
## line freq_pos
## <int> <dbl>
## 1 1 0.125
## 2 2 0.444
## 3 3 0.375
## 4 4 0.375
## 5 5 0.185
## 6 6 0.429
## 7 7 0.321
## 8 8 0.3
## 9 9 0.231
## 10 10 0.1
## # ... with 1,906 more rows
Attention, certains documents ne sont pas recensés (1916 documents notés en tout) parce qu’ils ne présentent aucun terme “positif”.
Prédiction par comparaison avec le seuil 0.5 c.-à-d. est considéré comme positif un commentaire s’il est composé au moins pour moitié de termes positifs.
#prédiction sur cette base
pred_polarite <- rep(0,nrow(D))
#affectation -- ternir compte des doc. non recensés
pred_polarite[pol_per_doc$line] <- pol_per_doc$freq_pos
#classe prédite
class_pred_polarite <- ifelse(pred_polarite >= 0.5,"positive","negative")
print(table(class_pred_polarite))
## class_pred_polarite
## negative positive
## 1250 750
Confrontation avec les classes observées.
#matrice de confusion
mc <-table(D$label,class_pred_polarite)
print(mc)
## class_pred_polarite
## negative positive
## neg 846 154
## pos 404 596
#accuracy
print(sum(diag(mc))/sum(mc))
## [1] 0.721