Chargement, inspection et nettoyage du corpus¶

In [ ]:
#changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")

#chargement du corpus
f = open("./corpus/cooking.stackexchange.txt","r")
corpus = f.readlines()
f.close()
In [ ]:
#nombre de lignes
print(len(corpus))
15404
In [ ]:
#premières lignes
print(corpus[:5])
['__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?\n', '__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments\n', '__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?\n', '__label__restaurant Michelin Three Star Restaurant; but if the chef is not there\n', '__label__knife-skills __label__dicing Without knife skills, how can I quickly and accurately dice vegetables?\n']
In [ ]:
#retirer le "\n" dans la liste
corpus_bis = [doc.replace('\n','') for doc in corpus]
print(corpus_bis[:5])
['__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?', '__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments', '__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?', '__label__restaurant Michelin Three Star Restaurant; but if the chef is not there', '__label__knife-skills __label__dicing Without knife skills, how can I quickly and accurately dice vegetables?']
In [ ]:
#transformer en minuscule
corpus_ter = [doc.lower() for doc in corpus_bis]
print(corpus_ter[:5])
['__label__sauce __label__cheese how much does potato starch affect a cheese sauce recipe?', '__label__food-safety __label__acidity dangerous pathogens capable of growing in acidic environments', '__label__cast-iron __label__stove how do i cover up the white spots on my cast iron stove?', '__label__restaurant michelin three star restaurant; but if the chef is not there', '__label__knife-skills __label__dicing without knife skills, how can i quickly and accurately dice vegetables?']
In [ ]:
#liste des ponctuations ponctuations
import string
ponctuations = list(string.punctuation)

print(ponctuations)
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
In [ ]:
#retirer le "_" des ponctuations, sinon problème pour les labels
ponctuations.remove('_')

print(ponctuations)
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~']
In [ ]:
#retirer les ponctuations
corpus_quater = ["".join([w for w in list(doc) if not w in ponctuations]) for doc in corpus_ter]
print(corpus_quater[:5])
['__label__sauce __label__cheese how much does potato starch affect a cheese sauce recipe', '__label__foodsafety __label__acidity dangerous pathogens capable of growing in acidic environments', '__label__castiron __label__stove how do i cover up the white spots on my cast iron stove', '__label__restaurant michelin three star restaurant but if the chef is not there', '__label__knifeskills __label__dicing without knife skills how can i quickly and accurately dice vegetables']

Prépartion des données, échantillons train-test¶

In [ ]:
#transformation en data.frame
import pandas
df = pandas.DataFrame(data=corpus_quater,columns=['docs'])
df.head()
Out[ ]:
docs
0 __label__sauce __label__cheese how much does p...
1 __label__foodsafety __label__acidity dangerous...
2 __label__castiron __label__stove how do i cove...
3 __label__restaurant michelin three star restau...
4 __label__knifeskills __label__dicing without k...
In [ ]:
#partition train-test
from sklearn.model_selection import train_test_split
dfTrain, dfTest = train_test_split(df,train_size=10404,random_state=0)

#vérif.
print(dfTrain.shape)
print(dfTest.shape)
(10404, 1)
(5000, 1)
In [ ]:
#sauvegarde du train
dfTrain.to_csv("corpus_train.txt",columns=['docs'],header=None,index=False)
In [ ]:
#sauvegarde du test
dfTest.to_csv("corpus_test.txt",columns=['docs'],header=None,index=False)

Première modélisation - Inspection des résultats¶

In [ ]:
#fasttext
import fasttext

#modélisation avec paramétrage - on a bien un réseau de neurones
#cf. https://fasttext.cc/docs/en/python-module.html#train_supervised-parameters
modele_one = fasttext.train_supervised(input="corpus_train.txt",epoch=25,lr=1.0,dim=50)
In [ ]:
#nombre de classes (etiquettes)
print(len(modele_one.get_labels()))
730
In [ ]:
#liste des etiquettes
modele_one.get_labels()
Out[ ]:
['__label__baking',
 '__label__foodsafety',
 '__label__substitutions',
 '__label__equipment',
 '__label__bread',
 '__label__chicken',
 '__label__storagemethod',
 '__label__meat',
 '__label__eggs',
 '__label__cake',
 '__label__sauce',
 '__label__freezing',
 '__label__flavor',
 '__label__vegetables',
 '__label__storagelifetime',
 '__label__cheese',
 '__label__chocolate',
 '__label__frying',
 '__label__oven',
 '__label__fruit',
 '__label__coffee',
 '__label__foodscience',
 '__label__temperature',
 '__label__dough',
 '__label__oil',
 '__label__sugar',
 '__label__beef',
 '__label__cookies',
 '__label__soup',
 '__label__pasta',
 '__label__fish',
 '__label__foodpreservation',
 '__label__spices',
 '__label__cleaning',
 '__label__potatoes',
 '__label__salt',
 '__label__milk',
 '__label__rice',
 '__label__pork',
 '__label__butter',
 '__label__flour',
 '__label__pizza',
 '__label__grilling',
 '__label__dessert',
 '__label__cookingtime',
 '__label__tea',
 '__label__steak',
 '__label__slowcooking',
 '__label__refrigerator',
 '__label__microwave',
 '__label__boiling',
 '__label__indiancuisine',
 '__label__cookware',
 '__label__yeast',
 '__label__beans',
 '__label__castiron',
 '__label__tomatoes',
 '__label__onions',
 '__label__canning',
 '__label__roasting',
 '__label__candy',
 '__label__pan',
 '__label__italiancuisine',
 '__label__chilipeppers',
 '__label__pie',
 '__label__storage',
 '__label__barbecue',
 '__label__asiancuisine',
 '__label__sousvide',
 '__label__culinaryuses',
 '__label__stock',
 '__label__garlic',
 '__label__cream',
 '__label__icecream',
 '__label__deepfrying',
 '__label__language',
 '__label__wine',
 '__label__fats',
 '__label__knives',
 '__label__sourdough',
 '__label__ingredientselection',
 '__label__measurements',
 '__label__water',
 '__label__marinade',
 '__label__alcohol',
 '__label__herbs',
 '__label__turkey',
 '__label__drinks',
 '__label__fermentation',
 '__label__yogurt',
 '__label__chinesecuisine',
 '__label__foodidentification',
 '__label__nuts',
 '__label__gelatin',
 '__label__vinegar',
 '__label__thickening',
 '__label__seasoning',
 '__label__pastry',
 '__label__reheating',
 '__label__vegetarian',
 '__label__japanesecuisine',
 '__label__shopping',
 '__label__curry',
 '__label__texture',
 '__label__seafood',
 '__label__mexicancuisine',
 '__label__nutrientcomposition',
 '__label__restaurantmimicry',
 '__label__chemistry',
 '__label__roast',
 '__label__sushi',
 '__label__noodles',
 '__label__sauteing',
 '__label__knifeskills',
 '__label__vegan',
 '__label__steaming',
 '__label__bacon',
 '__label__fryingpan',
 '__label__sausages',
 '__label__smoking',
 '__label__brining',
 '__label__frozen',
 '__label__pickling',
 '__label__hamburgers',
 '__label__stove',
 '__label__mushrooms',
 '__label__lemon',
 '__label__utensils',
 '__label__pancakes',
 '__label__corn',
 '__label__broth',
 '__label__batter',
 '__label__juice',
 '__label__glutenfree',
 '__label__jam',
 '__label__crust',
 '__label__lamb',
 '__label__coconut',
 '__label__moleculargastronomy',
 '__label__chickenbreast',
 '__label__crockpot',
 '__label__spicyhot',
 '__label__cocktails',
 '__label__dairy',
 '__label__salmon',
 '__label__salad',
 '__label__pressurecooker',
 '__label__cutting',
 '__label__oliveoil',
 '__label__honey',
 '__label__fresh',
 '__label__stews',
 '__label__resources',
 '__label__apples',
 '__label__espresso',
 '__label__recipescaling',
 '__label__vanilla',
 '__label__eggwhites',
 '__label__groundbeef',
 '__label__mold',
 '__label__meltingchocolate',
 '__label__pot',
 '__label__brownies',
 '__label__defrosting',
 '__label__nonstick',
 '__label__bananas',
 '__label__cheesecake',
 '__label__seeds',
 '__label__spoilage',
 '__label__mixing',
 '__label__frenchcuisine',
 '__label__thaicuisine',
 '__label__dehydrating',
 '__label__allergy',
 '__label__duck',
 '__label__cheesemaking',
 '__label__raw',
 '__label__icing',
 '__label__carrots',
 '__label__pleaseremovethistag',
 '__label__syrup',
 '__label__heat',
 '__label__ginger',
 '__label__rawmeat',
 '__label__ricecooker',
 '__label__seasoningpans',
 '__label__drying',
 '__label__smell',
 '__label__whippedcream',
 '__label__custard',
 '__label__sandwich',
 '__label__bones',
 '__label__tofu',
 '__label__meringue',
 '__label__ribs',
 '__label__blender',
 '__label__caramelization',
 '__label__chili',
 '__label__middleeasterncuisine',
 '__label__caramel',
 '__label__starter',
 '__label__roux',
 '__label__beer',
 '__label__peeling',
 '__label__conversion',
 '__label__emulsion',
 '__label__basics',
 '__label__gas',
 '__label__acidity',
 '__label__stainlesssteel',
 '__label__color',
 '__label__decorating',
 '__label__stirfry',
 '__label__bakingpowder',
 '__label__citrus',
 '__label__grinding',
 '__label__soy',
 '__label__pumpkin',
 '__label__maintenance',
 '__label__frenchfries',
 '__label__muffins',
 '__label__gellingagents',
 '__label__foodprocessing',
 '__label__cookbook',
 '__label__shrimp',
 '__label__safety',
 '__label__ham',
 '__label__oranges',
 '__label__beverages',
 '__label__breakfast',
 '__label__wok',
 '__label__cutofmeat',
 '__label__chickenstock',
 '__label__cupcakes',
 '__label__almonds',
 '__label__ripe',
 '__label__wheat',
 '__label__popcorn',
 '__label__oats',
 '__label__dutchoven',
 '__label__budgetcooking',
 '__label__spaghetti',
 '__label__pairing',
 '__label__bakingsoda',
 '__label__pizzastone',
 '__label__chips',
 '__label__squash',
 '__label__mayonnaise',
 '__label__avocados',
 '__label__kneading',
 '__label__sharpening',
 '__label__convection',
 '__label__peanuts',
 '__label__fire',
 '__label__eggplant',
 '__label__sweetpotatoes',
 '__label__pudding',
 '__label__saladdressing',
 '__label__frosting',
 '__label__hotsauce',
 '__label__thermometer',
 '__label__cabbage',
 '__label__rising',
 '__label__jerky',
 '__label__vacuum',
 '__label__grains',
 '__label__glass',
 '__label__lasagna',
 '__label__kitchensafety',
 '__label__presentation',
 '__label__americancuisine',
 '__label__omelette',
 '__label__lentils',
 '__label__coloring',
 '__label__organization',
 '__label__extracts',
 '__label__spinach',
 '__label__risotto',
 '__label__thanksgiving',
 '__label__hardboiledeggs',
 '__label__serving',
 '__label__gravy',
 '__label__brownsugar',
 '__label__cinnamon',
 '__label__spanishcuisine',
 '__label__cocoa',
 '__label__liver',
 '__label__history',
 '__label__consistency',
 '__label__infusion',
 '__label__basil',
 '__label__breadcrumbs',
 '__label__caffeine',
 '__label__creamcheese',
 '__label__menuplanning',
 '__label__soaking',
 '__label__dairyfree',
 '__label__cremebrulee',
 '__label__pineapple',
 '__label__bellpeppers',
 '__label__ganache',
 '__label__spongecake',
 '__label__pectin',
 '__label__poaching',
 '__label__tempering',
 '__label__buttermilk',
 '__label__waffle',
 '__label__soymilk',
 '__label__jelly',
 '__label__cornstarch',
 '__label__roastbeef',
 '__label__starch',
 '__label__electricstoves',
 '__label__puffpastry',
 '__label__souffle',
 '__label__meatballs',
 '__label__lowcarb',
 '__label__restaurant',
 '__label__soda',
 '__label__butchering',
 '__label__curing',
 '__label__smoothie',
 '__label__chickpeas',
 '__label__broiler',
 '__label__standmixer',
 '__label__skillet',
 '__label__cuttingboards',
 '__label__quickbread',
 '__label__braising',
 '__label__tenderizing',
 '__label__reduction',
 '__label__thawing',
 '__label__camping',
 '__label__ratio',
 '__label__foodtransport',
 '__label__blueberries',
 '__label__moisture',
 '__label__pepper',
 '__label__polenta',
 '__label__juicing',
 '__label__melting',
 '__label__kefir',
 '__label__potroast',
 '__label__scrambledeggs',
 '__label__greens',
 '__label__mint',
 '__label__koreancuisine',
 '__label__condiments',
 '__label__filling',
 '__label__bechamel',
 '__label__sourdoughstarter',
 '__label__mousse',
 '__label__shortening',
 '__label__botulism',
 '__label__fudge',
 '__label__sourcream',
 '__label__cookingmyth',
 '__label__mozzarella',
 '__label__ceramic',
 '__label__truffles',
 '__label__poultry',
 '__label__vodka',
 '__label__lettuce',
 '__label__turkishcuisine',
 '__label__olive',
 '__label__carbonation',
 '__label__whipper',
 '__label__marshmallow',
 '__label__dumplings',
 '__label__induction',
 '__label__organic',
 '__label__mustard',
 '__label__fondant',
 '__label__propanegrill',
 '__label__kimchi',
 '__label__additives',
 '__label__tortilla',
 '__label__lemonjuice',
 '__label__strawberries',
 '__label__lowfat',
 '__label__carbonsteel',
 '__label__brisket',
 '__label__crepe',
 '__label__teflon',
 '__label__paneer',
 '__label__veal',
 '__label__catering',
 '__label__proofing',
 '__label__aluminumcookware',
 '__label__foodhistory',
 '__label__charcoal',
 '__label__scallops',
 '__label__germancuisine',
 '__label__packaging',
 '__label__foam',
 '__label__toffee',
 '__label__celery',
 '__label__shellfish',
 '__label__sauerkraut',
 '__label__venison',
 '__label__blanching',
 '__label__learning',
 '__label__mango',
 '__label__sorbet',
 '__label__quinoa',
 '__label__bulkcooking',
 '__label__maplesyrup',
 '__label__alfredo',
 '__label__evaporatedmilk',
 '__label__zucchini',
 '__label__calories',
 '__label__charcuterie',
 '__label__spherification',
 '__label__kosher',
 '__label__stickyrice',
 '__label__lobster',
 '__label__flavourpairings',
 '__label__macarons',
 '__label__cremefraiche',
 '__label__cucumbers',
 '__label__professional',
 '__label__seitan',
 '__label__produce',
 '__label__pasteurization',
 '__label__dryaging',
 '__label__porkshoulder',
 '__label__peaches',
 '__label__paella',
 '__label__experimental',
 '__label__nutbutters',
 '__label__tuna',
 '__label__sugarfree',
 '__label__culturaldifference',
 '__label__containers',
 '__label__casserole',
 '__label__applepie',
 '__label__ghee',
 '__label__salami',
 '__label__plating',
 '__label__burnt',
 '__label__skin',
 '__label__outdoorcooking',
 '__label__cod',
 '__label__smokeflavor',
 '__label__lime',
 '__label__tamales',
 '__label__gnocchi',
 '__label__highaltitude',
 '__label__mortar',
 '__label__frenchpress',
 '__label__fondue',
 '__label__beets',
 '__label__meltingsugar',
 '__label__ramen',
 '__label__sashimi',
 '__label__crumbcrust',
 '__label__broccoli',
 '__label__pate',
 '__label__raspberries',
 '__label__chia',
 '__label__doughnuts',
 '__label__biscuits',
 '__label__glaze',
 '__label__hollandaise',
 '__label__rye',
 '__label__children',
 '__label__quiche',
 '__label__toasting',
 '__label__glucosesyrup',
 '__label__porkchops',
 '__label__barbecuesauce',
 '__label__yolk',
 '__label__focaccia',
 '__label__maillard',
 '__label__servingsuggestion',
 '__label__efficiency',
 '__label__shortcuts',
 '__label__offal',
 '__label__cornedbeef',
 '__label__bouillon',
 '__label__plums',
 '__label__russiancuisine',
 '__label__leavening',
 '__label__grating',
 '__label__cauliflower',
 '__label__liqueur',
 '__label__tasting',
 '__label__salmonella',
 '__label__margarine',
 '__label__timing',
 '__label__flatbread',
 '__label__salsa',
 '__label__chickenwings',
 '__label__mash',
 '__label__peanutbutter',
 '__label__mussels',
 '__label__vitalwheatgluten',
 '__label__filtering',
 '__label__mistakes',
 '__label__vietnamesecuisine',
 '__label__sprouting',
 '__label__flowers',
 '__label__ceviche',
 '__label__almondmilk',
 '__label__puree',
 '__label__hominy',
 '__label__rum',
 '__label__msg',
 '__label__eggnoodles',
 '__label__stuffing',
 '__label__englishcuisine',
 '__label__balkancuisine',
 '__label__pestle',
 '__label__greekcuisine',
 '__label__handblender',
 '__label__chopping',
 '__label__okra',
 '__label__hummus',
 '__label__rolls',
 '__label__cherries',
 '__label__crab',
 '__label__kitchen',
 '__label__chestnuts',
 '__label__meatloaf',
 '__label__wood',
 '__label__hotdog',
 '__label__foilcooking',
 '__label__tart',
 '__label__grapes',
 '__label__flambe',
 '__label__masscooking',
 '__label__carbonara',
 '__label__culturedfood',
 '__label__chilling',
 '__label__bagels',
 '__label__gumbo',
 '__label__comparisons',
 '__label__dip',
 '__label__chutney',
 '__label__cilantro',
 '__label__mascarpone',
 '__label__chorizo',
 '__label__root',
 '__label__kebab',
 '__label__ravioli',
 '__label__websites',
 '__label__jewishcuisine',
 '__label__parmesan',
 '__label__watermelon',
 '__label__asparagus',
 '__label__conveniencefoods',
 '__label__toaster',
 '__label__blowtorch',
 '__label__koshersalt',
 '__label__confit',
 '__label__artichokes',
 '__label__measuringscales',
 '__label__milling',
 '__label__cubes',
 '__label__brusselssprouts',
 '__label__jalapeno',
 '__label__masa',
 '__label__chai',
 '__label__wasabi',
 '__label__cedarplank',
 '__label__aging',
 '__label__granola',
 '__label__knifesafety',
 '__label__sifting',
 '__label__untagged',
 '__label__cajuncuisine',
 '__label__melon',
 '__label__hungariancuisine',
 '__label__mutton',
 '__label__cost',
 '__label__griddle',
 '__label__sodium',
 '__label__saffron',
 '__label__rhubarb',
 '__label__pita',
 '__label__lemonade',
 '__label__tartare',
 '__label__flax',
 '__label__flan',
 '__label__chocolatetruffles',
 '__label__rabbit',
 '__label__ketchup',
 '__label__coppercookware',
 '__label__blindbaking',
 '__label__basting',
 '__label__barley',
 '__label__stoneware',
 '__label__peel',
 '__label__kiwifruit',
 '__label__flourtortilla',
 '__label__vitamins',
 '__label__buckwheat',
 '__label__potpie',
 '__label__pressurecanner',
 '__label__biga',
 '__label__fryer',
 '__label__romaniancuisine',
 '__label__pomegranate',
 '__label__bakerpercentage',
 '__label__skewers',
 '__label__kettle',
 '__label__parsley',
 '__label__carpaccio',
 '__label__faq',
 '__label__whiskey',
 '__label__oregano',
 '__label__pulses',
 '__label__dietaryrestriction',
 '__label__rolling',
 '__label__seasonal',
 '__label__pretzels',
 '__label__traditional',
 '__label__honeycomb',
 '__label__legumes',
 '__label__porkbelly',
 '__label__freerange',
 '__label__caribbeancuisine',
 '__label__clottedcream',
 '__label__finnishcuisine',
 '__label__classification',
 '__label__silver',
 '__label__scottishcuisine',
 '__label__coriander',
 '__label__kale',
 '__label__parchment',
 '__label__passover',
 '__label__cremeanglaise',
 '__label__gingerbread',
 '__label__differences',
 '__label__dulcedeleche',
 '__label__yorkshirepuddings',
 '__label__friedeggs',
 '__label__halfandhalf',
 '__label__alcoholcontent',
 '__label__disposal',
 '__label__marinara',
 '__label__elderberries',
 '__label__african',
 '__label__haddock',
 '__label__tumeric',
 '__label__serbiancuisine',
 '__label__feta',
 '__label__fruitleather',
 '__label__concentration',
 '__label__apple',
 '__label__frozenyogurt',
 '__label__grahamcrackers',
 '__label__crudo',
 '__label__squid',
 '__label__software',
 '__label__sardines',
 '__label__tahini',
 '__label__leeks',
 '__label__tortillachips',
 '__label__gazpacho',
 '__label__aluminumfoil',
 '__label__brie',
 '__label__breadpudding',
 '__label__crock',
 '__label__thickness',
 '__label__snail',
 '__label__hotchocolate',
 '__label__mate',
 '__label__sunchokes',
 '__label__dashi',
 '__label__allium',
 '__label__australiancuisine',
 '__label__neapolitanpizza',
 '__label__tamarind',
 '__label__alkalinity',
 '__label__jerk',
 '__label__uht',
 '__label__tzatziki',
 '__label__deepdishpizza',
 '__label__snacks',
 '__label__coldbrew',
 '__label__pancetta',
 '__label__goat',
 '__label__cranberries',
 '__label__taffy',
 '__label__tilapia',
 '__label__polishcuisine',
 '__label__indonesiancuisine',
 '__label__shallots',
 '__label__kangaroo',
 '__label__marrow',
 '__label__thai',
 '__label__separating',
 '__label__cornflake',
 '__label__kohlrabi',
 '__label__science',
 '__label__malt',
 '__label__ricewine',
 '__label__porridge',
 '__label__crumble',
 '__label__parsnip',
 '__label__steamedpudding',
 '__label__jicama',
 '__label__clothing',
 '__label__gammon',
 '__label__cookingsafety',
 '__label__carob',
 '__label__flavorbase',
 '__label__vocabulary',
 '__label__swissroll',
 '__label__crackers',
 '__label__oxtail',
 '__label__frittata',
 '__label__pantry',
 '__label__sweeteners',
 '__label__tortillapress',
 '__label__paprika',
 '__label__kombucha',
 '__label__goose',
 '__label__crawfish',
 '__label__grade',
 '__label__durian',
 '__label__poprocks',
 '__label__bayleaf',
 '__label__straining',
 '__label__babyfood',
 '__label__dicing',
 '__label__sprinkles',
 '__label__guava',
 '__label__cakes']
In [ ]:
#nombre de termes du dictionnaire
print(len(modele_one.get_words()))
7835
In [ ]:
#liste des 25 premiers termes du dictionnaire
modele_one.get_words()[:25]
Out[ ]:
['</s>',
 'to',
 'how',
 'a',
 'the',
 'i',
 'in',
 'is',
 'what',
 'for',
 'can',
 'of',
 'and',
 'do',
 'my',
 'it',
 'why',
 'with',
 'make',
 'does',
 'are',
 'when',
 'from',
 'or',
 'cooking']
In [ ]:
#représentation vectorielle de cooking
modele_one.get_word_vector('cooking')
Out[ ]:
array([ 2.76140701e-02, -1.63758516e-01,  4.21501666e-01, -1.26866162e-01,
        8.92624021e-01,  1.06193304e+00, -4.18455005e-01, -4.25865769e-01,
        2.64381796e-01,  7.65234465e-03, -2.97671229e-01,  5.07829785e-01,
        4.50594544e-01,  4.75236744e-01,  9.32282209e-01,  1.17248023e+00,
       -6.07037544e-01, -5.00229597e-02,  6.22585535e-01,  6.76936567e-01,
       -6.12060606e-01, -3.08223724e-01,  4.65007842e-01, -6.53350770e-01,
        6.70516968e-01, -9.34424520e-01,  5.86412668e-01, -2.59867549e-01,
        4.46678847e-01, -9.55120981e-01, -1.01869750e+00,  6.36321306e-01,
        4.16292436e-02,  2.22233176e-01,  2.56294161e-01, -4.31494772e-01,
       -1.68193277e-04, -6.96872473e-01,  4.03554589e-02,  5.02427638e-01,
       -3.31594586e-01,  9.03470516e-01, -1.01095594e-01,  6.74550414e-01,
        1.58854210e+00,  1.71230789e-02, -3.15202743e-01,  9.28206444e-01,
        1.42517030e-01, -1.21765709e+00], dtype=float32)
In [ ]:
#taille de la matrice présentée en entrée
modele_one.get_input_matrix().shape
Out[ ]:
(7835, 50)
In [ ]:
#prediction sur un item
modele_one.predict('is corn a fruit vegetable or a nut')
Out[ ]:
(('__label__vegetables',), array([0.60731041]))
In [ ]:
#2 classes les plus probables
#avec une conception particulière du rappel et de la précision
#cf. https://fasttext.cc/docs/en/supervised-tutorial.html
modele_one.predict('is corn a fruit vegetable or a nut',k=2)
Out[ ]:
(('__label__vegetables', '__label__fruit'), array([0.60731041, 0.13093224]))
In [ ]:
#précision et rappel en test (prédiction unique)
modele_one.test("corpus_test.txt")
Out[ ]:
(5000, 0.5722, 0.2487177258106581)
In [ ]:
#précision et rappel (prédiction de 2 classes)
modele_one.test("corpus_test.txt",k=2)
Out[ ]:
(5000, 0.4476, 0.38911588281317916)

Introduction des n-grams (sur les termes)¶

In [ ]:
#introduction des bigrams
modele_bis = fasttext.train_supervised(input="corpus_train.txt",epoch=25,lr=1.0,dim=50,wordNgrams=2)
In [ ]:
#taille de la matrice entrée
#inflation à cause des bigrams
modele_bis.get_input_matrix().shape
Out[ ]:
(2007835, 50)
In [ ]:
#performances en test (prédiction unique)
modele_bis.test("corpus_test.txt")
Out[ ]:
(5000, 0.5944, 0.2583673824219769)

Approche one-vs-all (one-vs-rest)¶

In [ ]:
#Autant de modèle qu'il y a d'étiquettes
#j'ai dû baisser le taux d'apprentissage
modele_ter = fasttext.train_supervised(input="corpus_train.txt",epoch=25,lr=0.33,dim=50,loss='ova')
In [ ]:
#prédiction sur un item (2 classes)
modele_ter.predict('is corn a fruit vegetable or a nut',k=2)
Out[ ]:
(('__label__vegetables', '__label__nuts'), array([0.96886617, 0.50001001]))
In [ ]:
#ou bien utiliser un seuil d'affectation
modele_ter.predict('is corn a fruit vegetable or a nut',k=-1,threshold=0.9)
Out[ ]:
(('__label__vegetables',), array([0.96886617]))
In [ ]:
#performances en test
modele_ter.test("corpus_test.txt",k=-1,threshold=0.9)
Out[ ]:
(5000, 0.6381284221005475, 0.22289837433712945)

Modèles pré-entraînés¶

In [ ]:
#amazon review polarity
amz_modele = fasttext.load_model("amazon_review_polarity.ftz")
In [ ]:
#liste des labels
amz_modele.get_labels()
Out[ ]:
['__label__2', '__label__1']
In [ ]:
#vérifions les prédictions - phrase négative
amz_modele.predict('this book is bad and the author is uncultivated')
Out[ ]:
(('__label__1',), array([0.99961752]))
In [ ]:
#phrase plutôt positive
amz_modele.predict('i like this book, surprisingly it is underrated')
Out[ ]:
(('__label__2',), array([1.00001001]))