Versions¶
In [1]:
#pandas
import pandas
pandas.__version__
Out[1]:
'2.2.3'
In [2]:
#duckdb
import duckdb
duckdb.__version__
Out[2]:
'1.1.3'
Importation des tables¶
In [3]:
#dossier par défaut
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [4]:
#ratings des utilisateurs
dfRatings = pandas.read_excel("movies_dataset.xlsx",sheet_name="user ratings")
dfRatings.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1190 entries, 0 to 1189 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user 1190 non-null int64 1 title 1190 non-null object 2 rating 1190 non-null int64 dtypes: int64(2), object(1) memory usage: 28.0+ KB
In [5]:
#types de films
dfMovies = pandas.read_excel("movies_dataset.xlsx",sheet_name="movies types")
dfMovies.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 title 100 non-null object 1 unknown 100 non-null int64 2 Action 100 non-null int64 3 Adventure 100 non-null int64 4 Animation 100 non-null int64 5 Children 100 non-null int64 6 Comedy 100 non-null int64 7 Crime 100 non-null int64 8 Documentary 100 non-null int64 9 Drama 100 non-null int64 10 Fantasy 100 non-null int64 11 FilmNoir 100 non-null int64 12 Horror 100 non-null int64 13 Musical 100 non-null int64 14 Mystery 100 non-null int64 15 Romance 100 non-null int64 16 SciFiction 100 non-null int64 17 Thriller 100 non-null int64 18 War 100 non-null int64 19 Western 100 non-null int64 dtypes: int64(19), object(1) memory usage: 15.8+ KB
Requêtes sur une table¶
In [6]:
#liste des filmes recensés dans "Movies"
#texte de la requête SQL
requete = "SELECT title FROM dfMovies;"
#appel de la réquête, transformée en data frame .df()
res = duckdb.sql(requete).df()
#affichage du type de l'objet obtenu
print(type(res))
<class 'pandas.core.frame.DataFrame'>
In [7]:
#affichage du résultat
print(res)
title 0 U Turn (1997) 1 Chungking Express (1994) 2 Rainmaker, The (1997) 3 I Can t Sleep (J ai pas sommeil) (1994) 4 Good Man in Africa, A (1994) .. ... 95 Scarlet Letter, The (1995) 96 Willy Wonka and the Chocolate Factory (1971) 97 Herbie Rides Again (1974) 98 Dingo (1992) 99 Amityville Curse, The (1990) [100 rows x 1 columns]
In [8]:
#liste des films distincts notés
#texte de la requête
requete = "SELECT DISTINCT title FROM dfRatings;"
#appel et affichage
print(duckdb.sql(requete).df())
title 0 Dead Man Walking (1995) 1 Richard III (1995) 2 Truth About Cats & Dogs, The (1996) 3 Heat (1995) 4 Time to Kill, A (1996) .. ... 637 Kicking and Screaming (1995) 638 Basketball Diaries, The (1995) 639 Little Women (1994) 640 House of the Spirits, The (1993) 641 Substance of Fire, The (1996) [642 rows x 1 columns]
In [9]:
#moyenne des notes par film (pour ceux notés)
#trié de manière décroissante
requete = """
SELECT title, AVG(rating) as mean_rating
FROM dfRatings
GROUP BY title
ORDER BY mean_rating DESC;
"""
print(duckdb.sql(requete))
┌────────────────────────────────────────────────┬─────────────┐ │ title │ mean_rating │ │ varchar │ double │ ├────────────────────────────────────────────────┼─────────────┤ │ Fargo (1996) │ 5.0 │ │ As Good As It Gets (1997) │ 5.0 │ │ Clerks (1994) │ 5.0 │ │ Mystery Science Theater 3000: The Movie (1996) │ 5.0 │ │ Heathers (1989) │ 5.0 │ │ True Romance (1993) │ 5.0 │ │ Dead Poets Society (1989) │ 5.0 │ │ Hunt for Red October, The (1990) │ 5.0 │ │ Body Snatchers (1993) │ 5.0 │ │ For Whom the Bell Tolls (1943) │ 5.0 │ │ · │ · │ │ · │ · │ │ · │ · │ │ Amityville II: The Possession (1982) │ 1.0 │ │ First Wives Club, The (1996) │ 1.0 │ │ Bio-Dome (1996) │ 1.0 │ │ Muriel s Wedding (1994) │ 1.0 │ │ Amityville 3-D (1983) │ 1.0 │ │ Body Parts (1991) │ 1.0 │ │ How to Be a Player (1997) │ 1.0 │ │ Santa Clause, The (1994) │ 1.0 │ │ Lawnmower Man, The (1992) │ 1.0 │ │ Bastard Out of Carolina (1996) │ 1.0 │ ├────────────────────────────────────────────────┴─────────────┤ │ 642 rows (20 shown) 2 columns │ └──────────────────────────────────────────────────────────────┘
In [10]:
#les 3 utilisateurs qui ont le plus noté
requete = """
SELECT user, COUNT(user) as count_user
FROM dfRatings
GROUP BY user
ORDER BY count_user DESC
LIMIT 3;"""
print(duckdb.sql(requete))
┌───────┬────────────┐ │ user │ count_user │ │ int64 │ int64 │ ├───────┼────────────┤ │ 7 │ 400 │ │ 6 │ 208 │ │ 10 │ 184 │ └───────┴────────────┘
In [11]:
#liste des films d'action recensés
requete = "SELECT title FROM dfMovies WHERE Action=1;"
print(duckdb.sql(requete))
┌─────────────────────────────────────────────┐ │ title │ │ varchar │ ├─────────────────────────────────────────────┤ │ U Turn (1997) │ │ Good Man in Africa, A (1994) │ │ Strange Days (1995) │ │ In the Line of Duty 2 (1987) │ │ Braveheart (1995) │ │ Blues Brothers 2000 (1998) │ │ Steel (1997) │ │ Army of Darkness (1993) │ │ Supercop (1992) │ │ 3 Ninjas: High Noon At Mega Mountain (1998) │ │ Lawnmower Man, The (1992) │ │ Target (1995) │ │ GoldenEye (1995) │ │ Jackal, The (1997) │ │ Solo (1996) │ │ Man in the Iron Mask, The (1998) │ │ Johnny Mnemonic (1995) │ ├─────────────────────────────────────────────┤ │ 17 rows │ └─────────────────────────────────────────────┘
Requêtes avec jointures¶
In [12]:
#liste des films notés dont le type est recensé
requete = """
SELECT dfRatings.title
FROM dfRatings INNER JOIN dfMovies ON dfRatings.title = dfMovies.title;"""
print(duckdb.sql(requete))
┌──────────────────────────────────────────────┐ │ title │ │ varchar │ ├──────────────────────────────────────────────┤ │ GoldenEye (1995) │ │ Jerry Maguire (1996) │ │ Sabrina (1995) │ │ Sense and Sensibility (1995) │ │ Liar Liar (1997) │ │ Rainmaker, The (1997) │ │ Liar Liar (1997) │ │ U Turn (1997) │ │ Liar Liar (1997) │ │ Blues Brothers 2000 (1998) │ │ · │ │ · │ │ · │ │ Bananas (1971) │ │ Gandhi (1982) │ │ Casino (1995) │ │ Substance of Fire, The (1996) │ │ Mother (1996) │ │ Mother (1996) │ │ Mother (1996) │ │ Willy Wonka and the Chocolate Factory (1971) │ │ Willy Wonka and the Chocolate Factory (1971) │ │ Willy Wonka and the Chocolate Factory (1971) │ ├──────────────────────────────────────────────┤ │ 80 rows (20 shown) │ └──────────────────────────────────────────────┘
In [13]:
#même requête sans doublon
requete = """
SELECT DISTINCT dfRatings.title
FROM dfRatings INNER JOIN dfMovies ON dfRatings.title = dfMovies.title;"""
print(duckdb.sql(requete))
┌──────────────────────────────────────────────────┐ │ title │ │ varchar │ ├──────────────────────────────────────────────────┤ │ Lawnmower Man, The (1992) │ │ Naked Gun 33 1/3: The Final Insult (1994) │ │ Sling Blade (1996) │ │ Quiet Man, The (1952) │ │ Jackal, The (1997) │ │ Mother (1996) │ │ Platoon (1986) │ │ Cape Fear (1962) │ │ Substance of Fire, The (1996) │ │ Aristocats, The (1970) │ │ · │ │ · │ │ · │ │ Gandhi (1982) │ │ GoldenEye (1995) │ │ Sabrina (1995) │ │ Liar Liar (1997) │ │ Cinderella (1950) │ │ Wings of Desire (1987) │ │ Sense and Sensibility (1995) │ │ Home Alone (1990) │ │ Arsenic and Old Lace (1944) │ │ Blood For Dracula (Andy Warhol s Dracula) (1974) │ ├──────────────────────────────────────────────────┤ │ 37 rows (20 shown) │ └──────────────────────────────────────────────────┘
In [14]:
#note moyenne par film dont le genre est reconnu
requete = """
SELECT dfRatings.title, AVG(dfRatings.rating) as mean_rating
FROM dfRatings INNER JOIN dfMovies ON dfRatings.title = dfMovies.title
GROUP BY dfRatings.title;"""
print(duckdb.sql(requete))
┌────────────────────────────────────────────────┬────────────────────┐ │ title │ mean_rating │ │ varchar │ double │ ├────────────────────────────────────────────────┼────────────────────┤ │ Platoon (1986) │ 5.0 │ │ Cape Fear (1962) │ 3.0 │ │ Substance of Fire, The (1996) │ 4.0 │ │ Lawnmower Man, The (1992) │ 1.0 │ │ Naked Gun 33 1/3: The Final Insult (1994) │ 3.0 │ │ Sling Blade (1996) │ 4.666666666666667 │ │ Quiet Man, The (1952) │ 5.0 │ │ Jackal, The (1997) │ 4.0 │ │ Mother (1996) │ 4.0 │ │ Sense and Sensibility (1995) │ 4.25 │ │ · │ · │ │ · │ · │ │ · │ · │ │ Amityville Curse, The (1990) │ 1.0 │ │ Three Colors: Red (1994) │ 4.5 │ │ Manon of the Spring (Manon des sources) (1986) │ 3.5 │ │ Mrs. Brown (Her Majesty, Mrs. Brown) (1997) │ 4.0 │ │ Strange Days (1995) │ 5.0 │ │ GoldenEye (1995) │ 3.0 │ │ Sabrina (1995) │ 3.6666666666666665 │ │ Liar Liar (1997) │ 2.625 │ │ Cinderella (1950) │ 3.6666666666666665 │ │ Wings of Desire (1987) │ 4.0 │ ├────────────────────────────────────────────────┴────────────────────┤ │ 37 rows (20 shown) 2 columns │ └─────────────────────────────────────────────────────────────────────┘
In [15]:
#note moyenne des films d'action qui ont été notés
requete = """
SELECT dfRatings.title, AVG(dfRatings.rating) as mean_rating, COUNT(dfRatings.rating) as count_rating
FROM dfRatings INNER JOIN dfMovies ON dfRatings.title = dfMovies.title
WHERE dfMovies.Action = 1
GROUP BY dfRatings.title;"""
print(duckdb.sql(requete))
┌────────────────────────────┬─────────────┬──────────────┐ │ title │ mean_rating │ count_rating │ │ varchar │ double │ int64 │ ├────────────────────────────┼─────────────┼──────────────┤ │ Lawnmower Man, The (1992) │ 1.0 │ 2 │ │ Jackal, The (1997) │ 4.0 │ 1 │ │ Braveheart (1995) │ 4.5 │ 4 │ │ Blues Brothers 2000 (1998) │ 5.0 │ 1 │ │ Strange Days (1995) │ 5.0 │ 1 │ │ U Turn (1997) │ 4.0 │ 3 │ │ GoldenEye (1995) │ 3.0 │ 2 │ └────────────────────────────┴─────────────┴──────────────┘
In [16]:
#films d'action avec leurs notes moyennes
#ou pas si elles n'ont pas été notés
requete = """
SELECT dfMovies.title, AVG(dfRatings.rating) AS mean_rating
FROM dfMovies LEFT JOIN dfRatings ON dfMovies.title = dfRatings.title
WHERE Action = 1
GROUP BY dfMovies.title;"""
print(duckdb.sql(requete))
┌─────────────────────────────────────────────┬─────────────┐ │ title │ mean_rating │ │ varchar │ double │ ├─────────────────────────────────────────────┼─────────────┤ │ Lawnmower Man, The (1992) │ 1.0 │ │ Jackal, The (1997) │ 4.0 │ │ Good Man in Africa, A (1994) │ NULL │ │ Solo (1996) │ NULL │ │ Steel (1997) │ NULL │ │ Supercop (1992) │ NULL │ │ 3 Ninjas: High Noon At Mega Mountain (1998) │ NULL │ │ Man in the Iron Mask, The (1998) │ NULL │ │ Johnny Mnemonic (1995) │ NULL │ │ Blues Brothers 2000 (1998) │ 5.0 │ │ Strange Days (1995) │ 5.0 │ │ Target (1995) │ NULL │ │ Braveheart (1995) │ 4.5 │ │ GoldenEye (1995) │ 3.0 │ │ U Turn (1997) │ 4.0 │ │ In the Line of Duty 2 (1987) │ NULL │ │ Army of Darkness (1993) │ NULL │ ├─────────────────────────────────────────────┴─────────────┤ │ 17 rows 2 columns │ └───────────────────────────────────────────────────────────┘
In [17]:
#films d'action avec leurs notes moyennes
#sans les non-notés
requete = """
SELECT dfMovies.title, AVG(dfRatings.rating) AS mean_rating
FROM dfMovies LEFT JOIN dfRatings ON dfMovies.title = dfRatings.title
WHERE Action = 1
GROUP BY dfMovies.title
HAVING mean_rating IS NOT NULL;"""
print(duckdb.sql(requete))
┌────────────────────────────┬─────────────┐ │ title │ mean_rating │ │ varchar │ double │ ├────────────────────────────┼─────────────┤ │ Lawnmower Man, The (1992) │ 1.0 │ │ Jackal, The (1997) │ 4.0 │ │ GoldenEye (1995) │ 3.0 │ │ Braveheart (1995) │ 4.5 │ │ U Turn (1997) │ 4.0 │ │ Blues Brothers 2000 (1998) │ 5.0 │ │ Strange Days (1995) │ 5.0 │ └────────────────────────────┴─────────────┘