Version¶
In [1]:
#pyspark
import pyspark
pyspark.__version__
Out[1]:
'3.5.4'
Chargement des données d'apprentissage¶
Création d'une session¶
In [2]:
#création d'une session
from pyspark.sql import SparkSession
#configuré localement avec 4 coeurs
#et max de mémoire utilisée 1 GB
spark = SparkSession.builder \
.master("local[*]") \
.config("spark.executor.memory", "1g") \
.config("spark.executor.cores", "4") \
.getOrCreate()
#type de l'objet
print(type(spark))
<class 'pyspark.sql.session.SparkSession'>
Chargement et inspection¶
In [3]:
#changer le répertoire courant
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [4]:
#chargement des données d'apprentissage
dfTrain = spark.read.options(delimiter=";").csv("breast_train.csv",header=True,inferSchema=True)
#type des variables
dfTrain.printSchema()
root |-- clump: integer (nullable = true) |-- ucellsize: integer (nullable = true) |-- ucellshape: integer (nullable = true) |-- mgadhesion: integer (nullable = true) |-- sepics: integer (nullable = true) |-- bnuclei: integer (nullable = true) |-- bchromatin: integer (nullable = true) |-- normnucl: integer (nullable = true) |-- mitoses: integer (nullable = true) |-- target: string (nullable = true)
In [5]:
#premières lignes
dfTrain.show(5)
+-----+---------+----------+----------+------+-------+----------+--------+-------+---------+ |clump|ucellsize|ucellshape|mgadhesion|sepics|bnuclei|bchromatin|normnucl|mitoses| target| +-----+---------+----------+----------+------+-------+----------+--------+-------+---------+ | 1| 1| 1| 1| 1| 1| 3| 1| 1| begnin| | 10| 10| 10| 7| 10| 10| 8| 2| 1|malignant| | 10| 6| 3| 6| 4| 10| 7| 8| 4|malignant| | 3| 1| 1| 1| 2| 1| 2| 1| 2| begnin| | 3| 1| 1| 1| 2| 1| 1| 1| 1| begnin| +-----+---------+----------+----------+------+-------+----------+--------+-------+---------+ only showing top 5 rows
Préparation pour la modélisation¶
In [6]:
#liste des descripteurs
features_col = [col for col in dfTrain.columns if col != "target"]
print(features_col)
['clump', 'ucellsize', 'ucellshape', 'mgadhesion', 'sepics', 'bnuclei', 'bchromatin', 'normnucl', 'mitoses']
In [7]:
#vectoriser les descripteurs
from pyspark.ml.feature import VectorAssembler
# Créer un vecteur des caractéristiques
assembler = VectorAssembler(inputCols=features_col, outputCol="features")
dfTrain = assembler.transform(dfTrain)
#vérif.
dfTrain.show()
+-----+---------+----------+----------+------+-------+----------+--------+-------+---------+--------------------+ |clump|ucellsize|ucellshape|mgadhesion|sepics|bnuclei|bchromatin|normnucl|mitoses| target| features| +-----+---------+----------+----------+------+-------+----------+--------+-------+---------+--------------------+ | 1| 1| 1| 1| 1| 1| 3| 1| 1| begnin|[1.0,1.0,1.0,1.0,...| | 10| 10| 10| 7| 10| 10| 8| 2| 1|malignant|[10.0,10.0,10.0,7...| | 10| 6| 3| 6| 4| 10| 7| 8| 4|malignant|[10.0,6.0,3.0,6.0...| | 3| 1| 1| 1| 2| 1| 2| 1| 2| begnin|[3.0,1.0,1.0,1.0,...| | 3| 1| 1| 1| 2| 1| 1| 1| 1| begnin|[3.0,1.0,1.0,1.0,...| | 5| 1| 1| 2| 2| 2| 3| 1| 1| begnin|[5.0,1.0,1.0,2.0,...| | 4| 3| 2| 1| 3| 1| 2| 1| 1| begnin|[4.0,3.0,2.0,1.0,...| | 5| 1| 1| 1| 1| 1| 3| 1| 1| begnin|[5.0,1.0,1.0,1.0,...| | 4| 1| 1| 1| 2| 1| 3| 2| 1| begnin|[4.0,1.0,1.0,1.0,...| | 3| 2| 2| 1| 2| 1| 2| 3| 1| begnin|[3.0,2.0,2.0,1.0,...| | 5| 10| 10| 8| 5| 5| 7| 10| 1|malignant|[5.0,10.0,10.0,8....| | 10| 6| 6| 3| 4| 5| 3| 6| 1|malignant|[10.0,6.0,6.0,3.0...| | 7| 4| 5| 10| 2| 10| 3| 8| 2|malignant|[7.0,4.0,5.0,10.0...| | 4| 1| 1| 3| 1| 1| 2| 1| 1| begnin|[4.0,1.0,1.0,3.0,...| | 5| 7| 7| 1| 5| 8| 3| 4| 1| begnin|[5.0,7.0,7.0,1.0,...| | 3| 10| 8| 7| 6| 9| 9| 3| 8|malignant|[3.0,10.0,8.0,7.0...| | 1| 3| 3| 2| 2| 1| 7| 2| 1| begnin|[1.0,3.0,3.0,2.0,...| | 3| 1| 1| 1| 2| 1| 2| 1| 1| begnin|[3.0,1.0,1.0,1.0,...| | 5| 1| 1| 1| 2| 1| 2| 1| 1| begnin|[5.0,1.0,1.0,1.0,...| | 8| 7| 4| 4| 5| 3| 5| 10| 1|malignant|[8.0,7.0,4.0,4.0,...| +-----+---------+----------+----------+------+-------+----------+--------+-------+---------+--------------------+ only showing top 20 rows
In [8]:
#recoder la variable cible en 0 [begnin] et 1 [malignant]
from pyspark.sql.functions import when
dfTrain = dfTrain.withColumn("label",when(dfTrain['target']=="malignant",1).otherwise(0))
dfTrain.show(5)
+-----+---------+----------+----------+------+-------+----------+--------+-------+---------+--------------------+-----+ |clump|ucellsize|ucellshape|mgadhesion|sepics|bnuclei|bchromatin|normnucl|mitoses| target| features|label| +-----+---------+----------+----------+------+-------+----------+--------+-------+---------+--------------------+-----+ | 1| 1| 1| 1| 1| 1| 3| 1| 1| begnin|[1.0,1.0,1.0,1.0,...| 0| | 10| 10| 10| 7| 10| 10| 8| 2| 1|malignant|[10.0,10.0,10.0,7...| 1| | 10| 6| 3| 6| 4| 10| 7| 8| 4|malignant|[10.0,6.0,3.0,6.0...| 1| | 3| 1| 1| 1| 2| 1| 2| 1| 2| begnin|[3.0,1.0,1.0,1.0,...| 0| | 3| 1| 1| 1| 2| 1| 1| 1| 1| begnin|[3.0,1.0,1.0,1.0,...| 0| +-----+---------+----------+----------+------+-------+----------+--------+-------+---------+--------------------+-----+ only showing top 5 rows
In [9]:
#ne garder que les colonnes qui nous intéressent
dfTrain = dfTrain.select("features","label")
dfTrain.show()
+--------------------+-----+ | features|label| +--------------------+-----+ |[1.0,1.0,1.0,1.0,...| 0| |[10.0,10.0,10.0,7...| 1| |[10.0,6.0,3.0,6.0...| 1| |[3.0,1.0,1.0,1.0,...| 0| |[3.0,1.0,1.0,1.0,...| 0| |[5.0,1.0,1.0,2.0,...| 0| |[4.0,3.0,2.0,1.0,...| 0| |[5.0,1.0,1.0,1.0,...| 0| |[4.0,1.0,1.0,1.0,...| 0| |[3.0,2.0,2.0,1.0,...| 0| |[5.0,10.0,10.0,8....| 1| |[10.0,6.0,6.0,3.0...| 1| |[7.0,4.0,5.0,10.0...| 1| |[4.0,1.0,1.0,3.0,...| 0| |[5.0,7.0,7.0,1.0,...| 0| |[3.0,10.0,8.0,7.0...| 1| |[1.0,3.0,3.0,2.0,...| 0| |[3.0,1.0,1.0,1.0,...| 0| |[5.0,1.0,1.0,1.0,...| 0| |[8.0,7.0,4.0,4.0,...| 1| +--------------------+-----+ only showing top 20 rows
Entraînement du modèle¶
In [10]:
#instanciation
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol="features",labelCol="label")
#entraînement sur les données d'apprentissage
model = lr.fit(dfTrain)
In [11]:
#affichage des coefficients
print(features_col)
print(model.coefficients)
['clump', 'ucellsize', 'ucellshape', 'mgadhesion', 'sepics', 'bnuclei', 'bchromatin', 'normnucl', 'mitoses'] [0.5648252245008485,-0.12236390251894765,0.3553266381863309,0.31872309303254526,-0.09822500575733924,0.4272988206199519,0.6225731705199887,0.14709100815261644,0.48177882316045145]
In [12]:
#juste pour une présentation plus avenante
import pandas
pandas.DataFrame({"variable":features_col,"coef":model.coefficients})
Out[12]:
variable | coef | |
---|---|---|
0 | clump | 0.564825 |
1 | ucellsize | -0.122364 |
2 | ucellshape | 0.355327 |
3 | mgadhesion | 0.318723 |
4 | sepics | -0.098225 |
5 | bnuclei | 0.427299 |
6 | bchromatin | 0.622573 |
7 | normnucl | 0.147091 |
8 | mitoses | 0.481779 |
In [13]:
#intercept
print(model.intercept)
-9.89132998082611
In [14]:
# info détaillées
trainingSummary = model.summary
# par ex. la décroissance de la fonction de perte
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
print(objective)
objectiveHistory: 0.6560909066391412 0.16360041858262725 0.1400294335855931 0.10944069138261446 0.09837446964362398 0.09141207663279119 0.08724406987566327 0.0853129795025235 0.0849513650548719 0.0848231930168831 0.08473942065478333 0.08470889264349397 0.08469679748875486 0.08469606769019236 0.08469582885581871 0.0846958010983726 0.08469579936321399 0.08469579668250904 0.08469579647313742 0.08469579640284211 0.0846957964016923 0.08469579640120693
Evaluation sur l'échantillon test¶
Chargement et préparation des données de test¶
In [15]:
#chargement des données d'apprentissage
dfTest = spark.read.options(delimiter=";").csv("breast_test.csv",header=True,inferSchema=True)
#type des variables
dfTest.printSchema()
root |-- clump: integer (nullable = true) |-- ucellsize: integer (nullable = true) |-- ucellshape: integer (nullable = true) |-- mgadhesion: integer (nullable = true) |-- sepics: integer (nullable = true) |-- bnuclei: integer (nullable = true) |-- bchromatin: integer (nullable = true) |-- normnucl: integer (nullable = true) |-- mitoses: integer (nullable = true) |-- target: string (nullable = true)
In [16]:
#premières lignes
dfTest.show(5)
+-----+---------+----------+----------+------+-------+----------+--------+-------+---------+ |clump|ucellsize|ucellshape|mgadhesion|sepics|bnuclei|bchromatin|normnucl|mitoses| target| +-----+---------+----------+----------+------+-------+----------+--------+-------+---------+ | 5| 4| 6| 8| 4| 1| 8| 10| 1|malignant| | 10| 7| 7| 4| 5| 10| 5| 7| 2|malignant| | 3| 1| 1| 1| 2| 1| 3| 1| 1| begnin| | 1| 1| 2| 1| 3| 4| 1| 1| 1| begnin| | 10| 4| 6| 4| 5| 10| 7| 1| 1|malignant| +-----+---------+----------+----------+------+-------+----------+--------+-------+---------+ only showing top 5 rows
In [17]:
#vectorisation des features
dfTest = assembler.transform(dfTest)
#premières lignes
dfTest.show(5)
+-----+---------+----------+----------+------+-------+----------+--------+-------+---------+--------------------+ |clump|ucellsize|ucellshape|mgadhesion|sepics|bnuclei|bchromatin|normnucl|mitoses| target| features| +-----+---------+----------+----------+------+-------+----------+--------+-------+---------+--------------------+ | 5| 4| 6| 8| 4| 1| 8| 10| 1|malignant|[5.0,4.0,6.0,8.0,...| | 10| 7| 7| 4| 5| 10| 5| 7| 2|malignant|[10.0,7.0,7.0,4.0...| | 3| 1| 1| 1| 2| 1| 3| 1| 1| begnin|[3.0,1.0,1.0,1.0,...| | 1| 1| 2| 1| 3| 4| 1| 1| 1| begnin|[1.0,1.0,2.0,1.0,...| | 10| 4| 6| 4| 5| 10| 7| 1| 1|malignant|[10.0,4.0,6.0,4.0...| +-----+---------+----------+----------+------+-------+----------+--------+-------+---------+--------------------+ only showing top 5 rows
In [18]:
#ne garder que les 2 colonnes qui nous intéressent
dfTest = dfTest.select(['features','target'])
dfTest.show(5)
+--------------------+---------+ | features| target| +--------------------+---------+ |[5.0,4.0,6.0,8.0,...|malignant| |[10.0,7.0,7.0,4.0...|malignant| |[3.0,1.0,1.0,1.0,...| begnin| |[1.0,1.0,2.0,1.0,...| begnin| |[10.0,4.0,6.0,4.0...|malignant| +--------------------+---------+ only showing top 5 rows
Prédiction sur l'échantillon test¶
In [19]:
#appliquer le modèle sur le data frame de test
dfPred = model.transform(dfTest)
#resultats - nous obtenons un data frame
#les noms des colonnes additionnelles sont standardisés
#rawPrediction,probability,prediction
dfPred.show()
+--------------------+---------+--------------------+--------------------+----------+ | features| target| rawPrediction| probability|prediction| +--------------------+---------+--------------------+--------------------+----------+ |[5.0,4.0,6.0,8.0,...|malignant|[-4.0927581714178...|[0.01641904232948...| 1.0| |[10.0,7.0,7.0,4.0...|malignant|[-7.5504775193862...|[5.25582576997630...| 1.0| |[3.0,1.0,1.0,1.0,...| begnin|[4.91773032664532...|[0.99273741470937...| 0.0| |[1.0,1.0,2.0,1.0,...| begnin|[5.75352902239815...|[0.99683845855753...| 0.0| |[10.0,4.0,6.0,4.0...|malignant|[-7.4430640577205...|[5.85145879668262...| 1.0| |[5.0,1.0,2.0,1.0,...| begnin|[4.05532640997729...|[0.98296538423663...| 0.0| |[5.0,1.0,1.0,1.0,...| begnin|[4.41065304816362...|[0.98799854162688...| 0.0| |[3.0,1.0,1.0,4.0,...| begnin|[4.53526821567240...|[0.98938975411268...| 0.0| |[2.0,1.0,1.0,1.0,...| begnin|[6.20335372742350...|[0.99798144691852...| 0.0| |[5.0,2.0,1.0,1.0,...| begnin|[3.91044378016257...|[0.98036177578005...| 0.0| |[4.0,4.0,2.0,1.0,...| begnin|[2.79626923639472...|[0.94247388740125...| 0.0| |[2.0,1.0,1.0,1.0,...| begnin|[6.72770189218615...|[0.99880415048780...| 0.0| |[1.0,1.0,1.0,1.0,...| begnin|[7.29252711668700...|[0.99931985700224...| 0.0| |[8.0,10.0,10.0,10...|malignant|[-12.332023494070...|[4.40827146958253...| 1.0| |[10.0,10.0,10.0,2...|malignant|[-7.0142093020865...|[8.98208876699426...| 1.0| |[3.0,1.0,1.0,1.0,...| begnin|[4.91773032664532...|[0.99273741470937...| 0.0| |[1.0,1.0,1.0,1.0,...| begnin|[6.04738077564702...|[0.99764152966105...| 0.0| |[3.0,1.0,4.0,1.0,...| begnin|[5.09689675312631...|[0.99392147872430...| 0.0| |[3.0,1.0,1.0,1.0,...| begnin|[5.44207849140797...|[0.99568819907861...| 0.0| |[3.0,1.0,1.0,1.0,...| begnin|[5.24612148086008...|[0.99475969493837...| 0.0| +--------------------+---------+--------------------+--------------------+----------+ only showing top 20 rows
In [20]:
#détail des probabilités d'appartenance par ex.
dfPred.select("probability").show(5,truncate=False)
+------------------------------------------+ |probability | +------------------------------------------+ |[0.016419042329488043,0.9835809576705119] | |[5.255825769976308E-4,0.9994744174230024] | |[0.9927374147093723,0.007262585290627732] | |[0.9968384585575399,0.0031615414424600585]| |[5.851458796682625E-4,0.9994148541203317] | +------------------------------------------+ only showing top 5 rows
In [21]:
#transformer la prédiction en prediction explicit
dfPred = dfPred.withColumn("label_pred",when(dfPred['prediction']==1,"malignant").otherwise("begnin"))
dfPred.show(5)
+--------------------+---------+--------------------+--------------------+----------+----------+ | features| target| rawPrediction| probability|prediction|label_pred| +--------------------+---------+--------------------+--------------------+----------+----------+ |[5.0,4.0,6.0,8.0,...|malignant|[-4.0927581714178...|[0.01641904232948...| 1.0| malignant| |[10.0,7.0,7.0,4.0...|malignant|[-7.5504775193862...|[5.25582576997630...| 1.0| malignant| |[3.0,1.0,1.0,1.0,...| begnin|[4.91773032664532...|[0.99273741470937...| 0.0| begnin| |[1.0,1.0,2.0,1.0,...| begnin|[5.75352902239815...|[0.99683845855753...| 0.0| begnin| |[10.0,4.0,6.0,4.0...|malignant|[-7.4430640577205...|[5.85145879668262...| 1.0| malignant| +--------------------+---------+--------------------+--------------------+----------+----------+ only showing top 5 rows
Confrontation classes observées vs. prédites¶
In [22]:
#transformer le target en étiquette 0/1
dfPred = dfPred.withColumn("label",when(dfPred['target']=="malignant",1).otherwise(0))
dfPred.show(5)
+--------------------+---------+--------------------+--------------------+----------+----------+-----+ | features| target| rawPrediction| probability|prediction|label_pred|label| +--------------------+---------+--------------------+--------------------+----------+----------+-----+ |[5.0,4.0,6.0,8.0,...|malignant|[-4.0927581714178...|[0.01641904232948...| 1.0| malignant| 1| |[10.0,7.0,7.0,4.0...|malignant|[-7.5504775193862...|[5.25582576997630...| 1.0| malignant| 1| |[3.0,1.0,1.0,1.0,...| begnin|[4.91773032664532...|[0.99273741470937...| 0.0| begnin| 0| |[1.0,1.0,2.0,1.0,...| begnin|[5.75352902239815...|[0.99683845855753...| 0.0| begnin| 0| |[10.0,4.0,6.0,4.0...|malignant|[-7.4430640577205...|[5.85145879668262...| 1.0| malignant| 1| +--------------------+---------+--------------------+--------------------+----------+----------+-----+ only showing top 5 rows
In [23]:
#matrice de confusion -- attention, trier "target" sinon n'importe quoi !!!
dfPred.groupBy("target").pivot("label_pred").count().sort("target").show()
+---------+------+---------+ | target|begnin|malignant| +---------+------+---------+ | begnin| 138| 3| |malignant| 3| 56| +---------+------+---------+
La classe BinaryClassificationEvaluator¶
In [24]:
#classe spécifique pour l'évaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="label")
In [25]:
#pour rappel - la structure du data frame de prédiction
dfPred.show()
+--------------------+---------+--------------------+--------------------+----------+----------+-----+ | features| target| rawPrediction| probability|prediction|label_pred|label| +--------------------+---------+--------------------+--------------------+----------+----------+-----+ |[5.0,4.0,6.0,8.0,...|malignant|[-4.0927581714178...|[0.01641904232948...| 1.0| malignant| 1| |[10.0,7.0,7.0,4.0...|malignant|[-7.5504775193862...|[5.25582576997630...| 1.0| malignant| 1| |[3.0,1.0,1.0,1.0,...| begnin|[4.91773032664532...|[0.99273741470937...| 0.0| begnin| 0| |[1.0,1.0,2.0,1.0,...| begnin|[5.75352902239815...|[0.99683845855753...| 0.0| begnin| 0| |[10.0,4.0,6.0,4.0...|malignant|[-7.4430640577205...|[5.85145879668262...| 1.0| malignant| 1| |[5.0,1.0,2.0,1.0,...| begnin|[4.05532640997729...|[0.98296538423663...| 0.0| begnin| 0| |[5.0,1.0,1.0,1.0,...| begnin|[4.41065304816362...|[0.98799854162688...| 0.0| begnin| 0| |[3.0,1.0,1.0,4.0,...| begnin|[4.53526821567240...|[0.98938975411268...| 0.0| begnin| 0| |[2.0,1.0,1.0,1.0,...| begnin|[6.20335372742350...|[0.99798144691852...| 0.0| begnin| 0| |[5.0,2.0,1.0,1.0,...| begnin|[3.91044378016257...|[0.98036177578005...| 0.0| begnin| 0| |[4.0,4.0,2.0,1.0,...| begnin|[2.79626923639472...|[0.94247388740125...| 0.0| begnin| 0| |[2.0,1.0,1.0,1.0,...| begnin|[6.72770189218615...|[0.99880415048780...| 0.0| begnin| 0| |[1.0,1.0,1.0,1.0,...| begnin|[7.29252711668700...|[0.99931985700224...| 0.0| begnin| 0| |[8.0,10.0,10.0,10...|malignant|[-12.332023494070...|[4.40827146958253...| 1.0| malignant| 1| |[10.0,10.0,10.0,2...|malignant|[-7.0142093020865...|[8.98208876699426...| 1.0| malignant| 1| |[3.0,1.0,1.0,1.0,...| begnin|[4.91773032664532...|[0.99273741470937...| 0.0| begnin| 0| |[1.0,1.0,1.0,1.0,...| begnin|[6.04738077564702...|[0.99764152966105...| 0.0| begnin| 0| |[3.0,1.0,4.0,1.0,...| begnin|[5.09689675312631...|[0.99392147872430...| 0.0| begnin| 0| |[3.0,1.0,1.0,1.0,...| begnin|[5.44207849140797...|[0.99568819907861...| 0.0| begnin| 0| |[3.0,1.0,1.0,1.0,...| begnin|[5.24612148086008...|[0.99475969493837...| 0.0| begnin| 0| +--------------------+---------+--------------------+--------------------+----------+----------+-----+ only showing top 20 rows
In [26]:
#AUC du modèle
print(evaluator.evaluate(dfPred,{evaluator.metricName:"areaUnderROC"}))
0.9944704892414953
In [27]:
#stopper la machine
spark.stop()