Importation - Inspection des données¶
In [1]:
# version de scikit-learn
import sklearn
sklearn.__version__
Out[1]:
'1.8.0'
In [2]:
# changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [3]:
# chargement
import pandas
df = pandas.read_excel("dataset_2D_outliers.xlsx")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 x1 100 non-null float64 1 x2 100 non-null float64 dtypes: float64(2) memory usage: 1.7 KB
In [4]:
# représentation dans le plan
import seaborn as sns
sns.scatterplot(df,x="x1",y="x2")
Out[4]:
<Axes: xlabel='x1', ylabel='x2'>
Isolation Forest¶
In [5]:
# classe de calcul
from sklearn.ensemble import IsolationForest
# entraînement isolation forest
isof = IsolationForest(n_estimators=500)
isof.fit(df)
# scores pour les individus
scores = isof.score_samples(df)
print(scores)
[-0.43728389 -0.55150171 -0.42289236 -0.45110975 -0.47928923 -0.4302279 -0.51380213 -0.52160927 -0.4107846 -0.42829497 -0.4508745 -0.43831545 -0.42389577 -0.44358852 -0.64535837 -0.50844977 -0.4272569 -0.50380055 -0.50201508 -0.46840255 -0.43891585 -0.55495039 -0.44380509 -0.43957735 -0.41281843 -0.43469886 -0.40674242 -0.46257965 -0.45804569 -0.56481615 -0.42289995 -0.42327535 -0.42926826 -0.41515384 -0.44233831 -0.48858484 -0.45665153 -0.66086538 -0.44773115 -0.41696702 -0.43741305 -0.43292019 -0.47831899 -0.46277304 -0.55187263 -0.40887505 -0.42486348 -0.4868043 -0.49465577 -0.45569019 -0.44334118 -0.596103 -0.51778949 -0.47880003 -0.46809729 -0.42249406 -0.49910562 -0.42068103 -0.44502062 -0.43671221 -0.40260279 -0.46059507 -0.41932875 -0.42320257 -0.52124103 -0.46575884 -0.47441931 -0.80198915 -0.42330452 -0.5233635 -0.41443457 -0.51377476 -0.49097946 -0.43420901 -0.4579023 -0.4959781 -0.49272213 -0.51003888 -0.40724706 -0.44712284 -0.41353202 -0.43261831 -0.4347582 -0.4395739 -0.47641239 -0.42667045 -0.401202 -0.41792748 -0.43470576 -0.40726207 -0.40812998 -0.42434901 -0.44181381 -0.4143062 -0.41990002 -0.44015189 -0.56825932 -0.43859455 -0.51368141 -0.5747752 ]
In [6]:
# degré d'anomalie des points dans le plan - avec dégradé
sns.scatterplot(df,x="x1",y="x2",hue=scores,palette="viridis")
Out[6]:
<Axes: xlabel='x1', ylabel='x2'>
In [7]:
# graphique des scores
import numpy
sns.lineplot(x=numpy.arange(1,scores.shape[0]+1),y=numpy.sort(scores))
Out[7]:
<Axes: >
In [8]:
# valeur du seuil par défaut
isof.offset_
Out[8]:
-0.5
In [9]:
# scores corrigé par le seuil
# négatif => outlier, positif => outlier
isof.decision_function(df)
Out[9]:
array([ 0.06271611, -0.05150171, 0.07710764, 0.04889025, 0.02071077,
0.0697721 , -0.01380213, -0.02160927, 0.0892154 , 0.07170503,
0.0491255 , 0.06168455, 0.07610423, 0.05641148, -0.14535837,
-0.00844977, 0.0727431 , -0.00380055, -0.00201508, 0.03159745,
0.06108415, -0.05495039, 0.05619491, 0.06042265, 0.08718157,
0.06530114, 0.09325758, 0.03742035, 0.04195431, -0.06481615,
0.07710005, 0.07672465, 0.07073174, 0.08484616, 0.05766169,
0.01141516, 0.04334847, -0.16086538, 0.05226885, 0.08303298,
0.06258695, 0.06707981, 0.02168101, 0.03722696, -0.05187263,
0.09112495, 0.07513652, 0.0131957 , 0.00534423, 0.04430981,
0.05665882, -0.096103 , -0.01778949, 0.02119997, 0.03190271,
0.07750594, 0.00089438, 0.07931897, 0.05497938, 0.06328779,
0.09739721, 0.03940493, 0.08067125, 0.07679743, -0.02124103,
0.03424116, 0.02558069, -0.30198915, 0.07669548, -0.0233635 ,
0.08556543, -0.01377476, 0.00902054, 0.06579099, 0.0420977 ,
0.0040219 , 0.00727787, -0.01003888, 0.09275294, 0.05287716,
0.08646798, 0.06738169, 0.0652418 , 0.0604261 , 0.02358761,
0.07332955, 0.098798 , 0.08207252, 0.06529424, 0.09273793,
0.09187002, 0.07565099, 0.05818619, 0.0856938 , 0.08009998,
0.05984811, -0.06825932, 0.06140545, -0.01368141, -0.0747752 ])
In [10]:
# ou encore predict() => -1 outlier, +1 inlier
y_iso = isof.predict(df)
print(y_iso)
[ 1 -1 1 1 1 1 -1 -1 1 1 1 1 1 1 -1 -1 1 -1 -1 1 1 -1 1 1 1 1 1 1 1 -1 1 1 1 1 1 1 1 -1 1 1 1 1 1 1 -1 1 1 1 1 1 1 -1 -1 1 1 1 1 1 1 1 1 1 1 1 -1 1 1 -1 1 -1 1 -1 1 1 1 1 1 -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 -1 1 -1 -1]
In [11]:
# position des outliers dans le plan
# seuil par défaut, pas bon
sns.scatterplot(df,x="x1",y="x2",hue=y_iso,palette="tab10")
Out[11]:
<Axes: xlabel='x1', ylabel='x2'>
In [12]:
# on tâtonne -> seuil à -0.57 (???)
y_iso_2 = numpy.where(isof.score_samples(df) < -0.57, -1, 1)
# nouveau graphique
sns.scatterplot(df,x="x1",y="x2",hue=y_iso_2,palette="tab10")
Out[12]:
<Axes: xlabel='x1', ylabel='x2'>
Local Outlier Factor¶
In [13]:
# classe de calcul
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=5)
lof.fit(df)
# scores d'anomalies (négatives)
# valeurs fortes en valeur absolue => outlier
print(lof.negative_outlier_factor_)
[-1.04608274 -1.69257026 -1.11813672 -0.9690844 -1.43343698 -0.96375951 -2.19721377 -1.79444413 -0.94487364 -1.11430717 -1.18364131 -0.9903625 -1.0579283 -1.15025357 -4.2085264 -1.3821889 -0.98014108 -1.15531269 -1.57238353 -1.19450888 -1.15571224 -2.07888153 -1.19004651 -0.96429186 -1.03393334 -1.09894885 -1.01804325 -1.29357982 -1.23629537 -1.28653655 -1.19337467 -0.99497497 -1.09922916 -1.02077956 -1.18729471 -1.51809913 -1.16679406 -2.68036148 -0.95767284 -0.99504931 -1.09818903 -1.14402537 -1.13362099 -1.02112053 -1.77664714 -1.00763071 -1.18361056 -1.4443283 -1.99726922 -1.08809832 -1.02606878 -1.46959988 -1.53287686 -1.23964087 -1.22438247 -0.94209207 -1.17282471 -0.96304352 -1.17864837 -0.9871998 -0.92175695 -1.14890903 -1.01336198 -0.99410605 -1.84745407 -1.2450533 -1.31219323 -9.02703915 -1.06854442 -1.52540007 -1.00912782 -1.41736843 -1.47648814 -0.95450718 -1.14609758 -1.22017984 -1.40514416 -1.75658972 -0.99860746 -1.01482402 -0.90847344 -1.15449827 -0.99405383 -1.16464754 -1.12401074 -0.95904145 -0.93617898 -1.04917665 -1.09697468 -0.97206601 -0.96748459 -1.0258839 -1.14227194 -0.89962347 -0.96305421 -1.03923319 -1.52915086 -1.1424895 -1.15531269 -2.20815446]
In [14]:
# degré d'anomalie des points dans le plan - avec dégradé
sns.scatterplot(df,x="x1",y="x2",hue=lof.negative_outlier_factor_,palette="viridis")
Out[14]:
<Axes: xlabel='x1', ylabel='x2'>
In [15]:
# valeur seuil
lof.offset_
Out[15]:
-1.5
In [16]:
# points atypiques
# attention, pas predict() réservé à la novelty detection
# on aurait pu aussi comparer le score au seuil
y_lof = lof.fit_predict(df)
print(y_lof)
[ 1 -1 1 1 1 1 -1 -1 1 1 1 1 1 1 -1 1 1 1 -1 1 1 -1 1 1 1 1 1 1 1 1 1 1 1 1 1 -1 1 -1 1 1 1 1 1 1 -1 1 1 1 -1 1 1 1 -1 1 1 1 1 1 1 1 1 1 1 1 -1 1 1 -1 1 -1 1 1 1 1 1 1 1 -1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 -1 1 1 -1]
In [17]:
# graphique des outliers
sns.scatterplot(df,x="x1",y="x2",hue=y_lof,palette="tab10")
Out[17]:
<Axes: xlabel='x1', ylabel='x2'>
In [18]:
# valeurs des scores - plot
sns.lineplot(x=numpy.arange(1,lof.negative_outlier_factor_.shape[0]+1),y=numpy.sort(lof.negative_outlier_factor_))
Out[18]:
<Axes: >
In [19]:
# si on fixe le seuil à -2 (???)
y_lof_2 = numpy.where(lof.negative_outlier_factor_ < -2.0, -1, 1)
# nouveau graphique
sns.scatterplot(df,x="x1",y="x2",hue=y_lof_2,palette="tab10")
Out[19]:
<Axes: xlabel='x1', ylabel='x2'>