Importation - Inspection des données¶

In [1]:
# version de scikit-learn
import sklearn
sklearn.__version__
Out[1]:
'1.8.0'
In [2]:
# changement de dossier
import os
os.chdir("C:/Users/ricco/Desktop/demo")
In [3]:
# chargement
import pandas
df = pandas.read_excel("dataset_2D_outliers.xlsx")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      100 non-null    float64
 1   x2      100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB
In [4]:
# représentation dans le plan
import seaborn as sns
sns.scatterplot(df,x="x1",y="x2")
Out[4]:
<Axes: xlabel='x1', ylabel='x2'>
No description has been provided for this image

Isolation Forest¶

In [5]:
# classe de calcul
from sklearn.ensemble import IsolationForest

# entraînement isolation forest
isof = IsolationForest(n_estimators=500)
isof.fit(df)

# scores pour les individus
scores = isof.score_samples(df)
print(scores)
[-0.43728389 -0.55150171 -0.42289236 -0.45110975 -0.47928923 -0.4302279
 -0.51380213 -0.52160927 -0.4107846  -0.42829497 -0.4508745  -0.43831545
 -0.42389577 -0.44358852 -0.64535837 -0.50844977 -0.4272569  -0.50380055
 -0.50201508 -0.46840255 -0.43891585 -0.55495039 -0.44380509 -0.43957735
 -0.41281843 -0.43469886 -0.40674242 -0.46257965 -0.45804569 -0.56481615
 -0.42289995 -0.42327535 -0.42926826 -0.41515384 -0.44233831 -0.48858484
 -0.45665153 -0.66086538 -0.44773115 -0.41696702 -0.43741305 -0.43292019
 -0.47831899 -0.46277304 -0.55187263 -0.40887505 -0.42486348 -0.4868043
 -0.49465577 -0.45569019 -0.44334118 -0.596103   -0.51778949 -0.47880003
 -0.46809729 -0.42249406 -0.49910562 -0.42068103 -0.44502062 -0.43671221
 -0.40260279 -0.46059507 -0.41932875 -0.42320257 -0.52124103 -0.46575884
 -0.47441931 -0.80198915 -0.42330452 -0.5233635  -0.41443457 -0.51377476
 -0.49097946 -0.43420901 -0.4579023  -0.4959781  -0.49272213 -0.51003888
 -0.40724706 -0.44712284 -0.41353202 -0.43261831 -0.4347582  -0.4395739
 -0.47641239 -0.42667045 -0.401202   -0.41792748 -0.43470576 -0.40726207
 -0.40812998 -0.42434901 -0.44181381 -0.4143062  -0.41990002 -0.44015189
 -0.56825932 -0.43859455 -0.51368141 -0.5747752 ]
In [6]:
# degré d'anomalie des points dans le plan - avec dégradé
sns.scatterplot(df,x="x1",y="x2",hue=scores,palette="viridis")
Out[6]:
<Axes: xlabel='x1', ylabel='x2'>
No description has been provided for this image
In [7]:
# graphique des scores
import numpy
sns.lineplot(x=numpy.arange(1,scores.shape[0]+1),y=numpy.sort(scores))
Out[7]:
<Axes: >
No description has been provided for this image
In [8]:
# valeur du seuil par défaut
isof.offset_
Out[8]:
-0.5
In [9]:
# scores corrigé par le seuil
# négatif => outlier, positif => outlier
isof.decision_function(df)
Out[9]:
array([ 0.06271611, -0.05150171,  0.07710764,  0.04889025,  0.02071077,
        0.0697721 , -0.01380213, -0.02160927,  0.0892154 ,  0.07170503,
        0.0491255 ,  0.06168455,  0.07610423,  0.05641148, -0.14535837,
       -0.00844977,  0.0727431 , -0.00380055, -0.00201508,  0.03159745,
        0.06108415, -0.05495039,  0.05619491,  0.06042265,  0.08718157,
        0.06530114,  0.09325758,  0.03742035,  0.04195431, -0.06481615,
        0.07710005,  0.07672465,  0.07073174,  0.08484616,  0.05766169,
        0.01141516,  0.04334847, -0.16086538,  0.05226885,  0.08303298,
        0.06258695,  0.06707981,  0.02168101,  0.03722696, -0.05187263,
        0.09112495,  0.07513652,  0.0131957 ,  0.00534423,  0.04430981,
        0.05665882, -0.096103  , -0.01778949,  0.02119997,  0.03190271,
        0.07750594,  0.00089438,  0.07931897,  0.05497938,  0.06328779,
        0.09739721,  0.03940493,  0.08067125,  0.07679743, -0.02124103,
        0.03424116,  0.02558069, -0.30198915,  0.07669548, -0.0233635 ,
        0.08556543, -0.01377476,  0.00902054,  0.06579099,  0.0420977 ,
        0.0040219 ,  0.00727787, -0.01003888,  0.09275294,  0.05287716,
        0.08646798,  0.06738169,  0.0652418 ,  0.0604261 ,  0.02358761,
        0.07332955,  0.098798  ,  0.08207252,  0.06529424,  0.09273793,
        0.09187002,  0.07565099,  0.05818619,  0.0856938 ,  0.08009998,
        0.05984811, -0.06825932,  0.06140545, -0.01368141, -0.0747752 ])
In [10]:
# ou encore predict() => -1 outlier, +1 inlier
y_iso = isof.predict(df)
print(y_iso)
[ 1 -1  1  1  1  1 -1 -1  1  1  1  1  1  1 -1 -1  1 -1 -1  1  1 -1  1  1
  1  1  1  1  1 -1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1 -1  1  1  1
  1  1  1 -1 -1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1 -1  1 -1  1 -1
  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
 -1  1 -1 -1]
In [11]:
# position des outliers dans le plan
# seuil par défaut, pas bon
sns.scatterplot(df,x="x1",y="x2",hue=y_iso,palette="tab10")
Out[11]:
<Axes: xlabel='x1', ylabel='x2'>
No description has been provided for this image
In [12]:
# on tâtonne -> seuil à -0.57 (???)
y_iso_2 = numpy.where(isof.score_samples(df) < -0.57, -1, 1)

# nouveau graphique
sns.scatterplot(df,x="x1",y="x2",hue=y_iso_2,palette="tab10")
Out[12]:
<Axes: xlabel='x1', ylabel='x2'>
No description has been provided for this image

Local Outlier Factor¶

In [13]:
# classe de calcul
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=5)
lof.fit(df)

# scores d'anomalies (négatives)
# valeurs fortes en valeur absolue => outlier
print(lof.negative_outlier_factor_)
[-1.04608274 -1.69257026 -1.11813672 -0.9690844  -1.43343698 -0.96375951
 -2.19721377 -1.79444413 -0.94487364 -1.11430717 -1.18364131 -0.9903625
 -1.0579283  -1.15025357 -4.2085264  -1.3821889  -0.98014108 -1.15531269
 -1.57238353 -1.19450888 -1.15571224 -2.07888153 -1.19004651 -0.96429186
 -1.03393334 -1.09894885 -1.01804325 -1.29357982 -1.23629537 -1.28653655
 -1.19337467 -0.99497497 -1.09922916 -1.02077956 -1.18729471 -1.51809913
 -1.16679406 -2.68036148 -0.95767284 -0.99504931 -1.09818903 -1.14402537
 -1.13362099 -1.02112053 -1.77664714 -1.00763071 -1.18361056 -1.4443283
 -1.99726922 -1.08809832 -1.02606878 -1.46959988 -1.53287686 -1.23964087
 -1.22438247 -0.94209207 -1.17282471 -0.96304352 -1.17864837 -0.9871998
 -0.92175695 -1.14890903 -1.01336198 -0.99410605 -1.84745407 -1.2450533
 -1.31219323 -9.02703915 -1.06854442 -1.52540007 -1.00912782 -1.41736843
 -1.47648814 -0.95450718 -1.14609758 -1.22017984 -1.40514416 -1.75658972
 -0.99860746 -1.01482402 -0.90847344 -1.15449827 -0.99405383 -1.16464754
 -1.12401074 -0.95904145 -0.93617898 -1.04917665 -1.09697468 -0.97206601
 -0.96748459 -1.0258839  -1.14227194 -0.89962347 -0.96305421 -1.03923319
 -1.52915086 -1.1424895  -1.15531269 -2.20815446]
In [14]:
# degré d'anomalie des points dans le plan - avec dégradé
sns.scatterplot(df,x="x1",y="x2",hue=lof.negative_outlier_factor_,palette="viridis")
Out[14]:
<Axes: xlabel='x1', ylabel='x2'>
No description has been provided for this image
In [15]:
# valeur seuil
lof.offset_
Out[15]:
-1.5
In [16]:
# points atypiques
# attention, pas predict() réservé à la novelty detection
# on aurait pu aussi comparer le score au seuil
y_lof = lof.fit_predict(df)
print(y_lof)
[ 1 -1  1  1  1  1 -1 -1  1  1  1  1  1  1 -1  1  1  1 -1  1  1 -1  1  1
  1  1  1  1  1  1  1  1  1  1  1 -1  1 -1  1  1  1  1  1  1 -1  1  1  1
 -1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1 -1  1 -1  1  1
  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
 -1  1  1 -1]
In [17]:
# graphique des outliers
sns.scatterplot(df,x="x1",y="x2",hue=y_lof,palette="tab10")
Out[17]:
<Axes: xlabel='x1', ylabel='x2'>
No description has been provided for this image
In [18]:
# valeurs des scores - plot
sns.lineplot(x=numpy.arange(1,lof.negative_outlier_factor_.shape[0]+1),y=numpy.sort(lof.negative_outlier_factor_))
Out[18]:
<Axes: >
No description has been provided for this image
In [19]:
# si on fixe le seuil à -2 (???)
y_lof_2 = numpy.where(lof.negative_outlier_factor_ < -2.0, -1, 1)

# nouveau graphique
sns.scatterplot(df,x="x1",y="x2",hue=y_lof_2,palette="tab10")
Out[19]:
<Axes: xlabel='x1', ylabel='x2'>
No description has been provided for this image