Environnement et packages¶

In [125]:
# activer l'environnement
using Pkg
Pkg.activate("env_julia_transformers")
  Activating project at `c:\Users\ricco\Desktop\demo\env_julia_transformers`
In [126]:
# liste des packages installés
Pkg.status()
Status `C:\Users\ricco\Desktop\demo\env_julia_transformers\Project.toml`
  [324d7699] CategoricalArrays v1.1.1
  [a93c6f00] DataFrames v1.8.2
  [add582a8] MLJ v0.23.2
  [6ee0df7b] MLJLinearModels v0.10.1
  [21ca0261] Transformers v0.3.1
  [fdbf4ff8] XLSX v0.11.11

Modèle pré-entraîné pour calcul des embeddings¶

Utilisation brute¶

In [127]:
# outils pour chargement du modèle pé-entraîné
using Transformers
using Transformers.TextEncoders
using Transformers.HuggingFace

# Charge le tokenizer + le modèle pré-entraîné "sentence transformers -- all-MiniLM-L6-v2"
# on pouvait aussi décomposer en load_tonkenizer et load_model
textencoder, model = hgf"sentence-transformers/all-MiniLM-L6-v2"
(TrfTextEncoder(
├─ TextTokenizer(MatchTokenization(WordPieceTokenization(bert_uncased_tokenizer, WordPiece(vocab_size = 30522, unk = [UNK], max_char = 100)), 5 patterns)),
├─ vocab = Vocab{String, SizedArray}(size = 30522, unk = [UNK], unki = 101),
├─ config = @NamedTuple{startsym::String, endsym::String, padsym::String, trunc::Union{Nothing, Int64}}(("[CLS]", "[SEP]", "[PAD]", 128)),
├─ annotate = annotate_strings,
├─ onehot = lookup_first,
├─ decode = nestedcall(remove_conti_prefix),
├─ textprocess = Pipelines(target[token] := join_text(source); target[token] := nestedcall(cleanup ∘ remove_prefix_space, target.token); target := (target.token)),
└─ process = Pipelines:
  ╰─ target[token] := TextEncodeBase.nestedcall(string_getvalue, source)
  ╰─ target[token] := Transformers.TextEncoders.grouping_sentence(target.token)
  ╰─ target[(token, segment)] := SequenceTemplate{String}([CLS]:<type=1> Input[1]:<type=1> [SEP]:<type=1> (Input[2]:<type=2> [SEP]:<type=2>)...)(target.token)
  ╰─ target[attention_mask] := (NeuralAttentionlib.LengthMask ∘ Transformers.TextEncoders.getlengths(128))(target.token)
  ╰─ target[token] := TextEncodeBase.trunc_or_pad(128, [PAD], head, tail)(target.token)
  ╰─ target[token] := TextEncodeBase.nested2batch(target.token)
  ╰─ target[segment] := TextEncodeBase.trunc_or_pad(128, 1, head, tail)(target.segment)
  ╰─ target[segment] := TextEncodeBase.nested2batch(target.segment)
  ╰─ target[sequence_mask] := identity(target.attention_mask)
  ╰─ target := (target.token, target.segment, target.attention_mask, target.sequence_mask)
), HGFBertModel(Chain(CompositeEmbedding(token = Embed(384, 30522), position = ApplyEmbed(.+, FixedLenPositionEmbed(384, 512)), segment = ApplyEmbed(.+, Embed(384, 2), Transformers.HuggingFace.bert_ones_like)), DropoutLayer<nothing>(LayerNorm(384, ϵ = 1.0e-12))), Transformer<6>(PostNormTransformerBlock(DropoutLayer<nothing>(SelfAttention(MultiheadQKVAttenOp(head = 12, p = nothing), Fork<3>(Dense(W = (384, 384), b = true)), Dense(W = (384, 384), b = true))), LayerNorm(384, ϵ = 1.0e-12), DropoutLayer<nothing>(Chain(Dense(σ = NNlib.gelu_tanh, W = (384, 1536), b = true), Dense(W = (1536, 384), b = true))), LayerNorm(384, ϵ = 1.0e-12))), Branch{(:pooled,) = (:hidden_state,)}(BertPooler(Dense(σ = NNlib.tanh_fast, W = (384, 384), b = true)))))
In [128]:
# essai sur la phrase (même que sous Python)
phrase = ["i check the vector for embedding"]
typeof(phrase)
Vector{String} (alias for Array{String, 1})
In [129]:
# tokénisation
sample = encode(textencoder, phrase)
sample.token
30522x128x1 OneHotArray{30522, 3, Matrix{OneHot{0x0000773a}}}:
[:, :, 1] =
 0  0  0  0  0  0  0  0  0  0  1  1  1  …  1  1  1  1  1  1  1  1  1  1  1  1
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 ⋮              ⋮              ⋮        ⋱              ⋮              ⋮     
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
In [130]:
# obtention des coordonnées internes du modèle
# via couche cachée
model(sample).hidden_state
384×128×1 Array{Float32, 3}:
[:, :, 1] =
  0.095327    0.087373   0.845984   …   0.125315     0.135887   -0.0639195
 -0.198384   -0.846359  -0.881889       1.47837      1.40749     1.65427
 -0.207963   -0.435514  -0.335268       0.472097     0.437874    0.477201
 -0.0213389  -0.321832  -0.642241       0.175378     0.212665    0.199589
  0.175494    0.068748   0.933504       0.594178     0.567057    0.503295
  0.518619   -0.38562   -0.502043   …  -0.982852    -0.970056   -1.02056
 -0.251166    0.566989   0.378429       0.107739     0.189489    0.19574
 -0.116912    0.25914   -0.53065       -0.250755    -0.248923   -0.0435614
 -0.135102   -0.297249   0.322535       0.424965     0.403099    0.420517
 -0.251474   -0.386571   0.468953      -0.24419     -0.243078   -0.292161
  ⋮                                 ⋱   ⋮                       
 -0.360721   -0.415961  -0.898789   …  -0.27447     -0.24457    -0.213064
 -0.0687004   0.515878  -0.0557876     -0.191391    -0.165465   -0.403164
  0.189018   -0.325677   0.584291       0.112856     0.0652386   0.264499
  0.354121    0.343456   0.0356015      0.355765     0.398299    0.47135
 -0.277821   -0.318273  -0.0353911     -0.350908    -0.361668   -0.40573
 -0.0701933   1.15144   -0.0883372  …   0.774171     0.865632    0.744496
  0.0714479   0.184165   0.578313       0.00153449  -0.0256515  -0.117905
 -0.109169   -0.378037   0.120577      -0.976198    -0.938467   -0.792349
 -0.0454283  -0.523763  -0.619906      -0.560275    -0.540927   -0.555701
In [131]:
# autre propriété pooled BERT natif
# bonne dimension mais ne correspond pas du tout
# au "sentence_transformers" de Python !!!
model(sample).pooled
384×1 Matrix{Float32}:
 -0.013239923
 -0.040951025
  0.061973903
  0.0042933896
 -0.029748408
 -0.050784208
  0.04984044
  0.041920543
 -0.10031746
  0.04938916
  ⋮
 -0.023819428
 -0.15426454
  0.0829849
 -0.0025867838
 -0.10589066
  0.023930637
 -0.017439567
 -0.06480052
  0.02757243

Réponse de "Claude.AI" Pour obtenir un résultat équivalent à sentence_transformers en Python avec all-MiniLM-L6-v2, il faut reproduire exactement ce que fait la librairie Python : un forward pass dans le modèle, puis un mean pooling tenant compte de l'attention mask, puis une normalisation L2. C'est précisément le point qui pose problème en général en Julia : si on se contente de model(encode(enc, sentences)), on récupère le champ pooled (sortie du pooler BERT, CLS + tanh), qui n'a rien à voir avec le mean pooling utilisé par sentence_transformers — d'où des vecteurs différents.

Le champ pooled correspond à la sortie classique du pooler BERT : token [CLS] → couche dense → tanh. Cette tête a été conçue à l'origine pour la tâche de Next Sentence Prediction du BERT original — elle n'a jamais été entraînée pour produire des vecteurs où "la distance reflète la similarité sémantique". Le papier fondateur de Sentence-BERT (Reimers & Gurevych, 2019) a justement montré que les embeddings [CLS]/pooled d'un BERT brut donnent de moins bons résultats sur des tâches de similarité sémantique (STS) que la simple moyenne de vecteurs GloVe — c'est précisément ce constat qui a motivé la création de l'architecture sentence-transformers basée sur le mean pooling.

Cas spécifique de all-MiniLM-L6-v2

C'est encore plus net pour ce modèle précis : il a été fine-tuné par contrastive learning (plus d'1 milliard de paires de phrases) en utilisant explicitement le mean pooling + normalisation L2 comme objectif. La tête pooler (dense+tanh) n'a jamais fait partie de ce pipeline d'entraînement — elle existe juste parce que c'est un artefact de l'architecture BERT sous-jacente exposée par Transformers.jl, mais elle n'a reçu aucun gradient lié à la tâche de similarité sémantique. Concrètement, pooled est quasiment du bruit non pertinent pour cette tâche.

Fonction pour post-traitement du vecteur de "Transformers.jl"¶

In [132]:
# fonction pour obtenir des coordonnées similaires à
# "sentence_transformers" sous Python

function sentence_embeddings(textenc, model, sentences::Vector{<:AbstractString})
    sample = encode(textenc, sentences)
    output = model(sample)

    H    = output.hidden_state                 # (hidden_size, seq_len, batch)
    lens = Int.(sample.attention_mask.len)      # longueur réelle (hors padding) de chaque phrase

    hidden_size, _, batch = size(H)
    emb = Matrix{Float32}(undef, hidden_size, batch)

    # 1) Mean pooling sur les tokens non paddés (équivalent du mean_pooling() Python)
    for b in 1:batch
        L = lens[b]
        emb[:, b] = vec(sum(@view(H[:, 1:L, b]), dims = 2)) ./ L
    end

    # 2) Normalisation L2 (ce modèle inclut une étape Normalize par défaut côté Python)
    for b in 1:batch
        emb[:, b] ./= sqrt(sum(abs2, @view(emb[:, b])))
    end

    return emb   # (384, nombre_de_phrases)
end

# réappliquer pour obtenir les coordonnées
emb = sentence_embeddings(textencoder, model, phrase)
emb
384×1 Matrix{Float32}:
  0.025071865
 -0.07442233
 -0.036995046
 -0.04221145
  0.07871144
  0.028972728
 -0.023024926
 -0.012268753
 -0.007907801
 -0.037465557
  ⋮
 -0.043829016
  0.01390185
  0.009766516
 -0.0034230442
 -0.03548351
 -0.03971127
  0.02962076
  0.046208426
 -0.02786109

Classement via les embeddings de "all-MiniLM-L6-v2"¶

Importation du corpus étiquété¶

In [133]:
# packages
import DataFrames as DFR
import XLSX

# lecture des données
df = DFR.DataFrame(XLSX.readtable("./reuters_r8.xlsx"))

# premières lignes
println(DFR.first(df,5))
5×2 DataFrame
 Row │ classe  texte                             
     │ String  String                            
─────┼───────────────────────────────────────────
   1 │ trade   asian exporters fear damage from…
   2 │ grain   china daily says vermin eat pct …
   3 │ ship    australian foreign ship ban ends…
   4 │ acq     sumitomo bank aims at quick reco…
   5 │ earn    amatil proposes two for five bon…
In [134]:
# dimensions
size(df)
(7674, 2)

Calcul des embeddings des documents¶

In [135]:
# documents
docs = vec(df.texte)
typeof(docs)
Vector{String} (alias for Array{String, 1})
In [ ]:
# encodage des documents
# attention !!! prend 20 mn sur ma machine !!!
# je l'ai fait avant, je ne le refais pas pour la vidéo
#=
embeddings = sentence_embeddings(textencoder, model, docs)
size(embeddings)
=#
In [ ]:
# sauvegarde de la matrice lors d'une exécution préalable
#=
using Serialization
serialize("matrice_embedding.jls",embeddings)
=#
In [136]:
# chargement de la matrice
# à partir d'une excution préalable
using Serialization
embeddings = deserialize("./matrice_embeddings.jls")
println(typeof(embeddings))
println(size(embeddings))
Matrix{Float32}
(384, 7674)

Préparation pour la modélisation prédictive¶

In [137]:
# transposition et transformation en matrice
M = Matrix(embeddings')
size(M)
(7674, 384)
In [138]:
# puis transformation en data frame
X = DFR.DataFrame(M,:auto)
names(X)[1:10]
10-element Vector{String}:
 "x1"
 "x2"
 "x3"
 "x4"
 "x5"
 "x6"
 "x7"
 "x8"
 "x9"
 "x10"
In [139]:
# vérif.
X[1:10,1:10]
10×10 DataFrame
Rowx1x2x3x4x5x6x7x8x9x10
Float32Float32Float32Float32Float32Float32Float32Float32Float32Float32
1-0.1036240.01565260.03887680.004335310.06587490.04253350.0299217-0.0200362-0.1088430.0229752
2-0.02452360.04860710.06550010.004149570.0763108-0.005603930.0008850470.0324044-0.0583756-0.0309842
3-0.0456034-0.02328550.02844340.05103980.04713980.08785920.0151371-0.0722738-0.1560250.0424992
4-0.0185516-0.0324097-0.0179149-0.0283303-0.105191-0.00128882-0.04700070.037133-0.0458648-0.0255888
5-0.105916-0.07252890.02751440.0119789-0.01339190.03842440.08300090.02570490.02711060.0359904
6-0.00921877-0.1007630.0169911-0.0521175-0.04721640.06886160.03215480.04511050.0470212-0.0321568
7-0.104498-0.0634382-0.02705310.00543218-0.00212139-0.02364710.0523030.0357979-0.02356790.0369836
80.0132485-0.150037-0.0574905-0.003815380.007715490.07022220.05226140.06836780.000165996-0.0598484
9-0.00442133-0.01751260.03486690.004627890.0666993-0.0490327-0.06161070.0591907-0.00277557-0.0943933
10-0.07868870.009160310.0264461-0.004683480.0756982-0.00744083-0.06871830.0684668-0.0109314-0.0754226
In [140]:
# variable cible -> encodage en type "factor"
import CategoricalArrays as CA
y = CA.categorical(df.classe)

# liste des modalités
CA.levels(y)
8-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
 "acq"
 "crude"
 "earn"
 "grain"
 "interest"
 "money_fx"
 "ship"
 "trade"

Index pour TRAIN/TEST¶

In [141]:
# identifiants pour train/test
import MLJ
idTrain, idTest = MLJ.partition(1:DFR.nrow(df),0.6,shuffle=true,stratify=y,rng=42)

# dimension
println(size(idTrain))
println(size(idTest))
(4605,)
(3069,)

Régression Logistique Multinomiale (Lasso)¶

In [142]:
# importer la régression logistique
# à partir du module MLJLinearModels
LogisticClassifier = @MLJ.load MultinomialClassifier pkg=MLJLinearModels
import MLJLinearModels ✔
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\ricco\.julia\packages\MLJModels\AWkxi\src\loading.jl:159
MLJLinearModels.MultinomialClassifier
In [143]:
# instanciation et préparation
modele = LogisticClassifier(penalty = :l1, gamma = 0.1)

# machine avec les structures de données
mach = MLJ.machine(modele,X,y)
untrained Machine; caches model-specific representations of data
  model: MultinomialClassifier(lambda = 2.220446049250313e-16, …)
  args: 
    1:	Source @122 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Continuous}}
    2:	Source @968 ⏎ AbstractVector{ScientificTypesBase.Multiclass{8}}

Entraînement du modèle¶

In [144]:
# entraînement sur l'échantillon d'apprentissage
# rows = identifiants des individus TRAIN
MLJ.fit!(mach,rows=idTrain)
┌ Info: Training machine(MultinomialClassifier(lambda = 2.220446049250313e-16, …), …).
└ @ MLJBase C:\Users\ricco\.julia\packages\MLJBase\DCbte\src\machines.jl:499
┌ Info: Solver: MLJLinearModels.ProxGrad
│   accel: Bool true
│   max_iter: Int64 1000
│   tol: Float64 0.0001
│   max_inner: Int64 100
│   beta: Float64 0.8
│   gram: Bool false
└ @ MLJLinearModels C:\Users\ricco\.julia\packages\MLJLinearModels\s9vSj\src\mlj\interface.jl:72
┌ Warning: Proximal GD did not converge in 1000 iterations.
└ @ MLJLinearModels C:\Users\ricco\.julia\packages\MLJLinearModels\s9vSj\src\fit\proxgrad.jl:73
trained Machine; caches model-specific representations of data
  model: MultinomialClassifier(lambda = 2.220446049250313e-16, …)
  args: 
    1:	Source @122 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Continuous}}
    2:	Source @968 ⏎ AbstractVector{ScientificTypesBase.Multiclass{8}}

Inspection des coefficients¶

In [145]:
# coefficients
fp = MLJ.fitted_params(mach)
fp.coefs
384-element Vector{Pair{Symbol, SubArray{Float64, 1, Matrix{Float64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}}}:
   :x1 => [10.371185610727196, -7.76590126272795, -7.227128135919941, 0.6500188615708384, 1.4501180355954402, 3.5432782665254425, 1.852696732298626, -2.8742681081892982]
   :x2 => [5.451844119625546, 4.296026752097511, -12.997448799269703, 2.437692639517172, -5.505962061082315, 2.9941051216165167, -1.2157870545343874, 4.539529281854829]
   :x3 => [-5.282674425596243, 0.577069294325339, 0.6544926267158487, 1.607373740798359, 1.5708030809247564, -0.6950678765689661, 0.40456743766350706, 1.163436121586802]
   :x4 => [0.0790710470430111, 1.4255216002066347, -1.8004093857934809, 2.5207480300490177, -0.041685148962272024, 2.033280692010929, 1.4264533374489858, -5.642980172130398]
   :x5 => [1.109206031841218, 0.8794228754986335, -3.5816836368327714, 2.5150847391521483, 1.8139530953572787, 1.7967051501778857, -4.883902529311217, 0.35121427393235205]
   :x6 => [-0.2830933759281726, -1.3964462998532547, -2.0810818803000983, -2.1415560403535503, -0.42652048990083286, -2.4443153462484584, 2.605360331472206, 6.167653101325305]
   :x7 => [10.912095013213888, -7.995625898214512, 2.079108074665601, -2.1515496525097966, -2.1996529334353365, -1.9380650775833208, -3.4616685801469647, 4.755359054114365]
   :x8 => [-3.92268732989323, -1.6068823724949646, 7.830227567091169, -0.09105436465146126, 0.4528721816278729, 1.505155454598647, -1.0780931876454694, -3.0895379485517633]
   :x9 => [3.5960598532862824, -2.984710384508376, 11.100390493499932, -1.1530877392695267, 3.142715795009573, -3.360681172399022, -3.444592768922927, -6.896094076528521]
  :x10 => [-7.4852547337050295, -4.903940304726245, 0.8383543997396714, 1.2733389973264853, 4.73399656073897, 3.214419011653537, -0.1956155379205403, 2.5247016067985193]
       ⋮
 :x376 => [4.166398440735918, -2.0457260267507094, 0.8939298190887245, 0.537162928522513, -1.1303391256454762, -2.5479780069189397, 1.2723219468952434, -1.1457699758997282]
 :x377 => [10.457616064814637, -2.974420641185583, -4.576356558787065, 0.7993748054762335, -1.238013077402656, -3.1384674734109286, -0.3089586031651469, 0.9792254837380918]
 :x378 => [-0.6085954280718746, -1.781335717235709, 7.536585627753868, -3.5217077642985495, -0.7380978022001353, 3.630192634319697, -0.4623783081100805, -4.054663241993461]
 :x379 => [-4.523839466608446, -8.19823182418464, 4.207702393265039, 2.658001788507651, -2.650858659321829, 2.5264869588573307, 1.4234134215190515, 4.55732538788269]
 :x380 => [-6.480285511813314, -1.7092339393429068, 0.6717116527981714, -1.4326278360652693, 2.1403764167948722, 0.11914411036252312, 4.950694339176468, 1.7402207680633386]
 :x381 => [-2.4397940047686197, -1.8563447950500536, 4.241839409330676, 0.39819802969146517, 3.7066833340987575, -0.9992775056024837, 0.8468093878399208, -3.8981138556441]
 :x382 => [-1.1177757135942883, -1.6689846685339214, -3.2593454198741987, -0.5285005689819081, 3.9098508853787153, 0.4138043421955925, -0.49350431499305847, 2.7444554583465557]
 :x383 => [-7.005011403672204, 0.02812821837979645, 12.124863866735398, -0.7368495200781617, -3.2922332633168496, 3.205976859521862, -1.8936481414429278, -2.4312266158655436]
 :x384 => [-2.2604058756686918, 1.6415386732122303, 6.551482842205098, -0.012412255876620058, -1.686858852733802, -3.2960450104939403, 2.547660285118923, -3.4849598056326]
In [146]:
# intercept
fp.intercept
8-element Vector{Float64}:
  4.232664236188939
 -0.9195098637225952
  1.9493370782700128
 -1.236858267785365
 -3.5506868552506945
 -0.054243113947955744
  1.7293537881941046
 -2.15005700191133

Evaluation en test¶

In [147]:
# prédiction en test
pred = MLJ.predict_mode(mach,rows=idTest)

# premières valeurs
pred[1:10]
10-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
 "grain"
 "grain"
 "grain"
 "grain"
 "crude"
 "ship"
 "earn"
 "earn"
 "earn"
 "money_fx"
In [148]:
# matrice de confusion
mc = MLJ.confusion_matrix(pred,y[idTest])
mc
8×8 Matrix{Int64}:
 848    7    50   0   3    1   4    6
   4  135     9   0   1    0   3    1
  59    1  1507   3   3    1   0    1
   0    0     0  14   0    0   0    0
   1    1     0   1  82   12   0    0
   1    1     3   2  18  101   1    5
   2    5     0   0   0    0  47    0
   2    0     0   0   1    2   3  117
In [149]:
# accuracy
MLJ.accuracy(pred,y[idTest])
0.9289670902574129