Environnement et packages¶
# activer l'environnement
using Pkg
Pkg.activate("env_julia_transformers")
Activating project at `c:\Users\ricco\Desktop\demo\env_julia_transformers`
# liste des packages installés
Pkg.status()
Status `C:\Users\ricco\Desktop\demo\env_julia_transformers\Project.toml` [324d7699] CategoricalArrays v1.1.1 [a93c6f00] DataFrames v1.8.2 [add582a8] MLJ v0.23.2 [6ee0df7b] MLJLinearModels v0.10.1 [21ca0261] Transformers v0.3.1 [fdbf4ff8] XLSX v0.11.11
Modèle pré-entraîné pour calcul des embeddings¶
Utilisation brute¶
# outils pour chargement du modèle pé-entraîné
using Transformers
using Transformers.TextEncoders
using Transformers.HuggingFace
# Charge le tokenizer + le modèle pré-entraîné "sentence transformers -- all-MiniLM-L6-v2"
# on pouvait aussi décomposer en load_tonkenizer et load_model
textencoder, model = hgf"sentence-transformers/all-MiniLM-L6-v2"
(TrfTextEncoder(
├─ TextTokenizer(MatchTokenization(WordPieceTokenization(bert_uncased_tokenizer, WordPiece(vocab_size = 30522, unk = [UNK], max_char = 100)), 5 patterns)),
├─ vocab = Vocab{String, SizedArray}(size = 30522, unk = [UNK], unki = 101),
├─ config = @NamedTuple{startsym::String, endsym::String, padsym::String, trunc::Union{Nothing, Int64}}(("[CLS]", "[SEP]", "[PAD]", 128)),
├─ annotate = annotate_strings,
├─ onehot = lookup_first,
├─ decode = nestedcall(remove_conti_prefix),
├─ textprocess = Pipelines(target[token] := join_text(source); target[token] := nestedcall(cleanup ∘ remove_prefix_space, target.token); target := (target.token)),
└─ process = Pipelines:
╰─ target[token] := TextEncodeBase.nestedcall(string_getvalue, source)
╰─ target[token] := Transformers.TextEncoders.grouping_sentence(target.token)
╰─ target[(token, segment)] := SequenceTemplate{String}([CLS]:<type=1> Input[1]:<type=1> [SEP]:<type=1> (Input[2]:<type=2> [SEP]:<type=2>)...)(target.token)
╰─ target[attention_mask] := (NeuralAttentionlib.LengthMask ∘ Transformers.TextEncoders.getlengths(128))(target.token)
╰─ target[token] := TextEncodeBase.trunc_or_pad(128, [PAD], head, tail)(target.token)
╰─ target[token] := TextEncodeBase.nested2batch(target.token)
╰─ target[segment] := TextEncodeBase.trunc_or_pad(128, 1, head, tail)(target.segment)
╰─ target[segment] := TextEncodeBase.nested2batch(target.segment)
╰─ target[sequence_mask] := identity(target.attention_mask)
╰─ target := (target.token, target.segment, target.attention_mask, target.sequence_mask)
), HGFBertModel(Chain(CompositeEmbedding(token = Embed(384, 30522), position = ApplyEmbed(.+, FixedLenPositionEmbed(384, 512)), segment = ApplyEmbed(.+, Embed(384, 2), Transformers.HuggingFace.bert_ones_like)), DropoutLayer<nothing>(LayerNorm(384, ϵ = 1.0e-12))), Transformer<6>(PostNormTransformerBlock(DropoutLayer<nothing>(SelfAttention(MultiheadQKVAttenOp(head = 12, p = nothing), Fork<3>(Dense(W = (384, 384), b = true)), Dense(W = (384, 384), b = true))), LayerNorm(384, ϵ = 1.0e-12), DropoutLayer<nothing>(Chain(Dense(σ = NNlib.gelu_tanh, W = (384, 1536), b = true), Dense(W = (1536, 384), b = true))), LayerNorm(384, ϵ = 1.0e-12))), Branch{(:pooled,) = (:hidden_state,)}(BertPooler(Dense(σ = NNlib.tanh_fast, W = (384, 384), b = true)))))
# essai sur la phrase (même que sous Python)
phrase = ["i check the vector for embedding"]
typeof(phrase)
Vector{String} (alias for Array{String, 1})
# tokénisation
sample = encode(textencoder, phrase)
sample.token
30522x128x1 OneHotArray{30522, 3, Matrix{OneHot{0x0000773a}}}:
[:, :, 1] =
0 0 0 0 0 0 0 0 0 0 1 1 1 … 1 1 1 1 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
⋮ ⋮ ⋮ ⋱ ⋮ ⋮
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# obtention des coordonnées internes du modèle
# via couche cachée
model(sample).hidden_state
384×128×1 Array{Float32, 3}:
[:, :, 1] =
0.095327 0.087373 0.845984 … 0.125315 0.135887 -0.0639195
-0.198384 -0.846359 -0.881889 1.47837 1.40749 1.65427
-0.207963 -0.435514 -0.335268 0.472097 0.437874 0.477201
-0.0213389 -0.321832 -0.642241 0.175378 0.212665 0.199589
0.175494 0.068748 0.933504 0.594178 0.567057 0.503295
0.518619 -0.38562 -0.502043 … -0.982852 -0.970056 -1.02056
-0.251166 0.566989 0.378429 0.107739 0.189489 0.19574
-0.116912 0.25914 -0.53065 -0.250755 -0.248923 -0.0435614
-0.135102 -0.297249 0.322535 0.424965 0.403099 0.420517
-0.251474 -0.386571 0.468953 -0.24419 -0.243078 -0.292161
⋮ ⋱ ⋮
-0.360721 -0.415961 -0.898789 … -0.27447 -0.24457 -0.213064
-0.0687004 0.515878 -0.0557876 -0.191391 -0.165465 -0.403164
0.189018 -0.325677 0.584291 0.112856 0.0652386 0.264499
0.354121 0.343456 0.0356015 0.355765 0.398299 0.47135
-0.277821 -0.318273 -0.0353911 -0.350908 -0.361668 -0.40573
-0.0701933 1.15144 -0.0883372 … 0.774171 0.865632 0.744496
0.0714479 0.184165 0.578313 0.00153449 -0.0256515 -0.117905
-0.109169 -0.378037 0.120577 -0.976198 -0.938467 -0.792349
-0.0454283 -0.523763 -0.619906 -0.560275 -0.540927 -0.555701
# autre propriété pooled BERT natif
# bonne dimension mais ne correspond pas du tout
# au "sentence_transformers" de Python !!!
model(sample).pooled
384×1 Matrix{Float32}:
-0.013239923
-0.040951025
0.061973903
0.0042933896
-0.029748408
-0.050784208
0.04984044
0.041920543
-0.10031746
0.04938916
⋮
-0.023819428
-0.15426454
0.0829849
-0.0025867838
-0.10589066
0.023930637
-0.017439567
-0.06480052
0.02757243
Réponse de "Claude.AI" Pour obtenir un résultat équivalent à sentence_transformers en Python avec all-MiniLM-L6-v2, il faut reproduire exactement ce que fait la librairie Python : un forward pass dans le modèle, puis un mean pooling tenant compte de l'attention mask, puis une normalisation L2. C'est précisément le point qui pose problème en général en Julia : si on se contente de model(encode(enc, sentences)), on récupère le champ pooled (sortie du pooler BERT, CLS + tanh), qui n'a rien à voir avec le mean pooling utilisé par sentence_transformers — d'où des vecteurs différents.
Le champ pooled correspond à la sortie classique du pooler BERT : token [CLS] → couche dense → tanh. Cette tête a été conçue à l'origine pour la tâche de Next Sentence Prediction du BERT original — elle n'a jamais été entraînée pour produire des vecteurs où "la distance reflète la similarité sémantique". Le papier fondateur de Sentence-BERT (Reimers & Gurevych, 2019) a justement montré que les embeddings [CLS]/pooled d'un BERT brut donnent de moins bons résultats sur des tâches de similarité sémantique (STS) que la simple moyenne de vecteurs GloVe — c'est précisément ce constat qui a motivé la création de l'architecture sentence-transformers basée sur le mean pooling.
Cas spécifique de all-MiniLM-L6-v2
C'est encore plus net pour ce modèle précis : il a été fine-tuné par contrastive learning (plus d'1 milliard de paires de phrases) en utilisant explicitement le mean pooling + normalisation L2 comme objectif. La tête pooler (dense+tanh) n'a jamais fait partie de ce pipeline d'entraînement — elle existe juste parce que c'est un artefact de l'architecture BERT sous-jacente exposée par Transformers.jl, mais elle n'a reçu aucun gradient lié à la tâche de similarité sémantique. Concrètement, pooled est quasiment du bruit non pertinent pour cette tâche.
Fonction pour post-traitement du vecteur de "Transformers.jl"¶
# fonction pour obtenir des coordonnées similaires à
# "sentence_transformers" sous Python
function sentence_embeddings(textenc, model, sentences::Vector{<:AbstractString})
sample = encode(textenc, sentences)
output = model(sample)
H = output.hidden_state # (hidden_size, seq_len, batch)
lens = Int.(sample.attention_mask.len) # longueur réelle (hors padding) de chaque phrase
hidden_size, _, batch = size(H)
emb = Matrix{Float32}(undef, hidden_size, batch)
# 1) Mean pooling sur les tokens non paddés (équivalent du mean_pooling() Python)
for b in 1:batch
L = lens[b]
emb[:, b] = vec(sum(@view(H[:, 1:L, b]), dims = 2)) ./ L
end
# 2) Normalisation L2 (ce modèle inclut une étape Normalize par défaut côté Python)
for b in 1:batch
emb[:, b] ./= sqrt(sum(abs2, @view(emb[:, b])))
end
return emb # (384, nombre_de_phrases)
end
# réappliquer pour obtenir les coordonnées
emb = sentence_embeddings(textencoder, model, phrase)
emb
384×1 Matrix{Float32}:
0.025071865
-0.07442233
-0.036995046
-0.04221145
0.07871144
0.028972728
-0.023024926
-0.012268753
-0.007907801
-0.037465557
⋮
-0.043829016
0.01390185
0.009766516
-0.0034230442
-0.03548351
-0.03971127
0.02962076
0.046208426
-0.02786109
Classement via les embeddings de "all-MiniLM-L6-v2"¶
Importation du corpus étiquété¶
# packages
import DataFrames as DFR
import XLSX
# lecture des données
df = DFR.DataFrame(XLSX.readtable("./reuters_r8.xlsx"))
# premières lignes
println(DFR.first(df,5))
5×2 DataFrame Row │ classe texte │ String String ─────┼─────────────────────────────────────────── 1 │ trade asian exporters fear damage from… 2 │ grain china daily says vermin eat pct … 3 │ ship australian foreign ship ban ends… 4 │ acq sumitomo bank aims at quick reco… 5 │ earn amatil proposes two for five bon…
# dimensions
size(df)
(7674, 2)
Calcul des embeddings des documents¶
# documents
docs = vec(df.texte)
typeof(docs)
Vector{String} (alias for Array{String, 1})
# encodage des documents
# attention !!! prend 20 mn sur ma machine !!!
# je l'ai fait avant, je ne le refais pas pour la vidéo
#=
embeddings = sentence_embeddings(textencoder, model, docs)
size(embeddings)
=#
# sauvegarde de la matrice lors d'une exécution préalable
#=
using Serialization
serialize("matrice_embedding.jls",embeddings)
=#
# chargement de la matrice
# à partir d'une excution préalable
using Serialization
embeddings = deserialize("./matrice_embeddings.jls")
println(typeof(embeddings))
println(size(embeddings))
Matrix{Float32}
(384, 7674)
Préparation pour la modélisation prédictive¶
# transposition et transformation en matrice
M = Matrix(embeddings')
size(M)
(7674, 384)
# puis transformation en data frame
X = DFR.DataFrame(M,:auto)
names(X)[1:10]
10-element Vector{String}:
"x1"
"x2"
"x3"
"x4"
"x5"
"x6"
"x7"
"x8"
"x9"
"x10"
# vérif.
X[1:10,1:10]
| Row | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 |
|---|---|---|---|---|---|---|---|---|---|---|
| Float32 | Float32 | Float32 | Float32 | Float32 | Float32 | Float32 | Float32 | Float32 | Float32 | |
| 1 | -0.103624 | 0.0156526 | 0.0388768 | 0.00433531 | 0.0658749 | 0.0425335 | 0.0299217 | -0.0200362 | -0.108843 | 0.0229752 |
| 2 | -0.0245236 | 0.0486071 | 0.0655001 | 0.00414957 | 0.0763108 | -0.00560393 | 0.000885047 | 0.0324044 | -0.0583756 | -0.0309842 |
| 3 | -0.0456034 | -0.0232855 | 0.0284434 | 0.0510398 | 0.0471398 | 0.0878592 | 0.0151371 | -0.0722738 | -0.156025 | 0.0424992 |
| 4 | -0.0185516 | -0.0324097 | -0.0179149 | -0.0283303 | -0.105191 | -0.00128882 | -0.0470007 | 0.037133 | -0.0458648 | -0.0255888 |
| 5 | -0.105916 | -0.0725289 | 0.0275144 | 0.0119789 | -0.0133919 | 0.0384244 | 0.0830009 | 0.0257049 | 0.0271106 | 0.0359904 |
| 6 | -0.00921877 | -0.100763 | 0.0169911 | -0.0521175 | -0.0472164 | 0.0688616 | 0.0321548 | 0.0451105 | 0.0470212 | -0.0321568 |
| 7 | -0.104498 | -0.0634382 | -0.0270531 | 0.00543218 | -0.00212139 | -0.0236471 | 0.052303 | 0.0357979 | -0.0235679 | 0.0369836 |
| 8 | 0.0132485 | -0.150037 | -0.0574905 | -0.00381538 | 0.00771549 | 0.0702222 | 0.0522614 | 0.0683678 | 0.000165996 | -0.0598484 |
| 9 | -0.00442133 | -0.0175126 | 0.0348669 | 0.00462789 | 0.0666993 | -0.0490327 | -0.0616107 | 0.0591907 | -0.00277557 | -0.0943933 |
| 10 | -0.0786887 | 0.00916031 | 0.0264461 | -0.00468348 | 0.0756982 | -0.00744083 | -0.0687183 | 0.0684668 | -0.0109314 | -0.0754226 |
# variable cible -> encodage en type "factor"
import CategoricalArrays as CA
y = CA.categorical(df.classe)
# liste des modalités
CA.levels(y)
8-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
"acq"
"crude"
"earn"
"grain"
"interest"
"money_fx"
"ship"
"trade"
Index pour TRAIN/TEST¶
# identifiants pour train/test
import MLJ
idTrain, idTest = MLJ.partition(1:DFR.nrow(df),0.6,shuffle=true,stratify=y,rng=42)
# dimension
println(size(idTrain))
println(size(idTest))
(4605,) (3069,)
Régression Logistique Multinomiale (Lasso)¶
# importer la régression logistique
# à partir du module MLJLinearModels
LogisticClassifier = @MLJ.load MultinomialClassifier pkg=MLJLinearModels
import MLJLinearModels ✔
┌ Info: For silent loading, specify `verbosity=0`. └ @ Main C:\Users\ricco\.julia\packages\MLJModels\AWkxi\src\loading.jl:159
MLJLinearModels.MultinomialClassifier
# instanciation et préparation
modele = LogisticClassifier(penalty = :l1, gamma = 0.1)
# machine avec les structures de données
mach = MLJ.machine(modele,X,y)
untrained Machine; caches model-specific representations of data
model: MultinomialClassifier(lambda = 2.220446049250313e-16, …)
args:
1: Source @122 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Continuous}}
2: Source @968 ⏎ AbstractVector{ScientificTypesBase.Multiclass{8}}
Entraînement du modèle¶
# entraînement sur l'échantillon d'apprentissage
# rows = identifiants des individus TRAIN
MLJ.fit!(mach,rows=idTrain)
┌ Info: Training machine(MultinomialClassifier(lambda = 2.220446049250313e-16, …), …). └ @ MLJBase C:\Users\ricco\.julia\packages\MLJBase\DCbte\src\machines.jl:499 ┌ Info: Solver: MLJLinearModels.ProxGrad │ accel: Bool true │ max_iter: Int64 1000 │ tol: Float64 0.0001 │ max_inner: Int64 100 │ beta: Float64 0.8 │ gram: Bool false └ @ MLJLinearModels C:\Users\ricco\.julia\packages\MLJLinearModels\s9vSj\src\mlj\interface.jl:72 ┌ Warning: Proximal GD did not converge in 1000 iterations. └ @ MLJLinearModels C:\Users\ricco\.julia\packages\MLJLinearModels\s9vSj\src\fit\proxgrad.jl:73
trained Machine; caches model-specific representations of data
model: MultinomialClassifier(lambda = 2.220446049250313e-16, …)
args:
1: Source @122 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Continuous}}
2: Source @968 ⏎ AbstractVector{ScientificTypesBase.Multiclass{8}}
Inspection des coefficients¶
# coefficients
fp = MLJ.fitted_params(mach)
fp.coefs
384-element Vector{Pair{Symbol, SubArray{Float64, 1, Matrix{Float64}, Tuple{Int64, Base.Slice{Base.OneTo{Int64}}}, true}}}:
:x1 => [10.371185610727196, -7.76590126272795, -7.227128135919941, 0.6500188615708384, 1.4501180355954402, 3.5432782665254425, 1.852696732298626, -2.8742681081892982]
:x2 => [5.451844119625546, 4.296026752097511, -12.997448799269703, 2.437692639517172, -5.505962061082315, 2.9941051216165167, -1.2157870545343874, 4.539529281854829]
:x3 => [-5.282674425596243, 0.577069294325339, 0.6544926267158487, 1.607373740798359, 1.5708030809247564, -0.6950678765689661, 0.40456743766350706, 1.163436121586802]
:x4 => [0.0790710470430111, 1.4255216002066347, -1.8004093857934809, 2.5207480300490177, -0.041685148962272024, 2.033280692010929, 1.4264533374489858, -5.642980172130398]
:x5 => [1.109206031841218, 0.8794228754986335, -3.5816836368327714, 2.5150847391521483, 1.8139530953572787, 1.7967051501778857, -4.883902529311217, 0.35121427393235205]
:x6 => [-0.2830933759281726, -1.3964462998532547, -2.0810818803000983, -2.1415560403535503, -0.42652048990083286, -2.4443153462484584, 2.605360331472206, 6.167653101325305]
:x7 => [10.912095013213888, -7.995625898214512, 2.079108074665601, -2.1515496525097966, -2.1996529334353365, -1.9380650775833208, -3.4616685801469647, 4.755359054114365]
:x8 => [-3.92268732989323, -1.6068823724949646, 7.830227567091169, -0.09105436465146126, 0.4528721816278729, 1.505155454598647, -1.0780931876454694, -3.0895379485517633]
:x9 => [3.5960598532862824, -2.984710384508376, 11.100390493499932, -1.1530877392695267, 3.142715795009573, -3.360681172399022, -3.444592768922927, -6.896094076528521]
:x10 => [-7.4852547337050295, -4.903940304726245, 0.8383543997396714, 1.2733389973264853, 4.73399656073897, 3.214419011653537, -0.1956155379205403, 2.5247016067985193]
⋮
:x376 => [4.166398440735918, -2.0457260267507094, 0.8939298190887245, 0.537162928522513, -1.1303391256454762, -2.5479780069189397, 1.2723219468952434, -1.1457699758997282]
:x377 => [10.457616064814637, -2.974420641185583, -4.576356558787065, 0.7993748054762335, -1.238013077402656, -3.1384674734109286, -0.3089586031651469, 0.9792254837380918]
:x378 => [-0.6085954280718746, -1.781335717235709, 7.536585627753868, -3.5217077642985495, -0.7380978022001353, 3.630192634319697, -0.4623783081100805, -4.054663241993461]
:x379 => [-4.523839466608446, -8.19823182418464, 4.207702393265039, 2.658001788507651, -2.650858659321829, 2.5264869588573307, 1.4234134215190515, 4.55732538788269]
:x380 => [-6.480285511813314, -1.7092339393429068, 0.6717116527981714, -1.4326278360652693, 2.1403764167948722, 0.11914411036252312, 4.950694339176468, 1.7402207680633386]
:x381 => [-2.4397940047686197, -1.8563447950500536, 4.241839409330676, 0.39819802969146517, 3.7066833340987575, -0.9992775056024837, 0.8468093878399208, -3.8981138556441]
:x382 => [-1.1177757135942883, -1.6689846685339214, -3.2593454198741987, -0.5285005689819081, 3.9098508853787153, 0.4138043421955925, -0.49350431499305847, 2.7444554583465557]
:x383 => [-7.005011403672204, 0.02812821837979645, 12.124863866735398, -0.7368495200781617, -3.2922332633168496, 3.205976859521862, -1.8936481414429278, -2.4312266158655436]
:x384 => [-2.2604058756686918, 1.6415386732122303, 6.551482842205098, -0.012412255876620058, -1.686858852733802, -3.2960450104939403, 2.547660285118923, -3.4849598056326]
# intercept
fp.intercept
8-element Vector{Float64}:
4.232664236188939
-0.9195098637225952
1.9493370782700128
-1.236858267785365
-3.5506868552506945
-0.054243113947955744
1.7293537881941046
-2.15005700191133
Evaluation en test¶
# prédiction en test
pred = MLJ.predict_mode(mach,rows=idTest)
# premières valeurs
pred[1:10]
10-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
"grain"
"grain"
"grain"
"grain"
"crude"
"ship"
"earn"
"earn"
"earn"
"money_fx"
# matrice de confusion
mc = MLJ.confusion_matrix(pred,y[idTest])
mc
8×8 Matrix{Int64}:
848 7 50 0 3 1 4 6
4 135 9 0 1 0 3 1
59 1 1507 3 3 1 0 1
0 0 0 14 0 0 0 0
1 1 0 1 82 12 0 0
1 1 3 2 18 101 1 5
2 5 0 0 0 0 47 0
2 0 0 0 1 2 3 117
# accuracy
MLJ.accuracy(pred,y[idTest])
0.9289670902574129