Path: blob/main/course/fr/chapter3/section3_tf.ipynb
5906 views
Kernel: Python 3
Finetuner un modèle avec Keras
Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.
In [ ]:
!pip install datasets transformers[sentencepiece]
In [ ]:
from datasets import load_dataset from transformers import AutoTokenizer, DataCollatorWithPadding import numpy as np raw_datasets = load_dataset("paws-x", "fr") checkpoint = "camembert-base" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer(example["sentence1"], example["sentence2"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") tf_train_dataset = tokenized_datasets["train"].to_tf_dataset( columns=["attention_mask", "input_ids", "token_type_ids"], label_cols=["labels"], shuffle=True, collate_fn=data_collator, batch_size=8, ) tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset( columns=["attention_mask", "input_ids", "token_type_ids"], label_cols=["labels"], shuffle=False, collate_fn=data_collator, batch_size=8, )
In [ ]:
from transformers import TFAutoModelForSequenceClassification model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
In [ ]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy model.compile( optimizer="adam", loss=SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"], ) model.fit( tf_train_dataset, validation_data=tf_validation_dataset, )
In [ ]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay batch_size = 8 num_epochs = 3 # Le nombre d'étapes d'entraînement est le nombre d'échantillons dans le jeu de données, divisé par la taille du batch, puis multiplié par le nombre total d'époques. # Notez que le jeu de données tf_train_dataset est ici un batch de données tf.data.Dataset # et non le jeu de données original Hugging Face, donc sa len() est déjà num_samples // batch_size. num_train_steps = len(tf_train_dataset) * num_epochs lr_scheduler = PolynomialDecay( initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps ) from tensorflow.keras.optimizers import Adam opt = Adam(learning_rate=lr_scheduler)
In [ ]:
import tensorflow as tf model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])
In [ ]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
In [ ]:
preds = model.predict(tf_validation_dataset)["logits"]
In [ ]:
class_preds = np.argmax(preds, axis=1) print(preds.shape, class_preds.shape)
In [ ]:
from datasets import load_metric metric = load_metric("glue", "mrpc") metric.compute(predictions=class_preds, references=raw_datasets["validation"]["label"])