Path: blob/main/course/fr/chapter3/section3.ipynb
5906 views
Kernel: Python 3
Finetuner un modèle avec l'API Trainer
Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.
In [ ]:
!pip install datasets transformers[sentencepiece]
In [ ]:
from datasets import load_dataset from transformers import AutoTokenizer, DataCollatorWithPadding raw_datasets = load_dataset("paws-x", "fr") checkpoint = "camembert-base" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer(example["sentence1"], example["sentence2"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
In [ ]:
from transformers import TrainingArguments training_args = TrainingArguments("test-trainer")
In [ ]:
from transformers import AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
In [ ]:
from transformers import Trainer trainer = Trainer( model, training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, )
In [ ]:
trainer.train() # Attention, une epoch prend 12h !
In [ ]:
predictions = trainer.predict(tokenized_datasets["validation"]) print(predictions.predictions.shape, predictions.label_ids.shape)
In [ ]:
import numpy as np preds = np.argmax(predictions.predictions, axis=-1)
In [ ]:
from datasets import load_metric metric = load_metric("glue", "mrpc") metric.compute(predictions=preds, references=predictions.label_ids)
In [ ]:
def compute_metrics(eval_preds): metric = load_metric("glue", "mrpc") logits, labels = eval_preds predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels)
In [ ]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch") model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) trainer = Trainer( model, training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics, )