Path: blob/main/course/videos/token_pipeline_pt.ipynb
5906 views
Kernel: Unknown Kernel
This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
In [ ]:
#@title from IPython.display import HTML HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/0E7ltQB7fM8?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
In [ ]:
! pip install datasets transformers[sentencepiece]
In [ ]:
from transformers import pipeline token_classifier = pipeline("token-classification") token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
In [ ]:
token_classifier = pipeline("token-classification", aggregation_strategy="simple") token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
In [ ]:
from transformers import AutoTokenizer, AutoModelForTokenClassification model_checkpoint = "" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = AutoModelForTokenClassification.from_pretrained(model_checkpoint) example = "My name is Sylvain and I work at Hugging Face in Brooklyn." inputs = tokenizer(example, return_tensors="pt") outputs = model(**inputs) print(inputs["input_ids"].shape) print(outputs.logits.shape)
In [ ]:
import torch probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist() predictions = probabilities.argmax(dim=-1)[0].tolist() print(predictions)
In [ ]:
model.config.id2label
In [ ]:
results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] for idx, pred in enumerate(predictions): label = model.config.id2label[pred] if label != "O": start, end = offsets[idx] results.append( {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx], "start": start, "end": end} ) print(results)
In [ ]:
import numpy as np label_map = model.config.id2label results = [] idx = 0 while idx < len(predictions): pred = predictions[idx] label = label_map[pred] if label != "O": # Remove the B- or I- label = label[2:] start, _ = offsets[idx] # Grab all the tokens labeled with I-label all_scores = [] while idx < len(predictions) and label_map[predictions[idx]] == f"I-{label}": all_scores.append(probabilities[idx][pred]) _, end = offsets[idx] idx += 1 # The score is the mean of all the scores of the token in that grouped entity. score = np.mean(all_scores).item() word = example[start:end] results.append( {"entity_group": label, "score": score, "word": word, "start": start, "end": end} ) idx += 1
In [ ]: