Path: blob/main/course/videos/debug_notebook.ipynb
5906 views
Kernel: Unknown Kernel
This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
In [ ]:
#@title from IPython.display import HTML HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/rSPyvPw0p9k?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
In [ ]:
! pip install datasets transformers[sentencepiece]
In [ ]:
from datasets import load_dataset raw_datasets = load_dataset("conll2003")
In [ ]:
raw_datasets
In [ ]:
raw_datasets["train"][0]["tokens"]
In [ ]:
raw_datasets["train"][0]["ner_tags"]
In [ ]:
ner_feature = raw_datasets["train"].features["ner_tags"] ner_feature
In [ ]:
label_names = ner_feature.feature.names label_names
In [ ]:
words = raw_datasets["train"][0]["tokens"] labels = raw_datasets["train"][0]["ner_tags"] line1 = "" line2 = "" for word, label in zip(words, labels): full_label = label_names[label] max_length = max(len(word), len(full_label)) line1 += word + " " * (max_length - len(word) + 1) line2 += full_label + " " * (max_length - len(full_label) + 1) print(line1) print(line2)
In [ ]:
from transformers import AutoTokenizer model_checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
In [ ]:
tokenizer.is_fast
In [ ]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True) inputs.tokens()
In [ ]:
inputs.word_ids()
In [ ]:
def align_labels_with_tokens(labels, word_ids): new_labels = [] current_word = None for word_id in word_ids: if word_id != current_word: # Start of a new word! current_word = word_id label = -100 if word_id is None else labels[word_id] new_labels.append(label) elif word_id is None: # Special token new_labels.append(-100) else: # Same word as previous token label = labels[word_id] # If the label is B-XXX we change it to I-XXX if label % 2 == 1: label += 1 new_labels.append(label) return new_labels
In [ ]:
labels = raw_datasets["train"][0]["ner_tags"] word_ids = inputs.word_ids() print(labels) print(align_labels_with_tokens(labels, word_ids))
In [ ]:
def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples["tokens"], truncation=True, is_split_into_words=True ) all_labels = examples["ner_tags"] new_labels = [] for i, labels in enumerate(all_labels): word_ids = tokenized_inputs.word_ids(i) new_labels.append(align_labels_with_tokens(labels, word_ids)) tokenized_inputs["labels"] = new_labels return tokenized_inputs
In [ ]:
tokenized_datasets = raw_datasets.map( tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names, )
In [ ]:
features = [tokenized_datasets["train"][i] for i in range(8)]
In [ ]:
batch = tokenizer.pad(features, return_tensors="pt")
In [ ]:
%debug
In [ ]: