CoCalc -- masked_language

GitHub Repository: keras-team/keras-io
Path: blob/master/examples/nlp/masked_language_modeling.py
³⁵⁰⁷ views
1
"""
2
Title: End-to-end Masked Language Modeling with BERT
3
Author: [Ankur Singh](https://twitter.com/ankur310794)
4
Date created: 2020/09/18
5
Last modified: 2024/03/15
6
Description: Implement a Masked Language Model (MLM) with BERT and fine-tune it on the IMDB Reviews dataset.
7
Accelerator: GPU
8
Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT) and made backend-agnostic by: [Humbulani Ndou](https://github.com/Humbulani1234)
9
"""
10

11
"""
12
## Introduction
13

14
Masked Language Modeling is a fill-in-the-blank task,
15
where a model uses the context words surrounding a mask token to try to predict what the
16
masked word should be.
17

18
For an input that contains one or more mask tokens,
19
the model will generate the most likely substitution for each.
20

21
Example:
22

23
- Input: "I have watched this [MASK] and it was awesome."
24
- Output: "I have watched this movie and it was awesome."
25

26
Masked language modeling is a great way to train a language
27
model in a self-supervised setting (without human-annotated labels).
28
Such a model can then be fine-tuned to accomplish various supervised
29
NLP tasks.
30

31
This example teaches you how to build a BERT model from scratch,
32
train it with the masked language modeling task,
33
and then fine-tune this model on a sentiment classification task.
34

35
We will use the Keras `TextVectorization` and `MultiHeadAttention` layers
36
to create a BERT Transformer-Encoder network architecture.
37

38
Note: This example should be run with `tf-nightly`.
39
"""
40

41
"""
42
## Setup
43

44
Install `tf-nightly` via `pip install tf-nightly`.
45
"""
46

47
import os
48

49
os.environ["KERAS_BACKEND"] = "torch"  # or jax, or tensorflow
50

51
import keras_hub
52

53
import keras
54
from keras import layers
55
from keras.layers import TextVectorization
56

57
from dataclasses import dataclass
58
import pandas as pd
59
import numpy as np
60
import glob
61
import re
62
from pprint import pprint
63

64
"""
65
## Set-up Configuration
66
"""
67

68

69
@dataclass
70
class Config:
71
    MAX_LEN = 256
72
    BATCH_SIZE = 32
73
    LR = 0.001
74
    VOCAB_SIZE = 30000
75
    EMBED_DIM = 128
76
    NUM_HEAD = 8  # used in bert model
77
    FF_DIM = 128  # used in bert model
78
    NUM_LAYERS = 1
79

80

81
config = Config()
82

83
"""
84
## Load the data
85

86
We will first download the IMDB data and load into a Pandas dataframe.
87
"""
88

89
"""shell
90
curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
91
tar -xf aclImdb_v1.tar.gz
92
"""
93

94

95
def get_text_list_from_files(files):
96
    text_list = []
97
    for name in files:
98
        with open(name) as f:
99
            for line in f:
100
                text_list.append(line)
101
    return text_list
102

103

104
def get_data_from_text_files(folder_name):
105
    pos_files = glob.glob("aclImdb/" + folder_name + "/pos/*.txt")
106
    pos_texts = get_text_list_from_files(pos_files)
107
    neg_files = glob.glob("aclImdb/" + folder_name + "/neg/*.txt")
108
    neg_texts = get_text_list_from_files(neg_files)
109
    df = pd.DataFrame(
110
        {
111
            "review": pos_texts + neg_texts,
112
            "sentiment": [0] * len(pos_texts) + [1] * len(neg_texts),
113
        }
114
    )
115
    df = df.sample(len(df)).reset_index(drop=True)
116
    return df
117

118

119
train_df = get_data_from_text_files("train")
120
test_df = get_data_from_text_files("test")
121

122
all_data = pd.concat([train_df, test_df], ignore_index=True)
123

124
"""
125
## Dataset preparation
126

127
We will use the `TextVectorization` layer to vectorize the text into integer token ids.
128
It transforms a batch of strings into either
129
a sequence of token indices (one sample = 1D array of integer token indices, in order)
130
or a dense representation (one sample = 1D array of float values encoding an unordered set of tokens).
131

132
Below, we define 3 preprocessing functions.
133

134
1.  The `get_vectorize_layer` function builds the `TextVectorization` layer.
135
2.  The `encode` function encodes raw text into integer token ids.
136
3.  The `get_masked_input_and_labels` function will mask input token ids.
137
It masks 15% of all input tokens in each sequence at random.
138
"""
139

140
# For data pre-processing and tf.data.Dataset
141
import tensorflow as tf
142

143

144
def custom_standardization(input_data):
145
    lowercase = tf.strings.lower(input_data)
146
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
147
    return tf.strings.regex_replace(
148
        stripped_html, "[%s]" % re.escape("!#$%&'()*+,-./:;<=>?@\^_`{|}~"), ""
149
    )
150

151

152
def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
153
    """Build Text vectorization layer
154

155
    Args:
156
      texts (list): List of string i.e input texts
157
      vocab_size (int): vocab size
158
      max_seq (int): Maximum sequence length.
159
      special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].
160

161
    Returns:
162
        layers.Layer: Return TextVectorization Keras Layer
163
    """
164
    vectorize_layer = TextVectorization(
165
        max_tokens=vocab_size,
166
        output_mode="int",
167
        standardize=custom_standardization,
168
        output_sequence_length=max_seq,
169
    )
170
    vectorize_layer.adapt(texts)
171

172
    # Insert mask token in vocabulary
173
    vocab = vectorize_layer.get_vocabulary()
174
    vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"]
175
    vectorize_layer.set_vocabulary(vocab)
176
    return vectorize_layer
177

178

179
vectorize_layer = get_vectorize_layer(
180
    all_data.review.values.tolist(),
181
    config.VOCAB_SIZE,
182
    config.MAX_LEN,
183
    special_tokens=["[mask]"],
184
)
185

186
# Get mask token id for masked language model
187
mask_token_id = vectorize_layer(["[mask]"]).numpy()[0][0]
188

189

190
def encode(texts):
191
    encoded_texts = vectorize_layer(texts)
192
    return encoded_texts.numpy()
193

194

195
def get_masked_input_and_labels(encoded_texts):
196
    # 15% BERT masking
197
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
198
    # Do not mask special tokens
199
    inp_mask[encoded_texts <= 2] = False
200
    # Set targets to -1 by default, it means ignore
201
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
202
    # Set labels for masked tokens
203
    labels[inp_mask] = encoded_texts[inp_mask]
204

205
    # Prepare input
206
    encoded_texts_masked = np.copy(encoded_texts)
207
    # Set input to [MASK] which is the last token for the 90% of tokens
208
    # This means leaving 10% unchanged
209
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
210
    encoded_texts_masked[inp_mask_2mask] = (
211
        mask_token_id  # mask token is the last in the dict
212
    )
213

214
    # Set 10% to a random token
215
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
216
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
217
        3, mask_token_id, inp_mask_2random.sum()
218
    )
219

220
    # Prepare sample_weights to pass to .fit() method
221
    sample_weights = np.ones(labels.shape)
222
    sample_weights[labels == -1] = 0
223

224
    # y_labels would be same as encoded_texts i.e input tokens
225
    y_labels = np.copy(encoded_texts)
226

227
    return encoded_texts_masked, y_labels, sample_weights
228

229

230
# We have 25000 examples for training
231
x_train = encode(train_df.review.values)  # encode reviews with vectorizer
232
y_train = train_df.sentiment.values
233
train_classifier_ds = (
234
    tf.data.Dataset.from_tensor_slices((x_train, y_train))
235
    .shuffle(1000)
236
    .batch(config.BATCH_SIZE)
237
)
238

239
# We have 25000 examples for testing
240
x_test = encode(test_df.review.values)
241
y_test = test_df.sentiment.values
242
test_classifier_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(
243
    config.BATCH_SIZE
244
)
245

246
# Dataset for end to end model input (will be used at the end)
247
test_raw_classifier_ds = test_df
248

249
# Prepare data for masked language model
250
x_all_review = encode(all_data.review.values)
251
x_masked_train, y_masked_labels, sample_weights = get_masked_input_and_labels(
252
    x_all_review
253
)
254

255
mlm_ds = tf.data.Dataset.from_tensor_slices(
256
    (x_masked_train, y_masked_labels, sample_weights)
257
)
258
mlm_ds = mlm_ds.shuffle(1000).batch(config.BATCH_SIZE)
259

260
"""
261
## Create BERT model (Pretraining Model) for masked language modeling
262

263
We will create a BERT-like pretraining model architecture
264
using the `MultiHeadAttention` layer.
265
It will take token ids as inputs (including masked tokens)
266
and it will predict the correct ids for the masked input tokens.
267
"""
268

269

270
def bert_module(query, key, value, i):
271
    # Multi headed self-attention
272
    attention_output = layers.MultiHeadAttention(
273
        num_heads=config.NUM_HEAD,
274
        key_dim=config.EMBED_DIM // config.NUM_HEAD,
275
        name="encoder_{}_multiheadattention".format(i),
276
    )(query, key, value)
277
    attention_output = layers.Dropout(0.1, name="encoder_{}_att_dropout".format(i))(
278
        attention_output
279
    )
280
    attention_output = layers.LayerNormalization(
281
        epsilon=1e-6, name="encoder_{}_att_layernormalization".format(i)
282
    )(query + attention_output)
283

284
    # Feed-forward layer
285
    ffn = keras.Sequential(
286
        [
287
            layers.Dense(config.FF_DIM, activation="relu"),
288
            layers.Dense(config.EMBED_DIM),
289
        ],
290
        name="encoder_{}_ffn".format(i),
291
    )
292
    ffn_output = ffn(attention_output)
293
    ffn_output = layers.Dropout(0.1, name="encoder_{}_ffn_dropout".format(i))(
294
        ffn_output
295
    )
296
    sequence_output = layers.LayerNormalization(
297
        epsilon=1e-6, name="encoder_{}_ffn_layernormalization".format(i)
298
    )(attention_output + ffn_output)
299
    return sequence_output
300

301

302
loss_fn = keras.losses.SparseCategoricalCrossentropy(reduction=None)
303
loss_tracker = keras.metrics.Mean(name="loss")
304

305

306
class MaskedLanguageModel(keras.Model):
307

308
    def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):
309

310
        loss = loss_fn(y, y_pred, sample_weight)
311
        loss_tracker.update_state(loss, sample_weight=sample_weight)
312
        return keras.ops.sum(loss)
313

314
    def compute_metrics(self, x, y, y_pred, sample_weight):
315

316
        # Return a dict mapping metric names to current value
317
        return {"loss": loss_tracker.result()}
318

319
    @property
320
    def metrics(self):
321
        # We list our `Metric` objects here so that `reset_states()` can be
322
        # called automatically at the start of each epoch
323
        # or at the start of `evaluate()`.
324
        # If you don't implement this property, you have to call
325
        # `reset_states()` yourself at the time of your choosing.
326
        return [loss_tracker]
327

328

329
def create_masked_language_bert_model():
330
    inputs = layers.Input((config.MAX_LEN,), dtype="int64")
331

332
    word_embeddings = layers.Embedding(
333
        config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
334
    )(inputs)
335
    position_embeddings = keras_hub.layers.PositionEmbedding(
336
        sequence_length=config.MAX_LEN
337
    )(word_embeddings)
338
    embeddings = word_embeddings + position_embeddings
339

340
    encoder_output = embeddings
341
    for i in range(config.NUM_LAYERS):
342
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)
343

344
    mlm_output = layers.Dense(config.VOCAB_SIZE, name="mlm_cls", activation="softmax")(
345
        encoder_output
346
    )
347
    mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")
348

349
    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
350
    mlm_model.compile(optimizer=optimizer)
351
    return mlm_model
352

353

354
id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
355
token2id = {y: x for x, y in id2token.items()}
356

357

358
class MaskedTextGenerator(keras.callbacks.Callback):
359
    def __init__(self, sample_tokens, top_k=5):
360
        self.sample_tokens = sample_tokens
361
        self.k = top_k
362

363
    def decode(self, tokens):
364
        return " ".join([id2token[t] for t in tokens if t != 0])
365

366
    def convert_ids_to_tokens(self, id):
367
        return id2token[id]
368

369
    def on_epoch_end(self, epoch, logs=None):
370
        prediction = self.model.predict(self.sample_tokens)
371

372
        masked_index = np.where(self.sample_tokens == mask_token_id)
373
        masked_index = masked_index[1]
374
        mask_prediction = prediction[0][masked_index]
375

376
        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
377
        values = mask_prediction[0][top_indices]
378

379
        for i in range(len(top_indices)):
380
            p = top_indices[i]
381
            v = values[i]
382
            tokens = np.copy(sample_tokens[0])
383
            tokens[masked_index[0]] = p
384
            result = {
385
                "input_text": self.decode(sample_tokens[0].numpy()),
386
                "prediction": self.decode(tokens),
387
                "probability": v,
388
                "predicted mask token": self.convert_ids_to_tokens(p),
389
            }
390
            pprint(result)
391

392

393
sample_tokens = vectorize_layer(["I have watched this [mask] and it was awesome"])
394
generator_callback = MaskedTextGenerator(sample_tokens.numpy())
395

396
bert_masked_model = create_masked_language_bert_model()
397
bert_masked_model.summary()
398

399
"""
400
## Train and Save
401
"""
402

403
bert_masked_model.fit(mlm_ds, epochs=5, callbacks=[generator_callback])
404
bert_masked_model.save("bert_mlm_imdb.keras")
405

406
"""
407
## Fine-tune a sentiment classification model
408

409
We will fine-tune our self-supervised model on a downstream task of sentiment classification.
410
To do this, let's create a classifier by adding a pooling layer and a `Dense` layer on top of the
411
pretrained BERT features.
412

413
"""
414

415
# Load pretrained bert model
416
mlm_model = keras.models.load_model(
417
    "bert_mlm_imdb.keras", custom_objects={"MaskedLanguageModel": MaskedLanguageModel}
418
)
419
pretrained_bert_model = keras.Model(
420
    mlm_model.input, mlm_model.get_layer("encoder_0_ffn_layernormalization").output
421
)
422

423
# Freeze it
424
pretrained_bert_model.trainable = False
425

426

427
def create_classifier_bert_model():
428
    inputs = layers.Input((config.MAX_LEN,), dtype="int64")
429
    sequence_output = pretrained_bert_model(inputs)
430
    pooled_output = layers.GlobalMaxPooling1D()(sequence_output)
431
    hidden_layer = layers.Dense(64, activation="relu")(pooled_output)
432
    outputs = layers.Dense(1, activation="sigmoid")(hidden_layer)
433
    classifer_model = keras.Model(inputs, outputs, name="classification")
434
    optimizer = keras.optimizers.Adam()
435
    classifer_model.compile(
436
        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
437
    )
438
    return classifer_model
439

440

441
classifer_model = create_classifier_bert_model()
442
classifer_model.summary()
443

444
# Train the classifier with frozen BERT stage
445
classifer_model.fit(
446
    train_classifier_ds,
447
    epochs=5,
448
    validation_data=test_classifier_ds,
449
)
450

451
# Unfreeze the BERT model for fine-tuning
452
pretrained_bert_model.trainable = True
453
optimizer = keras.optimizers.Adam()
454
classifer_model.compile(
455
    optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
456
)
457
classifer_model.fit(
458
    train_classifier_ds,
459
    epochs=5,
460
    validation_data=test_classifier_ds,
461
)
462

463
"""
464
## Create an end-to-end model and evaluate it
465

466
When you want to deploy a model, it's best if it already includes its preprocessing
467
pipeline, so that you don't have to reimplement the preprocessing logic in your
468
production environment. Let's create an end-to-end model that incorporates
469
the `TextVectorization` layer inside evaluate method, and let's evaluate. We will pass raw strings as input.
470
"""
471

472

473
# We create a custom Model to override the evaluate method so
474
# that it first pre-process text data
475
class ModelEndtoEnd(keras.Model):
476

477
    def evaluate(self, inputs):
478
        features = encode(inputs.review.values)
479
        labels = inputs.sentiment.values
480
        test_classifier_ds = (
481
            tf.data.Dataset.from_tensor_slices((features, labels))
482
            .shuffle(1000)
483
            .batch(config.BATCH_SIZE)
484
        )
485
        return super().evaluate(test_classifier_ds)
486

487
    # Build the model
488
    def build(self, input_shape):
489
        self.built = True
490

491

492
def get_end_to_end(model):
493
    inputs = classifer_model.inputs[0]
494
    outputs = classifer_model.outputs
495
    end_to_end_model = ModelEndtoEnd(inputs, outputs, name="end_to_end_model")
496
    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
497
    end_to_end_model.compile(
498
        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
499
    )
500
    return end_to_end_model
501

502

503
end_to_end_classification_model = get_end_to_end(classifer_model)
504
# Pass raw text dataframe to the model
505
end_to_end_classification_model.evaluate(test_raw_classifier_ds)
506

507
Product

Resources

Company