Path: blob/master/examples/nlp/text_classification_with_transformer.py
3507 views
"""1Title: Text classification with Transformer2Author: [Apoorv Nandan](https://twitter.com/NandanApoorv)3Date created: 2020/05/104Last modified: 2024/01/185Description: Implement a Transformer block as a Keras layer and use it for text classification.6Accelerator: GPU7Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)8"""910"""11## Setup12"""1314import keras15from keras import ops16from keras import layers171819"""20## Implement a Transformer block as a layer21"""222324class TransformerBlock(layers.Layer):25def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):26super().__init__()27self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)28self.ffn = keras.Sequential(29[30layers.Dense(ff_dim, activation="relu"),31layers.Dense(embed_dim),32]33)34self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)35self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)36self.dropout1 = layers.Dropout(rate)37self.dropout2 = layers.Dropout(rate)3839def call(self, inputs):40attn_output = self.att(inputs, inputs)41attn_output = self.dropout1(attn_output)42out1 = self.layernorm1(inputs + attn_output)43ffn_output = self.ffn(out1)44ffn_output = self.dropout2(ffn_output)45return self.layernorm2(out1 + ffn_output)464748"""49## Implement embedding layer5051Two separate embedding layers, one for tokens, one for token index (positions).52"""535455class TokenAndPositionEmbedding(layers.Layer):56def __init__(self, maxlen, vocab_size, embed_dim):57super().__init__()58self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)59self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)6061def call(self, x):62maxlen = ops.shape(x)[-1]63positions = ops.arange(start=0, stop=maxlen, step=1)64positions = self.pos_emb(positions)65x = self.token_emb(x)66return x + positions676869"""70## Download and prepare dataset71"""7273vocab_size = 20000 # Only consider the top 20k words74maxlen = 200 # Only consider the first 200 words of each movie review75(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)76print(len(x_train), "Training sequences")77print(len(x_val), "Validation sequences")78x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)79x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen)8081"""82## Create classifier model using transformer layer8384Transformer layer outputs one vector for each time step of our input sequence.85Here, we take the mean across all time steps and86use a feed forward network on top of it to classify text.87"""888990embed_dim = 32 # Embedding size for each token91num_heads = 2 # Number of attention heads92ff_dim = 32 # Hidden layer size in feed forward network inside transformer9394inputs = layers.Input(shape=(maxlen,))95embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)96x = embedding_layer(inputs)97transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)98x = transformer_block(x)99x = layers.GlobalAveragePooling1D()(x)100x = layers.Dropout(0.1)(x)101x = layers.Dense(20, activation="relu")(x)102x = layers.Dropout(0.1)(x)103outputs = layers.Dense(2, activation="softmax")(x)104105model = keras.Model(inputs=inputs, outputs=outputs)106107108"""109## Train and Evaluate110"""111112model.compile(113optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]114)115history = model.fit(116x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)117)118119120