CoCalc -- tweet-classification-using-tfdf.py

GitHub Repository: keras-team/keras-io
Path: blob/master/examples/nlp/tweet-classification-using-tfdf.py
³⁵⁰⁷ views
1
"""
2
Title: Text classification using Decision Forests and pretrained embeddings
3
Author: Gitesh Chawda
4
Date created: 09/05/2022
5
Last modified: 09/05/2022
6
Description: Using Tensorflow Decision Forests for text classification.
7
Accelerator: GPU
8
"""
9

10
"""
11
## Introduction
12

13
[TensorFlow Decision Forests](https://www.tensorflow.org/decision_forests) (TF-DF)
14
is a collection of state-of-the-art algorithms for Decision Forest models that are
15
compatible with Keras APIs. The module includes Random Forests, Gradient Boosted Trees,
16
and CART, and can be used for regression, classification, and ranking tasks.
17

18
In this example we will use Gradient Boosted Trees with pretrained embeddings to
19
classify disaster-related tweets.
20

21
### See also:
22

23
- [TF-DF beginner tutorial](https://www.tensorflow.org/decision_forests/tutorials/beginner_colab)
24
- [TF-DF intermediate tutorial](https://www.tensorflow.org/decision_forests/tutorials/intermediate_colab).
25
"""
26

27
"""
28
Install Tensorflow Decision Forest using following command :
29
`pip install tensorflow_decision_forests`
30
"""
31

32

33
"""
34
## Imports
35
"""
36

37
import pandas as pd
38
import numpy as np
39
import tensorflow as tf
40
from tensorflow import keras
41
import tensorflow_hub as hub
42
from tensorflow.keras import layers
43
import tensorflow_decision_forests as tfdf
44
import matplotlib.pyplot as plt
45

46
"""
47
## Get the data
48

49
The Dataset is available on [Kaggle](https://www.kaggle.com/c/nlp-getting-started)
50

51
Dataset description:
52

53
**Files:**
54

55
- train.csv: the training set
56

57
**Columns:**
58

59
- id: a unique identifier for each tweet
60
- text: the text of the tweet
61
- location: the location the tweet was sent from (may be blank)
62
- keyword: a particular keyword from the tweet (may be blank)
63
- target: in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)
64
"""
65

66
# Turn .csv files into pandas DataFrame's
67
df = pd.read_csv(
68
    "https://raw.githubusercontent.com/IMvision12/Tweets-Classification-NLP/main/train.csv"
69
)
70
print(df.head())
71

72
"""
73
The dataset includes 7613 samples with 5 columns:
74
"""
75

76
print(f"Training dataset shape: {df.shape}")
77

78
"""
79
Shuffling and dropping unnecessary columns:
80
"""
81

82
df_shuffled = df.sample(frac=1, random_state=42)
83
# Dropping id, keyword and location columns as these columns consists of mostly nan values
84
# we will be using only text and target columns
85
df_shuffled.drop(["id", "keyword", "location"], axis=1, inplace=True)
86
df_shuffled.reset_index(inplace=True, drop=True)
87
print(df_shuffled.head())
88

89
"""
90
Printing information about the shuffled dataframe:
91
"""
92

93
print(df_shuffled.info())
94

95
"""
96
Total number of "disaster" and "non-disaster" tweets:
97
"""
98

99
print(
100
    "Total Number of disaster and non-disaster tweets: "
101
    f"{df_shuffled.target.value_counts()}"
102
)
103

104
"""
105
Let's preview a few samples:
106
"""
107

108
for index, example in df_shuffled[:5].iterrows():
109
    print(f"Example #{index}")
110
    print(f"\tTarget : {example['target']}")
111
    print(f"\tText : {example['text']}")
112

113
"""
114
Splitting dataset into training and test sets:
115
"""
116
test_df = df_shuffled.sample(frac=0.1, random_state=42)
117
train_df = df_shuffled.drop(test_df.index)
118
print(f"Using {len(train_df)} samples for training and {len(test_df)} for validation")
119

120
"""
121
Total number of "disaster" and "non-disaster" tweets in the training data:
122
"""
123
print(train_df["target"].value_counts())
124

125
"""
126
Total number of "disaster" and "non-disaster" tweets in the test data:
127
"""
128

129
print(test_df["target"].value_counts())
130

131
"""
132
## Convert data to a `tf.data.Dataset`
133
"""
134

135

136
def create_dataset(dataframe):
137
    dataset = tf.data.Dataset.from_tensor_slices(
138
        (dataframe["text"].to_numpy(), dataframe["target"].to_numpy())
139
    )
140
    dataset = dataset.batch(100)
141
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
142
    return dataset
143

144

145
train_ds = create_dataset(train_df)
146
test_ds = create_dataset(test_df)
147

148
"""
149
## Downloading pretrained embeddings
150

151
The Universal Sentence Encoder embeddings encode text into high-dimensional vectors that can be
152
used for text classification, semantic similarity, clustering and other natural language
153
tasks. They're trained on a variety of data sources and a variety of tasks. Their input is
154
variable-length English text and their output is a 512 dimensional vector.
155

156
To learn more about these pretrained embeddings, see
157
[Universal Sentence Encoder](https://tfhub.dev/google/universal-sentence-encoder/4).
158

159
"""
160

161
sentence_encoder_layer = hub.KerasLayer(
162
    "https://tfhub.dev/google/universal-sentence-encoder/4"
163
)
164

165
"""
166
## Creating our models
167

168
We create two models. In the first model (model_1) raw text will be first encoded via
169
pretrained embeddings and then passed to a Gradient Boosted Tree model for
170
classification. In the second model (model_2) raw text will be directly passed to
171
the Gradient Boosted Trees model.
172
"""
173

174
"""
175
Building model_1
176
"""
177

178
inputs = layers.Input(shape=(), dtype=tf.string)
179
outputs = sentence_encoder_layer(inputs)
180
preprocessor = keras.Model(inputs=inputs, outputs=outputs)
181
model_1 = tfdf.keras.GradientBoostedTreesModel(preprocessing=preprocessor)
182

183
"""
184
Building model_2
185
"""
186

187
model_2 = tfdf.keras.GradientBoostedTreesModel()
188

189
"""
190
## Train the models
191

192
We compile our model by passing the metrics `Accuracy`, `Recall`, `Precision` and
193
`AUC`. When it comes to the loss, TF-DF automatically detects the best loss for the task
194
(Classification or regression). It is printed in the model summary.
195

196
Also, because they're batch-training models rather than mini-batch gradient descent models,
197
TF-DF models do not need a validation dataset to monitor overfitting, or to stop
198
training early. Some algorithms do not use a validation dataset (e.g. Random Forest)
199
while some others do (e.g. Gradient Boosted Trees). If a validation dataset is
200
needed, it will be extracted automatically from the training dataset.
201
"""
202

203
# Compiling model_1
204
model_1.compile(metrics=["Accuracy", "Recall", "Precision", "AUC"])
205
# Here we do not specify epochs as, TF-DF trains exactly one epoch of the dataset
206
model_1.fit(train_ds)
207

208
# Compiling model_2
209
model_2.compile(metrics=["Accuracy", "Recall", "Precision", "AUC"])
210
# Here we do not specify epochs as, TF-DF trains exactly one epoch of the dataset
211
model_2.fit(train_ds)
212

213
"""
214
Prints training logs of model_1
215
"""
216

217
logs_1 = model_1.make_inspector().training_logs()
218
print(logs_1)
219

220
"""
221
Prints training logs of model_2
222
"""
223

224
logs_2 = model_2.make_inspector().training_logs()
225
print(logs_2)
226

227
"""
228
The model.summary() method prints a variety of information about your decision tree model, including model type, task, input features, and feature importance.
229
"""
230

231
print("model_1 summary: ")
232
print(model_1.summary())
233
print()
234
print("model_2 summary: ")
235
print(model_2.summary())
236

237
"""
238
## Plotting training metrics
239
"""
240

241

242
def plot_curve(logs):
243
    plt.figure(figsize=(12, 4))
244

245
    plt.subplot(1, 2, 1)
246
    plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
247
    plt.xlabel("Number of trees")
248
    plt.ylabel("Accuracy")
249

250
    plt.subplot(1, 2, 2)
251
    plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
252
    plt.xlabel("Number of trees")
253
    plt.ylabel("Loss")
254

255
    plt.show()
256

257

258
plot_curve(logs_1)
259
plot_curve(logs_2)
260

261
"""
262
## Evaluating on test data
263
"""
264

265
results = model_1.evaluate(test_ds, return_dict=True, verbose=0)
266
print("model_1 Evaluation: \n")
267
for name, value in results.items():
268
    print(f"{name}: {value:.4f}")
269

270
results = model_2.evaluate(test_ds, return_dict=True, verbose=0)
271
print("model_2 Evaluation: \n")
272
for name, value in results.items():
273
    print(f"{name}: {value:.4f}")
274

275
"""
276
## Predicting on validation data
277
"""
278

279
test_df.reset_index(inplace=True, drop=True)
280
for index, row in test_df.iterrows():
281
    text = tf.expand_dims(row["text"], axis=0)
282
    preds = model_1.predict_step(text)
283
    preds = tf.squeeze(tf.round(preds))
284
    print(f"Text: {row['text']}")
285
    print(f"Prediction: {int(preds)}")
286
    print(f"Ground Truth : {row['target']}")
287
    if index == 10:
288
        break
289

290
"""
291
## Concluding remarks
292

293
The TensorFlow Decision Forests package provides powerful models
294
that work especially well with structured data. In our experiments,
295
the Gradient Boosted Tree model with pretrained embeddings achieved 81.6%
296
test accuracy while the plain Gradient Boosted Tree model had 54.4% accuracy.
297
"""
298

299
Product

Resources

Company