CoCalc -- memory_efficient

GitHub Repository: keras-team/keras-io
Path: blob/master/examples/keras_recipes/memory_efficient_embeddings.py
³⁵⁰⁷ views
1
"""
2
Title: Memory-efficient embeddings for recommendation systems
3
Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
4
Date created: 2021/02/15
5
Last modified: 2023/11/15
6
Description: Using compositional & mixed-dimension embeddings for memory-efficient recommendation models.
7
Accelerator: GPU
8
"""
9

10
"""
11
## Introduction
12

13
This example demonstrates two techniques for building memory-efficient recommendation models
14
by reducing the size of the embedding tables, without sacrificing model effectiveness:
15

16
1. [Quotient-remainder trick](https://arxiv.org/abs/1909.02107), by Hao-Jun Michael Shi et al.,
17
which reduces the number of embedding vectors to store, yet produces unique embedding
18
vector for each item without explicit definition.
19
2. [Mixed Dimension embeddings](https://arxiv.org/abs/1909.11810), by Antonio Ginart et al.,
20
which stores embedding vectors with mixed dimensions, where less popular items have
21
reduced dimension embeddings.
22

23
We use the [1M version of the Movielens dataset](https://grouplens.org/datasets/movielens/1m/).
24
The dataset includes around 1 million ratings from 6,000 users on 4,000 movies.
25
"""
26

27
"""
28
## Setup
29
"""
30

31
import os
32

33
os.environ["KERAS_BACKEND"] = "tensorflow"
34

35
from zipfile import ZipFile
36
from urllib.request import urlretrieve
37
import numpy as np
38
import pandas as pd
39
import tensorflow as tf
40
import keras
41
from keras import layers
42
from keras.layers import StringLookup
43
import matplotlib.pyplot as plt
44

45
"""
46
## Prepare the data
47

48
## Download and process data
49
"""
50

51
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
52
ZipFile("movielens.zip", "r").extractall()
53

54
ratings_data = pd.read_csv(
55
    "ml-1m/ratings.dat",
56
    sep="::",
57
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
58
)
59

60
ratings_data["movie_id"] = ratings_data["movie_id"].apply(lambda x: f"movie_{x}")
61
ratings_data["user_id"] = ratings_data["user_id"].apply(lambda x: f"user_{x}")
62
ratings_data["rating"] = ratings_data["rating"].apply(lambda x: float(x))
63
del ratings_data["unix_timestamp"]
64

65
print(f"Number of users: {len(ratings_data.user_id.unique())}")
66
print(f"Number of movies: {len(ratings_data.movie_id.unique())}")
67
print(f"Number of ratings: {len(ratings_data.index)}")
68

69
"""
70
## Create train and eval data splits
71
"""
72

73
random_selection = np.random.rand(len(ratings_data.index)) <= 0.85
74
train_data = ratings_data[random_selection]
75
eval_data = ratings_data[~random_selection]
76

77
train_data.to_csv("train_data.csv", index=False, sep="|", header=False)
78
eval_data.to_csv("eval_data.csv", index=False, sep="|", header=False)
79
print(f"Train data split: {len(train_data.index)}")
80
print(f"Eval data split: {len(eval_data.index)}")
81
print("Train and eval data files are saved.")
82

83
"""
84
## Define dataset metadata and hyperparameters
85
"""
86

87
csv_header = list(ratings_data.columns)
88
user_vocabulary = list(ratings_data.user_id.unique())
89
movie_vocabulary = list(ratings_data.movie_id.unique())
90
target_feature_name = "rating"
91
learning_rate = 0.001
92
batch_size = 128
93
num_epochs = 3
94
base_embedding_dim = 64
95

96
"""
97
## Train and evaluate the model
98
"""
99

100

101
def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=True):
102
    return tf.data.experimental.make_csv_dataset(
103
        csv_file_path,
104
        batch_size=batch_size,
105
        column_names=csv_header,
106
        label_name=target_feature_name,
107
        num_epochs=1,
108
        header=False,
109
        field_delim="|",
110
        shuffle=shuffle,
111
    )
112

113

114
def run_experiment(model):
115
    # Compile the model.
116
    model.compile(
117
        optimizer=keras.optimizers.Adam(learning_rate),
118
        loss=keras.losses.MeanSquaredError(),
119
        metrics=[keras.metrics.MeanAbsoluteError(name="mae")],
120
    )
121
    # Read the training data.
122
    train_dataset = get_dataset_from_csv("train_data.csv", batch_size)
123
    # Read the test data.
124
    eval_dataset = get_dataset_from_csv("eval_data.csv", batch_size, shuffle=False)
125
    # Fit the model with the training data.
126
    history = model.fit(
127
        train_dataset,
128
        epochs=num_epochs,
129
        validation_data=eval_dataset,
130
    )
131
    return history
132

133

134
"""
135
## Experiment 1: baseline collaborative filtering model
136

137
### Implement embedding encoder
138
"""
139

140

141
def embedding_encoder(vocabulary, embedding_dim, num_oov_indices=0, name=None):
142
    return keras.Sequential(
143
        [
144
            StringLookup(
145
                vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices
146
            ),
147
            layers.Embedding(
148
                input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim
149
            ),
150
        ],
151
        name=f"{name}_embedding" if name else None,
152
    )
153

154

155
"""
156
### Implement the baseline model
157
"""
158

159

160
def create_baseline_model():
161
    # Receive the user as an input.
162
    user_input = layers.Input(name="user_id", shape=(), dtype=tf.string)
163
    # Get user embedding.
164
    user_embedding = embedding_encoder(
165
        vocabulary=user_vocabulary, embedding_dim=base_embedding_dim, name="user"
166
    )(user_input)
167

168
    # Receive the movie as an input.
169
    movie_input = layers.Input(name="movie_id", shape=(), dtype=tf.string)
170
    # Get embedding.
171
    movie_embedding = embedding_encoder(
172
        vocabulary=movie_vocabulary, embedding_dim=base_embedding_dim, name="movie"
173
    )(movie_input)
174

175
    # Compute dot product similarity between user and movie embeddings.
176
    logits = layers.Dot(axes=1, name="dot_similarity")(
177
        [user_embedding, movie_embedding]
178
    )
179
    # Convert to rating scale.
180
    prediction = keras.activations.sigmoid(logits) * 5
181
    # Create the model.
182
    model = keras.Model(
183
        inputs=[user_input, movie_input], outputs=prediction, name="baseline_model"
184
    )
185
    return model
186

187

188
baseline_model = create_baseline_model()
189
baseline_model.summary()
190

191
"""
192
Notice that the number of trainable parameters is 623,744
193
"""
194

195
history = run_experiment(baseline_model)
196

197
plt.plot(history.history["loss"])
198
plt.plot(history.history["val_loss"])
199
plt.title("model loss")
200
plt.ylabel("loss")
201
plt.xlabel("epoch")
202
plt.legend(["train", "eval"], loc="upper left")
203
plt.show()
204

205
"""
206
## Experiment 2: memory-efficient model
207
"""
208

209
"""
210
### Implement Quotient-Remainder embedding as a layer
211

212
The Quotient-Remainder technique works as follows. For a set of vocabulary and  embedding size
213
`embedding_dim`, instead of creating a `vocabulary_size X embedding_dim` embedding table,
214
we create *two* `num_buckets X embedding_dim` embedding tables, where `num_buckets`
215
is much smaller than `vocabulary_size`.
216
An embedding for a given item `index` is generated via the following steps:
217

218
1. Compute the `quotient_index` as `index // num_buckets`.
219
2. Compute the `remainder_index` as `index % num_buckets`.
220
3. Lookup `quotient_embedding` from the first embedding table using `quotient_index`.
221
4. Lookup `remainder_embedding` from the second embedding table using `remainder_index`.
222
5. Return `quotient_embedding` * `remainder_embedding`.
223

224
This technique not only reduces the number of embedding vectors needs to be stored and trained,
225
but also generates a *unique* embedding vector for each item of size `embedding_dim`.
226
Note that `q_embedding` and `r_embedding` can be combined using other operations,
227
like `Add` and `Concatenate`.
228
"""
229

230

231
class QREmbedding(keras.layers.Layer):
232
    def __init__(self, vocabulary, embedding_dim, num_buckets, name=None):
233
        super().__init__(name=name)
234
        self.num_buckets = num_buckets
235

236
        self.index_lookup = StringLookup(
237
            vocabulary=vocabulary, mask_token=None, num_oov_indices=0
238
        )
239
        self.q_embeddings = layers.Embedding(
240
            num_buckets,
241
            embedding_dim,
242
        )
243
        self.r_embeddings = layers.Embedding(
244
            num_buckets,
245
            embedding_dim,
246
        )
247

248
    def call(self, inputs):
249
        # Get the item index.
250
        embedding_index = self.index_lookup(inputs)
251
        # Get the quotient index.
252
        quotient_index = tf.math.floordiv(embedding_index, self.num_buckets)
253
        # Get the reminder index.
254
        remainder_index = tf.math.floormod(embedding_index, self.num_buckets)
255
        # Lookup the quotient_embedding using the quotient_index.
256
        quotient_embedding = self.q_embeddings(quotient_index)
257
        # Lookup the remainder_embedding using the remainder_index.
258
        remainder_embedding = self.r_embeddings(remainder_index)
259
        # Use multiplication as a combiner operation
260
        return quotient_embedding * remainder_embedding
261

262

263
"""
264
### Implement Mixed Dimension embedding as a layer
265

266
In the mixed dimension embedding technique, we train embedding vectors with full dimensions
267
for the frequently queried items, while train embedding vectors with *reduced dimensions*
268
for less frequent items, plus a *projection weights matrix* to bring low dimension embeddings
269
to the full dimensions.
270

271
More precisely, we define *blocks* of items of similar frequencies. For each block,
272
a `block_vocab_size X block_embedding_dim` embedding table and `block_embedding_dim X full_embedding_dim`
273
projection weights matrix are created. Note that, if `block_embedding_dim` equals `full_embedding_dim`,
274
the projection weights matrix becomes an *identity* matrix. Embeddings for a given batch of item
275
`indices` are generated via the following steps:
276

277
1. For each block, lookup the `block_embedding_dim` embedding vectors using `indices`, and
278
project them to the `full_embedding_dim`.
279
2. If an item index does not belong to a given block, an out-of-vocabulary embedding is returned.
280
Each block will return a `batch_size X full_embedding_dim` tensor.
281
3. A mask is applied to the embeddings returned from each block in order to convert the
282
out-of-vocabulary embeddings to vector of zeros. That is, for each item in the batch,
283
a single non-zero embedding vector is returned from the all block embeddings.
284
4. Embeddings retrieved from the blocks are combined using *sum* to produce the final
285
`batch_size X full_embedding_dim` tensor.
286

287
"""
288

289

290
class MDEmbedding(keras.layers.Layer):
291
    def __init__(
292
        self, blocks_vocabulary, blocks_embedding_dims, base_embedding_dim, name=None
293
    ):
294
        super().__init__(name=name)
295
        self.num_blocks = len(blocks_vocabulary)
296

297
        # Create vocab to block lookup.
298
        keys = []
299
        values = []
300
        for block_idx, block_vocab in enumerate(blocks_vocabulary):
301
            keys.extend(block_vocab)
302
            values.extend([block_idx] * len(block_vocab))
303
        self.vocab_to_block = tf.lookup.StaticHashTable(
304
            tf.lookup.KeyValueTensorInitializer(keys, values), default_value=-1
305
        )
306

307
        self.block_embedding_encoders = []
308
        self.block_embedding_projectors = []
309

310
        # Create block embedding encoders and projectors.
311
        for idx in range(self.num_blocks):
312
            vocabulary = blocks_vocabulary[idx]
313
            embedding_dim = blocks_embedding_dims[idx]
314
            block_embedding_encoder = embedding_encoder(
315
                vocabulary, embedding_dim, num_oov_indices=1
316
            )
317
            self.block_embedding_encoders.append(block_embedding_encoder)
318
            if embedding_dim == base_embedding_dim:
319
                self.block_embedding_projectors.append(layers.Lambda(lambda x: x))
320
            else:
321
                self.block_embedding_projectors.append(
322
                    layers.Dense(units=base_embedding_dim)
323
                )
324

325
    def call(self, inputs):
326
        # Get block index for each input item.
327
        block_indicies = self.vocab_to_block.lookup(inputs)
328
        # Initialize output embeddings to zeros.
329
        embeddings = tf.zeros(shape=(tf.shape(inputs)[0], base_embedding_dim))
330
        # Generate embeddings from blocks.
331
        for idx in range(self.num_blocks):
332
            # Lookup embeddings from the current block.
333
            block_embeddings = self.block_embedding_encoders[idx](inputs)
334
            # Project embeddings to base_embedding_dim.
335
            block_embeddings = self.block_embedding_projectors[idx](block_embeddings)
336
            # Create a mask to filter out embeddings of items that do not belong to the current block.
337
            mask = tf.expand_dims(tf.cast(block_indicies == idx, tf.dtypes.float32), 1)
338
            # Set the embeddings for the items not belonging to the current block to zeros.
339
            block_embeddings = block_embeddings * mask
340
            # Add the block embeddings to the final embeddings.
341
            embeddings += block_embeddings
342

343
        return embeddings
344

345

346
"""
347
### Implement the memory-efficient model
348

349
In this experiment, we are going to use the **Quotient-Remainder** technique to reduce the
350
size of the user embeddings, and the **Mixed Dimension** technique to reduce the size of the
351
movie embeddings.
352

353
While in the [paper](https://arxiv.org/abs/1909.11810), an alpha-power rule is used to determined
354
the dimensions of the embedding of each block, we simply set the number of blocks and the
355
dimensions of embeddings of each block based on the histogram visualization of movies popularity.
356
"""
357

358
movie_frequencies = ratings_data["movie_id"].value_counts()
359
movie_frequencies.hist(bins=10)
360

361
"""
362
You can see that we can group the movies into three blocks, and assign them 64, 32, and 16
363
embedding dimensions, respectively. Feel free to experiment with different number of blocks
364
and dimensions.
365
"""
366

367
sorted_movie_vocabulary = list(movie_frequencies.keys())
368

369
movie_blocks_vocabulary = [
370
    sorted_movie_vocabulary[:400],  # high popularity movies block
371
    sorted_movie_vocabulary[400:1700],  # normal popularity movies block
372
    sorted_movie_vocabulary[1700:],  # low popularity movies block
373
]
374

375
movie_blocks_embedding_dims = [64, 32, 16]
376

377
user_embedding_num_buckets = len(user_vocabulary) // 50
378

379

380
def create_memory_efficient_model():
381
    # Take the user as an input.
382
    user_input = layers.Input(name="user_id", shape=(), dtype="string")
383
    # Get user embedding.
384
    user_embedding = QREmbedding(
385
        vocabulary=user_vocabulary,
386
        embedding_dim=base_embedding_dim,
387
        num_buckets=user_embedding_num_buckets,
388
        name="user_embedding",
389
    )(user_input)
390

391
    # Take the movie as an input.
392
    movie_input = layers.Input(name="movie_id", shape=(), dtype="string")
393
    # Get embedding.
394
    movie_embedding = MDEmbedding(
395
        blocks_vocabulary=movie_blocks_vocabulary,
396
        blocks_embedding_dims=movie_blocks_embedding_dims,
397
        base_embedding_dim=base_embedding_dim,
398
        name="movie_embedding",
399
    )(movie_input)
400

401
    # Compute dot product similarity between user and movie embeddings.
402
    logits = layers.Dot(axes=1, name="dot_similarity")(
403
        [user_embedding, movie_embedding]
404
    )
405
    # Convert to rating scale.
406
    prediction = keras.activations.sigmoid(logits) * 5
407
    # Create the model.
408
    model = keras.Model(
409
        inputs=[user_input, movie_input], outputs=prediction, name="baseline_model"
410
    )
411
    return model
412

413

414
memory_efficient_model = create_memory_efficient_model()
415
memory_efficient_model.summary()
416

417
"""
418
Notice that the number of trainable parameters is 117,968, which is more than 5x less than
419
the number of parameters in the baseline model.
420
"""
421

422
history = run_experiment(memory_efficient_model)
423

424
plt.plot(history.history["loss"])
425
plt.plot(history.history["val_loss"])
426
plt.title("model loss")
427
plt.ylabel("loss")
428
plt.xlabel("epoch")
429
plt.legend(["train", "eval"], loc="upper left")
430
plt.show()
431

432
Product

Resources

Company