Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
keras-team
GitHub Repository: keras-team/keras-io
Path: blob/master/examples/keras_recipes/memory_efficient_embeddings.py
3507 views
1
"""
2
Title: Memory-efficient embeddings for recommendation systems
3
Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
4
Date created: 2021/02/15
5
Last modified: 2023/11/15
6
Description: Using compositional & mixed-dimension embeddings for memory-efficient recommendation models.
7
Accelerator: GPU
8
"""
9
10
"""
11
## Introduction
12
13
This example demonstrates two techniques for building memory-efficient recommendation models
14
by reducing the size of the embedding tables, without sacrificing model effectiveness:
15
16
1. [Quotient-remainder trick](https://arxiv.org/abs/1909.02107), by Hao-Jun Michael Shi et al.,
17
which reduces the number of embedding vectors to store, yet produces unique embedding
18
vector for each item without explicit definition.
19
2. [Mixed Dimension embeddings](https://arxiv.org/abs/1909.11810), by Antonio Ginart et al.,
20
which stores embedding vectors with mixed dimensions, where less popular items have
21
reduced dimension embeddings.
22
23
We use the [1M version of the Movielens dataset](https://grouplens.org/datasets/movielens/1m/).
24
The dataset includes around 1 million ratings from 6,000 users on 4,000 movies.
25
"""
26
27
"""
28
## Setup
29
"""
30
31
import os
32
33
os.environ["KERAS_BACKEND"] = "tensorflow"
34
35
from zipfile import ZipFile
36
from urllib.request import urlretrieve
37
import numpy as np
38
import pandas as pd
39
import tensorflow as tf
40
import keras
41
from keras import layers
42
from keras.layers import StringLookup
43
import matplotlib.pyplot as plt
44
45
"""
46
## Prepare the data
47
48
## Download and process data
49
"""
50
51
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
52
ZipFile("movielens.zip", "r").extractall()
53
54
ratings_data = pd.read_csv(
55
"ml-1m/ratings.dat",
56
sep="::",
57
names=["user_id", "movie_id", "rating", "unix_timestamp"],
58
)
59
60
ratings_data["movie_id"] = ratings_data["movie_id"].apply(lambda x: f"movie_{x}")
61
ratings_data["user_id"] = ratings_data["user_id"].apply(lambda x: f"user_{x}")
62
ratings_data["rating"] = ratings_data["rating"].apply(lambda x: float(x))
63
del ratings_data["unix_timestamp"]
64
65
print(f"Number of users: {len(ratings_data.user_id.unique())}")
66
print(f"Number of movies: {len(ratings_data.movie_id.unique())}")
67
print(f"Number of ratings: {len(ratings_data.index)}")
68
69
"""
70
## Create train and eval data splits
71
"""
72
73
random_selection = np.random.rand(len(ratings_data.index)) <= 0.85
74
train_data = ratings_data[random_selection]
75
eval_data = ratings_data[~random_selection]
76
77
train_data.to_csv("train_data.csv", index=False, sep="|", header=False)
78
eval_data.to_csv("eval_data.csv", index=False, sep="|", header=False)
79
print(f"Train data split: {len(train_data.index)}")
80
print(f"Eval data split: {len(eval_data.index)}")
81
print("Train and eval data files are saved.")
82
83
"""
84
## Define dataset metadata and hyperparameters
85
"""
86
87
csv_header = list(ratings_data.columns)
88
user_vocabulary = list(ratings_data.user_id.unique())
89
movie_vocabulary = list(ratings_data.movie_id.unique())
90
target_feature_name = "rating"
91
learning_rate = 0.001
92
batch_size = 128
93
num_epochs = 3
94
base_embedding_dim = 64
95
96
"""
97
## Train and evaluate the model
98
"""
99
100
101
def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=True):
102
return tf.data.experimental.make_csv_dataset(
103
csv_file_path,
104
batch_size=batch_size,
105
column_names=csv_header,
106
label_name=target_feature_name,
107
num_epochs=1,
108
header=False,
109
field_delim="|",
110
shuffle=shuffle,
111
)
112
113
114
def run_experiment(model):
115
# Compile the model.
116
model.compile(
117
optimizer=keras.optimizers.Adam(learning_rate),
118
loss=keras.losses.MeanSquaredError(),
119
metrics=[keras.metrics.MeanAbsoluteError(name="mae")],
120
)
121
# Read the training data.
122
train_dataset = get_dataset_from_csv("train_data.csv", batch_size)
123
# Read the test data.
124
eval_dataset = get_dataset_from_csv("eval_data.csv", batch_size, shuffle=False)
125
# Fit the model with the training data.
126
history = model.fit(
127
train_dataset,
128
epochs=num_epochs,
129
validation_data=eval_dataset,
130
)
131
return history
132
133
134
"""
135
## Experiment 1: baseline collaborative filtering model
136
137
### Implement embedding encoder
138
"""
139
140
141
def embedding_encoder(vocabulary, embedding_dim, num_oov_indices=0, name=None):
142
return keras.Sequential(
143
[
144
StringLookup(
145
vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices
146
),
147
layers.Embedding(
148
input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim
149
),
150
],
151
name=f"{name}_embedding" if name else None,
152
)
153
154
155
"""
156
### Implement the baseline model
157
"""
158
159
160
def create_baseline_model():
161
# Receive the user as an input.
162
user_input = layers.Input(name="user_id", shape=(), dtype=tf.string)
163
# Get user embedding.
164
user_embedding = embedding_encoder(
165
vocabulary=user_vocabulary, embedding_dim=base_embedding_dim, name="user"
166
)(user_input)
167
168
# Receive the movie as an input.
169
movie_input = layers.Input(name="movie_id", shape=(), dtype=tf.string)
170
# Get embedding.
171
movie_embedding = embedding_encoder(
172
vocabulary=movie_vocabulary, embedding_dim=base_embedding_dim, name="movie"
173
)(movie_input)
174
175
# Compute dot product similarity between user and movie embeddings.
176
logits = layers.Dot(axes=1, name="dot_similarity")(
177
[user_embedding, movie_embedding]
178
)
179
# Convert to rating scale.
180
prediction = keras.activations.sigmoid(logits) * 5
181
# Create the model.
182
model = keras.Model(
183
inputs=[user_input, movie_input], outputs=prediction, name="baseline_model"
184
)
185
return model
186
187
188
baseline_model = create_baseline_model()
189
baseline_model.summary()
190
191
"""
192
Notice that the number of trainable parameters is 623,744
193
"""
194
195
history = run_experiment(baseline_model)
196
197
plt.plot(history.history["loss"])
198
plt.plot(history.history["val_loss"])
199
plt.title("model loss")
200
plt.ylabel("loss")
201
plt.xlabel("epoch")
202
plt.legend(["train", "eval"], loc="upper left")
203
plt.show()
204
205
"""
206
## Experiment 2: memory-efficient model
207
"""
208
209
"""
210
### Implement Quotient-Remainder embedding as a layer
211
212
The Quotient-Remainder technique works as follows. For a set of vocabulary and embedding size
213
`embedding_dim`, instead of creating a `vocabulary_size X embedding_dim` embedding table,
214
we create *two* `num_buckets X embedding_dim` embedding tables, where `num_buckets`
215
is much smaller than `vocabulary_size`.
216
An embedding for a given item `index` is generated via the following steps:
217
218
1. Compute the `quotient_index` as `index // num_buckets`.
219
2. Compute the `remainder_index` as `index % num_buckets`.
220
3. Lookup `quotient_embedding` from the first embedding table using `quotient_index`.
221
4. Lookup `remainder_embedding` from the second embedding table using `remainder_index`.
222
5. Return `quotient_embedding` * `remainder_embedding`.
223
224
This technique not only reduces the number of embedding vectors needs to be stored and trained,
225
but also generates a *unique* embedding vector for each item of size `embedding_dim`.
226
Note that `q_embedding` and `r_embedding` can be combined using other operations,
227
like `Add` and `Concatenate`.
228
"""
229
230
231
class QREmbedding(keras.layers.Layer):
232
def __init__(self, vocabulary, embedding_dim, num_buckets, name=None):
233
super().__init__(name=name)
234
self.num_buckets = num_buckets
235
236
self.index_lookup = StringLookup(
237
vocabulary=vocabulary, mask_token=None, num_oov_indices=0
238
)
239
self.q_embeddings = layers.Embedding(
240
num_buckets,
241
embedding_dim,
242
)
243
self.r_embeddings = layers.Embedding(
244
num_buckets,
245
embedding_dim,
246
)
247
248
def call(self, inputs):
249
# Get the item index.
250
embedding_index = self.index_lookup(inputs)
251
# Get the quotient index.
252
quotient_index = tf.math.floordiv(embedding_index, self.num_buckets)
253
# Get the reminder index.
254
remainder_index = tf.math.floormod(embedding_index, self.num_buckets)
255
# Lookup the quotient_embedding using the quotient_index.
256
quotient_embedding = self.q_embeddings(quotient_index)
257
# Lookup the remainder_embedding using the remainder_index.
258
remainder_embedding = self.r_embeddings(remainder_index)
259
# Use multiplication as a combiner operation
260
return quotient_embedding * remainder_embedding
261
262
263
"""
264
### Implement Mixed Dimension embedding as a layer
265
266
In the mixed dimension embedding technique, we train embedding vectors with full dimensions
267
for the frequently queried items, while train embedding vectors with *reduced dimensions*
268
for less frequent items, plus a *projection weights matrix* to bring low dimension embeddings
269
to the full dimensions.
270
271
More precisely, we define *blocks* of items of similar frequencies. For each block,
272
a `block_vocab_size X block_embedding_dim` embedding table and `block_embedding_dim X full_embedding_dim`
273
projection weights matrix are created. Note that, if `block_embedding_dim` equals `full_embedding_dim`,
274
the projection weights matrix becomes an *identity* matrix. Embeddings for a given batch of item
275
`indices` are generated via the following steps:
276
277
1. For each block, lookup the `block_embedding_dim` embedding vectors using `indices`, and
278
project them to the `full_embedding_dim`.
279
2. If an item index does not belong to a given block, an out-of-vocabulary embedding is returned.
280
Each block will return a `batch_size X full_embedding_dim` tensor.
281
3. A mask is applied to the embeddings returned from each block in order to convert the
282
out-of-vocabulary embeddings to vector of zeros. That is, for each item in the batch,
283
a single non-zero embedding vector is returned from the all block embeddings.
284
4. Embeddings retrieved from the blocks are combined using *sum* to produce the final
285
`batch_size X full_embedding_dim` tensor.
286
287
"""
288
289
290
class MDEmbedding(keras.layers.Layer):
291
def __init__(
292
self, blocks_vocabulary, blocks_embedding_dims, base_embedding_dim, name=None
293
):
294
super().__init__(name=name)
295
self.num_blocks = len(blocks_vocabulary)
296
297
# Create vocab to block lookup.
298
keys = []
299
values = []
300
for block_idx, block_vocab in enumerate(blocks_vocabulary):
301
keys.extend(block_vocab)
302
values.extend([block_idx] * len(block_vocab))
303
self.vocab_to_block = tf.lookup.StaticHashTable(
304
tf.lookup.KeyValueTensorInitializer(keys, values), default_value=-1
305
)
306
307
self.block_embedding_encoders = []
308
self.block_embedding_projectors = []
309
310
# Create block embedding encoders and projectors.
311
for idx in range(self.num_blocks):
312
vocabulary = blocks_vocabulary[idx]
313
embedding_dim = blocks_embedding_dims[idx]
314
block_embedding_encoder = embedding_encoder(
315
vocabulary, embedding_dim, num_oov_indices=1
316
)
317
self.block_embedding_encoders.append(block_embedding_encoder)
318
if embedding_dim == base_embedding_dim:
319
self.block_embedding_projectors.append(layers.Lambda(lambda x: x))
320
else:
321
self.block_embedding_projectors.append(
322
layers.Dense(units=base_embedding_dim)
323
)
324
325
def call(self, inputs):
326
# Get block index for each input item.
327
block_indicies = self.vocab_to_block.lookup(inputs)
328
# Initialize output embeddings to zeros.
329
embeddings = tf.zeros(shape=(tf.shape(inputs)[0], base_embedding_dim))
330
# Generate embeddings from blocks.
331
for idx in range(self.num_blocks):
332
# Lookup embeddings from the current block.
333
block_embeddings = self.block_embedding_encoders[idx](inputs)
334
# Project embeddings to base_embedding_dim.
335
block_embeddings = self.block_embedding_projectors[idx](block_embeddings)
336
# Create a mask to filter out embeddings of items that do not belong to the current block.
337
mask = tf.expand_dims(tf.cast(block_indicies == idx, tf.dtypes.float32), 1)
338
# Set the embeddings for the items not belonging to the current block to zeros.
339
block_embeddings = block_embeddings * mask
340
# Add the block embeddings to the final embeddings.
341
embeddings += block_embeddings
342
343
return embeddings
344
345
346
"""
347
### Implement the memory-efficient model
348
349
In this experiment, we are going to use the **Quotient-Remainder** technique to reduce the
350
size of the user embeddings, and the **Mixed Dimension** technique to reduce the size of the
351
movie embeddings.
352
353
While in the [paper](https://arxiv.org/abs/1909.11810), an alpha-power rule is used to determined
354
the dimensions of the embedding of each block, we simply set the number of blocks and the
355
dimensions of embeddings of each block based on the histogram visualization of movies popularity.
356
"""
357
358
movie_frequencies = ratings_data["movie_id"].value_counts()
359
movie_frequencies.hist(bins=10)
360
361
"""
362
You can see that we can group the movies into three blocks, and assign them 64, 32, and 16
363
embedding dimensions, respectively. Feel free to experiment with different number of blocks
364
and dimensions.
365
"""
366
367
sorted_movie_vocabulary = list(movie_frequencies.keys())
368
369
movie_blocks_vocabulary = [
370
sorted_movie_vocabulary[:400], # high popularity movies block
371
sorted_movie_vocabulary[400:1700], # normal popularity movies block
372
sorted_movie_vocabulary[1700:], # low popularity movies block
373
]
374
375
movie_blocks_embedding_dims = [64, 32, 16]
376
377
user_embedding_num_buckets = len(user_vocabulary) // 50
378
379
380
def create_memory_efficient_model():
381
# Take the user as an input.
382
user_input = layers.Input(name="user_id", shape=(), dtype="string")
383
# Get user embedding.
384
user_embedding = QREmbedding(
385
vocabulary=user_vocabulary,
386
embedding_dim=base_embedding_dim,
387
num_buckets=user_embedding_num_buckets,
388
name="user_embedding",
389
)(user_input)
390
391
# Take the movie as an input.
392
movie_input = layers.Input(name="movie_id", shape=(), dtype="string")
393
# Get embedding.
394
movie_embedding = MDEmbedding(
395
blocks_vocabulary=movie_blocks_vocabulary,
396
blocks_embedding_dims=movie_blocks_embedding_dims,
397
base_embedding_dim=base_embedding_dim,
398
name="movie_embedding",
399
)(movie_input)
400
401
# Compute dot product similarity between user and movie embeddings.
402
logits = layers.Dot(axes=1, name="dot_similarity")(
403
[user_embedding, movie_embedding]
404
)
405
# Convert to rating scale.
406
prediction = keras.activations.sigmoid(logits) * 5
407
# Create the model.
408
model = keras.Model(
409
inputs=[user_input, movie_input], outputs=prediction, name="baseline_model"
410
)
411
return model
412
413
414
memory_efficient_model = create_memory_efficient_model()
415
memory_efficient_model.summary()
416
417
"""
418
Notice that the number of trainable parameters is 117,968, which is more than 5x less than
419
the number of parameters in the baseline model.
420
"""
421
422
history = run_experiment(memory_efficient_model)
423
424
plt.plot(history.history["loss"])
425
plt.plot(history.history["val_loss"])
426
plt.title("model loss")
427
plt.ylabel("loss")
428
plt.xlabel("epoch")
429
plt.legend(["train", "eval"], loc="upper left")
430
plt.show()
431
432