Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
keras-team
GitHub Repository: keras-team/keras-io
Path: blob/master/examples/structured_data/customer_lifetime_value.py
3507 views
1
"""
2
Title: Deep Learning for Customer Lifetime Value
3
Author: [Praveen Hosdrug](https://www.linkedin.com/in/praveenhosdrug/)
4
Date created: 2024/11/23
5
Last modified: 2024/11/27
6
Description: A hybrid deep learning architecture for predicting customer purchase patterns and lifetime value.
7
Accelerator: None
8
"""
9
10
"""
11
## Introduction
12
13
A hybrid deep learning architecture combining Transformer encoders and LSTM networks
14
for predicting customer purchase patterns and lifetime value using transaction history.
15
While many existing review articles focus on classic parametric models and traditional machine learning algorithms
16
,this implementation leverages recent advancements in Transformer-based models for time series prediction.
17
The approach handles multi-granularity prediction across different temporal scales.
18
19
"""
20
21
"""
22
## Setting up Libraries for the Deep Learning Project
23
"""
24
import subprocess
25
26
27
def install_packages(packages):
28
"""
29
Install a list of packages using pip.
30
31
Args:
32
packages (list): A list of package names to install.
33
"""
34
for package in packages:
35
subprocess.run(["pip", "install", package], check=True)
36
37
38
"""
39
## List of Packages to Install
40
41
1. uciml: For the purpose of the tutorial; we will be using
42
the UK Retail [Dataset](https://archive.ics.uci.edu/dataset/352/online+retail)
43
2. keras_hub: Access to the transformer encoder layer.
44
"""
45
46
packages_to_install = ["ucimlrepo", "keras_hub"]
47
48
# Install the packages
49
install_packages(packages_to_install)
50
51
# Core data processing and numerical libraries
52
import os
53
54
os.environ["KERAS_BACKEND"] = "jax"
55
import keras
56
import numpy as np
57
import pandas as pd
58
from typing import Dict
59
60
61
# Visualization
62
import matplotlib.pyplot as plt
63
64
# Keras imports
65
from keras import layers
66
from keras import Model
67
from keras import ops
68
from keras_hub.layers import TransformerEncoder
69
from keras import regularizers
70
71
# UK Retail Dataset
72
from ucimlrepo import fetch_ucirepo
73
74
"""
75
## Preprocessing the UK Retail dataset
76
"""
77
78
79
def prepare_time_series_data(data):
80
"""
81
Preprocess retail transaction data for deep learning.
82
83
Args:
84
data: Raw transaction data containing InvoiceDate, UnitPrice, etc.
85
Returns:
86
Processed DataFrame with calculated features
87
"""
88
processed_data = data.copy()
89
90
# Essential datetime handling for temporal ordering
91
processed_data["InvoiceDate"] = pd.to_datetime(processed_data["InvoiceDate"])
92
93
# Basic business constraints and calculations
94
processed_data = processed_data[processed_data["UnitPrice"] > 0]
95
processed_data["Amount"] = processed_data["UnitPrice"] * processed_data["Quantity"]
96
processed_data["CustomerID"] = processed_data["CustomerID"].fillna(99999.0)
97
98
# Handle outliers in Amount using statistical thresholds
99
q1 = processed_data["Amount"].quantile(0.25)
100
q3 = processed_data["Amount"].quantile(0.75)
101
102
# Define bounds - using 1.5 IQR rule
103
lower_bound = q1 - 1.5 * (q3 - q1)
104
upper_bound = q3 + 1.5 * (q3 - q1)
105
106
# Filter outliers
107
processed_data = processed_data[
108
(processed_data["Amount"] >= lower_bound)
109
& (processed_data["Amount"] <= upper_bound)
110
]
111
112
return processed_data
113
114
115
# Load Data
116
117
online_retail = fetch_ucirepo(id=352)
118
raw_data = online_retail.data.features
119
transformed_data = prepare_time_series_data(raw_data)
120
121
122
def prepare_data_for_modeling(
123
df: pd.DataFrame,
124
input_sequence_length: int = 6,
125
output_sequence_length: int = 6,
126
) -> Dict:
127
"""
128
Transform retail data into sequence-to-sequence format with separate
129
temporal and trend components.
130
"""
131
df = df.copy()
132
133
# Daily aggregation
134
daily_purchases = (
135
df.groupby(["CustomerID", pd.Grouper(key="InvoiceDate", freq="D")])
136
.agg({"Amount": "sum", "Quantity": "sum", "Country": "first"})
137
.reset_index()
138
)
139
140
daily_purchases["frequency"] = np.where(daily_purchases["Amount"] > 0, 1, 0)
141
142
# Monthly resampling
143
monthly_purchases = (
144
daily_purchases.set_index("InvoiceDate")
145
.groupby("CustomerID")
146
.resample("M")
147
.agg(
148
{"Amount": "sum", "Quantity": "sum", "frequency": "sum", "Country": "first"}
149
)
150
.reset_index()
151
)
152
153
# Add cyclical temporal features
154
def prepare_temporal_features(input_window: pd.DataFrame) -> np.ndarray:
155
156
month = input_window["InvoiceDate"].dt.month
157
month_sin = np.sin(2 * np.pi * month / 12)
158
month_cos = np.cos(2 * np.pi * month / 12)
159
is_quarter_start = (month % 3 == 1).astype(int)
160
161
temporal_features = np.column_stack(
162
[
163
month,
164
input_window["InvoiceDate"].dt.year,
165
month_sin,
166
month_cos,
167
is_quarter_start,
168
]
169
)
170
return temporal_features
171
172
# Prepare trend features with lagged values
173
def prepare_trend_features(input_window: pd.DataFrame, lag: int = 3) -> np.ndarray:
174
175
lagged_data = pd.DataFrame()
176
for i in range(1, lag + 1):
177
lagged_data[f"Amount_lag_{i}"] = input_window["Amount"].shift(i)
178
lagged_data[f"Quantity_lag_{i}"] = input_window["Quantity"].shift(i)
179
lagged_data[f"frequency_lag_{i}"] = input_window["frequency"].shift(i)
180
181
lagged_data = lagged_data.fillna(0)
182
183
trend_features = np.column_stack(
184
[
185
input_window["Amount"].values,
186
input_window["Quantity"].values,
187
input_window["frequency"].values,
188
lagged_data.values,
189
]
190
)
191
return trend_features
192
193
sequence_containers = {
194
"temporal_sequences": [],
195
"trend_sequences": [],
196
"static_features": [],
197
"output_sequences": [],
198
}
199
200
# Process sequences for each customer
201
for customer_id, customer_data in monthly_purchases.groupby("CustomerID"):
202
customer_data = customer_data.sort_values("InvoiceDate")
203
sequence_ranges = (
204
len(customer_data) - input_sequence_length - output_sequence_length + 1
205
)
206
207
country = customer_data["Country"].iloc[0]
208
209
for i in range(sequence_ranges):
210
input_window = customer_data.iloc[i : i + input_sequence_length]
211
output_window = customer_data.iloc[
212
i
213
+ input_sequence_length : i
214
+ input_sequence_length
215
+ output_sequence_length
216
]
217
218
if (
219
len(input_window) == input_sequence_length
220
and len(output_window) == output_sequence_length
221
):
222
temporal_features = prepare_temporal_features(input_window)
223
trend_features = prepare_trend_features(input_window)
224
225
sequence_containers["temporal_sequences"].append(temporal_features)
226
sequence_containers["trend_sequences"].append(trend_features)
227
sequence_containers["static_features"].append(country)
228
sequence_containers["output_sequences"].append(
229
output_window["Amount"].values
230
)
231
232
return {
233
"temporal_sequences": (
234
np.array(sequence_containers["temporal_sequences"], dtype=np.float32)
235
),
236
"trend_sequences": (
237
np.array(sequence_containers["trend_sequences"], dtype=np.float32)
238
),
239
"static_features": np.array(sequence_containers["static_features"]),
240
"output_sequences": (
241
np.array(sequence_containers["output_sequences"], dtype=np.float32)
242
),
243
}
244
245
246
# Transform data with input and output sequences into a Output dictionary
247
output = prepare_data_for_modeling(
248
df=transformed_data, input_sequence_length=6, output_sequence_length=6
249
)
250
251
"""
252
## Scaling and Splitting
253
"""
254
255
256
def robust_scale(data):
257
"""
258
Min-Max scaling function since standard deviation is high.
259
"""
260
data = np.array(data)
261
data_min = np.min(data)
262
data_max = np.max(data)
263
scaled = (data - data_min) / (data_max - data_min)
264
return scaled
265
266
267
def create_temporal_splits_with_scaling(
268
prepared_data: Dict[str, np.ndarray],
269
test_ratio: float = 0.2,
270
val_ratio: float = 0.2,
271
):
272
total_sequences = len(prepared_data["trend_sequences"])
273
# Calculate split points
274
test_size = int(total_sequences * test_ratio)
275
val_size = int(total_sequences * val_ratio)
276
train_size = total_sequences - (test_size + val_size)
277
278
# Scale trend sequences
279
trend_shape = prepared_data["trend_sequences"].shape
280
scaled_trends = np.zeros_like(prepared_data["trend_sequences"])
281
282
# Scale each feature independently
283
for i in range(trend_shape[-1]):
284
scaled_trends[..., i] = robust_scale(prepared_data["trend_sequences"][..., i])
285
# Scale output sequences
286
scaled_outputs = robust_scale(prepared_data["output_sequences"])
287
288
# Create splits
289
train_data = {
290
"trend_sequences": scaled_trends[:train_size],
291
"temporal_sequences": prepared_data["temporal_sequences"][:train_size],
292
"static_features": prepared_data["static_features"][:train_size],
293
"output_sequences": scaled_outputs[:train_size],
294
}
295
296
val_data = {
297
"trend_sequences": scaled_trends[train_size : train_size + val_size],
298
"temporal_sequences": prepared_data["temporal_sequences"][
299
train_size : train_size + val_size
300
],
301
"static_features": prepared_data["static_features"][
302
train_size : train_size + val_size
303
],
304
"output_sequences": scaled_outputs[train_size : train_size + val_size],
305
}
306
307
test_data = {
308
"trend_sequences": scaled_trends[train_size + val_size :],
309
"temporal_sequences": prepared_data["temporal_sequences"][
310
train_size + val_size :
311
],
312
"static_features": prepared_data["static_features"][train_size + val_size :],
313
"output_sequences": scaled_outputs[train_size + val_size :],
314
}
315
316
return train_data, val_data, test_data
317
318
319
# Usage
320
train_data, val_data, test_data = create_temporal_splits_with_scaling(output)
321
322
"""
323
## Evaluation
324
"""
325
326
327
def calculate_metrics(y_true, y_pred):
328
"""
329
Calculates RMSE, MAE and R²
330
"""
331
# Convert inputs to "float32"
332
y_true = ops.cast(y_true, dtype="float32")
333
y_pred = ops.cast(y_pred, dtype="float32")
334
335
# RMSE
336
rmse = np.sqrt(np.mean(np.square(y_true - y_pred)))
337
338
# R² (coefficient of determination)
339
ss_res = np.sum(np.square(y_true - y_pred))
340
ss_tot = np.sum(np.square(y_true - np.mean(y_true)))
341
r2 = 1 - (ss_res / ss_tot)
342
343
return {"mae": np.mean(np.abs(y_true - y_pred)), "rmse": rmse, "r2": r2}
344
345
346
def plot_lorenz_analysis(y_true, y_pred):
347
"""
348
Plots Lorenz curves to show distribution of high and low value users
349
"""
350
# Convert to numpy arrays and flatten
351
y_true = np.array(y_true).flatten()
352
y_pred = np.array(y_pred).flatten()
353
354
# Sort values in descending order (for high-value users analysis)
355
true_sorted = np.sort(-y_true)
356
pred_sorted = np.sort(-y_pred)
357
358
# Calculate cumulative sums
359
true_cumsum = np.cumsum(true_sorted)
360
pred_cumsum = np.cumsum(pred_sorted)
361
362
# Normalize to percentages
363
true_cumsum_pct = true_cumsum / true_cumsum[-1]
364
pred_cumsum_pct = pred_cumsum / pred_cumsum[-1]
365
366
# Generate percentiles for x-axis
367
percentiles = np.linspace(0, 1, len(y_true))
368
369
# Calculate Mutual Gini (area between curves)
370
mutual_gini = np.abs(
371
np.trapz(true_cumsum_pct, percentiles) - np.trapz(pred_cumsum_pct, percentiles)
372
)
373
374
# Create plot
375
plt.figure(figsize=(10, 6))
376
plt.plot(percentiles, true_cumsum_pct, "g-", label="True Values")
377
plt.plot(percentiles, pred_cumsum_pct, "r-", label="Predicted Values")
378
plt.xlabel("Cumulative % of Users (Descending Order)")
379
plt.ylabel("Cumulative % of LTV")
380
plt.title("Lorenz Curves: True vs Predicted Values")
381
plt.legend()
382
plt.grid(True)
383
print(f"\nMutual Gini: {mutual_gini:.4f} (lower is better)")
384
plt.show()
385
386
return mutual_gini
387
388
389
"""
390
## Hybrid Transformer / LSTM model architecture
391
392
The hybrid nature of this model is particularly significant because it combines RNN's
393
ability to handle sequential data with Transformer's attention mechanisms for capturing
394
global patterns across countries and seasonality.
395
"""
396
397
398
def build_hybrid_model(
399
input_sequence_length: int,
400
output_sequence_length: int,
401
num_countries: int,
402
d_model: int = 8,
403
num_heads: int = 4,
404
):
405
406
keras.utils.set_random_seed(seed=42)
407
408
# Inputs
409
temporal_inputs = layers.Input(
410
shape=(input_sequence_length, 5), name="temporal_inputs"
411
)
412
trend_inputs = layers.Input(shape=(input_sequence_length, 12), name="trend_inputs")
413
country_inputs = layers.Input(
414
shape=(num_countries,), dtype="int32", name="country_inputs"
415
)
416
417
# Process country features
418
country_embedding = layers.Embedding(
419
input_dim=num_countries,
420
output_dim=d_model,
421
mask_zero=False,
422
name="country_embedding",
423
)(
424
country_inputs
425
) # Output shape: (batch_size, 1, d_model)
426
427
# Flatten the embedding output
428
country_embedding = layers.Flatten(name="flatten_country_embedding")(
429
country_embedding
430
)
431
432
# Repeat the country embedding across timesteps
433
country_embedding_repeated = layers.RepeatVector(
434
input_sequence_length, name="repeat_country_embedding"
435
)(country_embedding)
436
437
# Projection of temporal inputs to match Transformer dimensions
438
temporal_projection = layers.Dense(
439
d_model, activation="tanh", name="temporal_projection"
440
)(temporal_inputs)
441
442
# Combine all features
443
combined_features = layers.Concatenate()(
444
[temporal_projection, country_embedding_repeated]
445
)
446
447
transformer_output = combined_features
448
for _ in range(3):
449
transformer_output = TransformerEncoder(
450
intermediate_dim=16, num_heads=num_heads
451
)(transformer_output)
452
453
lstm_output = layers.LSTM(units=64, name="lstm_trend")(trend_inputs)
454
455
transformer_flattened = layers.GlobalAveragePooling1D(name="flatten_transformer")(
456
transformer_output
457
)
458
transformer_flattened = layers.Dense(1, activation="sigmoid")(transformer_flattened)
459
# Concatenate flattened Transformer output with LSTM output
460
merged_features = layers.Concatenate(name="concatenate_transformer_lstm")(
461
[transformer_flattened, lstm_output]
462
)
463
# Repeat the merged features to match the output sequence length
464
decoder_initial = layers.RepeatVector(
465
output_sequence_length, name="repeat_merged_features"
466
)(merged_features)
467
468
decoder_lstm = layers.LSTM(
469
units=64,
470
return_sequences=True,
471
recurrent_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
472
)(decoder_initial)
473
474
# Output Dense layer
475
output = layers.Dense(units=1, activation="linear", name="output_dense")(
476
decoder_lstm
477
)
478
479
model = Model(
480
inputs=[temporal_inputs, trend_inputs, country_inputs], outputs=output
481
)
482
483
model.compile(
484
optimizer=keras.optimizers.Adam(learning_rate=0.001),
485
loss="mse",
486
metrics=["mse"],
487
)
488
489
return model
490
491
492
# Create the hybrid model
493
model = build_hybrid_model(
494
input_sequence_length=6,
495
output_sequence_length=6,
496
num_countries=len(np.unique(train_data["static_features"])) + 1,
497
d_model=8,
498
num_heads=4,
499
)
500
501
# Configure StringLookup
502
label_encoder = layers.StringLookup(output_mode="one_hot", num_oov_indices=1)
503
504
# Adapt and encode
505
label_encoder.adapt(train_data["static_features"])
506
507
train_static_encoded = label_encoder(train_data["static_features"])
508
val_static_encoded = label_encoder(val_data["static_features"])
509
test_static_encoded = label_encoder(test_data["static_features"])
510
511
# Convert sequences with proper type casting
512
x_train_seq = np.asarray(train_data["trend_sequences"]).astype(np.float32)
513
x_val_seq = np.asarray(val_data["trend_sequences"]).astype(np.float32)
514
x_train_temporal = np.asarray(train_data["temporal_sequences"]).astype(np.float32)
515
x_val_temporal = np.asarray(val_data["temporal_sequences"]).astype(np.float32)
516
train_outputs = np.asarray(train_data["output_sequences"]).astype(np.float32)
517
val_outputs = np.asarray(val_data["output_sequences"]).astype(np.float32)
518
test_output = np.asarray(test_data["output_sequences"]).astype(np.float32)
519
# Training setup
520
keras.utils.set_random_seed(seed=42)
521
522
history = model.fit(
523
[x_train_temporal, x_train_seq, train_static_encoded],
524
train_outputs,
525
validation_data=(
526
[x_val_temporal, x_val_seq, val_static_encoded],
527
val_data["output_sequences"].astype(np.float32),
528
),
529
epochs=20,
530
batch_size=30,
531
)
532
533
# Make predictions
534
predictions = model.predict(
535
[
536
test_data["temporal_sequences"].astype(np.float32),
537
test_data["trend_sequences"].astype(np.float32),
538
test_static_encoded,
539
]
540
)
541
542
# Calculate the predictions
543
predictions = np.squeeze(predictions)
544
545
# Calculate basic metrics
546
hybrid_metrics = calculate_metrics(test_data["output_sequences"], predictions)
547
548
# Plot Lorenz curves and get Mutual Gini
549
hybrid_mutual_gini = plot_lorenz_analysis(test_data["output_sequences"], predictions)
550
551
"""
552
## Conclusion
553
554
While LSTMs excel at sequence to sequence learning as demonstrated through the work of Sutskever, I., Vinyals,
555
O., & Le, Q. V. (2014) Sequence to sequence learning with neural networks.
556
The hybrid approach here enhances this foundation. The addition of attention mechanisms allows the model to adaptively
557
focus on relevant temporal/geographical patterns while maintaining the LSTM's inherent strengths in sequence learning.
558
This combination has proven especially effective for handling both periodic patterns and special events in time
559
series forecasting from Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W. (2021).
560
Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting.
561
"""
562
563