CoCalc -- customer_lifetime

GitHub Repository: keras-team/keras-io
Path: blob/master/examples/structured_data/customer_lifetime_value.py
³⁵⁰⁷ views
1
"""
2
Title: Deep Learning for Customer Lifetime Value
3
Author: [Praveen Hosdrug](https://www.linkedin.com/in/praveenhosdrug/)
4
Date created: 2024/11/23
5
Last modified: 2024/11/27
6
Description: A hybrid deep learning architecture for predicting customer purchase patterns and lifetime value.
7
Accelerator: None
8
"""
9

10
"""
11
## Introduction
12

13
A hybrid deep learning architecture combining Transformer encoders and LSTM networks
14
for predicting customer purchase patterns and lifetime value using transaction history.
15
While many existing review articles focus on classic parametric models and traditional machine learning algorithms
16
,this implementation leverages recent advancements in Transformer-based models for time series prediction.
17
The approach handles multi-granularity prediction across different temporal scales.
18

19
"""
20

21
"""
22
## Setting up Libraries for the Deep Learning Project
23
"""
24
import subprocess
25

26

27
def install_packages(packages):
28
    """
29
    Install a list of packages using pip.
30

31
    Args:
32
        packages (list): A list of package names to install.
33
    """
34
    for package in packages:
35
        subprocess.run(["pip", "install", package], check=True)
36

37

38
"""
39
## List of Packages to Install
40

41
1. uciml: For the purpose of the tutorial; we will be using
42
          the UK Retail [Dataset](https://archive.ics.uci.edu/dataset/352/online+retail)
43
2. keras_hub: Access to the transformer encoder layer.
44
"""
45

46
packages_to_install = ["ucimlrepo", "keras_hub"]
47

48
# Install the packages
49
install_packages(packages_to_install)
50

51
# Core data processing and numerical libraries
52
import os
53

54
os.environ["KERAS_BACKEND"] = "jax"
55
import keras
56
import numpy as np
57
import pandas as pd
58
from typing import Dict
59

60

61
# Visualization
62
import matplotlib.pyplot as plt
63

64
# Keras imports
65
from keras import layers
66
from keras import Model
67
from keras import ops
68
from keras_hub.layers import TransformerEncoder
69
from keras import regularizers
70

71
# UK Retail Dataset
72
from ucimlrepo import fetch_ucirepo
73

74
"""
75
## Preprocessing the UK Retail dataset
76
"""
77

78

79
def prepare_time_series_data(data):
80
    """
81
    Preprocess retail transaction data for deep learning.
82

83
    Args:
84
        data: Raw transaction data containing InvoiceDate, UnitPrice, etc.
85
    Returns:
86
        Processed DataFrame with calculated features
87
    """
88
    processed_data = data.copy()
89

90
    # Essential datetime handling for temporal ordering
91
    processed_data["InvoiceDate"] = pd.to_datetime(processed_data["InvoiceDate"])
92

93
    # Basic business constraints and calculations
94
    processed_data = processed_data[processed_data["UnitPrice"] > 0]
95
    processed_data["Amount"] = processed_data["UnitPrice"] * processed_data["Quantity"]
96
    processed_data["CustomerID"] = processed_data["CustomerID"].fillna(99999.0)
97

98
    # Handle outliers in Amount using statistical thresholds
99
    q1 = processed_data["Amount"].quantile(0.25)
100
    q3 = processed_data["Amount"].quantile(0.75)
101

102
    # Define bounds - using 1.5 IQR rule
103
    lower_bound = q1 - 1.5 * (q3 - q1)
104
    upper_bound = q3 + 1.5 * (q3 - q1)
105

106
    # Filter outliers
107
    processed_data = processed_data[
108
        (processed_data["Amount"] >= lower_bound)
109
        & (processed_data["Amount"] <= upper_bound)
110
    ]
111

112
    return processed_data
113

114

115
# Load Data
116

117
online_retail = fetch_ucirepo(id=352)
118
raw_data = online_retail.data.features
119
transformed_data = prepare_time_series_data(raw_data)
120

121

122
def prepare_data_for_modeling(
123
    df: pd.DataFrame,
124
    input_sequence_length: int = 6,
125
    output_sequence_length: int = 6,
126
) -> Dict:
127
    """
128
    Transform retail data into sequence-to-sequence format with separate
129
    temporal and trend components.
130
    """
131
    df = df.copy()
132

133
    # Daily aggregation
134
    daily_purchases = (
135
        df.groupby(["CustomerID", pd.Grouper(key="InvoiceDate", freq="D")])
136
        .agg({"Amount": "sum", "Quantity": "sum", "Country": "first"})
137
        .reset_index()
138
    )
139

140
    daily_purchases["frequency"] = np.where(daily_purchases["Amount"] > 0, 1, 0)
141

142
    # Monthly resampling
143
    monthly_purchases = (
144
        daily_purchases.set_index("InvoiceDate")
145
        .groupby("CustomerID")
146
        .resample("M")
147
        .agg(
148
            {"Amount": "sum", "Quantity": "sum", "frequency": "sum", "Country": "first"}
149
        )
150
        .reset_index()
151
    )
152

153
    # Add cyclical temporal features
154
    def prepare_temporal_features(input_window: pd.DataFrame) -> np.ndarray:
155

156
        month = input_window["InvoiceDate"].dt.month
157
        month_sin = np.sin(2 * np.pi * month / 12)
158
        month_cos = np.cos(2 * np.pi * month / 12)
159
        is_quarter_start = (month % 3 == 1).astype(int)
160

161
        temporal_features = np.column_stack(
162
            [
163
                month,
164
                input_window["InvoiceDate"].dt.year,
165
                month_sin,
166
                month_cos,
167
                is_quarter_start,
168
            ]
169
        )
170
        return temporal_features
171

172
    # Prepare trend features with lagged values
173
    def prepare_trend_features(input_window: pd.DataFrame, lag: int = 3) -> np.ndarray:
174

175
        lagged_data = pd.DataFrame()
176
        for i in range(1, lag + 1):
177
            lagged_data[f"Amount_lag_{i}"] = input_window["Amount"].shift(i)
178
            lagged_data[f"Quantity_lag_{i}"] = input_window["Quantity"].shift(i)
179
            lagged_data[f"frequency_lag_{i}"] = input_window["frequency"].shift(i)
180

181
        lagged_data = lagged_data.fillna(0)
182

183
        trend_features = np.column_stack(
184
            [
185
                input_window["Amount"].values,
186
                input_window["Quantity"].values,
187
                input_window["frequency"].values,
188
                lagged_data.values,
189
            ]
190
        )
191
        return trend_features
192

193
    sequence_containers = {
194
        "temporal_sequences": [],
195
        "trend_sequences": [],
196
        "static_features": [],
197
        "output_sequences": [],
198
    }
199

200
    # Process sequences for each customer
201
    for customer_id, customer_data in monthly_purchases.groupby("CustomerID"):
202
        customer_data = customer_data.sort_values("InvoiceDate")
203
        sequence_ranges = (
204
            len(customer_data) - input_sequence_length - output_sequence_length + 1
205
        )
206

207
        country = customer_data["Country"].iloc[0]
208

209
        for i in range(sequence_ranges):
210
            input_window = customer_data.iloc[i : i + input_sequence_length]
211
            output_window = customer_data.iloc[
212
                i
213
                + input_sequence_length : i
214
                + input_sequence_length
215
                + output_sequence_length
216
            ]
217

218
            if (
219
                len(input_window) == input_sequence_length
220
                and len(output_window) == output_sequence_length
221
            ):
222
                temporal_features = prepare_temporal_features(input_window)
223
                trend_features = prepare_trend_features(input_window)
224

225
                sequence_containers["temporal_sequences"].append(temporal_features)
226
                sequence_containers["trend_sequences"].append(trend_features)
227
                sequence_containers["static_features"].append(country)
228
                sequence_containers["output_sequences"].append(
229
                    output_window["Amount"].values
230
                )
231

232
    return {
233
        "temporal_sequences": (
234
            np.array(sequence_containers["temporal_sequences"], dtype=np.float32)
235
        ),
236
        "trend_sequences": (
237
            np.array(sequence_containers["trend_sequences"], dtype=np.float32)
238
        ),
239
        "static_features": np.array(sequence_containers["static_features"]),
240
        "output_sequences": (
241
            np.array(sequence_containers["output_sequences"], dtype=np.float32)
242
        ),
243
    }
244

245

246
# Transform data with input and output sequences into a Output dictionary
247
output = prepare_data_for_modeling(
248
    df=transformed_data, input_sequence_length=6, output_sequence_length=6
249
)
250

251
"""
252
## Scaling and Splitting
253
"""
254

255

256
def robust_scale(data):
257
    """
258
    Min-Max scaling function since standard deviation is high.
259
    """
260
    data = np.array(data)
261
    data_min = np.min(data)
262
    data_max = np.max(data)
263
    scaled = (data - data_min) / (data_max - data_min)
264
    return scaled
265

266

267
def create_temporal_splits_with_scaling(
268
    prepared_data: Dict[str, np.ndarray],
269
    test_ratio: float = 0.2,
270
    val_ratio: float = 0.2,
271
):
272
    total_sequences = len(prepared_data["trend_sequences"])
273
    # Calculate split points
274
    test_size = int(total_sequences * test_ratio)
275
    val_size = int(total_sequences * val_ratio)
276
    train_size = total_sequences - (test_size + val_size)
277

278
    # Scale trend sequences
279
    trend_shape = prepared_data["trend_sequences"].shape
280
    scaled_trends = np.zeros_like(prepared_data["trend_sequences"])
281

282
    # Scale each feature independently
283
    for i in range(trend_shape[-1]):
284
        scaled_trends[..., i] = robust_scale(prepared_data["trend_sequences"][..., i])
285
    # Scale output sequences
286
    scaled_outputs = robust_scale(prepared_data["output_sequences"])
287

288
    # Create splits
289
    train_data = {
290
        "trend_sequences": scaled_trends[:train_size],
291
        "temporal_sequences": prepared_data["temporal_sequences"][:train_size],
292
        "static_features": prepared_data["static_features"][:train_size],
293
        "output_sequences": scaled_outputs[:train_size],
294
    }
295

296
    val_data = {
297
        "trend_sequences": scaled_trends[train_size : train_size + val_size],
298
        "temporal_sequences": prepared_data["temporal_sequences"][
299
            train_size : train_size + val_size
300
        ],
301
        "static_features": prepared_data["static_features"][
302
            train_size : train_size + val_size
303
        ],
304
        "output_sequences": scaled_outputs[train_size : train_size + val_size],
305
    }
306

307
    test_data = {
308
        "trend_sequences": scaled_trends[train_size + val_size :],
309
        "temporal_sequences": prepared_data["temporal_sequences"][
310
            train_size + val_size :
311
        ],
312
        "static_features": prepared_data["static_features"][train_size + val_size :],
313
        "output_sequences": scaled_outputs[train_size + val_size :],
314
    }
315

316
    return train_data, val_data, test_data
317

318

319
# Usage
320
train_data, val_data, test_data = create_temporal_splits_with_scaling(output)
321

322
"""
323
## Evaluation
324
"""
325

326

327
def calculate_metrics(y_true, y_pred):
328
    """
329
    Calculates RMSE, MAE and R²
330
    """
331
    # Convert inputs to "float32"
332
    y_true = ops.cast(y_true, dtype="float32")
333
    y_pred = ops.cast(y_pred, dtype="float32")
334

335
    # RMSE
336
    rmse = np.sqrt(np.mean(np.square(y_true - y_pred)))
337

338
    # R² (coefficient of determination)
339
    ss_res = np.sum(np.square(y_true - y_pred))
340
    ss_tot = np.sum(np.square(y_true - np.mean(y_true)))
341
    r2 = 1 - (ss_res / ss_tot)
342

343
    return {"mae": np.mean(np.abs(y_true - y_pred)), "rmse": rmse, "r2": r2}
344

345

346
def plot_lorenz_analysis(y_true, y_pred):
347
    """
348
    Plots Lorenz curves to show distribution of high and low value users
349
    """
350
    # Convert to numpy arrays and flatten
351
    y_true = np.array(y_true).flatten()
352
    y_pred = np.array(y_pred).flatten()
353

354
    # Sort values in descending order (for high-value users analysis)
355
    true_sorted = np.sort(-y_true)
356
    pred_sorted = np.sort(-y_pred)
357

358
    # Calculate cumulative sums
359
    true_cumsum = np.cumsum(true_sorted)
360
    pred_cumsum = np.cumsum(pred_sorted)
361

362
    # Normalize to percentages
363
    true_cumsum_pct = true_cumsum / true_cumsum[-1]
364
    pred_cumsum_pct = pred_cumsum / pred_cumsum[-1]
365

366
    # Generate percentiles for x-axis
367
    percentiles = np.linspace(0, 1, len(y_true))
368

369
    # Calculate Mutual Gini (area between curves)
370
    mutual_gini = np.abs(
371
        np.trapz(true_cumsum_pct, percentiles) - np.trapz(pred_cumsum_pct, percentiles)
372
    )
373

374
    # Create plot
375
    plt.figure(figsize=(10, 6))
376
    plt.plot(percentiles, true_cumsum_pct, "g-", label="True Values")
377
    plt.plot(percentiles, pred_cumsum_pct, "r-", label="Predicted Values")
378
    plt.xlabel("Cumulative % of Users (Descending Order)")
379
    plt.ylabel("Cumulative % of LTV")
380
    plt.title("Lorenz Curves: True vs Predicted Values")
381
    plt.legend()
382
    plt.grid(True)
383
    print(f"\nMutual Gini: {mutual_gini:.4f} (lower is better)")
384
    plt.show()
385

386
    return mutual_gini
387

388

389
"""
390
## Hybrid Transformer / LSTM model architecture
391

392
The hybrid nature of this model is particularly significant because it combines RNN's
393
ability to handle sequential data with Transformer's attention mechanisms for capturing
394
global patterns across countries and seasonality.
395
"""
396

397

398
def build_hybrid_model(
399
    input_sequence_length: int,
400
    output_sequence_length: int,
401
    num_countries: int,
402
    d_model: int = 8,
403
    num_heads: int = 4,
404
):
405

406
    keras.utils.set_random_seed(seed=42)
407

408
    # Inputs
409
    temporal_inputs = layers.Input(
410
        shape=(input_sequence_length, 5), name="temporal_inputs"
411
    )
412
    trend_inputs = layers.Input(shape=(input_sequence_length, 12), name="trend_inputs")
413
    country_inputs = layers.Input(
414
        shape=(num_countries,), dtype="int32", name="country_inputs"
415
    )
416

417
    # Process country features
418
    country_embedding = layers.Embedding(
419
        input_dim=num_countries,
420
        output_dim=d_model,
421
        mask_zero=False,
422
        name="country_embedding",
423
    )(
424
        country_inputs
425
    )  # Output shape: (batch_size, 1, d_model)
426

427
    # Flatten the embedding output
428
    country_embedding = layers.Flatten(name="flatten_country_embedding")(
429
        country_embedding
430
    )
431

432
    # Repeat the country embedding across timesteps
433
    country_embedding_repeated = layers.RepeatVector(
434
        input_sequence_length, name="repeat_country_embedding"
435
    )(country_embedding)
436

437
    # Projection of temporal inputs to match Transformer dimensions
438
    temporal_projection = layers.Dense(
439
        d_model, activation="tanh", name="temporal_projection"
440
    )(temporal_inputs)
441

442
    # Combine all features
443
    combined_features = layers.Concatenate()(
444
        [temporal_projection, country_embedding_repeated]
445
    )
446

447
    transformer_output = combined_features
448
    for _ in range(3):
449
        transformer_output = TransformerEncoder(
450
            intermediate_dim=16, num_heads=num_heads
451
        )(transformer_output)
452

453
    lstm_output = layers.LSTM(units=64, name="lstm_trend")(trend_inputs)
454

455
    transformer_flattened = layers.GlobalAveragePooling1D(name="flatten_transformer")(
456
        transformer_output
457
    )
458
    transformer_flattened = layers.Dense(1, activation="sigmoid")(transformer_flattened)
459
    # Concatenate flattened Transformer output with LSTM output
460
    merged_features = layers.Concatenate(name="concatenate_transformer_lstm")(
461
        [transformer_flattened, lstm_output]
462
    )
463
    # Repeat the merged features to match the output sequence length
464
    decoder_initial = layers.RepeatVector(
465
        output_sequence_length, name="repeat_merged_features"
466
    )(merged_features)
467

468
    decoder_lstm = layers.LSTM(
469
        units=64,
470
        return_sequences=True,
471
        recurrent_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
472
    )(decoder_initial)
473

474
    # Output Dense layer
475
    output = layers.Dense(units=1, activation="linear", name="output_dense")(
476
        decoder_lstm
477
    )
478

479
    model = Model(
480
        inputs=[temporal_inputs, trend_inputs, country_inputs], outputs=output
481
    )
482

483
    model.compile(
484
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
485
        loss="mse",
486
        metrics=["mse"],
487
    )
488

489
    return model
490

491

492
# Create the hybrid model
493
model = build_hybrid_model(
494
    input_sequence_length=6,
495
    output_sequence_length=6,
496
    num_countries=len(np.unique(train_data["static_features"])) + 1,
497
    d_model=8,
498
    num_heads=4,
499
)
500

501
# Configure StringLookup
502
label_encoder = layers.StringLookup(output_mode="one_hot", num_oov_indices=1)
503

504
# Adapt and encode
505
label_encoder.adapt(train_data["static_features"])
506

507
train_static_encoded = label_encoder(train_data["static_features"])
508
val_static_encoded = label_encoder(val_data["static_features"])
509
test_static_encoded = label_encoder(test_data["static_features"])
510

511
# Convert sequences with proper type casting
512
x_train_seq = np.asarray(train_data["trend_sequences"]).astype(np.float32)
513
x_val_seq = np.asarray(val_data["trend_sequences"]).astype(np.float32)
514
x_train_temporal = np.asarray(train_data["temporal_sequences"]).astype(np.float32)
515
x_val_temporal = np.asarray(val_data["temporal_sequences"]).astype(np.float32)
516
train_outputs = np.asarray(train_data["output_sequences"]).astype(np.float32)
517
val_outputs = np.asarray(val_data["output_sequences"]).astype(np.float32)
518
test_output = np.asarray(test_data["output_sequences"]).astype(np.float32)
519
# Training setup
520
keras.utils.set_random_seed(seed=42)
521

522
history = model.fit(
523
    [x_train_temporal, x_train_seq, train_static_encoded],
524
    train_outputs,
525
    validation_data=(
526
        [x_val_temporal, x_val_seq, val_static_encoded],
527
        val_data["output_sequences"].astype(np.float32),
528
    ),
529
    epochs=20,
530
    batch_size=30,
531
)
532

533
# Make predictions
534
predictions = model.predict(
535
    [
536
        test_data["temporal_sequences"].astype(np.float32),
537
        test_data["trend_sequences"].astype(np.float32),
538
        test_static_encoded,
539
    ]
540
)
541

542
# Calculate the predictions
543
predictions = np.squeeze(predictions)
544

545
# Calculate basic metrics
546
hybrid_metrics = calculate_metrics(test_data["output_sequences"], predictions)
547

548
# Plot Lorenz curves and get Mutual Gini
549
hybrid_mutual_gini = plot_lorenz_analysis(test_data["output_sequences"], predictions)
550

551
"""
552
## Conclusion
553

554
While LSTMs excel at sequence to sequence learning as demonstrated through the work of Sutskever, I., Vinyals,
555
O., & Le, Q. V. (2014) Sequence to sequence learning with neural networks.
556
The hybrid approach here enhances this foundation. The addition of attention mechanisms allows the model to adaptively
557
focus on relevant temporal/geographical patterns while maintaining the LSTM's inherent strengths in sequence learning.
558
This combination has proven especially effective for handling both periodic patterns and special events in time
559
series forecasting from Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W. (2021).
560
Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting.
561
"""
562

563
Product

Resources

Company