Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
keras-team
GitHub Repository: keras-team/keras-io
Path: blob/master/examples/structured_data/deep_neural_decision_forests.py
3507 views
1
"""
2
Title: Classification with Neural Decision Forests
3
Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
4
Date created: 2021/01/15
5
Last modified: 2021/01/15
6
Description: How to train differentiable decision trees for end-to-end learning in deep neural networks.
7
Accelerator: GPU
8
"""
9
10
"""
11
## Introduction
12
13
This example provides an implementation of the
14
[Deep Neural Decision Forest](https://ieeexplore.ieee.org/document/7410529)
15
model introduced by P. Kontschieder et al. for structured data classification.
16
It demonstrates how to build a stochastic and differentiable decision tree model,
17
train it end-to-end, and unify decision trees with deep representation learning.
18
19
## The dataset
20
21
This example uses the
22
[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income)
23
provided by the
24
[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
25
The task is binary classification
26
to predict whether a person is likely to be making over USD 50,000 a year.
27
28
The dataset includes 48,842 instances with 14 input features (such as age, work class, education, occupation, and so on): 5 numerical features
29
and 9 categorical features.
30
"""
31
32
"""
33
## Setup
34
"""
35
36
import keras
37
from keras import layers
38
from keras.layers import StringLookup
39
from keras import ops
40
41
42
from tensorflow import data as tf_data
43
import numpy as np
44
import pandas as pd
45
46
import math
47
48
49
"""
50
## Prepare the data
51
"""
52
53
CSV_HEADER = [
54
"age",
55
"workclass",
56
"fnlwgt",
57
"education",
58
"education_num",
59
"marital_status",
60
"occupation",
61
"relationship",
62
"race",
63
"gender",
64
"capital_gain",
65
"capital_loss",
66
"hours_per_week",
67
"native_country",
68
"income_bracket",
69
]
70
71
train_data_url = (
72
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
73
)
74
train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)
75
76
test_data_url = (
77
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
78
)
79
test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
80
81
print(f"Train dataset shape: {train_data.shape}")
82
print(f"Test dataset shape: {test_data.shape}")
83
84
"""
85
Remove the first record (because it is not a valid data example) and a trailing
86
'dot' in the class labels.
87
"""
88
89
test_data = test_data[1:]
90
test_data.income_bracket = test_data.income_bracket.apply(
91
lambda value: value.replace(".", "")
92
)
93
94
"""
95
We store the training and test data splits locally as CSV files.
96
"""
97
98
train_data_file = "train_data.csv"
99
test_data_file = "test_data.csv"
100
101
train_data.to_csv(train_data_file, index=False, header=False)
102
test_data.to_csv(test_data_file, index=False, header=False)
103
104
"""
105
## Define dataset metadata
106
107
Here, we define the metadata of the dataset that will be useful for reading and parsing
108
and encoding input features.
109
"""
110
111
# A list of the numerical feature names.
112
NUMERIC_FEATURE_NAMES = [
113
"age",
114
"education_num",
115
"capital_gain",
116
"capital_loss",
117
"hours_per_week",
118
]
119
# A dictionary of the categorical features and their vocabulary.
120
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
121
"workclass": sorted(list(train_data["workclass"].unique())),
122
"education": sorted(list(train_data["education"].unique())),
123
"marital_status": sorted(list(train_data["marital_status"].unique())),
124
"occupation": sorted(list(train_data["occupation"].unique())),
125
"relationship": sorted(list(train_data["relationship"].unique())),
126
"race": sorted(list(train_data["race"].unique())),
127
"gender": sorted(list(train_data["gender"].unique())),
128
"native_country": sorted(list(train_data["native_country"].unique())),
129
}
130
# A list of the columns to ignore from the dataset.
131
IGNORE_COLUMN_NAMES = ["fnlwgt"]
132
# A list of the categorical feature names.
133
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
134
# A list of all the input features.
135
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
136
# A list of column default values for each feature.
137
COLUMN_DEFAULTS = [
138
[0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"]
139
for feature_name in CSV_HEADER
140
]
141
# The name of the target feature.
142
TARGET_FEATURE_NAME = "income_bracket"
143
# A list of the labels of the target features.
144
TARGET_LABELS = [" <=50K", " >50K"]
145
146
"""
147
## Create `tf_data.Dataset` objects for training and validation
148
149
We create an input function to read and parse the file, and convert features and labels
150
into a [`tf_data.Dataset`](https://www.tensorflow.org/guide/datasets)
151
for training and validation. We also preprocess the input by mapping the target label
152
to an index.
153
"""
154
155
156
target_label_lookup = StringLookup(
157
vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
158
)
159
160
161
lookup_dict = {}
162
for feature_name in CATEGORICAL_FEATURE_NAMES:
163
vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
164
# Create a lookup to convert a string values to an integer indices.
165
# Since we are not using a mask token, nor expecting any out of vocabulary
166
# (oov) token, we set mask_token to None and num_oov_indices to 0.
167
lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
168
lookup_dict[feature_name] = lookup
169
170
171
def encode_categorical(batch_x, batch_y):
172
for feature_name in CATEGORICAL_FEATURE_NAMES:
173
batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])
174
175
return batch_x, batch_y
176
177
178
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
179
dataset = (
180
tf_data.experimental.make_csv_dataset(
181
csv_file_path,
182
batch_size=batch_size,
183
column_names=CSV_HEADER,
184
column_defaults=COLUMN_DEFAULTS,
185
label_name=TARGET_FEATURE_NAME,
186
num_epochs=1,
187
header=False,
188
na_value="?",
189
shuffle=shuffle,
190
)
191
.map(lambda features, target: (features, target_label_lookup(target)))
192
.map(encode_categorical)
193
)
194
195
return dataset.cache()
196
197
198
"""
199
## Create model inputs
200
"""
201
202
203
def create_model_inputs():
204
inputs = {}
205
for feature_name in FEATURE_NAMES:
206
if feature_name in NUMERIC_FEATURE_NAMES:
207
inputs[feature_name] = layers.Input(
208
name=feature_name, shape=(), dtype="float32"
209
)
210
else:
211
inputs[feature_name] = layers.Input(
212
name=feature_name, shape=(), dtype="int32"
213
)
214
return inputs
215
216
217
"""
218
## Encode input features
219
"""
220
221
222
def encode_inputs(inputs):
223
encoded_features = []
224
for feature_name in inputs:
225
if feature_name in CATEGORICAL_FEATURE_NAMES:
226
vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
227
# Create a lookup to convert a string values to an integer indices.
228
# Since we are not using a mask token, nor expecting any out of vocabulary
229
# (oov) token, we set mask_token to None and num_oov_indices to 0.
230
value_index = inputs[feature_name]
231
embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
232
# Create an embedding layer with the specified dimensions.
233
embedding = layers.Embedding(
234
input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
235
)
236
# Convert the index values to embedding representations.
237
encoded_feature = embedding(value_index)
238
else:
239
# Use the numerical features as-is.
240
encoded_feature = inputs[feature_name]
241
if inputs[feature_name].shape[-1] is None:
242
encoded_feature = keras.ops.expand_dims(encoded_feature, -1)
243
244
encoded_features.append(encoded_feature)
245
246
encoded_features = layers.concatenate(encoded_features)
247
return encoded_features
248
249
250
"""
251
## Deep Neural Decision Tree
252
253
A neural decision tree model has two sets of weights to learn. The first set is `pi`,
254
which represents the probability distribution of the classes in the tree leaves.
255
The second set is the weights of the routing layer `decision_fn`, which represents the probability
256
of going to each leave. The forward pass of the model works as follows:
257
258
1. The model expects input `features` as a single vector encoding all the features of an instance
259
in the batch. This vector can be generated from a Convolution Neural Network (CNN) applied to images
260
or dense transformations applied to structured data features.
261
2. The model first applies a `used_features_mask` to randomly select a subset of input features to use.
262
3. Then, the model computes the probabilities (`mu`) for the input instances to reach the tree leaves
263
by iteratively performing a *stochastic* routing throughout the tree levels.
264
4. Finally, the probabilities of reaching the leaves are combined by the class probabilities at the
265
leaves to produce the final `outputs`.
266
"""
267
268
269
class NeuralDecisionTree(keras.Model):
270
def __init__(self, depth, num_features, used_features_rate, num_classes):
271
super().__init__()
272
self.depth = depth
273
self.num_leaves = 2**depth
274
self.num_classes = num_classes
275
276
# Create a mask for the randomly selected features.
277
num_used_features = int(num_features * used_features_rate)
278
one_hot = np.eye(num_features)
279
sampled_feature_indices = np.random.choice(
280
np.arange(num_features), num_used_features, replace=False
281
)
282
self.used_features_mask = ops.convert_to_tensor(
283
one_hot[sampled_feature_indices], dtype="float32"
284
)
285
286
# Initialize the weights of the classes in leaves.
287
self.pi = self.add_weight(
288
initializer="random_normal",
289
shape=[self.num_leaves, self.num_classes],
290
dtype="float32",
291
trainable=True,
292
)
293
294
# Initialize the stochastic routing layer.
295
self.decision_fn = layers.Dense(
296
units=self.num_leaves, activation="sigmoid", name="decision"
297
)
298
299
def call(self, features):
300
batch_size = ops.shape(features)[0]
301
302
# Apply the feature mask to the input features.
303
features = ops.matmul(
304
features, ops.transpose(self.used_features_mask)
305
) # [batch_size, num_used_features]
306
# Compute the routing probabilities.
307
decisions = ops.expand_dims(
308
self.decision_fn(features), axis=2
309
) # [batch_size, num_leaves, 1]
310
# Concatenate the routing probabilities with their complements.
311
decisions = layers.concatenate(
312
[decisions, 1 - decisions], axis=2
313
) # [batch_size, num_leaves, 2]
314
315
mu = ops.ones([batch_size, 1, 1])
316
317
begin_idx = 1
318
end_idx = 2
319
# Traverse the tree in breadth-first order.
320
for level in range(self.depth):
321
mu = ops.reshape(mu, [batch_size, -1, 1]) # [batch_size, 2 ** level, 1]
322
mu = ops.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2]
323
level_decisions = decisions[
324
:, begin_idx:end_idx, :
325
] # [batch_size, 2 ** level, 2]
326
mu = mu * level_decisions # [batch_size, 2**level, 2]
327
begin_idx = end_idx
328
end_idx = begin_idx + 2 ** (level + 1)
329
330
mu = ops.reshape(mu, [batch_size, self.num_leaves]) # [batch_size, num_leaves]
331
probabilities = keras.activations.softmax(self.pi) # [num_leaves, num_classes]
332
outputs = ops.matmul(mu, probabilities) # [batch_size, num_classes]
333
return outputs
334
335
336
"""
337
## Deep Neural Decision Forest
338
339
The neural decision forest model consists of a set of neural decision trees that are
340
trained simultaneously. The output of the forest model is the average outputs of its trees.
341
"""
342
343
344
class NeuralDecisionForest(keras.Model):
345
def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
346
super().__init__()
347
self.ensemble = []
348
# Initialize the ensemble by adding NeuralDecisionTree instances.
349
# Each tree will have its own randomly selected input features to use.
350
for _ in range(num_trees):
351
self.ensemble.append(
352
NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
353
)
354
355
def call(self, inputs):
356
# Initialize the outputs: a [batch_size, num_classes] matrix of zeros.
357
batch_size = ops.shape(inputs)[0]
358
outputs = ops.zeros([batch_size, num_classes])
359
360
# Aggregate the outputs of trees in the ensemble.
361
for tree in self.ensemble:
362
outputs += tree(inputs)
363
# Divide the outputs by the ensemble size to get the average.
364
outputs /= len(self.ensemble)
365
return outputs
366
367
368
"""
369
Finally, let's set up the code that will train and evaluate the model.
370
"""
371
372
learning_rate = 0.01
373
batch_size = 265
374
num_epochs = 10
375
376
377
def run_experiment(model):
378
model.compile(
379
optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
380
loss=keras.losses.SparseCategoricalCrossentropy(),
381
metrics=[keras.metrics.SparseCategoricalAccuracy()],
382
)
383
384
print("Start training the model...")
385
train_dataset = get_dataset_from_csv(
386
train_data_file, shuffle=True, batch_size=batch_size
387
)
388
389
model.fit(train_dataset, epochs=num_epochs)
390
print("Model training finished")
391
392
print("Evaluating the model on the test data...")
393
test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)
394
395
_, accuracy = model.evaluate(test_dataset)
396
print(f"Test accuracy: {round(accuracy * 100, 2)}%")
397
398
399
"""
400
## Experiment 1: train a decision tree model
401
402
In this experiment, we train a single neural decision tree model
403
where we use all input features.
404
"""
405
406
num_trees = 10
407
depth = 10
408
used_features_rate = 1.0
409
num_classes = len(TARGET_LABELS)
410
411
412
def create_tree_model():
413
inputs = create_model_inputs()
414
features = encode_inputs(inputs)
415
features = layers.BatchNormalization()(features)
416
num_features = features.shape[1]
417
418
tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
419
420
outputs = tree(features)
421
model = keras.Model(inputs=inputs, outputs=outputs)
422
return model
423
424
425
tree_model = create_tree_model()
426
run_experiment(tree_model)
427
428
429
"""
430
## Experiment 2: train a forest model
431
432
In this experiment, we train a neural decision forest with `num_trees` trees
433
where each tree uses randomly selected 50% of the input features. You can control the number
434
of features to be used in each tree by setting the `used_features_rate` variable.
435
In addition, we set the depth to 5 instead of 10 compared to the previous experiment.
436
"""
437
438
num_trees = 25
439
depth = 5
440
used_features_rate = 0.5
441
442
443
def create_forest_model():
444
inputs = create_model_inputs()
445
features = encode_inputs(inputs)
446
features = layers.BatchNormalization()(features)
447
num_features = features.shape[1]
448
449
forest_model = NeuralDecisionForest(
450
num_trees, depth, num_features, used_features_rate, num_classes
451
)
452
453
outputs = forest_model(features)
454
model = keras.Model(inputs=inputs, outputs=outputs)
455
return model
456
457
458
forest_model = create_forest_model()
459
460
run_experiment(forest_model)
461
462