CoCalc -- deep_neural_decision

GitHub Repository: keras-team/keras-io
Path: blob/master/examples/structured_data/deep_neural_decision_forests.py
³⁵⁰⁷ views
1
"""
2
Title: Classification with Neural Decision Forests
3
Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
4
Date created: 2021/01/15
5
Last modified: 2021/01/15
6
Description: How to train differentiable decision trees for end-to-end learning in deep neural networks.
7
Accelerator: GPU
8
"""
9

10
"""
11
## Introduction
12

13
This example provides an implementation of the
14
[Deep Neural Decision Forest](https://ieeexplore.ieee.org/document/7410529)
15
model introduced by P. Kontschieder et al. for structured data classification.
16
It demonstrates how to build a stochastic and differentiable decision tree model,
17
train it end-to-end, and unify decision trees with deep representation learning.
18

19
## The dataset
20

21
This example uses the
22
[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income)
23
provided by the
24
[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
25
The task is binary classification
26
to predict whether a person is likely to be making over USD 50,000 a year.
27

28
The dataset includes 48,842 instances with 14 input features (such as age, work class, education, occupation, and so on): 5 numerical features
29
and 9 categorical features.
30
"""
31

32
"""
33
## Setup
34
"""
35

36
import keras
37
from keras import layers
38
from keras.layers import StringLookup
39
from keras import ops
40

41

42
from tensorflow import data as tf_data
43
import numpy as np
44
import pandas as pd
45

46
import math
47

48

49
"""
50
## Prepare the data
51
"""
52

53
CSV_HEADER = [
54
    "age",
55
    "workclass",
56
    "fnlwgt",
57
    "education",
58
    "education_num",
59
    "marital_status",
60
    "occupation",
61
    "relationship",
62
    "race",
63
    "gender",
64
    "capital_gain",
65
    "capital_loss",
66
    "hours_per_week",
67
    "native_country",
68
    "income_bracket",
69
]
70

71
train_data_url = (
72
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
73
)
74
train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)
75

76
test_data_url = (
77
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
78
)
79
test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
80

81
print(f"Train dataset shape: {train_data.shape}")
82
print(f"Test dataset shape: {test_data.shape}")
83

84
"""
85
Remove the first record (because it is not a valid data example) and a trailing
86
'dot' in the class labels.
87
"""
88

89
test_data = test_data[1:]
90
test_data.income_bracket = test_data.income_bracket.apply(
91
    lambda value: value.replace(".", "")
92
)
93

94
"""
95
We store the training and test data splits locally as CSV files.
96
"""
97

98
train_data_file = "train_data.csv"
99
test_data_file = "test_data.csv"
100

101
train_data.to_csv(train_data_file, index=False, header=False)
102
test_data.to_csv(test_data_file, index=False, header=False)
103

104
"""
105
## Define dataset metadata
106

107
Here, we define the metadata of the dataset that will be useful for reading and parsing
108
and encoding input features.
109
"""
110

111
# A list of the numerical feature names.
112
NUMERIC_FEATURE_NAMES = [
113
    "age",
114
    "education_num",
115
    "capital_gain",
116
    "capital_loss",
117
    "hours_per_week",
118
]
119
# A dictionary of the categorical features and their vocabulary.
120
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
121
    "workclass": sorted(list(train_data["workclass"].unique())),
122
    "education": sorted(list(train_data["education"].unique())),
123
    "marital_status": sorted(list(train_data["marital_status"].unique())),
124
    "occupation": sorted(list(train_data["occupation"].unique())),
125
    "relationship": sorted(list(train_data["relationship"].unique())),
126
    "race": sorted(list(train_data["race"].unique())),
127
    "gender": sorted(list(train_data["gender"].unique())),
128
    "native_country": sorted(list(train_data["native_country"].unique())),
129
}
130
# A list of the columns to ignore from the dataset.
131
IGNORE_COLUMN_NAMES = ["fnlwgt"]
132
# A list of the categorical feature names.
133
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
134
# A list of all the input features.
135
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
136
# A list of column default values for each feature.
137
COLUMN_DEFAULTS = [
138
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"]
139
    for feature_name in CSV_HEADER
140
]
141
# The name of the target feature.
142
TARGET_FEATURE_NAME = "income_bracket"
143
# A list of the labels of the target features.
144
TARGET_LABELS = [" <=50K", " >50K"]
145

146
"""
147
## Create `tf_data.Dataset` objects for training and validation
148

149
We create an input function to read and parse the file, and convert features and labels
150
into a [`tf_data.Dataset`](https://www.tensorflow.org/guide/datasets)
151
for training and validation. We also preprocess the input by mapping the target label
152
to an index.
153
"""
154

155

156
target_label_lookup = StringLookup(
157
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
158
)
159

160

161
lookup_dict = {}
162
for feature_name in CATEGORICAL_FEATURE_NAMES:
163
    vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
164
    # Create a lookup to convert a string values to an integer indices.
165
    # Since we are not using a mask token, nor expecting any out of vocabulary
166
    # (oov) token, we set mask_token to None and num_oov_indices to 0.
167
    lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
168
    lookup_dict[feature_name] = lookup
169

170

171
def encode_categorical(batch_x, batch_y):
172
    for feature_name in CATEGORICAL_FEATURE_NAMES:
173
        batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])
174

175
    return batch_x, batch_y
176

177

178
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
179
    dataset = (
180
        tf_data.experimental.make_csv_dataset(
181
            csv_file_path,
182
            batch_size=batch_size,
183
            column_names=CSV_HEADER,
184
            column_defaults=COLUMN_DEFAULTS,
185
            label_name=TARGET_FEATURE_NAME,
186
            num_epochs=1,
187
            header=False,
188
            na_value="?",
189
            shuffle=shuffle,
190
        )
191
        .map(lambda features, target: (features, target_label_lookup(target)))
192
        .map(encode_categorical)
193
    )
194

195
    return dataset.cache()
196

197

198
"""
199
## Create model inputs
200
"""
201

202

203
def create_model_inputs():
204
    inputs = {}
205
    for feature_name in FEATURE_NAMES:
206
        if feature_name in NUMERIC_FEATURE_NAMES:
207
            inputs[feature_name] = layers.Input(
208
                name=feature_name, shape=(), dtype="float32"
209
            )
210
        else:
211
            inputs[feature_name] = layers.Input(
212
                name=feature_name, shape=(), dtype="int32"
213
            )
214
    return inputs
215

216

217
"""
218
## Encode input features
219
"""
220

221

222
def encode_inputs(inputs):
223
    encoded_features = []
224
    for feature_name in inputs:
225
        if feature_name in CATEGORICAL_FEATURE_NAMES:
226
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
227
            # Create a lookup to convert a string values to an integer indices.
228
            # Since we are not using a mask token, nor expecting any out of vocabulary
229
            # (oov) token, we set mask_token to None and num_oov_indices to 0.
230
            value_index = inputs[feature_name]
231
            embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
232
            # Create an embedding layer with the specified dimensions.
233
            embedding = layers.Embedding(
234
                input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
235
            )
236
            # Convert the index values to embedding representations.
237
            encoded_feature = embedding(value_index)
238
        else:
239
            # Use the numerical features as-is.
240
            encoded_feature = inputs[feature_name]
241
            if inputs[feature_name].shape[-1] is None:
242
                encoded_feature = keras.ops.expand_dims(encoded_feature, -1)
243

244
        encoded_features.append(encoded_feature)
245

246
    encoded_features = layers.concatenate(encoded_features)
247
    return encoded_features
248

249

250
"""
251
## Deep Neural Decision Tree
252

253
A neural decision tree model has two sets of weights to learn. The first set is `pi`,
254
which represents the probability distribution of the classes in the tree leaves.
255
The second set is the weights of the routing layer `decision_fn`, which represents the probability
256
of going to each leave. The forward pass of the model works as follows:
257

258
1. The model expects input `features` as a single vector encoding all the features of an instance
259
in the batch. This vector can be generated from a Convolution Neural Network (CNN) applied to images
260
or dense transformations applied to structured data features.
261
2. The model first applies a `used_features_mask` to randomly select a subset of input features to use.
262
3. Then, the model computes the probabilities (`mu`) for the input instances to reach the tree leaves
263
by iteratively performing a *stochastic* routing throughout the tree levels.
264
4. Finally, the probabilities of reaching the leaves are combined by the class probabilities at the
265
leaves to produce the final `outputs`.
266
"""
267

268

269
class NeuralDecisionTree(keras.Model):
270
    def __init__(self, depth, num_features, used_features_rate, num_classes):
271
        super().__init__()
272
        self.depth = depth
273
        self.num_leaves = 2**depth
274
        self.num_classes = num_classes
275

276
        # Create a mask for the randomly selected features.
277
        num_used_features = int(num_features * used_features_rate)
278
        one_hot = np.eye(num_features)
279
        sampled_feature_indices = np.random.choice(
280
            np.arange(num_features), num_used_features, replace=False
281
        )
282
        self.used_features_mask = ops.convert_to_tensor(
283
            one_hot[sampled_feature_indices], dtype="float32"
284
        )
285

286
        # Initialize the weights of the classes in leaves.
287
        self.pi = self.add_weight(
288
            initializer="random_normal",
289
            shape=[self.num_leaves, self.num_classes],
290
            dtype="float32",
291
            trainable=True,
292
        )
293

294
        # Initialize the stochastic routing layer.
295
        self.decision_fn = layers.Dense(
296
            units=self.num_leaves, activation="sigmoid", name="decision"
297
        )
298

299
    def call(self, features):
300
        batch_size = ops.shape(features)[0]
301

302
        # Apply the feature mask to the input features.
303
        features = ops.matmul(
304
            features, ops.transpose(self.used_features_mask)
305
        )  # [batch_size, num_used_features]
306
        # Compute the routing probabilities.
307
        decisions = ops.expand_dims(
308
            self.decision_fn(features), axis=2
309
        )  # [batch_size, num_leaves, 1]
310
        # Concatenate the routing probabilities with their complements.
311
        decisions = layers.concatenate(
312
            [decisions, 1 - decisions], axis=2
313
        )  # [batch_size, num_leaves, 2]
314

315
        mu = ops.ones([batch_size, 1, 1])
316

317
        begin_idx = 1
318
        end_idx = 2
319
        # Traverse the tree in breadth-first order.
320
        for level in range(self.depth):
321
            mu = ops.reshape(mu, [batch_size, -1, 1])  # [batch_size, 2 ** level, 1]
322
            mu = ops.tile(mu, (1, 1, 2))  # [batch_size, 2 ** level, 2]
323
            level_decisions = decisions[
324
                :, begin_idx:end_idx, :
325
            ]  # [batch_size, 2 ** level, 2]
326
            mu = mu * level_decisions  # [batch_size, 2**level, 2]
327
            begin_idx = end_idx
328
            end_idx = begin_idx + 2 ** (level + 1)
329

330
        mu = ops.reshape(mu, [batch_size, self.num_leaves])  # [batch_size, num_leaves]
331
        probabilities = keras.activations.softmax(self.pi)  # [num_leaves, num_classes]
332
        outputs = ops.matmul(mu, probabilities)  # [batch_size, num_classes]
333
        return outputs
334

335

336
"""
337
## Deep Neural Decision Forest
338

339
The neural decision forest model consists of a set of neural decision trees that are
340
trained simultaneously. The output of the forest model is the average outputs of its trees.
341
"""
342

343

344
class NeuralDecisionForest(keras.Model):
345
    def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
346
        super().__init__()
347
        self.ensemble = []
348
        # Initialize the ensemble by adding NeuralDecisionTree instances.
349
        # Each tree will have its own randomly selected input features to use.
350
        for _ in range(num_trees):
351
            self.ensemble.append(
352
                NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
353
            )
354

355
    def call(self, inputs):
356
        # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.
357
        batch_size = ops.shape(inputs)[0]
358
        outputs = ops.zeros([batch_size, num_classes])
359

360
        # Aggregate the outputs of trees in the ensemble.
361
        for tree in self.ensemble:
362
            outputs += tree(inputs)
363
        # Divide the outputs by the ensemble size to get the average.
364
        outputs /= len(self.ensemble)
365
        return outputs
366

367

368
"""
369
Finally, let's set up the code that will train and evaluate the model.
370
"""
371

372
learning_rate = 0.01
373
batch_size = 265
374
num_epochs = 10
375

376

377
def run_experiment(model):
378
    model.compile(
379
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
380
        loss=keras.losses.SparseCategoricalCrossentropy(),
381
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
382
    )
383

384
    print("Start training the model...")
385
    train_dataset = get_dataset_from_csv(
386
        train_data_file, shuffle=True, batch_size=batch_size
387
    )
388

389
    model.fit(train_dataset, epochs=num_epochs)
390
    print("Model training finished")
391

392
    print("Evaluating the model on the test data...")
393
    test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)
394

395
    _, accuracy = model.evaluate(test_dataset)
396
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
397

398

399
"""
400
## Experiment 1: train a decision tree model
401

402
In this experiment, we train a single neural decision tree model
403
where we use all input features.
404
"""
405

406
num_trees = 10
407
depth = 10
408
used_features_rate = 1.0
409
num_classes = len(TARGET_LABELS)
410

411

412
def create_tree_model():
413
    inputs = create_model_inputs()
414
    features = encode_inputs(inputs)
415
    features = layers.BatchNormalization()(features)
416
    num_features = features.shape[1]
417

418
    tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
419

420
    outputs = tree(features)
421
    model = keras.Model(inputs=inputs, outputs=outputs)
422
    return model
423

424

425
tree_model = create_tree_model()
426
run_experiment(tree_model)
427

428

429
"""
430
## Experiment 2: train a forest model
431

432
In this experiment, we train a neural decision forest with `num_trees` trees
433
where each tree uses randomly selected 50% of the input features. You can control the number
434
of features to be used in each tree by setting the `used_features_rate` variable.
435
In addition, we set the depth to 5 instead of 10 compared to the previous experiment.
436
"""
437

438
num_trees = 25
439
depth = 5
440
used_features_rate = 0.5
441

442

443
def create_forest_model():
444
    inputs = create_model_inputs()
445
    features = encode_inputs(inputs)
446
    features = layers.BatchNormalization()(features)
447
    num_features = features.shape[1]
448

449
    forest_model = NeuralDecisionForest(
450
        num_trees, depth, num_features, used_features_rate, num_classes
451
    )
452

453
    outputs = forest_model(features)
454
    model = keras.Model(inputs=inputs, outputs=outputs)
455
    return model
456

457

458
forest_model = create_forest_model()
459

460
run_experiment(forest_model)
461

462
Product

Resources

Company