Classification with Neural Decision Forests
Author: Khalid Salama
Date created: 2021/01/15
Last modified: 2021/01/15
Description: How to train differentiable decision trees for end-to-end learning in deep neural networks.
View in Colab •
GitHub source
Introduction
This example provides an implementation of the Deep Neural Decision Forest model introduced by P. Kontschieder et al. for structured data classification. It demonstrates how to build a stochastic and differentiable decision tree model, train it end-to-end, and unify decision trees with deep representation learning.
The dataset
This example uses the United States Census Income Dataset provided by the UC Irvine Machine Learning Repository. The task is binary classification to predict whether a person is likely to be making over USD 50,000 a year.
The dataset includes 48,842 instances with 14 input features (such as age, work class, education, occupation, and so on): 5 numerical features and 9 categorical features.
Setup
import keras
from keras import layers
from keras.layers import StringLookup
from keras import ops
from tensorflow import data as tf_data
import numpy as np
import pandas as pd
import math
Prepare the data
CSV_HEADER = [
"age",
"workclass",
"fnlwgt",
"education",
"education_num",
"marital_status",
"occupation",
"relationship",
"race",
"gender",
"capital_gain",
"capital_loss",
"hours_per_week",
"native_country",
"income_bracket",
]
train_data_url = (
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
)
train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)
test_data_url = (
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
)
test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")
```
Train dataset shape: (32561, 15)
Test dataset shape: (16282, 15)
</div>
Remove the first record (because it is not a valid data example) and a trailing
'dot' in the class labels.
```python
test_data = test_data[1:]
test_data.income_bracket = test_data.income_bracket.apply(
lambda value: value.replace(".", "")
)
We store the training and test data splits locally as CSV files.
train_data_file = "train_data.csv"
test_data_file = "test_data.csv"
train_data.to_csv(train_data_file, index=False, header=False)
test_data.to_csv(test_data_file, index=False, header=False)
Here, we define the metadata of the dataset that will be useful for reading and parsing and encoding input features.
NUMERIC_FEATURE_NAMES = [
"age",
"education_num",
"capital_gain",
"capital_loss",
"hours_per_week",
]
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
"workclass": sorted(list(train_data["workclass"].unique())),
"education": sorted(list(train_data["education"].unique())),
"marital_status": sorted(list(train_data["marital_status"].unique())),
"occupation": sorted(list(train_data["occupation"].unique())),
"relationship": sorted(list(train_data["relationship"].unique())),
"race": sorted(list(train_data["race"].unique())),
"gender": sorted(list(train_data["gender"].unique())),
"native_country": sorted(list(train_data["native_country"].unique())),
}
IGNORE_COLUMN_NAMES = ["fnlwgt"]
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
COLUMN_DEFAULTS = [
[0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"]
for feature_name in CSV_HEADER
]
TARGET_FEATURE_NAME = "income_bracket"
TARGET_LABELS = [" <=50K", " >50K"]
Create tf_data.Dataset
objects for training and validation
We create an input function to read and parse the file, and convert features and labels into a tf_data.Dataset
for training and validation. We also preprocess the input by mapping the target label to an index.
target_label_lookup = StringLookup(
vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)
lookup_dict = {}
for feature_name in CATEGORICAL_FEATURE_NAMES:
vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
lookup_dict[feature_name] = lookup
def encode_categorical(batch_x, batch_y):
for feature_name in CATEGORICAL_FEATURE_NAMES:
batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])
return batch_x, batch_y
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
dataset = (
tf_data.experimental.make_csv_dataset(
csv_file_path,
batch_size=batch_size,
column_names=CSV_HEADER,
column_defaults=COLUMN_DEFAULTS,
label_name=TARGET_FEATURE_NAME,
num_epochs=1,
header=False,
na_value="?",
shuffle=shuffle,
)
.map(lambda features, target: (features, target_label_lookup(target)))
.map(encode_categorical)
)
return dataset.cache()
def create_model_inputs():
inputs = {}
for feature_name in FEATURE_NAMES:
if feature_name in NUMERIC_FEATURE_NAMES:
inputs[feature_name] = layers.Input(
name=feature_name, shape=(), dtype="float32"
)
else:
inputs[feature_name] = layers.Input(
name=feature_name, shape=(), dtype="int32"
)
return inputs
def encode_inputs(inputs):
encoded_features = []
for feature_name in inputs:
if feature_name in CATEGORICAL_FEATURE_NAMES:
vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
value_index = inputs[feature_name]
embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
embedding = layers.Embedding(
input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
)
encoded_feature = embedding(value_index)
else:
encoded_feature = inputs[feature_name]
if inputs[feature_name].shape[-1] is None:
encoded_feature = keras.ops.expand_dims(encoded_feature, -1)
encoded_features.append(encoded_feature)
encoded_features = layers.concatenate(encoded_features)
return encoded_features
Deep Neural Decision Tree
A neural decision tree model has two sets of weights to learn. The first set is pi
, which represents the probability distribution of the classes in the tree leaves. The second set is the weights of the routing layer decision_fn
, which represents the probability of going to each leave. The forward pass of the model works as follows:
The model expects input features
as a single vector encoding all the features of an instance in the batch. This vector can be generated from a Convolution Neural Network (CNN) applied to images or dense transformations applied to structured data features.
The model first applies a used_features_mask
to randomly select a subset of input features to use.
Then, the model computes the probabilities (mu
) for the input instances to reach the tree leaves by iteratively performing a stochastic routing throughout the tree levels.
Finally, the probabilities of reaching the leaves are combined by the class probabilities at the leaves to produce the final outputs
.
class NeuralDecisionTree(keras.Model):
def __init__(self, depth, num_features, used_features_rate, num_classes):
super().__init__()
self.depth = depth
self.num_leaves = 2**depth
self.num_classes = num_classes
num_used_features = int(num_features * used_features_rate)
one_hot = np.eye(num_features)
sampled_feature_indices = np.random.choice(
np.arange(num_features), num_used_features, replace=False
)
self.used_features_mask = ops.convert_to_tensor(
one_hot[sampled_feature_indices], dtype="float32"
)
self.pi = self.add_weight(
initializer="random_normal",
shape=[self.num_leaves, self.num_classes],
dtype="float32",
trainable=True,
)
self.decision_fn = layers.Dense(
units=self.num_leaves, activation="sigmoid", name="decision"
)
def call(self, features):
batch_size = ops.shape(features)[0]
features = ops.matmul(
features, ops.transpose(self.used_features_mask)
)
decisions = ops.expand_dims(
self.decision_fn(features), axis=2
)
decisions = layers.concatenate(
[decisions, 1 - decisions], axis=2
)
mu = ops.ones([batch_size, 1, 1])
begin_idx = 1
end_idx = 2
for level in range(self.depth):
mu = ops.reshape(mu, [batch_size, -1, 1])
mu = ops.tile(mu, (1, 1, 2))
level_decisions = decisions[
:, begin_idx:end_idx, :
]
mu = mu * level_decisions
begin_idx = end_idx
end_idx = begin_idx + 2 ** (level + 1)
mu = ops.reshape(mu, [batch_size, self.num_leaves])
probabilities = keras.activations.softmax(self.pi)
outputs = ops.matmul(mu, probabilities)
return outputs
Deep Neural Decision Forest
The neural decision forest model consists of a set of neural decision trees that are trained simultaneously. The output of the forest model is the average outputs of its trees.
class NeuralDecisionForest(keras.Model):
def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
super().__init__()
self.ensemble = []
for _ in range(num_trees):
self.ensemble.append(
NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
)
def call(self, inputs):
batch_size = ops.shape(inputs)[0]
outputs = ops.zeros([batch_size, num_classes])
for tree in self.ensemble:
outputs += tree(inputs)
outputs /= len(self.ensemble)
return outputs
Finally, let's set up the code that will train and evaluate the model.
learning_rate = 0.01
batch_size = 265
num_epochs = 10
def run_experiment(model):
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
loss=keras.losses.SparseCategoricalCrossentropy(),
metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
print("Start training the model...")
train_dataset = get_dataset_from_csv(
train_data_file, shuffle=True, batch_size=batch_size
)
model.fit(train_dataset, epochs=num_epochs)
print("Model training finished")
print("Evaluating the model on the test data...")
test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)
_, accuracy = model.evaluate(test_dataset)
print(f"Test accuracy: {round(accuracy * 100, 2)}%")
Experiment 1: train a decision tree model
In this experiment, we train a single neural decision tree model where we use all input features.
num_trees = 10
depth = 10
used_features_rate = 1.0
num_classes = len(TARGET_LABELS)
def create_tree_model():
inputs = create_model_inputs()
features = encode_inputs(inputs)
features = layers.BatchNormalization()(features)
num_features = features.shape[1]
tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
outputs = tree(features)
model = keras.Model(inputs=inputs, outputs=outputs)
return model
tree_model = create_tree_model()
run_experiment(tree_model)
```
Start training the model...
Epoch 1/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 5s 26ms/step - loss: 0.5308 - sparse_categorical_accuracy: 0.8150
Epoch 2/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3476 - sparse_categorical_accuracy: 0.8429
Epoch 3/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3312 - sparse_categorical_accuracy: 0.8478
Epoch 4/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3247 - sparse_categorical_accuracy: 0.8495
Epoch 5/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3202 - sparse_categorical_accuracy: 0.8512
Epoch 6/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3158 - sparse_categorical_accuracy: 0.8536
Epoch 7/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3116 - sparse_categorical_accuracy: 0.8572
Epoch 8/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3071 - sparse_categorical_accuracy: 0.8608
Epoch 9/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step - loss: 0.3026 - sparse_categorical_accuracy: 0.8630
Epoch 10/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.2975 - sparse_categorical_accuracy: 0.8653
Model training finished
Evaluating the model on the test data...
62/62 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - loss: 0.3279 - sparse_categorical_accuracy: 0.8463
Test accuracy: 85.08%
</div>
---
## Experiment 2: train a forest model
In this experiment, we train a neural decision forest with `num_trees` trees
where each tree uses randomly selected 50% of the input features. You can control the number
of features to be used in each tree by setting the `used_features_rate` variable.
In addition, we set the depth to 5 instead of 10 compared to the previous experiment.
```python
num_trees = 25
depth = 5
used_features_rate = 0.5
def create_forest_model():
inputs = create_model_inputs()
features = encode_inputs(inputs)
features = layers.BatchNormalization()(features)
num_features = features.shape[1]
forest_model = NeuralDecisionForest(
num_trees, depth, num_features, used_features_rate, num_classes
)
outputs = forest_model(features)
model = keras.Model(inputs=inputs, outputs=outputs)
return model
forest_model = create_forest_model()
run_experiment(forest_model)
```
Start training the model...
Epoch 1/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 47s 202ms/step - loss: 0.5469 - sparse_categorical_accuracy: 0.7915
Epoch 2/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3459 - sparse_categorical_accuracy: 0.8494
Epoch 3/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3268 - sparse_categorical_accuracy: 0.8523
Epoch 4/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3195 - sparse_categorical_accuracy: 0.8524
Epoch 5/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3149 - sparse_categorical_accuracy: 0.8539
Epoch 6/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3112 - sparse_categorical_accuracy: 0.8556
Epoch 7/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step - loss: 0.3079 - sparse_categorical_accuracy: 0.8566
Epoch 8/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.3050 - sparse_categorical_accuracy: 0.8582
Epoch 9/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.3021 - sparse_categorical_accuracy: 0.8595
Epoch 10/10
123/123 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - loss: 0.2992 - sparse_categorical_accuracy: 0.8617
Model training finished
Evaluating the model on the test data...
62/62 ━━━━━━━━━━━━━━━━━━━━ 5s 39ms/step - loss: 0.3145 - sparse_categorical_accuracy: 0.8503
Test accuracy: 85.55%