CoCalc -- bayesian_neural

GitHub Repository: keras-team/keras-io
Path: blob/master/examples/keras_recipes/bayesian_neural_networks.py
³⁵⁰⁷ views
1
"""
2
Title: Probabilistic Bayesian Neural Networks
3
Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
4
Date created: 2021/01/15
5
Last modified: 2021/01/15
6
Description: Building probabilistic Bayesian neural network models with TensorFlow Probability.
7
Accelerator: GPU
8
"""
9

10
"""
11
## Introduction
12

13
Taking a probabilistic approach to deep learning allows to account for *uncertainty*,
14
so that models can assign less levels of confidence to incorrect predictions.
15
Sources of uncertainty can be found in the data, due to measurement error or
16
noise in the labels, or the model, due to insufficient data availability for
17
the model to learn effectively.
18

19

20
This example demonstrates how to build basic probabilistic Bayesian neural networks
21
to account for these two types of uncertainty.
22
We use [TensorFlow Probability](https://www.tensorflow.org/probability) library,
23
which is compatible with Keras API.
24

25
This example requires TensorFlow 2.3 or higher.
26
You can install Tensorflow Probability using the following command:
27

28
```python
29
pip install tensorflow-probability
30
```
31
"""
32

33
"""
34
## The dataset
35

36
We use the [Wine Quality](https://archive.ics.uci.edu/ml/datasets/wine+quality)
37
dataset, which is available in the [TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/wine_quality).
38
We use the red wine subset, which contains 4,898 examples.
39
The dataset has 11numerical physicochemical features of the wine, and the task
40
is to predict the wine quality, which is a score between 0 and 10.
41
In this example, we treat this as a regression task.
42

43
You can install TensorFlow Datasets using the following command:
44

45
```python
46
pip install tensorflow-datasets
47
```
48
"""
49

50
"""
51
## Setup
52
"""
53

54
import numpy as np
55
import tensorflow as tf
56
from tensorflow import keras
57
from tensorflow.keras import layers
58
import tensorflow_datasets as tfds
59
import tensorflow_probability as tfp
60

61
"""
62
## Create training and evaluation datasets
63

64
Here, we load the `wine_quality` dataset using `tfds.load()`, and we convert
65
the target feature to float. Then, we shuffle the dataset and split it into
66
training and test sets. We take the first `train_size` examples as the train
67
split, and the rest as the test split.
68
"""
69

70

71
def get_train_and_test_splits(train_size, batch_size=1):
72
    # We prefetch with a buffer the same size as the dataset because th dataset
73
    # is very small and fits into memory.
74
    dataset = (
75
        tfds.load(name="wine_quality", as_supervised=True, split="train")
76
        .map(lambda x, y: (x, tf.cast(y, tf.float32)))
77
        .prefetch(buffer_size=dataset_size)
78
        .cache()
79
    )
80
    # We shuffle with a buffer the same size as the dataset.
81
    train_dataset = (
82
        dataset.take(train_size).shuffle(buffer_size=train_size).batch(batch_size)
83
    )
84
    test_dataset = dataset.skip(train_size).batch(batch_size)
85

86
    return train_dataset, test_dataset
87

88

89
"""
90
## Compile, train, and evaluate the model
91
"""
92

93
hidden_units = [8, 8]
94
learning_rate = 0.001
95

96

97
def run_experiment(model, loss, train_dataset, test_dataset):
98
    model.compile(
99
        optimizer=keras.optimizers.RMSprop(learning_rate=learning_rate),
100
        loss=loss,
101
        metrics=[keras.metrics.RootMeanSquaredError()],
102
    )
103

104
    print("Start training the model...")
105
    model.fit(train_dataset, epochs=num_epochs, validation_data=test_dataset)
106
    print("Model training finished.")
107
    _, rmse = model.evaluate(train_dataset, verbose=0)
108
    print(f"Train RMSE: {round(rmse, 3)}")
109

110
    print("Evaluating model performance...")
111
    _, rmse = model.evaluate(test_dataset, verbose=0)
112
    print(f"Test RMSE: {round(rmse, 3)}")
113

114

115
"""
116
## Create model inputs
117
"""
118

119
FEATURE_NAMES = [
120
    "fixed acidity",
121
    "volatile acidity",
122
    "citric acid",
123
    "residual sugar",
124
    "chlorides",
125
    "free sulfur dioxide",
126
    "total sulfur dioxide",
127
    "density",
128
    "pH",
129
    "sulphates",
130
    "alcohol",
131
]
132

133

134
def create_model_inputs():
135
    inputs = {}
136
    for feature_name in FEATURE_NAMES:
137
        inputs[feature_name] = layers.Input(
138
            name=feature_name, shape=(1,), dtype=tf.float32
139
        )
140
    return inputs
141

142

143
"""
144
## Experiment 1: standard neural network
145

146
We create a standard deterministic neural network model as a baseline.
147
"""
148

149

150
def create_baseline_model():
151
    inputs = create_model_inputs()
152
    input_values = [value for _, value in sorted(inputs.items())]
153
    features = keras.layers.concatenate(input_values)
154
    features = layers.BatchNormalization()(features)
155

156
    # Create hidden layers with deterministic weights using the Dense layer.
157
    for units in hidden_units:
158
        features = layers.Dense(units, activation="sigmoid")(features)
159
    # The output is deterministic: a single point estimate.
160
    outputs = layers.Dense(units=1)(features)
161

162
    model = keras.Model(inputs=inputs, outputs=outputs)
163
    return model
164

165

166
"""
167
Let's split the wine dataset into training and test sets, with 85% and 15% of
168
the examples, respectively.
169
"""
170

171
dataset_size = 4898
172
batch_size = 256
173
train_size = int(dataset_size * 0.85)
174
train_dataset, test_dataset = get_train_and_test_splits(train_size, batch_size)
175

176
"""
177
Now let's train the baseline model. We use the `MeanSquaredError`
178
as the loss function.
179
"""
180

181
num_epochs = 100
182
mse_loss = keras.losses.MeanSquaredError()
183
baseline_model = create_baseline_model()
184
run_experiment(baseline_model, mse_loss, train_dataset, test_dataset)
185

186
"""
187
We take a sample from the test set use the model to obtain predictions for them.
188
Note that since the baseline model is deterministic, we get a single a
189
*point estimate* prediction for each test example, with no information about the
190
uncertainty of the model nor the prediction.
191
"""
192

193
sample = 10
194
examples, targets = list(test_dataset.unbatch().shuffle(batch_size * 10).batch(sample))[
195
    0
196
]
197

198
predicted = baseline_model(examples).numpy()
199
for idx in range(sample):
200
    print(f"Predicted: {round(float(predicted[idx][0]), 1)} - Actual: {targets[idx]}")
201

202
"""
203
## Experiment 2: Bayesian neural network (BNN)
204

205
The object of the Bayesian approach for modeling neural networks is to capture
206
the *epistemic uncertainty*, which is uncertainty about the model fitness,
207
due to limited training data.
208

209
The idea is that, instead of learning specific weight (and bias) *values* in the
210
neural network, the Bayesian approach learns weight *distributions*
211
- from which we can sample to produce an output for a given input -
212
to encode weight uncertainty.
213

214
Thus, we need to define prior and the posterior distributions of these weights,
215
and the training process is to learn the parameters of these distributions.
216
"""
217

218

219
# Define the prior weight distribution as Normal of mean=0 and stddev=1.
220
# Note that, in this example, the we prior distribution is not trainable,
221
# as we fix its parameters.
222
def prior(kernel_size, bias_size, dtype=None):
223
    n = kernel_size + bias_size
224
    prior_model = keras.Sequential(
225
        [
226
            tfp.layers.DistributionLambda(
227
                lambda t: tfp.distributions.MultivariateNormalDiag(
228
                    loc=tf.zeros(n), scale_diag=tf.ones(n)
229
                )
230
            )
231
        ]
232
    )
233
    return prior_model
234

235

236
# Define variational posterior weight distribution as multivariate Gaussian.
237
# Note that the learnable parameters for this distribution are the means,
238
# variances, and covariances.
239
def posterior(kernel_size, bias_size, dtype=None):
240
    n = kernel_size + bias_size
241
    posterior_model = keras.Sequential(
242
        [
243
            tfp.layers.VariableLayer(
244
                tfp.layers.MultivariateNormalTriL.params_size(n), dtype=dtype
245
            ),
246
            tfp.layers.MultivariateNormalTriL(n),
247
        ]
248
    )
249
    return posterior_model
250

251

252
"""
253
We use the `tfp.layers.DenseVariational` layer instead of the standard
254
`keras.layers.Dense` layer in the neural network model.
255
"""
256

257

258
def create_bnn_model(train_size):
259
    inputs = create_model_inputs()
260
    features = keras.layers.concatenate(list(inputs.values()))
261
    features = layers.BatchNormalization()(features)
262

263
    # Create hidden layers with weight uncertainty using the DenseVariational layer.
264
    for units in hidden_units:
265
        features = tfp.layers.DenseVariational(
266
            units=units,
267
            make_prior_fn=prior,
268
            make_posterior_fn=posterior,
269
            kl_weight=1 / train_size,
270
            activation="sigmoid",
271
        )(features)
272

273
    # The output is deterministic: a single point estimate.
274
    outputs = layers.Dense(units=1)(features)
275
    model = keras.Model(inputs=inputs, outputs=outputs)
276
    return model
277

278

279
"""
280
The epistemic uncertainty can be reduced as we increase the size of the
281
training data. That is, the more data the BNN model sees, the more it is certain
282
about its estimates for the weights (distribution parameters).
283
Let's test this behaviour by training the BNN model on a small subset of
284
the training set, and then on the full training set, to compare the output variances.
285
"""
286

287
"""
288
### Train BNN  with a small training subset.
289
"""
290

291
num_epochs = 500
292
train_sample_size = int(train_size * 0.3)
293
small_train_dataset = train_dataset.unbatch().take(train_sample_size).batch(batch_size)
294

295
bnn_model_small = create_bnn_model(train_sample_size)
296
run_experiment(bnn_model_small, mse_loss, small_train_dataset, test_dataset)
297

298
"""
299
Since we have trained a BNN model, the model produces a different output each time
300
we call it with the same input, since each time a new set of weights are sampled
301
from the distributions to construct the network and produce an output.
302
The less certain the mode weights are, the more variability (wider range) we will
303
see in the outputs of the same inputs.
304
"""
305

306

307
def compute_predictions(model, iterations=100):
308
    predicted = []
309
    for _ in range(iterations):
310
        predicted.append(model(examples).numpy())
311
    predicted = np.concatenate(predicted, axis=1)
312

313
    prediction_mean = np.mean(predicted, axis=1).tolist()
314
    prediction_min = np.min(predicted, axis=1).tolist()
315
    prediction_max = np.max(predicted, axis=1).tolist()
316
    prediction_range = (np.max(predicted, axis=1) - np.min(predicted, axis=1)).tolist()
317

318
    for idx in range(sample):
319
        print(
320
            f"Predictions mean: {round(prediction_mean[idx], 2)}, "
321
            f"min: {round(prediction_min[idx], 2)}, "
322
            f"max: {round(prediction_max[idx], 2)}, "
323
            f"range: {round(prediction_range[idx], 2)} - "
324
            f"Actual: {targets[idx]}"
325
        )
326

327

328
compute_predictions(bnn_model_small)
329

330
"""
331
### Train BNN  with the whole training set.
332
"""
333

334
num_epochs = 500
335
bnn_model_full = create_bnn_model(train_size)
336
run_experiment(bnn_model_full, mse_loss, train_dataset, test_dataset)
337

338
compute_predictions(bnn_model_full)
339

340
"""
341
Notice that the model trained with the full training dataset shows smaller range
342
(uncertainty) in the prediction values for the same inputs, compared to the model
343
trained with a subset of the training dataset.
344
"""
345

346
"""
347
## Experiment 3: probabilistic Bayesian neural network
348

349
So far, the output of the standard and the Bayesian NN models that we built is
350
deterministic, that is, produces a point estimate as a prediction for a given example.
351
We can create a probabilistic NN by letting the model output a distribution.
352
In this case, the model captures the *aleatoric uncertainty* as well,
353
which is due to irreducible noise in the data, or to the stochastic nature of the
354
process generating the data.
355

356
In this example, we model the output as a `IndependentNormal` distribution,
357
with learnable mean and variance parameters. If the task was classification,
358
we would have used `IndependentBernoulli` with binary classes, and `OneHotCategorical`
359
with multiple classes, to model distribution of the model output.
360
"""
361

362

363
def create_probablistic_bnn_model(train_size):
364
    inputs = create_model_inputs()
365
    features = keras.layers.concatenate(list(inputs.values()))
366
    features = layers.BatchNormalization()(features)
367

368
    # Create hidden layers with weight uncertainty using the DenseVariational layer.
369
    for units in hidden_units:
370
        features = tfp.layers.DenseVariational(
371
            units=units,
372
            make_prior_fn=prior,
373
            make_posterior_fn=posterior,
374
            kl_weight=1 / train_size,
375
            activation="sigmoid",
376
        )(features)
377

378
    # Create a probabilisticå output (Normal distribution), and use the `Dense` layer
379
    # to produce the parameters of the distribution.
380
    # We set units=2 to learn both the mean and the variance of the Normal distribution.
381
    distribution_params = layers.Dense(units=2)(features)
382
    outputs = tfp.layers.IndependentNormal(1)(distribution_params)
383

384
    model = keras.Model(inputs=inputs, outputs=outputs)
385
    return model
386

387

388
"""
389
Since the output of the model is a distribution, rather than a point estimate,
390
we use the [negative loglikelihood](https://en.wikipedia.org/wiki/Likelihood_function)
391
as our loss function to compute how likely to see the true data (targets) from the
392
estimated distribution produced by the model.
393
"""
394

395

396
def negative_loglikelihood(targets, estimated_distribution):
397
    return -estimated_distribution.log_prob(targets)
398

399

400
num_epochs = 1000
401
prob_bnn_model = create_probablistic_bnn_model(train_size)
402
run_experiment(prob_bnn_model, negative_loglikelihood, train_dataset, test_dataset)
403

404
"""
405
Now let's produce an output from the model given the test examples.
406
The output is now a distribution, and we can use its mean and variance
407
to compute the confidence intervals (CI) of the prediction.
408
"""
409

410
prediction_distribution = prob_bnn_model(examples)
411
prediction_mean = prediction_distribution.mean().numpy().tolist()
412
prediction_stdv = prediction_distribution.stddev().numpy()
413

414
# The 95% CI is computed as mean ± (1.96 * stdv)
415
upper = (prediction_mean + (1.96 * prediction_stdv)).tolist()
416
lower = (prediction_mean - (1.96 * prediction_stdv)).tolist()
417
prediction_stdv = prediction_stdv.tolist()
418

419
for idx in range(sample):
420
    print(
421
        f"Prediction mean: {round(prediction_mean[idx][0], 2)}, "
422
        f"stddev: {round(prediction_stdv[idx][0], 2)}, "
423
        f"95% CI: [{round(upper[idx][0], 2)} - {round(lower[idx][0], 2)}]"
424
        f" - Actual: {targets[idx]}"
425
    )
426

427
Product

Resources

Company