Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
keras-team
GitHub Repository: keras-team/keras-io
Path: blob/master/examples/vision/depth_estimation.py
3507 views
1
"""
2
Title: Monocular depth estimation
3
Author: [Victor Basu](https://www.linkedin.com/in/victor-basu-520958147)
4
Date created: 2021/08/30
5
Last modified: 2024/08/13
6
Description: Implement a depth estimation model with a convnet.
7
Accelerator: GPU
8
"""
9
10
"""
11
## Introduction
12
13
_Depth estimation_ is a crucial step towards inferring scene geometry from 2D images.
14
The goal in _monocular depth estimation_ is to predict the depth value of each pixel or
15
inferring depth information, given only a single RGB image as input.
16
This example will show an approach to build a depth estimation model with a convnet
17
and simple loss functions.
18
19
![depth](https://paperswithcode.com/media/thumbnails/task/task-0000000605-d9849a91.jpg)
20
21
"""
22
23
"""
24
## Setup
25
"""
26
27
import os
28
29
os.environ["KERAS_BACKEND"] = "tensorflow"
30
31
import sys
32
33
import tensorflow as tf
34
import keras
35
from keras import layers
36
from keras import ops
37
import pandas as pd
38
import numpy as np
39
import cv2
40
import matplotlib.pyplot as plt
41
42
keras.utils.set_random_seed(123)
43
44
"""
45
## Downloading the dataset
46
47
We will be using the dataset **DIODE: A Dense Indoor and Outdoor Depth Dataset** for this
48
tutorial. However, we use the validation set generating training and evaluation subsets
49
for our model. The reason we use the validation set rather than the training set of the original dataset is because
50
the training set consists of 81GB of data, which is challenging to download compared
51
to the validation set which is only 2.6GB.
52
Other datasets that you could use are
53
**[NYU-v2](https://cs.nyu.edu/~silberman/datasets/nyu_depth_v2.html)**
54
and **[KITTI](http://www.cvlibs.net/datasets/kitti/)**.
55
"""
56
57
annotation_folder = "/dataset/"
58
if not os.path.exists(os.path.abspath(".") + annotation_folder):
59
annotation_zip = keras.utils.get_file(
60
"val.tar.gz",
61
cache_subdir=os.path.abspath("."),
62
origin="http://diode-dataset.s3.amazonaws.com/val.tar.gz",
63
extract=True,
64
)
65
66
"""
67
## Preparing the dataset
68
69
We only use the indoor images to train our depth estimation model.
70
"""
71
72
path = "val/indoors"
73
74
filelist = []
75
76
for root, dirs, files in os.walk(path):
77
for file in files:
78
filelist.append(os.path.join(root, file))
79
80
filelist.sort()
81
data = {
82
"image": [x for x in filelist if x.endswith(".png")],
83
"depth": [x for x in filelist if x.endswith("_depth.npy")],
84
"mask": [x for x in filelist if x.endswith("_depth_mask.npy")],
85
}
86
df = pd.DataFrame(data)
87
88
df = df.sample(frac=1, random_state=42)
89
90
"""
91
## Preparing hyperparameters
92
"""
93
94
HEIGHT = 256
95
WIDTH = 256
96
LR = 0.00001
97
EPOCHS = 30
98
BATCH_SIZE = 32
99
100
"""
101
## Building a data pipeline
102
103
1. The pipeline takes a dataframe containing the path for the RGB images,
104
as well as the depth and depth mask files.
105
2. It reads and resize the RGB images.
106
3. It reads the depth and depth mask files, process them to generate the depth map image and
107
resize it.
108
4. It returns the RGB images and the depth map images for a batch.
109
"""
110
111
112
class DataGenerator(keras.utils.PyDataset):
113
def __init__(self, data, batch_size=6, dim=(768, 1024), n_channels=3, shuffle=True):
114
super().__init__()
115
"""
116
Initialization
117
"""
118
self.data = data
119
self.indices = self.data.index.tolist()
120
self.dim = dim
121
self.n_channels = n_channels
122
self.batch_size = batch_size
123
self.shuffle = shuffle
124
self.min_depth = 0.1
125
self.on_epoch_end()
126
127
def __len__(self):
128
return int(np.ceil(len(self.data) / self.batch_size))
129
130
def __getitem__(self, index):
131
if (index + 1) * self.batch_size > len(self.indices):
132
self.batch_size = len(self.indices) - index * self.batch_size
133
# Generate one batch of data
134
# Generate indices of the batch
135
index = self.indices[index * self.batch_size : (index + 1) * self.batch_size]
136
# Find list of IDs
137
batch = [self.indices[k] for k in index]
138
x, y = self.data_generation(batch)
139
140
return x, y
141
142
def on_epoch_end(self):
143
"""
144
Updates indexes after each epoch
145
"""
146
self.index = np.arange(len(self.indices))
147
if self.shuffle == True:
148
np.random.shuffle(self.index)
149
150
def load(self, image_path, depth_map, mask):
151
"""Load input and target image."""
152
153
image_ = cv2.imread(image_path)
154
image_ = cv2.cvtColor(image_, cv2.COLOR_BGR2RGB)
155
image_ = cv2.resize(image_, self.dim)
156
image_ = tf.image.convert_image_dtype(image_, tf.float32)
157
158
depth_map = np.load(depth_map).squeeze()
159
160
mask = np.load(mask)
161
mask = mask > 0
162
163
max_depth = min(300, np.percentile(depth_map, 99))
164
depth_map = np.clip(depth_map, self.min_depth, max_depth)
165
depth_map = np.log(depth_map, where=mask)
166
167
depth_map = np.ma.masked_where(~mask, depth_map)
168
169
depth_map = np.clip(depth_map, 0.1, np.log(max_depth))
170
depth_map = cv2.resize(depth_map, self.dim)
171
depth_map = np.expand_dims(depth_map, axis=2)
172
depth_map = tf.image.convert_image_dtype(depth_map, tf.float32)
173
174
return image_, depth_map
175
176
def data_generation(self, batch):
177
x = np.empty((self.batch_size, *self.dim, self.n_channels))
178
y = np.empty((self.batch_size, *self.dim, 1))
179
180
for i, batch_id in enumerate(batch):
181
x[i,], y[i,] = self.load(
182
self.data["image"][batch_id],
183
self.data["depth"][batch_id],
184
self.data["mask"][batch_id],
185
)
186
x, y = x.astype("float32"), y.astype("float32")
187
return x, y
188
189
190
"""
191
## Visualizing samples
192
"""
193
194
195
def visualize_depth_map(samples, test=False, model=None):
196
input, target = samples
197
cmap = plt.cm.jet
198
cmap.set_bad(color="black")
199
200
if test:
201
pred = model.predict(input)
202
fig, ax = plt.subplots(6, 3, figsize=(50, 50))
203
for i in range(6):
204
ax[i, 0].imshow((input[i].squeeze()))
205
ax[i, 1].imshow((target[i].squeeze()), cmap=cmap)
206
ax[i, 2].imshow((pred[i].squeeze()), cmap=cmap)
207
208
else:
209
fig, ax = plt.subplots(6, 2, figsize=(50, 50))
210
for i in range(6):
211
ax[i, 0].imshow((input[i].squeeze()))
212
ax[i, 1].imshow((target[i].squeeze()), cmap=cmap)
213
214
215
visualize_samples = next(
216
iter(DataGenerator(data=df, batch_size=6, dim=(HEIGHT, WIDTH)))
217
)
218
visualize_depth_map(visualize_samples)
219
220
"""
221
## 3D point cloud visualization
222
"""
223
224
depth_vis = np.flipud(visualize_samples[1][1].squeeze()) # target
225
img_vis = np.flipud(visualize_samples[0][1].squeeze()) # input
226
227
fig = plt.figure(figsize=(15, 10))
228
ax = plt.axes(projection="3d")
229
230
STEP = 3
231
for x in range(0, img_vis.shape[0], STEP):
232
for y in range(0, img_vis.shape[1], STEP):
233
ax.scatter(
234
[depth_vis[x, y]] * 3,
235
[y] * 3,
236
[x] * 3,
237
c=tuple(img_vis[x, y, :3] / 255),
238
s=3,
239
)
240
ax.view_init(45, 135)
241
242
"""
243
## Building the model
244
245
1. The basic model is from U-Net.
246
2. Addditive skip-connections are implemented in the downscaling block.
247
"""
248
249
250
class DownscaleBlock(layers.Layer):
251
def __init__(
252
self, filters, kernel_size=(3, 3), padding="same", strides=1, **kwargs
253
):
254
super().__init__(**kwargs)
255
self.convA = layers.Conv2D(filters, kernel_size, strides, padding)
256
self.convB = layers.Conv2D(filters, kernel_size, strides, padding)
257
self.reluA = layers.LeakyReLU(negative_slope=0.2)
258
self.reluB = layers.LeakyReLU(negative_slope=0.2)
259
self.bn2a = layers.BatchNormalization()
260
self.bn2b = layers.BatchNormalization()
261
262
self.pool = layers.MaxPool2D((2, 2), (2, 2))
263
264
def call(self, input_tensor):
265
d = self.convA(input_tensor)
266
x = self.bn2a(d)
267
x = self.reluA(x)
268
269
x = self.convB(x)
270
x = self.bn2b(x)
271
x = self.reluB(x)
272
273
x += d
274
p = self.pool(x)
275
return x, p
276
277
278
class UpscaleBlock(layers.Layer):
279
def __init__(
280
self, filters, kernel_size=(3, 3), padding="same", strides=1, **kwargs
281
):
282
super().__init__(**kwargs)
283
self.us = layers.UpSampling2D((2, 2))
284
self.convA = layers.Conv2D(filters, kernel_size, strides, padding)
285
self.convB = layers.Conv2D(filters, kernel_size, strides, padding)
286
self.reluA = layers.LeakyReLU(negative_slope=0.2)
287
self.reluB = layers.LeakyReLU(negative_slope=0.2)
288
self.bn2a = layers.BatchNormalization()
289
self.bn2b = layers.BatchNormalization()
290
self.conc = layers.Concatenate()
291
292
def call(self, x, skip):
293
x = self.us(x)
294
concat = self.conc([x, skip])
295
x = self.convA(concat)
296
x = self.bn2a(x)
297
x = self.reluA(x)
298
299
x = self.convB(x)
300
x = self.bn2b(x)
301
x = self.reluB(x)
302
303
return x
304
305
306
class BottleNeckBlock(layers.Layer):
307
def __init__(
308
self, filters, kernel_size=(3, 3), padding="same", strides=1, **kwargs
309
):
310
super().__init__(**kwargs)
311
self.convA = layers.Conv2D(filters, kernel_size, strides, padding)
312
self.convB = layers.Conv2D(filters, kernel_size, strides, padding)
313
self.reluA = layers.LeakyReLU(negative_slope=0.2)
314
self.reluB = layers.LeakyReLU(negative_slope=0.2)
315
316
def call(self, x):
317
x = self.convA(x)
318
x = self.reluA(x)
319
x = self.convB(x)
320
x = self.reluB(x)
321
return x
322
323
324
"""
325
## Defining the loss
326
327
We will optimize 3 losses in our mode.
328
1. Structural similarity index(SSIM).
329
2. L1-loss, or Point-wise depth in our case.
330
3. Depth smoothness loss.
331
332
Out of the three loss functions, SSIM contributes the most to improving model performance.
333
"""
334
335
336
def image_gradients(image):
337
if len(ops.shape(image)) != 4:
338
raise ValueError(
339
"image_gradients expects a 4D tensor "
340
"[batch_size, h, w, d], not {}.".format(ops.shape(image))
341
)
342
343
image_shape = ops.shape(image)
344
batch_size, height, width, depth = ops.unstack(image_shape)
345
346
dy = image[:, 1:, :, :] - image[:, :-1, :, :]
347
dx = image[:, :, 1:, :] - image[:, :, :-1, :]
348
349
# Return tensors with same size as original image by concatenating
350
# zeros. Place the gradient [I(x+1,y) - I(x,y)] on the base pixel (x, y).
351
shape = ops.stack([batch_size, 1, width, depth])
352
dy = ops.concatenate([dy, ops.zeros(shape, dtype=image.dtype)], axis=1)
353
dy = ops.reshape(dy, image_shape)
354
355
shape = ops.stack([batch_size, height, 1, depth])
356
dx = ops.concatenate([dx, ops.zeros(shape, dtype=image.dtype)], axis=2)
357
dx = ops.reshape(dx, image_shape)
358
359
return dy, dx
360
361
362
class DepthEstimationModel(keras.Model):
363
def __init__(self):
364
super().__init__()
365
self.ssim_loss_weight = 0.85
366
self.l1_loss_weight = 0.1
367
self.edge_loss_weight = 0.9
368
self.loss_metric = keras.metrics.Mean(name="loss")
369
f = [16, 32, 64, 128, 256]
370
self.downscale_blocks = [
371
DownscaleBlock(f[0]),
372
DownscaleBlock(f[1]),
373
DownscaleBlock(f[2]),
374
DownscaleBlock(f[3]),
375
]
376
self.bottle_neck_block = BottleNeckBlock(f[4])
377
self.upscale_blocks = [
378
UpscaleBlock(f[3]),
379
UpscaleBlock(f[2]),
380
UpscaleBlock(f[1]),
381
UpscaleBlock(f[0]),
382
]
383
self.conv_layer = layers.Conv2D(1, (1, 1), padding="same", activation="tanh")
384
385
def calculate_loss(self, target, pred):
386
# Edges
387
dy_true, dx_true = image_gradients(target)
388
dy_pred, dx_pred = image_gradients(pred)
389
weights_x = ops.cast(ops.exp(ops.mean(ops.abs(dx_true))), "float32")
390
weights_y = ops.cast(ops.exp(ops.mean(ops.abs(dy_true))), "float32")
391
392
# Depth smoothness
393
smoothness_x = dx_pred * weights_x
394
smoothness_y = dy_pred * weights_y
395
396
depth_smoothness_loss = ops.mean(abs(smoothness_x)) + ops.mean(
397
abs(smoothness_y)
398
)
399
400
# Structural similarity (SSIM) index
401
ssim_loss = ops.mean(
402
1
403
- tf.image.ssim(
404
target, pred, max_val=WIDTH, filter_size=7, k1=0.01**2, k2=0.03**2
405
)
406
)
407
# Point-wise depth
408
l1_loss = ops.mean(ops.abs(target - pred))
409
410
loss = (
411
(self.ssim_loss_weight * ssim_loss)
412
+ (self.l1_loss_weight * l1_loss)
413
+ (self.edge_loss_weight * depth_smoothness_loss)
414
)
415
416
return loss
417
418
@property
419
def metrics(self):
420
return [self.loss_metric]
421
422
def train_step(self, batch_data):
423
input, target = batch_data
424
with tf.GradientTape() as tape:
425
pred = self(input, training=True)
426
loss = self.calculate_loss(target, pred)
427
428
gradients = tape.gradient(loss, self.trainable_variables)
429
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
430
self.loss_metric.update_state(loss)
431
return {
432
"loss": self.loss_metric.result(),
433
}
434
435
def test_step(self, batch_data):
436
input, target = batch_data
437
438
pred = self(input, training=False)
439
loss = self.calculate_loss(target, pred)
440
441
self.loss_metric.update_state(loss)
442
return {
443
"loss": self.loss_metric.result(),
444
}
445
446
def call(self, x):
447
c1, p1 = self.downscale_blocks[0](x)
448
c2, p2 = self.downscale_blocks[1](p1)
449
c3, p3 = self.downscale_blocks[2](p2)
450
c4, p4 = self.downscale_blocks[3](p3)
451
452
bn = self.bottle_neck_block(p4)
453
454
u1 = self.upscale_blocks[0](bn, c4)
455
u2 = self.upscale_blocks[1](u1, c3)
456
u3 = self.upscale_blocks[2](u2, c2)
457
u4 = self.upscale_blocks[3](u3, c1)
458
459
return self.conv_layer(u4)
460
461
462
"""
463
## Model training
464
"""
465
466
optimizer = keras.optimizers.SGD(
467
learning_rate=LR,
468
nesterov=False,
469
)
470
model = DepthEstimationModel()
471
# Compile the model
472
model.compile(optimizer)
473
474
train_loader = DataGenerator(
475
data=df[:260].reset_index(drop="true"), batch_size=BATCH_SIZE, dim=(HEIGHT, WIDTH)
476
)
477
validation_loader = DataGenerator(
478
data=df[260:].reset_index(drop="true"), batch_size=BATCH_SIZE, dim=(HEIGHT, WIDTH)
479
)
480
model.fit(
481
train_loader,
482
epochs=EPOCHS,
483
validation_data=validation_loader,
484
)
485
486
"""
487
## Visualizing model output
488
489
We visualize the model output over the validation set.
490
The first image is the RGB image, the second image is the ground truth depth map image
491
and the third one is the predicted depth map image.
492
"""
493
494
test_loader = next(
495
iter(
496
DataGenerator(
497
data=df[265:].reset_index(drop="true"), batch_size=6, dim=(HEIGHT, WIDTH)
498
)
499
)
500
)
501
visualize_depth_map(test_loader, test=True, model=model)
502
503
test_loader = next(
504
iter(
505
DataGenerator(
506
data=df[300:].reset_index(drop="true"), batch_size=6, dim=(HEIGHT, WIDTH)
507
)
508
)
509
)
510
visualize_depth_map(test_loader, test=True, model=model)
511
512
"""
513
## Possible improvements
514
515
1. You can improve this model by replacing the encoding part of the U-Net with a
516
pretrained DenseNet or ResNet.
517
2. Loss functions play an important role in solving this problem.
518
Tuning the loss functions may yield significant improvement.
519
"""
520
521
"""
522
## References
523
524
The following papers go deeper into possible approaches for depth estimation.
525
1. [Depth Prediction Without the Sensors: Leveraging Structure for Unsupervised Learning from Monocular Videos](https://arxiv.org/abs/1811.06152v1)
526
2. [Digging Into Self-Supervised Monocular Depth Estimation](https://openaccess.thecvf.com/content_ICCV_2019/papers/Godard_Digging_Into_Self-Supervised_Monocular_Depth_Estimation_ICCV_2019_paper.pdf)
527
3. [Deeper Depth Prediction with Fully Convolutional Residual Networks](https://arxiv.org/abs/1606.00373v2)
528
529
You can also find helpful implementations in the papers with code depth estimation task.
530
531
You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/spaces/keras-io/Monocular-Depth-Estimation)
532
and try the demo on [Hugging Face Spaces](https://huggingface.co/keras-io/monocular-depth-estimation).
533
"""
534
535