Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/master/C4 - Convolutional Neural Networks/Week 3/Car detection for Autonomous Driving/yad2k/models/keras_yolo.py
Views: 4822
"""YOLO_v2 Model Defined in Keras."""1import sys23import numpy as np4import tensorflow as tf5from tensorflow.keras import backend as K6from tensorflow.keras.layers import Lambda7from tensorflow.keras.layers import concatenate8from tensorflow.keras.models import Model910from ..utils import compose11from .keras_darknet19 import (DarknetConv2D, DarknetConv2D_BN_Leaky,12darknet_body)1314sys.path.append('..')1516voc_anchors = np.array(17[[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])1819voc_classes = [20"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",21"chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",22"pottedplant", "sheep", "sofa", "train", "tvmonitor"23]242526def space_to_depth_x2(x):27"""Thin wrapper for Tensorflow space_to_depth with block_size=2."""28# Import currently required to make Lambda work.29# See: https://github.com/fchollet/keras/issues/5088#issuecomment-27385127330import tensorflow as tf31return tf.nn.space_to_depth(x, block_size=2)323334def space_to_depth_x2_output_shape(input_shape):35"""Determine space_to_depth output shape for block_size=2.3637Note: For Lambda with TensorFlow backend, output shape may not be needed.38"""39return (input_shape[0], input_shape[1] // 2, input_shape[2] // 2, 4 *40input_shape[3]) if input_shape[1] else (input_shape[0], None, None,414 * input_shape[3])4243def yolo_body(inputs, num_anchors, num_classes):44"""Create YOLO_V2 model CNN body in Keras."""45darknet = Model(inputs, darknet_body()(inputs))46conv20 = compose(47DarknetConv2D_BN_Leaky(1024, (3, 3)),48DarknetConv2D_BN_Leaky(1024, (3, 3)))(darknet.output)4950conv13 = darknet.layers[43].output51conv21 = DarknetConv2D_BN_Leaky(64, (1, 1))(conv13)52# TODO: Allow Keras Lambda to use func arguments for output_shape?53conv21_reshaped = Lambda(54space_to_depth_x2,55output_shape=space_to_depth_x2_output_shape,56name='space_to_depth')(conv21)5758x = concatenate([conv21_reshaped, conv20])59x = DarknetConv2D_BN_Leaky(1024, (3, 3))(x)60x = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1))(x)61return Model(inputs, x)626364def yolo_head(feats, anchors, num_classes):65"""Convert final layer features to bounding box parameters.6667Parameters68----------69feats : tensor70Final convolutional layer features.71anchors : array-like72Anchor box widths and heights.73num_classes : int74Number of target classes.7576Returns77-------78box_xy : tensor79x, y box predictions adjusted by spatial location in conv layer.80box_wh : tensor81w, h box predictions adjusted by anchors and conv spatial resolution.82box_conf : tensor83Probability estimate for whether each box contains any object.84box_class_pred : tensor85Probability distribution estimate for each box over class labels.86"""87num_anchors = len(anchors)88# Reshape to batch, height, width, num_anchors, box_params.89anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2])9091# Static implementation for fixed models.92# TODO: Remove or add option for static implementation.93# _, conv_height, conv_width, _ = K.int_shape(feats)94# conv_dims = K.variable([conv_width, conv_height])9596# Dynamic implementation of conv dims for fully convolutional model.97conv_dims = K.shape(feats)[1:3] # assuming channels last98# In YOLO the height index is the inner most iteration.99conv_height_index = K.arange(0, stop=conv_dims[0])100conv_width_index = K.arange(0, stop=conv_dims[1])101conv_height_index = K.tile(conv_height_index, [conv_dims[1]])102103# TODO: Repeat_elements and tf.split doesn't support dynamic splits.104# conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0)105conv_width_index = K.tile(106K.expand_dims(conv_width_index, 0), [conv_dims[0], 1])107conv_width_index = K.flatten(K.transpose(conv_width_index))108conv_index = K.transpose(K.stack([conv_height_index, conv_width_index]))109conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2])110conv_index = K.cast(conv_index, K.dtype(feats))111112feats = K.reshape(113feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5])114conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats))115116# Static generation of conv_index:117# conv_index = np.array([_ for _ in np.ndindex(conv_width, conv_height)])118# conv_index = conv_index[:, [1, 0]] # swap columns for YOLO ordering.119# conv_index = K.variable(120# conv_index.reshape(1, conv_height, conv_width, 1, 2))121# feats = Reshape(122# (conv_dims[0], conv_dims[1], num_anchors, num_classes + 5))(feats)123124box_xy = K.sigmoid(feats[..., :2])125box_wh = K.exp(feats[..., 2:4])126box_confidence = K.sigmoid(feats[..., 4:5])127box_class_probs = K.softmax(feats[..., 5:])128129# Adjust preditions to each spatial grid point and anchor size.130# Note: YOLO iterates over height index before width index.131box_xy = (box_xy + conv_index) / conv_dims132box_wh = box_wh * anchors_tensor / conv_dims133134return box_xy, box_wh, box_confidence, box_class_probs135136137def yolo_boxes_to_corners(box_xy, box_wh):138"""Convert YOLO box predictions to bounding box corners."""139box_mins = box_xy - (box_wh / 2.)140box_maxes = box_xy + (box_wh / 2.)141142return K.concatenate([143box_mins[..., 1:2], # y_min144box_mins[..., 0:1], # x_min145box_maxes[..., 1:2], # y_max146box_maxes[..., 0:1] # x_max147])148149150def yolo_loss(args,151anchors,152num_classes,153rescore_confidence=False,154print_loss=False):155"""YOLO localization loss function.156157Parameters158----------159yolo_output : tensor160Final convolutional layer features.161162true_boxes : tensor163Ground truth boxes tensor with shape [batch, num_true_boxes, 5]164containing box x_center, y_center, width, height, and class.165166detectors_mask : array1670/1 mask for detector positions where there is a matching ground truth.168169matching_true_boxes : array170Corresponding ground truth boxes for positive detector positions.171Already adjusted for conv height and width.172173anchors : tensor174Anchor boxes for model.175176num_classes : int177Number of object classes.178179rescore_confidence : bool, default=False180If true then set confidence target to IOU of best predicted box with181the closest matching ground truth box.182183print_loss : bool, default=False184If True then use a tf.Print() to print the loss components.185186Returns187-------188mean_loss : float189mean localization loss across minibatch190"""191(yolo_output, true_boxes, detectors_mask, matching_true_boxes) = args192num_anchors = len(anchors)193object_scale = 5194no_object_scale = 1195class_scale = 1196coordinates_scale = 1197pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_head(198yolo_output, anchors, num_classes)199200# Unadjusted box predictions for loss.201# TODO: Remove extra computation shared with yolo_head.202yolo_output_shape = K.shape(yolo_output)203feats = K.reshape(yolo_output, [204-1, yolo_output_shape[1], yolo_output_shape[2], num_anchors,205num_classes + 5206])207pred_boxes = K.concatenate(208(K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1)209210# TODO: Adjust predictions by image width/height for non-square images?211# IOUs may be off due to different aspect ratio.212213# Expand pred x,y,w,h to allow comparison with ground truth.214# batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params215pred_xy = K.expand_dims(pred_xy, 4)216pred_wh = K.expand_dims(pred_wh, 4)217218pred_wh_half = pred_wh / 2.219pred_mins = pred_xy - pred_wh_half220pred_maxes = pred_xy + pred_wh_half221222true_boxes_shape = K.shape(true_boxes)223224# batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params225true_boxes = K.reshape(true_boxes, [226true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2]227])228true_xy = true_boxes[..., 0:2]229true_wh = true_boxes[..., 2:4]230231# Find IOU of each predicted box with each ground truth box.232true_wh_half = true_wh / 2.233true_mins = true_xy - true_wh_half234true_maxes = true_xy + true_wh_half235236intersect_mins = K.maximum(pred_mins, true_mins)237intersect_maxes = K.minimum(pred_maxes, true_maxes)238intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.)239intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]240241pred_areas = pred_wh[..., 0] * pred_wh[..., 1]242true_areas = true_wh[..., 0] * true_wh[..., 1]243244union_areas = pred_areas + true_areas - intersect_areas245iou_scores = intersect_areas / union_areas246247# Best IOUs for each location.248best_ious = K.max(iou_scores, axis=4) # Best IOU scores.249best_ious = K.expand_dims(best_ious)250251# A detector has found an object if IOU > thresh for some true box.252object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious))253254# TODO: Darknet region training includes extra coordinate loss for early255# training steps to encourage predictions to match anchor priors.256257# Determine confidence weights from object and no_object weights.258# NOTE: YOLO does not use binary cross-entropy here.259no_object_weights = (no_object_scale * (1 - object_detections) *260(1 - detectors_mask))261no_objects_loss = no_object_weights * K.square(-pred_confidence)262263if rescore_confidence:264objects_loss = (object_scale * detectors_mask *265K.square(best_ious - pred_confidence))266else:267objects_loss = (object_scale * detectors_mask *268K.square(1 - pred_confidence))269confidence_loss = objects_loss + no_objects_loss270271# Classification loss for matching detections.272# NOTE: YOLO does not use categorical cross-entropy loss here.273matching_classes = K.cast(matching_true_boxes[..., 4], 'int32')274matching_classes = K.one_hot(matching_classes, num_classes)275classification_loss = (class_scale * detectors_mask *276K.square(matching_classes - pred_class_prob))277278# Coordinate loss for matching detection boxes.279matching_boxes = matching_true_boxes[..., 0:4]280coordinates_loss = (coordinates_scale * detectors_mask *281K.square(matching_boxes - pred_boxes))282283confidence_loss_sum = K.sum(confidence_loss)284classification_loss_sum = K.sum(classification_loss)285coordinates_loss_sum = K.sum(coordinates_loss)286total_loss = 0.5 * (287confidence_loss_sum + classification_loss_sum + coordinates_loss_sum)288if print_loss:289total_loss = tf.Print(290total_loss, [291total_loss, confidence_loss_sum, classification_loss_sum,292coordinates_loss_sum293],294message='yolo_loss, conf_loss, class_loss, box_coord_loss:')295296return total_loss297298299def yolo(inputs, anchors, num_classes):300"""Generate a complete YOLO_v2 localization model."""301num_anchors = len(anchors)302body = yolo_body(inputs, num_anchors, num_classes)303outputs = yolo_head(body.output, anchors, num_classes)304return outputs305306307def yolo_filter_boxes(boxes, box_confidence, box_class_probs, threshold=.6):308"""Filter YOLO boxes based on object and class confidence."""309box_scores = box_confidence * box_class_probs310box_classes = K.argmax(box_scores, axis=-1)311box_class_scores = K.max(box_scores, axis=-1)312prediction_mask = box_class_scores >= threshold313314# TODO: Expose tf.boolean_mask to Keras backend?315boxes = tf.boolean_mask(boxes, prediction_mask)316scores = tf.boolean_mask(box_class_scores, prediction_mask)317classes = tf.boolean_mask(box_classes, prediction_mask)318return boxes, scores, classes319320321def yolo_eval(yolo_outputs,322image_shape,323max_boxes=10,324score_threshold=.6,325iou_threshold=.5):326"""Evaluate YOLO model on given input batch and return filtered boxes."""327box_xy, box_wh, box_confidence, box_class_probs = yolo_outputs328boxes = yolo_boxes_to_corners(box_xy, box_wh)329boxes, scores, classes = yolo_filter_boxes(330boxes, box_confidence, box_class_probs, threshold=score_threshold)331332# Scale boxes back to original image shape.333height = image_shape[0]334width = image_shape[1]335image_dims = K.stack([height, width, height, width])336image_dims = K.reshape(image_dims, [1, 4])337boxes = boxes * image_dims338339# TODO: Something must be done about this ugly hack!340max_boxes_tensor = K.variable(max_boxes, dtype='int32')341K.get_session().run(tf.variables_initializer([max_boxes_tensor]))342nms_index = tf.image.non_max_suppression(343boxes, scores, max_boxes_tensor, iou_threshold=iou_threshold)344boxes = K.gather(boxes, nms_index)345scores = K.gather(scores, nms_index)346classes = K.gather(classes, nms_index)347return boxes, scores, classes348349350def preprocess_true_boxes(true_boxes, anchors, image_size):351"""Find detector in YOLO where ground truth box should appear.352353Parameters354----------355true_boxes : array356List of ground truth boxes in form of relative x, y, w, h, class.357Relative coordinates are in the range [0, 1] indicating a percentage358of the original image dimensions.359anchors : array360List of anchors in form of w, h.361Anchors are assumed to be in the range [0, conv_size] where conv_size362is the spatial dimension of the final convolutional features.363image_size : array-like364List of image dimensions in form of h, w in pixels.365366Returns367-------368detectors_mask : array3690/1 mask for detectors in [conv_height, conv_width, num_anchors, 1]370that should be compared with a matching ground truth box.371matching_true_boxes: array372Same shape as detectors_mask with the corresponding ground truth box373adjusted for comparison with predicted parameters at training time.374"""375height, width = image_size376num_anchors = len(anchors)377# Downsampling factor of 5x 2-stride max_pools == 32.378# TODO: Remove hardcoding of downscaling calculations.379assert height % 32 == 0, 'Image sizes in YOLO_v2 must be multiples of 32.'380assert width % 32 == 0, 'Image sizes in YOLO_v2 must be multiples of 32.'381conv_height = height // 32382conv_width = width // 32383num_box_params = true_boxes.shape[1]384detectors_mask = np.zeros(385(conv_height, conv_width, num_anchors, 1), dtype=np.float32)386matching_true_boxes = np.zeros(387(conv_height, conv_width, num_anchors, num_box_params),388dtype=np.float32)389390for box in true_boxes:391# scale box to convolutional feature spatial dimensions392box_class = box[4:5]393box = box[0:4] * np.array(394[conv_width, conv_height, conv_width, conv_height])395i = np.floor(box[1]).astype('int')396j = np.floor(box[0]).astype('int')397best_iou = 0398best_anchor = 0399for k, anchor in enumerate(anchors):400# Find IOU between box shifted to origin and anchor box.401box_maxes = box[2:4] / 2.402box_mins = -box_maxes403anchor_maxes = (anchor / 2.)404anchor_mins = -anchor_maxes405406intersect_mins = np.maximum(box_mins, anchor_mins)407intersect_maxes = np.minimum(box_maxes, anchor_maxes)408intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)409intersect_area = intersect_wh[0] * intersect_wh[1]410box_area = box[2] * box[3]411anchor_area = anchor[0] * anchor[1]412iou = intersect_area / (box_area + anchor_area - intersect_area)413if iou > best_iou:414best_iou = iou415best_anchor = k416417if best_iou > 0:418detectors_mask[i, j, best_anchor] = 1419adjusted_box = np.array(420[421box[0] - j, box[1] - i,422np.log(box[2] / anchors[best_anchor][0]),423np.log(box[3] / anchors[best_anchor][1]), box_class424],425dtype=np.float32)426matching_true_boxes[i, j, best_anchor] = adjusted_box427return detectors_mask, matching_true_boxes428429430