📚 The CoCalc Library - books, templates and other resources
License: OTHER
def examples_queue(data_sources, data_fields_to_features, training,1data_items_to_decoders=None, data_items_to_decode=None):2"""Contruct a queue of training or evaluation examples.34This function will create a reader from files given by data_sources,5then enqueue the tf.Examples from these files, shuffling if training6is true, and finally parse these tf.Examples to tensors.78The dictionary data_fields_to_features for an image dataset can be this:910data_fields_to_features = {11'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),12'image/format': tf.FixedLenFeature((), tf.string, default_value='raw'),13'image/class/label': tf.FixedLenFeature(14[1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)),15}1617and for a simple algorithmic dataset with variable-length data it is this:1819data_fields_to_features = {20'inputs': tf.VarLenFeature(tf.int64),21'targets': tf.VarLenFeature(tf.int64),22}2324The data_items_to_decoders dictionary argument can be left as None if there25is no decoding to be performed. But, e.g. for images, it should be set so that26the images are decoded from the features, e.g., like this for MNIST:2728data_items_to_decoders = {29'image': tfexample_decoder.Image(30image_key = 'image/encoded',31format_key = 'image/format',32shape=[28, 28],33channels=1),34'label': tfexample_decoder.Tensor('image/class/label'),35}3637These arguments are compatible with the use of tf.contrib.slim.data module,38see there for more documentation.3940Args:41data_sources: a list or tuple of sources from which the data will be read,42for example [/path/to/train@128, /path/to/train2*, /tmp/.../train3*]43data_fields_to_features: a dictionary from data fields in the data sources44to features, such as tf.VarLenFeature(tf.int64), see above for examples.45training: a Boolean, whether to read for training or evaluation.46data_items_to_decoders: a dictionary mapping data items (that will be47in the returned result) to decoders that will decode them using features48defined in data_fields_to_features; see above for examples. By default49(if this is None), we grab the tensor from every feature.50data_items_to_decode: a subset of data items that will be decoded;51by default (if this is None), we decode all items.5253Returns:54A dictionary mapping each data_field to a corresponding 1D int64 tensor55read from the created queue.5657Raises:58ValueError: if no files are found with the provided data_prefix or no data59fields were provided.60"""61with tf.name_scope("examples_queue"):62# Read serialized examples using slim parallel_reader.63_, example_serialized = tf.contrib.slim.parallel_reader.parallel_read(64data_sources, tf.TFRecordReader, shuffle=training,65num_readers=4 if training else 1)6667if data_items_to_decoders is None:68data_items_to_decoders = {69field: tf.contrib.slim.tfexample_decoder.Tensor(field)70for field in data_fields_to_features71}7273decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder(74data_fields_to_features, data_items_to_decoders)7576if data_items_to_decode is None:77data_items_to_decode = data_items_to_decoders.keys()7879decoded = decoder.decode(example_serialized, items=data_items_to_decode)80return {field: tensor81for (field, tensor) in zip(data_items_to_decode, decoded)}828384def batch_examples(examples, batch_size, bucket_boundaries=None):85"""Given a queue of examples, create batches of examples with similar lengths.8687We assume that examples is a dictionary with string keys and tensor values,88possibly coming from a queue, e.g., constructed by examples_queue above.89Each tensor in examples is assumed to be 1D. We will put tensors of similar90length into batches togeter. We return a dictionary with the same keys as91examples, and with values being batches of size batch_size. If elements have92different lengths, they are padded with 0s. This function is based on93tf.contrib.training.bucket_by_sequence_length so see there for details.9495For example, if examples is a queue containing [1, 2, 3] and [4], then96this function with batch_size=2 will return a batch [[1, 2, 3], [4, 0, 0]].9798Args:99examples: a dictionary with string keys and 1D tensor values.100batch_size: a python integer or a scalar int32 tensor.101bucket_boundaries: a list of integers for the boundaries that will be102used for bucketing; see tf.contrib.training.bucket_by_sequence_length103for more details; if None, we create a default set of buckets.104105Returns:106A dictionary with the same keys as examples and with values being batches107of examples padded with 0s, i.e., [batch_size x length] tensors.108"""109# Create default buckets if none were provided.110if bucket_boundaries is None:111# Small buckets -- go in steps of 8 until 64.112small_buckets = [8 * (i + 1) for i in xrange(8)]113# Medium buckets -- go in steps of 32 until 256.114medium_buckets = [32 * (i + 3) for i in xrange(6)]115# Large buckets -- go in steps of 128 until maximum of 1024.116large_buckets = [128 * (i + 3) for i in xrange(6)]117# By default use the above 20 bucket boundaries (21 queues in total).118bucket_boundaries = small_buckets + medium_buckets + large_buckets119with tf.name_scope("batch_examples"):120# The queue to bucket on will be chosen based on maximum length.121max_length = 0122for v in examples.values(): # We assume 0-th dimension is the length.123max_length = tf.maximum(max_length, tf.shape(v)[0])124(_, outputs) = tf.contrib.training.bucket_by_sequence_length(125max_length, examples, batch_size, bucket_boundaries,126capacity=2 * batch_size, dynamic_pad=True)127return outputs128129130