CoCalc -- 05_csv

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / stanford-tensorflow-tutorials / 2017 / examples / 05_csv_reader.py
¹³²⁹²⁷ views
License: OTHER
1
""" Some people tried to use TextLineReader for the assignment 1
2
but seem to have problems getting it work, so here is a short 
3
script demonstrating the use of CSV reader on the heart dataset.
4
Note that the heart dataset is originally in txt so I first
5
converted it to csv to take advantage of the already laid out columns.
6

7
You can download heart.csv in the data folder.
8
Author: Chip Huyen
9
Prepared for the class CS 20SI: "TensorFlow for Deep Learning Research"
10
cs20si.stanford.edu
11
"""
12
import os
13
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
14

15
import sys
16
sys.path.append('..')
17

18
import tensorflow as tf
19

20
DATA_PATH = 'data/heart.csv'
21
BATCH_SIZE = 2
22
N_FEATURES = 9
23

24
def batch_generator(filenames):
25
    """ filenames is the list of files you want to read from. 
26
    In this case, it contains only heart.csv
27
    """
28
    filename_queue = tf.train.string_input_producer(filenames)
29
    reader = tf.TextLineReader(skip_header_lines=1) # skip the first line in the file
30
    _, value = reader.read(filename_queue)
31

32
    # record_defaults are the default values in case some of our columns are empty
33
    # This is also to tell tensorflow the format of our data (the type of the decode result)
34
    # for this dataset, out of 9 feature columns, 
35
    # 8 of them are floats (some are integers, but to make our features homogenous, 
36
    # we consider them floats), and 1 is string (at position 5)
37
    # the last column corresponds to the lable is an integer
38

39
    record_defaults = [[1.0] for _ in range(N_FEATURES)]
40
    record_defaults[4] = ['']
41
    record_defaults.append([1])
42

43
    # read in the 10 rows of data
44
    content = tf.decode_csv(value, record_defaults=record_defaults) 
45

46
    # convert the 5th column (present/absent) to the binary value 0 and 1
47
    content[4] = tf.cond(tf.equal(content[4], tf.constant('Present')), lambda: tf.constant(1.0), lambda: tf.constant(0.0))
48

49
    # pack all 9 features into a tensor
50
    features = tf.stack(content[:N_FEATURES])
51

52
    # assign the last column to label
53
    label = content[-1]
54

55
    # minimum number elements in the queue after a dequeue, used to ensure 
56
    # that the samples are sufficiently mixed
57
    # I think 10 times the BATCH_SIZE is sufficient
58
    min_after_dequeue = 10 * BATCH_SIZE
59

60
    # the maximum number of elements in the queue
61
    capacity = 20 * BATCH_SIZE
62

63
    # shuffle the data to generate BATCH_SIZE sample pairs
64
    data_batch, label_batch = tf.train.shuffle_batch([features, label], batch_size=BATCH_SIZE, 
65
                                        capacity=capacity, min_after_dequeue=min_after_dequeue)
66

67
    return data_batch, label_batch
68

69
def generate_batches(data_batch, label_batch):
70
    with tf.Session() as sess:
71
        coord = tf.train.Coordinator()
72
        threads = tf.train.start_queue_runners(coord=coord)
73
        for _ in range(10): # generate 10 batches
74
            features, labels = sess.run([data_batch, label_batch])
75
            print(features)
76
        coord.request_stop()
77
        coord.join(threads)
78

79
def main():
80
    data_batch, label_batch = batch_generator([DATA_PATH])
81
    generate_batches(data_batch, label_batch)
82

83
if __name__ == '__main__':
84
    main()
85

86
Product

Resources

Company