CoCalc -- utils.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / stanford-tensorflow-tutorials / examples / utils.py
¹³²⁹²³ views
License: OTHER
1
import os
2
import gzip
3
import shutil
4
import struct
5
import urllib
6

7
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
8

9
from matplotlib import pyplot as plt
10
import numpy as np
11
import tensorflow as tf
12

13
def huber_loss(labels, predictions, delta=14.0):
14
    residual = tf.abs(labels - predictions)
15
    def f1(): return 0.5 * tf.square(residual)
16
    def f2(): return delta * residual - 0.5 * tf.square(delta)
17
    return tf.cond(residual < delta, f1, f2)
18

19
def safe_mkdir(path):
20
    """ Create a directory if there isn't one already. """
21
    try:
22
        os.mkdir(path)
23
    except OSError:
24
        pass
25

26
def read_birth_life_data(filename):
27
    """
28
    Read in birth_life_2010.txt and return:
29
    data in the form of NumPy array
30
    n_samples: number of samples
31
    """
32
    text = open(filename, 'r').readlines()[1:]
33
    data = [line[:-1].split('\t') for line in text]
34
    births = [float(line[1]) for line in data]
35
    lifes = [float(line[2]) for line in data]
36
    data = list(zip(births, lifes))
37
    n_samples = len(data)
38
    data = np.asarray(data, dtype=np.float32)
39
    return data, n_samples
40

41
def download_one_file(download_url, 
42
                    local_dest, 
43
                    expected_byte=None, 
44
                    unzip_and_remove=False):
45
    """ 
46
    Download the file from download_url into local_dest
47
    if the file doesn't already exists.
48
    If expected_byte is provided, check if 
49
    the downloaded file has the same number of bytes.
50
    If unzip_and_remove is True, unzip the file and remove the zip file
51
    """
52
    if os.path.exists(local_dest) or os.path.exists(local_dest[:-3]):
53
        print('%s already exists' %local_dest)
54
    else:
55
        print('Downloading %s' %download_url)
56
        local_file, _ = urllib.request.urlretrieve(download_url, local_dest)
57
        file_stat = os.stat(local_dest)
58
        if expected_byte:
59
            if file_stat.st_size == expected_byte:
60
                print('Successfully downloaded %s' %local_dest)
61
                if unzip_and_remove:
62
                    with gzip.open(local_dest, 'rb') as f_in, open(local_dest[:-3],'wb') as f_out:
63
                        shutil.copyfileobj(f_in, f_out)
64
                    os.remove(local_dest)
65
            else:
66
                print('The downloaded file has unexpected number of bytes')
67

68
def download_mnist(path):
69
    """ 
70
    Download and unzip the dataset mnist if it's not already downloaded 
71
    Download from http://yann.lecun.com/exdb/mnist
72
    """
73
    safe_mkdir(path)
74
    url = 'http://yann.lecun.com/exdb/mnist'
75
    filenames = ['train-images-idx3-ubyte.gz',
76
                'train-labels-idx1-ubyte.gz',
77
                't10k-images-idx3-ubyte.gz',
78
                't10k-labels-idx1-ubyte.gz']
79
    expected_bytes = [9912422, 28881, 1648877, 4542]
80

81
    for filename, byte in zip(filenames, expected_bytes):
82
        download_url = os.path.join(url, filename)
83
        local_dest = os.path.join(path, filename)
84
        download_one_file(download_url, local_dest, byte, True)
85

86
def parse_data(path, dataset, flatten):
87
    if dataset != 'train' and dataset != 't10k':
88
        raise NameError('dataset must be train or t10k')
89

90
    label_file = os.path.join(path, dataset + '-labels-idx1-ubyte')
91
    with open(label_file, 'rb') as file:
92
        _, num = struct.unpack(">II", file.read(8))
93
        labels = np.fromfile(file, dtype=np.int8) #int8
94
        new_labels = np.zeros((num, 10))
95
        new_labels[np.arange(num), labels] = 1
96
    
97
    img_file = os.path.join(path, dataset + '-images-idx3-ubyte')
98
    with open(img_file, 'rb') as file:
99
        _, num, rows, cols = struct.unpack(">IIII", file.read(16))
100
        imgs = np.fromfile(file, dtype=np.uint8).reshape(num, rows, cols) #uint8
101
        imgs = imgs.astype(np.float32) / 255.0
102
        if flatten:
103
            imgs = imgs.reshape([num, -1])
104

105
    return imgs, new_labels
106

107
def read_mnist(path, flatten=True, num_train=55000):
108
    """
109
    Read in the mnist dataset, given that the data is stored in path
110
    Return two tuples of numpy arrays
111
    ((train_imgs, train_labels), (test_imgs, test_labels))
112
    """
113
    imgs, labels = parse_data(path, 'train', flatten)
114
    indices = np.random.permutation(labels.shape[0])
115
    train_idx, val_idx = indices[:num_train], indices[num_train:]
116
    train_img, train_labels = imgs[train_idx, :], labels[train_idx, :]
117
    val_img, val_labels = imgs[val_idx, :], labels[val_idx, :]
118
    test = parse_data(path, 't10k', flatten)
119
    return (train_img, train_labels), (val_img, val_labels), test
120

121
def get_mnist_dataset(batch_size):
122
    # Step 1: Read in data
123
    mnist_folder = 'data/mnist'
124
    download_mnist(mnist_folder)
125
    train, val, test = read_mnist(mnist_folder, flatten=False)
126

127
    # Step 2: Create datasets and iterator
128
    train_data = tf.data.Dataset.from_tensor_slices(train)
129
    train_data = train_data.shuffle(10000) # if you want to shuffle your data
130
    train_data = train_data.batch(batch_size)
131

132
    test_data = tf.data.Dataset.from_tensor_slices(test)
133
    test_data = test_data.batch(batch_size)
134

135
    return train_data, test_data
136
    
137
def show(image):
138
    """
139
    Render a given numpy.uint8 2D array of pixel data.
140
    """
141
    plt.imshow(image, cmap='gray')
142
    plt.show()
143
Product

Resources

Company