CoCalc -- process

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / stanford-tensorflow-tutorials / 2017 / examples / process_data.py
¹³²⁹²⁶ views
License: OTHER
1
from __future__ import absolute_import
2
from __future__ import division
3
from __future__ import print_function
4

5
from collections import Counter
6
import random
7
import os
8
import sys
9
sys.path.append('..')
10
import zipfile
11

12
import numpy as np
13
from six.moves import urllib
14
import tensorflow as tf
15

16
import utils
17

18
# Parameters for downloading data
19
DOWNLOAD_URL = 'http://mattmahoney.net/dc/'
20
EXPECTED_BYTES = 31344016
21
DATA_FOLDER = 'data/'
22
FILE_NAME = 'text8.zip'
23

24
def download(file_name, expected_bytes):
25
    """ Download the dataset text8 if it's not already downloaded """
26
    file_path = DATA_FOLDER + file_name
27
    if os.path.exists(file_path):
28
        print("Dataset ready")
29
        return file_path
30
    file_name, _ = urllib.request.urlretrieve(DOWNLOAD_URL + file_name, file_path)
31
    file_stat = os.stat(file_path)
32
    if file_stat.st_size == expected_bytes:
33
        print('Successfully downloaded the file', file_name)
34
    else:
35
        raise Exception('File ' + file_name +
36
                        ' might be corrupted. You should try downloading it with a browser.')
37
    return file_path
38

39
def read_data(file_path):
40
    """ Read data into a list of tokens 
41
    There should be 17,005,207 tokens
42
    """
43
    with zipfile.ZipFile(file_path) as f:
44
        words = tf.compat.as_str(f.read(f.namelist()[0])).split() 
45
        # tf.compat.as_str() converts the input into the string
46
    return words
47

48
def build_vocab(words, vocab_size):
49
    """ Build vocabulary of VOCAB_SIZE most frequent words """
50
    dictionary = dict()
51
    count = [('UNK', -1)]
52
    count.extend(Counter(words).most_common(vocab_size - 1))
53
    index = 0
54
    utils.make_dir('processed')
55
    with open('processed/vocab_1000.tsv', "w") as f:
56
        for word, _ in count:
57
            dictionary[word] = index
58
            if index < 1000:
59
                f.write(word + "\n")
60
            index += 1
61
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
62
    return dictionary, index_dictionary
63

64
def convert_words_to_index(words, dictionary):
65
    """ Replace each word in the dataset with its index in the dictionary """
66
    return [dictionary[word] if word in dictionary else 0 for word in words]
67

68
def generate_sample(index_words, context_window_size):
69
    """ Form training pairs according to the skip-gram model. """
70
    for index, center in enumerate(index_words):
71
        context = random.randint(1, context_window_size)
72
        # get a random target before the center word
73
        for target in index_words[max(0, index - context): index]:
74
            yield center, target
75
        # get a random target after the center wrod
76
        for target in index_words[index + 1: index + context + 1]:
77
            yield center, target
78

79
def get_batch(iterator, batch_size):
80
    """ Group a numerical stream into batches and yield them as Numpy arrays. """
81
    while True:
82
        center_batch = np.zeros(batch_size, dtype=np.int32)
83
        target_batch = np.zeros([batch_size, 1])
84
        for index in range(batch_size):
85
            center_batch[index], target_batch[index] = next(iterator)
86
        yield center_batch, target_batch
87

88
def process_data(vocab_size, batch_size, skip_window):
89
    file_path = download(FILE_NAME, EXPECTED_BYTES)
90
    words = read_data(file_path)
91
    dictionary, _ = build_vocab(words, vocab_size)
92
    index_words = convert_words_to_index(words, dictionary)
93
    del words # to save memory
94
    single_gen = generate_sample(index_words, skip_window)
95
    return get_batch(single_gen, batch_size)
96

97
def get_index_vocab(vocab_size):
98
    file_path = download(FILE_NAME, EXPECTED_BYTES)
99
    words = read_data(file_path)
100
    return build_vocab(words, vocab_size)
101

102
Product

Resources

Company