Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

📚 The CoCalc Library - books, templates and other resources

132926 views
License: OTHER
1
from __future__ import absolute_import
2
from __future__ import division
3
from __future__ import print_function
4
5
from collections import Counter
6
import random
7
import os
8
import sys
9
sys.path.append('..')
10
import zipfile
11
12
import numpy as np
13
from six.moves import urllib
14
import tensorflow as tf
15
16
import utils
17
18
# Parameters for downloading data
19
DOWNLOAD_URL = 'http://mattmahoney.net/dc/'
20
EXPECTED_BYTES = 31344016
21
DATA_FOLDER = 'data/'
22
FILE_NAME = 'text8.zip'
23
24
def download(file_name, expected_bytes):
25
""" Download the dataset text8 if it's not already downloaded """
26
file_path = DATA_FOLDER + file_name
27
if os.path.exists(file_path):
28
print("Dataset ready")
29
return file_path
30
file_name, _ = urllib.request.urlretrieve(DOWNLOAD_URL + file_name, file_path)
31
file_stat = os.stat(file_path)
32
if file_stat.st_size == expected_bytes:
33
print('Successfully downloaded the file', file_name)
34
else:
35
raise Exception('File ' + file_name +
36
' might be corrupted. You should try downloading it with a browser.')
37
return file_path
38
39
def read_data(file_path):
40
""" Read data into a list of tokens
41
There should be 17,005,207 tokens
42
"""
43
with zipfile.ZipFile(file_path) as f:
44
words = tf.compat.as_str(f.read(f.namelist()[0])).split()
45
# tf.compat.as_str() converts the input into the string
46
return words
47
48
def build_vocab(words, vocab_size):
49
""" Build vocabulary of VOCAB_SIZE most frequent words """
50
dictionary = dict()
51
count = [('UNK', -1)]
52
count.extend(Counter(words).most_common(vocab_size - 1))
53
index = 0
54
utils.make_dir('processed')
55
with open('processed/vocab_1000.tsv', "w") as f:
56
for word, _ in count:
57
dictionary[word] = index
58
if index < 1000:
59
f.write(word + "\n")
60
index += 1
61
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
62
return dictionary, index_dictionary
63
64
def convert_words_to_index(words, dictionary):
65
""" Replace each word in the dataset with its index in the dictionary """
66
return [dictionary[word] if word in dictionary else 0 for word in words]
67
68
def generate_sample(index_words, context_window_size):
69
""" Form training pairs according to the skip-gram model. """
70
for index, center in enumerate(index_words):
71
context = random.randint(1, context_window_size)
72
# get a random target before the center word
73
for target in index_words[max(0, index - context): index]:
74
yield center, target
75
# get a random target after the center wrod
76
for target in index_words[index + 1: index + context + 1]:
77
yield center, target
78
79
def get_batch(iterator, batch_size):
80
""" Group a numerical stream into batches and yield them as Numpy arrays. """
81
while True:
82
center_batch = np.zeros(batch_size, dtype=np.int32)
83
target_batch = np.zeros([batch_size, 1])
84
for index in range(batch_size):
85
center_batch[index], target_batch[index] = next(iterator)
86
yield center_batch, target_batch
87
88
def process_data(vocab_size, batch_size, skip_window):
89
file_path = download(FILE_NAME, EXPECTED_BYTES)
90
words = read_data(file_path)
91
dictionary, _ = build_vocab(words, vocab_size)
92
index_words = convert_words_to_index(words, dictionary)
93
del words # to save memory
94
single_gen = generate_sample(index_words, skip_window)
95
return get_batch(single_gen, batch_size)
96
97
def get_index_vocab(vocab_size):
98
file_path = download(FILE_NAME, EXPECTED_BYTES)
99
words = read_data(file_path)
100
return build_vocab(words, vocab_size)
101
102