📚 The CoCalc Library - books, templates and other resources
License: OTHER
from __future__ import absolute_import1from __future__ import division2from __future__ import print_function34from collections import Counter5import random6import os7import sys8sys.path.append('..')9import zipfile1011import numpy as np12from six.moves import urllib13import tensorflow as tf1415import utils1617# Parameters for downloading data18DOWNLOAD_URL = 'http://mattmahoney.net/dc/'19EXPECTED_BYTES = 3134401620DATA_FOLDER = 'data/'21FILE_NAME = 'text8.zip'2223def download(file_name, expected_bytes):24""" Download the dataset text8 if it's not already downloaded """25file_path = DATA_FOLDER + file_name26if os.path.exists(file_path):27print("Dataset ready")28return file_path29file_name, _ = urllib.request.urlretrieve(DOWNLOAD_URL + file_name, file_path)30file_stat = os.stat(file_path)31if file_stat.st_size == expected_bytes:32print('Successfully downloaded the file', file_name)33else:34raise Exception('File ' + file_name +35' might be corrupted. You should try downloading it with a browser.')36return file_path3738def read_data(file_path):39""" Read data into a list of tokens40There should be 17,005,207 tokens41"""42with zipfile.ZipFile(file_path) as f:43words = tf.compat.as_str(f.read(f.namelist()[0])).split()44# tf.compat.as_str() converts the input into the string45return words4647def build_vocab(words, vocab_size):48""" Build vocabulary of VOCAB_SIZE most frequent words """49dictionary = dict()50count = [('UNK', -1)]51count.extend(Counter(words).most_common(vocab_size - 1))52index = 053utils.make_dir('processed')54with open('processed/vocab_1000.tsv', "w") as f:55for word, _ in count:56dictionary[word] = index57if index < 1000:58f.write(word + "\n")59index += 160index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))61return dictionary, index_dictionary6263def convert_words_to_index(words, dictionary):64""" Replace each word in the dataset with its index in the dictionary """65return [dictionary[word] if word in dictionary else 0 for word in words]6667def generate_sample(index_words, context_window_size):68""" Form training pairs according to the skip-gram model. """69for index, center in enumerate(index_words):70context = random.randint(1, context_window_size)71# get a random target before the center word72for target in index_words[max(0, index - context): index]:73yield center, target74# get a random target after the center wrod75for target in index_words[index + 1: index + context + 1]:76yield center, target7778def get_batch(iterator, batch_size):79""" Group a numerical stream into batches and yield them as Numpy arrays. """80while True:81center_batch = np.zeros(batch_size, dtype=np.int32)82target_batch = np.zeros([batch_size, 1])83for index in range(batch_size):84center_batch[index], target_batch[index] = next(iterator)85yield center_batch, target_batch8687def process_data(vocab_size, batch_size, skip_window):88file_path = download(FILE_NAME, EXPECTED_BYTES)89words = read_data(file_path)90dictionary, _ = build_vocab(words, vocab_size)91index_words = convert_words_to_index(words, dictionary)92del words # to save memory93single_gen = generate_sample(index_words, skip_window)94return get_batch(single_gen, batch_size)9596def get_index_vocab(vocab_size):97file_path = download(FILE_NAME, EXPECTED_BYTES)98words = read_data(file_path)99return build_vocab(words, vocab_size)100101102