CoCalc -- word2vec

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / stanford-tensorflow-tutorials / examples / word2vec_utils.py
¹³²⁹²³ views
License: OTHER
1
from collections import Counter
2
import random
3
import os
4
import sys
5
sys.path.append('..')
6
import zipfile
7

8
import numpy as np
9
from six.moves import urllib
10
import tensorflow as tf
11

12
import utils
13

14
def read_data(file_path):
15
    """ Read data into a list of tokens 
16
    There should be 17,005,207 tokens
17
    """
18
    with zipfile.ZipFile(file_path) as f:
19
        words = tf.compat.as_str(f.read(f.namelist()[0])).split() 
20
    return words
21

22
def build_vocab(words, vocab_size, visual_fld):
23
    """ Build vocabulary of VOCAB_SIZE most frequent words and write it to
24
    visualization/vocab.tsv
25
    """
26
    utils.safe_mkdir(visual_fld)
27
    file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w')
28
    
29
    dictionary = dict()
30
    count = [('UNK', -1)]
31
    index = 0
32
    count.extend(Counter(words).most_common(vocab_size - 1))
33
    
34
    for word, _ in count:
35
        dictionary[word] = index
36
        index += 1
37
        file.write(word + '\n')
38
    
39
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
40
    file.close()
41
    return dictionary, index_dictionary
42

43
def convert_words_to_index(words, dictionary):
44
    """ Replace each word in the dataset with its index in the dictionary """
45
    return [dictionary[word] if word in dictionary else 0 for word in words]
46

47
def generate_sample(index_words, context_window_size):
48
    """ Form training pairs according to the skip-gram model. """
49
    for index, center in enumerate(index_words):
50
        context = random.randint(1, context_window_size)
51
        # get a random target before the center word
52
        for target in index_words[max(0, index - context): index]:
53
            yield center, target
54
        # get a random target after the center wrod
55
        for target in index_words[index + 1: index + context + 1]:
56
            yield center, target
57

58
def most_common_words(visual_fld, num_visualize):
59
    """ create a list of num_visualize most frequent words to visualize on TensorBoard.
60
    saved to visualization/vocab_[num_visualize].tsv
61
    """
62
    words = open(os.path.join(visual_fld, 'vocab.tsv'), 'r').readlines()[:num_visualize]
63
    words = [word for word in words]
64
    file = open(os.path.join(visual_fld, 'vocab_' + str(num_visualize) + '.tsv'), 'w')
65
    for word in words:
66
        file.write(word)
67
    file.close()
68

69
def batch_gen(download_url, expected_byte, vocab_size, batch_size, 
70
                skip_window, visual_fld):
71
    local_dest = 'data/text8.zip'
72
    utils.download_one_file(download_url, local_dest, expected_byte)
73
    words = read_data(local_dest)
74
    dictionary, _ = build_vocab(words, vocab_size, visual_fld)
75
    index_words = convert_words_to_index(words, dictionary)
76
    del words           # to save memory
77
    single_gen = generate_sample(index_words, skip_window)
78
    
79
    while True:
80
        center_batch = np.zeros(batch_size, dtype=np.int32)
81
        target_batch = np.zeros([batch_size, 1])
82
        for index in range(batch_size):
83
            center_batch[index], target_batch[index] = next(single_gen)
84
        yield center_batch, target_batch
85

86
Product

Resources

Company