CoCalc -- 04_word2vec.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / stanford-tensorflow-tutorials / examples / 04_word2vec.py
¹³²⁹²³ views
License: OTHER
1
""" starter code for word2vec skip-gram model with NCE loss
2
CS 20: "TensorFlow for Deep Learning Research"
3
cs20.stanford.edu
4
Chip Huyen ([email protected])
5
Lecture 04
6
"""
7

8
import os
9
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
10

11
import numpy as np
12
from tensorflow.contrib.tensorboard.plugins import projector
13
import tensorflow as tf
14

15
import utils
16
import word2vec_utils
17

18
# Model hyperparameters
19
VOCAB_SIZE = 50000
20
BATCH_SIZE = 128
21
EMBED_SIZE = 128            # dimension of the word embedding vectors
22
SKIP_WINDOW = 1             # the context window
23
NUM_SAMPLED = 64            # number of negative examples to sample
24
LEARNING_RATE = 1.0
25
NUM_TRAIN_STEPS = 100000
26
VISUAL_FLD = 'visualization'
27
SKIP_STEP = 5000
28

29
# Parameters for downloading data
30
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
31
EXPECTED_BYTES = 31344016
32
NUM_VISUALIZE = 3000        # number of tokens to visualize
33

34

35
def word2vec(dataset):
36
    """ Build the graph for word2vec model and train it """
37
    # Step 1: get input, output from the dataset
38
    with tf.name_scope('data'):
39
        iterator = dataset.make_initializable_iterator()
40
        center_words, target_words = iterator.get_next()
41

42
    """ Step 2 + 3: define weights and embedding lookup.
43
    In word2vec, it's actually the weights that we care about 
44
    """
45
    with tf.name_scope('embed'):
46
        embed_matrix = tf.get_variable('embed_matrix', 
47
                                        shape=[VOCAB_SIZE, EMBED_SIZE],
48
                                        initializer=tf.random_uniform_initializer())
49
        embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embedding')
50

51
    # Step 4: construct variables for NCE loss and define loss function
52
    with tf.name_scope('loss'):
53
        nce_weight = tf.get_variable('nce_weight', shape=[VOCAB_SIZE, EMBED_SIZE],
54
                        initializer=tf.truncated_normal_initializer(stddev=1.0 / (EMBED_SIZE ** 0.5)))
55
        nce_bias = tf.get_variable('nce_bias', initializer=tf.zeros([VOCAB_SIZE]))
56

57
        # define loss function to be NCE loss function
58
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
59
                                            biases=nce_bias, 
60
                                            labels=target_words, 
61
                                            inputs=embed, 
62
                                            num_sampled=NUM_SAMPLED, 
63
                                            num_classes=VOCAB_SIZE), name='loss')
64

65
    # Step 5: define optimizer
66
    with tf.name_scope('optimizer'):
67
        optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
68
    
69
    utils.safe_mkdir('checkpoints')
70

71
    with tf.Session() as sess:
72
        sess.run(iterator.initializer)
73
        sess.run(tf.global_variables_initializer())
74

75
        total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
76
        writer = tf.summary.FileWriter('graphs/word2vec_simple', sess.graph)
77

78
        for index in range(NUM_TRAIN_STEPS):
79
            try:
80
                loss_batch, _ = sess.run([loss, optimizer])
81
                total_loss += loss_batch
82
                if (index + 1) % SKIP_STEP == 0:
83
                    print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
84
                    total_loss = 0.0
85
            except tf.errors.OutOfRangeError:
86
                sess.run(iterator.initializer)
87
        writer.close()
88

89
def gen():
90
    yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES, VOCAB_SIZE, 
91
                                        BATCH_SIZE, SKIP_WINDOW, VISUAL_FLD)
92

93
def main():
94
    dataset = tf.data.Dataset.from_generator(gen, 
95
                                (tf.int32, tf.int32), 
96
                                (tf.TensorShape([BATCH_SIZE]), tf.TensorShape([BATCH_SIZE, 1])))
97
    word2vec(dataset)
98

99
if __name__ == '__main__':
100
    main()
101

102
Product

Resources

Company