CoCalc -- 04_word2vec_no

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / stanford-tensorflow-tutorials / 2017 / examples / 04_word2vec_no_frills.py
¹³²⁹²⁷ views
License: OTHER
1
""" The no frills implementation of word2vec skip-gram model using NCE loss.
2
Author: Chip Huyen
3
Prepared for the class CS 20SI: "TensorFlow for Deep Learning Research"
4
cs20si.stanford.edu
5
"""
6

7
from __future__ import absolute_import
8
from __future__ import division
9
from __future__ import print_function
10

11
import os
12
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
13

14
import numpy as np
15
import tensorflow as tf
16
from tensorflow.contrib.tensorboard.plugins import projector
17

18
from process_data import process_data
19

20
VOCAB_SIZE = 50000
21
BATCH_SIZE = 128
22
EMBED_SIZE = 128 # dimension of the word embedding vectors
23
SKIP_WINDOW = 1 # the context window
24
NUM_SAMPLED = 64    # Number of negative examples to sample.
25
LEARNING_RATE = 1.0
26
NUM_TRAIN_STEPS = 10000
27
SKIP_STEP = 2000 # how many steps to skip before reporting the loss
28

29
def word2vec(batch_gen):
30
    """ Build the graph for word2vec model and train it """
31
    # Step 1: define the placeholders for input and output
32
    with tf.name_scope('data'):
33
        center_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE], name='center_words')
34
        target_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1], name='target_words')
35

36
    # Assemble this part of the graph on the CPU. You can change it to GPU if you have GPU
37
    # Step 2: define weights. In word2vec, it's actually the weights that we care about
38

39
    with tf.name_scope('embedding_matrix'):
40
        embed_matrix = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0), 
41
                            name='embed_matrix')
42

43
    # Step 3: define the inference
44
    with tf.name_scope('loss'):
45
        embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')
46

47
        # Step 4: construct variables for NCE loss
48
        nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE],
49
                                                    stddev=1.0 / (EMBED_SIZE ** 0.5)), 
50
                                                    name='nce_weight')
51
        nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')
52

53
        # define loss function to be NCE loss function
54
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
55
                                            biases=nce_bias, 
56
                                            labels=target_words, 
57
                                            inputs=embed, 
58
                                            num_sampled=NUM_SAMPLED, 
59
                                            num_classes=VOCAB_SIZE), name='loss')
60

61
    # Step 5: define optimizer
62
    optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
63
    
64
    with tf.Session() as sess:
65
        sess.run(tf.global_variables_initializer())
66

67
        total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
68
        writer = tf.summary.FileWriter('./graphs/no_frills/', sess.graph)
69
        for index in range(NUM_TRAIN_STEPS):
70
            centers, targets = next(batch_gen)
71
            loss_batch, _ = sess.run([loss, optimizer], 
72
                                    feed_dict={center_words: centers, target_words: targets})
73
            total_loss += loss_batch
74
            if (index + 1) % SKIP_STEP == 0:
75
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
76
                total_loss = 0.0
77
        writer.close()
78

79
def main():
80
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
81
    word2vec(batch_gen)
82

83
if __name__ == '__main__':
84
    main()
85
Product

Resources

Company