CoCalc -- 04_word2vec

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / stanford-tensorflow-tutorials / 2017 / examples / 04_word2vec_starter.py
¹³²⁹²⁷ views
License: OTHER
1
""" The mo frills implementation of word2vec skip-gram model using NCE loss. 
2
Author: Chip Huyen
3
Prepared for the class CS 20SI: "TensorFlow for Deep Learning Research"
4
cs20si.stanford.edu
5
"""
6

7
from __future__ import absolute_import
8
from __future__ import division
9
from __future__ import print_function
10

11
import os
12
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
13

14
import numpy as np
15
import tensorflow as tf
16
from tensorflow.contrib.tensorboard.plugins import projector
17

18
from process_data import process_data
19

20
VOCAB_SIZE = 50000
21
BATCH_SIZE = 128
22
EMBED_SIZE = 128 # dimension of the word embedding vectors
23
SKIP_WINDOW = 1 # the context window
24
NUM_SAMPLED = 64    # Number of negative examples to sample.
25
LEARNING_RATE = 1.0
26
NUM_TRAIN_STEPS = 20000
27
SKIP_STEP = 2000 # how many steps to skip before reporting the loss
28

29
def word2vec(batch_gen):
30
    """ Build the graph for word2vec model and train it """
31
    # Step 1: define the placeholders for input and output
32
    # center_words have to be int to work on embedding lookup
33

34
    # TO DO
35

36

37
    # Step 2: define weights. In word2vec, it's actually the weights that we care about
38
    # vocab size x embed size
39
    # initialized to random uniform -1 to 1
40

41
    # TOO DO
42

43

44
    # Step 3: define the inference
45
    # get the embed of input words using tf.nn.embedding_lookup
46
    # embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')
47

48
    # TO DO
49

50

51
        # Step 4: construct variables for NCE loss
52
        # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
53
        # nce_weight (vocab size x embed size), intialized to truncated_normal stddev=1.0 / (EMBED_SIZE ** 0.5)
54
        # bias: vocab size, initialized to 0
55

56
        # TO DO
57

58

59
        # define loss function to be NCE loss function
60
        # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
61
        # need to get the mean accross the batch
62
        # note: you should use embedding of center words for inputs, not center words themselves
63

64
        # TO DO
65

66
        
67
    # Step 5: define optimizer
68
    
69
    # TO DO
70

71

72

73
    with tf.Session() as sess:
74
        # TO DO: initialize variables
75

76

77
        total_loss = 0.0 # we use this to calculate the average loss in the last SKIP_STEP steps
78
        writer = tf.summary.FileWriter('./graphs/no_frills/', sess.graph)
79
        for index in range(NUM_TRAIN_STEPS):
80
            centers, targets = next(batch_gen)
81
            # TO DO: create feed_dict, run optimizer, fetch loss_batch
82

83
            total_loss += loss_batch
84
            if (index + 1) % SKIP_STEP == 0:
85
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
86
                total_loss = 0.0
87
        writer.close()
88

89
def main():
90
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
91
    word2vec(batch_gen)
92

93
if __name__ == '__main__':
94
    main()
95

96
Product

Resources

Company