CoCalc -- 04_word2vec

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / stanford-tensorflow-tutorials / examples / 04_word2vec_eager.py
¹³²⁹²³ views
License: OTHER
1
""" starter code for word2vec skip-gram model with NCE loss
2
Eager execution
3
CS 20: "TensorFlow for Deep Learning Research"
4
cs20.stanford.edu
5
Chip Huyen ([email protected]) & Akshay Agrawal ([email protected])
6
Lecture 04
7
"""
8

9
import os
10
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
11

12
import numpy as np
13
import tensorflow as tf
14
import tensorflow.contrib.eager as tfe
15

16
import utils
17
import word2vec_utils
18

19
tfe.enable_eager_execution()
20

21
# Model hyperparameters
22
VOCAB_SIZE = 50000
23
BATCH_SIZE = 128
24
EMBED_SIZE = 128            # dimension of the word embedding vectors
25
SKIP_WINDOW = 1             # the context window
26
NUM_SAMPLED = 64            # number of negative examples to sample
27
LEARNING_RATE = 1.0
28
NUM_TRAIN_STEPS = 100000
29
VISUAL_FLD = 'visualization'
30
SKIP_STEP = 5000
31

32
# Parameters for downloading data
33
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
34
EXPECTED_BYTES = 31344016
35

36
class Word2Vec(object):
37
  def __init__(self, vocab_size, embed_size, num_sampled=NUM_SAMPLED):
38
    self.vocab_size = vocab_size
39
    self.num_sampled = num_sampled
40
    self.embed_matrix = tfe.Variable(tf.random_uniform(
41
                                      [vocab_size, embed_size]))
42
    self.nce_weight = tfe.Variable(tf.truncated_normal(
43
                                    [vocab_size, embed_size],
44
                                    stddev=1.0 / (embed_size ** 0.5)))
45
    self.nce_bias = tfe.Variable(tf.zeros([vocab_size]))
46

47
  def compute_loss(self, center_words, target_words):
48
    """Computes the forward pass of word2vec with the NCE loss.""" 
49
    embed = tf.nn.embedding_lookup(self.embed_matrix, center_words)
50
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weight, 
51
                                        biases=self.nce_bias, 
52
                                        labels=target_words, 
53
                                        inputs=embed, 
54
                                        num_sampled=self.num_sampled, 
55
                                        num_classes=self.vocab_size))
56
    return loss
57

58

59
def gen():
60
  yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES,
61
                                      VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW,
62
                                      VISUAL_FLD)
63

64
def main():
65
  dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32),
66
                              (tf.TensorShape([BATCH_SIZE]),
67
                              tf.TensorShape([BATCH_SIZE, 1])))
68
  optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)
69
  model = Word2Vec(vocab_size=VOCAB_SIZE, embed_size=EMBED_SIZE)
70
  grad_fn = tfe.implicit_value_and_gradients(model.compute_loss)
71
  total_loss = 0.0  # for average loss in the last SKIP_STEP steps
72
  num_train_steps = 0
73
  while num_train_steps < NUM_TRAIN_STEPS:
74
    for center_words, target_words in tfe.Iterator(dataset):
75
      if num_train_steps >= NUM_TRAIN_STEPS:
76
        break
77
      loss_batch, grads = grad_fn(center_words, target_words)
78
      total_loss += loss_batch
79
      optimizer.apply_gradients(grads)
80
      if (num_train_steps + 1) % SKIP_STEP == 0:
81
        print('Average loss at step {}: {:5.1f}'.format(
82
                num_train_steps, total_loss / SKIP_STEP))
83
        total_loss = 0.0
84
      num_train_steps += 1
85

86

87
if __name__ == '__main__':
88
    main()
89

90
Product

Resources

Company