📚 The CoCalc Library - books, templates and other resources
License: OTHER
""" word2vec with NCE loss and code to visualize the embeddings on TensorBoard1Author: Chip Huyen2Prepared for the class CS 20SI: "TensorFlow for Deep Learning Research"3cs20si.stanford.edu4"""56from __future__ import absolute_import7from __future__ import division8from __future__ import print_function910import os11os.environ['TF_CPP_MIN_LOG_LEVEL']='2'1213import numpy as np14from tensorflow.contrib.tensorboard.plugins import projector15import tensorflow as tf1617from process_data import process_data18import utils1920VOCAB_SIZE = 5000021BATCH_SIZE = 12822EMBED_SIZE = 128 # dimension of the word embedding vectors23SKIP_WINDOW = 1 # the context window24NUM_SAMPLED = 64 # Number of negative examples to sample.25LEARNING_RATE = 1.026NUM_TRAIN_STEPS = 10000027WEIGHTS_FLD = 'processed/'28SKIP_STEP = 20002930class SkipGramModel:31""" Build the graph for word2vec model """32def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):33self.vocab_size = vocab_size34self.embed_size = embed_size35self.batch_size = batch_size36self.num_sampled = num_sampled37self.lr = learning_rate38self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')3940def _create_placeholders(self):41""" Step 1: define the placeholders for input and output """42with tf.name_scope("data"):43self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name='center_words')44self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name='target_words')4546def _create_embedding(self):47""" Step 2: define weights. In word2vec, it's actually the weights that we care about """48# Assemble this part of the graph on the CPU. You can change it to GPU if you have GPU49with tf.device('/cpu:0'):50with tf.name_scope("embed"):51self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size,52self.embed_size], -1.0, 1.0),53name='embed_matrix')5455def _create_loss(self):56""" Step 3 + 4: define the model + the loss function """57with tf.device('/cpu:0'):58with tf.name_scope("loss"):59# Step 3: define the inference60embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, name='embed')6162# Step 4: define loss function63# construct variables for NCE loss64nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],65stddev=1.0 / (self.embed_size ** 0.5)),66name='nce_weight')67nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')6869# define loss function to be NCE loss function70self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,71biases=nce_bias,72labels=self.target_words,73inputs=embed,74num_sampled=self.num_sampled,75num_classes=self.vocab_size), name='loss')76def _create_optimizer(self):77""" Step 5: define optimizer """78with tf.device('/cpu:0'):79self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss,80global_step=self.global_step)8182def _create_summaries(self):83with tf.name_scope("summaries"):84tf.summary.scalar("loss", self.loss)85tf.summary.histogram("histogram loss", self.loss)86# because you have several summaries, we should merge them all87# into one op to make it easier to manage88self.summary_op = tf.summary.merge_all()8990def build_graph(self):91""" Build the graph for our model """92self._create_placeholders()93self._create_embedding()94self._create_loss()95self._create_optimizer()96self._create_summaries()9798def train_model(model, batch_gen, num_train_steps, weights_fld):99saver = tf.train.Saver() # defaults to saving all variables - in this case embed_matrix, nce_weight, nce_bias100101initial_step = 0102utils.make_dir('checkpoints')103with tf.Session() as sess:104sess.run(tf.global_variables_initializer())105ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))106# if that checkpoint exists, restore from checkpoint107if ckpt and ckpt.model_checkpoint_path:108saver.restore(sess, ckpt.model_checkpoint_path)109110total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps111writer = tf.summary.FileWriter('improved_graph/lr' + str(LEARNING_RATE), sess.graph)112initial_step = model.global_step.eval()113for index in range(initial_step, initial_step + num_train_steps):114centers, targets = next(batch_gen)115feed_dict={model.center_words: centers, model.target_words: targets}116loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary_op],117feed_dict=feed_dict)118writer.add_summary(summary, global_step=index)119total_loss += loss_batch120if (index + 1) % SKIP_STEP == 0:121print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))122total_loss = 0.0123saver.save(sess, 'checkpoints/skip-gram', index)124125####################126# code to visualize the embeddings. uncomment the below to visualize embeddings127# run "'tensorboard --logdir='processed'" to see the embeddings128# final_embed_matrix = sess.run(model.embed_matrix)129130# # it has to variable. constants don't work here. you can't reuse model.embed_matrix131# embedding_var = tf.Variable(final_embed_matrix[:1000], name='embedding')132# sess.run(embedding_var.initializer)133134# config = projector.ProjectorConfig()135# summary_writer = tf.summary.FileWriter('processed')136137# # add embedding to the config file138# embedding = config.embeddings.add()139# embedding.tensor_name = embedding_var.name140141# # link this tensor to its metadata file, in this case the first 500 words of vocab142# embedding.metadata_path = 'processed/vocab_1000.tsv'143144# # saves a configuration file that TensorBoard will read during startup.145# projector.visualize_embeddings(summary_writer, config)146# saver_embed = tf.train.Saver([embedding_var])147# saver_embed.save(sess, 'processed/model3.ckpt', 1)148149def main():150model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)151model.build_graph()152batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)153train_model(model, batch_gen, NUM_TRAIN_STEPS, WEIGHTS_FLD)154155if __name__ == '__main__':156main()157158