Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/master/C5 - Sequence Models/Week 2/Word Vector Representation/w2v_utils.py
Views: 4819
from tensorflow.keras.models import Model1from tensorflow.keras.layers import Input, Dense, Reshape2from tensorflow.keras.layers import Embedding3from tensorflow.keras.preprocessing.sequence import skipgrams4from tensorflow.keras.preprocessing import sequence56import urllib.request7import collections8import os9import zipfile1011import numpy as np12import tensorflow as tf1314window_size = 315vector_dim = 30016epochs = 10001718valid_size = 16 # Random set of words to evaluate similarity on.19valid_window = 100 # Only pick dev samples in the head of the distribution.20valid_examples = np.random.choice(valid_window, valid_size, replace=False)2122def maybe_download(filename, url, expected_bytes):23"""Download a file if not present, and make sure it's the right size."""24if not os.path.exists(filename):25filename, _ = urllib.request.urlretrieve(url + filename, filename)26statinfo = os.stat(filename)27if statinfo.st_size == expected_bytes:28print('Found and verified', filename)29else:30print(statinfo.st_size)31raise Exception(32'Failed to verify ' + filename + '. Can you get to it with a browser?')33return filename343536# Read the data into a list of strings.37def read_data(filename):38"""Extract the first file enclosed in a zip file as a list of words."""39with zipfile.ZipFile(filename) as f:40data = tf.compat.as_str(f.read(f.namelist()[0])).split()41return data424344def build_dataset(words, n_words):45"""Process raw inputs into a dataset."""46count = [['UNK', -1]]47count.extend(collections.Counter(words).most_common(n_words - 1))48dictionary = dict()49for word, _ in count:50dictionary[word] = len(dictionary)51data = list()52unk_count = 053for word in words:54if word in dictionary:55index = dictionary[word]56else:57index = 0 # dictionary['UNK']58unk_count += 159data.append(index)60count[0][1] = unk_count61reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))62return data, count, dictionary, reversed_dictionary6364def collect_data(vocabulary_size=10000):65url = 'http://mattmahoney.net/dc/'66filename = maybe_download('text8.zip', url, 31344016)67vocabulary = read_data(filename)68print(vocabulary[:7])69data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,70vocabulary_size)71del vocabulary # Hint to reduce memory.72return data, count, dictionary, reverse_dictionary7374class SimilarityCallback:75def run_sim(self):76for i in range(valid_size):77valid_word = reverse_dictionary[valid_examples[i]]78top_k = 8 # number of nearest neighbors79sim = self._get_sim(valid_examples[i])80nearest = (-sim).argsort()[1:top_k + 1]81log_str = 'Nearest to %s:' % valid_word82for k in range(top_k):83close_word = reverse_dictionary[nearest[k]]84log_str = '%s %s,' % (log_str, close_word)85print(log_str)8687@staticmethod88def _get_sim(valid_word_idx):89sim = np.zeros((vocab_size,))90in_arr1 = np.zeros((1,))91in_arr2 = np.zeros((1,))92in_arr1[0,] = valid_word_idx93for i in range(vocab_size):94in_arr2[0,] = i95out = validation_model.predict_on_batch([in_arr1, in_arr2])96sim[i] = out97return sim9899100def read_glove_vecs(glove_file):101with open(glove_file, 'r') as f:102words = set()103word_to_vec_map = {}104105for line in f:106line = line.strip().split()107curr_word = line[0]108words.add(curr_word)109word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)110111return words, word_to_vec_map112113def relu(x):114"""115Compute the relu of x116117Arguments:118x -- A scalar or numpy array of any size.119120Return:121s -- relu(x)122"""123s = np.maximum(0,x)124125return s126127128def initialize_parameters(vocab_size, n_h):129"""130Arguments:131layer_dims -- python array (list) containing the dimensions of each layer in our network132133Returns:134parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2":135W1 -- weight matrix of shape (n_h, vocab_size)136b1 -- bias vector of shape (n_h, 1)137W2 -- weight matrix of shape (vocab_size, n_h)138b2 -- bias vector of shape (vocab_size, 1)139"""140141np.random.seed(3)142parameters = {}143144parameters['W1'] = np.random.randn(n_h, vocab_size) / np.sqrt(vocab_size)145parameters['b1'] = np.zeros((n_h, 1))146parameters['W2'] = np.random.randn(vocab_size, n_h) / np.sqrt(n_h)147parameters['b2'] = np.zeros((vocab_size, 1))148149return parameters150151def softmax(x):152"""Compute softmax values for each sets of scores in x."""153e_x = np.exp(x - np.max(x))154return e_x / e_x.sum()155156157