CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
amanchadha

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: amanchadha/coursera-deep-learning-specialization
Path: blob/master/C5 - Sequence Models/Week 2/Word Vector Representation/w2v_utils.py
Views: 4819
1
from tensorflow.keras.models import Model
2
from tensorflow.keras.layers import Input, Dense, Reshape
3
from tensorflow.keras.layers import Embedding
4
from tensorflow.keras.preprocessing.sequence import skipgrams
5
from tensorflow.keras.preprocessing import sequence
6
7
import urllib.request
8
import collections
9
import os
10
import zipfile
11
12
import numpy as np
13
import tensorflow as tf
14
15
window_size = 3
16
vector_dim = 300
17
epochs = 1000
18
19
valid_size = 16 # Random set of words to evaluate similarity on.
20
valid_window = 100 # Only pick dev samples in the head of the distribution.
21
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
22
23
def maybe_download(filename, url, expected_bytes):
24
"""Download a file if not present, and make sure it's the right size."""
25
if not os.path.exists(filename):
26
filename, _ = urllib.request.urlretrieve(url + filename, filename)
27
statinfo = os.stat(filename)
28
if statinfo.st_size == expected_bytes:
29
print('Found and verified', filename)
30
else:
31
print(statinfo.st_size)
32
raise Exception(
33
'Failed to verify ' + filename + '. Can you get to it with a browser?')
34
return filename
35
36
37
# Read the data into a list of strings.
38
def read_data(filename):
39
"""Extract the first file enclosed in a zip file as a list of words."""
40
with zipfile.ZipFile(filename) as f:
41
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
42
return data
43
44
45
def build_dataset(words, n_words):
46
"""Process raw inputs into a dataset."""
47
count = [['UNK', -1]]
48
count.extend(collections.Counter(words).most_common(n_words - 1))
49
dictionary = dict()
50
for word, _ in count:
51
dictionary[word] = len(dictionary)
52
data = list()
53
unk_count = 0
54
for word in words:
55
if word in dictionary:
56
index = dictionary[word]
57
else:
58
index = 0 # dictionary['UNK']
59
unk_count += 1
60
data.append(index)
61
count[0][1] = unk_count
62
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
63
return data, count, dictionary, reversed_dictionary
64
65
def collect_data(vocabulary_size=10000):
66
url = 'http://mattmahoney.net/dc/'
67
filename = maybe_download('text8.zip', url, 31344016)
68
vocabulary = read_data(filename)
69
print(vocabulary[:7])
70
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
71
vocabulary_size)
72
del vocabulary # Hint to reduce memory.
73
return data, count, dictionary, reverse_dictionary
74
75
class SimilarityCallback:
76
def run_sim(self):
77
for i in range(valid_size):
78
valid_word = reverse_dictionary[valid_examples[i]]
79
top_k = 8 # number of nearest neighbors
80
sim = self._get_sim(valid_examples[i])
81
nearest = (-sim).argsort()[1:top_k + 1]
82
log_str = 'Nearest to %s:' % valid_word
83
for k in range(top_k):
84
close_word = reverse_dictionary[nearest[k]]
85
log_str = '%s %s,' % (log_str, close_word)
86
print(log_str)
87
88
@staticmethod
89
def _get_sim(valid_word_idx):
90
sim = np.zeros((vocab_size,))
91
in_arr1 = np.zeros((1,))
92
in_arr2 = np.zeros((1,))
93
in_arr1[0,] = valid_word_idx
94
for i in range(vocab_size):
95
in_arr2[0,] = i
96
out = validation_model.predict_on_batch([in_arr1, in_arr2])
97
sim[i] = out
98
return sim
99
100
101
def read_glove_vecs(glove_file):
102
with open(glove_file, 'r') as f:
103
words = set()
104
word_to_vec_map = {}
105
106
for line in f:
107
line = line.strip().split()
108
curr_word = line[0]
109
words.add(curr_word)
110
word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
111
112
return words, word_to_vec_map
113
114
def relu(x):
115
"""
116
Compute the relu of x
117
118
Arguments:
119
x -- A scalar or numpy array of any size.
120
121
Return:
122
s -- relu(x)
123
"""
124
s = np.maximum(0,x)
125
126
return s
127
128
129
def initialize_parameters(vocab_size, n_h):
130
"""
131
Arguments:
132
layer_dims -- python array (list) containing the dimensions of each layer in our network
133
134
Returns:
135
parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2":
136
W1 -- weight matrix of shape (n_h, vocab_size)
137
b1 -- bias vector of shape (n_h, 1)
138
W2 -- weight matrix of shape (vocab_size, n_h)
139
b2 -- bias vector of shape (vocab_size, 1)
140
"""
141
142
np.random.seed(3)
143
parameters = {}
144
145
parameters['W1'] = np.random.randn(n_h, vocab_size) / np.sqrt(vocab_size)
146
parameters['b1'] = np.zeros((n_h, 1))
147
parameters['W2'] = np.random.randn(vocab_size, n_h) / np.sqrt(n_h)
148
parameters['b2'] = np.zeros((vocab_size, 1))
149
150
return parameters
151
152
def softmax(x):
153
"""Compute softmax values for each sets of scores in x."""
154
e_x = np.exp(x - np.max(x))
155
return e_x / e_x.sum()
156
157