CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
y33-j3T

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Probabilistic Models/Week 4 - Word Embeddings with Neural Networks/utils2.py
Views: 13373
1
import numpy as np
2
from scipy import linalg
3
from collections import defaultdict
4
5
6
def sigmoid(z):
7
# sigmoid function
8
return 1.0/(1.0+np.exp(-z))
9
10
11
def get_idx(words, word2Ind):
12
idx = []
13
for word in words:
14
idx = idx + [word2Ind[word]]
15
return idx
16
17
18
def pack_idx_with_frequency(context_words, word2Ind):
19
freq_dict = defaultdict(int)
20
for word in context_words:
21
freq_dict[word] += 1
22
idxs = get_idx(context_words, word2Ind)
23
packed = []
24
for i in range(len(idxs)):
25
idx = idxs[i]
26
freq = freq_dict[context_words[i]]
27
packed.append((idx, freq))
28
return packed
29
30
31
def get_vectors(data, word2Ind, V, C):
32
i = C
33
while True:
34
y = np.zeros(V)
35
x = np.zeros(V)
36
center_word = data[i]
37
y[word2Ind[center_word]] = 1
38
context_words = data[(i - C):i] + data[(i+1):(i+C+1)]
39
num_ctx_words = len(context_words)
40
for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
41
x[idx] = freq/num_ctx_words
42
yield x, y
43
i += 1
44
if i >= len(data):
45
print('i is being set to 0')
46
i = 0
47
48
49
def get_batches(data, word2Ind, V, C, batch_size):
50
batch_x = []
51
batch_y = []
52
for x, y in get_vectors(data, word2Ind, V, C):
53
while len(batch_x) < batch_size:
54
batch_x.append(x)
55
batch_y.append(y)
56
else:
57
yield np.array(batch_x).T, np.array(batch_y).T
58
batch = []
59
60
61
def compute_pca(data, n_components=2):
62
"""
63
Input:
64
data: of dimension (m,n) where each row corresponds to a word vector
65
n_components: Number of components you want to keep.
66
Output:
67
X_reduced: data transformed in 2 dims/columns + regenerated original data
68
pass in: data as 2D NumPy array
69
"""
70
71
m, n = data.shape
72
73
### START CODE HERE ###
74
# mean center the data
75
data -= data.mean(axis=0)
76
# calculate the covariance matrix
77
R = np.cov(data, rowvar=False)
78
# calculate eigenvectors & eigenvalues of the covariance matrix
79
# use 'eigh' rather than 'eig' since R is symmetric,
80
# the performance gain is substantial
81
evals, evecs = linalg.eigh(R)
82
# sort eigenvalue in decreasing order
83
# this returns the corresponding indices of evals and evecs
84
idx = np.argsort(evals)[::-1]
85
86
evecs = evecs[:, idx]
87
# sort eigenvectors according to same index
88
evals = evals[idx]
89
# select the first n eigenvectors (n is desired dimension
90
# of rescaled data array, or dims_rescaled_data)
91
evecs = evecs[:, :n_components]
92
### END CODE HERE ###
93
return np.dot(evecs.T, data.T).T
94
95
96
def get_dict(data):
97
"""
98
Input:
99
K: the number of negative samples
100
data: the data you want to pull from
101
indices: a list of word indices
102
Output:
103
word_dict: a dictionary with the weighted probabilities of each word
104
word2Ind: returns dictionary mapping the word to its index
105
Ind2Word: returns dictionary mapping the index to its word
106
"""
107
#
108
# words = nltk.word_tokenize(data)
109
words = sorted(list(set(data)))
110
n = len(words)
111
idx = 0
112
# return these correctly
113
word2Ind = {}
114
Ind2word = {}
115
for k in words:
116
word2Ind[k] = idx
117
Ind2word[idx] = k
118
idx += 1
119
return word2Ind, Ind2word
120
121