Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/master/Natural Language Processing with Probabilistic Models/Week 4 - Word Embeddings with Neural Networks/utils2.py
Views: 13373
import numpy as np1from scipy import linalg2from collections import defaultdict345def sigmoid(z):6# sigmoid function7return 1.0/(1.0+np.exp(-z))8910def get_idx(words, word2Ind):11idx = []12for word in words:13idx = idx + [word2Ind[word]]14return idx151617def pack_idx_with_frequency(context_words, word2Ind):18freq_dict = defaultdict(int)19for word in context_words:20freq_dict[word] += 121idxs = get_idx(context_words, word2Ind)22packed = []23for i in range(len(idxs)):24idx = idxs[i]25freq = freq_dict[context_words[i]]26packed.append((idx, freq))27return packed282930def get_vectors(data, word2Ind, V, C):31i = C32while True:33y = np.zeros(V)34x = np.zeros(V)35center_word = data[i]36y[word2Ind[center_word]] = 137context_words = data[(i - C):i] + data[(i+1):(i+C+1)]38num_ctx_words = len(context_words)39for idx, freq in pack_idx_with_frequency(context_words, word2Ind):40x[idx] = freq/num_ctx_words41yield x, y42i += 143if i >= len(data):44print('i is being set to 0')45i = 0464748def get_batches(data, word2Ind, V, C, batch_size):49batch_x = []50batch_y = []51for x, y in get_vectors(data, word2Ind, V, C):52while len(batch_x) < batch_size:53batch_x.append(x)54batch_y.append(y)55else:56yield np.array(batch_x).T, np.array(batch_y).T57batch = []585960def compute_pca(data, n_components=2):61"""62Input:63data: of dimension (m,n) where each row corresponds to a word vector64n_components: Number of components you want to keep.65Output:66X_reduced: data transformed in 2 dims/columns + regenerated original data67pass in: data as 2D NumPy array68"""6970m, n = data.shape7172### START CODE HERE ###73# mean center the data74data -= data.mean(axis=0)75# calculate the covariance matrix76R = np.cov(data, rowvar=False)77# calculate eigenvectors & eigenvalues of the covariance matrix78# use 'eigh' rather than 'eig' since R is symmetric,79# the performance gain is substantial80evals, evecs = linalg.eigh(R)81# sort eigenvalue in decreasing order82# this returns the corresponding indices of evals and evecs83idx = np.argsort(evals)[::-1]8485evecs = evecs[:, idx]86# sort eigenvectors according to same index87evals = evals[idx]88# select the first n eigenvectors (n is desired dimension89# of rescaled data array, or dims_rescaled_data)90evecs = evecs[:, :n_components]91### END CODE HERE ###92return np.dot(evecs.T, data.T).T939495def get_dict(data):96"""97Input:98K: the number of negative samples99data: the data you want to pull from100indices: a list of word indices101Output:102word_dict: a dictionary with the weighted probabilities of each word103word2Ind: returns dictionary mapping the word to its index104Ind2Word: returns dictionary mapping the index to its word105"""106#107# words = nltk.word_tokenize(data)108words = sorted(list(set(data)))109n = len(words)110idx = 0111# return these correctly112word2Ind = {}113Ind2word = {}114for k in words:115word2Ind[k] = idx116Ind2word[idx] = k117idx += 1118return word2Ind, Ind2word119120121