CoCalc -- utils2.py

GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Probabilistic Models/Week 4 - Word Embeddings with Neural Networks/utils2.py
¹⁴⁴⁸³ views
1
import numpy as np
2
from scipy import linalg
3
from collections import defaultdict
4

5

6
def sigmoid(z):
7
    # sigmoid function
8
    return 1.0/(1.0+np.exp(-z))
9

10

11
def get_idx(words, word2Ind):
12
    idx = []
13
    for word in words:
14
        idx = idx + [word2Ind[word]]
15
    return idx
16

17

18
def pack_idx_with_frequency(context_words, word2Ind):
19
    freq_dict = defaultdict(int)
20
    for word in context_words:
21
        freq_dict[word] += 1
22
    idxs = get_idx(context_words, word2Ind)
23
    packed = []
24
    for i in range(len(idxs)):
25
        idx = idxs[i]
26
        freq = freq_dict[context_words[i]]
27
        packed.append((idx, freq))
28
    return packed
29

30

31
def get_vectors(data, word2Ind, V, C):
32
    i = C
33
    while True:
34
        y = np.zeros(V)
35
        x = np.zeros(V)
36
        center_word = data[i]
37
        y[word2Ind[center_word]] = 1
38
        context_words = data[(i - C):i] + data[(i+1):(i+C+1)]
39
        num_ctx_words = len(context_words)
40
        for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
41
            x[idx] = freq/num_ctx_words
42
        yield x, y
43
        i += 1
44
        if i >= len(data):
45
            print('i is being set to 0')
46
            i = 0
47

48

49
def get_batches(data, word2Ind, V, C, batch_size):
50
    batch_x = []
51
    batch_y = []
52
    for x, y in get_vectors(data, word2Ind, V, C):
53
        while len(batch_x) < batch_size:
54
            batch_x.append(x)
55
            batch_y.append(y)
56
        else:
57
            yield np.array(batch_x).T, np.array(batch_y).T
58
            batch = []
59

60

61
def compute_pca(data, n_components=2):
62
    """
63
    Input: 
64
        data: of dimension (m,n) where each row corresponds to a word vector
65
        n_components: Number of components you want to keep.
66
    Output: 
67
        X_reduced: data transformed in 2 dims/columns + regenerated original data
68
    pass in: data as 2D NumPy array
69
    """
70

71
    m, n = data.shape
72

73
    ### START CODE HERE ###
74
    # mean center the data
75
    data -= data.mean(axis=0)
76
    # calculate the covariance matrix
77
    R = np.cov(data, rowvar=False)
78
    # calculate eigenvectors & eigenvalues of the covariance matrix
79
    # use 'eigh' rather than 'eig' since R is symmetric,
80
    # the performance gain is substantial
81
    evals, evecs = linalg.eigh(R)
82
    # sort eigenvalue in decreasing order
83
    # this returns the corresponding indices of evals and evecs
84
    idx = np.argsort(evals)[::-1]
85

86
    evecs = evecs[:, idx]
87
    # sort eigenvectors according to same index
88
    evals = evals[idx]
89
    # select the first n eigenvectors (n is desired dimension
90
    # of rescaled data array, or dims_rescaled_data)
91
    evecs = evecs[:, :n_components]
92
    ### END CODE HERE ###
93
    return np.dot(evecs.T, data.T).T
94

95

96
def get_dict(data):
97
    """
98
    Input:
99
        K: the number of negative samples
100
        data: the data you want to pull from
101
        indices: a list of word indices
102
    Output:
103
        word_dict: a dictionary with the weighted probabilities of each word
104
        word2Ind: returns dictionary mapping the word to its index
105
        Ind2Word: returns dictionary mapping the index to its word
106
    """
107
    #
108
#     words = nltk.word_tokenize(data)
109
    words = sorted(list(set(data)))
110
    n = len(words)
111
    idx = 0
112
    # return these correctly
113
    word2Ind = {}
114
    Ind2word = {}
115
    for k in words:
116
        word2Ind[k] = idx
117
        Ind2word[idx] = k
118
        idx += 1
119
    return word2Ind, Ind2word
120

121
Product

Resources

Company