CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
y33-j3T

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Sequence Models/Week 3 - LSTMs and Named Entity Recognition/utils.py
Views: 13373
1
def get_vocab(vocab_path, tags_path):
2
vocab = {}
3
with open(vocab_path) as f:
4
for i, l in enumerate(f.read().splitlines()):
5
vocab[l] = i # to avoid the 0
6
# loading tags (we require this to map tags to their indices)
7
vocab['<PAD>'] = len(vocab) # 35180
8
tag_map = {}
9
with open(tags_path) as f:
10
for i, t in enumerate(f.read().splitlines()):
11
tag_map[t] = i
12
13
return vocab, tag_map
14
15
def get_params(vocab, tag_map, sentences_file, labels_file):
16
sentences = []
17
labels = []
18
19
with open(sentences_file) as f:
20
for sentence in f.read().splitlines():
21
# replace each token by its index if it is in vocab
22
# else use index of UNK_WORD
23
s = [vocab[token] if token in vocab
24
else vocab['UNK']
25
for token in sentence.split(' ')]
26
sentences.append(s)
27
28
with open(labels_file) as f:
29
for sentence in f.read().splitlines():
30
# replace each label by its index
31
l = [tag_map[label] for label in sentence.split(' ')] # I added plus 1 here
32
labels.append(l)
33
return sentences, labels, len(sentences)
34
35