Path: blob/master/Natural Language Processing with Sequence Models/Week 3 - LSTMs and Named Entity Recognition/utils.py
14483 views
def get_vocab(vocab_path, tags_path):1vocab = {}2with open(vocab_path) as f:3for i, l in enumerate(f.read().splitlines()):4vocab[l] = i # to avoid the 05# loading tags (we require this to map tags to their indices)6vocab['<PAD>'] = len(vocab) # 351807tag_map = {}8with open(tags_path) as f:9for i, t in enumerate(f.read().splitlines()):10tag_map[t] = i1112return vocab, tag_map1314def get_params(vocab, tag_map, sentences_file, labels_file):15sentences = []16labels = []1718with open(sentences_file) as f:19for sentence in f.read().splitlines():20# replace each token by its index if it is in vocab21# else use index of UNK_WORD22s = [vocab[token] if token in vocab23else vocab['UNK']24for token in sentence.split(' ')]25sentences.append(s)2627with open(labels_file) as f:28for sentence in f.read().splitlines():29# replace each label by its index30l = [tag_map[label] for label in sentence.split(' ')] # I added plus 1 here31labels.append(l)32return sentences, labels, len(sentences)333435