Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

📚 The CoCalc Library - books, templates and other resources

132928 views
License: OTHER
1
""" A neural chatbot using sequence to sequence model with
2
attentional decoder.
3
4
This is based on Google Translate Tensorflow model
5
https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/
6
7
Sequence to sequence model by Cho et al.(2014)
8
9
Created by Chip Huyen ([email protected])
10
CS20: "TensorFlow for Deep Learning Research"
11
cs20.stanford.edu
12
13
This file contains the code to do the pre-processing for the
14
Cornell Movie-Dialogs Corpus.
15
16
See readme.md for instruction on how to run the starter code.
17
"""
18
import os
19
import random
20
import re
21
22
import numpy as np
23
24
import config
25
26
def get_lines():
27
id2line = {}
28
file_path = os.path.join(config.DATA_PATH, config.LINE_FILE)
29
print(config.LINE_FILE)
30
with open(file_path, 'r', errors='ignore') as f:
31
# lines = f.readlines()
32
# for line in lines:
33
i = 0
34
try:
35
for line in f:
36
parts = line.split(' +++$+++ ')
37
if len(parts) == 5:
38
if parts[4][-1] == '\n':
39
parts[4] = parts[4][:-1]
40
id2line[parts[0]] = parts[4]
41
i += 1
42
except UnicodeDecodeError:
43
print(i, line)
44
return id2line
45
46
def get_convos():
47
""" Get conversations from the raw data """
48
file_path = os.path.join(config.DATA_PATH, config.CONVO_FILE)
49
convos = []
50
with open(file_path, 'r') as f:
51
for line in f.readlines():
52
parts = line.split(' +++$+++ ')
53
if len(parts) == 4:
54
convo = []
55
for line in parts[3][1:-2].split(', '):
56
convo.append(line[1:-1])
57
convos.append(convo)
58
59
return convos
60
61
def question_answers(id2line, convos):
62
""" Divide the dataset into two sets: questions and answers. """
63
questions, answers = [], []
64
for convo in convos:
65
for index, line in enumerate(convo[:-1]):
66
questions.append(id2line[convo[index]])
67
answers.append(id2line[convo[index + 1]])
68
assert len(questions) == len(answers)
69
return questions, answers
70
71
def prepare_dataset(questions, answers):
72
# create path to store all the train & test encoder & decoder
73
make_dir(config.PROCESSED_PATH)
74
75
# random convos to create the test set
76
test_ids = random.sample([i for i in range(len(questions))],config.TESTSET_SIZE)
77
78
filenames = ['train.enc', 'train.dec', 'test.enc', 'test.dec']
79
files = []
80
for filename in filenames:
81
files.append(open(os.path.join(config.PROCESSED_PATH, filename),'w'))
82
83
for i in range(len(questions)):
84
if i in test_ids:
85
files[2].write(questions[i] + '\n')
86
files[3].write(answers[i] + '\n')
87
else:
88
files[0].write(questions[i] + '\n')
89
files[1].write(answers[i] + '\n')
90
91
for file in files:
92
file.close()
93
94
def make_dir(path):
95
""" Create a directory if there isn't one already. """
96
try:
97
os.mkdir(path)
98
except OSError:
99
pass
100
101
def basic_tokenizer(line, normalize_digits=True):
102
""" A basic tokenizer to tokenize text into tokens.
103
Feel free to change this to suit your need. """
104
line = re.sub('<u>', '', line)
105
line = re.sub('</u>', '', line)
106
line = re.sub('\[', '', line)
107
line = re.sub('\]', '', line)
108
words = []
109
_WORD_SPLIT = re.compile("([.,!?\"'-<>:;)(])")
110
_DIGIT_RE = re.compile(r"\d")
111
for fragment in line.strip().lower().split():
112
for token in re.split(_WORD_SPLIT, fragment):
113
if not token:
114
continue
115
if normalize_digits:
116
token = re.sub(_DIGIT_RE, '#', token)
117
words.append(token)
118
return words
119
120
def build_vocab(filename, normalize_digits=True):
121
in_path = os.path.join(config.PROCESSED_PATH, filename)
122
out_path = os.path.join(config.PROCESSED_PATH, 'vocab.{}'.format(filename[-3:]))
123
124
vocab = {}
125
with open(in_path, 'r') as f:
126
for line in f.readlines():
127
for token in basic_tokenizer(line):
128
if not token in vocab:
129
vocab[token] = 0
130
vocab[token] += 1
131
132
sorted_vocab = sorted(vocab, key=vocab.get, reverse=True)
133
with open(out_path, 'w') as f:
134
f.write('<pad>' + '\n')
135
f.write('<unk>' + '\n')
136
f.write('<s>' + '\n')
137
f.write('<\s>' + '\n')
138
index = 4
139
for word in sorted_vocab:
140
if vocab[word] < config.THRESHOLD:
141
break
142
f.write(word + '\n')
143
index += 1
144
with open('config.py', 'a') as cf:
145
if filename[-3:] == 'enc':
146
cf.write('ENC_VOCAB = ' + str(index) + '\n')
147
else:
148
cf.write('DEC_VOCAB = ' + str(index) + '\n')
149
150
def load_vocab(vocab_path):
151
with open(vocab_path, 'r') as f:
152
words = f.read().splitlines()
153
return words, {words[i]: i for i in range(len(words))}
154
155
def sentence2id(vocab, line):
156
return [vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)]
157
158
def token2id(data, mode):
159
""" Convert all the tokens in the data into their corresponding
160
index in the vocabulary. """
161
vocab_path = 'vocab.' + mode
162
in_path = data + '.' + mode
163
out_path = data + '_ids.' + mode
164
165
_, vocab = load_vocab(os.path.join(config.PROCESSED_PATH, vocab_path))
166
in_file = open(os.path.join(config.PROCESSED_PATH, in_path), 'r')
167
out_file = open(os.path.join(config.PROCESSED_PATH, out_path), 'w')
168
169
lines = in_file.read().splitlines()
170
for line in lines:
171
if mode == 'dec': # we only care about '<s>' and </s> in encoder
172
ids = [vocab['<s>']]
173
else:
174
ids = []
175
ids.extend(sentence2id(vocab, line))
176
# ids.extend([vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)])
177
if mode == 'dec':
178
ids.append(vocab['<\s>'])
179
out_file.write(' '.join(str(id_) for id_ in ids) + '\n')
180
181
def prepare_raw_data():
182
print('Preparing raw data into train set and test set ...')
183
id2line = get_lines()
184
convos = get_convos()
185
questions, answers = question_answers(id2line, convos)
186
prepare_dataset(questions, answers)
187
188
def process_data():
189
print('Preparing data to be model-ready ...')
190
build_vocab('train.enc')
191
build_vocab('train.dec')
192
token2id('train', 'enc')
193
token2id('train', 'dec')
194
token2id('test', 'enc')
195
token2id('test', 'dec')
196
197
def load_data(enc_filename, dec_filename, max_training_size=None):
198
encode_file = open(os.path.join(config.PROCESSED_PATH, enc_filename), 'r')
199
decode_file = open(os.path.join(config.PROCESSED_PATH, dec_filename), 'r')
200
encode, decode = encode_file.readline(), decode_file.readline()
201
data_buckets = [[] for _ in config.BUCKETS]
202
i = 0
203
while encode and decode:
204
if (i + 1) % 10000 == 0:
205
print("Bucketing conversation number", i)
206
encode_ids = [int(id_) for id_ in encode.split()]
207
decode_ids = [int(id_) for id_ in decode.split()]
208
for bucket_id, (encode_max_size, decode_max_size) in enumerate(config.BUCKETS):
209
if len(encode_ids) <= encode_max_size and len(decode_ids) <= decode_max_size:
210
data_buckets[bucket_id].append([encode_ids, decode_ids])
211
break
212
encode, decode = encode_file.readline(), decode_file.readline()
213
i += 1
214
return data_buckets
215
216
def _pad_input(input_, size):
217
return input_ + [config.PAD_ID] * (size - len(input_))
218
219
def _reshape_batch(inputs, size, batch_size):
220
""" Create batch-major inputs. Batch inputs are just re-indexed inputs
221
"""
222
batch_inputs = []
223
for length_id in range(size):
224
batch_inputs.append(np.array([inputs[batch_id][length_id]
225
for batch_id in range(batch_size)], dtype=np.int32))
226
return batch_inputs
227
228
229
def get_batch(data_bucket, bucket_id, batch_size=1):
230
""" Return one batch to feed into the model """
231
# only pad to the max length of the bucket
232
encoder_size, decoder_size = config.BUCKETS[bucket_id]
233
encoder_inputs, decoder_inputs = [], []
234
235
for _ in range(batch_size):
236
encoder_input, decoder_input = random.choice(data_bucket)
237
# pad both encoder and decoder, reverse the encoder
238
encoder_inputs.append(list(reversed(_pad_input(encoder_input, encoder_size))))
239
decoder_inputs.append(_pad_input(decoder_input, decoder_size))
240
241
# now we create batch-major vectors from the data selected above.
242
batch_encoder_inputs = _reshape_batch(encoder_inputs, encoder_size, batch_size)
243
batch_decoder_inputs = _reshape_batch(decoder_inputs, decoder_size, batch_size)
244
245
# create decoder_masks to be 0 for decoders that are padding.
246
batch_masks = []
247
for length_id in range(decoder_size):
248
batch_mask = np.ones(batch_size, dtype=np.float32)
249
for batch_id in range(batch_size):
250
# we set mask to 0 if the corresponding target is a PAD symbol.
251
# the corresponding decoder is decoder_input shifted by 1 forward.
252
if length_id < decoder_size - 1:
253
target = decoder_inputs[batch_id][length_id + 1]
254
if length_id == decoder_size - 1 or target == config.PAD_ID:
255
batch_mask[batch_id] = 0.0
256
batch_masks.append(batch_mask)
257
return batch_encoder_inputs, batch_decoder_inputs, batch_masks
258
259
if __name__ == '__main__':
260
prepare_raw_data()
261
process_data()
262