Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

📚 The CoCalc Library - books, templates and other resources

132929 views
License: OTHER
1
""" A neural chatbot using sequence to sequence model with
2
attentional decoder.
3
4
This is based on Google Translate Tensorflow model
5
https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/
6
7
Sequence to sequence model by Cho et al.(2014)
8
9
Created by Chip Huyen as the starter code for assignment 3,
10
class CS 20SI: "TensorFlow for Deep Learning Research"
11
cs20si.stanford.edu
12
13
This file contains the code to do the pre-processing for the
14
Cornell Movie-Dialogs Corpus.
15
16
See readme.md for instruction on how to run the starter code.
17
"""
18
from __future__ import print_function
19
20
import os
21
import random
22
import re
23
24
import numpy as np
25
26
import config
27
28
def get_lines():
29
id2line = {}
30
file_path = os.path.join(config.DATA_PATH, config.LINE_FILE)
31
with open(file_path, 'rb') as f:
32
lines = f.readlines()
33
for line in lines:
34
parts = line.split(' +++$+++ ')
35
if len(parts) == 5:
36
if parts[4][-1] == '\n':
37
parts[4] = parts[4][:-1]
38
id2line[parts[0]] = parts[4]
39
return id2line
40
41
def get_convos():
42
""" Get conversations from the raw data """
43
file_path = os.path.join(config.DATA_PATH, config.CONVO_FILE)
44
convos = []
45
with open(file_path, 'rb') as f:
46
for line in f.readlines():
47
parts = line.split(' +++$+++ ')
48
if len(parts) == 4:
49
convo = []
50
for line in parts[3][1:-2].split(', '):
51
convo.append(line[1:-1])
52
convos.append(convo)
53
54
return convos
55
56
def question_answers(id2line, convos):
57
""" Divide the dataset into two sets: questions and answers. """
58
questions, answers = [], []
59
for convo in convos:
60
for index, line in enumerate(convo[:-1]):
61
questions.append(id2line[convo[index]])
62
answers.append(id2line[convo[index + 1]])
63
assert len(questions) == len(answers)
64
return questions, answers
65
66
def prepare_dataset(questions, answers):
67
# create path to store all the train & test encoder & decoder
68
make_dir(config.PROCESSED_PATH)
69
70
# random convos to create the test set
71
test_ids = random.sample([i for i in range(len(questions))],config.TESTSET_SIZE)
72
73
filenames = ['train.enc', 'train.dec', 'test.enc', 'test.dec']
74
files = []
75
for filename in filenames:
76
files.append(open(os.path.join(config.PROCESSED_PATH, filename),'wb'))
77
78
for i in range(len(questions)):
79
if i in test_ids:
80
files[2].write(questions[i] + '\n')
81
files[3].write(answers[i] + '\n')
82
else:
83
files[0].write(questions[i] + '\n')
84
files[1].write(answers[i] + '\n')
85
86
for file in files:
87
file.close()
88
89
def make_dir(path):
90
""" Create a directory if there isn't one already. """
91
try:
92
os.mkdir(path)
93
except OSError:
94
pass
95
96
def basic_tokenizer(line, normalize_digits=True):
97
""" A basic tokenizer to tokenize text into tokens.
98
Feel free to change this to suit your need. """
99
line = re.sub('<u>', '', line)
100
line = re.sub('</u>', '', line)
101
line = re.sub('\[', '', line)
102
line = re.sub('\]', '', line)
103
words = []
104
_WORD_SPLIT = re.compile(b"([.,!?\"'-<>:;)(])")
105
_DIGIT_RE = re.compile(r"\d")
106
for fragment in line.strip().lower().split():
107
for token in re.split(_WORD_SPLIT, fragment):
108
if not token:
109
continue
110
if normalize_digits:
111
token = re.sub(_DIGIT_RE, b'#', token)
112
words.append(token)
113
return words
114
115
def build_vocab(filename, normalize_digits=True):
116
in_path = os.path.join(config.PROCESSED_PATH, filename)
117
out_path = os.path.join(config.PROCESSED_PATH, 'vocab.{}'.format(filename[-3:]))
118
119
vocab = {}
120
with open(in_path, 'rb') as f:
121
for line in f.readlines():
122
for token in basic_tokenizer(line):
123
if not token in vocab:
124
vocab[token] = 0
125
vocab[token] += 1
126
127
sorted_vocab = sorted(vocab, key=vocab.get, reverse=True)
128
with open(out_path, 'wb') as f:
129
f.write('<pad>' + '\n')
130
f.write('<unk>' + '\n')
131
f.write('<s>' + '\n')
132
f.write('<\s>' + '\n')
133
index = 4
134
for word in sorted_vocab:
135
if vocab[word] < config.THRESHOLD:
136
with open('config.py', 'ab') as cf:
137
if filename[-3:] == 'enc':
138
cf.write('ENC_VOCAB = ' + str(index) + '\n')
139
else:
140
cf.write('DEC_VOCAB = ' + str(index) + '\n')
141
break
142
f.write(word + '\n')
143
index += 1
144
145
def load_vocab(vocab_path):
146
with open(vocab_path, 'rb') as f:
147
words = f.read().splitlines()
148
return words, {words[i]: i for i in range(len(words))}
149
150
def sentence2id(vocab, line):
151
return [vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)]
152
153
def token2id(data, mode):
154
""" Convert all the tokens in the data into their corresponding
155
index in the vocabulary. """
156
vocab_path = 'vocab.' + mode
157
in_path = data + '.' + mode
158
out_path = data + '_ids.' + mode
159
160
_, vocab = load_vocab(os.path.join(config.PROCESSED_PATH, vocab_path))
161
in_file = open(os.path.join(config.PROCESSED_PATH, in_path), 'rb')
162
out_file = open(os.path.join(config.PROCESSED_PATH, out_path), 'wb')
163
164
lines = in_file.read().splitlines()
165
for line in lines:
166
if mode == 'dec': # we only care about '<s>' and </s> in encoder
167
ids = [vocab['<s>']]
168
else:
169
ids = []
170
ids.extend(sentence2id(vocab, line))
171
# ids.extend([vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)])
172
if mode == 'dec':
173
ids.append(vocab['<\s>'])
174
out_file.write(' '.join(str(id_) for id_ in ids) + '\n')
175
176
def prepare_raw_data():
177
print('Preparing raw data into train set and test set ...')
178
id2line = get_lines()
179
convos = get_convos()
180
questions, answers = question_answers(id2line, convos)
181
prepare_dataset(questions, answers)
182
183
def process_data():
184
print('Preparing data to be model-ready ...')
185
build_vocab('train.enc')
186
build_vocab('train.dec')
187
token2id('train', 'enc')
188
token2id('train', 'dec')
189
token2id('test', 'enc')
190
token2id('test', 'dec')
191
192
def load_data(enc_filename, dec_filename, max_training_size=None):
193
encode_file = open(os.path.join(config.PROCESSED_PATH, enc_filename), 'rb')
194
decode_file = open(os.path.join(config.PROCESSED_PATH, dec_filename), 'rb')
195
encode, decode = encode_file.readline(), decode_file.readline()
196
data_buckets = [[] for _ in config.BUCKETS]
197
i = 0
198
while encode and decode:
199
if (i + 1) % 10000 == 0:
200
print("Bucketing conversation number", i)
201
encode_ids = [int(id_) for id_ in encode.split()]
202
decode_ids = [int(id_) for id_ in decode.split()]
203
for bucket_id, (encode_max_size, decode_max_size) in enumerate(config.BUCKETS):
204
if len(encode_ids) <= encode_max_size and len(decode_ids) <= decode_max_size:
205
data_buckets[bucket_id].append([encode_ids, decode_ids])
206
break
207
encode, decode = encode_file.readline(), decode_file.readline()
208
i += 1
209
return data_buckets
210
211
def _pad_input(input_, size):
212
return input_ + [config.PAD_ID] * (size - len(input_))
213
214
def _reshape_batch(inputs, size, batch_size):
215
""" Create batch-major inputs. Batch inputs are just re-indexed inputs
216
"""
217
batch_inputs = []
218
for length_id in range(size):
219
batch_inputs.append(np.array([inputs[batch_id][length_id]
220
for batch_id in range(batch_size)], dtype=np.int32))
221
return batch_inputs
222
223
224
def get_batch(data_bucket, bucket_id, batch_size=1):
225
""" Return one batch to feed into the model """
226
# only pad to the max length of the bucket
227
encoder_size, decoder_size = config.BUCKETS[bucket_id]
228
encoder_inputs, decoder_inputs = [], []
229
230
for _ in range(batch_size):
231
encoder_input, decoder_input = random.choice(data_bucket)
232
# pad both encoder and decoder, reverse the encoder
233
encoder_inputs.append(list(reversed(_pad_input(encoder_input, encoder_size))))
234
decoder_inputs.append(_pad_input(decoder_input, decoder_size))
235
236
# now we create batch-major vectors from the data selected above.
237
batch_encoder_inputs = _reshape_batch(encoder_inputs, encoder_size, batch_size)
238
batch_decoder_inputs = _reshape_batch(decoder_inputs, decoder_size, batch_size)
239
240
# create decoder_masks to be 0 for decoders that are padding.
241
batch_masks = []
242
for length_id in range(decoder_size):
243
batch_mask = np.ones(batch_size, dtype=np.float32)
244
for batch_id in range(batch_size):
245
# we set mask to 0 if the corresponding target is a PAD symbol.
246
# the corresponding decoder is decoder_input shifted by 1 forward.
247
if length_id < decoder_size - 1:
248
target = decoder_inputs[batch_id][length_id + 1]
249
if length_id == decoder_size - 1 or target == config.PAD_ID:
250
batch_mask[batch_id] = 0.0
251
batch_masks.append(batch_mask)
252
return batch_encoder_inputs, batch_decoder_inputs, batch_masks
253
254
if __name__ == '__main__':
255
prepare_raw_data()
256
process_data()
257