CoCalc -- data.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / stanford-tensorflow-tutorials / 2017 / assignments / chatbot / data.py
¹³²⁹²⁹ views
License: OTHER
1
""" A neural chatbot using sequence to sequence model with
2
attentional decoder. 
3

4
This is based on Google Translate Tensorflow model 
5
https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/
6

7
Sequence to sequence model by Cho et al.(2014)
8

9
Created by Chip Huyen as the starter code for assignment 3,
10
class CS 20SI: "TensorFlow for Deep Learning Research"
11
cs20si.stanford.edu
12

13
This file contains the code to do the pre-processing for the
14
Cornell Movie-Dialogs Corpus.
15

16
See readme.md for instruction on how to run the starter code.
17
"""
18
from __future__ import print_function
19

20
import os
21
import random
22
import re
23

24
import numpy as np
25

26
import config
27

28
def get_lines():
29
    id2line = {}
30
    file_path = os.path.join(config.DATA_PATH, config.LINE_FILE)
31
    with open(file_path, 'rb') as f:
32
        lines = f.readlines()
33
        for line in lines:
34
            parts = line.split(' +++$+++ ')
35
            if len(parts) == 5:
36
                if parts[4][-1] == '\n':
37
                    parts[4] = parts[4][:-1]
38
                id2line[parts[0]] = parts[4]
39
    return id2line
40

41
def get_convos():
42
    """ Get conversations from the raw data """
43
    file_path = os.path.join(config.DATA_PATH, config.CONVO_FILE)
44
    convos = []
45
    with open(file_path, 'rb') as f:
46
        for line in f.readlines():
47
            parts = line.split(' +++$+++ ')
48
            if len(parts) == 4:
49
                convo = []
50
                for line in parts[3][1:-2].split(', '):
51
                    convo.append(line[1:-1])
52
                convos.append(convo)
53

54
    return convos
55

56
def question_answers(id2line, convos):
57
    """ Divide the dataset into two sets: questions and answers. """
58
    questions, answers = [], []
59
    for convo in convos:
60
        for index, line in enumerate(convo[:-1]):
61
            questions.append(id2line[convo[index]])
62
            answers.append(id2line[convo[index + 1]])
63
    assert len(questions) == len(answers)
64
    return questions, answers
65

66
def prepare_dataset(questions, answers):
67
    # create path to store all the train & test encoder & decoder
68
    make_dir(config.PROCESSED_PATH)
69
    
70
    # random convos to create the test set
71
    test_ids = random.sample([i for i in range(len(questions))],config.TESTSET_SIZE)
72
    
73
    filenames = ['train.enc', 'train.dec', 'test.enc', 'test.dec']
74
    files = []
75
    for filename in filenames:
76
        files.append(open(os.path.join(config.PROCESSED_PATH, filename),'wb'))
77

78
    for i in range(len(questions)):
79
        if i in test_ids:
80
            files[2].write(questions[i] + '\n')
81
            files[3].write(answers[i] + '\n')
82
        else:
83
            files[0].write(questions[i] + '\n')
84
            files[1].write(answers[i] + '\n')
85

86
    for file in files:
87
        file.close()
88

89
def make_dir(path):
90
    """ Create a directory if there isn't one already. """
91
    try:
92
        os.mkdir(path)
93
    except OSError:
94
        pass
95

96
def basic_tokenizer(line, normalize_digits=True):
97
    """ A basic tokenizer to tokenize text into tokens.
98
    Feel free to change this to suit your need. """
99
    line = re.sub('<u>', '', line)
100
    line = re.sub('</u>', '', line)
101
    line = re.sub('\[', '', line)
102
    line = re.sub('\]', '', line)
103
    words = []
104
    _WORD_SPLIT = re.compile(b"([.,!?\"'-<>:;)(])")
105
    _DIGIT_RE = re.compile(r"\d")
106
    for fragment in line.strip().lower().split():
107
        for token in re.split(_WORD_SPLIT, fragment):
108
            if not token:
109
                continue
110
            if normalize_digits:
111
                token = re.sub(_DIGIT_RE, b'#', token)
112
            words.append(token)
113
    return words
114

115
def build_vocab(filename, normalize_digits=True):
116
    in_path = os.path.join(config.PROCESSED_PATH, filename)
117
    out_path = os.path.join(config.PROCESSED_PATH, 'vocab.{}'.format(filename[-3:]))
118

119
    vocab = {}
120
    with open(in_path, 'rb') as f:
121
        for line in f.readlines():
122
            for token in basic_tokenizer(line):
123
                if not token in vocab:
124
                    vocab[token] = 0
125
                vocab[token] += 1
126

127
    sorted_vocab = sorted(vocab, key=vocab.get, reverse=True)
128
    with open(out_path, 'wb') as f:
129
        f.write('<pad>' + '\n')
130
        f.write('<unk>' + '\n')
131
        f.write('<s>' + '\n')
132
        f.write('<\s>' + '\n') 
133
        index = 4
134
        for word in sorted_vocab:
135
            if vocab[word] < config.THRESHOLD:
136
                with open('config.py', 'ab') as cf:
137
                    if filename[-3:] == 'enc':
138
                        cf.write('ENC_VOCAB = ' + str(index) + '\n')
139
                    else:
140
                        cf.write('DEC_VOCAB = ' + str(index) + '\n')
141
                break
142
            f.write(word + '\n')
143
            index += 1
144

145
def load_vocab(vocab_path):
146
    with open(vocab_path, 'rb') as f:
147
        words = f.read().splitlines()
148
    return words, {words[i]: i for i in range(len(words))}
149

150
def sentence2id(vocab, line):
151
    return [vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)]
152

153
def token2id(data, mode):
154
    """ Convert all the tokens in the data into their corresponding
155
    index in the vocabulary. """
156
    vocab_path = 'vocab.' + mode
157
    in_path = data + '.' + mode
158
    out_path = data + '_ids.' + mode
159

160
    _, vocab = load_vocab(os.path.join(config.PROCESSED_PATH, vocab_path))
161
    in_file = open(os.path.join(config.PROCESSED_PATH, in_path), 'rb')
162
    out_file = open(os.path.join(config.PROCESSED_PATH, out_path), 'wb')
163
    
164
    lines = in_file.read().splitlines()
165
    for line in lines:
166
        if mode == 'dec': # we only care about '<s>' and </s> in encoder
167
            ids = [vocab['<s>']]
168
        else:
169
            ids = []
170
        ids.extend(sentence2id(vocab, line))
171
        # ids.extend([vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)])
172
        if mode == 'dec':
173
            ids.append(vocab['<\s>'])
174
        out_file.write(' '.join(str(id_) for id_ in ids) + '\n')
175

176
def prepare_raw_data():
177
    print('Preparing raw data into train set and test set ...')
178
    id2line = get_lines()
179
    convos = get_convos()
180
    questions, answers = question_answers(id2line, convos)
181
    prepare_dataset(questions, answers)
182

183
def process_data():
184
    print('Preparing data to be model-ready ...')
185
    build_vocab('train.enc')
186
    build_vocab('train.dec')
187
    token2id('train', 'enc')
188
    token2id('train', 'dec')
189
    token2id('test', 'enc')
190
    token2id('test', 'dec')
191

192
def load_data(enc_filename, dec_filename, max_training_size=None):
193
    encode_file = open(os.path.join(config.PROCESSED_PATH, enc_filename), 'rb')
194
    decode_file = open(os.path.join(config.PROCESSED_PATH, dec_filename), 'rb')
195
    encode, decode = encode_file.readline(), decode_file.readline()
196
    data_buckets = [[] for _ in config.BUCKETS]
197
    i = 0
198
    while encode and decode:
199
        if (i + 1) % 10000 == 0:
200
            print("Bucketing conversation number", i)
201
        encode_ids = [int(id_) for id_ in encode.split()]
202
        decode_ids = [int(id_) for id_ in decode.split()]
203
        for bucket_id, (encode_max_size, decode_max_size) in enumerate(config.BUCKETS):
204
            if len(encode_ids) <= encode_max_size and len(decode_ids) <= decode_max_size:
205
                data_buckets[bucket_id].append([encode_ids, decode_ids])
206
                break
207
        encode, decode = encode_file.readline(), decode_file.readline()
208
        i += 1
209
    return data_buckets
210

211
def _pad_input(input_, size):
212
    return input_ + [config.PAD_ID] * (size - len(input_))
213

214
def _reshape_batch(inputs, size, batch_size):
215
    """ Create batch-major inputs. Batch inputs are just re-indexed inputs
216
    """
217
    batch_inputs = []
218
    for length_id in range(size):
219
        batch_inputs.append(np.array([inputs[batch_id][length_id]
220
                                    for batch_id in range(batch_size)], dtype=np.int32))
221
    return batch_inputs
222

223

224
def get_batch(data_bucket, bucket_id, batch_size=1):
225
    """ Return one batch to feed into the model """
226
    # only pad to the max length of the bucket
227
    encoder_size, decoder_size = config.BUCKETS[bucket_id]
228
    encoder_inputs, decoder_inputs = [], []
229

230
    for _ in range(batch_size):
231
        encoder_input, decoder_input = random.choice(data_bucket)
232
        # pad both encoder and decoder, reverse the encoder
233
        encoder_inputs.append(list(reversed(_pad_input(encoder_input, encoder_size))))
234
        decoder_inputs.append(_pad_input(decoder_input, decoder_size))
235

236
    # now we create batch-major vectors from the data selected above.
237
    batch_encoder_inputs = _reshape_batch(encoder_inputs, encoder_size, batch_size)
238
    batch_decoder_inputs = _reshape_batch(decoder_inputs, decoder_size, batch_size)
239

240
    # create decoder_masks to be 0 for decoders that are padding.
241
    batch_masks = []
242
    for length_id in range(decoder_size):
243
        batch_mask = np.ones(batch_size, dtype=np.float32)
244
        for batch_id in range(batch_size):
245
            # we set mask to 0 if the corresponding target is a PAD symbol.
246
            # the corresponding decoder is decoder_input shifted by 1 forward.
247
            if length_id < decoder_size - 1:
248
                target = decoder_inputs[batch_id][length_id + 1]
249
            if length_id == decoder_size - 1 or target == config.PAD_ID:
250
                batch_mask[batch_id] = 0.0
251
        batch_masks.append(batch_mask)
252
    return batch_encoder_inputs, batch_decoder_inputs, batch_masks
253

254
if __name__ == '__main__':
255
    prepare_raw_data()
256
    process_data()
257
Product

Resources

Company