CoCalc -- data.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / stanford-tensorflow-tutorials / assignments / chatbot / data.py
¹³²⁹²⁸ views
License: OTHER
1
""" A neural chatbot using sequence to sequence model with
2
attentional decoder. 
3

4
This is based on Google Translate Tensorflow model 
5
https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/
6

7
Sequence to sequence model by Cho et al.(2014)
8

9
Created by Chip Huyen ([email protected])
10
CS20: "TensorFlow for Deep Learning Research"
11
cs20.stanford.edu
12

13
This file contains the code to do the pre-processing for the
14
Cornell Movie-Dialogs Corpus.
15

16
See readme.md for instruction on how to run the starter code.
17
"""
18
import os
19
import random
20
import re
21

22
import numpy as np
23

24
import config
25

26
def get_lines():
27
    id2line = {}
28
    file_path = os.path.join(config.DATA_PATH, config.LINE_FILE)
29
    print(config.LINE_FILE)
30
    with open(file_path, 'r', errors='ignore') as f:
31
        # lines = f.readlines()
32
        # for line in lines:
33
        i = 0
34
        try:
35
            for line in f:
36
                parts = line.split(' +++$+++ ')
37
                if len(parts) == 5:
38
                    if parts[4][-1] == '\n':
39
                        parts[4] = parts[4][:-1]
40
                    id2line[parts[0]] = parts[4]
41
                i += 1
42
        except UnicodeDecodeError:
43
            print(i, line)
44
    return id2line
45

46
def get_convos():
47
    """ Get conversations from the raw data """
48
    file_path = os.path.join(config.DATA_PATH, config.CONVO_FILE)
49
    convos = []
50
    with open(file_path, 'r') as f:
51
        for line in f.readlines():
52
            parts = line.split(' +++$+++ ')
53
            if len(parts) == 4:
54
                convo = []
55
                for line in parts[3][1:-2].split(', '):
56
                    convo.append(line[1:-1])
57
                convos.append(convo)
58

59
    return convos
60

61
def question_answers(id2line, convos):
62
    """ Divide the dataset into two sets: questions and answers. """
63
    questions, answers = [], []
64
    for convo in convos:
65
        for index, line in enumerate(convo[:-1]):
66
            questions.append(id2line[convo[index]])
67
            answers.append(id2line[convo[index + 1]])
68
    assert len(questions) == len(answers)
69
    return questions, answers
70

71
def prepare_dataset(questions, answers):
72
    # create path to store all the train & test encoder & decoder
73
    make_dir(config.PROCESSED_PATH)
74
    
75
    # random convos to create the test set
76
    test_ids = random.sample([i for i in range(len(questions))],config.TESTSET_SIZE)
77
    
78
    filenames = ['train.enc', 'train.dec', 'test.enc', 'test.dec']
79
    files = []
80
    for filename in filenames:
81
        files.append(open(os.path.join(config.PROCESSED_PATH, filename),'w'))
82

83
    for i in range(len(questions)):
84
        if i in test_ids:
85
            files[2].write(questions[i] + '\n')
86
            files[3].write(answers[i] + '\n')
87
        else:
88
            files[0].write(questions[i] + '\n')
89
            files[1].write(answers[i] + '\n')
90

91
    for file in files:
92
        file.close()
93

94
def make_dir(path):
95
    """ Create a directory if there isn't one already. """
96
    try:
97
        os.mkdir(path)
98
    except OSError:
99
        pass
100

101
def basic_tokenizer(line, normalize_digits=True):
102
    """ A basic tokenizer to tokenize text into tokens.
103
    Feel free to change this to suit your need. """
104
    line = re.sub('<u>', '', line)
105
    line = re.sub('</u>', '', line)
106
    line = re.sub('\[', '', line)
107
    line = re.sub('\]', '', line)
108
    words = []
109
    _WORD_SPLIT = re.compile("([.,!?\"'-<>:;)(])")
110
    _DIGIT_RE = re.compile(r"\d")
111
    for fragment in line.strip().lower().split():
112
        for token in re.split(_WORD_SPLIT, fragment):
113
            if not token:
114
                continue
115
            if normalize_digits:
116
                token = re.sub(_DIGIT_RE, '#', token)
117
            words.append(token)
118
    return words
119

120
def build_vocab(filename, normalize_digits=True):
121
    in_path = os.path.join(config.PROCESSED_PATH, filename)
122
    out_path = os.path.join(config.PROCESSED_PATH, 'vocab.{}'.format(filename[-3:]))
123

124
    vocab = {}
125
    with open(in_path, 'r') as f:
126
        for line in f.readlines():
127
            for token in basic_tokenizer(line):
128
                if not token in vocab:
129
                    vocab[token] = 0
130
                vocab[token] += 1
131

132
    sorted_vocab = sorted(vocab, key=vocab.get, reverse=True)
133
    with open(out_path, 'w') as f:
134
        f.write('<pad>' + '\n')
135
        f.write('<unk>' + '\n')
136
        f.write('<s>' + '\n')
137
        f.write('<\s>' + '\n') 
138
        index = 4
139
        for word in sorted_vocab:
140
            if vocab[word] < config.THRESHOLD:
141
                break
142
            f.write(word + '\n')
143
            index += 1
144
        with open('config.py', 'a') as cf:
145
            if filename[-3:] == 'enc':
146
                cf.write('ENC_VOCAB = ' + str(index) + '\n')
147
            else:
148
                cf.write('DEC_VOCAB = ' + str(index) + '\n')
149

150
def load_vocab(vocab_path):
151
    with open(vocab_path, 'r') as f:
152
        words = f.read().splitlines()
153
    return words, {words[i]: i for i in range(len(words))}
154

155
def sentence2id(vocab, line):
156
    return [vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)]
157

158
def token2id(data, mode):
159
    """ Convert all the tokens in the data into their corresponding
160
    index in the vocabulary. """
161
    vocab_path = 'vocab.' + mode
162
    in_path = data + '.' + mode
163
    out_path = data + '_ids.' + mode
164

165
    _, vocab = load_vocab(os.path.join(config.PROCESSED_PATH, vocab_path))
166
    in_file = open(os.path.join(config.PROCESSED_PATH, in_path), 'r')
167
    out_file = open(os.path.join(config.PROCESSED_PATH, out_path), 'w')
168
    
169
    lines = in_file.read().splitlines()
170
    for line in lines:
171
        if mode == 'dec': # we only care about '<s>' and </s> in encoder
172
            ids = [vocab['<s>']]
173
        else:
174
            ids = []
175
        ids.extend(sentence2id(vocab, line))
176
        # ids.extend([vocab.get(token, vocab['<unk>']) for token in basic_tokenizer(line)])
177
        if mode == 'dec':
178
            ids.append(vocab['<\s>'])
179
        out_file.write(' '.join(str(id_) for id_ in ids) + '\n')
180

181
def prepare_raw_data():
182
    print('Preparing raw data into train set and test set ...')
183
    id2line = get_lines()
184
    convos = get_convos()
185
    questions, answers = question_answers(id2line, convos)
186
    prepare_dataset(questions, answers)
187

188
def process_data():
189
    print('Preparing data to be model-ready ...')
190
    build_vocab('train.enc')
191
    build_vocab('train.dec')
192
    token2id('train', 'enc')
193
    token2id('train', 'dec')
194
    token2id('test', 'enc')
195
    token2id('test', 'dec')
196

197
def load_data(enc_filename, dec_filename, max_training_size=None):
198
    encode_file = open(os.path.join(config.PROCESSED_PATH, enc_filename), 'r')
199
    decode_file = open(os.path.join(config.PROCESSED_PATH, dec_filename), 'r')
200
    encode, decode = encode_file.readline(), decode_file.readline()
201
    data_buckets = [[] for _ in config.BUCKETS]
202
    i = 0
203
    while encode and decode:
204
        if (i + 1) % 10000 == 0:
205
            print("Bucketing conversation number", i)
206
        encode_ids = [int(id_) for id_ in encode.split()]
207
        decode_ids = [int(id_) for id_ in decode.split()]
208
        for bucket_id, (encode_max_size, decode_max_size) in enumerate(config.BUCKETS):
209
            if len(encode_ids) <= encode_max_size and len(decode_ids) <= decode_max_size:
210
                data_buckets[bucket_id].append([encode_ids, decode_ids])
211
                break
212
        encode, decode = encode_file.readline(), decode_file.readline()
213
        i += 1
214
    return data_buckets
215

216
def _pad_input(input_, size):
217
    return input_ + [config.PAD_ID] * (size - len(input_))
218

219
def _reshape_batch(inputs, size, batch_size):
220
    """ Create batch-major inputs. Batch inputs are just re-indexed inputs
221
    """
222
    batch_inputs = []
223
    for length_id in range(size):
224
        batch_inputs.append(np.array([inputs[batch_id][length_id]
225
                                    for batch_id in range(batch_size)], dtype=np.int32))
226
    return batch_inputs
227

228

229
def get_batch(data_bucket, bucket_id, batch_size=1):
230
    """ Return one batch to feed into the model """
231
    # only pad to the max length of the bucket
232
    encoder_size, decoder_size = config.BUCKETS[bucket_id]
233
    encoder_inputs, decoder_inputs = [], []
234

235
    for _ in range(batch_size):
236
        encoder_input, decoder_input = random.choice(data_bucket)
237
        # pad both encoder and decoder, reverse the encoder
238
        encoder_inputs.append(list(reversed(_pad_input(encoder_input, encoder_size))))
239
        decoder_inputs.append(_pad_input(decoder_input, decoder_size))
240

241
    # now we create batch-major vectors from the data selected above.
242
    batch_encoder_inputs = _reshape_batch(encoder_inputs, encoder_size, batch_size)
243
    batch_decoder_inputs = _reshape_batch(decoder_inputs, decoder_size, batch_size)
244

245
    # create decoder_masks to be 0 for decoders that are padding.
246
    batch_masks = []
247
    for length_id in range(decoder_size):
248
        batch_mask = np.ones(batch_size, dtype=np.float32)
249
        for batch_id in range(batch_size):
250
            # we set mask to 0 if the corresponding target is a PAD symbol.
251
            # the corresponding decoder is decoder_input shifted by 1 forward.
252
            if length_id < decoder_size - 1:
253
                target = decoder_inputs[batch_id][length_id + 1]
254
            if length_id == decoder_size - 1 or target == config.PAD_ID:
255
                batch_mask[batch_id] = 0.0
256
        batch_masks.append(batch_mask)
257
    return batch_encoder_inputs, batch_decoder_inputs, batch_masks
258

259
if __name__ == '__main__':
260
    prepare_raw_data()
261
    process_data()
262
Product

Resources

Company