CoCalc -- lstm

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / data-science-ipython-notebooks / deep-learning / theano-tutorial / rnn_tutorial / lstm_text.py
¹³²⁹²⁸ views
License: OTHER
1
import cPickle as pkl
2
import time
3

4
import numpy
5
import theano
6
from theano import config
7
import theano.tensor as T
8
from theano.tensor.nnet import categorical_crossentropy
9

10
from fuel.datasets import TextFile
11
from fuel.streams import DataStream
12
from fuel.schemes import ConstantScheme
13
from fuel.transformers import Batch, Padding
14

15

16
# These files can be downloaded from
17
# http://www-etud.iro.umontreal.ca/~brakelp/train.txt.gz
18
# http://www-etud.iro.umontreal.ca/~brakelp/dictionary.pkl
19
# don't forget to change the paths and gunzip train.txt.gz
20
TRAIN_FILE = '/u/brakelp/temp/traindata.txt'
21
VAL_FILE = '/u/brakelp/temp/valdata.txt'
22
DICT_FILE = '/u/brakelp/temp/dictionary.pkl'
23

24

25
def sequence_categorical_crossentropy(prediction, targets, mask):
26
    prediction_flat = prediction.reshape(((prediction.shape[0] *
27
                                           prediction.shape[1]),
28
                                          prediction.shape[2]), ndim=2)
29
    targets_flat = targets.flatten()
30
    mask_flat = mask.flatten()
31
    ce = categorical_crossentropy(prediction_flat, targets_flat)
32
    return T.sum(ce * mask_flat)
33

34

35
def gauss_weight(ndim_in, ndim_out=None, sd=.005):
36
    if ndim_out is None:
37
        ndim_out = ndim_in
38
    W = numpy.random.randn(ndim_in, ndim_out) * sd
39
    return numpy.asarray(W, dtype=config.floatX)
40

41

42
class LogisticRegression(object):
43
    """Multi-class Logistic Regression Class
44

45
    The logistic regression is fully described by a weight matrix :math:`W`
46
    and bias vector :math:`b`. Classification is done by projecting data
47
    points onto a set of hyperplanes, the distance to which is used to
48
    determine a class membership probability.
49
    """
50

51
    def __init__(self, input, n_in, n_out):
52
        """ Initialize the parameters of the logistic regression
53

54
        :type input: theano.tensor.TensorType
55
        :param input: symbolic variable that describes the input of the
56
                      architecture (one minibatch)
57

58
        :type n_in: int
59
        :param n_in: number of input units, the dimension of the space in
60
                     which the datapoints lie
61

62
        :type n_out: int
63
        :param n_out: number of output units, the dimension of the space in
64
                      which the labels lie
65

66
        """
67

68
        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
69
        self.W = theano.shared(value=numpy.zeros((n_in, n_out),
70
                                                 dtype=theano.config.floatX),
71
                               name='W', borrow=True)
72
        # initialize the baises b as a vector of n_out 0s
73
        self.b = theano.shared(value=numpy.zeros((n_out,),
74
                                                 dtype=theano.config.floatX),
75
                               name='b', borrow=True)
76

77
        # compute vector of class-membership probabilities in symbolic form
78
        energy = T.dot(input, self.W) + self.b
79
        energy_exp = T.exp(energy - T.max(energy, 2)[:, :, None])
80
        pmf = energy_exp / energy_exp.sum(2)[:, :, None]
81
        self.p_y_given_x = pmf
82

83
        # compute prediction as class whose probability is maximal in
84
        # symbolic form
85
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
86

87
        # parameters of the model
88
        self.params = [self.W, self.b]
89

90

91
def index_dot(indices, w):
92
    return w[indices.flatten()]
93

94

95
class LstmLayer:
96

97
    def __init__(self, rng, input, mask, n_in, n_h):
98

99
        # Init params
100
        self.W_i = theano.shared(gauss_weight(n_in, n_h), 'W_i', borrow=True)
101
        self.W_f = theano.shared(gauss_weight(n_in, n_h), 'W_f', borrow=True)
102
        self.W_c = theano.shared(gauss_weight(n_in, n_h), 'W_c', borrow=True)
103
        self.W_o = theano.shared(gauss_weight(n_in, n_h), 'W_o', borrow=True)
104

105
        self.U_i = theano.shared(gauss_weight(n_h), 'U_i', borrow=True)
106
        self.U_f = theano.shared(gauss_weight(n_h), 'U_f', borrow=True)
107
        self.U_c = theano.shared(gauss_weight(n_h), 'U_c', borrow=True)
108
        self.U_o = theano.shared(gauss_weight(n_h), 'U_o', borrow=True)
109

110
        self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
111
                                 'b_i', borrow=True)
112
        self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
113
                                 'b_f', borrow=True)
114
        self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
115
                                 'b_c', borrow=True)
116
        self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
117
                                 'b_o', borrow=True)
118

119
        self.params = [self.W_i, self.W_f, self.W_c, self.W_o,
120
                       self.U_i, self.U_f, self.U_c, self.U_o,
121
                       self.b_i, self.b_f, self.b_c, self.b_o]
122

123
        outputs_info = [T.zeros((input.shape[1], n_h)),
124
                        T.zeros((input.shape[1], n_h))]
125

126
        rval, updates = theano.scan(self._step,
127
                                    sequences=[mask, input],
128
                                    outputs_info=outputs_info)
129

130
        # self.output is in the format (batchsize, n_h)
131
        self.output = rval[0]
132

133
    def _step(self, m_, x_, h_, c_):
134

135
        i_preact = (index_dot(x_, self.W_i) +
136
                    T.dot(h_, self.U_i) + self.b_i)
137
        i = T.nnet.sigmoid(i_preact)
138

139
        f_preact = (index_dot(x_, self.W_f) +
140
                    T.dot(h_, self.U_f) + self.b_f)
141
        f = T.nnet.sigmoid(f_preact)
142

143
        o_preact = (index_dot(x_, self.W_o) +
144
                    T.dot(h_, self.U_o) + self.b_o)
145
        o = T.nnet.sigmoid(o_preact)
146

147
        c_preact = (index_dot(x_, self.W_c) +
148
                    T.dot(h_, self.U_c) + self.b_c)
149
        c = T.tanh(c_preact)
150

151
        c = f * c_ + i * c
152
        c = m_[:, None] * c + (1. - m_)[:, None] * c_
153

154
        h = o * T.tanh(c)
155
        h = m_[:, None] * h + (1. - m_)[:, None] * h_
156

157
        return h, c
158

159

160
def train_model(batch_size=100, n_h=50, n_epochs=40):
161

162
    # Load the datasets with Fuel
163
    dictionary = pkl.load(open(DICT_FILE, 'r'))
164
    dictionary['~'] = len(dictionary)
165
    reverse_mapping = dict((j, i) for i, j in dictionary.items())
166

167
    print("Loading the data")
168
    train = TextFile(files=[TRAIN_FILE],
169
                     dictionary=dictionary,
170
                     unk_token='~',
171
                     level='character',
172
                     preprocess=str.lower,
173
                     bos_token=None,
174
                     eos_token=None)
175

176
    train_stream = DataStream.default_stream(train)
177

178
    # organize data in batches and pad shorter sequences with zeros
179
    train_stream = Batch(train_stream,
180
                         iteration_scheme=ConstantScheme(batch_size))
181
    train_stream = Padding(train_stream)
182

183
    # idem dito for the validation text
184
    val = TextFile(files=[VAL_FILE],
185
                     dictionary=dictionary,
186
                     unk_token='~',
187
                     level='character',
188
                     preprocess=str.lower,
189
                     bos_token=None,
190
                     eos_token=None)
191

192
    val_stream = DataStream.default_stream(val)
193

194
    # organize data in batches and pad shorter sequences with zeros
195
    val_stream = Batch(val_stream,
196
                         iteration_scheme=ConstantScheme(batch_size))
197
    val_stream = Padding(val_stream)
198

199
    print('Building model')
200

201
    # Set the random number generator' seeds for consistency
202
    rng = numpy.random.RandomState(12345)
203

204
    x = T.lmatrix('x')
205
    mask = T.matrix('mask')
206

207
    # Construct the LSTM layer
208
    recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h)
209

210
    logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1],
211
                                      n_in=n_h, n_out=111)
212

213
    cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x,
214
                                             x[1:],
215
                                             mask[1:]) / batch_size
216

217
    # create a list of all model parameters to be fit by gradient descent
218
    params = logreg_layer.params + recurrent_layer.params
219

220
    # create a list of gradients for all model parameters
221
    grads = T.grad(cost, params)
222

223
    # update_model is a function that updates the model parameters by
224
    # SGD Since this model has many parameters, it would be tedious to
225
    # manually create an update rule for each model parameter. We thus
226
    # create the updates list by automatically looping over all
227
    # (params[i], grads[i]) pairs.
228
    learning_rate = 0.1
229
    updates = [
230
        (param_i, param_i - learning_rate * grad_i)
231
        for param_i, grad_i in zip(params, grads)
232
    ]
233

234
    update_model = theano.function([x, mask], cost, updates=updates)
235

236
    evaluate_model = theano.function([x, mask], cost)
237

238
    # Define and compile a function for generating a sequence step by step.
239
    x_t = T.iscalar()
240
    h_p = T.vector()
241
    c_p = T.vector()
242
    h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p)
243
    energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b
244

245
    energy_exp = T.exp(energy - T.max(energy, 1)[:, None])
246

247
    output = energy_exp / energy_exp.sum(1)[:, None]
248
    single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t])
249

250
    start_time = time.clock()
251

252
    iteration = 0
253

254
    for epoch in range(n_epochs):
255
        print 'epoch:', epoch
256

257
        for x_, mask_ in train_stream.get_epoch_iterator():
258
            iteration += 1
259

260
            cross_entropy = update_model(x_.T, mask_.T)
261

262

263
            # Generate some text after each 20 minibatches
264
            if iteration % 40 == 0:
265
                try:
266
                    prediction = numpy.ones(111, dtype=config.floatX) / 111.0
267
                    h_p = numpy.zeros((n_h,), dtype=config.floatX)
268
                    c_p = numpy.zeros((n_h,), dtype=config.floatX)
269
                    initial = 'the meaning of life is '
270
                    sentence = initial
271
                    for char in initial:
272
                        x_t = dictionary[char]
273
                        prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
274
                                                           c_p.flatten())
275
                    sample = numpy.random.multinomial(1, prediction.flatten())
276
                    for i in range(450):
277
                        x_t = numpy.argmax(sample)
278
                        prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
279
                                                           c_p.flatten())
280
                        sentence += reverse_mapping[x_t]
281
                        sample = numpy.random.multinomial(1, prediction.flatten())
282
                    print 'LSTM: "' + sentence + '"'
283
                except ValueError:
284
                    print 'Something went wrong during sentence generation.'
285

286
            if iteration % 40 == 0:
287
                print 'epoch:', epoch, '  minibatch:', iteration
288
                val_scores = []
289
                for x_val, mask_val in val_stream.get_epoch_iterator():
290
                    val_scores.append(evaluate_model(x_val.T, mask_val.T))
291
                print 'Average validation CE per sentence:', numpy.mean(val_scores)
292

293
    end_time = time.clock()
294
    print('Optimization complete.')
295
    print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
296

297

298
if __name__ == '__main__':
299
    train_model()
300

301
Product

Resources

Company