Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

📚 The CoCalc Library - books, templates and other resources

132928 views
License: OTHER
1
import cPickle as pkl
2
import time
3
4
import numpy
5
import theano
6
from theano import config
7
import theano.tensor as T
8
from theano.tensor.nnet import categorical_crossentropy
9
10
from fuel.datasets import TextFile
11
from fuel.streams import DataStream
12
from fuel.schemes import ConstantScheme
13
from fuel.transformers import Batch, Padding
14
15
16
# These files can be downloaded from
17
# http://www-etud.iro.umontreal.ca/~brakelp/train.txt.gz
18
# http://www-etud.iro.umontreal.ca/~brakelp/dictionary.pkl
19
# don't forget to change the paths and gunzip train.txt.gz
20
TRAIN_FILE = '/u/brakelp/temp/traindata.txt'
21
VAL_FILE = '/u/brakelp/temp/valdata.txt'
22
DICT_FILE = '/u/brakelp/temp/dictionary.pkl'
23
24
25
def sequence_categorical_crossentropy(prediction, targets, mask):
26
prediction_flat = prediction.reshape(((prediction.shape[0] *
27
prediction.shape[1]),
28
prediction.shape[2]), ndim=2)
29
targets_flat = targets.flatten()
30
mask_flat = mask.flatten()
31
ce = categorical_crossentropy(prediction_flat, targets_flat)
32
return T.sum(ce * mask_flat)
33
34
35
def gauss_weight(ndim_in, ndim_out=None, sd=.005):
36
if ndim_out is None:
37
ndim_out = ndim_in
38
W = numpy.random.randn(ndim_in, ndim_out) * sd
39
return numpy.asarray(W, dtype=config.floatX)
40
41
42
class LogisticRegression(object):
43
"""Multi-class Logistic Regression Class
44
45
The logistic regression is fully described by a weight matrix :math:`W`
46
and bias vector :math:`b`. Classification is done by projecting data
47
points onto a set of hyperplanes, the distance to which is used to
48
determine a class membership probability.
49
"""
50
51
def __init__(self, input, n_in, n_out):
52
""" Initialize the parameters of the logistic regression
53
54
:type input: theano.tensor.TensorType
55
:param input: symbolic variable that describes the input of the
56
architecture (one minibatch)
57
58
:type n_in: int
59
:param n_in: number of input units, the dimension of the space in
60
which the datapoints lie
61
62
:type n_out: int
63
:param n_out: number of output units, the dimension of the space in
64
which the labels lie
65
66
"""
67
68
# initialize with 0 the weights W as a matrix of shape (n_in, n_out)
69
self.W = theano.shared(value=numpy.zeros((n_in, n_out),
70
dtype=theano.config.floatX),
71
name='W', borrow=True)
72
# initialize the baises b as a vector of n_out 0s
73
self.b = theano.shared(value=numpy.zeros((n_out,),
74
dtype=theano.config.floatX),
75
name='b', borrow=True)
76
77
# compute vector of class-membership probabilities in symbolic form
78
energy = T.dot(input, self.W) + self.b
79
energy_exp = T.exp(energy - T.max(energy, 2)[:, :, None])
80
pmf = energy_exp / energy_exp.sum(2)[:, :, None]
81
self.p_y_given_x = pmf
82
83
# compute prediction as class whose probability is maximal in
84
# symbolic form
85
self.y_pred = T.argmax(self.p_y_given_x, axis=1)
86
87
# parameters of the model
88
self.params = [self.W, self.b]
89
90
91
def index_dot(indices, w):
92
return w[indices.flatten()]
93
94
95
class LstmLayer:
96
97
def __init__(self, rng, input, mask, n_in, n_h):
98
99
# Init params
100
self.W_i = theano.shared(gauss_weight(n_in, n_h), 'W_i', borrow=True)
101
self.W_f = theano.shared(gauss_weight(n_in, n_h), 'W_f', borrow=True)
102
self.W_c = theano.shared(gauss_weight(n_in, n_h), 'W_c', borrow=True)
103
self.W_o = theano.shared(gauss_weight(n_in, n_h), 'W_o', borrow=True)
104
105
self.U_i = theano.shared(gauss_weight(n_h), 'U_i', borrow=True)
106
self.U_f = theano.shared(gauss_weight(n_h), 'U_f', borrow=True)
107
self.U_c = theano.shared(gauss_weight(n_h), 'U_c', borrow=True)
108
self.U_o = theano.shared(gauss_weight(n_h), 'U_o', borrow=True)
109
110
self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
111
'b_i', borrow=True)
112
self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
113
'b_f', borrow=True)
114
self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
115
'b_c', borrow=True)
116
self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),
117
'b_o', borrow=True)
118
119
self.params = [self.W_i, self.W_f, self.W_c, self.W_o,
120
self.U_i, self.U_f, self.U_c, self.U_o,
121
self.b_i, self.b_f, self.b_c, self.b_o]
122
123
outputs_info = [T.zeros((input.shape[1], n_h)),
124
T.zeros((input.shape[1], n_h))]
125
126
rval, updates = theano.scan(self._step,
127
sequences=[mask, input],
128
outputs_info=outputs_info)
129
130
# self.output is in the format (batchsize, n_h)
131
self.output = rval[0]
132
133
def _step(self, m_, x_, h_, c_):
134
135
i_preact = (index_dot(x_, self.W_i) +
136
T.dot(h_, self.U_i) + self.b_i)
137
i = T.nnet.sigmoid(i_preact)
138
139
f_preact = (index_dot(x_, self.W_f) +
140
T.dot(h_, self.U_f) + self.b_f)
141
f = T.nnet.sigmoid(f_preact)
142
143
o_preact = (index_dot(x_, self.W_o) +
144
T.dot(h_, self.U_o) + self.b_o)
145
o = T.nnet.sigmoid(o_preact)
146
147
c_preact = (index_dot(x_, self.W_c) +
148
T.dot(h_, self.U_c) + self.b_c)
149
c = T.tanh(c_preact)
150
151
c = f * c_ + i * c
152
c = m_[:, None] * c + (1. - m_)[:, None] * c_
153
154
h = o * T.tanh(c)
155
h = m_[:, None] * h + (1. - m_)[:, None] * h_
156
157
return h, c
158
159
160
def train_model(batch_size=100, n_h=50, n_epochs=40):
161
162
# Load the datasets with Fuel
163
dictionary = pkl.load(open(DICT_FILE, 'r'))
164
dictionary['~'] = len(dictionary)
165
reverse_mapping = dict((j, i) for i, j in dictionary.items())
166
167
print("Loading the data")
168
train = TextFile(files=[TRAIN_FILE],
169
dictionary=dictionary,
170
unk_token='~',
171
level='character',
172
preprocess=str.lower,
173
bos_token=None,
174
eos_token=None)
175
176
train_stream = DataStream.default_stream(train)
177
178
# organize data in batches and pad shorter sequences with zeros
179
train_stream = Batch(train_stream,
180
iteration_scheme=ConstantScheme(batch_size))
181
train_stream = Padding(train_stream)
182
183
# idem dito for the validation text
184
val = TextFile(files=[VAL_FILE],
185
dictionary=dictionary,
186
unk_token='~',
187
level='character',
188
preprocess=str.lower,
189
bos_token=None,
190
eos_token=None)
191
192
val_stream = DataStream.default_stream(val)
193
194
# organize data in batches and pad shorter sequences with zeros
195
val_stream = Batch(val_stream,
196
iteration_scheme=ConstantScheme(batch_size))
197
val_stream = Padding(val_stream)
198
199
print('Building model')
200
201
# Set the random number generator' seeds for consistency
202
rng = numpy.random.RandomState(12345)
203
204
x = T.lmatrix('x')
205
mask = T.matrix('mask')
206
207
# Construct the LSTM layer
208
recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h)
209
210
logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1],
211
n_in=n_h, n_out=111)
212
213
cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x,
214
x[1:],
215
mask[1:]) / batch_size
216
217
# create a list of all model parameters to be fit by gradient descent
218
params = logreg_layer.params + recurrent_layer.params
219
220
# create a list of gradients for all model parameters
221
grads = T.grad(cost, params)
222
223
# update_model is a function that updates the model parameters by
224
# SGD Since this model has many parameters, it would be tedious to
225
# manually create an update rule for each model parameter. We thus
226
# create the updates list by automatically looping over all
227
# (params[i], grads[i]) pairs.
228
learning_rate = 0.1
229
updates = [
230
(param_i, param_i - learning_rate * grad_i)
231
for param_i, grad_i in zip(params, grads)
232
]
233
234
update_model = theano.function([x, mask], cost, updates=updates)
235
236
evaluate_model = theano.function([x, mask], cost)
237
238
# Define and compile a function for generating a sequence step by step.
239
x_t = T.iscalar()
240
h_p = T.vector()
241
c_p = T.vector()
242
h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p)
243
energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b
244
245
energy_exp = T.exp(energy - T.max(energy, 1)[:, None])
246
247
output = energy_exp / energy_exp.sum(1)[:, None]
248
single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t])
249
250
start_time = time.clock()
251
252
iteration = 0
253
254
for epoch in range(n_epochs):
255
print 'epoch:', epoch
256
257
for x_, mask_ in train_stream.get_epoch_iterator():
258
iteration += 1
259
260
cross_entropy = update_model(x_.T, mask_.T)
261
262
263
# Generate some text after each 20 minibatches
264
if iteration % 40 == 0:
265
try:
266
prediction = numpy.ones(111, dtype=config.floatX) / 111.0
267
h_p = numpy.zeros((n_h,), dtype=config.floatX)
268
c_p = numpy.zeros((n_h,), dtype=config.floatX)
269
initial = 'the meaning of life is '
270
sentence = initial
271
for char in initial:
272
x_t = dictionary[char]
273
prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
274
c_p.flatten())
275
sample = numpy.random.multinomial(1, prediction.flatten())
276
for i in range(450):
277
x_t = numpy.argmax(sample)
278
prediction, h_p, c_p = single_step(x_t, h_p.flatten(),
279
c_p.flatten())
280
sentence += reverse_mapping[x_t]
281
sample = numpy.random.multinomial(1, prediction.flatten())
282
print 'LSTM: "' + sentence + '"'
283
except ValueError:
284
print 'Something went wrong during sentence generation.'
285
286
if iteration % 40 == 0:
287
print 'epoch:', epoch, ' minibatch:', iteration
288
val_scores = []
289
for x_val, mask_val in val_stream.get_epoch_iterator():
290
val_scores.append(evaluate_model(x_val.T, mask_val.T))
291
print 'Average validation CE per sentence:', numpy.mean(val_scores)
292
293
end_time = time.clock()
294
print('Optimization complete.')
295
print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
296
297
298
if __name__ == '__main__':
299
train_model()
300
301