📚 The CoCalc Library - books, templates and other resources
cocalc-examples / data-science-ipython-notebooks / deep-learning / theano-tutorial / rnn_tutorial / lstm_text.py
132928 viewsLicense: OTHER
import cPickle as pkl1import time23import numpy4import theano5from theano import config6import theano.tensor as T7from theano.tensor.nnet import categorical_crossentropy89from fuel.datasets import TextFile10from fuel.streams import DataStream11from fuel.schemes import ConstantScheme12from fuel.transformers import Batch, Padding131415# These files can be downloaded from16# http://www-etud.iro.umontreal.ca/~brakelp/train.txt.gz17# http://www-etud.iro.umontreal.ca/~brakelp/dictionary.pkl18# don't forget to change the paths and gunzip train.txt.gz19TRAIN_FILE = '/u/brakelp/temp/traindata.txt'20VAL_FILE = '/u/brakelp/temp/valdata.txt'21DICT_FILE = '/u/brakelp/temp/dictionary.pkl'222324def sequence_categorical_crossentropy(prediction, targets, mask):25prediction_flat = prediction.reshape(((prediction.shape[0] *26prediction.shape[1]),27prediction.shape[2]), ndim=2)28targets_flat = targets.flatten()29mask_flat = mask.flatten()30ce = categorical_crossentropy(prediction_flat, targets_flat)31return T.sum(ce * mask_flat)323334def gauss_weight(ndim_in, ndim_out=None, sd=.005):35if ndim_out is None:36ndim_out = ndim_in37W = numpy.random.randn(ndim_in, ndim_out) * sd38return numpy.asarray(W, dtype=config.floatX)394041class LogisticRegression(object):42"""Multi-class Logistic Regression Class4344The logistic regression is fully described by a weight matrix :math:`W`45and bias vector :math:`b`. Classification is done by projecting data46points onto a set of hyperplanes, the distance to which is used to47determine a class membership probability.48"""4950def __init__(self, input, n_in, n_out):51""" Initialize the parameters of the logistic regression5253:type input: theano.tensor.TensorType54:param input: symbolic variable that describes the input of the55architecture (one minibatch)5657:type n_in: int58:param n_in: number of input units, the dimension of the space in59which the datapoints lie6061:type n_out: int62:param n_out: number of output units, the dimension of the space in63which the labels lie6465"""6667# initialize with 0 the weights W as a matrix of shape (n_in, n_out)68self.W = theano.shared(value=numpy.zeros((n_in, n_out),69dtype=theano.config.floatX),70name='W', borrow=True)71# initialize the baises b as a vector of n_out 0s72self.b = theano.shared(value=numpy.zeros((n_out,),73dtype=theano.config.floatX),74name='b', borrow=True)7576# compute vector of class-membership probabilities in symbolic form77energy = T.dot(input, self.W) + self.b78energy_exp = T.exp(energy - T.max(energy, 2)[:, :, None])79pmf = energy_exp / energy_exp.sum(2)[:, :, None]80self.p_y_given_x = pmf8182# compute prediction as class whose probability is maximal in83# symbolic form84self.y_pred = T.argmax(self.p_y_given_x, axis=1)8586# parameters of the model87self.params = [self.W, self.b]888990def index_dot(indices, w):91return w[indices.flatten()]929394class LstmLayer:9596def __init__(self, rng, input, mask, n_in, n_h):9798# Init params99self.W_i = theano.shared(gauss_weight(n_in, n_h), 'W_i', borrow=True)100self.W_f = theano.shared(gauss_weight(n_in, n_h), 'W_f', borrow=True)101self.W_c = theano.shared(gauss_weight(n_in, n_h), 'W_c', borrow=True)102self.W_o = theano.shared(gauss_weight(n_in, n_h), 'W_o', borrow=True)103104self.U_i = theano.shared(gauss_weight(n_h), 'U_i', borrow=True)105self.U_f = theano.shared(gauss_weight(n_h), 'U_f', borrow=True)106self.U_c = theano.shared(gauss_weight(n_h), 'U_c', borrow=True)107self.U_o = theano.shared(gauss_weight(n_h), 'U_o', borrow=True)108109self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),110'b_i', borrow=True)111self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),112'b_f', borrow=True)113self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),114'b_c', borrow=True)115self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX),116'b_o', borrow=True)117118self.params = [self.W_i, self.W_f, self.W_c, self.W_o,119self.U_i, self.U_f, self.U_c, self.U_o,120self.b_i, self.b_f, self.b_c, self.b_o]121122outputs_info = [T.zeros((input.shape[1], n_h)),123T.zeros((input.shape[1], n_h))]124125rval, updates = theano.scan(self._step,126sequences=[mask, input],127outputs_info=outputs_info)128129# self.output is in the format (batchsize, n_h)130self.output = rval[0]131132def _step(self, m_, x_, h_, c_):133134i_preact = (index_dot(x_, self.W_i) +135T.dot(h_, self.U_i) + self.b_i)136i = T.nnet.sigmoid(i_preact)137138f_preact = (index_dot(x_, self.W_f) +139T.dot(h_, self.U_f) + self.b_f)140f = T.nnet.sigmoid(f_preact)141142o_preact = (index_dot(x_, self.W_o) +143T.dot(h_, self.U_o) + self.b_o)144o = T.nnet.sigmoid(o_preact)145146c_preact = (index_dot(x_, self.W_c) +147T.dot(h_, self.U_c) + self.b_c)148c = T.tanh(c_preact)149150c = f * c_ + i * c151c = m_[:, None] * c + (1. - m_)[:, None] * c_152153h = o * T.tanh(c)154h = m_[:, None] * h + (1. - m_)[:, None] * h_155156return h, c157158159def train_model(batch_size=100, n_h=50, n_epochs=40):160161# Load the datasets with Fuel162dictionary = pkl.load(open(DICT_FILE, 'r'))163dictionary['~'] = len(dictionary)164reverse_mapping = dict((j, i) for i, j in dictionary.items())165166print("Loading the data")167train = TextFile(files=[TRAIN_FILE],168dictionary=dictionary,169unk_token='~',170level='character',171preprocess=str.lower,172bos_token=None,173eos_token=None)174175train_stream = DataStream.default_stream(train)176177# organize data in batches and pad shorter sequences with zeros178train_stream = Batch(train_stream,179iteration_scheme=ConstantScheme(batch_size))180train_stream = Padding(train_stream)181182# idem dito for the validation text183val = TextFile(files=[VAL_FILE],184dictionary=dictionary,185unk_token='~',186level='character',187preprocess=str.lower,188bos_token=None,189eos_token=None)190191val_stream = DataStream.default_stream(val)192193# organize data in batches and pad shorter sequences with zeros194val_stream = Batch(val_stream,195iteration_scheme=ConstantScheme(batch_size))196val_stream = Padding(val_stream)197198print('Building model')199200# Set the random number generator' seeds for consistency201rng = numpy.random.RandomState(12345)202203x = T.lmatrix('x')204mask = T.matrix('mask')205206# Construct the LSTM layer207recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h)208209logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1],210n_in=n_h, n_out=111)211212cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x,213x[1:],214mask[1:]) / batch_size215216# create a list of all model parameters to be fit by gradient descent217params = logreg_layer.params + recurrent_layer.params218219# create a list of gradients for all model parameters220grads = T.grad(cost, params)221222# update_model is a function that updates the model parameters by223# SGD Since this model has many parameters, it would be tedious to224# manually create an update rule for each model parameter. We thus225# create the updates list by automatically looping over all226# (params[i], grads[i]) pairs.227learning_rate = 0.1228updates = [229(param_i, param_i - learning_rate * grad_i)230for param_i, grad_i in zip(params, grads)231]232233update_model = theano.function([x, mask], cost, updates=updates)234235evaluate_model = theano.function([x, mask], cost)236237# Define and compile a function for generating a sequence step by step.238x_t = T.iscalar()239h_p = T.vector()240c_p = T.vector()241h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p)242energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b243244energy_exp = T.exp(energy - T.max(energy, 1)[:, None])245246output = energy_exp / energy_exp.sum(1)[:, None]247single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t])248249start_time = time.clock()250251iteration = 0252253for epoch in range(n_epochs):254print 'epoch:', epoch255256for x_, mask_ in train_stream.get_epoch_iterator():257iteration += 1258259cross_entropy = update_model(x_.T, mask_.T)260261262# Generate some text after each 20 minibatches263if iteration % 40 == 0:264try:265prediction = numpy.ones(111, dtype=config.floatX) / 111.0266h_p = numpy.zeros((n_h,), dtype=config.floatX)267c_p = numpy.zeros((n_h,), dtype=config.floatX)268initial = 'the meaning of life is '269sentence = initial270for char in initial:271x_t = dictionary[char]272prediction, h_p, c_p = single_step(x_t, h_p.flatten(),273c_p.flatten())274sample = numpy.random.multinomial(1, prediction.flatten())275for i in range(450):276x_t = numpy.argmax(sample)277prediction, h_p, c_p = single_step(x_t, h_p.flatten(),278c_p.flatten())279sentence += reverse_mapping[x_t]280sample = numpy.random.multinomial(1, prediction.flatten())281print 'LSTM: "' + sentence + '"'282except ValueError:283print 'Something went wrong during sentence generation.'284285if iteration % 40 == 0:286print 'epoch:', epoch, ' minibatch:', iteration287val_scores = []288for x_val, mask_val in val_stream.get_epoch_iterator():289val_scores.append(evaluate_model(x_val.T, mask_val.T))290print 'Average validation CE per sentence:', numpy.mean(val_scores)291292end_time = time.clock()293print('Optimization complete.')294print('The code ran for %.2fm' % ((end_time - start_time) / 60.))295296297if __name__ == '__main__':298train_model()299300301