Final Exam¶

Machine Learning 2015-2¶

After solving all the questions in the exam save your notebook with the name username.ipynb and submit it to: https://www.dropbox.com/request/KN8GwdAIi0Hl2jk2mg2E

The following code implements a simple one-neuron neural network:

import numpy as np
import pylab as pl
%matplotlib inline

def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def predict(w, x):
    return sigmoid(np.dot(w[:2], np.array(x).T) + w[2])

X = [[0, 0],
 [0, 1],
 [1, 0],
 [1, 1]]
Y = [0, 1, 1, 0]
w = np.array([-10, -10, 5])

predict(w, X)

array([  9.93307149e-01,   6.69285092e-03,   6.69285092e-03,
         3.05902227e-07])

1. (1.0)¶

Find a weight vector such that the neural network calculates the NOR function:

$$f(x,y)=\neg(x\vee y)$$

Use the following function to test your answer:

def test_prediction(X, Y, w):
    epsilon = 0.001
    for i, x in enumerate(X):
        print predict(w, x)
        if np.abs(predict(w, x) - Y[i]) > epsilon:
            raise Exception("Prediction error")
    return True

X = [[0, 0],
 [0, 1],
 [1, 0],
 [1, 1]]
Y = [1, 0, 0 ,0]
w = np.array([ -15, -15, 7])
test_prediction(X, Y, w)

0.999088948806
0.000335350130466
0.000335350130466
1.02618796306e-10

True

2. (1.0)¶

The following function calculates the loss function of the neural network

def loss(w, x, y):
    return ((predict(w, x) - y) ** 2) / 2

Write a function that calculates the gradient of the loss with respect to the weights:

$$ \frac{\partial E}{\partial w} $$

def de_dw(w, x, y):
    delta = np.zeros(len(w))
    r = predict(w,x)
    out_error = y - r
    # update weights
    delta[0] = -out_error*r*(1-r)*x[0]
    delta[1] = -out_error*r*(1-r)*x[1]
    delta[2] = -out_error*r*(1-r)
    return delta

X = np.array([0, 0])
Y = [1]
w = np.array([0, 0, 0])
de_dw(w,X,Y)

array([-0.   , -0.   , -0.125])

Use the following functions to test your code:

def num_de_dw(w, x, y, epsilon):
    deltas = np.identity(len(w)) * epsilon
    de = np.zeros(len(w))
    
    for i in range(len(w)):
        de[i] = (loss(w + deltas[i, :], x, y) - loss(w - deltas[i, :], x, y)) / (2 * epsilon)
    return de

def test_de_dw():
    num_tests = 100
    epsilon = 0.01
    for i in range(num_tests):
        tw = np.random.randn(3)
        tx = np.random.randn(2)
        ty = np.random.randn(1)
        if np.linalg.norm(de_dw(tw, tx,ty) - num_de_dw(tw, tx, ty, epsilon)) > epsilon:
            raise Exception("de_dw test failed!")

test_de_dw()

Now, we can use the gradient function to train the neural network using gradient descent

def evaluate(w, X, Y):
    result = 0
    for i, x in enumerate(X):
        result += loss(w, x, Y[i])
    return result

def train(X, Y, epochs, eta, w_ini):
    losses = []
    w = w_ini
    for i in range(epochs):
        delta = np.zeros(len(w))
        for i, x in enumerate(X):
            delta += de_dw(w, x, Y[i])
        w = w - eta * delta
        losses.append(evaluate(w, X, Y))
    return w, losses

X = [[0, 0],
     [0, 1],
     [1, 0],
     [1, 1]]
Y = [0, 0, 0, 1]
w, losses = train(X, Y, 50, 1, [0, 0, 0])
pl.plot(losses)
print w
print predict(w, X)

[ 1.82593763  1.82593763 -2.89634667]
[ 0.05233446  0.2553253   0.2553253   0.68038216]

3. (1.0)¶

Now we will modify the loss function to include a regularization term: $$ E(w,D)=\frac{1}{2}\sum_{(x_{i},y_{i})\in D}(f(w,x_{i})-y_{i})^{2}+\frac{\text{1}}{2}\beta\left\Vert w\right\Vert _{2}^{2}$$

where $f(w,x_{i})$ is the prediction calculated by the neural network.

To accomplish this you must modify the following functions:

def reg_loss(w, beta, X, y):
    return (((predict(w, x) - y) ** 2) / 2 ) + 0.5*beta*(np.linalg.norm(w)**2)

def reg_de_dw(w, beta, x, y):
    delta = np.zeros(len(w))
    r = predict(w,x)
    out_error = y - r + beta*np.linalg.norm(w)
    
    # update weights
    delta[0] = -out_error*r*(1-r)*x[0]
    delta[1] = -out_error*r*(1-r)*x[1]
    delta[2] = -out_error*r*(1-r)
    
    # put your code he
    return delta

X = [0, 0]
Y = [1]
w = np.array([-20, -20, 20])
print reg_loss(w, 1, X, Y)
print reg_de_dw(w, 1, X, Y)

[ 600.5]
[ -0.00000000e+00  -0.00000000e+00  -7.14004582e-08]

You can use the following functions to test your code:

def reg_num_de_dw(w, beta, x, y, epsilon):
    deltas = np.identity(len(w)) * epsilon
    de = np.zeros(len(w))
    for i in range(len(w)):
        de[i] = (reg_loss(w + deltas[i, :], beta, x, y) - reg_loss(w - deltas[i, :], beta, x, y)) / (2 * epsilon)
    return de

def reg_test_de_dw():
    num_tests = 100
    epsilon = 0.9
    beta = 1
    for i in range(num_tests):
        tw = np.random.randn(3)
        tx = np.random.randn(2)
        ty = np.random.randn(1)
        print np.linalg.norm(reg_de_dw(tw, beta, tx, ty)
        if np.linalg.norm(reg_de_dw(tw, beta, tx, ty) - reg_num_de_dw(tw, beta, tx, ty, epsilon)) > epsilon:
            raise Exception("reg_de_dw test failed!")

print reg_test_de_dw()

  File "<ipython-input-260-72a6509c5189>", line 17
    if np.linalg.norm(reg_de_dw(tw, beta, tx, ty) - reg_num_de_dw(tw, beta, tx, ty, epsilon)) > epsilon:
                                                                                                       ^
SyntaxError: invalid syntax

4. (1.0)¶

Now train the neural network using regularization:

def evaluate(w, beta, X, Y):
    result = 0
    for i, x in enumerate(X):
        result += reg_loss(w, beta, X, Y[i])
    return result

def reg_train(X, Y, epochs, eta, w_ini, beta):
    losses = []
    w = np.array(w_ini)
    w = w_ini
    for i in range(epochs):
        delta = np.zeros(len(w))
        for i, x in enumerate(X):
            delta += reg_de_dw(w, beta, x, Y[i])
        w = w - eta * delta - eta
        losses.append(reg_evaluate(w, beta, X, Y))
    return w, losses

X = [[0, 0],
     [0, 1],
     [1, 0],
     [1, 1]]
Y = [0, 0, 0, 1]
wr, losses1 = reg_train(X, Y, 50, 1, [0, 0, 0], 1)
pl.plot(losses1)
print wr
print predict(wr, X)

[ 1.96703837  1.96703837  5.47057399]
[ 0.99580882  0.99941166  0.99941166  0.99991767]

What is the effect of regularization? Discuss.

5. (1.0)¶

Here, we will build a kernel version of the previous neural network, i.e., a neural network able to work in a feature space induced by a kernel. To do this we will express the weight vector as a linear combination of vectors in a set $X$:

$$ w=\sum_{x_{i}\in X}\alpha_{i}\phi(x_{i}) $$

Now, implement this modifying the following functions:

def k_predict(alpha, X, kernel, x):
    result = 0
    weights = []
    for i, xs in enumerate(X):
        weights.append(np.array(alpha[i]).dot([xs[0], xs[1], kernel(xs, xs)]))
    weights = np.sum(weights, axis=0)
    return sigmoid(np.dot(weights[:2], np.array(x).T)+0.5*w[2])
    

    
def k_loss(alpha, X, beta, kernel, x, y):
    # your code here
    return ((k_predict(alpha, X, kernel, x) - y) ** 2) / 2

Test your functions with the following code:

alpha = [1, 0.5, -0.3, -0.4]
Xs = [[0.1, -0.5],
     [0.5, 1.0],
     [-1.0, 0.5],
     [1.0, 1.0]]

def k1(x, y):
    return np.dot(x, y)

def k2(x, y):
    return (np.dot(x, y) + 1) ** 2

X = [[0, 0],
     [0, 1],
     [1, 0],
     [1, 1]]
Y = [1, 0, 0, 0]

for i, x in enumerate(X):
    print k_predict(alpha, Xs, k1, x), k_loss(alpha, Xs, 1, k1, x, Y[i])
print "--------"
for i, x in enumerate(X):
    print k_predict(alpha, Xs, k2, x), k_loss(alpha, Xs, 1, k2, x, Y[i])

[ 0.25 -0.55 -0.29]
0.986425000121 [ 0.25 -0.55 -0.29]
9.2140310862e-05
[ 0.25 -0.55 -0.29]
0.976702986855 [ 0.25 -0.55 -0.29]
0.476974362266
[ 0.25 -0.55 -0.29]
0.989395937805 [ 0.25 -0.55 -0.29]
0.489452160872
[ 0.25 -0.55 -0.29]
0.981762283842 [ 0.25 -0.55 -0.29]
0.481928590987
--------
[ 0.25   -0.55   -0.9999]
0.986425000121 [ 0.25   -0.55   -0.9999]
9.2140310862e-05
[ 0.25   -0.55   -0.9999]
0.976702986855 [ 0.25   -0.55   -0.9999]
0.476974362266
[ 0.25   -0.55   -0.9999]
0.989395937805 [ 0.25   -0.55   -0.9999]
0.489452160872
[ 0.25   -0.55   -0.9999]
0.981762283842 [ 0.25   -0.55   -0.9999]
0.481928590987

6. (optional, extra credit)¶

Train the kernel neural network using gradient descent.

def loss_k(alpha, Xs, kernel, x, y):
    return (((k_predict(alpha, Xs, kernel, x) - y) ** 2) / 2 )

def de_dw_k(alpha, Xs, kernel, x, y):
    delta = np.zeros(len(w))
    r = k_predict(alpha, Xs, kernel, x)
    out_error = y - r
    # update weights
    delta[0] = -out_error*sigmoid(r)*(1-sigmoid(r))
    delta[1] = -out_error*sigmoid(r)*(1-sigmoid(r))
    delta[2] = out_error*sigmoid(r)*(1-sigmoid(r))
    
    # put your code he
    return delta

def evaluate(alpha, Xs, kernel, X, Y):
    result = 0
    for i, x in enumerate(X):
        result += loss_k(alpha, Xs, kernel, x, Y[i])
    return result

def train_k(X, Y, epochs, eta, w_ini, alpha, Xs, kernel):
    losses = []
    w = []
    for i in range(epochs):
        delta = np.zeros(len(w))
        for i, x in enumerate(X):
            delta += de_dw_k(alpha, Xs, kernel, x, Y[i])
        w = w - eta * delta
        losses.append(evaluate(alpha, Xs, kernel, X, Y))
    return w, losses

X = [[0, 0],
     [0, 1],
     [1, 0],
     [1, 1]]
Y = [1, 0, 0, 0]

X = [[0, 0],
     [0, 1],
     [1, 0],
     [1, 1]]
Y = [0, 0, 1, 0]
wr, losses = train_k(X, Y, 50, 0.1, [0, 0, 0], alpha, Xs, k1)
pl.plot(losses)
print wr
print predict(wr, X)

[-2.90898794 -2.90898794  2.90898794]
[ 0.8106891   0.1893109   0.1893109   0.01257386]

def GaussianKernel(v1, v2, sigma):
    return np.exp(-np.linalg.norm(v1-v2, 2)**2/(2.*sigma**2))

X = np.array([[0, 0],
             [0, 1],
             [1, 0],
             [1, 1]])

for i in X:
    print GaussianKernel(i,i,1)

1.0
1.0
1.0
1.0