How to deal with overfitting in Tensorflow? - python-2.7

I'm currently trying to train a image classification convolutional neural network. I'm using an architecture similar to that in the TensorFlow tutorial. After training, I can get a quite high training accuracy and a very low cross entropy. But the test accuracy is always only a little bit higher than random guessing. The neural network seems to suffer from overfitting. In the training process, I have applied stochastic gradient descent and droupout to try to avoid overfitting. But it just doesn't seem to work.
Here is part of my code.
batch_image = np.ndarray(shape=(100,9216), dtype='float')
batch_class = np.ndarray(shape=(100,10), dtype='float')
# first convolutinal layer
w_conv1 = weight_variable([5, 5, 3, 64])
b_conv1 = bias_variable([64])
x_image = tf.reshape(x, [-1, 48, 64, 3])
h_conv1 = tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
norm1 = tf.nn.lrn(tf.to_float(h_pool1, name='ToFloat'), 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
# second convolutional layer
w_conv2 = weight_variable([5, 5, 64, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(norm1, w_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
norm2 = tf.nn.lrn(tf.to_float(h_pool2, name='ToFloat'), 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
# densely connected layer
w_fc1 = weight_variable([12*16*64, 512])
b_fc1 = bias_variable([512])
h_pool2_flat = tf.reshape(norm2, [-1, 12*16*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
#densely connected layer
w_fc2 = weight_variable([512, 256])
b_fc2 = bias_variable([256])
h_fc2 = tf.nn.relu(tf.matmul(h_fc1, w_fc2) + b_fc2)
# dropout
keep_prob = tf.placeholder("float")
h_fc1_drop = tf.nn.dropout(h_fc2, keep_prob)
# readout layer
w_fc3 = weight_variable([256, 10])
b_fc3 = bias_variable([10])
y_prob = tf.nn.softmax(tf.matmul(h_fc1_drop, w_fc3) + b_fc3)
# train and evaluate the model
cross_entropy = -tf.reduce_sum(y_ * tf.log(y_prob + 0.000000001))
train_step = tf.train.GradientDescentOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_prob, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
sess.run(tf.initialize_all_variables())
for i in range(100):
rand_idx = np.random.randint(17778, size=(100))
k = 0
for j in rand_idx:
batch_image[k] = images[j]
batch_class[k] = np.zeros(shape=(10))
batch_class[k, classes[j, 0]] = 1.0
k+=1
train_step.run(feed_dict={x:batch_image, y_:batch_class, keep_prob:0.5})
train_accuracy = accuracy.eval(feed_dict={x:batch_image, y_:batch_class, keep_prob:1.0})
train_ce = cross_entropy.eval(feed_dict={x:batch_image, y_:batch_class, keep_prob:1.0})
I am wondering is there any mistake in my code or do I have to apply any other strategies to get a better test accuracy.
Thank you!

You can try below strategies to avoid overfitting.
shuffle the input data
Use early stopping for the Loss function with some Patience level.
L1 & L2 Regularization
Add Dropout
Batch Normalization.
If pixels are not normalized, dividing the pixels values with 255 also helps.
Perform Image Data Agumentation.
May be hyper parameter tuning grid search.
Hope it helps! Happy Coding.
Thank You!

Related

How to get accurate predictions from a neural network

I created below neural network for the truth table for the 3-input logic AND gate, but the expected output for the [1,1,0] is not correct.Output should be 0. But it predicts as 0.9 that means approximately 1. So the output is not correct. So what I need to know is how to make the output prediction more accurate.Please guide me.
import numpy as np
class NeuralNetwork():
def __init__(self):
self.X = np.array([[0, 0, 0],
[0, 0, 1],
[0, 1, 0],
[0, 1, 1],
[1, 0, 0],
[1, 0, 1],
[1, 1, 1]])
self.y = np.array([[0],
[0],
[0],
[0],
[0],
[0],
[1]])
np.random.seed(1)
# randomly initialize our weights with mean 0
self.syn0 = 2 * np.random.random((3, 4)) - 1
self.syn1 = 2 * np.random.random((4, 1)) - 1
def nonlin(self,x, deriv=False):
if (deriv == True):
return x * (1 - x)
return 1 / (1 + np.exp(-x))
def train(self,steps):
for j in xrange(steps):
# Feed forward through layers 0, 1, and 2
l0 = self.X
l1 = self.nonlin(np.dot(l0, self.syn0))
l2 = self.nonlin(np.dot(l1, self.syn1))
# how much did we miss the target value?
l2_error = self.y - l2
if (j % 10000) == 0:
print "Error:" + str(np.mean(np.abs(l2_error)))
# in what direction is the target value?
# were we really sure? if so, don't change too much.
l2_delta = l2_error * self.nonlin(l2, deriv=True)
# how much did each l1 value contribute to the l2 error (according to the weights)?
l1_error = l2_delta.dot(self.syn1.T)
# in what direction is the target l1?
# were we really sure? if so, don't change too much.
l1_delta = l1_error * self.nonlin(l1, deriv=True)
self.syn1 += l1.T.dot(l2_delta)
self.syn0 += l0.T.dot(l1_delta)
print("Output after training:")
print(l2)
def predict(self,newInput):
# Multiply the input with weights and find its sigmoid activation for all layers
layer0 = newInput
print("predict -> layer 0 : "+str(layer0))
layer1 = self.nonlin(np.dot(layer0, self.syn0))
print("predict -> layer 1 : "+str(layer1))
layer2 = self.nonlin(np.dot(layer1, self.syn1))
print("predicted output is : "+str(layer2))
if __name__ == '__main__':
ann=NeuralNetwork()
ann.train(100000)
ann.predict([1,1,0])
Output:
Error:0.48402933124
Error:0.00603525276229
Error:0.00407346660344
Error:0.00325224335386
Error:0.00277628698655
Error:0.00245737222701
Error:0.00222508289674
Error:0.00204641406194
Error:0.00190360175536
Error:0.00178613765229
Output after training:
[[ 1.36893057e-04]
[ 5.80758383e-05]
[ 1.19857670e-03]
[ 1.85443483e-03]
[ 2.13949603e-03]
[ 2.19360982e-03]
[ 9.95769492e-01]]
predict -> layer 0 : [1, 1, 0]
predict -> layer 1 : [ 0.00998162 0.91479567 0.00690524 0.05241988]
predicted output is : [ 0.99515547]
Actually, it does produce correct output -- the model is ambiguous. Your input data fits A*B; the value of the third input never affects the given output, so your model has no way to know that it's supposed to matter in case 110. In terms of pure information theory, you don't have the input to force the result you want.
Seems like this is happening for every input you miss in the AND gate. For example try replacing [0, 1, 1] input with [1, 1, 0] and then try to predict [0, 1, 1] it predicts the final value close to 1. I tried including biases and learning rate but nothing seem to work.
Like Prune mentioned it might be because the BackPropagation Network is not able to work with the incomplete model.
To train your network to the fullest and get optimal weights, provide all the possible inputs i.e 8 inputs to the AND gate. Then you can always get the correct predictions because you already trained the network with those inputs, which might not make sense with predictions in this case. May be predictions on a small dataset do not work that great.
This is just my guess because almost all the networks I used for predictions used to have fairly bigger datasets.

TensorFlow CNN with 3D input is too slow to train

Problem
I am building a TensorFlow CNN in Python 2.7 to classify 40x40x40 3D images into one of the two categories. The training data is stored in a HDF5 file as a (5000,40,40,40,1) array (5000 training images, 1 color channel). However, when I am training the network, each iteration of a batch of 32 images takes about a minute to complete. Through Activity Monitor I see that there are about 6GB data written to the disk in each iteration. The HDF5 itself is only about 500MB. What is happening?
This is the code I used to load data:
f = h5py.File(file_name, 'r')
images = f.get('key_name')
images = np.array(images)
images = images.astype(np.float32)
images = np.multiply(images, 1.0 / 100.0)
I also tried to directly use a HDF5 object for each iteration, instead of loading all the data into memory at once. But the problem remains:
f = h5py.File(file_name, 'r')
images = np.array(f.get('key_name')[:batch_size])
CNN Structure
The CNN I built has two 3D convolution layer, one flatten layer, one fully connected layer and one output layer. Here is the complete code for the structure:
num_channels = 1
filter_size_conv1 = 5
filter_size_conv2 = 5
num_filters_conv1 = 32
num_filters_conv2 = 64
fc_layer_size = 64
x = tf.placeholder(tf.float32, shape=[None, img_size, img_size, img_size, num_channels], name='x')
y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls = tf.argmax(y_true, dimension=1)
def create_weights(shape):
return tf.Variable(tf.truncated_normal(shape, stddev=0.05))
def create_biases(size):
return tf.Variable(tf.constant(0.05,shape=[size]))
def create_convolutional_layer(input,
num_input_channels,
conv_filter_size,
num_filters):
weights = create_weights(shape=[
conv_filter_size, conv_filter_size, conv_filter_size, num_input_channels, num_filters])
biases = create_biases(num_filters)
layer = tf.nn.conv3d(input=input,
filter=weights,
strides=[1, 1, 1, 1, 1],
padding='SAME')
layer += biases
layer = tf.nn.max_pool3d(input=layer,
ksize=[1, 4, 4, 4, 1],
strides=[1, 4, 4, 4, 1],
padding='SAME')
layer = tf.nn.relu(layer)
return layer
def create_flatten_layer(layer):
layer_shape = layer.get_shape()
num_features = layer_shape[1:5].num_elements()
layer = tf.reshape(layer, [-1, num_features])
return layer
def create_fc_layer(input,
num_inputs,
num_outputs,
use_relu=True):
weights = create_weights(shape=[num_inputs, num_outputs])
biases = create_biases(num_outputs)
layer = tf.matmul(input, weights) + biases
if use_relu:
layer = tf.nn.relu(layer)
return layer
layer_conv1 = create_convolutional_layer(input=x,
num_input_channels=num_channels,
conv_filter_size=filter_size_conv1,
num_filters=num_filters_conv1)
layer_conv2 = create_convolutional_layer(input=layer_conv1,
num_input_channels=num_filters_conv1,
conv_filter_size=filter_size_conv2,
num_filters=num_filters_conv2)
layer_flat = create_flatten_layer(layer_conv2)
layer_fc1 = create_fc_layer(input=layer_flat,
num_inputs=layer_flat.get_shape()[1:5].num_elements(),
num_outputs=fc_layer_size,
use_relu=True)
layer_fc2 = create_fc_layer(input=layer_fc1,
num_inputs=fc_layer_size,
num_outputs=num_classes,
use_relu=False)
Any help would be appreciated. Thank you so much!
I actually found the same problem here. I'm using conv3d and it takes like forever to finish. Probably related to the optimization of conv3d

keras autoencoder vs PCA

I am playing with a toy example to understand PCA vs keras autoencoder
I have the following code for understanding PCA:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
pca = decomposition.PCA(n_components=3)
pca.fit(X)
pca.explained_variance_ratio_
array([ 0.92461621, 0.05301557, 0.01718514])
pca.components_
array([[ 0.36158968, -0.08226889, 0.85657211, 0.35884393],
[ 0.65653988, 0.72971237, -0.1757674 , -0.07470647],
[-0.58099728, 0.59641809, 0.07252408, 0.54906091]])
I have done a few readings and play codes with keras including this one.
However, the reference code feels too high a leap for my level of understanding.
Does someone have a short auto-encoder code which can show me
(1) how to pull the first 3 components from auto-encoder
(2) how to understand what amount of variance the auto-encoder captures
(3) how the auto-encoder components compare against PCA components
First of all, the aim of an autoencoder is to learn a representation (encoding) for a set of data, typically for the purpose of dimensionality reduction. So, the target output of the autoencoder is the autoencoder input itself.
It is shown in [1] that If there is one linear hidden layer and the mean squared error criterion is used to train the network, then the k hidden units learn to project the input in the span of the first k principal components of the data.
And in [2] you can see that If the hidden layer is nonlinear, the autoencoder behaves differently from PCA, with the ability to capture multi-modal aspects of the input distribution.
Autoencoders are data-specific, which means that they will only be able to compress data similar to what they have been trained on. So, the usefulness of features that have been learned by hidden layers could be used for evaluating the efficacy of the method.
For this reason, one way to evaluate an autoencoder efficacy in dimensionality reduction is cutting the output of the middle hidden layer and compare the accuracy/performance of your desired algorithm by this reduced data rather than using original data.
Generally, PCA is a linear method, while autoencoders are usually non-linear. Mathematically, it is hard to compare them together, but intuitively I provide an example of dimensionality reduction on MNIST dataset using Autoencoder for your better understanding. The code is here:
from keras.datasets import mnist
from keras.models import Model
from keras.layers import Input, Dense
from keras.utils import np_utils
import numpy as np
num_train = 60000
num_test = 10000
height, width, depth = 28, 28, 1 # MNIST images are 28x28
num_classes = 10 # there are 10 classes (1 per digit)
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(num_train, height * width)
X_test = X_test.reshape(num_test, height * width)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255 # Normalise data to [0, 1] range
X_test /= 255 # Normalise data to [0, 1] range
Y_train = np_utils.to_categorical(y_train, num_classes) # One-hot encode the labels
Y_test = np_utils.to_categorical(y_test, num_classes) # One-hot encode the labels
input_img = Input(shape=(height * width,))
x = Dense(height * width, activation='relu')(input_img)
encoded = Dense(height * width//2, activation='relu')(x)
encoded = Dense(height * width//8, activation='relu')(encoded)
y = Dense(height * width//256, activation='relu')(x)
decoded = Dense(height * width//8, activation='relu')(y)
decoded = Dense(height * width//2, activation='relu')(decoded)
z = Dense(height * width, activation='sigmoid')(decoded)
model = Model(input_img, z)
model.compile(optimizer='adadelta', loss='mse') # reporting the accuracy
model.fit(X_train, X_train,
epochs=10,
batch_size=128,
shuffle=True,
validation_data=(X_test, X_test))
mid = Model(input_img, y)
reduced_representation =mid.predict(X_test)
out = Dense(num_classes, activation='softmax')(y)
reduced = Model(input_img, out)
reduced.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
reduced.fit(X_train, Y_train,
epochs=10,
batch_size=128,
shuffle=True,
validation_data=(X_test, Y_test))
scores = reduced.evaluate(X_test, Y_test, verbose=1)
print("Accuracy: ", scores[1])
It produces a $y\in \mathbb{R}^{3}$ ( almost like what you get by decomposition.PCA(n_components=3) ). For example, here you see the outputs of layer y for a digit 5 instance in dataset:
class y_1 y_2 y_3
5 87.38 0.00 20.79
As you see in the above code, when we connect layer y to a softmax dense layer:
mid = Model(input_img, y)
reduced_representation =mid.predict(X_test)
the new model mid give us a good classification accuracy about 95%. So, it would be reasonable to say that y, is an efficiently extracted feature vector for the dataset.
References:
[1]: Bourlard, Hervé, and Yves Kamp. "Auto-association by multilayer perceptrons and singular value decomposition." Biological cybernetics 59.4 (1988): 291-294.
[2]: Japkowicz, Nathalie, Stephen Jose Hanson, and Mark A. Gluck. "Nonlinear autoassociation is not equivalent to PCA." Neural computation 12.3 (2000): 531-545.
The earlier answer cover the whole thing, however I am doing the analysis on the Iris data - my code comes with a slightly modificiation from this post which dives further into the topic. As it was request, lets load the data
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
iris = load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
Let's do a regular PCA
from sklearn import decomposition
pca = decomposition.PCA()
pca_transformed = pca.fit_transform(X_scaled)
plot3clusters(pca_transformed[:,:2], 'PCA', 'PC')
A very simple AE model with linear layers, as the earlier answer pointed out with ... the first reference, one linear hidden layer and the mean squared error criterion is used to train the network, then the k hidden units learn to project the input in the span of the first k principal components of the data.
from keras.layers import Input, Dense
from keras.models import Model
import matplotlib.pyplot as plt
#create an AE and fit it with our data using 3 neurons in the dense layer using keras' functional API
input_dim = X_scaled.shape[1]
encoding_dim = 2
input_img = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='linear')(input_img)
decoded = Dense(input_dim, activation='linear')(encoded)
autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer='adam', loss='mse')
print(autoencoder.summary())
history = autoencoder.fit(X_scaled, X_scaled,
epochs=1000,
batch_size=16,
shuffle=True,
validation_split=0.1,
verbose = 0)
# use our encoded layer to encode the training input
encoder = Model(input_img, encoded)
encoded_input = Input(shape=(encoding_dim,))
decoder_layer = autoencoder.layers[-1]
decoder = Model(encoded_input, decoder_layer(encoded_input))
encoded_data = encoder.predict(X_scaled)
plot3clusters(encoded_data[:,:2], 'Linear AE', 'AE')
You can look into the loss if you want
#plot our loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()
The function to plot the data
def plot3clusters(X, title, vtitle):
import matplotlib.pyplot as plt
plt.figure()
colors = ['navy', 'turquoise', 'darkorange']
lw = 2
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
plt.scatter(X[y == i, 0], X[y == i, 1], color=color, alpha=1., lw=lw, label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title(title)
plt.xlabel(vtitle + "1")
plt.ylabel(vtitle + "2")
return(plt.show())
Regarding explaining the variability, using non-linear hidden function, leads to other approximation similar to ICA / TSNE and others. Where the idea of variance explanation is not there, still one can look into the convergence.

Tensorflow RNN training won't execute?

I am currently trying to train this RNN network, but seem to be running into weird errors, which I am not able to decode.
The input to my rnn network is digital sampled audio files. As the audio file can be of different length, will the vector of the sampled audio also have different lengths.
The output or the target of the neural network is to recreate a 14 dimensional vector, containing certain information of the audio files. I've already know the target, by manually calculating it, but need to make it work with a neural network.
I am currently using tensorflow as framework.
My network setup looks like this:
def last_relevant(output):
max_length = int(output.get_shape()[1])
relevant = tf.reduce_sum(tf.mul(output, tf.expand_dims(tf.one_hot(length, max_length), -1)), 1)
return relevant
def length(sequence): ##Zero padding to fit the max lenght... Question whether that is a good idea.
used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
def cost(output, target):
# Compute cross entropy for each frame.
cross_entropy = target * tf.log(output)
cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
mask = tf.sign(tf.reduce_max(tf.abs(target), reduction_indices=2))
cross_entropy *= mask
# Average over actual sequence lengths.
cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
cross_entropy /= tf.reduce_sum(mask, reduction_indices=1)
return tf.reduce_mean(cross_entropy)
#----------------------------------------------------------------------#
#----------------------------Main--------------------------------------#
### Tensorflow neural network setup
batch_size = None
sequence_length_max = max_length
input_dimension=1
data = tf.placeholder(tf.float32,[batch_size,sequence_length_max,input_dimension])
target = tf.placeholder(tf.float32,[None,14])
num_hidden = 24 ## Hidden layer
cell = tf.nn.rnn_cell.LSTMCell(num_hidden,state_is_tuple=True) ## Long short term memory
output, state = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32,sequence_length = length(data)) ## Creates the Rnn skeleton
last = last_relevant(output)#tf.gather(val, int(val.get_shape()[0]) - 1) ## Appedning as last
weight = tf.Variable(tf.truncated_normal([num_hidden, int(target.get_shape()[1])]))
bias = tf.Variable(tf.constant(0.1, shape=[target.get_shape()[1]]))
prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
cross_entropy = cost(output,target)# How far am I from correct value?
optimizer = tf.train.AdamOptimizer() ## TensorflowOptimizer
minimize = optimizer.minimize(cross_entropy)
mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(prediction, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
## Training ##
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op)
batch_size = 1000
no_of_batches = int(len(train_data)/batch_size)
epoch = 5000
for i in range(epoch):
ptr = 0
for j in range(no_of_batches):
inp, out = train_data[ptr:ptr+batch_size], train_output[ptr:ptr+batch_size]
ptr+=batch_size
sess.run(minimize,{data: inp, target: out})
print "Epoch - ",str(i)
incorrect = sess.run(error,{data: test_data, target: test_output})
print('Epoch {:2d} error {:3.1f}%'.format(i + 1, 100 * incorrect))
sess.close()
The error seem to be the usage of the function last_relevant, which should take the output, and feed it back.
This is the error message:
TypeError: Expected binary or unicode string, got <function length at 0x7f846594dde8>
Anyway to tell what could be wrong here?
I tried to build your code in my local.
There is a fundamental mistake in the code which is that you call tf.one_hot but what you pass don't really fit with what is expected:
Read documentation here:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.one_hot.md
tf.one_hot(indices, depth, on_value=None, off_value=None, axis=None, dtype=None, name=None)
However, you are passing a function pointer ("length" is a function in your code, I recommend naming your function in a meaningful manner by refraining yourself from using common keywords) instead of the first parameter.
For a wild guide, you can put your indices as first param (instead of my placeholder empty list) and it will be fixed
relevant = tf.reduce_sum(
tf.mul(output, tf.expand_dims(tf.one_hot([], max_length), -1)), 1)

How efficient/intelligent is Theano in computing gradients?

Suppose I have an artificial neural networks with 5 hidden layers. For the moment, forget about the details of the neural network model such as biases, the activation functions used, type of data and so on ... . Of course, the activation functions are differentiable.
With symbolic differentiation, the following computes the gradients of the objective function with respect to the layers' weights:
w1_grad = T.grad(lost, [w1])
w2_grad = T.grad(lost, [w2])
w3_grad = T.grad(lost, [w3])
w4_grad = T.grad(lost, [w4])
w5_grad = T.grad(lost, [w5])
w_output_grad = T.grad(lost, [w_output])
This way, to compute the gradients w.r.t w1 the gradients w.r.t w2, w3, w4 and w5 must first be computed. Similarly to compute the gradients w.r.t w2 the gradients w.r.t w3, w4 and w5 must be computed first.
However, I could the following code also computes the gradients w.r.t to each weight matrix:
w1_grad, w2_grad, w3_grad, w4_grad, w5_grad, w_output_grad = T.grad(lost, [w1, w2, w3, w4, w5, w_output])
I was wondering, is there any difference between these two methods in terms of performance? Is Theano intelligent enough to avoid re-computing the gradients using the second method? By intelligent I mean to compute w3_grad, Theano should [preferably] use the pre-computed gradients of w_output_grad, w5_grad and w4_grad instead of computing them again.
Well it turns out Theano does not take the previously-computed gradients to compute the gradients in lower layers of a computational graph. Here's a dummy example of a neural network with 3 hidden layers and an output layer. However, it's not going to be a big deal at all since computing the gradients is a once-in-a-life-time operation unless you have to compute the gradient on each iteration. Theano returns a symbolic expression for the derivatives as a computational graph and you can simply use it as a function from that point on. From that point on we simply use the function derived by Theano to compute numerical values and update the weights using those.
import theano.tensor as T
import time
import numpy as np
class neuralNet(object):
def __init__(self, examples, num_features, num_classes):
self.w = shared(np.random.random((16384, 5000)).astype(T.config.floatX), borrow = True, name = 'w')
self.w2 = shared(np.random.random((5000, 3000)).astype(T.config.floatX), borrow = True, name = 'w2')
self.w3 = shared(np.random.random((3000, 512)).astype(T.config.floatX), borrow = True, name = 'w3')
self.w4 = shared(np.random.random((512, 40)).astype(T.config.floatX), borrow = True, name = 'w4')
self.b = shared(np.ones(5000, dtype=T.config.floatX), borrow = True, name = 'b')
self.b2 = shared(np.ones(3000, dtype=T.config.floatX), borrow = True, name = 'b2')
self.b3 = shared(np.ones(512, dtype=T.config.floatX), borrow = True, name = 'b3')
self.b4 = shared(np.ones(40, dtype=T.config.floatX), borrow = True, name = 'b4')
self.x = examples
L1 = T.nnet.sigmoid(T.dot(self.x, self.w) + self.b)
L2 = T.nnet.sigmoid(T.dot(L1, self.w2) + self.b2)
L3 = T.nnet.sigmoid(T.dot(L2, self.w3) + self.b3)
L4 = T.dot(L3, self.w4) + self.b4
self.forwardProp = T.nnet.softmax(L4)
self.predict = T.argmax(self.forwardProp, axis = 1)
def loss(self, y):
return -T.mean(T.log(self.forwardProp)[T.arange(y.shape[0]), y])
x = T.matrix('x')
y = T.ivector('y')
nnet = neuralNet(x)
loss = nnet.loss(y)
diffrentiationTime = []
for i in range(100):
t1 = time.time()
gw, gw2, gw3, gw4, gb, gb2, gb3, gb4 = T.grad(loss, [nnet.w, nnet.w2, logReg.w3, nnet.w4, nnet.b, nnet.b2, nnet.b3, nnet.b4])
diffrentiationTime.append(time.time() - t1)
print 'Efficient Method: Took %f seconds with std %f' % (np.mean(diffrentiationTime), np.std(diffrentiationTime))
diffrentiationTime = []
for i in range(100):
t1 = time.time()
gw = T.grad(loss, [nnet.w])
gw2 = T.grad(loss, [nnet.w2])
gw3 = T.grad(loss, [nnet.w3])
gw4 = T.grad(loss, [nnet.w4])
gb = T.grad(loss, [nnet.b])
gb2 = T.grad(loss, [nnet.b2])
gb3 = T.grad(loss, [nnet.b3])
gb4 = T.grad(loss, [nnet.b4])
diffrentiationTime.append(time.time() - t1)
print 'Inefficient Method: Took %f seconds with std %f' % (np.mean(diffrentiationTime), np.std(diffrentiationTime))
This will print out the followings:
Efficient Method: Took 0.061056 seconds with std 0.013217
Inefficient Method: Took 0.305081 seconds with std 0.026024
This shows that Theano uses a dynamic-programming approach to compute gradients for the efficient method.