Related
I'm new in deep learning. In first step I create and train a model in python with keras and freezed by this code:
def export_model(MODEL_NAME, input_node_name, output_node_name):
tf.train.write_graph(K.get_session().graph_def, 'out', \
MODEL_NAME + '_graph.pbtxt')
tf.train.Saver().save(K.get_session(), 'out/' + MODEL_NAME + '.chkp')
freeze_graph.freeze_graph('out/' + MODEL_NAME + '_graph.pbtxt', None, \
False, 'out/' + MODEL_NAME + '.chkp', output_node_name, \
"save/restore_all", "save/Const:0", \
'out/frozen_' + MODEL_NAME + '.pb', True, "")
input_graph_def = tf.GraphDef()
with tf.gfile.Open('out/frozen_' + MODEL_NAME + '.pb', "rb") as f:
input_graph_def.ParseFromString(f.read())
output_graph_def = optimize_for_inference_lib.optimize_for_inference(
input_graph_def, [input_node_name], [output_node_name],
tf.float32.as_datatype_enum)
with tf.gfile.FastGFile('out/opt_' + MODEL_NAME + '.pb', "wb") as f:
f.write(output_graph_def.SerializeToString())
it's output :
checkpoint
Model.chkp.data-00000-of-00001
Model.chkp.index
Model.chkp.meta
Model_graph.pbtxt
frozen_Model.pb
opt_Model.pb
when I want to read the net in opencv c++ by readNetFromTensorflow :
String weights = "frozen_Model.pb";
String pbtxt = "Model_graph.pbtxt";
dnn::Net cvNet = cv::dnn::readNetFromTensorflow(weights, pbtxt);
This will make error :
OpenCV(4.0.0-pre) Error: Unspecified error (FAILED: ReadProtoFromBinaryFile(param_file, param). Failed to parse GraphDef file: frozen_Model.pb) in cv::dnn::ReadTFNetParamsFromBinaryFileOrDie, file D:\LIBS\OpenCV-4.00\modules\dnn\src\tensorflow\tf_io.cpp, line 44
and
OpenCV(4.0.0-pre) Error: Assertion failed (const_layers.insert(std::make_pair(name, li)).second) in cv::dnn::experimental_dnn_v4::`anonymous-namespace'::addConstNodes, file D:\LIBS\OpenCV-4.00\modules\dnn\src\tensorflow\tf_importer.cpp, line 555
How to fix this error?
Amin, may I ask you to try to save a graph in a testing mode:
K.backend.set_learning_phase(0) # <--- This setting makes all the following layers work in test mode
model = Sequential(name = MODEL_NAME)
model.add(Conv2D(filters = 128, kernel_size = (5, 5), activation = 'relu',name = 'FirstLayerConv2D_No1',input_shape = (Width, Height, image_channel)))
...
model.add(Dropout(0.25))
model.add(Dense(100, activation = 'softmax', name = 'endNode'))
# Create a graph definition (with no weights)
sess = K.backend.get_session()
sess.as_default()
tf.train.write_graph(sess.graph.as_graph_def(), "", 'graph_def.pb', as_text=False)
Then freeze your checkpoint files with a newly created graph_def.pb by a freeze_graph.py script (do not forget to use --input_binary flag).
Part of the code :
Create model, train and export_model
train_batch = gen.flow_from_directory(path + 'Train', target_size = (Width, Height), shuffle = False, color_mode = color_mode,
batch_size = batch_size_train, class_mode = 'categorical')
.
.
X_train, Y_train = next(train_batch)
.
.
X_train = X_train.reshape(X_train.shape).astype('float32')
.
.
model = Sequential(name = MODEL_NAME)
model.add(Conv2D(filters = 128, kernel_size = (5, 5), activation = 'relu',name = 'FirstLayerConv2D_No1',input_shape = (Width, Height, image_channel)))
model.add(Conv2D(filters = 128, kernel_size = (3, 3), activation = 'relu'))
model.add(MaxPool2D(pool_size = (2, 2)))
model.add(BatchNormalization())
.
.
.
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(200, activation = 'tanh'))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Dense(100, activation = 'softmax', name = 'endNode'))
model.compile(loss = 'categorical_crossentropy',
optimizer = SGD(lr = 0.01, momentum = 0.9), metrics = ['accuracy'])
history = model.fit(X_train, Y_train, batch_size = batch_size_fit, epochs = epoch, shuffle = True,
verbose = 1, validation_split = .1, validation_data = (X_test, Y_test))
export_model(MODEL_NAME, "FirstLayerConv2D_No1/Relu", "endNode/Softmax")
When you're writing the graph in python you need to do the following steps:
with tf.Session(graph=tf.Graph()) as sess:
# 1. Load saved model
saved_model = tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], SAVED_MODEL_PATH)
# 2. Convert variables to constants
inference_graph_def = tf.graph_util.convert_variables_to_constants(sess, saved_model.graph_def, OUTPUT_NODE_NAMES)
# 3. Optimize for inference
optimized_graph_def = optimize_for_inference_lib.optimize_for_inference(inference_graph_def,
INPUT_NODE_NAMES,
OUTPUT_NODE_NAMES,
tf.float32.as_datatype_enum)
# 4. Save .pb file
tf.train.write_graph(optimized_graph_def, MODEL_DIR, 'model_name.pb', as_text=False)
# 5. Transform graph
transforms = [
'strip_unused_nodes(type=float, shape=\"1,128,128,3\")',
'remove_nodes(op=PlaceholderWithDefault)',
'remove_device',
'sort_by_execution_order'
]
transformed_graph_def = TransformGraph(optimized_graph_def,
INPUT_NODE_NAMES,
OUTPUT_NODE_NAMES,
transforms)
# 6. Remove constant nodes and attributes
for i in reversed(range(len(transformed_graph_def.node))):
if transformed_graph_def.node[i].op == "Const":
del transformed_graph_def.node[i]
for attr in ['T', 'data_format', 'Tshape', 'N', 'Tidx', 'Tdim',
'use_cudnn_on_gpu', 'Index', 'Tperm', 'is_training', 'Tpaddings']:
if attr in transformed_graph_def.node[i].attr:
del transformed_graph_def.node[i].attr[attr]
# 7. Save .pbtxt file
tf.train.write_graph(transformed_graph_def, MODEL_DIR, 'model_name.pbtxt', as_text=True)
Plus, if you have special nodes as Flatten, you need to remove and rename some nodes manually.
More info here.
I'm new to TF and ML.
Details about data: Features(x) - (70 x 70 x 70) tensor for each sample, y - a float for each sample.
TFRecords created with the following code:
def convert_to_tf_records():
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _float64_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
tfrecords_filename = 'A-100-h2-h2o.tfrecords'
writer = tf.python_io.TFRecordWriter(tfrecords_filename)
# Get data from db for now.
db = connect('results-60-70.db')
data = db.select(selection='Ti')
i = 0
for row in data:
desc = np.array(json.loads(row.descriptor), dtype=np.float32)
print(desc.shape)
be = float(row.binding_energy) * 23 # Convert to Kcal/mol ?
desc = desc.flatten()
desc = desc.tostring()
example = tf.train.Example(features=tf.train.Features(feature={'voxel_grid': _bytes_feature(desc), 'binding_energy': _float64_feature(be)}))
writer.write(example.SerializeToString())
i += 1
if i >= 10:
break
Input function:
def my_input_function(fname, perform_shuffle=False, repeat_count=None):
def _parse_elements(example):
features = tf.parse_single_example(example, features={'voxel_grid': tf.FixedLenFeature([], tf.string), 'binding_energy': tf.FixedLenFeature([], tf.float32)})
vg = tf.decode_raw(features['voxel_grid'], tf.float32)
vg = tf.reshape(vg, [70, 70, 70])
vg = tf.convert_to_tensor(vg, dtype=tf.float32)
vg = {'voxel_grid': vg}
e = tf.cast(features['binding_energy'], tf.float32)
return vg, e
def input_function():
dataset = tf.data.TFRecordDataset(fname).map(_parse_elements)
dataset = dataset.repeat(repeat_count)
dataset = dataset.batch(5)
dataset = dataset.prefetch(1)
if perform_shuffle:
dataset.shuffle(20)
iterator = dataset.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
return input_function
Model function:
def my_model_function(features, labels, mode):
if mode == tf.estimator.ModeKeys.PREDICT:
tf.logging.info("my_model_fn: PREDICT, {}".format(mode))
elif mode == tf.estimator.ModeKeys.EVAL:
tf.logging.info("my_model_fn: EVAL, {}".format(mode))
elif mode == tf.estimator.ModeKeys.TRAIN:
tf.logging.info("my_model_fn: TRAIN, {}".format(mode))
feature_columns = [tf.feature_column.numeric_column('voxel_grid', shape=(70, 70, 70), dtype=tf.float32)]
# Create the layer of input
input_layer = tf.feature_column.input_layer(features, feature_columns)
input_layer = tf.reshape(input_layer, [-1, 70, 70, 70, 1])
# Convolution layers
conv1 = tf.layers.conv3d(inputs=input_layer, strides=(2, 2, 2), filters=32, kernel_size=(7, 7, 7))
conv2 = tf.layers.conv3d(inputs=conv1, strides=(2, 2, 2), filters=32, kernel_size=(7, 7, 7))
pool3 = tf.layers.max_pooling3d(inputs=conv2, pool_size=[2, 2, 2], strides=2)
flat = tf.layers.flatten(pool3)
dense1 = tf.layers.dense(inputs=flat, units=10, activation=tf.nn.relu)
dense2 = tf.layers.dense(inputs=dense1, units=10, activation=tf.nn.relu)
output = tf.layers.dense(inputs=dense2, units=1)
predictions = {'binding_energy': output}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Calculate loss
loss = tf.losses.mean_squared_error(labels=labels, predictions=predictions)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# Add evaluation metrics
eval_metric_ops = {"mse": tf.metrics.mean_squared_error(labels=labels, predictions=predictions['binding_energy'])}
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
When calling model.train using
model = tf.estimator.Estimator(model_fn=my_model_function, model_dir='./model_dir')
model.train(input_fn=my_input_function('A-100-h2-h2o.tfrecords'), steps=100)
I get the following error.
TypeError: Failed to convert object of type to Tensor.
Found it!
changing
# Calculate loss
loss = tf.losses.mean_squared_error(labels=labels, predictions=predictions)
to
# Calculate loss
loss = tf.losses.mean_squared_error(labels=labels, predictions=predictions['binding_energy'])
solves the issue.
I'm trying to make speech recognition system with tensorflow.
Input data is an numpy array of size 50000 X 1.
Output data (mapping data) is an numpy array of size 400 X 1.
Input and mapping data is passed in batches of 2 in a list.
I've used this tutorial to design the neural network. Following is the code snippet:
For RNN:
input_data = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_DATA, sound_constants.MAX_COLUMN_SIZE_IN_DATA], name="train_input")
target = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT], name="train_output")
fwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating one backward cell
bkwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating bidirectional RNN
val, _, _ = tf.nn.static_bidirectional_rnn(fwd_cell, bkwd_cell, tf.unstack(input_data), dtype=tf.float32)
For feeding data:
feed = {g['input_data'] : trb[0], g['target'] : trb[1], g['dropout'] : 0.6}
accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
accuracy += accuracy_
When I ran the code, I got this error:
Traceback (most recent call last):
File "/home/wolborg/PycharmProjects/speech-to-text-rnn/src/rnn_train_1.py", line 205, in <module>
tr_losses, te_losses = train_network(g)
File "/home/wolborg/PycharmProjects/speech-to-text-rnn/src/rnn_train_1.py", line 177, in train_network
accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
File "/home/wolborg/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/home/wolborg/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1102, in _run
raise ValueError('Tensor %s may not be fed.' % subfeed_t)
ValueError: Tensor Tensor("Const:0", shape=(), dtype=float32) may not be fed.
Process finished with exit code 1
Earlier, I was facing this issue with tf.sparse_placeholder, then after some browsing, I changed input type to tf.placeholder and made related changes. Now I'm clueless on where I'm making the error.
Please suggest something as how should I feed data.
Entire code:
import tensorflow as tf
# for taking MFCC and label input
import numpy as np
import rnn_input_data_1
import sound_constants
# input constants
# Training Parameters
num_input = 10 # mfcc data input
training_data_size = 8 # determines number of files in training and testing module
testing_data_size = num_input - training_data_size
# Network Parameters
learning_rate = 0.0001 # for large training set, it can be set 0.001
num_hidden = 200 # number of hidden layers
num_classes = 28 # total alphabet classes (a-z) + extra symbols (', ' ')
epoch = 1 # number of iterations
batch_size = 2 # number of batches
mfcc_coeffs, text_data = rnn_input_data_1.mfcc_and_text_encoding()
class DataGenerator:
def __init__(self, data_size):
self.ptr = 0
self.epochs = 0
self.data_size = data_size
def next_batch(self):
self.ptr += batch_size
if self.ptr > self.data_size:
self.epochs += 1
self.ptr = 0
return mfcc_coeffs[self.ptr-batch_size : self.ptr], text_data[self.ptr-batch_size : self.ptr]
def reset_graph():
if 'sess' in globals() and sess:
sess.close()
tf.reset_default_graph()
def struct_network():
print ('Inside struct network !!')
reset_graph()
input_data = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_DATA, sound_constants.MAX_COLUMN_SIZE_IN_DATA], name="train_input")
target = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT], name="train_output")
keep_prob = tf.constant(1.0)
fwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating one backward cell
bkwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating bidirectional RNN
val, _, _ = tf.nn.static_bidirectional_rnn(fwd_cell, bkwd_cell, tf.unstack(input_data), dtype=tf.float32)
# adding dropouts
val = tf.nn.dropout(val, keep_prob)
val = tf.transpose(val, [1, 0, 2])
last = tf.gather(val, int(val.get_shape()[0]) - 1)
# creating bidirectional RNN
print ('BiRNN created !!')
print ('Last Size: ', last.get_shape())
weight = tf.Variable(tf.truncated_normal([num_hidden * 2, sound_constants.MAX_ROW_SIZE_IN_TXT]))
bias = tf.Variable(tf.constant(0.1, shape=[sound_constants.MAX_ROW_SIZE_IN_TXT]))
# mapping to 28 output classes
logits = tf.matmul(last, weight) + bias
prediction = tf.nn.softmax(logits)
prediction = tf.reshape(prediction, shape = [batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT])
# getting probability distribution
mat1 = tf.cast(tf.argmax(prediction,1),tf.float32)
correct = tf.equal(prediction, target)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
logits = tf.reshape(logits, shape=[batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT])
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=target))
train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)
# returning components as dictionary elements
return {'input_data' : input_data,
'target' : target,
'dropout': keep_prob,
'loss': loss,
'ts': train_step,
'preds': prediction,
'accuracy': accuracy
}
def train_network(graph):
# initialize tensorflow session and all variables
# tf_gpu_config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = True)
# tf_gpu_config.gpu_options.allow_growth = True
# with tf.Session(config = tf_gpu_config) as sess:
with tf.Session() as sess:
train_instance = DataGenerator(training_data_size)
test_instance = DataGenerator(testing_data_size)
print ('Training data size: ', train_instance.data_size)
print ('Testing data size: ', test_instance.data_size)
sess.run(tf.global_variables_initializer())
print ('Starting session...')
step, accuracy = 0, 0
tr_losses, te_losses = [], []
current_epoch = 0
while current_epoch < epoch:
step += 1
trb = train_instance.next_batch()
feed = {g['input_data'] : trb[0], g['target'] : trb[1], g['dropout'] : 0.6}
accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
accuracy += accuracy_
if train_instance.epochs > current_epoch:
current_epoch += 1
tr_losses.append(accuracy / step)
step, accuracy = 0, 0
#eval test set
te_epoch = test_instance.epochs
while test_instance.epochs == te_epoch:
step += 1
print ('Testing round ', step)
trc = test_instance.next_batch()
feed = {g['input_data']: trc[0], g['target']: trc[1]}
accuracy_ = sess.run([g['accuracy']], feed_dict=feed)[0]
accuracy += accuracy_
te_losses.append(accuracy / step)
step, accuracy = 0,0
print("Accuracy after epoch", current_epoch, " - tr:", tr_losses[-1], "- te:", te_losses[-1])
return tr_losses, te_losses
g = struct_network()
tr_losses, te_losses = train_network(g)
You defined keep_prob as a tf.constant, but then trying to feed the value into it. Replace keep_prob = tf.constant(1.0) with keep_prob = tf.placeholder(tf.float32,[]) or keep_prob = tf.placeholder_with_default(1.0,[])
I am trying to change gradients in tensorflow and after that trying to update with applyGradient() function. This is my code and it doesnt work
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
for j in range(n_rounds):
sample = np.random.randint(row, size=int(batch_size))
batch_xs = temp[sample][:]
batch_ys = output[sample][:]
vars_with_grads = sess.run(optimizer.compute_gradients(cross_entropy), feed_dict={x: batch_xs, y_: batch_ys})
noiseAddedGradient = []
print(vars_with_grads)
for var in vars_with_grads:
gaussianNoise = [np.random.normal(MEAN_FOR_AUTOENCODER, SCALE_FOR_AUTOENCODER, var[0].shape) for i in range(int(batch_size))]
totalGaussianNoise = [sum(x) for x in zip(*gaussianNoise)]
averageGaussianNoise = [x1 / float(batch_size) for x1 in totalGaussianNoise]
averageGaussianNoiseList = np.array(averageGaussianNoise).flatten().reshape(var[0].shape)
noiseAddedGradient.append((tf.Variable(np.add(var[0], averageGaussianNoiseList), dtype=np.float32), var[1]))
appliedGradient = sess.run(optimizer.apply_gradients(noiseAddedGradient))
it returns error function:
Traceback (most recent call last):
File "/home/Downloads/objectPerturbation.py", line 214, in <module>
appliedGradient = sess.run(optimizer.apply_gradients(noiseAddedGradient))
File "/home/anaconda2/envs/tensorflow/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 384, in apply_gradients
p = _get_processor(v)
File "/home/anaconda2/envs/tensorflow/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 98, in _get_processor
if v.op.type == "ReadVariableOp":
AttributeError: 'numpy.ndarray' object has no attribute 'op'
Can you please help me?
Try chaging it like this. The gradients should be computed in the graph.
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
grads = optimizer.compute_gradients(cross_entropy)
grad_placeholder = [(tf.placeholder("float", shape=grad[1].get_shape()), grad[1] for grad in grads]
apply_placeholder_op = opt.apply_gradients(grad_placeholder)
#added in case you don't do this
sess.run(tf.initialize_all_variables())
for j in range(n_rounds):
sample = np.random.randint(row, size=int(batch_size))
batch_xs = temp[sample][:]
batch_ys = output[sample][:]
vars_with_grads = sess.run(grads, feed_dict={x: batch_xs, y_: batch_ys})
#Add gaussian noise to gradients
feed_dict = {}
for i in range(len(grad_placeholder)):
feed_dict[grad_placeholder[i][0]] = add_gaussian_noise_fn(grad_vals[i])
sess.run(apply_placeholder_op, feed_dict=feed_dict)
#separate function to make it more general to do whatever you want with grads
def add_gaussian_noise_fn(x):
return x + np.random.normal(size=x.shape)
Idea is similar to this previous post:
Efficiently grab gradients from TensorFlow?
I'm in high school and I'm trying to do a project involving neural networks. I am using Ubuntu and trying to do reinforcement learning with tensorflow, but I consistently get lots of underrun warnings when I train a neural network. They take the form of ALSA lib pcm.c:7963:(snd_pcm_recover) underrun occurred. This message is printed to the screen more and more frequently as training progresses. Eventually, I get a ResourceExhaustedError and the program terminates. Here is the full error message:
W tensorflow/core/framework/op_kernel.cc:975] Resource exhausted: OOM when allocating tensor with shape[320000,512]
Traceback (most recent call last):
File "./train.py", line 121, in <module>
loss, _ = model.train(minibatch, gamma, sess) # Train the model based on the batch, the discount factor, and the tensorflow session.
File "/home/perrin/neural/dqn.py", line 174, in train
return sess.run([self.loss, self.optimize], feed_dict=self.feed_dict) # Runs the training. This is where the underrun errors happen
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 766, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 964, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1014, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1034, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[320000,512]
[[Node: gradients/fully_connected/MatMul_grad/MatMul_1 = MatMul[T=DT_FLOAT, transpose_a=true, transpose_b=false, _device="/job:localhost/replica:0/task:0/cpu:0"](dropout/mul, gradients/fully_connected/BiasAdd_grad/tuple/control_dependency)]]
Caused by op u'gradients/fully_connected/MatMul_grad/MatMul_1', defined at:
File "./train.py", line 72, in <module>
model = AC_Net([None, 201, 201, 3], 5, trainer) # This creates the neural network using the imported AC_Net class.
File "/home/perrin/neural/dqn.py", line 128, in __init__
self.optimize = trainer.minimize(self.loss) # This tells the trainer to adjust the weights in such a way as to minimize the loss. This is what actually
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 269, in minimize
grad_loss=grad_loss)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 335, in compute_gradients
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients_impl.py", line 482, in gradients
in_grads = grad_fn(op, *out_grads)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/math_grad.py", line 731, in _MatMulGrad
math_ops.matmul(op.inputs[0], grad, transpose_a=True))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/math_ops.py", line 1729, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 1442, in _mat_mul
transpose_b=transpose_b, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
self._traceback = _extract_stack()
...which was originally created as op u'fully_connected/MatMul', defined at:
File "./train.py", line 72, in <module>
model = AC_Net([None, 201, 201, 3], 5, trainer) # This creates the neural network using the imported AC_Net class.
File "/home/perrin/neural/dqn.py", line 63, in __init__
net = slim.fully_connected(net, 512, activation_fn=tf.nn.elu, scope='fully_connected') # Feeds the input through a fully connected layer
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 177, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1350, in fully_connected
outputs = standard_ops.matmul(inputs, weights)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/math_ops.py", line 1729, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 1442, in _mat_mul
transpose_b=transpose_b, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
self._traceback = _extract_stack()
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[320000,512]
[[Node: gradients/fully_connected/MatMul_grad/MatMul_1 = MatMul[T=DT_FLOAT, transpose_a=true, transpose_b=false, _device="/job:localhost/replica:0/task:0/cpu:0"](dropout/mul, gradients/fully_connected/BiasAdd_grad/tuple/control_dependency)]]
I researched these problems but didn't get a clear idea of how I could fix them. I am pretty new to programming so I don't know much about how buffers and data reading/writing works. I am very perplexed by these errors. Does anyone know what parts of my code might be causing this and how to fix it? Thanks for taking the time to consider this question!
Here is my code for defining the neural network (based on this tutorial):
#! /usr/bin/python
import numpy as np
import tensorflow as tf
slim = tf.contrib.slim
# The neural network
class AC_Net:
# This defines the actual neural network.
# output_size: the number of outputs of the policy
# trainer: the tensorflow training optimizer used by the network
def __init__(self, input_shape, output_size, trainer):
with tf.name_scope('input'):
self.input = tf.placeholder(shape=list(input_shape), dtype=tf.float32, name='input')
net = tf.image.per_image_standardization(self.input[0])
net = tf.expand_dims(net, [0])
with tf.name_scope('convolution'):
net = slim.conv2d(net, 32, [8, 8], activation_fn=tf.nn.elu, scope='conv')
net = slim.max_pool2d(net, [2, 2], scope='pool')
net = slim.flatten(net)
net = tf.nn.dropout(net, .5)
net = slim.fully_connected(net, 512, activation_fn=tf.nn.elu, scope='fully_connected')
net = tf.nn.dropout(net, .5)
with tf.name_scope('LSTM'):
cell = tf.nn.rnn_cell.BasicLSTMCell(256, state_is_tuple=True, activation=tf.nn.elu)
with tf.name_scope('state_in'):
state_in = cell.zero_state(tf.shape(net)[0], tf.float32)
net = tf.expand_dims(net, [0])
step_size = tf.shape(self.input)[:1]
output, state = tf.nn.dynamic_rnn(cell, net, initial_state=state_in, sequence_length=step_size, time_major=False, scope='LSTM')
out = tf.reshape(output, [-1, 256])
out = tf.nn.dropout(out, .5)
self.policy = slim.fully_connected(out, output_size, activation_fn=tf.nn.softmax, scope='policy')
self.value = slim.fully_connected(out, 1, activation_fn=None, scope='value')
# Defines the loss functions
with tf.name_scope('loss_function'):
self.target_values = tf.placeholder(dtype=tf.float32, name='target_values') # The target value is the discounted reward.
self.actions = tf.placeholder(dtype=tf.int32, name='actions') # This is the network's policy.
# The advantage is the difference between what the network thought the value of an action was, and what it actually was.
# It is computed as R - V(s), where R is the discounted reward and V(s) is the value of being in state s.
self.advantages = tf.placeholder(dtype=tf.float32, name='advantages')
with tf.name_scope('entropy'):
entropy = -tf.reduce_sum(tf.log(self.policy + 1e-10) * self.policy)
with tf.name_scope('responsible_actions'):
actions_onehot = tf.one_hot(self.actions, output_size, dtype=tf.float32)
responsible_actions = tf.reduce_sum(self.policy * actions_onehot, [1]) # This returns only the actions that were selected.
with tf.name_scope('loss'):
with tf.name_scope('value_loss'):
self.value_loss = tf.reduce_sum(tf.square(self.target_values - tf.reshape(self.value, [-1])))
with tf.name_scope('policy_loss'):
self.policy_loss = -tf.reduce_sum(tf.log(responsible_actions + 1e-10) * self.advantages)
with tf.name_scope('total_loss'):
self.loss = self.value_loss + self.policy_loss - entropy * .01
tf.summary.scalar('loss', self.loss)
with tf.name_scope('gradient_clipping'):
tvars = tf.trainable_variables()
grads = tf.gradients(self.loss, tvars)
grads, _ = tf.clip_by_global_norm(grads, 20.)
self.optimize = trainer.apply_gradients(zip(grads, tvars))
def predict(self, inputs, sess):
return sess.run([self.policy, self.value], feed_dict={self.input:inputs})
def train(self, train_batch, gamma, sess):
inputs = train_batch[:, 0]
actions = train_batch[:, 1]
rewards = train_batch[:, 2]
values = train_batch[:, 4]
discounted_rewards = rewards[::-1]
for i, j in enumerate(discounted_rewards):
if i > 0:
discounted_rewards[i] += discounted_rewards[i - 1] * gamma
discounted_rewards = np.array(discounted_rewards, np.float32)[::-1]
advantages = discounted_rewards - values
self.feed_dict = {
self.input:np.vstack(inputs),
self.target_values:discounted_rewards,
self.actions:actions,
self.advantages:advantages
}
return sess.run([self.loss, self.optimize], feed_dict=self.feed_dict)
Here is my code for training the neural network:
#! /usr/bin/python
import game_env, move_right, move_right_with_obs, random, inspect, os
import tensorflow as tf
import numpy as np
from dqn import AC_Net
def process_outputs(x):
a = [int(x > 2), int(x%2 == 0 and x > 0)*2-int(x > 0)]
return a
environment = game_env # The environment to use
env_name = str(inspect.getmodule(environment).__name__) # The name of the environment
ep_length = 2000
num_episodes = 20
total_steps = ep_length * num_episodes # The total number of steps
model_path = '/home/perrin/neural/nn/' + env_name
learning_rate = 1e-4 # The learning rate
trainer = tf.train.AdamOptimizer(learning_rate=learning_rate) # The gradient descent optimizer used
first_epsilon = 0.6 # The initial chance of random action
final_epsilon = 0.01 # The final chance of random action
gamma = 0.9
anneal_steps = 35000 # The number of steps it takes to go from initial to random
count = 0 # Keeps track of the number of steps we've run
experience_buffer = [] # Stores the agent's experiences in a list
buffer_size = 10000 # How large the experience buffer can be
train_step = 256 # How often to train the model
batches_per_train = 10
save_step = 500 # How often to save the trained model
batch_size = 256 # How many experiences to train on at once
env_size = 500 # How many pixels tall and wide the environment should be.
load_model = True # Whether or not to load a pretrained model
train = True # Whether or not to train the model
test = False # Whether or not to test the model
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = AC_Net([None, 201, 201, 3], 5, trainer)
env = environment.Env(env_size)
action = [0, 0]
state, _ = env.step(True, action)
saver = tf.train.Saver() # This saves the model
epsilon = first_epsilon
tf.global_variables_initializer().run()
if load_model:
ckpt = tf.train.get_checkpoint_state(model_path)
saver.restore(sess, ckpt.model_checkpoint_path)
print 'Model loaded'
prev_out = None
while count <= total_steps and train:
if random.random() < epsilon or count == 0:
if prev_out is not None:
out = prev_out
if random.randint(0, 100) == 100 or prev_out is None:
out = np.random.rand(5)
out = np.array([val/np.sum(out) for val in out])
_, value = model.predict(state, sess)
prev_out = out
else:
out, value = model.predict(state, sess)
out = out[0]
act = np.random.choice(out, p=out)
act = np.argmax(out == act)
act1 = process_outputs(act)
action[act1[0]] = act1[1]
_, reward = env.step(True, action)
new_state = env.get_state()
experience_buffer.append((state, act, reward, new_state, value[0, 0]))
state = new_state
if len(experience_buffer) > buffer_size:
experience_buffer.pop(0)
if count % train_step == 0 and count > 0:
print "Training model"
for i in range(batches_per_train):
# Get a random sample of experiences and train the model based on it.
x = random.randint(0, len(experience_buffer)-batch_size)
minibatch = np.array(experience_buffer[x:x+batch_size])
loss, _ = model.train(minibatch, gamma, sess)
print "Loss for batch", str(i+1) + ":", loss
if count % save_step == 0 and count > 0:
saver.save(sess, model_path+'/model-'+str(count)+'.ckpt')
print "Model saved"
if count % ep_length == 0 and count > 0:
print "Starting new episode"
env = environment.Env(env_size)
if epsilon > final_epsilon:
epsilon -= (first_epsilon - final_epsilon)/anneal_steps
count += 1
while count <= total_steps and test:
out, _ = model.predict(state, sess)
out = out[0]
act = np.random.choice(out, p=out)
act = np.argmax(out == act)
act1 = process_outputs(act)
action[act1[0]] = act1[1]
state, reward = env.step(True, action)
new_state = env.get_state()
count += 1
# Write log files to create tensorboard visualizations
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter('/home/perrin/neural/summaries', sess.graph)
if train:
summary = sess.run(merged, feed_dict=model.feed_dict)
writer.add_summary(summary)
writer.flush()
You are running out of memory. It's possible that your network requires more memory than you have to run, so the first step to tracking down excessive memory usage is to figure out what is using so much memory.
Here's one approach that uses timeline and statssummarizer:
https://gist.github.com/yaroslavvb/08afccbe087171881ceafc0c98abca05
This will print out several tables, one of the tables is the tensors sorted by top memory usage. You should check that you don't have something unusually large in there.
You can also see memory timeline using Chrome visualizer, as detailed here
A more advanced technique is to plot a timeline of memory allocations/deallocations, as done in this issue
Theoretically your memory usage shouldn't grow between steps if you aren't creating new stateful ops (Variables), but I found that global memory allocation can grow if sizes of your tensors change between steps.
A work-around is to periodically save your parameters to checkpoint and restart your script.