Multiple time "Init failure" error witth attribute error "__dict__" - python-2.7

I have a bunch of code, Program is written in python2 and used old version of pymc. probably version2.x .
When i run
python run.py
the error i am facing
Init failure
Init failure
Init failure
Init failure
Init failure
Init failure
Init failure
Init failure
No previous MCMC data found.
Traceback (most recent call last):
File "run.py", line 106, in <module>
M=run_MCMC(ms)
File "run.py", line 94, in run_MCMC
mcmc = pm.MCMC(model, db=db, name=name)
File "/home/divyadeep/miniconda3/envs/detrital/lib/python2.7/site-packages/pymc/MCMC.py", line 90, in init
**kwds)
File "/home/divyadeep/miniconda3/envs/detrital/lib/python2.7/site-packages/pymc/Model.py", line 191, in init
Model.init(self, input, name, verbose)
File "/home/divyadeep/miniconda3/envs/detrital/lib/python2.7/site-packages/pymc/Model.py", line 92, in init
ObjectContainer.init(self, input)
File "/home/divyadeep/miniconda3/envs/detrital/lib/python2.7/site-packages/pymc/Container.py", line 605, in init
input_to_file = input.dict
AttributeError: 'NoneType' object has no attribute 'dict'`
I have tried to comment out some of 'init' in the program. but still not able to run.
the run.py is as
def InitExhumation(settings):
"""Initialize piece-wise linear exhumation model"""
#Check that erosion and age break priors are meaningful
if (settings.erate_prior[0] >= settings.erate_prior[1]):
print "\nInvalid range for erate_prior."
sys.exit()
if (settings.abr_prior[0] >= settings.abr_prior[1]):
print "\nInvalid range for abr_prior."
sys.exit()
#Create erosion rate parameters (e1, e2, ...)
e = []
for i in range(1,settings.breaks+2):
e.append(pm.Uniform("e%i" % i, settings.erate_prior[0], settings.erate_prior[1]))
#Create age break parameters (abr1, ...)
abr_i = settings.abr_prior[0]
abr = []
for i in range(1,settings.breaks+1):
abr_i = pm.Uniform("abr%i" % i, abr_i, settings.abr_prior[1])
abr.append(abr_i)
return e, abr
def ExhumationModel(settings):
"""Set up the exhumation model"""
#Check that error rate priors are meaningful
if (settings.error_prior[0] >= settings.error_prior[1]):
print "\nInvalid range for error_prior."
sys.exit()
err = pm.Uniform('RelErr',settings.error_prior[0],settings.error_prior[1])
#Closure elevation priors
hc_parms={'AFT':[3.7, 0.8, 6.0, 2.9], 'AHe':[2.2, 0.5, 3.7, 1.6]}
e, abr = InitExhumation(settings)
nodes = [err, e, abr]
hc = {}
for sample in settings.samples:
parms = e[:]
h_mu = np.mean(sample.catchment.z)
if sample.tc_type not in hc.keys():
hc[sample.tc_type] = pm.TruncatedNormal("hc_%s"%sample.tc_type, h_mu-hc_parms[sample.tc_type][0],
1/hc_parms[sample.tc_type][1]**2,
h_mu-hc_parms[sample.tc_type][2],
h_mu-hc_parms[sample.tc_type][3])
nodes.append(hc[sample.tc_type])
parms.append(hc[sample.tc_type])
parms.extend(abr)
if isinstance(sample, DetritalSample):
idx_i = pm.Categorical("Index_" + sample.sample_name, p = sample.catchment.bins['w'], size=len(sample.dt_ages))
nodes.extend([idx_i])
exp_i = pm.Lambda("ExpAge_" + sample.sample_name, lambda parm=parms, idx=idx_i: ba.h2a(sample.catchment.bins['h'][idx],parm))
value = sample.dt_ages
else:
idx_i = None
exp_i = pm.Lambda("ExpAge_" + sample.sample_name, lambda parm=parms: ba.h2a(sample.br_elevation,parm), plot=False)
value = sample.br_ages
obs_i = pm.Normal("ObsAge_" + sample.sample_name, mu = exp_i, tau = 1./(err*exp_i)**2, value = value, observed=True)
sim_i = pm.Lambda("SimAge_" + sample.sample_name, lambda ta=exp_i, err=err: pm.rnormal(mu = ta, tau = 1./(err*ta)**2))
nodes.extend([exp_i, obs_i, sim_i])
return nodes
def run_MCMC(settings):
"""Run MCMC algorithm"""
burn = settings.iterations/2
thin = (settings.iterations-burn) / settings.finalChainSize
name = "%s" % settings.model_name + "_%ibrk" % settings.breaks
attempt = 0
model=None
while attempt<5000:
try:
model = ExhumationModel(settings)
break
except pm.ZeroProbability, ValueError:
attempt+=1
#print "Init failure %i" % attemp
print "Init failure "
try:
#The following creates text files for the chains rather than hdf5
db = pm.database.txt.load(name + '.txt')
#db = pm.database.hdf5.load(name + '.hdf5')
print "\nExisting MCMC data loaded.\n"
except AttributeError:
print "\nNo previous MCMC data found.\n"
db='txt'
mcmc = pm.MCMC(model, db=db, name=name)
#mcmc.use_step_method(pm.AdaptiveMetropolis, M.parm)
if settings.iterations > 1:
mcmc.sample(settings.iterations,burn=burn,thin=thin)
return mcmc
if __name__ == '__main__':
sys.path[0:0] = './' # Puts current directory at the start of path
import model_setup as ms
if len(sys.argv)>1: ms.iterations = int(sys.argv[1])
M=run_MCMC(ms)
#import pdb; pdb.set_trace()
#Output and diagnostics
try:
ba.statistics(M, ms.samples)
except TypeError:
print "\nCannot compute stats without resampling (PyMC bug?).\n"
ps.chains(M, ms.finalChainSize, ms.iterations, ms.samples, ms.output_format)
ps.summary(M, ms.samples, ms.output_format)
ps.ks_gof(M, ms.samples, ms.output_format)
ps.histograms(ms.samples, ms.show_histogram, ms.output_format)
ps.discrepancy(M, ms.samples, ms.output_format)
## ps.unorthodox_ks(M, ms.output_format)
## try:
## ps.catchment(M.catchment_dem, format=ms.output_format)
## except KeyError:
## print "\nUnable to generate catchment plot."
M.db.close()
`

Related

Tensorflow 1.0 Seq2Seq Decoder function

I'm trying to make a Seq2Seq Regression example for time-series analysis and I've used the Seq2Seq library as presented at the Dev Summit, which is currently the code on the Tensorflow GitHub branch r1.0.
I have difficulties understanding how the decoder function works for Seq2Seq, specifically for the "cell_output".
I understand that the num_decoder_symbols is the number of classes/words to decode at each time step. I have it working at a point where I can do training. However, I don't get why I can't just substitute the number of features (num_features) instead of num_decoder_symbols. Basically, I want to be able to run the decoder without teacher forcing, in other words pass the output of the previous time step as the input to the next time step.
with ops.name_scope(name, "simple_decoder_fn_inference",
[time, cell_state, cell_input, cell_output,
context_state]):
if cell_input is not None:
raise ValueError("Expected cell_input to be None, but saw: %s" %
cell_input)
if cell_output is None:
# invariant that this is time == 0
next_input_id = array_ops.ones([batch_size,], dtype=dtype) * (
start_of_sequence_id)
done = array_ops.zeros([batch_size,], dtype=dtypes.bool)
cell_state = encoder_state
cell_output = array_ops.zeros([num_decoder_symbols],
dtype=dtypes.float32)
Here is a link to the original code: https://github.com/tensorflow/tensorflow/blob/r1.0/tensorflow/contrib/seq2seq/python/ops/decoder_fn.py
Why don't I need to pass batch_size for the cell output?
cell_output = array_ops.zeros([batch_size, num_decoder_symbols],
dtype=dtypes.float32)
When trying to use this code to create my own regressive Seq2Seq example, where instead of having an output of probabilities/classes, I have a real valued vector of dimension num_features, instead of an array of probability of classes. As I understood, I thought I could replace num_decoder_symbols with num_features, like below:
def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
"""
Again same as in simple_decoder_fn_inference but for regression on sequences with a fixed length
"""
with ops.name_scope(name, "simple_decoder_fn_inference", [time, cell_state, cell_input, cell_output, context_state]):
if cell_input is not None:
raise ValueError("Expected cell_input to be None, but saw: %s" % cell_input)
if cell_output is None:
# invariant that this is time == 0
next_input = array_ops.ones([batch_size, num_features], dtype=dtype)
done = array_ops.zeros([batch_size], dtype=dtypes.bool)
cell_state = encoder_state
cell_output = array_ops.zeros([num_features], dtype=dtypes.float32)
else:
cell_output = output_fn(cell_output)
done = math_ops.equal(0,1) # hardcoded hack just to properly define done
next_input = cell_output
# if time > maxlen, return all true vector
done = control_flow_ops.cond(math_ops.greater(time, maximum_length),
lambda: array_ops.ones([batch_size,], dtype=dtypes.bool),
lambda: done)
return (done, cell_state, next_input, cell_output, context_state)
return decoder_fn
But, I get the following error:
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/contrib/seq2seq/python/ops/seq2seq.py", line 212, in dynamic_rnn_decoder
swap_memory=swap_memory, scope=scope)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/ops/rnn.py", line 1036, in raw_rnn
swap_memory=swap_memory)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2605, in while_loop
result = context.BuildLoop(cond, body, loop_vars, shape_invariants)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2438, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2388, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/ops/rnn.py", line 980, in body
(next_output, cell_state) = cell(current_input, state)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py", line 327, in __call__
input_size = inputs.get_shape().with_rank(2)[1]
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/framework/tensor_shape.py", line 635, in with_rank
raise ValueError("Shape %s must have rank %d" % (self, rank))
ValueError: Shape (100,) must have rank 2
As a result, I passed in the batch_size like this in order to get a Shape of rank 2:
cell_output = array_ops.zeros([batch_size, num_features],
dtype=dtypes.float32)
But I get the following error, where Shape is of rank 3 and wants a rank 2 instead:
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/contrib/seq2seq/python/ops/seq2seq.py", line 212, in dynamic_rnn_decoder
swap_memory=swap_memory, scope=scope)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/ops/rnn.py", line 1036, in raw_rnn
swap_memory=swap_memory)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2605, in while_loop
result = context.BuildLoop(cond, body, loop_vars, shape_invariants)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2438, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2388, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/ops/rnn.py", line 980, in body
(next_output, cell_state) = cell(current_input, state)
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py", line 327, in __call__
input_size = inputs.get_shape().with_rank(2)[1]
File "/opt/DL/tensorflow/lib/python2.7/site-packages/tensorflow/python/framework/tensor_shape.py", line 635, in with_rank
raise ValueError("Shape %s must have rank %d" % (self, rank))
ValueError: Shape (10, 10, 100) must have rank 2

Buffer underrun and ResourceExhausted errors with tensorflow

I'm in high school and I'm trying to do a project involving neural networks. I am using Ubuntu and trying to do reinforcement learning with tensorflow, but I consistently get lots of underrun warnings when I train a neural network. They take the form of ALSA lib pcm.c:7963:(snd_pcm_recover) underrun occurred. This message is printed to the screen more and more frequently as training progresses. Eventually, I get a ResourceExhaustedError and the program terminates. Here is the full error message:
W tensorflow/core/framework/op_kernel.cc:975] Resource exhausted: OOM when allocating tensor with shape[320000,512]
Traceback (most recent call last):
File "./train.py", line 121, in <module>
loss, _ = model.train(minibatch, gamma, sess) # Train the model based on the batch, the discount factor, and the tensorflow session.
File "/home/perrin/neural/dqn.py", line 174, in train
return sess.run([self.loss, self.optimize], feed_dict=self.feed_dict) # Runs the training. This is where the underrun errors happen
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 766, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 964, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1014, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1034, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[320000,512]
[[Node: gradients/fully_connected/MatMul_grad/MatMul_1 = MatMul[T=DT_FLOAT, transpose_a=true, transpose_b=false, _device="/job:localhost/replica:0/task:0/cpu:0"](dropout/mul, gradients/fully_connected/BiasAdd_grad/tuple/control_dependency)]]
Caused by op u'gradients/fully_connected/MatMul_grad/MatMul_1', defined at:
File "./train.py", line 72, in <module>
model = AC_Net([None, 201, 201, 3], 5, trainer) # This creates the neural network using the imported AC_Net class.
File "/home/perrin/neural/dqn.py", line 128, in __init__
self.optimize = trainer.minimize(self.loss) # This tells the trainer to adjust the weights in such a way as to minimize the loss. This is what actually
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 269, in minimize
grad_loss=grad_loss)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 335, in compute_gradients
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients_impl.py", line 482, in gradients
in_grads = grad_fn(op, *out_grads)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/math_grad.py", line 731, in _MatMulGrad
math_ops.matmul(op.inputs[0], grad, transpose_a=True))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/math_ops.py", line 1729, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 1442, in _mat_mul
transpose_b=transpose_b, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
self._traceback = _extract_stack()
...which was originally created as op u'fully_connected/MatMul', defined at:
File "./train.py", line 72, in <module>
model = AC_Net([None, 201, 201, 3], 5, trainer) # This creates the neural network using the imported AC_Net class.
File "/home/perrin/neural/dqn.py", line 63, in __init__
net = slim.fully_connected(net, 512, activation_fn=tf.nn.elu, scope='fully_connected') # Feeds the input through a fully connected layer
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 177, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1350, in fully_connected
outputs = standard_ops.matmul(inputs, weights)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/math_ops.py", line 1729, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 1442, in _mat_mul
transpose_b=transpose_b, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
self._traceback = _extract_stack()
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[320000,512]
[[Node: gradients/fully_connected/MatMul_grad/MatMul_1 = MatMul[T=DT_FLOAT, transpose_a=true, transpose_b=false, _device="/job:localhost/replica:0/task:0/cpu:0"](dropout/mul, gradients/fully_connected/BiasAdd_grad/tuple/control_dependency)]]
I researched these problems but didn't get a clear idea of how I could fix them. I am pretty new to programming so I don't know much about how buffers and data reading/writing works. I am very perplexed by these errors. Does anyone know what parts of my code might be causing this and how to fix it? Thanks for taking the time to consider this question!
Here is my code for defining the neural network (based on this tutorial):
#! /usr/bin/python
import numpy as np
import tensorflow as tf
slim = tf.contrib.slim
# The neural network
class AC_Net:
# This defines the actual neural network.
# output_size: the number of outputs of the policy
# trainer: the tensorflow training optimizer used by the network
def __init__(self, input_shape, output_size, trainer):
with tf.name_scope('input'):
self.input = tf.placeholder(shape=list(input_shape), dtype=tf.float32, name='input')
net = tf.image.per_image_standardization(self.input[0])
net = tf.expand_dims(net, [0])
with tf.name_scope('convolution'):
net = slim.conv2d(net, 32, [8, 8], activation_fn=tf.nn.elu, scope='conv')
net = slim.max_pool2d(net, [2, 2], scope='pool')
net = slim.flatten(net)
net = tf.nn.dropout(net, .5)
net = slim.fully_connected(net, 512, activation_fn=tf.nn.elu, scope='fully_connected')
net = tf.nn.dropout(net, .5)
with tf.name_scope('LSTM'):
cell = tf.nn.rnn_cell.BasicLSTMCell(256, state_is_tuple=True, activation=tf.nn.elu)
with tf.name_scope('state_in'):
state_in = cell.zero_state(tf.shape(net)[0], tf.float32)
net = tf.expand_dims(net, [0])
step_size = tf.shape(self.input)[:1]
output, state = tf.nn.dynamic_rnn(cell, net, initial_state=state_in, sequence_length=step_size, time_major=False, scope='LSTM')
out = tf.reshape(output, [-1, 256])
out = tf.nn.dropout(out, .5)
self.policy = slim.fully_connected(out, output_size, activation_fn=tf.nn.softmax, scope='policy')
self.value = slim.fully_connected(out, 1, activation_fn=None, scope='value')
# Defines the loss functions
with tf.name_scope('loss_function'):
self.target_values = tf.placeholder(dtype=tf.float32, name='target_values') # The target value is the discounted reward.
self.actions = tf.placeholder(dtype=tf.int32, name='actions') # This is the network's policy.
# The advantage is the difference between what the network thought the value of an action was, and what it actually was.
# It is computed as R - V(s), where R is the discounted reward and V(s) is the value of being in state s.
self.advantages = tf.placeholder(dtype=tf.float32, name='advantages')
with tf.name_scope('entropy'):
entropy = -tf.reduce_sum(tf.log(self.policy + 1e-10) * self.policy)
with tf.name_scope('responsible_actions'):
actions_onehot = tf.one_hot(self.actions, output_size, dtype=tf.float32)
responsible_actions = tf.reduce_sum(self.policy * actions_onehot, [1]) # This returns only the actions that were selected.
with tf.name_scope('loss'):
with tf.name_scope('value_loss'):
self.value_loss = tf.reduce_sum(tf.square(self.target_values - tf.reshape(self.value, [-1])))
with tf.name_scope('policy_loss'):
self.policy_loss = -tf.reduce_sum(tf.log(responsible_actions + 1e-10) * self.advantages)
with tf.name_scope('total_loss'):
self.loss = self.value_loss + self.policy_loss - entropy * .01
tf.summary.scalar('loss', self.loss)
with tf.name_scope('gradient_clipping'):
tvars = tf.trainable_variables()
grads = tf.gradients(self.loss, tvars)
grads, _ = tf.clip_by_global_norm(grads, 20.)
self.optimize = trainer.apply_gradients(zip(grads, tvars))
def predict(self, inputs, sess):
return sess.run([self.policy, self.value], feed_dict={self.input:inputs})
def train(self, train_batch, gamma, sess):
inputs = train_batch[:, 0]
actions = train_batch[:, 1]
rewards = train_batch[:, 2]
values = train_batch[:, 4]
discounted_rewards = rewards[::-1]
for i, j in enumerate(discounted_rewards):
if i > 0:
discounted_rewards[i] += discounted_rewards[i - 1] * gamma
discounted_rewards = np.array(discounted_rewards, np.float32)[::-1]
advantages = discounted_rewards - values
self.feed_dict = {
self.input:np.vstack(inputs),
self.target_values:discounted_rewards,
self.actions:actions,
self.advantages:advantages
}
return sess.run([self.loss, self.optimize], feed_dict=self.feed_dict)
Here is my code for training the neural network:
#! /usr/bin/python
import game_env, move_right, move_right_with_obs, random, inspect, os
import tensorflow as tf
import numpy as np
from dqn import AC_Net
def process_outputs(x):
a = [int(x > 2), int(x%2 == 0 and x > 0)*2-int(x > 0)]
return a
environment = game_env # The environment to use
env_name = str(inspect.getmodule(environment).__name__) # The name of the environment
ep_length = 2000
num_episodes = 20
total_steps = ep_length * num_episodes # The total number of steps
model_path = '/home/perrin/neural/nn/' + env_name
learning_rate = 1e-4 # The learning rate
trainer = tf.train.AdamOptimizer(learning_rate=learning_rate) # The gradient descent optimizer used
first_epsilon = 0.6 # The initial chance of random action
final_epsilon = 0.01 # The final chance of random action
gamma = 0.9
anneal_steps = 35000 # The number of steps it takes to go from initial to random
count = 0 # Keeps track of the number of steps we've run
experience_buffer = [] # Stores the agent's experiences in a list
buffer_size = 10000 # How large the experience buffer can be
train_step = 256 # How often to train the model
batches_per_train = 10
save_step = 500 # How often to save the trained model
batch_size = 256 # How many experiences to train on at once
env_size = 500 # How many pixels tall and wide the environment should be.
load_model = True # Whether or not to load a pretrained model
train = True # Whether or not to train the model
test = False # Whether or not to test the model
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = AC_Net([None, 201, 201, 3], 5, trainer)
env = environment.Env(env_size)
action = [0, 0]
state, _ = env.step(True, action)
saver = tf.train.Saver() # This saves the model
epsilon = first_epsilon
tf.global_variables_initializer().run()
if load_model:
ckpt = tf.train.get_checkpoint_state(model_path)
saver.restore(sess, ckpt.model_checkpoint_path)
print 'Model loaded'
prev_out = None
while count <= total_steps and train:
if random.random() < epsilon or count == 0:
if prev_out is not None:
out = prev_out
if random.randint(0, 100) == 100 or prev_out is None:
out = np.random.rand(5)
out = np.array([val/np.sum(out) for val in out])
_, value = model.predict(state, sess)
prev_out = out
else:
out, value = model.predict(state, sess)
out = out[0]
act = np.random.choice(out, p=out)
act = np.argmax(out == act)
act1 = process_outputs(act)
action[act1[0]] = act1[1]
_, reward = env.step(True, action)
new_state = env.get_state()
experience_buffer.append((state, act, reward, new_state, value[0, 0]))
state = new_state
if len(experience_buffer) > buffer_size:
experience_buffer.pop(0)
if count % train_step == 0 and count > 0:
print "Training model"
for i in range(batches_per_train):
# Get a random sample of experiences and train the model based on it.
x = random.randint(0, len(experience_buffer)-batch_size)
minibatch = np.array(experience_buffer[x:x+batch_size])
loss, _ = model.train(minibatch, gamma, sess)
print "Loss for batch", str(i+1) + ":", loss
if count % save_step == 0 and count > 0:
saver.save(sess, model_path+'/model-'+str(count)+'.ckpt')
print "Model saved"
if count % ep_length == 0 and count > 0:
print "Starting new episode"
env = environment.Env(env_size)
if epsilon > final_epsilon:
epsilon -= (first_epsilon - final_epsilon)/anneal_steps
count += 1
while count <= total_steps and test:
out, _ = model.predict(state, sess)
out = out[0]
act = np.random.choice(out, p=out)
act = np.argmax(out == act)
act1 = process_outputs(act)
action[act1[0]] = act1[1]
state, reward = env.step(True, action)
new_state = env.get_state()
count += 1
# Write log files to create tensorboard visualizations
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter('/home/perrin/neural/summaries', sess.graph)
if train:
summary = sess.run(merged, feed_dict=model.feed_dict)
writer.add_summary(summary)
writer.flush()
You are running out of memory. It's possible that your network requires more memory than you have to run, so the first step to tracking down excessive memory usage is to figure out what is using so much memory.
Here's one approach that uses timeline and statssummarizer:
https://gist.github.com/yaroslavvb/08afccbe087171881ceafc0c98abca05
This will print out several tables, one of the tables is the tensors sorted by top memory usage. You should check that you don't have something unusually large in there.
You can also see memory timeline using Chrome visualizer, as detailed here
A more advanced technique is to plot a timeline of memory allocations/deallocations, as done in this issue
Theoretically your memory usage shouldn't grow between steps if you aren't creating new stateful ops (Variables), but I found that global memory allocation can grow if sizes of your tensors change between steps.
A work-around is to periodically save your parameters to checkpoint and restart your script.

Variable scopes in Tensorflow

I am having problems making effective usage of variable scopes. I want to define some variables for weights, biases and inner state of a simple recurrent network. I call get_saver() once after defining the default graph. I then iterate over a batch of samples using tf.scan.
import tensorflow as tf
import math
import numpy as np
INPUTS = 10
HIDDEN_1 = 2
BATCH_SIZE = 3
def batch_vm2(m, x):
[input_size, output_size] = m.get_shape().as_list()
input_shape = tf.shape(x)
batch_rank = input_shape.get_shape()[0].value - 1
batch_shape = input_shape[:batch_rank]
output_shape = tf.concat(0, [batch_shape, [output_size]])
x = tf.reshape(x, [-1, input_size])
y = tf.matmul(x, m)
y = tf.reshape(y, output_shape)
return y
def get_saver():
with tf.variable_scope('h1') as scope:
weights = tf.get_variable('W', shape=[INPUTS, HIDDEN_1], initializer=tf.truncated_normal_initializer(stddev=1.0 / math.sqrt(float(INPUTS))))
biases = tf.get_variable('bias', shape=[HIDDEN_1], initializer=tf.constant_initializer(0.0))
state = tf.get_variable('state', shape=[HIDDEN_1], initializer=tf.constant_initializer(0.0), trainable=False)
saver = tf.train.Saver([weights, biases, state])
return saver
def load(sess, saver, checkpoint_dir = None):
print("loading a session")
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
else:
raise Exception("no checkpoint found")
return
def iterate_state(prev_state_tuple, input):
with tf.variable_scope('h1') as scope:
scope.reuse_variables()
weights = tf.get_variable('W', shape=[INPUTS, HIDDEN_1], initializer=tf.truncated_normal_initializer(stddev=1.0 / math.sqrt(float(INPUTS))))
biases = tf.get_variable('bias', shape=[HIDDEN_1], initializer=tf.constant_initializer(0.0))
state = tf.get_variable('state', shape=[HIDDEN_1], initializer=tf.constant_initializer(0.0), trainable=False)
print("input: ",input.get_shape())
matmuladd = batch_vm2(weights, input) + biases
matmulpri = tf.Print(matmuladd,[matmuladd], message=" malmul -> ")
#matmulvec = tf.reshape(matmuladd, [HIDDEN_1])
#state = tf.get_variable('state', shape=[HIDDEN_1], initializer=tf.constant_initializer(0.0))
print("prev state: ",prev_state_tuple.get_shape())
unpacked_state, unpacked_out = tf.split(0,2,prev_state_tuple)
prev_state = unpacked_state
state = state.assign( 4.2*(0.9* prev_state + 0.1*matmuladd) )
#output = tf.nn.relu(state)
output = tf.nn.tanh(state)
state = tf.Print(state, [state], message=" state -> ")
output = tf.Print(output, [output], message=" output -> ")
#output = matmulpri
print(" state: ", state.get_shape())
print(" output: ", output.get_shape())
concat_result = tf.concat(0,[state, output])
print (" concat return: ", concat_result.get_shape())
return concat_result
def data_iter():
while True:
idxs = np.random.rand(BATCH_SIZE, INPUTS)
yield idxs
with tf.Graph().as_default():
inputs = tf.placeholder(tf.float32, shape=(BATCH_SIZE, INPUTS))
saver = get_saver()
initial_state = tf.zeros([HIDDEN_1],
name='initial_state')
initial_out = tf.zeros([HIDDEN_1],
name='initial_out')
#concat_tensor = tf.concat(0,[initial_state, initial_out])
concat_tensor = tf.concat(0,[initial_state, initial_out])
print(" init state: ",initial_state.get_shape())
print(" init out: ",initial_out.get_shape())
print(" concat: ",concat_tensor.get_shape())
scanout = tf.scan(iterate_state, inputs, initializer=concat_tensor, name='state_scan')
print ("scanout shape: ", scanout.get_shape())
state, output = tf.split(1,2,scanout, name='split_scan_output')
print(" end state: ",state.get_shape())
print(" end out: ",output.get_shape())
#output,state,diagnostic = create_graph(inputs, state, prev_state)
sess = tf.Session()
# Run the Op to initialize the variables.
sess.run(tf.initialize_all_variables())
if False:
load(sess, saver)
iter_ = data_iter()
for i in xrange(0, 5):
print ("iteration: ",i)
input_data = iter_.next()
out,st,so = sess.run([output,state,scanout], feed_dict={ inputs: input_data})
saver.save(sess, 'my-model', global_step=1+i)
print("input vec: ", input_data)
print("state vec: ", st)
print("output vec: ", out)
print(" end state (runtime): ",st.shape)
print(" end out (runtime): ",out.shape)
print(" end scanout (runtime): ",so.shape)
My hope would be to have the variables retrieved from get_variable inside the scan op to be the same as defined inside the get_saver call. However if I run this sample code I get the following output with errors:
(' init state: ', TensorShape([Dimension(2)]))
(' init out: ', TensorShape([Dimension(2)]))
(' concat: ', TensorShape([Dimension(4)]))
Traceback (most recent call last):
File "cycles_in_graphs_with_scan.py", line 88, in <module>
scanout = tf.scan(iterate_state, inputs, initializer=concat_tensor, name='state_scan')
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/functional_ops.py", line 345, in scan
back_prop=back_prop, swap_memory=swap_memory)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 1873, in while_loop
result = context.BuildLoop(cond, body, loop_vars)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 1749, in BuildLoop
body_result = body(*vars_for_body_with_tensor_arrays)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/functional_ops.py", line 339, in compute
a = fn(a, elems_ta.read(i))
File "cycles_in_graphs_with_scan.py", line 47, in iterate_state
weights = tf.get_variable('W', shape=[INPUTS, HIDDEN_1], initializer=tf.truncated_normal_initializer(stddev=1.0 / math.sqrt(float(INPUTS))))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variable_scope.py", line 732, in get_variable
partitioner=partitioner, validate_shape=validate_shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variable_scope.py", line 596, in get_variable
partitioner=partitioner, validate_shape=validate_shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variable_scope.py", line 161, in get_variable
caching_device=caching_device, validate_shape=validate_shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variable_scope.py", line 454, in _get_single_variable
" Did you mean to set reuse=None in VarScope?" % name)
ValueError: Variable state_scan/h1/W does not exist, disallowed. Did you mean to set reuse=None in VarScope?
any idea what I am doing wrong in this example?
if False:
load(sess, saver)
This two lines lead to uninitialized variables.

python - ZeroDivisionError

I created a script which copy data to specific location. What i tried to do is print a results via progress-bar. I tried to use package : -> https://pypi.python.org/pypi/progressbar2
Here is my code:
src = raw_input("Enter source disk location: ")
src = os.path.abspath(src)
dst = raw_input("Enter first destination to copy: ")
dst = os.path.abspath(dst)
dest = raw_input("Enter second destination to move : ")
dest = os.path.abspath(dest)
for dir, dirs, files in os.walk(src):
if any(f.endswith('.mdi') for f in files):
dirs[:] = [] # do not recurse into subdirectories
continue # ignore this directory
files = [os.path.join(dir, f) for f in files]
progress, progress_maxval = 0, len(files) pbar = ProgressBar(widgets=['Progress ', Percentage(), Bar(), ' ', ETA(), ],maxval=progress_maxval).start()
debug_status = ''
for list in files:
part1 = os.path.dirname(list)
part2 = os.path.dirname(os.path.dirname(part1))
part3 = os.path.split(part1)[1]
path_miss1 = os.path.join(dst, "missing_mdi")
# ---------first location-------------------#
path_miss = os.path.join(path_miss1, part3)
# ---------second location-------------------#
path_missing = os.path.join(dest, "missing_mdi")
try:
# ---------first location-------------------#
if not os.path.exists(path_miss):
os.makedirs(path_miss)
else:
pass
if os.path.exists(path_miss):
distutils.dir_util.copy_tree(part1, path_miss)
else:
debug_status += "missing_file\n"
pass
if (get_size(path_miss)) == 0:
os.rmdir(path_miss)
else:
pass
# ---------second location-------------------#
if not os.path.exists(path_missing):
os.makedirs(path_missing)
else:
pass
if os.path.exists(path_missing):
shutil.move(part1, path_missing)
else:
debug_status += "missing_file\n"
if (get_size(path_missing)) == 0:
os.rmdir(path_missing)
else:
pass
except Exception:
pass
finally:
progress += 1
pbar.update(progress)
pbar.finish()
print debug_status
When i tried to execute it i got error and My Traceback is below:
Traceback (most recent call last):
File "<string>", line 254, in run_nodebug
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\CopyClass.py", in <module>
pbar = ProgressBar(widgets=['Progress ', Percentage(), Bar(), ' ', ETA(),],maxval=progress_maxval).start()
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\__init__.py", in start
self.update(0)
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\__init__.py", line 283, in update
self.fd.write(self._format_line() + '\r')
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\__init__.py", line 243, in _format_line
widgets = ''.join(self._format_widgets())
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\__init__.py", line 223, in _format_widgets
widget = format_updatable(widget, self)
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\widgets.py", in format_updatable
if hasattr(updatable, 'update'): return updatable.update(pbar)
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\widgets.py", in update
return '%3d%%' % pbar.percentage()
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\__init__.py", line 208, in percentage
return self.currval * 100.0 / self.maxval
ZeroDivisionError: float division by zero
I know that there is a problem with "maxval=progress_maxval" because it can't be devided by zero.
My qestion is ,how to change it? Should i create exception to ignore zero ? How to do it ?
I think inside the ProgressBar its trying divide to zero. It calculates like this:
max_value - 100%
progress_value - x and from this formula if we find x? will be this:
x = (100 * progress_value) / max_value
for this solution set 1 instead of 0 for max_value.

Cannot Pool.map() function because of UnpickleableError?

So I am trying to multi process function F. Which is accessed by a button press with tkinter.
def f(x):
global doom,results,info
doom = doom + 1
if check(x) == True:
results.add(x)
info.append(get_column_number(x))
j.step(1)
texx = "1/"+doom
s.configure(text=texx)
root.update()
The function is called within a function like so:
def dojob():
index = ['URLS'...]
pool = Pool(processes=4)
s.configure(text="Shifting Workload to cores..")
root.update()
pool.map(f, index)
The button is inside root window.
I get the following error:
Exception in thread Thread-2:
Traceback (most recent call last):
File "C:\Python27\lib\threading.py", line 808, in __bootstrap_inner
self.run()
File "C:\Python27\lib\threading.py", line 761, in run
self.__target(*self.__args, **self.__kwargs)
File "C:\Python27\lib\multiprocessing\pool.py", line 342, in _handle_tasks
put(task)
UnpickleableError: Cannot pickle <type 'tkapp'> objects
I do not even know what a pickle does? Help?
Here is the complete code:
from Tkinter import *
from ttk import *
from tkMessageBox import showinfo
from multiprocessing import Pool
import random
emails = set()
import urllib2
import urllib2 as urllib
########
CONSTANT_PAGECOUNT = 20
######
def f(x):
global doom,emails,info
doom = doom + 1
if check(x) == True:
print "",
emails.add(x)
info.append(get_column_number(x))
j.step(1)
texx = "Sk1nn1n "+str(doom)+'/'+str(CONSTANT_PAGECOUNT)+""
s.configure(text=texx)
root.update()
return 0
def f(x):
print ""
def showFile(site,info):
top = Toplevel()
top.title('Sites')
x = Text(top)
x.pack()
i=0
for site_url in site:
x.insert(END,site_url)
i=i+1
def get_column_number(url):
return True
def check(url):
return True
def getgoogleurl(search,siteurl=False,startr=0):
if siteurl==False:
return 'http://www.google.com/search?q='+urllib2.quote(search)+'&start='+str(startr)+'&oq='+urllib2.quote(search)
else:
return 'http://www.google.com/search?q=site:'+urllib2.quote(siteurl)+'%20'+urllib2.quote(search)+'&oq=site:'+urllib2.quote(siteurl)+'%20'+urllib2.quote(search)
def getgooglelinks(search,siteurl=False,startr=0):
#google returns 403 without user agent
headers = {'User-agent':'Mozilla/11.0'}
req = urllib2.Request(getgoogleurl(search,siteurl,startr),None,headers)
site = urllib2.urlopen(req)
data = site.read()
site.close()
#no beatifulsoup because google html is generated with javascript
start = data.find('<div id="res">')
end = data.find('<div id="foot">')
if data[start:end]=='':
#error, no links to find
return False
else:
links =[]
data = data[start:end]
start = 0
end = 0
while start>-1 and end>-1:
#get only results of the provided site
if siteurl==False:
start = data.find('<a href="/url?q=')
else:
start = data.find('<a href="/url?q='+str(siteurl))
data = data[start+len('<a href="/url?q='):]
end = data.find('&sa=U&ei=')
if start>-1 and end>-1:
link = urllib2.unquote(data[0:end])
data = data[end:len(data)]
if link.find('http')==0:
links.append(link)
return links
def rip(results=15,accuracy=16):
global e
keyword = ''+str(e.get())
if keyword.strip()=="":
s.configure(text="Please enter a keyword")
root.update()
return 0
linklist = []
counter = 0
doom = 0
while counter < results:
links = getgooglelinks(keyword,startr=counter)
for link in links:
if len(linklist) > CONSTANT_PAGECOUNT:
s.configure(text="Proccessing..")
root.update()
return linklist
else:
doom = doom + 1
linklist.append(link)
texx = str(doom)+"/"+str(CONSTANT_PAGECOUNT)
s.configure(text=texx)
root.update()
root.update()
counter = counter+accuracy
return linklist
def flip():
global e
emails = set()
info = []
keyword = ''+str(e.get())
if keyword.strip()=="":
s.configure(text="Please enter a keyword")
root.update()
return 0
s.configure(text="Generating index..")
root.update()
doom = -1
index = rip(CONSTANT_PAGECOUNT,10)
if 1:
try:
pool = Pool(processes=4)
#s.configure(text="Shifting Workload to cores..")
#root.update()
pool.map(f, index)
pool.close()
except:
print "The errors there.."
j.config(value=CONSTANT_PAGECOUNT)
if len(emails) > 0:
filepath='relavant_list_'+str(random.randint(1,9999))+'.emList.txt'
#print len(emails),
#print "emails found."
ggg = open(filepath,'a+')
for x in emails:
ggg.write(x+"\n")
showinfo(
str(len(emails))+" key word related sites found!",
" sites are saved in "+str(filepath)
)
showFile(emails,info)
s.configure(text=filepath)
else:
s.configure(text='No related sites found : (')
if __name__ == '__main__':
### CONSTANTS
version = '1.0'
### END CONSTANTS
root = Tk()
root.title('Program v'+version)
s = Style()
s.theme_use('default')
#print s.theme_names()
s.configure("black.Horizontal.TProgressbar", foreground='blue', background='blue')
j = Progressbar(root, style="black.Horizontal.TProgressbar", orient="vertical", length=200, mode="determinate", maximum=CONSTANT_PAGECOUNT, value=0)
j.pack(side='right',fill='y')
f = Frame(root)
x = Frame(f)
e = Entry(x,width=51)
s = Label(x,width=50,anchor='center',text='Waiting for task..')
Button(f,text='Generate List!',width=50,command=flip).pack(fill='both',expand=True)
s.pack(side='bottom',fill='y',expand=True)
e.pack(side='top',fill='both',expand=True)
x.pack(side='top',fill='y',expand=True)
f.pack(side='left',expand=True,fill="both")
root.mainloop()
You are leaking a tkinter object. Most likely because you are trying to update the interface from another process with the last line of f()
Update based on code
You have a name collision between your function f() and a variable f in your __main__ which gets assigned to your main window and causes the tkapp pickle error. Rename the function to def myfunc() or something. Also need to call pool.join() after pool.close()