Run parallel op with different inputs and same placeholder

Run parallel op with different inputs and same placeholder - concurrency

I have the necessity to calculate more then one accuracy in the same time, concurrently.
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
The piece of code is the same of the mnist example in the tutorial of TensorFlow but instead of having:
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
I have two placeolder because I already calculated and stored them.
W = tf.placeholder(tf.float32, [784, 10])
b = tf.placeholder(tf.float32, [10])
I want to fill the network with the values I aready have and then calculate the accuracy and this have to happen for each network I loaded.
So if I load 20 networks I want to calculate in parallel the accuracy for each one. There is a way with the session run to execute the same operation with different input?

You have multiple options to make things happen in parallel:
Parallelize using multiple python threads / subprocesses. (See Python's "multiprocessing" library.)
Batch up the operations into single larger operations. (e.g. Similar to the image operations that operate on a batch of images simultaneously https://www.tensorflow.org/api_docs/python/image/resizing#resize_bilinear.)
Make a single graph that has the 20 network accuracy calculations.
I think the last one is the easiest, so I've included a bit of sample code below to get you started:
import tensorflow as tf
def construct_accuracy_calculation(i):
W = tf.placeholder(tf.float32, [784, 10], name=("%d_W" % i))
b = tf.placeholder(tf.float32, [10], name=("%d_b" % i))
# ...
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
return (W, b, accuracy)
def main():
accuracy_computations = []
feed_dict={}
for i in xrange(NUM_NETWORKS):
(W, b) = load_network(i)
(W_op, b_op, accuracy) = construct_accuracy_calculation(i)
feed_dict[W_op] = W
feed_dict[b_op] = b
accuracy_computations.append(accuracy)
# sess = ...
accuracy_values = sess.run(accuracy_computations, feed_dict=feed_dict)
if __name__ == "__main__":
main()

One approach to parallelizing TF computations is to execute run calls in parallel using threads (TF is incompatible with multiprocessing). It's a bit more complicated than other approaches because you have to handle parallelism yourself on the Python side.
Here's an example that runs same matmul op in same session in different Python threads with different fed inputs and runs about 4x faster with 4 threads compared to 1 thread
import os, sys, queue, threading, time
import tensorflow as tf
import numpy as np
def p(s):
# helper function for printing from multiple threads
# need to append \n or results get intermixed in notebook
print(s+"\n", flush=True, end="")
num_threads = 4
data_size = 32 # number of data points to enqueue
work_per_thread = data_size/num_threads
timeout = 10 # grace period for dequeing
input_queue = queue.Queue(data_size)
output_queue = queue.Queue(data_size)
dtype = np.float32
# use matrix vector matmul since it's compute intensive and uses single core
# see issue #6752
n = 16*1024
with tf.device("/cpu:0"):
x = tf.placeholder(dtype)
matrix = tf.Variable(tf.ones((n, n)))
vector = tf.Variable(tf.ones((n, 1)))
y = tf.matmul(matrix, vector)[0, 0] + x
# turn off graph-rewriting optimizations
sess = tf.Session(config=tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0))))
sess.run(tf.global_variables_initializer())
done = False
def runner(runner_id):
p("Starting runner %s" % (runner_id,))
count = 0
while not done:
try:
x_val = input_queue.get(timeout=1)
except queue.Empty:
# retry on empty queue
continue
p("Start computing %d on %d" %(x_val, runner_id))
out = sess.run(y, {x: x_val})
count+=1
output_queue.put(out)
if count>=work_per_thread:
break
else:
p("Stopping runner "+str(runner_id))
threads = []
print("Creating threads.")
for i in range(num_threads):
t = threading.Thread(target=runner, args=(i,))
threads.append(t)
for i in range(data_size):
input_queue.put(i, timeout=timeout)
# start threads
p("Launching runners.")
start_time = time.time()
for t in threads:
t.start()
p("Reading results.")
for i in range(data_size):
try:
p("Main thread: obtained %.2f" % (output_queue.get(timeout=timeout),))
except queue.Empty:
print("No results after %d, terminating computation."%(timeout,))
break
else:
p("Computed successfully.")
done = True
p("Waiting for threads to finish.")
for t in threads:
t.join()
print("Done in %.2f seconds" %(time.time() - start_time))

Related

tensorflow error: InvalidArgumentError: Shape mismatch in tuple component 1. Expected [1], got [5]

I am trying to construct a batch of (wav_file, label) pair.
wav file labels and paths are listed in dev.csv.
below code is not working,
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
threads = 1
batch_size = 5
global record_defaults
record_defaults = [['/Users/phoenix/workspace/dataset/data_thchs30/dev/A11_101.wav'], ['8.26'], ['七十 年代 末 我 外出 求学 母亲 叮咛 我 吃饭 要 细嚼慢咽 学习 要 深 钻 细 研']]
def read_record(filename_queue, num_records):
reader = tf.TextLineReader()
key, value = reader.read_up_to(filename_queue, num_records)
wav_filename, duration, transcript = tf.decode_csv(value, record_defaults, field_delim=",")
wav_reader = tf.WholeFileReader()
wav_key, wav_value = wav_reader.read_up_to(tf.train.string_input_producer(wav_filename, shuffle=False, capacity=num_records), num_records)
return [wav_key, transcript] # throw errors
# return [wav_key, wav_value] # works
# return [wav_filename, duration, transcript] # works
data_queue = tf.train.string_input_producer(tf.train.match_filenames_once('dev.csv'), shuffle=False)
batch_data = [read_record(data_queue, batch_size) for _ in range(threads)]
capacity = threads * batch_size
batch_values = tf.train.batch_join(batch_data, batch_size=batch_size, capacity=capacity, enqueue_many=True)
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
sess.run(tf.initialize_local_variables())
coord = tf.train.Coordinator()
print(coord)
threads = tf.train.start_queue_runners(coord=coord)
print("threads num: " + str(threads))
try:
step = 0
while not coord.should_stop():
step += 1
feat = sess.run([batch_values])
print("line:", step, feat)
except tf.errors.OutOfRangeError:
print(' training for 1 epochs, %d steps', step)
finally:
coord.request_stop()
coord.join(threads)
throw errors below, how can I fix it?:
dev.csv content as below:
/Users/phoenix/workspace/dataset/data_thchs30/dev/A11_101.wav,8.26,qi shi nian dai mo wo wai chu qiu xue
/Users/phoenix/workspace/dataset/data_thchs30/dev/A11_119.wav,6.9,chen yun tong shi yao qiu gan bu men ren zhen xue xi

I tried to rewrite your code like this.
This is my observation.
The error is no longer thrown. And the values are returned.
An obvious discrepancy is that the size of the batch for transcript is double that specified. So it is 4 instead of 2. It doubles for some reason. No such problem for the audio binary.
shapes=[tf.TensorShape(()),tf.TensorShape(batch_size,)] is based on an error I saw which mentioned that I have to specify this using TensorShape. I didn't find the documentation of any help but it is mentioned there.
shapes: (Optional.) A list of fully-defined TensorShape objects with the same length as dtypes, or None.
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.DEBUG)
FLAGS = tf.app.flags.FLAGS
threads = 1
batch_size = 2
record_defaults = [['D:/male.wav'], ['8.26'], ['七十 年代 末 我 外出 求学 母亲 叮咛 我 吃饭 要 细嚼慢咽 学习 要 深 钻 细 研']]
def readbatch(data_queue) :
reader = tf.TextLineReader()
_, rows = reader.read_up_to(data_queue, batch_size)
wav_filename, duration, transcript = tf.decode_csv(rows, record_defaults,field_delim=",")
audioreader = tf.WholeFileReader()
_, audio = audioreader.read( tf.train.string_input_producer(wav_filename) )
return [audio,transcript]
data_queue = tf.train.string_input_producer(
tf.train.match_filenames_once('D:/Book1.csv'), shuffle=False)
batch_data = [readbatch(data_queue) for _ in range(threads)]
capacity = threads * batch_size
batch_values = tf.train.batch_join(batch_data, shapes=[tf.TensorShape(()),tf.TensorShape(batch_size,)], capacity=capacity, batch_size=batch_size, enqueue_many=False )
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
sess.run(tf.initialize_local_variables())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
step = 0
while not coord.should_stop():
step += 1
feat = sess.run([batch_values])
audio = feat[0][0]
print ('Size of audio is ' + str(audio.size))
script = feat[0][1]
print ('Size of script is ' + str(script.size))
except tf.errors.OutOfRangeError:
print(' training for 1 epochs, %d steps', step)
finally:
coord.request_stop()
coord.join(threads)
A sample dataset proves that there is an extra pair.
[[array([b'Text2', b'Text1'], dtype=object), array([[b'Translation-1', b'Translation-2'],
[b'Translation-1', b'Translation-2']], dtype=object)]]

How do I terminate an async scipy.optimize based on time?

Really struggling with this one... Forgive the longish post.
I have an experiment that on each trial displays some stimulus, collects a response, and then moves on to the next trial.
I would like to incorporate an optimizer that runs in between trials and therefore must have a specific time-window designated by me to run, or it should be terminated. If it's terminated, I would like to return the last set of parameters it tried so that I can use it later.
Generally speaking, here's the order of events I'd like to happen:
In between trials:
Display stimulus ("+") for some number of seconds.
While this is happening, run the optimizer.
If the time for displaying the "+" has elapsed and the optimizer has
not finished, terminate the optimizer, return the most recent set of parameters it tried, and move on.
Here is some of the relevant code I'm working with so far:
do_bns() is the objective function. In it I include NLL['par'] = par or q.put(par)
from scipy.optimize import minimize
from multiprocessing import Process, Manager, Queue
from psychopy import core #for clock, and other functionality
clock = core.Clock()
def optim(par, NLL, q)::
a = minimize(do_bns, (par), method='L-BFGS-B', args=(NLL, q),
bounds=[(0.2, 1.5), (0.01, 0.8), (0.001, 0.3), (0.1, 0.4), (0.1, 1), (0.001, 0.1)],
options={"disp": False, 'maxiter': 1, 'maxfun': 1, "eps": 0.0002}, tol=0.00002)
if __name__ == '__main__':
print('starting optim')
max_time = 1.57
with Manager() as manager:
par = manager.list([1, 0.1, 0.1, 0.1, 0.1, 0.1])
NLL = manager.dict()
q = Queue()
p = Process(target=optim, args=(par, NLL, q))
p.start()
start = clock.getTime()
while clock.getTime() - start < max_time:
p.join(timeout=0)
if not p.is_alive():
break
if p.is_alive():
res = q.get()
p.terminate()
stop = clock.getTime()
print(NLL['a'])
print('killed after: ' + str(stop - start))
else:
res = q.get()
stop = clock.getTime()
print('terminated successfully after: ' + str(stop - start))
print(NLL)
print(res)
This code, on its own, seems to sort of do what I want. For example, the res = q.get() right above the p.terminate() actually takes something like 200ms so it will not terminate exactly at max_time if max_time < ~1.5s
If I wrap this code in a while-loop that checks to see if it's time to stop presenting the stimulus:
stim_start = clock.getTime()
stim_end = 5
print('showing stim')
textStim.setAutoDraw(True)
win.flip()
while clock.getTime() - stim_start < stim_end:
# insert the code above
print('out of loop')
I get weird behavior such as multiple iterations of the whole code from the beginning...
showing stim
starting optim
showing stim
out of loop
showing stim
out of loop
[1.0, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001]
killed after: 2.81303440395
Note the multiple 'showing stim's' and 'out of loop's.
I'm open to any solution that accomplishes my goal :|
Help and thank you!
Ben

General remark
Your solution would give me nightmares! I don't see a reason to use multiprocessing here and i'm not even sure how you grab those updated results before termination. Maybe you got your reason for this approach, but i highly recommend something else (which has a limitation).
Callback-based approach
The general idea i would pursue is the following:
fire up your optimizer with some additional time-limit information and some callback enforcing this
the callback is called in each iteration of this optimizer
if time-limit reached: raise a customized Exception
The limits:
as the callback is only called once in each iteration, there is some limited sequence of points in time where the optimizer might get stopped
the potential difference is highly dependent on iteration-time for your problem! (numerical-differentiation, huge-data, slow function eval; all this matters)
if not exceeding some given time is of highest priority, this approach might be not right or you would need some kind of safeguarded interpolation to reason if one more iteration is possible in time
or: combine your kind of killing off workers with my approach of updating intermediate-results through some callback
Example code (bit hacky):
import time
import numpy as np
import scipy.sparse as sp
import scipy.optimize as opt
np.random.seed(1)
""" Fake task: sparse NNLS """
M, N, D = 2500, 2500, 0.1
A = sp.random(M, N, D)
b = np.random.random(size=M)
""" Optimization-setup """
class TimeOut(Exception):
"""Raise for my specific kind of exception"""
def opt_func(x, A, b):
return 0.5 * np.linalg.norm(A.dot(x) - b)**2
def opt_grad(x, A, b):
Ax = A.dot(x) - b
grad = A.T.dot(Ax)
return grad
def callback(x):
time_now = time.time() # probably not the right tool in general!
callback.result = [np.copy(x)] # better safe than sorry -> copy
if time_now - callback.time_start >= callback.time_max:
raise TimeOut("Time out")
def optimize(x0, A, b, time_max):
result = [np.copy(x0)] # hack: mutable type
time_start = time.time()
try:
""" Add additional info to callback (only takes x as param!) """
callback.time_start = time_start
callback.time_max = time_max
callback.result = result
res = opt.minimize(opt_func, x0, jac=opt_grad,
bounds=[(0, np.inf) for i in range(len(x0))], # NNLS
args=(A, b), callback=callback, options={'disp': False})
except TimeOut:
print('time out')
return result[0], opt_func(result[0], A, b)
return res.x, res.fun
print('experiment 1')
start_time = time.perf_counter()
x, res = optimize(np.zeros(len(b)), A, b, 0.1) # 0.1 seconds max!
end_time = time.perf_counter()
print(res)
print('used secs: ', end_time - start_time)
print('experiment 2')
start_time = time.perf_counter()
x_, res_ = optimize(np.zeros(len(b)), A, b, 5) # 5 seconds max!
end_time = time.perf_counter()
print(res_)
print('used secs: ', end_time - start_time)
Example output:
experiment 1
time out
422.392771467
used secs: 0.10226839151517493
experiment 2
72.8470708728
used secs: 0.3943936788825996

Fastest way to run calculations on a list of lists

I have a list of lists like so:
import numpy as np
import random
import time
import itertools
N = 1000
x =np.random.random((N,N))
y = np.zeros((N,N))
z = np.random.random((N,N))
list_of_lists = [[x, y], [y,z], [z,x]]
and for each sublist I want to calculate the number of non zeros, the mean and the standard deviation.
I have done that like so:
distribution = []
alb_mean = []
alb_std = []
start = time.time()
for i in range(len(list_of_lists)):
one_mean = []
non_zero_l = []
one_list = list_of_lists[i]
for n in one_list:
#count non_zeros
non_zero_count = np.count_nonzero(n)
non_zero_l.append(non_zero_count)
#assign nans
n = n.astype(float)
n[n == 0.0] = np.nan
#flatten the matrix
n = np.array(n.flatten())
one_mean.append(n)
#append means and stds
distribution.append(sum(non_zero_l))
alb_mean.append(np.nanmean(one_mean))
alb_std.append(np.nanstd(one_mean))
end = time.time()
print "Loop took {} seconds".format((end - start))
which takes 0.23 seconds.
I tried to make this faster with a second option:
distribution = []
alb_mean = []
alb_std = []
start = time.time()
for i in range(len(list_of_lists)):
for_mean = []
#get one list
one_list = list_of_lists[i]
#flatten the list
chain = itertools.chain(*one_list)
flat = list(chain)
#count non_zeros
non_zero_count = np.count_nonzero(flat)
distribution.append(non_zero_count)
#remove zeros
remove_zero = np.setdiff1d(flat ,[0.0])
alb_mean.append(np.nanmean(remove_zero))
alb_std.append(np.nanstd(remove_zero))
end = time.time()
print "Loop took {} seconds".format((end - start))
which is actually slower and takes 0.88 seconds.
The sheer amount of loops has me thinking there is a better way to do this. I have tried numba but it doesn't seam to like appending in a function.

Version #1
Well in your sample with the loopy solution, you are looping with two loops - One with 3 iterations and another with 2 iterations. So, it's already close to being a vectorized one. The only bottlenecks being the append steps.
Going fully vectorized, here's one approach -
a = np.array(list_of_lists, dtype=float)
zm = a!=0
avgs = np.einsum('ijkl,ijkl->i',zm,a)/zm.sum(axis=(1,2,3)).astype(float)
a[~zm] = np.nan
stds = np.nanstd(a, axis=(1,2,3))
Using the same setup as in the question, here's what I get on timings -
Loop took 0.150925159454 seconds
Proposed solution took 0.121352910995 seconds
Version #2
We could compute std using average, thus re-use avgs for further boost :
Thus, a modified version would be -
a = np.asarray(list_of_lists)
zm = a!=0
N = zm.sum(axis=(1,2,3)).astype(float)
avgs = np.einsum('ijkl,ijkl->i',zm,a)/N
diffs = ((a-avgs[:,None,None,None])**2)
stds = np.sqrt(np.einsum('ijkl,ijkl->i',zm,diffs)/N)
Updated timings -
Loop took 0.155035018921 seconds
Proposed solution took 0.0648851394653 seconds

Plot in tensorboard is always closes and like a circle

I was trying to plot a loss curve, but is always abnormal (just like a circle, I really don't know how to describe it in English properly), I had found many topics about question like this and just can't solve, my tensorflow version is 0.10.0.
import tensorflow as tf
from tensorflow.core.util.event_pb2 import SessionLog
import os
# initialize variables/model parameters
# define the training loop operations
def inputs():
# read/generate input training data X and expected outputs Y
weight_age = [[84,46],[73,20],[65,52],[70,30],[76,57],[69,25],[63,28],[72,36],[79,57],[75,44],[27,24]
,[89,31],[65,52],[57,23],[59,60],[69,48],[60,34],[79,51],[75,50],[82,34],[59,46],[67,23],
[85,37],[55,40],[63,30]]
blodd_fat_content = [354,190,405,263,451,302,288,385,402,365,209,290,346,
254,395,434,220,374,308,220,311,181,274,303,244]
return tf.to_float(weight_age), tf.to_float(blodd_fat_content)
def inference(X):
# compute inference model over data X and return the result
return tf.matmul(X, W) + b
def loss(X, Y):
# compute loss over training data X and expected outputs Y
Y_predicted = inference(X)
return tf.reduce_sum(tf.squared_difference(Y, Y_predicted))
def train(total_loss):
# train / adjust model parameters according to computed total loss
learning_rate = 1e-7
return tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)
def evaluate(sess, X, Y):
# evaluate the resulting trained model
print (sess.run(inference([[80., 25.]])))
print (sess.run(inference([[60., 25.]])))
g1 = tf.Graph()
with tf.Session(graph=g1) as sess:
W = tf.Variable(tf.zeros([2,1]), name="weights")
b = tf.Variable(0., name="bias")
tf.initialize_all_variables().run()
X, Y = inputs()
print (sess.run(W))
total_loss = loss(X, Y)
train_op = train(total_loss)
tf.scalar_summary("loss", total_loss)
summaries = tf.merge_all_summaries()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
summary_writer = tf.train.SummaryWriter('linear', g1)
summary_writer.add_session_log(session_log= SessionLog(status=SessionLog.START), global_step=1)
# actual training loop
training_steps = 100
tolerance = 100
total_loss_last = 0
initial_step = 0
# Create a saver.
saver = tf.train.Saver()
# verify if we don't have a checkpoint saved already
ckpt = tf.train.get_checkpoint_state(os.path.dirname('my_model'))
if ckpt and ckpt.model_checkpoint_path:
# Restores from checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
initial_step = int(ckpt.model_checkpoint_path.rsplit('-', 1)[1])
# summary_writer.add_session_log(SessionLog(status=SessionLog.START), global_step=initial_step)
for step in range(initial_step, training_steps):
sess.run([train_op])
if step%20 == 0:
saver.save(sess, 'my-model', global_step=step)
gap = abs(sess.run(total_loss) - total_loss_last)
total_loss_last = sess.run(total_loss)
summary_writer.add_summary(sess.run(summaries), step)
# for debugging and learning purposes, see how the loss gets decremented thru training steps
if step % 10 == 0:
print ("loss: ", sess.run([total_loss]))
print("step: ", step)
if gap < tolerance:
break
# evaluation...
evaluate(sess, X, Y)
coord.request_stop()
coord.join(threads)
saver.save(sess, 'my-model', global_step=training_steps)
summary_writer.flush()
sess.close()

How to run independent transformations in parallel using PySpark?

I am trying to run 2 functions doing completely independent transformations on a single RDD in parallel using PySpark. What are some methods to do the same?
def doXTransforms(sampleRDD):
(X transforms)
def doYTransforms(sampleRDD):
(Y Transforms)
if __name__ == "__main__":
sc = SparkContext(appName="parallelTransforms")
sqlContext = SQLContext(sc)
hive_context = HiveContext(sc)
rows_rdd = hive_context.sql("select * from tables.X_table")
p1 = Process(target=doXTransforms , args=(rows_rdd,))
p1.start()
p2 = Process(target=doYTransforms, args=(rows_rdd,))
p2.start()
p1.join()
p2.join()
sc.stop()
This does not work and I now understand this will not work.
But is there any alternative way to make this work? Specifically are there any python-spark specific solutions?

Just use threads and make sure that cluster have enough resources to process both tasks at the same time.
from threading import Thread
import time
def process(rdd, f):
def delay(x):
time.sleep(1)
return f(x)
return rdd.map(delay).sum()
rdd = sc.parallelize(range(100), int(sc.defaultParallelism / 2))
t1 = Thread(target=process, args=(rdd, lambda x: x * 2))
t2 = Thread(target=process, args=(rdd, lambda x: x + 1))
t1.start(); t2.start()
Arguably this is not that often useful in practice but otherwise should work just fine.
You can further use in-application scheduling with FAIR scheduler and scheduler pools for a better control over execution strategy.
You can also try pyspark-asyncactions (disclaimer - the author of this answer is also the author of the package) which provides a set of wrappers around Spark API and concurrent.futures:
import asyncactions
import concurrent.futures
f1 = rdd.filter(lambda x: x % 3 == 0).countAsync()
f2 = rdd.filter(lambda x: x % 11 == 0).countAsync()
[x.result() for x in concurrent.futures.as_completed([f1, f2])]

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Run parallel op with different inputs and same placeholder - concurrency

Related

tensorflow error: InvalidArgumentError: Shape mismatch in tuple component 1. Expected [1], got [5]

How do I terminate an async scipy.optimize based on time?

Fastest way to run calculations on a list of lists

Plot in tensorboard is always closes and like a circle

How to run independent transformations in parallel using PySpark?

Categories

Resources