How to run independent transformations in parallel using PySpark? - python-2.7

I am trying to run 2 functions doing completely independent transformations on a single RDD in parallel using PySpark. What are some methods to do the same?
def doXTransforms(sampleRDD):
(X transforms)
def doYTransforms(sampleRDD):
(Y Transforms)
if __name__ == "__main__":
sc = SparkContext(appName="parallelTransforms")
sqlContext = SQLContext(sc)
hive_context = HiveContext(sc)
rows_rdd = hive_context.sql("select * from tables.X_table")
p1 = Process(target=doXTransforms , args=(rows_rdd,))
p1.start()
p2 = Process(target=doYTransforms, args=(rows_rdd,))
p2.start()
p1.join()
p2.join()
sc.stop()
This does not work and I now understand this will not work.
But is there any alternative way to make this work? Specifically are there any python-spark specific solutions?

Just use threads and make sure that cluster have enough resources to process both tasks at the same time.
from threading import Thread
import time
def process(rdd, f):
def delay(x):
time.sleep(1)
return f(x)
return rdd.map(delay).sum()
rdd = sc.parallelize(range(100), int(sc.defaultParallelism / 2))
t1 = Thread(target=process, args=(rdd, lambda x: x * 2))
t2 = Thread(target=process, args=(rdd, lambda x: x + 1))
t1.start(); t2.start()
Arguably this is not that often useful in practice but otherwise should work just fine.
You can further use in-application scheduling with FAIR scheduler and scheduler pools for a better control over execution strategy.
You can also try pyspark-asyncactions (disclaimer - the author of this answer is also the author of the package) which provides a set of wrappers around Spark API and concurrent.futures:
import asyncactions
import concurrent.futures
f1 = rdd.filter(lambda x: x % 3 == 0).countAsync()
f2 = rdd.filter(lambda x: x % 11 == 0).countAsync()
[x.result() for x in concurrent.futures.as_completed([f1, f2])]

Related

How to use pandas.Series.str.contains with tqdm progress map?

I'm trying to add a new column to a dataframe (dfA) based on values from another dataframe (dfB):
s = dfA['value'].tolist()
dfB['value'] = dfB['text_bod'].str.contains('|'.join(s))
Can progress_map be used with this setup?
dfB['value] = 'dfB['text_bod].progress_map(func)'
Or is there some other way tqdm can be implemented?
Alternative method using FlashText:
from flashtext import KeywordProcessor
s = dfA['value'].tolist()
processor = KeywordProcessor()
processor.add_keywords_from_list(s)
dfB['value'] = dfB['text_bod'].progress_map(lambda x: processor.extract_keywords(x))
Not aware of a str.contains way, but you can use progress_map with a callback that does the exact same thing, but with re.search:
import re
dfB['value'] = dfB['text_bod'].progress_map(
lambda x: bool(re.search('|'.join(s), x))
)
As a function, you can use
def extract(x, p):
m = p.search(x)
if m:
return m.groups(0)
return np.nan
p = re.compile('|'.join(s))
dfB['value'] = dfB['text_bod'].progress_map(lambda x: extract(x, p))
This should allow you greater flexibility than a lambda.

Error using multiprocessing library: "got multiple values for keyword argument 'x' "

I am trying to parallelize a penalized linear model using the multiprocessing library in python.
I created a function that solves my model:
from __future__ import division
import numpy as np
from cvxpy import *
def lm_lasso_solver(x, y, lambda1):
n = x.shape[0]
m = x.shape[1]
lambda1_param = Parameter(sign="positive")
betas_var = Variable(m)
response = dict(model='lm', penalization='l')
response["parameters"] = {"lambda_vector": lambda1}
lasso_penalization = lambda1_param * norm(betas_var, 1)
lm_penalization = 0.5 * sum_squares(y - x * betas_var)
objective = Minimize(lm_penalization + lasso_penalization)
problem = Problem(objective)
lambda1_param.value = lambda1
try:
problem.solve(solver=ECOS)
except:
try:
problem.solve(solver=CVXOPT)
except:
problem.solve(solver=SCS)
beta_sol = np.asarray(betas_var.value).flatten()
response["solution"] = beta_sol
return response
In this function x is a matrix of predictors and y is the response variable. lambda1 is the parameter that must be optimized, and so, is the parameter that I want to parallelize. I saved this script in a python file called "ms.py"
Then I created another python file called "parallelization.py" and in that file I defined the following:
import multiprocessing as mp
import ms
import functools
def myFunction(x, y, lambda1):
pool = mp.Pool(processes=mp.cpu_count())
results = pool.map(functools.partial(ms.lm_lasso_solver, x=x, y=y), lambda1)
return results
So the idea was now, on the python interpreter, execute:
from sklearn.datasets import load_boston
boston = load_boston()
x = boston.data
y = boston.target
runfile('parallelization.py')
lambda_vector = np.array([1,2,3])
myFunction(x, y, lambda_vector)
But when I do this, I get the following error message:
The problem is on the line:
results = pool.map(functools.partial(ms.lm_lasso_solver, x=x, y=y), lambda1)
You are calling the functools.partial() method with keyworded arguments whereas in your lm_lasso_solver method, you don't define them as keyworded arguments. You should call it with x and y as positional arguments as follows:
results = pool.map(functools.partial(ms.lm_lasso_solver, x, y), lambda1)
or simply use the apply_async() method the pool object :
results = pool.apply_async(ms.lm_lasso_solver, args=[x, y, lambda1])

How do I terminate an async scipy.optimize based on time?

Really struggling with this one... Forgive the longish post.
I have an experiment that on each trial displays some stimulus, collects a response, and then moves on to the next trial.
I would like to incorporate an optimizer that runs in between trials and therefore must have a specific time-window designated by me to run, or it should be terminated. If it's terminated, I would like to return the last set of parameters it tried so that I can use it later.
Generally speaking, here's the order of events I'd like to happen:
In between trials:
Display stimulus ("+") for some number of seconds.
While this is happening, run the optimizer.
If the time for displaying the "+" has elapsed and the optimizer has
not finished, terminate the optimizer, return the most recent set of parameters it tried, and move on.
Here is some of the relevant code I'm working with so far:
do_bns() is the objective function. In it I include NLL['par'] = par or q.put(par)
from scipy.optimize import minimize
from multiprocessing import Process, Manager, Queue
from psychopy import core #for clock, and other functionality
clock = core.Clock()
def optim(par, NLL, q)::
a = minimize(do_bns, (par), method='L-BFGS-B', args=(NLL, q),
bounds=[(0.2, 1.5), (0.01, 0.8), (0.001, 0.3), (0.1, 0.4), (0.1, 1), (0.001, 0.1)],
options={"disp": False, 'maxiter': 1, 'maxfun': 1, "eps": 0.0002}, tol=0.00002)
if __name__ == '__main__':
print('starting optim')
max_time = 1.57
with Manager() as manager:
par = manager.list([1, 0.1, 0.1, 0.1, 0.1, 0.1])
NLL = manager.dict()
q = Queue()
p = Process(target=optim, args=(par, NLL, q))
p.start()
start = clock.getTime()
while clock.getTime() - start < max_time:
p.join(timeout=0)
if not p.is_alive():
break
if p.is_alive():
res = q.get()
p.terminate()
stop = clock.getTime()
print(NLL['a'])
print('killed after: ' + str(stop - start))
else:
res = q.get()
stop = clock.getTime()
print('terminated successfully after: ' + str(stop - start))
print(NLL)
print(res)
This code, on its own, seems to sort of do what I want. For example, the res = q.get() right above the p.terminate() actually takes something like 200ms so it will not terminate exactly at max_time if max_time < ~1.5s
If I wrap this code in a while-loop that checks to see if it's time to stop presenting the stimulus:
stim_start = clock.getTime()
stim_end = 5
print('showing stim')
textStim.setAutoDraw(True)
win.flip()
while clock.getTime() - stim_start < stim_end:
# insert the code above
print('out of loop')
I get weird behavior such as multiple iterations of the whole code from the beginning...
showing stim
starting optim
showing stim
out of loop
showing stim
out of loop
[1.0, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001]
killed after: 2.81303440395
Note the multiple 'showing stim's' and 'out of loop's.
I'm open to any solution that accomplishes my goal :|
Help and thank you!
Ben
General remark
Your solution would give me nightmares! I don't see a reason to use multiprocessing here and i'm not even sure how you grab those updated results before termination. Maybe you got your reason for this approach, but i highly recommend something else (which has a limitation).
Callback-based approach
The general idea i would pursue is the following:
fire up your optimizer with some additional time-limit information and some callback enforcing this
the callback is called in each iteration of this optimizer
if time-limit reached: raise a customized Exception
The limits:
as the callback is only called once in each iteration, there is some limited sequence of points in time where the optimizer might get stopped
the potential difference is highly dependent on iteration-time for your problem! (numerical-differentiation, huge-data, slow function eval; all this matters)
if not exceeding some given time is of highest priority, this approach might be not right or you would need some kind of safeguarded interpolation to reason if one more iteration is possible in time
or: combine your kind of killing off workers with my approach of updating intermediate-results through some callback
Example code (bit hacky):
import time
import numpy as np
import scipy.sparse as sp
import scipy.optimize as opt
np.random.seed(1)
""" Fake task: sparse NNLS """
M, N, D = 2500, 2500, 0.1
A = sp.random(M, N, D)
b = np.random.random(size=M)
""" Optimization-setup """
class TimeOut(Exception):
"""Raise for my specific kind of exception"""
def opt_func(x, A, b):
return 0.5 * np.linalg.norm(A.dot(x) - b)**2
def opt_grad(x, A, b):
Ax = A.dot(x) - b
grad = A.T.dot(Ax)
return grad
def callback(x):
time_now = time.time() # probably not the right tool in general!
callback.result = [np.copy(x)] # better safe than sorry -> copy
if time_now - callback.time_start >= callback.time_max:
raise TimeOut("Time out")
def optimize(x0, A, b, time_max):
result = [np.copy(x0)] # hack: mutable type
time_start = time.time()
try:
""" Add additional info to callback (only takes x as param!) """
callback.time_start = time_start
callback.time_max = time_max
callback.result = result
res = opt.minimize(opt_func, x0, jac=opt_grad,
bounds=[(0, np.inf) for i in range(len(x0))], # NNLS
args=(A, b), callback=callback, options={'disp': False})
except TimeOut:
print('time out')
return result[0], opt_func(result[0], A, b)
return res.x, res.fun
print('experiment 1')
start_time = time.perf_counter()
x, res = optimize(np.zeros(len(b)), A, b, 0.1) # 0.1 seconds max!
end_time = time.perf_counter()
print(res)
print('used secs: ', end_time - start_time)
print('experiment 2')
start_time = time.perf_counter()
x_, res_ = optimize(np.zeros(len(b)), A, b, 5) # 5 seconds max!
end_time = time.perf_counter()
print(res_)
print('used secs: ', end_time - start_time)
Example output:
experiment 1
time out
422.392771467
used secs: 0.10226839151517493
experiment 2
72.8470708728
used secs: 0.3943936788825996

Parallel processing of a 3d matrix in Python

Hi I need to speed up this code
import numpy as np
matrix3d=np.empty([10,10,1000])
matrix3d[:]=np.random.randint(10)
matrix3d_1=np.empty([10,10,1000])
x=10
y=1
for z in range(0,1000):
matrix3d_1[:,:,z]=func(matrix3d[:,:,z],x,y)
def func(matrix,x,y):
return matrix*x+y
I have tried using multiprocessig using Pool.map() but it did not work.
from functools import partial
import multiprocessing as mp
pool=mp.Pool(processes=2)
args=partial(func,x,y)
matrix3d_2=np.empty([10,10,1000])
matrix3d_2=pool.map(args,matrix3d)
pool.close()
If I compare the two matrix matrix3d_1==matrix3d_2 the results is false.
How can this be fixed?
Parallel processing of a 3d matrix
The python map method as well as the pool.map methode can only take one input object. See for example https://stackoverflow.com/a/10973817/4045774
To reduce the inputs to one input we can use for example functools. The input which remains have to be on the last place.
from functools import partial
import numpy as np
import multiprocessing as mp
def main():
matrix3d=np.empty([10,10,1000])
matrix3d[:]=np.random.randint(10)
matrix3d_1=np.empty([10,10,1000])
x=10
y=1
pool=mp.Pool(processes=4)
func_p=partial(func,x,y)
#parallel map returns a list
res=pool.map(func_p,(matrix3d[:,:,z] for z in xrange(0,matrix3d.shape[2])))
#copy the data to array
for i in xrange(0,matrix3d.shape[2]):
matrix3d_1[:,:,i]=res[i]
def func(x,y,matrix):
return matrix*x+y
Parallel version using numba
This version will scale well over all cores and is at least 200 times faster than simple multiprocessing shown above. I have modified the code you linked to a bit, to get rid of any other dependencies than numpy.
import numpy
from numba import njit, prange
nb_meanInterp = njit("float32[:,:](float32[:,:],int64,int64)")(meanInterp)
resample_3d_nb = njit("float32[:,:,:](float32[:,:,:],int64,int64)",parallel=True)(resample_3d)
def resample_3d(matrix_3d,x,y):
matrix3d_res=numpy.empty((x,y,matrix_3d.shape[2]),dtype=numpy.float32)
for z in prange(0,matrix_3d.shape[2]):
matrix3d_res[:,:,z]=nb_meanInterp(matrix_3d[:,:,z],x,y)
return matrix3d_res
def meanInterp(data, m, n):
newData = numpy.zeros((m,n),dtype=numpy.float32)
mOrig, nOrig = data.shape
hBoundariesOrig, vBoundariesOrig = numpy.linspace(0,1,mOrig+1),
numpy.linspace(0,1,nOrig+1)
hBoundaries, vBoundaries = numpy.linspace(0,1,m+1), numpy.linspace(0,1,n+1)
for iOrig in range(mOrig):
for jOrig in range(nOrig):
for i in range(m):
if hBoundaries[i+1] <= hBoundariesOrig[iOrig]: continue
if hBoundaries[i] >= hBoundariesOrig[iOrig+1]: break
for j in range(n):
if vBoundaries[j+1] <= vBoundariesOrig[jOrig]: continue
if vBoundaries[j] >= vBoundariesOrig[jOrig+1]: break
#boxCoords = ((hBoundaries[i], vBoundaries[j]),(hBoundaries[i+1], vBoundaries[j+1]))
#origBoxCoords = ((hBoundariesOrig[iOrig], vBoundariesOrig[jOrig]),(hBoundariesOrig[iOrig+1], vBoundariesOrig[jOrig+1]))
#area=overlap(boxCoords, origBoxCoords)
#hopefully this is equivivalent (not tested)-----
T_x=(hBoundaries[i],hBoundaries[i+1],hBoundariesOrig[iOrig],hBoundariesOrig[iOrig+1])
T_y=(vBoundaries[j],vBoundaries[j+1],vBoundariesOrig[jOrig],vBoundariesOrig[jOrig+1])
tx=(T_x[1]-T_x[0]+T_x[3]-T_x[2])-(max(T_x)-min(T_x))
ty=(T_y[1]-T_y[0]+T_y[3]-T_y[2])-(max(T_y)-min(T_y))
area=tx*ty
#------------------------
newData[i][j] += area * data[iOrig][jOrig] / (hBoundaries[1] * vBoundaries[1])
return newData

Run parallel op with different inputs and same placeholder

I have the necessity to calculate more then one accuracy in the same time, concurrently.
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
The piece of code is the same of the mnist example in the tutorial of TensorFlow but instead of having:
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
I have two placeolder because I already calculated and stored them.
W = tf.placeholder(tf.float32, [784, 10])
b = tf.placeholder(tf.float32, [10])
I want to fill the network with the values I aready have and then calculate the accuracy and this have to happen for each network I loaded.
So if I load 20 networks I want to calculate in parallel the accuracy for each one. There is a way with the session run to execute the same operation with different input?
You have multiple options to make things happen in parallel:
Parallelize using multiple python threads / subprocesses. (See Python's "multiprocessing" library.)
Batch up the operations into single larger operations. (e.g. Similar to the image operations that operate on a batch of images simultaneously https://www.tensorflow.org/api_docs/python/image/resizing#resize_bilinear.)
Make a single graph that has the 20 network accuracy calculations.
I think the last one is the easiest, so I've included a bit of sample code below to get you started:
import tensorflow as tf
def construct_accuracy_calculation(i):
W = tf.placeholder(tf.float32, [784, 10], name=("%d_W" % i))
b = tf.placeholder(tf.float32, [10], name=("%d_b" % i))
# ...
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
return (W, b, accuracy)
def main():
accuracy_computations = []
feed_dict={}
for i in xrange(NUM_NETWORKS):
(W, b) = load_network(i)
(W_op, b_op, accuracy) = construct_accuracy_calculation(i)
feed_dict[W_op] = W
feed_dict[b_op] = b
accuracy_computations.append(accuracy)
# sess = ...
accuracy_values = sess.run(accuracy_computations, feed_dict=feed_dict)
if __name__ == "__main__":
main()
One approach to parallelizing TF computations is to execute run calls in parallel using threads (TF is incompatible with multiprocessing). It's a bit more complicated than other approaches because you have to handle parallelism yourself on the Python side.
Here's an example that runs same matmul op in same session in different Python threads with different fed inputs and runs about 4x faster with 4 threads compared to 1 thread
import os, sys, queue, threading, time
import tensorflow as tf
import numpy as np
def p(s):
# helper function for printing from multiple threads
# need to append \n or results get intermixed in notebook
print(s+"\n", flush=True, end="")
num_threads = 4
data_size = 32 # number of data points to enqueue
work_per_thread = data_size/num_threads
timeout = 10 # grace period for dequeing
input_queue = queue.Queue(data_size)
output_queue = queue.Queue(data_size)
dtype = np.float32
# use matrix vector matmul since it's compute intensive and uses single core
# see issue #6752
n = 16*1024
with tf.device("/cpu:0"):
x = tf.placeholder(dtype)
matrix = tf.Variable(tf.ones((n, n)))
vector = tf.Variable(tf.ones((n, 1)))
y = tf.matmul(matrix, vector)[0, 0] + x
# turn off graph-rewriting optimizations
sess = tf.Session(config=tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0))))
sess.run(tf.global_variables_initializer())
done = False
def runner(runner_id):
p("Starting runner %s" % (runner_id,))
count = 0
while not done:
try:
x_val = input_queue.get(timeout=1)
except queue.Empty:
# retry on empty queue
continue
p("Start computing %d on %d" %(x_val, runner_id))
out = sess.run(y, {x: x_val})
count+=1
output_queue.put(out)
if count>=work_per_thread:
break
else:
p("Stopping runner "+str(runner_id))
threads = []
print("Creating threads.")
for i in range(num_threads):
t = threading.Thread(target=runner, args=(i,))
threads.append(t)
for i in range(data_size):
input_queue.put(i, timeout=timeout)
# start threads
p("Launching runners.")
start_time = time.time()
for t in threads:
t.start()
p("Reading results.")
for i in range(data_size):
try:
p("Main thread: obtained %.2f" % (output_queue.get(timeout=timeout),))
except queue.Empty:
print("No results after %d, terminating computation."%(timeout,))
break
else:
p("Computed successfully.")
done = True
p("Waiting for threads to finish.")
for t in threads:
t.join()
print("Done in %.2f seconds" %(time.time() - start_time))