Python multiprocessing: parsing, editing, and writing long series of csv files - python-2.7

I have a very long series of similar cvs files (14Gb altogether). I need to open each file, replace certain characters, and write the fixed version to a new file. I want to use the processing power of my multicore computer. I tried with mp.Pools and with mp.Process/mp.Queue. The pool version works, but the queue approach produces this error:
IOError: [Errno 22] invalid mode ('r') or filename: '<multiprocessing.queues.Queue object at 0x0000000002775A90>'
This is a simplified version of my Pool code:
import os
import pandas as pd
import multiprocessing as mp
def fixer(a_file):
lines = []
opened_file = open(a_file)
for each_line in opened_file:
lines.append(each_line.replace('mad', 'rational'))
opened_file.close()
df = pd.DataFrame(lines)
#some pandas magics here
df.to_csv(a_file[:-4] + '_fixed.csv')
if __name__ == "__main__":
my_path = os.getcwd()
my_files = list(os.walk(my_path))[0][2] #I just get the list of file names here
processors = mp.cpu_count()
pool = mp.Pool(processes = processors) # I set as many processes as processors my computer has.
pool.map(fixer, my_files)
And this is the one for the Queue approach:
import os
import pandas as pd
import multiprocessing as mp
def fixer(a_file):
lines = []
opened_file = open(a_file)
for each_line in opened_file:
lines.append(each_line.replace('mad', 'rational'))
opened_file.close()
df = pd.DataFrame(lines)
#some pandas magics here
df.to_csv(a_file[:-4] + '_fixed.csv')
if __name__ == "__main__":
my_path = os.getcwd()
my_files = list(os.walk(my_path))[0][2] #I just get the list of file names here
processors = mp.cpu_count()
queue = mp.Queue()
for each_file in my_files:
queue.put(each_file)
processes = [mp.Process(target = fixer, args=(queue,)) for core in range(processors)]
for process in processes:
process.start()
for process in processes:
process.join()
I will appreciate if you can provide an example to make the Queue version to work. In a second processing step, before the files are written, I need the processors to get an intermediate result and do some calculations. This is the reason why I need the queues.

The problem in the Queue script is that I was not getting the next element in the Queue, but passing the whole Queue to the fixer function. This problem is solved by assigning the value of queue.get() to a variable in the fixer function:
import os
import pandas as pd
import multiprocessing as mp
def fixer(a_queue):
a_file = a_queue.get()
lines = []
opened_file = open(a_file)
for each_line in opened_file:
lines.append(each_line.replace('mad', 'rational'))
opened_file.close()
df = pd.DataFrame(lines)
#some pandas magics here
df.to_csv(a_file[:-4] + '_fixed.csv')
if __name__ == "__main__":
my_path = os.getcwd()
my_files = list(os.walk(my_path))[0][2] #I just get the list of file names here
processors = mp.cpu_count()
queue = mp.Queue()
for each_file in my_files:
queue.put(each_file)
processes = [mp.Process(target = fixer, args=(queue,)) for core in range(processors)]
for process in processes:
process.start()
for process in processes:
process.join()

Related

Parallel processing in Python for image batching

I like to parallel two functions, one for image batching (streaming all 25 images for processing) and another one for processing batched images. They need to be in parallel.
So I have main function for batching images BatchStreaming(self) and processing for BatchProcessing(self, b_num). Now BatchStreaming is working well. After streaming 25 images, need to proceed for batch processing. I have two parallel processes. They are
(1)While loop in BatchStreaming need to continue for another batch of images.
(2)At the same time, current batched images need to be processed.
I am confusing whether I should use process or thread. I prefer process as I like to utilize all cores in CPU. (Python's thread run only on one CPU core)
Then I have two issues
(1)Process has to join back to main program to proceed. But I need to continue for next batch of images.
(2)In the following program, when BatchProcessing(self, b_num) is called and have exception as
Caught Main Exception
(<class 'TypeError'>, TypeError("'module' object is not callable",), <traceback object at 0x7f98635dcfc8>)
What could be issue?
The code is as follow.
import multiprocessing as MultiProcess
import time
import vid_streamv3 as vs
import cv2
import sys
import numpy as np
import os
BATCHSIZE=25
CHANNEL=3
HEIGHT=480
WIDTH=640
ORGHEIGHT=1080
ORGWIDTH=1920
class ProcessPipeline:
def __init__(self):
#Current Cam
self.camProcess = None
self.cam_queue = MultiProcess.Queue(maxsize=100)
self.stopbit = None
self.camlink = 'rtsp://root:pass#192.168.0.90/axis-media/media.amp?camera=1' #Add your RTSP cam link
self.framerate = 25
self.fullsize_batch1=np.zeros((BATCHSIZE, ORGHEIGHT, ORGWIDTH, CHANNEL), dtype=np.uint8)
self.fullsize_batch2=np.zeros((BATCHSIZE, ORGHEIGHT, ORGWIDTH, CHANNEL), dtype=np.uint8)
self.batch1_is_processed=False
def BatchStreaming(self):
#get all cams
time.sleep(3)
self.stopbit = MultiProcess.Event()
self.camProcess = vs.StreamCapture(self.camlink,
self.stopbit,
self.cam_queue,
self.framerate)
self.camProcess.start()
count=0
try:
while True:
if not self.cam_queue.empty():
cmd, val = self.cam_queue.get()
if cmd == vs.StreamCommands.FRAME:
if val is not None:
print('streaming starts ')
if(self.batch1_is_processed == False):
self.fullsize_batch1[count]=val
else:
self.fullsize_batch2[count]=val
count=count+1
if(count>=25):
if(self.batch1_is_processed == False):#to start process for inference and post processing for batch 1
self.batch1_is_processed = True
print('batch 1 process')
p = MultiProcess(target=self.BatchProcessing, args=(1,))
else:#to start process for inference and post processing for batch 2
self.batch1_is_processed = False
print('batch 2 process')
p = MultiProcess(target=self.BatchProcessing, args=(2,))
p.start()
print('BatchProcessing start')
p.join()
print('BatchProcessing join')
count=0
cv2.imshow('Cam: ' + self.camlink, val)
cv2.waitKey(1)
except KeyboardInterrupt:
print('Caught Keyboard interrupt')
except:
e = sys.exc_info()
print('Caught Main Exception')
print(e)
self.StopStreaming()
cv2.destroyAllWindows()
def StopStreaming(self):
print('in stopCamStream')
if self.stopbit is not None:
self.stopbit.set()
while not self.cam_queue.empty():
try:
_ = self.cam_queue.get()
except:
break
self.cam_queue.close()
print("before camProcess.join()")
self.camProcess.join()
print("after camProcess.join()")
def BatchProcessing(self, b_num):
print('module name:', __name__)
if hasattr(os, 'getppid'): # only available on Unix
print('parent process:', os.getppid())
print('process id:', os.getpid())
if __name__ == "__main__":
mc = ProcessPipeline()
mc.BatchStreaming()
I used Event signalling as shown below.
That is more straightforward for my application.
When batching loop have enough images, signal to batch processing.
#event_tut.py
import random, time
from threading import Event, Thread
event = Event()
def waiter(event, nloops):
count=0
while(count<10):
print("%s. Waiting for the flag to be set." % (i+1))
event.wait() # Blocks until the flag becomes true.
print("Wait complete at:", time.ctime())
event.clear() # Resets the flag.
print('wait exit')
count=count+1
def setter(event, nloops):
for i in range(nloops):
time.sleep(random.randrange(2, 5)) # Sleeps for some time.
event.set()
threads = []
nloops = 10
threads.append(Thread(target=waiter, args=(event, nloops)))
threads[-1].start()
threads.append(Thread(target=setter, args=(event, nloops)))
threads[-1].start()
for thread in threads:
thread.join()
print("All done.")

multiprocessing Queue deadlock when spawn multi threads in one process

I created two processes, one process that spawn multi threads is response for writing data to Queue, the other is reading data from Queue. It always deadblock in high frequent, fewer not. Especially when you add sleep in run method in write module(comment in codes). Let me put my codes below:
environments: python2.7
main.py
from multiprocessing import Process,Queue
from write import write
from read import read
if __name__ == "__main__":
record_queue = Queue()
table_queue = Queue()
pw = Process(target=write,args=[record_queue, table_queue])
pr = Process(target=read,args=[record_queue, table_queue])
pw.start()
pr.start()
pw.join()
pr.join()
write.py
from concurrent.futures import ThreadPoolExecutor, as_completed
def write(record_queue, table_queue):
thread_num = 3
pool = ThreadPoolExecutor(thread_num)
futures = [pool.submit(run, record_queue, table_queue) for _ in range (thread_num)]
results = [r.result() for r in as_completed(futures)]
def run(record_queue, table_queue):
while True:
if table_queue.empty():
break
table = table_queue.get()
# adding this code below reduce deadlock opportunity.
#import time
#import random
#time.sleep(random.randint(1, 3))
process_with_table(record_queue, table_queue, table)
def process_with_table(record_queue, table_queue, table):
#for short
for item in [x for x in range(1000)]:
record_queue.put(item)
read.py
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import Queue
def read(record_queue, table_queue):
count = 0
while True:
item = record_queue.get()
count += 1
print ("item: ", item)
if count == 4:
break
I googled it and there are same questions on SO, but i cant see the similarity compared with my code, so can anyone help my codes, thanks...
I seem to find a solution, change run method in write module to :
def run(record_queue, table_queue):
while True:
try:
if table_queue.empty():
break
table = table_queue.get(timeout=3)
process_with_table(record_queue, table_queue, table)
except multiprocessing.queues.Empty:
import time
time.sleep(0.1)
and never see deadlock or blocking on get method.

How to get python to read all images in a directory one by one

My experience with python is very limited so I don't fully understand what the code does in this instance. This is part of the code for poets lab from the tensorflow framework.
import os, sys
import tensorflow as tf
import sys
import numpy as np
from PIL import Image
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# change this as you see fit
image_path = sys.argv[1]
# Read in the image_data
image_data = tf.gfile.FastGFile(image_path, 'rb').read()
image = Image.open(image_path)
image_array = image.convert('RGB')
# Loads label file, strips off carriage return
label_lines = [line.rstrip() for line
in tf.gfile.GFile("retrained_labels.txt")]
# Unpersists graph from file
with tf.gfile.FastGFile("retrained_graph.pb", 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
_ = tf.import_graph_def(graph_def, name='')
with tf.Session() as sess:
# Feed the image_data as input to the graph and get first prediction
softmax_tensor = sess.graph.get_tensor_by_name('final_result:0')
predictions = sess.run(softmax_tensor,{'DecodeJpeg:0': image_array})
# Sort to show labels of first prediction in order of confidence
top_k = predictions[0].argsort()[-len(predictions[0]):][::-1]
for node_id in top_k:
human_string = label_lines[node_id]
score = predictions[0][node_id]
print('%s (score = %.5f)' % (human_string, score))
filename = "results.txt"
with open(filename, 'a+') as f:
f.write('\n**%s**\n' % (image_path))
for node_id in top_k:
human_string = label_lines[node_id]
score = predictions[0][node_id]
f.write('%s (score = %.5f)\n' % (human_string, score))
I want the above code to read in a directory instead of a single image and then process them all and output the scores to the results.txt file.
Currently I can call this like so:
python this_file.py /root/images/1.jpg
How would I get this code to take the following input and processes it
python this_file.py /root/images/
Use os.listdir to list all files in the directory. Qualify it with a filter as well. Join the resulting files to their directory. Read them from the list with a for loop.
python this_file.py /root/images/
image_path = sys.argv[1]
image_paths = [os.path.join(image_path,img) for img in os.listdir(image_path) if '.jpg' in img]
I also recommend re-examining your training function and strategy. It is also good practice to abstract your entire network with tf variable placeholders as far as you can. In addition it would be much more efficient to implement batching, as well as possibly convert your dataset to tf records.

Google Dataflow seems to drop 1000th record

I have set up a small test using Google Dataflow (apache-beam). The use case for the experiment is to take a (csv) file and write a selected column to a (txt) file.
The code for the experiment is as listed below:
from __future__ import absolute_import
import argparse
import logging
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class EmitColDoFn(beam.DoFn):
first = True
header = ""
def __init__(self, i):
super(EmitColDoFn, self).__init__()
self.line_count = Metrics.counter(self.__class__, 'lines')
self.i = i
def process(self, element):
if self.first:
self.header = element
self.first = False
else:
self.line_count.inc()
cols = re.split(',', element)
return (cols[self.i],)
def run(argv=None):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument('--input',
dest='input',
default='/users/sms/python_beam/data/MOCK_DATA (4).csv',
# default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument('--output',
dest='output',
default="/users/sms/python_beam/data/",
# required=True,
help='Output file to write results to.')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
# Read the text file[pattern] into a PCollection.
lines = p | 'read' >> ReadFromText(known_args.input)
column = (lines
| 'email col' >> (beam.ParDo(EmitColDoFn(3)))
| "col file" >> WriteToText(known_args.output, ".txt", shard_name_template="SS_Col"))
result = p.run()
result.wait_until_finish()
if (not hasattr(result, 'has_job') # direct runner
or result.has_job): # not just a template creation
lines_filter = MetricsFilter().with_name('lines')
query_result = result.metrics().query(lines_filter)
if query_result['counters']:
lines_counter = query_result['counters'][0]
print "Lines committed", lines_counter.committed
run()
The last few lines of sample 1 below:
990,Corabel,Feldbau,cfeldbaurh#deliciousdays.com,Female,84.102.162.190,DJ
991,Kiley,Rottcher,krottcherri#stanford.edu,Male,91.97.155.28,CA
992,Glenda,Clist,gclistrj#state.gov,Female,24.98.253.127,UA
993,Ingunna,Maher,imaherrk#army.mil,Female,159.31.127.19,PL
994,Megan,Giacopetti,mgiacopettirl#instagram.com,Female,115.6.63.52,RU
995,Briny,Dutnall,bdutnallrm#xrea.com,Female,102.81.33.24,SE
996,Jan,Caddan,jcaddanrn#jalbum.net,Female,115.142.222.106,PL
Running this produces the expected output of:
/usr/local/bin/python2.7
/Users/sms/Library/Preferences/PyCharmCE2017.1/scratches/scratch_4.py
No handlers could be found for logger "oauth2client.contrib.multistore_file"
Lines committed 996
Process finished with exit code 0
Now for the strange results. In the next run, the number of lines is increased to 1000.
994,Megan,Giacopetti,mgiacopettirl#instagram.com,Female,115.6.63.52,RU
995,Briny,Dutnall,bdutnallrm#xrea.com,Female,102.81.33.24,SE
996,Jan,Caddan,jcaddanrn#jalbum.net,Female,115.142.222.106,PL
997,Shannen,Gaisford,sgaisfordr7#rediff.com,Female,167.255.222.92,RU
998,Lorianna,Slyne,lslyner8#cbc.ca,Female,54.169.60.13,CN
999,Franklin,Yaakov,fyaakovr9#latimes.com,Male,122.1.92.236,CN
1000,Wilhelmine,Cariss,wcarissra#creativecommons.org,Female,237.48.113.255,PL
But this time the out put is
/usr/local/bin/python2.7
/Users/sms/Library/Preferences/PyCharmCE2017.1/scratches/scratch_4.py
No handlers could be found for logger "oauth2client.contrib.multistore_file"
Lines committed 999
Process finished with exit code 0
Inspection of the output file shows that the last line was NOT processed.
bdutnallrm#xrea.com
jcaddanrn#jalbum.net
sgaisfordr7#rediff.com
lslyner8#cbc.ca
fyaakovr9#latimes.com
Any ideas what is going on here?
'EditColDoFn' skips first line, assuming there is one instance of it for each file. When you have more 1000 lines, the DirectRunner creates two bundles : 1000 lines in first one, and 1 line in second. In a Beam application, the input might be split into multiple bundles for processing in parallel. There is no correlation to number of files and number of bundles. Same application can process terra bytes of data spread across many files.
ReadFromText has an option 'skip_header_lines', which you can set to 1 in order to skip header line in each of your input files.

Getting too many deadlock errors while updating MSSQL table with pyodbc in parallel with multiprocessing

I am trying to open pickle files that have data within them, then update a MSSQL table with that data. It was taking forever, 10 days to update 1,000,000 rows. So i wrote a script for more parallelism. The more processes i run it with the more errors i get like this
(<class 'pyodbc.Error'>, Error('40001', '[40001] [Microsoft][ODBC SQL Server Dri
ver][SQL Server]Transaction (Process ID 93) was deadlocked on lock resources wit
h another process and has been chosen as the deadlock victim. Rerun the transact
ion. (1205) (SQLExecDirectW)'), <traceback object at 0x0000000002791808>)
As you can see in my code i keep trying to process the update until successful and even sleep for a second here
while True:
try:
updated = cursor.execute(update,'Yes', fileName+'.'+ext, dt, size,uniqueID )
break
except:
time.sleep(1)
print sys.exc_info()
Is this because when you use the multiprocessing module in windows it uses os.spawn instead of os.fork ?
Is there a way to do this that will provide more speed up?
I was told that the table can handle way more transactions then this...
#!C:/Python/python.exe -u
import pyodbc,re,pickle,os,glob,sys,time
from multiprocessing import Lock, Process, Queue, current_process
def UpDater(pickleQueue):
for pi in iter(pickleQueue.get, 'STOP'):
name = current_process().name
f=pi
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=database.windows.net;DATABASE=DB;UID=user;PWD=pwd');
cursor = cnxn.cursor()
update = ("""UPDATE DocumentList
SET Downloaded=?, DownLoadedAs=?,DownLoadedWhen=?,DownLoadedSizeKB=?
WHERE DocNumberSequence=?""")
r = re.compile('\d+')
pkl_file = open(pi, 'rb')
meta = pickle.load(pkl_file)
fileName = meta[0][0]
pl = r.findall(fileName)
l= int(len(pl)-1)
ext = meta[0][1]
url = meta[0][2]
uniqueID = pl[l]
dt = meta[0][4]
size = meta[0][5]
while True:
try:
updated = cursor.execute(update,'Yes', fileName+'.'+ext, dt, size,uniqueID )
break
except:
time.sleep(1)
print sys.exc_info()
print uniqueID
cnxn.commit()
pkl_file.close()
os.remove(fileName+'.pkl')
cnxn.close()
if __name__ == '__main__':
os.chdir('Pickles')
pickles = glob.glob("*.pkl")
pickleQueue=Queue();processes =[];
for item in pickles:
pickleQueue.put(item)
workers = int(sys.argv[1]);
for x in xrange(workers):
p = Process(target=UpDater,args=(pickleQueue,))
p.start()
processes.append(p)
pickleQueue.put('STOP')
for p in processes:
p.join()
I am using Windows 7 and python 2.7 Anaconda Distribution
EDIT
The answer below to use row locks stopped the error from happening. However, the updates were still slow. Turns out an old fashion index on the primary key was needed for 100x speed up
A few things to try. Using sleeps is a bad idea. First, could you try row level locking?
update = ("""UPDATE DocumentList WITH (ROWLOCK)
SET Downloaded=?, DownLoadedAs=?,DownLoadedWhen=?,DownLoadedSizeKB=?
WHERE DocNumberSequence=? """)
Another option would be to wrap each in a transaction:
update = ("""
BEGIN TRANSACTION my_trans;
UPDATE DocumentList
SET Downloaded=?, DownLoadedAs=?,DownLoadedWhen=?,DownLoadedSizeKB=?
WHERE DocNumberSequence=?;
END TRANSACTION my_trans;
""")
Would either of these solutions work for you?