Concurrent in Python 2 - concurrency

I have a function pre_raw() and pandas data train_raw.values in python3 I could like this:
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(pre_raw, train_raw.values)
How to wrote it on Python2?
Thanks.

procs=[]
for val in train_raw.values:
p = multiprocessing.Process(target=pre_raw, args=(val,))
procs.append(p)
p.start()
for p in procs:
p.join()

Related

Cloud datastore client changes type from int to float

I was writing a script in python using google-cloud-datastore python module to upload data from my CSV to datastore. The script seems to work fine but There seems to be a problem that I'm stuck with. I see that the integer values from my CSV are being stored as Floating point number. Is it a default way of sending data to datastore or am I doing something wrong?
Here's my code:
import sys
import getopt
import pandas as pd
from google.cloud import datastore
def write_dict_chunks(data, SIZE=100):
log_count = 0
datastore_client = datastore.Client()
task_key = datastore_client.key(kind)
for i in xrange(0, len(data), SIZE):
entities = []
for each_entry in data[i : i+SIZE]:
nan_check = lambda v: v if str(v)!='nan' else None
string_check = lambda v: v.decode('utf-8') if isinstance(v, str) else v
write_row = {k: nan_check(string_check(v)) for k, v in each_entry.iteritems()}
entity = datastore.Entity(key=task_key)
entity.update(write_row)
entities.append(entity)
datastore_client.put_multi(entities)
log_count += len(entities)
print 'Wrote {} entities to datastore'.format(log_count)
try:
opts, args = getopt.getopt(sys.argv[1:], "ho:v", ["kind=", "filepath="])
if len(args) > 0:
for each in args:
print 'Unrecognized argument: '+each
sys.exit(2)
except getopt.GetoptError as err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
print 'Usage: python parse_csv.py --kind=kind_name --filepath=path_to_csv'
kind = None
filepath = None
for option, argument in opts:
if option in '--kind':
kind = argument
elif option in '--filepath':
filepath = argument
df = pd.read_csv(filepath)
df = df.to_dict(orient='records')
write_dict_chunks(df)

Type error: for p in permutations(a): TypeError: 'int' object is not iterable

Here is my code :
t=(int(input()))
from itertools import permutations
for i in range(0,t):
a=input()
sum=0
count=0
for p in permutations(a):
sum=sum+int(''.join(p))
count=count+1
print(str(count)+" "+str(sum))
It's working fine in Python 3.0+x but not in python 2.7
t=(int(input()))
from itertools import permutations
for i in range(0,t):
a=raw_input()
sum=0
count=0
for p in permutations(a):
sum=sum+int(''.join(p))
count=count+1
print(str(count)+" "+str(sum))
Change input to raw_input in line 4,
in python 2.x, input means integer and raw_input means str

How to get direct return value instead of <MagicMock name='mock.xx' id='yy'>

This is my testing code :
import mock
import unittest
def check_method_return(input):
return_value = input.ops.list()
if not return_value:
return False
return return_value
def check_method_len(input):
return_value = input.ops.list()
if len(return_value) < 1:
return False
return return_value
class TestMockReturnValue(unittest.TestCase):
def test_mock_return(self):
fake_input = mock.MagicMock()
fake_input().ops.list.return_value = []
result = check_method_return(fake_input)
self.assertFalse(result)
def test_mock_len(self):
fake_input = mock.MagicMock()
fake_input().ops.list.return_value = []
result = check_method_len(fake_input)
self.assertFalse(result)
if __name__ == '__main__':
test_empty = []
if not test_empty:
print("empty list equals to False")
unittest.main()
The run result output is :
empty list equals to False
.F
======================================================================
FAIL: test_mock_return (__main__.TestMockReturnValue)
----------------------------------------------------------------------
Traceback (most recent call last):
File "test_mock.py", line 31, in test_mock_return
self.assertFalse(result)
AssertionError: <MagicMock name='mock.ops.list()' id='140459969939728'> is not false
----------------------------------------------------------------------
Ran 2 tests in 0.005s
FAILED (failures=1)
Because when the list is empty, its return value for if is False. So, method "check_method_return" should work exactly the same as "check_method_len" in the real world.
So, my question is :
Is there a way to make the unit test pass for "check_method_return" ?
If this is the case, Here is the solution, I cannot explain the exact difference, but it makes sense:
# this mock away input.ops.list()
fake_input.ops.list.return_value = []
# this did not mock away input.ops.list()
fake_input().ops.list.return_value = []
To show the difference between 2 ways to set mock_input return value
This could help to understand better
[gliang#www ~]$ ipython
Python 2.6.6 (r266:84292, Jul 23 2015, 15:22:56)
IPython 0.13.2 -- An enhanced Interactive Python.
In [1]: import unittest
In [2]: import mock
In [3]: fake_input mock.Mag
mock.MagicMixin mock.MagicMock mock.MagicProxy
In [4]: fake_input = mock.MagicMock()
In [5]: fake_input().ops.list.return_value= []
In [6]: print fake_input().ops.list.return_value
[]
In [7]: print fake_input.ops.list.return_value
<MagicMock name='mock.ops.list()' id='15160848'>
In [8]: fake_input2 = mock.MagicMock()
In [9]: fake_input2.ops.list.return_value = []
In [10]: print fake_input2.ops.list.return_value
[]
In [11]: quit()

Unpickling data from Python 2 with unicode strings in Python 3

I have pickled data from 2.7 that I pickled like this:
#!/usr/bin/env python2
# coding=utf-8
import pickle
data = {1: datetime.date(2014, 3, 18),
'string-key': u'ünicode-string'}
pickle.dump(data, open('file.pickle', 'wb'))
The only way I found to load this in Python 3.4 is:
data = pickle.load(open('file.pickle', "rb"), encoding='bytes')
Now my unicode string are fine but the dict keys are bytes. print(repr(data)) gives:
{1: datetime.date(2014, 3, 18), b'string-key': 'ünicode-string'}
Does anybody have an idea to get around rewriting my code like data[b'string-key'] resp. converting all existing files?
This is not a real answer but only a workaround. This converts pickled data to version 3 in Python 3.4 (doesn't work in 3.3):
#!/usr/bin/env python3
import pickle, glob
def bytes_to_unicode(ob):
t = type(ob)
if t in (list, tuple):
l = [str(i, 'utf-8') if type(i) is bytes else i for i in ob]
l = [bytes_to_unicode(i) if type(i) in (list, tuple, dict) else i for i in l]
ro = tuple(l) if t is tuple else l
elif t is dict:
byte_keys = [i for i in ob if type(i) is bytes]
for bk in byte_keys:
v = ob[bk]
del(ob[bk])
ob[str(bk,'utf-8')] = v
for k in ob:
if type(ob[k]) is bytes:
ob[k] = str(ob[k], 'utf-8')
elif type(ob[k]) in (list, tuple, dict):
ob[k] = bytes_to_unicode(ob[k])
ro = ob
else:
ro = ob
print("unprocessed object: {0} {1}".format(t, ob))
return ro
for fn in glob.glob('*.pickle'):
data = pickle.load(open(fn, "rb"), encoding='bytes')
ndata = bytes_to_unicode(data)
pickle.dump(ndata, open(fn + '3', "wb"))
The Python docs say:
The pickle serialization format is guaranteed to be backwards compatible across Python releases.
I didn't find a way to pickle.load Python-2.7 pickled data in Python 3.3 -- not even data that contained only ints and dates.
Have a look at the implementation.
You can subclass the Unpickler and overwrite the byte deserialization to produce strings.

Multiprocessing, how to run processes in parallel without creating zombies?

I'd like to run the processes in parallel, so I commented out a p.join from the __main__ section.
What are the consequences of not have a .join, or better yet, should I be using a different approach for parallel multiprocessing?
import multiprocessing
def worker(num):
x = 0
for i in range(10000):
x+=1
print x, num
if __name__ == '__main__':
for i in range(4):
p = multiprocessing.Process(target=worker, args=(i,))
p.start()
# p.join()
Join the processes after starting them.
if __name__ == '__main__':
procs = []
for i in range(4):
p = multiprocessing.Process(target=worker, args=(i,))
p.start()
procs.append(p)
for p in procs:
p.join()
If you run multiple similar tasks, you can use multiprocessing.Pool.
if __name__ == '__main__':
pool = multiprocessing.Pool()
pool.map(worker, range(4))
pool.close()
pool.join()