Map Reduce: Why does this code give the correct output for max but not min? - mapreduce

I am testing some map reduce code in Ubuntu, python 2 with commmand:
cat testfile2 | ./mapper.py | sort | ./reducer.py
I get the correct output for max but not min, ever time I get the value 1 for min as though it hasn't changed from it's original value. Every single value 'value' is less than 1 so on the first iteration through the for loop min should change to the first value and then update the min on further iterations. Am I loosing my mind or is there a silly mistake in the code? Please help!
#!/usr/bin/python
import sys
def reducer():
max = 0
min = 1
old_tuple = ('foo', 'bar')
i = 0
for line in sys.stdin:
data = line.strip().split("\t")
if len(data) != 3:
continue
city, year, value = data
new_tuple = (city, year)
if old_tuple != new_tuple:
if i != 0:
print "{0}\t{1}\t{2}\t{3}".format(old_tuple[0], old_tuple[1], max, min)
max = 0
min = 1
i += 1
old_tuple = new_tuple
if min > value:
min = value
if max < value:
max = value
if old_tuple != ('foo', 'bar'):
print "{0}\t{1}\t{2}\t{3}".format(old_tuple[0], old_tuple[1], max, min)
if __name__ == '__main__':
reducer()
The output I get looks like so
Alert 2009 0.215236752 1
Winnipeg 2017 0.032557214 1

Firstly I was using min and max as variable names that are keywords. After changing
min => minimum
max => maximum
the output was still incorrect. The same problem actually. It was only after attempting to get the min and max in a more Pythonic way that I got it working. I'm new to Python so this may still be not the best most Pythonic way but the code below is at least getting the min and max values as needed.
#!/usr/bin/python
import sys
import math
def reducer():
list_ = []
old_tuple = ('foo', 'bar')
i = 0
for line in sys.stdin:
data = line.strip().split("\t")
if len(data) != 3:
continue
city, year, value = data
new_tuple = (city, year)
if old_tuple != new_tuple:
if i != 0:
print "{0}\t{1}\t{2}\t{3}".format(old_tuple[0], old_tuple[1], max(list_), min(list_))
list_ = []
i += 1
list_.append(value)
old_tuple = new_tuple
if old_tuple != ('foo', 'bar'):
print "{0}\t{1}\t{2}\t{3}".format(old_tuple[0], old_tuple[1], max(list_), min(list_))
if __name__ == '__main__':
reducer()

Related

Is there a reverse way to find number of people with given 0.5 probability that two people will have same birthday but no using mathematical formula?

I'm doing birthday paradox, and want to know how many people can meet 0.5 probability that two people have same birthday by using python.
I have tried no using mathematical formula to find probability with given the number of people by using random and randint in python
import random
def random_birthdays():
bdays = []
bdays = [random.randint(1, 365) for i in range(23)]
bdays.sort()
for x in range(len(bdays)):
while x < len(bdays)-1:
print x
if bdays[x] == bdays[x+1]:
#print(bdays[x])
return True
x+=1
return False
count = sum(random_birthdays() for _ in range(1000))
print('In a sample of 1000 classes each with 23 pupils, there were', count, 'classes with individuals with the same birthday')
I expect some hints or codes that can help me through this.
Well, problem with your code you check only consecutive birthday equality. Better check it using sets
Along the line
import random
def sim(n):
"""simulate birthdays for n people"""
a = set([random.randint(1, 365) for _ in range(n)])
if len(a) == n:
return False
return True
print(sim(23))
print(sim(23))
print(sim(23))
print(sim(23))
print(sim(23))
Function above will return true if there are same day birthday for n people, false otherwise.
Call it 1000000 times for n = 20, 21, ...., 25 and count how many Trues vs Falses are there
Running code
nt = 0
nf = 0
n = 23
for k in range(0, 1000000):
if sim(n):
nt += 1
else:
nf += 1
print((nt, nf))
for n = 23 and n = 22 produced
(506245, 493755)
(475290, 524710)

Filling Value of a Pandas Data Frame From a Large DB Query (Python)

I am running a snippet of code that queries a database and then fills in a pandas dataframe with a value of 1 if that tuple is present in the query. it does this by running the query then iterates over the tuples and fills in the dataframe. However, the query returns almost 8 million rows of data.
My question is if anyone knows how to speed up a process like this. Here is the code below:
user_age = pd.read_sql_query(sql_age, datastore, index_col=['userid']).age.astype(np.int, copy=False)
x = pd.DataFrame(0, index=user_age.index, columns=range(366), dtype=np.int8)
for r in pd.read_sql_query(sql_active, datastore, chunksize=50000):
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
Thank you in advance!
You could save some time by replacing the Python loop
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
with a NumPy array assignment using "advanced integer indexing":
x[npidx[r['userid']], r['day']] = 1
On a 80000-row DataFrame, using_numpy (below) is about 6x faster:
In [7]: %timeit orig()
1 loop, best of 3: 984 ms per loop
In [8]: %timeit using_numpy()
10 loops, best of 3: 162 ms per loop
import numpy as np
import pandas as pd
def mock_read_sql_query():
np.random.seed(2016)
for arr in np.array_split(index, N//M):
size = len(arr)
df = pd.DataFrame({'userid':arr , 'day':np.random.randint(366, size=size)})
df = df[['userid', 'day']]
yield df
N, M = 8*10**4, 5*10**2
index = np.arange(N)
np.random.shuffle(index)
columns = range(366)
def using_numpy():
npidx = np.empty_like(index)
npidx[index] = np.arange(len(index))
x = np.zeros((len(index), len(columns)), dtype=np.int8)
for r in mock_read_sql_query():
x[npidx[r['userid']], r['day']] = 1
x = pd.DataFrame(x, columns=columns, index=index)
return x
def orig():
x = pd.DataFrame(0, index=index, columns=columns, dtype=np.int8)
for r in mock_read_sql_query():
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
return x
expected = orig()
result = using_numpy()
expected_index, expected_col = np.where(expected)
result_index, result_col = np.where(result)
assert np.equal(expected_index, result_index).all()
assert np.equal(expected_col, result_col).all()

Appending individual lists created from a list comprehension using values from input()

I created a list comprehension to provide me the following:
listoflists = [[] for i in range(252*5)]
I then simplified the list in variable newlists to contain only the number of lists in range(weeks) which is a dynamic variable.
I want to append each individual list in the following loop for a specified range with the append process moving through each list after its reached a specified length. The values are generated from an input function. For instance, if the first list in newlists exceeds a length of 5 I want the values following the 5th loop to then append to the next list and so on. The code I currently have is:
p = 0
singlist = []
listoflists = [[] for i in range(252*5)]
newlists= [listoflists[i] for i in range(weeks)]
while p<(int(people)*weeks): #fix appending process
for i in range(int(people)*weeks):
weekly =input("Put your hours: ")
singlist.append(int(weekly))
p += 1
if weekly.isalpha() == True:
print("Not a valid amount of time")
for i in range(0,weeks):
while len(newlists[i])<int(people):
newlists[i].append(singlist[i])
This code however appends the same values to all lists in range weeks. What is the most efficient way to fix this? Thank you!
if singlist = [10,15,20,25]
desire output for newlists is: [[10,15],[20,25]]
How I've structured the program:
import sys
import numpy as np
import pandas as pd
from datetime import tzinfo,timedelta,datetime
import matplotlib.pyplot as plt
import itertools as it
from itertools import count,islice
team = []
y = 0
while y == 0:
try:
people = input("How many people are on your engagement? ")
if people.isdigit() == True:
y += 1
except:
print("Not a number try again")
z= 0
while z<int(people):
for i in range(int(people)):
names = input("Name: ")
if names.isalpha() == False:
team.append(names)
z+=1
elif names.isdigit() == True:
print("Not a name try again")
ties = [] # fix looping for more than one person
e = 0
while e<int(people):
for i in range(int(people)):
title = input("What is their title: ")
if title.isdigit() == True:
print("Not a title try again")
else:
ties.append(title)
e+=1
values = [] #fix looping for more than one person
t= 0
while t <int(people):
for i in range(int(people)):
charge = input("How much are you charging for them: ")
if charge.isalpha() == True:
print("Not a valid rate")
else:
values.append(int(charge))
t +=1
weeks = int(input("How many weeks are you including: "))
days = []
x = 0
while x<weeks: #include a parameter for dates of a 7 day difference to only be permitted
try:
for i in range(int(weeks)):
dates = input("Input the dates (mm/dd/yy): ")
dt_start = datetime.strptime(dates,'%m/%d/%y')
days.append(dates)
x+=1
except:
print("Incorrect format")
p = 0
singlist = []
listoflists = [[] for i in range(252*5)]
newlists= [listoflists[i] for i in range(weeks)]
while p<(int(people)*weeks): #fix appending process
for i in range(int(people)*weeks):
weekly =input("Put your hours: ")
singlist.append(int(weekly))
p += 1
if weekly.isalpha() == True:
print("Not a valid amount of time")
def func(items,n):
items = iter(items)
for i in it.count():
out = it.islice(items,weeks*i,weeks*i+n)
if not out:
break
output = list(func(singlist,weeks))
# items = [1,2,3,...n]
# output = [[1,2],[3,4],..], n = 2 elements each
items_ = iter(items)
outiter = iter(lambda: [next(items_) for i in range(n)],[])
outlist = list(outiter)
You can do the same thing using while loop in place of count() and [a:b] slice operation on list instead of islice(). But using iterators is very efficient.

Pandas Series Resampling: How do I get moves based on certain previous changes?

import pandas as pd
import numpy as np
import datetime as dt
# Create Column names
col_names = ['930', '931', '932', '933', '934', '935']
# Create Index datetimes
idx_names = pd.date_range(start = dt.datetime(2011, 1, 1), periods = 10, freq= 'D')
# Create dataframe with previously created column names and index datetimes
df1 = pd.DataFrame(np.random.randn(10, 6), columns=col_names, index=idx_names)
# Change the column names from strings to datetimes.time() object
df1.columns = [dt.datetime.strptime(x, '%H%M').time() for x in df1.columns]
# This step and the next step changes the dataframe into a chronological timeseries
df2 = df1.T.unstack()
df2.index = [dt.datetime.combine(x[0], x[1]) for x in df2.index.tolist()]
# Show the series
df2
Question: What is the most pythonic/pandas-thonic way to create a specific list? This list would say 'Every time the difference between 9:32 and 9:34 is between 0 and .50, what is the difference between 9:34 and the next day's 9:34.
I was doing this with the numbers in a dataframe format (dates along the x-axis and times along the y-axis) and I would say something like (below is pseudo-code, above is not pseudo-code):
# Create a column with wrong answers and right answers
df['Today 934 minus yesterday 934'] = df[934] - df[934].shift(1)
# Boolean mask were condition 1 (diff > 0) and condition 2 (diff < .5) are true
mask = (df[934].shift(1) - df[932].shift(1) > 0) & (df[934].shift(1) - df[932].shift(1) < .5)
# Apply the boolean mask to the dataframe. This is will remove all the answers
# I dont want from the df['Today 934 minus yesterday 934'] column
df2 = df[mask]
# Only the answers I want:
answers = df['Today 934 minus yesterday 934']
My attempt, basically a filled in version of your pseudo-code. Someone else may have a cleaner approach.
mask1 = (df2.index.hour == 9) & (df2.index.minute == 34)
mask2 = (df2.index.hour == 9) & (df2.index.minute == 32)
diff_934 = df2[mask1] - df2[mask1].shift(-1)
diff_934 = diff_934[diff_934.index.minute == 34]
diff_932 = df2[mask1|mask2] - df2[mask1|mask2].shift(-1)
diff_932 = diff_932[diff_932.index.minute == 34]
diff_932 = diff_932[(diff_932 > 0) & (diff_932 < .5)]
answer = diff_934.reindex(diff_932.index)
In [116]: answer
Out[116]:
2011-01-02 09:34:00 -0.874153
2011-01-08 09:34:00 0.186254
dtype: float64

Raster plot not showing symbols using plt.scatter function

I have to do raster plot, of 1 neuron, with 10 trials of data and time is 4500 ms.
import numpy as np
import matplotlib.pyplot as plt
#Plotting 1 neuron with 12 trials of info
maatriks = []
for i in range(1,14):
if i<10:
string = 'C:\\Users\\latel\\Desktop\\kool\\Neuro\\prax3\\data\\lgn\\plain\\neuron_01_stimulus_0'+str(i)+'.csv'
else:
string = 'C:\\Users\\latel\\Desktop\\kool\\Neuro\\prax3\\data\\lgn\\plain\\neuron_01_stimulus_'+str(i)+'.csv'
data_in = np.genfromtxt(string,dtype = 'int', delimiter = ',' or '\n')
maatriks.append(data_in)
data = np.array(maatriks)
print data.shape
spikes = np.array(data[8])
print spikes.shape
nonzeros = 0
for i,item in enumerate(spikes):
nonzeros += np.count_nonzero(item)
plt.scatter(item, i*np.ones(item.shape), marker = '|')
print nonzeros
plt.ylim(-1,len(spikes))
plt.xlim(0,len(spikes[0]))
plt.xlabel("Time is seconds")
plt.ylabel("Trial number")
plt.tight_layout()
plt.show()
This outputs me(the prints) :
(13L, 10L, 4501L)
(10L, 4501L)
55
But the plot is empty , i cannot understand why the plot is empty. There should be 55 lines in my opinion ...
Edit: Got it working. Added this code.
for row in spikes:
for i in range(len(row)):
if (row[i] == 1):
row[i] = i
Because the data was only 0 or 1.
Anyone know how to do it shorter ?