Cumsum entire table and reset at zero - python-2.7

I have following data frame.
d = pd.DataFrame({'one' : [0,1,1,1,0,1],'two' : [0,0,1,0,1,1]})
d
one two
0 0 0
1 1 0
2 1 1
3 1 0
4 0 1
5 1 1
I want cumulative sum which resets at zero
desired output should be
pd.DataFrame({'one' : [0,1,2,3,0,1],'two' : [0,0,1,0,1,2]})
one two
0 0 0
1 1 0
2 2 1
3 3 0
4 0 1
5 1 2
i have tried using group by but it does not work for entire table.

df2 = df.apply(lambda x: x.groupby((~x.astype(bool)).cumsum()).cumsum())
print(df2)
Output:
one two
0 0 0
1 1 0
2 2 1
3 3 0
4 0 1
5 1 2

pandas
def cum_reset_pd(df):
csum = df.cumsum()
return (csum - csum.where(df == 0).ffill()).astype(d.dtypes)
cum_reset_pd(d)
one two
0 0 0
1 1 0
2 2 1
3 3 0
4 0 1
5 1 2
numpy
def cum_reset_np(df):
v = df.values
z = np.zeros_like(v)
j, i = np.where(v.T)
r = np.arange(1, i.size + 1)
p = np.where(
np.append(False, (np.diff(i) != 1) | (np.diff(j) != 0))
)[0]
b = np.append(0, np.append(p, r.size))
z[i, j] = r - b[:-1].repeat(np.diff(b))
return pd.DataFrame(z, df.index, df.columns)
cum_reset_np(d)
one two
0 0 0
1 1 0
2 2 1
3 3 0
4 0 1
5 1 2
Why go through this trouble?
because it's quicker!

This one is without using Pandas, but using NumPy and list comprehensions:
import numpy as np
d = {'one': [0,1,1,1,0,1], 'two': [0,0,1,0,1,1]}
out = {}
for key in d.keys():
l = d[key]
indices = np.argwhere(np.array(l)==0).flatten()
indices = np.append(indices, len(l))
out[key] = np.concatenate([np.cumsum(l[indices[n-1]:indices[n]]) \
for n in range(1, indices.shape[0])]).ravel()
print(out)
First, I find all occurences of 0 (positions to split the lists), then I calculate cumsum of the resulting sublists and insert them into a new dict.

This should do it:
d = {'one' : [0,1,1,1,0,1],'two' : [0,0,1,0,1,1]}
one = d['one']
two = d['two']
i = 0
new_one = []
for item in one:
if item == 0:
i = 0
else:
i += item
new_one.append(i)
j = 0
new_two = []
for item in two:
if item == 0:
j = 0
else:
j += item
new_two.append(j)
d['one'], d['two'] = new_one, new_two
df = pd.DataFrame(d)

Related

Creating a boolean panda dataframe from a list within a list

[(['Piano'], 'Beethoven - opus22 4.mid'), (['Piano'], 'Borodin - ps7.mid'), (['Piano'], 'Chopin - op18.mid'), ([None, 'Guitar', 'StringInstrument', 'Acoustic Bass'], 'Cyndi Lauper - True Colors.mid'), (['Piano', 'Fretless Bass', 'StringInstrument', None], 'Frank Mills - Musicbox Dancer.mid'), (['Piano', 'Acoustic Bass', None, 'Baritone Saxophone'], 'George Benson - On Broadway.mid'), (['Piano'], 'Grieg - Voeglein.mid'), (['Piano'], 'Mozart - 333 3.mid'), ([None, 'Pan Flute', 'Piano', 'Piccolo', 'Violin'], 'The Corrs - Dreams.mid'), (['Piano', None, 'Fretless Bass'], 'ABBA - Money Money Money.mid')]
The above-given list is a list of songs with the given instruments used within those songs. I want to make a boolean panda dataframe given these songs with the nonetype instrument removed. The below-given image as an example:
Given dataframe
I tried to make a dataframe given every single instrument and merge these, however, this did not result in the given dataframe.
Try:
import pandas as pd
lst = [
(["Piano"], "Beethoven - opus22 4.mid"),
(["Piano"], "Borodin - ps7.mid"),
(["Piano"], "Chopin - op18.mid"),
(
[None, "Guitar", "StringInstrument", "Acoustic Bass"],
"Cyndi Lauper - True Colors.mid",
),
(
["Piano", "Fretless Bass", "StringInstrument", None],
"Frank Mills - Musicbox Dancer.mid",
),
(
["Piano", "Acoustic Bass", None, "Baritone Saxophone"],
"George Benson - On Broadway.mid",
),
(["Piano"], "Grieg - Voeglein.mid"),
(["Piano"], "Mozart - 333 3.mid"),
(
[None, "Pan Flute", "Piano", "Piccolo", "Violin"],
"The Corrs - Dreams.mid",
),
(["Piano", None, "Fretless Bass"], "ABBA - Money Money Money.mid"),
]
all_data = []
for instruments, title in lst:
d = {"title": title}
for i in instruments:
if not i is None:
d[i] = 1
all_data.append(d)
df = pd.DataFrame(all_data).fillna(0).set_index("title").astype(int)
df.index.name = None
print(df)
Prints:
Piano Guitar StringInstrument Acoustic Bass Fretless Bass Baritone Saxophone Pan Flute Piccolo Violin
Beethoven - opus22 4.mid 1 0 0 0 0 0 0 0 0
Borodin - ps7.mid 1 0 0 0 0 0 0 0 0
Chopin - op18.mid 1 0 0 0 0 0 0 0 0
Cyndi Lauper - True Colors.mid 0 1 1 1 0 0 0 0 0
Frank Mills - Musicbox Dancer.mid 1 0 1 0 1 0 0 0 0
George Benson - On Broadway.mid 1 0 0 1 0 1 0 0 0
Grieg - Voeglein.mid 1 0 0 0 0 0 0 0 0
Mozart - 333 3.mid 1 0 0 0 0 0 0 0 0
The Corrs - Dreams.mid 1 0 0 0 0 0 1 1 1
ABBA - Money Money Money.mid 1 0 0 0 1 0 0 0 0

Pandas calculating column based on inter-dependent lagged values

I have a dataframe that looks like the following. The rightmost two columns are my desired columns:
Open Close open_to_close close_to_next_open open_desired close_desired
0 0 0 3 0 0
0 0 4 8 3 7
0 0 1 1 15 16
The calculations are as the following:
open_desired = close_desired(prior row) + close_to_next_open(prior row)
close_desired = open_desired + open_to_close
How do I implement the following in a loop manner? I am trying to do this until the last row.
df = pd.DataFrame({'open': [0,0,0], 'close': [0,0,0], 'open_to_close': [0,4,1], 'close_to_next_open': [3,8,1]})
df['close_desired'] = 0
df['open_desired'] = 0
##First step is to create open_desired in current row which is dependent on close_desired in previous row
df['open_desired'] = df['close_desired'].shift() + df['close_to_next_open'].shift()
##second step is to create close_desired in current row which is dependent on open_desired in current row
df['close_desired'] = df['open_desired'] + df['open_to_close']
df.fillna(0,inplace=True)
The only way I can think of doing this is with iterrows()
for row, v in df.iterrows():
if row>0:
df.loc[row,'open_desired'] = df.shift(1).loc[row, 'close_desired'] + df.shift(1).loc[row, 'close_to_next_open']
df.loc[row,'close_desired'] = df.loc[row, 'open_desired'] + df.loc[row, 'open_to_close']

Function I defined is not cleaning my list properly

Here is my minimal working example:
list1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20] #len = 21
list2 = [1,1,1,0,1,0,0,1,0,1,1,0,1,0,1,0,0,0,1,1,0] #len = 21
list3 = [0,0,1,0,1,1,0,1,0,1,0,1,1,1,0,1,0,1,1,1,1] #len = 21
list4 = [1,0,0,1,1,0,0,0,0,1,0,1,1,1,1,0,1,0,1,0,1] #len = 21
I have four lists and I want to "clean" my list 1 using the following rule: "if any of list2[i] or list3[i] or list4[i] are equal to zero, then I want to eliminate the item I from list1. SO basically I only keep those elements of list1 such that the other lists all have ones there.
here is the function I wrote to solve this
def clean(list1, list2,list3,list4):
for i in range(len(list2)):
if (list2[i]==0 or list3[i]==0 or list4[i]==0):
list1.pop(i)
return list1
however it doesn't work. If you apply it, it give the error
Traceback (most recent call last):line 68, in clean list1.pop(I)
IndexError: pop index out of range
What am I doing wrong? Also, I was told Pandas is really good in dealing with data. Is there a way I can do it with Pandas? Each of these lists are actually columns (after removing the heading) of a csv file.
EDIT
For example at the end I would like to get: list1 = [4,9,11,15]
I think the main problem is that at each iteration, when I pop out the elements, the index of all the successor of that element change! And also, the overall length of the list changes, and so the index in pop() is too large. So hopefully there is another strategy or function that I can use
This is definitely a job for pandas:
import pandas as pd
df = pd.DataFrame({
'l1':list1,
'l2':list2,
'l3':list3,
'l4':list4
})
no_zeroes = df.loc[(df['l2'] != 0) & (df['l3'] != 0) & (df['l4'] != 0)]
Where df.loc[...] takes the full dataframe, then filters it by the criteria provided. In this example, your criteria are that you only keep the items where l2, l3, and l3 are not zero (!= 0).
Gives you a pandas dataframe:
l1 l2 l3 l4
4 4 1 1 1
9 9 1 1 1
12 12 1 1 1
18 18 1 1 1
or if you need just list1:
list1 = df['l1'].tolist()
if you want the criteria to be where all other columns are 1, then use:
all_ones = df.loc[(df['l2'] == 1) & (df['l3'] == 1) & (df['l4'] == 1)]
Note that I'm creating new dataframes for no_zeroes and all_ones and that the original dataframe stays intact if you want to further manipulate the data.
Update:
Per Divakar's answer (far more elegant than my original answer), much the same can be done in pandas:
df = pd.DataFrame([list1, list2, list3, list4])
list1 = df.loc[0, (df[1:] != 0).all()].astype(int).tolist()
Here's one approach with NumPy -
import numpy as np
mask = (np.asarray(list2)==1) & (np.asarray(list3)==1) & (np.asarray(list4)==1)
out = np.asarray(list1)[mask].tolist()
Here's another way with NumPy that stacks those lists into rows to form a 2D array and thus simplifies things quite a bit -
arr = np.vstack((list1, list2, list3, list4))
out = arr[0,(arr[1:] == 1).all(0)].tolist()
Sample run -
In [165]: arr = np.vstack((list1, list2, list3, list4))
In [166]: print arr
[[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]
[ 1 1 1 0 1 0 0 1 0 1 1 0 1 0 1 0 0 0 1 1 0]
[ 0 0 1 0 1 1 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1]
[ 1 0 0 1 1 0 0 0 0 1 0 1 1 1 1 0 1 0 1 0 1]]
In [167]: arr[0,(arr[1:] == 1).all(0)].tolist()
Out[167]: [4, 9, 12, 18]

field_delim="\t" doesn't work properly in tf.decode_csv(csv_row, record_defaults=listoflists,field_delim="\t") in tensorflow

I have a tab seperated CSV file
I use the following code fragment
data = tf.decode_csv(csv_row, record_defaults=listoflists,field_delim="\t")
but it arises the following error
tensorflow.python.framework.errors.InvalidArgumentError: Expect 5 fields but have 1 in record 0
but when i make the file into comma separated and space separated , it works correctly
1. Comma Sepeated
data = tf.decode_csv(csv_row, record_defaults=listoflists)
2.Space Separated
data = tf.decode_csv(csv_row, record_defaults=listoflists,field_delim=" ")
The full Code
from __future__ import print_function
import tensorflow as tf
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
filename = "test.csv"
# setup text reader
file_length = file_len(filename)
filename_queue = tf.train.string_input_producer([filename])
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
# setup CSV decoding
#setup text reader
listoflists = []
for i in range(0,5):
listoflists.append((list([0])))
data = tf.decode_csv(csv_row, record_defaults=listoflists,field_delim="\t")
# turn features back into a tensor
print("loading, " + str(file_length) + " line(s)\n")
with tf.Session() as sess:
tf.initialize_all_variables().run()
# start populating filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(file_length):
# retrieve a single instance
example = sess.run(data)
print(example)
coord.request_stop()
coord.join(threads)
print("\ndone loading")
Sample Data
Tab Separated :
1 0 0 0 0
1 0 0 0 0
1 0 0 0 0
1 0 0 0 0
1 0 0 0 0
Comma Separated :
1,0,1,1,1
1,0,1,1,1
1,0,1,1,1
1,0,1,1,1
1,0,1,1,1
1,0,1,1,1
Space Separated :
1 0 0 0 0
1 0 0 0 0
1 0 0 0 0
1 0 0 0 0
1 0 0 0 0

How to count the values of columns row wise based on conditions in Pandas

I have a pandas dataframe df below
df = pd.DataFrame({'id':[1,2,3],'v' : ['r','r','i'], 'w' : ['r','r','i'],'x' : ['r','i','i']})
df
id v w x
1 r r r
2 r r i
3 i i i
The values of columns are r and i. I want to count the occurrences of r and i row wise and generate two more colum headers r and i with the counts of r and i` as values for each row, the final result I am expecting is given below
id v w x r i
1 r r r 3 0
2 i r r 2 1
3 i i i 0 3
Method 1
In [15]:
def count(df):
df['i'] = np.sum(df == 'i')
df['r'] = np.sum(df == 'r')
return df
In [16]:
df.apply(count, axis = 1)
Out[16]:
id v w x i r
0 1 r r r 0 3
1 2 r r i 1 2
2 3 i i i 3 0
Method 2
In [9]:
count = df.apply(lambda x : x.value_counts() , axis = 1)[['i' , 'r']]
count
Out[9]:
i r
0 NaN 3
1 1 2
2 3 NaN
In [10]:
pd.concat([df , count.fillna(0)] , axis = 1)
Out[10]:
id v w x i r
0 1 r r r 0 3
1 2 r r i 1 2
2 3 i i i 3 0