calculates the average of identical columns of several dataframes - list

I am trying to write a function that calculates the average of identical columns of different dataframes stored in a list:
def mean(dfs):
# declare an empty dataframe
df_mean = pd.DataFrame()
# assign the first column from each raw data framework to df
for i in range(len(dfs)):
dfs[i].set_index(['Time'], inplace=True)
for j in dfs[0].columns:
for i in range(len(dfs)):
df_mean[j] = pd.concat([df_mean,dfs[i][j]], axis=1).mean(axis=1)
return df_mean
dfs = []
l1 = [[1,6,2,6,7],[2,3,2,6,8],[3,3,2,8,8],[4,5,2,6,8],[5,3,9,6,8]]
l2 = [[1,7,2,5,7],[2,3,0,6,8],[3,3,3,6,8],[4,3,7,6,8],[5,3,0,6,8]]
dfs.append(pd.DataFrame(l1, columns=['Time','25','50','75','100']))
dfs.append(pd.DataFrame(l2, columns=['Time','25','50','75','100']))
mean(dfs)
However, I got out only the mean of the first column right!

Option 1
Use python's sum, which well default to reducing the list based on the individual object's __add__ method. Then just divide by the length of the list.
sum(dfs) / len(dfs)
Time 25 50 75 100
0 1.0 6.5 2.0 5.5 7.0
1 2.0 3.0 1.0 6.0 8.0
2 3.0 3.0 2.5 7.0 8.0
3 4.0 4.0 4.5 6.0 8.0
4 5.0 3.0 4.5 6.0 8.0
Option 2
Reconstruct while using numpy's mean function
pd.DataFrame(
np.mean([d.values for d in dfs], 0),
dfs[0].index, dfs[0].columns)
Time 25 50 75 100
0 1.0 6.5 2.0 5.5 7.0
1 2.0 3.0 1.0 6.0 8.0
2 3.0 3.0 2.5 7.0 8.0
3 4.0 4.0 4.5 6.0 8.0
4 5.0 3.0 4.5 6.0 8.0

Use concat on Time indexed list of dataframes, and groupby larger dataframe on Time and take mean
In [275]: pd.concat([d.set_index('Time') for d in dfs]).groupby(level='Time').mean()
Out[275]:
25 50 75 100
Time
1 6.5 2.0 5.5 7.0
2 3.0 1.0 6.0 8.0
3 3.0 2.5 7.0 8.0
4 4.0 4.5 6.0 8.0
5 3.0 4.5 6.0 8.0
Or, since Time column is anyway common for both, atleast in this usecase
In [289]: pd.concat(dfs).groupby(level=0).mean()
Out[289]:
Time 25 50 75 100
0 1.0 6.5 2.0 5.5 7.0
1 2.0 3.0 1.0 6.0 8.0
2 3.0 3.0 2.5 7.0 8.0
3 4.0 4.0 4.5 6.0 8.0
4 5.0 3.0 4.5 6.0 8.0
Details
In [276]: dfs
Out[276]:
[ Time 25 50 75 100
0 1 6 2 6 7
1 2 3 2 6 8
2 3 3 2 8 8
3 4 5 2 6 8
4 5 3 9 6 8, Time 25 50 75 100
0 1 7 2 5 7
1 2 3 0 6 8
2 3 3 3 6 8
3 4 3 7 6 8
4 5 3 0 6 8]
In [277]: pd.concat([d.set_index('Time') for d in dfs])
Out[277]:
25 50 75 100
Time
1 6 2 6 7
2 3 2 6 8
3 3 2 8 8
4 5 2 6 8
5 3 9 6 8
1 7 2 5 7
2 3 0 6 8
3 3 3 6 8
4 3 7 6 8
5 3 0 6 8

Related

Pandas column sort order

I am using rolling().agg and adding columns to a dataframe.
def add_mean_std_cols(df):
res = df.rolling(5).agg(['mean','std'])
res.columns = res.columns.map('_'.join)
final = res.join(df).sort_index(axis=1)
return final
np.random.seed(20)
df = pd.DataFrame(np.random.randint(0,9,size=(10, 6)), columns=list('ABCDEF'))
print
print df
print
df.columns = ['A', 'A/B','AB', 'AC', 'C/B', 'D']
print add_mean_std_cols(df)
The issue is the output column name order:
A A/B A/B_mean A/B_std AB AB_mean AB_std AC AC_mean AC_std A_mean A_std C/B C/B_mean C/B_std D D_mean D_std
0 3 4 NaN NaN 6 NaN NaN 7 NaN NaN NaN NaN 2 NaN NaN 0 NaN NaN
1 6 8 NaN NaN 5 NaN NaN 3 NaN NaN NaN NaN 0 NaN NaN 6 NaN NaN
2 6 0 NaN NaN 5 NaN NaN 7 NaN NaN NaN NaN 5 NaN NaN 2 NaN NaN
3 6 3 NaN NaN 3 NaN NaN 0 NaN NaN NaN NaN 6 NaN NaN 2 NaN NaN
4 3 1 3.2 3.114482 8 5.4 1.816590 0 3.4 3.507136 4.8 1.643168 2 3.0 2.449490 7 3.4 2.966479
5 6 6 3.6 3.361547 8 5.8 2.167948 2 2.4 2.880972 5.4 1.341641 1 2.8 2.588436 3 4.0 2.345208
6 2 6 3.2 2.774887 4 5.6 2.302173 6 3.0 3.316625 4.6 1.949359 4 3.6 2.073644 8 4.4 2.880972
7 6 2 3.6 2.302173 3 5.2 2.588436 1 1.8 2.489980 4.6 1.949359 5 3.6 2.073644 2 4.4 2.880972
8 1 8 4.6 2.966479 2 5.0 2.828427 4 2.6 2.408319 3.6 2.302173 4 3.2 1.643168 8 5.6 2.880972
9 6 0 4.4 3.286335 3 4.0 2.345208 4 3.4 1.949359 4.2 2.489980 0 2.8 2.167948 5 5.2 2.774887
For some reason it is sorting A/B and AB before A_mean A_std.
The order that I would prefer is:
A A_mean A_std ...
From playing it seems that '_' is sorted last.
Any suggestions on how to achieve the desired order?
Thanks!
In [60]: res = df.rolling(5).agg(['mean','std'])
In [61]: res.columns = res.columns.map('_'.join)
In [62]: cols = np.concatenate(list(zip(df.columns, res.columns[0::2], res.columns[1::2])))
In [63]: res.join(df).loc[:, cols]
Out[63]:
A A_mean A_std A/B A/B_mean A/B_std AB AB_mean AB_std AC AC_mean AC_std C/B C/B_mean C/B_std D D_mean \
0 3 NaN NaN 4 NaN NaN 6 NaN NaN 7 NaN NaN 2 NaN NaN 0 NaN
1 6 NaN NaN 8 NaN NaN 5 NaN NaN 3 NaN NaN 0 NaN NaN 6 NaN
2 6 NaN NaN 0 NaN NaN 5 NaN NaN 7 NaN NaN 5 NaN NaN 2 NaN
3 6 NaN NaN 3 NaN NaN 3 NaN NaN 0 NaN NaN 6 NaN NaN 2 NaN
4 3 4.8 1.643168 1 3.2 3.114482 8 5.4 1.816590 0 3.4 3.507136 2 3.0 2.449490 7 3.4
5 6 5.4 1.341641 6 3.6 3.361547 8 5.8 2.167948 2 2.4 2.880972 1 2.8 2.588436 3 4.0
6 2 4.6 1.949359 6 3.2 2.774887 4 5.6 2.302173 6 3.0 3.316625 4 3.6 2.073644 8 4.4
7 6 4.6 1.949359 2 3.6 2.302173 3 5.2 2.588436 1 1.8 2.489980 5 3.6 2.073644 2 4.4
8 1 3.6 2.302173 8 4.6 2.966479 2 5.0 2.828427 4 2.6 2.408319 4 3.2 1.643168 8 5.6
9 6 4.2 2.489980 0 4.4 3.286335 3 4.0 2.345208 4 3.4 1.949359 0 2.8 2.167948 5 5.2
D_std
0 NaN
1 NaN
2 NaN
3 NaN
4 2.966479
5 2.345208
6 2.880972
7 2.880972
8 2.880972
9 2.774887
You can join by MultiIndex and then sort_index:
def add_mean_std_cols(df):
res = df.rolling(5).agg(['mean','std'])
df.columns = [df.columns, [''] * len(df.columns)]
final = res.join(df).sort_index(axis=1)
final.columns = final.columns.map('_'.join).str.strip('_')
return final
print (add_mean_std_cols(df))
A A_mean A_std A/B A/B_mean A/B_std AB AB_mean AB_std AC \
0 3 NaN NaN 4 NaN NaN 6 NaN NaN 7
1 6 NaN NaN 8 NaN NaN 5 NaN NaN 3
2 6 NaN NaN 0 NaN NaN 5 NaN NaN 7
3 6 NaN NaN 3 NaN NaN 3 NaN NaN 0
4 3 4.8 1.643168 1 3.2 3.114482 8 5.4 1.816590 0
5 6 5.4 1.341641 6 3.6 3.361547 8 5.8 2.167948 2
6 2 4.6 1.949359 6 3.2 2.774887 4 5.6 2.302173 6
7 6 4.6 1.949359 2 3.6 2.302173 3 5.2 2.588436 1
8 1 3.6 2.302173 8 4.6 2.966479 2 5.0 2.828427 4
9 6 4.2 2.489980 0 4.4 3.286335 3 4.0 2.345208 4
AC_mean AC_std C/B C/B_mean C/B_std D D_mean D_std
0 NaN NaN 2 NaN NaN 0 NaN NaN
1 NaN NaN 0 NaN NaN 6 NaN NaN
2 NaN NaN 5 NaN NaN 2 NaN NaN
3 NaN NaN 6 NaN NaN 2 NaN NaN
4 3.4 3.507136 2 3.0 2.449490 7 3.4 2.966479
5 2.4 2.880972 1 2.8 2.588436 3 4.0 2.345208
6 3.0 3.316625 4 3.6 2.073644 8 4.4 2.880972
7 1.8 2.489980 5 3.6 2.073644 2 4.4 2.880972
8 2.6 2.408319 4 3.2 1.643168 8 5.6 2.880972
9 3.4 1.949359 0 2.8 2.167948 5 5.2 2.774887

pandas using melt to create lookup table

I have a dataframe df of size 24x13 which appears as (I have displayed truncated version of 24x13 array which represents 12 months and 24 hours):
HE 1 2 3 4
0 1 1.8 2.5 3.5 8.5
1 2 2.6 2.9 4.3 8.7
2 3 4.4 2.3 5.3 4.3
3 4 2.6 2.1 4.2 5.3
How do I change this to look up table for each combination of hour and month and display the value in third column as follows:
Hour Month Value
1 1 1.8
1 2 2.5
1 3 3.5
I am trying the following and variation of it but this is not working:
pd.melt(df, id_vars=range(1,24), value_vars=range(1,12))
Edit 1:
df.columns
Index([u'HE', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype='object')
df.shape
(24, 13)
df.set_index('HE').stack().reset_index()
Output:
HE level_1 0
0 1 1 1.8
1 1 2 2.5
2 1 3 3.5
3 1 4 8.5
4 2 1 2.6
OR using melt
df.melt(id_vars='HE').sort_values(by=['HE','variable']
Output:
HE variable value
0 1 1 1.8
4 1 2 2.5
8 1 3 3.5
12 1 4 8.5
1 2 1 2.6

Broadcasting pandas dataframe to two dimensional matrix

I have a dataframe of size 12x24 which appears as (I have displayed truncated version):
HE 1 2 3 4 5
0 1 1.8 2.5 3.5 8.5
1 2 2.6 2.9 4.3 8.7
2 3 4.4 2.3 5.3 4.3
3 4 2.6 2.1 4.2 5.3
The column names are number 1 through 12 (representing months) and rows are numbered 1 through 24 (representing hours).
I have another DateTable dataframe which has data as follows:
Date Month Hour
2001-01-01 1 1
2001-02-01 2 4
2001-01-05 1 3
2011-01-31 3 2
2012-01-01 1 5
I want to broadcast the values from 12x24 array into the DateTable to get the following:
Date Month Hour Values
2001-01-01 1 1 3.5
2001-02-01 2 4 5.3
2001-01-05 1 3 2.5
2011-01-31 3 2 2.6
2012-01-01 1 5 1.8
I envision creating some kind of multiindex from the 12x24 table and using against the DateTable but not quite sure about the syntax as I am struggling with syntax.

How to resample dates with Pandas item by item?

My objective is to add rows in pandas in order to replace missing data with previous data and resample dates at the same time. Example :
This is what I have :
date wins losses
2015-12-19 11 5
2015-12-20 17 8
2015-12-20 10 6
2015-12-21 15 1
2015-12-25 11 5
2015-12-26 6 10
2015-12-27 10 6
2015-12-28 4 12
2015-12-29 8 11
And this is what I want :
wins losses
date
2015-12-19 11.0 5.0
2015-12-20 10.0 6.0
2015-12-21 15.0 1.0
2015-12-22 15.0 1.0
2015-12-23 15.0 1.0
2015-12-24 15.0 1.0
2015-12-25 11.0 5.0
2015-12-26 6.0 10.0
2015-12-27 10.0 6.0
2015-12-28 4.0 12.0
2015-12-29 8.0 11.0
And this is my code :
resamp = df.set_index('date').resample('D', how='last', fill_method='ffill')
It works !
But I want to do the same thing with 22 million lines (pandas), with different dates, and different IDs..
This dataframe contains two productID (1 and 2). I want to do the same previous exercice and keep the time serie data of every productID..
createdAt productId popularity
2015-12-01 1 5
2015-12-02 1 8
2015-12-04 1 6
2015-12-07 1 9
2015-12-01 2 5
2015-12-03 2 10
2015-12-04 2 6
2015-12-07 2 12
2015-12-09 2 11
This is my code :
df['date'] = pd.to_datetime(df['createdAt'])
df.set_index('date').resample('D', how='last', fill_method='ffill')
This is what I have if I use the same code ! I don't want a groupby with my dates.
createdAt productId popularity
date
2015-12-01 2015-12-01 2 5
2015-12-02 2015-12-02 2 5
2015-12-03 2015-12-03 2 10
2015-12-04 2015-12-04 2 6
2015-12-05 2015-12-05 2 6
2015-12-06 2015-12-06 2 6
2015-12-07 2015-12-07 2 12
2015-12-08 2015-12-08 2 12
2015-12-09 2015-12-09 2 11
This is what I want !
createdAt productId popularity
2015-12-01 1 5
2015-12-02 1 8
2015-12-03 1 8
2015-12-04 1 6
2015-12-05 1 6
2015-12-06 1 6
2015-12-07 1 9
2015-12-01 2 5
2015-12-02 2 5
2015-12-03 2 10
2015-12-04 2 6
2015-12-05 2 6
2015-12-06 2 6
2015-12-07 2 12
2015-12-08 2 12
2015-12-09 2 11
What to do ?
Thank you
Try this, it should works :)
print df.set_index('date').groupby('productId', group_keys=False).apply(lambda
df: df.resample('D').ffill()).reset_index()
This produces what you said you wanted.
print df.groupby('productId', group_keys=False).apply(lambda df: df.resample('D').ffill()).reset_index()
createdAt productId popularity
0 2015-12-01 1 5
1 2015-12-02 1 8
2 2015-12-03 1 8
3 2015-12-04 1 6
4 2015-12-05 1 6
5 2015-12-06 1 6
6 2015-12-07 1 9
7 2015-12-01 2 5
8 2015-12-02 2 5
9 2015-12-03 2 10
10 2015-12-04 2 6
11 2015-12-05 2 6
12 2015-12-06 2 6
13 2015-12-07 2 12
14 2015-12-08 2 12
15 2015-12-09 2 11

Merging DataFrames within a loop [duplicate]

This question already has answers here:
Pandas Merging 101
(8 answers)
Closed 4 years ago.
I have a folder with numerous csv files which look like this:
csv1
2006 Percent Land_Use
0 13 5.379564 Developed
1 8 25.781580 Grass/Pasture
2 4 54.265050 Crop
3 15 0.363983 Water
4 16 6.244104 Wetlands
5 6 4.691764 Forest
6 1 3.031494 Alfalfa
7 11 0.137424 Shrubland
8 5 0.003671 Vetch
9 3 0.055412 Barren
10 7 0.009531 Grass
11 12 0.036423 Tree
csv2
2007 Percent Land_Use
0 13 2.742430 Developed
1 4 56.007242 Crop
2 8 24.227963 Grass/Pasture
3 16 8.839979 Wetlands
4 6 6.181062 Forest
5 1 1.446668 Alfalfa
6 15 0.366116 Water
7 3 0.127760 Barren
8 11 0.034426 Shrubland
9 7 0.000827 Grass
10 12 0.025528 Tree
csv3
2008 Percent Land_Use
0 13 1.863809 Developed
1 8 31.455578 Grass/Pasture
2 4 57.896856 Crop
3 16 2.693929 Wetlands
4 6 4.417966 Forest
5 1 1.239176 Alfalfa
6 7 0.130849 Grass
7 15 0.266536 Water
8 11 0.004571 Shrubland
9 3 0.030731 Barren
and I want to merge them all together into one DataFrame on Land_Use
I am reading in the files like this:
pth = (r'G:\')
for f in os.listdir(pth):
df=pd.read_csv(os.path.join(pth,f)
but I can't figure out how to merge all the individual dataframes after that. I figured out how to concat them but that isn't what I want. The type of merge I want is outer.
If I were to use a pathway to each csv file I would merge them like this, but I do NOT want to set a pathway to each file as there are many of them:
one=pd.read_csv(r'G:\one.csv')
two=pd.read_csv(r'G:\two.csv')
three=pd.read_csv(r'G:\three.csv')
merge=pd.merge(one,two, on=['Land_Use'], how='outer')
mergetwo=pd.merge(merge,three,on=['Land_Use'], how='outer')
I think you can use in python 3:
import functools
dfs = [df1,df2,df3]
df = functools.reduce(lambda left,right: pd.merge(left,right,on='Land_Use',how='outer'),dfs)
print (df)
2006 Percent_x Land_Use 2007 Percent_y 2008 Percent
0 13 5.379564 Developed 13.0 2.742430 13.0 1.863809
1 8 25.781580 Grass/Pasture 8.0 24.227963 8.0 31.455578
2 4 54.265050 Crop 4.0 56.007242 4.0 57.896856
3 15 0.363983 Water 15.0 0.366116 15.0 0.266536
4 16 6.244104 Wetlands 16.0 8.839979 16.0 2.693929
5 6 4.691764 Forest 6.0 6.181062 6.0 4.417966
6 1 3.031494 Alfalfa 1.0 1.446668 1.0 1.239176
7 11 0.137424 Shrubland 11.0 0.034426 11.0 0.004571
8 5 0.003671 Vetch NaN NaN NaN NaN
9 3 0.055412 Barren 3.0 0.127760 3.0 0.030731
10 7 0.009531 Grass 7.0 0.000827 7.0 0.130849
11 12 0.036423 Tree 12.0 0.025528 NaN NaN
In python 2:
df = reduce(lambda left,right: pd.merge(left,right,on='Land_Use',how='outer'),dfs)
Working solution with glob:
import pandas as pd
import functools
import glob
pth = 'a/*.csv'
files = glob.glob(pth)
dfs = [pd.read_csv(f, sep=';') for f in files]
df = functools.reduce(lambda left,right: pd.merge(left,right,on='Land_Use', how='outer'),dfs)
print (df)
2006 Percent_x Land_Use 2008 Percent_y 2007 Percent
0 13 5.379564 Developed 13.0 1.863809 13.0 2.742430
1 8 25.781580 Grass/Pasture 8.0 31.455578 8.0 24.227963
2 4 54.265050 Crop 4.0 57.896856 4.0 56.007242
3 15 0.363983 Water 15.0 0.266536 15.0 0.366116
4 16 6.244104 Wetlands 16.0 2.693929 16.0 8.839979
5 6 4.691764 Forest 6.0 4.417966 6.0 6.181062
6 1 3.031494 Alfalfa 1.0 1.239176 1.0 1.446668
7 11 0.137424 Shrubland 11.0 0.004571 11.0 0.034426
8 5 0.003671 Vetch NaN NaN NaN NaN
9 3 0.055412 Barren 3.0 0.030731 3.0 0.127760
10 7 0.009531 Grass 7.0 0.130849 7.0 0.000827
11 12 0.036423 Tree NaN NaN 12.0 0.025528
I am not allowed to comment, so I am unsure of what you exactly want.
You can try using one.merge(two, on=['Land_Use'], how='outer').merge(three,on=['Land_Use'], how='outer'). Let me know if you wanted something else.
If you have many dataframes, you can try using the reduce function. First create a list containing all the dataframes dataframes = [one, two, three, four, ... , twenty] You can add them into the list by using list comprehensions or by appending them into the list in your loop.
Then if you want to combine them based on Land_Use, you can use df_final = reduce(lambda left,right: pd.merge(left,right,on=['Land_Use'], how='outer'), dataframes)
Note: The reduce function is in the functools package in python 3+