Concatenate two data frames to a data frame of square matrix - python-2.7

I have two pandas dataframes of which shapes are "n x n" and "m x n" (m < n). For example:
df1 = pd.DataFrame([[0,1,0,1],[1,0,0,1],[0,0,0,1],[1,1,1,0]])
df2 = pd.DataFrame([[1,1,1,0],[1,1,0,1]])
I'd like to get the dataframe of a square matrix by concatenating above dataframes:
df3 = foo(df1, df2)
print df3.values
This should print like the following matrix.
[[0,1,0,1,1,1],
[1,0,0,1,1,1],
[0,0,0,1,1,0],
[1,1,1,0,0,1],
[1,1,1,0,0,0],
[1,1,0,1,0,0]]
The logic of concatination is like this:
the upper-left part of the square matrix comes from df1
the upper-right part of it comes from the transpose of df2
the bottom-left part of it comes from df2
all element of the rest of it (bottom-right part) is zero.
How do I implement the above logic (foo method)?

Here is a sample of foo:
def foo(_df1,_df2):
df1 = _df1.reset_index(drop=True) #to make sure the index is ordered
df2 = _df2.reset_index(drop=True) #to make sure the index is ordered
df2_transpose = df2.transpose().reset_index(drop=True) #reset the index to match the join below
df_upper = df1.join(df2_transpose,rsuffix="_") #add suffix for additional columns
df_upper.columns = [i for i in range(df_upper.shape[1])] #reset column names to int
df = pd.concat([df_upper,df2]) #fill the bottom left
df.fillna(0,inplace=True) #fill with 0 the bottom right
return df

The foo function:
def foo(df1_data,df2_data):
df_test = pd.concat([df1_data,df2_data])
a = np.concatenate((df2_data.values.T,np.zeros(shape = (df_test.values.shape[0] - df_test.values.shape[1],df2_data.values.shape[0]))))
final_array = np.append(df_test.values,a, axis=1).astype(int)
df3_data = pd.DataFrame(final_array)
return df3_data
df3 = foo(df1,df2)

Related

Looping over a list of pandas data frames and create a new data frame

I have a list of data frames:
data_frames = [sort,sort1,sort2]
I'd like to iterate over them and store some stats in a new df. I feel like this is something trivial but the function below returns an empty data frame df_concat = df_stats(data_frames). What am I missing? Will appreciate your help.
Create an example data set:
import pandas as pd
data = {'number': [23,56,89], 'PVs': [23456, 34456, 6789]}
sort = pd.DataFrame.from_dict(data)
data1 = {'number': [28,52,12], 'PVs': [3423456, 2334456, 36789]}
sort1 = pd.DataFrame.from_dict(data1)
data2 = {'number': [123,5,86], 'PVs': [2345655, 934456, 16789]}
sort2 = pd.DataFrame.from_dict(data2)
The function to iterate over data frames:
def df_stats(data_frames):
df = pd.DataFrame()
for data in data_frames:
df['Number'] = data.number.count()
df["Total PVs"] = '{0:,.0f}'.format(data.PVs.sum())
df["Average"] = '{0:,.0f}'.format(data.PVs.mean())
df["Median"] = '{0:,.0f}'.format(data.PVs.median())
return df
We can using pd.concat+groupby rather than for loop
pd.concat(data_frames,keys=[1,2,3]).groupby(level=0).agg({'number':'count','PVs':['sum','mean','median']})
Out[1117]:
number PVs
count sum mean median
1 3 64701 2.156700e+04 23456
2 3 5794701 1.931567e+06 2334456
3 3 3296900 1.098967e+06 934456
Also if you want to using your function you can fix it to
df = pd.DataFrame()
for i,data in enumerate(data_frames):
df.at[i,'Number'] = data.number.count()
df.at[i,"Total PVs"] = '{0:,.0f}'.format(data.PVs.sum())
df.at[i,"Average"] = '{0:,.0f}'.format(data.PVs.mean())
df.at[i,"Median"] = '{0:,.0f}'.format(data.PVs.median())
df
Out[1121]:
Number Total PVs Average Median
0 3.0 64,701 21,567 23,456
1 3.0 5,794,701 1,931,567 2,334,456
2 3.0 3,296,900 1,098,967 934,456
Try this:
''' Example DataFrames '''
data1 = pd.DataFrame({'number': [23,56,89], 'PVs': [23456, 34456, 6789]},
columns=['number', 'PVs'])
data2 = pd.DataFrame({'number': [28,52,12], 'PVs': [3423456, 2334456, 36789]}, columns=['number', 'PVs'])
data3 = pd.DataFrame({'number': [123,5,86], 'PVs': [2345655, 934456, 16789]},
columns=['number', 'PVs'])
''' The function returning the stats '''
def df_stats(dataFrame):
df = pd.DataFrame({}, columns=['Number', 'Total PVs', 'Average', 'Median'])
df.loc['Number'] = dataFrame['number'].count()
df["Total PVs"] = '{0:,.0f}'.format(dataFrame['PVs'].sum())
df["Average"] = '{0:,.0f}'.format(dataFrame['PVs'].mean())
df["Median"] = '{0:,.0f}'.format(dataFrame['PVs'].median())
return df
''' Create a list of DataFrames to iterate over '''
data_frames = [data1, data2, data3]
''' Create an emmpty DataFrame so you can include it in pd.concat() '''
result = pd.DataFrame()
''' Iterate over DataFrame list and concatenate'''
for dataFrame in data_frames:
tempDF = df_stats(dataFrame)
result = pd.concat([result,tempDF], ignore_index=True)
result.head(3)
The output is:
Number Total PVs Average Median
0 3 64,701 21,567 23,456
1 3 5,794,701 1,931,567 2,334,456
2 3 3,296,900 1,098,967 934,456
The below functions works
dict_df ={'df1':sort1,'df':sort,'df2':sort2}
def df_stats(dict_df):
df = pd.DataFrame(columns=['Number','Total PVs','Average','Median'],index=dict_df.keys())
for name,data in dict_df.items():
df.loc[name,"Number"] = data.number.count()
df.loc[name,"Total PVs"] = '{0:,.0f}'.format(data.PVs.sum())
df.loc[name,"Average"] = '{0:,.0f}'.format(data.PVs.mean())
df.loc[name,"Median"] = '{0:,.0f}'.format(data.PVs.median())
return df
Output:
Number Total PVs Average Median
df2 3 3,296,900 1,098,967 934,456
df1 3 5,794,701 1,931,567 2,334,456
df 3 64,701 21,567 23,456

Average of median of a column in a list of dataframes

I am looking for the best way to take the average of median of a column in a list of data frames (same column name).
let's say i have a list of dataframes list_df. I can write the following for loop to get the required output. I am more interested in looking if we can eliminate the for loop
med_arr = []
list_df = [df1, df2, df3]
for df in list_df:
med_arr.append(np.median(df['col_name']))
np.mean(med_arr)
Consider the sample data
np.random.seed([3,1415])
df1 = pd.DataFrame(dict(col_name=np.random.randint(10, size=10)))
df2 = pd.DataFrame(dict(col_name=np.random.randint(10, size=10)))
df3 = pd.DataFrame(dict(col_name=np.random.randint(10, size=10)))
list_df = [df1, df2, df3]
Option 1
pandas
pd.concat([d['col_name'] for d in list_df], axis=1).median().mean()
3.8333333333333335
Option 2
numpy
np.median([d['col_name'].values for d in list_df], 1).mean()
3.8333333333333335
This could be done as a list comprehension:
list_df = [ df1, df2, df3 ]
med_arr = [ np.median( df['col_name'] ) for df in list_df ]
np.mean(med_arr)

Pandas append list to list of column names

I'm looking for a way to append a list of column names to existing column names in a DataFrame in pandas and then reorder them by col_start + col_add.
The DataFrame already contains the columns from col_start.
Something like:
import pandas as pd
df = pd.read_csv(file.csv)
col_start = ["col_a", "col_b", "col_c"]
col_add = ["Col_d", "Col_e", "Col_f"]
df = pd.concat([df,pd.DataFrame(columns = list(col_add))]) #Add columns
df = df[[col_start.extend(col_add)]] #Rearrange columns
Also, is there a way to capitalize the first letter for each item in col_start, analogous to title() or capitalize()?
Your code is nearly there, a couple things:
df = pd.concat([df,pd.DataFrame(columns = list(col_add))])
can be simplified to just this as col_add is already a list:
df = pd.concat([df,pd.DataFrame(columns = col_add)])
Also you can also just add 2 lists together so:
df = df[[col_start.extend(col_add)]]
becomes
df = df[col_start+col_add]
And to capitalise the first letter in your list just do:
In [184]:
col_start = ["col_a", "col_b", "col_c"]
col_start = [x.title() for x in col_start]
col_start
Out[184]:
['Col_A', 'Col_B', 'Col_C']
EDIT
To avoid the KeyError on the capitalised column names, you need to capitalise after calling concat, the columns have a vectorised str title method:
In [187]:
df = pd.DataFrame(columns = col_start + col_add)
df
Out[187]:
Empty DataFrame
Columns: [col_a, col_b, col_c, Col_d, Col_e, Col_f]
Index: []
In [188]:
df.columns = df.columns.str.title()
df.columns
Out[188]:
Index(['Col_A', 'Col_B', 'Col_C', 'Col_D', 'Col_E', 'Col_F'], dtype='object')
Here what you want to do :
import pandas as pd
#Here you have a first dataframe
d1 = pd.DataFrame([[1,2,3],[4,5,6]], columns=['col1','col2','col3'])
#a second one
d2 = pd.DataFrame([[8,7,3,8],[4,8,6,8]], columns=['col4','col5','col6', 'col7'])
#Here we can make a dataframe with d1 and d2
d = pd.concat((d1,d2), axis=1)
#We want a different order from the columns ?
d = d[col_start + col_add]
If you want to capitalize values from a column 'col', you can do
d['col'] = d['col'].str.capitalize()
PS: Update Pandas if ".str.capitalize()" doesn't work.
Or, what you can do :
df['col'] = df['col'].map(lambda x:x.capitalize())

Printing Results from Loops

I currently have a piece of code that works in two segments. The first segment opens the existing text file from a specific path on my local drive and then arranges, based on certain indices, into a list of sub list. In the second segment I take the sub-lists I have created and group them on a similar index to simplify them (starts at def merge_subs). I am getting no error code but I am not receiving a result when I try to print the variable answer. Am I not correctly looping the original list of sub-lists? Ultimately I would like to have a variable that contains the final product from these loops so that I may write the contents of it to a new text file. Here is the code I am working with:
from itertools import groupby, chain
from operator import itemgetter
with open ("somepathname") as g:
# reads text from lines and turns them into a list sub-lists
lines = g.readlines()
for line in lines:
matrix = line.split()
JD = matrix [2]
minTime= matrix [5]
maxTime= matrix [7]
newLists = [JD,minTime,maxTime]
L = newLists
def merge_subs(L):
dates = {}
for sub in L:
date = sub[0]
if date not in dates:
dates[date] = []
dates[date].extend(sub[1:])
answer = []
for date in sorted(dates):
answer.append([date] + dates[date])
new code
def openfile(self):
filename = askopenfilename(parent=root)
self.lines = open(filename)
def simplify(self):
g = self.lines.readlines()
for line in g:
matrix = line.split()
JD = matrix[2]
minTime = matrix[5]
maxTime = matrix[7]
self.newLists = [JD, minTime, maxTime]
print(self.newLists)
dates = {}
for sub in self.newLists:
date = sub[0]
if date not in dates:
dates[date] = []
dates[date].extend(sub[1:])
answer = []
for date in sorted(dates):
print(answer.append([date] + dates[date]))
enter code here
enter code here

Getting a 2-D list and adding each column

I'm trying to create a function where a user inputs the values for a 3x4 2-D list, then the function prints the grid of numbers, along with an extra row at the bottom with the sum of each column. At the moment, I can't even get it to take any inputs without getting errors. Here's what I've got so far, without the addition of the columns at the end.
def testMatrixFunctions():
row0 = raw_input("Enter a 3-by-4 matrix row for row 0: ")
row0.split()
map(float, row0.split())
[float(c) for c in row0.split()]
row1 = raw_input("Enter a 3-by-4 matrix row for row 1: ")
row1.split()
map(float, row1.split())
[float(c) for c in row1.split()]
row2 = raw_input("Enter a 3-by-4 matrix row for row 2: ")
row2.split()
map(float, row2.split())
[float(c) for c in row2.split()]
print row0
print row1
print row2
You can use numpy to get the column sum
import sys
import numpy
ROWS = 3
COLS = 4
def main():
global ROWS
global COLS
matrixData = []
# Get input and process it
for i in range(0, ROWS):
thisRow = raw_input('Enter a 3-by-4 matrix row for row ' + str(i) + ' : ').split()
evalRow = []
[evalRow.append(eval(element)) for element in thisRow]
matrixData.append(evalRow)
numpyMatrix = numpy.array(matrixData)
colSums = []
[colSums.append(numpy.sum(numpyMatrix[:,index])) for index in range(0,COLS)]
# Print the matrix
for row in matrixData:
print row
print colSums
return
if __name__ == '__main__':
main()
sys.exit(0)