This is my dataframe:
import matplotlib.pyplot as plt
import pandas as pd
mydic = {'time': {'Type1': 15, 'Type2': 47, 'Type3': 23, 'Type4': 45}, 'count': {'Type1': 26, 'Type2': 39, 'Type3': 34, 'Type4': 67}}
df = pd.DataFrame.from_dict(mydic, orient='index')
df.head()
df:
Type4 Type1 Type3 Type2
count 67 26 34 39
time 45 15 23 47
I need to join some columns according to a dictionary: sum some rows but calculate average for others. I can do sum() OR mean() but cannot figure out how to do both in one go without creating different data frames for 'count' and 'time'. Help, please?
My code:
def merge(df):
types = {'Type1&2': ['Type1', 'Type2'], 'Type3&4': ['Type3', 'Type4']}
columns_to_drop = ['Type1', 'Type2', 'Type3', 'Type4']
for t in types:
# df[t] = df[types[t]].sum(axis=1)
df[t] = df[types[t]].mean(axis=1)
df.drop(columns_to_drop, axis=1, inplace=True) # Drop merged columns
merge(df)
df.head()
It seems you need select rows by loc for mean and for sum:
def merge(df):
types = {'Type1&2': ['Type1', 'Type2'], 'Type3&4': ['Type3', 'Type4']}
columns_to_drop = ['Type1', 'Type2', 'Type3', 'Type4']
for t in types:
df.loc['count', t] = df.loc['count', types[t]].sum()
df.loc['time', t] = df.loc['time', types[t]].mean()
df.drop(columns_to_drop, axis=1, inplace=True) # Drop merged columns
return df
df1 = merge(df)
print (df1)
Type1&2 Type3&4
count 65.0 101.0
time 31.0 34.0
Or maybe try this ?
d = dict(Type1='Type12', Type2='Type12', Type3='Type34', Type4='Type34')
df1=df.T.groupby(d).agg({'count':'sum','time':'mean'}).T
df1
Out[1004]:
Type12 Type34
count 65 101
time 31 34
Related
Here's my code:
import pandas as pd
import numpy as np
input = {'name': ['Andy', 'Alex', 'Amy', "Olivia" ],
'rating': ['A', 'A', 'B', "B" ],
'score': [100, 60, 70, 95]}
df = pd.DataFrame(input)
df['valid1']=np.where((df['score']==100) & (df['rating']=='A'),'true','false')
The code above works fine to set a new column 'valid1' data as 'true' for score is 100 and 'rating' is A.
If the condition comes from a dict variable as
c = {'score':'100', 'rating':'A'}
How can I use the condition defined in c to get the same result 'valid' column value? I tried the following code
for key,value in c.iteritems():
df['valid2']=np.where((df[key]==value),'true','false')
got an error:
TypeError: Invalid type comparison
I'd define c as a pd.Series so that when you compare it to a dataframe, it automatically compares agains each row while matching columns with series indices. Note that I made sure 100 was an integer and not a string.
c = pd.Series({'score':100, 'rating':'A'})
i = df.columns.intersection(c.index)
df.assign(valid1=df[i].eq(c).all(1))
name rating score valid1
0 Andy A 100 True
1 Alex A 60 False
2 Amy B 70 False
3 Olivia B 95 False
You can use the same series and still use numpy to speed things up
c = pd.Series({'score':100, 'rating':'A'})
i = df.columns.intersection(c.index)
v = np.column_stack(df[c].values for c in i)
df.assign(valid1=(v == c.loc[i].values).all(1))
name rating score valid1
0 Andy A 100 True
1 Alex A 60 False
2 Amy B 70 False
3 Olivia B 95 False
I am using the following code to create a data frame from a list:
test_list = ['a','b','c','d']
df_test = pd.DataFrame.from_records(test_list, columns=['my_letters'])
df_test
The above code works fine. Then I tried the same approach for another list:
import pandas as pd
q_list = ['112354401', '116115526', '114909312', '122425491', '131957025', '111373473']
df1 = pd.DataFrame.from_records(q_list, columns=['q_data'])
df1
But it gave me the following errors this time:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-24-99e7b8e32a52> in <module>()
1 import pandas as pd
2 q_list = ['112354401', '116115526', '114909312', '122425491', '131957025', '111373473']
----> 3 df1 = pd.DataFrame.from_records(q_list, columns=['q_data'])
4 df1
/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in from_records(cls, data, index, exclude, columns, coerce_float, nrows)
1021 else:
1022 arrays, arr_columns = _to_arrays(data, columns,
-> 1023 coerce_float=coerce_float)
1024
1025 arr_columns = _ensure_index(arr_columns)
/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in _to_arrays(data, columns, coerce_float, dtype)
5550 data = lmap(tuple, data)
5551 return _list_to_arrays(data, columns, coerce_float=coerce_float,
-> 5552 dtype=dtype)
5553
5554
/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in _list_to_arrays(data, columns, coerce_float, dtype)
5607 content = list(lib.to_object_array(data).T)
5608 return _convert_object_array(content, columns, dtype=dtype,
-> 5609 coerce_float=coerce_float)
5610
5611
/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in _convert_object_array(content, columns, coerce_float, dtype)
5666 # caller's responsibility to check for this...
5667 raise AssertionError('%d columns passed, passed data had %s '
-> 5668 'columns' % (len(columns), len(content)))
5669
5670 # provide soft conversion of object dtypes
AssertionError: 1 columns passed, passed data had 9 columns
Why would the same approach work for one list but not another? Any idea what might be wrong here? Thanks a lot!
DataFrame.from_records treats string as a character list. so it needs as many columns as length of string.
You could simply use the DataFrame constructor.
In [3]: pd.DataFrame(q_list, columns=['q_data'])
Out[3]:
q_data
0 112354401
1 116115526
2 114909312
3 122425491
4 131957025
5 111373473
In[20]: test_list = [['a','b','c'], ['AA','BB','CC']]
In[21]: pd.DataFrame(test_list, columns=['col_A', 'col_B', 'col_C'])
Out[21]:
col_A col_B col_C
0 a b c
1 AA BB CC
In[22]: pd.DataFrame(test_list, index=['col_low', 'col_up']).T
Out[22]:
col_low col_up
0 a AA
1 b BB
2 c CC
If you want to create a DataFrame from multiple lists you can simply zip the lists. This returns a 'zip' object. So you convert back to a list.
mydf = pd.DataFrame(list(zip(lstA, lstB)), columns = ['My List A', 'My List B'])
just using concat method
test_list = ['a','b','c','d']
pd.concat(test_list )
You could also take the help of numpy.
import numpy as np
df1 = pd.DataFrame(np.array(q_list),columns=['q_data'])
I am reading in a CSV file with the general schema of
,abv,ibu,id,name,style,brewery_id,ounces
14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0
0 , 0.05,, 1436, Pub Beer, American Pale Lager, 408, 12.0
I am running into problems where fields are not existing such as in object 0 where it is lacking an IBU. I would like to be able to insert a value such as 0.0 that would work as a float for values that require floats and an empty string for ones that require strings.
My code is along the lines of
import csv
import numpy as np
def dataset(path, filter_field, filter_value):
with open(path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
if filter_field:
for row in filter(lambda row: row[filter_field]==filter_value, reader):
yield row
def main(path):
data = [(row["ibu"], float(row["ibu"])) for row in dataset(path, "style", "American Pale Lager")]
As of right now my code would throw an error sine there are empty values in the "ibu" column for object 0.
How should one go about solving this problem?
You can do the following:
add a default dictionary input that you can use for missing values
and also to update upon certain conditions such as when ibu is empty
this is your implementation changed to correct for what you need. If I were you I would use pandas ...
import csv, copy
def dataset(path, filter_field, filter_value, default={'brewery_id':-1, 'style': 'unkown style', ' ': -1, 'name': 'unkown name', 'abi':0.0, 'id': -1, 'ounces':-1, 'ibu':0.0}):
with open(path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row is None:
break
if row[filter_field].strip() != filter_value:
continue
default_row = copy.copy(default)
default_row.update(row)
# you might want to add conditions
if default_row["ibu"] == "":
default_row["ibu"] = default["ibu"]
yield default_row
data = [(row["ibu"], float(row["ibu"])) for row in dataset('test.csv', "style", "American Pale Lager")]
print data
>> [(0.0, 0.0)]
Why don't you use
import pandas as pd
df = pd.read_csv(data_file)
The following is the result:
In [13]: df
Out[13]:
Unnamed: 0 abv ibu id name style \
0 14 0.061 60.0 1979 Bitter Bitch American Pale Ale (APA)
1 0 0.050 NaN 1436 Pub Beer American Pale Lager
brewery_id ounces
0 177 12.0
1 408 12.0
Simulating your file with a text string:
In [48]: txt=b""" ,abv,ibu,id,name,style,brewery_id,ounces
...: 14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0
...: 0 , 0.05,, 1436, Pub Beer, American Pale Lager, 408, 12.0
...: """
I can load it with numpy genfromtxt.
In [49]: data=np.genfromtxt(txt.splitlines(),delimiter=',',dtype=None,skip_heade
...: r=1,filling_values=0)
In [50]: data
Out[50]:
array([ (14, 0.061, 60., 1979, b'Bitter Bitch', b'American Pale Ale (APA)', 177, 12.),
( 0, 0.05 , 0., 1436, b' Pub Beer', b' American Pale Lager', 408, 12.)],
dtype=[('f0', '<i4'), ('f1', '<f8'), ('f2', '<f8'), ('f3', '<i4'), ('f4', 'S12'), ('f5', 'S23'), ('f6', '<i4'), ('f7', '<f8')])
In [51]:
I had to skip the header line because it is incomplete (a blank for the 1st field). The result is a structured array - a mix of ints, floats and strings (bytestrings in Py3).
After correcting the header line, and using names=True, I get
array([ (14, 0.061, 60., 1979, b'Bitter Bitch', b'American Pale Ale (APA)', 177, 12.),
( 0, 0.05 , 0., 1436, b' Pub Beer', b' American Pale Lager', 408, 12.)],
dtype=[('f0', '<i4'), ('abv', '<f8'), ('ibu', '<f8'), ('id', '<i4'), ('name', 'S12'), ('style', 'S23'), ('brewery_id', '<i4'), ('ounces', '<f8')])
genfromtxt is the most powerful csv reader in numpy. See it's docs for more parameters. The pandas reader is faster and more flexible - but of course produces a data frame, not array.
I am running a snippet of code that queries a database and then fills in a pandas dataframe with a value of 1 if that tuple is present in the query. it does this by running the query then iterates over the tuples and fills in the dataframe. However, the query returns almost 8 million rows of data.
My question is if anyone knows how to speed up a process like this. Here is the code below:
user_age = pd.read_sql_query(sql_age, datastore, index_col=['userid']).age.astype(np.int, copy=False)
x = pd.DataFrame(0, index=user_age.index, columns=range(366), dtype=np.int8)
for r in pd.read_sql_query(sql_active, datastore, chunksize=50000):
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
Thank you in advance!
You could save some time by replacing the Python loop
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
with a NumPy array assignment using "advanced integer indexing":
x[npidx[r['userid']], r['day']] = 1
On a 80000-row DataFrame, using_numpy (below) is about 6x faster:
In [7]: %timeit orig()
1 loop, best of 3: 984 ms per loop
In [8]: %timeit using_numpy()
10 loops, best of 3: 162 ms per loop
import numpy as np
import pandas as pd
def mock_read_sql_query():
np.random.seed(2016)
for arr in np.array_split(index, N//M):
size = len(arr)
df = pd.DataFrame({'userid':arr , 'day':np.random.randint(366, size=size)})
df = df[['userid', 'day']]
yield df
N, M = 8*10**4, 5*10**2
index = np.arange(N)
np.random.shuffle(index)
columns = range(366)
def using_numpy():
npidx = np.empty_like(index)
npidx[index] = np.arange(len(index))
x = np.zeros((len(index), len(columns)), dtype=np.int8)
for r in mock_read_sql_query():
x[npidx[r['userid']], r['day']] = 1
x = pd.DataFrame(x, columns=columns, index=index)
return x
def orig():
x = pd.DataFrame(0, index=index, columns=columns, dtype=np.int8)
for r in mock_read_sql_query():
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
return x
expected = orig()
result = using_numpy()
expected_index, expected_col = np.where(expected)
result_index, result_col = np.where(result)
assert np.equal(expected_index, result_index).all()
assert np.equal(expected_col, result_col).all()
Maintainer note: This question as-is is obsolete, since the bokeh.charts API was deprecated and removed years ago. But see the answer below for how to create grouped bar charts with the stable bokeh.plotting API in newer versions of Bokeh
I want to create a simple bar chart (like the one in the oficial example page)
I tried executing the code in this old answer Plotting Bar Charts with Bokeh
but it show the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-2-ba53ce344126> in <module>()
11
12 bar = Bar(xyvalues, cat, title="Stacked bars",
---> 13 xlabel="category", ylabel="language")
14
15 output_file("stacked_bar.html")
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builders/bar_builder.pyc in Bar(data, label, values, color, stack, group, agg, xscale, yscale, xgrid, ygrid, continuous_range, **kw)
318 kw['y_range'] = y_range
319
--> 320 chart = create_and_build(BarBuilder, data, **kw)
321
322 # hide x labels if there is a single value, implying stacking only
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in create_and_build(builder_class, *data, **kws)
60 # create the new builder
61 builder_kws = {k: v for k, v in kws.items() if k in builder_props}
---> 62 builder = builder_class(*data, **builder_kws)
63
64 # create a chart to return, since there isn't one already
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in __init__(self, *args, **kws)
280
281 # handle input attrs and ensure attrs have access to data
--> 282 attributes = self._setup_attrs(data, kws)
283
284 # remove inputs handled by dimensions and chart attributes
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in _setup_attrs(self, data, kws)
331 attributes[attr_name].iterable = custom_palette
332
--> 333 attributes[attr_name].setup(data=source, columns=attr)
334
335 else:
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in setup(self, data, columns)
193
194 if columns is not None and self.data is not None:
--> 195 self.set_columns(columns)
196
197 if self.columns is not None and self.data is not None:
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in set_columns(self, columns)
185 # assume this is now the iterable at this point
186 self.iterable = columns
--> 187 self._setup_default()
188
189 def setup(self, data=None, columns=None):
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in _setup_default(self)
142 def _setup_default(self):
143 """Stores the first value of iterable into `default` property."""
--> 144 self.default = next(self._setup_iterable())
145
146 def _setup_iterable(self):
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in _setup_iterable(self)
320
321 def _setup_iterable(self):
--> 322 return iter(self.items)
323
324 def get_levels(self, columns):
TypeError: 'NoneType' object is not iterable
The oficial example did work
URL: http://docs.bokeh.org/en/0.11.0/docs/user_guide/charts.html#userguide-charts-data-types
from bokeh.charts import Bar, output_file, show
from bokeh.sampledata.autompg import autompg as df
p = Bar(df, label='yr', values='mpg', agg='median', group='origin',
title="Median MPG by YR, grouped by ORIGIN", legend='top_right')
output_file("bar.html")
show(p)
BUT, I don't want to use pandas, I want to use a simple python dictionary like this:
my_simple_dict = {
'Group 1': [22,33,44,55],
'Group 2': [44,66,0,24],
'Group 3': [2,99,33,51]
}
How cant I achive a Bar chart that shows the tree groups (Group 1, Group 2, Group 3) with the x-axis going from 1 to 4?
NOTE: I am working with python 2.7
The question and other answers are obsolete, as bokeh.charts was deprecated and removed several years ago. However. support for grouped and stacked bar charts using the stable bokeh.plotting API has improved greatly since then:
https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html
Here is a full example:
from bokeh.io import show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
fruits = ['Apples', 'Pears', 'Nectarines', 'Plums', 'Grapes', 'Strawberries']
years = ['2015', '2016', '2017']
data = {'fruits' : fruits,
'2015' : [2, 1, 4, 3, 2, 4],
'2016' : [5, 3, 3, 2, 4, 6],
'2017' : [3, 2, 4, 4, 5, 3]}
# this creates [ ("Apples", "2015"), ("Apples", "2016"), ("Apples", "2017"), ("Pears", "2015), ... ]
x = [ (fruit, year) for fruit in fruits for year in years ]
counts = sum(zip(data['2015'], data['2016'], data['2017']), ()) # like an hstack
source = ColumnDataSource(data=dict(x=x, counts=counts))
p = figure(x_range=FactorRange(*x), plot_height=250, title="Fruit Counts by Year",
toolbar_location=None, tools="")
p.vbar(x='x', top='counts', width=0.9, source=source)
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
show(p)
For now the solution I found is changing the dict structure
from bokeh.charts import Bar, output_file, show, hplot
import pandas as pd
my_simple_dict = {
'Group 1': [22,33,44,55],
'Group 2': [44,66,0,24],
'Group 3': [2,99,33,51]
}
my_data_transformed_dict = {}
my_data_transformed_dict['x-axis'] = []
my_data_transformed_dict['value'] = []
my_data_transformed_dict['group-name'] = []
for group, group_list in my_simple_dict.iteritems():
x_axis = 0
for item in group_list:
x_axis += 1
my_data_transformed_dict['x-axis'].append(x_axis)
my_data_transformed_dict['value'].append(item)
my_data_transformed_dict['group-name'].append(group)
my_bar = Bar(my_data_transformed_dict, values='value',label='x-axis',group='group-name',legend='top_right')
output_file("grouped_bar.html")
show(my_bar)
If someone knows a better way please tell me