pandas set_index not working as expected for multiple columns - django

df = im.pd.DataFrame([
['abc bank', 'Delhi', 'person a'],
['abc bank', 'Delhi', 'person b'],
['abc bank', 'Bombay', 'person c'],
['abc bank', 'Bombay', 'person d'],
['abc bank', 'Surat', 'person c'],
['abc bank', 'Surat', 'person d'],
['cde bank', 'Delhi', 'person z'],
['cde bank', 'Delhi', 'person y'],
['cde bank', 'Bombay', 'person x']
],
columns = ['corporation', 'city', 'managers'])
print('DataFrame with default index\n', df)
Then when we do this:
df = df.set_index(['corporation', 'city'])
print('\nDataFrame with MultiIndex\n',df)
The output we get is:
DataFrame with MultiIndex
managers
corporation city
abc bank Delhi person a
Delhi person b
Bombay person c
Bombay person d
Surat person c
Surat person d
cde bank Delhi person z
Delhi person y
Bombay person x
What I want is:
managers
corporation city
abc bank Delhi person a
person b
Bombay person c
person d
Surat person c
person d
cde bank Delhi person z
person y
Bombay person x
So the set_index is grouping the first column of 'corporation' but not the second column of 'city' how can I get this?
Context: I'm doing this to covert pandas df to html with rowspans for first 2 columns

You can start by sorting your two columns (set to be the multiindex) then use pandas.DataFrame.loc with pandas.DataFrame.duplicated to replace duplicated entries with empty strings :
df = df.sort_values(['rollno', 'name']).reset_index(drop=True)
df.loc[df.duplicated(["rollno", "name"]), 'name'] = ''
df.loc[df.duplicated('name'), 'name'] = ''
out= df.set_index(['rollno', 'name'])
# Output :
print(out)
physics botony
rollno name
21 Amol 72 67
78 69
Kiku 74 56
22 Ajit 54 76

Related

Pandas: join columns sum some rows and mean others

This is my dataframe:
import matplotlib.pyplot as plt
import pandas as pd
mydic = {'time': {'Type1': 15, 'Type2': 47, 'Type3': 23, 'Type4': 45}, 'count': {'Type1': 26, 'Type2': 39, 'Type3': 34, 'Type4': 67}}
df = pd.DataFrame.from_dict(mydic, orient='index')
df.head()
df:
Type4 Type1 Type3 Type2
count 67 26 34 39
time 45 15 23 47
I need to join some columns according to a dictionary: sum some rows but calculate average for others. I can do sum() OR mean() but cannot figure out how to do both in one go without creating different data frames for 'count' and 'time'. Help, please?
My code:
def merge(df):
types = {'Type1&2': ['Type1', 'Type2'], 'Type3&4': ['Type3', 'Type4']}
columns_to_drop = ['Type1', 'Type2', 'Type3', 'Type4']
for t in types:
# df[t] = df[types[t]].sum(axis=1)
df[t] = df[types[t]].mean(axis=1)
df.drop(columns_to_drop, axis=1, inplace=True) # Drop merged columns
merge(df)
df.head()
It seems you need select rows by loc for mean and for sum:
def merge(df):
types = {'Type1&2': ['Type1', 'Type2'], 'Type3&4': ['Type3', 'Type4']}
columns_to_drop = ['Type1', 'Type2', 'Type3', 'Type4']
for t in types:
df.loc['count', t] = df.loc['count', types[t]].sum()
df.loc['time', t] = df.loc['time', types[t]].mean()
df.drop(columns_to_drop, axis=1, inplace=True) # Drop merged columns
return df
df1 = merge(df)
print (df1)
Type1&2 Type3&4
count 65.0 101.0
time 31.0 34.0
Or maybe try this ?
d = dict(Type1='Type12', Type2='Type12', Type3='Type34', Type4='Type34')
df1=df.T.groupby(d).agg({'count':'sum','time':'mean'}).T
df1
Out[1004]:
Type12 Type34
count 65 101
time 31 34

How to create a column in pandas dataframe using conditions defined in dict

Here's my code:
import pandas as pd
import numpy as np
input = {'name': ['Andy', 'Alex', 'Amy', "Olivia" ],
'rating': ['A', 'A', 'B', "B" ],
'score': [100, 60, 70, 95]}
df = pd.DataFrame(input)
df['valid1']=np.where((df['score']==100) & (df['rating']=='A'),'true','false')
The code above works fine to set a new column 'valid1' data as 'true' for score is 100 and 'rating' is A.
If the condition comes from a dict variable as
c = {'score':'100', 'rating':'A'}
How can I use the condition defined in c to get the same result 'valid' column value? I tried the following code
for key,value in c.iteritems():
df['valid2']=np.where((df[key]==value),'true','false')
got an error:
TypeError: Invalid type comparison
I'd define c as a pd.Series so that when you compare it to a dataframe, it automatically compares agains each row while matching columns with series indices. Note that I made sure 100 was an integer and not a string.
c = pd.Series({'score':100, 'rating':'A'})
i = df.columns.intersection(c.index)
df.assign(valid1=df[i].eq(c).all(1))
name rating score valid1
0 Andy A 100 True
1 Alex A 60 False
2 Amy B 70 False
3 Olivia B 95 False
You can use the same series and still use numpy to speed things up
c = pd.Series({'score':100, 'rating':'A'})
i = df.columns.intersection(c.index)
v = np.column_stack(df[c].values for c in i)
df.assign(valid1=(v == c.loc[i].values).all(1))
name rating score valid1
0 Andy A 100 True
1 Alex A 60 False
2 Amy B 70 False
3 Olivia B 95 False

Bokeh - Grouped Bar chart

Maintainer note: This question as-is is obsolete, since the bokeh.charts API was deprecated and removed years ago. But see the answer below for how to create grouped bar charts with the stable bokeh.plotting API in newer versions of Bokeh
I want to create a simple bar chart (like the one in the oficial example page)
I tried executing the code in this old answer Plotting Bar Charts with Bokeh
but it show the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-2-ba53ce344126> in <module>()
11
12 bar = Bar(xyvalues, cat, title="Stacked bars",
---> 13 xlabel="category", ylabel="language")
14
15 output_file("stacked_bar.html")
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builders/bar_builder.pyc in Bar(data, label, values, color, stack, group, agg, xscale, yscale, xgrid, ygrid, continuous_range, **kw)
318 kw['y_range'] = y_range
319
--> 320 chart = create_and_build(BarBuilder, data, **kw)
321
322 # hide x labels if there is a single value, implying stacking only
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in create_and_build(builder_class, *data, **kws)
60 # create the new builder
61 builder_kws = {k: v for k, v in kws.items() if k in builder_props}
---> 62 builder = builder_class(*data, **builder_kws)
63
64 # create a chart to return, since there isn't one already
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in __init__(self, *args, **kws)
280
281 # handle input attrs and ensure attrs have access to data
--> 282 attributes = self._setup_attrs(data, kws)
283
284 # remove inputs handled by dimensions and chart attributes
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in _setup_attrs(self, data, kws)
331 attributes[attr_name].iterable = custom_palette
332
--> 333 attributes[attr_name].setup(data=source, columns=attr)
334
335 else:
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in setup(self, data, columns)
193
194 if columns is not None and self.data is not None:
--> 195 self.set_columns(columns)
196
197 if self.columns is not None and self.data is not None:
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in set_columns(self, columns)
185 # assume this is now the iterable at this point
186 self.iterable = columns
--> 187 self._setup_default()
188
189 def setup(self, data=None, columns=None):
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in _setup_default(self)
142 def _setup_default(self):
143 """Stores the first value of iterable into `default` property."""
--> 144 self.default = next(self._setup_iterable())
145
146 def _setup_iterable(self):
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in _setup_iterable(self)
320
321 def _setup_iterable(self):
--> 322 return iter(self.items)
323
324 def get_levels(self, columns):
TypeError: 'NoneType' object is not iterable
The oficial example did work
URL: http://docs.bokeh.org/en/0.11.0/docs/user_guide/charts.html#userguide-charts-data-types
from bokeh.charts import Bar, output_file, show
from bokeh.sampledata.autompg import autompg as df
p = Bar(df, label='yr', values='mpg', agg='median', group='origin',
title="Median MPG by YR, grouped by ORIGIN", legend='top_right')
output_file("bar.html")
show(p)
BUT, I don't want to use pandas, I want to use a simple python dictionary like this:
my_simple_dict = {
'Group 1': [22,33,44,55],
'Group 2': [44,66,0,24],
'Group 3': [2,99,33,51]
}
How cant I achive a Bar chart that shows the tree groups (Group 1, Group 2, Group 3) with the x-axis going from 1 to 4?
NOTE: I am working with python 2.7
The question and other answers are obsolete, as bokeh.charts was deprecated and removed several years ago. However. support for grouped and stacked bar charts using the stable bokeh.plotting API has improved greatly since then:
https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html
Here is a full example:
from bokeh.io import show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
fruits = ['Apples', 'Pears', 'Nectarines', 'Plums', 'Grapes', 'Strawberries']
years = ['2015', '2016', '2017']
data = {'fruits' : fruits,
'2015' : [2, 1, 4, 3, 2, 4],
'2016' : [5, 3, 3, 2, 4, 6],
'2017' : [3, 2, 4, 4, 5, 3]}
# this creates [ ("Apples", "2015"), ("Apples", "2016"), ("Apples", "2017"), ("Pears", "2015), ... ]
x = [ (fruit, year) for fruit in fruits for year in years ]
counts = sum(zip(data['2015'], data['2016'], data['2017']), ()) # like an hstack
source = ColumnDataSource(data=dict(x=x, counts=counts))
p = figure(x_range=FactorRange(*x), plot_height=250, title="Fruit Counts by Year",
toolbar_location=None, tools="")
p.vbar(x='x', top='counts', width=0.9, source=source)
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
show(p)
For now the solution I found is changing the dict structure
from bokeh.charts import Bar, output_file, show, hplot
import pandas as pd
my_simple_dict = {
'Group 1': [22,33,44,55],
'Group 2': [44,66,0,24],
'Group 3': [2,99,33,51]
}
my_data_transformed_dict = {}
my_data_transformed_dict['x-axis'] = []
my_data_transformed_dict['value'] = []
my_data_transformed_dict['group-name'] = []
for group, group_list in my_simple_dict.iteritems():
x_axis = 0
for item in group_list:
x_axis += 1
my_data_transformed_dict['x-axis'].append(x_axis)
my_data_transformed_dict['value'].append(item)
my_data_transformed_dict['group-name'].append(group)
my_bar = Bar(my_data_transformed_dict, values='value',label='x-axis',group='group-name',legend='top_right')
output_file("grouped_bar.html")
show(my_bar)
If someone knows a better way please tell me

Django use ORM to get range of a particular element

I have a model called MyModel which has some dummy data as follows:
item date value
------------------------------
ab 8/10/12 1
ab 7/10/12 2
ab 6/10/12 3
abc 8/10/12 4
abc 7/10/12 5
abc 7/10/12 6
ab 7/10/12 7
ab 7/10/12 8
ab 7/10/12 9
ab 7/10/12 10
abc 7/10/12 11
abc 7/10/12 12
abc 7/10/12 13
I would like to query this Model in such a way that i get an output the gives me the ranges of the items that are in serial, something like the following:
[{'item': 'ab', 'values': '1-3'},
{'item': 'abc', 'values': '4-6'},
{'item': 'ab', 'values': '7-10'},
{'item': 'abc', 'values': '11-13'}]
How would i be able to do this using the django ORM?
Pretty certain you can't do that with the ORM...you'll need to write your own python code to do that.
counts = []
for model in MyModel.objects.all().order_by('value'):
if not counts or last_item != model.item:
counts.append({'item': model.item, 'values': [ model.value ]})
last_item = model.item
elsif model.item == last_item:
counts[-1]['values'].append(model.value)
for count in counts:
count['values'] = '%s-%s' % (count['values'][0], count['values'][-1])
Edit:
counts = []
for model in MyModel.objects.all().order_by('value'):
if not counts or last_item != model.item:
counts.append({'item': model.item, 'first': model.value, 'last':model.value})
last_item = model.item
elsif model.item == last_item:
counts[-1][last] = model.value

Assigning different values in Python Pandas based on criteria

I am trying to clean up the data. For the first name variable, I would like to 1) assign missing value (NaN) to those entries that have one character only, 2) assign missing value if it contains only two characters AND one of the characters is a symbol (ie: ".", or "?"), and 3) convert "wm" to string "william"
I tried the following and other codes, but none seems to work:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import re
def CleanUp():
data = pd.read_csv("C:\sample.csv")
frame2 = DataFrame(data)
frame2.columns = ["First Name", "Ethnicity"]
# Convert weird values to missing value
for Name in frame2["First_Name"]:
if len(Name) == 1:
Name == np.nan
if (len(Name) == 2) and (Name.str.contain(".|?|:", na=False)):
Name == np.nan
if Name == "wm":
Name == "william"
print frame2["First_Name"]
You're looking for df.replace
make up some data:
np.random.seed(3)
n=6
df = pd.DataFrame({'Name' : np.random.choice(['wm','bob','harry','chickens'], size=n),
'timeStamp' : np.random.randint(1000, size=n)})
print df
Name timeStamp
0 harry 256
1 wm 789
2 bob 659
3 chickens 714
4 wm 875
5 wm 681
run the replace:
df.Name = df.Name.replace('wm','william')
print df
Name timeStamp
0 harry 256
1 william 789
2 bob 659
3 chickens 714
4 william 875
5 william 681