Maintainer note: This question as-is is obsolete, since the bokeh.charts API was deprecated and removed years ago. But see the answer below for how to create grouped bar charts with the stable bokeh.plotting API in newer versions of Bokeh
I want to create a simple bar chart (like the one in the oficial example page)
I tried executing the code in this old answer Plotting Bar Charts with Bokeh
but it show the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-2-ba53ce344126> in <module>()
11
12 bar = Bar(xyvalues, cat, title="Stacked bars",
---> 13 xlabel="category", ylabel="language")
14
15 output_file("stacked_bar.html")
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builders/bar_builder.pyc in Bar(data, label, values, color, stack, group, agg, xscale, yscale, xgrid, ygrid, continuous_range, **kw)
318 kw['y_range'] = y_range
319
--> 320 chart = create_and_build(BarBuilder, data, **kw)
321
322 # hide x labels if there is a single value, implying stacking only
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in create_and_build(builder_class, *data, **kws)
60 # create the new builder
61 builder_kws = {k: v for k, v in kws.items() if k in builder_props}
---> 62 builder = builder_class(*data, **builder_kws)
63
64 # create a chart to return, since there isn't one already
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in __init__(self, *args, **kws)
280
281 # handle input attrs and ensure attrs have access to data
--> 282 attributes = self._setup_attrs(data, kws)
283
284 # remove inputs handled by dimensions and chart attributes
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in _setup_attrs(self, data, kws)
331 attributes[attr_name].iterable = custom_palette
332
--> 333 attributes[attr_name].setup(data=source, columns=attr)
334
335 else:
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in setup(self, data, columns)
193
194 if columns is not None and self.data is not None:
--> 195 self.set_columns(columns)
196
197 if self.columns is not None and self.data is not None:
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in set_columns(self, columns)
185 # assume this is now the iterable at this point
186 self.iterable = columns
--> 187 self._setup_default()
188
189 def setup(self, data=None, columns=None):
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in _setup_default(self)
142 def _setup_default(self):
143 """Stores the first value of iterable into `default` property."""
--> 144 self.default = next(self._setup_iterable())
145
146 def _setup_iterable(self):
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in _setup_iterable(self)
320
321 def _setup_iterable(self):
--> 322 return iter(self.items)
323
324 def get_levels(self, columns):
TypeError: 'NoneType' object is not iterable
The oficial example did work
URL: http://docs.bokeh.org/en/0.11.0/docs/user_guide/charts.html#userguide-charts-data-types
from bokeh.charts import Bar, output_file, show
from bokeh.sampledata.autompg import autompg as df
p = Bar(df, label='yr', values='mpg', agg='median', group='origin',
title="Median MPG by YR, grouped by ORIGIN", legend='top_right')
output_file("bar.html")
show(p)
BUT, I don't want to use pandas, I want to use a simple python dictionary like this:
my_simple_dict = {
'Group 1': [22,33,44,55],
'Group 2': [44,66,0,24],
'Group 3': [2,99,33,51]
}
How cant I achive a Bar chart that shows the tree groups (Group 1, Group 2, Group 3) with the x-axis going from 1 to 4?
NOTE: I am working with python 2.7
The question and other answers are obsolete, as bokeh.charts was deprecated and removed several years ago. However. support for grouped and stacked bar charts using the stable bokeh.plotting API has improved greatly since then:
https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html
Here is a full example:
from bokeh.io import show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
fruits = ['Apples', 'Pears', 'Nectarines', 'Plums', 'Grapes', 'Strawberries']
years = ['2015', '2016', '2017']
data = {'fruits' : fruits,
'2015' : [2, 1, 4, 3, 2, 4],
'2016' : [5, 3, 3, 2, 4, 6],
'2017' : [3, 2, 4, 4, 5, 3]}
# this creates [ ("Apples", "2015"), ("Apples", "2016"), ("Apples", "2017"), ("Pears", "2015), ... ]
x = [ (fruit, year) for fruit in fruits for year in years ]
counts = sum(zip(data['2015'], data['2016'], data['2017']), ()) # like an hstack
source = ColumnDataSource(data=dict(x=x, counts=counts))
p = figure(x_range=FactorRange(*x), plot_height=250, title="Fruit Counts by Year",
toolbar_location=None, tools="")
p.vbar(x='x', top='counts', width=0.9, source=source)
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
show(p)
For now the solution I found is changing the dict structure
from bokeh.charts import Bar, output_file, show, hplot
import pandas as pd
my_simple_dict = {
'Group 1': [22,33,44,55],
'Group 2': [44,66,0,24],
'Group 3': [2,99,33,51]
}
my_data_transformed_dict = {}
my_data_transformed_dict['x-axis'] = []
my_data_transformed_dict['value'] = []
my_data_transformed_dict['group-name'] = []
for group, group_list in my_simple_dict.iteritems():
x_axis = 0
for item in group_list:
x_axis += 1
my_data_transformed_dict['x-axis'].append(x_axis)
my_data_transformed_dict['value'].append(item)
my_data_transformed_dict['group-name'].append(group)
my_bar = Bar(my_data_transformed_dict, values='value',label='x-axis',group='group-name',legend='top_right')
output_file("grouped_bar.html")
show(my_bar)
If someone knows a better way please tell me
Related
can anyone please help me move forward in my modeling, I have no idea where is that .lower attribute I have called upon and how to fix it.. appreciate any help
HERE IS THE ONLY PART WHERE I APPLIED .LOWER
wordnet_lemmatizer = WordNetLemmatizer()wordnet_lemmatizer = WordNetLemmatizer()
def create_tokens(df2):
df2['low'] = df2['Movie'].str.lower()
df2['stopwords_out'] = df2['low'].apply(lambda x: " ".join([word for word in x.split()if word not in stops]))
df2['tokenized'] = df2.apply(lambda row: nltk.word_tokenize(row['stopwords_out']), axis=1)
df2['eng_only'] = df2['tokenized'].apply(lambda x: [word for word in x if word.isalpha()])
df2['lemmatized'] = df2['eng_only'].apply(lambda x: [wordnet_lemmatizer.lemmatize(word) for word in x])
HERE IS WHEN I HAVE CHANGED MY LEMMATIZED COLUMN TO LIST
a = df2.lemmatized.to_list()
b = (list(itertools.chain.from_iterable(a)))
bow = Counter (b)
HERE IS WHEN I TRY TO CREATE TF IDF AND WHERE THE ERROR APPEARS
cv = CountVectorizer(min_df=0, max_df=1)
tf = cv.fit_transform(df2.lemmatized)
THE ERROR
AttributeError Traceback (most recent call last)
C:\AppData\Local\Temp/ipykernel_24552/1530549768.py in
2
3 cv = CountVectorizer(min_df=0, max_df=1)
----> 4 tf = cv.fit_transform(df2.lemmatized)
5
6 print(df2.lemmatized)
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self.count_vocab(raw_documents,
1203 self.fixed_vocabulary)
1204
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
~\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
AttributeError: 'list' object has no attribute 'lower'
print(df2.lemmatized)
This is my dataframe:
import matplotlib.pyplot as plt
import pandas as pd
mydic = {'time': {'Type1': 15, 'Type2': 47, 'Type3': 23, 'Type4': 45}, 'count': {'Type1': 26, 'Type2': 39, 'Type3': 34, 'Type4': 67}}
df = pd.DataFrame.from_dict(mydic, orient='index')
df.head()
df:
Type4 Type1 Type3 Type2
count 67 26 34 39
time 45 15 23 47
I need to join some columns according to a dictionary: sum some rows but calculate average for others. I can do sum() OR mean() but cannot figure out how to do both in one go without creating different data frames for 'count' and 'time'. Help, please?
My code:
def merge(df):
types = {'Type1&2': ['Type1', 'Type2'], 'Type3&4': ['Type3', 'Type4']}
columns_to_drop = ['Type1', 'Type2', 'Type3', 'Type4']
for t in types:
# df[t] = df[types[t]].sum(axis=1)
df[t] = df[types[t]].mean(axis=1)
df.drop(columns_to_drop, axis=1, inplace=True) # Drop merged columns
merge(df)
df.head()
It seems you need select rows by loc for mean and for sum:
def merge(df):
types = {'Type1&2': ['Type1', 'Type2'], 'Type3&4': ['Type3', 'Type4']}
columns_to_drop = ['Type1', 'Type2', 'Type3', 'Type4']
for t in types:
df.loc['count', t] = df.loc['count', types[t]].sum()
df.loc['time', t] = df.loc['time', types[t]].mean()
df.drop(columns_to_drop, axis=1, inplace=True) # Drop merged columns
return df
df1 = merge(df)
print (df1)
Type1&2 Type3&4
count 65.0 101.0
time 31.0 34.0
Or maybe try this ?
d = dict(Type1='Type12', Type2='Type12', Type3='Type34', Type4='Type34')
df1=df.T.groupby(d).agg({'count':'sum','time':'mean'}).T
df1
Out[1004]:
Type12 Type34
count 65 101
time 31 34
import numpy as np
import pandas as pd
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
'value': values,
'even': values % 2 == 0,
'above_three': values > 3
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
# Find second largest value in each group
if True:
def second_largest(xs):
sorted_xs = xs.sort(inplace=False, ascending=False)
return sorted_xs.iloc[1]
grouped_data = example_df.groupby('even')
print grouped_data['value'].apply(second_largest)
The traceback says the following
AttributeError
Traceback (most recent call last) <ipython-input-94-251c7e3ea488> in
()
14
15 grouped_data = example_df.groupby('even')
---> 16 print grouped_data['value'].apply(second_largest)
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\groupby.pyc in
apply(self, func, *args, **kwargs)
714 # ignore SettingWithCopy here in case the user mutates
715 with option_context('mode.chained_assignment', None):
--> 716 return self._python_apply_general(f)
717
718 def _python_apply_general(self, f):
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\groupby.pyc in
_python_apply_general(self, f)
718 def _python_apply_general(self, f):
719 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 720 self.axis)
721
722 return self._wrap_applied_output(
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\groupby.pyc in
apply(self, f, data, axis) 1800 # group might be
modified 1801 group_axes = _get_axes(group)
-> 1802 res = f(group) 1803 if not _is_indexed_like(res, group_axes): 1804 mutated = True
in second_largest(xs)
9 if True:
10 def second_largest(xs):
---> 11 sorted_xs = xs.sort(inplace=False, ascending=False)
12 print sorted_xs
13 return sorted_xs.iloc[1]
C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\generic.pyc in
getattr(self, name) 3079 if name in self._info_axis: 3080 return self[name]
-> 3081 return object.getattribute(self, name) 3082 3083 def setattr(self, name, value):
AttributeError: 'Series' object has no attribute 'sort'
I am reading in a CSV file with the general schema of
,abv,ibu,id,name,style,brewery_id,ounces
14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0
0 , 0.05,, 1436, Pub Beer, American Pale Lager, 408, 12.0
I am running into problems where fields are not existing such as in object 0 where it is lacking an IBU. I would like to be able to insert a value such as 0.0 that would work as a float for values that require floats and an empty string for ones that require strings.
My code is along the lines of
import csv
import numpy as np
def dataset(path, filter_field, filter_value):
with open(path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
if filter_field:
for row in filter(lambda row: row[filter_field]==filter_value, reader):
yield row
def main(path):
data = [(row["ibu"], float(row["ibu"])) for row in dataset(path, "style", "American Pale Lager")]
As of right now my code would throw an error sine there are empty values in the "ibu" column for object 0.
How should one go about solving this problem?
You can do the following:
add a default dictionary input that you can use for missing values
and also to update upon certain conditions such as when ibu is empty
this is your implementation changed to correct for what you need. If I were you I would use pandas ...
import csv, copy
def dataset(path, filter_field, filter_value, default={'brewery_id':-1, 'style': 'unkown style', ' ': -1, 'name': 'unkown name', 'abi':0.0, 'id': -1, 'ounces':-1, 'ibu':0.0}):
with open(path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row is None:
break
if row[filter_field].strip() != filter_value:
continue
default_row = copy.copy(default)
default_row.update(row)
# you might want to add conditions
if default_row["ibu"] == "":
default_row["ibu"] = default["ibu"]
yield default_row
data = [(row["ibu"], float(row["ibu"])) for row in dataset('test.csv', "style", "American Pale Lager")]
print data
>> [(0.0, 0.0)]
Why don't you use
import pandas as pd
df = pd.read_csv(data_file)
The following is the result:
In [13]: df
Out[13]:
Unnamed: 0 abv ibu id name style \
0 14 0.061 60.0 1979 Bitter Bitch American Pale Ale (APA)
1 0 0.050 NaN 1436 Pub Beer American Pale Lager
brewery_id ounces
0 177 12.0
1 408 12.0
Simulating your file with a text string:
In [48]: txt=b""" ,abv,ibu,id,name,style,brewery_id,ounces
...: 14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0
...: 0 , 0.05,, 1436, Pub Beer, American Pale Lager, 408, 12.0
...: """
I can load it with numpy genfromtxt.
In [49]: data=np.genfromtxt(txt.splitlines(),delimiter=',',dtype=None,skip_heade
...: r=1,filling_values=0)
In [50]: data
Out[50]:
array([ (14, 0.061, 60., 1979, b'Bitter Bitch', b'American Pale Ale (APA)', 177, 12.),
( 0, 0.05 , 0., 1436, b' Pub Beer', b' American Pale Lager', 408, 12.)],
dtype=[('f0', '<i4'), ('f1', '<f8'), ('f2', '<f8'), ('f3', '<i4'), ('f4', 'S12'), ('f5', 'S23'), ('f6', '<i4'), ('f7', '<f8')])
In [51]:
I had to skip the header line because it is incomplete (a blank for the 1st field). The result is a structured array - a mix of ints, floats and strings (bytestrings in Py3).
After correcting the header line, and using names=True, I get
array([ (14, 0.061, 60., 1979, b'Bitter Bitch', b'American Pale Ale (APA)', 177, 12.),
( 0, 0.05 , 0., 1436, b' Pub Beer', b' American Pale Lager', 408, 12.)],
dtype=[('f0', '<i4'), ('abv', '<f8'), ('ibu', '<f8'), ('id', '<i4'), ('name', 'S12'), ('style', 'S23'), ('brewery_id', '<i4'), ('ounces', '<f8')])
genfromtxt is the most powerful csv reader in numpy. See it's docs for more parameters. The pandas reader is faster and more flexible - but of course produces a data frame, not array.
I am using the sklearn's GradientBoostingRegression method. So after fitting it with 2000 estimators, I wanted to add more estimators to it. Since it is taking too long to rerun the entire fitting process, I used the set_params() method. Note that it is a multi-target problem, meaning, I have 3 targets to fit. So I am using the following code to add more estimators.
'''parameters: models (list of length 3 in our case )
train_X, train_y [n_samples x 3], test
n_estimators : previous + 500 (default) [additional estimators]
warm_start : True (default)
'''
def addMoreEstimators(train_X, train_y, test, models, n_estimators = 500, warm_start=True):
params = {'n_estimators':n_estimators, 'warm_start':warm_start}
gbm_pred= pd.DataFrame()
for (i,stars),clf in zip(enumerate(['*','**','***']), models):
clf.set_params(**params)
%time clf.fit(train_X.todense(),train_y[stars])
%time gbm_pred[stars] = clf.predict(test.todense())
gbm_pred = gbm_pred.as_matrix()
gbm_dict ={'model': gbm, 'prediction': gbm_pred}
return gbm_dict
Note: the models parameter is a list of 3 fitted models for the 3 targets.
When I ran it for the first time using 2500 (originally I had 2000 estimators), it ran fine and gave me an output.
When, I am running the same function using 3000 estimators, I am getting an AttributeError (see the traceback of the error below). Here the models contained the 3 fitted models. Below is the traceback of the error: (it's kinda long)
AttributeError Traceback (most recent call last)
<ipython-input-104-9418ada3b36f> in <module>()
7 test = val_X_tfidf[:,shortened_col_index],
8 models = models,
----> 9 n_estimators = 3000)
10
11 reduced_features_gbm_pred_3000_2_lr_1_msp_2 = reduced_features_gbm_model_3000_2_lr_1_msp_2['prediction']
<ipython-input-103-e15a4fb70b50> in addMoreEstimators(train_X, train_y, test, models, n_estimators, warm_start)
15
16 clf.set_params(**params)
---> 17 get_ipython().magic(u'time clf.fit(train_X.todense(),train_y[stars])')
18 print 'starting prediction'
19
//anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
2305 magic_name, _, magic_arg_s = arg_s.partition(' ')
2306 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2307 return self.run_line_magic(magic_name, magic_arg_s)
2308
2309 #-------------------------------------------------------------------------
//anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
2226 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2227 with self.builtin_trap:
-> 2228 result = fn(*args,**kwargs)
2229 return result
2230
//anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
//anaconda/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
191 # but it's overkill for just that one bit of state.
192 def magic_deco(arg):
--> 193 call = lambda f, *a, **k: f(*a, **k)
194
195 if callable(arg):
//anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
1160 if mode=='eval':
1161 st = clock2()
-> 1162 out = eval(code, glob, local_ns)
1163 end = clock2()
1164 else:
<timed eval> in <module>()
//anaconda/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in fit(self, X, y, sample_weight, monitor)
973 self.estimators_.shape[0]))
974 begin_at_stage = self.estimators_.shape[0]
--> 975 y_pred = self._decision_function(X)
976 self._resize_state()
977
//anaconda/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in _decision_function(self, X)
1080 # not doing input validation.
1081 score = self._init_decision_function(X)
-> 1082 predict_stages(self.estimators_, X, self.learning_rate, score)
1083 return score
1084
sklearn/ensemble/_gradient_boosting.pyx in sklearn.ensemble._gradient_boosting.predict_stages (sklearn/ensemble/_gradient_boosting.c:2502)()
AttributeError: 'int' object has no attribute 'tree_'
Sorry for the long traceback, but I think it wouldn't be possible to provide me with meaningful feedback.
Again, why am I getting this feedback ?
Any help would be greatly appreciated.
Thanks
Edit
Below is the code that generates the models that was one of the inputs in the above function.
from sklearn import ensemble
def updated_runGBM(train_X, train_y, test,
n_estimators =100,
max_depth = 1,
min_samples_split=1,
learning_rate=0.01,
loss= 'ls',
warm_start=True):
'''train_X : n_samples x m_features
train_y : n_samples x k_targets (multiple targets allowed)
test : n_samples x m_features
warm_start : True (originally the default is False, but I want to add trees)
'''
params = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split,
'learning_rate': learning_rate, 'loss': loss,'warm_start':warm_start}
gbm1 = ensemble.GradientBoostingRegressor(**params)
gbm2 = ensemble.GradientBoostingRegressor(**params)
gbm3 = ensemble.GradientBoostingRegressor(**params)
gbm = [gbm1,gbm2,gbm3]
gbm_pred= pd.DataFrame()
for (i,stars),clf in zip(enumerate(['*','**','***']), gbm):
%time clf.fit(train_X.todense(),train_y[stars])
%time gbm_pred[stars] = clf.predict(test.todense())
gbm_pred = gbm_pred.as_matrix()
gbm_pred = np.clip(gbm_pred,0,np.inf)
gbm_dict ={'model': gbm, 'prediction': gbm_pred}
return gbm_dict
NOTE In the code above, I have removed some of the print statements to reduce clutter.
These are the two functions I am using, nothing else (apart from the code to split up the data).