Lottery analysis with combinations - combinations

I'm trying to get duos and trios, but seems that my list is constructed little bit differently. Is there solution for this. I have exported lottery number to csv and after some modification imported back here is the code
import csv
import pandas as pd
from itertools import combinations
from collections import Counter
results = []
with open('numerot.csv', newline='') as inputfile:
for row in csv.reader(inputfile):
results.append(row[0])
results=[i.replace(';',',') for i in results]
print(results)
duos = Counter()
trios = Counter()
for draw in results:
duos.update(combinations(draw, 2))
trios.update(combinations(draw, 3))
print('Top 5 duos')
for x in duos.most_common(5):
print(f'{x[0]}: {x[1]}')
print()
print('Top 5 trios')
for x in trios.most_common(5):
print(f'{x[0]}: {x[1]}')
and result.
['1;3;7;17;34;35;39;33', '2;4;6;30;32;33;35;11', '9;12;13;25;26;36;40;29', '2;7;11;22;30;33;35;23', '7;12;13;17;18;24;31;10', '2;5;8;11;15;19;29;38', '7;12;14;17;21;28;35;19', '19;20;23;27;29;32;34;8', '4;17;22;25;27;29;30;1', '1;7;15;19;20;35;38;32', '6;16;17;21;23;30;36;40', '1;6;8;9;15;30;40;17', '9;15;17;18;19;22;30;11', '2;6;13;14;29;30;38;40', '1;13;24;29;31;37;39;32', '8;10;15;19;32;38;40;34', '7;16;19;26;27;30;31;14', '4;5;7;9;19;24;37;40', '13;18;19;27;30;33;40;32', '2;7;12;17;27;38;39;25', '3;6;9;14;16;25;33;2', '3;9;24;25;30;32;34;13', '9;22;28;31;32;34;40;20', '3;4;14;19;20;28;33;7', '17;21;25;27;28;37;40;1', '1;7;16;23;25;32;33;9', '2;18;24;31;33;38;39;23', '15;18;22;25;29;32;34;23', '19;20;22;32;35;36;39;4', '2;8;22;28;32;34;39;14', '9;11;12;13;24;28;34;5', '8;22;27;28;31;34;39;30', '4;8;25;28;29;31;38;21', '4;15;18;23;35;36;40;22', '3;6;12;24;27;28;30;19', '2;7;12;24;26;37;39;29', '1;5;10;12;17;21;22;40', '4;5;10;14;25;26;29;19', '4;5;9;10;15;31;40;1', '1;16;24;27;28;36;40;38', '2;17;19;20;21;30;38;32', '10;12;13;20;24;33;36;8', '3;8;23;28;36;38;39;30', '1;10;13;30;34;35;39;6', '4;7;10;15;16;29;36;23', '7;12;15;22;27;28;31;20', '5;10;19;21;26;32;38;6', '4;13;16;17;18;21;26;20', '4;6;10;15;27;29;40;36', '4;15;26;31;33;34;38;19', '9;13;16;24;25;35;36;10', '1;13;19;28;30;33;35;7', '2;3;7;13;14;22;27;32', '5;6;7;14;27;37;38;40', '10;13;17;19;21;25;30;31']
Top 5 duos
(';', ';'): 1155
(';', '3'): 732
('1', ';'): 603
(';', '2'): 561
('2', ';'): 524
Top 5 trios
(';', ';', ';'): 1925
(';', ';', '3'): 1621
(';', '2', ';'): 1310
('1', ';', ';'): 1276
I have tried to change csv file and import with pandas but no success

Related

Making the list from 2 previous lists

Need advise how to make the third list from a couple of lists.
The first one is like this (about 20000 rows):
LIST1.
field1 field2 field3 field4 field5 field6 field7
---------------------------------------------------------------------------
1167 28669 001f.ce5d.cb4d Gi0/0/1.10 1 Vi2.156 PTA
848 32350 c83a.350d.f368 Gi0/0/1.10 1 Vi2.601 PTA
1771 43465 c46e.1f7a.4763 Gi0/0/1.10 1 Vi2.959 PTA
1390 24116 dc9f.db01.c6e8 Gi0/0/1.10 1 Vi2.32 PTA
712 23579 d850.e6d5.cb1c Gi0/0/1.10 1 Vi2.436 PTA
1239 28354 2828.5dd4.bc65 Gi0/0/1.10 1 Vi2.78 PTA
204 27816 e03f.491d.9978 Gi0/0/1.10 1 Vi2.341 PTA
383 28368 60e3.278c.7199 Gi0/0/1.10 1 Vi2.114 PTA
671 54657 c46e.1f81.a3d3 Gi0/0/1.10 1 Vi2.224 PTA
The second one is like this (about 20000 rows):
LIST2
field1 field2 field3 field4 field5
---------------------------------------------------------------------
Vi2.1 0001799 PPPoE 00:00:08 10.100.146.30
Vi2.2 0010129 PPPoE 00:00:08 10.100.148.108
Vi2.4 0010173 PPPoE 00:00:08 10.100.150.56
Vi2.5 0011093 PPPoE 00:00:08 10.100.146.143
Vi2.6 0003301 PPPoE 00:43:48 10.100.150.107
Vi2.7 0010101 PPPoE 00:00:08 10.100.147.133
Vi2.8 0001859 PPPoE 00:00:08 10.100.145.223
Vi2.9 0010049 PPPoE 06:45:08 10.100.147.138
Vi2.10 0003515 PPPoE 00:00:28 10.100.146.173
Vi2.11 0001747 PPPoE 00:00:18 10.100.146.37
Vi2.12 0011060 PPPoE 04:40:28 10.100.149.165
Vi2.13 0001335 PPPoE 00:00:08 10.239.152.165
Vi2.14 0010154 PPPoE 00:00:08 10.100.148.68
I need to create the third list, and the order is needed in such the way:
field6(list1) Field2(list1) field3(list1) field2(list2) field5(list2)
By the way. Field6 from list1 is the same as field1 in list 2.
I do understand, that I need to take every row from list1, make it a list of fields and after that take the field 6 and go to list2 and search that value in the list2.
And after that gather all needed fields into a new row. Anybody, I'm very,very new to parsing, please give me a couple of examples how to deal with this (I think typical) task!
Clarifying.
I'm receiving that rows via python 3 telnetlib, like this:
import telnetlib
HOST = '2.22.22.22'
password = "user"
user = "user"
tn = telnetlib.Telnet(HOST)
tn.read_until(b"Username: ")
tn.write(user.encode('ascii') + b"\n")
tn.read_until(b"Password: ")
tn.write(password.encode('ascii') + b"\n")
tn.write(b"term len 0 \n")
tn.write(b"show pppoe session | exclude 7878.7878.7878 \n")
tn.write(b"\n exit\n")
mystring = tn.read_all().decode('ascii').replace('\r\n', '\n')
print(mystring)
temp_list = mystring.splitlines()
print(temp_list)
mylist = ["\n".join(s for s in temp_list if 'PTA' in s and 'Vi2' in s)]
You might want to hack numpy. loadtxt method loads tabular data from the text file and transforms it to array (not native python).
With this handy method, you can easily achieve the result.
>>> import numpy as np
>>> t1 = "/home/ziya/Projects/python/test/list1.txt"
>>> t2 = "/home/ziya/Projects/python/test/list1.txt"
>>> d1 = np.loadtxt(t1, skiprows=2, dtype='str')
>>> d2 = np.loadtxt(t2, skiprows=2, dtype='str')
>>> d1_field1 = [i[0] for i in d1]
>>> d2_field1 = [i[1] for i in d1]
>>> d3_field1 = [i[2] for i in d1]
>>> d4_field2 = [i[3] for i in d2]
>>> d5_field2 = [i[4] for i in d2]
>>> d6_field2 = [i[5] for i in d2]
>>> new_list = []
>>> new_list.append(d1_field1)
>>> new_list.append(d2_field1)
>>> new_list.append(d3_field1)
>>> new_list.append(d4_field2)
>>> new_list.append(d5_field2)
>>> new_list.append(d6_field2)
>>> new_list
[['1167', '848', '1771', '1390', '712', '1239', '204', '383', '671'], ['28669', '32350', '43465', '24116', '23579', '28354', '27816', '28368', '54657'], ['001f.ce5d.cb4d', 'c83a.350d.f368', 'c46e.1f7a.4763', 'dc9f.db01.c6e8', 'd850.e6d5.cb1c', '2828.5dd4.bc65', 'e03f.491d.9978', '60e3.278c.7199', 'c46e.1f81.a3d3'], ['Gi0/0/1.10', 'Gi0/0/1.10', 'Gi0/0/1.10', 'Gi0/0/1.10', 'Gi0/0/1.10', 'Gi0/0/1.10', 'Gi0/0/1.10', 'Gi0/0/1.10', 'Gi0/0/1.10'], ['1', '1', '1', '1', '1', '1', '1', '1', '1'], ['Vi2.156', 'Vi2.601', 'Vi2.959', 'Vi2.32', 'Vi2.436', 'Vi2.78', 'Vi2.341', 'Vi2.114', 'Vi2.224']]
>>> np.array(new_list).transpose()
array([['1167', '28669', '001f.ce5d.cb4d', 'Gi0/0/1.10', '1', 'Vi2.156'],
['848', '32350', 'c83a.350d.f368', 'Gi0/0/1.10', '1', 'Vi2.601'],
['1771', '43465', 'c46e.1f7a.4763', 'Gi0/0/1.10', '1', 'Vi2.959'],
['1390', '24116', 'dc9f.db01.c6e8', 'Gi0/0/1.10', '1', 'Vi2.32'],
['712', '23579', 'd850.e6d5.cb1c', 'Gi0/0/1.10', '1', 'Vi2.436'],
['1239', '28354', '2828.5dd4.bc65', 'Gi0/0/1.10', '1', 'Vi2.78'],
['204', '27816', 'e03f.491d.9978', 'Gi0/0/1.10', '1', 'Vi2.341'],
['383', '28368', '60e3.278c.7199', 'Gi0/0/1.10', '1', 'Vi2.114'],
['671', '54657', 'c46e.1f81.a3d3', 'Gi0/0/1.10', '1', 'Vi2.224']],
dtype='|S14')

Comparing metrics of Keras with metrics of sklearn.classification_report

I am struggling with different metrics while evaluating neural networks.
My investigations showed that Keras (version 1.2.2) calculates different values for specific metrics (using function evaluate) compared to sklearn.classification report.
Specifically, the values for the metric 'precision' (i.e. 'precision' of Keras != 'precision' of sklearn) or 'recall' (i.e. 'recall' of Keras != 'recall' of sklearn) differ.
For the following working example the differences seem to be random, but evaluating bigger networks shows that 'precision' of Keras equals (almost) 'recall' of sklearn whereas both 'recall' metrics differ clearly.
I appreciate your help!
from __future__ import print_function
import numpy as np
np.random.seed(1337) # for reproducibility
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils # numpy utils for to_categorical()
from keras import backend as K # abstract backend API (in order to generate compatible code for Theano and Tf)
from sklearn.metrics import classification_report
batch_size = 128
nb_classes = 10
nb_epoch = 30
# input image dimensions
img_rows, img_cols = 28, 28
# number of convolutional filters to use
nb_filters = 32
# size of pooling area for max pooling
pool_size = (2, 2)
# convolution kernel size
kernel_size = (3, 3)
# the data, shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
if K.image_dim_ordering() == 'th':
X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255 # range [0,1]
X_test /= 255 # range [0,1]
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes) # necessary for use of categorical_crossentropy
Y_test = np_utils.to_categorical(y_test, nb_classes) # necessary for use of categorical_crossentropy
# create model
model = Sequential()
model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1],
border_mode='valid',
input_shape=input_shape))
model.add(Activation('relu'))
model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1]))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool_size))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
# configure model
model.compile(loss='categorical_crossentropy',
optimizer='adadelta',
metrics=['accuracy', 'precision', 'recall'])
# train model
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
verbose=1, validation_data=(X_test, Y_test))
# evaluate model with keras
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])
print('Test precision:', score[2])
print('Test recall:', score[3])
# evaluate model with sklearn
predictions_last_epoch = model.predict(X_test, batch_size=batch_size, verbose=1)
target_names = ['class 0', 'class 1', 'class 2', 'class 3', 'class 4',
'class 5', 'class 6', 'class 7', 'class 8', 'class 9']
predicted_classes = np.argmax(predictions_last_epoch, axis=1)
print('\n')
print(classification_report(y_test, predicted_classes,
target_names=target_names, digits = 6))
E D I T
The output of the script given above:
Test score: 0.0271549037314
Test accuracy: 0.9916
Test precision: 0.992290322304
Test recall: 0.9908
9728/10000 [============================>.] - ETA: 0s
precision recall f1-score support
class 0 0.987867 0.996939 0.992382 980
class 1 0.993860 0.998238 0.996044 1135
class 2 0.990329 0.992248 0.991288 1032
class 3 0.991115 0.994059 0.992585 1010
class 4 0.994882 0.989817 0.992343 982
class 5 0.991041 0.992152 0.991597 892
class 6 0.993678 0.984342 0.988988 958
class 7 0.992180 0.987354 0.989761 1028
class 8 0.989754 0.991786 0.990769 974
class 9 0.991054 0.988107 0.989578 1009
avg / total 0.991607 0.991600 0.991597 10000
For another model:
val/test loss: 0.231304548573
val/test categorical_accuracy: **0.978500002956**
val/test precision: *0.995103668976*
val/test recall: 0.941900001907
val/test fbeta_score: 0.967675107574
val/test mean_squared_error: 0.0064611148566
10000/10000 [==============================] - 0s
precision recall f1-score support
class 0 0.989605 0.971429 0.980433 980
class 1 0.985153 0.993833 0.989474 1135
class 2 0.988154 0.969961 0.978973 1032
class 3 0.981373 0.991089 0.986207 1010
class 4 0.968907 0.983707 0.976251 982
class 5 0.997633 0.945067 0.970639 892
class 6 0.995690 0.964509 0.979852 958
class 7 0.987230 0.977626 0.982405 1028
class 8 0.945205 0.991786 0.967936 974
class 9 0.951429 0.990089 0.970374 1009
avg / total *0.978964* **0.978500** 0.978522 10000
Definition of desired metrics (for model.compile):
metrics=['categorical_accuracy', 'precision', 'recall', 'fbeta_score', 'mean_squared_error']
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=metrics)
Output of model.metrics_names:
['loss', 'categorical_accuracy', 'precision', 'recall', 'fbeta_score', 'mean_squared_error']
Yes, it is different due to the fact that the sklearn classification report gives you the weighted average based on the support.
Experiment with:
from sklearn.metrics import classification_report
y_true = [0, 1,2,1]
y_pred = [0, 0,2,0]
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_true, y_pred, target_names=target_names))
Gives you:
precision recall f1-score support
class 0 0.33 1.00 0.50 1
class 1 0.00 0.00 0.00 2
class 2 1.00 1.00 1.00 1
avg / total 0.33 0.50 0.38 **4**
However, (1+0+0.33)/3 = 0.44(3), but as it seems from the support column sklearn returns (1*1+0*2+0.33*1)/4=0.3325

Reading Data from CSV and fill Empty Values Python

I am reading in a CSV file with the general schema of
,abv,ibu,id,name,style,brewery_id,ounces
14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0
0 , 0.05,, 1436, Pub Beer, American Pale Lager, 408, 12.0
I am running into problems where fields are not existing such as in object 0 where it is lacking an IBU. I would like to be able to insert a value such as 0.0 that would work as a float for values that require floats and an empty string for ones that require strings.
My code is along the lines of
import csv
import numpy as np
def dataset(path, filter_field, filter_value):
with open(path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
if filter_field:
for row in filter(lambda row: row[filter_field]==filter_value, reader):
yield row
def main(path):
data = [(row["ibu"], float(row["ibu"])) for row in dataset(path, "style", "American Pale Lager")]
As of right now my code would throw an error sine there are empty values in the "ibu" column for object 0.
How should one go about solving this problem?
You can do the following:
add a default dictionary input that you can use for missing values
and also to update upon certain conditions such as when ibu is empty
this is your implementation changed to correct for what you need. If I were you I would use pandas ...
import csv, copy
def dataset(path, filter_field, filter_value, default={'brewery_id':-1, 'style': 'unkown style', ' ': -1, 'name': 'unkown name', 'abi':0.0, 'id': -1, 'ounces':-1, 'ibu':0.0}):
with open(path, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row is None:
break
if row[filter_field].strip() != filter_value:
continue
default_row = copy.copy(default)
default_row.update(row)
# you might want to add conditions
if default_row["ibu"] == "":
default_row["ibu"] = default["ibu"]
yield default_row
data = [(row["ibu"], float(row["ibu"])) for row in dataset('test.csv', "style", "American Pale Lager")]
print data
>> [(0.0, 0.0)]
Why don't you use
import pandas as pd
df = pd.read_csv(data_file)
The following is the result:
In [13]: df
Out[13]:
Unnamed: 0 abv ibu id name style \
0 14 0.061 60.0 1979 Bitter Bitch American Pale Ale (APA)
1 0 0.050 NaN 1436 Pub Beer American Pale Lager
brewery_id ounces
0 177 12.0
1 408 12.0
Simulating your file with a text string:
In [48]: txt=b""" ,abv,ibu,id,name,style,brewery_id,ounces
...: 14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0
...: 0 , 0.05,, 1436, Pub Beer, American Pale Lager, 408, 12.0
...: """
I can load it with numpy genfromtxt.
In [49]: data=np.genfromtxt(txt.splitlines(),delimiter=',',dtype=None,skip_heade
...: r=1,filling_values=0)
In [50]: data
Out[50]:
array([ (14, 0.061, 60., 1979, b'Bitter Bitch', b'American Pale Ale (APA)', 177, 12.),
( 0, 0.05 , 0., 1436, b' Pub Beer', b' American Pale Lager', 408, 12.)],
dtype=[('f0', '<i4'), ('f1', '<f8'), ('f2', '<f8'), ('f3', '<i4'), ('f4', 'S12'), ('f5', 'S23'), ('f6', '<i4'), ('f7', '<f8')])
In [51]:
I had to skip the header line because it is incomplete (a blank for the 1st field). The result is a structured array - a mix of ints, floats and strings (bytestrings in Py3).
After correcting the header line, and using names=True, I get
array([ (14, 0.061, 60., 1979, b'Bitter Bitch', b'American Pale Ale (APA)', 177, 12.),
( 0, 0.05 , 0., 1436, b' Pub Beer', b' American Pale Lager', 408, 12.)],
dtype=[('f0', '<i4'), ('abv', '<f8'), ('ibu', '<f8'), ('id', '<i4'), ('name', 'S12'), ('style', 'S23'), ('brewery_id', '<i4'), ('ounces', '<f8')])
genfromtxt is the most powerful csv reader in numpy. See it's docs for more parameters. The pandas reader is faster and more flexible - but of course produces a data frame, not array.

Bokeh - Grouped Bar chart

Maintainer note: This question as-is is obsolete, since the bokeh.charts API was deprecated and removed years ago. But see the answer below for how to create grouped bar charts with the stable bokeh.plotting API in newer versions of Bokeh
I want to create a simple bar chart (like the one in the oficial example page)
I tried executing the code in this old answer Plotting Bar Charts with Bokeh
but it show the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-2-ba53ce344126> in <module>()
11
12 bar = Bar(xyvalues, cat, title="Stacked bars",
---> 13 xlabel="category", ylabel="language")
14
15 output_file("stacked_bar.html")
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builders/bar_builder.pyc in Bar(data, label, values, color, stack, group, agg, xscale, yscale, xgrid, ygrid, continuous_range, **kw)
318 kw['y_range'] = y_range
319
--> 320 chart = create_and_build(BarBuilder, data, **kw)
321
322 # hide x labels if there is a single value, implying stacking only
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in create_and_build(builder_class, *data, **kws)
60 # create the new builder
61 builder_kws = {k: v for k, v in kws.items() if k in builder_props}
---> 62 builder = builder_class(*data, **builder_kws)
63
64 # create a chart to return, since there isn't one already
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in __init__(self, *args, **kws)
280
281 # handle input attrs and ensure attrs have access to data
--> 282 attributes = self._setup_attrs(data, kws)
283
284 # remove inputs handled by dimensions and chart attributes
/usr/local/lib/python2.7/dist-packages/bokeh/charts/builder.pyc in _setup_attrs(self, data, kws)
331 attributes[attr_name].iterable = custom_palette
332
--> 333 attributes[attr_name].setup(data=source, columns=attr)
334
335 else:
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in setup(self, data, columns)
193
194 if columns is not None and self.data is not None:
--> 195 self.set_columns(columns)
196
197 if self.columns is not None and self.data is not None:
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in set_columns(self, columns)
185 # assume this is now the iterable at this point
186 self.iterable = columns
--> 187 self._setup_default()
188
189 def setup(self, data=None, columns=None):
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in _setup_default(self)
142 def _setup_default(self):
143 """Stores the first value of iterable into `default` property."""
--> 144 self.default = next(self._setup_iterable())
145
146 def _setup_iterable(self):
/usr/local/lib/python2.7/dist-packages/bokeh/charts/attributes.pyc in _setup_iterable(self)
320
321 def _setup_iterable(self):
--> 322 return iter(self.items)
323
324 def get_levels(self, columns):
TypeError: 'NoneType' object is not iterable
The oficial example did work
URL: http://docs.bokeh.org/en/0.11.0/docs/user_guide/charts.html#userguide-charts-data-types
from bokeh.charts import Bar, output_file, show
from bokeh.sampledata.autompg import autompg as df
p = Bar(df, label='yr', values='mpg', agg='median', group='origin',
title="Median MPG by YR, grouped by ORIGIN", legend='top_right')
output_file("bar.html")
show(p)
BUT, I don't want to use pandas, I want to use a simple python dictionary like this:
my_simple_dict = {
'Group 1': [22,33,44,55],
'Group 2': [44,66,0,24],
'Group 3': [2,99,33,51]
}
How cant I achive a Bar chart that shows the tree groups (Group 1, Group 2, Group 3) with the x-axis going from 1 to 4?
NOTE: I am working with python 2.7
The question and other answers are obsolete, as bokeh.charts was deprecated and removed several years ago. However. support for grouped and stacked bar charts using the stable bokeh.plotting API has improved greatly since then:
https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html
Here is a full example:
from bokeh.io import show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
fruits = ['Apples', 'Pears', 'Nectarines', 'Plums', 'Grapes', 'Strawberries']
years = ['2015', '2016', '2017']
data = {'fruits' : fruits,
'2015' : [2, 1, 4, 3, 2, 4],
'2016' : [5, 3, 3, 2, 4, 6],
'2017' : [3, 2, 4, 4, 5, 3]}
# this creates [ ("Apples", "2015"), ("Apples", "2016"), ("Apples", "2017"), ("Pears", "2015), ... ]
x = [ (fruit, year) for fruit in fruits for year in years ]
counts = sum(zip(data['2015'], data['2016'], data['2017']), ()) # like an hstack
source = ColumnDataSource(data=dict(x=x, counts=counts))
p = figure(x_range=FactorRange(*x), plot_height=250, title="Fruit Counts by Year",
toolbar_location=None, tools="")
p.vbar(x='x', top='counts', width=0.9, source=source)
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
show(p)
For now the solution I found is changing the dict structure
from bokeh.charts import Bar, output_file, show, hplot
import pandas as pd
my_simple_dict = {
'Group 1': [22,33,44,55],
'Group 2': [44,66,0,24],
'Group 3': [2,99,33,51]
}
my_data_transformed_dict = {}
my_data_transformed_dict['x-axis'] = []
my_data_transformed_dict['value'] = []
my_data_transformed_dict['group-name'] = []
for group, group_list in my_simple_dict.iteritems():
x_axis = 0
for item in group_list:
x_axis += 1
my_data_transformed_dict['x-axis'].append(x_axis)
my_data_transformed_dict['value'].append(item)
my_data_transformed_dict['group-name'].append(group)
my_bar = Bar(my_data_transformed_dict, values='value',label='x-axis',group='group-name',legend='top_right')
output_file("grouped_bar.html")
show(my_bar)
If someone knows a better way please tell me

Assigning different values in Python Pandas based on criteria

I am trying to clean up the data. For the first name variable, I would like to 1) assign missing value (NaN) to those entries that have one character only, 2) assign missing value if it contains only two characters AND one of the characters is a symbol (ie: ".", or "?"), and 3) convert "wm" to string "william"
I tried the following and other codes, but none seems to work:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import re
def CleanUp():
data = pd.read_csv("C:\sample.csv")
frame2 = DataFrame(data)
frame2.columns = ["First Name", "Ethnicity"]
# Convert weird values to missing value
for Name in frame2["First_Name"]:
if len(Name) == 1:
Name == np.nan
if (len(Name) == 2) and (Name.str.contain(".|?|:", na=False)):
Name == np.nan
if Name == "wm":
Name == "william"
print frame2["First_Name"]
You're looking for df.replace
make up some data:
np.random.seed(3)
n=6
df = pd.DataFrame({'Name' : np.random.choice(['wm','bob','harry','chickens'], size=n),
'timeStamp' : np.random.randint(1000, size=n)})
print df
Name timeStamp
0 harry 256
1 wm 789
2 bob 659
3 chickens 714
4 wm 875
5 wm 681
run the replace:
df.Name = df.Name.replace('wm','william')
print df
Name timeStamp
0 harry 256
1 william 789
2 bob 659
3 chickens 714
4 william 875
5 william 681