pandas how to split twice a given field - python-2.7

I would like to count the top subject in a Column. Some fields have commas or dot I would like to create a new row with them.
import pandas as pd
from pandas import DataFrame, Series
sbj = DataFrame(["Africa, Business", "Oceania",
"Business.Biology.Pharmacology.Therapeutics",
"French Litterature, Philosophy, Arts", "Biology,Business", ""
])
sbj
I would like to split into a new any field that has a '.' or '.'
sbj_top = sbj[0].apply(lambda x: pd.value_counts(x.split(",")) if not pd.isnull(x) else pd.value_counts('---'.split(","))).sum(axis = 0)
sbj_top
I'm getting an error (AttributeError) here while try to re-split('.') it
sbj_top = sbj_top.apply(lambda x: pd.value_counts(x.split(".")) if not pd.isnull(x) else pd.value_counts('---'.split(","))).sum(axis = 0)
sbj_top
My desired output
sbj_top.sort(ascending=False)
plt.title("Distribution of the top 10 subjects")
plt.ylabel("Frequency")
sbj_top.head(10).plot(kind='bar', color="#348ABD")

You can use Counter together with chain from itertools. Note that I first replace periods with commas before parsing.
from collections import Counter
import itertools
from string import whitespace
trimmed_list = [i.replace('.', ',').split(',') for i in sbj[0].tolist() if i != ""]
item_list = [item.strip(whitespace) for item in itertools.chain(*trimmed_list)]
item_count = Counter(item_list)
>>> item_count.most_common()
[('Business', 3),
('Biology', 2),
('Oceania', 1),
('Pharmacology', 1),
('Philosophy', 1),
('Africa', 1),
('French Litterature', 1),
('Therapeutics', 1),
('Arts', 1)]
if you need the output in the form of a DataFrame:
df = pd.DataFrame(item_list, columns=['subject'])
>>> df
subject
0 Africa
1 Business
2 Oceania
3 Business
4 Biology
5 Pharmacology
6 Therapeutics
7 French Litterature
8 Philosophy
9 Arts
10 Biology
11 Business

Related

Using regex on spaCy PhraseMatcher [duplicate]

I am creating a spaCy regular expression matches for matching number and extracting it pandas data frame.
Question: Panda picks up from number but overwrites value instead of appending. How to solve it?
(original code credit: yarongon)
from __future__ import unicode_literals
import spacy
import re
import pandas as pd
from datetime import date
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
doc = nlp("This is a sample number: 11. This is second sample number: 1145.")
NUM_PATTERN = re.compile(r"\d+")
for match in re.finditer(NUM_PATTERN, doc.text):
start, end = match.span()
Number = doc.char_span(start, end)
print Number
pandas_attributes = [Number,]
df = pd.DataFrame(pandas_attributes,
columns=['Number'])
print df
Output:
11
1145
Number
0 1145
Expected output:
Number
o 11
1 1145
Edit 1:
I am trying multiple pattern match on single text.
from __future__ import unicode_literals
import spacy
import re
import pandas as pd
from datetime import date
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
doc = nlp("This is a sample-number: 11. This is second sample number: 1145.")
NUM_PATTERN = re.compile(r"\d+")
HYPH_PATTERN = re.compile('\w+(?:-)\w+')
for match in re.finditer(NUM_PATTERN, doc.text):
start, end = match.span()
Number = doc.char_span(start, end)
print Number
for match in re.finditer(HYPH_PATTERN, doc.text):
start, end = match.span()
Hyph_word = doc.char_span(start, end)
print Hyph_word
pandas_attributes = [Number,Hyph_word]
df = pd.DataFrame(pandas_attributes,
columns=['Number','Hyphenword'])
print df
Current output.
Output:
11
1145
sample-number
AssertionError: 2 columns passed, passed data had 3 columns
Expected output:
Number Hyphen_word
11 sample-number
1145
edit 2: output
Number Hyphenword
0 (11) (1145)
1 (sample, -, number) Non
Expected output:
Number Hyphenword
0 11 sample-word
1 1145 Non
You need append values to list in loop:
L = []
for match in re.finditer(NUM_PATTERN, doc.text):
start, end = match.span()
L.append(doc.char_span(start, end))
and then use DataFrame constructor:
df = pd.DataFrame(L,columns=['Number'])
You can also append tuples with multiple values:
Sample:
L = []
for x in range(3):
Number = x + 1
Val = x + 4
L.append((Number, Val))
print (L)
[(1, 4), (2, 5), (3, 6)]
df = pd.DataFrame(L,columns=['Number', 'Val'])
print (df)
Number Val
0 1 4
1 2 5
2 3 6
I believe you can use double append:
PATTERNS = [NUM_PATTERN, HYPH_PATTERN]
pandas_attributes = []
for pat in PATTERNS:
L = []
for match in re.finditer(pat, doc.text):
start, end = match.span()
L.append(doc.char_span(start, end))
pandas_attributes.append(L)
df = pd.DataFrame(pandas_attributes,
index=['Number','Hyphenword']).T

Regular expression SpaCy

I am creating a spaCy regular expression matches for matching number and extracting it pandas data frame.
Question: Panda picks up from number but overwrites value instead of appending. How to solve it?
(original code credit: yarongon)
from __future__ import unicode_literals
import spacy
import re
import pandas as pd
from datetime import date
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
doc = nlp("This is a sample number: 11. This is second sample number: 1145.")
NUM_PATTERN = re.compile(r"\d+")
for match in re.finditer(NUM_PATTERN, doc.text):
start, end = match.span()
Number = doc.char_span(start, end)
print Number
pandas_attributes = [Number,]
df = pd.DataFrame(pandas_attributes,
columns=['Number'])
print df
Output:
11
1145
Number
0 1145
Expected output:
Number
o 11
1 1145
Edit 1:
I am trying multiple pattern match on single text.
from __future__ import unicode_literals
import spacy
import re
import pandas as pd
from datetime import date
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
doc = nlp("This is a sample-number: 11. This is second sample number: 1145.")
NUM_PATTERN = re.compile(r"\d+")
HYPH_PATTERN = re.compile('\w+(?:-)\w+')
for match in re.finditer(NUM_PATTERN, doc.text):
start, end = match.span()
Number = doc.char_span(start, end)
print Number
for match in re.finditer(HYPH_PATTERN, doc.text):
start, end = match.span()
Hyph_word = doc.char_span(start, end)
print Hyph_word
pandas_attributes = [Number,Hyph_word]
df = pd.DataFrame(pandas_attributes,
columns=['Number','Hyphenword'])
print df
Current output.
Output:
11
1145
sample-number
AssertionError: 2 columns passed, passed data had 3 columns
Expected output:
Number Hyphen_word
11 sample-number
1145
edit 2: output
Number Hyphenword
0 (11) (1145)
1 (sample, -, number) Non
Expected output:
Number Hyphenword
0 11 sample-word
1 1145 Non
You need append values to list in loop:
L = []
for match in re.finditer(NUM_PATTERN, doc.text):
start, end = match.span()
L.append(doc.char_span(start, end))
and then use DataFrame constructor:
df = pd.DataFrame(L,columns=['Number'])
You can also append tuples with multiple values:
Sample:
L = []
for x in range(3):
Number = x + 1
Val = x + 4
L.append((Number, Val))
print (L)
[(1, 4), (2, 5), (3, 6)]
df = pd.DataFrame(L,columns=['Number', 'Val'])
print (df)
Number Val
0 1 4
1 2 5
2 3 6
I believe you can use double append:
PATTERNS = [NUM_PATTERN, HYPH_PATTERN]
pandas_attributes = []
for pat in PATTERNS:
L = []
for match in re.finditer(pat, doc.text):
start, end = match.span()
L.append(doc.char_span(start, end))
pandas_attributes.append(L)
df = pd.DataFrame(pandas_attributes,
index=['Number','Hyphenword']).T

Python: create a pandas data frame from a list

I am using the following code to create a data frame from a list:
test_list = ['a','b','c','d']
df_test = pd.DataFrame.from_records(test_list, columns=['my_letters'])
df_test
The above code works fine. Then I tried the same approach for another list:
import pandas as pd
q_list = ['112354401', '116115526', '114909312', '122425491', '131957025', '111373473']
df1 = pd.DataFrame.from_records(q_list, columns=['q_data'])
df1
But it gave me the following errors this time:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-24-99e7b8e32a52> in <module>()
1 import pandas as pd
2 q_list = ['112354401', '116115526', '114909312', '122425491', '131957025', '111373473']
----> 3 df1 = pd.DataFrame.from_records(q_list, columns=['q_data'])
4 df1
/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in from_records(cls, data, index, exclude, columns, coerce_float, nrows)
1021 else:
1022 arrays, arr_columns = _to_arrays(data, columns,
-> 1023 coerce_float=coerce_float)
1024
1025 arr_columns = _ensure_index(arr_columns)
/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in _to_arrays(data, columns, coerce_float, dtype)
5550 data = lmap(tuple, data)
5551 return _list_to_arrays(data, columns, coerce_float=coerce_float,
-> 5552 dtype=dtype)
5553
5554
/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in _list_to_arrays(data, columns, coerce_float, dtype)
5607 content = list(lib.to_object_array(data).T)
5608 return _convert_object_array(content, columns, dtype=dtype,
-> 5609 coerce_float=coerce_float)
5610
5611
/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in _convert_object_array(content, columns, coerce_float, dtype)
5666 # caller's responsibility to check for this...
5667 raise AssertionError('%d columns passed, passed data had %s '
-> 5668 'columns' % (len(columns), len(content)))
5669
5670 # provide soft conversion of object dtypes
AssertionError: 1 columns passed, passed data had 9 columns
Why would the same approach work for one list but not another? Any idea what might be wrong here? Thanks a lot!
DataFrame.from_records treats string as a character list. so it needs as many columns as length of string.
You could simply use the DataFrame constructor.
In [3]: pd.DataFrame(q_list, columns=['q_data'])
Out[3]:
q_data
0 112354401
1 116115526
2 114909312
3 122425491
4 131957025
5 111373473
In[20]: test_list = [['a','b','c'], ['AA','BB','CC']]
In[21]: pd.DataFrame(test_list, columns=['col_A', 'col_B', 'col_C'])
Out[21]:
col_A col_B col_C
0 a b c
1 AA BB CC
In[22]: pd.DataFrame(test_list, index=['col_low', 'col_up']).T
Out[22]:
col_low col_up
0 a AA
1 b BB
2 c CC
If you want to create a DataFrame from multiple lists you can simply zip the lists. This returns a 'zip' object. So you convert back to a list.
mydf = pd.DataFrame(list(zip(lstA, lstB)), columns = ['My List A', 'My List B'])
just using concat method
test_list = ['a','b','c','d']
pd.concat(test_list )
You could also take the help of numpy.
import numpy as np
df1 = pd.DataFrame(np.array(q_list),columns=['q_data'])

Python Side-by-side box plots on same figure

I am trying to generate a box plot in Python 2.7 for each categorical value in column E from the Pandas dataframe below
A B C D E
0 0.647366 0.317832 0.875353 0.993592 1
1 0.504790 0.041806 0.113889 0.445370 2
2 0.769335 0.120647 0.749565 0.935732 3
3 0.215003 0.497402 0.795033 0.246890 1
4 0.841577 0.211128 0.248779 0.250432 1
5 0.045797 0.710889 0.257784 0.207661 4
6 0.229536 0.094308 0.464018 0.402725 3
7 0.067887 0.591637 0.949509 0.858394 2
8 0.827660 0.348025 0.507488 0.343006 3
9 0.559795 0.820231 0.461300 0.921024 1
I would be willing to do this with Matplotlib or any other plotting library. So far the above code can plot all the categories combined on one plot. Here is the code to generate the above data and produce the plot:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
# Data
df = pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
df['E'] = [1,2,3,1,1,4,3,2,3,1]
# Boxplot
bp = ax.boxplot(df.iloc[:,:-1].values, widths=0.2)
plt.show()
In this example, the categories are 1,2,3,4. I would like to plot separate boxplots side-by-side on the same figure, for only categories 1 and 2 and show the category names in the legend.
Is there a way to do this?
Additional Information:
The output should look similar to the 3rd figure from here - replace "Yes","No" by "1","2".
Starting with this:
import numpy
import pandas
from matplotlib import pyplot
import seaborn
seaborn.set(style="ticks")
# Data
df = pandas.DataFrame(numpy.random.rand(10,4), columns=list('ABCD'))
df['E'] = [1, 2, 3, 1, 1, 4, 3, 2, 3, 1]
You've got a couple of options. If separate axes are ok,
fig, axes = pyplot.subplots(ncols=4, figsize=(12, 5), sharey=True)
df.query("E in [1, 2]").boxplot(by='E', return_type='axes', ax=axes)
If you want 1 axes, I think seaborn will be easier. You just need to clean up your data.
ax = (
df.set_index('E', append=True) # set E as part of the index
.stack() # pull A - D into rows
.to_frame() # convert to a dataframe
.reset_index() # make the index into reg. columns
.rename(columns={'level_2': 'quantity', 0: 'value'}) # rename columns
.drop('level_0', axis='columns') # drop junk columns
.pipe((seaborn.boxplot, 'data'), x='E', y='value', hue='quantity', order=[1, 2])
)
seaborn.despine(trim=True)
The cool thing about seaborn is that tweaking the parameters slightly can achieve a lot in terms of the plot's layout. If we switch our hue and x variables, we get:
ax = (
df.set_index('E', append=True) # set E as part of the index
.stack() # pull A - D into rows
.to_frame() # convert to a dataframe
.reset_index() # make the index into reg. columns
.rename(columns={'level_2': 'quantity', 0: 'value'}) # rename columns
.drop('level_0', axis='columns') # drop junk columns
.pipe((seaborn.boxplot, 'data'), x='quantity', y='value', hue='E', hue_order=[1, 2])
)
seaborn.despine(trim=True)
If you're curious, the resulting dataframe looks something like this:
E quantity value
0 1 A 0.935433
1 1 B 0.862290
2 1 C 0.197243
3 1 D 0.977969
4 2 A 0.675037
5 2 B 0.494440
6 2 C 0.492762
7 2 D 0.531296
8 3 A 0.119273
9 3 B 0.303639
10 3 C 0.911700
11 3 D 0.807861
An addition to #Paul_H answer.
Side-by-side boxplots on the single matplotlib.axes.Axes, no seaborn:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.rand(10,4), columns=list('ABCD'))
df['E'] = [1, 2, 1, 1, 1, 2, 1, 2, 2, 1]
mask_e = df['E'] == 1
# prepare data
data_to_plot = [df[mask_e]['A'], df[~mask_e]['A'],
df[mask_e]['B'], df[~mask_e]['B'],
df[mask_e]['C'], df[~mask_e]['C'],
df[mask_e]['D'], df[~mask_e]['D']]
# Positions defaults to range(1, N+1) where N is the number of boxplot to be drawn.
# we will move them a little, to visually group them
plt.figure(figsize=(10, 6))
box = plt.boxplot(data_to_plot,
positions=[1, 1.6, 2.5, 3.1, 4, 4.6, 5.5, 6.1],
labels=['A1','A0','B1','B0','C1','C0','D1','D0'])

Pandas Series Resampling: How do I get moves based on certain previous changes?

import pandas as pd
import numpy as np
import datetime as dt
# Create Column names
col_names = ['930', '931', '932', '933', '934', '935']
# Create Index datetimes
idx_names = pd.date_range(start = dt.datetime(2011, 1, 1), periods = 10, freq= 'D')
# Create dataframe with previously created column names and index datetimes
df1 = pd.DataFrame(np.random.randn(10, 6), columns=col_names, index=idx_names)
# Change the column names from strings to datetimes.time() object
df1.columns = [dt.datetime.strptime(x, '%H%M').time() for x in df1.columns]
# This step and the next step changes the dataframe into a chronological timeseries
df2 = df1.T.unstack()
df2.index = [dt.datetime.combine(x[0], x[1]) for x in df2.index.tolist()]
# Show the series
df2
Question: What is the most pythonic/pandas-thonic way to create a specific list? This list would say 'Every time the difference between 9:32 and 9:34 is between 0 and .50, what is the difference between 9:34 and the next day's 9:34.
I was doing this with the numbers in a dataframe format (dates along the x-axis and times along the y-axis) and I would say something like (below is pseudo-code, above is not pseudo-code):
# Create a column with wrong answers and right answers
df['Today 934 minus yesterday 934'] = df[934] - df[934].shift(1)
# Boolean mask were condition 1 (diff > 0) and condition 2 (diff < .5) are true
mask = (df[934].shift(1) - df[932].shift(1) > 0) & (df[934].shift(1) - df[932].shift(1) < .5)
# Apply the boolean mask to the dataframe. This is will remove all the answers
# I dont want from the df['Today 934 minus yesterday 934'] column
df2 = df[mask]
# Only the answers I want:
answers = df['Today 934 minus yesterday 934']
My attempt, basically a filled in version of your pseudo-code. Someone else may have a cleaner approach.
mask1 = (df2.index.hour == 9) & (df2.index.minute == 34)
mask2 = (df2.index.hour == 9) & (df2.index.minute == 32)
diff_934 = df2[mask1] - df2[mask1].shift(-1)
diff_934 = diff_934[diff_934.index.minute == 34]
diff_932 = df2[mask1|mask2] - df2[mask1|mask2].shift(-1)
diff_932 = diff_932[diff_932.index.minute == 34]
diff_932 = diff_932[(diff_932 > 0) & (diff_932 < .5)]
answer = diff_934.reindex(diff_932.index)
In [116]: answer
Out[116]:
2011-01-02 09:34:00 -0.874153
2011-01-08 09:34:00 0.186254
dtype: float64