Reference a table column by its column header in Python - python-2.7

Is there a Pythonic way to refer to columns of 2D lists by name?
I import a lot of tables from the web so I made a general purpose function that creates 2 dimensional lists out of various HTML tables. So far so good. But the next step is often to parse the table row by row.
# Sample table.
# In real life I would do something like: table = HTML_table('url', 'table id')
table =
[
['Column A', 'Column B', 'Column C'],
['One', 'Two', 3],
['Four', 'Five', 6]
]
# Current code:
iA = table[0].index('Column A')
iB = tabel[0].index('Column B')
for row in table[1:]:
process_row(row[iA], row[iC])
# Desired code:
for row in table[1:]:
process_row(row['Column A'], row['Column C'])

I think you'll really like the pandas module! http://pandas.pydata.org/
Put your list into a DataFrame
This could also be done directly from html, csv, etc.
df = pd.DataFrame(table[1:], columns=table[0]).astype(str)
Access columns
df['Column A']
Access first row by index
df.iloc[0]
Process row by row
df.apply(lambda x: '_'.join(x), axis=0)
for index,row in df.iterrows():
process_row(row['Column A'], row['Column C'])
Process a column
df['Column C'].astype(int).sum()

Wouldn't a ordereddict of keys being columns names and values a list of rows be a better approach for your problem? I would go with something like:
table = {
'Column A': [1, 4],
'Column B': [2, 5],
'Column C': [3, 6]
}
# And you would parse column by column...
for col, rows in table.iteritems():
#do something

My QueryList is simple to use.
ql.filter(portfolio='123')
ql.group_by(['portfolio', 'ticker'])
class QueryList(list):
"""filter and/or group_by a list of objects."""
def group_by(self, attrs) -> dict:
"""Like a database group_by function.
args:
attrs: str or list.
Returns:
{value_of_the_group: list_of_matching_objects, ...}
When attrs is a list, each key is a tuple.
Ex:
{'AMZN': QueryList(),
'MSFT': QueryList(),
...
}
-- or --
{('Momentum', 'FB'): QueryList(),
...,
}
"""
result = defaultdict(QueryList)
if isinstance(attrs, str):
for item in self:
result[getattr(item, attrs)].append(item)
else:
for item in self:
result[tuple(getattr(item, x) for x in attrs)].append(item)
return result
def filter(self, **kwargs):
"""Returns the subset of IndexedList that has matching attributes.
args:
kwargs: Attribute name/value pairs.
Example:
foo.filter(portfolio='123', account='ABC').
"""
ordered_kwargs = OrderedDict(kwargs)
match = tuple(ordered_kwargs.values())
def is_match(item):
if tuple(getattr(item, y) for y in ordered_kwargs.keys()) == match:
return True
else:
return False
result = IndexedList([x for x in self if is_match(x)])
return result
def scalar(self, default=None, attr=None):
"""Returns the first item in this QueryList.
args:
default: The value to return if there is less than one item,
or if the attr is not found.
attr: Returns getattr(item, attr) if not None.
"""
item, = self[0:1] or [default]
if attr is None:
result = item
else:
result = getattr(item, attr, default)
return result
I tried pandas. I wanted to like it, I really did. But ultimately it is too complicated for my needs.
For example:
df[df['portfolio'] == '123'] & df['ticker'] == 'MSFT']]
is not as simple as
ql.filter(portfolio='123', ticker='MSFT')
Furthermore, creating a QueryList is simpler than creating a df.
That's because you tend to use custom classes with a QueryList. The data conversion code would naturally be placed into the custom class which keeps that separate from the rest of the logic. But data conversion for a df would normally be done inline with the rest of the code.

Related

Django postgres Json field with numeric keys

I have model with postgres json field.
class MyModel(models.Model):
data = JSONField(null=True)
then, I do:
m1 = MyModel.objects.create(data={'10':'2017-12-1'})
m2 = MyModel.objects.create(data={'10':'2018-5-1'})
I want query all the MyModel whose key '10' starts with '2017', so I want to write:
MyModel.objects.filter(data__10__startswith='2017')
The problem is that the 10 is interpreted as integer, and therefore, in the generated query it is considered as list index and not key.
Is there anyway to solve this? (except writing raw queries).
This is the generated query:
SELECT "systools_mymodel"."id", "systools_mymodel"."data" FROM "systools_mymodel" WHERE ("systools_mymodel"."data" ->> 10)::text LIKE '2017%' LIMIT 21;
And I want the 10 to be quoted (which would give me the right answer).
Thanks!
A very hackish solution (use on your own risk, tested under Django 2.0.5, voids warranty...):
# patch_jsonb.py
from django.contrib.postgres.fields.jsonb import KeyTransform
def as_sql(self, compiler, connection):
key_transforms = [self.key_name]
previous = self.lhs
while isinstance(previous, KeyTransform):
key_transforms.insert(0, previous.key_name)
previous = previous.lhs
lhs, params = compiler.compile(previous)
if len(key_transforms) > 1:
return "(%s %s %%s)" % (lhs, self.nested_operator), [
key_transforms] + params
try:
int(self.key_name)
except ValueError:
if self.key_name.startswith("K") and self.key_name[1:].isnumeric():
lookup = "'%s'" % self.key_name[1:]
else:
lookup = "'%s'" % self.key_name
else:
lookup = "%s" % self.key_name
return "(%s %s %s)" % (lhs, self.operator, lookup), params
def patch():
KeyTransform.as_sql = as_sql
Usage:
Add this to the bottom of your settings.py:
import patch_jsonb
patch_jsonb.patch()
Instead of __123__ lookups use __K123__ lookups - the uppercase K will be stripped by this patch:
MyModel.objects.filter(data__K10__startswith='2017')
And consider avoiding using numbers as jsonb object keys...

Using python groupby or defaultdict effectively?

I have a csv with name, role, years of experience. I want to create a list of tuples that aggregates (name, role1, total_exp_inthisRole) for all the employess.
so far i am able to use defaultdict to do the below
import csv, urllib2
from collections import defaultdict
response = urllib2.urlopen(url)
cr = csv.reader(response)
parsed = ((row[0],row[1],int(row[2])) for row in cr)
employees =[]
for item in parsed:
employees.append(tuple(item))
employeeExp = defaultdict(int)
for x,y,z in employees: # variable unpacking
employeeExp[x] += z
employeeExp.items()
output: [('Ken', 15), ('Buckky', 5), ('Tina', 10)]
but how do i use the second column also to achieve the result i want. Shall i try to solve by groupby multiple keys or simpler way is possible? Thanks all in advance.
You can simply pass a tuple of name and role to your defaultdict, instead of only one item:
for x,y,z in employees:
employeeExp[(x, y)] += z
For your second expected output ([('Ken', ('engineer', 5),('sr. engineer', 6)), ...])
You need to aggregate the result of aforementioned snippet one more time, but this time you need to use a defaultdict with a list:
d = defaultdict(list)
for (name, rol), total_exp_inthisRole in employeeExp.items():
d[name].append(rol, total_exp_inthisRole)

Dictvectorizer for list as one feature in Python Pandas and Scikit-learn

I have been trying to solve this for days, and although I have found a similar problem here How can i vectorize list using sklearn DictVectorizer, the solution is overly simplified.
I would like to fit some features into a logistic regression model to predict 'chinese' or 'non-chinese'. I have a raw_name which I will extract to get two features 1) is just the last name, and 2) is a list of substring of the last name, for example, 'Chan' will give ['ch', 'ha', 'an']. But it seems Dictvectorizer doesn't take list type as part of the dictionary. From the link above, I try to create a function list_to_dict, and successfully, return some dict elements,
{'substring=co': True, 'substring=or': True, 'substring=rn': True, 'substring=ns': True}
but I have no idea how to incorporate that in the my_dict = ... before applying the dictvectorizer.
# coding=utf-8
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import nltk
import re
import random
from random import randint
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
lr = LogisticRegression()
dv = DictVectorizer()
# Get csv file into data frame
data = pd.read_csv("V2-1_2000Records_Processed_SEP2015.csv", header=0, encoding="utf-8")
df = DataFrame(data)
# Pandas data frame shuffling
df_shuffled = df.iloc[np.random.permutation(len(df))]
df_shuffled.reset_index(drop=True)
# Assign X and y variables
X = df.raw_name.values
y = df.chineseScan.values
# Feature extraction functions
def feature_full_last_name(nameString):
try:
last_name = nameString.rsplit(None, 1)[-1]
if len(last_name) > 1: # not accept name with only 1 character
return last_name
else: return None
except: return None
def feature_twoLetters(nameString):
placeHolder = []
try:
for i in range(0, len(nameString)):
x = nameString[i:i+2]
if len(x) == 2:
placeHolder.append(x)
return placeHolder
except: return []
def list_to_dict(substring_list):
try:
substring_dict = {}
for i in substring_list:
substring_dict['substring='+str(i)] = True
return substring_dict
except: return None
list_example = ['co', 'or', 'rn', 'ns']
print list_to_dict(list_example)
# Transform format of X variables, and spit out a numpy array for all features
my_dict = [{'two-letter-substrings': feature_twoLetters(feature_full_last_name(i)),
'last-name': feature_full_last_name(i), 'dummy': 1} for i in X]
print my_dict[3]
Output:
{'substring=co': True, 'substring=or': True, 'substring=rn': True, 'substring=ns': True}
{'dummy': 1, 'two-letter-substrings': [u'co', u'or', u'rn', u'ns'], 'last-name': u'corns'}
Sample data:
Raw_name chineseScan
Jack Anderson non-chinese
Po Lee chinese
If I have understood correctly you want a way to encode list values in order to have a feature dictionary that DictVectorizer could use. (One year too late but) something like this can be used depending on the case:
my_dict_list = []
for i in X:
# create a new feature dictionary
feat_dict = {}
# add the features that are straight forward
feat_dict['last-name'] = feature_full_last_name(i)
feat_dict['dummy'] = 1
# for the features that have a list of values iterate over the values and
# create a custom feature for each value
for two_letters in feature_twoLetters(feature_full_last_name(i)):
# make sure the naming is unique enough so that no other feature
# unrelated to this will have the same name/ key
feat_dict['two-letter-substrings-' + two_letters] = True
# save it to the feature dictionary list that will be used in Dict vectorizer
my_dict_list.append(feat_dict)
print my_dict_list
from sklearn.feature_extraction import DictVectorizer
dict_vect = DictVectorizer(sparse=False)
transformed_x = dict_vect.fit_transform(my_dict_list)
print transformed_x
Output:
[{'dummy': 1, u'two-letter-substrings-er': True, 'last-name': u'Anderson', u'two-letter-substrings-on': True, u'two-letter-substrings-de': True, u'two-letter-substrings-An': True, u'two-letter-substrings-rs': True, u'two-letter-substrings-nd': True, u'two-letter-substrings-so': True}, {'dummy': 1, u'two-letter-substrings-ee': True, u'two-letter-substrings-Le': True, 'last-name': u'Lee'}]
[[ 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1.]
[ 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0.]]
Another thing you could do (but I don't recommend) if you don't want to create as many features as the values in your lists is something like this:
# sorting the values would be a good idea
feat_dict[frozenset(feature_twoLetters(feature_full_last_name(i)))] = True
# or
feat_dict[" ".join(feature_twoLetters(feature_full_last_name(i)))] = True
but the first one means that you can't have any duplicate values and probably both don't make good features, especially if you need fine-tuned and detailed ones. Also, they reduce the possibility of two rows having the same combination of two letter combinations, thus the classification probably won't do well.
Output:
[{'dummy': 1, 'last-name': u'Anderson', frozenset([u'on', u'rs', u'de', u'nd', u'An', u'so', u'er']): True}, {'dummy': 1, 'last-name': u'Lee', frozenset([u'ee', u'Le']): True}]
[{'dummy': 1, 'last-name': u'Anderson', u'An nd de er rs so on': True}, {'dummy': 1, u'Le ee': True, 'last-name': u'Lee'}]
[[ 1. 0. 1. 1. 0.]
[ 0. 1. 1. 0. 1.]]

Removing features with low variance using scikit-learn

scikit-learn provides various methods to remove descriptors, a basic method for this purpose has been provided by the given tutorial below,
http://scikit-learn.org/stable/modules/feature_selection.html
but the tutorial does not provide any method or a way that can tell you the way to keep the list of features that either removed or kept.
The code below has been taken from the tutorial.
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)
array([[0, 1],
[1, 0],
[0, 0],
[1, 1],
[1, 0],
[1, 1]])
The given example code above depicts only two descriptors "shape(6, 2)", but in my case, I have a huge data frames with a shape of (rows 51, columns 9000). After finding a suitable model I want to keep the track of useful and useless features because I can save computational time during the computation of the features of test data set by calculating only useful features.
For example, when you perform machine learning modeling with WEKA 6.0, it provided with remarkable flexibility over feature selection and after removing the useless feature you can get a list of a discarded features along with the useful features.
thanks
Then, what you can do, if I'm not wrong is:
In the case of the VarianceThreshold, you can call the method fit instead of fit_transform. This will fit data, and the resulting variances will be stored in vt.variances_ (assuming vt is your object).
Having a threhold, you can extract the features of the transformation as fit_transform would do:
X[:, vt.variances_ > threshold]
Or get the indexes as:
idx = np.where(vt.variances_ > threshold)[0]
Or as a mask
mask = vt.variances_ > threshold
PS: default threshold is 0
EDIT:
A more straight forward to do, is by using the method get_support of the class VarianceThreshold. From the documentation:
get_support([indices]) Get a mask, or integer index, of the features selected
You should call this method after fit or fit_transform.
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
# Just make a convenience function; this one wraps the VarianceThreshold
# transformer but you can pass it a pandas dataframe and get one in return
def get_low_variance_columns(dframe=None, columns=None,
skip_columns=None, thresh=0.0,
autoremove=False):
"""
Wrapper for sklearn VarianceThreshold for use on pandas dataframes.
"""
print("Finding low-variance features.")
try:
# get list of all the original df columns
all_columns = dframe.columns
# remove `skip_columns`
remaining_columns = all_columns.drop(skip_columns)
# get length of new index
max_index = len(remaining_columns) - 1
# get indices for `skip_columns`
skipped_idx = [all_columns.get_loc(column)
for column
in skip_columns]
# adjust insert location by the number of columns removed
# (for non-zero insertion locations) to keep relative
# locations intact
for idx, item in enumerate(skipped_idx):
if item > max_index:
diff = item - max_index
skipped_idx[idx] -= diff
if item == max_index:
diff = item - len(skip_columns)
skipped_idx[idx] -= diff
if idx == 0:
skipped_idx[idx] = item
# get values of `skip_columns`
skipped_values = dframe.iloc[:, skipped_idx].values
# get dataframe values
X = dframe.loc[:, remaining_columns].values
# instantiate VarianceThreshold object
vt = VarianceThreshold(threshold=thresh)
# fit vt to data
vt.fit(X)
# get the indices of the features that are being kept
feature_indices = vt.get_support(indices=True)
# remove low-variance columns from index
feature_names = [remaining_columns[idx]
for idx, _
in enumerate(remaining_columns)
if idx
in feature_indices]
# get the columns to be removed
removed_features = list(np.setdiff1d(remaining_columns,
feature_names))
print("Found {0} low-variance columns."
.format(len(removed_features)))
# remove the columns
if autoremove:
print("Removing low-variance features.")
# remove the low-variance columns
X_removed = vt.transform(X)
print("Reassembling the dataframe (with low-variance "
"features removed).")
# re-assemble the dataframe
dframe = pd.DataFrame(data=X_removed,
columns=feature_names)
# add back the `skip_columns`
for idx, index in enumerate(skipped_idx):
dframe.insert(loc=index,
column=skip_columns[idx],
value=skipped_values[:, idx])
print("Succesfully removed low-variance columns.")
# do not remove columns
else:
print("No changes have been made to the dataframe.")
except Exception as e:
print(e)
print("Could not remove low-variance features. Something "
"went wrong.")
pass
return dframe, removed_features
this worked for me if you want to see exactly which columns are remained after thresholding you may use this method:
from sklearn.feature_selection import VarianceThreshold
threshold_n=0.95
sel = VarianceThreshold(threshold=(threshold_n* (1 - threshold_n) ))
sel_var=sel.fit_transform(data)
data[data.columns[sel.get_support(indices=True)]]
When testing features I wrote this simple function that tells me which variables remained in the data frame after the VarianceThreshold is applied.
from sklearn.feature_selection import VarianceThreshold
from itertools import compress
def fs_variance(df, threshold:float=0.1):
"""
Return a list of selected variables based on the threshold.
"""
# The list of columns in the data frame
features = list(df.columns)
# Initialize and fit the method
vt = VarianceThreshold(threshold = threshold)
_ = vt.fit(df)
# Get which column names which pass the threshold
feat_select = list(compress(features, vt.get_support()))
return feat_select
which returns a list of column names which are selected. For example: ['col_2','col_14', 'col_17'].

The queryset's `count` is wrong after `extra`

When I use extra in a certain way on a Django queryset (call it qs), the result of qs.count() is different than len(qs.all()). To reproduce:
Make an empty Django project and app, then add a trivial model:
class Baz(models.Model):
pass
Now make a few objects:
>>> Baz(id=1).save()
>>> Baz(id=2).save()
>>> Baz(id=3).save()
>>> Baz(id=4).save()
Using the extra method to select only some of them produces the expected count:
>>> Baz.objects.extra(where=['id > 2']).count()
2
>>> Baz.objects.extra(where=['-id < -2']).count()
2
But add a select clause to the extra and refer to it in the where clause, and the count is suddenly wrong, even though the result of all() is correct:
>>> Baz.objects.extra(select={'negid': '0 - id'}, where=['"negid" < -2']).all()
[<Baz: Baz object>, <Baz: Baz object>] # As expected
>>> Baz.objects.extra(select={'negid': '0 - id'}, where=['"negid" < -2']).count()
0 # Should be 2
I think the problem has to do with django.db.models.sql.query.BaseQuery.get_count(). It checks whether the BaseQuery's select or aggregate_select attributes have been set; if so, it uses a subquery. But django.db.models.sql.query.BaseQuery.add_extra adds only to the BaseQuery's extra attribute, not select or aggregate_select.
How can I fix the problem? I know I could just use len(qs.all()), but it would be nice to be able to pass the extra'ed queryset to other parts of the code, and those parts may call count() without knowing that it's broken.
Redefining get_count() and monkeypatching appears to fix the problem:
def get_count(self):
"""
Performs a COUNT() query using the current filter constraints.
"""
obj = self.clone()
if len(self.select) > 1 or self.aggregate_select or self.extra:
# If a select clause exists, then the query has already started to
# specify the columns that are to be returned.
# In this case, we need to use a subquery to evaluate the count.
from django.db.models.sql.subqueries import AggregateQuery
subquery = obj
subquery.clear_ordering(True)
subquery.clear_limits()
obj = AggregateQuery(obj.model, obj.connection)
obj.add_subquery(subquery)
obj.add_count_column()
number = obj.get_aggregation()[None]
# Apply offset and limit constraints manually, since using LIMIT/OFFSET
# in SQL (in variants that provide them) doesn't change the COUNT
# output.
number = max(0, number - self.low_mark)
if self.high_mark is not None:
number = min(number, self.high_mark - self.low_mark)
return number
django.db.models.sql.query.BaseQuery.get_count = quuux.get_count
Testing:
>>> Baz.objects.extra(select={'negid': '0 - id'}, where=['"negid" < -2']).count()
2
Updated to work with Django 1.2.1:
def basequery_get_count(self, using):
"""
Performs a COUNT() query using the current filter constraints.
"""
obj = self.clone()
if len(self.select) > 1 or self.aggregate_select or self.extra:
# If a select clause exists, then the query has already started to
# specify the columns that are to be returned.
# In this case, we need to use a subquery to evaluate the count.
from django.db.models.sql.subqueries import AggregateQuery
subquery = obj
subquery.clear_ordering(True)
subquery.clear_limits()
obj = AggregateQuery(obj.model)
obj.add_subquery(subquery, using=using)
obj.add_count_column()
number = obj.get_aggregation(using=using)[None]
# Apply offset and limit constraints manually, since using LIMIT/OFFSET
# in SQL (in variants that provide them) doesn't change the COUNT
# output.
number = max(0, number - self.low_mark)
if self.high_mark is not None:
number = min(number, self.high_mark - self.low_mark)
return number
models.sql.query.Query.get_count = basequery_get_count
I'm not sure if this fix will have other unintended consequences, however.