How to improve my feature selection for a NB classifier? - python-2.7

I have read that improving feature selection will reduce the training time of my classifier and also improve accuracy but I am not sure how can I reduce the number of features. Should I count them and after select the first 3000 for example ?
This is my code :
def save_object(obj, filename):
with open(filename, 'wb') as output:
pickle.dump(obj,output,pickle.HIGHEST_PROTOCOL)
print "saved"
ujson.dumps({"output" : "obj"})
with open('neg5000.csv','rb') as f:
reader = csv.reader(f)
neg_tweets = list(reader)
print "list ready"
with open('pos5000.csv','rb') as f:
reader = csv.reader(f)
pos_tweets = list(reader)
print "list ready"
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
tweets.append((words_filtered, sentiment))
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = list(wordlist.keys())[:3000]
#word_features = wordlist.keys()
return word_features
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#def extract_features(words):
# return dict([(word, True) for word in words])
word_features = get_word_features(get_words_in_tweets(tweets))
training_set = nltk.classify.apply_features(extract_features, tweets)
save_object(word_features, 'wordf.save')
print 'features done'
print datetime.datetime.now()
classifier = nltk.NaiveBayesClassifier.train(training_set)
print 'training done'
print datetime.datetime.now()
save_object(classifier, 'classifier.save')
tweet = 'I love this car'
print classifier.classify(extract_features(tweet.split()))

There's a number of ways to approach feature selection for the supervised classification problem (which is what Naive Bayes does). I suggest heading over to scikit-learn manual and just trying everything listed there, since the choice of particular method is dependends on the data you have.
The easiest way to do this is to switch to the scikit-learn implementation of Naive Bayes and the use a Pipeline to chain the feature selection and classifier training. See this tutorial for code examples.
Here's a version of your code using scikit-learn with SelectKBest feature selection:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
def read_input(path):
with open(path) as handle:
lines = (line.rsplit(",", 1) for line in handle)
return [text for text, label in lines]
# Assuming each line in ``neg5000.csv`` and ``pos5000.csv`` is a
# UTF-8-encoded tweet.
neg_tweets = read_input("neg5000.csv")
pos_tweets = read_input("pos5000.csv")
X = np.append(neg_tweets, pos_tweets)
y = np.append(np.full(len(neg_tweets), -1, dtype=int),
np.full(len(pos_tweets), 1, dtype=int))
p = Pipeline([
("vectorizer", CountVectorizer()),
("selector", SelectPercentile(percentile=20)),
("nb", MultinomialNB())
])
p.fit(X, y)
print(p.predict(["I love this car"]))

Related

Showing key error

I am trying to use a recomendation engine to predict thr top selling product,it is showing key error,i am doing it with python2 anaconda jupyter notebook.hw i can over come from this error
import pandas as pd
import numpy as np
import operator
SMOOTHING_WINDOW_FUNCTION = np.hamming
SMOOTHING_WINDOW_SIZE = 7
def train():
df = pd.read_csv('C:\\Users\SHIVAPRASAD\Desktop\sample-cart-add-data
(1).csv')
df.sort_values(by=['id', 'age'], inplace=True)
trends = pd.pivot_table(df, values='count', index=['id', 'age'])
trend_snap = {}
for i in np.unique(df['id']):
trend = np.array(trends[i])
smoothed = smooth(trend, SMOOTHING_WINDOW_SIZE,
SMOOTHING_WINDOW_FUNCTION)
nsmoothed = standardize(smoothed)
slopes = nsmoothed[1:] - nsmoothed[:-1]
# I blend in the previous slope as well, to stabalize things a bit
# give a boost to things that have been trending for more than1day[![key error][1]][1]
if len(slopes) > 1:
trend_snap[i] = slopes[-1] + slopes[-2] * 0.5
return sorted(trend_snap.items(), key=operator.itemgetter(1),
reverse=True)
def smooth(series, window_size, window):
ext = np.r_[2 * series[0] - series[window_size-1::-1],
series,
2 * series[-1] - series[-1:-window_size:-1]]
weights = window(window_size)
smoothed = np.convolve(weights / weights.sum(), ext, mode='same')
return smoothed[window_size:-window_size+1]
def standardize(series):
iqr = np.percentile(series, 75) - np.percentile(series, 25)
return (series - np.median(series)) / iqr
trending = train()
print "Top 5 trending products:"
for i, s in trending[:5]:
print "Product %s (score: %2.2f)" % (i, s)
insted of
trend = np.array(trends[i]) use trend = np.array(trends.loc[i])

Python 2.7 using nltk built-in dataset to do sentiment analysis for KNN

As I am new to programming, I wish to know that is it possible to use the nltk built-in movie review dataset to do sentiment analysis by using KNN to determine the polarity of data? Is there any way to do so?
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[500:1500]
testing_set = featuresets[:1500]
classifier = nltk.DecisionTreeClassifier.train(training_set)
print "Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100 , "%"
string = raw_input("Enter the string: ")
print (classifier.classify(find_features(word_tokenize(string))))
As I am trying to convert the code above from decision-tree to KNN

Performance difference serializing pandas frames python 2.x / 3.x

I was experiencing some performance differences between python 2.7 and 3.5 when serializing pandas frames to CSV.
So did a quick search on google and found this benchmark:
https://gist.github.com/GitRay/4001b4962eb9f3e09a9d456ee5a30aae
And modified it a bit for my needs:
import pandas as pd
from time import time
import platform
def timeit(func, n=5):
start = time()
for i in range(n):
func()
end = time()
return (end - start) / n
def csvdumps(s):
s.to_csv('foo')
return 'foo'
def csvloads(fn):
return pd.read_csv(fn)
def hdfdumps(s):
s.to_hdf('foo', 'bar', mode='w')
return ('foo', 'bar')
def hdfloads(path):
return pd.read_hdf('foo', 'bar')
df = pd.DataFrame({'text': [str(i % 1000) for i in range(1000000)],
'numbers': range(1000000)})
keys = ['csv', 'hdfstore']
d = {'csv': [csvloads, csvdumps],
'hdfstore': [hdfloads, hdfdumps]}
result = dict()
for name, (loads, dumps) in d.items():
text = dumps(df.text)
numbers = dumps(df.numbers)
result[name] = {'text': {'dumps': timeit(lambda: dumps(df.text)),
'loads': timeit(lambda: loads(text))},
'numbers': {'dumps': timeit(lambda: dumps(df.numbers)),
'loads': timeit(lambda: loads(numbers))}}
########
# Plot #
########
# Much of this was taken from
# http://nbviewer.ipython.org/gist/mwaskom/886b4e5cb55fed35213d
# by Michael Waskom
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", font_scale=1.3)
w, h = 7, 7
f, (left, right) = plt.subplots(nrows=1, ncols=2, sharex=True, figsize=(w*2, h), squeeze=True)
df = pd.DataFrame({'loads': [result[key]['text']['loads'] for key in keys],
'dumps': [result[key]['text']['dumps'] for key in keys],
'storage': keys})
df = pd.melt(df, "storage", value_name="duration", var_name="operation")
sns.barplot("duration", "storage", "operation", data=df, ax=left)
left.set(xlabel="Duration (s)", ylabel="")
sns.despine(bottom=True)
left.set_title('Cost to Serialize Text')
left.legend(loc="lower center", ncol=2, frameon=True, title="operation")
df = pd.DataFrame({'loads': [result[key]['numbers']['loads'] for key in keys],
'dumps': [result[key]['numbers']['dumps'] for key in keys],
'storage': keys})
df = pd.melt(df, "storage", value_name="duration", var_name="operation")
sns.barplot("duration", "storage", "operation", data=df, ax=right)
right.set(xlabel="Duration (s)", ylabel="")
sns.despine(bottom=True)
right.set_title('Cost to Serialize Numerical Data')
right.legend(loc="lower center", ncol=2, frameon=True, title="operation")
plt.savefig('serialize_py'+'.'.join(platform.python_version_tuple())+'.png')
As you can see in the results serializing in python 3 is much slower :
python 2.7 python 3.5 diff
load 0.3504s 0.329005s +06.50%
dump 1.2784s 3.333152s -61.65%
Does anybody know why?

Feature selection in scikit learn for multiple variables and thousands+ features

I am trying to perform feature selection for logistic regression classifier. Originally there are 4 variables: name, location, gender, and label = ethnicity. The three variables, namely the name, give rise to tens of thousands of more "features", for example, name "John Snow" will give rise to 2-letter substrings like 'jo', 'oh', 'hn'... etc. The feature set undergoes DictVectorization.
I am trying to follow this tutorial (http://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html) but I am not sure if I am doing it right since the tutorial is using a small number of features while mine has tens of thousands after vectorization. And also the plt.show() shows a blank figure.
# coding=utf-8
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import re
import random
import time
from random import randint
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
# Assign X and y variables
X = df.raw_name.values
X2 = df.name.values
X3 = df.gender.values
X4 = df.location.values
y = df.ethnicity_scan.values
# Feature extraction functions
def feature_full_name(nameString):
try:
full_name = nameString
if len(full_name) > 1: # not accept name with only 1 character
return full_name
else: return '?'
except: return '?'
def feature_avg_wordLength(nameString):
try:
space = 0
for i in nameString:
if i == ' ':
space += 1
length = float(len(nameString) - space)
name_entity = float(space + 1)
avg = round(float(length/name_entity), 0)
return avg
except:
return 0
def feature_name_entity(nameString2):
space = 0
try:
for i in nameString2:
if i == ' ':
space += 1
return space+1
except: return 0
def feature_gender(genString):
try:
gender = genString
if len(gender) >= 1:
return gender
else: return '?'
except: return '?'
def feature_noNeighborLoc(locString):
try:
x = re.sub(r'^[^, ]*', '', locString) # remove everything before and include first ','
y = x[2:] # remove subsequent ',' and ' '
return y
except: return '?'
def list_to_dict(substring_list):
try:
substring_dict = {}
for i in substring_list:
substring_dict['substring='+str(i)] = True
return substring_dict
except: return '?'
# Transform format of X variables, and spit out a numpy array for all features
my_dict13 = [{'name-entity': feature_name_entity(feature_full_name(i))} for i in X2]
my_dict14 = [{'avg-length': feature_avg_wordLength(feature_full_name(i))} for i in X]
my_dict15 = [{'gender': feature_full_name(i)} for i in X3]
my_dict16 = [{'location': feature_noNeighborLoc(feature_full_name(i))} for i in X4]
my_dict17 = [{'dummy1': 1} for i in X]
my_dict18 = [{'dummy2': random.randint(0,2)} for i in X]
all_dict = []
for i in range(0, len(my_dict)):
temp_dict = dict(my_dict13[i].items() + my_dict14[i].items()
+ my_dict15[i].items() + my_dict16[i].items() + my_dict17[i].items() + my_dict18[i].items()
)
all_dict.append(temp_dict)
newX = dv.fit_transform(all_dict)
# Separate the training and testing data sets
half_cut = int(len(df)/2.0)*-1
X_train = newX[:half_cut]
X_test = newX[half_cut:]
y_train = y[:half_cut]
y_test = y[half_cut:]
# Fitting X and y into model, using training data
lr = LogisticRegression()
lr.fit(X_train, y_train)
dv = DictVectorizer()
# Feature selection
plt.figure(1)
plt.clf()
X_indices = np.arange(X_train.shape[-1])
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
label=r'Univariate score ($-Log(p_{value})$)', color='g')
plt.show()
Warning:
E:\Program Files Extra\Python27\lib\site-packages\sklearn\feature_selection\univariate_selection.py:111: UserWarning: Features [[0 0 0 ..., 0 0 0]] are constant.
It looks like the way you split your data into training and testing sets is not working:
# Separate the training and testing data sets
X_train = newX[:half_cut]
X_test = newX[half_cut:]
If you already use sklearn, it is much more convenient to use the builtin splitting routine for this:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, random_state=0)

python plotting moving average python error

i made some code where i need to make a plot where my data is persed to moving average
import numpy as np
import csv
import datetime
import matplotlib.pyplot as plt
#Open Data/File
data = open('iphonevsandroid.csv', 'r')
reader = csv.reader(data, delimiter=',')
#Define lists
iphone_data = []
android_data = []
dateTime = []
stringdates = []
#iphone_data_average = []
#android_data_average = []
for row in reader:
first_date_row = row[0]
first_date = row[0][:-13]
if row[1] != 'iphone':
iphone_data.append(int(row[1]))
if row[2] != 'android':
android_data.append(int(row[2]))
if row[0] != 'week':
stringdates.append(row[0][:-13])
for item in stringdates:
dateTime.append(datetime.datetime.strptime(item, '%Y-%m-%d'))
def movingaverage(values,window):
weigths = np.repeat(1.0, window)/window
#including valid will REQUIRE there to be enough datapoints.
#for example, if you take out valid, it will start # point one,
#not having any prior points, so itll be 1+0+0 = 1 /3 = .3333
smas = np.convolve(values, weigths, 'valid')
return smas # as a numpy array
movingaverage(iphone_data,3)
movingaverage(android_data,3)
plt.ylabel('Indsæt y label')
plt.xlabel('Indsæt x label')
plt.plot(dateTime,movingaverage(iphone_data,3)+2)
plt.plot(dateTime,movingaverage(android_data,3)+2)
plt.show()
My problem is that i get this error: ValueError: x and y must have same first dimension.
I know its because of the len of the values,
if i print the len of:
print len(dateTime)
print len(movingaverage(iphone_data,3))
print len(movingaverage(android_data,3))
i get:
528
526
526
How do i get dateTime to 526???
smas = np.convolve(values, weigths, 'valid')
should be
smas = np.convolve(values, weigths, 'same')
and if you don't want the border values, then you will have to remove them yourself, that is for odd window lengths:
smas = np.convolve(values, weigths, 'valid')[(window-1)/2:-(window-1)/2]
Note that you would also have to remove these values from android_data and iphone_data.