I am trying to learn how to find the optimal hyperparameters in decision trees classifier using the GridSearchCV() method from scikit-learn.
The problem is it is fine if I am specifying just one parameter's options, it is fine as in the following:
print(__doc__)
# Code source: Gael Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
from sklearn import datasets
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
# define classifier
dt = DecisionTreeClassifier()
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
# define parameter values that should be searched
min_samples_split_options = range(2, 4)
# create a parameter grid: map the parameter names to the values that should be saved
param_grid_dt = dict(min_samples_split= min_samples_split_options) # for DT
# instantiate the grid
grid = GridSearchCV(dt, param_grid_dt, cv=10, scoring='accuracy')
# fit the grid with param
grid.fit(X, y)
# view complete results
grid.grid_scores_
'''# examine results from first tuple
print grid.grid_scores_[0].parameters
print grid.grid_scores_[0].cv_validation_scores
print grid.grid_scores_[0].mean_validation_score'''
# examine the best model
print '*******Final results*********'
print grid.best_score_
print grid.best_params_
print grid.best_estimator_
Result:
None
*******Final results*********
0.68
{'min_samples_split': 3}
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=3, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
But when I add another parameters' options into the mix, it gives me an "invalid parameter" error, as follows:
print(__doc__)
# Code source: Gael Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
from sklearn import datasets
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
# define classifier
dt = DecisionTreeClassifier()
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
# define parameter values that should be searched
max_depth_options = range(10, 251) # for DT
min_samples_split_options = range(2, 4)
# create a parameter grid: map the parameter names to the values that should be saved
param_grid_dt = dict(max_depth=max_depth_options, min_sample_split=min_samples_split_options) # for DT
# instantiate the grid
grid = GridSearchCV(dt, param_grid_dt, cv=10, scoring='accuracy')
# fit the grid with param
grid.fit(X, y)
'''# view complete results
grid.grid_scores_
# examine results from first tuple
print grid.grid_scores_[0].parameters
print grid.grid_scores_[0].cv_validation_scores
print grid.grid_scores_[0].mean_validation_score
# examine the best model
print '*******Final results*********'
print grid.best_score_
print grid.best_params_
print grid.best_estimator_'''
Result:
None
Traceback (most recent call last):
File "C:\Users\KubiK\Desktop\GridSearch_ex6.py", line 31, in <module>
grid.fit(X, y)
File "C:\Users\KubiK\Anaconda2\lib\site-packages\sklearn\grid_search.py", line 804, in fit
return self._fit(X, y, ParameterGrid(self.param_grid))
File "C:\Users\KubiK\Anaconda2\lib\site-packages\sklearn\grid_search.py", line 553, in _fit
for parameters in parameter_iterable
File "C:\Users\KubiK\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 800, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\KubiK\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 658, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\KubiK\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 566, in _dispatch
job = ImmediateComputeBatch(batch)
File "C:\Users\KubiK\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 180, in __init__
self.results = batch()
File "C:\Users\KubiK\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py", line 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\KubiK\Anaconda2\lib\site-packages\sklearn\cross_validation.py", line 1520, in _fit_and_score
estimator.set_params(**parameters)
File "C:\Users\KubiK\Anaconda2\lib\site-packages\sklearn\base.py", line 270, in set_params
(key, self.__class__.__name__))
ValueError: Invalid parameter min_sample_split for estimator DecisionTreeClassifier. Check the list of available parameters with `estimator.get_params().keys()`.
[Finished in 0.3s]
There's a typo in your code, it should be min_samples_split not min_sample_split.
Related
I'm trying to execute the code from https://github.com/lucfra/RFHO, more specifically from RFHO starting example.ipynb. The only thing I want to change is doing it in forward mode instead of reverse mode. So this is the changed code:
import tensorflow as tf
import rfho as rf
from rfho.datasets import load_mnist
mnist = load_mnist(partitions=(.05, .01)) # 5% of data in training set, 1% in validation
# remaining in test set (change these percentages and see the effect on regularization hyperparameter)
x, y = tf.placeholder(tf.float32, name='x'), tf.placeholder(tf.float32, name='y')
# define the model (here use a linear model from rfho.models)
model = rf.LinearModel(x, mnist.train.dim_data, mnist.train.dim_target)
# vectorize the model, and build the state vector (augment by 1 since we are
# going to optimize the weights with momentum)
s, out, w_matrix = rf.vectorize_model(model.var_list, model.inp[-1], model.Ws[0],
augment=0)
# (this function will print also some tensorflow infos and warnings about variables
# collections... we'll solve this)
# define error
error = tf.reduce_mean(rf.cross_entropy_loss(labels=y, logits=out), name='error')
constraints = []
# define training error by error + L2 weights penalty
rho = tf.Variable(0., name='rho') # regularization hyperparameter
training_error = error + rho*tf.reduce_sum(tf.pow(w_matrix, 2))
constraints.append(rf.positivity(rho)) # regularization coefficient should be positive
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(out, 1), tf.argmax(y, 1)),
"float"), name='accuracy')
# define learning rates and momentum factor as variables, to be optimized
eta = tf.Variable(.01, name='eta')
#mu = tf.Variable(.5, name='mu')
# now define the training dynamics (similar to tf.train.Optimizer)
optimizer = rf.GradientDescentOptimizer.create(s, eta, loss=training_error)
# add constraints for learning rate and momentum factor
constraints += optimizer.get_natural_hyperparameter_constraints()
# we want to optimize the weights w.r.t. training_error
# and hyperparameters w.r.t. validation error (that in this case is
# error evaluated on the validation set)
# we are going to use ReverseMode
hyper_dict = {error: [rho, eta]}
hyper_opt = rf.HyperOptimizer(optimizer, hyper_dict, method=rf.ForwardHG)
# define helper for stochastic descent
ev_data = rf.ExampleVisiting(mnist.train, batch_size=2**8, epochs=200)
tr_suppl = ev_data.create_supplier(x, y)
val_supplier = mnist.validation.create_supplier(x, y)
test_supplier = mnist.test.create_supplier(x, y)
# Run all for some hyper-iterations and print progresses
def run(hyper_iterations):
with tf.Session().as_default() as ss:
ev_data.generate_visiting_scheme() # needed for remembering the example visited in forward pass
for hyper_step in range(hyper_iterations):
hyper_opt.initialize() # initializes all variables or reset weights to initial state
hyper_opt.run(ev_data.T, train_feed_dict_supplier=tr_suppl,
val_feed_dict_suppliers=val_supplier,
hyper_constraints_ops=constraints)
#
# print('Concluded hyper-iteration', hyper_step)
# print('Test accuracy:', ss.run(accuracy, feed_dict=test_supplier()))
# print('Validation error:', ss.run(error, feed_dict=val_supplier()))
saver = rf.Saver('Staring example', collect_data=False)
with saver.record(rf.Records.tensors('error', fd=('x', 'y', mnist.validation), rec_name='valid'),
rf.Records.tensors('error', fd=('x', 'y', mnist.test), rec_name='test'),
rf.Records.tensors('accuracy', fd=('x', 'y', mnist.validation), rec_name='valid'),
rf.Records.tensors('accuracy', fd=('x', 'y', mnist.test), rec_name='test'),
rf.Records.hyperparameters(),
rf.Records.hypergradients(),
): # a context to print some statistics.
# If you execute again any cell containing the model construction,
# restart the notebook or reset tensorflow graph in order to prevent errors
# due to tensor namings
run(20) # this will take some time... run it for less hyper-iterations for a quicker look
The problem is I get a Type error: 'function' object is not subscriptable back after the first iteration:
Traceback (most recent call last):
File "/Applications/PyCharm CE.app/Contents/helpers/pydev/pydev_run_in_console.py", line 52, in run_file
pydev_imports.execfile(file, globals, locals) # execute the script
File "/Applications/PyCharm CE.app/Contents/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/Users/repierau/Documents/FSHO/RFHO-master/rfho/examples/simply_example.py", line 80, in <module>
run(20) # this will take some time... run it for less hyper-iterations for a quicker look
File "/Users/repierau/Documents/FSHO/RFHO-master/rfho/examples/simply_example.py", line 63, in run
hyper_constraints_ops=constraints)
File "/Users/repierau/Documents/FSHO/RFHO-master/rfho/save_and_load.py", line 624, in _saver_wrapped
res = f(*args, **kwargs)
File "/Users/repierau/Documents/FSHO/RFHO-master/rfho/hyper_gradients.py", line 689, in run
hyper_batch_step=self.hyper_batch_step.eval())
File "/Users/repierau/Documents/FSHO/RFHO-master/rfho/hyper_gradients.py", line 581, in run_all
return self.hyper_gradients(val_feed_dict_suppliers, hyper_batch_step)
File "/Users/repierau/Documents/FSHO/RFHO-master/rfho/hyper_gradients.py", line 551, in hyper_gradients
val_sup_lst.append(val_feed_dict_supplier[k])
TypeError: 'function' object is not subscriptable
I have been trying to run the code below which I got from here and even though I have changed almost nothing other than the image size (350,350 instead of 150, 150) is still cannot get it to work. I am getting the above filter error (in title) which I do comprehend but I am not doing it wrong so I don't understand this. It basically says that I cannot have more nodes than inputs, correct?
I was able to eventually hack my way to a solution by changing this line:
model.add(Convolution2D(32, 5, 5, border_mode='valid', input_shape=(3, IMG_WIDTH, IMG_HEIGHT)))
with this:
model.add(Convolution2D(32, 5, 5, border_mode='valid', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)))
but I would still like to understand why this worked.
Here is the code below along with the error I am getting. Would appreciate some help (I am using Python Anaconda 2.7.11).
# IMPORT LIBRARIES --------------------------------------------------------------------------------#
import glob
import tensorflow
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from settings import RAW_DATA_ROOT
# GLOBAL VARIABLES --------------------------------------------------------------------------------#
TRAIN_PATH = RAW_DATA_ROOT + "/train/"
TEST_PATH = RAW_DATA_ROOT + "/test/"
IMG_WIDTH, IMG_HEIGHT = 350, 350
NB_TRAIN_SAMPLES = len(glob.glob(TRAIN_PATH + "*"))
NB_VALIDATION_SAMPLES = len(glob.glob(TEST_PATH + "*"))
NB_EPOCH = 50
# FUNCTIONS ---------------------------------------------------------------------------------------#
def baseline_model():
"""
The Keras library provides wrapper classes to allow you to use neural network models developed
with Keras in scikit-learn. The code snippet below is used to construct a simple stack of 3
convolution layers with a ReLU activation and followed by max-pooling layers. This is very
similar to the architectures that Yann LeCun advocated in the 1990s for image classification
(with the exception of ReLU).
:return: The training model.
"""
model = Sequential()
model.add(Convolution2D(32, 5, 5, border_mode='valid', input_shape=(3, IMG_WIDTH, IMG_HEIGHT)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Convolution2D(32, 5, 5, border_mode='valid'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Convolution2D(64, 5, 5, border_mode='valid'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
# Add a fully connected layer layer that converts our 3D feature maps to 1D feature vectors
model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
# Use a dropout layer to reduce over-fitting, by preventing a layer from seeing twice the exact
# same pattern (works by switching off a node once in a while in different epochs...). This
# will also serve as out output layer.
model.add(Dropout(0.5))
model.add(Dense(8))
model.add(Activation('softmax'))
# Compile model
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
def train_model(model):
"""
Simple script that uses the baseline model and returns a trained model.
:param model: model
:return: model
"""
# Define the augmentation configuration we will use for training
TRAIN_DATAGEN = ImageDataGenerator(
rescale=1. / 255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True)
# Build the train generator
TRAIN_GENERATOR = TRAIN_DATAGEN.flow_from_directory(
TRAIN_PATH,
target_size=(IMG_WIDTH, IMG_HEIGHT),
batch_size=32,
class_mode='categorical')
TEST_DATAGEN = ImageDataGenerator(rescale=1. / 255)
# Build the validation generator
TEST_GENERATOR = TEST_DATAGEN.flow_from_directory(
TEST_PATH,
target_size=(IMG_WIDTH, IMG_HEIGHT),
batch_size=32,
class_mode='categorical')
# Train model
model.fit_generator(
TRAIN_GENERATOR,
samples_per_epoch=NB_TRAIN_SAMPLES,
nb_epoch=NB_EPOCH,
validation_data=TEST_GENERATOR,
nb_val_samples=NB_VALIDATION_SAMPLES)
# Always save your weights after training or during training
model.save_weights('first_try.h5')
# END OF FILE -------------------------------------------------------------------------------------#
and the error:
Using TensorFlow backend.
Training set: 0 files.
Test set: 0 files.
Traceback (most recent call last):
File "/Users/christoshadjinikolis/GitHub_repos/datareplyuk/ODSC_Facial_Sentiment_Analysis/src/model/__init__.py", line 79, in <module>
model = baseline_model()
File "/Users/christoshadjinikolis/GitHub_repos/datareplyuk/ODSC_Facial_Sentiment_Analysis/src/model/training_module.py", line 31, in baseline_model
model.add(Convolution2D(32, 5, 5, border_mode='valid', input_shape=(3, IMG_WIDTH, IMG_HEIGHT)))
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/keras/models.py", line 276, in add
layer.create_input_layer(batch_input_shape, input_dtype)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/keras/engine/topology.py", line 370, in create_input_layer
self(x)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/keras/engine/topology.py", line 514, in __call__
self.add_inbound_node(inbound_layers, node_indices, tensor_indices)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/keras/engine/topology.py", line 572, in add_inbound_node
Node.create_node(self, inbound_layers, node_indices, tensor_indices)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/keras/engine/topology.py", line 149, in create_node
output_tensors = to_list(outbound_layer.call(input_tensors[0], mask=input_masks[0]))
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/keras/layers/convolutional.py", line 466, in call
filter_shape=self.W_shape)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py", line 1579, in conv2d
x = tf.nn.conv2d(x, kernel, strides, padding=padding)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 394, in conv2d
data_format=data_format, name=name)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 703, in apply_op
op_def=op_def)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2319, in create_op
set_shapes_for_outputs(ret)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1711, in set_shapes_for_outputs
shapes = shape_func(op)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/common_shapes.py", line 246, in conv2d_shape
padding)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/common_shapes.py", line 184, in get2d_conv_output_size
(row_stride, col_stride), padding_type)
File "/Users/christoshadjinikolis/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/common_shapes.py", line 149, in get_conv_output_size
"Filter: %r Input: %r" % (filter_size, input_size))
ValueError: Filter must not be larger than the input: Filter: (5, 5) Input: (3, 350)
The problem is that the order of input_shape() changes depending the backend you are using (tensorflow or theano).
The best solution I found was defining this order in the file ~/.keras/keras.json.
Try to use the theano order with tensorflow backend, or theano order with theano backend.
Create the keras directory in your home and create the keras json: mkdir ~/.keras && touch ~/.keras/keras.json
{
"image_dim_ordering": "th",
"epsilon": 1e-07,
"floatx": "float32",
"backend": "tensorflow"
}
Just encountered the same problem myself, when I was following a tutorial. As pointed out by #Yao Zhang, the error is caused by the order in the input_shape. There are multiple ways to solve the problem.
Option 1: Change the order in input_shape
The line of your code
model.add(Convolution2D(32, 5, 5, border_mode='valid', input_shape=(3, IMG_WIDTH, IMG_HEIGHT)))
should be changed to
model.add(Convolution2D(32, 5, 5, border_mode='valid', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)))
which should be fine then.
Option 2: Specify image_dim_ordering in your layers
Option 3: Modify the keras configuration file by changing 'tf' to 'th' in your ~/.keras/keras.json
My code works fine on a friends computer I do not understand why does it work on anaconda environment on my computer, please help. Is it a problem with python installation or with the code?
Both iris_train and iris_test data files look:
Blockquote
Sepal Length Sepal Width Petal Length Petal Width classType classNames
5.1 3.5 1.4 0.2 Iris-setosa Iris-setosa
4.9 3 1.4 0.2 Iris-setosa Iris-virginica
4.7 3.2 1.3 0.2 Iris-setosa Iris-versicolor
Blockquote
Receive following error when I run the code:
File "/Applications/anaconda2/lib/python2.7/site- packages/spyderlib/widgets/externalshell/sitecustomize.py", line 714, in runfile
execfile(filename, namespace)
File "/Applications/anaconda2/lib/python2.7/site-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 81, in execfile
builtins.execfile(filename, *where)
File "/Users/nadiastraton/Documents/workspacePython/02450Toolbox_Python/Thesis/Scripts/ Script_Results/KNN/KNN1.py", line 30, in <module>
classNames = list({name[0] for name in data_train['classNames']})
File "/Users/nadiastraton/Documents/workspacePython/02450Toolbox_Python/Thesis/Scripts/ Script_Results/KNN/KNN1.py", line 30, in <setcomp>
classNames = list({name[0] for name in data_train['classNames']})
TypeError: 'float' object has no attribute '__getitem__'
Receive following errors, when I try to access run file ('KNN1') from the terminal:
File "/Applications/anaconda2/lib/python2.7/site-packages/pylab.py", line 1, in <module>
from matplotlib.pylab import *
File "/Applications/anaconda2/lib/python2.7/site- packages/matplotlib/__init__.py", line 1131, in <module>
rcParams = rc_params()
File "/Applications/anaconda2/lib/python2.7/site-packages/matplotlib/__init__.py", line 975, in rc_params
return rc_params_from_file(fname, fail_on_error)
File "/Applications/anaconda2/lib/python2.7/site-packages/matplotlib/__init__.py", line 1100, in rc_params_from_file
config_from_file = _rc_params_in_file(fname, fail_on_error)
File "/Applications/anaconda2/lib/python2.7/site-packages/matplotlib/__init__.py", line 1018, in _rc_params_in_file
with _open_file_or_url(fname) as fd:
File "/Applications/anaconda2/lib/python2.7/contextlib.py", line 17, in __enter__
return self.gen.next()
File "/Applications/anaconda2/lib/python2.7/site-packages/matplotlib/__init__.py", line 1000, in _open_file_or_url
encoding = locale.getdefaultlocale()[1]
File "/Applications/anaconda2/lib/python2.7/locale.py", line 543, in getdefaultlocale
return _parse_localename(localename)
File "/Applications/anaconda2/lib/python2.7/locale.py", line 475, in _parse_localename
raise ValueError, 'unknown locale: %s' % localename
ValueError: unknown locale: UTF-8
My code is:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from pylab import figure, ylabel, xlabel,hold, xticks,yticks, show, title, plot, imshow, colorbar
import pandas as pd
data_train =pd.read_excel('iris_train.xlsx')
data_test =pd.read_excel('iris_test.xlsx')
X_train =data_train.as_matrix(columns=['Sepal Length', 'Sepal Width'])
X_test =data_test.as_matrix(columns=['Sepal Length', 'Sepal Width'])
y_train =data_train.as_matrix(columns=['classType'])
y_test =data_test.as_matrix(columns=['classType'])
# attributeNames = [name[0] for name in data['attributeNames'].squeeze()]
classNames = list({name[0] for name in data_train['classNames']})
N, M = X_train.shape
C = len(classNames)
# K-nearest neighbors
K=2
# Distance metric (corresponds to 2nd norm, euclidean distance).
# You can set dist=1 to obtain manhattan distance (cityblock distance).
dist=2
# Fit classifier and classify the test points
knn = KNeighborsClassifier(n_neighbors=K, p=dist);
knn.fit(X_train, y_train);
y_est = knn.predict(X_test);
# Plot the training data points (color-coded) and test data points.
figure(1);
hold(True);
styles = ['.b', '.r', '.g']
# import ipdb; ipdb.set_trace()
for c in range(C):
class_mask = (y_train==c)
plot(X_train[class_mask], X_train[class_mask], styles[c])
# Plot the classfication results
styles = ['ob', 'or', 'og']
for c in range(C):
class_mask = (y_est==c)
plot(X_test[class_mask,0], X_test[class_mask,1], styles[c], markersize=10)
plot(X_test[class_mask,0], X_test[class_mask,1], 'kx', markersize=8)
title('Synthetic data classification - KNN');
# Compute and plot confusion matrix
cm = confusion_matrix(y_test.ravel(), y_est);
accuracy = 100*cm.diagonal().sum()/cm.sum(); error_rate = 100-accuracy;
figure(2);
imshow(cm, cmap='binary', interpolation='None');
colorbar()
xticks(range(C));
yticks(range(C));
xlabel('Predicted class'); ylabel('Actual class');
title('Confusion matrix (Accuracy: {0}%, Error Rate: {1}%)'.format(accuracy, error_rate));
show()
I trained an instance of scikit-learn's TfidfVectorizer and I want to persist it to disk. I saved the IDF matrix (the idf_ attribute) to disk as a numpy array and I saved the vocabulary (vocabulary_) to disk as a JSON object (I'm avoiding pickle, for security and other reasons). I'm trying to do this:
import json
from idf import idf # numpy array with the pre-computed IDFs
from sklearn.feature_extraction.text import TfidfVectorizer
# dirty trick so I can plug my pre-computed IDFs
# necessary because "vectorizer.idf_ = idf" doesn't work,
# it returns "AttributeError: can't set attribute."
class MyVectorizer(TfidfVectorizer):
TfidfVectorizer.idf_ = idf
# instantiate vectorizer
vectorizer = MyVectorizer(lowercase = False,
min_df = 2,
norm = 'l2',
smooth_idf = True)
# plug vocabulary
vocabulary = json.load(open('vocabulary.json', mode = 'rb'))
vectorizer.vocabulary_ = vocabulary
# test it
vectorizer.transform(['foo bar'])
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 1314, in transform
return self._tfidf.transform(X, copy=False)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 1014, in transform
check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/sklearn/utils/validation.py", line 627, in check_is_fitted
raise NotFittedError(msg % {'name': type(estimator).__name__})
sklearn.utils.validation.NotFittedError: idf vector is not fitted
So, what am I doing wrong? I'm failing to fool the vectorizer object: somehow it knows that I'm cheating (i.e., passing it pre-computed data and not training it with actual text). I inspected the attributes of the vectorizer object but I can't find anything like 'istrained', 'isfitted', etc. So, how do I fool the vectorizer?
Ok, I think I got it: the vectorizer instance has an attribute _tfidf, which in turn must have an attribute _idf_diag. The transform method calls a check_is_fitted function that checks whether whether that _idf_diag exists. (I had missed it because it's an attribute of an attribute.) So, I inspected the TfidfVectorizer source code to see how _idf_diag is created. Then I just added it to the _tfidf attribute:
import scipy.sparse as sp
# ... code ...
vectorizer._tfidf._idf_diag = sp.spdiags(idf,
diags = 0,
m = len(idf),
n = len(idf))
And now the vectorization works.
I am using scikit.learn to train an svm based on data where each observation (X) is a list of words. The tags for each observation (Y) are floating point values. I have tried following the example given in the scikit learn documentation (http://scikit-learn.org/stable/modules/svm.html) for Multi-class classification.
Here is my code:
from __future__ import division
from sklearn import svm
import os.path
import numpy
import re
'''
The stanford-postagger was included to see how it tags the words and to see if it would help in getting just the names
of the ingredients. Turns out its pointless.
'''
#from nltk.tag.stanford import POSTagger
mainDirectory = './nyu/PROJECTS/Epicurious/DATA/ingredients'
#st = POSTagger('/usr/share/stanford-postagger/models/english-bidirectional-distsim.tagger','/usr/share/stanford-postagger/stanford-postagger.jar')
'''
This is where we would reach each line of the file and then run a regex match on it to get all the words before
the first tab. (these are the names of the ingredients. Some of them may have adjectives like fresh, peeled,cut etc.
Not sure what to do about them yet.)
'''
def getFileDetails(_filename,_fileDescriptor):
rankingRegexMatch = re.match('([0-9](?:\_)[0-9]?)', _filename)
if len(rankingRegexMatch.group(0)) == 2:
ranking = float(rankingRegexMatch.group(0)[0])
else:
ranking = float(rankingRegexMatch.group(0)[0]+'.'+rankingRegexMatch.group(0)[2])
_keywords = []
for line in _fileDescriptor:
m = re.match('(\w+\s*\w*)(?=\t[0-9])', line)
if m:
_keywords.append(m.group(0))
return [_keywords,ranking]
'''
Open each file in the directory and pass the name and file descriptor to getFileDetails
'''
def this_is_it(files):
_allKeywords = []
_allRankings = []
for eachFile in files:
fullFilePath = mainDirectory + '/' + eachFile
f = open(fullFilePath)
XandYForThisFile = getFileDetails(eachFile,f)
_allKeywords.append(XandYForThisFile[0])
_allRankings.append(XandYForThisFile[1])
#_allKeywords = numpy.array(_allKeywords,dtype=object)
svm_learning(_allKeywords,_allRankings)
def svm_learning(x,y):
clf = svm.SVC()
clf.fit(x,y)
'''
This just prints the directory path and then calls the callback x on files
'''
def print_files( x, dir_path , files ):
print dir_path
x(files)
'''
code starts here
'''
os.path.walk(mainDirectory, print_files, this_is_it)
When the svm_learning(x,y) method is called, it throws me an error:
Traceback (most recent call last):
File "scan for files.py", line 72, in <module>
os.path.walk(mainDirectory, print_files, this_is_it)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/posixpath.py", line 238, in walk
func(arg, top, names)
File "scan for files.py", line 68, in print_files
x(files)
File "scan for files.py", line 56, in this_is_it
svm_learning(_allKeywords,_allRankings)
File "scan for files.py", line 62, in svm_learning
clf.fit(x,y)
File "/Library/Python/2.7/site-packages/scikit_learn-0.14_git-py2.7-macosx-10.8-intel.egg/sklearn/svm/base.py", line 135, in fit
X = atleast2d_or_csr(X, dtype=np.float64, order='C')
File "/Library/Python/2.7/site-packages/scikit_learn-0.14_git-py2.7-macosx-10.8-intel.egg/sklearn/utils/validation.py", line 116, in atleast2d_or_csr
"tocsr")
File "/Library/Python/2.7/site-packages/scikit_learn-0.14_git-py2.7-macosx-10.8-intel.egg/sklearn/utils/validation.py", line 96, in _atleast2d_or_sparse
X = array2d(X, dtype=dtype, order=order, copy=copy)
File "/Library/Python/2.7/site-packages/scikit_learn-0.14_git-py2.7-macosx-10.8-intel.egg/sklearn/utils/validation.py", line 80, in array2d
X_2d = np.asarray(np.atleast_2d(X), dtype=dtype, order=order)
File "/Library/Python/2.7/site-packages/numpy-1.8.0.dev_bbcfcf6_20130307-py2.7-macosx-10.8-intel.egg/numpy/core/numeric.py", line 331, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: setting an array element with a sequence.
Can anyone help? I am new to scikit and could not find any help in the documentation.
You should take a look at: Text feature extraction. You are going to want to use either a TfidfVectorizer, a CountVectorizer, or a HashingVectorizer(if your data is very large). These components take your text in and output feature matrices that are acceptable to classifiers. Be advised that these work on lists of strings, with one string per example, so if you have a list of lists of strings (you have already tokenized), you may need to either join() the tokens to get a list of strings or skip tokenization.