How to Load custom data sets in neon nervana - python-2.7

If any one is familiar with nervana's neon please can you give me example of how to load custom dataset in this example of neon.

Here is an example dataset. You can also check their docs. You'll see a refernce later to a "DataSet" in __iter__ but that is just contains some function generate an item. The key is to make sure you create contiguous X, y pairs, set them on a backend tensor and yield. Hope that helps.
import numpy as np
from data import DataSet
from operator import mul
from neon.data import NervanaDataIterator
class CustomLoader(NervanaDataIterator):
def __init__(self, in_data, img_shape, n_classes):
# Load the numpy data into some variables. We divide the image by 255 to normalize the values
# between 0 and 1.
self.shape = img_shape # shape of the input data (e.g. for images, (C, H, W))
# 1. assign some required and useful attributes
self.start = 0 # start at zero
self.ndata = in_data.shape[0] # number of images in X (hint: use X.shape)
self.nfeatures = reduce(mul, img_shape, 1) # number of features in X (hint: use X.shape)
# number of minibatches per epoch
# to calculate this, use the batchsize, which is stored in self.be.bsz
self.nbatches = self.ndata / self.be.bsz
# 2. allocate memory on the GPU for a minibatch's worth of data.
# (e.g. use `self.be` to access the backend.). See the backend documentation.
# to get the minibatch size, use self.be.bsz
# hint: X should have shape (# features, mini-batch size)
# hint: use some of the attributes previously defined above
self.dev_X = self.be.zeros((self.nfeatures, self.be.bsz))
self.dev_Y = self.be.zeros((n_classes, self.be.bsz))
self.data_loader = DataSet(in_data, self.be.bsz)
self.data_loader.start()
def reset(self):
self.data_loader.stop()
self.start = 0
self.data_loader.start()
def __iter__(self):
# 3. loop through minibatches in the dataset
for index in xrange(self.nbatches):
# 3a. grab the right slice from the numpy arrays
inputs, targets, _ = self.data_loader.batch()
inputs = inputs.ravel()
# The arrays X and Y data are in shape (batch_size, num_features),
# but the iterator needs to return data with shape (num_features, batch_size).
# here we transpose the data, and then store it as a contiguous array.
# numpy arrays need to be contiguous before being loaded onto the GPU.
inputs = np.ascontiguousarray(inputs.T / 255.0)
targets = np.ascontiguousarray(targets.T)
# here we test your implementation
# your slice has to have the same shape as the GPU tensors you allocated
assert inputs.shape == self.dev_X.shape, \
"inputs has shape {}, but dev_X is {}".format(inputs.shape, self.dev_X.shape)
assert targets.shape == self.dev_Y.shape, \
"targets has shape {}, but dev_Y is {}".format(targets.shape, self.dev_Y.shape)
# 3b. transfer from numpy arrays to device
# - use the GPU memory buffers allocated previously,
# and call the myTensorBuffer.set() function.
self.dev_X.set(inputs)
self.dev_Y.set(targets)
# 3c. yield a tuple of the device tensors.
# X should be of shape (num_features, batch_size)
# Y should be of shape (4, batch_size)
yield (self.dev_X, self.dev_Y)

Related

Trying to use Caffe2 to add two blobs together that contain matrices

I'm trying to add the values of two blobs together. These blobs contain a matrix that is 2,2.
workspace.FeedBlob("X", np.random.randn(2, 2).astype(np.float32))
workspace.FeedBlob("Y", np.random.randn(2, 2).astype(np.float32))
net = core.Net('net')
sum_stuff = net.Add([X, Y])
What exactly did not work? The following example will create two 2x2 matrices and add them together:
from caffe2.python import workspace, model_helper, core
import numpy as np
# create 2x2 matrices with random integer from 0 to 9
# and feed them to the workspace
workspace.FeedBlob("X", np.random.randint(0,9,size=(2,2)).astype(np.int))
workspace.FeedBlob("Y", np.random.randint(0,9,size=(2,2)).astype(np.int))
# define a network which adds the two blobs together and
# stores the result as Sum
net = core.Net('net')
sum_stuff = net.Add(["X", "Y"], "Sum")
# run the network
workspace.CreateNet(net)
workspace.RunNet(net.Proto().name)
# get the values from the workspace
X = workspace.FetchBlob("X")
Y = workspace.FetchBlob("Y")
Sum = workspace.FetchBlob("Sum")
# print the result to check if correct
print("First matrix:\n{0}".format(X))
print("Second matrix:\n{0}".format(Y))
print("Sum of the two matrices:\n{0}".format(Sum))

Keras ImageDataGenerator: random transform

I'm interested in augmenting my dataset with random image transformations. I'm using Keras ImageDataGenerator, and I'm getting the following error when trying to apply random_transform to a single image:
--> x = apply_transform(x, transform matrix, img_channel_axis, fill_mode, cval)
>>> RuntimeError: affine matrix has wrong number of rows.
I found the source code for the ImageDataGenerator here. However, I'm not sure how to debug the runtime error. Below is the code I have:
from keras.preprocessing.image import img_to_array, load_img
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.inception_v3 import preprocess_input
image_path = './figures/zebra.jpg'
#data augmentation
train_datagen = ImageDataGenerator(
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest')
print "\nloading image..."
image = load_img(image_path, target_size=(299, 299))
image = img_to_array(image)
image = np.expand_dims(image, axis=0) # 1 x input_shape
image = preprocess_input(image)
train_datagen.fit(image)
image = train_datagen.random_transform(image)
The error occurs at the last line when calling random_transform.
The problem is that random_transform expects a 3D-array.
See the docstring:
def random_transform(self, x, seed=None):
"""Randomly augment a single image tensor.
# Arguments
x: 3D tensor, single image.
seed: random seed.
# Returns
A randomly transformed version of the input (same shape).
"""
So you'll need to call it before np.expand_dims.

ValueError: Tensor must be from the same graph as Tensor

I am trying to bulid a graph in tensorflow, but encountering the following error:
ValueError: Tensor(transformation_0/output/output: 0", shape=(),
dtype=float32 ) must be from the same graph as
Tensor("variables/total_output: 0", shape=(), dtype=float32_ref)
Here is the code:
import tensorflow as tf
# Explicitly create a Graph object
graph =tf.Graph()
with graph.as_default():
with tf.name_scope("variables"):
# Variable to keep track of how many times the graph has been run
global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="global_step")
# Variable that keeps track of the sum of all output values over time:
total_output = tf.Variable(0.0, dtype=tf.float32, trainable=False, name="total_output")
# Primary transformation Operations
with tf.name_scope("transformation"):
# Separate input layer
with tf.name_scope("input"):
# Create input placeholder- takes in a Vector
a = tf.placeholder(tf.float32, shape=[None],name="input_placeholder_a")
# Separate middle layer
with tf.name_scope("intermediate_layer"):
b = tf.reduce_prod(a, name="product_b")
c = tf.reduce_sum(a, name="sum_c")
# Separate output layer
with tf.name_scope("output"):
output = tf.add(b, c, name="output")
with tf.name_scope("update"):
# Increments the total_output Variable by the latest output
update_total = total_output.assign_add(output)
# Increments the above `global_step` Variable, should be run whenever #the graph is run
increment_step = global_step.assign_add(1)
# Summary Operations
with tf.name_scope("summaries"):
avg = tf.div(update_total, tf.cast(increment_step, tf.float32), name="average")
# Creates summaries for output node
tf.scalar_summary(b'Output', output, name="output_summary")
tf.scalar_summary(b'Sum of outputs over time', update_total, name="total_summary")
tf.scalar_summary(b'Average of outputs over time', avg, name="average_summary")
# Global Variables and Operations
with tf.name_scope("global_ops"):
# Initialization Op
init = tf.initialize_all_variables()
# Merge all summaries into one Operation
merged_summaries = tf.merge_all_summaries()
# Start a Session, using the explicitly created Graph
sess = tf.Session(graph=graph)
# Open a SummaryWriter to save summaries
writer = tf.train.SummaryWriter('./improved_graph', graph)
# Initialize Variables
sess.run(init)
def run_graph(input_tensor):
"""
Helper function; runs the graph with given input tensor and saves summaries
"""
feed_dict = {a: input_tensor}
_, step, summary = sess.run([output, increment_step, merged_summaries],
feed_dict=feed_dict)
writer.add_summary(summary, global_step=step)
# Run the graph with various inputs
run_graph([2,8])
run_graph([3,1,3,3])
run_graph([8])
run_graph([1,2,3])
run_graph([11,4])
run_graph([4,1])
run_graph([7,3,1])
run_graph([6,3])
run_graph([0,2])
run_graph([4,5,6])
# Write the summaries to disk
writer.flush()
# Close the SummaryWriter
writer.close()
# Close the session
sess.close()
have you tried:
1) changing
graph =tf.Graph()
with graph.as_default()
for:
with tf.Session() as sess:
2) And removing:
sess = tf.Session(graph=graph)
I was having the same error and those changes solve it.
try this,delete shape=[None]
a = tf.placeholder(tf.float32, name="input_placeholder_a")

Understanding Deep Learning model accuracy

I need help in understanding the accuracy and dataset output format for Deep Learning model.
I did some training for deep learning based on this site : https://machinelearningmastery.com/deep-learning-with-python2/
I did the example for pima-indian-diabetes dataset, and iris flower dataset. I train my computer for pima-indian-diabetes dataset using script from this : http://machinelearningmastery.com/tutorial-first-neural-network-python-keras/
Then I train my computer for iris-flower dataset using below script.
# import package
import numpy
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.callbacks import ModelCheckpoint
# fix random seed for reproductibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataframe = read_csv("iris_2.csv", header=None)
dataset = dataframe.values
X = dataset[:,0:4].astype(float)
Y = dataset[:,4]
# encode class value as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
### one-hot encoder ###
dummy_y = np_utils.to_categorical(encoded_Y)
# define base model
def baseline_model():
# create model
model = Sequential()
model.add(Dense(4, input_dim=4, init='normal', activation='relu'))
model.add(Dense(3, init='normal', activation='sigmoid'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_json = model.to_json()
with open("iris.json", "w") as json_file:
json_file.write(model_json)
model.save_weights('iris.h5')
return model
estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=1000, batch_size=6, verbose=0)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
Everything works fine until I decided to try on other dataset from this link : https://archive.ics.uci.edu/ml/datasets/Glass+Identification
At first I train this new dataset using the pime-indian-diabetes dataset script's example and change the value for X and Y variable to this
dataset = numpy.loadtxt("glass.csv", delimiter=",")
X = dataset[:,0:10]
Y = dataset[:,10]
and also the value for the neuron layer to this
model = Sequential()
model.add(Dense(10, input_dim=10, init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))
the result produce accuracy = 32.71%
Then I changed the output column of this dataset which is originally in integer (1~7) to string (a~g) and use the example's script for the iris-flower dataset by doing some modification to it
import numpy
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
seed = 7
numpy.random.seed(seed)
dataframe = read_csv("glass.csv", header=None)
dataset = dataframe.values
X = dataset[:,0:10].astype(float)
Y = dataset[:,10]
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
def create_baseline():
model = Sequential()
model.add(Dense(10, input_dim=10, init='normal', activation='relu'))
model.add(Dense(1, init='normal', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_json = model.to_json()
with open("glass.json", "w") as json_file:
json_file.write(model_json)
model.save_weights('glass.h5')
return model
estimator = KerasClassifier(build_fn=create_baseline, nb_epoch=1000, batch_size=10, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X, encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
I did not use 'dummy_y' variable as refer to this tutorial : http://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/
I check that the dataset using alphabet as the output and thinking that maybe I can reuse that script to train the new glass dataset that I modified.
This time the results become like this
Baseline : 68.42% (3.03%)
From the article, that 68% and 3% means the mean and standard deviation of model accuracy.
My 1st question is when do I use integer or alphabet as the output column? and is this kind of accuracy result common when we tempered with the dataset like changing the output from integer to string/alphabet?
My 2nd question is how do I know how many neuron I have to put for each layer? Is it related to what backend I use when compiling the model(Tensorflow or Theano)?
Thank you in advance.
First question
It doesn't matter, as you can see here:
Y = range(10)
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
print encoded_Y
Y = ['a', 'b', 'c', 'd', 'e', 'f','g','h','i','j']
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
print encoded_Y
results:
[0 1 2 3 4 5 6 7 8 9]
[0 1 2 3 4 5 6 7 8 9]
Which means that your classifier sees exactly the same labels.
Second question
There is no absolutely correct answer for this question, but for sure it does not depend on your backend.
You should try and experiment with different number of neurons, number of layers, types of layers and all other network parameters in order to understand what is the best architecture to your problem.
With experience you will develop both a good intuition as for what parameters will be better for which type of problems as well as a good method for the experimentation.
The best rule of thumb (assuming you have the dataset required to sustain such a strategy) I've heard is "Make your network as large as you can until it overfit, add regularization until it does not overfit - repeat".
Per parts. First, if your output includes values ​​of [0, 5] it is
impossible that using the sigmoid activation you can obtain that.
The sigmoid function has a range of [0, 1]. You could use an
activation = linear (without activation). But I think it's a bad approach because your problem is not to estimate a continuous value.
Second, the question you should ask yourself is not so much the type
of data you are using (in the sense of how you store the
information). Is it a string? Is it an int? Is it a float? It does
not matter, but you have to ask what kind of problem you are trying
to solve.
In this case, the problem should not be treated as a regression
(estimate a continuous value). Because your output are categorical,
numbers but categorical. Really you want to classifying between:
Type of glass: (class attribute).
When do a classification problem the following configuration is
"normally" used:
The class is encoded by one-hot encoding. It is nothing more than a vector of 0's and a single one in the corresponding class.
For instance: class 3 (0 count) and have 6 classes -> [0, 0, 0, 1, 0, 0] (as many zeros as classes you have).
As you see now, we dont have a single output, your model must be as outputs as your Y (6 classes). That way the last layer should
have as many neurons as classes. Dense (classes, ...).
You are also interested in the fact that the output is the probability of belonging to each class, that is: p (y = class_0),
... p (y_class_n). For this, the softmax activation layer is used,
which is to ensure that the sum of all the probabilities is 1.
You have to change the loss for the categorical_crossentropy so that it is able to work together with the softmax. And use the metric categorical_accuracy.
seed = 7
numpy.random.seed(seed)
dataframe = read_csv("glass.csv", header=None)
dataset = dataframe.values
X = dataset[:,0:10].astype(float)
Y = dataset[:,10]
encoder = LabelEncoder()
encoder.fit(Y)
from keras.utils import to_categorical
encoded_Y = to_categorical(encoder.transform(Y))
def create_baseline():
model = Sequential()
model.add(Dense(10, input_dim=10, init='normal', activation='relu'))
model.add(Dense(encoded_Y.shape[1], init='normal', activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
model_json = model.to_json()
with open("glass.json", "w") as json_file:
json_file.write(model_json)
model.save_weights('glass.h5')
return model
model = create_baseline()
model.fit(X, encoded_Y, epochs=1000, batch_size=100)
The number of neurons does not depend on the backend you use.
But if it is true that you will never have the same results. That's
because there are enough stochastic processes within a network:
initialization, dropout (if you use), batch order, etc.
What is known is that expanding the number of neurons per dense
makes the model more complex and therefore has more potential to
represent your problem but is more difficult to learn and more
expensive both in time and in calculations. That way you always have
to look for a balance.
At the moment there is no clear evidence that it is better:
expand the number of neurons per layer.
add more layers.
There are models that use one architecture and others the other.
Using this architecture you get the following result:
Epoch 1000/1000
214/214 [==============================] - 0s 17us/step - loss: 0.0777 - categorical_accuracy: 0.9953
Using this architecture you get the following result:

Removing features with low variance using scikit-learn

scikit-learn provides various methods to remove descriptors, a basic method for this purpose has been provided by the given tutorial below,
http://scikit-learn.org/stable/modules/feature_selection.html
but the tutorial does not provide any method or a way that can tell you the way to keep the list of features that either removed or kept.
The code below has been taken from the tutorial.
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)
array([[0, 1],
[1, 0],
[0, 0],
[1, 1],
[1, 0],
[1, 1]])
The given example code above depicts only two descriptors "shape(6, 2)", but in my case, I have a huge data frames with a shape of (rows 51, columns 9000). After finding a suitable model I want to keep the track of useful and useless features because I can save computational time during the computation of the features of test data set by calculating only useful features.
For example, when you perform machine learning modeling with WEKA 6.0, it provided with remarkable flexibility over feature selection and after removing the useless feature you can get a list of a discarded features along with the useful features.
thanks
Then, what you can do, if I'm not wrong is:
In the case of the VarianceThreshold, you can call the method fit instead of fit_transform. This will fit data, and the resulting variances will be stored in vt.variances_ (assuming vt is your object).
Having a threhold, you can extract the features of the transformation as fit_transform would do:
X[:, vt.variances_ > threshold]
Or get the indexes as:
idx = np.where(vt.variances_ > threshold)[0]
Or as a mask
mask = vt.variances_ > threshold
PS: default threshold is 0
EDIT:
A more straight forward to do, is by using the method get_support of the class VarianceThreshold. From the documentation:
get_support([indices]) Get a mask, or integer index, of the features selected
You should call this method after fit or fit_transform.
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
# Just make a convenience function; this one wraps the VarianceThreshold
# transformer but you can pass it a pandas dataframe and get one in return
def get_low_variance_columns(dframe=None, columns=None,
skip_columns=None, thresh=0.0,
autoremove=False):
"""
Wrapper for sklearn VarianceThreshold for use on pandas dataframes.
"""
print("Finding low-variance features.")
try:
# get list of all the original df columns
all_columns = dframe.columns
# remove `skip_columns`
remaining_columns = all_columns.drop(skip_columns)
# get length of new index
max_index = len(remaining_columns) - 1
# get indices for `skip_columns`
skipped_idx = [all_columns.get_loc(column)
for column
in skip_columns]
# adjust insert location by the number of columns removed
# (for non-zero insertion locations) to keep relative
# locations intact
for idx, item in enumerate(skipped_idx):
if item > max_index:
diff = item - max_index
skipped_idx[idx] -= diff
if item == max_index:
diff = item - len(skip_columns)
skipped_idx[idx] -= diff
if idx == 0:
skipped_idx[idx] = item
# get values of `skip_columns`
skipped_values = dframe.iloc[:, skipped_idx].values
# get dataframe values
X = dframe.loc[:, remaining_columns].values
# instantiate VarianceThreshold object
vt = VarianceThreshold(threshold=thresh)
# fit vt to data
vt.fit(X)
# get the indices of the features that are being kept
feature_indices = vt.get_support(indices=True)
# remove low-variance columns from index
feature_names = [remaining_columns[idx]
for idx, _
in enumerate(remaining_columns)
if idx
in feature_indices]
# get the columns to be removed
removed_features = list(np.setdiff1d(remaining_columns,
feature_names))
print("Found {0} low-variance columns."
.format(len(removed_features)))
# remove the columns
if autoremove:
print("Removing low-variance features.")
# remove the low-variance columns
X_removed = vt.transform(X)
print("Reassembling the dataframe (with low-variance "
"features removed).")
# re-assemble the dataframe
dframe = pd.DataFrame(data=X_removed,
columns=feature_names)
# add back the `skip_columns`
for idx, index in enumerate(skipped_idx):
dframe.insert(loc=index,
column=skip_columns[idx],
value=skipped_values[:, idx])
print("Succesfully removed low-variance columns.")
# do not remove columns
else:
print("No changes have been made to the dataframe.")
except Exception as e:
print(e)
print("Could not remove low-variance features. Something "
"went wrong.")
pass
return dframe, removed_features
this worked for me if you want to see exactly which columns are remained after thresholding you may use this method:
from sklearn.feature_selection import VarianceThreshold
threshold_n=0.95
sel = VarianceThreshold(threshold=(threshold_n* (1 - threshold_n) ))
sel_var=sel.fit_transform(data)
data[data.columns[sel.get_support(indices=True)]]
When testing features I wrote this simple function that tells me which variables remained in the data frame after the VarianceThreshold is applied.
from sklearn.feature_selection import VarianceThreshold
from itertools import compress
def fs_variance(df, threshold:float=0.1):
"""
Return a list of selected variables based on the threshold.
"""
# The list of columns in the data frame
features = list(df.columns)
# Initialize and fit the method
vt = VarianceThreshold(threshold = threshold)
_ = vt.fit(df)
# Get which column names which pass the threshold
feat_select = list(compress(features, vt.get_support()))
return feat_select
which returns a list of column names which are selected. For example: ['col_2','col_14', 'col_17'].