Cloud-ML Job No such file or directory - google-cloud-ml

I have submitted a training job to cloud ml. But, it can't find the csv file. it is there in the bucket. this is the code.
# Use scikit-learn to grid search the batch size and epochs
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
def create_model():
model = Sequential()
model.add(Dense(12, input_dim=11, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
return model
seed = 7
numpy.random.seed(seed)
FIL = "gs://bubbly-hexagon-112008-ml/dataset/mixed.csv"
dataset = numpy.loadtxt(FIL, delimiter=",")
X = dataset[:,0:11]
Y = dataset[:,11]
model = KerasClassifier(build_fn=create_model, verbose=1)
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100, 500, 1000]
param_grid = dict(batch_size=batch_size, nb_epoch=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X, Y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
after submitting the job i get this error.
Traceback (most recent call last): File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main "__main__", fname, loader, pkg_name) File "/usr/lib/python2.7/runpy.py", line 72, in _run_code exec code in
run_globals File "/root/.local/lib/python2.7/
site-packages/trainer/task.py", line 18, in <module> dataset = numpy.loadtxt(FIL, delimiter=",") File "/root/.local/lib/python2.7/
site-packages/numpy/lib/npyio.py", line 803, in loadtxt fh = iter(open(fname, 'U')) IOError: [Errno 2] No such file or directory:
'gs://bubbly-hexagon-112008-ml/dataset/mixed.csv'
-The file is in the specified bucket and its permission includes cloud ml as reader.
-I also used gcloud beta ml init-project to initialize the project.
-And i created a new bucket and put the file in there, but got the same error.
-My bucket is in the same region as my submitted job.
Thanks

file_io from tensorflow works great:
from tensorflow.python.lib.io import file_io
import numpy as np
import json
To read a numpy array:
with file_io.FileIO(path_npx, 'rb') as f:
np_arr = np.load( BytesIO(f.read()) )
print(np_arr)
To read a json file:
with file_io.FileIO(path_json, 'r') as f:
print(json.loads(f.read()))

You can't read directly from gfs like that you need to use some sort of io library.
from io import BytesIO
import tensorflow as tf
import numpy as np
from tensorflow.python.lib.io import file_io
FIL = "gs://bubbly-hexagon-112008-ml/dataset/mixed.csv"
f = BytesIO(file_io.read_file_to_string(FIL, binary_mode=True))
data = np.load(f)

I don't think you can read gcs files directly with numpy.

Related

Error while loading .h5 model in Flask using keras

I have built a horse human detector using keras CNN on Google colab the model worked and loaded perfectly on colab. Now I am building a flask application while loading he .h5 model file I was getting error
TypeError: __init__() got an unexpected keyword argument 'ragged'
I reinstall keras 2.3.1 using pip and now I am getting a library error
NameError: name 'six' is not defined
my App.py
#Import necessary libraries
from flask import Flask, render_template, request
import numpy as np
import os
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.models import load_model
#load model
model = load_model("predictor.h5" )
print("## model loaded")
def pred_human_horse(model , horse_or_human):
test_image = load_img(horse_or_human , target_size=(150,150)) #resize
print("## Got Image for predicton")
test_image = img_to_array(test_image)/255 #numpy array between 0-1
test_image = np.expand_dims(test_image,axis=0) #4 dimension
result= model.predict(test_image).round(3) #rounding off
pred =np.argmax(result)
print("## Raw results = ",result)
print("## class = ", pred)
if pred==0:
return "Horse"
else:
return "Human"
# Crate flask app
app = Flask(__name__)
#app.route("/",methods=["GET","POST"])
def home():
return render_template("index.html")
#app.route("/predict",methods=["GET","POST"])
def predict():
if request.method=="POST":
#get input image file
file = request.files["image"]
filename= file.filename
print("## File recieved",filename)
#save the file
file_path= os.path.join("static/user_uploaded",filename)
file.save(file_path)
print("## Prediction...")
pred=pred_human_horse(horse_or_human=file_path )
return render_template("predict.html" ,pred_output= pred , user_image=file_path )
if __name__=="__main__":
app.run(threaded=False)
Error I am getting
runfile('F:/INTERNSHIP/Crisp-Metric-MAY21/Human-horse-prediction/app.py', wdir='F:/INTERNSHIP/Crisp-Metric-MAY21/Human-horse-prediction')
Traceback (most recent call last):
File "<ipython-input-26-df590f092cb6>", line 1, in <module>
runfile('F:/INTERNSHIP/Crisp-Metric-MAY21/Human-horse-prediction/app.py', wdir='F:/INTERNSHIP/Crisp-Metric-MAY21/Human-horse-prediction')
File "C:\Users\DANIA NIAZI\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "C:\Users\DANIA NIAZI\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "F:/INTERNSHIP/Crisp-Metric-MAY21/Human-horse-prediction/app.py", line 13, in <module>
model = load_model("predictor.h5" )
File "C:\Users\DANIA NIAZI\Anaconda3\lib\site-packages\keras\engine\saving.py", line 492, in load_wrapper
File "C:\Users\DANIA NIAZI\Anaconda3\lib\site-packages\keras\engine\saving.py", line 582, in load_model
File "C:\Users\DANIA NIAZI\Anaconda3\lib\site-packages\keras\utils\io_utils.py", line 211, in is_supported_type
NameError: name 'six' is not defined
Maybe you should try installing the six package which will be installed when installing Django. Anyway you can install it using:
pip install six

Extract images in .jpg format from binary using unpickle (python)

I am trying to extract images from CIFAR-10 data binary file, i.e. data_batch_1.bin as .jpg.
But while doing unpickle I am getting an error.
My code is:
from PIL import Image
import numpy
def unpickle(file):
import pickle
with open(file, 'rb') as fo:
dict = pickle.load(fo)
return dict
def save_as_image(img_flat):
"""
Saves a data blob as an image file.
"""
# consecutive 1024 entries store color channels of 32x32 image
img_R = img_flat[0:1024].reshape((32, 32))
img_G = img_flat[1024:2048].reshape((32, 32))
img_B = img_flat[2048:3072].reshape((32, 32))
img = numpy.dstack((img_R, img_G, img_B))
im = Image.fromarray(img)
im.show()
abc = unpickle("/home/ubuntu/visit/cifar-10-batches-bin/data_batch_1.bin")
#print(abc)
data = abc["data"]
save_as_image(data[0])
I am getting an error as follows:
Traceback (most recent call last):
File "load.py", line 24, in <module>
abc = unpickle("/home/ubuntu/visit/cifar-10-batches-bin/data_batch_1.bin")
File "load.py", line 7, in unpickle
dict = pickle.load(fo)
File "/usr/lib/python2.7/pickle.py", line 1378, in load
return Unpickler(file).load()
File "/usr/lib/python2.7/pickle.py", line 858, in load
dispatch[key](self)
KeyError: '\x06'
What could be the cause of this issue?

python luigi localTarget pickle

I am running on Windows 7, Python 2.7 via Anaconda 4.3.17, Luigi 2.4.0, Pandas 0.18, sklearn version 0.18. Per below, I am trying to have a luigi.LocalTarget output be a pickle to store a few different objects (using firstJob) and then read from that pickle in a dependent job (secondJob). firstJob completes successfully if I run the following from the command line:
"python -m luigi --module luigiPickle firstJob --date 2017-06-07 --local-scheduler"
However, if I try running secondJob i.e.,
"python -m luigi --module luigiPickle secondJob --date 2017-06-07 --local-scheduler"
I get
Traceback (most recent call last):
File "C:\Anaconda2\lib\site-packages\luigi-2.4.0-py2.7.egg\luigi\worker.py", l
ine 191, in run
new_deps = self._run_get_new_deps()
File "C:\Anaconda2\lib\site-packages\luigi-2.4.0-py2.7.egg\luigi\worker.py", l
ine 129, in _run_get_new_deps
task_gen = self.task.run()
File "luigiPickle.py", line 41, in run
ret2 = pickle.load(inFile)
File "C:\Anaconda2\lib\pickle.py", line 1384, in load
return Unpickler(file).load()
File "C:\Anaconda2\lib\pickle.py", line 864, in load
dispatch[key](self)
File "C:\Anaconda2\lib\pickle.py", line 1096, in load_global
klass = self.find_class(module, name)
File "C:\Anaconda2\lib\pickle.py", line 1130, in find_class
__import__(module)
ImportError: No module named frame
It appears that luigi is having trouble reading the pickle due to not recognizing the pandas.DataFrame() object (perhaps a scope issue?).
import luigi
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
class firstJob(luigi.Task):
date = luigi.DateParameter()
def requires(self):
return None
def output(self):
return luigi.LocalTarget('%s_first.pickle' % self.date)
def run(self):
ret = {}
ret['a'] = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
ret['b'] = pd.DataFrame({'a': [3, 4], 'd': [0, 0]})
ret['c'] = LinearRegression()
outFile = self.output().open('wb')
pickle.dump(ret, outFile, protocol=pickle.HIGHEST_PROTOCOL)
outFile.close()
class secondJob(luigi.Task):
date = luigi.DateParameter()
def requires(self):
return firstJob(self.date)
def output(self):
return luigi.LocalTarget('%s_second.pickle' % self.date)
def run(self):
inFile = self.input().open('rb')
ret2 = pickle.load(inFile)
inFile.close()
if __name__ == '__main__':
luigi.run()
The luigi open command doesn't work with the b flag for binary- it strips it out of the options string. (not sure why). Better to just use standard open with the path attribute:
open(self.input().path, 'rb') and open(self.output().path, 'wb').
d6tflow solves this, see example for sklearn model pickle which answers this question. Plus you don't need to write all that boilerplate code.
import d6tflow
class firstJob(d6tflow.tasks.TaskPickle):
def run(self):
# your code
self.save(ret)
class secondJob(TaskClass):
date = luigi.DateParameter()
def requires(self):
return firstJob(self.date)
def run(self):
inFile = self.input().load()
# use inFile
d6tflow.run([secondJob])

tensorflow SKCompat is not compatible with cross_val_score

I'm trying to use a tensorflow classifier with some tools from scikit learn, namely model_selection.cross_val_score. When I run the following code (adapted from this example from the tensorflow docs), I get a TypeError (see full traceback below).
From what I can tell the problem is that cross_val_score tries to clone the estimator by performing what amounts to estimator.__class__(**estimator.get_params(deep=True)). For some reason, SKCompat.get_params returns {}, the init method on the class requires one argument (as shown in the example code) so the operation blows up.
Am I doing something wrong? Or is this a bug with tensorflow?
Failing example
"""Example of DNNClassifier for Iris plant dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sklearn import metrics
from sklearn import model_selection
import tensorflow as tf
def main(unused_argv):
# Load dataset.
iris = tf.contrib.learn.datasets.load_dataset('iris')
# Build 3 layer DNN with 10, 20, 10 units respectively.
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
iris.data)
classifier = tf.contrib.learn.SKCompat(
tf.contrib.learn.DNNClassifier(
feature_columns=feature_columns,
hidden_units=[10, 20, 10],
n_classes=3
)
)
# Fit and predict.
scores = model_selection.cross_val_score(classifier, iris.data, iris.target,
scoring='accuracy')
print('Accuracy: {0:f}'.format(scores.mean()))
if __name__ == '__main__':
tf.app.run()
Traceback
Traceback (most recent call last):
File "iris.py", line 49, in <module>
tf.app.run()
File "/Users/Matt/.virtualenvs/numerai/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "iris.py", line 44, in main
scoring='accuracy')
File "/Users/Matt/.virtualenvs/numerai/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 140, in cross_val_score
for train, test in cv_iter)
File "/Users/Matt/.virtualenvs/numerai/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/Users/Matt/.virtualenvs/numerai/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 603, in dispatch_one_batch
tasks = BatchedCalls(itertools.islice(iterator, batch_size))
File "/Users/Matt/.virtualenvs/numerai/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 127, in __init__
self.items = list(iterator_slice)
File "/Users/Matt/.virtualenvs/numerai/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 140, in <genexpr>
for train, test in cv_iter)
File "/Users/Matt/.virtualenvs/numerai/lib/python2.7/site-packages/sklearn/base.py", line 70, in clone
new_object = klass(**new_object_params)
TypeError: __init__() takes exactly 2 arguments (1 given)
Versions
python: 2.7.3
tensorflow: 1.1.0
scikit-learn: 0.18.1

backtest with local data in zipline

I am using zipline to backtest with the local data, but it seems unsuccessful.
from datetime import datetime
import pytz
import pandas as pd
from zipline.algorithm import TradingAlgorithm
import zipline.utils.factory as factory
class BuyApple(TradingAlgorithm):
def handle_data(self, data):
self.order('AAPL', 1)
if __name__ == '__main__':
data = pd.read_csv('AAPL.csv')
simple_algo = BuyApple()
results = simple_algo.run(data)
above is my code, When I run this script, I got the message:
[2015-04-03 01:41:53.712035] WARNING: Loader: No benchmark data found for date range.
start_date=2015-04-03 00:00:00+00:00, end_date=2015-04-03 01:41:53.632300, url=http://ichart.finance.yahoo.com/table.csv?a=3&c=2015&b=3&e=3&d=3&g=d&f=2015&s=%5EGSPC
Traceback (most recent call last):
File "bollinger.py", line 31, in <module>
results = simple_algo.run(data)
File "/home/xinzhou/.local/lib/python2.7/site-packages/zipline-0.7.0-py2.7.egg/zipline/algorithm.py", line 372, in run
source = DataFrameSource(source)
File "/home/xinzhou/.local/lib/python2.7/site-packages/zipline-0.7.0-py2.7.egg/zipline/sources/data_frame_source.py", line 42, in __init__
assert isinstance(data.index, pd.tseries.index.DatetimeIndex)
AssertionError
Then I change my code to below:
from datetime import datetime
import pytz
import pandas as pd
from zipline.algorithm import TradingAlgorithm
import zipline.utils.factory as factory
class BuyApple(TradingAlgorithm):
def handle_data(self, data):
self.order('AAPL', 1)
if __name__ == '__main__':
start = datetime(2000, 1, 9, 14, 30, 0, 0, pytz.utc)
end = datetime(2001, 1, 10, 21, 0, 0, 0, pytz.utc)
data = pd.read_csv('AAPL.csv', parse_dates=True, index_col=0)
sim_params = factory.create_simulation_parameters(
start=start, end=end, capital_base=10000)
sim_params.data_frequency = '1d'
sim_params.emission_rate = '1d'
simple_algo = BuyApple()
results = simple_algo.run(data)
The
assert isinstance(data.index, pd.tseries.index.DatetimeIndex)
AssertionError
is gone. But in my terminal, it keeps in this message:
[2015-04-03 01:44:28.141657] WARNING: Loader: No benchmark data found for date range.
start_date=2015-04-03 00:00:00+00:00, end_date=2015-04-03 01:44:28.028243, url=http://ichart.finance.yahoo.com/table.csv?a=3&c=2015&b=3&e=3&d=3&g=d&f=2015&s=%5EGSPC
How to solve this problem? Thanks.
data.index=pd.to_datetime(data.index)
data.index=data.index.tz_localize(pytz.utc)
The next code works for me.Is a version of the tutorial example "My first Algorithm" (http://www.zipline.io/tutorial/) .Data must be in ascending order by date. Run as a normal python program( python yourfilename.py):
import pytz
from datetime import datetime
from zipline.algorithm import TradingAlgorithm
from zipline.api import order, record, symbol
import pandas as pd
# Load data manually csv
#Date,Open,High,Low,Close,Volume,Adj Close
#1984-09-07,26.5,26.87,26.25,26.5,2981600,3.02
#...
parse = lambda x: pytz.utc.localize(datetime.strptime(x, '%Y-%m-%d'))
data=pd.read_csv('aapl.csv', parse_dates=['Date'], index_col=0,date_parser=parse)
# Define algorithm
def initialize(context):
pass
def handle_data(context, data):
order('Close',10)
record(AAPL=data['Close'])
# Create algorithm object passing in initialize and
# handle_data functions
algo_obj = TradingAlgorithm(initialize=initialize,
handle_data=handle_data)
# Run algorithm
perf_manual = algo_obj.run(data)
# Print
perf_manual.to_csv('output.csv'