backtest with local data in zipline - python-2.7

I am using zipline to backtest with the local data, but it seems unsuccessful.
from datetime import datetime
import pytz
import pandas as pd
from zipline.algorithm import TradingAlgorithm
import zipline.utils.factory as factory
class BuyApple(TradingAlgorithm):
def handle_data(self, data):
self.order('AAPL', 1)
if __name__ == '__main__':
data = pd.read_csv('AAPL.csv')
simple_algo = BuyApple()
results = simple_algo.run(data)
above is my code, When I run this script, I got the message:
[2015-04-03 01:41:53.712035] WARNING: Loader: No benchmark data found for date range.
start_date=2015-04-03 00:00:00+00:00, end_date=2015-04-03 01:41:53.632300, url=http://ichart.finance.yahoo.com/table.csv?a=3&c=2015&b=3&e=3&d=3&g=d&f=2015&s=%5EGSPC
Traceback (most recent call last):
File "bollinger.py", line 31, in <module>
results = simple_algo.run(data)
File "/home/xinzhou/.local/lib/python2.7/site-packages/zipline-0.7.0-py2.7.egg/zipline/algorithm.py", line 372, in run
source = DataFrameSource(source)
File "/home/xinzhou/.local/lib/python2.7/site-packages/zipline-0.7.0-py2.7.egg/zipline/sources/data_frame_source.py", line 42, in __init__
assert isinstance(data.index, pd.tseries.index.DatetimeIndex)
AssertionError
Then I change my code to below:
from datetime import datetime
import pytz
import pandas as pd
from zipline.algorithm import TradingAlgorithm
import zipline.utils.factory as factory
class BuyApple(TradingAlgorithm):
def handle_data(self, data):
self.order('AAPL', 1)
if __name__ == '__main__':
start = datetime(2000, 1, 9, 14, 30, 0, 0, pytz.utc)
end = datetime(2001, 1, 10, 21, 0, 0, 0, pytz.utc)
data = pd.read_csv('AAPL.csv', parse_dates=True, index_col=0)
sim_params = factory.create_simulation_parameters(
start=start, end=end, capital_base=10000)
sim_params.data_frequency = '1d'
sim_params.emission_rate = '1d'
simple_algo = BuyApple()
results = simple_algo.run(data)
The
assert isinstance(data.index, pd.tseries.index.DatetimeIndex)
AssertionError
is gone. But in my terminal, it keeps in this message:
[2015-04-03 01:44:28.141657] WARNING: Loader: No benchmark data found for date range.
start_date=2015-04-03 00:00:00+00:00, end_date=2015-04-03 01:44:28.028243, url=http://ichart.finance.yahoo.com/table.csv?a=3&c=2015&b=3&e=3&d=3&g=d&f=2015&s=%5EGSPC
How to solve this problem? Thanks.

data.index=pd.to_datetime(data.index)
data.index=data.index.tz_localize(pytz.utc)

The next code works for me.Is a version of the tutorial example "My first Algorithm" (http://www.zipline.io/tutorial/) .Data must be in ascending order by date. Run as a normal python program( python yourfilename.py):
import pytz
from datetime import datetime
from zipline.algorithm import TradingAlgorithm
from zipline.api import order, record, symbol
import pandas as pd
# Load data manually csv
#Date,Open,High,Low,Close,Volume,Adj Close
#1984-09-07,26.5,26.87,26.25,26.5,2981600,3.02
#...
parse = lambda x: pytz.utc.localize(datetime.strptime(x, '%Y-%m-%d'))
data=pd.read_csv('aapl.csv', parse_dates=['Date'], index_col=0,date_parser=parse)
# Define algorithm
def initialize(context):
pass
def handle_data(context, data):
order('Close',10)
record(AAPL=data['Close'])
# Create algorithm object passing in initialize and
# handle_data functions
algo_obj = TradingAlgorithm(initialize=initialize,
handle_data=handle_data)
# Run algorithm
perf_manual = algo_obj.run(data)
# Print
perf_manual.to_csv('output.csv'

Related

error while running python 2.7 code for deepwalk

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import random
from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging
from deepwalk import graph
from deepwalk import walks as serialized_walks
from walks import WalksCorpus
from gensim.models import Word2Vec
from deepwalk.skipgram import Skipgram
from six import text_type as unicode
from six import iteritems
from six.moves import range
import psutil
from multiprocessing import cpu_count
p = psutil.Process(os.getpid())
try:
p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
try:
p.cpu_affinity(list(range(cpu_count())))
except AttributeError:
pass
logger = logging.getLogger(__name__)
LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
def debug(type_, value, tb):
if hasattr(sys, 'ps1') or not sys.stderr.isatty():
sys.__excepthook__(type_, value, tb)
else:
import traceback
import pdb
traceback.print_exception(type_, value, tb)
print(u"\n")
pdb.pm()
def process(args):
if args.format == "adjlist":
G = graph.load_adjacencylist(args.input, undirected=args.undirected)
elif args.format == "edgelist":
G = graph.load_edgelist(args.input, undirected=args.undirected)
elif args.format == "mat":
G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
else:
raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)
print("Number of nodes: {}".format(len(G.nodes())))
num_walks = len(G.nodes()) * args.number_walks
print("Number of walks: {}".format(num_walks))
data_size = num_walks * args.walk_length
print("Data size (walks*length): {}".format(data_size))
if data_size < args.max_memory_data_size:
print("Walking...")
walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
print("Training...")
model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
else:
print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size))
print("Walking...")
walks_filebase = args.output + ".walks"
walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
num_workers=args.workers)
print("Counting vertex frequency...")
if not args.vertex_freq_degree:
vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
else:
# use degree distribution for frequency in tree
vertex_counts = G.degree(nodes=G.iterkeys())
print("Training...")
walks_corpus = serialized_walks.WalksCorpus(walk_files)
model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
size=args.representation_size,
window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)
model.wv.save_word2vec_format(args.output)
def main():
parser = ArgumentParser("deepwalk",
formatter_class=ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument("--debug", dest="debug", action='store_true', default=False,
help="drop a debugger if an exception is raised.")
parser.add_argument('--format', default='adjlist',
help='File format of input file')
parser.add_argument('--input', nargs='?', required=True,
help='Input graph file')
parser.add_argument("-l", "--log", dest="log", default="INFO",
help="log verbosity level")
parser.add_argument('--matfile-variable-name', default='network',
help='variable name of adjacency matrix inside a .mat file.')
parser.add_argument('--max-memory-data-size', default=1000000000, type=int,
help='Size to start dumping walks to disk, instead of keeping them in memory.')
parser.add_argument('--number-walks', default=10, type=int,
help='Number of random walks to start at each node')
parser.add_argument('--output', required=True,
help='Output representation file')
parser.add_argument('--representation-size', default=64, type=int,
help='Number of latent dimensions to learn for each node.')
parser.add_argument('--seed', default=0, type=int,
help='Seed for random walk generator.')
parser.add_argument('--undirected', default=True, type=bool,
help='Treat graph as undirected.')
parser.add_argument('--vertex-freq-degree', default=False, action='store_true',
help='Use vertex degree to estimate the frequency of nodes '
'in the random walks. This option is faster than '
'calculating the vocabulary.')
parser.add_argument('--walk-length', default=40, type=int,
help='Length of the random walk started at each node')
parser.add_argument('--window-size', default=5, type=int,
help='Window size of skipgram model.')
parser.add_argument('--workers', default=1, type=int,
help='Number of parallel processes.')
args = parser.parse_args()
numeric_level = getattr(logging, args.log.upper(), None)
logging.basicConfig(format=LOGFORMAT)
logger.setLevel(numeric_level)
if args.debug:
sys.excepthook = debug
process(args)
if __name__ == "__main__":
sys.exit(main())
Error:
Traceback (most recent call last): File "main.py", line 165, in sys.exit(main()) File "main.py", line 162, in main process(args) File "main.py", line 93, in process walks_corpus = serialized_walks.WalksCorpus(walk_files) AttributeError: 'module' object has no attribute 'WalksCorpus'
Why do I get this error?
It looks as though you are importing WalksCorpus on its own from walks with from walks import WalksCorpus. Then when you try to use WalksCorpus method you are looking for it with in serialized_walks which I assume does not have the WalksCorpus method in it.
Try changing this line.
walks_corpus = serialized_walks.WalksCorpus(walk_files)
To:
walks_corpus = WalksCorpus(walk_files)

python luigi localTarget pickle

I am running on Windows 7, Python 2.7 via Anaconda 4.3.17, Luigi 2.4.0, Pandas 0.18, sklearn version 0.18. Per below, I am trying to have a luigi.LocalTarget output be a pickle to store a few different objects (using firstJob) and then read from that pickle in a dependent job (secondJob). firstJob completes successfully if I run the following from the command line:
"python -m luigi --module luigiPickle firstJob --date 2017-06-07 --local-scheduler"
However, if I try running secondJob i.e.,
"python -m luigi --module luigiPickle secondJob --date 2017-06-07 --local-scheduler"
I get
Traceback (most recent call last):
File "C:\Anaconda2\lib\site-packages\luigi-2.4.0-py2.7.egg\luigi\worker.py", l
ine 191, in run
new_deps = self._run_get_new_deps()
File "C:\Anaconda2\lib\site-packages\luigi-2.4.0-py2.7.egg\luigi\worker.py", l
ine 129, in _run_get_new_deps
task_gen = self.task.run()
File "luigiPickle.py", line 41, in run
ret2 = pickle.load(inFile)
File "C:\Anaconda2\lib\pickle.py", line 1384, in load
return Unpickler(file).load()
File "C:\Anaconda2\lib\pickle.py", line 864, in load
dispatch[key](self)
File "C:\Anaconda2\lib\pickle.py", line 1096, in load_global
klass = self.find_class(module, name)
File "C:\Anaconda2\lib\pickle.py", line 1130, in find_class
__import__(module)
ImportError: No module named frame
It appears that luigi is having trouble reading the pickle due to not recognizing the pandas.DataFrame() object (perhaps a scope issue?).
import luigi
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
class firstJob(luigi.Task):
date = luigi.DateParameter()
def requires(self):
return None
def output(self):
return luigi.LocalTarget('%s_first.pickle' % self.date)
def run(self):
ret = {}
ret['a'] = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
ret['b'] = pd.DataFrame({'a': [3, 4], 'd': [0, 0]})
ret['c'] = LinearRegression()
outFile = self.output().open('wb')
pickle.dump(ret, outFile, protocol=pickle.HIGHEST_PROTOCOL)
outFile.close()
class secondJob(luigi.Task):
date = luigi.DateParameter()
def requires(self):
return firstJob(self.date)
def output(self):
return luigi.LocalTarget('%s_second.pickle' % self.date)
def run(self):
inFile = self.input().open('rb')
ret2 = pickle.load(inFile)
inFile.close()
if __name__ == '__main__':
luigi.run()
The luigi open command doesn't work with the b flag for binary- it strips it out of the options string. (not sure why). Better to just use standard open with the path attribute:
open(self.input().path, 'rb') and open(self.output().path, 'wb').
d6tflow solves this, see example for sklearn model pickle which answers this question. Plus you don't need to write all that boilerplate code.
import d6tflow
class firstJob(d6tflow.tasks.TaskPickle):
def run(self):
# your code
self.save(ret)
class secondJob(TaskClass):
date = luigi.DateParameter()
def requires(self):
return firstJob(self.date)
def run(self):
inFile = self.input().load()
# use inFile
d6tflow.run([secondJob])

Cloud-ML Job No such file or directory

I have submitted a training job to cloud ml. But, it can't find the csv file. it is there in the bucket. this is the code.
# Use scikit-learn to grid search the batch size and epochs
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
def create_model():
model = Sequential()
model.add(Dense(12, input_dim=11, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
return model
seed = 7
numpy.random.seed(seed)
FIL = "gs://bubbly-hexagon-112008-ml/dataset/mixed.csv"
dataset = numpy.loadtxt(FIL, delimiter=",")
X = dataset[:,0:11]
Y = dataset[:,11]
model = KerasClassifier(build_fn=create_model, verbose=1)
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100, 500, 1000]
param_grid = dict(batch_size=batch_size, nb_epoch=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X, Y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
after submitting the job i get this error.
Traceback (most recent call last): File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main "__main__", fname, loader, pkg_name) File "/usr/lib/python2.7/runpy.py", line 72, in _run_code exec code in
run_globals File "/root/.local/lib/python2.7/
site-packages/trainer/task.py", line 18, in <module> dataset = numpy.loadtxt(FIL, delimiter=",") File "/root/.local/lib/python2.7/
site-packages/numpy/lib/npyio.py", line 803, in loadtxt fh = iter(open(fname, 'U')) IOError: [Errno 2] No such file or directory:
'gs://bubbly-hexagon-112008-ml/dataset/mixed.csv'
-The file is in the specified bucket and its permission includes cloud ml as reader.
-I also used gcloud beta ml init-project to initialize the project.
-And i created a new bucket and put the file in there, but got the same error.
-My bucket is in the same region as my submitted job.
Thanks
file_io from tensorflow works great:
from tensorflow.python.lib.io import file_io
import numpy as np
import json
To read a numpy array:
with file_io.FileIO(path_npx, 'rb') as f:
np_arr = np.load( BytesIO(f.read()) )
print(np_arr)
To read a json file:
with file_io.FileIO(path_json, 'r') as f:
print(json.loads(f.read()))
You can't read directly from gfs like that you need to use some sort of io library.
from io import BytesIO
import tensorflow as tf
import numpy as np
from tensorflow.python.lib.io import file_io
FIL = "gs://bubbly-hexagon-112008-ml/dataset/mixed.csv"
f = BytesIO(file_io.read_file_to_string(FIL, binary_mode=True))
data = np.load(f)
I don't think you can read gcs files directly with numpy.

Python3 pickle serialization with Cmd

I am new to Python and as my first project I am attempting to convert a Python2 script to Python3.
The script is failing when it attempts to serialize a class using pickle.
It seems as though it is failing as I am trying to save a class which uses the Cmd CLI.
This code works using Python2.
Can anyone tell me what is wrong with the script and how I fix it?
import sys
import cmd
try:
import pickle as pickle
except:
import pickle
import os.path
def main():
app = Labyrinth()
turnfile = "turn0.lwot"
app.Save(turnfile)
class CLI(cmd.Cmd):
def __init__(self):
cmd.Cmd.__init__(self)
class Labyrinth(cmd.Cmd):
def __init__(self):
cmd.Cmd.__init__(self)
def Save(self, fname):
with open(fname, 'wb') as f:
pickle.dump(self,f, 2)
f.close()
print ("Save Successful!")
sys.exit()
if __name__ == '__main__':
main()
Not all objects are picklable. In particular, file objects are problematic because you can't generally restore their state later. cmd.Cmd holds stdin and stdout file objects and that should make them unpicklable. I was quite surprised that it worked in python 2, but it didn't really... Even though the stdin and stdout pickled, the unpickled object you get back later doesn't work, as in this example:
>>> import sys
>>> import pickle
>>> sys.stdout.write('foo\n')
foo
>>> serialized = pickle.dumps(sys.stdout, 2)
>>> stdout = pickle.loads(serialized)
>>> stdout.write('bar\n')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: I/O operation on closed file
>>>
So, even though this bit of code didn't fail, the object shouldn't be usable later. You can add a few special methods to an object that let you fix objects so they can be serialized. Here, I've stripped the bad attributes on save and added them back on restore. Now you can pickle, unpickle and it actually works when you are done.
import sys
import cmd
try:
import cPickle as pickle
except:
import pickle
import os.path
def main():
app = Labyrinth()
turnfile = "turn0.lwot"
app.Save(turnfile)
class CLI(cmd.Cmd):
def __init__(self):
cmd.Cmd.__init__(self)
class Labyrinth(cmd.Cmd):
def __init__(self):
cmd.Cmd.__init__(self)
def Save(self, fname):
with open(fname, 'wb') as f:
pickle.dump(self,f, pickle.HIGHEST_PROTOCOL)
f.close()
print ("Save Successful!")
sys.exit()
def __getstate__(self):
# stdin/out are unpicklable. We'll get new ones on load
return tuple(((k,v) for k,v in self.__dict__.items()
if k not in ('stdin', 'stdout')))
def __setstate__(self, state):
self.__dict__.update(state)
self.stdin = sys.stdin
self.stdout = sys.stdout
if __name__ == '__main__':
main()
Playing with the protocol doesn't help. The full error message (which you should have included) is:
1027:~/mypy$ python3 stack41334887.py
Traceback (most recent call last):
File "stack41334887.py", line 33, in <module>
main()
File "stack41334887.py", line 14, in main
app.Save(turnfile)
File "stack41334887.py", line 27, in Save
pickle.dump(self,f, 3, fix_imports=True)
TypeError: cannot serialize '_io.TextIOWrapper' object
Python3 made some major changes in the io system. This TextIOWrapper is, I think new to Py3.
https://docs.python.org/3.1/library/io.html#io.TextIOWrapper
Can I use multiprocessing.Pool in a method of a class? also had problems serializing a TextIOWrapper.
=========
So inspireed by #tdelaney, I checked the stdin for my PY3 session:
In [1212]: sys.stdin
Out[1212]: <_io.TextIOWrapper name='<stdin>' mode='r' encoding='UTF-8'>
So that's the thing that can't be serialized.

Attribute error while implementing filter in django manager

I have a manager in my django project and am implementing a filter as below.
I start the django shell and get this error:
>>> from django.http import HttpRequest
>>> r=HttpRequest()
>>> r.session=()
>>> from movierating.models import *
>>> RatingModel.statManager.RatingTimeLine(r)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "/data/dashapp/movierating/managers.py", line 27, in RatingTimeLine
(data,display,err)=self.getdatacache(request)
File "/data/dashapp/movierating/managers.py", line 11, in getdatacache
filterDict=request.session.get('RatingFilter',{})
AttributeError: 'tuple' object has no attribute 'get'
>>> from django.http import HttpRequest
>>> r=HttpRequest()
>>> r.session()
Traceback (most recent call last):
File "<console>", line 1, in <module>
AttributeError: 'HttpRequest' object has no attribute 'session'r code here
The file manager.py looks like this:
import json,datetime
from django.db import models
from django.db.models import *
from pandas import *
from urllib import urlencode
import hashlib
from django.core.cache import cache
from web.extras import *
class RatingManager(models.Manager):
def getdatacache(self,request):
filterDict=request.session.get('RatingFilter',{})
(fapply,display,err)=normalizeFilter(self.model,filterDict)
cache_id=urlencode(fapply.items())
cache_id=hashlib.md5('RatingFilter'+cache_id).hexdigest()
data=None
if data==None:
res=self.model.objects.order_by('date').filter(**fapply).values('date').annotate(
RatingCounts = Count('rating'),
RatingSum = Sum('rating'),
)
data=DataFrame(list(res))
data['AverageRating']=data['RatingSum']/data['RatingCounts']
cache.set(cache_id,data)
return (data,display,err)
def RatingTimeLine(self,request):
jsondata={}
jsondata['chartconfig']={}
jsondata['chartconfig']['title']="Average Movie Rating per Day"
jsondata['chartconfig']['names']=['AverageRating']
(data,display,err)=self.getdatacache(request)
jsondata['chartconfig']['errors']="<br/>".join(err)
jsondata['chartconfig']['subtitle']="<br/>".join(display)
jsondata['series']=data[['data','AverageRating']].values.tolist()
data=json.dumps(jsondata,cls = SeriesEncoder)
return {'data':data}
I have this model in my models.py:
class RatingModel(models.Model):
movie=models.ForeignKey(MovieModel)
user=models.ForeignKey(userModel)
rating=models.IntegerField()
date=models.DateField()
objects=models.Manager() #default manager
statManager=RatingManager() #new manager class
# FilterMapping={
#'movie':'movie__name', #django relational
# }
def __unicode__(self):
return "id: %d rating for movie is %s" %(self.id,self.movie.name) #relationships
class Meta:
app_label = 'movierating'
db_table = 'rating'
What could be the possible error in this line?
filterDict=request.session.get('RatingFilter',{})
In your third line you say r.session=() - that is, you're assigning an empty tuple to r.session. Your code later tries to call get() on this tuple, which isn't a supported operation.
I'm not sure what you're trying to do here. request.session is normally filled in by the session middleware. If you're trying to test your code programmatically, you might want to look into the Django testing framework, which supports sessions.