#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import random
from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging
from deepwalk import graph
from deepwalk import walks as serialized_walks
from walks import WalksCorpus
from gensim.models import Word2Vec
from deepwalk.skipgram import Skipgram
from six import text_type as unicode
from six import iteritems
from six.moves import range
import psutil
from multiprocessing import cpu_count
p = psutil.Process(os.getpid())
try:
p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
try:
p.cpu_affinity(list(range(cpu_count())))
except AttributeError:
pass
logger = logging.getLogger(__name__)
LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
def debug(type_, value, tb):
if hasattr(sys, 'ps1') or not sys.stderr.isatty():
sys.__excepthook__(type_, value, tb)
else:
import traceback
import pdb
traceback.print_exception(type_, value, tb)
print(u"\n")
pdb.pm()
def process(args):
if args.format == "adjlist":
G = graph.load_adjacencylist(args.input, undirected=args.undirected)
elif args.format == "edgelist":
G = graph.load_edgelist(args.input, undirected=args.undirected)
elif args.format == "mat":
G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
else:
raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)
print("Number of nodes: {}".format(len(G.nodes())))
num_walks = len(G.nodes()) * args.number_walks
print("Number of walks: {}".format(num_walks))
data_size = num_walks * args.walk_length
print("Data size (walks*length): {}".format(data_size))
if data_size < args.max_memory_data_size:
print("Walking...")
walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
print("Training...")
model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
else:
print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size))
print("Walking...")
walks_filebase = args.output + ".walks"
walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
num_workers=args.workers)
print("Counting vertex frequency...")
if not args.vertex_freq_degree:
vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
else:
# use degree distribution for frequency in tree
vertex_counts = G.degree(nodes=G.iterkeys())
print("Training...")
walks_corpus = serialized_walks.WalksCorpus(walk_files)
model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
size=args.representation_size,
window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)
model.wv.save_word2vec_format(args.output)
def main():
parser = ArgumentParser("deepwalk",
formatter_class=ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument("--debug", dest="debug", action='store_true', default=False,
help="drop a debugger if an exception is raised.")
parser.add_argument('--format', default='adjlist',
help='File format of input file')
parser.add_argument('--input', nargs='?', required=True,
help='Input graph file')
parser.add_argument("-l", "--log", dest="log", default="INFO",
help="log verbosity level")
parser.add_argument('--matfile-variable-name', default='network',
help='variable name of adjacency matrix inside a .mat file.')
parser.add_argument('--max-memory-data-size', default=1000000000, type=int,
help='Size to start dumping walks to disk, instead of keeping them in memory.')
parser.add_argument('--number-walks', default=10, type=int,
help='Number of random walks to start at each node')
parser.add_argument('--output', required=True,
help='Output representation file')
parser.add_argument('--representation-size', default=64, type=int,
help='Number of latent dimensions to learn for each node.')
parser.add_argument('--seed', default=0, type=int,
help='Seed for random walk generator.')
parser.add_argument('--undirected', default=True, type=bool,
help='Treat graph as undirected.')
parser.add_argument('--vertex-freq-degree', default=False, action='store_true',
help='Use vertex degree to estimate the frequency of nodes '
'in the random walks. This option is faster than '
'calculating the vocabulary.')
parser.add_argument('--walk-length', default=40, type=int,
help='Length of the random walk started at each node')
parser.add_argument('--window-size', default=5, type=int,
help='Window size of skipgram model.')
parser.add_argument('--workers', default=1, type=int,
help='Number of parallel processes.')
args = parser.parse_args()
numeric_level = getattr(logging, args.log.upper(), None)
logging.basicConfig(format=LOGFORMAT)
logger.setLevel(numeric_level)
if args.debug:
sys.excepthook = debug
process(args)
if __name__ == "__main__":
sys.exit(main())
Error:
Traceback (most recent call last): File "main.py", line 165, in sys.exit(main()) File "main.py", line 162, in main process(args) File "main.py", line 93, in process walks_corpus = serialized_walks.WalksCorpus(walk_files) AttributeError: 'module' object has no attribute 'WalksCorpus'
Why do I get this error?
It looks as though you are importing WalksCorpus on its own from walks with from walks import WalksCorpus. Then when you try to use WalksCorpus method you are looking for it with in serialized_walks which I assume does not have the WalksCorpus method in it.
Try changing this line.
walks_corpus = serialized_walks.WalksCorpus(walk_files)
To:
walks_corpus = WalksCorpus(walk_files)
Related
I'm not able to run python pipeline through airflow BeamRunPythonPipelineOperator. Below is my complete code:
DAG FILE
import os
from datetime import datetime, timedelta
from airflow.utils.dates import days_ago
from airflow import DAG
from airflow.providers.google.cloud.operators.dataflow import DataflowConfiguration
from airflow.providers.apache.beam.operators.beam import BeamRunPythonPipelineOperator
from airflow.providers.google.cloud.operators.dataflow import DataflowTemplatedJobStartOperator
default_args = {
"owner": "<...>",
"start_date": days_ago(1),
'dataflow_default_options': {
"project": "<...>",
}
}
dag = DAG(
dag_id="word_count",
default_args=default_args,
schedule_interval="#once"
)
start_python_pipeline_dataflow_runner = BeamRunPythonPipelineOperator(
task_id="start_python_pipeline_dataflow_runner",
runner="DataflowRunner",
py_file="gs://<...>/word_count.py",
pipeline_options={
'input':"gs://<...>/kinglear.txt",
'output':"gs://<...>/output.txt",
'temp_location':"gs://<...>/temp/",
'staging_location':"gs://<...>/temp/",
},
py_options=[],
py_requirements=['apache-beam[gcp]==2.26.0'],
py_interpreter='python3',
py_system_site_packages=False,
dataflow_config=DataflowConfiguration(
job_name='{{task.task_id}}', project_id="<...>", location="us-central1"
),
dag=dag,
)
Python File (word_count.py)
"""A word-counting workflow."""
# pytype: skip-file
import argparse
import logging
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class WordExtractingDoFn(beam.DoFn):
"""Parse each line of input text into words."""
def process(self, element):
"""Returns an iterator over the words of this element.
The element is a line of text. If the line is blank, note that, too.
Args:
element: the element being processed
Returns:
The processed element.
"""
return re.findall(r'[\w\']+', element, re.UNICODE)
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://<...>/kinglear.txt',
help='Input file to process.')
parser.add_argument(
'--output',
dest='output',
default='gs://<...>/output.txt',
help='Output file to write results to.')
argv = [
'--project=<...>',
'--region=us-central1',
'--runner=DataflowRunner',
'--staging_location=gs://<...>/temp/',
'--temp_location=gs://<...>/temp/',
'--template_location=gs://<...>/templates/word_count_template'
]
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
# The pipeline will be run on exiting the with block.
with beam.Pipeline(argv=argv,options=pipeline_options) as p:
# Read the text file[pattern] into a PCollection.
lines = p | 'Read' >> ReadFromText(known_args.input)
counts = (
lines
| 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
| 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
| 'GroupAndSum' >> beam.CombinePerKey(sum))
# Format the counts into a PCollection of strings.
def format_result(word, count):
return '%s: %d' % (word, count)
output = counts | 'Format' >> beam.MapTuple(format_result)
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'Write' >> WriteToText(known_args.output)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
Below is the screenshot of the composer:
I am not able to see the dataflow job in the console, as well as count, result in the bucket. Could anyone suggest to me the right approach or any suggestions on this?
You DAG is ok, the problem is on the Beam Python file, there is an error when you send the Dataflow args in the argv. The best approach is extend pipeline_args. And the job is not being submitted because you are sending the argv in the beam.Pipeline.
Following is the fixed code:
word_count.py :
"""A word-counting workflow."""
# pytype: skip-file
import argparse
import logging
import re
import os
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class WordExtractingDoFn(beam.DoFn):
"""Parse each line of input text into words."""
def process(self, element):
"""Returns an iterator over the words of this element.
The element is a line of text. If the line is blank, note that, too.
Args:
element: the element being processed
Returns:
The processed element.
"""
return re.findall(r'[\w\']+', element, re.UNICODE)
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument(
'--output',
dest='output',
default='gs://<bucket>/newoutput',
help='Output file to write results to.')
#argv = [
# '--project=<...>',
# '--region=us-central1',
# '--runner=DataflowRunner',
# '--staging_location=gs://<...>/temp/',
# '--temp_location=gs://<...>/temp/',
# '--template_location=gs://<...>/templates/word_count_template'
# ]
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
'--runner=DataflowRunner',
'--project=<project-name>',
'--region=<region>',
'--staging_location=gs://<bucket>/',
'--temp_location=gs://<bucket>/temp',
'--job_name=your-wordcount-job',
])
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
# The pipeline will be run on exiting the with block.
with beam.Pipeline(options=pipeline_options) as p:
lines = p | 'Read' >> ReadFromText(known_args.input)
counts = (
lines
| 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
| 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
| 'GroupAndSum' >> beam.CombinePerKey(sum))
# Format the counts into a PCollection of strings.
def format_result(word, count):
return '%s: %d' % (word, count)
output = counts | 'Format' >> beam.MapTuple(format_result)
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'Write' >> WriteToText(known_args.output)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
I am running on Windows 7, Python 2.7 via Anaconda 4.3.17, Luigi 2.4.0, Pandas 0.18, sklearn version 0.18. Per below, I am trying to have a luigi.LocalTarget output be a pickle to store a few different objects (using firstJob) and then read from that pickle in a dependent job (secondJob). firstJob completes successfully if I run the following from the command line:
"python -m luigi --module luigiPickle firstJob --date 2017-06-07 --local-scheduler"
However, if I try running secondJob i.e.,
"python -m luigi --module luigiPickle secondJob --date 2017-06-07 --local-scheduler"
I get
Traceback (most recent call last):
File "C:\Anaconda2\lib\site-packages\luigi-2.4.0-py2.7.egg\luigi\worker.py", l
ine 191, in run
new_deps = self._run_get_new_deps()
File "C:\Anaconda2\lib\site-packages\luigi-2.4.0-py2.7.egg\luigi\worker.py", l
ine 129, in _run_get_new_deps
task_gen = self.task.run()
File "luigiPickle.py", line 41, in run
ret2 = pickle.load(inFile)
File "C:\Anaconda2\lib\pickle.py", line 1384, in load
return Unpickler(file).load()
File "C:\Anaconda2\lib\pickle.py", line 864, in load
dispatch[key](self)
File "C:\Anaconda2\lib\pickle.py", line 1096, in load_global
klass = self.find_class(module, name)
File "C:\Anaconda2\lib\pickle.py", line 1130, in find_class
__import__(module)
ImportError: No module named frame
It appears that luigi is having trouble reading the pickle due to not recognizing the pandas.DataFrame() object (perhaps a scope issue?).
import luigi
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
class firstJob(luigi.Task):
date = luigi.DateParameter()
def requires(self):
return None
def output(self):
return luigi.LocalTarget('%s_first.pickle' % self.date)
def run(self):
ret = {}
ret['a'] = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
ret['b'] = pd.DataFrame({'a': [3, 4], 'd': [0, 0]})
ret['c'] = LinearRegression()
outFile = self.output().open('wb')
pickle.dump(ret, outFile, protocol=pickle.HIGHEST_PROTOCOL)
outFile.close()
class secondJob(luigi.Task):
date = luigi.DateParameter()
def requires(self):
return firstJob(self.date)
def output(self):
return luigi.LocalTarget('%s_second.pickle' % self.date)
def run(self):
inFile = self.input().open('rb')
ret2 = pickle.load(inFile)
inFile.close()
if __name__ == '__main__':
luigi.run()
The luigi open command doesn't work with the b flag for binary- it strips it out of the options string. (not sure why). Better to just use standard open with the path attribute:
open(self.input().path, 'rb') and open(self.output().path, 'wb').
d6tflow solves this, see example for sklearn model pickle which answers this question. Plus you don't need to write all that boilerplate code.
import d6tflow
class firstJob(d6tflow.tasks.TaskPickle):
def run(self):
# your code
self.save(ret)
class secondJob(TaskClass):
date = luigi.DateParameter()
def requires(self):
return firstJob(self.date)
def run(self):
inFile = self.input().load()
# use inFile
d6tflow.run([secondJob])
I am new to Python and as my first project I am attempting to convert a Python2 script to Python3.
The script is failing when it attempts to serialize a class using pickle.
It seems as though it is failing as I am trying to save a class which uses the Cmd CLI.
This code works using Python2.
Can anyone tell me what is wrong with the script and how I fix it?
import sys
import cmd
try:
import pickle as pickle
except:
import pickle
import os.path
def main():
app = Labyrinth()
turnfile = "turn0.lwot"
app.Save(turnfile)
class CLI(cmd.Cmd):
def __init__(self):
cmd.Cmd.__init__(self)
class Labyrinth(cmd.Cmd):
def __init__(self):
cmd.Cmd.__init__(self)
def Save(self, fname):
with open(fname, 'wb') as f:
pickle.dump(self,f, 2)
f.close()
print ("Save Successful!")
sys.exit()
if __name__ == '__main__':
main()
Not all objects are picklable. In particular, file objects are problematic because you can't generally restore their state later. cmd.Cmd holds stdin and stdout file objects and that should make them unpicklable. I was quite surprised that it worked in python 2, but it didn't really... Even though the stdin and stdout pickled, the unpickled object you get back later doesn't work, as in this example:
>>> import sys
>>> import pickle
>>> sys.stdout.write('foo\n')
foo
>>> serialized = pickle.dumps(sys.stdout, 2)
>>> stdout = pickle.loads(serialized)
>>> stdout.write('bar\n')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: I/O operation on closed file
>>>
So, even though this bit of code didn't fail, the object shouldn't be usable later. You can add a few special methods to an object that let you fix objects so they can be serialized. Here, I've stripped the bad attributes on save and added them back on restore. Now you can pickle, unpickle and it actually works when you are done.
import sys
import cmd
try:
import cPickle as pickle
except:
import pickle
import os.path
def main():
app = Labyrinth()
turnfile = "turn0.lwot"
app.Save(turnfile)
class CLI(cmd.Cmd):
def __init__(self):
cmd.Cmd.__init__(self)
class Labyrinth(cmd.Cmd):
def __init__(self):
cmd.Cmd.__init__(self)
def Save(self, fname):
with open(fname, 'wb') as f:
pickle.dump(self,f, pickle.HIGHEST_PROTOCOL)
f.close()
print ("Save Successful!")
sys.exit()
def __getstate__(self):
# stdin/out are unpicklable. We'll get new ones on load
return tuple(((k,v) for k,v in self.__dict__.items()
if k not in ('stdin', 'stdout')))
def __setstate__(self, state):
self.__dict__.update(state)
self.stdin = sys.stdin
self.stdout = sys.stdout
if __name__ == '__main__':
main()
Playing with the protocol doesn't help. The full error message (which you should have included) is:
1027:~/mypy$ python3 stack41334887.py
Traceback (most recent call last):
File "stack41334887.py", line 33, in <module>
main()
File "stack41334887.py", line 14, in main
app.Save(turnfile)
File "stack41334887.py", line 27, in Save
pickle.dump(self,f, 3, fix_imports=True)
TypeError: cannot serialize '_io.TextIOWrapper' object
Python3 made some major changes in the io system. This TextIOWrapper is, I think new to Py3.
https://docs.python.org/3.1/library/io.html#io.TextIOWrapper
Can I use multiprocessing.Pool in a method of a class? also had problems serializing a TextIOWrapper.
=========
So inspireed by #tdelaney, I checked the stdin for my PY3 session:
In [1212]: sys.stdin
Out[1212]: <_io.TextIOWrapper name='<stdin>' mode='r' encoding='UTF-8'>
So that's the thing that can't be serialized.
Is there any way in Pandas to capture the warning produced by setting error_bad_lines = False and warn_bad_lines = True? For instance the following script:
import pandas as pd
from StringIO import StringIO
data = StringIO("""a,b,c
1,2,3
4,5,6
6,7,8,9
1,2,5
3,4,5""")
pd.read_csv(data, warn_bad_lines=True, error_bad_lines=False)
produces the warning:
Skipping line 4: expected 3 fields, saw 4
I'd like to store this output to a string so that I can eventually write it to a log file to keep track of records that are being skipped.
I tried using the warning module but it doesn't appear as though this "warning" is of the traditional sense. I'm using Python 2.7 and Pandas 0.16.
I think it isn't implemented to pandas.
source1, source2
My solutions:
1. Pre or after processing
import pandas as pd
import csv
df = pd.read_csv('data.csv', warn_bad_lines=True, error_bad_lines=False)
#compare length of rows by recommended value:
RECOMMENDED = 3
with open('data.csv') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
for row in reader:
if (len(row) != RECOMMENDED):
print ("Length of row is: %r" % len(row) )
print row
#compare length of rows by length of columns in df
lencols = len(df.columns)
print lencols
with open('data.csv') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
for row in reader:
if (len(row) != lencols):
print ("Length of row is: %r" % len(row) )
print row
2. Replaces sys.stdout
import pandas as pd
import os
import sys
class RedirectStdStreams(object):
def __init__(self, stdout=None, stderr=None):
self._stdout = stdout or sys.stdout
self._stderr = stderr or sys.stderr
def __enter__(self):
self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
self.old_stdout.flush(); self.old_stderr.flush()
sys.stdout, sys.stderr = self._stdout, self._stderr
def __exit__(self, exc_type, exc_value, traceback):
self._stdout.flush(); self._stderr.flush()
sys.stdout = self.old_stdout
sys.stderr = self.old_stderr
if __name__ == '__main__':
devnull = open('log.txt', 'w')
#replaces sys.stdout, sys.stderr, see http://stackoverflow.com/a/6796752/2901002
with RedirectStdStreams(stdout=devnull, stderr=devnull):
df = pd.read_csv('data.csv', warn_bad_lines=True, error_bad_lines=False)
I can't help you with older than Python 3, but I've had very good success with the following:
import pandas as pd
from contextlib import redirect_stderr
import io
# Redirect stderr to something we can report on.
f = io.StringIO()
with redirect_stderr(f):
df = pd.read_csv(
new_file_name, header=None, error_bad_lines=False, warn_bad_lines=True, dtype=header_types
)
if f.getvalue():
logger.warning("Had parsing errors: {}".format(f.getvalue()))
I searched for this issue a number of times and kept being pointed to this questions. Hope it helps someone else, later on.
I am using zipline to backtest with the local data, but it seems unsuccessful.
from datetime import datetime
import pytz
import pandas as pd
from zipline.algorithm import TradingAlgorithm
import zipline.utils.factory as factory
class BuyApple(TradingAlgorithm):
def handle_data(self, data):
self.order('AAPL', 1)
if __name__ == '__main__':
data = pd.read_csv('AAPL.csv')
simple_algo = BuyApple()
results = simple_algo.run(data)
above is my code, When I run this script, I got the message:
[2015-04-03 01:41:53.712035] WARNING: Loader: No benchmark data found for date range.
start_date=2015-04-03 00:00:00+00:00, end_date=2015-04-03 01:41:53.632300, url=http://ichart.finance.yahoo.com/table.csv?a=3&c=2015&b=3&e=3&d=3&g=d&f=2015&s=%5EGSPC
Traceback (most recent call last):
File "bollinger.py", line 31, in <module>
results = simple_algo.run(data)
File "/home/xinzhou/.local/lib/python2.7/site-packages/zipline-0.7.0-py2.7.egg/zipline/algorithm.py", line 372, in run
source = DataFrameSource(source)
File "/home/xinzhou/.local/lib/python2.7/site-packages/zipline-0.7.0-py2.7.egg/zipline/sources/data_frame_source.py", line 42, in __init__
assert isinstance(data.index, pd.tseries.index.DatetimeIndex)
AssertionError
Then I change my code to below:
from datetime import datetime
import pytz
import pandas as pd
from zipline.algorithm import TradingAlgorithm
import zipline.utils.factory as factory
class BuyApple(TradingAlgorithm):
def handle_data(self, data):
self.order('AAPL', 1)
if __name__ == '__main__':
start = datetime(2000, 1, 9, 14, 30, 0, 0, pytz.utc)
end = datetime(2001, 1, 10, 21, 0, 0, 0, pytz.utc)
data = pd.read_csv('AAPL.csv', parse_dates=True, index_col=0)
sim_params = factory.create_simulation_parameters(
start=start, end=end, capital_base=10000)
sim_params.data_frequency = '1d'
sim_params.emission_rate = '1d'
simple_algo = BuyApple()
results = simple_algo.run(data)
The
assert isinstance(data.index, pd.tseries.index.DatetimeIndex)
AssertionError
is gone. But in my terminal, it keeps in this message:
[2015-04-03 01:44:28.141657] WARNING: Loader: No benchmark data found for date range.
start_date=2015-04-03 00:00:00+00:00, end_date=2015-04-03 01:44:28.028243, url=http://ichart.finance.yahoo.com/table.csv?a=3&c=2015&b=3&e=3&d=3&g=d&f=2015&s=%5EGSPC
How to solve this problem? Thanks.
data.index=pd.to_datetime(data.index)
data.index=data.index.tz_localize(pytz.utc)
The next code works for me.Is a version of the tutorial example "My first Algorithm" (http://www.zipline.io/tutorial/) .Data must be in ascending order by date. Run as a normal python program( python yourfilename.py):
import pytz
from datetime import datetime
from zipline.algorithm import TradingAlgorithm
from zipline.api import order, record, symbol
import pandas as pd
# Load data manually csv
#Date,Open,High,Low,Close,Volume,Adj Close
#1984-09-07,26.5,26.87,26.25,26.5,2981600,3.02
#...
parse = lambda x: pytz.utc.localize(datetime.strptime(x, '%Y-%m-%d'))
data=pd.read_csv('aapl.csv', parse_dates=['Date'], index_col=0,date_parser=parse)
# Define algorithm
def initialize(context):
pass
def handle_data(context, data):
order('Close',10)
record(AAPL=data['Close'])
# Create algorithm object passing in initialize and
# handle_data functions
algo_obj = TradingAlgorithm(initialize=initialize,
handle_data=handle_data)
# Run algorithm
perf_manual = algo_obj.run(data)
# Print
perf_manual.to_csv('output.csv'