Best way to download all historic Ethereum ERC721 transfers - blockchain

I'd like to download all the transfer events of tokens under a given contract address.
I know etherscan provides an API endpoint for this, however it is limited to the latest 10,000 transfers (even if paginating). https://docs.etherscan.io/api-endpoints/accounts#get-a-list-of-erc721-token-transfer-events-by-address
Is there a third party who can provide this data, or is my only option to get it directly from a node (Infura, Geth, etc.)?
Thanks!

Limited to 10k transfers per contract? I know opensea events api can filter by contract address + token id and you can do before and after timestamps. But I don't know how far back they go.

This should work. You need to insert your API key (API_KEY) and the wallet address you want to explore in the main function.
The output will be as CSV file named data.csv
from requests import get
import pandas as pd
pd.options.display.max_columns = 60 ## 0 by default
pd.options.display.width = 10000 ## 80 by default
pd.options.display.max_rows = 3000
API_KEY = ""
'''
https://api.etherscan.io/api
?module=account
&action=balance
&address=0xde0b295669a9fd93d5f28d9ec85e40f4cb697bae
&tag=latest
&apikey=YourApiKeyToken
'''
BASE_URL = "https://api.etherscan.io/api"
ETH_VALUE = 10 ** 18
def make_api_url(module, action, adress, **kwargs):
url = f"{BASE_URL}?module={module}&action={action}&address={adress}&apikey={API_KEY}"
for key, value in kwargs.items():
url += f"&{key}={value}"
return url
class Collector:
def __init__(self, start_block):
self.start_block = start_block
def get_erc721_transactions(self, adress):
'''
https://api.etherscan.io/api
?module=account
&action=tokennfttx
&contractaddress=0x06012c8cf97bead5deae237070f9587f8e7a266d
&address=0x6975be450864c02b4613023c2152ee0743572325
&page=1
&offset=100
&startblock=0
&endblock=27025780
&sort=asc
&apikey=YourApiKeyToken
'''
get_transaction_url = make_api_url("account",
"tokennfttx",
adress,
startblock=self.start_block,
endblock=999999999999999999,
sort='asc')
response = get(get_transaction_url)
data = response.json()
temp_df = pd.json_normalize(data['result'], sep="_")
temp_df['gasCost'] = temp_df.gasUsed.astype(float) * temp_df.gasPrice.astype(float)
print(temp_df.tail())
print(self.start_block)
temp_df['type'] = 'erc721'
return temp_df
def aggrigate_data(self, address):
data = pd.DataFrame()
self.start_block = 0
while True:
df = self.get_erc721_transactions(address)
if df.shape[0] == 0:
print('There is no erc721 transactions')
break
if self.start_block == df.blockNumber.iloc[-1]:
break
data = pd.concat([data, df])
self.start_block = df.blockNumber.iloc[-1]
data.head()
data.to_csv("data.csv")
if __name__ == '__main__':
'''Insert the wallet address you want to check'''
address = "0x4c8CFE078a5B989CeA4B330197246ceD82764c63"
Collector(0).aggrigate_data(address)

Related

Automating Date Range while extracting

The below script I am using to extract data from Google Analytics. Here I am extracting data for last one week. I want to automate the date range so that i don't have to change date_range every week.
I also want to avoid sampling of data by GA. Please guide my the correct way to automate in details.
author = 'test#gmail.com (test)'
import argparse
import sys
import csv
import string
import datetime
import json
import time
from apiclient.errors import HttpError
from apiclient import sample_tools
from oauth2client.client import AccessTokenRefreshError
cam_name = sys.argv[1:]
class SampledDataError(Exception): pass
def main(argv):
# Authenticate and construct service.
service, flags = sample_tools.init(
argv[0], 'analytics', 'v3', __doc__, __file__,
scope='https://www.googleapis.com/analytics.readonly')
# Try to make a request to the API. Print the results or handle errors.
try:
profile_id = profile_ids[profile]
if not profile_id:
print ('Could not find a valid profile for this user.')
else:
metrics = argv[1]
dimensions = argv[2]
reportName = argv[3]
sort = argv[4]
filters = argv[5]
for start_date, end_date in date_ranges:
limit = ga_query(service, profile_id, 0,
start_date, end_date, metrics, dimensions, sort, filters).get('totalResults')
for pag_index in range(0, limit, 10000):
results = ga_query(service, profile_id, pag_index,
start_date, end_date, metrics, dimensions, sort, filters)
# if results.get('containsSampledData'):
# raise SampledDataError
print_results(results, pag_index, start_date, end_date, reportName)
except TypeError as error:
# Handle errors in constructing a query.
print ('There was an error in constructing your query : %s' % error)
except HttpError as error:
# Handle API errors.
print ('Arg, there was an API error : %s : %s' %
(error.resp.status, error._get_reason()))
except AccessTokenRefreshError:
# Handle Auth errors.
print ('The credentials have been revoked or expired, please re-run '
'the application to re-authorize')
except SampledDataError:
# force an error if ever a query returns data that is sampled!
print ('Error: Query contains sampled data!')
def ga_query(service, profile_id, pag_index, start_date, end_date, metrics, dimensions, sort, filters):
return service.data().ga().get(
ids='ga:' + profile_id,
start_date=start_date,
end_date=end_date,
metrics=metrics,
dimensions=dimensions,
sort=sort,
filters=filters,
samplingLevel='HIGHER_PRECISION',
start_index=str(pag_index+1),
max_results=str(pag_index+10000)).execute()
def print_results(results, pag_index, start_date, end_date, reportName):
"""Prints out the results.
This prints out the profile name, the column headers, and all the rows of
data.
Args:
results: The response returned from the Core Reporting API.
"""
# New write header
if pag_index == 0:
if (start_date, end_date) == date_ranges[0]:
print ('Profile Name: %s' % results.get('profileInfo').get('profileName'))
columnHeaders = results.get('columnHeaders')
cleanHeaders = [str(h['name']) for h in columnHeaders]
writer.writerow(cleanHeaders)
print (reportName,'Now pulling data from %s to %s.' %(start_date, end_date))
# Print data table.
if results.get('rows', []):
for row in results.get('rows'):
for i in range(len(row)):
old, new = row[i], str()
for s in old:
new += s if s in string.printable else ''
row[i] = new
writer.writerow(row)
else:
print ('No Rows Found')
limit = results.get('totalResults')
print (pag_index, 'of about', int(round(limit, -4)), 'rows.')
return None
# Uncomment this line & replace with 'profile name': 'id' to query a single profile
# Delete or comment out this line to loop over multiple profiles.
#Brands
profile_ids = {'abc-Mobile': '12345',
'abc-Desktop': '23456',
'pqr-Mobile': '34567',
'pqr-Desktop': '45678',
'xyz-Mobile': '56789',
'xyz-Desktop': '67890'}
date_ranges = [
('2017-01-24','2017-01-24'),
('2017-01-25','2017-01-25'),
('2017-01-26','2017-01-26'),
('2017-01-27','2017-01-27'),
('2017-01-28','2017-01-28'),
('2017-01-29','2017-01-29'),
('2017-01-30','2017-01-30')
]
for profile in sorted(profile_ids):
print("Sequence 1",profile)
with open('qwerty.json') as json_data:
d = json.load(json_data)
for getThisReport in d["Reports"]:
print("Sequence 2",getThisReport["ReportName"])
reportName = getThisReport["ReportName"]
metrics = getThisReport["Metrics"]
dimensions = getThisReport["Dimensions"]
sort = getThisReport["sort"]
filters = getThisReport["filter"]
path = 'C:\\Projects\\DataExport\\test\\' #replace with path to your folder where csv file with data will be written
today = time.strftime('%Y%m%d')
filename = profile+'_'+reportName+'_'+today+'.csv' #replace with your filename. Note %s is a placeholder variable and the profile name you specified on row 162 will be written here
with open(path + filename, 'wt') as f:
writer = csv.writer(f,delimiter = '|', lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
args = [sys.argv,metrics,dimensions,reportName,sort,filters]
if __name__ == '__main__': main(args)
print ( "Profile done. Next profile...")
print ("All profiles done.")
The Core Reporting API supports some interesting things as far as dates goes.
All Analytics data requests must specify a date range. If you do not include start-date and end-date parameters in the request, the server returns an error. Date values can be for a specific date by using the pattern YYYY-MM-DD or relative by using today, yesterday, or the NdaysAgo pattern. Values must match [0-9]{4}-[0-9]{2}-[0-9]{2}|today|yesterday|[0-9]+(daysAgo).
so doing something like
start_date = '7daysAgo'
end_date = 'today'
Just remember that data hasn't completed processing for 24 - 48 hours so your data for today, yesterday and the day before that may not be 100% accurate.

get() in Google Datastore doesn't work as intended

I'm building a basic blog from the Web Development course by Steve Hoffman on Udacity. This is my code -
import os
import webapp2
import jinja2
from google.appengine.ext import db
template_dir = os.path.join(os.path.dirname(__file__), 'templates')
jinja_env = jinja2.Environment(loader = jinja2.FileSystemLoader(template_dir), autoescape = True)
def datetimeformat(value, format='%H:%M / %d-%m-%Y'):
return value.strftime(format)
jinja_env.filters['datetimeformat'] = datetimeformat
def render_str(template, **params):
t = jinja_env.get_template(template)
return t.render(params)
class Entries(db.Model):
title = db.StringProperty(required = True)
body = db.TextProperty(required = True)
created = db.DateTimeProperty(auto_now_add = True)
class MainPage(webapp2.RequestHandler):
def get(self):
entries = db.GqlQuery('select * from Entries order by created desc limit 10')
self.response.write(render_str('mainpage.html', entries=entries))
class NewPost(webapp2.RequestHandler):
def get(self):
self.response.write(render_str('newpost.html', error=""))
def post(self):
title = self.request.get('title')
body = self.request.get('body')
if title and body:
e = Entries(title=title, body=body)
length = db.GqlQuery('select * from Entries order by created desc').count()
e.put()
self.redirect('/newpost/' + str(length+1))
else:
self.response.write(render_str('newpost.html', error="Please type in a title and some content"))
class Permalink(webapp2.RequestHandler):
def get(self, id):
e = db.GqlQuery('select * from Entries order by created desc').get()
self.response.write(render_str('permalink.html', id=id, entry = e))
app = webapp2.WSGIApplication([('/', MainPage),
('/newpost', NewPost),
('/newpost/(\d+)', Permalink)
], debug=True)
In the class Permalink, I'm using the get() method on the query than returns all records in the descending order of creation. So, it should return the most recently added record. But when I try to add a new record, permalink.html (it's just a page with shows the title, the body and the date of creation of the new entry) shows the SECOND most recently added. For example, I already had three records, so when I added a fourth record, instead of showing the details of the fourth record, permalink.html showed me the details of the third record. Am I doing something wrong?
I don't think my question is a duplicate of this - Read delay in App Engine Datastore after put(). That question is about read delay of put(), while I'm using get(). The accepted answer also states that get() doesn't cause any delay.
This is because of eventual consistency used by default for GQL queries.
You need to read:
https://cloud.google.com/appengine/docs/python/datastore/data-consistency
https://cloud.google.com/appengine/docs/python/datastore/structuring_for_strong_consistency
https://cloud.google.com/datastore/docs/articles/balancing-strong-and-eventual-consistency-with-google-cloud-datastore/
search & read on SO and other source about strong & eventual consistency in Google Cloud Datastore.
You can specify read_policy=STRONG_CONSISTENCY for your query but it has associated costs that you should be aware of and take into account.

Query between two numbers

Django Version 1.9.5
What I'm essentially after is this query:
SELECT *
FROM "ipaddress_setup"
WHERE '167837954' BETWEEN "start_ipaddress" AND "end_ipaddress"
In the query there may be additional WHERE statements, here is an example of what I've got so far:
from django.core.paginator import Paginator
from database.models import IpaddressSetup
from django.db.models import Q
import ipaddress
class ServiceSearch:
def __init__(self, request, get):
self.request = request
self.get = get
def search(self):
args = ()
context = {}
if 'name' in self.get and self.get['name'] is not None:
context['name__icontains'] = self.get['name']
if 'pool' in self.get and self.get['pool'] is not None:
try:
ip = ipaddress.ip_address(self.get['pool'])
args = (Q(start_ipaddress__gte=int(ip)) | Q(end_ipaddress__lte=int(ip)),)
except ValueError:
pass
if 'ipaddress_type' in self.get and self.get['ipaddress_type'] is not None:
context['ipaddress_type__exact'] = self.get['ipaddress_type']
if 'assigned' in self.get and self.get['assigned'] is not None:
context['assigned__exact'] = self.get['assigned']
if 'status' in self.get and self.get['status'] is not None:
context['status__exact'] = self.get['status']
result = IpaddressSetup.objects.all().filter(*args, **context).order_by('name')
return Paginator(result, self.request.user.max_search)
This is used in a search feature for finding IP Addresses in an allotted pool. I store the IP's as INT's for starting ipaddress / ending ipaddress in two different columns named start_ipaddress and end_ipaddress
But as you can see, I'm also allowing the ability to search for the pool name, type of IP (ipv4/ipv6), assigned (public/reserved), and status (enabled/disabled)
The only problem I am having right now is trying to get the BETWEEN query working on the start_ipaddress / end_ipaddress.
I've tried using GTE/LTE queries, but then it returns other IP Pools that may also fall within the search'd params, but I'm after more of a concrete way of finding IP's between a pool.
Based on this image and my search params, my hopes is to only return 1 record instead of the 3 listed here.
I'd be happy to supply any other details.
In the line
args = (Q(start_ipaddress__gte=int(ip)) | Q(end_ipaddress__lte=int(ip)),)
you are ORing the subqueries:
start_ip >= ip OR end_ip <= ip
That will yield everything.
AND them:
args = (Q(start_ipaddress__gte=int(ip)) & Q(end_ipaddress__lte=int(ip)),)
This means OR
args = (Q(start_ipaddress__gte=int(ip)) | Q(end_ipaddress__lte=int(ip)),)
but you want AND
... WHERE "start_ipaddress" <= 167837954 AND "end_ipaddress" => 167837954
or in Django's
context['start_ipaddress__lte'] = int(ip)
context['end_ipaddress__gte'] = int(ip)

How to increase number of search terms in Tweepy for Twitter API access?

I found that I can only include up till 20+ search terms with the following code, then it gives me error of saying there is too many search terms. Is there any way to get around it?
import tweepy
searchTerms = '1' or '2' or '3' # to say ... '99' or '100'
tweets = tweepy.Cursor(api.search, q=searchTerms, since=startSince, until=endUntil).items()
Per the Twitter docs, the q parameter is limited.
A UTF-8, URL-encoded search query of 500 characters maximum, including operators. Queries may additionally be limited by complexity.
If you want to build complex search term logic you could use a Streamer and Listener. You're essentially doing your own filtering. Here's a simple example of a Listener. I tried to give some popular objects returned from the on_status method, returned as a json.
import json
class SListener(StreamListener):
def __init__(self, api = None, fprefix = 'streamer'):
self.api = api or API()
self.counter = 0
self.fprefix = fprefix
def on_data(self, data):
elif 'limit' in data:
if self.on_limit(json.loads(data)['limit']['track']) is False:
return False
elif 'warning' in data:
warning = json.loads(data)['warnings']
print warning['message']
return false
def on_status(self, status):
status_obj = json.loads(status)
username = status_obj["user"]["screen_name"]
userID = status_obj["user"]["id"]
user_loc = status_obj["user"]["location"]
tweet_date_time = status_obj["created_at"]
tweetID = status_obj["id"]
tweet = status_obj["text"].encode('utf-8')
searchTerms = ['1','2','3'] # to say ... '99' or '100'
if any(query in tweet for query in searchTerms):
print(tweet) #or do something with it

Automating pulling csv files off google Trends

pyGTrends does not seem to work. Giving errors in Python.
pyGoogleTrendsCsvDownloader seems to work, logs in, but after getting 1-3 requests (per day!) complains about exhausted quota, even though manual download with the same login/IP works flawlessly.
Bottom line: neither work. Searching through stackoverflow: many questions from people trying to pull csv's from Google, but no workable solution I could find...
Thank you in advance: whoever will be able to help. How should the code be changed? Do you know of another solution that works?
Here's the code of pyGoogleTrendsCsvDownloader.py
import httplib
import urllib
import urllib2
import re
import csv
import lxml.etree as etree
import lxml.html as html
import traceback
import gzip
import random
import time
import sys
from cookielib import Cookie, CookieJar
from StringIO import StringIO
class pyGoogleTrendsCsvDownloader(object):
'''
Google Trends Downloader
Recommended usage:
from pyGoogleTrendsCsvDownloader import pyGoogleTrendsCsvDownloader
r = pyGoogleTrendsCsvDownloader(username, password)
r.get_csv(cat='0-958', geo='US-ME-500')
'''
def __init__(self, username, password):
'''
Provide login and password to be used to connect to Google Trends
All immutable system variables are also defined here
'''
# The amount of time (in secs) that the script should wait before making a request.
# This can be used to throttle the downloading speed to avoid hitting servers too hard.
# It is further randomized.
self.download_delay = 0.25
self.service = "trendspro"
self.url_service = "http://www.google.com/trends/"
self.url_download = self.url_service + "trendsReport?"
self.login_params = {}
# These headers are necessary, otherwise Google will flag the request at your account level
self.headers = [('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
("Accept-Language", "en-gb,en;q=0.5"),
("Accept-Encoding", "gzip, deflate"),
("Connection", "keep-alive")]
self.url_login = 'https://accounts.google.com/ServiceLogin?service='+self.service+'&passive=1209600&continue='+self.url_service+'&followup='+self.url_service
self.url_authenticate = 'https://accounts.google.com/accounts/ServiceLoginAuth'
self.header_dictionary = {}
self._authenticate(username, password)
def _authenticate(self, username, password):
'''
Authenticate to Google:
1 - make a GET request to the Login webpage so we can get the login form
2 - make a POST request with email, password and login form input values
'''
# Make sure we get CSV results in English
ck = Cookie(version=0, name='I4SUserLocale', value='en_US', port=None, port_specified=False, domain='www.google.com', domain_specified=False,domain_initial_dot=False, path='/trends', path_specified=True, secure=False, expires=None, discard=False, comment=None, comment_url=None, rest=None)
self.cj = CookieJar()
self.cj.set_cookie(ck)
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
self.opener.addheaders = self.headers
# Get all of the login form input values
find_inputs = etree.XPath("//form[#id='gaia_loginform']//input")
try:
#
resp = self.opener.open(self.url_login)
if resp.info().get('Content-Encoding') == 'gzip':
buf = StringIO( resp.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
else:
data = resp.read()
xmlTree = etree.fromstring(data, parser=html.HTMLParser(recover=True, remove_comments=True))
for input in find_inputs(xmlTree):
name = input.get('name')
if name:
name = name.encode('utf8')
value = input.get('value', '').encode('utf8')
self.login_params[name] = value
except:
print("Exception while parsing: %s\n" % traceback.format_exc())
self.login_params["Email"] = username
self.login_params["Passwd"] = password
params = urllib.urlencode(self.login_params)
self.opener.open(self.url_authenticate, params)
def get_csv(self, throttle=False, **kwargs):
'''
Download CSV reports
'''
# Randomized download delay
if throttle:
r = random.uniform(0.5 * self.download_delay, 1.5 * self.download_delay)
time.sleep(r)
params = {
'export': 1
}
params.update(kwargs)
params = urllib.urlencode(params)
r = self.opener.open(self.url_download + params)
# Make sure everything is working ;)
if not r.info().has_key('Content-Disposition'):
print "You've exceeded your quota. Continue tomorrow..."
sys.exit(0)
if r.info().get('Content-Encoding') == 'gzip':
buf = StringIO( r.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
else:
data = r.read()
myFile = open('trends_%s.csv' % '_'.join(['%s-%s' % (key, value) for (key, value) in kwargs.items()]), 'w')
myFile.write(data)
myFile.close()
Although I don't know python, I may have a solution. I am currently doing the same thing in C# and though I didn't get the .csv file, I got created a custom URL through code and then downloaded that HTML and saved to a text file (also through code). In this HTML (at line 12) is all the information needed to create the graph that is used on Google Trends. However, this has alot of unnecessary text within it that needs to be cut down. But either way, you end up with the same result. The Google Trends data. I posted a more detailed answer to my question here:
Downloading .csv file from Google Trends
There is an alternative module named pytrends - https://pypi.org/project/pytrends/ It is really cool. I would recommend this.
Example usage:
import numpy as np
import pandas as pd
from pytrends.request import TrendReq
pytrend = TrendReq()
#It is the term that you want to search
pytrend.build_payload(kw_list=["Eminem is the Rap God"])
# Find which region has searched the term
df = pytrend.interest_by_region()
df.to_csv("path\Eminem_InterestbyRegion.csv")
Potentially if you have a list of terms to search you could make use of "for loop" to automate the insights as per your wish.