How to fetch old data from twitter - python-2.7

I am using tweepy api to fetch old tweets
Here is my code which only fetching around most recent 3200 tweets from twitter.
Code :
import tweepy #https://github.com/tweepy/tweepy
import pymssql
#Twitter API credentials
access_key = ''
access_secret = ''
consumer_key = ''
consumer_secret = ''
#connect to local database
conn = pymssql.connect('(local)','','','mydbname')
x = conn.cursor()
def get_all_tweets(sc):
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = sc,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print "getting tweets before %s" % (oldest)
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = sc,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print "...%s tweets downloaded so far" % (len(alltweets))
print len(alltweets)
#save tweets to database
for tweet in alltweets:
insert_post_details(tweet)
#insert tweets data into database
def insert_post_details(tweet):
try:
qry="INSERT INTO MPTTwitterPosts(Id,Created_At,Favorite_Count ,Retweet_Count,Text) VALUES(%s,%s,%s,%s,%s)"
x.execute(qry, (str(tweet.id), tweet.created_at,tweet.favorite_count,tweet.retweet_count,str(tweet.text.encode("utf-8","ignore"))))
conn.commit()
except Exception,e:
print str(e)
if __name__ == '__main__':
#pass in the username of the account you want to download
get_all_tweets("Google")
If I tried to get older tweets with max_id,it is giving blank list of tweets
so How can I get old tweets, any suggestion would be greatly helpfull.
Thanks

Related

Automating Date Range while extracting

The below script I am using to extract data from Google Analytics. Here I am extracting data for last one week. I want to automate the date range so that i don't have to change date_range every week.
I also want to avoid sampling of data by GA. Please guide my the correct way to automate in details.
author = 'test#gmail.com (test)'
import argparse
import sys
import csv
import string
import datetime
import json
import time
from apiclient.errors import HttpError
from apiclient import sample_tools
from oauth2client.client import AccessTokenRefreshError
cam_name = sys.argv[1:]
class SampledDataError(Exception): pass
def main(argv):
# Authenticate and construct service.
service, flags = sample_tools.init(
argv[0], 'analytics', 'v3', __doc__, __file__,
scope='https://www.googleapis.com/analytics.readonly')
# Try to make a request to the API. Print the results or handle errors.
try:
profile_id = profile_ids[profile]
if not profile_id:
print ('Could not find a valid profile for this user.')
else:
metrics = argv[1]
dimensions = argv[2]
reportName = argv[3]
sort = argv[4]
filters = argv[5]
for start_date, end_date in date_ranges:
limit = ga_query(service, profile_id, 0,
start_date, end_date, metrics, dimensions, sort, filters).get('totalResults')
for pag_index in range(0, limit, 10000):
results = ga_query(service, profile_id, pag_index,
start_date, end_date, metrics, dimensions, sort, filters)
# if results.get('containsSampledData'):
# raise SampledDataError
print_results(results, pag_index, start_date, end_date, reportName)
except TypeError as error:
# Handle errors in constructing a query.
print ('There was an error in constructing your query : %s' % error)
except HttpError as error:
# Handle API errors.
print ('Arg, there was an API error : %s : %s' %
(error.resp.status, error._get_reason()))
except AccessTokenRefreshError:
# Handle Auth errors.
print ('The credentials have been revoked or expired, please re-run '
'the application to re-authorize')
except SampledDataError:
# force an error if ever a query returns data that is sampled!
print ('Error: Query contains sampled data!')
def ga_query(service, profile_id, pag_index, start_date, end_date, metrics, dimensions, sort, filters):
return service.data().ga().get(
ids='ga:' + profile_id,
start_date=start_date,
end_date=end_date,
metrics=metrics,
dimensions=dimensions,
sort=sort,
filters=filters,
samplingLevel='HIGHER_PRECISION',
start_index=str(pag_index+1),
max_results=str(pag_index+10000)).execute()
def print_results(results, pag_index, start_date, end_date, reportName):
"""Prints out the results.
This prints out the profile name, the column headers, and all the rows of
data.
Args:
results: The response returned from the Core Reporting API.
"""
# New write header
if pag_index == 0:
if (start_date, end_date) == date_ranges[0]:
print ('Profile Name: %s' % results.get('profileInfo').get('profileName'))
columnHeaders = results.get('columnHeaders')
cleanHeaders = [str(h['name']) for h in columnHeaders]
writer.writerow(cleanHeaders)
print (reportName,'Now pulling data from %s to %s.' %(start_date, end_date))
# Print data table.
if results.get('rows', []):
for row in results.get('rows'):
for i in range(len(row)):
old, new = row[i], str()
for s in old:
new += s if s in string.printable else ''
row[i] = new
writer.writerow(row)
else:
print ('No Rows Found')
limit = results.get('totalResults')
print (pag_index, 'of about', int(round(limit, -4)), 'rows.')
return None
# Uncomment this line & replace with 'profile name': 'id' to query a single profile
# Delete or comment out this line to loop over multiple profiles.
#Brands
profile_ids = {'abc-Mobile': '12345',
'abc-Desktop': '23456',
'pqr-Mobile': '34567',
'pqr-Desktop': '45678',
'xyz-Mobile': '56789',
'xyz-Desktop': '67890'}
date_ranges = [
('2017-01-24','2017-01-24'),
('2017-01-25','2017-01-25'),
('2017-01-26','2017-01-26'),
('2017-01-27','2017-01-27'),
('2017-01-28','2017-01-28'),
('2017-01-29','2017-01-29'),
('2017-01-30','2017-01-30')
]
for profile in sorted(profile_ids):
print("Sequence 1",profile)
with open('qwerty.json') as json_data:
d = json.load(json_data)
for getThisReport in d["Reports"]:
print("Sequence 2",getThisReport["ReportName"])
reportName = getThisReport["ReportName"]
metrics = getThisReport["Metrics"]
dimensions = getThisReport["Dimensions"]
sort = getThisReport["sort"]
filters = getThisReport["filter"]
path = 'C:\\Projects\\DataExport\\test\\' #replace with path to your folder where csv file with data will be written
today = time.strftime('%Y%m%d')
filename = profile+'_'+reportName+'_'+today+'.csv' #replace with your filename. Note %s is a placeholder variable and the profile name you specified on row 162 will be written here
with open(path + filename, 'wt') as f:
writer = csv.writer(f,delimiter = '|', lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
args = [sys.argv,metrics,dimensions,reportName,sort,filters]
if __name__ == '__main__': main(args)
print ( "Profile done. Next profile...")
print ("All profiles done.")
The Core Reporting API supports some interesting things as far as dates goes.
All Analytics data requests must specify a date range. If you do not include start-date and end-date parameters in the request, the server returns an error. Date values can be for a specific date by using the pattern YYYY-MM-DD or relative by using today, yesterday, or the NdaysAgo pattern. Values must match [0-9]{4}-[0-9]{2}-[0-9]{2}|today|yesterday|[0-9]+(daysAgo).
so doing something like
start_date = '7daysAgo'
end_date = 'today'
Just remember that data hasn't completed processing for 24 - 48 hours so your data for today, yesterday and the day before that may not be 100% accurate.

extracting tweet from Twitter API using Python

I want to get every tweet of the HousingWire on Twitter (https://twitter.com/HousingWire). I understood how to authenticate into the twitter account but how I can get the tweet of HousingWire?
I know how to stream the data based on the keywords,but I want to stream the HousingWire tweet. how I can do that?
import time
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
ckey=''
csecret=''
atoken=''
asecret=''
class listener(StreamListener):
def on_data(self,data):
try:
print data
#tweet=data.split(',"text":"')[1].split('","source')[0]
#print tweet
#savethis=str(time.time())+'::'+tweet
savefile=open('tweetdb.txt','a')
savefile.write(data)
savefile.write('\n')
savefile.close()
return True
except BaseException,e:
print 'failed on data',str(e)
time.sleep(5)
def on_error(self,status):
print status
auth=OAuthHandler(ckey,csecret)
auth.set_access_token(atoken,asecret)
twitterStream=Stream(auth,listener())
twitterStream.filter(track=["stock"])
You can use the below Python script to grab the last 3,240 tweets from HousingWire (Twitter only allows access to that many tweets from a user - no way to grab the complete history). Usage: Simply put their twitter screen name in the script.
#!/usr/bin/env python
# encoding: utf-8
import tweepy #https://github.com/tweepy/tweepy
import csv
#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
def get_all_tweets(screen_name):
#Twitter only allows access to a users most recent 3240 tweets with this method
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print "getting tweets before %s" % (oldest)
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print "...%s tweets downloaded so far" % (len(alltweets))
#transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]
#write the csv
with open('%s_tweets.csv' % screen_name, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["id","created_at","text"])
writer.writerows(outtweets)
pass
if __name__ == '__main__':
#pass in the username of the account you want to download
get_all_tweets("J_tsar")

How to get user tweets on Twitter on particular topic using Tweepy package in Python

Suppose user 'ABCD' has tweeted on topic, how to find the comments for that tweet ?
I am using tweepy to get twitter data.
It will also be good, If it possible using any other python packages.
Code for the following is:
Import tweepy
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)` # This is authentication process.
new_tweets = api.user_timeline(screen_name ='ABCD',count=20) # Code to download 20 timelines for user 'ABCD'
Till here i can get 20 timelines from user ABCD.
Q: How to get tweets for each timeline.
Thanks.

fetching various data corresponding to a tweet

I am trying to fetch data from twitter for processing. Please see the code I want various data corresponding to a particular tweet corresponding to a given topic. I am able to fetch data (created_at, text, username, user_id). It shows error when i try to fetch(location, followers_count, friends_count, retweet_count).
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
import json
ckey = '***********************'
csecret = '************************'
atoken ='*************************'
asecret = '**********************'
class listener(StreamListener):
def on_data(self,data):
try:
all_data = json.loads(data)
tweet = all_data["text"]
username = all_data["user"]["screen_name"]
timestamp = all_data["created_at"]
user_id = all_data["id_str"]
location = all_data["location"]
followers_count = all_data["followers_count"]
friends_count = all_data["friends_count"]
retweet_count = all_data["retweet_count"]
saveThis = str(time.time())+'::'+timestamp+'::'+username+'::'+user_id+'::'+tweet+'::'+followers_count+'::'+friends_count+'::'+retweet_count+'::'+location
saveFile = open('clean2.txt','a')
saveFile.write(saveThis)
saveFile.write('\n')
saveFile.close
return True
except BaseException, e:
print 'failed on data,',str(e)
time.sleep(5)
def on_error(self, status):
print status
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["tweepy"])#topic
The reason it fails on all_data["location"] is that tweets don't have such a property: https://dev.twitter.com/overview/api/tweets
same with friends_count, followers_count - they are properties of users, not tweets.
The code should not be failing on all_date["retweet_count"] as tweets have such a property.
P.S. please include the error message (even if you skip the full error trackback) when reporting errors. makes it's easier to help you, otherwise one has to guess what the error might be.

Extraction of the all tweets of particular user with timestamp

I am trying to extract the all tweets and timestamp of particular person. I am new in python and tweepy. I have the working code from internet search, but my desire is to print only all tweets of particular user.
import tweepy
# Authentication details. To obtain these visit dev.twitter.com
consumer_key = 'nWGEdfoaBt7d6wWhiAw5Tw'
consumer_secret = 'qM4QfDPqG9JQp6n0fqTCMrj6LJjES6vu2IzqpZLc'
access_token = '2284416938-JbD4F32m9xQPMxKoh6UikpCLoJm8F6xy8wDPS9P'
access_token_secret = 'XvJZQWa6zz5vHcHkUcYBacQKZJE9pcxbpxUUgNo9rN4AG'
if __name__ == '__main__':
# Create authentication token
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
print 'Getting statistics for #BarackObama:'
# Get information about the user
data = api.get_user('BarackObama')
print 'Followers: ' + str(data.followers_count)
print 'Tweets: ' + str(data.statuses_count)
print 'Favouries: ' + str(data.favourites_count)
print 'Friends: ' + str(data.friends_count)
print 'Appears on ' + str(data.listed_count) + ' lists'
print(data)
print(data) or print(status) only gives certain tweets(not all) along with other unwanted information in JSON format.
I found extracting only tweets from home and own timeline by use of following code
statuses = tweepy.Cursor(api.user_timeline).items(2)
data = [s.text.encode('utf8') for s in statuses]
print data
Anyway I got the way to download tweet at single window by passing screenname or twitter id, it works for multiple user at a same time as well