I want to get every tweet of the HousingWire on Twitter (https://twitter.com/HousingWire). I understood how to authenticate into the twitter account but how I can get the tweet of HousingWire?
I know how to stream the data based on the keywords,but I want to stream the HousingWire tweet. how I can do that?
import time
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
ckey=''
csecret=''
atoken=''
asecret=''
class listener(StreamListener):
def on_data(self,data):
try:
print data
#tweet=data.split(',"text":"')[1].split('","source')[0]
#print tweet
#savethis=str(time.time())+'::'+tweet
savefile=open('tweetdb.txt','a')
savefile.write(data)
savefile.write('\n')
savefile.close()
return True
except BaseException,e:
print 'failed on data',str(e)
time.sleep(5)
def on_error(self,status):
print status
auth=OAuthHandler(ckey,csecret)
auth.set_access_token(atoken,asecret)
twitterStream=Stream(auth,listener())
twitterStream.filter(track=["stock"])
You can use the below Python script to grab the last 3,240 tweets from HousingWire (Twitter only allows access to that many tweets from a user - no way to grab the complete history). Usage: Simply put their twitter screen name in the script.
#!/usr/bin/env python
# encoding: utf-8
import tweepy #https://github.com/tweepy/tweepy
import csv
#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
def get_all_tweets(screen_name):
#Twitter only allows access to a users most recent 3240 tweets with this method
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print "getting tweets before %s" % (oldest)
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print "...%s tweets downloaded so far" % (len(alltweets))
#transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]
#write the csv
with open('%s_tweets.csv' % screen_name, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["id","created_at","text"])
writer.writerows(outtweets)
pass
if __name__ == '__main__':
#pass in the username of the account you want to download
get_all_tweets("J_tsar")
Related
I am using tweepy api to fetch old tweets
Here is my code which only fetching around most recent 3200 tweets from twitter.
Code :
import tweepy #https://github.com/tweepy/tweepy
import pymssql
#Twitter API credentials
access_key = ''
access_secret = ''
consumer_key = ''
consumer_secret = ''
#connect to local database
conn = pymssql.connect('(local)','','','mydbname')
x = conn.cursor()
def get_all_tweets(sc):
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = sc,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print "getting tweets before %s" % (oldest)
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = sc,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print "...%s tweets downloaded so far" % (len(alltweets))
print len(alltweets)
#save tweets to database
for tweet in alltweets:
insert_post_details(tweet)
#insert tweets data into database
def insert_post_details(tweet):
try:
qry="INSERT INTO MPTTwitterPosts(Id,Created_At,Favorite_Count ,Retweet_Count,Text) VALUES(%s,%s,%s,%s,%s)"
x.execute(qry, (str(tweet.id), tweet.created_at,tweet.favorite_count,tweet.retweet_count,str(tweet.text.encode("utf-8","ignore"))))
conn.commit()
except Exception,e:
print str(e)
if __name__ == '__main__':
#pass in the username of the account you want to download
get_all_tweets("Google")
If I tried to get older tweets with max_id,it is giving blank list of tweets
so How can I get old tweets, any suggestion would be greatly helpfull.
Thanks
I am trying to fetch data from twitter for processing. Please see the code I want various data corresponding to a particular tweet corresponding to a given topic. I am able to fetch data (created_at, text, username, user_id). It shows error when i try to fetch(location, followers_count, friends_count, retweet_count).
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
import json
ckey = '***********************'
csecret = '************************'
atoken ='*************************'
asecret = '**********************'
class listener(StreamListener):
def on_data(self,data):
try:
all_data = json.loads(data)
tweet = all_data["text"]
username = all_data["user"]["screen_name"]
timestamp = all_data["created_at"]
user_id = all_data["id_str"]
location = all_data["location"]
followers_count = all_data["followers_count"]
friends_count = all_data["friends_count"]
retweet_count = all_data["retweet_count"]
saveThis = str(time.time())+'::'+timestamp+'::'+username+'::'+user_id+'::'+tweet+'::'+followers_count+'::'+friends_count+'::'+retweet_count+'::'+location
saveFile = open('clean2.txt','a')
saveFile.write(saveThis)
saveFile.write('\n')
saveFile.close
return True
except BaseException, e:
print 'failed on data,',str(e)
time.sleep(5)
def on_error(self, status):
print status
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["tweepy"])#topic
The reason it fails on all_data["location"] is that tweets don't have such a property: https://dev.twitter.com/overview/api/tweets
same with friends_count, followers_count - they are properties of users, not tweets.
The code should not be failing on all_date["retweet_count"] as tweets have such a property.
P.S. please include the error message (even if you skip the full error trackback) when reporting errors. makes it's easier to help you, otherwise one has to guess what the error might be.
I am trying to extract the all tweets and timestamp of particular person. I am new in python and tweepy. I have the working code from internet search, but my desire is to print only all tweets of particular user.
import tweepy
# Authentication details. To obtain these visit dev.twitter.com
consumer_key = 'nWGEdfoaBt7d6wWhiAw5Tw'
consumer_secret = 'qM4QfDPqG9JQp6n0fqTCMrj6LJjES6vu2IzqpZLc'
access_token = '2284416938-JbD4F32m9xQPMxKoh6UikpCLoJm8F6xy8wDPS9P'
access_token_secret = 'XvJZQWa6zz5vHcHkUcYBacQKZJE9pcxbpxUUgNo9rN4AG'
if __name__ == '__main__':
# Create authentication token
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
print 'Getting statistics for #BarackObama:'
# Get information about the user
data = api.get_user('BarackObama')
print 'Followers: ' + str(data.followers_count)
print 'Tweets: ' + str(data.statuses_count)
print 'Favouries: ' + str(data.favourites_count)
print 'Friends: ' + str(data.friends_count)
print 'Appears on ' + str(data.listed_count) + ' lists'
print(data)
print(data) or print(status) only gives certain tweets(not all) along with other unwanted information in JSON format.
I found extracting only tweets from home and own timeline by use of following code
statuses = tweepy.Cursor(api.user_timeline).items(2)
data = [s.text.encode('utf8') for s in statuses]
print data
Anyway I got the way to download tweet at single window by passing screenname or twitter id, it works for multiple user at a same time as well
I would like to write a program that can pull tweets from Twitter based on keywords, usernames and a date range. I have used Tweepy to write a program to extract tweets from the Streaming API. Currently, it brings tweets based on keywords OR usernames.
I would like to change the program to include a filtering critera: keywords AND usernames AND date_range. How should I modify the program ?
Python Version Used: 2.7
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
# Removed my authentication parameters because of confidentiality
consumer_key=
consumer_secret=
access_token=
access_token_secret=
class StdOutListener(StreamListener):
def on_data(self, data):
saveFile= open('save_tweets.txt','a')
saveFile.write(data)
saveFile.write('\n')
return True
def on_error(self, status):
print(status)
def on_limit(self, track):
return
def on_timeout(self):
print ('time out')
return
def on_disconnect(self, notice):
print (notice)
return
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
track_list=['art','gallery']
follow_list=['70100659','12804422']
stream.filter(track=track_list, follow=follow_list)
pyGTrends does not seem to work. Giving errors in Python.
pyGoogleTrendsCsvDownloader seems to work, logs in, but after getting 1-3 requests (per day!) complains about exhausted quota, even though manual download with the same login/IP works flawlessly.
Bottom line: neither work. Searching through stackoverflow: many questions from people trying to pull csv's from Google, but no workable solution I could find...
Thank you in advance: whoever will be able to help. How should the code be changed? Do you know of another solution that works?
Here's the code of pyGoogleTrendsCsvDownloader.py
import httplib
import urllib
import urllib2
import re
import csv
import lxml.etree as etree
import lxml.html as html
import traceback
import gzip
import random
import time
import sys
from cookielib import Cookie, CookieJar
from StringIO import StringIO
class pyGoogleTrendsCsvDownloader(object):
'''
Google Trends Downloader
Recommended usage:
from pyGoogleTrendsCsvDownloader import pyGoogleTrendsCsvDownloader
r = pyGoogleTrendsCsvDownloader(username, password)
r.get_csv(cat='0-958', geo='US-ME-500')
'''
def __init__(self, username, password):
'''
Provide login and password to be used to connect to Google Trends
All immutable system variables are also defined here
'''
# The amount of time (in secs) that the script should wait before making a request.
# This can be used to throttle the downloading speed to avoid hitting servers too hard.
# It is further randomized.
self.download_delay = 0.25
self.service = "trendspro"
self.url_service = "http://www.google.com/trends/"
self.url_download = self.url_service + "trendsReport?"
self.login_params = {}
# These headers are necessary, otherwise Google will flag the request at your account level
self.headers = [('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
("Accept-Language", "en-gb,en;q=0.5"),
("Accept-Encoding", "gzip, deflate"),
("Connection", "keep-alive")]
self.url_login = 'https://accounts.google.com/ServiceLogin?service='+self.service+'&passive=1209600&continue='+self.url_service+'&followup='+self.url_service
self.url_authenticate = 'https://accounts.google.com/accounts/ServiceLoginAuth'
self.header_dictionary = {}
self._authenticate(username, password)
def _authenticate(self, username, password):
'''
Authenticate to Google:
1 - make a GET request to the Login webpage so we can get the login form
2 - make a POST request with email, password and login form input values
'''
# Make sure we get CSV results in English
ck = Cookie(version=0, name='I4SUserLocale', value='en_US', port=None, port_specified=False, domain='www.google.com', domain_specified=False,domain_initial_dot=False, path='/trends', path_specified=True, secure=False, expires=None, discard=False, comment=None, comment_url=None, rest=None)
self.cj = CookieJar()
self.cj.set_cookie(ck)
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
self.opener.addheaders = self.headers
# Get all of the login form input values
find_inputs = etree.XPath("//form[#id='gaia_loginform']//input")
try:
#
resp = self.opener.open(self.url_login)
if resp.info().get('Content-Encoding') == 'gzip':
buf = StringIO( resp.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
else:
data = resp.read()
xmlTree = etree.fromstring(data, parser=html.HTMLParser(recover=True, remove_comments=True))
for input in find_inputs(xmlTree):
name = input.get('name')
if name:
name = name.encode('utf8')
value = input.get('value', '').encode('utf8')
self.login_params[name] = value
except:
print("Exception while parsing: %s\n" % traceback.format_exc())
self.login_params["Email"] = username
self.login_params["Passwd"] = password
params = urllib.urlencode(self.login_params)
self.opener.open(self.url_authenticate, params)
def get_csv(self, throttle=False, **kwargs):
'''
Download CSV reports
'''
# Randomized download delay
if throttle:
r = random.uniform(0.5 * self.download_delay, 1.5 * self.download_delay)
time.sleep(r)
params = {
'export': 1
}
params.update(kwargs)
params = urllib.urlencode(params)
r = self.opener.open(self.url_download + params)
# Make sure everything is working ;)
if not r.info().has_key('Content-Disposition'):
print "You've exceeded your quota. Continue tomorrow..."
sys.exit(0)
if r.info().get('Content-Encoding') == 'gzip':
buf = StringIO( r.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
else:
data = r.read()
myFile = open('trends_%s.csv' % '_'.join(['%s-%s' % (key, value) for (key, value) in kwargs.items()]), 'w')
myFile.write(data)
myFile.close()
Although I don't know python, I may have a solution. I am currently doing the same thing in C# and though I didn't get the .csv file, I got created a custom URL through code and then downloaded that HTML and saved to a text file (also through code). In this HTML (at line 12) is all the information needed to create the graph that is used on Google Trends. However, this has alot of unnecessary text within it that needs to be cut down. But either way, you end up with the same result. The Google Trends data. I posted a more detailed answer to my question here:
Downloading .csv file from Google Trends
There is an alternative module named pytrends - https://pypi.org/project/pytrends/ It is really cool. I would recommend this.
Example usage:
import numpy as np
import pandas as pd
from pytrends.request import TrendReq
pytrend = TrendReq()
#It is the term that you want to search
pytrend.build_payload(kw_list=["Eminem is the Rap God"])
# Find which region has searched the term
df = pytrend.interest_by_region()
df.to_csv("path\Eminem_InterestbyRegion.csv")
Potentially if you have a list of terms to search you could make use of "for loop" to automate the insights as per your wish.