How i fix this syntax error for a tweepy script? - python-2.7

Hello i'm trying to learn programing for a project.
i've been working on a simple script (using tweepy) to download tweets from a search of a keyword into a .csv format. However i keep getting a sintax error in multiple lines (from 28 to 38) and i don't know what is wrong at this point, can somebody tell me what's wrong?
here is the code i've been working on...
# -*- coding: utf-8 -*-
#import modules
import tweepy
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import unicodecsv
from unidecode import unidecode
import csv
from textblob import TextBlob
ckey = "XXXXXXXXXXXXXXXXXX"
csecret = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
atoken = "XXXXXXXXXXXX-XXXXXXXXXXXXXXXXXXX"
asecret = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
OAUTH_KEYS = {'consumer_key':ckey, 'consumer_secret':csecret, 'access_token_key':atoken, 'access_token_secret':asecret}
auth = tweepy.OAuthHandler(OAUTH_KEYS['consumer_key'], OAUTH_KEYS['consumer_secret'])
api = tweepy.API(auth)
fName= raw_input("Nombre del Archivo: ")+'.csv'
for tweet in tweepy.Cursor(api.search, q=('dulceveneno'), since='2014-09-16', until='2017-07-25').items(5):
tweet_info = [tweet.author.name.encode('utf8')
tweet.author.screen_name.encode('utf8')
tweet.created_at
tweet.text.encode('utf8')
tweet.retweeted
tweet.favorited
tweet.user.location.encode('utf8')
tweet.user.time_zone
tweet.geo
tweet.entities.get('hashtags')]
with open(fName, 'wb') as file:
writer = unicodecsv.writer(file, delimiter = ',', quotechar = '"')
# Write header row.
writer.writerow(["Nombre",
"UserName",
"Fecha",
"Tweet",
"Retweet?"
"Favs"
"UbicaciĆ³n",
"Horario",
"Geo",
"Hashtags"])
# Write data to CSV.
writer.writerow(tweet_info)
# Show progress.
print("DONE!" % q)
the problem is in the "tweet_info" part i guess...

You haven't posted the actual error yet, but I can see for tweet_info you do not have any commas , for the elements in the list.
It should be:
tweet_info = [tweet.author.name.encode('utf8'),
tweet.author.screen_name.encode('utf8'),
tweet.created_at,
tweet.text.encode('utf8'),
tweet.retweeted,
tweet.favorited,
tweet.user.location.encode('utf8'),
tweet.user.time_zone,
tweet.geo,
tweet.entities.get('hashtags')]

Related

blank list error when taking data from web site

I have a problem on API. Its turns to me empty list
I tried to search browser but none is my answer.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib import re
site = "http://www.hurriyet.com.tr"
regex = "<span class='news-title'>(.+?)</span>"
comp = re.compile(regex)
print(comp) print(regex)
htmlkod = urllib.urlopen(site).read()
titles = re.findall(regex, htmlkod)
print(titles)
i=1
for title in titles:
print str(i), title.decode("iso8859-9")
print(title)
i+=1
I Expect the its turn to me news titles but its turn me "[]" empty list
I recommend to use BeautifulSoup instead of regex like :
from urllib import urlopen
from bs4 import BeautifulSoup
site = "http://www.hurriyet.com.tr"
openurl = urlopen(site)
soup = BeautifulSoup(openurl, "html.parser")
getTitle = soup.findAll('span', attrs={'class': 'news-title'})
for title in getTitle:
print title.text

AWS Lambda - Generate CSV In Memory and send it as an attachment to an Email

I'm trying to write an AWS Lambda service using Python 2.7 that will generate an In-Memory CSV file and email it as an attachment. I feel like I'm close with this script based on what I've learned but I'm not quite there.
# Import smtplib for the actual sending function
import smtplib
import sys
import csv
import cStringIO
from os.path import basename
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication
# Import the email modules we'll need
server = smtplib.SMTP('smtp.postmarkapp.com', 587)
server.starttls()
server.login('.....','.....')
list = []
row1 = ["One","Two","Three"]
list.append(row1)
msg = MIMEMultipart()
msg['To'] = "daniel#mydomain.com"
msg['From'] = "noreply#mydomain.com"
msg['Subject'] = "DG Test subject"
msg.attach(MIMEText("Test Message"))
csv_buffer = cStringIO.StringIO()
writer = csv.writer(csv_buffer, lineterminator='\n')
writer.writerow(["1","2","3"])
for row in list:
writer.writerow(row)
print(csv_buffer.getvalue())
msg.attach(csv_buffer)
try:
response = server.sendmail(msg['From'], ["daniel#mydomain.com"],msg.as_string())
server.quit()
except AttributeError as error:
print(error)
else:
print(response)
This gives me the following error:
1,2,3
One,Two,Three
'cStringIO.StringO' object has no attribute 'get_content_maintype'
Basically it comes down to not being sure how to use the csv_buffer object. Assuming I just need to add that attribute to the object somehow but I'm not quite sure how. If I try to add any additional arguments to the .attach() line, it complains that I have too many arguments.
Thanks!
I figured it out, thanks to stitching together a few SO posts.
import cStringIO
import csv
csv_buffer = cStringIO.StringIO()
writer = csv.writer(csv_buffer, delimiter=',', quoting=csv.QUOTE_ALL)
writer.writerow(["1","2","3"])
for row in list:
writer.writerow(row)
print(csv_buffer.getvalue())
# new lines
csv_file = MIMEText(csv_buffer.getvalue())
attachment = csv_file.add_header('Content-Disposition', 'attachment', filename="csv_file.csv")
msg.attach(csv_file)

Download all csv files from URL

I want to download all csv files, any idea how I do this?
from bs4 import BeautifulSoup
import requests
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in soup.findAll("a"):
print link.get("href")
Something like this should work:
from bs4 import BeautifulSoup
from time import sleep
import requests
if __name__ == '__main__':
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in soup.findAll("a"):
current_link = link.get("href")
if current_link.endswith('csv'):
print('Found CSV: ' + current_link)
print('Downloading %s' % current_link)
sleep(10)
response = requests.get('http://www.football-data.co.uk/%s' % current_link, stream=True)
fn = current_link.split('/')[0] + '_' + current_link.split('/')[1] + '_' + current_link.split('/')[2]
with open(fn, "wb") as handle:
for data in response.iter_content():
handle.write(data)
You just need to filter the hrefs which you can do with a css selector,a[href$=.csv] which will find the href's ending in .csv then join each to the base url, request and finally write the content:
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin
from os.path import basename
base = "http://www.football-data.co.uk/"
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in (urljoin(base, a["href"]) for a in soup.select("a[href$=.csv]")):
with open(basename(link), "w") as f:
f.writelines(requests.get(link))
Which will give you five files, E0.csv, E1.csv, E2.csv, E3.csv, E4.csv with all the data inside.

Where to put scraper script in django so as to access django database

I have a script which populates a database with initial data(football matches) from a web scraper..Currently I have put it under init.py but looks and sounds wrong..Here is my code in init.py..Where should I put this code so that it updates the matches field in my database?..Kindly help
from .models import Betting
import re
import requests
import urllib
from bs4 import BeautifulSoup
urls= "https://sms.betyetu.co.ke/tomorrow.html"
htmlfile = urllib.urlopen(urls)
htmltext = htmlfile.read()
soup = BeautifulSoup(htmltext, "html.parser")
for i in soup.findAll("table"):
try:
momo = 1
a = len( i.contents[3].contents)
while momo <= a:
foo = Betting(matches=i.contents[3].contents[momo].findAll("td")[2].text)
momo += 2
**strong text** foo.save()
print i.contents[3].contents[3].findAll("td")[0].text
except:
momo = 1

How to crawl latest articles in a specific site using specific set keyword?

I am trying a python code for crawling article links on specific sites based on key word like name of the article.but i didn't get the links appropriate links.
import sys
import requests
from bs4 import BeautifulSoup
import urllib.request
from urlparse import urlparse
def extract_article_links(url,data):
req = urllib.request.Request(url,data)
response = urllib.request.urlopen(req)
responseData = response.read()
#r = requests.get(url)
soup = BeautifulSoup(responseData.content)
links = soup.find_all('a')
for link in links:
try:
#if 'http' in link:
print ("<a href='%s'>%s</a>" % (link.get('href'),link.text))
except Exception as e :
print (e)
responseData = soup.find_all("div",{"class:info"})
print responseData
for item in responseData:
print (item.contents[0].text)
print (item.contents[1].text)
if __name__ == "__main__":
from sys import argv
if (len(argv)<2):
print"Insufficient arguments..!!"
sys.exit(1)
url = sys.argv[1]
values = {'s':'article','submit':'search'}
data = urlparse.urlencode(values)
data = data.encode('utf-8')
extract_article_links(url,data)
Try lxml, analyze the html and locate elements you are looking for, then you can do this easily with xpath :
from lxml import html
print map (lambda link: link, html.fromstring(source).xpath('//a/#href'))
of course you need to modify the xpath according to the attribute you are looking for.
try this
import requests
from bs4 import BeautifulSoup
def extract_article_links(url,data):
soup = BeautifulSoup(requests.get('http://www.hindustantimes.com/Search/search.aspx?q={}&op=All&pt=all&auth=all'.format(data)).content)
responseData = soup.find("ul",{'class':'searchNews'})
_a, _li = responseData.find_all('a'), responseData.find_all('li')
for i,j in zip(_a,_li):
print '='*40,'\nLink: ',i['href'], '\nTitle: ',i.contents[0], '\nContent: \n\t', j.p.get_text(),'\n'
if __name__ == "__main__":
url = "http://www.hindustantimes.com/"
extract_article_links(url,'article')