scrapy re.match not working find urls in string using regex - regex

I try crawl many url in the same domain. I have to url list in the string. I want to search regex in string and find urls. But re.match() always return none. I test my regex and it working. This is my code:
# -*- coding: UTF-8 -*-
import scrapy
import codecs
import re
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import Request
from scrapy.selector import HtmlXPathSelector
from hurriyet.items import HurriyetItem
class hurriyet_spider(CrawlSpider):
name = 'hurriyet'
allowed_domains = ['hurriyet.com.tr']
start_urls = ['http://www.hurriyet.com.tr/gundem/']
rules = (Rule(SgmlLinkExtractor(allow=('\/gundem(\/\S*)?.asp$')),'parse',follow=True),)
def parse(self, response):
image = HurriyetItem()
text = response.xpath("//a/#href").extract()
print text
urls = ''.join(text)
page_links = re.match("(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))", urls, re.M)
image['title'] = response.xpath("//h1[#class = 'title selectionShareable'] | //h1[#itemprop = 'name']/text()").extract()
image['body'] = response.xpath("//div[#class = 'detailSpot']").extract()
image['body2'] = response.xpath("//div[#class = 'ctx_content'] ").extract()
print page_links
return image, text

There is no need to use the re module, Scrapy selectors have a built in feature for regex filtering:
def parse(self, response):
...
page_links = response.xpath("//a/#href").re('your_regex_expression')
...
With that said, I suggest you play with this approach in the Scrapy shell first to make sure your regex is indeed working. Because I wouldn't expect people to try to debug a mile long regex - it's basically a write only language :)

Related

blank list error when taking data from web site

I have a problem on API. Its turns to me empty list
I tried to search browser but none is my answer.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib import re
site = "http://www.hurriyet.com.tr"
regex = "<span class='news-title'>(.+?)</span>"
comp = re.compile(regex)
print(comp) print(regex)
htmlkod = urllib.urlopen(site).read()
titles = re.findall(regex, htmlkod)
print(titles)
i=1
for title in titles:
print str(i), title.decode("iso8859-9")
print(title)
i+=1
I Expect the its turn to me news titles but its turn me "[]" empty list
I recommend to use BeautifulSoup instead of regex like :
from urllib import urlopen
from bs4 import BeautifulSoup
site = "http://www.hurriyet.com.tr"
openurl = urlopen(site)
soup = BeautifulSoup(openurl, "html.parser")
getTitle = soup.findAll('span', attrs={'class': 'news-title'})
for title in getTitle:
print title.text

AWS Lambda - Generate CSV In Memory and send it as an attachment to an Email

I'm trying to write an AWS Lambda service using Python 2.7 that will generate an In-Memory CSV file and email it as an attachment. I feel like I'm close with this script based on what I've learned but I'm not quite there.
# Import smtplib for the actual sending function
import smtplib
import sys
import csv
import cStringIO
from os.path import basename
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication
# Import the email modules we'll need
server = smtplib.SMTP('smtp.postmarkapp.com', 587)
server.starttls()
server.login('.....','.....')
list = []
row1 = ["One","Two","Three"]
list.append(row1)
msg = MIMEMultipart()
msg['To'] = "daniel#mydomain.com"
msg['From'] = "noreply#mydomain.com"
msg['Subject'] = "DG Test subject"
msg.attach(MIMEText("Test Message"))
csv_buffer = cStringIO.StringIO()
writer = csv.writer(csv_buffer, lineterminator='\n')
writer.writerow(["1","2","3"])
for row in list:
writer.writerow(row)
print(csv_buffer.getvalue())
msg.attach(csv_buffer)
try:
response = server.sendmail(msg['From'], ["daniel#mydomain.com"],msg.as_string())
server.quit()
except AttributeError as error:
print(error)
else:
print(response)
This gives me the following error:
1,2,3
One,Two,Three
'cStringIO.StringO' object has no attribute 'get_content_maintype'
Basically it comes down to not being sure how to use the csv_buffer object. Assuming I just need to add that attribute to the object somehow but I'm not quite sure how. If I try to add any additional arguments to the .attach() line, it complains that I have too many arguments.
Thanks!
I figured it out, thanks to stitching together a few SO posts.
import cStringIO
import csv
csv_buffer = cStringIO.StringIO()
writer = csv.writer(csv_buffer, delimiter=',', quoting=csv.QUOTE_ALL)
writer.writerow(["1","2","3"])
for row in list:
writer.writerow(row)
print(csv_buffer.getvalue())
# new lines
csv_file = MIMEText(csv_buffer.getvalue())
attachment = csv_file.add_header('Content-Disposition', 'attachment', filename="csv_file.csv")
msg.attach(csv_file)

Where to put scraper script in django so as to access django database

I have a script which populates a database with initial data(football matches) from a web scraper..Currently I have put it under init.py but looks and sounds wrong..Here is my code in init.py..Where should I put this code so that it updates the matches field in my database?..Kindly help
from .models import Betting
import re
import requests
import urllib
from bs4 import BeautifulSoup
urls= "https://sms.betyetu.co.ke/tomorrow.html"
htmlfile = urllib.urlopen(urls)
htmltext = htmlfile.read()
soup = BeautifulSoup(htmltext, "html.parser")
for i in soup.findAll("table"):
try:
momo = 1
a = len( i.contents[3].contents)
while momo <= a:
foo = Betting(matches=i.contents[3].contents[momo].findAll("td")[2].text)
momo += 2
**strong text** foo.save()
print i.contents[3].contents[3].findAll("td")[0].text
except:
momo = 1

How to crawl latest articles in a specific site using specific set keyword?

I am trying a python code for crawling article links on specific sites based on key word like name of the article.but i didn't get the links appropriate links.
import sys
import requests
from bs4 import BeautifulSoup
import urllib.request
from urlparse import urlparse
def extract_article_links(url,data):
req = urllib.request.Request(url,data)
response = urllib.request.urlopen(req)
responseData = response.read()
#r = requests.get(url)
soup = BeautifulSoup(responseData.content)
links = soup.find_all('a')
for link in links:
try:
#if 'http' in link:
print ("<a href='%s'>%s</a>" % (link.get('href'),link.text))
except Exception as e :
print (e)
responseData = soup.find_all("div",{"class:info"})
print responseData
for item in responseData:
print (item.contents[0].text)
print (item.contents[1].text)
if __name__ == "__main__":
from sys import argv
if (len(argv)<2):
print"Insufficient arguments..!!"
sys.exit(1)
url = sys.argv[1]
values = {'s':'article','submit':'search'}
data = urlparse.urlencode(values)
data = data.encode('utf-8')
extract_article_links(url,data)
Try lxml, analyze the html and locate elements you are looking for, then you can do this easily with xpath :
from lxml import html
print map (lambda link: link, html.fromstring(source).xpath('//a/#href'))
of course you need to modify the xpath according to the attribute you are looking for.
try this
import requests
from bs4 import BeautifulSoup
def extract_article_links(url,data):
soup = BeautifulSoup(requests.get('http://www.hindustantimes.com/Search/search.aspx?q={}&op=All&pt=all&auth=all'.format(data)).content)
responseData = soup.find("ul",{'class':'searchNews'})
_a, _li = responseData.find_all('a'), responseData.find_all('li')
for i,j in zip(_a,_li):
print '='*40,'\nLink: ',i['href'], '\nTitle: ',i.contents[0], '\nContent: \n\t', j.p.get_text(),'\n'
if __name__ == "__main__":
url = "http://www.hindustantimes.com/"
extract_article_links(url,'article')

Cannot login again after resuming crawl. Cookies are not sticky after resuming scrapy

I have a CrawlSpider, the code is below. I use Tor through tsocks.
When I start my spider, everything works fine. Using init_request I can login on site and crawl with sticky cookies.
But problem occurred when I stopped and resumed spider. Cookies became not sticky.
I give you the response from Scrapy.
=======================INIT_REQUEST================
2013-01-30 03:03:58+0300 [my] INFO: Spider opened
2013-01-30 03:03:58+0300 [my] INFO: Resuming crawl (675 requests scheduled)
............ And here crawling began
So... callback=self.login_url in def init_request is not fired!!!
I thought that scrapy engine didn't want to send again request on login page. Before resuming scrapy I changed login_page (I can login from every page on site) to different that not included in restrict_xpaths.
Result is - After resuming I cannot login and previous cookies are lost.
Does anyone have some assumptions?
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import TakeFirst, MapCompose, Join, Identity
from beles_com_ua.items import Product
from scrapy.utils.response import get_base_url
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.markup import remove_entities
from django.utils.html import strip_tags
from datetime import datetime
from scrapy import log
import re
from scrapy.http import Request, FormRequest
class ProductLoader(XPathItemLoader):
.... some code is here ...
class MySpider(CrawlSpider):
name = 'my'
login_page = 'http://test.com/index.php?section=6&type=12'
allowed_domains = ['test.com']
start_urls = [
'http://test.com/index.php?section=142',
]
rules = (
Rule(SgmlLinkExtractor(allow=('.',),restrict_xpaths=('...my xpath...')),callback='parse_item', follow=True),
)
def start_requests(self):
return self.init_request()
def init_request(self):
print '=======================INIT_REQUEST================'
return [Request(self.login_page, callback=self.login_url)]
def login_url(self, response):
print '=======================LOGIN======================='
"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'login': 'mylogin', 'pswd': 'mypass'},
callback=self.after_login)
def after_login(self, response):
print '=======================AFTER_LOGIN ...======================='
if "images/info_enter.png" in response.body:
print "==============Bad times :(==============="
else:
print "=========Successfully logged in.========="
for url in self.start_urls:
yield self.make_requests_from_url(url)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
entry = hxs.select("//div[#class='price']/text()").extract()
l = ProductLoader(Product(), hxs)
if entry:
name = hxs.select("//div[#class='header_box']/text()").extract()[0]
l.add_value('name', name)
... some code is here ...
return l.load_item()
The init_request(self): is available only when you subclass from InitSpider not CrawlSpider
You need to subclass your spider from InitSpider like this
class WorkingSpider(InitSpider):
login_page = 'http://www.example.org/login.php'
def init_request(self):
#"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
But then remember that you can't define Rules in initSpider as its only avaiable in CrawlSpider you need to manually extract the links