Run CrawlerProcess in Scrapy with Splash - python-2.7

I have a scrapy+splash file to crawl data. Now I want to run my scrapy file by script so I use CrawlerProcess. My file is like this:
import scrapy
from scrapy_splash import SplashRequest
from scrapy.crawler import CrawlerProcess
class ProvinceSpider(scrapy.Spider):
name = 'province'
def start_requests(self):
url = "https://e.vnexpress.net/covid-19/vaccine"
yield SplashRequest(url=url,callback=self.parse)
def parse(self, response):
provinces = response.xpath("//div[#id='total_vaccine_province']/ul[#data-weight]")
for province in provinces:
yield{
'province_name':province.xpath(".//li[1]/text()").get(),
'province_population':province.xpath(".//li[2]/text()").get(),
'province_expected_distribution':province.xpath(".//li[3]/text()").get(),
'province_actual_distribution':province.xpath(".//li[4]/text()").get(),
'province_distribution_percentage':province.xpath(".//li[5]/div/div/span/text()").get(),
}
process = CrawlerProcess(settings={
"FEEDS": {
"province.json": {"format": "json"},
},
})
process.crawl(ProvinceSpider)
process.start() # the script will block here until the crawling is finished
But when I run
python3 province.py
It doesn't connect to splash server thus can't crawl data. Any idea about which part I do wrong? Tks in advance

Turns out the issue you actually experienced has been covered by the following answer here: Answer
A quick break-down (if you're not interested in the details):
Go to settings.py and add a USER-AGENT, in my case I left it as:
USER_AGENT = 'testit (http://www.yourdomain.com)'
Then run your crawler and it should work. Why? your scrapy is being blocked by the site.
Output:
2021-12-26 13:15:32 [scrapy.core.scraper] DEBUG: Scraped from <200 https://e.vnexpress.net/covid-19/vaccine>
{'province_name': 'HCMC', 'province_population': '7.2M', 'province_expected_distribution': '13.8M', 'province_actual_distribution': '14.6M', 'province_distribution_percentage': '100%'}
2021-12-26 13:15:32 [scrapy.core.scraper] DEBUG: Scraped from <200 https://e.vnexpress.net/covid-19/vaccine>
{'province_name': 'Hanoi', 'province_population': '6.2M', 'province_expected_distribution': '11.4M', 'province_actual_distribution': '12.3M', 'province_distribution_percentage': '99,2%'}
2021-12-26 13:15:32 [scrapy.core.scraper] DEBUG: Scraped from <200 https://e.vnexpress.net/covid-19/vaccine>
{'province_name': 'Dong Nai', 'province_population': '2.4M', 'province_expected_distribution': '4.3M', 'province_actual_distribution': '5M', 'province_distribution_percentage': '100%'}
...
...
Here's my custom settings:
BOT_NAME = 'testing'
SPIDER_MODULES = ['testing.spiders']
NEWSPIDER_MODULE = 'testing.spiders'
SPLASH_URL = 'http://localhost:8050'
USER_AGENT = 'testing (http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Safari/605.1.15'
}
SPIDER_MIDDLEWARES = {
'testing.middlewares.TestingSpiderMiddleware': 100,
}
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'

Related

Flask cache throws error - KeyError: <flask_caching.Cache object at 0x7ff585e47358>

I'm trying to add caching to my Python Flask application.
I did what Flask-Caching pages suggest, so I have this in app module:
config = {
# "DEBUG": True,
"env": 'dev',
"secret_key": 'my secret stuff',
"CACHE_TYPE": "simple",
"CACHE_DEFAULT_TIMEOUT": 300
}
app = Flask(__name__)
app.config.from_mapping(config)
cors = CORS(app, resources={"/api/*": {"origins": "*"}})
cache = Cache(app)
cache.init_app(app)
#app.before_first_request
def init_resources():
Database.initialize()
app.register_blueprint(auth_api_blueprint, url_prefix="/api/auth")
app.register_blueprint(user_api_blueprint, url_prefix="/api/user")
app.register_blueprint(year_api_blueprint, url_prefix="/api/year")
app.register_blueprint(notice_api_blueprint, url_prefix="/api/notice")
app.register_blueprint(event_api_blueprint, url_prefix="/api/event")
app.register_blueprint(admins_api_blueprint, url_prefix="/api/admin")
app.register_blueprint(guardian_api_blueprint, url_prefix="/api/guardian")
app.register_blueprint(employee_api_blueprint, url_prefix="/api/employee")
app.register_blueprint(student_api_blueprint, url_prefix="/api/student")
app.register_blueprint(teacher_api_blueprint, url_prefix="/api/teacher")
if __name__ == '__main__':
with app.app_context():
cache.clear()
# app.run(port=8080) - port does not work here, it is still default 5000
app.run()
then I applied cached decorator to the method like this:
from common.database import Database
from common.decorators import requires_login
year_api_blueprint = Blueprint('api/year', __name__)
from src.app import cache
#year_api_blueprint.route('/all')
#cache.cached(timeout=500, key_prefix="years")
# #requires_login - this need to be public
def get_all_years():
data = Database.find("years", {})
if data is not None:
return jsonify([year for year in data])
all seems to work fine and year above are not longer called many times (just once)
However I am getting this error every time when cached years are used:
127.0.0.1 - - [20/Oct/2020 17:36:32] "OPTIONS /api/year/all HTTP/1.1" 200 -
Exception possibly due to cache backend.
Traceback (most recent call last):
File "/home/smoczyna/Python-Projects/SchoolMateAPI/venv/lib/python3.6/site-packages/flask_caching/__init__.py", line 435, in decorated_function
rv = self.cache.get(cache_key)
File "/home/smoczyna/Python-Projects/SchoolMateAPI/venv/lib/python3.6/site-packages/flask_caching/__init__.py", line 244, in cache
return app.extensions["cache"][self]
KeyError: <flask_caching.Cache object at 0x7ff585e47358>
I've seen similar post here but I don't understand the solution and don't know how to apply that. I have nothing more than this about cache in app.
So my question is what is possibly missing or misconfigured here ?

Spider not found in scrapyd.schedule

i'm trying to start scrapyd from django
The scrapyd code is like this
unique_id = str(uuid4()) # create a unique ID.
settings = {
'unique_id': unique_id, # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
task = scrapyd.schedule('scrap_lowongan','josbid', settings=settings)
However, i'm getting
scrapyd_api.exceptions.ScrapydResponseError: spider 'josbid' not found
My folder structure is something like this
Bitalisy>
Bitalisy
Scraping>
views.py (Schedule scrapyd from here)
scrap_lowongan> (scrapy Project)
scrap_lowongan>
spider>
jobsid.py
settings.py
pipelines.py
scrapyd.conf
scrapy.cfg
Note that i'm using scrapyd.conf because i have two scrapy project. The scrapy.conf
[scrapyd]
http_port = 6801
Thank you
I have found that you must add :
scrapyd = ScrapydAPI('http://localhost:6801')
And after restarting scrapyd it's working
like charm. Read more the documentation here

selenium with chromedriver on centOS7 for spidering

I trying to make Crawler for my server.
I Found chilkat Lib's CKSpider, but it is not support JS Rendering.
So I try to use selenium webdriver with Chrome.
I run with CentOS7, python2.7
I want spider all page with 1 baseDomain.
Example
BaseDomain = example.com
then find all page something like
example.com/event/.../../...
example.com/games/.../...
example.com/../.../..
...
My Crawler code
from selenium import webdriver
import time
options = webdriver.ChromeOptions()
options.binary_location = "/usr/bin/google-chrome"
chrome_driver_binary = "/root/chromedriver"
options.add_argument("--headless")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
options.add_argument("lang=ko-KR,ko,en-US,en")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_driver_binary, chrome_options=options)
host = example.com
def Crawler(Url):
driver.get(Url)
driver.implicitly_wait(3)
#Do Something
time.sleep(3)
#Crawl next
Crawler(host)
driver.quit()
How can I crawl next page? Is there any other way in selenium
Or need other Lib for that?
Thanks for any Tips or Advice.

scrapy shell enable javascript

I am trying to get the response.body of https://www.wickedfire.com/ in scrapy shell.
but in the response.body it tells me:
<html><title>You are being redirected...</title>\n<noscript>Javascript is required. Please enable javascript before you are allowed to see this page...
How do i activate the javascript? Or is there something else that i can do?
Thank you in advance
UPDATE:
i ve installed pip install scrapy-splash
and i put those commands in settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPLASH_URL = 'http://localhost:8050/'
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
It does give me an error:
NameError: Module 'scrapy_splash' doesn't define any object named 'SplashCoockiesMiddleware'
I have put it as a comment after that error.and it passed.
And my script is like this...but it doesn't work
...
from scrapy_splash import SplashRequest
...
start_urls = ['https://www.wickedfire.com/login.php?do=login']
payload = {'vb_login_username':'','vb_login_password':''}
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse,args={'wait':1})
def parse(self, response):
# url = "https://www.wickedfire.com/login.php?do=login"
r = SplashFormRequest(response,formdata=payload,callback=self.after_login)
return r
def after_login(self,response):
print response.body + "THIS IS THE BODY"
if "incorrect" in response.body:
self.logger.error("Login failed")
return
else:
results = FormRequest.from_response(response,
formdata={'query': 'bitter'},
callback=self.parse_page)
return results
...
This is the error that i get:
[scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://wickedfire.com/ via http://localhost:8050/render.html> (failed 1 times): 502 Bad Gateway
[scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://wickedfire.com/ via http://localhost:8050/render.html> (failed 2 times): 502 Bad Gateway
[scrapy.downloadermiddlewares.retry] DEBUG: Gave up retrying <GET https://wickedfire.com/ via http://localhost:8050/render.html> (failed 3 times): 502 Bad Gateway
[scrapy.core.engine] DEBUG: Crawled (502) <GET https://wickedfire.com/ via http://localhost:8050/render.html> (referer: None) ['partial']
[scrapy.spidermiddlewares.httperror] INFO: Ignoring response <502 https://wickedfire.com/>: HTTP status code is not handled or not allowed
i also tried scrapy splash with scrapy shell using this Guide
I just want to login to the page and put in a keyword to be search and get the results. This is my end results.

Scraping aspx with Python mechanize - getting search results

I've been trying to scrape Congressional financial disclosure reports using mechanize; the form submits successfully, but I can't locate any of the search results. My script is below:
br = Browser()
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open('http://clerk.house.gov/public_disc/financial-search.aspx')
br.select_form(name='aspnetForm')
br.set_all_readonly(False)
br['filing_year'] = ['2008']
response = br.submit(name='search_btn')
html = response.read()
I'm new to scraping, and would appreciate any corrections/advice on this. Thanks!
This is an alternative solution that involves a real browser with the help of selenium tool.
from selenium import webdriver
from selenium.webdriver.support.select import Select
# initialize webdriver instance and visit url
url = "http://clerk.house.gov/public_disc/financial-search.aspx"
browser = webdriver.Firefox()
browser.get(url)
# find select tag and select 2008
select = Select(browser.find_element_by_id('ctl00_cphMain_txbFiling_year'))
select.select_by_value('2008')
# find "search" button and click it
button = browser.find_element_by_id('ctl00_cphMain_btnSearch')
button.click()
# display results
table = browser.find_element_by_id('search_results')
for row in table.find_elements_by_tag_name('tr')[1:-1]:
print [cell.text for cell in row.find_elements_by_tag_name('td')]
# close the browser
browser.close()
Prints:
[u'ABERCROMBIE, HON.NEIL', u'HI01', u'2008', u'FD Amendment']
[u'ABERCROMBIE, HON.NEIL', u'HI01', u'2008', u'FD Original']
[u'ACKERMAN, HON.GARY L.', u'NY05', u'2008', u'FD Amendment']
[u'ACKERMAN, HON.GARY L.', u'NY05', u'2008', u'FD Amendment']
...