I'm trying to log into Quora with Scrapy, however I did not succeed, which indicating 400 or 500 code, corresponds to my formdata.
I found the form data by Chrome:
General
Request URL:https://www.quora.com/webnode2/server_call_POST?__instart__
Request Method:POST
Status Code:200
Remote Address:103.243.14.60:443
Form Data
json:{"args":[],"kwargs":{"email":"1liusai253#163.com","password":"XXXX","passwordless":1}}
formkey:750febacf08976a47c82f3e10af83305
postkey:dab46d0df2014d1568ead6b2fbad7297
window_id:dep3300-2420196009402604566
referring_controller:index
referring_action:index
_lm_transaction_id:0.2598935768985011
_lm_window_id:dep3300-2420196009402604566
__vcon_json:["Vn03YsuKFZvHV9"]
__vcon_method:do_login
__e2e_action_id:ee1qmp1iit
js_init:{}
Next are my code samples, a normal Scrapy flow. I thought the problem lies in the formdata. Can someone help with this?
import scrapy
import re
class QuestionsSpider(scrapy.Spider):
name = 'questions'
domain = 'https://www.quora.com'
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36",
"Accept-Encoding": "gzip, deflate",
"Host": "www.quora.com",
"Connection": "Keep-Alive",
"content-type":"application/x-www-form-urlencoded"
}
def __init__(self, login_url = None):
self.login_url = 'https://www.quora.com/webnode2/server_call_POST?__instart__' # Here is the login URL of Quora
def start_requests(self):
body = response.body
formkey_patt = re.compile(r'.*?"formkey".*?"(.*?)".*?',re.S)
formkey = re.findall(formkey_patt, body)[0]
postkey_patt = re.compile('.*?"postkey".*?"(.*?)".*?',re.S)
postkey = re.findall(postkey_patt, body)[0]
window_id_patt = re.compile('.*?window_id.*?"(.*?)".*?',re.S)
window_id = re.findall(window_id_patt, body)[0]
referring_controller = 'index'
referring_action = 'index'
__vcon_method = 'do_login'
yield scrapy.Request(
url = self.domain,
headers = self.headers,
meta = {'cookiejar':1},
callback = self.start_login
)
def start_login(self,response):
yield scrapy.FormRequest.from_response(
response,
url = self.login_url,
meta = {'cookiejar':response.meta['cookiejar']},
headers = self.headers,
formdata = {"json":{"args":[],"kwargs":{"email":"xxxx","password":"xxx"}},
"formkey":formkey,
"postkey":postkey,
"window_id":window_id,
"referring_controller":referring_controller,
"referring_action":referring_action,
"__vcon_method":__vcon_method,
"__e2e_action_id":"ee1qmp1iit"
},
callback = self.after_login
)
def after_login(self, response):
print response.body
You are not setting nor sending formkey, postkey, window_id, etc. That's why you should grab them from the response. That being said, you need to use FormRequest.from_response()
Related
There is a site that i connect to, but need to login 4 times with different user names and passwords.
Is there anyway that i can do this by looping through the usernames and passwords in a payload.
This is the first time im am doing this and am not really sure of how to go about it.
The code works fine if i post just one username and password.
Im using Python 2.7 and BeautifulSoup and requests.
Here is my code.
import requests
import zipfile, StringIO
from bs4 import BeautifulSoup
# Here were add the login details to be submitted to the login form.
payload = [
{'USERNAME': 'xxxxxx','PASSWORD': 'xxxxxx','option': 'login'},
{'USERNAME': 'xxxxxx','PASSWORD': 'xxxxxxx','option': 'login'},
{'USERNAME': 'xxxxx','PASSWORD': 'xxxxx','option': 'login'},
{'USERNAME': 'xxxxxx','PASSWORD': 'xxxxxx','option': 'login'},
]
#Possibly need headers later.
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
base_url = "https://service.rl360.com/scripts/customer.cgi/SC/servicing/"
with requests.Session() as s:
p = s.post('https://service.rl360.com/scripts/customer.cgi?option=login', data=payload)
# Get the download page to scrape.
r = s.get('https://service.rl360.com/scripts/customer.cgi/SC/servicing/downloads.php?Folder=DataDownloads&SortField=ExpiryDays&SortOrder=Ascending', stream=True)
content = r.text
soup = BeautifulSoup(content, 'lxml')
#Now i get the most recent download URL.
download_url = soup.find_all("a", {'class':'tabletd'})[-1]['href']
#now we join the base url with the download url.
download_docs = s.get(base_url + download_url, stream=True)
print "Checking Content"
content_type = download_docs.headers['content-type']
print content_type
print "Checking Filename"
content_name = download_docs.headers['content-disposition']
print content_name
print "Checking Download Size"
content_size = download_docs.headers['content-length']
print content_size
#This is where we extract and download the specified xml files.
z = zipfile.ZipFile(StringIO.StringIO(download_docs.content))
print "---------------------------------"
print "Downloading........."
#Now we save the files to the specified location.
z.extractall('C:\Temp')
print "Download Complete"
Just use a for loop. You may need to adjust your download directory if files will be overwritten.
payloads = [
{'USERNAME': 'xxxxxx1','PASSWORD': 'xxxxxx','option': 'login'},
{'USERNAME': 'xxxxxx2','PASSWORD': 'xxxxxxx','option': 'login'},
{'USERNAME': 'xxxxx3','PASSWORD': 'xxxxx','option': 'login'},
{'USERNAME': 'xxxxxx4','PASSWORD': 'xxxxxx','option': 'login'},
]
....
for payload in payloads:
with requests.Session() as s:
p = s.post('https://service.rl360.com/scripts/customer.cgi?option=login', data=payload)
...
I'm trying to set up image downloading from web pages by using Scrapy Framework and djano-item. I think I have done everything like in doc
but after calling scrapy crawl I log looking like this:
Scrapy log
I can't find there any information on what went wrong but Images field Is empty and directory does not contain any images.
This is my model
class Event(models.Model):
title = models.CharField(max_length=100, blank=False)
description = models.TextField(blank=True, null=True)
event_location = models.CharField(max_length=100, blank = True, null= True)
image_urls = models.CharField(max_length = 200, blank = True, null = True)
images = models.CharField(max_length=100, blank = True, null = True)
url = models.URLField(max_length=200)
def __unicode(self):
return self.title
and this is how i go from spider to image pipeline
def parse_from_details_page(self, response):
"Some code"
item_event = item_loader.load_item()
#this is to create image_urls list (there is only one image_url allways)
item_event['image_urls'] = [item_event['image_urls'],]
return item_event
and finally this is my settings.py for Scrapy project:
import sys
import os
import django
DJANGO_PROJECT_PATH = os.path.join(os.path.dirname((os.path.abspath(__file__))), 'MyScrapy')
#sys.path.insert(0, DJANGO_PROJECT_PATH)
#sys.path.append(DJANGO_PROJECT_PATH)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "MyScrapy.settings")
#os.environ["DJANGO_SETTINGS_MODULE"] = "MyScrapy.settings"
django.setup()
BOT_NAME = 'EventScraper'
SPIDER_MODULES = ['EventScraper.spiders']
NEWSPIDER_MODULE = 'EventScraper.spiders'
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 100,
'EventScraper.pipelines.EventscraperPipeline': 200,
}
#MEDIA STORAGE URL
IMAGES_STORE = os.path.join(DJANGO_PROJECT_PATH, "IMAGES")
#IMAGES (used to be sure that it takes good fields)
FILES_URLS_FIELD = 'image_urls'
FILES_RESULT_FIELD = 'images'
Thank you in advance for your help
EDIT:
I used custom image pipeline from doc looking like this,
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
import ipdb; ipdb.set_trace()
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
import ipdb; ipdb.set_trace()
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
In get_media_requests it creates request to my Url but in item_completed in result param i get somethin like this : [(False, <twisted.python.failure.Failure scrapy.pipelines.files.FileException: >)]
I still don't know how to fix it.
Is it possible that the problem could be caused by a reference to the address with https ?
I faced the EXACT issue with scrapy.
My Solution:
Added headers to the request you're yielding in the get_media_requests function. I added a user agent and a host along with some other headers. Here's my list of headers.
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-GB,en-US;q=0.8,en;q=0.6',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Proxy-Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Host': 'images.finishline.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
Open up the exact image url in your browser (the url with which you're downloading the image). Simply check your browser's network tab for the list of headers. Make sure your headers for that request I mentioned above are the same as those.
Hope it works.
I am trying to build a basic LinkedIn scraper for a research project and am running into challenges when I try to scrape through levels of the directory. I am a beginner and I keep on running the code below and IDLE returns and error before shutting down. See below the code and error:
Code:
import requests
from bs4 import BeautifulSoup
from urllib2 import urlopen
from pprint import pprint as pp
PROFILE_URL = "linkedin.com"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
#use this to gather all of the individual links from the second directory page
def get_second_links(pre_section_link):
response = requests.get(pre_section_link, headers=headers)
soup = BeautifulSoup(response.content, "lxml")
column = soup.find("ul", attrs={'class':'column dual-column'})
second_links = [li.a["href"] for li in column.findAll("li")]
return second_links
# use this to gather all of the individual links from the third directory page
def get_third_links(section_link):
response = requests.get(section_link, headers=headers)
soup = BeautifulSoup(response.content, "lxml")
column = soup.find("ul", attrs={'class':'column dual-column'})
third_links = [li.a["href"] for li in column.findAll("li")]
return third_links
use this to build the individual profile links
def get_profile_link(link):
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, "lxml")
column2 = soup.find("ul", attrs={'class':'column dual-column'})
profile_links = [PROFILE_URL + li.a["href"] for li in column2.findAll("li")]
return profile_links
if __name__=="__main__":
sub_directory = get_second_links("https://www.linkedin.com/directory/people-a-1/")
sub_directory = map(get_third_links, sub_directory)
profiles = get_third_links(sub_directory)
profiles = map(get_profile_link, profiles)
profiles = [item for sublist in fourth_links for item in sublist]
pp(profiles)
Error I keep getting:
Error Page
You need to add https to PROFILE_URL:
PROFILE_URL = "https://linkedin.com"
I'm using a simple REST client to test. Sending a simple JPEG, tried the following content-Type(s):
Content-Type: image/jpeg
Content-Type: multipart/form-data
Also note csrftoken authentication is turned off to allow outside 3rd party REST connection.
(image is attached via the rest client)
Checked wireshark and the packet is setup according to the above parameter.
Django - request object has several variables:
request.body
request.FILES
After the POST is received by the Django server, the request object always stores all data/payload into request.body. Shouldn't an image or any attached files be going into request.FILES? Is there something setup incorrectly on the content-type or POST.
very simple code. Just trying to print into the log. All objects in post keep going to request.body
def testPost(request):
print request.body
print request.FILES
return HttpResponse()
Wireshark packet:
Hypertext Transfer Protocol
POST /testPost/ HTTP/1.1\r\n
Host: MYURL.com:8000\r\n
Connection: keep-alive\r\n
Content-Length: 8318\r\n
Origin: chrome-extension://aejoelaoggembcahagimdiliamlcdmfm\r\n
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36\r\n
Content-Type: image/jpeg\r\n
Accept: */*\r\n
Accept-Encoding: gzip,deflate,sdch\r\n
Accept-Language: en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4\r\n
Cookie: ******; csrftoken=**********\r\n
\r\n
[Full request URI: http://MYURL.com:8000/testPost/]
[HTTP request 1/1]
JPEG File Interchange Format
Here is how I handle file uploads: which in this case happen to be images. One of the issues I fought with for awhile was that request.FILES could come in with multiple keys and I always wanted the last one.
Note: request.FILES will only contain data if:
the request is a POST
the request has the attribute 'enctype="multipart/form-data"'
see the Django File-uploads documentation for more details.
The Model: First there is a model with a ImageField in it: models.py
photos_dir = settings.MEDIA_ROOT + "/photos" + "/%Y/%m/%d/"
class Photo(models.Model):
image = models.ImageField(upload_to=photos_dir, null=True, blank=True, default=None)
filename = models.CharField(max_length=60, blank=True, null=True)
The View: in views.py handle the post:
from django.core.files.images import ImageFile
def upload_image( request ):
file_key=None
for file_key in sorted(request.FILES):
pass
wrapped_file = ImageFile(request.FILES[file_key])
filename = wrapped_file.name
# new photo table-row
photo = Photo()
photo.filename = filename
photo.image = request.FILES[file_key]
try:
photo.save()
except OSError:
print "Deal with this situation"
# do your stuff here.
return HttpResponse("boo", "text/html");
The Standlone Poster: Some python code to stimulate your django view.
Reference: I actually used this lib: poster.encode to 'stimulate data' to my django view.py
from poster.streaminghttp import register_openers
from poster.encode import multipart_encode
import urllib2
server = "http://localhost/"
headers = {}
# Register the streaming http handlers with urllib2
register_openers()
img="path/to/image/image.png"
data = {'media' : open( img ),
'additionalattr': 111,
}
datagen, headers = multipart_encode(data)
headers['Connection']='keep-alive'
request = urllib2.Request('%s/upload_image/' % ( server ), datagen, headers)
print urllib2.urlopen(request).read()
When I do a request through curl, passing the username+ApiKey through the url like:
curl --dump-header - -H "Content-Type: application/json" -X POST --data '{"question": "Is a test yo?", "pub_date": "2011-05-22T00:46:38"}' "http://localhost:8000/polls/api/v1/poll/?username=federico&api_key=10a2d3586e63078ef39f9da8f9aa9209715ed282
I have no problem (other than the server complaining it's a bad request because I'm not sending the FK data, but the db is updated anyways.
However when I try to do the same thing by sending the username+apikey through the header I get a 401 Unauthorized error and nothing happens.
What am I missing here?
#resources
class PollResource(ModelResource):
choices = fields.ToManyField('polls.api.ChoiceResource', 'choice_set', full=True)
class Meta:
queryset = Poll.objects.all()
resource_name = 'poll'
allowed_methods = ['get', 'post', 'put']
list_allowed_methods = ['get', 'post', 'put', 'delete']
authentication = ApiKeyAuthentication()
authorization = DjangoAuthorization()
class ChoiceResource(ModelResource):
poll = fields.ForeignKey(PollResource, 'poll')
class Meta:
queryset = Choice.objects.all()
resource_name = 'choice'
list_allowed_methods = ['get', 'post', 'put', 'delete']
// js
// backbone-tastypie config
Backbone.Tastypie.csrfToken = $("#secret-token")[0].value;
Backbone.Tastypie.apiKey = {
username: USER,
key: API_KEY
};
// model
var Poll = Backbone.Model.extend({
urlRoot: '/polls/api/v1/poll/'
});
Request from Backbone with ApiKey in HTTP_Authorization header:
Request URL:http://localhost:8000/polls/api/v1/poll/
Request Method:POST
Status Code:401 UNAUTHORIZED
Request Headersview source
Accept:application/json, text/javascript, */*; q=0.01
Accept-Charset:ISO-8859-1,utf-8;q=0.7,*;q=0.3
Accept-Encoding:gzip,deflate,sdch
Accept-Language:en-US,en;q=0.8
Authorization:ApiKey federico:10a2d3586e63078ef39f9da8f9aa9209715ed282
Connection:keep-alive
Content-Length:109
Content-Type:application/json
Cookie:djdt=hide; sessionid=96ca6e066bab30f241819b22cc85693b; csrftoken=PYMw9nrqh3TOqse3GM3ojU5iSOV2QMUA
Host:localhost:8000
Origin:http://localhost:8000
Referer:http://localhost:8000/index/
User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17
X-CSRFToken:PYMw9nrqh3TOqse3GM3ojU5iSOV2QMUA
X-Requested-With:XMLHttpRequest
Request Payload
{"csrfmiddlewaretoken":"PYMw9nrqh3TOqse3GM3ojU5iSOV2QMUA","question":"What is love?","pub_date":"07/02/2013"}
Response Headersview source
Content-Type:text/html; charset=utf-8
Date:Thu, 07 Feb 2013 21:57:01 GMT
Server:WSGIServer/0.1 Python/2.7.1
Vary:Cookie
Edit: I've been trying to debug this and apparently it's some issue with the url...
This is my project's url.py
urlpatterns = patterns('',
url(r'^admin/', include(admin.site.urls)),
url(r'^index/$', 'polls.views.index', name='index'),
url(r'^polls/', include('polls.urls')),
)
and this is the app's url.py
v1_api = Api(api_name='v1')
v1_api.register(PollResource())
v1_api.register(ChoiceResource())
urlpatterns = patterns('',
url(r'api/', include(v1_api.urls)),
)