Django redirect and modify GET parameters - django

I am implementing magic tokens and would like clean URLs. As a consequence, I would like to remove the token from the URL upon a successful user authentication. This is my attempt:
def authenticate_via_token(get_response):
def middleware(request):
if request.session.get('authenticated', None):
pass
else:
token = request.GET.get('token', None)
if token:
mt = MagicToken.fetch_by_token(token)
if mt:
request.session['authenticated'] = mt.email
if not request.GET._mutable:
request.GET._mutable = True
request.GET['token'] = None
request.GET._mutable = False
else:
print("invalid token")
response = get_response(request)
return response
return middleware
IE, I would like to send /products/product-detail/3?token=piyMlVMrmYblRwHcgwPEee --> /products/product-detail/3
It's possible that there may be additional GET parameters and I would like to keep them. Any input would be appreciated!

This is the solution I ended up going for:
from django.urls import resolve, reverse
import urllib
def drop_get_param(request, param):
'helpful for redirecting while dropping a specific parameter'
resolution = resolve(request.path_info) #simulate resolving the request
new_params = request.GET.copy() # copy the parameters
del new_params[param] # drop the specified parameter
reversed = reverse(resolution.url_name, kwargs=resolution.kwargs) # create a base url
if new_params: #append the remaining parameters
reversed += '?' + urllib.parse.urlencode(new_params)
return reversed

Related

Check url status without opening it

At Now when url is opened (without a slash - example.com/blog), a slash is automatically added at the end (there are 301 redirects). The question is, can I somehow do it so that the check first goes to see if the page exists (without a slash - example.com/blog). If so, open it. If not, then check whether the page exists with a slash (only without 301 - example.com/blog/). If so, then redirect 301, and if not, then throw 404.
Now just if there is no page (example.com/blog), then a slash is added to the end first (example.com/blog/), 301 redirects go and only then a 404 error is thrown. In this case, the 404 error must be thrown immediately, without a 301 redirect.
The dispatch was rewritten as follows.
def is_normal_slash_count(url):
temp_url = url
slash_count = 0
while temp_url.endswith('/'):
slash_count += 1
temp_url = temp_url[:-1]
return (slash_count == 1, slash_count)
def replace_bad_slash(url, slash_count):
if slash_count == 2:
return url.replace('//', '/')
return url.replace('/'*(slash_count-1), '')
def normalize_url(url):
if len(url) > 1:
if not url.endswith('/'):
return url + '/'
# replace the url like /contacts//// to /contacts/
good_slash, slash_count = is_normal_slash_count(url)
if not good_slash:
url = replace_bad_slash(url, slash_count)
return url
def is_bad_url(url):
if len(url) > 1:
good_slash, slash_count = is_normal_slash_count(url)
if not good_slash:
return True
return False
class RedirectMixinView:
def dispatch(self, *args, **kwargs):
url = self.request.path
redirect_setting = RedirectSettings.objects.filter(url_from=url).first()
if redirect_setting:
return redirect(redirect_setting.url_to, permanent=True)
if is_bad_url(url):
return redirect(normalize_url(url), permanent=True)
return super(RedirectMixinView, self).dispatch(*args, **kwargs)
Is this realistic?
I think in the direction of writing middleware.
Updated
projects.urls
url(r'^page/', include('pages.urls')),
pages.urls
url(r'^$', PageView.as_view(), name='page'),
test
try:
resolve('/page/')
except:
raise Http404
return redirect('/page/')
I'm tried /page/, /page, page/, page, http://127.0.0.1:8000/page/, http://127.0.0.1:8000/page
You need to remove RedirectMixinView from LandingView.
Comment out the middleware CommonMiddleware.
Add RedirectMiddleware to the list of middleware (preferably in the top).
Create RedirectMiddleware
The code is written jointly with #dirkgroten (most of his contribution).
import re
from django.http import HttpResponsePermanentRedirect
class RedirectMiddleware(object):
response_redirect_class = HttpResponsePermanentRedirect
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
response = self.get_response(request)
path = re.sub("/+", "/", request.path)
if response.status_code == 404:
if not path.endswith('/'):
request.path = path # to force using the cleaned path
else:
request.path = path[:-1] # to force using the cleaned path
try:
full_path = request.get_full_path(force_append_slash=True) # add the slash, keeping query parameters
r = resolve(full_path)
new_response = r.func(request, args=r.args, kwargs=r.kwargs)
if new_response.status_code == 200:
return redirect(full_path)
except Resolver404:
pass # this will fall through to `return response`
# Add the Content-Length header to non-streaming responses if not
# already set.
if not response.streaming and not response.has_header('Content-Length'):
response['Content-Length'] = str(len(response.content))
return response
Add to ngnx config of project
if ($request_uri ~* "\/\/") {
rewrite ^/(.*) $scheme://$host/$1 permanent;
}
# merge_slashes off;
It does what you need, and also removes duplicate slashes if this page exists.
First make sure you set APPEND_SLASH to False in your settings.py. This will disable the automatic 301 redirects to the URLs with slash.
Then use resolve() to check if the URL with slash exists before redirecting. Do this in a Middleware class where you handle the case that the response status code is 404.
from django.urls import resolve
try:
resolve(url_with_slash)
except Resolver404:
raise Http404
return redirect(url_with_slash)
Note that resolve(url) will not raise an exception when there is a path matching the url, even if the view might afterwards still raise a 404. This is the case for example if you have a DetailView for an object where the object's pk is in the URL. Say you have /objects/<pk>/ as the path to show your objects, then the url /objects/4/ will always match even if object with pk=4 does not exist. The view will still raise a 404 after the redirect.
So if you really want to also catch those 404's, you could actually call the view function yourself to check the response:
try:
r = resolve(url_with_slash)
response = r.func(request, args=r.args, kwargs=r.kwargs)
if response.status_code == 200:
return redirect(url_with_slash)
except Resolver404:
pass

Query parameters in PUT call of APIClient

I have an API endpoint to which I want to make a PUT call which needs both a body and query parameters. I use Django's test client to call my endpoint in a test case (docs).
I read in the documentation that for a GET call, query parameters are introduced using the argument data. I also read that for a PUT call, the argument data represents the body. I miss documentation how to add query parameters in a PUT call.
In particular, this test case fails:
data = ['image_1', 'image_2']
url = reverse('images')
response = self.client.put(url,
data=data,
content_type='application/json',
params={'width': 100, 'height': 200})
And this test case passes:
data = ['image_1', 'image_2']
url = reverse('images') + '?width=100&height=200'
response = self.client.put(url,
data=data,
content_type='application/json')
In other words: is this manually URL building really necessary?
Assuming you're using rest_framework's APITestClient, I found this:
def get(self, path, data=None, secure=False, **extra):
"""Construct a GET request."""
data = {} if data is None else data
r = {
'QUERY_STRING': urlencode(data, doseq=True),
}
r.update(extra)
return self.generic('GET', path, secure=secure, **r)
whereas the put is:
def put(self, path, data='', content_type='application/octet-stream',
secure=False, **extra):
"""Construct a PUT request."""
return self.generic('PUT', path, data, content_type,
secure=secure, **extra)
and the interesting part (an excerpt from the self.generic code):
# If QUERY_STRING is absent or empty, we want to extract it from the URL.
if not r.get('QUERY_STRING'):
# WSGI requires latin-1 encoded strings. See get_path_info().
query_string = force_bytes(parsed[4]).decode('iso-8859-1')
r['QUERY_STRING'] = query_string
return self.request(**r)
so you could probably try to create that dict with QUERY_STRING and pass it to put's kwargs, I'm not sure how worthy effort-wise that is though.
I just specify what #henriquesalvaro answer more detail.
You can pass query-parameters in PUT or POST method like below.
# tests.py
def test_xxxxx(self):
url = 'xxxxx'
res = self.client.put(url,**{'QUERY_STRING': 'a=10&b=20'})
# views.py
class TestViewSet(.....):
def ...(self, request, *args, **kwargs):
print(request.query_params.get('a'))
print(request.query_params.get('b'))

Include parameters in return login OAuth2

I am using a third party library to retrieve a token through social networks, which uses python-social-auth-oauth and django-toolkit.
Beyond the normal parameters, I would like to add the list of groups that the user is checked.
Current return:
{"scope":"write read
groups","token_type":"Bearer","expires_in":36000,"refresh_token":"xxx","access_token":"xxx"}
make a custom class, which in the end includes the list of groups.
settings.py
OAUTH2_PROVIDER = {
'OAUTH2_VALIDATOR_CLASS': 'apps.userTest.validator.CustomOAuth2Validator'
}
apps.userTest.validator.CustomOAuth2Validator.py
from datetime import timedelta
from django.conf import settings
from django.utils import timezone
from oauth2_provider.models import AccessToken, RefreshToken
from oauth2_provider.oauth2_validators import OAuth2Validator
from oauth2_provider.settings import oauth2_settings
class CustomOAuth2Validator(OAuth2Validator):
def save_bearer_token(self, token, request, *args, **kwargs):
"""
It's messy. It is 90% code from parent function. I didn't find a way to reduce it.
I tried and I failed :'(
Sin Count += 1
Save access and refresh token, If refresh token is issued, remove old refresh tokens as
in rfc:`6`
"""
if request.refresh_token:
# remove used refresh token
# Copied as is from parent. I don't know why they're even caring to delete this! - Dheerendra
try:
RefreshToken.objects.get(token=request.refresh_token).revoke()
except RefreshToken.DoesNotExist:
assert () # TODO though being here would be very strange, at least log the error
expires = timezone.now() + timedelta(seconds=oauth2_settings.ACCESS_TOKEN_EXPIRE_SECONDS)
token['expires_in'] = oauth2_settings.ACCESS_TOKEN_EXPIRE_SECONDS
if request.response_type == 'token':
expires = timezone.now() + timedelta(seconds=settings.IMPLICIT_ACCESS_TOKEN_EXPIRES_SECONDS)
token['expires_in'] = settings.IMPLICIT_ACCESS_TOKEN_EXPIRES_SECONDS
if request.grant_type == 'client_credentials':
request.user = None
access_token = AccessToken(
user=request.user,
scope=token['scope'],
expires=expires,
token=token['access_token'],
application=request.client)
access_token.save()
if 'refresh_token' in token:
refresh_token = RefreshToken(
user=request.user,
token=token['refresh_token'],
application=request.client,
)
if request.grant_type == 'authorization_code':
refresh_tokens = RefreshToken.objects.all().filter(user=request.user,
application=request.client).order_by('-id')
if len(refresh_tokens) > 0:
refresh_token = refresh_tokens[0]
# Delete the old access_token
refresh_token.access_token.delete()
if len(refresh_tokens) > 1:
# Enforce 1 token pair. Delete all old refresh_tokens
RefreshToken.objects.exclude(pk=refresh_token.id).delete()
refresh_token.access_token = access_token
refresh_token.save()
token['refresh_token'] = refresh_token.token
token['groups'] = request.user.group_list

How to work with a very large "allowed_domains" attribute in scrapy?

The following is my scrapy code:
def get_host_regex(self, spider):
"""Override this method to implement a different offsite policy"""
allowed_domains = getattr(spider, 'allowed_domains', None)
if not allowed_domains:
return re.compile('') # allow all by default
regex = r'^(.*\.)?(%s)$' % '|'.join(re.escape(d) for d in allowed_domains if d is not None)
return re.compile(regex)
def spider_opened(self, spider):
self.host_regex = self.get_host_regex(spider)
self.domains_seen = set()
Because the allowed_domains is very big, it throws this exception:
regex = r'^(.*.)?(%s)$' % '|'.join(re.escape(d) for d in allowed_domains if d is not None)
How do I solve this problem?
You can build your own OffsiteMiddleware variation, with a different implementation checking requests to domains not in the spider's allowed_domains.
For example, add this in a middlewares.py file,
from scrapy.spidermiddlewares.offsite import OffsiteMiddleware
from scrapy.utils.httpobj import urlparse_cached
class SimpleOffsiteMiddleware(OffsiteMiddleware):
def spider_opened(self, spider):
# don't build a regex, just use the list as-is
self.allowed_hosts = getattr(spider, 'allowed_domains', [])
self.domains_seen = set()
def should_follow(self, request, spider):
if self.allowed_hosts:
host = urlparse_cached(request).hostname or ''
# does 'www.example.com' end with 'example.com'?
# test this for all allowed domains
return any([host.endswith(h) for h in self.allowed_hosts])
else:
return True
and change your settings to disable the default OffsiteMiddleware, and add yours:
SPIDER_MIDDLEWARES = {
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None,
'myproject.middlewares.SimpleOffsiteMiddleware': 500,
}
Warning: this middleware is not tested. This is a very naive implementation, definitely not very efficient (testing string inclusion for each of 50'000 possible domains for each and every request).
You could use another backend to store the list and test a hostname value, like sqlite for example.

Scrapy get request url in parse

How can I get the request url in Scrapy's parse() function? I have a lot of urls in start_urls and some of them redirect my spider to homepage and as result I have an empty item. So I need something like item['start_url'] = request.url to store these urls. I'm using the BaseSpider.
The 'response' variable that's passed to parse() has the info you want. You shouldn't need to override anything.
eg. (EDITED)
def parse(self, response):
print "URL: " + response.request.url
The request object is accessible from the response object, therefore you can do the following:
def parse(self, response):
item['start_url'] = response.request.url
Instead of storing requested URL's somewhere and also scrapy processed URL's are not in same sequence as provided in start_urls.
By using below,
response.request.meta['redirect_urls']
will give you the list of redirect happened like ['http://requested_url','https://redirected_url','https://final_redirected_url']
To access first URL from above list, you can use
response.request.meta['redirect_urls'][0]
For more, see doc.scrapy.org mentioned as :
RedirectMiddleware
This middleware handles redirection of requests based on response status.
The urls which the request goes through (while being redirected) can be found in the redirect_urls Request.meta key.
Hope this helps you
You need to override BaseSpider's make_requests_from_url(url) function to assign the start_url to the item and then use the Request.meta special keys to pass that item to the parse function
from scrapy.http import Request
# override method
def make_requests_from_url(self, url):
item = MyItem()
# assign url
item['start_url'] = url
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
return request
def parse(self, response):
# access and do something with the item in parse
item = response.meta['item']
item['other_url'] = response.url
return item
Hope that helps.
Python 3.5
Scrapy 1.5.0
from scrapy.http import Request
# override method
def start_requests(self):
for url in self.start_urls:
item = {'start_url': url}
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
yield request
# use meta variable
def parse(self, response):
url = response.meta['item']['start_url']