Change user agent with selenium webdriver and python - python-2.7

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import os
xpaths = { 'video' : "//video[#id='video']",
}
from selenium import webdriver
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36")
driver = webdriver.Firefox(profile)
mydriver = webdriver.Firefox()
baseurl = "XXXX"
mydriver.get(baseurl)
It's not changing the user agent. I want the user agent to be chrome. I don't know what's wrong...
And also, here's what i'd like it to do: Go to the website, if it redirects to another url > Goes back to main page and keeps doing that until it finds (id:video)
I have not implemented this yet because i have no idea how to...
The website i'm trying to automate got a vid and it appears sometimes. What i'd like this to do is keep visiting the website until it finds the id:video clicks it and waits.
Help is appreciated :)

You are navigating to your application URL using the wrong firefox instance - mydriver. Using the correct firefox instance (with required profile setting) should do the work (which is driver in your case).
Below is the correct code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import os
xpaths = { 'video' : "//video[#id='video']",
}
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36")
driver = webdriver.Firefox(profile)
# the below line is not required
#mydriver = webdriver.Firefox()
baseurl = "XXXX"
# navigate to url with 'driver' instead of 'mydriver'
driver.get(baseurl)
If you change your baseurl to "http://whatsmyuseragent.com/", you will be able to right away see if the user agent change is reflected correctly.
Hope this helps!

Related

403 Response when I use requests to make a post request

My core code is as follows:
import requests
url='https://www.xxxx.top' #for example
data=dict()
session = requests.session()
session.get(url)
token = session.cookies.get('csrftoken')
data['csrfmiddlewaretoken'] = token
res = session.post(url=url, data=data, headers=session.headers, cookies=session.cookies)
print(res)
# <Response [403]>
The variable url is my own website, which is based on Django. I know I can use #csrf_exempt to disable CSRF, but I don't want to do that.
However, it return 403 response when I use requests to make a post request. I wish someone could tell me what was wrong with my approach.
I have solved the problem. In this case, just add Referer to headers
import requests
url='https://www.xxxx.top' #for example
data=dict()
session = requests.session()
session.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36','Referer':self.url}
session.get(url)
token = session.cookies.get('csrftoken')
data['csrfmiddlewaretoken'] = token
res = session.post(url=url, data=data, headers=session.headers, cookies=session.cookies)
print(res)

Test SameSite and Secure cookies in Django Test client response

I have a Django 3.1.7 API.
Until now I was adding SameSite and Secure cookies in the responses through a custom middleware before Django 3.1, depending on the user agent, with automated tests.
Now that Django 3.1 can add those cookie keys itself, I removed the custom middleware and still want to test the presence of SameSite and Secure cookies in the responses.
So I added the following constants in settings.py, as Django doc says:
CSRF_COOKIE_SECURE = True
SESSION_COOKIE_SECURE = True
CSRF_COOKIE_SAMESITE = 'None'
SESSION_COOKIE_SAMESITE = 'None'
But when I look at the content of the responses in my tests, I don't get any SameSite neither Secure cookie keys anymore. I printed the content of the cookies, and it's not there.
Why?
Here are my tests:
agent_string = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.2227.0 Safari/537.36"
from django.test import Client
test_client = Client()
res = test_client.get("/", HTTP_USER_AGENT=agent_string)
print(res.cookies.items())
I also tried with the DRF test client just in case, with same result:
agent_string = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.2227.0 Safari/537.36"
from rest_framework.test import APIClient
test_client = APIClient()
res = test_client.get("/", HTTP_USER_AGENT=agent_string)
print(res.cookies.items())

BS4 error 'NoneType' object has no attribute 'find_all'. Cannot parse html data

BS4 error 'NoneType' object has no attribute 'find_all'. Cannot parse html data.
import requests
from bs4 import BeautifulSoup as bs
session = requests.session()
def get_sizes_in_stock():
global session
endpoint = 'https://www.jimmyjazz.com/mens/footwear/nike-air-max-270/AH8050-100?color=White'
response = session.get(endpoint)
soup = bs(response.text,'html.parser')
div = soup.find('div',{'class':'box_wrapper'})
all_sizes = div.find_all('a')
sizes_in_stock = []
for size in all_sizes:
if 'piunavailable' not in size['class']:
size_id = size['id']
sizes_in_stock.append(size_id.split('_')[1])
return sizes_in_stock
print (get_sizes_in_stock())
enter image description here
try adding in the headers parameter:
change:
response = session.get(endpoint)
to:
response = session.get(endpoint, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'})
import requests
from bs4 import BeautifulSoup as bs
session = requests.session()
def get_sizes_in_stock():
global session
endpoint = "https://www.sneakers76.com/en/nike/5111-nike-af1-type-ci0054-001-.html"
response = session.get(endpoint, headers={'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Mobile Safari/537.36'})
soup = bs(response.text,"html.parser")
var = soup.find("var",{"blockwishlist_viewwishlist":"View your wishlist"})
all_sizes = var.find_all("var combinations")
sizes_in_stock = []
for size in all_sizes:
if "0" not in size["quantity"]:
size_id = size["attributes"]
sizes_in_stock.append(size_id)
return sizes_in_stock
print (get_sizes_in_stock())

Persistent session in mechanize (Python) / Navigate to another site after login check

I need to login to a site at one url (ex: 'www.targetsite.com/login') and then navigate to another site to scrape data (ex: 'www.targetsite.com/data'). This is because the site auto directs you to the home page after you login, no matter which url you used to access the site to begin with.
I'm using the mechanize python library (old I know, but it has some functions I'll need later on & is a good learning experience).
The problem I'm facing is that the cookiejar doesn't seem to be working the way I thought it would
import mechanize
import Cookie
import cookielib
cj = cookielib.LWPCookieJar()
br = mechanize.Browser()
br.set_cookiejar(cj)
###browser emulation
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
###login
login_url = "https://targetsite.org/login"
br.open(login_url)
br.select_form(action="https://targetsite.org/wp-login.php?wpe-login=true")
br.form['log'] = 'login'
br.form['pwd'] = 'password
br.submit()
target_url = "https://targetsite.com/data"
br.open(target_url)
soup = BeautifulSoup(br.response().read())
body_tag = soup.body
all_paragraphs = soup.find_all('p')
print(body_tag.text)
Wierdly, the site doesn't seem to be registering my logged in state and is redirecting my mechanise br back to the login screen. Any idea of what's going on?

Scraping using Mechanize and BS4

I am trying to scrape articles from the Wall Street Journal. This involves logging in using mechanize and scraping using BeautifulSoup. I was hoping someone could take a look at my code and explain to me why it's not working.
I am using python 2.7 on a 2012 MacBook Pro running the latest software. I'm new to python so explain to me like I'm 5. Any advice would be deeply appreciated. Thanks in advance.
from bs4 import BeautifulSoup
import cookielib
import mechanize
#Browser
br = mechanize.Browser()
#Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# User-Agent
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.set_debug_http(True) # Print HTTP headers.
# Want more debugging messages?
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
# The site we will navigate into, handling it's session
br.open('https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=id-wsj')
# Select the first (index zero) form
br.select_form(nr=0)
# User credentials
br.form['username'] = 'username'
br.form['password'] = 'password'
# Login
br.submit()
#br.open("http://online.wsj.com/home-page")
br.open("http://online.wsj.com/news/articles/SB10001424052702304626304579506924089231470?mod=WSJ_hp_LEFTTopStories&mg=reno64-wsj&url=http%3A%2F%2Fonline.wsj.com%2Farticle%2FSB10001424052702304626304579506924089231470.html%3Fmod%3DWSJ_hp_LEFTTopStories&cb=logged0.9458705162058179")
soup = BeautifulSoup(br.response().read())
title = soup.find('h1')
print title