Download file to AWS Lambda /tmp file directory using Chromedriver - amazon-web-services

I am trying to automate the download of a file to the /tmp directory in AWS Lambda using Chromedriver from this website https://registry.verra.org/app/search/VCS/All%20Projects. The steps are to 1) click on the 'Search' button, 2) wait for the results to load, 3) click on the 'Excel' logo to download the file.
I've referenced and tried the code provided in these 2 questions to change the download directory of Chromedriver but the file is not downloaded in the /tmp path.
AWS Lambda download a file using Chromedriver
prefs = {
"profile.default_content_settings.popups": 0,
"download.default_directory": r"/tmp",
"directory_upgrade": True
}
options.add_experimental_option("prefs", prefs)
Unable to change the default download location of chrome in AWS lambda to /tmp using Selenium
options = webdriver.ChromeOptions()
prefs = {"browser.downloads.dir": "//tmp//", "download.default_directory": "//tmp//", "directory_upgrade": True}
options.add_experimental_option("prefs", prefs)
For reference, here is my code which works fine in local but not in AWS Lambda.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import requests
import requests.auth
import json
import csv
def lambda_handler(event, context):
# change directory to /tmp folder
os.chdir('/tmp')
# get dataset from website
options = Options()
options.binary_location = '/opt/headless-chromium'
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--single-process')
options.add_argument('--disable-dev-shm-usage')
## SAVE TO TMP DIRECTORY
# set download settings
prefs = {
"profile.default_content_settings.popups": 0,
"download.default_directory": r"/tmp",
"directory_upgrade": True
}
options.add_experimental_option("prefs", prefs)
## open Chrome webdriver
driver = webdriver.Chrome('/opt/chromedriver',options=options)
driver.maximize_window()
driver.get('https://registry.verra.org/app/search/VCS/All%20Projects')
# wait for 60 seconds for website content to load
print("Waiting for website to load...")
element1 = WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, '/html/body/apx-root/div/div/apx-search-page/div/apx-search-container/div/div[2]/div/div[1]/apx-search-selection-criteria/div/form/div[2]/div/button[1]')))
print("Website loaded!")
# click on search button to load results
search_btn = driver.find_element(By.XPATH, '/html/body/apx-root/div/div/apx-search-page/div/apx-search-container/div/div[2]/div/div[1]/apx-search-selection-criteria/div/form/div[2]/div/button[1]')
search_btn.click()
# wait for results to load for 100 seconds - determine by checking the page numbers
element2 = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, '/html/body/apx-root/div/div/apx-search-page/div/apx-search-container/div/div[2]/div/div[2]/apx-project-search-results/div/div/kendo-grid/kendo-pager/kendo-pager-numeric-buttons/ul/li[1]/a')))
print("Results loaded!")
# wait for download button to load for 100 seconds - determine by detecting presence of download button
element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, '/html/body/apx-root/div/div/apx-search-page/div/apx-search-container/div/div[2]/div/div[2]/apx-project-search-results/div/apx-search-results-header/div/button[1]')))
download_btn = driver.find_element(By.XPATH, '/html/body/apx-root/div/div/apx-search-page/div/apx-search-container/div/div[2]/div/div[2]/apx-project-search-results/div/apx-search-results-header/div/button[1]')
# click on download button
# if element is not clickable
filepath = driver.execute_script("arguments[0].click();", element)
# wait for 60 seconds for file to download
time.sleep(60)
# check if file is downloaded to /tmp directory
# Method 2
list = os.listdir('/tmp')
print("list", list)
response = {
"statusCode": 200,
"body": "Selenium Headless Chrome Initialized"
}
return response
Could this be a versioning issue? Because my code only works if the runtime settings is for Python 3.7.
Selenium version: selenium/python/lib/python3.7/site-packages selenium==3.8.0\ (runtime Python 3.7)
Chromedriver version: https://chromedriver.storage.googleapis.com/2.37/chromedriver_linux64.zip\ (only runtime Python 3.9 works, changing to Python 3.7 does not work)
Headless Chrome: https://github.com/adieuadieu/serverless-chrome/releases/download/v1.0.0-41/stable-headless-chromium-amazonlinux-2017-03.zip

Related

Python 2.7 Selenium unable to extract data

I am trying to extra data by return error
NoSuchElementException: Message: u'Unable to locate element: {"method":"xpath","selector":"//*[#id=\'searchpopbox\']"}' ; Stacktrace:
at FirefoxDriver.findElementInternal_ (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/driver_component.js:8444)
at FirefoxDriver.findElement (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/driver_component.js:8453)
at DelayedCommand.executeInternal_/h (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10456)
at DelayedCommand.executeInternal_ (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10461)
at DelayedCommand.execute/< (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10401)
My code is as below and I am trying to get the list from the link
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
browser = webdriver.Firefox(profile)
url = 'https://www.bursamarketplace.com/index.php?tpl=th001_search_ajax'
browser.get(url)
time.sleep(15)
a = browser.find_element_by_xpath("//*[#id='searchpopbox']")
print a
I am seeking your help to get the right xpath for the url.
This gets all the listing for that table.
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.bursamarketplace.com/index.php?tpl=th001_search_ajax")
time.sleep(15)
a = driver.find_element_by_xpath("//*[#id='searchpopbox']")
print(a.text)
Or without chromedrivermanager same thing applies to firefox
.Chrome(executable_path='absolutepathofchromedriver.exe')

How to handle pop up window dialog to download file automatically with firefox profile in python selenium on Linux (Ubuntu) system

I am trying to download the file automatically from system file download dialog by setting preference in firefox profile in my python selenium code , but my code is not working.
Browser : Firefox 72.0
Selenium Version : 3.14
OS : linux Ubuntu
Filetype to download: *.enc (encrypted file type)
Path of firefox in linux : /usr/bin/firefox
Code :
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", "/home/user/Downloads/tests")
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-uuencoded,application/octet-stream")
self.driver = webdriver.Firefox(firefox_profile=profile)
Hi #Sum i have resolving, my problem was a different Content-Type.
Use this example to resolve your problem , and to understand your Content-Type: https://stackoverflow.com/a/36356422/12911814
In my case the Content-Type was "application/force-download" not "application/pdf"
profile.set_preference("pdfjs.disabled", True)
profile.set_preference("browser.download.folderList",2)
profile.set_preference("browser.download.manager.useWindow", False)
profile.set_preference("browser.download.dir", "<path>")
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf, application/force-download")
These settings worked for me. Hope it might help you.
I have the same problem with Firefox 72.0, but with pdf files. This is the code:
fp = webdriver.FirefoxProfile()
fp.set_preference("pdfjs.disabled", True)
fp.set_preference("browser.download.folderList", 2)
fp.set_preference("browser.download.dir", "/path")
fp.set_preference("browser.download.downloadDir", "/path")
fp.set_preference("browser.download.defaultFolder", "/path")
fp.set_preference("plugin.disable_full_page_plugin_for_types", "application/x-pdf, application/acrobat, applications/vnd.pdf, text/pdf, text/x-pdf, application/vnd.cups-pdf")
fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-pdf, application/acrobat, applications/vnd.pdf, text/pdf, text/x-pdf, application/vnd.cups-pdf")
fp.set_preference("browser.helperApps.neverAsk.openFile", "application/x-pdf, application/acrobat, applications/vnd.pdf, text/pdf, text/x-pdf, application/vnd.cups-pdf")
driver = webdriver.Firefox(firefox_profile=fp)
I have tried all possible preferences, but it always triggers the download alert.
Try this will work like charm ......
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import time
import pyautogui
try :
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.maximize_window()
driver.get("https://www.citysdk.eu/wp-content/uploads/2013/09/DELIVERABLE_WP4_TA_SRS_0.21.pdf")
WebDriverWait(driver, 10).until(lambda d: d.execute_script('return document.readyState') == 'complete')
# Click the OK button and close
time.sleep(5)
webelem = driver.find_element_by_id('download')
webelem.click()
time.sleep(5)
print('press enter')
pyautogui.press('enter')
except Exception as err:
print('ERROR: %sn' % str(err))
driver.quit()
The correct MIME type for .enc is "text/x-uuencoded"
Updated as below in code and it's working :
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/x-uuencoded")

Python request download a file and save to a specific directory

Hello sorry if this question has been asked before.
But I have tried a lot of methods that provided.
Basically, I want to download the file from a website, which is I will show my coding below. The code works perfectly, but the problem is the file was auto download in our download folder path directory.
My concern is to download the file and save it to a specific folder.
I'm aware we can change our browser setting since this was a server that will remote by different users. So, it will automatically download to their temporarily /users/adam_01/download/ folder.
I want it to save in server disk which is, C://ExcelFile/
Below are my script and some of the data have been changing because it is confidential.
import pandas as pd
import html5lib
import time from bs4
import BeautifulSoup
import requests
import csv
from datetime
import datetime
import urllib.request
import os
with requests.Session() as c:
proxies = {"http": "http://:911"}
url = 'https://......./login.jsp'
USERNAME = 'mwirzonw'
PASSWORD = 'Fiqr123'
c.get(url,verify= False)
csrftoken = ''
login_data = dict(proxies,atl_token = csrftoken, os_username=USERNAME, os_password=PASSWORD, next='/')
c.post(url, data=login_data, headers={"referer" : "https://.....com"})
page = c.get('https://........s...../SearchRequest-96010.csv')
location = 'C:/Users/..../Downloads/'
with open('asdsad906010.csv', 'wb') as output:
output.write(page.content )
print("Done!")
Thank you, be pleased to ask if any confusing information was given.
Regards,
Fiqri
It seems that from your script you are writing the file to asdsad906010.csv. You should be able to change the output directory as follows.
# Set the output directory to your desired location
output_directory = 'C:/ExcelFile/'
# Create a file path by joining the directory name with the desired file name
file_path = os.path.join(output_directory, 'asdsad906010.csv')
# Write the file
with open(file_path, 'wb') as output:
output.write(page.content)

Selenium - chromeDriver u'unknown error : chrome field to start

I'm pretty new in selenium and getting an error with ChromeWebDriver.
I'm using: Chrome 36, ChromeWebDriver 2.10, Windows 7
Here's my code:
from selenium import webdriver
webD = webdriver.Chrome();
But I get the response
unknown error : chrome field to start
How can I fix this?
You may need to download the chrome executable driver from http://chromedriver.storage.googleapis.com/index.html and set the executable path accordingly.
Sample Python Code :
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chromedriver = "./chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
#driver = webdriver.Firefox()
driver.get("http://www.python.org")
print driver.title
assert "Python" in driver.title
For more information and end to end script follow
Reference

Where are python logs default stored when ran through IPython notebook?

In an IPython notebook cell I wrote:
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
handler = logging.FileHandler('model.log')
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
Notice that I am supplying a file name, but not a path.
Where could I find that log? (ran a 'find' and couldn't locate it...)
There's multiple ways to set the IPython working directory. If you don't set any of that in your IPython profile/config, environment or notebook, the log should be in your working directory. Also try $ ipython locate to print the default IPython directory path, the log may be there.
What about giving it an absolute file path to see if it works at all?
Other than that the call to logging.basicConfig doesn't seem to do anything inside an IPython notebook:
# In:
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()
logger.debug('root debug test')
There's no output.
As per the docs, the logging.basicConfig doesn't do anything if the root logger already has handlers configured for it. This seems to be the case, IPython apparently already has the root logger set up. We can confirm it:
# In:
import logging
logger = logging.getLogger()
logger.handlers
# Out:
[<logging.StreamHandler at 0x106fa19d0>]
So we can try setting the root logger level manually:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.debug('root debug test')
which yields a formatted output in the notebook:
Now onto setting up the file logger:
# In:
import logging
# set root logger level
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
# setup custom logger
logger = logging.getLogger(__name__)
handler = logging.FileHandler('model.log')
handler.setLevel(logging.INFO)
logger.addHandler(handler)
# log
logger.info('test info my')
which results in writing the output both to the notebook and the model.log file, which for me is located in a directory I started IPython and notebook from.
Mind that repeated calls to this piece of code without restarting the IPython kernel will result in creating and attaching yet another handler to the logger on every run and the number of messages being logged to the file with each log call will grow.
Declare the path of the log file in the basicConfig like this :
log_file_path = "/your/path/"
logging.basicConfig(level = logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
filename = log_file_path,
filemode = 'w')
You can then start logging and why not add a different log format to the console if you want :
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
# set a format which is simpler for console use
formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
# tell the handler to use this format
console.setFormatter(formatter)
# add the handler to the root logger
logging.getLogger().addHandler(console)
logger = logging.getLogger()
et voilĂ  .