ThreadPoolExecutor fails when run with manage.py - django

# test.py
# python 3.4.5
import time
from concurrent.futures import ThreadPoolExecutor
def a():
time.sleep(1)
print("success")
executor = ThreadPoolExecutor(1)
executor.submit(a).result()
The above snippet works when run like
$ python test.py
success
But fails when run like
$ python manage.py shell < test.py
Traceback (most recent call last):
File "manage.py", line 22, in <module>
execute_from_command_line(sys.argv)
File "/var/www/cgi-bin/tracking/lib64/python3.4/site-packages/django/core/management/__init__.py", line 363, in execute_from_command_line
utility.execute()
File "/var/www/cgi-bin/tracking/lib64/python3.4/site-packages/django/core/management/__init__.py", line 355, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "/var/www/cgi-bin/tracking/lib64/python3.4/site-packages/django/core/management/base.py", line 283, in run_from_argv
self.execute(*args, **cmd_options)
File "/var/www/cgi-bin/tracking/lib64/python3.4/site-packages/django/core/management/base.py", line 330, in execute
output = self.handle(*args, **options)
File "/var/www/cgi-bin/tracking/lib64/python3.4/site-packages/django/core/management/commands/shell.py", line 101, in handle
exec(sys.stdin.read())
File "<string>", line 11, in <module>
File "/usr/lib64/python3.4/concurrent/futures/_base.py", line 395, in result
return self.__get_result()
File "/usr/lib64/python3.4/concurrent/futures/_base.py", line 354, in __get_result
raise self._exception
File "/usr/lib64/python3.4/concurrent/futures/thread.py", line 54, in run
result = self.fn(*self.args, **self.kwargs)
File "<string>", line 7, in a
NameError: name 'time' is not defined
Which is really strange to me. What is it about running the script with the manage.py shell command that results in the time module being undefined in the function a?

Checking in the Django implementation (django/core/management/commands/shell.py line 83):
# Execute stdin if it has anything to read and exit.
# Not supported on Windows due to select.select() limitations.
if sys.platform != 'win32' and select.select([sys.stdin], [], [], 0)[0]:
exec(sys.stdin.read())
return
The developers did not add a globals() scope in the exec() method, that means you are importing time and ThreadPoolExecutor in the 'locals()' dictionary of the handle() scope (in shell.py) but after, when you try to use inside a() it tries to search in the locals() dictionary of the "a" scope and in the globals() dictionary so it throws an import error, you can see an example in this snippet:
command = """
import time
def b():
time.sleep(1)
b()
"""
def a():
exec(command)
a()
and try to change exec(command) by exec(command, globals())

I think it's not working because you did not set the environment variable DJANGO_SETTING_MODULE to your settings, and call django.setup() or set the path to sys.path.append('path/')
(NOT SURE)
But these 2 options can work like a charm:
Either you import the module time inside the function:
from concurrent.futures import ThreadPoolExecutor
def a():
import time
time.sleep(1)
print("success")
executor = ThreadPoolExecutor(1)
executor.submit(a).result()
or just import time at the beginning like you did, and use the module as a global one:
from concurrent.futures import ThreadPoolExecutor
import time
def a():
global time
time.sleep(1)
print("success")
executor = ThreadPoolExecutor(1)
executor.submit(a).result()

Related

Running Scrapy with a task queue

I built a web crawler with Scrapy and Django and put the CrawlerRunner code into task queue. In my local everything works fine until run the tasks in the server. I'm thinking multiple threads causing the problem.
This is the task code, I'm using huey for the tasks
from huey import crontab
from huey.contrib.djhuey import db_periodic_task, on_startup
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
from apps.core.tasks import CRONTAB_PERIODS
from apps.scrapers.crawler1 import Crawler1
from apps.scrapers.crawler2 import Crawler2
from apps.scrapers.crawler3 import Crawler3
#on_startup(name="scrape_all__on_startup")
#db_periodic_task(crontab(**CRONTAB_PERIODS["every_10_minutes"]))
def scrape_all():
configure_logging()
settings = get_project_settings()
runner = CrawlerRunner(settings=settings)
runner.crawl(Crawler1)
runner.crawl(Crawler2)
runner.crawl(Crawler3)
defer = runner.join()
defer.addBoth(lambda _: reactor.stop())
reactor.run()
and this is the first error I get from sentry.io, it's truncated
Unhandled Error
Traceback (most recent call last):
File "/home/deployer/env/lib/python3.10/site-packages/twisted/internet/base.py", line 501, in fireEvent
DeferredList(beforeResults).addCallback(self._continueFiring)
File "/home/deployer/env/lib/python3.10/site-packages/twisted/internet/defer.py", line 532, in addCallback
return self.addCallbacks(callback, callbackArgs=args, callbackKeywords=kwargs)
File "/home/deployer/env/lib/python3.10/site-packages/twisted/internet/defer.py", line 512, in addCallbacks
self._runCallbacks()
File "/home/deployer/env/lib/python3.10/site-packages/twisted/internet/defer.py", line 892, in _runCallbacks
current.result = callback( # type: ignore[misc]
--- <exception caught here> ---
File "/home/deployer/env/lib/python3.10/site-packages/twisted/internet/base.py", line 513, in _continueFiring
callable(*args, **kwargs)
File "/home/deployer/env/lib/python3.10/site-packages/twisted/internet/base.py", line 1314, in _reallyStartRunning
self._handle...
the task is set to run every 10 minutes, on the second run I'm getting this error from sentry.io
ReactorNotRestartable: null
File "huey/api.py", line 379, in _execute
task_value = task.execute()
File "huey/api.py", line 772, in execute
return func(*args, **kwargs)
File "huey/contrib/djhuey/__init__.py", line 135, in inner
return fn(*args, **kwargs)
File "apps/series/tasks.py", line 31, in scrape_all
reactor.run()
File "twisted/internet/base.py", line 1317, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "twisted/internet/base.py", line 1299, in startRunning
ReactorBase.startRunning(cast(ReactorBase, self))
File "twisted/internet/base.py", line 843, in startRunning
raise error.ReactorNotRestartable()
Assuming at the first run twisted reactor didn't kill itself and after 10 minutes huey trying to start a twisted reactor again and fails.
I'm not proficient about multi-threads but i'm assuming task runner and twisted are running on different threads and they can't communicate with each other.
Any advices ?

Import Python file which contains pySpark functions into Django app

I'm trying to import in views.py of my Django app, a python file "load_model.py" which contains my custom pyspark API but I got an error And I can't figure out how to solve it.
I import the file "load-model.py" with a simple:
import load_model as lm
My load_model.py contains the following code (this is just part of the code):
import findspark
# findspark.init('/home/student/spark-2.1.1-bin-hadoop2.7')
findspark.init('/Users/fabiomagarelli/spark-2.4.3-bin-hadoop2.7')
from pyspark.sql import SparkSession
from pyspark.ml.regression import RandomForestRegressionModel
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import Row
from collections import OrderedDict
spark = SparkSession.builder.appName('RForest_Regression').getOrCreate()
sc = spark.sparkContext
model = RandomForestRegressionModel.load('model/')
def predict(df):
predictions = model.transform(df)
return int(predictions.select('prediction').collect()[0].prediction)
# etc... ... ...
when I lunch python manage.py run server on my command line, I get this error log:
19/07/20 07:22:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Traceback (most recent call last):
File "manage.py", line 21, in <module>
main()
File "manage.py", line 17, in main
execute_from_command_line(sys.argv)
File "/anaconda3/lib/python3.7/site-packages/django/core/management/__init__.py", line 381, in execute_from_command_line
utility.execute()
File "/anaconda3/lib/python3.7/site-packages/django/core/management/__init__.py", line 375, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "/anaconda3/lib/python3.7/site-packages/django/core/management/base.py", line 323, in run_from_argv
self.execute(*args, **cmd_options)
File "/anaconda3/lib/python3.7/site-packages/django/core/management/commands/runserver.py", line 60, in execute
super().execute(*args, **options)
File "/anaconda3/lib/python3.7/site-packages/django/core/management/base.py", line 364, in execute
output = self.handle(*args, **options)
File "/anaconda3/lib/python3.7/site-packages/django/core/management/commands/runserver.py", line 95, in handle
self.run(**options)
File "/anaconda3/lib/python3.7/site-packages/django/core/management/commands/runserver.py", line 102, in run
autoreload.run_with_reloader(self.inner_run, **options)
File "/anaconda3/lib/python3.7/site-packages/django/utils/autoreload.py", line 585, in run_with_reloader
start_django(reloader, main_func, *args, **kwargs)
File "/anaconda3/lib/python3.7/site-packages/django/utils/autoreload.py", line 570, in start_django
reloader.run(django_main_thread)
File "/anaconda3/lib/python3.7/site-packages/django/utils/autoreload.py", line 288, in run
self.run_loop()
File "/anaconda3/lib/python3.7/site-packages/django/utils/autoreload.py", line 294, in run_loop
next(ticker)
File "/anaconda3/lib/python3.7/site-packages/django/utils/autoreload.py", line 334, in tick
for filepath, mtime in self.snapshot_files():
File "/anaconda3/lib/python3.7/site-packages/django/utils/autoreload.py", line 350, in snapshot_files
for file in self.watched_files():
File "/anaconda3/lib/python3.7/site-packages/django/utils/autoreload.py", line 249, in watched_files
yield from iter_all_python_module_files()
File "/anaconda3/lib/python3.7/site-packages/django/utils/autoreload.py", line 101, in iter_all_python_module_files
modules_view = sorted(list(sys.modules.items()), key=lambda i: i[0])
RuntimeError: dictionary changed size during iteration
Exception ignored in: <function JavaWrapper.__del__ at 0x11d2de6a8>
Traceback (most recent call last):
File "/Users/fabiomagarelli/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 41, in __del__
File "/Users/fabiomagarelli/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 2000, in detach
File "/Users/fabiomagarelli/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1298, in _detach
File "/Users/fabiomagarelli/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 628, in _garbage_collect_object
File "/anaconda3/lib/python3.7/logging/__init__.py", line 1370, in debug
File "/anaconda3/lib/python3.7/logging/__init__.py", line 1626, in isEnabledFor
TypeError: 'NoneType' object is not callable
Exception ignored in: <function GatewayConnection.__init__.<locals>.<lambda> at 0x11da84d90>
Traceback (most recent call last):
File "/Users/fabiomagarelli/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1061, in <lambda>
File "/Users/fabiomagarelli/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 640, in _garbage_collect_connection
File "/Users/fabiomagarelli/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 487, in quiet_shutdown
File "/anaconda3/lib/python3.7/logging/__init__.py", line 1370, in debug
File "/anaconda3/lib/python3.7/logging/__init__.py", line 1626, in isEnabledFor
TypeError: 'NoneType' object is not callable
pySpark is installed on my computer, I was using it on my jupyter notebook for fitting the model so I don't think the problem is that pyspark is not installed. Any suggestions?
So I found some tutorials on how to deploy a pySpark ML model using databricks, tensor flow etc. All too complex solutions for my limited pySpark knowledge and a project deadline in 4 weeks.
However, I found a workaround which consists in "deploying" the ML model on a Flask App then call it from my Django App (my project app). I think this may be very useful for someone facing my same problem. Not the best practice maybe but working! That's why I'm going to explain each step:
1. Create a Flask Application
in the command line (in your virtual env if you have one), type: pip install flask.
make a new folder (i call it 'static') and place in it the model folder which is obtained by saving the pySpark model (it contains other folders: data, metadata...)
create a new folder for your flask app (can be in the parent folder of your django app) and create a file in it named main.py (you can use whatever name but for the code I'm gonna post, this is the name I used).
in main.py, copy paste this:
from flask import Flask, request
import findspark
findspark.init('/home/student/spark-2.1.1-bin-hadoop2.7')
# various pySpark imports here...
app = Flask(__name__)
spark = SparkSession.builder.appName('RForest_Regression').getOrCreate()
sc = spark.sparkContext
# I'm using a RandomForest ML model, change it as appropriate
model = RandomForestRegressionModel.load('static/model/')
# define here all your functions to make a prediction (eventual arguments cleaning...)
#app.route('/predict')
# this is the function called when the page: '127.0.0.1/5000/predict' is requested.
# you can pass arguments in here by calling: '127.0.0.1/5000/predict?data=...'
numbers = request.args.get('data') # numbers = '...'
makePredictions(numbers)
def makePredictions(n):
# your function here
now on the django app, open views.py
add the fucntion to request the predictions from the flask app:
# Send a request to the flask App where the model is hosted
def getPredictions(request):
try:
data_to_predict = request.GET['data']
url = 'http://127.0.0.1:5000/predict?data=%s' % data_to_predict
response = get(url)
return JsonResponse(response.text, safe=False)
except:
print('ERROR getPredictions: no pySpark module or Flask App not running or wrong arguments')
then call the getPredictions function from javascript in your django app (I haven't done it yet so I don't have a snippet but so far is working, I tested it passing custom arguments).
You have to remember to run the flask app and the django app together in order to make it works:
cd into your Flask app folder (where you have the main.py file) then type: export FLASK_APP=main.py and flask run
then cd into your django app (where you have the manage.py file) then type: python manage.py runserver
I hope this will be useful to someone and that my explaination is not too messy. I will appreciate any comments, suggestions and requests. :)

Django command throws TypeError: handle() got an unexpected keyword argument

I'm using Django 1.10.4 and Python 3.52. When I try to run a Django command via python manage.py my_command I get the following error:
Traceback (most recent call last):
File "manage.py", line 22, in <module>
execute_from_command_line(sys.argv)
File "path_to_envs/envs/env_name/lib/python3.5/site-packages/django/core/management/__init__.py", line 367, in execute_from_command_line
utility.execute()
File "path_to_envs/envs/env_name/lib/python3.5/site-packages/django/core/management/__init__.py", line 359, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "path_to_envs/envs/env_name/lib/python3.5/site-packages/django/core/management/base.py", line 294, in run_from_argv
self.execute(*args, **cmd_options)
File "path_to_envs/envs/env_name/lib/python3.5/site-packages/django/core/management/base.py", line 345, in execute
output = self.handle(*args, **options)
TypeError: handle() got an unexpected keyword argument 'verbosity'
I can run a local django server and interact with the admin pages. The app that contains that command is in my settings.py file.
Below is the contents of the django command:
from django.core.management import BaseCommand
from my_module import MyClass
class Command(BaseCommand):
def handle(self):
my_class = MyClass()
my_class.my_method()
At the time of error, the options dictionary contains {'verbosity': 1, 'no_color': False, 'settings': None, 'pythonpath': None, 'traceback': False}. Depending on the random ordering of the dictionary no_color, traceback, and the others will throw the same TypeError. After a day of googling I still can't figure out what the issue is. Has anyone seen this before?
After lots of googling and pulling my hair out, the issue was an incorrect number of arguments to handle().
This:
def handle(self):
Should be:
def handle(self, *args, **options):
If your command needs no arguments, try a subclass of BaseCommand
NoArgsCommand.handle_noargs(**options)

How to deal with httplib.BadStatusLine: ''

I'm scraping some data of the web using Python, BeautifulSoup and Selenium. I am also using PyVirtualDisplay so that I do not need a display.
It works perfectly from my laptop but when I run if from a server I'm getting the following error:
httplib.BadStatusLine: ''
I got this the second time it scraped a page. It now does it all the time. What is the issue?
EDIT
Code Added:
import requests, bs4
import csv
import re
import datetime
import time
import os
from contextlib import closing
from selenium import webdriver
from selenium.webdriver import Firefox # pip install selenium
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1500, 1200))
display.start()
url_base = "https://www.seek.com.au/jobs?page="
# open web browser and login
binary = FirefoxBinary('/home/firefox/firefox/firefox')
driver = webdriver.Firefox(firefox_binary=binary)
overlap = False
page = 0
while not overlap:
page += 1
driver.get(url_base+str(page))
...
And here is the traceback:
Traceback (most recent call last):
File "manage.py", line 22, in <module>
execute_from_command_line(sys.argv)
File "/var/www/matt/env/local/lib/python2.7/site-packages/django/core/management/__init__.py", line 367, in execute_from_command_line
utility.execute()
File "/var/www/matt/env/local/lib/python2.7/site-packages/django/core/management/__init__.py", line 359, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "/var/www/matt/env/local/lib/python2.7/site-packages/django/core/management/base.py", line 294, in run_from_argv
self.execute(*args, **cmd_options)
File "/var/www/matt/env/local/lib/python2.7/site-packages/django/core/management/base.py", line 345, in execute
output = self.handle(*args, **options)
File "/var/www/matt/matt/management/commands/mattv3.py", line 109, in handle
driver.get(url_base+str(page))
File "/var/www/matt/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 245, in get
self.execute(Command.GET, {'url': url})
File "/var/www/matt/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 231, in execute
response = self.command_executor.execute(driver_command, params)
File "/var/www/matt/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 395, in execute
return self._request(command_info[0], url, body=data)
File "/var/www/matt/env/local/lib/python2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 426, in _request
resp = self._conn.getresponse()
File "/usr/lib/python2.7/httplib.py", line 1136, in getresponse
response.begin()
File "/usr/lib/python2.7/httplib.py", line 453, in begin
version, status, reason = self._read_status()
File "/usr/lib/python2.7/httplib.py", line 417, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''
I was running this on a very small server (512MB, 20GB SSD). I've increased it and it is running fine. If someone could explain the issue to me I would love to understand.

django management command: using LabelCommand

i want to pass argument along with this management command. i run this code from command line as
python manage.py example1 amita
where example1 is name of my file and amita is argument. on running this i am getting error.i am pasting the traceback:
Traceback (most recent call last):
File "manage.py", line 79, in <module>
execute_manager(settings)
File "/usr/lib/python2.7/dist-packages/django/core/management/__init__.py", line 438, in execute_manager
utility.execute()
File "/usr/lib/python2.7/dist-packages/django/core/management/__init__.py", line 379, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "/usr/lib/python2.7/dist-packages/django/core/management/__init__.py", line 261, in fetch_command
klass = load_command_class(app_name, subcommand)
File "/usr/lib/python2.7/dist-packages/django/core/management/__init__.py", line 68, in load_command_class
return module.Command()
AttributeError: 'module' object has no attribute 'Command'
the code for example1.py is below
from django.core.management.base import LabelCommand
from django.core.management.base import BaseCommand
def hello(name):
print name
def hello1(name):
print name
class LabelCommand(BaseCommand):
"""
A management command which takes one or more arbitrary arguments
(labels) on the command line, and does something with each of
them.
Rather than implementing ``handle()``, subclasses must implement
``handle_label()``, which will be called once for each label.
If the arguments should be names of installed applications, use
``AppCommand`` instead.
"""
args = '<label label ...>'
label = 'label'
def handle(self, *labels, **options):
if not labels:
raise CommandError('Enter at least one %s.' % self.label)
output = []
for label in labels:
label_output = self.handle_label(label, **options)
if label_output:
output.append(label_output)
return '\n'.join(output)
def handle_label(self, label, **options):
"""
Perform the command's actions for ``label``, which will be the
string as given on the command line.
"""
hello(label)
hello1(label)
raise NotImplementedError()
Django already has a LabelCommand class that you should use:
from django.core.management.base import LabelCommand
Then you just have to override the handle_label command.