Python interpreter hangs - experimenting with thread and queues - python-2.7

Following successfully hangs on exit
import threading
import Queue as queue
import time
import sys
class WorkItem(threading.Thread):
def __init__(self):
self.P1 = 20
self.P2 = 40
threading.Thread.__init__(self)
def run(self):
print "P1 = %d" % (self.P1)
print "P2 = %d" % (self.P2)
class WorkQueue(object):
def __init__(self,queueLimit = 5):
self.WorkQueue = queue.Queue(queueLimit)
self.dispatcherThread = threading.Thread(target=self.DequeueWorker)
self.dispatcherThread.start()
self.QueueStopEvent = threading.Event()
self.QueueStopEvent.clear()
def DequeueWorker(self):
print "DequeueWorker Enter .."
while not self.QueueStopEvent.isSet():
workItem = self.WorkQueue.get(True)
workItem.start()
def DispatchToQueue(self,workItem):
self.WorkQueue.put(workItem,True)
def Stop(self):
self.QueueStopEvent.set()
self.queue = None
def main():
q = WorkQueue()
for i in range(1,20):
t = WorkItem()
q.DispatchToQueue(t)
time.sleep(10)
q.Stop()
if __name__ == "__main__":
main()
I can see the DequeueWorker is the one still running and pending and trying to understand why since I do signal the Stop event. I was expecting the loop would exit.
>>> $w
=> Frame id=0, function=DequeueWorker
Frame id=1, function=run
Frame id=2, function=__bootstrap_inner
Frame id=3, function=__bootstrap
Help appreciated !!

You're calling get with block set to True, which means it will block until an item is actually available in the queue. In your code, once the work queue is exhausted, the next get will indefinitely block since it is waiting for an additional work item that will never come (and not letting the next iteration of the while loop execute, so the status of QueueStopEvent doesn't get checked anymore). Try modifying your DequeueWorker method to this:
def DequeueWorker(self):
print "DequeueWorker Enter .."
while not self.QueueStopEvent.isSet():
try:
workItem = self.WorkQueue.get(True, timeout=3)
workItem.start()
except queue.Empty: continue
Now when get is called after the work queue is exhausted, it will timeout (after 3 seconds in this case, I chose that arbitrarily) and raise the queue.Empty exception. In this case, we're simply going to let the loop continue to the next iteration where the loop will break itself when QueueStopEvent eventually gets set.
Other options would be to invoke get with block set to False or to use the get_nowait method inside that try/except block:
workItem = self.WorkQueue.get(False)
workItem = self.WorkQueue.get_nowait()
Although that creates a very tight while loop when the queue is empty.

Related

psutil's cpu_percent always returns 0.0

I would like my Flask application to report how much CPU and memory it is currently using as a percentage:
import psutil
from flask import Flask, request, jsonify
app = Flask(__name__)
#app.route("/test", methods=["GET"])
def healthz():
return jsonify(msg="OK"), 200
#app.route("/stats", methods=["GET"])
def stats():
p = psutil.Process()
json_body = {
"cpu_percent": p.cpu_percent(interval=None),
"cpu_times": p.cpu_times(),
"mem_info": p.memory_info(),
"mem_percent": p.memory_percent()
}
return jsonify(json_body), 200
def main():
app.run(host="0.0.0.0", port=8000, debug=False)
if __name__ == '__main__':
main()
While sending a lot of requests to /test, /stats will always returns 0.0 for cpu_percent:
$ while true; do curl http://127.0.0.1:8000/test &>/dev/null; done &
$ curl http://127.0.0.1:8000/stats
{
"cpu_percent": 0.0,
"cpu_times": [
4.97,
1.28,
0.0,
0.0
],
"mem_info": [
19652608,
243068928,
4292608,
4096,
0,
14675968,
0
],
"mem_percent": 1.8873787935409003
}
However, if I manually check using ipython:
import psutil
p = psutil.Process(10993)
p.cpu_percent()
This correctly returns a value greater than 0.0.
Simply define "p = psutil.Process()" globally (outside of stat() function). cpu_percent() keeps track of CPU times since last call, and that's how it is able to determine percentage.
The first call will always be 0.0 as calculating percentage is something which requires comparing two values over time, and as such, some time has to pass.
As Giampaolo pointed out, the instance of the Process needs to be at global scope because the instance tracks state to work it out based on prior call.
Do be aware though that CPU percentage can jump around quite a lot from one moment to another and especially where the time period it is calculated over keeps changing, can be quite confusing. It is perhaps better to use a background thread which works out CPU percentage average over set time ranges.
Some code I happened to have handy may be of interest:
from __future__ import print_function
import os
import time
import atexit
import threading
try:
import Queue as queue
except ImportError:
import queue
import psutil
_running = False
_queue = queue.Queue()
_lock = threading.Lock()
_cpu_percentage = 1800 * [0.0]
_processes = {}
def _monitor():
global _cpu_percentage
global _processes
while True:
marker = time.time()
total = 0.0
pids = psutil.pids()
processes = {}
for pid in pids:
process = _processes.get(pid)
if process is None:
process = psutil.Process(pid)
processes[pid] = process
total += process.cpu_percent()
_processes = processes
_cpu_percentage.insert(0, total)
_cpu_percentage = _cpu_percentage[:1800]
duration = max(0.0, 1.0 - (time.time() - marker))
try:
return _queue.get(timeout=duration)
except queue.Empty:
pass
_thread = threading.Thread(target=_monitor)
_thread.setDaemon(True)
def _exiting():
try:
_queue.put(True)
except Exception:
pass
_thread.join()
def track_changes(path):
if not path in _files:
_files.append(path)
def start_monitor():
global _running
_lock.acquire()
if not _running:
prefix = 'monitor (pid=%d):' % os.getpid()
print('%s Starting CPU monitor.' % prefix)
_running = True
_thread.start()
atexit.register(_exiting)
_lock.release()
def cpu_averages():
values = _cpu_percentage[:60]
averages = {}
def average(secs):
return min(100.0, sum(values[:secs])/secs)
averages['cpu.average.1s'] = average(1)
averages['cpu.average.5s'] = average(5)
averages['cpu.average.15s'] = average(15)
averages['cpu.average.30s'] = average(30)
averages['cpu.average.1m'] = average(60)
averages['cpu.average.5m'] = average(300)
averages['cpu.average.15m'] = average(900)
averages['cpu.average.30m'] = average(1800)
return averages
I had other stuff in this which I deleted, so hopefully what remains is still in a usable state.
To use it, add to file monitor.py and then import the module in your main and start the monitoring loop.
import monitor
monitor.start_monitor()
Then on each request call:
monitor.cpu_averages()
and extract value for time period you think makes sense.
The solution fo Graham seems to work but, but I found a way simpler solution, by telling it the interval, in this example it measures the last second:
psutil.cpu_percent(interval=1)

How to call a function after finishing recursive asynchronous jobs in Python?

I use scrapy for scraping this site.
I want to save all the sub-categories in an array, then get the corresponding pages (pagination)
first step i have
def start_requests(self):
yield Request(start_urls[i], callback=self.get_sous_cat)
get_sous_cat is a function which gets all the sub-categories of a site, then starts asynchronously jobs to explore the sub-sub-categories recursively.
def get_sous_cat(self,response):
#Put all the categgories in a array
catList = response.css('div.categoryRefinementsSection')
if (catList):
for category in catList.css('a::attr(href)').extract():
category = 'https://www.amazon.fr' + category
print category
self.arrayCategories.append(category)
yield Request(category, callback=self.get_sous_cat)
When all the respective request have been sent, I need to call this termination function :
def pagination(self,response):
for i in range(0, len(self.arrayCategories[i])):
#DO something with each sub-category
I tried this
def start_requests(self):
yield Request(start_urls[i], callback=self.get_sous_cat)
for subCat in range(0,len(self.arrayCategories)):
yield Request(self.arrayCategories[subCat], callback=self.pagination)
Well done, this is a good question! Two small things:
a) use a set instead of an array. This way you won't have duplicates
b) site structure will change once a month/year. You will likely crawl more frequently. Break the spider into two; 1. The one that creates the list of category urls and runs monthly and 2. The one that gets as start_urls the file generated by the first
Now, if you really want to do it the way you do it now, hook the spider_idle signal (see here: Scrapy: How to manually insert a request from a spider_idle event callback? ). This gets called when there are no further urls to do and allows you to inject more. Set a flag or reset your list at that point so that the second time the spider is idle (after it crawled everything), it doesn't re-inject the same category urls for ever.
If, as it seems in your case, you don't want to do some fancy processing on the urls but just crawl categories before other URLs, this is what Request priority property is for (http://doc.scrapy.org/en/latest/topics/request-response.html#topics-request-response-ref-request-subclasses). Just set it to e.g. 1 for your category URLs and then it will follow those links before it processes any non-category links. This is more efficient since it won't load those category pages twice as your current implementation would do.
This is not "recursion", it's asynchronous jobs. What you need is a global counter (protected by a Lock) and if 0, do your completion :
from threading import Lock
class JobCounter(object):
def __init__(self, completion_callback, *args, **kwargs):
self.c = 0
self.l = Lock()
self.completion = (completion_callback, args, kwargs)
def __iadd__(self, n):
b = false
with self.l:
self.c += n
if self.c <= 0:
b = true
if b:
f, args, kwargs = self.completion
f(*args, **kwargs)
def __isub__(self, n):
self.__iadd__(-n)
each time you launch a job, do counter += 1
each time a job finishes, do counter -= 1
NOTE : this does the completion in the thread of the last calling job. If you want to do it in a particular thread, use a Condition instead of a Lock, and do notify() instead of the call.

How to write unittests for events in a database or something hard to test for?

I have a process where I'm querying a database in chunks of 100000, and I want to retry up to 5 times if it is a timeout error, otherwise send out an alert. How could I write a unittest for this? I'm not exactly sure how to simulate the environment so I can double check the logic. It's also difficult to test the alert (it's an email), other than just sending it and seeing if it arrives. How would you write a test for this? Here is my code:
def _query_data(self, sql_query: str, retries: int = 5):
cursor = self._return_cursor(self.canvas_conn)
for attempt in range(retries):
try:
cursor.execute(sql_query)
except (ppg2.ProgrammingError, ppg2.OperationalError) as e:
if 'Operation timed out' in str(e):
print('Retrying, attempt {}'.format(attempt))
sleep(3)
continue
elif self.historical_connect:
msg = 'Connected on last run, no longer available.'
self.set_for_alert(msg)
self.historical_connect = False
raise ppg2.Error from e
else:
return self._iterate_results(cursor)
The error gets caught in the process that calls it.

While True loop can not be used while enable multiprocessing.pool in Python

I am using multiprocessing pool in one of my program and i need the program to have infinite loop which means run forever... But after i applied the while True: loop in the pool program, the program seem like forever hang and do nothing at there...
I come out with a very simple Python code as shown as below:
from multiprocessing import Pool
def market_update(var):
return var*var
while True:
alist = [1,2,3,4,5,6,7,8,9,10,11,12]
print alist
if __name__ == '__main__':
pool = Pool()
result= pool.map(market_update, alist)
print result
Can anyone explain why and if there any wokraround?
Edited code as suggested from Dano:
from multiprocessing import Pool
def market_update(var):
return var*var
if __name__ == '__main__':
pool = Pool()
while True:
alist = [1,2,3,4,5,6,7,8,9,10,11,12]
print alist
result = pool.map(market_update, alist)
print result

Python timed scheduler

Okay, so I'm working on a scheduler and I was thinking of something like, timeOut(3,print,'hello') and it would print hello every three seconds, I have tried some methods but all failed. Also Using time.sleep for this wouldn't quite work because I need to run other tasks as well besides just one
Edit:
I found out how to do what I needed, sorry for being confusing but this did the trick for what I needed, thanks for answering everyone.
class test:
def __init__(self):
self.objectives = set()
class Objective:
pass
def interval(self,timeout,function,*data):
newObjective = self.Objective()
newObjective.Class = self
newObjective.timeout = time.time()+timeout
newObjective.timer = timeout
newObjective.function = function
newObjective.repeate = True
newObjective.data = data
self.objectives.add(newObjective)
return True
def runObjectives(self):
timeNow = time.time()
for objective in self.objectives:
timeout = objective.timer
if objective.timeout <= timeNow:
objective.function(*objective.data)
if objective.repeate:
objective.timeout = timeNow + timeout
self.main()
else:
self.objectives.remove(objective)
print('removed')
def main(self):
while True:
self.runObjectives()
The standard library includes a module called sched for scheduling. It can be adapted to work in a variety of environments using the delayfunc constructor parameter. Using it your question would likely read:
def event():
scheduler.enter(3, 0, event, ()) # reschedule
print('hello')
Now it depends on how you run the other tasks. Are you using an event loop? It probably has a similar scheduling mechanism (at least twisted has callLater and GObject has timeout_add). If all else fails, you can spawn a new thread and execute a sched.scheduler with time.sleep there.