I'm trying to write a python script that evaluates a vector Complex numbers so I can do things like plot it via matplotlib. Python code below with comment where code breaks:
import sys
import gdb
import matplotlib.pyplot as plt
class Plotter (gdb.Command):
""" Plots vector"""
# _iterator pulled directly from
# /usr/share/gdb/python/libstdcxx/v6/printers.py
class _iterator:
def __init__ (self, start, finish):
self.item = start
self.finish = finish
self.count = 0
def __iter__(self):
return self
def next(self):
if self.item == self.finish:
raise StopIteration
count = self.count
self.count = self.count + 1
elt = self.item.dereference()
self.item = self.item + 1
return ('[%d]' % count, elt)
def __init__(self):
super(Plotter, self).__init__("plot_test", gdb.COMMAND_OBSCURE)
def invoke(self, arg, from_tty):
frame = gdb.selected_frame()
try:
val = gdb.Frame.read_var(frame, arg)
if str(val.type).find("vector") != -1:
print "Plot vector:", str(val.type)
if (str(val.type).find("complex") != -1):
self.plot_complex_vector(val)
else:
self.plot_vector(val)
else:
print "Not a vector..."
except:
print "No such variable:", arg
return
def plot_complex_vector(self, val):
try:
it = self._iterator(val['_M_impl']['_M_start'],
val['_M_impl']['_M_finish'])
vector = []
while(True):
x = it.next()
vector.append(complex(x[1])) # doesn't work...
return
except StopIteration:
pass
except:
print sys.exc_info()[0]
print vector
plt.plot(vector)
plt.show()
# works...
def plot_vector(self, val):
try:
it = self._iterator(val['_M_impl']['_M_start'],
val['_M_impl']['_M_finish'])
vector = []
while(True):
x = it.next()
vector.append(float(x[1]))
except StopIteration:
pass
except:
print sys.exc_info()[0]
print vector
plt.plot(vector)
plt.show()
Plotter()
So the question is, how do I access the real/imaginary parts of a a std::complex value?
It looks like doing a
print x[1]
Will print values like : {_M_value = 0 + 1 * I}
Update: It looks like I can do a little string editing before doing a typecast:
while(True):
x = it.next()
s = str(x[1]['_M_value'])
# convert string to complex format that python understands.
c = complex(s.replace('I','j').replace(' ','').replace('*',''))
vector.append(c) # Works now...
But... is there a better way to do this?
try:
it = self._iterator(val['_M_impl']['_M_start'],
val['_M_impl']['_M_finish'])
vector = []
while(True):
x = it.next()
vector.append(complex(x[1])) # doesn't work...
return
except StopIteration:
pass
except:
print sys.exc_info()[0]
Is not how iterators are meant to be used in python. Use
try:
it = self._iterator(val['_M_impl']['_M_start'],
val['_M_impl']['_M_finish'])
vector = [complex(x[1]) for x in it]
except Exception as e:
print e, sys.exc_info()[0]
If you really still want to wrap it in a try...except block.
Edit: Try complex(x.real() + x.imag()). What does print x.type.fields() show?
Related
I'm working on a concordance dictionary that reads a data file and records every unique word and the word's line number in a AVL tree. The problem is that my find method is not finding the Entry's within the tree so it adds every word instead of every unique word.
I'm also having trouble making my program keep a list of the line numbers within each entry. I'm using an entry class to keep the key(word) and the list of line numbers. Thank you for any help.
I'm writing in Python 2.7 and have included all my program so far.
My Main Program:
import string #NEW
from time import clock
import sys #for BST recursion limit
from dictionary import Entry
sys.setrecursionlimit(3000)#for BST
from avl import AVL
def main():
"""Calls on necessary functions to fill the dictionary, and process the keys"""
start = clock() #times runtime
stopWordDict = AVL()#Empty Dictionary
stopWordDict = fillStopWordDict(stopWordDict)
keyList = []
wordConcordanceDict = AVL()#Empty Dictionary
wordConcordanceDict = fillWordDict(stopWordDict,wordConcordanceDict, keyList)
print str(wordConcordanceDict) #wordconcorddict made here.
keyList.sort()
print keyList
writeWordConDict(wordConcordanceDict, keyList)
end = clock() #gets runtime
runTime = end - start
print("Done. Runtime was:",runTime,"seconds.")
def fillStopWordDict(stopWordDict):
"""fills chain dict with all of the stop words"""
fileNew=open('stop_words.txt', "r")
for word in fileNew:
word=word.lower().strip() #strip will strip \n from word
if stopWordDict.find(word) == None:
stopWordDict.add(word)
fileNew.close()
return stopWordDict
def fillWordDict(stopWordDict,wordConcordanceDict, keyList):
"""opens hw5data.txt and calls on processLine function"""
lineCounter = 1
fileNew=open('hw5data.txt', "r")
for line in fileNew:
processLine(lineCounter, line, stopWordDict,wordConcordanceDict, keyList)
lineCounter+=1 #changes to next line of file
fileNew.close()
return wordConcordanceDict
def processLine(lineCounter, line, stopWordDict,wordConcordanceDict, keyList):
"""process each line into the wordConcordanceDict"""
line=line.split() #splits line into list of words
for word in line:
word=word.lower().strip(string.punctuation)#strips punctuation
if stopWordDict.find(word) == None:
wordEntry = Entry(word, None)
if wordConcordanceDict.find(wordEntry) == None:
lineList = wordEntry.value
lineList.append(lineCounter)
wordEntry.value = lineList
wordConcordanceDict.add(wordEntry)
keyList.append(word)
else:
wordEntry = wordConcordance.find(wordEntry)
lineList = wordEntry.value
lineList.append(lineCounter)
wordEntry.value = lineList
wordConcordanceDict.add(wordEntry)
return wordConcordanceDict
def writeWordConDict(wordConcordanceDict, keyList):
"""takes in wordConcordanceDict and list of its keys. Then prints the key value pairs to the screen"""
fileNew=open("ProgProj5Concordance.txt", 'w')
# listOfWords = wordConcordanceDict.inorder()
for key in keyList:
wordEntry = wordConcordanceDict.find(key) #makes the values into a string
lineList = wordEntry.value
line=str(key + ":" + lineList + "\n")
fileNew.write(line)
fileNew.close()
main()
MY ENTRY CLASS:
"""
File: bst.py
BST class for binary search trees.
"""
from queue import LinkedQueue
from binarytree import BinaryTree
class BST(object):
def __init__(self):
self._tree = BinaryTree.THE_EMPTY_TREE
self._size = 0
def isEmpty(self):
return len(self) == 0
def __len__(self):
return self._size
def __str__(self):
return str(self._tree)
def __iter__(self):
return iter(self.inorder())
def find(self, target):
"""Returns data if target is found or None otherwise."""
def findHelper(tree):
if tree.isEmpty():
return None
elif target == tree.getRoot():
return tree.getRoot()
elif target < tree.getRoot():
return findHelper(tree.getLeft())
else:
return findHelper(tree.getRight())
return findHelper(self._tree)
def add(self, newItem):
"""Adds newItem to the tree."""
# Helper function to search for item's position
def addHelper(tree):
currentItem = tree.getRoot()
left = tree.getLeft()
right = tree.getRight()
# New item is less, go left until spot is found
if newItem < currentItem:
if left.isEmpty():
tree.setLeft(BinaryTree(newItem))
else:
addHelper(left)
# New item is greater or equal,
# go right until spot is found
elif right.isEmpty():
tree.setRight(BinaryTree(newItem))
else:
addHelper(right)
# End of addHelper
# Tree is empty, so new item goes at the root
if self.isEmpty():
self._tree = BinaryTree(newItem)
# Otherwise, search for the item's spot
else:
addHelper(self._tree)
self._size += 1
def inorder(self):
"""Returns a list containing the results of
an inorder traversal."""
lyst = []
self._tree.inorder(lyst)
return lyst
def preorder(self):
"""Returns a list containing the results of
a preorder traversal."""
# Exercise
pass
def postorder(self):
"""Returns a list containing the results of
a postorder traversal."""
# Exercise
pass
def levelorder(self):
"""Returns a list containing the results of
a levelorder traversal."""
# Exercise
pass
def remove(self, item):
# Exercise
pass
def main():
tree = BST()
print "Adding D B A C F E G"
tree.add("D")
tree.add("B")
tree.add("A")
tree.add("C")
tree.add("F")
tree.add("E")
tree.add("G")
print tree.find("A")
print tree.find("Z")
print "\nString:\n" + str(tree)
print "Iterator (inorder traversal): "
iterator = iter(tree)
while True:
try:
print iterator.next(),
except Exception, e:
print e
break
# Use a for loop instead
print "\nfor loop (inorder traversal): "
for item in tree:
print item,
if __name__ == "__main__":
main()
AND FINALLY THE BINARY TREE AVL CLASS:
from binarytree import *
class BinaryTreeAVL(BinaryTree):
def __init__(self, item, balance = 'EQ'):
BinaryTree.__init__(self, item)
self._balance = balance
def getBalance(self):
return self._balance
def setBalance(self, newBalance):
self._balance = newBalance
def __str__(self):
"""Returns a string representation of the tree
rotated 90 degrees to the left."""
def strHelper(tree, level):
result = ""
if not tree.isEmpty():
result += strHelper(tree.getRight(), level + 1)
result += "| " * level
result += str(tree.getRoot())+ " : " + tree.getBalance() + "\n"
result += strHelper(tree.getLeft(), level + 1)
return result
return strHelper(self, 0)
I am unexpectedly getting IOError: bad message length error when trying to share pyodbc connection across multiple processes, especially when N is more than 4 (no. of cores). Sometimes I also get cPickle.UnpicklingError: invalid load key, '#'., pyodbc.ProgrammingError: ('24000', '[24000] [FreeTDS][SQL Server]Invalid cursor state (0) (SQLExecDirectW)') as errors.
# Import custom python packages
import multiprocessing
import multiprocessing.managers as mm
import pathos.multiprocessing as mp
import pyodbc, datetime, time
class MyConn(object):
def __init__(self):
self.conn = None
self.cursor = None
def connect_to_db(self):
self.conn = pyodbc.connect("DSN=cpmeast;UID=dntcore;PWD=dntcorevs2")
self.cursor = self.conn.cursor()
def run_qry(self, data):
print 'Running query', data
self.cursor.execute("WAITFOR DELAY '00:00:01';select GETDATE(), '"+str(data)+"';")
l = self.cursor.fetchall()
_l = []
for i in l:
_l.append(list(i))
print 'Result for query', data, _l
return _l
class MyManagerClass(object):
def __init__(self):
self.result = multiprocessing.Manager().list()
def read_data(self, *args):
conn = args[0][0]
data = args[0][1]
l = conn.run_qry(data)
self.result.append(l)
class MyManager(mm.BaseManager):
pass # Pass is really enough. Nothing needs to be done here.
def main():
time_start = time.time()
MyManager.register("MyConn", MyConn)
manager = MyManager()
manager.start()
a = manager.MyConn()
a.connect_to_db()
dbm = MyManagerClass()
pool = mp.ProcessingPool(4)
jobs = []
N = 5
for i in range(N):
jobs.append((a, str(i)))
for i in pool.imap(dbm.read_data, jobs):
print 'result'
pool.close()
pool.join()
print 'Result', dbm.result
print 'Closed'
time_stop = time.time()
msg = 'runtime: {0}'.format(str(datetime.timedelta
(seconds=time_stop-time_start)))
print msg
if __name__ == '__main__':
main()
I used code::blocks (CB) or visual studio (VS) for C++ programs with eigen library. However, when it comes to debugging, I cannot see contents of arrays, matrices etc. I checked following posts:
Using GDB with Eigen C++ library
I am not a C++ expert but I could get that I need something called as printer.
https://android.googlesource.com/platform/external/eigen/+/b015e75e8c7ba1ab4ddb91e9372a57e76f3fd159/debug/gdb/printers.py has the source code. However I do not know how to use this source code to debug with gdb with eigen library in CB or VS. Any ideas how to do this?
Update:
vsoftco mentioned a webpage https://android.googlesource.com/platform/external/eigen/+/b015e75e8c7ba1ab4ddb91e9372a57e76f3fd159/debug and that has python printers for gdb for CB and VS. If anyone knows how to use them to see contents of arrays of eigen library, please comment.
The Eigen::Matrix class is not an aggregate, so you cannot just see its content with a debugger. However, you should be able to step in with a debugger, and can use cout or other methods to display the content.
The link you mentioned is a python plugin for gdb, to allow gdb to print the content of Eigen types. But as you use VS (which has its internal debugger and doesn't use gdb), there is no reason why it would work in your case.
You can try switching to MinGW and g++/gdb, or can check this link How can I use GDB from inside Visual Studio C++ (Express) to debug my GCC Makefile projects? for some advice about installing gdb under VS.
PS: it seems that a solution for VS also exists,
https://android.googlesource.com/platform/external/eigen/+/b015e75e8c7ba1ab4ddb91e9372a57e76f3fd159/debug
The python printers for gdb works for me. Notice that the printers.py script is written in Python 2.7 and your gdb is probably running python 3.5 or later... Use the 2to3 converter or simply copy this to a new file called printers3.py:
# -*- coding: utf-8 -*-
# This file is part of Eigen, a lightweight C++ template library
# for linear algebra.
#
# Copyright (C) 2009 Benjamin Schindler <bschindler#inf.ethz.ch>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Pretty printers for Eigen::Matrix
# This is still pretty basic as the python extension to gdb is still pretty basic.
# It cannot handle complex eigen types and it doesn't support any of the other eigen types
# Such as quaternion or some other type.
# This code supports fixed size as well as dynamic size matrices
# To use it:
#
# * Create a directory and put the file as well as an empty __init__.py in
# that directory.
# * Create a ~/.gdbinit file, that contains the following:
# python
# import sys
# sys.path.insert(0, '/path/to/eigen/printer/directory')
# from printers3 import register_eigen_printers
# register_eigen_printers (None)
# end
import gdb
import re
import itertools
class EigenMatrixPrinter:
"Print Eigen Matrix or Array of some kind"
def __init__(self, variety, val):
"Extract all the necessary information"
# Save the variety (presumably "Matrix" or "Array") for later usage
self.variety = variety
# The gdb extension does not support value template arguments - need to extract them by hand
type = val.type
if type.code == gdb.TYPE_CODE_REF:
type = type.target()
self.type = type.unqualified().strip_typedefs()
tag = self.type.tag
regex = re.compile('\<.*\>')
m = regex.findall(tag)[0][1:-1]
template_params = m.split(',')
template_params = [x.replace(" ", "") for x in template_params]
if template_params[1] == '-0x00000000000000001' or template_params[1] == '-0x000000001' or template_params[1] == '-1':
self.rows = val['m_storage']['m_rows']
else:
self.rows = int(template_params[1])
if template_params[2] == '-0x00000000000000001' or template_params[2] == '-0x000000001' or template_params[2] == '-1':
self.cols = val['m_storage']['m_cols']
else:
self.cols = int(template_params[2])
self.options = 0 # default value
if len(template_params) > 3:
self.options = template_params[3];
self.rowMajor = (int(self.options) & 0x1)
self.innerType = self.type.template_argument(0)
self.val = val
# Fixed size matrices have a struct as their storage, so we need to walk through this
self.data = self.val['m_storage']['m_data']
if self.data.type.code == gdb.TYPE_CODE_STRUCT:
self.data = self.data['array']
self.data = self.data.cast(self.innerType.pointer())
class _iterator:
def __init__ (self, rows, cols, dataPtr, rowMajor):
self.rows = rows
self.cols = cols
self.dataPtr = dataPtr
self.currentRow = 0
self.currentCol = 0
self.rowMajor = rowMajor
def __iter__ (self):
return self
def __next__(self):
row = self.currentRow
col = self.currentCol
if self.rowMajor == 0:
if self.currentCol >= self.cols:
raise StopIteration
self.currentRow = self.currentRow + 1
if self.currentRow >= self.rows:
self.currentRow = 0
self.currentCol = self.currentCol + 1
else:
if self.currentRow >= self.rows:
raise StopIteration
self.currentCol = self.currentCol + 1
if self.currentCol >= self.cols:
self.currentCol = 0
self.currentRow = self.currentRow + 1
item = self.dataPtr.dereference()
self.dataPtr = self.dataPtr + 1
if (self.cols == 1): #if it's a column vector
return ('[%d]' % (row,), item)
elif (self.rows == 1): #if it's a row vector
return ('[%d]' % (col,), item)
return ('[%d,%d]' % (row, col), item)
def children(self):
return self._iterator(self.rows, self.cols, self.data, self.rowMajor)
def to_string(self):
return "Eigen::%s<%s,%d,%d,%s> (data ptr: %s)" % (self.variety, self.innerType, self.rows, self.cols, "RowMajor" if self.rowMajor else "ColMajor", self.data)
class EigenQuaternionPrinter:
"Print an Eigen Quaternion"
def __init__(self, val):
"Extract all the necessary information"
# The gdb extension does not support value template arguments - need to extract them by hand
type = val.type
if type.code == gdb.TYPE_CODE_REF:
type = type.target()
self.type = type.unqualified().strip_typedefs()
self.innerType = self.type.template_argument(0)
self.val = val
# Quaternions have a struct as their storage, so we need to walk through this
self.data = self.val['m_coeffs']['m_storage']['m_data']['array']
self.data = self.data.cast(self.innerType.pointer())
class _iterator:
def __init__ (self, dataPtr):
self.dataPtr = dataPtr
self.currentElement = 0
self.elementNames = ['x', 'y', 'z', 'w']
def __iter__ (self):
return self
def __next__(self):
element = self.currentElement
if self.currentElement >= 4: #there are 4 elements in a quanternion
raise StopIteration
self.currentElement = self.currentElement + 1
item = self.dataPtr.dereference()
self.dataPtr = self.dataPtr + 1
return ('[%s]' % (self.elementNames[element],), item)
def children(self):
return self._iterator(self.data)
def to_string(self):
return "Eigen::Quaternion<%s> (data ptr: %s)" % (self.innerType, self.data)
def build_eigen_dictionary ():
pretty_printers_dict[re.compile('^Eigen::Quaternion<.*>$')] = lambda val: EigenQuaternionPrinter(val)
pretty_printers_dict[re.compile('^Eigen::Matrix<.*>$')] = lambda val: EigenMatrixPrinter("Matrix", val)
pretty_printers_dict[re.compile('^Eigen::Array<.*>$')] = lambda val: EigenMatrixPrinter("Array", val)
def register_eigen_printers(obj):
"Register eigen pretty-printers with objfile Obj"
if obj == None:
obj = gdb
obj.pretty_printers.append(lookup_function)
def lookup_function(val):
"Look-up and return a pretty-printer that can print va."
type = val.type
if type.code == gdb.TYPE_CODE_REF:
type = type.target()
type = type.unqualified().strip_typedefs()
typename = type.tag
if typename == None:
return None
for function in pretty_printers_dict:
if function.search(typename):
return pretty_printers_dict[function](val)
return None
pretty_printers_dict = {}
build_eigen_dictionary ()
I have got the following error when I'm trying to execute example in pdfquery.
File "C:\workspace-php\test\pdfminer\pdfqueries\pdfquery.py", line 187, in init
doc.initialize()
AttributeError: 'QPDFDocument' object has no attribute 'initialize'
I'm trying to solve this but still i dont get any solution for that.it would be appreciated if some one can help me to solve this.
class PDFQuery(object):
def __init__(self, file,
merge_tags=('LTChar', 'LTAnno'),
round_floats=True,
round_digits=3,
input_text_formatter=None,
normalize_spaces=True,
resort=True,
parse_tree_cacher=None,
):
# store input
self.merge_tags = merge_tags
self.round_floats = round_floats
self.round_digits = round_digits
self.resort = resort
# set up input text formatting function, if any
if input_text_formatter:
self.input_text_formatter = input_text_formatter
elif normalize_spaces:
r = re.compile(r'\s+')
self.input_text_formatter = lambda s: re.sub(r, ' ', s)
else:
self.input_text_formatter = None
# open doc
if not hasattr(file, 'read'):
try:
file = open(file, 'rb')
except TypeError:
raise TypeError("File must be file object or filepath string.")
parser = PDFParser(file)
if hasattr(QPDFDocument, 'set_parser'):
# pdfminer < 20131022
doc = QPDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
else:
# pdfminer >= 20131022
doc = QPDFDocument(parser)
parser.set_document(doc)
doc.initialize()
self.doc = doc
self.parser = parser
self.tree = None
self.pq = None
self.file = file
if parse_tree_cacher:
self._parse_tree_cacher = parse_tree_cacher
self._parse_tree_cacher.set_hash_key(self.file)
else:
self._parse_tree_cacher = DummyCache()
# set up layout parsing
rsrcmgr = PDFResourceManager()
laparams = LAParams(all_texts=True, detect_vertical=True)
self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
# caches
self._pages = []
self._pages_iter = None
self._elements = []
def load(self, *page_numbers):
"""
Load etree and pyquery object for entire document, or given page numbers (ints or lists).
After this is called, objects are available at pdf.tree and pdf.pq.
>>> pdf.load()
>>> pdf.tree
<lxml.etree._ElementTree object at ...>
>>> pdf.pq('LTPage')
[<LTPage>, <LTPage>]
>>> pdf.load(1)
>>> pdf.pq('LTPage')
[<LTPage>]
>>> pdf.load(0,1)
>>> pdf.pq('LTPage')
[<LTPage>, <LTPage>]
"""
self.tree = self.get_tree(*_flatten(page_numbers))
self.pq = self.get_pyquery(self.tree)
def extract(self, searches, tree=None, as_dict=True):
"""
>>> foo = pdf.extract( [ ['pages', 'LTPage'] ])
>>> foo
{'pages': [<LTPage>, <LTPage>]}
>>> pdf.extract( [ ['bar', ':in_bbox("100,100,400,400")'] ], foo['pages'][0])
{'bar': [<LTTextLineHorizontal>, <LTTextBoxHorizontal>,...
"""
if self.tree is None or self.pq is None:
self.load()
pq = PyQuery(tree, css_translator=PDFQueryTranslator()) if tree is not None else self.pq
if tree is None:
pq = self.pq
else:
pq = PyQuery(tree, css_translator=PDFQueryTranslator())
results = []
formatter = None
parent = pq
for search in searches:
if len(search) < 3:
search = list(search) + [formatter]
key, search, tmp_formatter = search
if key == 'with_formatter':
if isinstance(search, basestring): # is a pyquery method name, e.g. 'text'
formatter = lambda o, search=search: getattr(o, search)()
elif hasattr(search, '__call__') or not search: # is a method, or None to end formatting
formatter = search
else:
raise TypeError("Formatter should be either a pyquery method name or a callable function.")
elif key == 'with_parent':
parent = pq(search) if search else pq
else:
try:
result = parent("*").filter(search) if hasattr(search, '__call__') else parent(search)
except cssselect.SelectorSyntaxError, e:
raise cssselect.SelectorSyntaxError( "Error applying selector '%s': %s" % (search, e) )
if tmp_formatter:
result = tmp_formatter(result)
results += result if type(result) == tuple else [[key, result]]
if as_dict:
results = dict(results)
return results
# tree building stuff
def get_pyquery(self, tree=None, page_numbers=[]):
"""
Wrap given tree in pyquery and return.
If no tree supplied, will generate one from given page_numbers, or all page numbers.
"""
if tree is None:
if not page_numbers and self.tree is not None:
tree = self.tree
else:
tree = self.get_tree(page_numbers)
if hasattr(tree, 'getroot'):
tree = tree.getroot()
return PyQuery(tree, css_translator=PDFQueryTranslator())
def get_tree(self, *page_numbers):
"""
Return lxml.etree.ElementTree for entire document, or page numbers given if any.
"""
cache_key = "_".join(map(str, _flatten(page_numbers)))
tree = self._parse_tree_cacher.get(cache_key)
if tree is None:
# set up root
root = parser.makeelement("pdfxml")
if self.doc.info: #not all PDFs seem to have this info section
for k, v in self.doc.info[0].items():
root.set(k, unicode(v))
# add pages
if page_numbers:
pages = [[n, self.get_layout(self.get_page(n))] for n in _flatten(page_numbers)]
else:
pages = enumerate(self.get_layouts())
for n, page in pages:
page = self._xmlize(page)
page.set('page_index', unicode(n))
page.set('page_label', self.doc.get_page_number(n))
root.append(page)
self._clean_text(root)
# wrap root in ElementTree
tree = etree.ElementTree(root)
self._parse_tree_cacher.set(cache_key, tree)
return tree
def _clean_text(self, branch):
"""
Remove text from node if same text exists in its children.
Apply string formatter if set.
"""
if branch.text and self.input_text_formatter:
branch.text = self.input_text_formatter(branch.text)
try:
for child in branch:
self._clean_text(child)
if branch.text and branch.text.find(child.text) >= 0:
branch.text = branch.text.replace(child.text, '', 1)
except TypeError: # not an iterable node
pass
def _xmlize(self, node, root=None):
# collect attributes of current node
tags = self._getattrs(node, 'y0', 'y1', 'x0', 'x1', 'width', 'height', 'bbox', 'linewidth', 'pts', 'index','name','matrix','word_margin' )
if type(node) == LTImage:
tags.update( self._getattrs(node, 'colorspace','bits','imagemask','srcsize','stream','name','pts','linewidth') )
elif type(node) == LTChar:
tags.update( self._getattrs(node, 'fontname','adv','upright','size') )
elif type(node) == LTPage:
tags.update( self._getattrs(node, 'pageid','rotate') )
# create node
branch = parser.makeelement(node.__class__.__name__, tags)
branch.layout = node
self._elements += [branch] # make sure layout keeps state
if root is None:
root = branch
# add text
if hasattr(node, 'get_text'):
branch.text = node.get_text()
# add children if node is an iterable
if hasattr(node, '__iter__'):
last = None
for child in node:
child = self._xmlize(child, root)
if self.merge_tags and child.tag in self.merge_tags:
if branch.text and child.text in branch.text:
continue
elif last is not None and last.tag in self.merge_tags:
last.text += child.text
last.set('_obj_id', last.get('_obj_id')+","+child.get('_obj_id'))
continue
# sort children by bounding boxes
if self.resort:
_append_sorted(root, child, _comp_bbox)
else:
branch.append(child)
last = child
return branch
def _getattrs(self, obj, *attrs):
""" Return dictionary of given attrs on given object, if they exist, processing through filter_value(). """
return dict( (attr, unicode(self._filter_value(getattr(obj, attr)))) for attr in attrs if hasattr(obj, attr))
def _filter_value(self, val):
if self.round_floats:
if type(val) == float:
val = round(val, self.round_digits)
elif hasattr(val, '__iter__'):
val = [self._filter_value(item) for item in val]
return val
# page access stuff
def get_page(self, page_number):
""" Get PDFPage object -- 0-indexed."""
return self._cached_pages(target_page=page_number)
def get_layout(self, page):
""" Get PDFMiner Layout object for given page object or page number. """
if type(page) == int:
page = self.get_page(page)
self.interpreter.process_page(page)
return self.device.get_result()
def get_layouts(self):
""" Get list of PDFMiner Layout objects for each page. """
return (self.get_layout(page) for page in self._cached_pages())
def _cached_pages(self, target_page=-1):
"""
Get a page or all pages from page generator, caching results.
This is necessary because PDFMiner searches recursively for pages,
so we won't know how many there are until we parse the whole document,
which we don't want to do until we need to.
"""
try:
# pdfminer < 20131022
self._pages_iter = self._pages_iter or self.doc.get_pages()
except AttributeError:
# pdfminer >= 20131022
self._pages_iter = self._pages_iter or PDFPage.create_pages(self.doc)
if target_page >= 0:
while len(self._pages) <= target_page:
next = self._pages_iter.next()
if not next:
return None
next.page_number = 0
self._pages += [next]
try:
return self._pages[target_page]
except IndexError:
return None
self._pages += list(self._pages_iter)
return self._pages
if __name__ == "__main__":
import doctest
pdf = PDFQuery("../examples/sample.pdf")
doctest.testmod(extraglobs={'pdf': pdf}, optionflags=doctest.ELLIPSIS)
add.elif to line 18 then add a for loop before the list and that should fix it if there is any problems contact me for support
So I am trying to multi process function F. Which is accessed by a button press with tkinter.
def f(x):
global doom,results,info
doom = doom + 1
if check(x) == True:
results.add(x)
info.append(get_column_number(x))
j.step(1)
texx = "1/"+doom
s.configure(text=texx)
root.update()
The function is called within a function like so:
def dojob():
index = ['URLS'...]
pool = Pool(processes=4)
s.configure(text="Shifting Workload to cores..")
root.update()
pool.map(f, index)
The button is inside root window.
I get the following error:
Exception in thread Thread-2:
Traceback (most recent call last):
File "C:\Python27\lib\threading.py", line 808, in __bootstrap_inner
self.run()
File "C:\Python27\lib\threading.py", line 761, in run
self.__target(*self.__args, **self.__kwargs)
File "C:\Python27\lib\multiprocessing\pool.py", line 342, in _handle_tasks
put(task)
UnpickleableError: Cannot pickle <type 'tkapp'> objects
I do not even know what a pickle does? Help?
Here is the complete code:
from Tkinter import *
from ttk import *
from tkMessageBox import showinfo
from multiprocessing import Pool
import random
emails = set()
import urllib2
import urllib2 as urllib
########
CONSTANT_PAGECOUNT = 20
######
def f(x):
global doom,emails,info
doom = doom + 1
if check(x) == True:
print "",
emails.add(x)
info.append(get_column_number(x))
j.step(1)
texx = "Sk1nn1n "+str(doom)+'/'+str(CONSTANT_PAGECOUNT)+""
s.configure(text=texx)
root.update()
return 0
def f(x):
print ""
def showFile(site,info):
top = Toplevel()
top.title('Sites')
x = Text(top)
x.pack()
i=0
for site_url in site:
x.insert(END,site_url)
i=i+1
def get_column_number(url):
return True
def check(url):
return True
def getgoogleurl(search,siteurl=False,startr=0):
if siteurl==False:
return 'http://www.google.com/search?q='+urllib2.quote(search)+'&start='+str(startr)+'&oq='+urllib2.quote(search)
else:
return 'http://www.google.com/search?q=site:'+urllib2.quote(siteurl)+'%20'+urllib2.quote(search)+'&oq=site:'+urllib2.quote(siteurl)+'%20'+urllib2.quote(search)
def getgooglelinks(search,siteurl=False,startr=0):
#google returns 403 without user agent
headers = {'User-agent':'Mozilla/11.0'}
req = urllib2.Request(getgoogleurl(search,siteurl,startr),None,headers)
site = urllib2.urlopen(req)
data = site.read()
site.close()
#no beatifulsoup because google html is generated with javascript
start = data.find('<div id="res">')
end = data.find('<div id="foot">')
if data[start:end]=='':
#error, no links to find
return False
else:
links =[]
data = data[start:end]
start = 0
end = 0
while start>-1 and end>-1:
#get only results of the provided site
if siteurl==False:
start = data.find('<a href="/url?q=')
else:
start = data.find('<a href="/url?q='+str(siteurl))
data = data[start+len('<a href="/url?q='):]
end = data.find('&sa=U&ei=')
if start>-1 and end>-1:
link = urllib2.unquote(data[0:end])
data = data[end:len(data)]
if link.find('http')==0:
links.append(link)
return links
def rip(results=15,accuracy=16):
global e
keyword = ''+str(e.get())
if keyword.strip()=="":
s.configure(text="Please enter a keyword")
root.update()
return 0
linklist = []
counter = 0
doom = 0
while counter < results:
links = getgooglelinks(keyword,startr=counter)
for link in links:
if len(linklist) > CONSTANT_PAGECOUNT:
s.configure(text="Proccessing..")
root.update()
return linklist
else:
doom = doom + 1
linklist.append(link)
texx = str(doom)+"/"+str(CONSTANT_PAGECOUNT)
s.configure(text=texx)
root.update()
root.update()
counter = counter+accuracy
return linklist
def flip():
global e
emails = set()
info = []
keyword = ''+str(e.get())
if keyword.strip()=="":
s.configure(text="Please enter a keyword")
root.update()
return 0
s.configure(text="Generating index..")
root.update()
doom = -1
index = rip(CONSTANT_PAGECOUNT,10)
if 1:
try:
pool = Pool(processes=4)
#s.configure(text="Shifting Workload to cores..")
#root.update()
pool.map(f, index)
pool.close()
except:
print "The errors there.."
j.config(value=CONSTANT_PAGECOUNT)
if len(emails) > 0:
filepath='relavant_list_'+str(random.randint(1,9999))+'.emList.txt'
#print len(emails),
#print "emails found."
ggg = open(filepath,'a+')
for x in emails:
ggg.write(x+"\n")
showinfo(
str(len(emails))+" key word related sites found!",
" sites are saved in "+str(filepath)
)
showFile(emails,info)
s.configure(text=filepath)
else:
s.configure(text='No related sites found : (')
if __name__ == '__main__':
### CONSTANTS
version = '1.0'
### END CONSTANTS
root = Tk()
root.title('Program v'+version)
s = Style()
s.theme_use('default')
#print s.theme_names()
s.configure("black.Horizontal.TProgressbar", foreground='blue', background='blue')
j = Progressbar(root, style="black.Horizontal.TProgressbar", orient="vertical", length=200, mode="determinate", maximum=CONSTANT_PAGECOUNT, value=0)
j.pack(side='right',fill='y')
f = Frame(root)
x = Frame(f)
e = Entry(x,width=51)
s = Label(x,width=50,anchor='center',text='Waiting for task..')
Button(f,text='Generate List!',width=50,command=flip).pack(fill='both',expand=True)
s.pack(side='bottom',fill='y',expand=True)
e.pack(side='top',fill='both',expand=True)
x.pack(side='top',fill='y',expand=True)
f.pack(side='left',expand=True,fill="both")
root.mainloop()
You are leaking a tkinter object. Most likely because you are trying to update the interface from another process with the last line of f()
Update based on code
You have a name collision between your function f() and a variable f in your __main__ which gets assigned to your main window and causes the tkapp pickle error. Rename the function to def myfunc() or something. Also need to call pool.join() after pool.close()