Extensible Hashing with unique keys - python-2.7

I have a database that consists of tuples like so
'The Abyss,1989,LaserDisc,Science Fiction,James Cameron,James Cameron,USA,20th Century Fox,$0.00'
I want to concatenate the movie title with the year to make the unique key for each bucket. But unsure how to... I think it would be beneficial to use extensible hashing for this,
I would like to be able to search by movies being DVD or VHS as well as searching through and finding movies by year. I would consist buckets of years in a decade increments and types of movie (DVD, VHS)
Right now I just have a simple add, remove, and get functionality
class HTable(object):
def __init__(self, table = [], maximum = 100):
#table = dict, maximum = maximum amount of elements.
assert type(table) == dict
self.table = table
self.max = maximum
def lookup(self, data):
#Lookup a value in our table.
if type(data) == int or type(data) == long:
try:
if self.table[data % self.max] != None:
return (data % self.max, self.table[data % self.max])
else:
return None
except:
return None
else:
try:
obj1 = self.string2int(data) % self.max
obj2 = self.table[self.string2int(data) % self.max]
if obj2 != None:
return (obj1, obj2)
else:
return None
except:
return None
def append(self, data):
#data = int, string, object, etc. No duplicates allowed.
assert len(self.table) < self.max
if type(data) == int or type(data) == long:
original = data
if data >= self.max:
raise IOError, "Value to large to append into hash table. Max limit reached."
else:
original = data
data = self.string2int(data)
index = data % self.max
if int(index) >= self.max:
raise IOError, "Data: %s, exceeded your maximum limit of %s, with the size of %s." %(str(original), str(self.max), str(index))
try:
if type(original) == int or type(original) == long:
self.table[data % self.max] = data
else:
self.table[data % self.max] = original
return self.table
except:
if len(self.table) < data % self.max:
while len(self.table) < data % self.max:
self.table.append(None)
if type(original) == int:
self.table.insert(data % self.max, data)
else:
self.table.insert(data % self.max, str(original))
return self.table
def string2int(self, STRING):
#Convert a string into a 'hash' integer.
return sum([ord(j) for j in STRING])
def isEmpty(self):
#Return True if empty, false otherwise.
if len(self.table) == 0:
return True
else:
return False
def isFull(self):
#Returns TRUE if full, false otherwise.
if len(self.table) == self.max:
return True
else:
return False
def remove(self, key):
#Remove the data located at the given index/key. Key can be a index key(integer), or the data itself. For example: self.remove(key = 'value') or self.remove(key = 10).
try:
self.table.pop(int(key))
return 1
except:
try:
self.table.remove(key)
return 1
except:
return False
def get(self, key):
#Get the data in our HASH Table, using the given index(key).
try:
return self.table[int(key)]
except:
return None
def output(self):
#Return our current HASH Table.
return self.table

Related

(In Python 3.8) filling in lists to mantain them similar in length and average?

I need to allocate some values in 3 individual lists.
The values are generated on the fly but all included in the 0-6 range.
The point is that these values should be put in the three lists so that the average of each list does not differ so much from the others. The lists also need to be similar in length.
So the goal would be to progressively fill these lists to maintain, as much as possible, a uniform average value and size for all of them.
As I didn't found any built-in function to do this, I have implemented a code which keeps track of lists length and tries to keep them as close as possible in their average value. You can play with it and improve it to better fit your case.
class Data:
def __init__(self):
"""Init the three lists."""
self.a = []
self.b = []
self.c = []
#staticmethod
def get_average(data: list):
"""Get average value of a list."""
try:
return sum(data) / len(data)
except ZeroDivisionError:
return 0
def get_shortest(self):
"""Return list with the shortest length."""
shortest_length = min(len(self.a), len(self.b), len(self.c))
if len(self.a) == shortest_length:
return self.a
elif len(self.b) == shortest_length:
return self.b
else:
return self.c
def get_smallest(self):
"""Return list with the smallest average value."""
smallest_average = min(self.get_average(self.a), self.get_average(self.b), self.get_average(self.c))
if self.get_average(self.a) == smallest_average:
return self.a
elif self.get_average(self.b) == smallest_average:
return self.b
else:
return self.c
def get_highest(self):
"""Return list with the highest average value."""
highest_average = max(self.get_average(self.a), self.get_average(self.b), self.get_average(self.c))
if self.get_average(self.a) == highest_average:
return self.a
elif self.get_average(self.b) == highest_average:
return self.b
else:
return self.c
def add_number(self, num):
"""Add number to one of the lists."""
shortest = self.get_shortest()
smallest = self.get_smallest()
highest = self.get_highest()
# Lists must not differ by more than two elements
if len(smallest) - len(shortest) >= 2 or len(highest) - len(shortest) >= 2:
shortest.append(num)
else:
# Test if the number uppers the smallest average
initial_avg = self.get_average(smallest)
smallest.append(number)
final_avg = self.get_average(smallest)
if final_avg > initial_avg:
return
else:
smallest.pop()
# Test if the number lowers the highest average
initial_avg = self.get_average(highest)
highest.append(number)
final_avg = self.get_average(highest)
if final_avg < initial_avg:
return
else:
highest.pop()
# Last resort
shortest.append(num)
d = Data()
value = input("Add number: ")
while value != 'e':
try:
number = int(value)
except ValueError:
break
d.add_number(number)
print(f"List a: {d.a}, avg. {d.get_average(d.a)}")
print(f"List b: {d.b}, avg. {d.get_average(d.b)}")
print(f"List c: {d.c}, avg. {d.get_average(d.c)}")
value = input("Add number:")

Why doesn't my find function compare the nodes correctly?

I'm working on a concordance dictionary that reads a data file and records every unique word and the word's line number in a AVL tree. The problem is that my find method is not finding the Entry's within the tree so it adds every word instead of every unique word.
I'm also having trouble making my program keep a list of the line numbers within each entry. I'm using an entry class to keep the key(word) and the list of line numbers. Thank you for any help.
I'm writing in Python 2.7 and have included all my program so far.
My Main Program:
import string #NEW
from time import clock
import sys #for BST recursion limit
from dictionary import Entry
sys.setrecursionlimit(3000)#for BST
from avl import AVL
def main():
"""Calls on necessary functions to fill the dictionary, and process the keys"""
start = clock() #times runtime
stopWordDict = AVL()#Empty Dictionary
stopWordDict = fillStopWordDict(stopWordDict)
keyList = []
wordConcordanceDict = AVL()#Empty Dictionary
wordConcordanceDict = fillWordDict(stopWordDict,wordConcordanceDict, keyList)
print str(wordConcordanceDict) #wordconcorddict made here.
keyList.sort()
print keyList
writeWordConDict(wordConcordanceDict, keyList)
end = clock() #gets runtime
runTime = end - start
print("Done. Runtime was:",runTime,"seconds.")
def fillStopWordDict(stopWordDict):
"""fills chain dict with all of the stop words"""
fileNew=open('stop_words.txt', "r")
for word in fileNew:
word=word.lower().strip() #strip will strip \n from word
if stopWordDict.find(word) == None:
stopWordDict.add(word)
fileNew.close()
return stopWordDict
def fillWordDict(stopWordDict,wordConcordanceDict, keyList):
"""opens hw5data.txt and calls on processLine function"""
lineCounter = 1
fileNew=open('hw5data.txt', "r")
for line in fileNew:
processLine(lineCounter, line, stopWordDict,wordConcordanceDict, keyList)
lineCounter+=1 #changes to next line of file
fileNew.close()
return wordConcordanceDict
def processLine(lineCounter, line, stopWordDict,wordConcordanceDict, keyList):
"""process each line into the wordConcordanceDict"""
line=line.split() #splits line into list of words
for word in line:
word=word.lower().strip(string.punctuation)#strips punctuation
if stopWordDict.find(word) == None:
wordEntry = Entry(word, None)
if wordConcordanceDict.find(wordEntry) == None:
lineList = wordEntry.value
lineList.append(lineCounter)
wordEntry.value = lineList
wordConcordanceDict.add(wordEntry)
keyList.append(word)
else:
wordEntry = wordConcordance.find(wordEntry)
lineList = wordEntry.value
lineList.append(lineCounter)
wordEntry.value = lineList
wordConcordanceDict.add(wordEntry)
return wordConcordanceDict
def writeWordConDict(wordConcordanceDict, keyList):
"""takes in wordConcordanceDict and list of its keys. Then prints the key value pairs to the screen"""
fileNew=open("ProgProj5Concordance.txt", 'w')
# listOfWords = wordConcordanceDict.inorder()
for key in keyList:
wordEntry = wordConcordanceDict.find(key) #makes the values into a string
lineList = wordEntry.value
line=str(key + ":" + lineList + "\n")
fileNew.write(line)
fileNew.close()
main()
MY ENTRY CLASS:
"""
File: bst.py
BST class for binary search trees.
"""
from queue import LinkedQueue
from binarytree import BinaryTree
class BST(object):
def __init__(self):
self._tree = BinaryTree.THE_EMPTY_TREE
self._size = 0
def isEmpty(self):
return len(self) == 0
def __len__(self):
return self._size
def __str__(self):
return str(self._tree)
def __iter__(self):
return iter(self.inorder())
def find(self, target):
"""Returns data if target is found or None otherwise."""
def findHelper(tree):
if tree.isEmpty():
return None
elif target == tree.getRoot():
return tree.getRoot()
elif target < tree.getRoot():
return findHelper(tree.getLeft())
else:
return findHelper(tree.getRight())
return findHelper(self._tree)
def add(self, newItem):
"""Adds newItem to the tree."""
# Helper function to search for item's position
def addHelper(tree):
currentItem = tree.getRoot()
left = tree.getLeft()
right = tree.getRight()
# New item is less, go left until spot is found
if newItem < currentItem:
if left.isEmpty():
tree.setLeft(BinaryTree(newItem))
else:
addHelper(left)
# New item is greater or equal,
# go right until spot is found
elif right.isEmpty():
tree.setRight(BinaryTree(newItem))
else:
addHelper(right)
# End of addHelper
# Tree is empty, so new item goes at the root
if self.isEmpty():
self._tree = BinaryTree(newItem)
# Otherwise, search for the item's spot
else:
addHelper(self._tree)
self._size += 1
def inorder(self):
"""Returns a list containing the results of
an inorder traversal."""
lyst = []
self._tree.inorder(lyst)
return lyst
def preorder(self):
"""Returns a list containing the results of
a preorder traversal."""
# Exercise
pass
def postorder(self):
"""Returns a list containing the results of
a postorder traversal."""
# Exercise
pass
def levelorder(self):
"""Returns a list containing the results of
a levelorder traversal."""
# Exercise
pass
def remove(self, item):
# Exercise
pass
def main():
tree = BST()
print "Adding D B A C F E G"
tree.add("D")
tree.add("B")
tree.add("A")
tree.add("C")
tree.add("F")
tree.add("E")
tree.add("G")
print tree.find("A")
print tree.find("Z")
print "\nString:\n" + str(tree)
print "Iterator (inorder traversal): "
iterator = iter(tree)
while True:
try:
print iterator.next(),
except Exception, e:
print e
break
# Use a for loop instead
print "\nfor loop (inorder traversal): "
for item in tree:
print item,
if __name__ == "__main__":
main()
AND FINALLY THE BINARY TREE AVL CLASS:
from binarytree import *
class BinaryTreeAVL(BinaryTree):
def __init__(self, item, balance = 'EQ'):
BinaryTree.__init__(self, item)
self._balance = balance
def getBalance(self):
return self._balance
def setBalance(self, newBalance):
self._balance = newBalance
def __str__(self):
"""Returns a string representation of the tree
rotated 90 degrees to the left."""
def strHelper(tree, level):
result = ""
if not tree.isEmpty():
result += strHelper(tree.getRight(), level + 1)
result += "| " * level
result += str(tree.getRoot())+ " : " + tree.getBalance() + "\n"
result += strHelper(tree.getLeft(), level + 1)
return result
return strHelper(self, 0)

Display tagged list as indented tree grid

I am writing a logger which records the level of the entries.
To make it simple, let's say it logs entries like <level> <message>.
I am now trying to write a log viewer which formats the logfile "nicely" as an indented tree grid.
For example is the raw log file contains:
0 entry1
0 entry2
1 entry3
2 entry4
3 entry5
2 entry6
0 entry7
It should output:
entry1
entry2
└entry3
├entry4
│└entry5
└entry6
entry7
My first steps were
Converting the list into a tree
Recursively print the tree
This worked with one single exception: I cannot figure out how I can pass the information that - referring to the example - before entry5 comes the │ sign to display that the previous level continues after the sub-levels.
So any hint, how to come from the list to the desired output is welcome.
Finally got it:
class LogViewer(LogFile):
"""
Formats raw log file contents nicely
and thus makes it human-readable
"""
__down = False
class EntryTreeNode():
"""
A minimal entry wrapper
"""
def __init__(self, string):
"""
Constructor
"""
lst = string.split(LogEntry.colsep())
if len(lst) != 6:
raise Exception('Invalid entry: ' + string)
else:
self.DATE = datetime.strptime(lst[0], LogEntry.timeformat())
self.ERRLVL = ErrLvlType(lst[1])
self.USER = lst[2]
self.CALLER = lst[3]
self.OFFSET = int(lst[4])
self.MSG = lst[5]
self.tag = self.OFFSET
self.children = []
self.pre = '[' + datetime.strftime(self.DATE, LogEntry.timeformat()) + ']\t' \
+ str(self.ERRLVL) + '\t' \
+ str(self.USER) + '\t'
self.post = str(self.CALLER) + ' \t' + str(self.MSG)
def __repr__(self):
return str(self.tag)
def __init__(self, path):
"""
Constructor
"""
super().__init__(path)
#property
def __sym_last(self):
"""
Returns the symbol for a last entry
"""
return '┌' if self.__down else '└'
#property
def __sym_mid(self):
"""
Returns the symbol for a middle entry
"""
return '├'
#property
def __sym_follow(self):
"""
Returns the symbol for a following entry
"""
return '│'
def __mktree(self, lst):
"""
Converts a log entry list into a tree
"""
roots = []
def children(root, lst):
result = []
while lst:
curr = lst.pop()
if curr.tag == root.tag + 1:
curr.children = children(curr, lst)
result.append(curr)
else:
lst.append(curr)
break
return result
while lst:
curr = lst.pop()
if curr.tag == 0:
curr.children = children(curr, lst)
roots.append(curr)
return roots
def __print_tree(self, root, offset='', prefix='', last=True):
"""
Prints a log entry tree
"""
print(root.pre + offset + prefix + root.post)
if last:
offset += ' '
else:
offset += self.__sym_follow
for i in range(0, len(root.children)):
if i == len(root.children)-1:
prefix = self.__sym_last
last = True
else:
prefix = self.__sym_mid
last = False
self.__print_tree(root.children[i], offset, prefix, last)
def display(self, reverse=False):
"""
Displays the log file nicely
"""
self.__down = reverse
entries = reversed(self.dump()) if reverse else self.dump()
entries = [self.EntryTreeNode(e) for e in entries]
tree = self.__mktree(entries)
for root in tree:
self.__print_tree(root)

What is the efficient way to sort custom fields in django models?

I load all the leads, iterate the queryset and populate the custom fields.
The custom fields are dependent on other model.
Then I sort the leads by these custom fields and show the result.
This method is very slow.
How can I optimize and increase speed?
The models are as follows
Lead Model
class Lead(LeadModel):
def most_recent_mailing_date(self):
""" Return the most recent mailing date """
mailingHistories = self.mailinghistory_set.all()
if len(mailingHistories) != 0:
today = datetime.date.today()
mostRecentHistory = None
diff = -1
for mailingHistory in mailingHistories:
if mailingHistory.mailing_date < today and (diff == -1 or (today - mailingHistory.mailing_date) < diff):
mostRecentHistory = mailingHistory
diff = today - mostRecentHistory.mailing_date
if mostRecentHistory is None:
return "No Mailing History"
else:
return mostRecentHistory.mailing_date
else:
return "No Mailing History"
def next_mailing_date(self):
""" Return the next mailing date """
mailingHistories = self.mailinghistory_set.all()
if len(mailingHistories) != 0:
today = datetime.date.today()
nextHistory = None
diff = -1
for mailingHistory in mailingHistories:
if mailingHistory.mailing_date > today and (diff == -1 or (mailingHistory.mailing_date - today) < diff):
nextHistory = mailingHistory
diff = mailingHistory.mailing_date - today
if nextHistory is None:
return "No Future Mailings"
else:
return nextHistory.mailing_date
else:
return "No Future Mailings"
Mailing History Model
class MailingHistory(models.Model):
lead = models.ForeignKey(Lead)
returned_envelope = models.BooleanField()
mailing_date = models.DateField(blank=True, null=True)
Leads to list function
def leads_to_list(queryset):
holder = []
for item in queryset:
item_dict = item.__dict__
recent_mailing_date = item.most_recent_mailing_date()
next_mailing_date = item.next_mailing_date()
if not type(recent_mailing_date) == datetime.date:
recent_mailing_date_key = NONE_DATE
else:
recent_mailing_date_key = recent_mailing_date
if not type(next_mailing_date) == datetime.date:
next_mailing_date_key = NONE_DATE
else:
next_mailing_date_key = next_mailing_date
item_dict['recent_mailing_date'] = recent_mailing_date
item_dict['recent_mailing_date_key'] = recent_mailing_date_key
item_dict['next_mailing_date'] = next_mailing_date
item_dict['next_mailing_date_key'] = next_mailing_date_key
if '_state' in item_dict:
del item_dict['_state']
holder.append(item_dict)
return holder
Sorting Logic
# Code to be optimized #
leads = Lead.objects.all()
leads = queryset_to_list(leads) # Important for serialization. json.dumps
sort_key = 'recent_mailing_date_key'
sort_reverse = True
leads = sorted(leads,key=itemgetter(sort_key),reverse = sort_reverse)
return json.dumps(leads)
You can use filter, limit and order by in query.
Be careful with len of queryset. The function len counts objects of a list. It is more efficient to use count (function that runs a query to count).
I hope I've helped.
class Lead(LeadModel):
def most_recent_mailing_date(self):
""" Return the most recent mailing date """
today = datetime.date.today()
mailingHistories = self.mailinghistory_set.filter(mailing_date__lt=today).order_by('-mailing_date', '-id')[:1]
if len(mailingHistories) != 0:
return mostRecentHistory[0].mailing_date
else:
return "No Mailing History"
def next_mailing_date(self):
""" Return the next mailing date """
today = datetime.date.today()
mailingHistories = self.mailinghistory_set.filter(mailing_date__gt=today).order_by('mailing_date', 'id')[:1]
if len(mailingHistories) != 0:
return mostRecentHistory[0].mailing_date
else:
return "No Future Mailings"

How to solve "AttributeError: 'QPDFDocument' object has no attribute 'initialize' " in python

I have got the following error when I'm trying to execute example in pdfquery.
File "C:\workspace-php\test\pdfminer\pdfqueries\pdfquery.py", line 187, in init
doc.initialize()
AttributeError: 'QPDFDocument' object has no attribute 'initialize'
I'm trying to solve this but still i dont get any solution for that.it would be appreciated if some one can help me to solve this.
class PDFQuery(object):
def __init__(self, file,
merge_tags=('LTChar', 'LTAnno'),
round_floats=True,
round_digits=3,
input_text_formatter=None,
normalize_spaces=True,
resort=True,
parse_tree_cacher=None,
):
# store input
self.merge_tags = merge_tags
self.round_floats = round_floats
self.round_digits = round_digits
self.resort = resort
# set up input text formatting function, if any
if input_text_formatter:
self.input_text_formatter = input_text_formatter
elif normalize_spaces:
r = re.compile(r'\s+')
self.input_text_formatter = lambda s: re.sub(r, ' ', s)
else:
self.input_text_formatter = None
# open doc
if not hasattr(file, 'read'):
try:
file = open(file, 'rb')
except TypeError:
raise TypeError("File must be file object or filepath string.")
parser = PDFParser(file)
if hasattr(QPDFDocument, 'set_parser'):
# pdfminer < 20131022
doc = QPDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
else:
# pdfminer >= 20131022
doc = QPDFDocument(parser)
parser.set_document(doc)
doc.initialize()
self.doc = doc
self.parser = parser
self.tree = None
self.pq = None
self.file = file
if parse_tree_cacher:
self._parse_tree_cacher = parse_tree_cacher
self._parse_tree_cacher.set_hash_key(self.file)
else:
self._parse_tree_cacher = DummyCache()
# set up layout parsing
rsrcmgr = PDFResourceManager()
laparams = LAParams(all_texts=True, detect_vertical=True)
self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
# caches
self._pages = []
self._pages_iter = None
self._elements = []
def load(self, *page_numbers):
"""
Load etree and pyquery object for entire document, or given page numbers (ints or lists).
After this is called, objects are available at pdf.tree and pdf.pq.
>>> pdf.load()
>>> pdf.tree
<lxml.etree._ElementTree object at ...>
>>> pdf.pq('LTPage')
[<LTPage>, <LTPage>]
>>> pdf.load(1)
>>> pdf.pq('LTPage')
[<LTPage>]
>>> pdf.load(0,1)
>>> pdf.pq('LTPage')
[<LTPage>, <LTPage>]
"""
self.tree = self.get_tree(*_flatten(page_numbers))
self.pq = self.get_pyquery(self.tree)
def extract(self, searches, tree=None, as_dict=True):
"""
>>> foo = pdf.extract( [ ['pages', 'LTPage'] ])
>>> foo
{'pages': [<LTPage>, <LTPage>]}
>>> pdf.extract( [ ['bar', ':in_bbox("100,100,400,400")'] ], foo['pages'][0])
{'bar': [<LTTextLineHorizontal>, <LTTextBoxHorizontal>,...
"""
if self.tree is None or self.pq is None:
self.load()
pq = PyQuery(tree, css_translator=PDFQueryTranslator()) if tree is not None else self.pq
if tree is None:
pq = self.pq
else:
pq = PyQuery(tree, css_translator=PDFQueryTranslator())
results = []
formatter = None
parent = pq
for search in searches:
if len(search) < 3:
search = list(search) + [formatter]
key, search, tmp_formatter = search
if key == 'with_formatter':
if isinstance(search, basestring): # is a pyquery method name, e.g. 'text'
formatter = lambda o, search=search: getattr(o, search)()
elif hasattr(search, '__call__') or not search: # is a method, or None to end formatting
formatter = search
else:
raise TypeError("Formatter should be either a pyquery method name or a callable function.")
elif key == 'with_parent':
parent = pq(search) if search else pq
else:
try:
result = parent("*").filter(search) if hasattr(search, '__call__') else parent(search)
except cssselect.SelectorSyntaxError, e:
raise cssselect.SelectorSyntaxError( "Error applying selector '%s': %s" % (search, e) )
if tmp_formatter:
result = tmp_formatter(result)
results += result if type(result) == tuple else [[key, result]]
if as_dict:
results = dict(results)
return results
# tree building stuff
def get_pyquery(self, tree=None, page_numbers=[]):
"""
Wrap given tree in pyquery and return.
If no tree supplied, will generate one from given page_numbers, or all page numbers.
"""
if tree is None:
if not page_numbers and self.tree is not None:
tree = self.tree
else:
tree = self.get_tree(page_numbers)
if hasattr(tree, 'getroot'):
tree = tree.getroot()
return PyQuery(tree, css_translator=PDFQueryTranslator())
def get_tree(self, *page_numbers):
"""
Return lxml.etree.ElementTree for entire document, or page numbers given if any.
"""
cache_key = "_".join(map(str, _flatten(page_numbers)))
tree = self._parse_tree_cacher.get(cache_key)
if tree is None:
# set up root
root = parser.makeelement("pdfxml")
if self.doc.info: #not all PDFs seem to have this info section
for k, v in self.doc.info[0].items():
root.set(k, unicode(v))
# add pages
if page_numbers:
pages = [[n, self.get_layout(self.get_page(n))] for n in _flatten(page_numbers)]
else:
pages = enumerate(self.get_layouts())
for n, page in pages:
page = self._xmlize(page)
page.set('page_index', unicode(n))
page.set('page_label', self.doc.get_page_number(n))
root.append(page)
self._clean_text(root)
# wrap root in ElementTree
tree = etree.ElementTree(root)
self._parse_tree_cacher.set(cache_key, tree)
return tree
def _clean_text(self, branch):
"""
Remove text from node if same text exists in its children.
Apply string formatter if set.
"""
if branch.text and self.input_text_formatter:
branch.text = self.input_text_formatter(branch.text)
try:
for child in branch:
self._clean_text(child)
if branch.text and branch.text.find(child.text) >= 0:
branch.text = branch.text.replace(child.text, '', 1)
except TypeError: # not an iterable node
pass
def _xmlize(self, node, root=None):
# collect attributes of current node
tags = self._getattrs(node, 'y0', 'y1', 'x0', 'x1', 'width', 'height', 'bbox', 'linewidth', 'pts', 'index','name','matrix','word_margin' )
if type(node) == LTImage:
tags.update( self._getattrs(node, 'colorspace','bits','imagemask','srcsize','stream','name','pts','linewidth') )
elif type(node) == LTChar:
tags.update( self._getattrs(node, 'fontname','adv','upright','size') )
elif type(node) == LTPage:
tags.update( self._getattrs(node, 'pageid','rotate') )
# create node
branch = parser.makeelement(node.__class__.__name__, tags)
branch.layout = node
self._elements += [branch] # make sure layout keeps state
if root is None:
root = branch
# add text
if hasattr(node, 'get_text'):
branch.text = node.get_text()
# add children if node is an iterable
if hasattr(node, '__iter__'):
last = None
for child in node:
child = self._xmlize(child, root)
if self.merge_tags and child.tag in self.merge_tags:
if branch.text and child.text in branch.text:
continue
elif last is not None and last.tag in self.merge_tags:
last.text += child.text
last.set('_obj_id', last.get('_obj_id')+","+child.get('_obj_id'))
continue
# sort children by bounding boxes
if self.resort:
_append_sorted(root, child, _comp_bbox)
else:
branch.append(child)
last = child
return branch
def _getattrs(self, obj, *attrs):
""" Return dictionary of given attrs on given object, if they exist, processing through filter_value(). """
return dict( (attr, unicode(self._filter_value(getattr(obj, attr)))) for attr in attrs if hasattr(obj, attr))
def _filter_value(self, val):
if self.round_floats:
if type(val) == float:
val = round(val, self.round_digits)
elif hasattr(val, '__iter__'):
val = [self._filter_value(item) for item in val]
return val
# page access stuff
def get_page(self, page_number):
""" Get PDFPage object -- 0-indexed."""
return self._cached_pages(target_page=page_number)
def get_layout(self, page):
""" Get PDFMiner Layout object for given page object or page number. """
if type(page) == int:
page = self.get_page(page)
self.interpreter.process_page(page)
return self.device.get_result()
def get_layouts(self):
""" Get list of PDFMiner Layout objects for each page. """
return (self.get_layout(page) for page in self._cached_pages())
def _cached_pages(self, target_page=-1):
"""
Get a page or all pages from page generator, caching results.
This is necessary because PDFMiner searches recursively for pages,
so we won't know how many there are until we parse the whole document,
which we don't want to do until we need to.
"""
try:
# pdfminer < 20131022
self._pages_iter = self._pages_iter or self.doc.get_pages()
except AttributeError:
# pdfminer >= 20131022
self._pages_iter = self._pages_iter or PDFPage.create_pages(self.doc)
if target_page >= 0:
while len(self._pages) <= target_page:
next = self._pages_iter.next()
if not next:
return None
next.page_number = 0
self._pages += [next]
try:
return self._pages[target_page]
except IndexError:
return None
self._pages += list(self._pages_iter)
return self._pages
if __name__ == "__main__":
import doctest
pdf = PDFQuery("../examples/sample.pdf")
doctest.testmod(extraglobs={'pdf': pdf}, optionflags=doctest.ELLIPSIS)
add.elif to line 18 then add a for loop before the list and that should fix it if there is any problems contact me for support