I am running below code to remove duplicates from Linked List. But my code only prints linked List before removing duplicates. Once removeDup method is called, it does not print anything. Below is my code. Please tell me what am I missing.
class Node:
def __init__(self, data):
self.data = data
self.next = None
class LinkedList:
def __init__(self):
self.head = None
def insert(self, data):
node = Node(data)
node.next=self.head
self.head = node
def printl(self):
current = self.head
while current:
print current.data
current= current.next
def removeDups(self):
current = self.head
while current.next is not None:
if second.data == current.data:
current.next = current.next.next
else:
current=current.next
l= LinkedList()
l.insert(15)
l.insert(14)
l.insert(16)
l.insert(15)
l.insert(15)
l.insert(14)
l.insert(18)
l.insert(159)
l.insert(12)
l.insert(10)
l.insert(15)
l.insert(14)
l.printl()
print "==============="
l.removeDups()
l.printl()
Your logic for removing the duplicated items you find is not right. It causes you to cut out all the items between the first occurrence of a value and a point past its last occurrence. For your example list, that results in a single item, 14 being printed after the deduplication runs (it cuts from just after the first value to the end, though it makes some smaller cuts along the way too).
Here's a fixed version of your removeDups method.
def removeDups(self):
current = second = self.head
while current is not None:
while second.next is not None: # check second.next here rather than second
if second.next.data == current.data: # check second.next.data, not second.data
second.next = second.next.next # cut second.next out of the list
else:
second = second.next # put this line in an else, to avoid skipping items
current = second = current.next
The main change is that second points to the node before the second node we're actually interested in checking. We do all our work on second.next. We need to keep the reference to second so we can easily cut second.next out of the list. Doing it this way requires that we don't advance second if we've cut out a node, so the second = second.next line needs to be in an else clause.
Since current and second always start with the same value after each update to current, I changed the logic to assign both of them in a single statement. It would work fine the original way, I just think this way looks nicer.
I think it is confusing to use the "second" variable.
def removeDups(self):
current = self.head
while current: #First loop
while current.next and current.data == current.next.data: #Second loop
current.next = current.next.next #Deletion
current = current.next
You start at the head of the list and for each node in your list until you hit the None at the end (while current) you enter another loop. That loops checks to make sure there is a next node (while current.next) and if that next node has the same data as the current node (current.data == current.next.data). Each time this second loop is true, it means we have a duplicate. The next line (current.next = current.next.next) is what does the actual deletion. It also conveniently updates current.next to the next node in the list that we want to compare so that the second loop can immediately check again to see if we have another duplicate. Once that second loop has found and deleted all the duplicates for that particular node, we will drop down to the next line (current = current.next), update our current node to the next one and start checking that node for duplicates.
We can use a list or dictionary to check whether the item inserted is already there or not
class Node:
def __init__(self,data):
self.data=data
self.next=None
class LinkedList:
def __init__(self):
self.head=None
def append(self,data):
new_Node=Node(data)
if self.head is None:
self.head=new_Node
return
last_node=self.head
while last_node.next:
last_node=last_node.next
last_node.next=new_Node
def printing(self):
current_node=self.head
while current_node:
print(current_node.data)
current_node=current_node.next
def remove_dup(self):
curr=self.head
glist=[] #list to store the values
while curr:
if curr.data in glist: #checking the value already exists in list
prev.next=curr.next
else:
glist.append(curr.data)
prev=curr
curr=curr.next
llist=LinkedList()
llist.append(1)
llist.append(6)
llist.append(1)
llist.append(4)
llist.append(2)
llist.append(2)
llist.append(4)
llist.remove_dup()
llist.printing()
This is how removeDuplicates function should be written:
class node:
def __init__(self):
self.data = None
self.next = None
class Linked_List:
def __init__(self):
self.head = None
def get_head(self):
return self.head
def insert(self, data):
if self.head == None:
self.head = node()
self.head.data = data
else:
new_node = node()
new_node.data = data
new_node.next = None
temp = self.head
while(temp.next):
temp=temp.next
temp.next = new_node
def printlist(self):
temp = self.head
while temp!=None:
print(temp.data, end=" ")
temp= temp.next
def removeDuplicates(head):
current = head
element_list = []
prev = None
while (current is not None):
if current.data not in element_list:
element_list.append(current.data)
prev = current
current = current.next
else:
prev.next = current.next
current = current.next
if __name__ == '__main__':
values_list = [[5,2,2,4], [2,2,2,2,2]]
t = len(values_list)
for index in range(t):
list1 = Linked_List()
for i in values_list[index]:
list1.insert(i)
print('Input:')
list1.printlist()
print()
removeDuplicates(list1.head)
print('Output')
list1.printlist()
print('')
It removes duplicate nodes from sorted/unsorted singly linked list
you can use an additional data structure to hold the unique values(e.g. a list)
class Node:
def __init__(self, data):
self.data = data
self.next = None
class LinkedList:
def __init__(self):
self.head = None
def insert_node(self, data):
new_node = Node(data)
if self.head is None:
self.head = new_node
else:
new_node.next = self.head
self.head = new_node
def remove_dups(self):
uniques = []
prev = None
curr = self.head
while curr is not None:
if curr.data in uniques:
prev.next = curr.next
else:
uniques.append(curr.data)
prev = curr
curr = curr.next
def print_list(self):
output = ""
tmp = self.head
while tmp is not None:
output += str(tmp.data) + " "
tmp = tmp.next
print(output)
`
Related
I did see this post here [java] and am attempting a similar solution, but the noted thread did not fully answer my question)
I need to work with singly linked lists and want to try and perform an insertBefore() method. I understand that doubly-linked lists have a previous attribute, while singly linked lists do not, so I understand this might be better accomplished using doubly-linked lists, but this was the requirements of the assignment and I'm trying to solve things.
So far below: I've got my Node class setup along with my SinglyLinkedList class. I've also got my insertBefore() method, which is my goal and where I'm getting stuck.
You'll see in my if statement, I'm hoping to compare node.next.value to my targetNode (note that targetNode is a value) -- why is my node.next.value throwing me the following error? if node.next.value == targetNode: AttributeError: 'NoneType' object has no attribute 'value'
# this is our node object
class Node(object):
def __init__(self, value, next=None):
self.value = value
self.next = next
# this is our singly linked list object
class SinglyLinkedList(object):
def __init__(self):
self.head = None
self.tail = None
def insertBefore(self, targetNode, value):
# create new node
newNode = Node(value)
# find target node to insert
node = self.head
if node == None:
print 'There aren\'t any nodes to insert before!'
else:
found = None
# search nodes
while node:
if node.next.value == targetNode:
found = True
print node.value + ' <--this was node before target'
beforeInsert = node
afterInsert = node.next
beforeInsert.next = newNode
newNode.next = afterInsert # sets new node's next to target node
node = node.next # continues through while loop
else:
node = node.next
if found != True:
print 'Your target node of {} was not found in the list!'.format(targetNode)
Please note: I was able to get this to work for an insertAfter() method (not included in the snippet above), but am struggling to match up the node.next with the targetNode object.
As you suggest yourself, the problem seems to be that you create a new node using your target node, node.next == Node(targetNode) as this will never be true.
I assume that targetNode is a Node object, in which case you should simply use node.next == targetNode.
Another problem with your code is that you do not check if the first node is the target node. Thus, you would not be able to insert a node before the head Node using the insertBefore function.
The following code is a rewrite of your insertBefore function which inserts a new node with the specified value before a given node.
def insertBefore(self, targetValue, value):
# create new node
newNode = Node(value)
# find target node to insert
node = self.head
if node == None:
print 'There aren\'t any nodes to insert before!'
else:
# search nodes
if node.value == targetValue:
newNode.next = self.head
self.head = newNode
while node.next is not None:
if node.next.value == targetValue:
print ">>> ",node.value,' <--this was node before target'
newNode.next = node.next
node.next = newNode
return
else:
node = node.next
print 'Your target node of {} was not found in the list!'.format(targetNode.value)
If you want targetNode to be the value of the target node, you will need to compare node.next.value with targetNode. Note that this would insert the value before the first occurrence of the target value. Thus, you might want to turn you list into a set instead.
Here's how I was able to find a solution to my question:
def insertBefore(self, targetValue, value):
# create new node
newNode = Node(value)
# find target node to insert
node = self.head
if node == None:
print 'There aren\'t any nodes to insert before!'
else:
found = False
# search nodes
while node:
if node.next == None:
break
if node.next.value == targetValue:
found = True
newNode.next = node.next
node.next = newNode
break
else:
node = node.next
if found != True:
print 'Your target node of {} was not found in the list!'.format(targetValue)
Note, as Jonas pointed out in his comments, the above mentioned solution only works if there are no duplicates in the node list.
This is my solution of your problem.
def addBefore(self, nextData, data):
newNode = Node(data, None)
if self.head.data == nextData:
newNode.next = self.head
self.head = newNode
return
ptr = self.head
while ptr is not None:
if ptr.data == nextData:
newNode.next = ptr
parPtr.next = newNode
break
else:
parPtr = ptr
ptr = ptr.next
I'm working on a concordance dictionary that reads a data file and records every unique word and the word's line number in a AVL tree. The problem is that my find method is not finding the Entry's within the tree so it adds every word instead of every unique word.
I'm also having trouble making my program keep a list of the line numbers within each entry. I'm using an entry class to keep the key(word) and the list of line numbers. Thank you for any help.
I'm writing in Python 2.7 and have included all my program so far.
My Main Program:
import string #NEW
from time import clock
import sys #for BST recursion limit
from dictionary import Entry
sys.setrecursionlimit(3000)#for BST
from avl import AVL
def main():
"""Calls on necessary functions to fill the dictionary, and process the keys"""
start = clock() #times runtime
stopWordDict = AVL()#Empty Dictionary
stopWordDict = fillStopWordDict(stopWordDict)
keyList = []
wordConcordanceDict = AVL()#Empty Dictionary
wordConcordanceDict = fillWordDict(stopWordDict,wordConcordanceDict, keyList)
print str(wordConcordanceDict) #wordconcorddict made here.
keyList.sort()
print keyList
writeWordConDict(wordConcordanceDict, keyList)
end = clock() #gets runtime
runTime = end - start
print("Done. Runtime was:",runTime,"seconds.")
def fillStopWordDict(stopWordDict):
"""fills chain dict with all of the stop words"""
fileNew=open('stop_words.txt', "r")
for word in fileNew:
word=word.lower().strip() #strip will strip \n from word
if stopWordDict.find(word) == None:
stopWordDict.add(word)
fileNew.close()
return stopWordDict
def fillWordDict(stopWordDict,wordConcordanceDict, keyList):
"""opens hw5data.txt and calls on processLine function"""
lineCounter = 1
fileNew=open('hw5data.txt', "r")
for line in fileNew:
processLine(lineCounter, line, stopWordDict,wordConcordanceDict, keyList)
lineCounter+=1 #changes to next line of file
fileNew.close()
return wordConcordanceDict
def processLine(lineCounter, line, stopWordDict,wordConcordanceDict, keyList):
"""process each line into the wordConcordanceDict"""
line=line.split() #splits line into list of words
for word in line:
word=word.lower().strip(string.punctuation)#strips punctuation
if stopWordDict.find(word) == None:
wordEntry = Entry(word, None)
if wordConcordanceDict.find(wordEntry) == None:
lineList = wordEntry.value
lineList.append(lineCounter)
wordEntry.value = lineList
wordConcordanceDict.add(wordEntry)
keyList.append(word)
else:
wordEntry = wordConcordance.find(wordEntry)
lineList = wordEntry.value
lineList.append(lineCounter)
wordEntry.value = lineList
wordConcordanceDict.add(wordEntry)
return wordConcordanceDict
def writeWordConDict(wordConcordanceDict, keyList):
"""takes in wordConcordanceDict and list of its keys. Then prints the key value pairs to the screen"""
fileNew=open("ProgProj5Concordance.txt", 'w')
# listOfWords = wordConcordanceDict.inorder()
for key in keyList:
wordEntry = wordConcordanceDict.find(key) #makes the values into a string
lineList = wordEntry.value
line=str(key + ":" + lineList + "\n")
fileNew.write(line)
fileNew.close()
main()
MY ENTRY CLASS:
"""
File: bst.py
BST class for binary search trees.
"""
from queue import LinkedQueue
from binarytree import BinaryTree
class BST(object):
def __init__(self):
self._tree = BinaryTree.THE_EMPTY_TREE
self._size = 0
def isEmpty(self):
return len(self) == 0
def __len__(self):
return self._size
def __str__(self):
return str(self._tree)
def __iter__(self):
return iter(self.inorder())
def find(self, target):
"""Returns data if target is found or None otherwise."""
def findHelper(tree):
if tree.isEmpty():
return None
elif target == tree.getRoot():
return tree.getRoot()
elif target < tree.getRoot():
return findHelper(tree.getLeft())
else:
return findHelper(tree.getRight())
return findHelper(self._tree)
def add(self, newItem):
"""Adds newItem to the tree."""
# Helper function to search for item's position
def addHelper(tree):
currentItem = tree.getRoot()
left = tree.getLeft()
right = tree.getRight()
# New item is less, go left until spot is found
if newItem < currentItem:
if left.isEmpty():
tree.setLeft(BinaryTree(newItem))
else:
addHelper(left)
# New item is greater or equal,
# go right until spot is found
elif right.isEmpty():
tree.setRight(BinaryTree(newItem))
else:
addHelper(right)
# End of addHelper
# Tree is empty, so new item goes at the root
if self.isEmpty():
self._tree = BinaryTree(newItem)
# Otherwise, search for the item's spot
else:
addHelper(self._tree)
self._size += 1
def inorder(self):
"""Returns a list containing the results of
an inorder traversal."""
lyst = []
self._tree.inorder(lyst)
return lyst
def preorder(self):
"""Returns a list containing the results of
a preorder traversal."""
# Exercise
pass
def postorder(self):
"""Returns a list containing the results of
a postorder traversal."""
# Exercise
pass
def levelorder(self):
"""Returns a list containing the results of
a levelorder traversal."""
# Exercise
pass
def remove(self, item):
# Exercise
pass
def main():
tree = BST()
print "Adding D B A C F E G"
tree.add("D")
tree.add("B")
tree.add("A")
tree.add("C")
tree.add("F")
tree.add("E")
tree.add("G")
print tree.find("A")
print tree.find("Z")
print "\nString:\n" + str(tree)
print "Iterator (inorder traversal): "
iterator = iter(tree)
while True:
try:
print iterator.next(),
except Exception, e:
print e
break
# Use a for loop instead
print "\nfor loop (inorder traversal): "
for item in tree:
print item,
if __name__ == "__main__":
main()
AND FINALLY THE BINARY TREE AVL CLASS:
from binarytree import *
class BinaryTreeAVL(BinaryTree):
def __init__(self, item, balance = 'EQ'):
BinaryTree.__init__(self, item)
self._balance = balance
def getBalance(self):
return self._balance
def setBalance(self, newBalance):
self._balance = newBalance
def __str__(self):
"""Returns a string representation of the tree
rotated 90 degrees to the left."""
def strHelper(tree, level):
result = ""
if not tree.isEmpty():
result += strHelper(tree.getRight(), level + 1)
result += "| " * level
result += str(tree.getRoot())+ " : " + tree.getBalance() + "\n"
result += strHelper(tree.getLeft(), level + 1)
return result
return strHelper(self, 0)
I am writing a logger which records the level of the entries.
To make it simple, let's say it logs entries like <level> <message>.
I am now trying to write a log viewer which formats the logfile "nicely" as an indented tree grid.
For example is the raw log file contains:
0 entry1
0 entry2
1 entry3
2 entry4
3 entry5
2 entry6
0 entry7
It should output:
entry1
entry2
└entry3
├entry4
│└entry5
└entry6
entry7
My first steps were
Converting the list into a tree
Recursively print the tree
This worked with one single exception: I cannot figure out how I can pass the information that - referring to the example - before entry5 comes the │ sign to display that the previous level continues after the sub-levels.
So any hint, how to come from the list to the desired output is welcome.
Finally got it:
class LogViewer(LogFile):
"""
Formats raw log file contents nicely
and thus makes it human-readable
"""
__down = False
class EntryTreeNode():
"""
A minimal entry wrapper
"""
def __init__(self, string):
"""
Constructor
"""
lst = string.split(LogEntry.colsep())
if len(lst) != 6:
raise Exception('Invalid entry: ' + string)
else:
self.DATE = datetime.strptime(lst[0], LogEntry.timeformat())
self.ERRLVL = ErrLvlType(lst[1])
self.USER = lst[2]
self.CALLER = lst[3]
self.OFFSET = int(lst[4])
self.MSG = lst[5]
self.tag = self.OFFSET
self.children = []
self.pre = '[' + datetime.strftime(self.DATE, LogEntry.timeformat()) + ']\t' \
+ str(self.ERRLVL) + '\t' \
+ str(self.USER) + '\t'
self.post = str(self.CALLER) + ' \t' + str(self.MSG)
def __repr__(self):
return str(self.tag)
def __init__(self, path):
"""
Constructor
"""
super().__init__(path)
#property
def __sym_last(self):
"""
Returns the symbol for a last entry
"""
return '┌' if self.__down else '└'
#property
def __sym_mid(self):
"""
Returns the symbol for a middle entry
"""
return '├'
#property
def __sym_follow(self):
"""
Returns the symbol for a following entry
"""
return '│'
def __mktree(self, lst):
"""
Converts a log entry list into a tree
"""
roots = []
def children(root, lst):
result = []
while lst:
curr = lst.pop()
if curr.tag == root.tag + 1:
curr.children = children(curr, lst)
result.append(curr)
else:
lst.append(curr)
break
return result
while lst:
curr = lst.pop()
if curr.tag == 0:
curr.children = children(curr, lst)
roots.append(curr)
return roots
def __print_tree(self, root, offset='', prefix='', last=True):
"""
Prints a log entry tree
"""
print(root.pre + offset + prefix + root.post)
if last:
offset += ' '
else:
offset += self.__sym_follow
for i in range(0, len(root.children)):
if i == len(root.children)-1:
prefix = self.__sym_last
last = True
else:
prefix = self.__sym_mid
last = False
self.__print_tree(root.children[i], offset, prefix, last)
def display(self, reverse=False):
"""
Displays the log file nicely
"""
self.__down = reverse
entries = reversed(self.dump()) if reverse else self.dump()
entries = [self.EntryTreeNode(e) for e in entries]
tree = self.__mktree(entries)
for root in tree:
self.__print_tree(root)
I've been trying to figure out how to recursively call a function in python for the last few days to no avail. I'm building a tree structure to store objects and have issues with not only traversing the tree using a generator, but also with making recursive calls to my find function.
Here is my code.
class Node:
def __init__(self, data, children=list()):
self.data = data
self.children = children
def __eq__(self, node):
return self.data == node.data
def __str__(self):
return self.data
def __repr__(self):
return self.data
def write_xtl(self, node, out_file, level=0):
gen2 = self.traverse(node)
for child in gen2:
out_file.write(child.data)
def traverse(self, node, path=list()):
yield self
for n in self.children:
for m in traverse(n, path):
yield m
def find(self, node):
if self == node:
return self
else:
for child in self.children:
return child.find(node)
def add(self, node, value):
entry_point = self.find(node)
if entry_point:
#print ("Found %s in %s") % (value.data.rstrip(), node.data.rstrip())
#print ("\tentry_point is %s") % (entry_point.data)
entry_point.children.append(value)
else:
print ("Could not find %s") % (value)
Here's my test file:
from xtensiltree import tree
root = tree.Node("root\n")
header = tree.Node("header\n")
orderHeader = tree.Node("orderHeader\n")
date = tree.Node("date\n")
notes = tree.Node("notes\n")
address = tree.Node("address\n")
contacts = tree.Node("contacts\n")
root.add(root, header)
root.add(header, orderHeader)
root.add(orderHeader, date)
root.add(orderHeader, notes)
root.add(orderHeader, address)
root.add(address, contacts)
outfile = open("ooutput.xtl", "w")
root.write_xtl(root, outfile)
outfile.close()
Thank you in advance.
Here is some example how find method should look like.
def find(self, node):
if self == node:
return self
elif self.children != []:
for child in self.children:
found = child.find(node)
if found:
return found
return None
Basicly, it means: if the current node is what we are looking for then return it. If not and the current node has children search them. If it's been found in one of children it will be returned, otherwise return None.
My problem was with my init() method.
def init(self, data, children=list()):
The list in my tree was declared global. This caused every element of my tree to have the same children.
In turn, any recursive functions would go on forever.
Thanks for the help Tiero.
I have got the following error when I'm trying to execute example in pdfquery.
File "C:\workspace-php\test\pdfminer\pdfqueries\pdfquery.py", line 187, in init
doc.initialize()
AttributeError: 'QPDFDocument' object has no attribute 'initialize'
I'm trying to solve this but still i dont get any solution for that.it would be appreciated if some one can help me to solve this.
class PDFQuery(object):
def __init__(self, file,
merge_tags=('LTChar', 'LTAnno'),
round_floats=True,
round_digits=3,
input_text_formatter=None,
normalize_spaces=True,
resort=True,
parse_tree_cacher=None,
):
# store input
self.merge_tags = merge_tags
self.round_floats = round_floats
self.round_digits = round_digits
self.resort = resort
# set up input text formatting function, if any
if input_text_formatter:
self.input_text_formatter = input_text_formatter
elif normalize_spaces:
r = re.compile(r'\s+')
self.input_text_formatter = lambda s: re.sub(r, ' ', s)
else:
self.input_text_formatter = None
# open doc
if not hasattr(file, 'read'):
try:
file = open(file, 'rb')
except TypeError:
raise TypeError("File must be file object or filepath string.")
parser = PDFParser(file)
if hasattr(QPDFDocument, 'set_parser'):
# pdfminer < 20131022
doc = QPDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
else:
# pdfminer >= 20131022
doc = QPDFDocument(parser)
parser.set_document(doc)
doc.initialize()
self.doc = doc
self.parser = parser
self.tree = None
self.pq = None
self.file = file
if parse_tree_cacher:
self._parse_tree_cacher = parse_tree_cacher
self._parse_tree_cacher.set_hash_key(self.file)
else:
self._parse_tree_cacher = DummyCache()
# set up layout parsing
rsrcmgr = PDFResourceManager()
laparams = LAParams(all_texts=True, detect_vertical=True)
self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
# caches
self._pages = []
self._pages_iter = None
self._elements = []
def load(self, *page_numbers):
"""
Load etree and pyquery object for entire document, or given page numbers (ints or lists).
After this is called, objects are available at pdf.tree and pdf.pq.
>>> pdf.load()
>>> pdf.tree
<lxml.etree._ElementTree object at ...>
>>> pdf.pq('LTPage')
[<LTPage>, <LTPage>]
>>> pdf.load(1)
>>> pdf.pq('LTPage')
[<LTPage>]
>>> pdf.load(0,1)
>>> pdf.pq('LTPage')
[<LTPage>, <LTPage>]
"""
self.tree = self.get_tree(*_flatten(page_numbers))
self.pq = self.get_pyquery(self.tree)
def extract(self, searches, tree=None, as_dict=True):
"""
>>> foo = pdf.extract( [ ['pages', 'LTPage'] ])
>>> foo
{'pages': [<LTPage>, <LTPage>]}
>>> pdf.extract( [ ['bar', ':in_bbox("100,100,400,400")'] ], foo['pages'][0])
{'bar': [<LTTextLineHorizontal>, <LTTextBoxHorizontal>,...
"""
if self.tree is None or self.pq is None:
self.load()
pq = PyQuery(tree, css_translator=PDFQueryTranslator()) if tree is not None else self.pq
if tree is None:
pq = self.pq
else:
pq = PyQuery(tree, css_translator=PDFQueryTranslator())
results = []
formatter = None
parent = pq
for search in searches:
if len(search) < 3:
search = list(search) + [formatter]
key, search, tmp_formatter = search
if key == 'with_formatter':
if isinstance(search, basestring): # is a pyquery method name, e.g. 'text'
formatter = lambda o, search=search: getattr(o, search)()
elif hasattr(search, '__call__') or not search: # is a method, or None to end formatting
formatter = search
else:
raise TypeError("Formatter should be either a pyquery method name or a callable function.")
elif key == 'with_parent':
parent = pq(search) if search else pq
else:
try:
result = parent("*").filter(search) if hasattr(search, '__call__') else parent(search)
except cssselect.SelectorSyntaxError, e:
raise cssselect.SelectorSyntaxError( "Error applying selector '%s': %s" % (search, e) )
if tmp_formatter:
result = tmp_formatter(result)
results += result if type(result) == tuple else [[key, result]]
if as_dict:
results = dict(results)
return results
# tree building stuff
def get_pyquery(self, tree=None, page_numbers=[]):
"""
Wrap given tree in pyquery and return.
If no tree supplied, will generate one from given page_numbers, or all page numbers.
"""
if tree is None:
if not page_numbers and self.tree is not None:
tree = self.tree
else:
tree = self.get_tree(page_numbers)
if hasattr(tree, 'getroot'):
tree = tree.getroot()
return PyQuery(tree, css_translator=PDFQueryTranslator())
def get_tree(self, *page_numbers):
"""
Return lxml.etree.ElementTree for entire document, or page numbers given if any.
"""
cache_key = "_".join(map(str, _flatten(page_numbers)))
tree = self._parse_tree_cacher.get(cache_key)
if tree is None:
# set up root
root = parser.makeelement("pdfxml")
if self.doc.info: #not all PDFs seem to have this info section
for k, v in self.doc.info[0].items():
root.set(k, unicode(v))
# add pages
if page_numbers:
pages = [[n, self.get_layout(self.get_page(n))] for n in _flatten(page_numbers)]
else:
pages = enumerate(self.get_layouts())
for n, page in pages:
page = self._xmlize(page)
page.set('page_index', unicode(n))
page.set('page_label', self.doc.get_page_number(n))
root.append(page)
self._clean_text(root)
# wrap root in ElementTree
tree = etree.ElementTree(root)
self._parse_tree_cacher.set(cache_key, tree)
return tree
def _clean_text(self, branch):
"""
Remove text from node if same text exists in its children.
Apply string formatter if set.
"""
if branch.text and self.input_text_formatter:
branch.text = self.input_text_formatter(branch.text)
try:
for child in branch:
self._clean_text(child)
if branch.text and branch.text.find(child.text) >= 0:
branch.text = branch.text.replace(child.text, '', 1)
except TypeError: # not an iterable node
pass
def _xmlize(self, node, root=None):
# collect attributes of current node
tags = self._getattrs(node, 'y0', 'y1', 'x0', 'x1', 'width', 'height', 'bbox', 'linewidth', 'pts', 'index','name','matrix','word_margin' )
if type(node) == LTImage:
tags.update( self._getattrs(node, 'colorspace','bits','imagemask','srcsize','stream','name','pts','linewidth') )
elif type(node) == LTChar:
tags.update( self._getattrs(node, 'fontname','adv','upright','size') )
elif type(node) == LTPage:
tags.update( self._getattrs(node, 'pageid','rotate') )
# create node
branch = parser.makeelement(node.__class__.__name__, tags)
branch.layout = node
self._elements += [branch] # make sure layout keeps state
if root is None:
root = branch
# add text
if hasattr(node, 'get_text'):
branch.text = node.get_text()
# add children if node is an iterable
if hasattr(node, '__iter__'):
last = None
for child in node:
child = self._xmlize(child, root)
if self.merge_tags and child.tag in self.merge_tags:
if branch.text and child.text in branch.text:
continue
elif last is not None and last.tag in self.merge_tags:
last.text += child.text
last.set('_obj_id', last.get('_obj_id')+","+child.get('_obj_id'))
continue
# sort children by bounding boxes
if self.resort:
_append_sorted(root, child, _comp_bbox)
else:
branch.append(child)
last = child
return branch
def _getattrs(self, obj, *attrs):
""" Return dictionary of given attrs on given object, if they exist, processing through filter_value(). """
return dict( (attr, unicode(self._filter_value(getattr(obj, attr)))) for attr in attrs if hasattr(obj, attr))
def _filter_value(self, val):
if self.round_floats:
if type(val) == float:
val = round(val, self.round_digits)
elif hasattr(val, '__iter__'):
val = [self._filter_value(item) for item in val]
return val
# page access stuff
def get_page(self, page_number):
""" Get PDFPage object -- 0-indexed."""
return self._cached_pages(target_page=page_number)
def get_layout(self, page):
""" Get PDFMiner Layout object for given page object or page number. """
if type(page) == int:
page = self.get_page(page)
self.interpreter.process_page(page)
return self.device.get_result()
def get_layouts(self):
""" Get list of PDFMiner Layout objects for each page. """
return (self.get_layout(page) for page in self._cached_pages())
def _cached_pages(self, target_page=-1):
"""
Get a page or all pages from page generator, caching results.
This is necessary because PDFMiner searches recursively for pages,
so we won't know how many there are until we parse the whole document,
which we don't want to do until we need to.
"""
try:
# pdfminer < 20131022
self._pages_iter = self._pages_iter or self.doc.get_pages()
except AttributeError:
# pdfminer >= 20131022
self._pages_iter = self._pages_iter or PDFPage.create_pages(self.doc)
if target_page >= 0:
while len(self._pages) <= target_page:
next = self._pages_iter.next()
if not next:
return None
next.page_number = 0
self._pages += [next]
try:
return self._pages[target_page]
except IndexError:
return None
self._pages += list(self._pages_iter)
return self._pages
if __name__ == "__main__":
import doctest
pdf = PDFQuery("../examples/sample.pdf")
doctest.testmod(extraglobs={'pdf': pdf}, optionflags=doctest.ELLIPSIS)
add.elif to line 18 then add a for loop before the list and that should fix it if there is any problems contact me for support