why my function doesn't remove objects from final_list? - python-2.7

hey guys:)im sorry for all the code but i feel its necessary for u to see everything..
i tried everything... i hidden prints in the code, debugging for ten times, triple checked the built in methods, and still, the .crawl() method dosnt remove any object from the final_list.
the object of my assignment is to built two classes:
Web_page : holds data of a web page.(the pages come in the form of html files saved in a folder on my desktop. Crawler: compare between pages and hold a list of the uniqe pages---> final_list
import re
import os
def remove_html_tags(s):
tag = False
quote = False
out = ""
for c in s:
if c == '<' and not quote:
tag = True
elif c == '>' and not quote:
tag = False
elif (c == '"' or c == "'") and tag:
quote = not quote
elif not tag:
out = out + c
return out
def lev(s1, s2):
return lev_iter(s1, s2, dict())
def lev_iter(s1, s2, mem):
(i,j) = (len(s1), len(s2))
if (i,j) in mem:
return mem[(i,j)]
s1_low = s1.lower()
s2_low = s2.lower()
if len(s1_low) == 0 or len(s2_low) == 0:
return max(len(s1_low), len(s2_low))
d1 = lev_iter(s1_low[:-1], s2_low, mem) + 1
d2 = lev_iter(s1_low, s2_low[:-1], mem) + 1
last = 0 if s1_low[-1] == s2_low[-1] else 1
d3 = lev_iter(s1_low[:-1], s2_low[:-1], mem) + last
result = min(d1, d2, d3)
mem[(i,j)] = result
return result
def merge_spaces(content):
return re.sub('\s+', ' ', content).strip()
""" A Class that holds data on a Web page """
class WebPage:
def __init__(self, filename):
self.filename = filename
def process(self):
f = open(self.filename,'r')
LINE_lst = f.readlines()
self.info = {}
for i in range(len(LINE_lst)):
LINE_lst[i] = LINE_lst[i].strip(' \n\t')
LINE_lst[i] = remove_html_tags(LINE_lst[i])
lines = LINE_lst[:]
for line in lines:
if len(line) == 0:
LINE_lst.remove(line)
self.body = ' '.join(LINE_lst[1:])
self.title = LINE_lst[0]
f.close()
def __str__(self):
return self.title + '\n' + self.body
def __repr__(self):
return self.title
def __eq__(self,other):
n = lev(self.body,other.body)
k = len(self.body)
m = len(other.body)
return float(n)/max(k,m) <= 0.15
def __lt__(self,other):
return self.title < other.title
""" A Class that crawls the web """
class Crawler:
def __init__(self, directory):
self.folder = directory
def crawl(self):
pages = [f for f in os.listdir(self.folder) if f.endswith('.html')]
final_list = []
for i in range(len(pages)):
pages[i] = WebPage(self.folder + '\\' + pages[i])
pages[i].process()
for k in range(len(final_list)+1):
if k == len(final_list):
final_list.append(pages[i])
elif pages[i] == final_list[k]:
if pages[i] < final_list[k]:
final_list.append(pages[i])
final_list.remove(final_list[k])
break
print final_list
self.pages = final_list
everything works fine besides this freaking line final_list.remove(final_list[k]). help please? whats wrong here?

I'm not sure why your code doesn't work, it's difficult to test it because I don't know what kind of input should end up calling remove().
I suggest following these steps:
Make sure that remove() is called at some point.
remove() relies on your __eq__() method to find the item to remove, so make sure that __eq__() isn't the culprit.
As a side note, you will probably want to replace this:
self.folder + '\\' + pages[i]
with:
import os.path
# ...
os.path.join(self.folder, page[i])
This simple change should make your script work on all operating systems, rather than on Windows only. (GNU/Linux, Mac OS and other Unix-like OS use “/” as path separator.)
Please also consider replacing loops of this form:
for i in range(len(sequence)):
# Do something with sequence[i]
with:
for item in sequence:
# Do something with item
If you need the item index, use enumerate():
for i, item in enumerate(sequence):

Related

PYTHON: Searching two files for common lines and collecting their contents into set

I have a task to compare two comma separated files. If the first two columns exist in both files, then I have to collect the remaining columns into set from both the files in my results.
If I have the following two files:
a.txt
1,2,3,4
2,4,7,5
3,8,6,7
4,9,5,6
3,8,7,2
b.txt
1,2,4,6
2,3,6,5
3,8,9,2
4,9,6,9
3,5,2,3
6,2,7,3
I want to get the results:
1,2(3,4,4,6)
3,8(6,7,7,2,9,2)
4,9(5,6,6,9)
Is there a more efficient way to implement it? especially as the files maybe large and not fit in the available memory of my computer.
The following is my implement.
KEYNOTFOUND = '<KEYNOTFOUND>'
class dict_cls(object):
#staticmethod
def dict_diff(first, second):
diff = {}
for key in first.keys():
if (not second.has_key(key)):
diff[key] = (first[key], KEYNOTFOUND)
elif (first[key] != second[key]):
diff[key] = (first[key], second[key])
for key in second.keys():
if (not first.has_key(key)):
diff[key] = (KEYNOTFOUND, second[key])
return diff
if __name__ == '__main__':
dict1 = {(1,2):(3,4),(2,4):(7,5),(3,8):(6,7),(4,9):(5,6),(3,8):(7,2)}
dict2 = {(1,2):(4,6),(2,3):(6,5),(3,8):(9,2),(4,9):(6,9),(3,5):(2,3),(6,2):(7,3)}
print dict_cls.dict_diff(dict1, dict2)
import datetime
class FindCommKey(object):
def __init__(self):
self.combine = {}
self.counter = {}
self.result = {}
def find_common_key(self, target_file):
with open(target_file, 'r+') as file_handler:
for line in file_handler:
print(line, end='')
__line = list(map(int, line.strip().split(',')))
key, value = tuple(__line[:2]), __line[2:]
if key in self.combine:
self.combine[key] = self.combine[key] + value
else:
self.combine[key] = value
if key in self.counter:
self.counter[key] = self.counter[key] + 1
else:
self.counter[key] = 1
for k1, v1 in self.counter.items():
if v1 >= 2:
self.result[k1] = self.combine[k1]
print()
return self.result
if __name__ == '__main__':
files = ['ds1.txt', 'ds2.txt']
print("Started at: {}{}".format(datetime.datetime.now(), '\n'))
print('Initial data:')
fck = FindCommKey()
for f in files:
fck.find_common_key(f)
print("Write to dic finished at: {}{}".format(datetime.datetime.now(), '\n'))
print('Result set:')
for k, v in fck.result.items():
print(','.join(map(str, k)), tuple(v))
print("{}Finished at: {}".format('\n', datetime.datetime.now()))

How to debug with gdb with eigen math library

I used code::blocks (CB) or visual studio (VS) for C++ programs with eigen library. However, when it comes to debugging, I cannot see contents of arrays, matrices etc. I checked following posts:
Using GDB with Eigen C++ library
I am not a C++ expert but I could get that I need something called as printer.
https://android.googlesource.com/platform/external/eigen/+/b015e75e8c7ba1ab4ddb91e9372a57e76f3fd159/debug/gdb/printers.py has the source code. However I do not know how to use this source code to debug with gdb with eigen library in CB or VS. Any ideas how to do this?
Update:
vsoftco mentioned a webpage https://android.googlesource.com/platform/external/eigen/+/b015e75e8c7ba1ab4ddb91e9372a57e76f3fd159/debug and that has python printers for gdb for CB and VS. If anyone knows how to use them to see contents of arrays of eigen library, please comment.
The Eigen::Matrix class is not an aggregate, so you cannot just see its content with a debugger. However, you should be able to step in with a debugger, and can use cout or other methods to display the content.
The link you mentioned is a python plugin for gdb, to allow gdb to print the content of Eigen types. But as you use VS (which has its internal debugger and doesn't use gdb), there is no reason why it would work in your case.
You can try switching to MinGW and g++/gdb, or can check this link How can I use GDB from inside Visual Studio C++ (Express) to debug my GCC Makefile projects? for some advice about installing gdb under VS.
PS: it seems that a solution for VS also exists,
https://android.googlesource.com/platform/external/eigen/+/b015e75e8c7ba1ab4ddb91e9372a57e76f3fd159/debug
The python printers for gdb works for me. Notice that the printers.py script is written in Python 2.7 and your gdb is probably running python 3.5 or later... Use the 2to3 converter or simply copy this to a new file called printers3.py:
# -*- coding: utf-8 -*-
# This file is part of Eigen, a lightweight C++ template library
# for linear algebra.
#
# Copyright (C) 2009 Benjamin Schindler <bschindler#inf.ethz.ch>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Pretty printers for Eigen::Matrix
# This is still pretty basic as the python extension to gdb is still pretty basic.
# It cannot handle complex eigen types and it doesn't support any of the other eigen types
# Such as quaternion or some other type.
# This code supports fixed size as well as dynamic size matrices
# To use it:
#
# * Create a directory and put the file as well as an empty __init__.py in
# that directory.
# * Create a ~/.gdbinit file, that contains the following:
# python
# import sys
# sys.path.insert(0, '/path/to/eigen/printer/directory')
# from printers3 import register_eigen_printers
# register_eigen_printers (None)
# end
import gdb
import re
import itertools
class EigenMatrixPrinter:
"Print Eigen Matrix or Array of some kind"
def __init__(self, variety, val):
"Extract all the necessary information"
# Save the variety (presumably "Matrix" or "Array") for later usage
self.variety = variety
# The gdb extension does not support value template arguments - need to extract them by hand
type = val.type
if type.code == gdb.TYPE_CODE_REF:
type = type.target()
self.type = type.unqualified().strip_typedefs()
tag = self.type.tag
regex = re.compile('\<.*\>')
m = regex.findall(tag)[0][1:-1]
template_params = m.split(',')
template_params = [x.replace(" ", "") for x in template_params]
if template_params[1] == '-0x00000000000000001' or template_params[1] == '-0x000000001' or template_params[1] == '-1':
self.rows = val['m_storage']['m_rows']
else:
self.rows = int(template_params[1])
if template_params[2] == '-0x00000000000000001' or template_params[2] == '-0x000000001' or template_params[2] == '-1':
self.cols = val['m_storage']['m_cols']
else:
self.cols = int(template_params[2])
self.options = 0 # default value
if len(template_params) > 3:
self.options = template_params[3];
self.rowMajor = (int(self.options) & 0x1)
self.innerType = self.type.template_argument(0)
self.val = val
# Fixed size matrices have a struct as their storage, so we need to walk through this
self.data = self.val['m_storage']['m_data']
if self.data.type.code == gdb.TYPE_CODE_STRUCT:
self.data = self.data['array']
self.data = self.data.cast(self.innerType.pointer())
class _iterator:
def __init__ (self, rows, cols, dataPtr, rowMajor):
self.rows = rows
self.cols = cols
self.dataPtr = dataPtr
self.currentRow = 0
self.currentCol = 0
self.rowMajor = rowMajor
def __iter__ (self):
return self
def __next__(self):
row = self.currentRow
col = self.currentCol
if self.rowMajor == 0:
if self.currentCol >= self.cols:
raise StopIteration
self.currentRow = self.currentRow + 1
if self.currentRow >= self.rows:
self.currentRow = 0
self.currentCol = self.currentCol + 1
else:
if self.currentRow >= self.rows:
raise StopIteration
self.currentCol = self.currentCol + 1
if self.currentCol >= self.cols:
self.currentCol = 0
self.currentRow = self.currentRow + 1
item = self.dataPtr.dereference()
self.dataPtr = self.dataPtr + 1
if (self.cols == 1): #if it's a column vector
return ('[%d]' % (row,), item)
elif (self.rows == 1): #if it's a row vector
return ('[%d]' % (col,), item)
return ('[%d,%d]' % (row, col), item)
def children(self):
return self._iterator(self.rows, self.cols, self.data, self.rowMajor)
def to_string(self):
return "Eigen::%s<%s,%d,%d,%s> (data ptr: %s)" % (self.variety, self.innerType, self.rows, self.cols, "RowMajor" if self.rowMajor else "ColMajor", self.data)
class EigenQuaternionPrinter:
"Print an Eigen Quaternion"
def __init__(self, val):
"Extract all the necessary information"
# The gdb extension does not support value template arguments - need to extract them by hand
type = val.type
if type.code == gdb.TYPE_CODE_REF:
type = type.target()
self.type = type.unqualified().strip_typedefs()
self.innerType = self.type.template_argument(0)
self.val = val
# Quaternions have a struct as their storage, so we need to walk through this
self.data = self.val['m_coeffs']['m_storage']['m_data']['array']
self.data = self.data.cast(self.innerType.pointer())
class _iterator:
def __init__ (self, dataPtr):
self.dataPtr = dataPtr
self.currentElement = 0
self.elementNames = ['x', 'y', 'z', 'w']
def __iter__ (self):
return self
def __next__(self):
element = self.currentElement
if self.currentElement >= 4: #there are 4 elements in a quanternion
raise StopIteration
self.currentElement = self.currentElement + 1
item = self.dataPtr.dereference()
self.dataPtr = self.dataPtr + 1
return ('[%s]' % (self.elementNames[element],), item)
def children(self):
return self._iterator(self.data)
def to_string(self):
return "Eigen::Quaternion<%s> (data ptr: %s)" % (self.innerType, self.data)
def build_eigen_dictionary ():
pretty_printers_dict[re.compile('^Eigen::Quaternion<.*>$')] = lambda val: EigenQuaternionPrinter(val)
pretty_printers_dict[re.compile('^Eigen::Matrix<.*>$')] = lambda val: EigenMatrixPrinter("Matrix", val)
pretty_printers_dict[re.compile('^Eigen::Array<.*>$')] = lambda val: EigenMatrixPrinter("Array", val)
def register_eigen_printers(obj):
"Register eigen pretty-printers with objfile Obj"
if obj == None:
obj = gdb
obj.pretty_printers.append(lookup_function)
def lookup_function(val):
"Look-up and return a pretty-printer that can print va."
type = val.type
if type.code == gdb.TYPE_CODE_REF:
type = type.target()
type = type.unqualified().strip_typedefs()
typename = type.tag
if typename == None:
return None
for function in pretty_printers_dict:
if function.search(typename):
return pretty_printers_dict[function](val)
return None
pretty_printers_dict = {}
build_eigen_dictionary ()

setting an attribute at create retrieves None value - Python

So I have an Article class that models the articles in a store. When I create a new article, I want it to have an EAN 13 code. So I initialize the article with a 12 digits code and use the check_ean13() funtion to retrieve the control digit. It works but seems like in any moment, when the object is created, rewrite the ean13 attribute and replaces it for None. Any ideas?
Main
if __name__ == "__main__":
# create article
art1 = Article("123456789087", "Article 1", 145.6, 200.0)
print art1
print art1.get_ean13()
class Article
class Article:
def __init__(self, cod, art_name, cost, listprice):
self.ean13 = self.set_ean13(cod)
self.art_name = art_name
self.cost = cost
self.listprice = listprice
self.commission = None
self.promotion=[]
def get_ean13(self):
return self.ean13
def set_ean13(self,cod):
cd = self.check_ean13(cod)
ean13 = cod + str(cd)
self.ean13=ean13
def check_ean13(self, code):
checksum = 0
for i, digit in enumerate(reversed(code)):
checksum += int(digit) * 3 if (i % 2 == 0) else int(digit)
return (10 - (checksum % 10)) % 10
output:
None - Article 1 list price: 400.0
None
self.ean13 = self.set_ean13(cod)
set_ean13 doesn't return anything, so you're effectively doing self.ean13 = None here. Just call the method without assigning the result.
self.set_ean13(cod)

Display tagged list as indented tree grid

I am writing a logger which records the level of the entries.
To make it simple, let's say it logs entries like <level> <message>.
I am now trying to write a log viewer which formats the logfile "nicely" as an indented tree grid.
For example is the raw log file contains:
0 entry1
0 entry2
1 entry3
2 entry4
3 entry5
2 entry6
0 entry7
It should output:
entry1
entry2
└entry3
├entry4
│└entry5
└entry6
entry7
My first steps were
Converting the list into a tree
Recursively print the tree
This worked with one single exception: I cannot figure out how I can pass the information that - referring to the example - before entry5 comes the │ sign to display that the previous level continues after the sub-levels.
So any hint, how to come from the list to the desired output is welcome.
Finally got it:
class LogViewer(LogFile):
"""
Formats raw log file contents nicely
and thus makes it human-readable
"""
__down = False
class EntryTreeNode():
"""
A minimal entry wrapper
"""
def __init__(self, string):
"""
Constructor
"""
lst = string.split(LogEntry.colsep())
if len(lst) != 6:
raise Exception('Invalid entry: ' + string)
else:
self.DATE = datetime.strptime(lst[0], LogEntry.timeformat())
self.ERRLVL = ErrLvlType(lst[1])
self.USER = lst[2]
self.CALLER = lst[3]
self.OFFSET = int(lst[4])
self.MSG = lst[5]
self.tag = self.OFFSET
self.children = []
self.pre = '[' + datetime.strftime(self.DATE, LogEntry.timeformat()) + ']\t' \
+ str(self.ERRLVL) + '\t' \
+ str(self.USER) + '\t'
self.post = str(self.CALLER) + ' \t' + str(self.MSG)
def __repr__(self):
return str(self.tag)
def __init__(self, path):
"""
Constructor
"""
super().__init__(path)
#property
def __sym_last(self):
"""
Returns the symbol for a last entry
"""
return '┌' if self.__down else '└'
#property
def __sym_mid(self):
"""
Returns the symbol for a middle entry
"""
return '├'
#property
def __sym_follow(self):
"""
Returns the symbol for a following entry
"""
return '│'
def __mktree(self, lst):
"""
Converts a log entry list into a tree
"""
roots = []
def children(root, lst):
result = []
while lst:
curr = lst.pop()
if curr.tag == root.tag + 1:
curr.children = children(curr, lst)
result.append(curr)
else:
lst.append(curr)
break
return result
while lst:
curr = lst.pop()
if curr.tag == 0:
curr.children = children(curr, lst)
roots.append(curr)
return roots
def __print_tree(self, root, offset='', prefix='', last=True):
"""
Prints a log entry tree
"""
print(root.pre + offset + prefix + root.post)
if last:
offset += ' '
else:
offset += self.__sym_follow
for i in range(0, len(root.children)):
if i == len(root.children)-1:
prefix = self.__sym_last
last = True
else:
prefix = self.__sym_mid
last = False
self.__print_tree(root.children[i], offset, prefix, last)
def display(self, reverse=False):
"""
Displays the log file nicely
"""
self.__down = reverse
entries = reversed(self.dump()) if reverse else self.dump()
entries = [self.EntryTreeNode(e) for e in entries]
tree = self.__mktree(entries)
for root in tree:
self.__print_tree(root)

How to solve "AttributeError: 'QPDFDocument' object has no attribute 'initialize' " in python

I have got the following error when I'm trying to execute example in pdfquery.
File "C:\workspace-php\test\pdfminer\pdfqueries\pdfquery.py", line 187, in init
doc.initialize()
AttributeError: 'QPDFDocument' object has no attribute 'initialize'
I'm trying to solve this but still i dont get any solution for that.it would be appreciated if some one can help me to solve this.
class PDFQuery(object):
def __init__(self, file,
merge_tags=('LTChar', 'LTAnno'),
round_floats=True,
round_digits=3,
input_text_formatter=None,
normalize_spaces=True,
resort=True,
parse_tree_cacher=None,
):
# store input
self.merge_tags = merge_tags
self.round_floats = round_floats
self.round_digits = round_digits
self.resort = resort
# set up input text formatting function, if any
if input_text_formatter:
self.input_text_formatter = input_text_formatter
elif normalize_spaces:
r = re.compile(r'\s+')
self.input_text_formatter = lambda s: re.sub(r, ' ', s)
else:
self.input_text_formatter = None
# open doc
if not hasattr(file, 'read'):
try:
file = open(file, 'rb')
except TypeError:
raise TypeError("File must be file object or filepath string.")
parser = PDFParser(file)
if hasattr(QPDFDocument, 'set_parser'):
# pdfminer < 20131022
doc = QPDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
else:
# pdfminer >= 20131022
doc = QPDFDocument(parser)
parser.set_document(doc)
doc.initialize()
self.doc = doc
self.parser = parser
self.tree = None
self.pq = None
self.file = file
if parse_tree_cacher:
self._parse_tree_cacher = parse_tree_cacher
self._parse_tree_cacher.set_hash_key(self.file)
else:
self._parse_tree_cacher = DummyCache()
# set up layout parsing
rsrcmgr = PDFResourceManager()
laparams = LAParams(all_texts=True, detect_vertical=True)
self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
# caches
self._pages = []
self._pages_iter = None
self._elements = []
def load(self, *page_numbers):
"""
Load etree and pyquery object for entire document, or given page numbers (ints or lists).
After this is called, objects are available at pdf.tree and pdf.pq.
>>> pdf.load()
>>> pdf.tree
<lxml.etree._ElementTree object at ...>
>>> pdf.pq('LTPage')
[<LTPage>, <LTPage>]
>>> pdf.load(1)
>>> pdf.pq('LTPage')
[<LTPage>]
>>> pdf.load(0,1)
>>> pdf.pq('LTPage')
[<LTPage>, <LTPage>]
"""
self.tree = self.get_tree(*_flatten(page_numbers))
self.pq = self.get_pyquery(self.tree)
def extract(self, searches, tree=None, as_dict=True):
"""
>>> foo = pdf.extract( [ ['pages', 'LTPage'] ])
>>> foo
{'pages': [<LTPage>, <LTPage>]}
>>> pdf.extract( [ ['bar', ':in_bbox("100,100,400,400")'] ], foo['pages'][0])
{'bar': [<LTTextLineHorizontal>, <LTTextBoxHorizontal>,...
"""
if self.tree is None or self.pq is None:
self.load()
pq = PyQuery(tree, css_translator=PDFQueryTranslator()) if tree is not None else self.pq
if tree is None:
pq = self.pq
else:
pq = PyQuery(tree, css_translator=PDFQueryTranslator())
results = []
formatter = None
parent = pq
for search in searches:
if len(search) < 3:
search = list(search) + [formatter]
key, search, tmp_formatter = search
if key == 'with_formatter':
if isinstance(search, basestring): # is a pyquery method name, e.g. 'text'
formatter = lambda o, search=search: getattr(o, search)()
elif hasattr(search, '__call__') or not search: # is a method, or None to end formatting
formatter = search
else:
raise TypeError("Formatter should be either a pyquery method name or a callable function.")
elif key == 'with_parent':
parent = pq(search) if search else pq
else:
try:
result = parent("*").filter(search) if hasattr(search, '__call__') else parent(search)
except cssselect.SelectorSyntaxError, e:
raise cssselect.SelectorSyntaxError( "Error applying selector '%s': %s" % (search, e) )
if tmp_formatter:
result = tmp_formatter(result)
results += result if type(result) == tuple else [[key, result]]
if as_dict:
results = dict(results)
return results
# tree building stuff
def get_pyquery(self, tree=None, page_numbers=[]):
"""
Wrap given tree in pyquery and return.
If no tree supplied, will generate one from given page_numbers, or all page numbers.
"""
if tree is None:
if not page_numbers and self.tree is not None:
tree = self.tree
else:
tree = self.get_tree(page_numbers)
if hasattr(tree, 'getroot'):
tree = tree.getroot()
return PyQuery(tree, css_translator=PDFQueryTranslator())
def get_tree(self, *page_numbers):
"""
Return lxml.etree.ElementTree for entire document, or page numbers given if any.
"""
cache_key = "_".join(map(str, _flatten(page_numbers)))
tree = self._parse_tree_cacher.get(cache_key)
if tree is None:
# set up root
root = parser.makeelement("pdfxml")
if self.doc.info: #not all PDFs seem to have this info section
for k, v in self.doc.info[0].items():
root.set(k, unicode(v))
# add pages
if page_numbers:
pages = [[n, self.get_layout(self.get_page(n))] for n in _flatten(page_numbers)]
else:
pages = enumerate(self.get_layouts())
for n, page in pages:
page = self._xmlize(page)
page.set('page_index', unicode(n))
page.set('page_label', self.doc.get_page_number(n))
root.append(page)
self._clean_text(root)
# wrap root in ElementTree
tree = etree.ElementTree(root)
self._parse_tree_cacher.set(cache_key, tree)
return tree
def _clean_text(self, branch):
"""
Remove text from node if same text exists in its children.
Apply string formatter if set.
"""
if branch.text and self.input_text_formatter:
branch.text = self.input_text_formatter(branch.text)
try:
for child in branch:
self._clean_text(child)
if branch.text and branch.text.find(child.text) >= 0:
branch.text = branch.text.replace(child.text, '', 1)
except TypeError: # not an iterable node
pass
def _xmlize(self, node, root=None):
# collect attributes of current node
tags = self._getattrs(node, 'y0', 'y1', 'x0', 'x1', 'width', 'height', 'bbox', 'linewidth', 'pts', 'index','name','matrix','word_margin' )
if type(node) == LTImage:
tags.update( self._getattrs(node, 'colorspace','bits','imagemask','srcsize','stream','name','pts','linewidth') )
elif type(node) == LTChar:
tags.update( self._getattrs(node, 'fontname','adv','upright','size') )
elif type(node) == LTPage:
tags.update( self._getattrs(node, 'pageid','rotate') )
# create node
branch = parser.makeelement(node.__class__.__name__, tags)
branch.layout = node
self._elements += [branch] # make sure layout keeps state
if root is None:
root = branch
# add text
if hasattr(node, 'get_text'):
branch.text = node.get_text()
# add children if node is an iterable
if hasattr(node, '__iter__'):
last = None
for child in node:
child = self._xmlize(child, root)
if self.merge_tags and child.tag in self.merge_tags:
if branch.text and child.text in branch.text:
continue
elif last is not None and last.tag in self.merge_tags:
last.text += child.text
last.set('_obj_id', last.get('_obj_id')+","+child.get('_obj_id'))
continue
# sort children by bounding boxes
if self.resort:
_append_sorted(root, child, _comp_bbox)
else:
branch.append(child)
last = child
return branch
def _getattrs(self, obj, *attrs):
""" Return dictionary of given attrs on given object, if they exist, processing through filter_value(). """
return dict( (attr, unicode(self._filter_value(getattr(obj, attr)))) for attr in attrs if hasattr(obj, attr))
def _filter_value(self, val):
if self.round_floats:
if type(val) == float:
val = round(val, self.round_digits)
elif hasattr(val, '__iter__'):
val = [self._filter_value(item) for item in val]
return val
# page access stuff
def get_page(self, page_number):
""" Get PDFPage object -- 0-indexed."""
return self._cached_pages(target_page=page_number)
def get_layout(self, page):
""" Get PDFMiner Layout object for given page object or page number. """
if type(page) == int:
page = self.get_page(page)
self.interpreter.process_page(page)
return self.device.get_result()
def get_layouts(self):
""" Get list of PDFMiner Layout objects for each page. """
return (self.get_layout(page) for page in self._cached_pages())
def _cached_pages(self, target_page=-1):
"""
Get a page or all pages from page generator, caching results.
This is necessary because PDFMiner searches recursively for pages,
so we won't know how many there are until we parse the whole document,
which we don't want to do until we need to.
"""
try:
# pdfminer < 20131022
self._pages_iter = self._pages_iter or self.doc.get_pages()
except AttributeError:
# pdfminer >= 20131022
self._pages_iter = self._pages_iter or PDFPage.create_pages(self.doc)
if target_page >= 0:
while len(self._pages) <= target_page:
next = self._pages_iter.next()
if not next:
return None
next.page_number = 0
self._pages += [next]
try:
return self._pages[target_page]
except IndexError:
return None
self._pages += list(self._pages_iter)
return self._pages
if __name__ == "__main__":
import doctest
pdf = PDFQuery("../examples/sample.pdf")
doctest.testmod(extraglobs={'pdf': pdf}, optionflags=doctest.ELLIPSIS)
add.elif to line 18 then add a for loop before the list and that should fix it if there is any problems contact me for support