How to extract text from PDF uploaded in Google App Engine using PyPDF2? - python-2.7

Is there any way to extract text and documentInfo from PDF file uploaded via Google app engine? I want to use PyPDF2, and my code is this:
pdf_file = self.request.POST['file'].file
pdf_reader = pypdf.PdfFileReader(pdf_file)
This gives me error:
Traceback (most recent call last):
....
File "/myrepo/myproj/main.py", line 154, in post
pdf_text = pypdf.PdfFileReader(pdf_file)
File "lib/PyPDF2/pdf.py", line 649, in __init__
self.read(stream)
File "lib/PyPDF2/pdf.py", line 1100, in read
raise utils.PdfReadError, "EOF marker not found"
PdfReadError: EOF marker not found
It gives this error for any file, even for those that can successfully be read from file on the disk via open(filename, 'r')
am i missing something? thanks in advance!

the solution is to use get_uploads from blobstore_handlers.BlobstoreUploadHandler:
from google.appengine.ext.webapp import blobstore_handlers
from cStringIO import StringIO
import PyPDF2
class UploadHandler(blobstore_handlers.BlobstoreUploadHandler):
def post(self):
upload_files = self.get_uploads('file')
blob_info = upload_files[0]
blob_reader = blobstore.BlobReader(blob_info)
blob_content = StringIO(blob_reader.read())
pdf_info = PyPDF2.PdfFileReader(blob_content)

Related

Extract images in .jpg format from binary using unpickle (python)

I am trying to extract images from CIFAR-10 data binary file, i.e. data_batch_1.bin as .jpg.
But while doing unpickle I am getting an error.
My code is:
from PIL import Image
import numpy
def unpickle(file):
import pickle
with open(file, 'rb') as fo:
dict = pickle.load(fo)
return dict
def save_as_image(img_flat):
"""
Saves a data blob as an image file.
"""
# consecutive 1024 entries store color channels of 32x32 image
img_R = img_flat[0:1024].reshape((32, 32))
img_G = img_flat[1024:2048].reshape((32, 32))
img_B = img_flat[2048:3072].reshape((32, 32))
img = numpy.dstack((img_R, img_G, img_B))
im = Image.fromarray(img)
im.show()
abc = unpickle("/home/ubuntu/visit/cifar-10-batches-bin/data_batch_1.bin")
#print(abc)
data = abc["data"]
save_as_image(data[0])
I am getting an error as follows:
Traceback (most recent call last):
File "load.py", line 24, in <module>
abc = unpickle("/home/ubuntu/visit/cifar-10-batches-bin/data_batch_1.bin")
File "load.py", line 7, in unpickle
dict = pickle.load(fo)
File "/usr/lib/python2.7/pickle.py", line 1378, in load
return Unpickler(file).load()
File "/usr/lib/python2.7/pickle.py", line 858, in load
dispatch[key](self)
KeyError: '\x06'
What could be the cause of this issue?

File writing in Django keeps having IOError

I'm running my app locally and I'm currently having an IOError during my file creation from the database. I am using Django 1.10, MongoDB as my database, and Celery 4.0.2 for my background tasks. The problem occurs in the tasks.py since that is where I access the db then store it in my django subfolder 'analysis_samples'.
Here is the traceback:
[2017-04-15 15:31:08,798: ERROR/PoolWorker-2] Task tasks.process_sample_input[0619194e-4300-4a1d-91b0-20766e048c4a] raised unexpected: IOError(2, 'No such file or directory')
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/celery/app/trace.py", line 367, in trace_task
R = retval = fun(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/celery/app/trace.py", line 622, in __protected_call__
return self.run(*args, **kwargs)
File "/home/user/django_apps/myapp/analysis/tasks.py", line 218, in process_sample_input
with open(sample_file_path, "w+") as f:
IOError: [Errno 2] No such file or directory: u'/home/user/django_apps/myapp/myapp/analysis_samples/58f1cc3c45015d127c3d68c1'
And here is the snippet of tasks.py:
from django.core.files import File
sys.path.append(settings.ANALYSIS_SAMPLES)
import base64
import os, sys
#shared_task(name='tasks.process_sample_input')
def process_sample_input(instance_id):
instance = Sample.objects.get(pk=instance_id)
#many code here..
try:
conn=pymongo.MongoClient(settings.MONGO_HOST, settings.MONGO_PORT)
db = conn.thugfs #connect to GridFS db of thug
thugfs_db = GridFS(db)
except pymongo.errors.ConnectionFailure, e:
logger.error("Could not connect to ThugFS MongoDB: %s" % e)
sample_file_folder = settings.ANALYSIS_SAMPLES
for sample_fs_id in sample_fs_ids:
sample_file = thugfs_db.get(ObjectId(sample_fs_id)).read()
sample_file = base64.b64decode(sample_file) #decode file from database
sample_file_path = os.path.join(sample_file_folder, sample_fs_id)
with open(sample_file_path, "w+") as f:
fileOut = File(f)
fileOut.write(sample_file)
settings.py:
ANALYSIS_SAMPLES = os.path.join(BASE_DIR, 'myapp/analysis_samples')
Can anyone see the point that caused the error? Any help will be appreciated.

Got EOFError during loading doc2vec model

I could not load a doc2vec model on my computer and I got the following error. But, when I load that model on other computers, I can use that model.Therefore, I know the model was built correctly.
what should I do.
This is the code:
# coding: utf-8
from gensim.models.doc2vec import Doc2Vec
import gensim.models.doc2vec
from gensim.models.doc2vec import LabeledSentence
import os
import pickle
pth='/home/fatemeh/Step2/input-output/model/iterator'
model= Doc2Vec.load(pth+'/my_model.doc2vec')
This is the error:
Traceback (most recent call last):
File "CreateAnnoyIndex.py", line 16, in <module>
model= Doc2Vec.load(pth+'/my_model.doc2vec')
File "/usr/local/lib/python2.7/dist-packages/gensim-0.13.3-py2.7-linux-x86_64.egg/gensim/models/word2vec.py", line 1762, in load
model = super(Word2Vec, cls).load(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/gensim-0.13.3-py2.7-linux-x86_64.egg/gensim/utils.py", line 248, in load
obj = unpickle(fname)
File "/usr/local/lib/python2.7/dist-packages/gensim-0.13.3-py2.7-linux-x86_64.egg/gensim/utils.py", line 912, in unpickle
return _pickle.loads(f.read())
EOFError
I think your model causes the problem. Are you check with same model? I mean build in a same way. please see this page

Log warning from Selenium on Django [duplicate]

Whenever I try to construct a string based on self.live_server_url, I get python TypeError messages. For example, I've tried the following string constructions (form 1 & 2 below), but I experience the same TypeError. My desired string is the Live Server URL with "/lists" appended. NOTE: the actual test does succeed to create a server and I can manually access the server, and more specifically, I can manually access the exact URL that I'm trying to build programmatically (e.g. 'http://localhost:8081/lists').
TypeErrors occur with these string constructions.
# FORM 1
lists_live_server_url = '%s%s' % (self.live_server_url, '/lists')
# FORM 2
lists_live_server_url = '{0}{1}'.format(self.live_server_url, '/lists')
self.browser.get(lists_live_server_url)
There is no python error with this form (nothing appended to string), albeit my test fails (as I would expect since it isn't accessing /lists).
self.browser.get(self.live_server_url)
Here is the python error that I'm getting.
/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/bin/python3.4 /Applications/PyCharm.app/Contents/helpers/pycharm/django_test_manage.py test functional_tests.lists_tests.LiveNewVisitorTest.test_can_start_a_list_and_retrieve_it_later /Users/myusername/PycharmProjects/mysite_proj
Testing started at 11:55 AM ...
Creating test database for alias 'default'...
Traceback (most recent call last):
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/wsgiref/handlers.py", line 137, in run
self.result = application(self.environ, self.start_response)
File "/usr/local/lib/python3.4/site-packages/django/test/testcases.py", line 1104, in __call__
return super(FSFilesHandler, self).__call__(environ, start_response)
File "/usr/local/lib/python3.4/site-packages/django/core/handlers/wsgi.py", line 189, in __call__
response = self.get_response(request)
File "/usr/local/lib/python3.4/site-packages/django/test/testcases.py", line 1087, in get_response
return self.serve(request)
File "/usr/local/lib/python3.4/site-packages/django/test/testcases.py", line 1099, in serve
return serve(request, final_rel_path, document_root=self.get_base_dir())
File "/usr/local/lib/python3.4/site-packages/django/views/static.py", line 54, in serve
fullpath = os.path.join(document_root, newpath)
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/posixpath.py", line 82, in join
path += b
TypeError: unsupported operand type(s) for +=: 'NoneType' and 'str'
Am I unknowingly attempting to modify the live_server_url, which is leading to these TypeErrors? How could I programmatically build a string of live_server_url + "/lists"?
Here is the test that I am attempting...
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from django.test import LiveServerTestCase
class LiveNewVisitorTest(LiveServerTestCase):
def setUp(self):
self.browser = webdriver.Chrome()
self.browser.implicitly_wait(3)
def tearDown(self):
self.browser.close()
def test_can_start_a_list_and_retrieve_it_later(self):
#self.browser.get('http://localhost:8000/lists')
#self.browser.get('http://www.google.com')
#lists_live_server_url = '%s%s' % (self.live_server_url, '/lists')
#lists_live_server_url = '{0}{1}'.format(self.live_server_url, '/lists')
lists_live_server_url = self.live_server_url
self.browser.get(lists_live_server_url)
self.assertIn('To-Do', self.browser.title)
header_text = self.browser.find_element_by_tag_name('h1').text
self.assertIn('To-Do', header_text)
See this discussion on Reddit featuring the same error Traceback.
Basically, this is not a problem with anything within the Selenium tests but rather with your project's static file configuration.
From your question, I believe the key line within the Traceback is:
File "/usr/local/lib/python3.4/site-packages/django/views/static.py", line 54, in serve
fullpath = os.path.join(document_root, newpath)
This line indicates that an unsuccessful os.path.join is being attempted within django.views.static.
Set STATIC_ROOT in your project's settings.pyfile and you should be good.
Use StaticLiveServerTestCase instead may help

Werkzeug test client and utf-8

Here's the code. When I send both fields it fails.
import unittest
class UnicodeTestCase(unittest.TestCase):
def test_unicode(self):
from cStringIO import StringIO
from flask import Flask, request
app = Flask(__name__)
app.config['TESTING'] = True
#app.route('/', methods=["POST"])
def test_view():
print request.values, request.files
return "OK"
file = (StringIO("0" * 1000), "filename.txt")
string = u"∆_∆"
client = app.test_client(use_cookies=False)
self.assertEquals(200, client.post('/', data={'file': file}).status_code)
self.assertEquals(200, client.post('/', data={'string': string}).status_code)
self.assertEquals(200, client.post('/', data={'file': file, 'string': string}).status_code)
On the last assert it fails with:
Error
Traceback (most recent call last):
File "/Users/user1/tests/test_uni.py", line 108, in test_unicode
self.assertEquals(200, client.post('/', data={'file': file, 'string': string}).status_code)
File "/Users/user1/.virtualenvs/test/lib/python2.7/site-packages/werkzeug/test.py", line 771, in post
return self.open(*args, **kw)
File "/Users/user1/.virtualenvs/test/lib/python2.7/site-packages/flask/testing.py", line 108, in open
follow_redirects=follow_redirects)
File "/Users/user1/.virtualenvs/test/lib/python2.7/site-packages/werkzeug/test.py", line 725, in open
environ = args[0].get_environ()
File "/Users/user1/.virtualenvs/test/lib/python2.7/site-packages/werkzeug/test.py", line 535, in get_environ
stream_encode_multipart(values, charset=self.charset)
File "/Users/user1/.virtualenvs/test/lib/python2.7/site-packages/werkzeug/test.py", line 104, in stream_encode_multipart
write('\r\n\r\n' + value)
File "/Users/user1/.virtualenvs/test/lib/python2.7/site-packages/werkzeug/test.py", line 71, in write
write_binary(string.encode(charset))
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128)
It works fine when I'm sending both fields with the Postman (a Google Chrome extension).
Is it OK and should I wrap fields with unicode with base64 or something else? Or is it a bug in the werkzeug test client?
Look like test client bug, I already have another bug with test client when direct request work fine, but test client has unexpected result.
For me in https://github.com/mitsuhiko/werkzeug/blob/master/werkzeug/test.py#L71 I have string type as str. For string only this method not called, for file only this method do not called with your string. You can try temporary update this method with next for python 2 only:
def write(string):
if isinstance(string, str):
write_binary(string)
else:
write_binary(string.encode(charset))
I created bug for your example: https://github.com/mitsuhiko/flask/issues/973.