Cognitive face API Python 2.7.x - Bad Image Url - python-2.7

Trying to run the following program to post the faces stored in a folder using ImageUrl but there exists an error.
The API I used is MS's Cognitive Face API v1.0
Please help
import sys
import os, time
import cognitive_face as CF
from global_variables import personGroupId
import urllib
import sqlite3
Key = '---some key----' #I removed the key her for some security Purpose
CF.Key.set(Key)
def get_person_id():
person_id = ''
extractId = str(sys.argv[1])[-2:]
connect = sqlite3.connect("Face-DataBase")
c = connect.cursor()
cmd = "SELECT * FROM Students WHERE ID = " + extractId
c.execute(cmd)
row = c.fetchone()
person_id = row[3]
connect.close()
return person_id
if len(sys.argv) is not 1:
currentDir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
imageFolder = os.path.join(currentDir, "dataset/" + str(sys.argv[1]))
person_id = get_person_id()
for filename in os.listdir(imageFolder):
if filename.endswith(".jpg"):
print(filename)
imgurl = urllib.request.pathname2url(os.path.join(imageFolder, filename))
res = CF.face.detect(imgurl)
if len(res) != 1:
print ("No face detected in image")
else:
res = CF.person.add_face(imgurl, personGroupId, person_id)
print(res)
time.sleep(6)
The error I got is a Invalid Image Url with Status Code '400'
User.22.1.jpg
Traceback (most recent call last):
File "add_person_faces.py", line 31, in <module>
res = CF.face.detect(imgurl)
File "C:\Users\Avina\Anaconda3\envs\virtual_platform\lib\site-packages\cognitive_face\face.py", line 41, in detect
data=data)
File "C:\Users\Avina\Anaconda3\envs\virtual_platform\lib\site-packages\cognitive_face\util.py", line 102, in request
error_msg.get('message'))
cognitive_face.util.CognitiveFaceException: Error when calling Cognitive Face API:
status_code: 400
code: InvalidURL
message: Invalid image URL.

You'll have to send a public url or send the image in binary format for the API to be able to access it.
Here is how you read the image:
try:
with open('mypic.jpg', 'rb') as image:
img = image.read()
face_to_detect = bytearray(img)
print("image reading done!")
except Exception as ex:
print("exception in reading image bytes {0}".format(ex.args))
has_exception = True
And then post it:
result = cf.face.detect(face_to_detect)

Related

ValueError: Missing scheme in request url: h

I am a beginner in scrapy, python. I tried to deploy the spider code in scrapinghub and I encountered the following error. Below is the code.
import scrapy
from bs4 import BeautifulSoup,SoupStrainer
import urllib2
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import re
import pkgutil
from pkg_resources import resource_string
from tues1402.items import Tues1402Item
data = pkgutil.get_data("tues1402","resources/urllist.txt")
class SpiderTuesday (scrapy.Spider):
name = 'tuesday'
self.start_urls = [url.strip() for url in data]
def parse(self, response):
story = Tues1402Item()
story['url'] = response.url
story['title'] = response.xpath("//title/text()").extract()
return story
is my spider.py code
import scrapy
class Tues1402Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
is the items.py code and
from setuptools import setup, find_packages
setup(
name = 'tues1402',
version = '1.0',
packages = find_packages(),
entry_points = {'scrapy': ['settings = tues1402.settings']},
package_data = {'tues1402':['resources/urllist.txt']},
zip_safe = False,
)
is the setup.py code.
The following is the error.
Traceback (most recent call last):
File "/usr/local/lib/python2.7/site-packages/scrapy/core/engine.py", line 126, in _next_request
request = next(slot.start_requests)
File "/usr/local/lib/python2.7/site-packages/scrapy/spiders/init.py", line 70, in start_requests
yield self.make_requests_from_url(url)
File "/usr/local/lib/python2.7/site-packages/scrapy/spiders/init.py", line 73, in make_requests_from_url
return Request(url, dont_filter=True)
File "/usr/local/lib/python2.7/site-packages/scrapy/http/request/init.py", line 25, in init
self._set_url(url)
File "/usr/local/lib/python2.7/site-packages/scrapy/http/request/init.py", line 57, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: h
Thank you in Advance
Your error means that url h is not a valid url. You should print out your self.start_urls and see what urls you have there, you most likely have a string h as your first url.
Seems like your spider iterates through text instead of a list of urls here:
data = pkgutil.get_data("tues1402","resources/urllist.txt")
class SpiderTuesday (scrapy.Spider):
name = 'tuesday'
self.start_urls = [url.strip() for url in data]
Assuming that you store your urls with some separator in urllist.txt file you should split that:
# assuming file has url in every line
self.start_urls = [url.strip() for url in data.splitlines()]

Python SnakeSQL Table not found error

I am learning Python and SnakeSQL for my database operations. I am following the examples of both web.py and SnakeSQL's. I combined both codes from their web sites and tried to make the below code worked but I am getting this error:
Traceback (most recent call last):
File ".\app.py", line 9, in
cursor.execute("INSERT INTO Test (dateColumn, numberColumn) VALUES ('2004->11-8', 4)")
File "D:\Python\virtualenvs\new4\textweb\bin\SnakeSQL\driver\base.py", line >1552, in execute
self.info = self.connection._insert(parsedSQL['table'], >parsedSQL['columns'], parsedSQL['sqlValues'], parameters)
File "D:\Python\virtualenvs\new4\textweb\bin\SnakeSQL\driver\base.py", line >1040, in _insert
raise SQLError("Table '%s' not found."%(table))
error.SQLError: Table 'Test' not found.
And here my app.py code:
import web
import SnakeSQL
render = web.template.render('templates/')
connection = SnakeSQL.connect(database='test', autoCreate=True)
connection = SnakeSQL.connect(database='test')
cursor = connection.cursor()
cursor.execute("INSERT INTO Test (dateColumn, numberColumn) VALUES ('2004-11-8', 4)")
urls = (
'/', 'index'
)
class index:
def GET(self, name):
cursor.execute("SELECT * FROM Test")
results = cursor.fetchall()
return render.index(results)
if __name__ == "__main__":
app = web.application(urls, globals())
app.run()
What could be wrong in here?

Downloading images with Python

I am working on a project where I need to download some images using python. I have tried to fix it by doing different things but it is still not working. Here is some code I found and I tried to use but it does not seem to work. To be honest I am a newbie at programming so I would be grateful to get some help.
Here is the code:
import json
import os
import time
import requests
import Image
from StringIO import StringIO
from requests.exceptions import ConnectionError
def go(query,pathA):
BASE_URL = 'https://ajax.googleapis.com/ajax/services/search/images?'\
'v=1.0&q=' + query + '&start=%d'
BASE_PATH = os.path.join(pathA, query)
if not os.path.exists(BASE_PATH):
os.makedirs(BASE_PATH)
start = 0
while start < 60:
r = requests.get(BASE_URL % start)
for image_info in json.loads(r.text)['responseData']['results']:
url = image_info['unescapedUrl']
try:
image_r = requests.get(url)
except ConnectionError, e:
print 'could not download %s' % urla
continue
# Remove file-system path characters from name.
title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '')
fileII = open(os.path.join(BASE_PATH, '%s.jpg') % title, 'w')
try:
Image.open(StringIO(image_r.content)).save(fileII, 'JPEG')
except IOError, e:
# Throw away some gifs...blegh.
print 'could not save %s' % url
continue
finally:
fileII.close()
print start
start += 4 # 4 images per page.
time.sleep(1.5)
# Example use
go('landscape', 'myDirectory')
The error I get when I run the code above is:
IOError: [Errno 22] invalid mode ('w') or filename: u'myDirectory\landscape\Na
ture - Photo gallery | MIRIADNA.COM.jpg'
Thanks in advance
This bit of code defines where you will save your image
# Remove file-system path characters from name.
title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '')
Reading your error message I see that file does not exist (or a directory) because w is a valid mode for opening a file.
Try hardcoding the title to a simple and local path, such as
title = 'test'

lxml not working with django, scraperwiki

I'm working on a django app that goes through Illinois' General Assembly website to scrape some pdfs. While deployed on my desktop it works fine until urllib2 times out. When I try to deploy on my Bluehost server, the lxml part of the code throws up an error. Any help would be appreciated.
import scraperwiki
from bs4 import BeautifulSoup
import urllib2
import lxml.etree
import re
from django.core.management.base import BaseCommand
from legi.models import Votes
class Command(BaseCommand):
def handle(self, *args, **options):
chmbrs =['http://www.ilga.gov/house/', 'http://www.ilga.gov/senate/']
for chmbr in chmbrs:
site = chmbr
url = urllib2.urlopen(site)
content = url.read()
soup = BeautifulSoup(content)
links = []
linkStats = []
x=0
y=0
table = soup.find('table', cellpadding=3)
for a in soup.findAll('a',href=True):
if re.findall('Bills', a['href']):
l = (site + a['href']+'&Primary=True')
links.append(str(l))
x+=1
print x
for link in links:
url = urllib2.urlopen(link)
content = url.read()
soup = BeautifulSoup(content)
table = soup.find('table', cellpadding=3)
for a in table.findAll('a',href=True):
if re.findall('BillStatus', a['href']):
linkStats.append(str('http://ilga.gov'+a['href']))
for linkStat in linkStats:
url = urllib2.urlopen(linkStat)
content = url.read()
soup = BeautifulSoup(content)
for a in soup.findAll('a',href=True):
if re.findall('votehistory', a['href']):
vl = 'http://ilga.gov/legislation/'+a['href']
url = urllib2.urlopen(vl)
content = url.read()
soup = BeautifulSoup(content)
for b in soup.findAll('a',href=True):
if re.findall('votehistory', b['href']):
llink = 'http://ilga.gov'+b['href']
try:
u = urllib2.urlopen(llink)
x = scraperwiki.pdftoxml(u.read())
root = lxml.etree.fromstring(x)
pages = list(root)
chamber = str()
for page in pages:
print "working_1"
for el in page:
print "working_2"
if el.tag == 'text':
if int(el.attrib['top']) == 168:
chamber = el.text
if re.findall("Senate Vote", chamber):
if int(el.attrib['top']) >= 203 and int(el.attrib['top']) < 231:
title = el.text
if (re.findall('House', title)):
title = (re.findall('[0-9]+', title))
title = "HB"+title[0]
elif (re.findall('Senate', title)):
title = (re.findall('[0-9]+', title))
title = "SB"+title[0]
if int(el.attrib['top']) >350 and int(el.attrib['top']) <650:
r = el.text
names = re.findall(r'[A-z-\u00F1]{3,}',r)
vs = re.findall(r'[A-Z]{1,2}\s',r)
for name in names:
legi = name
for vote in vs:
v = vote
if Votes.objects.filter(legislation=title).exists() == False:
c = Votes(legislation=title, legislator=legi, vote=v)
c.save()
print 'saved'
else:
print 'not saved'
elif int(el.attrib['top']) == 189:
chamber = el.text
if re.findall("HOUSE ROLL CALL", chamber):
if int(el.attrib['top']) > 200 and int(el.attrib['top']) <215:
title = el.text
if (re.findall('HOUSE', title)):
title = (re.findall('[0-9]+', title))
title = "HB"+title[0]
elif (re.findall('SENATE', title)):
title = (re.findall('[0-9]+', title))
title = "SB"+title[0]
if int(el.attrib['top']) >385 and int(el.attrib['top']) <1000:
r = el.text
names = re.findall(r'[A-z-\u00F1]{3,}',r)
votes = re.findall(r'[A-Z]{1,2}\s',r)
for name in names:
legi = name
for vote in votes:
v = vote
if Votes.objects.filter(legislation=title).exists() == False:
c = Votes(legislation=title, legislator=legi, vote=v)
c.save()
print 'saved'
else:
print 'not saved'
except:
pass
EDIT 1
Here's the error trace
Traceback (most recent call last):
File "manage.py", line 10, in <module>
execute_from_command_line(sys.argv)
File "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/__init__.py", line 399, in execute_from_command_line
utility.execute()
File "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/__init__.py", line 392, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/base.py", line 242, in run_from_argv
self.execute(*args, **options.__dict__)
File "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/base.py", line 285, in execute
output = self.handle(*args, **options)
File "/home7/maythirt/GAB/legi/management/commands/vote.py", line 51, in handle
root = lxml.etree.fromstring(x)
File "lxml.etree.pyx", line 3032, in lxml.etree.fromstring (src/lxml/lxml.etree.c:68121)
File "parser.pxi", line 1786, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:102470)
File "parser.pxi", line 1674, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:101299)
File "parser.pxi", line 1074, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:96481)
File "parser.pxi", line 582, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:91290)
File "parser.pxi", line 683, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:92476)
File "parser.pxi", line 633, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:91939)
lxml.etree.XMLSyntaxError: None
As Jonathan mentioned, it may be the output of scraperwiki.pdftoxml() that's causing a problem. You could display or log the value of x to confirm it.
Specifically, pdftoxml() runs an external program pdftohtml and uses temporary files to store the PDF and XML.
What I'd also check for is:
Is pdftohtml correctly set up on your server?
If so, does the conversion to XML work if you directly run it in a shell on the server with the PDF that the code's failing on? The command it's executing is pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "input.pdf" "output.xml"
If there's an issue when you directly run the command, then that's there your issue lies. With the way pdftohtml runs in the scraperwiki code, there's no easy way you'd be able to tell if the command fails.
They way I would go about this is add a try: except: clause and when you get the error you simply save the xml file as well as the link down to your harddrive. That way you can inspect the xml file separately.
It might be that scraperwiki.pdftoxml makes an illegal xml file for some reason. I've had that happen to me when using another pdftoxml tool.
And please refactor your code into more functions it will become a lot easier to read and maintain :).
Another way would of course to download all of the pdfs first, and then parse them all. That way you can avoid hitting the website several times whenever you fail for some reason.

"TypeError: cannot concatenate 'str' and 'NoneType' objects" in Python Google Adwords API Client

I can't use the download reports feature with the Python client. I'm using with adwords-15.9.0 with v201306. It always fails with:
$ ./classifications.py
Traceback (most recent call last):
File "./classifications.py", line 48, in <module>
download_report(client, client_id)
File "./classifications.py", line 32, in download_report
file_path = report_downloader.DownloadReportWithAwql(report_query, 'CSV', file_path=path)
File "/Users/mike/.virtualenvs/xxx/lib/python2.7/site-packages/adspygoogle/adwords/ReportDownloader.py", line 127, in DownloadReportWithAwql
fileobj) or file_path
File "/Users/mike/.virtualenvs/xxx/lib/python2.7/site-packages/adspygoogle/adwords/ReportDownloader.py", line 169, in __DownloadAdHocReportWithAwql
return self.__DownloadReport(payload, return_micros, fileobj)
File "/Users/mike/.virtualenvs/xxx/lib/python2.7/site-packages/adspygoogle/adwords/ReportDownloader.py", line 184, in __DownloadReport
headers = self.__GenerateHeaders(return_micros)
File "/Users/mike/.virtualenvs/xxx/lib/python2.7/site-packages/adspygoogle/adwords/ReportDownloader.py", line 282, in __GenerateHeaders
self._headers['oauth2credentials'].apply(headers)
File "/Users/mike/.virtualenvs/xxx/lib/python2.7/site-packages/oauth2client/client.py", line 533, in apply
headers['Authorization'] = 'Bearer ' + self.access_token
TypeError: cannot concatenate 'str' and 'NoneType' objects
Example scripts get_report_fields.py and get_campaign_stats.py work fine but download_criteria_report.py and download_criteria_report_with_awql.py fail with the same error.
Any ideas?
My code:
#!/usr/bin/env python
import csv
import os
import MySQLdb as mdb
from adspygoogle.adwords.AdWordsClient import AdWordsClient
MATCH_TYPES = {
'b': 'Broad',
'e': 'Exact',
'p': 'Phrase',
}
DEVICE_TYPES = {
'c': 'Desktop',
'm': 'Mobile',
't': 'Tablet',
}
REPORT_TYPE = 'CREATIVE_CONVERSION_REPORT'
def download_report(client, client_id):
# Initialize appropriate service.
report_downloader = client.GetReportDownloader(version='v201306')
# Create report query.
report_query = ('SELECT AdGroupId', 'CampaignId', 'CreativeId FROM CREATIVE_CONVERSION_REPORT DURING LAST_7_DAYS')
path = '/tmp/report_%d.csv' % client_id
file_path = report_downloader.DownloadReportWithAwql(report_query, 'CSV', file_path=path)
print 'Report was downloaded to \'%s\'.' % file_path
if __name__ == '__main__':
client = AdWordsClient()
conn = mdb.connect('xxx.us-east-1.rds.amazonaws.com', 'xxx', 'xxx', 'xxx');
with conn:
cur = conn.cursor(mdb.cursors.DictCursor)
cur.execute("SELECT * FROM xxx.adwords_accounts")
rows = cur.fetchall()
for row in rows:
client_id = row['id']
client.SetClientCustomerId(client_id)
download_report(client, client_id)
Something's wrong with your authentication as indicated by the OAuth2Credentials object's attribute access_token being None.
If you didn't already, take a look at the use_oath2.py example to see how authentication via OAuth2 is handled. You will also need to create a Google API Console application to obtain a client ID and secret.
It's a known bug. Fixed in v15.9.1