BigQuery results run via python in Google Cloud don't match results running on MAC - python-2.7

I have a python app that runs a query on BigQuery and appends results to a file. I've run this on MAC workstation (Yosemite) and on GC instance (ubuntu 14.1) and the results for floating point differ. How can I make them the same? They python environments are the same on both.
run on google cloud instance
1120224,2015-04-06,23989,866,55159.71274162368,0.04923989554019882,0.021414467106578683,0.03609987911125933,63.69481840834143
54897577,2015-04-06,1188089,43462,2802473.708558333,0.051049132980100984,0.021641920553251377,0.03658143455582873,64.4810111950286
run on mac workstation
1120224,2015-04-06,23989,866,55159.712741623654,0.049239895540198794,0.021414467106578683,0.03609987911125933,63.694818408341405
54897577,2015-04-06,1188089,43462,2802473.708558335,0.05104913298010102,0.021641920553251377,0.03658143455582873,64.48101119502864
import sys
import pdb
import json
from collections import OrderedDict
from csv import DictWriter
from pprint import pprint
from apiclient import discovery
from oauth2client import tools
import functools
import argparse
import httplib2
import time
from subprocess import call
def authenticate_SERVICE_ACCOUNT(service_acct_email, private_key_path):
""" Generic authentication through a service accounts.
Args:
service_acct_email: The service account email associated
with the private key private_key_path: The path to the private key file
"""
from oauth2client.client import SignedJwtAssertionCredentials
with open(private_key_path, 'rb') as pk_file:
key = pk_file.read()
credentials = SignedJwtAssertionCredentials(
service_acct_email,
key,
scope='https://www.googleapis.com/auth/bigquery')
http = httplib2.Http()
auth_http = credentials.authorize(http)
return discovery.build('bigquery', 'v2', http=auth_http)
def create_query(number_of_days_ago):
""" Create a query
Args:
number_of_days_ago: Default value of 1 gets yesterday's data
"""
q = 'SELECT xxxxxxxxxx'
return q;
def translate_row(row, schema):
"""Apply the given schema to the given BigQuery data row.
Args:
row: A single BigQuery row to transform.
schema: The BigQuery table schema to apply to the row, specifically
the list of field dicts.
Returns:
Dict containing keys that match the schema and values that match
the row.
Adpated from bigquery client
https://github.com/tylertreat/BigQuery-Python/blob/master/bigquery/client.py
"""
log = {}
#pdb.set_trace()
# Match each schema column with its associated row value
for index, col_dict in enumerate(schema):
col_name = col_dict['name']
row_value = row['f'][index]['v']
if row_value is None:
log[col_name] = None
continue
# Cast the value for some types
if col_dict['type'] == 'INTEGER':
row_value = int(row_value)
elif col_dict['type'] == 'FLOAT':
row_value = float(row_value)
elif col_dict['type'] == 'BOOLEAN':
row_value = row_value in ('True', 'true', 'TRUE')
log[col_name] = row_value
return log
def extractResult(queryReply):
""" Extract a result from the query reply. Uses schema and rows to translate.
Args:
queryReply: the object returned by bigquery
"""
#pdb.set_trace()
result = []
schema = queryReply.get('schema', {'fields': None})['fields']
rows = queryReply.get('rows',[])
for row in rows:
result.append(translate_row(row, schema))
return result
def writeToCsv(results, filename, ordered_fieldnames, withHeader=True):
""" Create a csv file from a list of rows.
Args:
results: list of rows of data (first row is assumed to be a header)
order_fieldnames: a dict with names of fields in order desired - names must exist in results header
withHeader: a boolen to indicate whether to write out header -
Set to false if you are going to append data to existing csv
"""
try:
the_file = open(filename, "w")
writer = DictWriter(the_file, fieldnames=ordered_fieldnames)
if withHeader:
writer.writeheader()
writer.writerows(results)
the_file.close()
except:
print "Unexpected error:", sys.exc_info()[0]
raise
def runSyncQuery (client, projectId, query, timeout=0):
results = []
try:
print 'timeout:%d' % timeout
jobCollection = client.jobs()
queryData = {'query':query,
'timeoutMs':timeout}
queryReply = jobCollection.query(projectId=projectId,
body=queryData).execute()
jobReference=queryReply['jobReference']
# Timeout exceeded: keep polling until the job is complete.
while(not queryReply['jobComplete']):
print 'Job not yet complete...'
queryReply = jobCollection.getQueryResults(
projectId=jobReference['projectId'],
jobId=jobReference['jobId'],
timeoutMs=timeout).execute()
# If the result has rows, print the rows in the reply.
if('rows' in queryReply):
#print 'has a rows attribute'
#pdb.set_trace();
result = extractResult(queryReply)
results.extend(result)
currentPageRowCount = len(queryReply['rows'])
# Loop through each page of data
while('rows' in queryReply and currentPageRowCount < int(queryReply['totalRows'])):
queryReply = jobCollection.getQueryResults(
projectId=jobReference['projectId'],
jobId=jobReference['jobId'],
startIndex=currentRow).execute()
if('rows' in queryReply):
result = extractResult(queryReply)
results.extend(result)
currentRow += len(queryReply['rows'])
except AccessTokenRefreshError:
print ("The credentials have been revoked or expired, please re-run"
"the application to re-authorize")
except HttpError as err:
print 'Error in runSyncQuery:', pprint.pprint(err.content)
except Exception as err:
print 'Undefined error' % err
return results;
# Main
if __name__ == '__main__':
# Name of file
FILE_NAME = "results.csv"
# Default prior number of days to run query
NUMBER_OF_DAYS = "1"
# BigQuery project id as listed in the Google Developers Console.
PROJECT_ID = 'xxxxxx'
# Service account email address as listed in the Google Developers Console.
SERVICE_ACCOUNT = 'xxxxxx#developer.gserviceaccount.com'
KEY = "/usr/local/xxxxxxxx"
query = create_query(NUMBER_OF_DAYS)
# Authenticate
client = authenticate_SERVICE_ACCOUNT(SERVICE_ACCOUNT, KEY)
# Get query results
results = runSyncQuery (client, PROJECT_ID, query, timeout=0)
#pdb.set_trace();
# Write results to csv without header
ordered_fieldnames = OrderedDict([('f_split',None),('m_members',None),('f_day',None),('visitors',None),('purchasers',None),('demand',None), ('dmd_per_mem',None),('visitors_per_mem',None),('purchasers_per_visitor',None),('dmd_per_purchaser',None)])
writeToCsv(results, FILE_NAME, ordered_fieldnames, False)
# Backup current data
backupfilename = "data_bk-" + time.strftime("%y-%m-%d") + ".csv"
call(['cp','../data/data.csv',backupfilename])
# Concatenate new results to data
with open("../data/data.csv", "ab") as outfile:
with open("results.csv","rb") as infile:
line = infile.read()
outfile.write(line)

You mention that these come from aggregate sums of floating point data. As Felipe mentioned, floating point is awkward; it violates some of the mathematical identities that we tend to assume.
In this case, the associative property is the one that bites us. That is, usually (A+B)+C == A+(B+C). However, in floating point math, this isn't the case. Each operation is an approximation; you can see this better if you wrap with an 'approx' function: approx(approx(A+B) + C) is clearly different from approx(A + approx(B+C)).
If you think about how bigquery computes aggregates, it builds an execution tree, and computes the value to be aggregated at the leaves of the tree. As those answers are ready, they're passed back up to the higher levels of the tree and aggregated (let's say they're added). The "when they're ready" part makes it non-deterministic.
A node may get results back in the order A,B,C the first time and C,A, B the second time. This means that the order of distribution will change, since you'll get approx(approx(A + B) + C) the first time and approx(approx(C, A) + B) the second time. Note that since we're dealing with ordering, it may look like the commutative property is the problematic one, but it isn't; A+B in floating math is the same as B+A. The problem is really that you're adding partial results, which aren't associative.
Floating point math has all sorts of nasty properties and should usually be avoided if you rely on precision.

Assume floating point is non-deterministic:
https://randomascii.wordpress.com/2013/07/16/floating-point-determinism/
“the IEEE standard does not guarantee that the same program will
deliver identical results on all conforming systems.”

Related

There is a text file want to return content of file in json format on the matching column

there is a text file containing data in the form:
[sec1]
"ab": "s"
"sd" : "d"
[sec2]
"rt" : "ty"
"gh" : "rr"
"kk":"op"
we are supposed to return dara of matching sections in json format like if user wants sec1 so we are supposed to send sec1 contents
The format you specified is very similar to the TOML format. However, this one uses equals signs for assignments of key-value pairs.
If your format actually uses colons for the assignment, the following example may help you.
It uses regular expressions in conjunction with a defaultdict to read the data from the file. The section to be queried is extracted from the URL using a variable rule.
If there is no hit within the loaded data, the server responds with a 404 error (NOT FOUND).
import re
from collections import defaultdict
from flask import (
Flask,
abort,
jsonify
)
def parse(f):
data = defaultdict(dict)
section = None
for line in f:
if re.match(r'^\[[^\]]+\]$', line.strip()):
section = line[1:-2]
data[section] = dict()
continue
m = re.match(r'^"(?P<key>[^"]+)"\s*:\s*"(?P<val>[^"]+)"$', line.strip())
if m:
key,val = m.groups()
if not section:
raise OSError('illegal format')
data[section][key] = val
continue
return dict(data)
app = Flask(__name__)
#app.route('/<string:section>')
def data(section):
path = 'path/to/file'
with open(path) as f:
data = parse(f)
if section in data:
return jsonify(data[section])
abort(404)

Python 2.X on NiFi: problems with Ñ (and others) in json.loads

I'm using a Jython InvokeScriptedProcessor to struct data from json struct to sql struct. I'm having trouble with a specific function. json.loads. json.loads does not recognize special characters like ñ, é, á, í...
It writes it in an odd form. And I've not reached any form to have it.
e.g. (very simple)
{"id":"ÑUECO","value":3.141592,"datetime":"....","location":"ÑUECO"}
If we try to write it in sql like
INSERT INTO .... (id, value) VALUES ("...",3.141592);
It will fail. It fails me. I cannot return data with any return option, success or failure, it doesn't matter NiFi's version. Here is my code
def process(self, inputStream, outputStream):
# read input json data from flowfile content
text = IOUtils.toString(inputStream, StandardCharsets.UTF_8)
data = json.loads(text)
Neither
data = json.loads(text.encode("utf-8"))
works properly. text comes in unicode.
def __generate_sql_transaction(input_data):
""" Generate SQL statement """
sql = """
BEGIN;"""
_id = input_data.get("id")
_timestamp = input_data.get("timestamp")
_flowfile_metrics = input_data.get("metrics")
_flowfile_metadata = input_data.get("metadata")
self.valid = __validate_metrics_type(_flowfile_metrics)
if self.valid is True:
self.log.error("generate insert")
sql += """
INSERT INTO
{0}.{1} (id, timestamp, metrics""".format(schema, table)
if _flowfile_metadata:
sql += ", metadata"
sql += """)
VALUES
('{0}', '{1}', '{2}'""".format(_id.encode("utf-8"), _timestamp, json.dumps(_flowfile_metrics))
self.log.error("generate metadata")
if _flowfile_metadata:
sql += ", '{}'".format(json.dumps(_flowfile_metadata).encode("utf-8"))
sql += """)
ON CONFLICT ({})""".format(on_conflict)
if not bool(int(self.update)):
sql += """
DO NOTHING;"""
else:
sql += """
DO UPDATE
SET"""
if bool(int(self.preference)):
sql += """
metrics = '{2}' || {0}.{1}.metrics;""".format(schema, table, json.dumps(_flowfile_metrics))
else:
sql += """
metrics = {0}.{1}.metrics || '{2}';""".format(schema, table, json.dumps(_flowfile_metrics))
else:
return ""
sql += """
COMMIT;"""
return sql
I send the data to NiFi again with:
output = __generate_sql_transaction(data)
self.log.error("post generate_sql_transaction")
self.log.error(output.encode("utf-8"))
# If no sql_transaction is generated because requisites weren't met,
# set the processor output with the original flowfile input.
if output == "":
output = text
# write new content to flowfile
outputStream.write(
output.encode("utf-8")
)
That output seems like
INSERT INTO .... VALUES ("ÃUECO","2020-01-01T10:00:00",'{"value":3.1415}','{"location":"\u00d1UECO"}');
I have "Ñueco" also in metadata, and it doesn't works fine with id nor metadata
NOTE: It seems that InvokeScriptedProcessor works fine using Groove instead of Python. But my problem is I know nothing about Groovy...
Does anybody found a similar issue? How did you solve it?
Update:
Input Example:
{"id":"ÑUECO",
"metrics":{
"value":3.1415
},
"metadata":{
"location":"ÑUECO"
},
"timestamp":"2020-01-01 00:00:00+01:00"
}
Desired Output:
BEGIN;
INSERT INTO Table (id, timestamp, metrics, metadata)
VALUES ('ÑUECO',
'2020-01-01T00:00:00+01:00',
'{"value":3.1415}',
'{"location":"ÑUECO"}')
ON CONFLICT (id, timestamp)
DO UPDATE
SET
metrics='{"value":3.1415}' || Table.metrics;
COMMIT;
Real Output:
BEGIN;
INSERT INTO Table (id, timestamp, metrics, metadata)
VALUES ('ÃUECO',
'2020-01-01T00:00:00+01:00',
'{"value":3.1415}',
'{"location":"\u00d1UECO"}')
ON CONFLICT (id, timestamp)
DO UPDATE
SET
metrics='{"value":3.1415}' || Table.metrics;
COMMIT;
UPD
jython does not work correctly with byte-strings - so, don't use .encode('utf-8')
use java methods to write content back to flow file with specific encoding
below is a sample that reads and writes correctly non-ascii chars including Ñ
use ExecuteScript processor with jython and replace body of _transform(text) function:
import traceback
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback
class FlowWriter(StreamCallback):
def _transform(self, text):
# transform incoming text here
return '####' + text + '****'
def process(self, inputStream, outputStream):
text = IOUtils.toString(inputStream, StandardCharsets.UTF_8)
new_text = self._transform(text)
IOUtils.write(new_text, outputStream, StandardCharsets.UTF_8)
flowFile = session.get()
if flowFile != None:
try:
flowFile = session.write(flowFile, FlowWriter())
flowFile = session.putAttribute(flowFile, "filename", 'headerfile.xml')
session.transfer(flowFile, REL_SUCCESS)
session.commit()
except Exception as e:
log.error("{}\n{}".format(e,traceback.format_exc()))
session.rollback(True) # put file back and penalize it
I've recently found this answer.
https://stackoverflow.com/a/35882335/7634711
It's not a problem with NiFi. It's a problem with Python2 and how it works with json library. And the problems will be also in Python3 if special characters come in dict keys.

Bigquery : job is done but job.query_results().total_bytes_processed returns None

The following code :
import time
from google.cloud import bigquery
client = bigquery.Client()
query = """\
select 3 as x
"""
dataset = client.dataset('dataset_name')
table = dataset.table(name='table_name')
job = client.run_async_query('job_name_76', query)
job.write_disposition = 'WRITE_TRUNCATE'
job.destination = table
job.begin()
retry_count = 100
while retry_count > 0 and job.state != 'DONE':
retry_count -= 1
time.sleep(10)
job.reload()
print job.state
print job.query_results().name
print job.query_results().total_bytes_processed
prints :
DONE
job_name_76
None
I do not understand why total_bytes_processed returns None because the job is done and the documentation says :
total_bytes_processed:
Total number of bytes processed by the query.
See
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#totalBytesProcessed
Return type: int, or NoneType
Returns: Count generated on the server (None until set by the server).
Looks like you are right. As you can see in the code, the current API does not process data regarding bytes processed.
This has been reported in this issue and as you can see in this tseaver's PR this feature has already been implemented and awaits review /merging so probably we'll have this code in production quite soon.
In the mean time you could get the result from the _properties attribute of job, like:
from google.cloud.bigquery import Client
import types
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'path/to/key.json'
bc = Client()
query = 'your query'
job = bc.run_async_query('name', query)
job.begin()
wait_job(job)
query_results = job._properties['statistics'].get('query')
query_results should have the totalBytesProcessed you are looking for.

How to Add a Column in DynamoDB

Is there a way to add a new column to existing table in DynamoDB in Amazon's AWS?
Google didn't help,
UpdateTable Query in http://docs.aws.amazon.com/cli/latest/reference/dynamodb/update-table.html?highlight=update%20table doesn't have any information related to adding a new column.
DynamoDB does not require schema definition, and so there is no such thing as a "column". You can just add a new item with a new attribute.
Well, let's not get dragged away in the semantic discussion about the difference between "fields" and "columns". The word "column" does remind us of relational databases, which dynamodb is not. In essence that means that dynamodb does not have foreign keys.
Dynamodb does have "primary partition keys" and "index partition keys" though, just as with relational databases. (Although there are other strategies as well)
You do need to respect those keys when you add data. But aside from those requirements, you don't have to predefine your fields (except for those partition keys mentioned earlier).
Assuming that you are new to this, some additional good practices:
Add a numeric field to each record, to store the time of creation in seconds. Dynamodb has optional cleaning features, which require this type of field in your data.
You cannot use dates in dynamodb, so you have to store those as numeric fields or as strings. Given the previously mentioned remark, you may prefer a numeric type for them.
Don't store big documents in it, because there is a maximum fetch size of 16MB, and a maximum record size of 400KB. Fortunately, AWS has S3 storage and other kind of databases (e.g. DocumentDB).
There are many strategies for table keys:
If you only declare the partition-key, then it acts like a primary key (e.g. partition-key=invoiceId). That's fine.
If your object has a parent reference. (e.g. invoices have a customer), then you probably want to add a sort-key. (e.g. partition-key=customerId;sort-key=invoiceId) Together they behave like a composed key. The advantage is that you can do a lookup using both keys, or just using the partition-key. (e.g. request a specific invoice for a specific customer, or all invoices for that customer)
I installed NoSQL Workbench then connected to existing DynamoDB Table and tried to update existing Item by adding a new attribute.
I figured out that we can only add a new attribute with one of these types - "SS", "NS", "BS" (String Set, Number Set, Binary Set").
In Workbench, we can generate code for the chosen operation.
I scanned my dynamodb Table and for each item added new attribute with type "SS" then I scanned again and updated recently added new attribute to type - "S" in order create a global secondary index (GSI) with a primary key - "pk2NewAttr".
NoSQL Workbench related video - https://www.youtube.com/watch?v=Xn12QSNa4RE&feature=youtu.be&t=3666
Example in Python "how to scan all dynamodb Table" - https://gist.github.com/pgolding
You can achieve the same by doing the following,
Open Dynamodb and click on the tables option in the left sidebar menu.
Search your table by name and click on your table
Now select the orange button named Explore table items
Scroll down and Click on Create item
Now you will see an editor with JSON Value, Click on Form button on the right side to add a new column and its type.
Note: this will insert 1 new record and you can see now the new column as well.
A way to add a new column to existing table in DynamoDB in Amazon's AWS:
We can store the values in DynamoDb in 2 ways,
(i) In an RDBMS Type of Structure for the DynamoDB, we can add a new Coulmn by executing the same command keeping the "new Column" entry within which the Records in the Existing Table has been created. we can use DynamoDb with the Records/ Rows having Values for certain Columns while other columns does not have Values.
(ii) In a NoSQL kind of Structure; where we store a Json String within a Column to keep all the attributes as per the Requirement. Here we are generating a json string and we have to add the new Attribute into the json String which can then be inserted into the same Column but with the new Attribute.
This script will either delete a record, or add a ttl field. You might want to tailor it to your column name and remove the delete stuff.
Usage:
usage: add_ttl.py [-h] --profile PROFILE [-d] [--force_delete_all] [-v] [-q]
Procedurally modify DynamoDB
optional arguments:
-h, --help show this help message and exit
--profile PROFILE AWS profile name
-d, --dryrun Dry run, take no action
--force_delete_all Delete all records, including valid, unexpired
-v, --verbose set loglevel to DEBUG
-q, --quiet set loglevel to ERROR
Script:
#!/usr/bin/env python3
# pylint:disable=duplicate-code
import argparse
import logging
import sys
from collections import Counter
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from functools import cached_property
from typing import Dict, Optional
import boto3
from dateutil.parser import isoparse
from tqdm import tqdm
LOGGER = logging.getLogger(__name__)
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
LOG_FORMAT = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
def setup_logging(loglevel=None, date_format=None, log_format=None):
"""Setup basic logging.
Args:
loglevel (int): minimum loglevel for emitting messages
"""
logging.basicConfig(
level=loglevel or logging.INFO,
stream=sys.stdout,
format=log_format or LOG_FORMAT,
datefmt=date_format or DATE_FORMAT,
)
def parse_args():
"""
Extract the CLI arguments from argparse
"""
parser = argparse.ArgumentParser(description="Procedurally modify DynamoDB")
parser.add_argument(
"--profile",
help="AWS profile name",
required=True,
)
parser.add_argument(
"-d",
"--dryrun",
action="store_true",
default=False,
help="Dry run, take no action",
)
parser.add_argument(
"--force_delete_all",
action="store_true",
default=False,
help="Delete all records, including valid, unexpired",
)
parser.add_argument(
"-v",
"--verbose",
dest="loglevel",
help="set loglevel to DEBUG",
action="store_const",
const=logging.DEBUG,
)
parser.add_argument(
"-q",
"--quiet",
dest="loglevel",
help="set loglevel to ERROR",
action="store_const",
const=logging.ERROR,
)
return parser.parse_args()
def query_yes_no(question, default="yes"):
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
"""
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == "":
return valid[default]
if choice in valid:
return valid[choice]
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
#dataclass
class Table:
"""Class that wraps dynamodb and simplifies pagination as well as counting."""
region_name: str
table_name: str
_counter: Optional[Counter] = None
def __str__(self):
out = "\n" + ("=" * 80) + "\n"
for key, value in self.counter.items():
out += "{:<20} {:<2}\n".format(key, value)
return out
def str_table(self):
keys = list(self.counter.keys())
# Set the names of the columns.
fmt = "{:<20} " * len(keys)
return f"\n\n{fmt}\n".format(*keys) + f"{fmt}\n".format(
*list(self.counter.values())
)
#cached_property
def counter(self):
if not self._counter:
self._counter = Counter()
return self._counter
#cached_property
def client(self):
return boto3.client("dynamodb", region_name=self.region_name)
#cached_property
def table(self):
dynamodb = boto3.resource("dynamodb", region_name=self.region_name)
return dynamodb.Table(self.table_name)
#property
def items(self):
response = self.table.scan()
self.counter["Fetched Pages"] += 1
data = response["Items"]
with tqdm(desc="Fetching pages") as pbar:
while "LastEvaluatedKey" in response:
response = self.table.scan(
ExclusiveStartKey=response["LastEvaluatedKey"]
)
self.counter["Fetched Pages"] += 1
data.extend(response["Items"])
pbar.update(500)
self.counter["Fetched Items"] = len(data)
return data
#cached_property
def item_count(self):
response = self.client.describe_table(TableName=self.table_name)
breakpoint()
count = int(response["Table"]["ItemCount"])
self.counter["Total Rows"] = count
return count
def delete_item(table, item):
return table.table.delete_item(
Key={
"tim_id": item["tim_id"],
}
)
def update_item(table: Table, item: Dict, ttl: int):
return table.table.update_item(
Key={"tim_id": item["tim_id"]},
UpdateExpression="set #t=:t",
ExpressionAttributeNames={
"#t": "ttl",
},
ExpressionAttributeValues={
":t": ttl,
},
ReturnValues="UPDATED_NEW",
)
def main():
setup_logging()
args = parse_args()
if not query_yes_no(
f"Performing batch operations with {args.profile}; is this correct?"
):
sys.exit(1)
sys.stdout.write(f"Setting up connection with {args.profile}\n")
boto3.setup_default_session(profile_name=args.profile)
table = Table(region_name="us-west-2", table_name="TimManager")
now = datetime.utcnow().replace(microsecond=0).astimezone(timezone.utc)
buffer = timedelta(days=7)
# #TODO list comprehension
to_update = []
to_delete = []
for item in tqdm(table.items, desc="Inspecting items"):
ttl_dt = isoparse(item["delivery_stop_time"])
if ttl_dt > now - buffer and not args.force_delete_all:
to_update.append(item)
else:
to_delete.append(item)
table.counter["Identified for update"] = len(to_update)
table.counter["Identified for delete"] = len(to_delete)
table.counter["Performed Update"] = 0
table.counter["Performed Delete"] = 0
if to_update and query_yes_no(
f"Located {len(to_update)} records to update with {args.profile}"
):
for item in tqdm(to_update, desc="Updating items"):
if not args.dryrun:
ttl_dt = isoparse(item["delivery_stop_time"])
response = update_item(table, item, int((ttl_dt + buffer).timestamp()))
if response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 200:
table.counter["Updated"] += 1
if to_delete and query_yes_no(
f"Located {len(to_delete)} records to delete with {args.profile}"
):
for item in tqdm(to_delete, desc="Deleting items"):
if not args.dryrun:
table.counter["Deleted"] += 1
response = delete_item(table, item)
if response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 200:
table.counter["Deleted"] += 1
sys.stdout.write(str(table))
if __name__ == "__main__":
main()

Decoding utf-8 in Django while using unicode_literals in Python 2.7

I'm using Django to manage a Postgres database. I have a value stored in the database representing a city in Spain (Málaga). My Django project uses unicode strings for everything by putting from __future__ import unicode_literals at the beginning of each of the files I created.
I need to pull the city information from the database and send it to another server using an XML request. There is logging in place along the way so that I can observe the flow of data. When I try and log the value for the city I get the following traceback:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe1' in position 1: ordinal not in range(128)
Here is the code I use to log the values I'm passing.
def createXML(self, dict):
"""
.. method:: createXML()
Create a single-depth XML string based on a set of tuples
:param dict: Set of tuples (simple dictionary)
"""
xml_string = ''
for key in dict:
self.logfile.write('\nkey = {0}\n'.format(key))
if (isinstance(dict[key], basestring)):
self.logfile.write('basestring\n')
self.logfile.write('value = {0}\n\n'.format(dict[key].decode('utf-8')))
else:
self.logfile.write('value = {0}\n\n'.format(dict[key]))
xml_string += '<{0}>{1}</{0}>'.format(key, dict[key])
return xml_string
I'm basically saving all the information I have in a simple dictionary and using this function to generate an XML formatted string - this is beyond the scope of this question.
The error I am getting had me wondering what was actually being saved in the database. I have verified the value is utf-8 encoded. I created a simple script to extract the value from the database, decode it and print it to the screen.
from __future__ import unicode_literals
import psycopg2
# Establish the database connection
try:
db = psycopg2.connect("dbname = 'dbname' \
user = 'user' \
host = 'IP Address' \
password = 'password'")
cur = db.cursor()
except:
print "Unable to connect to the database."
# Get database info if any is available
command = "SELECT state FROM table WHERE id = 'my_id'"
cur.execute(command)
results = cur.fetchall()
state = results[0][0]
print "my state is {0}".format(state.decode('utf-8'))
Result: my state is Málaga
In Django I'm doing the following to create the HTTP request:
## Create the header
http_header = "POST {0} HTTP/1.0\nHost: {1}\nContent-Type: text/xml\nAuthorization: Basic {2}\nContent-Length: {3}\n\n"
req = http_header.format(service, host, auth, len(self.xml_string)) + self.xml_string
Can anyone help me correct the problem so that I can write this information to the database and be able to create the req string to send to the other server?
Am I getting this error as a result of how Django is handling this? If so, what is Django doing? Or, what am I telling Django to do that is causing this?
EDIT1:
I've tried to use Django's django.utils.encoding on this state value as well. I read a little from saltycrane about a possible hiccup Djano might have with unicode/utf-8 stuff.
I tried to modify my logging to use the smart_str functionality.
def createXML(self, dict):
"""
.. method:: createXML()
Create a single-depth XML string based on a set of tuples
:param dict: Set of tuples (simple dictionary)
"""
xml_string = ''
for key in dict:
if (isinstance(dict[key], basestring)):
if (key == 'v1:State'):
var_str = smart_str(dict[key])
for index in range(0, len(var_str)):
var = bin(ord(var_str[index]))
self.logfile.write(var)
self.logfile.write('\n')
self.logfile.write('{0}\n'.format(var_str))
xml_string += '<{0}>{1}</{0}>'.format(key, dict[key])
return xml_string
I'm able to write the correct value to the log doing this but I narrowed down another possible problem with the .format() string functionality in Python. Of course my Google search of python format unicode had the first result as Issue 7300, which states that this is a known "issue" with Python 2.7.
Now, from another stackoverflow post I found a "solution" that does not work in Django with the smart_str functionality (or at least I've been unable to get them to work together).
I'm going to continue digging around and see if I can't find the underlying problem - or at least a work-around.
EDIT2:
I found a work-around by simply concatenating strings rather than using the .format() functionality. I don't like this "solution" - it's ugly, but it got the job done.
def createXML(self, dict):
"""
.. method:: createXML()
Create a single-depth XML string based on a set of tuples
:param dict: Set of tuples (simple dictionary)
"""
xml_string = ''
for key in dict:
xml_string += '<{0}>'.format(key)
if (isinstance(dict[key], basestring)):
xml_string += smart_str(dict[key])
else:
xml_string += str(dict[key])
xml_string += '<{0}>'.format(key)
return xml_string
I'm going to leave this question unanswered as I'd love to find a solution that lets me use .format() the way it was intended.
This is correct approach (problem was with opening file. With UTF-8 You MUST use codecs.open() :
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
class Writer(object):
logfile = codecs.open("test.log", "w", 'utf-8')
def createXML(self, dict):
xml_string = ''
for key, value in dict.iteritems():
self.logfile.write(u'\nkey = {0}\n'.format(key))
if (isinstance(value, basestring)):
self.logfile.write(u'basestring\n')
self.logfile.write(u'value = {0}\n\n'.format( value))
else:
self.logfile.write(u'value = {0}\n\n'.format( value ))
xml_string += u'<{0}>{1}</{0}>'.format(key, value )
return xml_string
And this is from python console:
In [1]: from test import Writer
In [2]: d = { 'a' : u'Zażółć gęślą jaźń', 'b' : u'Och ja Ci zażółcę' }
In [3]: w = Writer()
In [4]: w.createXML(d)
Out[4]: u'<a>Za\u017c\xf3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144</a><b>Och ja Ci za\u017c\xf3\u0142c\u0119</b>'
And this is test.log file:
key = a
basestring
value = Zażółć gęślą jaźń
key = b
basestring
value = Och ja Ci zażółcę