Python 2.7 and GCP Google BigQuery: extracts - compression not working - python-2.7

I'm using python 2.7 (can't change right now), and Google python client library v0.28 of google.cloud.bigquery, and the compression="GZIP" or "NONE" argument/setting doesn't appear to be working for me, can someone else try this out and let me know if it works for them?
In the code below you can see I've been playing with this, but each time on GCS my files appear to be non-compressed, no matter what I use for the compression.
Note: my imports are for a larger set of code, not all needed for this snippet
from pandas.io import gbq
import google.auth
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
from google.cloud.bigquery import LoadJobConfig
from google.cloud.bigquery import Table
import json
import re
from google.cloud import storage
bigquery_client = bigquery.Client(project=project)
dataset_ref = bigquery_client.dataset(dataset_name)
table_ref = dataset_ref.table(table_name)
job_id_prefix = "bqTools_export_job"
job_config = bigquery.LoadJobConfig()
# default is ","
if field_delimiter:
job_config.field_delimiter = field_delimiter
# default is true
if print_header:
job_config.print_header = print_header
# CSV, NEWLINE_DELIMITED_JSON, or AVRO
if destination_format:
job_config.destination_format = destination_format
# GZIP or NONE
if compression:
job_config.compression = compression
job_config.Compression = "GZIP"
job_config.compression = "GZIP"
job = bigquery_client.extract_table(table_ref, destination, job_config=job_config, job_id_prefix=job_id_prefix)
# job.begin()
job.result() # Wait for job to complete
returnMsg = 'Exported {}:{} to {}'.format(dataset_name, table_name, destination)
Related links:
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.extract.compression
https://googlecloudplatform.github.io/google-cloud-python/latest/_modules/google/cloud/bigquery/job.html
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.extract.compression
https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/bigquery/api/export_data_to_cloud_storage.py
I'm sure I'm doing something stupid, thank you for your help...Rich
EDIT BELOW
In the interest of sharing, here is what I think our final code will be...Rich
# export a table from bq into a file on gcs,
# the destination should look like the following, with no brackets {}
# gs://{bucket-name-here}/{file-name-here}
def export_data_to_gcs(dataset_name, table_name, destination,
field_delimiter=",", print_header=None,
destination_format="CSV", compression="GZIP", project=None):
try:
bigquery_client = bigquery.Client(project=project)
dataset_ref = bigquery_client.dataset(dataset_name)
table_ref = dataset_ref.table(table_name)
job_id_prefix = "bqTools_export_job"
job_config = bigquery.ExtractJobConfig()
# default is ","
if field_delimiter:
job_config.field_delimiter = field_delimiter
# default is true
if print_header:
job_config.print_header = print_header
# CSV, NEWLINE_DELIMITED_JSON, or AVRO
if destination_format:
job_config.destination_format = destination_format
# GZIP or NONE
if compression:
job_config.compression = compression
# if it should be compressed, make sure there is a .gz on the filename, add if needed
if compression == "GZIP":
if destination.lower()[-3:] != ".gz":
destination = str(destination) + ".gz"
job = bigquery_client.extract_table(table_ref, destination, job_config=job_config, job_id_prefix=job_id_prefix)
# job.begin()
job.result() # Wait for job to complete
returnMsg = 'Exported {}:{} to {}'.format(dataset_name, table_name, destination)
return returnMsg
except Exception as e:
errorStr = 'ERROR (export_data_to_gcs): ' + str(e)
print(errorStr)
raise

For table extract you should use ExtractJobConfig

Related

IICS taskflow - Notification task - attach file

I have an IICS taskflow with a mapping task and a notification task. The target of the mapping task is a csv stored in a server location.
With the notification task, I want to send an email with the csv attached. Do you know if this is possible or if is there another way to get to send the target csv by email??
Not the ideal solution, but you can write a simple python program to send the csv as an email, just provide the path of the csv to python program. and then, execute the python script from informatica as a command task. (IF Success) it is super easy to use a python script.
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
import os
import shutil
def send_mail(body_text, fromaddr, recipient_list, smtp_login, smtp_pass, file_path):
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['To'] = ', '.join(recipient_list)
msg['Subject'] = 'your Subject variable'
msg.attach(MIMEText(body_text, 'plain'))
filename = os.path.basename(file_path)
attachment = open(file_path, "rb")
part = MIMEBase('multipart', 'mixed; name=%s' % filename)
part.set_payload(attachment.read())
encoders.encode_base64(part)
part.add_header('Content-Disposition', "attachment; filename= %s" % filename)
msg.attach(part)
server = smtplib.SMTP(host="your.mailserver.com", port=123)
# eg smtp.google.com
server.starttls()
server.login(smtp_login, smtp_pass)
text = msg.as_string()
server.set_debuglevel(1)
server.sendmail(fromaddr, recipient_list, text)
server.quit()
mainDir = '/path/to/file'
sendFileName = 'yourfilename' + '.csv'
sourceFilePath = mainDir + 'file_send/filename.csv'
body_text = '''
Dear All,
Please find attached document
Thank You.
'''
smtp_login = "emailusername"
smtp_pass = "emaipassword"
recipient_list = ['abc#company.com', 'def#company.com']
file_path = os.path.abspath(sendFilePath)
fromaddr = 'emailusername#company.com'
send_mail(body_text=body_text, fromaddr=fromaddr, recipient_list=recipient_list, smtp_login=smtp_login, smtp_pass=smtp_pass,
file_path=file_path)
This code may needs some modifications but thought it might help someone.

I would like to export DynamoDB Table to S3 bucket in CSV format using Python (Boto3)

This question has been asked earlier in the following link:
How to write dynamodb scan data's in CSV and upload to s3 bucket using python?
I have amended the code as advised in the comments. The code looks like as follows:
import csv
import boto3
import json
dynamodb = boto3.resource('dynamodb')
db = dynamodb.Table('employee_details')
def lambda_handler(event, context):
AWS_BUCKET_NAME = 'session5cloudfront'
s3 = boto3.resource('s3')
bucket = s3.Bucket(AWS_BUCKET_NAME)
path = '/tmp/' + 'employees.csv'
try:
response = db.scan()
myFile = open(path, 'w')
for i in response['Items']:
csv.register_dialect('myDialect', delimiter=' ', quoting=csv.QUOTE_NONE)
with myFile:
writer = csv.writer(myFile, dialect='myDialect')
writer.writerows(i)
print(i)
except :
print("error")
bucket.put_object(
ACL='public-read',
ContentType='application/csv',
Key=path,
# Body=json.dumps(i),
)
# print("here")
body = {
"uploaded": "true",
"bucket": AWS_BUCKET_NAME,
"path": path,
}
# print("then here")
return {
"statusCode": 200,
"body": json.dumps(body)
}
I am a novice, please help me in fixing this code as it is having problem in inserting data in file created in S3 Bucket.
Thanks
I have revised the code to be simpler and to also handle paginated responses for tables with more than 1MB of data:
import csv
import boto3
import json
TABLE_NAME = 'employee_details'
OUTPUT_BUCKET = 'my-bucket'
TEMP_FILENAME = '/tmp/employees.csv'
OUTPUT_KEY = 'employees.csv'
s3_resource = boto3.resource('s3')
dynamodb_resource = boto3.resource('dynamodb')
table = dynamodb_resource.Table(TABLE_NAME)
def lambda_handler(event, context):
with open(TEMP_FILENAME, 'w') as output_file:
writer = csv.writer(output_file)
header = True
first_page = True
# Paginate results
while True:
# Scan DynamoDB table
if first_page:
response = table.scan()
first_page = False
else:
response = table.scan(ExclusiveStartKey = response['LastEvaluatedKey'])
for item in response['Items']:
# Write header row?
if header:
writer.writerow(item.keys())
header = False
writer.writerow(item.values())
# Last page?
if 'LastEvaluatedKey' not in response:
break
# Upload temp file to S3
s3_resource.Bucket(OUTPUT_BUCKET).upload_file(TEMP_FILENAME, OUTPUT_KEY)

Dynamic Handing of Bigquery table schema while inserting data into BQ table from variable

I am trying to append data to BQ table using python code which requires dynamic schema handling.
Can anyone provide me the link to handle above scenario.
An example code of loading a .csv file into BigQuery using the python client library:
# from google.cloud import bigquery
# client = bigquery.Client()
# filename = '/path/to/file.csv'
# dataset_id = 'my_dataset'
# table_id = 'my_table'
dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.CSV
job_config.skip_leading_rows = 1
job_config.autodetect = True
with open(filename, "rb") as source_file:
job = client.load_table_from_file(source_file, table_ref, job_config=job_config)
job.result() # Waits for table load to complete.
print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id, table_id))
Also check this part of the documentation to know more about appending data into tables from a source file using the same or different schema.

Required parameter is missing error while writing to bigQuery with google.cloud.bigquery in Python

I am loading a New Line Delimited JSON to bigQuery using the following code snippet in Python 2.7:
from google.cloud import bigquery
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
bigquery_client = bigquery.Client()
dataset = bigquery_client.dataset('testGAData')
table_ref = dataset.table('gaData')
table = bigquery.Table(table_ref)
with open('gaData.json', 'rb') as source_file:
job_config = bigquery.LoadJobConfig()
job_config.source_format = 'NEWLINE_DELIMITED_JSON'
job = bigquery_client.load_table_from_file(
source_file, table, job_config=job_config)
It returns me the following error:
File "/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/google/cloud/bigquery/client.py", line 897, in load_table_from_file
raise exceptions.from_http_response(exc.response)
google.api_core.exceptions.BadRequest: 400 POST https://www.googleapis.com/upload/bigquery/v2/projects/test-project-for-experiments/jobs?uploadType=resumable: Required parameter is missing
Why am I getting this error? How can I fix this? Has anyone else faced a similar issue? Thanks in advance.
Edit: Added last para, included python imports and corrected the indents.
Issues observed with the initial code
You are missing the schema for your table. You can either use job_config.autodetect = True or job_config.schema = [bigquery.SchemaField("FIELD NAME", "FIELD TYPE")].
From the documentation, you should set job_config.source_format = `bigquery.SourceFormat.NEWLINE_DELIMITED_JSON` for a JSON file source
You should pass yourtable_ref variable as an argument instead your table variable in bigquery_client.load_table_from_file(source_file, table, job_config=job_config)
Link to the documentation
Working Code
The below code works for me. I am using python 3 and google-cloud-bigquery v1.5
from google.cloud import bigquery
client = bigquery.Client()
dataset_id, table_id = "TEST_DATASET", "TEST_TABLE"
data_ref = client.dataset(dataset_id)
table_ref = data_ref.table(table_id)
file_path = "path/to/test.json"
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
#job_config.autodetect = True
job_config.schema = [bigquery.SchemaField("Name", "STRING"), bigquery.SchemaField("Age", "INTEGER")]
with open(file_path, 'rb') as source_file:
job = client.load_table_from_file(source_file, table_ref, location='US', job_config=job_config)
job.result()
print('Loaded {} rows into {}:{}.'.format(job.output_rows, dataset_id, table_id))
Output
>> Loaded 2 rows into TEST_DATASET:TEST_TABLE.

DynamoDB pagination using Boto3

We are using boto3 for our DynamoDB and we need to do a full scan of our tables to enable to do that based on other post we need to do a pagination. However, we are unable to find a working sample of pagination. Here is what we did.
import boto3
client_setting = boto3.client('dynamodb', region_name='ap-southeast-2')
paginator = client_setting.get_paginator('scan')
esk = {}
data = []
unconverted_ga = ourQuery(params1, params2)
for page in unconverted_ga:
data.append(page)
esk = page['LastEvaluatedKey']
We dont know exactly how to make the esk as the ExclusiveStartKey of our next query. What should be the expected value of ExclusiveStartkey parameter? We are still new in DynamoDB and there's many things we need to learn including this. thanks!
From the answer by Tay B at https://stackoverflow.com/a/38619425/3176550
import boto3
dynamodb = boto3.resource('dynamodb',
aws_session_token=aws_session_token,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=region
)
table = dynamodb.Table('widgetsTableName')
response = table.scan()
data = response['Items']
while 'LastEvaluatedKey' in response:
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
data.update(response['Items'])
After hour of search, i've finally found a better solution. For those who are new to DynamoDB, we should'nt missed this - http://docs.aws.amazon.com/amazondynamodb/latest/gettingstartedguide/GettingStarted.Python.04.html
from __future__ import print_function # Python 2/3 compatibility
import boto3
import json
import decimal
from boto3.dynamodb.conditions import Key, Attr
# Helper class to convert a DynamoDB item to JSON.
class DecimalEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, decimal.Decimal):
if o % 1 > 0:
return float(o)
else:
return int(o)
return super(DecimalEncoder, self).default(o)
dynamodb = boto3.resource('dynamodb', region_name='us-west-2', endpoint_url="http://localhost:8000")
table = dynamodb.Table('Movies')
fe = Key('year').between(1950, 1959)
pe = "#yr, title, info.rating"
# Expression Attribute Names for Projection Expression only.
ean = { "#yr": "year", }
esk = None
response = table.scan(
FilterExpression=fe,
ProjectionExpression=pe,
ExpressionAttributeNames=ean
)
for i in response['Items']:
print(json.dumps(i, cls=DecimalEncoder))
// As long as LastEvaluatedKey is in response it means there are still items from the query related to the data
while 'LastEvaluatedKey' in response:
response = table.scan(
ProjectionExpression=pe,
FilterExpression=fe,
ExpressionAttributeNames= ean,
ExclusiveStartKey=response['LastEvaluatedKey']
)
for i in response['Items']:
print(json.dumps(i, cls=DecimalEncoder))
You can try with following code:
esk = None
while True:
scan_generator = YourTableName.scan(max_results=10, exclusive_start_key=esk)
for item in scan_generator:
# your code for processing
# condition to check if entire table is scanned
else:
break;
# Load the last keys
esk = scan_generator.kwargs['exclusive_start_key'].values()
Here is the reference documentation link.
Hope that helps
Bit more verbose but I like it.
def fetch_from_table(last_key=None):
if last_key:
response = table.query(
IndexName='advertCatalogIdx',
KeyConditionExpression=Key('sk').eq('CATALOG'),
Limit=5,
ExclusiveStartKey=last_key
)
else:
response = table.query(
IndexName='advertCatalogIdx',
KeyConditionExpression=Key('sk').eq('CATALOG'),
Limit=5
)
# print(response)
for item in response['Items']:
print(item['address'])
print('***************************')
return response.get('LastEvaluatedKey')
last_key = fetch_from_table()
while last_key != None:
print("Running again : ")
last_key = fetch_from_table(last_key)
import sys
import boto3
client = boto3.client('dynamodb')
marker = None
while True:
paginator = client.get_paginator('list_tables')
page_iterator = paginator.paginate(
PaginationConfig={
'MaxItems': 1000,
'PageSize': 100,
'StartingToken': marker})
for page in page_iterator:
tables=page['TableNames']
for table in tables:
print (table)
try:
marker = page['NextToken']
except KeyError:
sys.exit()