I am trying to launch a Streaming Dataflow Job which contains n number of pipelines.
Based on configured topic and corresponding BQ table for each Topic i want to launch a Pipeline inside a one Streaming Job.
My actual problem is i have to create and upload a template for each and every project. What i want is, i can reuse the uploaded template and only configuration files ihave to pass for launching new dataflow job by changing topic,subscription, dataset and bq table.
Which is i am unable to reuse the template.
Please help me on this and let me know if this is possible or not. Because Google has also provided one to one template. Not many to many Template (e.g Three topic - Three BQ Table (three data pipeleine) , n-n).
import logging
import os
import json
from google.cloud import storage
from apache_beam import Pipeline, ParDo, DoFn
from apache_beam.io import ReadFromPubSub, WriteToBigQuery, BigQueryDisposition
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions, WorkerOptions, GoogleCloudOptions, \
def _get_storage_service():
storage_client = storage.Client \
print('storage service fetched')
return storage_client
class RuntimeOptions(PipelineOptions):
def __init__(self, flags=None, **kwargs):
super(RuntimeOptions, self).__init__(flags, **kwargs)
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument('--bucket_name', type=str)
parser.add_value_provider_argument('--config_json_path', type=str,)
class PipelineCreator:
def __init__(self):
self.options = PipelineOptions()
storage_client = storage.Client.from_service_account_json(
runtime_options = self.options.view_as(RuntimeOptions)
bucket_name = str(runtime_options.bucket_name)
config_json_path = str(runtime_options.config_json_path)
# get the bucket with name
bucket = storage_client.get_bucket(bucket_name)
# get bucket file as blob
blob = bucket.get_blob(config_json_path)
# convert to string and load config
json_data = blob.download_as_string()
self.configData = json.loads(json_data)
dataflow_config = self.configData['dataflow_config']
self.options.view_as(StandardOptions).streaming = bool(dataflow_config['streaming'])
self.options.view_as(SetupOptions).save_main_session = True
worker_options = self.options.view_as(WorkerOptions)
worker_options.max_num_workers = int(dataflow_config['max_num_worker'])
worker_options.autoscaling_algorithm = str(dataflow_config['autoscaling_algorithm'])
#worker_options.machine_type = str(dataflow_config['machine_type'])
#worker_options.zone = str(dataflow_config['zone'])
#worker_options.network = str(dataflow_config['network'])
#worker_options.subnetwork = str(dataflow_config['subnetwork'])
def run(self):
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'dataflow-service-account.json'
project_id = self.configData['project_id']
dataset_id = self.configData['dataset_id']
topics = self.configData['topics']
table_ids = self.configData['bq_table_ids']
error_table_id = self.configData['error_table_id']
logger = logging.getLogger(project_id)
pipeline = Pipeline(options=self.options)
size = len(topics)
for index in range(size):
pipeline_name = "pipeline_"+str(index)
logger.info("Launch pipeline :: "+pipeline_name)
messages = pipeline | 'Read PubSub Message in ' + pipeline_name >> ReadFromPubSub(topic=topics[index])
logger.info("Read PubSub Message")
valid_messages, invalid_messages = messages | 'Convert Messages to TableRows in ' + pipeline_name >> ParDo(TransformMessageToTableRow()).with_outputs('invalid', main='valid')
valid_messages | 'Write Messages to BigQuery in ' + pipeline_name >> WriteToBigQuery(table=table_ids[index],
class TransformMessageToTableRow(DoFn):
def process(self, element, *args, **kwargs):
logging.getLogger('dataflow').log(logging.INFO, element)
print element
print("element type ", type(element))
print("inside bq pardo")
import json
message_rows = json.loads(element)
# if using emulator, uncomment below line
message_rows = json.loads(message_rows)
print 'loaded element'
element = "[" + element + "]"
message_rows = json.loads(element)
except Exception as e:
from apache_beam import pvalue
yield [pvalue.TaggedOutput('invalid', [element, str(e)])]
print("message rows", type(message_rows))
if not isinstance(message_rows, list):
message_rows = [message_rows]
#rows = list()
if isinstance(message_rows, list):
for row in message_rows:
new_row = dict()
for k, v in row.items():
new_row[str(k)] = v
yield new_row
except Exception as e:
from apache_beam import pvalue
yield pvalue.TaggedOutput('invalid', [row, str(e)])
if __name__ == '__main__':
Here Runtime argument as bucket_name and config_json_path for all the configuration related stuffs like Dataset, BQ table, Topics/ Subscription and all Workflow options.
This is possible or not ? Because Google has also provided one to one template. Not many to many Template (e.g Three topic - Three BQ Table (three data pipeleine) , n-n).

Regarding this previously answered thread Unable to run multiple Pipelines in desired order by creating template in Apache Beam, you can run only one pipeline inside a template at any time.
You'll have to delegate the template creation to another service and pass the configuration with it, just follow the link inside the thread and you'll have How To examples.


How to manage stored procedure in AWS Redshift using an automation or command line tool?

We've got a number of stored procedures that have been built in Redshift on AWS.
We need to download and upload these stored procedures so that they can be kept in GITHUB as a means of tracking changes.
These procedures eventually need to be part of a cloudformation template so that the infrastructure can be maintained as well.
Ideally this could be done using the AWS CLI but there doesn't seem to be a command to do that.
How are AWS RedShift stored proceedure managed in an automation/ CICD environment?
I have a portion of a working solution.
import json
import psycopg2
import os
def run_sql_commands(input_command, database="mydatabasename", host_port=1234 ,host_url="datawarehouse.redshift.amazonaws.com"):
:param input_command: sql string to execute
:param database: which database to run the query
results = None
# db_user and db_pass will need to be set as environment variables
db_user = os.environ["db_user"]
db_pass = os.environ["db_pass"]
db_host = host_url
db_port = host_port
db_name = database
conn = psycopg2.connect(
"dbname={} port={} user={} host={} password={}".format(db_name, db_port, db_user, db_host, db_pass))
cursor = conn.cursor()
results = cursor.fetchall()
except Exception as e:
return None
return results
def get_arg_type(oid, database):
sql = f"SELECT typname FROM pg_catalog.pg_type WHERE oid={oid}"
r = run_sql_commands(sql, database)
return r[0][0]
def download_all_procedures(database, file_location="./local-code-store"):
get_all_stored_procedure_sql = """
pg_catalog.pg_namespace n
JOIN pg_catalog.pg_proc p ON
pronamespace = n.oid
JOIN pg_user b ON
b.usesysid = p.proowner
nspname NOT IN ('information_schema', 'pg_catalog');
r = run_sql_commands(get_all_stored_procedure_sql, database)
for item in r:
table = item[0]
author = item[1]
procedure_name = item[2]
procedure_arguments = item[3]
procedure_argtypes = item[4]
procedure = item[5]
t_list = []
for this_oid in procedure_argtypes.split():
t = get_arg_type(this_oid, database="mydatabasename")
meta_data = {'table': table,
'author': author,
'arguments': procedure_arguments,
'argument_types': t_list}
filename = f'{file_location}/{database}/Schemas/{table}/{procedure_name}.sql'
os.makedirs(os.path.dirname(filename), exist_ok=True)
f = open(filename, 'w')
count = f.write(procedure.replace('\r\n', '\n'))
filename = f'{file_location}/{database}/Schemas/{table}/{procedure_name}.json'
os.makedirs(os.path.dirname(filename), exist_ok=True)
f = open(filename, 'w')
return counted_items
if __name__ == "__main__":
c = download_all_procedures("mydatabase")
print("... finished")

Copying S3 objects from one account to other using Lambda python

I'm using boto3 to copy files from s3 bucket from one account to other. I need a similar functionality like aws s3 sync. Please see my code. My company has decided to 'PULL' from other S3 bucket (source account). Please don't suggest replication, S3 batch, S3 trigger Lambda..etc. We have gone through all these options and my management do not want to do any configuration at source side. Can you please review this code and let me know if this code works for thousands of objects. Source bucket has nearly 10000 objects. We will create this lambda function in destination account and create a cloudwatch event to trigger the lambda once in a day.
I am checking ETag so that modified files will be copied across when this function is triggered.
Edit: I simplified my code just to see pagination works. It's working if I don't add client.copy(). If I add this line in for loop after reading 3,4 objects it's throwing "errorMessage": "2021-08-07T15:29:07.827Z 82757747-7b72-4f29-ae9f-22e95f969d6c Task timed out after 3.00 seconds". Please advise. Please note that 'test/' folder in my source bucket has around 1100 objects.
import os
import logging
import botocore
logger = logging.getLogger()
logger.setLevel(os.getenv('debug_level', 'INFO'))
client = boto3.client('s3')
def handler(event, context):
main(event, logger)
def main(event, logger):
SOURCE_BUCKET = os.environ.get('SRC_BUCKET')
DEST_BUCKET = os.environ.get('DST_BUCKET')
REGION = os.environ.get('REGION')
prefix = 'test/'
# Create a reusable Paginator
paginator = client.get_paginator('list_objects_v2')
print ('after paginator')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket=SOURCE_BUCKET,Prefix = prefix)
print ('after page iterator')
index = 0
for page in page_iterator:
for obj in page['Contents']:
index += 1
print ("I am looking for {} in the source bucket".format(obj['ETag']))
copy_source = {'Bucket': SOURCE_BUCKET, 'Key': obj['Key']}
client.copy(copy_source, DEST_BUCKET, obj['Key'])
logger.info("number of objects copied {}:".format(index))
except botocore.exceptions.ClientError as e:
This version is working fine if I increase the Lambda timeout to 15 min and memory to 512MB. This checks if the source object already exists in destination before copying.
import boto3
import os
import logging
import botocore
from botocore.client import Config
logger = logging.getLogger()
logger.setLevel(os.getenv('debug_level', 'INFO'))
config = Config(connect_timeout=5, retries={'max_attempts': 0})
client = boto3.client('s3', config=config)
#client = boto3.client('s3')
def handler(event, context):
main(event, logger)
def main(event, logger):
DEST_BUCKET = os.environ.get('DST_BUCKET')
SOURCE_BUCKET = os.environ.get('SRC_BUCKET')
REGION = os.environ.get('REGION')
prefix = ''
# Create a reusable Paginator
paginator = client.get_paginator('list_objects_v2')
print ('after paginator')
# Create a PageIterator from the Paginator
page_iterator_src = paginator.paginate(Bucket=SOURCE_BUCKET,Prefix = prefix)
page_iterator_dest = paginator.paginate(Bucket=DEST_BUCKET,Prefix = prefix)
print ('after page iterator')
index = 0
for page_source in page_iterator_src:
for obj_src in page_source['Contents']:
flag = "FALSE"
for page_dest in page_iterator_dest:
for obj_dest in page_dest['Contents']:
# checks if source ETag already exists in destination
if obj_src['ETag'] in obj_dest['ETag']:
flag = "TRUE"
if flag == "TRUE":
if flag != "TRUE":
index += 1
client.copy_object(Bucket=DEST_BUCKET, CopySource={'Bucket': SOURCE_BUCKET, 'Key': obj_src['Key']}, Key=obj_src['Key'],)
print ("source ETag {} and destination ETag {}".format(obj_src['ETag'],obj_dest['ETag']))
print ("source Key {} and destination Key {}".format(obj_src['Key'],obj_dest['Key']))
print ("Number of objects copied{}".format(index))
logger.info("number of objects copied {}:".format(index))
except botocore.exceptions.ClientError as e:

NoneType' object has no attribute 'name'

I was building wine recommendation system using k means approach in django. I made cluster module in admin and added 3 clusters manually. However, when I am trying to recommend wine to logged in user I get this error.Can you please help:
AttributeError at /reviews/recommendation/
'NoneType' object has no attribute 'name'
I am getting error in line:
here is the code for view.py
def user_recommendation_list(request):
# get request user reviewed wines
user_reviews = Review.objects.filter(user_name=request.user.username).prefetch_related('wine')
user_reviews_wine_ids = set(map(lambda x: x.wine.id, user_reviews))
# get request user cluster name (just the first one righ now)
user_cluster_name = \
except: # if no cluster assigned for a user, update clusters
user_cluster_name = \
# get usernames for other memebers of the cluster
user_cluster_other_members = \
Cluster.objects.get(name=user_cluster_name).users \
other_members_usernames = set(map(lambda x: x.username, user_cluster_other_members))
# get reviews by those users, excluding wines reviewed by the request user
other_users_reviews = \
Review.objects.filter(user_name__in=other_members_usernames) \
other_users_reviews_wine_ids = set(map(lambda x: x.wine.id, other_users_reviews))
# then get a wine list including the previous IDs, order by rating
wine_list = sorted(
key=lambda x: x.average_rating,
return render(
{'username': request.user.username,'wine_list': wine_list}
and here is the code for suggestions.py
from .models import Review, Wine, Cluster
from django.contrib.auth.models import User
from sklearn.cluster import KMeans
from scipy.sparse import dok_matrix, csr_matrix
import numpy as np
def update_clusters():
num_reviews = Review.objects.count()
update_step = ((num_reviews/100)+1) * 5
if num_reviews % update_step == 0: # using some magic numbers here, sorry...
# Create a sparse matrix from user reviews
all_user_names = map(lambda x: x.username, User.objects.only("username"))
all_wine_ids = set(map(lambda x: x.wine.id, Review.objects.only("wine")))
num_users = len(all_user_names)
ratings_m = dok_matrix((num_users, max(all_wine_ids)+1), dtype=np.float32)
for i in range(num_users): # each user corresponds to a row, in the order of all_user_names
user_reviews = Review.objects.filter(user_name=all_user_names[i])
for user_review in user_reviews:
ratings_m[i,user_review.wine.id] = user_review.rating
# Perform kmeans clustering
k = int(num_users / 10) + 2
kmeans = KMeans(n_clusters=k)
clustering = kmeans.fit(ratings_m.tocsr())
# Update clusters
new_clusters = {i: Cluster(name=i) for i in range(k)}
for cluster in new_clusters.values(): # clusters need to be saved before refering to users
for i,cluster_label in enumerate(clustering.labels_):
When you are going to add data in the cluster table at that time you need to insert your current logged in username. For instance,
jadianes, carlos, and lluis
<username>, teus, yasset

Automating Date Range while extracting

The below script I am using to extract data from Google Analytics. Here I am extracting data for last one week. I want to automate the date range so that i don't have to change date_range every week.
I also want to avoid sampling of data by GA. Please guide my the correct way to automate in details.
author = 'test#gmail.com (test)'
import argparse
import sys
import csv
import string
import datetime
import json
import time
from apiclient.errors import HttpError
from apiclient import sample_tools
from oauth2client.client import AccessTokenRefreshError
cam_name = sys.argv[1:]
class SampledDataError(Exception): pass
def main(argv):
# Authenticate and construct service.
service, flags = sample_tools.init(
argv[0], 'analytics', 'v3', __doc__, __file__,
# Try to make a request to the API. Print the results or handle errors.
profile_id = profile_ids[profile]
if not profile_id:
print ('Could not find a valid profile for this user.')
metrics = argv[1]
dimensions = argv[2]
reportName = argv[3]
sort = argv[4]
filters = argv[5]
for start_date, end_date in date_ranges:
limit = ga_query(service, profile_id, 0,
start_date, end_date, metrics, dimensions, sort, filters).get('totalResults')
for pag_index in range(0, limit, 10000):
results = ga_query(service, profile_id, pag_index,
start_date, end_date, metrics, dimensions, sort, filters)
# if results.get('containsSampledData'):
# raise SampledDataError
print_results(results, pag_index, start_date, end_date, reportName)
except TypeError as error:
# Handle errors in constructing a query.
print ('There was an error in constructing your query : %s' % error)
except HttpError as error:
# Handle API errors.
print ('Arg, there was an API error : %s : %s' %
(error.resp.status, error._get_reason()))
except AccessTokenRefreshError:
# Handle Auth errors.
print ('The credentials have been revoked or expired, please re-run '
'the application to re-authorize')
except SampledDataError:
# force an error if ever a query returns data that is sampled!
print ('Error: Query contains sampled data!')
def ga_query(service, profile_id, pag_index, start_date, end_date, metrics, dimensions, sort, filters):
return service.data().ga().get(
ids='ga:' + profile_id,
def print_results(results, pag_index, start_date, end_date, reportName):
"""Prints out the results.
This prints out the profile name, the column headers, and all the rows of
results: The response returned from the Core Reporting API.
# New write header
if pag_index == 0:
if (start_date, end_date) == date_ranges[0]:
print ('Profile Name: %s' % results.get('profileInfo').get('profileName'))
columnHeaders = results.get('columnHeaders')
cleanHeaders = [str(h['name']) for h in columnHeaders]
print (reportName,'Now pulling data from %s to %s.' %(start_date, end_date))
# Print data table.
if results.get('rows', []):
for row in results.get('rows'):
for i in range(len(row)):
old, new = row[i], str()
for s in old:
new += s if s in string.printable else ''
row[i] = new
print ('No Rows Found')
limit = results.get('totalResults')
print (pag_index, 'of about', int(round(limit, -4)), 'rows.')
return None
# Uncomment this line & replace with 'profile name': 'id' to query a single profile
# Delete or comment out this line to loop over multiple profiles.
profile_ids = {'abc-Mobile': '12345',
'abc-Desktop': '23456',
'pqr-Mobile': '34567',
'pqr-Desktop': '45678',
'xyz-Mobile': '56789',
'xyz-Desktop': '67890'}
date_ranges = [
for profile in sorted(profile_ids):
print("Sequence 1",profile)
with open('qwerty.json') as json_data:
d = json.load(json_data)
for getThisReport in d["Reports"]:
print("Sequence 2",getThisReport["ReportName"])
reportName = getThisReport["ReportName"]
metrics = getThisReport["Metrics"]
dimensions = getThisReport["Dimensions"]
sort = getThisReport["sort"]
filters = getThisReport["filter"]
path = 'C:\\Projects\\DataExport\\test\\' #replace with path to your folder where csv file with data will be written
today = time.strftime('%Y%m%d')
filename = profile+'_'+reportName+'_'+today+'.csv' #replace with your filename. Note %s is a placeholder variable and the profile name you specified on row 162 will be written here
with open(path + filename, 'wt') as f:
writer = csv.writer(f,delimiter = '|', lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
args = [sys.argv,metrics,dimensions,reportName,sort,filters]
if __name__ == '__main__': main(args)
print ( "Profile done. Next profile...")
print ("All profiles done.")
The Core Reporting API supports some interesting things as far as dates goes.
All Analytics data requests must specify a date range. If you do not include start-date and end-date parameters in the request, the server returns an error. Date values can be for a specific date by using the pattern YYYY-MM-DD or relative by using today, yesterday, or the NdaysAgo pattern. Values must match [0-9]{4}-[0-9]{2}-[0-9]{2}|today|yesterday|[0-9]+(daysAgo).
so doing something like
start_date = '7daysAgo'
end_date = 'today'
Just remember that data hasn't completed processing for 24 - 48 hours so your data for today, yesterday and the day before that may not be 100% accurate.

DynamoDB pagination using Boto3

We are using boto3 for our DynamoDB and we need to do a full scan of our tables to enable to do that based on other post we need to do a pagination. However, we are unable to find a working sample of pagination. Here is what we did.
import boto3
client_setting = boto3.client('dynamodb', region_name='ap-southeast-2')
paginator = client_setting.get_paginator('scan')
esk = {}
data = []
unconverted_ga = ourQuery(params1, params2)
for page in unconverted_ga:
esk = page['LastEvaluatedKey']
We dont know exactly how to make the esk as the ExclusiveStartKey of our next query. What should be the expected value of ExclusiveStartkey parameter? We are still new in DynamoDB and there's many things we need to learn including this. thanks!
From the answer by Tay B at https://stackoverflow.com/a/38619425/3176550
import boto3
dynamodb = boto3.resource('dynamodb',
table = dynamodb.Table('widgetsTableName')
response = table.scan()
data = response['Items']
while 'LastEvaluatedKey' in response:
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
After hour of search, i've finally found a better solution. For those who are new to DynamoDB, we should'nt missed this - http://docs.aws.amazon.com/amazondynamodb/latest/gettingstartedguide/GettingStarted.Python.04.html
from __future__ import print_function # Python 2/3 compatibility
import boto3
import json
import decimal
from boto3.dynamodb.conditions import Key, Attr
# Helper class to convert a DynamoDB item to JSON.
class DecimalEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, decimal.Decimal):
if o % 1 > 0:
return float(o)
return int(o)
return super(DecimalEncoder, self).default(o)
dynamodb = boto3.resource('dynamodb', region_name='us-west-2', endpoint_url="http://localhost:8000")
table = dynamodb.Table('Movies')
fe = Key('year').between(1950, 1959)
pe = "#yr, title, info.rating"
# Expression Attribute Names for Projection Expression only.
ean = { "#yr": "year", }
esk = None
response = table.scan(
for i in response['Items']:
print(json.dumps(i, cls=DecimalEncoder))
// As long as LastEvaluatedKey is in response it means there are still items from the query related to the data
while 'LastEvaluatedKey' in response:
response = table.scan(
ExpressionAttributeNames= ean,
for i in response['Items']:
print(json.dumps(i, cls=DecimalEncoder))
You can try with following code:
esk = None
while True:
scan_generator = YourTableName.scan(max_results=10, exclusive_start_key=esk)
for item in scan_generator:
# your code for processing
# condition to check if entire table is scanned
# Load the last keys
esk = scan_generator.kwargs['exclusive_start_key'].values()
Here is the reference documentation link.
Hope that helps
Bit more verbose but I like it.
def fetch_from_table(last_key=None):
if last_key:
response = table.query(
response = table.query(
# print(response)
for item in response['Items']:
return response.get('LastEvaluatedKey')
last_key = fetch_from_table()
while last_key != None:
print("Running again : ")
last_key = fetch_from_table(last_key)
import sys
import boto3
client = boto3.client('dynamodb')
marker = None
while True:
paginator = client.get_paginator('list_tables')
page_iterator = paginator.paginate(
'MaxItems': 1000,
'PageSize': 100,
'StartingToken': marker})
for page in page_iterator:
for table in tables:
print (table)
marker = page['NextToken']
except KeyError: