BigQuery displaying wrong results - Duplicating data from Cloud Function? - google-cloud-platform

I am a junior developer and I was in charge of implementing the Facebook API to an existing project. However, the business team figured out that the Google Analytics results displayed on BigQuery are wrong. They asked me to fix it. This is the architecture:
What I have done is:
On BigQuery, checking how close/far are the results from Google Analytics. I found there is a pattern, the results I am getting on BigQuery are always either 1, 2 or 3 times the original value of GA.
I checked if there is actually multiple cron jobs on the Compute Engine. There is actually only 1 cron job and running once a day.
I verified the results on Google Cloud Storage. And the result on Google Cloud Storage are correct as you can see bellow:
Based on those informations, I strongly believe that the issue is coming from the Cloud Function as it's the only element between GCS and BQ. I have look at the Cloud Function that trigger files from GCS and I could not find any duplicate operations.
Do you know how can I find the issue?
Cloud Function
BUCKET = "xxxx"
GOOGLE_PROJECT = "xxxx"
HEADER_MAPPING = {
"Source/Medium": "source_medium",
"Campaign": "campaign",
"Last Non-Direct Click Conversions": "last_non_direct_click_conversions",
"Last Non-Direct Click Conversion Value": "last_non_direct_click_conversion_value",
"Last Click Prio Conversions": "last_click_prio_conversions",
"Last Click Prio Conversion Value": "last_click_prio_conversion_value",
"Data-Driven Conversions": "dda_conversions",
"Data-Driven Conversion Value": "dda_conversion_value",
"% Change in Conversions from Last Non-Direct Click to Last Click Prio": "last_click_prio_vs_last_click",
"% Change in Conversions from Last Non-Direct Click to Data-Driven": "dda_vs_last_click"
}
SPEND_HEADER_MAPPING = {
"Source/Medium": "source_medium",
"Campaign": "campaign",
"Spend": "spend"
}
tables_schema = {
"google-analytics": [
bigquery.SchemaField("date", bigquery.enums.SqlTypeNames.DATE, mode='REQUIRED'),
bigquery.SchemaField("week", bigquery.enums.SqlTypeNames.INT64, mode='REQUIRED'),
bigquery.SchemaField("goal", bigquery.enums.SqlTypeNames.STRING, mode='REQUIRED'),
bigquery.SchemaField("source", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("medium", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("campaign", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("last_non_direct_click_conversions", bigquery.enums.SqlTypeNames.INT64, mode='NULLABLE'),
bigquery.SchemaField("last_non_direct_click_conversion_value", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
bigquery.SchemaField("last_click_prio_conversions", bigquery.enums.SqlTypeNames.INT64, mode='NULLABLE'),
bigquery.SchemaField("last_click_prio_conversion_value", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
bigquery.SchemaField("dda_conversions", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
bigquery.SchemaField("dda_conversion_value", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
bigquery.SchemaField("last_click_prio_vs_last_click", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
bigquery.SchemaField("dda_vs_last_click", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE')
],
"google-analytics-spend": [
bigquery.SchemaField("date", bigquery.enums.SqlTypeNames.DATE, mode='REQUIRED'),
bigquery.SchemaField("week", bigquery.enums.SqlTypeNames.INT64, mode='REQUIRED'),
bigquery.SchemaField("source", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("medium", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("campaign", bigquery.enums.SqlTypeNames.STRING, mode='NULLABLE'),
bigquery.SchemaField("spend", bigquery.enums.SqlTypeNames.FLOAT64, mode='NULLABLE'),
]
}
def download_from_gcs(file):
client = storage.Client()
bucket = client.get_bucket(BUCKET)
blob = bucket.get_blob(file['name'])
file_name = os.path.basename(os.path.normpath(file['name']))
blob.download_to_filename(f"/tmp/{file_name}")
return file_name
def load_in_bigquery(file_object, dataset: str, table: str):
client = bigquery.Client()
table_id = f"{GOOGLE_PROJECT}.{dataset}.{table}"
job_config = bigquery.LoadJobConfig(
source_format=bigquery.SourceFormat.CSV,
skip_leading_rows=1,
autodetect=True,
schema=tables_schema[table]
)
job = client.load_table_from_file(file_object, table_id, job_config=job_config)
job.result() # Wait for the job to complete.
def __order_columns(df: pd.DataFrame, spend=False) ->pd.DataFrame:
# We want to have source and medium columns at the third position
# for a spend data frame and at the fourth postion for others df
# because spend data frame don't have goal column.
pos = 2 if spend else 3
cols = df.columns.tolist()
cols[pos:2] = cols[-2:]
cols = cols[:-2]
return df[cols]
def __common_transformation(df: pd.DataFrame, date: str, goal: str) -> pd.DataFrame:
# for any kind of dataframe, we add date and week columns
# based on the file name and we split Source/Medium from the csv
# into two different columns
week_of_the_year = datetime.strptime(date, '%Y-%m-%d').isocalendar()[1]
df.insert(0, 'date', date)
df.insert(1, 'week', week_of_the_year)
mapping = SPEND_HEADER_MAPPING if goal == "spend" else HEADER_MAPPING
print(df.columns.tolist())
df = df.rename(columns=mapping)
print(df.columns.tolist())
print(df)
df["source_medium"] = df["source_medium"].str.replace(' ', '')
df[["source", "medium"]] = df["source_medium"].str.split('/', expand=True)
df = df.drop(["source_medium"], axis=1)
df["week"] = df["week"].astype(int, copy=False)
return df
def __transform_spend(df: pd.DataFrame) -> pd.DataFrame:
df["spend"] = df["spend"].astype(float, copy=False)
df = __order_columns(df, spend=True)
return df[df.columns[:6]]
def __transform_attribution(df: pd.DataFrame, goal: str) -> pd.DataFrame:
df.insert(2, 'goal', goal)
df["last_non_direct_click_conversions"] = df["last_non_direct_click_conversions"].astype(int, copy=False)
df["last_click_prio_conversions"] = df["last_click_prio_conversions"].astype(int, copy=False)
df["dda_conversions"] = df["dda_conversions"].astype(float, copy=False)
return __order_columns(df)
def transform(df: pd.DataFrame, file_name) -> pd.DataFrame:
goal, date, *_ = file_name.split('_')
df = __common_transformation(df, date, goal)
# we only add goal in attribution df (google-analytics table).
return __transform_spend(df) if "spend" in file_name else __transform_attribution(df, goal)
def main(event, context):
"""Triggered by a change to a Cloud Storage bucket.
Args:
event (dict): Event payload.
context (google.cloud.functions.Context): Metadata for the event.
"""
file = event
file_name = download_from_gcs(file)
df = pd.read_csv(f"/tmp/{file_name}")
transformed_df = transform(df, file_name)
with open(f"/tmp/bq_{file_name}", "w") as file_object:
file_object.write(transformed_df.to_csv(index=False))
with open(f"/tmp/bq_{file_name}", "rb") as file_object:
table = "google-analytics-spend" if "spend" in file_name else "google-analytics"
load_in_bigquery(file_object, dataset='attribution', table=table)
update
Yes, the cloud function is triggered by the GCS object finalize event. Moreover, the function won't be automatically retried on failure.
I am following your suggestions and I am now checking the log table on my Cloud Function page. On the last 10 lines of logs data, it seems that 3 different instances of the Cloud Function were run. I am not able to get more details when I am expanding each lines.
I am also going to check BigQuery logs now. I guess the easiest solution would be to use BigQueryAuditMetadata and get logs about when the table was updated?

From my point of view, this is a very big topic, so it might very difficult to provide one precise solution to solve all issues. So, I won't be able to solve the issue, but I can only express some personal observations and provide some suggestions.
The cloud function is triggered by the GCS object finalize event - can you check that this is correct, please? In that case, the event is 'going through the PubSub' before triggering the cloud function invocation. Now there are 2 things to have in mind:
The PubSub is based on 'deliver at least once' paradigm, thus duplicate message deliveries are possible.
Such cloud function invocation has an automatic acknowledgement. And the developer has not control over that. The PubSub cannot be used to control the overall process state. And the longer the cloud function is being executed (up to timeout 540 seconds or more), the more chances that the PubSub makes (an internal) decision that the message has not been delivered, therefore it should be delivered again, thus a new invocation of the cloud function.
Some additional details are here: Issue: Cloud Function explicit acknowledgement of a pubsub message
Now, how to see if that happens. Personally I would start with the logging. When a cloud function starts, I would log the object name and some hash code (i.e. CRC32C or MD5, etc. which are available from the event metadata) - just in the cloud function code. In that case, I would be able to see many cloud function invocations for one GCS object from the logs (if that happens). Another good idea - to get information - how long a cloud function is being executed.
By doing that step we can check if the cloud function is called more than once fora given GCS object.
The next step - how we load the data. The 'load' is a job. It means that there exist a queue and a scheduler (somewhere in GCP BigQuery service). And the load job stays in the queue until it is picked up for loading, and then that job is performed/executed. All of that is extremely 'asynchronous'. Can you check if there are failed load jobs, please? That can be done though BigQuery UI in the simplest case.
On top of that, loading from inside of the cloud function memory - from my point to view - not only very expensive, but also very risky and unreliable. Even simple save the csv into a GCS bucket and load from the bucket - may be much better.
The next - there is a quota 1500 load jobs per table per day as far as I remember. If you have many files to load - you can easily exceed that quota.
Alternative way for 'loading' data - use streaming. It does not have such quota limitations, but it is chargeable, thus you are to pay for streaming.
I will stop for now, let me know if the above was useful, please. And in what direction you are going to develop your solution.
=> Update 04 February 2021 10:50 GMT
To avoid copy and paste - see the answer here: Cloud Function running multiple times instead of once

Related

Cron job not able to trigger AWS Step function

I followed this tutorial here to configure a cron job to kick off my AWS Step Function Data Science SDK. Every time it tries to trigger the state machine it immediately get
I added the newly generated IAM role to have full access to SageMaker and Lambda so I am not sure what is going on.
I am going to provide more information here. The way I setup my sklearn estimator is like this
sklearn_estimator = SKLearn(
entry_point= sm_script,
role = role,
instance_count=1,
dependencies=[sm_utils_file, config_path, 'requirements.txt'],
instance_type=training_instance,
sagemaker_session=sm_sess,
framework_version=FRAMEWORK_VERSION,
base_job_name='{}-training'.format(base_name),
hyperparameters = {'config': config_file},
metric_definitions=[
{'Name': 'client_devices_validation_accuracy_top1', 'Regex': "client devices accuracy for top 1 = ([0-9.]+)"},
{'Name': 'client_devices_validation_f1_top1', 'Regex': "client devices f1 for top 1 = ([0-9.]+)"},
{'Name': 'account_and_password_validation_accuracy_top1', 'Regex': "account and password accuracy for top 1 = ([0-9.]+)"},
{'Name': 'account_and_password_validation_f1_top1', 'Regex': "account and password f1 for top 1 = ([0-9.]+)"}]
)
As you can see I have set some dependencies there that the model needs to train. For the training step I have defined it as so
training_step = steps.TrainingStep(
"Train Step",
estimator=sklearn_estimator,
data={
"train": sagemaker.TrainingInput(pre_train_utils.resolution_data_path()),
},
job_name=execution_input["TrainingJobName"],
wait_for_completion=True,
)
This part pre_train_utils.resolution_data_path() grabs the newest data from redshift and the pre_train_utils is stored as a dependency in the estimator so it should be fine? I am now thinking that this could be the problem?
Update:
I was able to find the error which states this
An error occurred while executing the state 'Train Step' (entered at the event id #2). The JSONPath '$$.Execution.Input['TrainingJobName']' specified for the field 'TrainingJobName.$' could not be found in the input
Specifically it needs to have a json input that looks like this in my case
{
"TrainingJobName": "tt-resolution-classifier-training-2022-09-02",
"ModelName": "tt-resolution-classifier-model-2022-09-02",
"EndpointName": "tt-resolution-classifier-endpoint",
"LambdaFunctionName": "odi-ds-grab-ticket-training-metrics"
}
How do I past this into AWSCloudWatch cron job, if I cannot pass it then I cannot automatically have the state machine train and deploy the endpoint...

How to see progress when using Glue to export DynamoDB table

I'm trying to export every item in a DynamoDB table to S3. I found this tutorial https://aws.amazon.com/blogs/big-data/how-to-export-an-amazon-dynamodb-table-to-amazon-s3-using-aws-step-functions-and-aws-glue/ and followed the example. Basically,
table = glueContext.create_dynamic_frame.from_options(
"dynamodb",
connection_options={
"dynamodb.input.tableName": table_name,
"dynamodb.throughput.read.percent": read_percentage,
"dynamodb.splits": splits
}
)
glueContext.write_dynamic_frame.from_options(
frame=table,
connection_type="s3",
connection_options={
"path": output_path
},
format=output_format,
transformation_ctx="datasink"
)
I tested it in a tiny table in nonprod environment and it works fine. But my Dynamo table in production is over 400GB, 200 mil items. I suppose it'll take a while, but I have no idea how long to expect. Hours, or even days? Are there any way to show progress? For example, showing a count of how many items have been processed. I don't want to blindly start this job and wait.
One way would be to enable continuous logging for your AWS Glue Job to monitor its progress.
Another way would be to trigger a Lambda function whenever a file has been stored in S3, using Amazon S3 event notifications.
Did you try the custom waiter class within was docs?
For instance custom waiter for a Glue Job should look something like this:
class JobCompleteWaiter(CustomWaiter):
def __init__(self, client):
super().__init__(
"JobComplete",
"get_job_run",
"JobRun.JobRunState",
{"SUCCEEDED": WaitState.SUCCEEDED, "FAILED": WaitState.FAILED},
client,
max_tries=100,
)
def wait(self, JobName, RunId):
self._wait(JobName=JobName, RunId=RunId)
According to boto3 docs, you should expect a set of 6 different possible states from a JOB: STARTING'|'RUNNING'|'STOPPING'|'STOPPED'|'SUCCEEDED'|'FAILED'|'TIMEOUT'
So I chost checkein whether was SUCCEEDED or FAILED.

GCP Cloud Tasks: shorten period for creating a previously created named task

We are developing a GCP Cloud Task based queue process that sends a status email whenever a particular Firestore doc write-trigger fires. The reason we use Cloud Tasks is so a delay can be created (using scheduledTime property 2-min in the future) before the email is sent, and to control dedup (by using a task-name formatted as: [firestore-collection-name]-[doc-id]) since the 'write' trigger on the Firestore doc can be fired several times as the document is being created and then quickly updated by backend cloud functions.
Once the task's delay period has been reached, the cloud-task runs, and the email is sent with updated Firestore document info included. After which the task is deleted from the queue and all is good.
Except:
If the user updates the Firestore doc (say 20 or 30 min later) we want to resend the status email but are unable to create the task using the same task-name. We get the following error:
409 The task cannot be created because a task with this name existed too recently. For more information about task de-duplication see https://cloud.google.com/tasks/docs/reference/rest/v2/projects.locations.queues.tasks/create#body.request_body.FIELDS.task.
This was unexpected as the queue is empty at this point as the last task completed succesfully. The documentation referenced in the error message says:
If the task's queue was created using Cloud Tasks, then another task
with the same name can't be created for ~1hour after the original task
was deleted or executed.
Question: is there some way in which this restriction can be by-passed by lowering the amount of time, or even removing the restriction all together?
The short answer is No. As you've already pointed, the docs are very clear regarding this behavior and you should wait 1 hour to create a task with same name as one that was previously created. The API or Client Libraries does not allow to decrease this time.
Having said that, I would suggest that instead of using the same Task ID, use different ones for the task and add an identifier in the body of the request. For example, using Python:
from google.cloud import tasks_v2
from google.protobuf import timestamp_pb2
import datetime
def create_task(project, queue, location, payload=None, in_seconds=None):
client = tasks_v2.CloudTasksClient()
parent = client.queue_path(project, location, queue)
task = {
'app_engine_http_request': {
'http_method': 'POST',
'relative_uri': '/task/'+queue
}
}
if payload is not None:
converted_payload = payload.encode()
task['app_engine_http_request']['body'] = converted_payload
if in_seconds is not None:
d = datetime.datetime.utcnow() + datetime.timedelta(seconds=in_seconds)
timestamp = timestamp_pb2.Timestamp()
timestamp.FromDatetime(d)
task['schedule_time'] = timestamp
response = client.create_task(parent, task)
print('Created task {}'.format(response.name))
print(response)
#You can change DOCUMENT_ID with USER_ID or something to identify the task
create_task(PROJECT_ID, QUEUE, REGION, DOCUMENT_ID)
Facing a similar problem of requiring to debounce multiple instances of Firestore write-trigger functions, we worked around the default Cloud Tasks task-name based dedup mechanism (still a constraint in Nov 2022) by building a small debounce "helper" using Firestore transactions.
We're using a helper collection _syncHelper_ to implement a delayed throttle for side effects of write-trigger fires - in the OP's case, send 1 email for all writes within 2 minutes.
In our case we are using Firebease Functions task queue utils and not directly interacting with Cloud Tasks but thats immaterial to the solution. The key is to determine the task's execution time in advance and use that as the "dedup key":
async function enqueueTask(shopId) {
const queueName = 'doSomething';
const now = new Date();
const next = new Date(now.getTime() + 2 * 60 * 1000);
try {
const shouldEnqueue = await getFirestore().runTransaction(async t=>{
const syncRef = getFirestore().collection('_syncHelper_').doc(<collection_id-doc_id>);
const doc = await t.get(syncRef);
let data = doc.data();
if (data?.timestamp.toDate()> now) {
return false;
}
await t.set(syncRef, { timestamp: Timestamp.fromDate(next) });
return true;
});
if (shouldEnqueue) {
let queue = getFunctions().taskQueue(queueName);
await queue.enqueue({
timestamp: next.toISOString(),
},
{ scheduleTime: next }); }
} catch {
...
}
}
This will ensure a new task is enqueued only if the "next execution" time has passed.
The execution operation (also a cloud function in our case) will remove the sync data entry if it hasn't been changed since it was executed:
exports.doSomething = functions.tasks.taskQueue({
retryConfig: {
maxAttempts: 2,
minBackoffSeconds: 60,
},
rateLimits: {
maxConcurrentDispatches: 2,
}
}).onDispatch(async data => {
let { timestamp } = data;
await sendYourEmailHere();
await getFirestore().runTransaction(async t => {
const syncRef = getFirestore().collection('_syncHelper_').doc(<collection_id-doc_id>);
const doc = await t.get(syncRef);
let data = doc.data();
if (data?.timestamp.toDate() <= new Date(timestamp)) {
await t.delete(syncRef);
}
});
});
This isn't a bullet proof solution (if the doSomething() execution function has high latency for example) but good enough for 99% of our use cases.

How to read and parse data from PubSub topic into a beam pipeline and print it

I have a program which creates a topic in pubSub and also publishes messages to the topic. I also have an automated dataflow job(using a template) which saves these messages into my BigQuery table. Now I intend to replace the template based job with a python pipeline where my requirement is to read data from PubSub, apply transformations and save the data into BigQuery/publish to another PubSub topic. I started writing the script in python and did a lot of trial and error to achieve it but to my dismay, I could not achieve it. The code looks like this:
import apache_beam as beam
from apache_beam.io import WriteToText
TOPIC_PATH = "projects/test-pipeline-253103/topics/test-pipeline-topic"
OUTPUT_PATH = "projects/test-pipeline-253103/topics/topic-repub"
def run():
o = beam.options.pipeline_options.PipelineOptions()
p = beam.Pipeline(options=o)
print("I reached here")
# # Read from PubSub into a PCollection.
data = (
p
| "Read From Pub/Sub" >> beam.io.ReadFromPubSub(topic=TOPIC_PATH)
)
data | beam.io.WriteToPubSub(topic=OUTPUT_PATH)
print("Lines: ", data)
run()
I will really appreciate if I can get some help at the earliest.
Note: I have my project set up on google cloud and I have my script running locally.
Here the working code.
import apache_beam as beam
TOPIC_PATH = "projects/test-pipeline-253103/topics/test-pipeline-topic"
OUTPUT_PATH = "projects/test-pipeline-253103/topics/topic-repub"
class PrintValue(beam.DoFn):
def process(self, element):
print(element)
return [element]
def run():
o = beam.options.pipeline_options.PipelineOptions()
# Replace this by --stream execution param
standard_options = o.view_as(beam.options.pipeline_options.StandardOptions)
standard_options.streaming = True
p = beam.Pipeline(options=o)
print("I reached here")
# # Read from PubSub into a PCollection.
data = p | beam.io.ReadFromPubSub(topic=TOPIC_PATH) | beam.ParDo(PrintValue()) | beam.io.WriteToPubSub(topic=OUTPUT_PATH)
# Don't forget to run the pipeline!
result = p.run()
result.wait_until_finish()
run()
In summary
You miss to run the pipeline. Indeed, Beam is a Graph programming model. So, in your previous code, you built your graph but you never run it. Here, at the end, run it (not blocking call) and wait the end (blocking call)
When you start your pipeline, Beam mention that PubSub work only in streaming mode. Thus, you can start your code with --streaming param, or do it programmatically as shown in my code
Be careful, streaming mode means to listen indefinitively on PubSub. If you run this on Dataflow, your pipeline will be always up, until you stop it. This can be cost expensive if you have few message. Be sure that is the target model
An alternative is to use your pipeline for a limited period of time (you use scheduler for starting it, and another one for stopping it). But, at this moment, you have to stack message. Here you use a Topic as entry of the pipeline. This option force Beam to create a temporary subscription and to listen message on this subscription. This means that the message publish before this subscription creation won't be received and processed.
The idea is to create a subscription, by this way the message will be stacked in it (up to 7 days, by default). Then, use the subscription name in entry of your pipeline beam.io.ReadFromPubSub(subscription=SUB_PATH). The messages will be unstacked and processed by Beam (Order not guaranteed!)
Based on the Beam programming guide, you simply have to add a Transform step in your pipeline. Here an example or transform:
class PrintValue(beam.DoFn):
def process(self, element):
print(element)
return [element]
Add it to your pipeline
data | beam.ParDo(PrintValue()) | beam.io.WriteToPubSub(topic=OUTPUT_PATH)
You can add the number of transforms that you want. You can test the value and set the elements in tagged PCollection (for having multiple output) for fan out, or use side input for fan in PCollection.

list automated RDS snapshots created today and copy to other region using boto3

We are building an automated DR cold site on other region, currently are working on retrieving a list of RDS automated snapshots created today, and passed them to another function to copy them to another AWS region.
The issue is with RDS boto3 client where it returned a unique format of date, making filtering on creation date more difficult.
today = (datetime.today()).date()
rds_client = boto3.client('rds')
snapshots = rds_client.describe_db_snapshots(SnapshotType='automated')
harini = "datetime("+ today.strftime('%Y,%m,%d') + ")"
print harini
print snapshots
for i in snapshots['DBSnapshots']:
if i['SnapshotCreateTime'].date() == harini:
print(i['DBSnapshotIdentifier'])
print (today)
despite already converted the date "harini" to the format 'SnapshotCreateTime': datetime(2015, 1, 1), the Lambda function still unable to list out the snapshots.
The better method is to copy the files as they are created by invoking a lambda function using a cloud watch event.
See step by step instruction:
https://geektopia.tech/post.php?blogpost=Automating_The_Cross_Region_Copy_Of_RDS_Snapshots
Alternatively, you can issue a copy for each snapshot regardless of the date. The client will raise an exception and you can trap it like this
# Written By GeekTopia
#
# Copy All Snapshots for an RDS Instance To a new region
# --Free to use under all conditions
# --Script is provied as is. No Warranty, Express or Implied
import json
import boto3
from botocore.exceptions import ClientError
import time
destinationRegion = "us-east-1"
sourceRegion = 'us-west-2'
rdsInstanceName = 'needbackups'
def lambda_handler(event, context):
#We need two clients
# rdsDestinationClient -- Used to start the copy processes. All cross region
copies must be started from the destination and reference the source
# rdsSourceClient -- Used to list the snapshots that need to be copied.
rdsDestinationClient = boto3.client('rds',region_name=destinationRegion)
rdsSourceClient=boto3.client('rds',region_name=sourceRegion)
#List All Automated for A Single Instance
snapshots = rdsSourceClient.describe_db_snapshots(DBInstanceIdentifier=rdsInstanceName,SnapshotType='automated')
for snapshot in snapshots['DBSnapshots']:
#Check the the snapshot is NOT in the process of being created
if snapshot['Status'] == 'available':
#Get the Source Snapshot ARN. - Always use the ARN when copying snapshots across region
sourceSnapshotARN = snapshot['DBSnapshotArn']
#build a new snapshot name
sourceSnapshotIdentifer = snapshot['DBSnapshotIdentifier']
targetSnapshotIdentifer ="{0}-ManualCopy".format(sourceSnapshotIdentifer)
targetSnapshotIdentifer = targetSnapshotIdentifer.replace(":","-")
#Adding a delay to stop from reaching the api rate limit when there are large amount of snapshots -
#This should never occur in this use-case, but may if the script is modified to copy more than one instance.
time.sleep(.2)
#Execute copy
try:
copy = rdsDestinationClient.copy_db_snapshot(SourceDBSnapshotIdentifier=sourceSnapshotARN,TargetDBSnapshotIdentifier=targetSnapshotIdentifer,SourceRegion=sourceRegion)
print("Started Copy of Snapshot {0} in {2} to {1} in {3} ".format(sourceSnapshotIdentifer,targetSnapshotIdentifer,sourceRegion,destinationRegion))
except ClientError as ex:
if ex.response['Error']['Code'] == 'DBSnapshotAlreadyExists':
print("Snapshot {0} already exist".format(targetSnapshotIdentifer))
else:
print("ERROR: {0}".format(ex.response['Error']['Code']))
return {
'statusCode': 200,
'body': json.dumps('Opearation Complete')
}
The code below will take automated snapshots created today.
import boto3
from datetime import date, datetime
region_src = 'us-east-1'
client_src = boto3.client('rds', region_name=region_src)
date_today = datetime.today().strftime('%Y-%m-%d')
def get_db_snapshots_src():
response = client_src.describe_db_snapshots(
SnapshotType = 'automated',
IncludeShared=False,
IncludePublic=False
)
snapshotsInDay = []
for i in response["DBSnapshots"]:
if i["SnapshotCreateTime"].strftime('%Y-%m-%d') == date.isoformat(date.today()):
snapshotsInDay.append(i)
return snapshotsInDay