Upload Airflow XCom to Google Cloud Storage as JSON File - google-cloud-platform

In Airflow, I have an XCom Task ID customer_schema that I want to transform into a JSON file named final_schema.json and upload into Google Cloud Storage. My bucket in Google Cloud Storage is named northern_industrial_customer. I tried to use the following FileToGoogleCloudStorageOperator, but it did not work.
Does anyone know how I transfer my XCom Task ID customer_schema to Google Cloud Storage as a JSON file named final_schema.json?
transfer_to_gcs = FileToGoogleCloudStorageOperator(task_id = 'transfer_to_gcs', src = "{{task_instance.xcom_pull(task_ids='customer_schema')}}", dst = 'final_schema.json', bucket = 'northern_industrial_customer', google_cloud_storage_conn_id = conn_id_gcs)

there is no operator in Airflow to perform these operation, but Airflow is extensible and you can write your own custom operator.
import tempfile
import warnings
from airflow.gcp.hooks.gcs import GoogleCloudStorageHook
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
class ContentToGoogleCloudStorageOperator(BaseOperator):
"""
Uploads a text content to Google Cloud Storage.
Optionally can compress the content for upload.
:param content: Content to upload. (templated)
:type src: str
:param dst: Destination path within the specified bucket, it must be the full file path
to destination object on GCS, including GCS object (ex. `path/to/file.txt`) (templated)
:type dst: str
:param bucket: The bucket to upload to. (templated)
:type bucket: str
:param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform.
:type gcp_conn_id: str
:param mime_type: The mime-type string
:type mime_type: str
:param delegate_to: The account to impersonate, if any
:type delegate_to: str
:param gzip: Allows for file to be compressed and uploaded as gzip
:type gzip: bool
"""
template_fields = ('src', 'dst', 'bucket')
#apply_defaults
def __init__(self,
content,
dst,
bucket,
gcp_conn_id='google_cloud_default',
mime_type='application/octet-stream',
delegate_to=None,
gzip=False,
*args,
**kwargs):
super().__init__(*args, **kwargs)
self.content = content
self.dst = dst
self.bucket = bucket
self.gcp_conn_id = gcp_conn_id
self.mime_type = mime_type
self.delegate_to = delegate_to
self.gzip = gzip
def execute(self, context):
"""
Uploads the file to Google cloud storage
"""
hook = GoogleCloudStorageHook(
google_cloud_storage_conn_id=self.gcp_conn_id,
delegate_to=self.delegate_to
)
with tempfile.NamedTemporaryFile(prefix="gcs-local") as file:
file.write(self.content)
file.flush()
hook.upload(
bucket_name=self.bucket,
object_name=self.dst,
mime_type=self.mime_type,
filename=file.name,
gzip=self.gzip,
)
transfer_to_gcs = ContentToGoogleCloudStorageOperator(
task_id = 'transfer_to_gcs',
content = "{{task_instance.xcom_pull(task_ids='customer_schema')}}",
dst = 'final_schema.json',
bucket = 'northern_industrial_customer',
gcp_conn_id = conn_id_gcs)
Please note that in Airflow 2.0 the google_cloud_storage_conn_id parameter in the FileToGoogleCloudStorageOperator operator is discontinued. You should use gcp_conn_id

Related

How to Load data from Google Bucket to Google BigQuery Table?

Scenario need to be catered:
User will share sales.csv file in Google Bucket
sales.csv file data should be uploaded in the Google BigQuery everytime with the timestamp.
Can someone guide me how to do it with best practices?
for that you need to follow these steps:-
Step 1:- Create a Google Cloud Storage bucket
Step 2:- Set up Google Cloud Functions
Step 3:- Write the Cloud Function (You can write cloud functions in any computer language)
from google.Cloud import storage, bigquery
def load_sales_data(event, context):
file = event
timestamp = str(int(time.time() * 1000))
table_name = f"sales_{timestamp}"
bucket_name = file['bucket'] #your bucket name
file_name = file['name'] #your file name
bq_client = bigquery.Client()
dataset = bq_client.dataset('my_dataset')# your dataset name
table = dataset.table(table_name) #your table name
schema = [
bigquery.SchemaField("id", "INTEGER"),
bigquery.SchemaField("date", "DATE"),
bigquery.SchemaField("amount", "FLOAT"),
]
job_config = bigquery.LoadJobConfig(
source_format=bigquery.SourceFormat.CSV,
skip_leading_rows=1,
autodetect=True,
schema=schema,
)
uri = f"gs://{bucket_name}/{file_name}"
load_job = bq_client.load_table_from_uri(uri,table,job_config=job_config)
load_job.result()
print(f"Data loaded into {table_name}")

Lambda task timeout but no application log

I have a python lambda that triggers by S3 uploads to a specific folder. The lambda function is to process the uploaded file and outputs it to another folder on the same S3 bucket.
The issue is that when I do a bulk upload using AWS console, some files do not get processed. I ended up setting a dead letter queue to catch these invocations. While inspecting the message in the queue, there is a request ID which I tried to find it in the lambda logs.
These are the logs for the request ID:
Now the odd part is that in the python code, the first line after the imports is print('Loading function') which does not show up in the lambda log?
Added the python code here. It should still print the Processing file name: " + key which is inside the handler ya?
import urllib.parse
from datetime import datetime
import boto3
from constants import CONTENT_TYPE, XML_EXTENSION, VALIDATING
from xml_process import *
from s3Integration import download_file
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context):
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
print("Processing file name: " + key)
try:
response = s3.get_object(Bucket=bucket, Key=key)
xml_content = response["Body"].read()
content_type = response["ContentType"]
tree = ET.fromstring(xml_content)
key_file_name = key.split("/")[1]
# Creating a temporary copy by downloading file to get the namespaces
temp_file_name = "/tmp/" + key_file_name
download_file(key, temp_file_name)
namespaces = {node[0]: node[1] for _, node in ET.iterparse(temp_file_name, events=['start-ns'])}
for name, value in namespaces.items():
ET.register_namespace(name, value)
# Preparing path for file processing
processed_file = key_file_name.split(".")[0] + "_processed." + key_file_name.split(".")[1]
print(processed_file, "processed")
db_record = XMLMapping(file_path=key,
processed_file_path=processed_file,
uploaded_by="lambda",
status=VALIDATING, uploaded_date=datetime.now(), is_active=True)
session.add(db_record)
session.commit()
if key_file_name.split(".")[1] == XML_EXTENSION:
if content_type in CONTENT_TYPE:
xml_parse(tree, db_record, processed_file, True)
else:
print("Content Type is not valid. Provided value: ", content_type)
else:
print("File extension is not valid. Provided extension: ", key_file_name.split(".")[1])
return "success"
except Exception as e:
print(e)
raise e
I don't think its a permission issue as other files uploaded in the same batch were processed successfully.

Share JPEG file stored on S3 via URL instead of downloading

I have recently completed this tutorial from AWS on how to create a thumbnail generator using lambda and S3: https://docs.aws.amazon.com/lambda/latest/dg/with-s3-tutorial.html . Basically, I'm uploading an image file to my '-source' bucket and then lambda generates a thumbnail and uploads it to my '-thumbnail' bucket.
Everything works as expected. However, I wanted to use s3 object URL in the '-thumbnail' bucket so that I can load the image from there for a small app I'm building. The issue I'm having is that the URL doesn't display the image in the browser but instead downloads the file. This causes my app to error out.
I did some research and learned that I had to change the content-type to image/jpeg and then also made the object public using ACL. This works for all of the other buckets I have except the one that has the thumbnail. I have recreated this bucket several times. I even copied the settings from my existing buckets. I have compared settings to all the other buckets and they appear to be the same.
I wanted to reach out and see if anyone has ran into this type of issue before. Or if there is something I might be missing.
Here is the code I'm using to generate the thumbnail.
import boto3
from boto3.dynamodb.conditions import Key, Attr
import os
import sys
import uuid
import urllib.parse
from urllib.parse import unquote_plus
from PIL.Image import core as _imaging
import PIL.Image
s3 = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table(os.environ['DB_TABLE_NAME'])
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
recordId = key
tmpkey = key.replace('/', '')
download_path = '/tmp/{}{}'.format(uuid.uuid4(), tmpkey)
upload_path = '/tmp/resized-{}'.format(tmpkey)
try:
s3.download_file(bucket, key, download_path)
resize_image(download_path, upload_path)
bucket = bucket.replace('source', 'thumbnail')
s3.upload_file(upload_path, bucket, key)
print(f"Thumbnail created and uploaded to {bucket} successfully.")
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
else:
s3.put_object_acl(ACL='public-read',
Bucket=bucket,
Key=key)
#create image url to add to dynamo
url = f"https://postreader-thumbnail.s3.us-west-2.amazonaws.com/{key}"
print(url)
#create record id to update the appropriate record in the 'Posts' table
recordId = key.replace('.jpeg', '')
#add the image_url column along with the image url as the value
table.update_item(
Key={'id':recordId},
UpdateExpression=
"SET #statusAtt = :statusValue, #img_urlAtt = :img_urlValue",
ExpressionAttributeValues=
{':statusValue': 'UPDATED', ':img_urlValue': url},
ExpressionAttributeNames=
{'#statusAtt': 'status', '#img_urlAtt': 'img_url'},
)
def resize_image(image_path, resized_path):
with PIL.Image.open(image_path) as image:
#change to standard/hard-coded size
image.thumbnail(tuple(x / 2 for x in image.size))
image.save(resized_path)
This could happen if the Content-Type of the file you're uploading is binary/octet-stream , you can modify your script like below to provide custom content-type while uploading.
s3.upload_file(upload_path, bucket, key, ExtraArgs={'ContentType':
"image/jpeg"})
After more troubleshooting the issue was apparently related to the bucket's name. I created a new bucket with a different name than it had previously. After doing so I was able to upload and share images without issue.
I edited my code so that the lambda uploads to the new bucket name and I am able to share the image via URL without downloading.

generate_presigned_url boto3 generates same URL until expiry when called using cloudfront

I am trying to generate s3 presigned URL. Logic for this is written in Lambda#Edge, python 3.7
def lambda_handler(event, context):
request = event['Records'][0]['cf']['request']
headers = request['headers']
s3 = boto3.client('s3',config=Config(signature_version='s3v4'))
url = s3.generate_presigned_url(ClientMethod='get_object',
Params={'Bucket': 'BUCKET_NAME',
'Key':'abc.jpeg'
},
ExpiresIn=3600)
response = {
'status': '302',
'statusDescription': 'Found',
'headers': {
'location': [{
'key': 'Location',
'value': url
}]
}
}
return response
When I test this code in mere Lambda, I get different URL for each call. But if I add cloudfront and call the cloudfront domain name then same URL will be generated until the URL expires.
Why dont I get different URL for each call as in the case 1?
You need to create a signer for CloudFront, and use generate_presigned_url from the signer. This snippet generates different URLs every time I try.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
from datetime import datetime, timedelta
from botocore.signers import CloudFrontSigner
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import serialization, hashes
from cryptography.hazmat.primitives.asymmetric import padding
from cf_url_gen.config import Bucket, Operation, config
class CFClient(object):
"""CloudFront client class
Creates a CloudFront client object for generting S3 resource URLs.
"""
def __init__(
self,
cf_key_id: str = os.getenv("CF_KEY_ID"),
cf_key_path: str = os.getenv("CF_KEY_PATH"),
cf_ttl_sec: int = 300,
):
"""The __init__ method for CFClient class
Args:
cf_key_id (str, optional): CloudFront key id, can be set via environment variable `CF_KEY_ID`
cf_key_path (str, optional): CloudFront key path, can be set via environment variable `CF_KEY_PATH`
cf_ttl_sec (int, optional): URL validity duration from UTC timestamp, default value 300 seconds
"""
with open(cf_key_path, "rb") as key_file:
_key = serialization.load_pem_private_key(key_file.read(), password=None, backend=default_backend())
self.signer = CloudFrontSigner(cf_key_id, lambda m: _key.sign(m, padding.PKCS1v15(), hashes.SHA1()))
self.ttl = cf_ttl_sec
def get_url_with_prefix(self, file_path: str, url_prefix: str, is_signed: bool = True) -> str:
"""Method for generating CloudFront URLs for file paths with given url prefix
Args:
file_path (str): Full file path in S3 bucket excluding bucket name
url_prefix (str): URL prefix to use for signing or generating S3 resource URL
is_signed (bool): Flag indicating whether URL should be signed or not
Returns:
CloudFront url string based on provided signing options
"""
if not is_signed:
return os.path.join(url_prefix, file_path)
else:
end_date = datetime.utcnow() + timedelta(seconds=self.ttl)
file_url = os.path.join(url_prefix, file_path)
return self.signer.generate_presigned_url(file_url, date_less_than=end_date)
Note: The URL prefix should be the CloudFront domain URL for your S3 bucket, usually something like https://bucketname.yourdomain.com or something similar.

Copying S3 objects from one account to other using Lambda python

I'm using boto3 to copy files from s3 bucket from one account to other. I need a similar functionality like aws s3 sync. Please see my code. My company has decided to 'PULL' from other S3 bucket (source account). Please don't suggest replication, S3 batch, S3 trigger Lambda..etc. We have gone through all these options and my management do not want to do any configuration at source side. Can you please review this code and let me know if this code works for thousands of objects. Source bucket has nearly 10000 objects. We will create this lambda function in destination account and create a cloudwatch event to trigger the lambda once in a day.
I am checking ETag so that modified files will be copied across when this function is triggered.
Edit: I simplified my code just to see pagination works. It's working if I don't add client.copy(). If I add this line in for loop after reading 3,4 objects it's throwing "errorMessage": "2021-08-07T15:29:07.827Z 82757747-7b72-4f29-ae9f-22e95f969d6c Task timed out after 3.00 seconds". Please advise. Please note that 'test/' folder in my source bucket has around 1100 objects.
import os
import logging
import botocore
logger = logging.getLogger()
logger.setLevel(os.getenv('debug_level', 'INFO'))
client = boto3.client('s3')
def handler(event, context):
main(event, logger)
def main(event, logger):
try:
SOURCE_BUCKET = os.environ.get('SRC_BUCKET')
DEST_BUCKET = os.environ.get('DST_BUCKET')
REGION = os.environ.get('REGION')
prefix = 'test/'
# Create a reusable Paginator
paginator = client.get_paginator('list_objects_v2')
print ('after paginator')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket=SOURCE_BUCKET,Prefix = prefix)
print ('after page iterator')
index = 0
for page in page_iterator:
for obj in page['Contents']:
index += 1
print ("I am looking for {} in the source bucket".format(obj['ETag']))
copy_source = {'Bucket': SOURCE_BUCKET, 'Key': obj['Key']}
client.copy(copy_source, DEST_BUCKET, obj['Key'])
logger.info("number of objects copied {}:".format(index))
except botocore.exceptions.ClientError as e:
raise
This version is working fine if I increase the Lambda timeout to 15 min and memory to 512MB. This checks if the source object already exists in destination before copying.
import boto3
import os
import logging
import botocore
from botocore.client import Config
logger = logging.getLogger()
logger.setLevel(os.getenv('debug_level', 'INFO'))
config = Config(connect_timeout=5, retries={'max_attempts': 0})
client = boto3.client('s3', config=config)
#client = boto3.client('s3')
def handler(event, context):
main(event, logger)
def main(event, logger):
try:
DEST_BUCKET = os.environ.get('DST_BUCKET')
SOURCE_BUCKET = os.environ.get('SRC_BUCKET')
REGION = os.environ.get('REGION')
prefix = ''
# Create a reusable Paginator
paginator = client.get_paginator('list_objects_v2')
print ('after paginator')
# Create a PageIterator from the Paginator
page_iterator_src = paginator.paginate(Bucket=SOURCE_BUCKET,Prefix = prefix)
page_iterator_dest = paginator.paginate(Bucket=DEST_BUCKET,Prefix = prefix)
print ('after page iterator')
index = 0
for page_source in page_iterator_src:
for obj_src in page_source['Contents']:
flag = "FALSE"
for page_dest in page_iterator_dest:
for obj_dest in page_dest['Contents']:
# checks if source ETag already exists in destination
if obj_src['ETag'] in obj_dest['ETag']:
flag = "TRUE"
break
if flag == "TRUE":
break
if flag != "TRUE":
index += 1
client.copy_object(Bucket=DEST_BUCKET, CopySource={'Bucket': SOURCE_BUCKET, 'Key': obj_src['Key']}, Key=obj_src['Key'],)
print ("source ETag {} and destination ETag {}".format(obj_src['ETag'],obj_dest['ETag']))
print ("source Key {} and destination Key {}".format(obj_src['Key'],obj_dest['Key']))
print ("Number of objects copied{}".format(index))
logger.info("number of objects copied {}:".format(index))
except botocore.exceptions.ClientError as e:
raise