AWS Glue Job - Unable to pass the transformed glue data to a lambda function - amazon-web-services

I'm a beginner at the AWS-Glue service. I'm having an issue with Glue's ETL-job.
My use case:
1) Processing the S3 JSON files using Glue-Crawler to catalog
Note: The S3 files will be created frequently.
2) Created Glue-Job to process the catalog data & that will be transformed and stored in the S3 target location.
In addition to that, I'm calling a Lambda function to notify it to slack channel which is working. But I couldn't send the transformed data to the lambda function.
Glue-Job(Python) Script:-
The value block used in the code [bucket_name] need to be updated
import sys
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "[bucket_name]", table_name = "data_source", transformation_ctx = "datasource0")
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("doc_id", "string", "doc_id", "string"), ("skill_campaign", "string", "skill_campaign", "string"), ("disposition", "string", "disposition", "string"), ("customer_num", "string", "customer_num", "string"), ("call_date", "string", "call_date", "string"), ("segstart", "string", "segstart", "string"), ("segstop", "string", "segstop", "string"), ("duration", "string", "duration", "string"), ("vendor", "string", "vendor", "string"), ("user_id", "string", "user_id", "string"), ("supervisor_user_id", "string", "supervisor_user_id", "string"), ("evaluation_status_id", "string", "evaluation_status_id", "string"), ("assigned_to", "string", "assigned_to", "string"), ("evaluation_form_id", "string", "evaluation_form_id", "string")], transformation_ctx = "applymapping1")
datasink2 = glueContext.write_dynamic_frame.from_options(frame = applymapping1, connection_type = "s3", connection_options = {"path": "s3://[bucket_name]/data-target"}, format = "json", transformation_ctx = "datasink2")
job.commit()
##Call Lambda function: How to send data that above ETL job processed to this lambda function?
client = boto3.client('lambda' , region_name='ap-south-1')
r_lambda = client.invoke(FunctionName='lambda-slack-notification', InvocationType='Event')
Lambda Function(NodeJS) Script:-
The value blocks used in the code [token], [channel], [username] & [text] need to be updated
const https = require('https');
exports.handler = (event, context, callback) => {
const options = {
hostname: "slack.com",
method: "POST",
path: "/api/chat.postMessage?token=[token]&channel=[channel]&username=[username]&text=[text]",
};
const req = https.request(options, (res) => {
res.setEncoding('utf8');
res.on('data', (chunk) => {
// code to execute
});
res.on('end', () => {
// code to execute
});
});
req.on('error', (e) => {
callback(null, "Error has occured");
});
req.end();
}
Thanks in advance...

Related

AWS Glue Studio Keeps Omitting Dates

I have a problem.
I uploaded a CSV data file to my AWS S3 bucket and using a Glue crawler, inserted that data into a table object in a created database. I'm able to see all the data in Amazon Athena from this table, no problem.
The problem comes when I attempt to convert the data from CSV to Parquet format using AWS Glue Studio. In Amazon Athena, from the new table / database, I'm able to see all the data in Parquet format except for the dates.
Prior to uploading the data to AWS, I formatted the dates to a timestamp data type using Pandas in a Jupyter Notebook.
Here's the Python code produced by AWS Glue Studio. This is one is a bit puzzling. Any feedback would be most welcomed.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node S3 bucket
S3bucket_node1 = glueContext.create_dynamic_frame.from_catalog(
database="capstone-project-landing-area-raw-db",
table_name="capstone_project_landing_area_raw",
transformation_ctx="S3bucket_node1",
)
# Script generated for node ApplyMapping
ApplyMapping_node2 = ApplyMapping.apply(
frame=S3bucket_node1,
mappings=[
("id", "long", "id", "string"),
("order_status", "string", "order_status", "string"),
("order_products_value", "double", "order_products_value", "string"),
("order_freight_value", "double", "order_freight_value", "string"),
("order_items_qty", "long", "order_items_qty", "string"),
("customer_city", "string", "customer_city", "string"),
("customer_state", "string", "customer_state", "string"),
("customer_zip_code_prefix", "long", "customer_zip_code_prefix", "string"),
("product_name_length", "long", "product_name_length", "string"),
("product_description_length", "long", "product_description_length", "string"),
("product_photos_qty", "long", "product_photos_qty", "string"),
("review_score", "long", "review_score", "string"),
("order_purchase_timestamp", "string", "order_purchase_timestamp", "date"),
("order_approved_at", "string", "order_approved_at", "date"),
(
"order_delivered_customer_date",
"string",
"order_delivered_customer_date",
"date",
),
],
transformation_ctx="ApplyMapping_node2",
)
# Script generated for node S3 bucket
S3bucket_node3 = glueContext.getSink(
path="s3://capstone-project-cleansed-enriched",
connection_type="s3",
updateBehavior="UPDATE_IN_DATABASE",
partitionKeys=[],
compression="snappy",
enableUpdateCatalog=True,
transformation_ctx="S3bucket_node3",
)
S3bucket_node3.setCatalogInfo(
catalogDatabase="capstone-project-cleansed-enriched-db",
catalogTableName="capstone-project-cleansed-enriched",
)
S3bucket_node3.setFormat("glueparquet")
S3bucket_node3.writeFrame(ApplyMapping_node2)
job.commit()

Partition colum from AWS Glue Catalog becomes null after mapping

Im running a job on AWS Glue to create another catalog(B) from one previously made(A). The main problem here, both on Visual(preview) and Pyspark script is when I try to use the partitions from A, which are year, month and day. After the AppyMapping process these columns become null completely.
Visual example
And if I run the job trying to partition B through the year provided by A it shows HIVE_DEFAULT_PARTITION on S3.
I share with you the code so far.
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node S3 bucket
S3bucket_node1 = glueContext.create_dynamic_frame.from_catalog(
database="clean",
table_name="sumn",
transformation_ctx="S3bucket_node1",
)
# Script generated for node Apply Mapping
ApplyMapping_node1660246155493 = ApplyMapping.apply(
frame=S3bucket_node1,
mappings=[
("date", "date", "date", "date"),
("producto", "string", "producto", "string"),
("volumennominacion", "double", "volumennominacion", "double"),
("estacion", "string", "estacion", "string"),
("proveedor", "string", "proveedor", "string"),
("preciocompraporlitro", "double", "preciocompraporlitro", "double"),
("precioventaporlitro", "double", "precioventaporlitro", "double"),
("year", "int", "year", "int"),
("month", "int", "month", "int"),
("day", "int", "day", "int"),
],
transformation_ctx="ApplyMapping_node1660246155493",
)
job.commit()
Thanks.

Not able to change the file name while writing file into csv and storing into S3

So i have been working on AWS glue and I created an ETL job in pyspark which reads data from data catalog and writes into and the csv is getting stored in S3 bucket.
But every time the job is running, it is creating different files with different names and i want to override the same file each time the job runs. I am not able to find the correct code for it.
Is there a way override the same file (versioning is enabled in S3).
Below is the code
import sys
from awsglue.transforms
import *
from awsglue.utils
import getResolvedOptions
from pyspark.context
import SparkContext
from awsglue.context
import GlueContext
from awsglue.job
import Job
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## #type: DataSource## #args: [database = "test_db", table_name = "test_dash_data", transformation_ctx = "datasource0"]## #return: datasource0## #inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "test_db", table_name = "test_dash_data", transformation_ctx = "datasource0")
## #type: ApplyMapping## #args: [mapping = [("id", "int", "id", "int"), ("value", "int", "value", "int"), ("email", "string", "email", "string"), ("age", "int", "age", "int")], transformation_ctx = "applymapping1"]## #return: applymapping1## #inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("id", "int", "id", "int"), ("value", "int", "value", "int"), ("email", "string", "email", "string"), ("age", "int", "age", "int")], transformation_ctx = "applymapping1")
## #type: DataSink## #args: [connection_type = "s3", connection_options = {
"path": "s3://auroratos3dataimport/customerdata2"
}, format = "csv", transformation_ctx = "datasink2"]
## #return: datasink2
## #inputs: [frame = applymapping1]
repartitioned1 = applymapping1.repartition(1)
datasink2 = glueContext.write_dynamic_frame.from_options(frame = repartitioned1, connection_type = "s3", connection_options = {
"path": "s3://auroratos3dataimport/customerdata2"
}, format = "csv", transformation_ctx = "datasink2")
job.commit()
Currently AWS Glue doesn't support 'overwrite' mode from pyspark but they are working on this feature.
There is a workaround though, using plain pyspark:
repartitioned1.toDF()
.write
.mode("overwrite")
.format("csv")
.save(s3_path)

How to add an index to an RDS database/table after AWS Glue script imports the data therein?

I have a typical AWS Glue-generated script that loads data from an S3 bucket to my Aurora database available through a JDBC Connection. For reference, it looks like this:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "dev-db",
table_name = "attributes", transformation_ctx = "datasource0")
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings =
[("id", "long", "id", "long"), ("value", "string", "value", "string"),
("name", "string", "name", "string")], transformation_ctx = "applymapping1")
resolvechoice2 = ResolveChoice.apply(frame = applymapping1,
choice = "make_cols", transformation_ctx = "resolvechoice2")
dropnullfields3 = DropNullFields.apply(frame = resolvechoice2,
transformation_ctx = "dropnullfields3")
datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame = dropnullfields3,
catalog_connection = "local-dev-aurora",
connection_options = {"dbtable": "attributes", "database": "local-dev-db"},
transformation_ctx = "datasink4")
job.commit()
The script above creates the table in database in question and loads csv data from bucket in it. The imported data is very large and I need then to attach the usual index to the RDS database table.
How I can specify that the id from the mapping (or, alternatively, a combination of fields) would be an index? Could I do it using the Python Glue functions or is it necessary to connect to database after the
job.commit() and additionally add the indexes?
Adding index is a SQL query operation, glue dynamic frames will not do anything with it.
So once the data imported, run the create index query from the glue itself.

Create paritioned data using AWS Glue and save into s3

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import col,year,month,dayofmonth,to_date,from_unixtime
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "db_name", table_name = "table_name", transformation_ctx = "datasource0")
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("dateregistered", "timestamp", "dateregistered", "timestamp"), ("id", "int", "id", "int")], transformation_ctx = "applymapping1")
df = applymapping1.toDF()
repartitioned_with_new_columns_df = applymapping1.select("*")
.withColumn("date_col", to_date(from_unixtime(col("dateRegistered"))))
.withColumn("year", year(col("date_col")))
.withColumn("month", month(col("date_col")))
.withColumn("day", dayofmonth(col("date_col")))
.drop(col("date_col"))
#.repartition(1)
dyf = DynamicFrame.fromDF(repartitioned_with_new_columns_df, glueContext, "enriched")
datasink = glueContext.write_dynamic_frame.from_options(
frame = dyf,
connection_type = "s3",
connection_options = {
"path": "bucket-path",
"partitionKeys": ["year", "month", "day"]
},
format = "json",
transformation_ctx = "datasink")
job.commit()
I have above script and i cant figure out why is not working, or if it is even the correct way.
Could someone please review and let me know what i am doing wrong?
The goal here is to run this job daily, and write this table partitioned as above and save it in s3 either json or parquet.
You are referring to the wrong data frame when manipulating the columns.
applymapping1.select("*") should actually be df.select("*")