How to add an index to an RDS database/table after AWS Glue script imports the data therein? - amazon-web-services

I have a typical AWS Glue-generated script that loads data from an S3 bucket to my Aurora database available through a JDBC Connection. For reference, it looks like this:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "dev-db",
table_name = "attributes", transformation_ctx = "datasource0")
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings =
[("id", "long", "id", "long"), ("value", "string", "value", "string"),
("name", "string", "name", "string")], transformation_ctx = "applymapping1")
resolvechoice2 = ResolveChoice.apply(frame = applymapping1,
choice = "make_cols", transformation_ctx = "resolvechoice2")
dropnullfields3 = DropNullFields.apply(frame = resolvechoice2,
transformation_ctx = "dropnullfields3")
datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame = dropnullfields3,
catalog_connection = "local-dev-aurora",
connection_options = {"dbtable": "attributes", "database": "local-dev-db"},
transformation_ctx = "datasink4")
job.commit()
The script above creates the table in database in question and loads csv data from bucket in it. The imported data is very large and I need then to attach the usual index to the RDS database table.
How I can specify that the id from the mapping (or, alternatively, a combination of fields) would be an index? Could I do it using the Python Glue functions or is it necessary to connect to database after the
job.commit() and additionally add the indexes?

Adding index is a SQL query operation, glue dynamic frames will not do anything with it.
So once the data imported, run the create index query from the glue itself.

Related

Not able to change the file name while writing file into csv and storing into S3

So i have been working on AWS glue and I created an ETL job in pyspark which reads data from data catalog and writes into and the csv is getting stored in S3 bucket.
But every time the job is running, it is creating different files with different names and i want to override the same file each time the job runs. I am not able to find the correct code for it.
Is there a way override the same file (versioning is enabled in S3).
Below is the code
import sys
from awsglue.transforms
import *
from awsglue.utils
import getResolvedOptions
from pyspark.context
import SparkContext
from awsglue.context
import GlueContext
from awsglue.job
import Job
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## #type: DataSource## #args: [database = "test_db", table_name = "test_dash_data", transformation_ctx = "datasource0"]## #return: datasource0## #inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "test_db", table_name = "test_dash_data", transformation_ctx = "datasource0")
## #type: ApplyMapping## #args: [mapping = [("id", "int", "id", "int"), ("value", "int", "value", "int"), ("email", "string", "email", "string"), ("age", "int", "age", "int")], transformation_ctx = "applymapping1"]## #return: applymapping1## #inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("id", "int", "id", "int"), ("value", "int", "value", "int"), ("email", "string", "email", "string"), ("age", "int", "age", "int")], transformation_ctx = "applymapping1")
## #type: DataSink## #args: [connection_type = "s3", connection_options = {
"path": "s3://auroratos3dataimport/customerdata2"
}, format = "csv", transformation_ctx = "datasink2"]
## #return: datasink2
## #inputs: [frame = applymapping1]
repartitioned1 = applymapping1.repartition(1)
datasink2 = glueContext.write_dynamic_frame.from_options(frame = repartitioned1, connection_type = "s3", connection_options = {
"path": "s3://auroratos3dataimport/customerdata2"
}, format = "csv", transformation_ctx = "datasink2")
job.commit()
Currently AWS Glue doesn't support 'overwrite' mode from pyspark but they are working on this feature.
There is a workaround though, using plain pyspark:
repartitioned1.toDF()
.write
.mode("overwrite")
.format("csv")
.save(s3_path)

AWS ETL job that creates a new column which is a substring of an existing column

I have a data source in an S3 bucket. The data source is the CSV file with one column "ID". I want to use AWS Glue to complete an ETL job. I want to extract the data from the S3 bucket, create a second column ("ID Suffix") which is the last two elements of the "ID", and then load this data file into a different S3 bucket. So if the "ID" is 1000031, I want the second column to be 31.
Here is the script that AWS Glue created for the simple task of extracting the file from one S3 bucket and putting it into another. I would like to edit it to accomplish the task above. If you can assist with this, I would greatly appreciate it. Thanks!
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## #type: DataSource
## #args: [database = "stackoverflow", table_name = "sample_data_csv", transformation_ctx = "datasource0"]
## #return: datasource0
## #inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "stackoverflow", table_name = "sample_data_csv", transformation_ctx = "datasource0")
## #type: ApplyMapping
## #args: [mapping = [("id", "int", "id", "int")], transformation_ctx = "applymapping1"]
## #return: applymapping1
## #inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("id", "int", "id", "int")], transformation_ctx = "applymapping1")
## #type: DataSink
## #args: [connection_type = "s3", connection_options = {"path": "s3://aws-glue-scripts-us-west-1/Sample data"}, format = "csv", transformation_ctx = "datasink2"]
## #return: datasink2
## #inputs: [frame = applymapping1]
datasink2 = glueContext.write_dynamic_frame.from_options(frame = applymapping1, connection_type = "s3", connection_options = {"path": "s3://aws-glue-scripts-us-west-1/Sample data"}, format = "csv", transformation_ctx = "datasink2")
job.commit()
You can achieve this using Map.apply with an UDF defined. Refer to below input and output that I got after running the below script:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame_from_options(connection_type = "s3", connection_options = {"paths": ["s3://aws-glue-us-east-2/test.csv"]}, format = "csv")
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("col0", "int", "id", "int")], transformation_ctx = "applymapping1")
def map_function(dynamicRecord):
sub_id = dynamicRecord["id"][-2:]
dynamicRecord["sub_id"] = sub_id
return dynamicRecord
mapping1 = Map.apply(frame = applymapping1, f = map_function, transformation_ctx = "mapping1")
datasink2 = glueContext.write_dynamic_frame.from_options(frame = mapping1, connection_type = "s3", connection_options = {"path": "s3://aws-glue-us-east-2/Sample_output"}, format = "csv", transformation_ctx = "datasink2")
job.commit()
Once I ran this I got below output:
Input
id
1000031
1000032
1000034
1000035
1000036
1000037
1000039
1000030
Output:
sub_id,id
31,1000031
32,1000032
34,1000034
35,1000035
36,1000036
37,1000037
39,1000039
30,1000030

Creating a substring in AWS Glue

I have a data source in an S3 bucket. The data source is the CSV file with one column "ID". I want to use AWS Glue to complete an ETL job. I want to extract the data from the S3 bucket, create a second column ("ID Suffix") which is the last two elements of the "ID", and then load this data file into a different S3 bucket. So if the "ID" is 1000031, I want the second column to be 31.
Here is the script that AWS Glue created for the simple task of extracting the file from one S3 bucket and putting it into another. I would like to edit it to accomplish the task above.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## #type: DataSource
## #args: [database = "stackoverflow", table_name = "sample_data_csv", transformation_ctx = "datasource0"]
## #return: datasource0
## #inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "stackoverflow", table_name = "sample_data_csv", transformation_ctx = "datasource0")
## #type: ApplyMapping
## #args: [mapping = [("id", "int", "id", "int")], transformation_ctx = "applymapping1"]
## #return: applymapping1
## #inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("id", "int", "id", "int")], transformation_ctx = "applymapping1")
## #type: DataSink
## #args: [connection_type = "s3", connection_options = {"path": "s3://aws-glue-scripts-us-west-1/Sample data"}, format = "csv", transformation_ctx = "datasink2"]
## #return: datasink2
## #inputs: [frame = applymapping1]
datasink2 = glueContext.write_dynamic_frame.from_options(frame = applymapping1, connection_type = "s3", connection_options = {"path": "s3://aws-glue-scripts-us-west-1/Sample data"}, format = "csv", transformation_ctx = "datasink2")
job.commit()
I would suggest to use dataframe functions since they are much easier to understand and use. In your case you can transform your dynamic frame to dataframe and call simples pyspark function:
from pyspark.sql import functions as F
from awsglue.dynamicframe import DynamicFrame
new_df = datasource0.toDF()\
.withColumn("id_suffix",
F.substring(F.col("id"), F.length("id") - 2, 2)
And later convert back to dynamic frame
dynamic_frame = DynamicFrame.fromDF(dataframe=new_df, glue_ctx=my_glue_context, name="generic-name")
Pyspark Functions
Furthermore, you don't need to setup the transformation context on every function, I already explained why here

Create paritioned data using AWS Glue and save into s3

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import col,year,month,dayofmonth,to_date,from_unixtime
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "db_name", table_name = "table_name", transformation_ctx = "datasource0")
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("dateregistered", "timestamp", "dateregistered", "timestamp"), ("id", "int", "id", "int")], transformation_ctx = "applymapping1")
df = applymapping1.toDF()
repartitioned_with_new_columns_df = applymapping1.select("*")
.withColumn("date_col", to_date(from_unixtime(col("dateRegistered"))))
.withColumn("year", year(col("date_col")))
.withColumn("month", month(col("date_col")))
.withColumn("day", dayofmonth(col("date_col")))
.drop(col("date_col"))
#.repartition(1)
dyf = DynamicFrame.fromDF(repartitioned_with_new_columns_df, glueContext, "enriched")
datasink = glueContext.write_dynamic_frame.from_options(
frame = dyf,
connection_type = "s3",
connection_options = {
"path": "bucket-path",
"partitionKeys": ["year", "month", "day"]
},
format = "json",
transformation_ctx = "datasink")
job.commit()
I have above script and i cant figure out why is not working, or if it is even the correct way.
Could someone please review and let me know what i am doing wrong?
The goal here is to run this job daily, and write this table partitioned as above and save it in s3 either json or parquet.
You are referring to the wrong data frame when manipulating the columns.
applymapping1.select("*") should actually be df.select("*")

AWS Glue hanging up & consuming lot of time in ETL job

I am using AWS Glue where I want to dump records from Oracle table (which has 80 million rows) to Redshift. However, almost 2 hrs go,it remains in hanging state & still nothing gets written to Amazon S3 & eventually I have to stop the job.
My code:
import sys
import boto3
import json
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
db_username = [removed]
db_password = [removed]
db_url = [removed]
table_name = [removed]
jdbc_driver_name = "oracle.jdbc.OracleDriver"
s3_output = [removed]
df = glueContext.read.format("jdbc").option("url", db_url).option("user", db_username).option("password", db_password).option("dbtable", table_name).option("driver", jdbc_driver_name).load()
df.printSchema()
datasource0 = DynamicFrame.fromDF(df, glueContext, "datasource0")
datasource0.schema()
datasource0.show()
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("correlation_id", "decimal", "correlation_id", "bigint"), ("machine_pin","varchar","machine_pin","varchar"),("messageguid","varchar","messageguid","varchar"), ("originating_domain_object_id", "decimal", "originating_domain_object_id", "bigint"), ("originating_message_type_id", "bigint", "originating_message_type_id", "bigint"), ("source_messageguid","varchar","source_messageguid","varchar"), ("timestamp_of_request","timestamp","timestamp_of_request","timestamp"),("token","varchar","token","varchar"),("id","decimal","id","bigint"),("file_attachment","decimal","file_attachment","bigint")], transformation_ctx = "applymapping1")
resolvechoice2 = ResolveChoice.apply(frame = applymapping1,choice = "make_cols", transformation_ctx = "resolvechoice2")
dropnullfields3 = DropNullFields.apply(frame = resolvechoice2, transformation_ctx = "dropnullfields3")
datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame = dropnullfields3, catalog_connection = "us01-isg-analytics", connection_options = {"dbtable": "analytics_team_data.message_details", "database": "jk_test"}, redshift_tmp_dir = "s3://aws-glue-scripts-823837687343-us-east-1/glue_op/", transformation_ctx = "datasink4")
When I use Apache Spark,it takes less than 1 hr to dump the data to Redshift.What modifications need to be required for performance optimization so that Glue dumps the data in a speedy manner?