Best practises on writing Glue Jobs (pyspark) - amazon-web-services

Looking at the example here(https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-libraries.html) this seems to be the way to build a pyspark script, where the class is called to initialise the job and then runs a function outside the class (I am not sure whether this was done to allow unit testing?). Is this normal behaviour, wouldn't it be better to have the function inside the class? I was also wondering whether something such as this (https://github.com/aws-samples/aws-glue-jobs-unit-testing/blob/main/src/sample.py) is more common practise? I am just looking really best way to build out my scripts in a repeatable format and make it easy to unit test as well?
import sys
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
class GluePythonSampleTest:
def __init__(self):
params = []
if '--JOB_NAME' in sys.argv:
params.append('JOB_NAME')
args = getResolvedOptions(sys.argv, params)
self.context = GlueContext(SparkContext.getOrCreate())
self.job = Job(self.context)
if 'JOB_NAME' in args:
jobname = args['JOB_NAME']
else:
jobname = "test"
self.job.init(jobname, args)
def run(self):
dyf = read_json(self.context, "s3://awsglue-datasets/examples/us-legislators/all/persons.json")
dyf.printSchema()
self.job.commit()
def read_json(glue_context, path):
dynamicframe = glue_context.create_dynamic_frame.from_options(
connection_type='s3',
connection_options={
'paths': [path],
'recurse': True
},
format='json'
)
return dynamicframe
if __name__ == '__main__':
GluePythonSampleTest().run()

Related

How to develop and test spark specific aws glue code in my local without using docker

I have a spark code that runs on a glue job and the code needs to tested locally and generate sonar coverage reports for the unit tests. The issue is that I can not import glue specific libraries in my local .I installed awsglue-local 1.0.2 and fake-awsglue python libaries and none of them seem to work. At an organization level , I have restrictions using docker as stated here: https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-libraries.html#develop-local-docker-image . The documentation here :https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-libraries.html#develop-using-etl-library is not for windows.
code here:
from awsglue.dynamicframe import DynamicFrame
from awsglue.transforms import DropNullFields
from awsglue.transforms import ApplyMapping
from awsglue.transforms import SelectFields
from awsglue.transforms import ResolveChoice
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
Errors here:
from dynamicframe import DynamicFrame
ModuleNotFoundError: No module named 'dynamicframe'
awsglue.utils.GlueArgumentError: the following arguments are required: --JOB_NAME
How do we solve this?

An error occurred while calling o98.getDynamicFrame. ERROR: column "id" does not exist

i am using glue to transfer data from postgreSql db to another postgreSql db, i always have issue on the id as it is declared as primary key, but when the primary key tag is remove on the database there would be no error.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node PostgreSQL
PostgreSQL_node1654086903275 = glueContext.create_dynamic_frame.from_catalog(
database="my_db",
table_name="test_table_1",
transformation_ctx="PostgreSQL_node1654086903275",
)
# Script generated for node Rename Field
RenameField_node1654086935942 = RenameField.apply(
frame=PostgreSQL_node1654086903275,
old_name="id",
new_name="Id",
transformation_ctx="RenameField_node1654086935942",
)
# Script generated for node PostgreSQL
PostgreSQL_node1654086963634 = glueContext.write_dynamic_frame.from_catalog(
frame=RenameField_node1654086935942,
database="my_db",
table_name="test_table_",
transformation_ctx="PostgreSQL_node1654086963634",
)
job.commit()

Retrieve RDS credentials from Secrets Manager in AWS Glue Python Script

I have a Glue Script which is trying to read the RDS credentials I have stored in Secrets manager. But the Script keeps on running and never completes.
Also, the IAM Role which this Glue Script is running with contains SecretsManagerReadWrite policy (AWS Managed)
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrameCollection
from awsglue.dynamicframe import DynamicFrame
import boto3
import botocore
from botocore.errorfactory import ClientError
# import org.apache.spark.sql.functions.concat_ws
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from datetime import date
today = date.today()
current_day = today.strftime("%Y%m%d")
def str_to_arr(my_list):
str = ""
for item in my_list:
if item:
str += item
str = str.split(" ")
return '{"' + ' '.join([elem for elem in str]) + '"}'
str_to_arr_udf = udf(str_to_arr,StringType())
def AddPartitionKeys(glueContext, dfc) -> DynamicFrameCollection:
df = dfc.select(list(dfc.keys())[0]).toDF()
df = glueContext.add_ingestion_time_columns(df, "day")
glue_df = DynamicFrame.fromDF(df, glueContext, "transform_date")
return(DynamicFrameCollection({"CustomTransform0": glue_df}, glueContext))
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'days', 's3_bucket', 'rds_endpoint', 'region_name', 'secret_name'])
region_name = args['region_name']
session = boto3.session.Session()
client = session.client("secretsmanager", region_name=region_name)
get_secret_value_response = client.get_secret_value(SecretId=args['secret_name'])
secret = get_secret_value_response['SecretString']
secret = json.loads(secret)
db_username = secret.get('username')
db_password = secret.get('password')
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
print("Below are the creds")
# print("DB USERNAME IS " , db_username)
# print("DB PWD IS " , db_password)
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
job.commit()
What am I missing here?
I checked my work against this blog and yet I am not able to get this script complete successfully.
After Mark's suggestion, I was able to figure out that I had to create a VPC Interface Endpoint for Secrets Manager. The steps are outlined here by AWS, just had to make sure the policy in the endpoint had access/ARNs mentioned of resources I want to access from Secrets Manager.

why apscheduler use get_jobs empty?

this is my test.py
from datetime import datetime, timedelta
import sys
import os
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
from apscheduler.jobstores.redis import RedisJobStore
jobstores = {
#'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
'default': RedisJobStore(host='localhost', port=6379)
}
scheduler = BlockingScheduler(jobstores=jobstores)
def alarm(time):
print('Alarm! This alarm was scheduled at %s.' % time)
if __name__ == '__main__':
alarm_time = datetime.now() + timedelta(seconds=10)
scheduler.add_job(alarm, 'interval', seconds=10, args=[datetime.now()], name='alarm_test')
print('To clear the alarms, delete the example.sqlite file.')
print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
try:
scheduler.start()
except (KeyboardInterrupt, SystemExit):
pass
i do python test.py run job successfully
and then use another terminal by putty
python
>>> import redis
>>> from test import *
>>> r = redis.Redis()
>>> r.keys()
>>> r.zrange('apscheduler.run_times',0,1)
it will find the job id 57841c0ee05249efb466882265f2c495
>>> ret = scheduler.get_jobs(jobstore='default')
ret is empty
why???
thanks a lot
Have you started the scheduler before running get_jobs()? If not, it will only list tentatively scheduled jobs. That's why you're not seeing the job.
Try this instead:
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.jobstores.redis import RedisJobStore
scheduler = BackgroundScheduler()
scheduler.add_jobstore('redis', host='localhost', port=6379)
scheduler.start(paused=True)
scheduler.print_jobs()

how to properly structure pyspark code to work with spark streaming and kafka

I have the following code, which suppose to connect to a local kafka cluster, and run a pyspark job:
from pyspark import SparkConf, SparkContext
from operator import add
import sys
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
## Constants
APP_NAME = "PythonStreamingDirectKafkaWordCount"
##OTHER FUNCTIONS/CLASSES
def main(sc):
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 2)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a+b)
counts.pprint()
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
# Configure Spark
conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster("local[*]")
sc = SparkContext(conf=conf)
# filename = sys.argv[1]
# Execute Main functionality
main(sc)
When I run this code, I get the following error:
ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=PythonStreamingDirectKafkaWordCount, master=local[*]) created by __init__ at /home/ubuntu/spark-1.3.0-bin-hadoop2.4/hello1.py:30
What is the proper way to structure my code, to avoid this error?
Simply don't create SparkContext twice. If it is created inside main function there is no reason to pass it from outside:
def main():
conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster("local[*]")
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 2)
...
if __name__ == "__main__":
main()
Since StreamingContext termination stops corresponding SparkContext there is no good reason to keep these two apart.
SparkContext has also a getOrCreate which can be used to create new context or retrieve existing one.