AWS MWAA/Apache airflow: how to debug on_failure_callback itself - amazon-web-services

Have a dag like this:
import os
from datetime import timedelta
from xxx import on_failure_opsgenie
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
DAG_ID = os.path.basename(__file__).replace(".py", "")
DEFAULT_ARGS = {
"owner": "airflow",
"depends_on_past": False,
"email": ["airflow#example.com"],
"email_on_failure": False,
"email_on_retry": False,
}
def kaboom(*args, **kwargs):
print("goodbye cruel world")
print(args)
print(kwargs)
assert 1 == 2
with DAG(
dag_id=DAG_ID,
default_args=DEFAULT_ARGS,
description="Print contents of airflow.cfg to logs",
dagrun_timeout=timedelta(hours=2),
start_date=days_ago(1),
schedule_interval=None,
on_failure_callback=on_failure_opsgenie,
) as dag:
get_airflow_cfg_operator = PythonOperator(task_id="gonna_explode", python_callable=kaboom)
The DAG fails as expected, purposefully. However, on_failure_opsgenie is not doing what it should; how do I get the logs or debug a failed on-failure-callback in AWS MWAA?

Related

Trigger AWS Step Function Once file received in AWS s3 using Airflow File Sensor

I need to trigger a AWS step function state Machine whenever a file received in AWS s3 location using Airflow File Sensor operator.
Im trying this but its not working.
from airflow.models import DAG
from datetime import datetime, timedelta
from airflow.operators.python_operator import PythonOperator
from airflow.operators.sensors import S3KeySensor
import boto3
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2022, 2, 22),
'email': ['nic#enye.tech'],
'email_on_failure': False,
'max_active_runs': 1,
'email_on_retry': False,
'retry_delay': timedelta(minutes=5)
}
dg = DAG('cloudwalker_s3_sensor',
schedule_inte`your text`rval='#daily',`your text`
default_args=default_args,
catchup=False
)
s3_buckname = 'demo1-s3-sensor'
s3_locat = 'demo/testfile.txt'
state_machine_arn = 'arn:......'
s3_sensor = S3KeySensor(
task_id='s3_file_check',
poke_interval=60,
timeout=180,
soft_fail=False,
retries=2,
bucket_key=s3_locat,
bucket_name=s3_buckname,
aws_conn_id='customer_demo',
dag=dg)
def processing_func(**kwargs):
print("Reading the file")
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=s3_buckname, Key=s3_locat)
lin = obj['Body'].read().decode("utf-8")
print(lin)
start_execution = StepFunctionStartExecutionOperator(task_id='start_execution', state_machine_arn=state_machine_arn)
s3_sensor >> func_task
pasted in the question

Airflow2 Creating Dag dynamically after function Run

Hello all i am working with airflow here is scenario i am trying to resolve
i want to create DAG dynamically after the function run
try:
import os
import sys
from datetime import timedelta,datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.email_operator import EmailOperator
from airflow.utils.trigger_rule import TriggerRule
from airflow.utils.task_group import TaskGroup
import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
# ===============================================
default_args = {
"owner": "airflow",
"start_date": datetime(2021, 1, 1),
"retries": 1,
"retry_delay": timedelta(minutes=1),
'email': ['shahsoumil519#gmail.com'],
'email_on_failure': True,
'email_on_retry': False,
}
dag = DAG(dag_id="project", schedule_interval="#once", default_args=default_args, catchup=False)
# ================================================
class XcomHelper(object):
def __init__(self, **context):
self.context = context
def get(self, key=None):
""" Get the Value from XCOM"""
try:
return self.context.get("ti").xcom_pull(key=key)
except Exception as e: return "Error"
def push(self, key=None, value=None):
"""Push the value on session """
try:
self.context['ti'].xcom_push(key=key, value=value)
return True
except Exception as e: return False
def create_dag(dag_id,schedule,dag_number,default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id,python_callable=hello_world_py)
return dag
def simple_task(**context):
DATA = ["soumil", "Shah"]
for n in range(1, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow','start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id,schedule, dag_number,default_args)
except Exception as e:
print("Error : {} ".format(e))
with DAG(dag_id="project", schedule_interval="#once", default_args=default_args, catchup=False) as dag:
simple_task = PythonOperator(task_id="simple_task",
python_callable=simple_task,
provide_context=True)
simple_task
I want to create these dags based on len of DATA variable
that data comes from the database
i tried looking into
https://www.astronomer.io/guides/dynamically-generating-dags
Can an Airflow task dynamically generate a DAG at runtime?
https://medium.com/#flavio.mtps/making-use-of-python-globals-to-dynamically-create-airflow-dags-124e556b704e
any help would be great
Revised Code :
try:
import os
import sys
from datetime import timedelta, datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
# from airflow.operators.email_operator import EmailOperator
# from airflow.utils.trigger_rule import TriggerRule
# from airflow.utils.task_group import TaskGroup
# import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
def create_dag(dag_id, schedule, dag_number, default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id, python_callable=hello_world_py)
return dag
def simple_task():
DATA = ["soumil", "Shah", "Shah2"]
for n in range(0, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id, schedule, dag_number, default_args)
except Exception as e:
print("Error : {} ".format(e))
def trigger_function():
print("HEREE")
simple_task()
with DAG(dag_id="project", schedule_interval="#once", default_args={'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}, catchup=False) as dag:
trigger_function = PythonOperator(task_id="trigger_function",python_callable=trigger_function,provide_context=True,)
trigger_function
I removed a few lines from your code to keep the answer to the point. The below code will generate DAGs like hello_world_0, hello_world_1... based on the contents of DATA .
EDIT - I used airflow v1.10.x but the code should work for v2.x
Suggestions:
Make the tasks names different from DAGs names.
dag_number variable is currently not being used. That can be taken off.
The DAGs will look like this -
try:
import os
import sys
from datetime import timedelta, datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
# from airflow.operators.email_operator import EmailOperator
# from airflow.utils.trigger_rule import TriggerRule
# from airflow.utils.task_group import TaskGroup
# import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
def create_dag(dag_id, schedule, dag_number, default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id, python_callable=hello_world_py)
return dag
def simple_task():
DATA = ["soumil", "Shah", "Shah2"]
for n in range(0, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id, schedule, dag_number, default_args)
except Exception as e:
print("Error : {} ".format(e))
simple_task()

Airflow LocalFilesystemToGCSOperator marks the task with success but the file is not uploaded

I'm trying to upload a file from my local machine to GCS and I'm using the LocalFilesystemToGCSOperator. I'm following this howto https://airflow.readthedocs.io/en/latest/howto/operator/google/transfer/local_to_gcs.html#prerequisite-tasks. I've set up a connection to GCP with a path to a json file. This is the DAG code:
import os
from airflow import models
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
from airflow.utils import dates
BUCKET_NAME = 'bucket-name'
PATH_TO_UPLOAD_FILE = '...path-to/airflow/dags/example-text.txt'
DESTINATION_FILE_LOCATION = '/test-dir-input/example-text.txt'
with models.DAG(
'example_local_to_gcs',
default_args=dict(start_date=dates.days_ago(1)),
schedule_interval=None,
) as dag:
upload_file = LocalFilesystemToGCSOperator(
gcp_conn_id='custom_gcp_connection',
task_id="upload_file",
src=PATH_TO_UPLOAD_FILE,
dst=DESTINATION_FILE_LOCATION,
bucket=BUCKET_NAME,
mime_type='text/plain'
)
When I trigger the DAG it is marked as a success but the file is not in the bucket
It looks like there's a problem with your path_to_upload and destination_file_location.
To give you an idea, here's a separate post that could also help you. The relevant parameters similar to yours were declared like this for example:
src='/Users/john/Documents/tmp',
dst='gs://constantine-bucket',
bucket='constantine-bucket',
You should remove the ... and make sure that the destination_file_location refers to your bucket name or the folder inside it like this:
BUCKET_NAME = 'bucket-name'
PATH_TO_UPLOAD_FILE = '/path-to/airflow/dags/example-text.txt'
DESTINATION_FILE_LOCATION = 'gs://bucket-name/example-text.txt'
# Or in a folder on your bucket
# DESTINATION_FILE_LOCATION = 'gs://bucket-name/folder/example-text.txt'
The following code did the trick for me.
Please note that the service account used must have storage.objects permissions in the destination-bucket to write the file.
import os
import datetime
from pathlib import Path
from airflow import DAG
from airflow.configuration import conf
from airflow.operators.dummy import DummyOperator
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
comp_home_path = Path(conf.get("core", "dags_folder")).parent.absolute()
comp_bucket_path = "data/uploaded" # <- if your file is within a folder
comp_local_path = os.path.join(comp_home_path, comp_bucket_path)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime.today(),
'end_date': None,
'email': ['somename#somecompany.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=1)
}
sch_interval = None
dag = DAG(
'mv_local_to_GCS',
default_args=default_args,
tags=["example"],
catchup=False,
schedule_interval=sch_interval
)
mv_local_gcs = LocalFilesystemToGCSOperator(
task_id="local_to_gcs",
src=comp_local_path+"/yourfilename.csv",# PATH_TO_UPLOAD_FILE
dst="somefolder/yournewfilename.csv",# BUCKET_FILE_LOCATION
bucket="yourproject",#using NO 'gs://' nor '/' at the end, only the project, folders, if any, in dst
dag=dag
)
start = DummyOperator(task_id='Starting', dag=dag)
start >> mv_local_gcs

why apscheduler use get_jobs empty?

this is my test.py
from datetime import datetime, timedelta
import sys
import os
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
from apscheduler.jobstores.redis import RedisJobStore
jobstores = {
#'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
'default': RedisJobStore(host='localhost', port=6379)
}
scheduler = BlockingScheduler(jobstores=jobstores)
def alarm(time):
print('Alarm! This alarm was scheduled at %s.' % time)
if __name__ == '__main__':
alarm_time = datetime.now() + timedelta(seconds=10)
scheduler.add_job(alarm, 'interval', seconds=10, args=[datetime.now()], name='alarm_test')
print('To clear the alarms, delete the example.sqlite file.')
print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
try:
scheduler.start()
except (KeyboardInterrupt, SystemExit):
pass
i do python test.py run job successfully
and then use another terminal by putty
python
>>> import redis
>>> from test import *
>>> r = redis.Redis()
>>> r.keys()
>>> r.zrange('apscheduler.run_times',0,1)
it will find the job id 57841c0ee05249efb466882265f2c495
>>> ret = scheduler.get_jobs(jobstore='default')
ret is empty
why???
thanks a lot
Have you started the scheduler before running get_jobs()? If not, it will only list tentatively scheduled jobs. That's why you're not seeing the job.
Try this instead:
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.jobstores.redis import RedisJobStore
scheduler = BackgroundScheduler()
scheduler.add_jobstore('redis', host='localhost', port=6379)
scheduler.start(paused=True)
scheduler.print_jobs()

Airflow - Email Notification not working with SES

I have created two Dag's to check the email configuration for Airflow.
Basically I want to get an email alert whenever a job is failed.
I have also gone through the following links but unfortunately, I am not able to resolve the problem.
Link 1
Link 2
DAG One: ( Success Job )
from datetime import datetime
from datetime import timedelta
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'Airflow',
'depends_on_past': False,
'start_date': datetime(2015, 6, 1),
'email': ['firstnamelastname#company.com','firstnamelastname#company.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': timedelta(seconds=5),
'email_on_success': True
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
}
def print_hello():
return 'Hello world!'
dag = DAG('success', description='Simple tutorial DAG',
schedule_interval='0 12 * * *',default_args=default_args,
start_date=datetime(2017, 3, 20), catchup=False)
dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)
hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag)
dummy_operator >> hello_operator
DAG Two : ( Failed Job )
from datetime import datetime
from datetime import timedelta
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'Airflow',
'depends_on_past': False,
'start_date': datetime(2015, 6, 1),
'email': ['firstnamelastname#company.com','firstnamelastname#company.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': timedelta(seconds=5),
'email_on_success': True
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
}
def print_hello():
xxxx
return 'Hello world!'
dag = DAG('success', description='Simple tutorial DAG',
schedule_interval='0 12 * * *',default_args=default_args,
start_date=datetime(2017, 3, 20), catchup=False)
dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)
hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag)
dummy_operator >> hello_operator
I was expecting to get an email for both of the jobs. Since both of the Jobs contains configuration for email_on_success and email_on_failure
But I did not receive any email.
Please have a look at the Job Run Stats :
Here is my SMTP Configuration under airflow.cfg :
smtp_host = email-smtp.ap-south-1.amazonaws.com
smtp_starttls = True
smtp_ssl = False
# Uncomment and set the user/pass settings if you want to use SMTP AUTH
smtp_user = XXXXXXXXXXXXXXXXXXX
smtp_password = XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
smtp_port = 587
smtp_mail_from = firstnamelastname#company.com
I have obtained the username and password from the Create My SMTP Credentials under the SES Service. I also have a verified email address. Security Group for my EC2 contains all outbound traffic for all protocol, all port and for destination 0.0.0.0/0
What else I am missing here?
Is it possible to configure/generate logs for the email sending process?