AWS MWAA/Apache airflow: how to debug on_failure_callback itself

AWS MWAA/Apache airflow: how to debug on_failure_callback itself - amazon-web-services

Have a dag like this:
import os
from datetime import timedelta
from xxx import on_failure_opsgenie
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
DAG_ID = os.path.basename(__file__).replace(".py", "")
DEFAULT_ARGS = {
"owner": "airflow",
"depends_on_past": False,
"email": ["airflow#example.com"],
"email_on_failure": False,
"email_on_retry": False,
}
def kaboom(*args, **kwargs):
print("goodbye cruel world")
print(args)
print(kwargs)
assert 1 == 2
with DAG(
dag_id=DAG_ID,
default_args=DEFAULT_ARGS,
description="Print contents of airflow.cfg to logs",
dagrun_timeout=timedelta(hours=2),
start_date=days_ago(1),
schedule_interval=None,
on_failure_callback=on_failure_opsgenie,
) as dag:
get_airflow_cfg_operator = PythonOperator(task_id="gonna_explode", python_callable=kaboom)
The DAG fails as expected, purposefully. However, on_failure_opsgenie is not doing what it should; how do I get the logs or debug a failed on-failure-callback in AWS MWAA?

Related

Trigger AWS Step Function Once file received in AWS s3 using Airflow File Sensor

I need to trigger a AWS step function state Machine whenever a file received in AWS s3 location using Airflow File Sensor operator.
Im trying this but its not working.
from airflow.models import DAG
from datetime import datetime, timedelta
from airflow.operators.python_operator import PythonOperator
from airflow.operators.sensors import S3KeySensor
import boto3
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2022, 2, 22),
'email': ['nic#enye.tech'],
'email_on_failure': False,
'max_active_runs': 1,
'email_on_retry': False,
'retry_delay': timedelta(minutes=5)
}
dg = DAG('cloudwalker_s3_sensor',
schedule_inte`your text`rval='#daily',`your text`
default_args=default_args,
catchup=False
)
s3_buckname = 'demo1-s3-sensor'
s3_locat = 'demo/testfile.txt'
state_machine_arn = 'arn:......'
s3_sensor = S3KeySensor(
task_id='s3_file_check',
poke_interval=60,
timeout=180,
soft_fail=False,
retries=2,
bucket_key=s3_locat,
bucket_name=s3_buckname,
aws_conn_id='customer_demo',
dag=dg)
def processing_func(**kwargs):
print("Reading the file")
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=s3_buckname, Key=s3_locat)
lin = obj['Body'].read().decode("utf-8")
print(lin)
start_execution = StepFunctionStartExecutionOperator(task_id='start_execution', state_machine_arn=state_machine_arn)
s3_sensor >> func_task
pasted in the question

Airflow2 Creating Dag dynamically after function Run

Hello all i am working with airflow here is scenario i am trying to resolve
i want to create DAG dynamically after the function run
try:
import os
import sys
from datetime import timedelta,datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.email_operator import EmailOperator
from airflow.utils.trigger_rule import TriggerRule
from airflow.utils.task_group import TaskGroup
import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
# ===============================================
default_args = {
"owner": "airflow",
"start_date": datetime(2021, 1, 1),
"retries": 1,
"retry_delay": timedelta(minutes=1),
'email': ['shahsoumil519#gmail.com'],
'email_on_failure': True,
'email_on_retry': False,
}
dag = DAG(dag_id="project", schedule_interval="#once", default_args=default_args, catchup=False)
# ================================================
class XcomHelper(object):
def __init__(self, **context):
self.context = context
def get(self, key=None):
""" Get the Value from XCOM"""
try:
return self.context.get("ti").xcom_pull(key=key)
except Exception as e: return "Error"
def push(self, key=None, value=None):
"""Push the value on session """
try:
self.context['ti'].xcom_push(key=key, value=value)
return True
except Exception as e: return False
def create_dag(dag_id,schedule,dag_number,default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id,python_callable=hello_world_py)
return dag
def simple_task(**context):
DATA = ["soumil", "Shah"]
for n in range(1, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow','start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id,schedule, dag_number,default_args)
except Exception as e:
print("Error : {} ".format(e))
with DAG(dag_id="project", schedule_interval="#once", default_args=default_args, catchup=False) as dag:
simple_task = PythonOperator(task_id="simple_task",
python_callable=simple_task,
provide_context=True)
simple_task
I want to create these dags based on len of DATA variable
that data comes from the database
i tried looking into
https://www.astronomer.io/guides/dynamically-generating-dags
Can an Airflow task dynamically generate a DAG at runtime?
https://medium.com/#flavio.mtps/making-use-of-python-globals-to-dynamically-create-airflow-dags-124e556b704e
any help would be great
Revised Code :
try:
import os
import sys
from datetime import timedelta, datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
# from airflow.operators.email_operator import EmailOperator
# from airflow.utils.trigger_rule import TriggerRule
# from airflow.utils.task_group import TaskGroup
# import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
def create_dag(dag_id, schedule, dag_number, default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id, python_callable=hello_world_py)
return dag
def simple_task():
DATA = ["soumil", "Shah", "Shah2"]
for n in range(0, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id, schedule, dag_number, default_args)
except Exception as e:
print("Error : {} ".format(e))
def trigger_function():
print("HEREE")
simple_task()
with DAG(dag_id="project", schedule_interval="#once", default_args={'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}, catchup=False) as dag:
trigger_function = PythonOperator(task_id="trigger_function",python_callable=trigger_function,provide_context=True,)
trigger_function

I removed a few lines from your code to keep the answer to the point. The below code will generate DAGs like hello_world_0, hello_world_1... based on the contents of DATA .
EDIT - I used airflow v1.10.x but the code should work for v2.x
Suggestions:
Make the tasks names different from DAGs names.
dag_number variable is currently not being used. That can be taken off.
The DAGs will look like this -
try:
import os
import sys
from datetime import timedelta, datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
# from airflow.operators.email_operator import EmailOperator
# from airflow.utils.trigger_rule import TriggerRule
# from airflow.utils.task_group import TaskGroup
# import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
def create_dag(dag_id, schedule, dag_number, default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id, python_callable=hello_world_py)
return dag
def simple_task():
DATA = ["soumil", "Shah", "Shah2"]
for n in range(0, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id, schedule, dag_number, default_args)
except Exception as e:
print("Error : {} ".format(e))
simple_task()

Airflow LocalFilesystemToGCSOperator marks the task with success but the file is not uploaded

I'm trying to upload a file from my local machine to GCS and I'm using the LocalFilesystemToGCSOperator. I'm following this howto https://airflow.readthedocs.io/en/latest/howto/operator/google/transfer/local_to_gcs.html#prerequisite-tasks. I've set up a connection to GCP with a path to a json file. This is the DAG code:
import os
from airflow import models
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
from airflow.utils import dates
BUCKET_NAME = 'bucket-name'
PATH_TO_UPLOAD_FILE = '...path-to/airflow/dags/example-text.txt'
DESTINATION_FILE_LOCATION = '/test-dir-input/example-text.txt'
with models.DAG(
'example_local_to_gcs',
default_args=dict(start_date=dates.days_ago(1)),
schedule_interval=None,
) as dag:
upload_file = LocalFilesystemToGCSOperator(
gcp_conn_id='custom_gcp_connection',
task_id="upload_file",
src=PATH_TO_UPLOAD_FILE,
dst=DESTINATION_FILE_LOCATION,
bucket=BUCKET_NAME,
mime_type='text/plain'
)
When I trigger the DAG it is marked as a success but the file is not in the bucket

It looks like there's a problem with your path_to_upload and destination_file_location.
To give you an idea, here's a separate post that could also help you. The relevant parameters similar to yours were declared like this for example:
src='/Users/john/Documents/tmp',
dst='gs://constantine-bucket',
bucket='constantine-bucket',
You should remove the ... and make sure that the destination_file_location refers to your bucket name or the folder inside it like this:
BUCKET_NAME = 'bucket-name'
PATH_TO_UPLOAD_FILE = '/path-to/airflow/dags/example-text.txt'
DESTINATION_FILE_LOCATION = 'gs://bucket-name/example-text.txt'
# Or in a folder on your bucket
# DESTINATION_FILE_LOCATION = 'gs://bucket-name/folder/example-text.txt'

The following code did the trick for me.
Please note that the service account used must have storage.objects permissions in the destination-bucket to write the file.
import os
import datetime
from pathlib import Path
from airflow import DAG
from airflow.configuration import conf
from airflow.operators.dummy import DummyOperator
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
comp_home_path = Path(conf.get("core", "dags_folder")).parent.absolute()
comp_bucket_path = "data/uploaded" # <- if your file is within a folder
comp_local_path = os.path.join(comp_home_path, comp_bucket_path)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime.today(),
'end_date': None,
'email': ['somename#somecompany.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=1)
}
sch_interval = None
dag = DAG(
'mv_local_to_GCS',
default_args=default_args,
tags=["example"],
catchup=False,
schedule_interval=sch_interval
)
mv_local_gcs = LocalFilesystemToGCSOperator(
task_id="local_to_gcs",
src=comp_local_path+"/yourfilename.csv",# PATH_TO_UPLOAD_FILE
dst="somefolder/yournewfilename.csv",# BUCKET_FILE_LOCATION
bucket="yourproject",#using NO 'gs://' nor '/' at the end, only the project, folders, if any, in dst
dag=dag
)
start = DummyOperator(task_id='Starting', dag=dag)
start >> mv_local_gcs

why apscheduler use get_jobs empty?

this is my test.py
from datetime import datetime, timedelta
import sys
import os
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
from apscheduler.jobstores.redis import RedisJobStore
jobstores = {
#'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
'default': RedisJobStore(host='localhost', port=6379)
}
scheduler = BlockingScheduler(jobstores=jobstores)
def alarm(time):
print('Alarm! This alarm was scheduled at %s.' % time)
if __name__ == '__main__':
alarm_time = datetime.now() + timedelta(seconds=10)
scheduler.add_job(alarm, 'interval', seconds=10, args=[datetime.now()], name='alarm_test')
print('To clear the alarms, delete the example.sqlite file.')
print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
try:
scheduler.start()
except (KeyboardInterrupt, SystemExit):
pass
i do python test.py run job successfully
and then use another terminal by putty
python
>>> import redis
>>> from test import *
>>> r = redis.Redis()
>>> r.keys()
>>> r.zrange('apscheduler.run_times',0,1)
it will find the job id 57841c0ee05249efb466882265f2c495
>>> ret = scheduler.get_jobs(jobstore='default')
ret is empty
why???
thanks a lot

Have you started the scheduler before running get_jobs()? If not, it will only list tentatively scheduled jobs. That's why you're not seeing the job.
Try this instead:
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.jobstores.redis import RedisJobStore
scheduler = BackgroundScheduler()
scheduler.add_jobstore('redis', host='localhost', port=6379)
scheduler.start(paused=True)
scheduler.print_jobs()

Airflow - Email Notification not working with SES

I have created two Dag's to check the email configuration for Airflow.
Basically I want to get an email alert whenever a job is failed.
I have also gone through the following links but unfortunately, I am not able to resolve the problem.
Link 1
Link 2
DAG One: ( Success Job )
from datetime import datetime
from datetime import timedelta
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'Airflow',
'depends_on_past': False,
'start_date': datetime(2015, 6, 1),
'email': ['firstnamelastname#company.com','firstnamelastname#company.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': timedelta(seconds=5),
'email_on_success': True
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
}
def print_hello():
return 'Hello world!'
dag = DAG('success', description='Simple tutorial DAG',
schedule_interval='0 12 * * *',default_args=default_args,
start_date=datetime(2017, 3, 20), catchup=False)
dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)
hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag)
dummy_operator >> hello_operator
DAG Two : ( Failed Job )
from datetime import datetime
from datetime import timedelta
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'Airflow',
'depends_on_past': False,
'start_date': datetime(2015, 6, 1),
'email': ['firstnamelastname#company.com','firstnamelastname#company.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': timedelta(seconds=5),
'email_on_success': True
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
# 'end_date': datetime(2016, 1, 1),
}
def print_hello():
xxxx
return 'Hello world!'
dag = DAG('success', description='Simple tutorial DAG',
schedule_interval='0 12 * * *',default_args=default_args,
start_date=datetime(2017, 3, 20), catchup=False)
dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)
hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag)
dummy_operator >> hello_operator
I was expecting to get an email for both of the jobs. Since both of the Jobs contains configuration for email_on_success and email_on_failure
But I did not receive any email.
Please have a look at the Job Run Stats :
Here is my SMTP Configuration under airflow.cfg :
smtp_host = email-smtp.ap-south-1.amazonaws.com
smtp_starttls = True
smtp_ssl = False
# Uncomment and set the user/pass settings if you want to use SMTP AUTH
smtp_user = XXXXXXXXXXXXXXXXXXX
smtp_password = XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
smtp_port = 587
smtp_mail_from = firstnamelastname#company.com
I have obtained the username and password from the Create My SMTP Credentials under the SES Service. I also have a verified email address. Security Group for my EC2 contains all outbound traffic for all protocol, all port and for destination 0.0.0.0/0
What else I am missing here?
Is it possible to configure/generate logs for the email sending process?

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

AWS MWAA/Apache airflow: how to debug on_failure_callback itself - amazon-web-services

Related

Trigger AWS Step Function Once file received in AWS s3 using Airflow File Sensor

Airflow2 Creating Dag dynamically after function Run

Airflow LocalFilesystemToGCSOperator marks the task with success but the file is not uploaded

why apscheduler use get_jobs empty?

Airflow - Email Notification not working with SES

Categories

Resources