How can I get the tags in the on_dag_failure function on Airflow? - airflow-scheduler

Here is my dag and task, when the task is failed it will call the on_dag_failure.
dag = DAG('my_dag_id',
catchup=False,
schedule_interval='00 01 17 * *',
description = 'My dag desc',
default_args=default_args,
tags=['TAG_1','TAG_2'],
)
run_this_0 = BashOperator(
task_id='my_task_id',
bash_command='some cmd',
on_failure_callback = on_dag_failure,
execution_timeout=None,
dag=dag
)
In the on_dag_failure, I want to do get the tags that is define in the dag.
Is there any way to get it?
def on_dag_failure(context):
tags = dag.tags #how to get the tags from the dag
tags_str = ""
for tag in tags:
tags_str += tag + " "
print(tags_str)

The models.DAG object should be available in the context variable provided to the function:
def on_dag_failure(context):
tags_str = " ".join(context["dag"].tags)
print(tags_str)

Related

Schedule an Airflow DAG for running with parameters

I am trying to trigger an airflow DAG externally and passing some parameters to the DAG. The DAG is scheduled to run every 3 minutes. My problem is that the parameters are only being used by the first DAG run.
from pyexpat import model
from airflow import DAG
from datetime import datetime, timedelta
from airflow.operators.python import PythonOperator
import os
dag_id = "proj"
home_path = os.path.expanduser("~")
runpath = os.path.join(home_path, "airflow/data", dag_id)
def load_data(ti):
import os
train = os.path.join(runpath, "mnist")
test = os.path.join(runpath, "mnist.t")
model = os.path.join(runpath, "trained.mnist")
if not os.path.exists(train):
os.system(
"curl https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2 --output mnist.bz2"
)
os.system("bzip2 -d mnist.bz2")
if not os.path.exists(test):
os.system(
"curl https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.t.bz2 --output mnist.t.bz2"
)
os.system("bzip2 -d mnist.t.bz2")
ti.xcom_push(key="train_path", value=train)
ti.xcom_push(key="test_path", value=test)
ti.xcom_push(key="model_path", value=model)
def train(
**context,
):
import os
ti = context["ti"]
train = ti.xcom_pull(task_ids="load_data", key="train_path")
model_path = ti.xcom_pull(task_ids="load_data", key="model_path")
lr = context["dag_run"].conf["lr"]
epochs = context["dag_run"].conf["epochs"]
name = context["dag_run"].conf["name"]
print(lr)
print(epochs)
ti.xcom_push(key="model_name", value=model_final_name)
def validate(**context):
ti = context["ti"]
test = ti.xcom_pull(task_ids="load_data", key="test_path")
model_path = ti.xcom_pull(task_ids="train", key="model_name")
print(test)
print(model_path)
with DAG(
dag_id="project",
default_args={"owner": "airflow"},
start_date=datetime(2022, 8, 8),
schedule_interval=timedelta(minutes=3),
tags=["mnist_4"],
catchup=False,
) as dag:
print(runpath)
os.makedirs(runpath, exist_ok=True)
os.chdir(runpath)
read_file = PythonOperator(
task_id="load_data",
python_callable=load_data,
provide_context=True,
)
process_train = PythonOperator(
task_id="train",
python_callable=train,
provide_context=True,
)
validate = PythonOperator(
task_id="validate", python_callable=validate, provide_context=True
)
read_file >> process_train >> validate
I trigger dag with the command
airflow dags trigger project --conf '{"epochs":1,"name":"trial_3","lr":0.001}'
Except one run, all the other runs have failed with the following error:
KeyError: 'lr'
When I look at the conf for the dag runs, only one run has the conf, rest are empty.
If I look at the field External Trigger, only one run is true which means while triggering the dag, only run is triggered, rest are scheduled.
I want to know how to pass config to the scheduled dags as well.
I hope it can help.
Indeed dag run conf works for manual triggered DAGs, in this case the conf can be passed.
For sheduled DAGs, you can set default params in your DAGs, this post shows an example :
Airflow how to set default values for dag_run.conf

How to trigger DAG3 after successfully execution of DAG1 and DAG2 in airflow?

I have created DAG1 and DAG2 which containing 2 task each inside it.
and in DAG3 by using triggerdagrunoperator triggering DAG1 and DAG2 and set parameter wait_for_completion and poke_interval of 15 sec.
But problem here is scheduler not working by this parameter...Scheduler got stop.
Here is my code for DAG3:
from airflow import DAG
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
from airflow.utils.dates import days_ago
from airflow.operators.python_operator import PythonOperator
dag = DAG(
dag_id="controller_dag",
default_args={"owner": "airflow"},
start_date=days_ago(2),
schedule_interval="#once",
tags=['example'],
)
def BYE():
return "GOOD BYE from Controller_Dag"
trigger1 = TriggerDagRunOperator(
task_id="trigger_dag_1",
trigger_dag_id="target_dag_1",
wait_for_completion = True,
poke_interval = 15,
dag=dag,
)
trigger2 = TriggerDagRunOperator(
task_id="trigger_dag_2",
trigger_dag_id="target_dag_2",
wait_for_completion =True,
poke_interval = 15,
dag=dag,
)
Bye = PythonOperator(
task_id = "BYE",
python_callable = BYE,
dag=dag)
[trigger1, trigger2]>>Bye
If any knows why its happening please help me.
Thanks in advance..!

How to schedule my crawler function in django periodically using celery?

Here I have a view CrawlerHomeView which is used to create the task object from a form now I want to schedule this task periodically with celery.
I want to schedule this CrawlerHomeView process with the task object search_frequency and by checking some task object fields.
Task Model
class Task(models.Model):
INITIAL = 0
STARTED = 1
COMPLETED = 2
task_status = (
(INITIAL, 'running'),
(STARTED, 'running'),
(COMPLETED, 'completed'),
(ERROR, 'error')
)
FREQUENCY = (
('1', '1 hrs'),
('2', '2 hrs'),
('6', '6 hrs'),
('8', '8 hrs'),
('10', '10 hrs'),
)
name = models.CharField(max_length=255)
scraping_end_date = models.DateField(null=True, blank=True)
search_frequency = models.CharField(max_length=5, null=True, blank=True, choices=FREQUENCY)
status = models.IntegerField(choices=task_status)
tasks.py
I want to run the view below posted periodically [period=(task's search_frequency time] if the task status is 0 or 1 and not crossed the task scraping end date. But I got stuck here. How can I do this?
#periodic_task(run_every=crontab(hour="task.search_frequency")) # how to do with task search_frequency value
def schedule_task(pk):
task = Task.objects.get(pk=pk)
if task.status == 0 or task.status == 1 and not datetime.date.today() > task.scraping_end_date:
# perform the crawl function ---> def crawl() how ??
if task.scraping_end_date == datetime.date.today():
task.status = 2
task.save() # change the task status as complete.
views.py
I want to run this view periodically.How can I do it?
class CrawlerHomeView(LoginRequiredMixin, View):
login_url = 'users:login'
def get(self, request, *args, **kwargs):
# all_task = Task.objects.all().order_by('-id')
frequency = Task()
categories = Category.objects.all()
targets = TargetSite.objects.all()
keywords = Keyword.objects.all()
form = CreateTaskForm()
context = {
'targets': targets,
'keywords': keywords,
'frequency': frequency,
'form':form,
'categories': categories,
}
return render(request, 'index.html', context)
def post(self, request, *args, **kwargs):
form = CreateTaskForm(request.POST)
if form.is_valid():
# try:
unique_id = str(uuid4()) # create a unique ID.
obj = form.save(commit=False)
# obj.keywords = keywords
obj.created_by = request.user
obj.unique_id = unique_id
obj.status = 0
obj.save()
form.save_m2m()
keywords = ''
# for keys in ast.literal_eval(obj.keywords.all()): #keywords change to csv
for keys in obj.keywords.all():
if keywords:
keywords += ', ' + keys.title
else:
keywords += keys.title
# tasks = request.POST.get('targets')
# targets = ['thehimalayantimes', 'kathmandupost']
# print('$$$$$$$$$$$$$$$ keywords', keywords)
task_ids = [] #one Task/Project contains one or multiple scrapy task
settings = {
'spider_count' : len(obj.targets.all()),
'keywords' : keywords,
'unique_id': unique_id, # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# res = ast.literal_eval(ini_list)
for site_url in obj.targets.all():
domain = urlparse(site_url.address).netloc # parse the url and extract the domain
spider_name = domain.replace('.com', '')
task = scrapyd.schedule('default', spider_name, settings=settings, url=site_url.address, domain=domain, keywords=keywords)
# task = scrapyd.schedule('default', spider_name , settings=settings, url=obj.targets, domain=domain, keywords=obj.keywords)
return redirect('crawler:task-list')
# except:
# return render(request, 'index.html', {'form':form})
return render(request, 'index.html', {'form':form, 'errors':form.errors})
Any Suggestions or answer is there for this problem ?
After fighting Celery for 5 years in a 15k tasks/second setup I highly recommend you to switch to Dramatiq, which has a sane, reliable, performant code base that isn't split across multiple convoluted packages and works perfectly in two of my newer projects so far.
From the author's motivation
I’ve used Celery professionally for years and my growing frustration with it is one of the reasons why I developed dramatiq. Here are some of the main differences between Dramatiq, Celery and RQ:
There's also a a Django helper package: https://github.com/Bogdanp/django_dramatiq
Granted, you won't have a builtin celerybeat, but a cron calling python tasks is more robust anyway, we lost a good amount of data because celerybeat decided to stall regularly :)
There are two projects that aim to add periodic task creation: https://gitlab.com/bersace/periodiq and https://apscheduler.readthedocs.io/en/stable/
I haven't used those packages yet, what you could try with periodiq is selecting your database entries, loop through those and define a periodic-task for each (but this requires regular restarts of the periodiq worker to pick up changes):
# tasks.py
from dramatiq import get_broker
from periodiq import PeriodiqMiddleware, cron
broker = get_broker()
broker.add_middleware(PeriodiqMiddleware(skip_delay=30))
for obj in Task.objects.all():
#dramatiq.actor(periodic=cron(obj.frequency))
def hourly(obj=obj):
# import logic based on obj.name
# Do something each hour…
For the error,
Exception Type: EncodeError
Exception Value:
Object of type timedelta is not JSON serializable
Instead of defining following variable in django settings,
CELERY_BEAT_SCHEDULE = {
'task-first': {
'task': 'scheduler.tasks.create_task',
'schedule': timedelta(minutes=1)
},
can you try following in your celery file:
app.conf.beat_schedule = {
'task-first': {
'task': 'scheduler.tasks.create_task',
'schedule': crontab(minute='*/1')
}
}
this works for me given, celery server is up and running.
Apart from this why are you redirecting to 'list_tasks' after each task, what does it exactly do? Also, you have called the celery task from the view add_task_celery.delay(name,date,freq), is it just another way to add task apart from periodic task defined using celery-beat?
Edit 1:
My structure looks like as follow:
settings.py
CELERY_TIMEZONE = 'Asia/Kolkata'
CELERY_BROKER_URL = 'amqp://localhost'
celery.py
app.conf.beat_schedule = {
'task1': {
'task': '<app_name>.tasks.random_task',
'schedule': crontab(minute=0, hour=0)
},
}
Here you should note that I have a file named tasks in my app folder and there I have written a shared task as follow:
#shared_task
def random_task(total):
...
Also, apart from this you should start both celery beat as well as a celery worker process as follow:
celery -A <project_name>.celery worker -l error
celery -A <project_name>.celery beat -l error --scheduler django_celery_beat.schedulers:DatabaseScheduler
You can any scheduler you want, on production I use DatabaseScheduler. For testing you can try with following command:
celery -A <project_name> beat -l info -S django
You should run all these commands from the project folder of the Django project
I believe the problem is with 2nd and 3rd parameter in the task definition, which is freq and date. Although from the error, you posted, Object of type timedelta is not JSON serializable, it looks like it's talking about freq field which is of type DurationField that returns timedelta object.
Ideally, both fields must be serialized before passing to the task.
one simple way would be -
1) You can explicitly serialize these fields and pass to the task and in the task again convert it to datetime / timedelta object.
alternatively, you can dump whole data dict if there are too many items.
add_task_celery.delay(json.dumps(form.cleaned_data)),
and then in the task do -> json.loads(...)
2) Another thing you can try is to pass the serializer in the parameters explicitly.(using apply_async instead of delay)
add_task_celery.apply_async((name, date, freq), serializer='json')
3) You can also set value, if you haven't already, for setting CELERY_TASK_SERIALIZER = 'json' (default value is 'pickle').

Unable to lunch Multiple Streaming Pipeline ( N to N Pipeline) Dynamically (Using Runtime Value Provider) in Single Dataflow Job in Python

I am trying to launch a Streaming Dataflow Job which contains n number of pipelines.
Based on configured topic and corresponding BQ table for each Topic i want to launch a Pipeline inside a one Streaming Job.
My actual problem is i have to create and upload a template for each and every project. What i want is, i can reuse the uploaded template and only configuration files ihave to pass for launching new dataflow job by changing topic,subscription, dataset and bq table.
Which is i am unable to reuse the template.
Please help me on this and let me know if this is possible or not. Because Google has also provided one to one template. Not many to many Template (e.g Three topic - Three BQ Table (three data pipeleine) , n-n).
import logging
import os
import json
from google.cloud import storage
from apache_beam import Pipeline, ParDo, DoFn
from apache_beam.io import ReadFromPubSub, WriteToBigQuery, BigQueryDisposition
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions, WorkerOptions, GoogleCloudOptions, \
SetupOptions
def _get_storage_service():
storage_client = storage.Client \
.from_service_account_json(
json_credentials_path='C:\Users\dneema\PycharmProjects\iot_dataflow\df_stm_iot_pubsub_bq\service_account_credentials.json')
print('storage service fetched')
return storage_client
class RuntimeOptions(PipelineOptions):
def __init__(self, flags=None, **kwargs):
super(RuntimeOptions, self).__init__(flags, **kwargs)
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument('--bucket_name', type=str)
parser.add_value_provider_argument('--config_json_path', type=str,)
class PipelineCreator:
def __init__(self):
self.options = PipelineOptions()
storage_client = storage.Client.from_service_account_json(
'service_account_credentials_updated.json')
runtime_options = self.options.view_as(RuntimeOptions)
bucket_name = str(runtime_options.bucket_name)
config_json_path = str(runtime_options.config_json_path)
# get the bucket with name
bucket = storage_client.get_bucket(bucket_name)
# get bucket file as blob
blob = bucket.get_blob(config_json_path)
# convert to string and load config
json_data = blob.download_as_string()
self.configData = json.loads(json_data)
dataflow_config = self.configData['dataflow_config']
self.options.view_as(StandardOptions).streaming = bool(dataflow_config['streaming'])
self.options.view_as(SetupOptions).save_main_session = True
worker_options = self.options.view_as(WorkerOptions)
worker_options.max_num_workers = int(dataflow_config['max_num_worker'])
worker_options.autoscaling_algorithm = str(dataflow_config['autoscaling_algorithm'])
#worker_options.machine_type = str(dataflow_config['machine_type'])
#worker_options.zone = str(dataflow_config['zone'])
#worker_options.network = str(dataflow_config['network'])
#worker_options.subnetwork = str(dataflow_config['subnetwork'])
def run(self):
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'dataflow-service-account.json'
project_id = self.configData['project_id']
dataset_id = self.configData['dataset_id']
topics = self.configData['topics']
table_ids = self.configData['bq_table_ids']
error_table_id = self.configData['error_table_id']
logger = logging.getLogger(project_id)
logger.info(self.options.display_data())
pipeline = Pipeline(options=self.options)
size = len(topics)
for index in range(size):
print(topics[index])
pipeline_name = "pipeline_"+str(index)
logger.info("Launch pipeline :: "+pipeline_name)
messages = pipeline | 'Read PubSub Message in ' + pipeline_name >> ReadFromPubSub(topic=topics[index])
logger.info("Read PubSub Message")
valid_messages, invalid_messages = messages | 'Convert Messages to TableRows in ' + pipeline_name >> ParDo(TransformMessageToTableRow()).with_outputs('invalid', main='valid')
valid_messages | 'Write Messages to BigQuery in ' + pipeline_name >> WriteToBigQuery(table=table_ids[index],
dataset=dataset_id,
project=project_id,
write_disposition=BigQueryDisposition.WRITE_APPEND)
pipeline.run().wait_until_finish()
class TransformMessageToTableRow(DoFn):
def process(self, element, *args, **kwargs):
logging.getLogger('dataflow').log(logging.INFO, element)
print element
print("element type ", type(element))
print("inside bq pardo")
import json
try:
message_rows = json.loads(element)
# if using emulator, uncomment below line
message_rows = json.loads(message_rows)
print 'loaded element'
except:
try:
element = "[" + element + "]"
message_rows = json.loads(element)
except Exception as e:
print(e)
from apache_beam import pvalue
yield [pvalue.TaggedOutput('invalid', [element, str(e)])]
print(message_rows)
print("message rows", type(message_rows))
if not isinstance(message_rows, list):
message_rows = [message_rows]
#rows = list()
if isinstance(message_rows, list):
for row in message_rows:
try:
new_row = dict()
for k, v in row.items():
new_row[str(k)] = v
#rows.append(new_row)
print(new_row)
yield new_row
except Exception as e:
print(e)
from apache_beam import pvalue
yield pvalue.TaggedOutput('invalid', [row, str(e)])
if __name__ == '__main__':
PipelineCreator().run()
Here Runtime argument as bucket_name and config_json_path for all the configuration related stuffs like Dataset, BQ table, Topics/ Subscription and all Workflow options.
This is possible or not ? Because Google has also provided one to one template. Not many to many Template (e.g Three topic - Three BQ Table (three data pipeleine) , n-n).
Regarding this previously answered thread Unable to run multiple Pipelines in desired order by creating template in Apache Beam, you can run only one pipeline inside a template at any time.
You'll have to delegate the template creation to another service and pass the configuration with it, just follow the link inside the thread and you'll have How To examples.

How can I give dynamic timeperiod to periodic task in django celery

I have a task to send remind mails to my users before one hour of specific occasion. How can I set the time of periodic task ?
`code`
#periodic_task( ? )
def auto_email():
subject = "Exam"
date_time = datetime.now()
body = ""
to = []
after_hour_time = date_time + timedelta(minutes=60)
student_qs = Student.objects.filter(date_time = after_hour_time)
for obj in student_qs :
to.append(obj.email)
send_mail(subject, body, settings.EMAIL_HOST_USER,to)
Thanks