Schedule an Airflow DAG for running with parameters - google-cloud-platform

I am trying to trigger an airflow DAG externally and passing some parameters to the DAG. The DAG is scheduled to run every 3 minutes. My problem is that the parameters are only being used by the first DAG run.
from pyexpat import model
from airflow import DAG
from datetime import datetime, timedelta
from airflow.operators.python import PythonOperator
import os
dag_id = "proj"
home_path = os.path.expanduser("~")
runpath = os.path.join(home_path, "airflow/data", dag_id)
def load_data(ti):
import os
train = os.path.join(runpath, "mnist")
test = os.path.join(runpath, "mnist.t")
model = os.path.join(runpath, "trained.mnist")
if not os.path.exists(train):
os.system(
"curl https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2 --output mnist.bz2"
)
os.system("bzip2 -d mnist.bz2")
if not os.path.exists(test):
os.system(
"curl https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.t.bz2 --output mnist.t.bz2"
)
os.system("bzip2 -d mnist.t.bz2")
ti.xcom_push(key="train_path", value=train)
ti.xcom_push(key="test_path", value=test)
ti.xcom_push(key="model_path", value=model)
def train(
**context,
):
import os
ti = context["ti"]
train = ti.xcom_pull(task_ids="load_data", key="train_path")
model_path = ti.xcom_pull(task_ids="load_data", key="model_path")
lr = context["dag_run"].conf["lr"]
epochs = context["dag_run"].conf["epochs"]
name = context["dag_run"].conf["name"]
print(lr)
print(epochs)
ti.xcom_push(key="model_name", value=model_final_name)
def validate(**context):
ti = context["ti"]
test = ti.xcom_pull(task_ids="load_data", key="test_path")
model_path = ti.xcom_pull(task_ids="train", key="model_name")
print(test)
print(model_path)
with DAG(
dag_id="project",
default_args={"owner": "airflow"},
start_date=datetime(2022, 8, 8),
schedule_interval=timedelta(minutes=3),
tags=["mnist_4"],
catchup=False,
) as dag:
print(runpath)
os.makedirs(runpath, exist_ok=True)
os.chdir(runpath)
read_file = PythonOperator(
task_id="load_data",
python_callable=load_data,
provide_context=True,
)
process_train = PythonOperator(
task_id="train",
python_callable=train,
provide_context=True,
)
validate = PythonOperator(
task_id="validate", python_callable=validate, provide_context=True
)
read_file >> process_train >> validate
I trigger dag with the command
airflow dags trigger project --conf '{"epochs":1,"name":"trial_3","lr":0.001}'
Except one run, all the other runs have failed with the following error:
KeyError: 'lr'
When I look at the conf for the dag runs, only one run has the conf, rest are empty.
If I look at the field External Trigger, only one run is true which means while triggering the dag, only run is triggered, rest are scheduled.
I want to know how to pass config to the scheduled dags as well.

I hope it can help.
Indeed dag run conf works for manual triggered DAGs, in this case the conf can be passed.
For sheduled DAGs, you can set default params in your DAGs, this post shows an example :
Airflow how to set default values for dag_run.conf

Related

Cannot fix start manual transfer runs method for bigquery_datatransfer_v1

I am trying to run a scheduled query only when a table updates in big query. For this I am trying to make this python code to work in cloud functions, but it is giving me error. Would highly appreciate any help.
I am running this python code :
import time
from google.protobuf.timestamp_pb2 import Timestamp
from google.cloud import bigquery_datatransfer_v1
def runQuery (parent,requested_run_time):
client = bigquery_datatransfer_v1.DataTransferServiceClient()
projectid = '629586xxxx' # Enter your projectID here
transferid = '60cc15f8-xxxx-xxxx-8ba2-xxxxx41bc' # Enter your transferId here
parent = client.transfer_config_path(projectid, transferid)
start_time = Timestamp(seconds=int(time.time() + 10))
response = client.start_manual_transfer_runs(parent, requested_run_time=start_time)
print(response)
I get this error
start_manual_transfer_runs() got an unexpected keyword argument 'requested_run_time'
I created doing this using cloud function and it worked for me.
import time
from google.protobuf.timestamp_pb2 import Timestamp
from google.cloud import bigquery_datatransfer_v1
def runQuery(parent, requested_run_time):
client = bigquery_datatransfer_v1.DataTransferServiceClient()
PROJECT_ID = 'graphical-reach-285218'
TRANSFER_CONFIG_ID = '61adfc39-0000-206b-a7b0-089e08324288'
parent = client.project_transfer_config_path(PROJECT_ID, TRANSFER_CONFIG_ID)
start_time = bigquery_datatransfer_v1.types.Timestamp(seconds=int(time.time() + 10))
response = client.start_manual_transfer_runs(parent, requested_run_time=start_time)
print(parent)
print(response)
Replace with your project id and transfer config id. The above code will go in main.py and in requirements.txt , please keep
google-cloud-bigquery-datatransfer==1

How to trigger DAG3 after successfully execution of DAG1 and DAG2 in airflow?

I have created DAG1 and DAG2 which containing 2 task each inside it.
and in DAG3 by using triggerdagrunoperator triggering DAG1 and DAG2 and set parameter wait_for_completion and poke_interval of 15 sec.
But problem here is scheduler not working by this parameter...Scheduler got stop.
Here is my code for DAG3:
from airflow import DAG
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
from airflow.utils.dates import days_ago
from airflow.operators.python_operator import PythonOperator
dag = DAG(
dag_id="controller_dag",
default_args={"owner": "airflow"},
start_date=days_ago(2),
schedule_interval="#once",
tags=['example'],
)
def BYE():
return "GOOD BYE from Controller_Dag"
trigger1 = TriggerDagRunOperator(
task_id="trigger_dag_1",
trigger_dag_id="target_dag_1",
wait_for_completion = True,
poke_interval = 15,
dag=dag,
)
trigger2 = TriggerDagRunOperator(
task_id="trigger_dag_2",
trigger_dag_id="target_dag_2",
wait_for_completion =True,
poke_interval = 15,
dag=dag,
)
Bye = PythonOperator(
task_id = "BYE",
python_callable = BYE,
dag=dag)
[trigger1, trigger2]>>Bye
If any knows why its happening please help me.
Thanks in advance..!

Possibility of creating DAGs in Airflow

Is there a way that one can make a DAG file dynamically from code and upload it on airflow(AirFlow reads from the dags directory, but creating file for every DAG and uploading it on that folder is slow)?
Is it possible to create a template dag and populate it with new logic whenever it is needed?
I saw that they are working on API. The current version only has a trigger DAG option.
You can quite easily create multiple dags in a single file:
create_dag(dag_id):
dag = DAG(....)
// some tasks added
return dag
for dag_id in dags_lists:
globals()[dag_id] = create_dag(dag_id)
If you create a proper DAG object with the template function (create_dag in the above example) and make them available in the globals object, Airflow will recognise them as individual DAGs.
Yes you can create dynamic DAGs as follows:
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
def create_dag(dag_id,
schedule,
dag_number,
default_args):
def hello_world_py(*args):
print('Hello World')
print('This is DAG: {}'.format(str(dag_number)))
dag = DAG(dag_id,
schedule_interval=schedule,
default_args=default_args)
with dag:
t1 = PythonOperator(
task_id='hello_world',
python_callable=hello_world_py,
dag_number=dag_number)
return dag
# build a dag for each number in range(10)
for n in range(1, 10):
dag_id = 'hello_world_{}'.format(str(n))
default_args = {'owner': 'airflow',
'start_date': datetime(2018, 1, 1)
}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id,
schedule,
dag_number,
default_args)
Example from https://www.astronomer.io/guides/dynamically-generating-dags/
However, note that this can cause some issues like delays between the execution of tasks. This is because Airflow Scheduler and Worker will have to parse the entire file when scheduling/executing each task for a single DAG.
As you would have many DAGs (let's say 100) inside the same file this will mean that all the 100 DAG objects will have to be parsed while executing a single task for DAG1.
I would recommend building a tool that creates a single file per DAG.

How can I set a timeout on Dataflow?

I am using Composer to run my Dataflow pipeline on a schedule. If the job is taking over a certain amount of time, I want it to be killed. Is there a way to do this programmatically either as a pipeline option or a DAG parameter?
Not sure how to do it as a pipeline config option, but here is an idea.
You could launch a taskqueue task with countdown set to your timeout value. When the task does launch, you could check to see if your task is still running:
https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
If it is, you can call update on it with job state JOB_STATE_CANCELLED
https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/update
https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs#jobstate
This is done through the googleapiclient lib: https://developers.google.com/api-client-library/python/apis/discovery/v1
Here is an example of how to use it
class DataFlowJobsListHandler(InterimAdminResourceHandler):
def get(self, resource_id=None):
"""
Wrapper to this:
https://cloud.google.com/dataflow/docs/reference/rest/v1b3/projects.jobs/list
"""
if resource_id:
self.abort(405)
else:
credentials = GoogleCredentials.get_application_default()
service = discovery.build('dataflow', 'v1b3', credentials=credentials)
project_id = app_identity.get_application_id()
_filter = self.request.GET.pop('filter', 'UNKNOWN').upper()
jobs_list_request = service.projects().jobs().list(
projectId=project_id,
filter=_filter) #'ACTIVE'
jobs_list = jobs_list_request.execute()
return {
'$cursor': None,
'results': jobs_list.get('jobs', []),
}

Django Celerybeat PeriodicTask running far more than expected

I'm struggling with Django, Celery, djcelery & PeriodicTasks.
I've created a task to pull a report for Adsense to generate a live stat report. Here is my task:
import datetime
import httplib2
import logging
from apiclient.discovery import build
from celery.task import PeriodicTask
from django.contrib.auth.models import User
from oauth2client.django_orm import Storage
from .models import Credential, Revenue
logger = logging.getLogger(__name__)
class GetReportTask(PeriodicTask):
run_every = datetime.timedelta(minutes=2)
def run(self, *args, **kwargs):
scraper = Scraper()
scraper.get_report()
class Scraper(object):
TODAY = datetime.date.today()
YESTERDAY = TODAY - datetime.timedelta(days=1)
def get_report(self, start_date=YESTERDAY, end_date=TODAY):
logger.info('Scraping Adsense report from {0} to {1}.'.format(
start_date, end_date))
user = User.objects.get(pk=1)
storage = Storage(Credential, 'id', user, 'credential')
credential = storage.get()
if not credential is None and credential.invalid is False:
http = httplib2.Http()
http = credential.authorize(http)
service = build('adsense', 'v1.2', http=http)
reports = service.reports()
report = reports.generate(
startDate=start_date.strftime('%Y-%m-%d'),
endDate=end_date.strftime('%Y-%m-%d'),
dimension='DATE',
metric='EARNINGS',
)
data = report.execute()
for row in data['rows']:
date = row[0]
revenue = row[1]
try:
record = Revenue.objects.get(date=date)
except Revenue.DoesNotExist:
record = Revenue()
record.date = date
record.revenue = revenue
record.save()
else:
logger.error('Invalid Adsense Credentials')
I'm using Celery & RabbitMQ. Here are my settings:
# Celery/RabbitMQ
BROKER_HOST = "localhost"
BROKER_PORT = 5672
BROKER_USER = "myuser"
BROKER_PASSWORD = "****"
BROKER_VHOST = "myvhost"
CELERYD_CONCURRENCY = 1
CELERYD_NODES = "w1"
CELERY_RESULT_BACKEND = "amqp"
CELERY_TIMEZONE = 'America/Denver'
CELERYBEAT_SCHEDULER = 'djcelery.schedulers.DatabaseScheduler'
import djcelery
djcelery.setup_loader()
On first glance everything seems to work, but after turning on the logger and watching it run I have found that it is running the task at least four times in a row - sometimes more. It also seems to be running every minute instead of every two minutes. I've tried changing the run_every to use a crontab but I get the same results.
I'm starting celerybeat using supervisor. Here is the command I use:
python manage.py celeryd -B -E -c 1
Any ideas as to why its not working as expected?
Oh, and one more thing, after the day changes, it continues to use the date range it first ran with. So as days progress it continues to get stats for the day the task started running - unless I run the task manually at some point then it changes to the date I last ran it manually. Can someone tell me why this happens?
Consider creating a separate queue with one worker process and fixed rate for this type of tasks and just add the tasks in this new queue instead of running them in directly from celerybeat. I hope that could help you to figure out what is wrong with your code, is it problem with celerybeat or your tasks are running longer than expected.
#task(queue='create_report', rate_limit='0.5/m')
def create_report():
scraper = Scraper()
scraper.get_report()
class GetReportTask(PeriodicTask):
run_every = datetime.timedelta(minutes=2)
def run(self, *args, **kwargs):
create_report.delay()
in settings.py
CELERY_ROUTES = {
'myapp.tasks.create_report': {'queue': 'create_report'},
}
start additional celery worker with that would handle tasks in your queue
celery worker -c 1 -Q create_report -n create_report.local
Problem 2. Your YESTERDAY and TODAY variables are set at class level, so within one thread they are set only once.