Airflow2 Creating Dag dynamically after function Run - airflow-scheduler

Hello all i am working with airflow here is scenario i am trying to resolve
i want to create DAG dynamically after the function run
try:
import os
import sys
from datetime import timedelta,datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.email_operator import EmailOperator
from airflow.utils.trigger_rule import TriggerRule
from airflow.utils.task_group import TaskGroup
import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
# ===============================================
default_args = {
"owner": "airflow",
"start_date": datetime(2021, 1, 1),
"retries": 1,
"retry_delay": timedelta(minutes=1),
'email': ['shahsoumil519#gmail.com'],
'email_on_failure': True,
'email_on_retry': False,
}
dag = DAG(dag_id="project", schedule_interval="#once", default_args=default_args, catchup=False)
# ================================================
class XcomHelper(object):
def __init__(self, **context):
self.context = context
def get(self, key=None):
""" Get the Value from XCOM"""
try:
return self.context.get("ti").xcom_pull(key=key)
except Exception as e: return "Error"
def push(self, key=None, value=None):
"""Push the value on session """
try:
self.context['ti'].xcom_push(key=key, value=value)
return True
except Exception as e: return False
def create_dag(dag_id,schedule,dag_number,default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id,python_callable=hello_world_py)
return dag
def simple_task(**context):
DATA = ["soumil", "Shah"]
for n in range(1, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow','start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id,schedule, dag_number,default_args)
except Exception as e:
print("Error : {} ".format(e))
with DAG(dag_id="project", schedule_interval="#once", default_args=default_args, catchup=False) as dag:
simple_task = PythonOperator(task_id="simple_task",
python_callable=simple_task,
provide_context=True)
simple_task
I want to create these dags based on len of DATA variable
that data comes from the database
i tried looking into
https://www.astronomer.io/guides/dynamically-generating-dags
Can an Airflow task dynamically generate a DAG at runtime?
https://medium.com/#flavio.mtps/making-use-of-python-globals-to-dynamically-create-airflow-dags-124e556b704e
any help would be great
Revised Code :
try:
import os
import sys
from datetime import timedelta, datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
# from airflow.operators.email_operator import EmailOperator
# from airflow.utils.trigger_rule import TriggerRule
# from airflow.utils.task_group import TaskGroup
# import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
def create_dag(dag_id, schedule, dag_number, default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id, python_callable=hello_world_py)
return dag
def simple_task():
DATA = ["soumil", "Shah", "Shah2"]
for n in range(0, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id, schedule, dag_number, default_args)
except Exception as e:
print("Error : {} ".format(e))
def trigger_function():
print("HEREE")
simple_task()
with DAG(dag_id="project", schedule_interval="#once", default_args={'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}, catchup=False) as dag:
trigger_function = PythonOperator(task_id="trigger_function",python_callable=trigger_function,provide_context=True,)
trigger_function

I removed a few lines from your code to keep the answer to the point. The below code will generate DAGs like hello_world_0, hello_world_1... based on the contents of DATA .
EDIT - I used airflow v1.10.x but the code should work for v2.x
Suggestions:
Make the tasks names different from DAGs names.
dag_number variable is currently not being used. That can be taken off.
The DAGs will look like this -
try:
import os
import sys
from datetime import timedelta, datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
# from airflow.operators.email_operator import EmailOperator
# from airflow.utils.trigger_rule import TriggerRule
# from airflow.utils.task_group import TaskGroup
# import pandas as pd
print("All Dag modules are ok ......")
except Exception as e:
print("Error {} ".format(e))
def create_dag(dag_id, schedule, dag_number, default_args):
def hello_world_py():
print('Hello World')
dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args)
with dag:
t1 = PythonOperator(task_id=dag_id, python_callable=hello_world_py)
return dag
def simple_task():
DATA = ["soumil", "Shah", "Shah2"]
for n in range(0, len(DATA)):
try:
dag_id = 'hello_world_{}'.format(str(n))
print("DAG ID : {} ".format(dag_id))
default_args = {'owner': 'airflow', 'start_date': datetime(2018, 1, 1)}
schedule = '#daily'
dag_number = n
globals()[dag_id] = create_dag(dag_id, schedule, dag_number, default_args)
except Exception as e:
print("Error : {} ".format(e))
simple_task()

Related

Not able to run python pipeline through airflow BeamRunPythonPipelineOperator

I'm not able to run python pipeline through airflow BeamRunPythonPipelineOperator. Below is my complete code:
DAG FILE
import os
from datetime import datetime, timedelta
from airflow.utils.dates import days_ago
from airflow import DAG
from airflow.providers.google.cloud.operators.dataflow import DataflowConfiguration
from airflow.providers.apache.beam.operators.beam import BeamRunPythonPipelineOperator
from airflow.providers.google.cloud.operators.dataflow import DataflowTemplatedJobStartOperator
default_args = {
"owner": "<...>",
"start_date": days_ago(1),
'dataflow_default_options': {
"project": "<...>",
}
}
dag = DAG(
dag_id="word_count",
default_args=default_args,
schedule_interval="#once"
)
start_python_pipeline_dataflow_runner = BeamRunPythonPipelineOperator(
task_id="start_python_pipeline_dataflow_runner",
runner="DataflowRunner",
py_file="gs://<...>/word_count.py",
pipeline_options={
'input':"gs://<...>/kinglear.txt",
'output':"gs://<...>/output.txt",
'temp_location':"gs://<...>/temp/",
'staging_location':"gs://<...>/temp/",
},
py_options=[],
py_requirements=['apache-beam[gcp]==2.26.0'],
py_interpreter='python3',
py_system_site_packages=False,
dataflow_config=DataflowConfiguration(
job_name='{{task.task_id}}', project_id="<...>", location="us-central1"
),
dag=dag,
)
Python File (word_count.py)
"""A word-counting workflow."""
# pytype: skip-file
import argparse
import logging
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class WordExtractingDoFn(beam.DoFn):
"""Parse each line of input text into words."""
def process(self, element):
"""Returns an iterator over the words of this element.
The element is a line of text. If the line is blank, note that, too.
Args:
element: the element being processed
Returns:
The processed element.
"""
return re.findall(r'[\w\']+', element, re.UNICODE)
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://<...>/kinglear.txt',
help='Input file to process.')
parser.add_argument(
'--output',
dest='output',
default='gs://<...>/output.txt',
help='Output file to write results to.')
argv = [
'--project=<...>',
'--region=us-central1',
'--runner=DataflowRunner',
'--staging_location=gs://<...>/temp/',
'--temp_location=gs://<...>/temp/',
'--template_location=gs://<...>/templates/word_count_template'
]
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
# The pipeline will be run on exiting the with block.
with beam.Pipeline(argv=argv,options=pipeline_options) as p:
# Read the text file[pattern] into a PCollection.
lines = p | 'Read' >> ReadFromText(known_args.input)
counts = (
lines
| 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
| 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
| 'GroupAndSum' >> beam.CombinePerKey(sum))
# Format the counts into a PCollection of strings.
def format_result(word, count):
return '%s: %d' % (word, count)
output = counts | 'Format' >> beam.MapTuple(format_result)
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'Write' >> WriteToText(known_args.output)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
Below is the screenshot of the composer:
I am not able to see the dataflow job in the console, as well as count, result in the bucket. Could anyone suggest to me the right approach or any suggestions on this?
You DAG is ok, the problem is on the Beam Python file, there is an error when you send the Dataflow args in the argv. The best approach is extend pipeline_args. And the job is not being submitted because you are sending the argv in the beam.Pipeline.
Following is the fixed code:
word_count.py :
"""A word-counting workflow."""
# pytype: skip-file
import argparse
import logging
import re
import os
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class WordExtractingDoFn(beam.DoFn):
"""Parse each line of input text into words."""
def process(self, element):
"""Returns an iterator over the words of this element.
The element is a line of text. If the line is blank, note that, too.
Args:
element: the element being processed
Returns:
The processed element.
"""
return re.findall(r'[\w\']+', element, re.UNICODE)
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument(
'--output',
dest='output',
default='gs://<bucket>/newoutput',
help='Output file to write results to.')
#argv = [
# '--project=<...>',
# '--region=us-central1',
# '--runner=DataflowRunner',
# '--staging_location=gs://<...>/temp/',
# '--temp_location=gs://<...>/temp/',
# '--template_location=gs://<...>/templates/word_count_template'
# ]
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
'--runner=DataflowRunner',
'--project=<project-name>',
'--region=<region>',
'--staging_location=gs://<bucket>/',
'--temp_location=gs://<bucket>/temp',
'--job_name=your-wordcount-job',
])
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
# The pipeline will be run on exiting the with block.
with beam.Pipeline(options=pipeline_options) as p:
lines = p | 'Read' >> ReadFromText(known_args.input)
counts = (
lines
| 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
| 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
| 'GroupAndSum' >> beam.CombinePerKey(sum))
# Format the counts into a PCollection of strings.
def format_result(word, count):
return '%s: %d' % (word, count)
output = counts | 'Format' >> beam.MapTuple(format_result)
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'Write' >> WriteToText(known_args.output)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()

AWS MWAA/Apache airflow: how to debug on_failure_callback itself

Have a dag like this:
import os
from datetime import timedelta
from xxx import on_failure_opsgenie
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
DAG_ID = os.path.basename(__file__).replace(".py", "")
DEFAULT_ARGS = {
"owner": "airflow",
"depends_on_past": False,
"email": ["airflow#example.com"],
"email_on_failure": False,
"email_on_retry": False,
}
def kaboom(*args, **kwargs):
print("goodbye cruel world")
print(args)
print(kwargs)
assert 1 == 2
with DAG(
dag_id=DAG_ID,
default_args=DEFAULT_ARGS,
description="Print contents of airflow.cfg to logs",
dagrun_timeout=timedelta(hours=2),
start_date=days_ago(1),
schedule_interval=None,
on_failure_callback=on_failure_opsgenie,
) as dag:
get_airflow_cfg_operator = PythonOperator(task_id="gonna_explode", python_callable=kaboom)
The DAG fails as expected, purposefully. However, on_failure_opsgenie is not doing what it should; how do I get the logs or debug a failed on-failure-callback in AWS MWAA?

why apscheduler use get_jobs empty?

this is my test.py
from datetime import datetime, timedelta
import sys
import os
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
from apscheduler.jobstores.redis import RedisJobStore
jobstores = {
#'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
'default': RedisJobStore(host='localhost', port=6379)
}
scheduler = BlockingScheduler(jobstores=jobstores)
def alarm(time):
print('Alarm! This alarm was scheduled at %s.' % time)
if __name__ == '__main__':
alarm_time = datetime.now() + timedelta(seconds=10)
scheduler.add_job(alarm, 'interval', seconds=10, args=[datetime.now()], name='alarm_test')
print('To clear the alarms, delete the example.sqlite file.')
print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))
try:
scheduler.start()
except (KeyboardInterrupt, SystemExit):
pass
i do python test.py run job successfully
and then use another terminal by putty
python
>>> import redis
>>> from test import *
>>> r = redis.Redis()
>>> r.keys()
>>> r.zrange('apscheduler.run_times',0,1)
it will find the job id 57841c0ee05249efb466882265f2c495
>>> ret = scheduler.get_jobs(jobstore='default')
ret is empty
why???
thanks a lot
Have you started the scheduler before running get_jobs()? If not, it will only list tentatively scheduled jobs. That's why you're not seeing the job.
Try this instead:
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.jobstores.redis import RedisJobStore
scheduler = BackgroundScheduler()
scheduler.add_jobstore('redis', host='localhost', port=6379)
scheduler.start(paused=True)
scheduler.print_jobs()

list_jobs function min_creation_time error

Below is my code in Django frame (python 2.7) to list the jobs in Bigquery. I want to filter to just the ones in last two weeks but the min_creation_time in the list_jobs() function does not work and errors out for some reason. Please suggest
from __future__ import unicode_literals
from django.shortcuts import render
import thd_gbq_tools as bq
# Create your views here.
from django.http import HttpResponse
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from oauth2client.client import GoogleCredentials
from google.cloud import bigquery
import uuid
import os
import logging
import time
import json
from datetime import datetime,timedelta
from django.template import loader
from django.shortcuts import render
import pandas as pd
from collections import OrderedDict
from datetime import date
def home(request):
credentials = GoogleCredentials.get_application_default()
# Construct the service object for interacting with the BigQuery API.
bq_conn = build('bigquery', 'v2', credentials=credentials)
job_query_dict = []
import warnings
warnings.filterwarnings("ignore")
###Create the big query client
client =bigquery.Client(project='analytics-supplychain-thd')
###List the jobs in the client
jobs = client.list_jobs(all_users= True) # API request
for job in jobs:
job_create_timestamp = datetime.strptime((str(job.created).replace('+','.')).split('.')[0],'%Y-%m-%d %H:%M:%S')
job_ended_timestamp = datetime.strptime((str(job.ended).replace('+','.')).split('.')[0],'%Y-%m-%d %H:%M:%S')
job_query_dict.append([job.job_id, job.user_email , job_create_timestamp,job_ended_timestamp, job.state])
Table1 = sorted(job_query_dict,key=lambda x: (x[2]), reverse=True)
return render(request, 'j2_response.html', {'Table1':Table1})
This is the code I am using to assign the parameter that indicates the last 10 minutes for min_creation_time:
from datetime import datetime,timedelta
from datetime import date
ten_mins_ago = datetime.utcnow() - timedelta(minutes=10)
When indicating ten_mins_ago = datetime.utcnow() - timedelta(minutes=10) you are specifying that you want the BigQuery jobs that have been run for the last 10 minutes.
You can try this code snippet to list the BigQuery jobs made in the last 2 weeks:
from google.cloud import bigquery
from datetime import datetime, timedelta
from pytz import timezone
client = bigquery.Client(project = '[YOUR_PROJECT]')
local_timezone = timezone('US/Eastern')
two_weeks_ago = datetime.utcnow() - timedelta(days = 14)
local_two_weeks = local_timezone.localize(two_weeks_ago)
for job in client.list_jobs(all_users = True, max_results = 10, min_creation_time = local_two_weeks):
print(job.job_id, job.user_email)
If this snippet works for you, you can integrate it into your code. Should you get any errors, please state them so we can look further into the issue.

Run a Scrapy spider in a Celery Task (django project)

I'm trying to run scrapy (spider/crawl)from django project (task in the admin interrface using celery). this is my code .
this is the error when I try to call the task from a python shell
djangoproject:
-monapp:
-tasks.py
-spider.py
-myspider.py '
-models.py
.....
tasks.py:
from djcelery import celery
from demoapp.spider import *
from demoapp.myspider import *
#celery.task
def add(x, y):
return x + y
#celery.task
def scra():
result_queue = Queue()
crawler = CrawlerWorker(MySpider(), result_queue)
crawler.start()
return "success"
spider.py:
from scrapy import project, signals
from scrapy.settings import Settings
from scrapy.crawler import Crawler
from scrapy.xlib.pydispatch import dispatcher
from multiprocessing.queues import Queue
import multiprocessing
class CrawlerWorker(multiprocessing.Process):
def __init__(self, spider, result_queue):
multiprocessing.Process.__init__(self)
self.result_queue = result_queue
self.crawler = Crawler(Settings())
if not hasattr(project, 'crawler'):
self.crawler.install()
self.crawler.configure()
self.items = []
self.spider = spider
dispatcher.connect(self._item_passed, signals.item_passed)
def _item_passed(self, item):
self.items.append(item)
def run(self):
self.crawler.crawl(self.spider)
self.crawler.start()
self.crawler.stop()
self.result_queue.put(self.items)
myspider.py
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.item import Item, Field
class TorentItem(Item):
title = Field()
desc = Field()
class MySpider(CrawlSpider):
name = 'job'
allowed_domains = ['tanitjobs.com']
start_urls = [\
'http://tanitjobs.com/browse-by-category/Nurse/',]
rules = (
Rule (SgmlLinkExtractor(allow=('page=*',)
,restrict_xpaths=('//div[#class="pageNavigation"]',),
unique = True)
, callback='parse_item', follow= True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items= hxs.select('\
//div[#class="offre"]/div[#class="detail"]')
scraped_items =[]
for item in items:
scraped_item = TorentItem()
scraped_item['title']=item.select(\
'a/strong/text()').extract()
scraped_item['desc'] =item.select(\
'./div[#class="descriptionjob"]/text()').extract()
scraped_items.append(scraped_item)
return scraped_items
I got it work mine on the shell using django management command. Below is my code snippet. Feel free to modify to fit your needs.
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import signals
from scrapy.utils.project import get_project_settings
from django.core.management.base import BaseCommand
from myspiderproject.spiders.myspider import MySpider
class ReactorControl:
def __init__(self):
self.crawlers_running = 0
def add_crawler(self):
self.crawlers_running += 1
def remove_crawler(self):
self.crawlers_running -= 1
if self.crawlers_running == 0:
reactor.stop()
def setup_crawler(domain):
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed)
spider = MySpider(domain=domain)
crawler.crawl(spider)
reactor_control.add_crawler()
crawler.start()
reactor_control = ReactorControl()
class Command(BaseCommand):
help = 'Crawls the site'
def handle(self, *args, **options):
setup_crawler('somedomain.com')
reactor.run() # the script will block here until the spider_closed signal was sent
hope this helps.