How to create dynamic task sequence with new airflow version - google-cloud-platform

I am trying to create a sequence of tasks like below using Airflow 2.3+
START -> generate_files -> download_file -> STOP
But instead I am getting below flow. The code is also given. Please advice.
from airflow import DAG
from airflow.decorators import task
from datetime import datetime
from airflow.operators.bash import BashOperator
from airflow.utils.dates import days_ago
from airflow.utils.trigger_rule import TriggerRule
with DAG('my_dag', start_date=days_ago(1), schedule_interval='#daily', catchup=False) as dag:
START = BashOperator(task_id="start", bash_command='echo "starting batch pipeline"', do_xcom_push=False)
STOP = BashOperator(task_id="stop", bash_command='echo "stopping batch pipeline"', trigger_rule=TriggerRule.NONE_SKIPPED, do_xcom_push=False)
#task
def generate_files():
return ["file_1", "file_2", "file_3"]
#task
def download_file(file):
print(file)
START >> download_file.expand(file=generate_files()) >> STOP

Define the dag structure from START to generate_files explicitly
files = generate_files()
START >> files >> download_file.expand(file=files) >> STOP

Related

Django supervisord and apscheduler

In Django project, have a supervisord than start apscheduler
[program:apscheduler]
command=/home/user/Project/.venv/bin/python manage.py runapscheduler
In apscheduler I have one job:
# runapscheduler.py
import logging
import sys
from django.conf import settings
from django.core import management
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.cron import CronTrigger
from django.core.management.base import BaseCommand
from django_apscheduler.jobstores import DjangoJobStore
from django_apscheduler.models import DjangoJobExecution
from django_apscheduler import util
logger = logging.getLogger(__name__)
def my_management_command():
management.call_command('MyCommand')
# The `close_old_connections` decorator ensures that database connections, that have become unusable or are obsolete,
# are closed before and after our job has run.
#util.close_old_connections
def delete_old_job_executions(max_age=604_800):
"""
This job deletes APScheduler job execution entries older than `max_age` from the database. It helps to prevent the
database from filling up with old historical records that are no longer useful.
:param max_age: The maximum length of time to retain historical job execution records. Defaults
to 7 days.
"""
DjangoJobExecution.objects.delete_old_job_executions(max_age)
class Command(BaseCommand):
help = "Runs APScheduler."
def handle(self, *args, **options):
scheduler = BlockingScheduler(timezone=settings.TIME_ZONE)
scheduler.add_jobstore(DjangoJobStore(), "default")
scheduler.add_job(
my_management_command,
trigger=CronTrigger(hour="*", minute="*"), # Every hour
id="MyCommand", # The `id` assigned to each job MUST be unique
max_instances=1,
replace_existing=True,
)
logger.info("Added hourly job 'my_management_command'.")
scheduler.add_job(
delete_old_job_executions,
trigger=CronTrigger(
day_of_week="mon", hour="00", minute="00"
), # Midnight on Monday, before start of the next work week.
id="delete_old_job_executions",
max_instances=1,
replace_existing=True,
)
logger.info(
"Added weekly job: 'delete_old_job_executions'."
)
try:
logger.info("Starting scheduler...")
scheduler.start()
except KeyboardInterrupt:
logger.info("Stopping scheduler...")
scheduler.shutdown()
logger.info("Scheduler shut down successfully!")
In my management command "MyCommand", I open some tcp socket to other server.
If I run it outside of apscheduler, the socket are closed correctly once the management command is over.
When it is run with apscheduler socket never close until I restart the job.
Any idea how to fix that ?

How to set GOOGLE_APPLICATION_CREDENTIALS in beam pipeline to resolve Access Denied Error?

I am trying to insert data in bigquery from a file using dataflow pipeline. Below code gives me access denied error.
Also I have set application credentials by environment variable. I started getting error which says
The Application Default Credentials are not available.
I need help to resolve this. Thanks in advance. Please find below code:
from __future__ import absolute_import
import argparse
import logging
import re
import apache_beam as beam
import os
from apache_beam.options.pipeline_options import PipelineOptions
class DataIngestion:
def parse_method(self, string_input):
values = re.split(",",
re.sub('\r\n', '', re.sub(u'"', '', string_input)))
row = dict(
zip(('state', 'gender', 'year', 'name', 'number', 'created_date'),
values))
return row
def run(argv=None):
"""The main function which creates the pipeline and runs it."""
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "LocalPath\FileName.json"
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
default='gs://python-dataflow-example/data_files/head_usa_names.csv')
parser.add_argument('--output',
dest='output',
required=False,
help='Output BQ table to write results to.',
default='lake.usa_names')
# Parse arguments from the command line.
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
'--runner=DirectRunner',
'--project=projectID',
'--staging_location=staging_location',
'--temp_location=temp_location',
])
data_ingestion = DataIngestion()
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
(p
| 'Read from a File' >> beam.io.ReadFromText(known_args.input, skip_header_lines=1)
| 'String To BigQuery Row' >>
beam.Map(lambda s: data_ingestion.parse_method(s))
| 'Write to BigQuery' >> beam.io.Write(
beam.io.BigQuerySink(
known_args.output,
schema='state:STRING,gender:STRING,year:STRING,name:STRING,'
'number:STRING,created_date:STRING',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
p.run().wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
You can remove the code to set the credentials inside your beam job and instead try the below command before running the job
gcloud auth application-default login
The above command will set the GOOGLE_APPLICATION_CREDENTIALS that will be visible to your job.

Google Composer Dag not triggered Automatically or even via CMD

Below is my simple DAG/ Python script that is inside the DAGS folder on Google cloud bucket .
from airflow import DAG
import airflow
from airflow.operators import BashOperator
from datetime import datetime,timedelta , date
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from generate_csv_feeds import generate_csv
DEFAULT_DAG_ARGS = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.utcnow(),
'email_on_failure': False,
'schedule_interval':'*/5 * * * *'
}
with DAG('DAG_MAIN',default_args=DEFAULT_DAG_ARGS,catchup=False) as dag:
generate_csv = PythonOperator(
task_id='generate_mktg_csv',
python_callable=generate_csv,
op_args=['get_data.sql','feeds_data_airflow.csv']
)
csv_generated = BashOperator(
task_id='csv_generated',
bash_command='echo CSV Generated Succesfully.')
generate_csv >> csv_generated
The issue is that it does not get triggered automatically at all nor does it get executed if i trigger it externally via the Command line. But strangely it works when i run it from the Airflow UI . I need this to run every 5 minutes . I am not sure if this has anything to do with Google Composer. Any help would be appreciated . Thanks in advance
I think this is due to your start_date being datetime.utcnow(). It is not recommended to use moving start_date especially datetime.utcnow() because the DAG is triggered at start_date + schedule_interval and as the start_date is moving, the DAG is never triggered. See the FAQ https://airflow.apache.org/faq.html#what-s-the-deal-with-start-date.
Try with a fixed start_date like datetime(2019, 08, 04).

Run RASA with flask

I want to run RASA with --enable-api inside the python code rather than the command line. Below is my code which is not working. Let me know how can i do that. The issue is once i hit the service because the channel is 'cmdline' it comes to the command line. I don't know how to resolve this.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import rasa_core
from rasa_core.agent import Agent
from rasa_core.policies.keras_policy import KerasPolicy
from rasa_core.policies.memoization import MemoizationPolicy
from rasa_core.interpreter import RasaNLUInterpreter
from rasa_core.utils import EndpointConfig
from rasa_core.run import serve_application
from rasa_core import config
from rasa_core.policies.fallback import FallbackPolicy
from rasa_core.policies.keras_policy import KerasPolicy
from flask import Flask
from flask_cors import CORS, cross_origin
app = Flask(__name__)
CORS(app)
logger = logging.getLogger(__name__)
#app.route("/conversations/default/respond",methods=['POST'])
def run_weather_bot(serve_forever=True):
logging.basicConfig(level="ERROR")
interpreter = RasaNLUInterpreter('C:\\xxxx_nlu\\models\\nlu\\default\\weathernlu')
action_endpoint = EndpointConfig(url="http://xxx.xx.xx.xxx:5055/webhook")
agent = Agent.load('C:\\xxxx_nlu\\models\\dialogue', interpreter=interpreter, action_endpoint=action_endpoint)
rasa_core.run.serve_application(agent,channel='cmdline')
return agent
if __name__ == '__main__':
app.run("xxx.xx.xx.xxx",5005,debug=True)
You're calling rasa bot in the command line in your run_weather_bot function using below command.
rasa_core.run.serve_application(agent,channel='cmdline')
As you can see its serving as command line application.
I have made some changes in your code for a conversation with rasa chatbot. You can refer AGENT documentation and Weather bot article for connection of RASA agent and how RASA agent handles the input message.
def rasa_agent():
interpreter = RasaNLUInterpreter("Path for NLU")
action_endpoint = EndpointConfig(url="Webhook URL")
agent = Agent.load('Path to Dialogue', interpreter=interpreter, action_endpoint=action_endpoint)
## Next line runs the rasa in commandline
# rasa_core.run.serve_application(agent,channel='cmdline')
return agent
#app.route("/conversations/default/respond",methods=['POST'])
def run_weather_bot(serve_forever=True):
agent = rasa_agent() # calling rasa agent
## Collect Query from POST request
## Send Query to Agent
## Get Response of BOT
output = {} ## Append output
return jsonify(output)

how to write a spark-transformed data back to a kafka broker using pyspark?

In my pyspark app, I intent to use Spark streaming as a method of transforming Kafka messages "in-flight". Each such message is initially received from a specific Kafka topic. Such message will need to undergo some transformations (let's says - substitute one string for another), and the transformed version needs to be posted on a different Kafka topic.
The first part (receiving a Kafka message) appears to be working fine:
from pyspark import SparkConf, SparkContext
from operator import add
import sys
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
## Constants
APP_NAME = "PythonStreamingDirectKafkaWordCount"
##OTHER FUNCTIONS/CLASSES
def main():
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 2)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
...
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
main()
What is the proper syntax to put something (let's say - a string) onto a different Kafka topic?
Should such method be provided by KafkaUtils, or is made available in some other way?
Within the handler function, we can do whatever with each record, and then send that record to a different kafka topic:
from pyspark import SparkConf, SparkContext
from operator import add
import sys
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
from kafka import SimpleProducer, KafkaClient
from kafka import KafkaProducer
producer = KafkaProducer(bootstrap_servers='localhost:9092')
def handler(message):
records = message.collect()
for record in records:
producer.send('spark.out', str(record))
producer.flush()
def main():
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 10)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
kvs.foreachRDD(handler)
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
main()
To run this:
spark-submit --jars spark-streaming-kafka-assembly_2.10-1.6.1.jar s.py localhost:9092 test
The correct way to do according to SPARK documentation
https://spark.apache.org/docs/2.2.0/streaming-programming-guide.html#design-patterns-for-using-foreachrdd
def kafka_sender(messages):
producer = KafkaProducer(bootstrap_servers='localhost:9092')
for message in messages:
producer.send('alerts', bytes(message[0].encode('utf-8')))
# For faster push
# producer.flush()
producer.flush()
# On your Dstream
sentiment_data.foreachRDD(lambda rdd: rdd.foreachPartition(kafka_sender))