Below is my simple DAG/ Python script that is inside the DAGS folder on Google cloud bucket .
from airflow import DAG
import airflow
from airflow.operators import BashOperator
from datetime import datetime,timedelta , date
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from generate_csv_feeds import generate_csv
DEFAULT_DAG_ARGS = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.utcnow(),
'email_on_failure': False,
'schedule_interval':'*/5 * * * *'
}
with DAG('DAG_MAIN',default_args=DEFAULT_DAG_ARGS,catchup=False) as dag:
generate_csv = PythonOperator(
task_id='generate_mktg_csv',
python_callable=generate_csv,
op_args=['get_data.sql','feeds_data_airflow.csv']
)
csv_generated = BashOperator(
task_id='csv_generated',
bash_command='echo CSV Generated Succesfully.')
generate_csv >> csv_generated
The issue is that it does not get triggered automatically at all nor does it get executed if i trigger it externally via the Command line. But strangely it works when i run it from the Airflow UI . I need this to run every 5 minutes . I am not sure if this has anything to do with Google Composer. Any help would be appreciated . Thanks in advance
I think this is due to your start_date being datetime.utcnow(). It is not recommended to use moving start_date especially datetime.utcnow() because the DAG is triggered at start_date + schedule_interval and as the start_date is moving, the DAG is never triggered. See the FAQ https://airflow.apache.org/faq.html#what-s-the-deal-with-start-date.
Try with a fixed start_date like datetime(2019, 08, 04).
Related
I am trying to create a sequence of tasks like below using Airflow 2.3+
START -> generate_files -> download_file -> STOP
But instead I am getting below flow. The code is also given. Please advice.
from airflow import DAG
from airflow.decorators import task
from datetime import datetime
from airflow.operators.bash import BashOperator
from airflow.utils.dates import days_ago
from airflow.utils.trigger_rule import TriggerRule
with DAG('my_dag', start_date=days_ago(1), schedule_interval='#daily', catchup=False) as dag:
START = BashOperator(task_id="start", bash_command='echo "starting batch pipeline"', do_xcom_push=False)
STOP = BashOperator(task_id="stop", bash_command='echo "stopping batch pipeline"', trigger_rule=TriggerRule.NONE_SKIPPED, do_xcom_push=False)
#task
def generate_files():
return ["file_1", "file_2", "file_3"]
#task
def download_file(file):
print(file)
START >> download_file.expand(file=generate_files()) >> STOP
Define the dag structure from START to generate_files explicitly
files = generate_files()
START >> files >> download_file.expand(file=files) >> STOP
I'm trying to upload a file from my local machine to GCS and I'm using the LocalFilesystemToGCSOperator. I'm following this howto https://airflow.readthedocs.io/en/latest/howto/operator/google/transfer/local_to_gcs.html#prerequisite-tasks. I've set up a connection to GCP with a path to a json file. This is the DAG code:
import os
from airflow import models
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
from airflow.utils import dates
BUCKET_NAME = 'bucket-name'
PATH_TO_UPLOAD_FILE = '...path-to/airflow/dags/example-text.txt'
DESTINATION_FILE_LOCATION = '/test-dir-input/example-text.txt'
with models.DAG(
'example_local_to_gcs',
default_args=dict(start_date=dates.days_ago(1)),
schedule_interval=None,
) as dag:
upload_file = LocalFilesystemToGCSOperator(
gcp_conn_id='custom_gcp_connection',
task_id="upload_file",
src=PATH_TO_UPLOAD_FILE,
dst=DESTINATION_FILE_LOCATION,
bucket=BUCKET_NAME,
mime_type='text/plain'
)
When I trigger the DAG it is marked as a success but the file is not in the bucket
It looks like there's a problem with your path_to_upload and destination_file_location.
To give you an idea, here's a separate post that could also help you. The relevant parameters similar to yours were declared like this for example:
src='/Users/john/Documents/tmp',
dst='gs://constantine-bucket',
bucket='constantine-bucket',
You should remove the ... and make sure that the destination_file_location refers to your bucket name or the folder inside it like this:
BUCKET_NAME = 'bucket-name'
PATH_TO_UPLOAD_FILE = '/path-to/airflow/dags/example-text.txt'
DESTINATION_FILE_LOCATION = 'gs://bucket-name/example-text.txt'
# Or in a folder on your bucket
# DESTINATION_FILE_LOCATION = 'gs://bucket-name/folder/example-text.txt'
The following code did the trick for me.
Please note that the service account used must have storage.objects permissions in the destination-bucket to write the file.
import os
import datetime
from pathlib import Path
from airflow import DAG
from airflow.configuration import conf
from airflow.operators.dummy import DummyOperator
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
comp_home_path = Path(conf.get("core", "dags_folder")).parent.absolute()
comp_bucket_path = "data/uploaded" # <- if your file is within a folder
comp_local_path = os.path.join(comp_home_path, comp_bucket_path)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime.today(),
'end_date': None,
'email': ['somename#somecompany.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=1)
}
sch_interval = None
dag = DAG(
'mv_local_to_GCS',
default_args=default_args,
tags=["example"],
catchup=False,
schedule_interval=sch_interval
)
mv_local_gcs = LocalFilesystemToGCSOperator(
task_id="local_to_gcs",
src=comp_local_path+"/yourfilename.csv",# PATH_TO_UPLOAD_FILE
dst="somefolder/yournewfilename.csv",# BUCKET_FILE_LOCATION
bucket="yourproject",#using NO 'gs://' nor '/' at the end, only the project, folders, if any, in dst
dag=dag
)
start = DummyOperator(task_id='Starting', dag=dag)
start >> mv_local_gcs
I want to receive an email notification for task success, failure and retry in GCP composer using Sendgrid.
Currently, all the tasks in my DAG are running successfully. I want to receive notification in that case.
Also when certain tasks are failing or retrying, I want to get those notifications as well. I did the following steps and didn't receive any notification when I forced a task to fail.
Created GCP Composer environment, added environment variables.
SENDGRID_MAIL_FROM : abc#gmail.com
SENDGRID_API_KEY :
Created following DAG.
import json
from datetime import timedelta, datetime
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryCheckOperator
from airflow.operators.email_operator import EmailOperator
default_args = {
'owner': 'airflow',
'depends_on_past': True,
'start_date': datetime(2020, 3, 30),
'email': ['abc#gmail.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 2,
'retry_delay': timedelta(minutes=5),
}
schedule_interval = "05 23 * * *"
dag = DAG(
'DAG_NAME',
default_args=default_args,
schedule_interval=schedule_interval
)
# Config variables
BQ_CONN_ID = ""
BQ_PROJECT = ""
BQ_DATASET = ""
## Task 1
t1 = BigQueryCheckOperator(----)
## Task 2
t2 = BigQueryCheckOperator(----)
## Task 3
t3 = BigQueryOperator(----)
t4 = EmailOperator(
task_id='send_email',
to='abc#gmail.com',
subject='Airflow Alert',
html_content=""" <h3>Email Test</h3> """,
dag=dag
)
# Setting up Dependencies
t1>>t2>>t3>>t4
Am I missing anything? Please tell me what needs to be done, thanks.
Firstly, you need to check which versions of Composer and Sendgrid you are using.
For instance, the latest version of Sendgrid supported on airflow-1.10.3 is v5.6.0. You can refer to the the airflow's setup.py for what dependencies are installed for a specific airflow version.
I recommend you to check the instructions for setting up Sendgrid with Cloud Composer once again. Make sure of a few things:
You set up the environment variables as the guide says, to configure Sendgrid as your email server, you need to obtain your SENDGRID_API_KEY (have you generate it with right permission? At a minimum, the key must have Mail send permissions to send email) and SENDGRID_MAIL_FROM(is the structure correct? noreply-composer#) as environment variables.
In your airflow.cfg file, check if email_backend variable is set to use Sendgrid:
email_backend = airflow.contrib.utils.sendgrid.send_email
Try sending a test DAG, as the guide says, for example you can use this:
from airflow import DAG
from airflow.operators.email_operator import EmailOperator
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
default_args = {
'owner': 'name.surname',
'start_date': days_ago(1),
'email_on_failure': True,
'email': ['name.surname#company.com'],
}
dag = DAG(
'mail-test',
schedule_interval='#once',
default_args=default_args,
)
send_mail = EmailOperator(
task_id='sendmail',
to='name.surname#company.com',
subject='TEST Mail from Cloud Composer',
html_content='Mail Contents',
dag=dag,
)
failed_bash = BashOperator(
task_id='run_bash',
bash_command='exit 1',
dag=dag,
)
send_mail >> failed_bash
Additionally, please check the spam filter in your email client. If that continues to fail, I'd then start suspecting a firewall rule (if you have edited them) may be causing the issue.
Let me know about the results. I hope it helps.
For some reason I can't deploy DAG files on Google Composer if I import google.cloud.storage in the DAG. If I try to deploy such a DAG file then it doesn't get added to the DagBag so ends up with a non-link entry in the Airflow website and is not usable. At this point there's the usual information icon saying: This DAG isn't available in the web server's DagBag object. It shows up in this list because the scheduler marked it as active in the metadata database. Unlike an actual syntax error there is no error message at the top of the page.
I have boiled this down precisely as to whether I import google.cloud.storage or not. Not even whether I actually use it. For example this dag works fine if I comment out the storage import line, does not install in Composer if I replace it. Would anyone have any clue as to why?
import datetime
from airflow import DAG
from google.cloud import storage
from airflow.operators.python_operator import PythonOperator
default_args = {
'owner': 'Airflow',
'depends_on_past': False,
'email': ['kevin#here.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'start_date': datetime.datetime(2017,1,1),
}
def ingest_file(**kwargs):
status = 'OK'
return status
# Not scheduled, trigger only
dag = DAG('ingest_test', default_args=default_args, schedule_interval=None)
ingest = PythonOperator(task_id = 'ingest', provide_context = True,
python_callable = ingest_file, dag = dag)
If you require PyPi packages in your DAG or custom Operators then you don't get an error message, the DAG just doesn't deploy. If you're getting this then make sure all the packages you need are installed in the Composer environment.
Note, that the behaviour of being present then not present is still there but does actually settle after a short while
I am trying to execute a dataflow python file that reads a text file from a GCS bucket through an airflow DAG using its DataFlowPythonOperator. I have been able to execute the python file independently but it fails when I execute it through airflow. I am using a service account to authenticate for my default gcp connection.
The error I get when executing the job is:
{gcp_dataflow_hook.py:108} INFO - Start waiting for DataFlow process to complete.
{models.py:1417} ERROR - DataFlow failed with return code 2
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/airflow/models.py", line 1374, in run
result = task_copy.execute(context=context)
File "/usr/local/lib/python2.7/dist-packages/airflow/contrib/operators/dataflow_operator.py", line 182, in execute
self.py_file, self.py_options)
File "/usr/local/lib/python2.7/dist-packages/airflow/contrib/hooks/gcp_dataflow_hook.py", line 152, in start_python_dataflow
task_id, variables, dataflow, name, ["python"] + py_options)
File "/usr/local/lib/python2.7/dist-packages/airflow/contrib/hooks/gcp_dataflow_hook.py", line 138, in _start_dataflow
_Dataflow(cmd).wait_for_done()
File "/usr/local/lib/python2.7/dist-packages/airflow/contrib/hooks/gcp_dataflow_hook.py", line 119, in wait_for_done
self._proc.returncode))
Exception: DataFlow failed with return code 2
My airflow script:
from airflow import DAG
from airflow.contrib.operators.dataflow_operator import DataFlowPythonOperator
from datetime import datetime, timedelta
# Default DAG parameters
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'email': <email>,
'email_on_failure': False,
'email_on_retry': False,
'start_date': datetime(2018, 4, 30),
'retries': 1,
'retry_delay': timedelta(minutes=1),
'dataflow_default_options': {
'project': '<Project ID>'
}
}
dag = DAG(
dag_id='df_dag_readfromgcs',
default_args=default_args,
schedule_interval=timedelta(minutes=60)
)
task1 = DataFlowPythonOperator(
task_id='task1',
py_file='~<path>/1readfromgcs.py',
gcp_conn_id='default_google_cloud_connection',
dag=dag
)
My Dataflow python file (1readfromgcs.py) contains the following code:
from __future__ import absolute_import
import argparse
import logging
import apache_beam as beam
import apache_beam.pipeline as pipeline
import apache_beam.io as beamio
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import ReadFromText
def runCode(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('--input',
default='<Input file path>',
help='File name')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
'--project=<project name>',
'--runner=DataflowRunner',
'--job_name=<job name>',
'--region=europe-west1',
'--staging_location=<GCS staging location>',
'--temp_location=<GCS temp location>'
])
pipeline_options = PipelineOptions(pipeline_args)
p = beam.pipeline.Pipeline(options=pipeline_options)
rows = p | 'read' >> beam.io.ReadFromText(known_args.input)
p.run().wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
runCode()
I am unable to debug and figure out the reason for this exception and as per my investigation in Airflow: https://github.com/apache/incubator-airflow/blob/master/airflow/contrib/hooks/gcp_dataflow_hook.py file, the error is arising from the following lines:
def wait_for_done(self):
reads = [self._proc.stderr.fileno(), self._proc.stdout.fileno()]
self.log.info("Start waiting for DataFlow process to complete.")
while self._proc.poll() is None:
ret = select.select(reads, [], [], 5)
if ret is not None:
for fd in ret[0]:
line = self._line(fd)
self.log.debug(line[:-1])
else:
self.log.info("Waiting for DataFlow process to complete.")
if self._proc.returncode is not 0:
raise Exception("DataFlow failed with return code {}".format(
self._proc.returncode))
Appreciate your thoughts and help with my issue.
This exception stems from _proc which is a subprocess. It returns an exit code from a shell.
I haven't worked with this component yet. Depending on what is being executed this exit code 2 will tell about the reason of the exit. E.g. this exit code in bash means:
Misuse of shell builtins
and could be connected to
Missing keyword or command, or permission problem
So it might be connected to the underlying DataFlow configuration. Try manually executing the file while impersonating the user airflow.