Import error :Python Dataflow Job in cloud composer - google-cloud-platform

I can run the single file as a dataflow job in cloud composer but when i run it as a package it fails .
pipeline_jobs/
-- __init__.py
-- run.py (main file)
-- setup.py
-- data_pipeline/
----- __init__.py
----- tasks.py
----- transform.py
----- util.py
I'm getting this error:
WARNING - File "/tmp/dataflowd232f-run.py", line 14, in <module
{gcp_dataflow_hook.py:120} WARNING - from data_pipeline.tasks import task
WARNING - ImportError: No module named data_pipeline.tasks.
This is the dag configuration:
from datetime import datetime, timedelta
from airflow import DAG
from airflow.contrib.operators.dataflow_operator import DataFlowPythonOperator
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.strptime("2017-11-01","%Y-%m-%d"),
'py_options': [],
'dataflow_default_options': {
'start-date': '20171101',
'end-date': '20171101',
'project': '<project-id>',
'region': '<location>',
'temp_location': 'gs://<bucket>/flow/tmp',
'staging_location': 'gs://<bucket>/flow/staging',
'setup_file': 'gs://<bucket>/dags/pipeline_jobs/setup.py',
'runner': 'DataFlowRunner',
'job_name': 'job_name_lookup',
'task-id': 'run_pipeline'
},
}
dag = DAG(
dag_id='pipeline_01',
default_args=default_args,
max_active_runs=1,
concurrency =1
)
task_1 = DataFlowPythonOperator(
py_file = 'gs://<bucket>/dags/pipeline_jobs/run.py',
gcp_conn_id='google_cloud_default',
task_id='run_job',
dag=dag)
I tried putting run.py into dags folder but still getting same error.
Any kind of suggestions would be really helpful.
Tried doing this as well:
from pipeline_jobs .data_pipeline.tasks import task
but still the same issue.

Try put the entire pipeline_jobs/ in dags folder following this instruction and refer the dataflow py file as: /home/airflow/gcs/dags/pipeline_jobs/run.py.

Related

Dataflow job not triggering on cloud from Composer(Airflow)

I am trying to execute apache beam pipeline from composer and facing below issue that the job does not get trigger on GCP.
Job log: (parameterized below stuff not to reveal company specific details:
NFO - Running command: java -jar /tmp/dataflow40103bb6-GcsToBqDataIngestion.jar --runner=DataflowRunner --project=<project_id> --zone=northamerica-northeast1-a --stagingLocation=gs:// --maxNumWorkers=1 --tempLocation=<> --region=northamerica-northeast1 --subnetwork=<network_link> --serviceAccount= --usePublicIps=false --pipelineConfig=pipeline_config/pgp_comm_apps.properties --workerMachineType=n1-standard-2 --env=dev --jobName=test-ingestion-7e20f260#-#{"workflow": "fds-test-dataflow", "task-id": "load-data", "execution-date": "2022-08-02T19:31:51.473861+00:00"}
DAG code:
import datetime
from airflow import models
# The DAG object; we'll need this to instantiate a DAG
from airflow import DAG
from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator
# Operators; we need this to operate!
from airflow.operators.bash_operator import BashOperator
default_dag_args = {
# The start_date describes when a DAG is valid / can be run. Set this to a
# fixed point in time rather than dynamically, since it is evaluated every
# time a DAG is parsed. See:
# https://airflow.apache.org/faq.html#what-s-the-deal-with-start-date
'owner': 'Airflow',
'depends_on_past': False,
'start_date': datetime.datetime(2022, 7, 27),
'email': ['test#test.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'dataflow_default_options': {
'project': '<prod_id>',
#"region": "northamerica-northeast1",
"zone": "northamerica-northeast1-a",
'stagingLocation': 'gs://location',
}
#'retry_delay': timedelta(minutes=30),
}
# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG(
'fds-test-dataflow',
catchup=False,
schedule_interval=None,
#schedule_interval=datetime.timedelta(days=1),
default_args=default_dag_args) as dag:
task = DataFlowJavaOperator(
task_id='load-data',
gcp_conn_id="gcp_connection",
job_name='test-ingestion',
jar='gs://path_to_jar',
delegate_to="<SA>",
location='northamerica-northeast1',
options={
'maxNumWorkers': '1',
'project': '<proj_id>',
'tempLocation': 'gs://location/',
'region': 'northamerica-northeast1',
"zone": "northamerica-northeast1-a",
'subnetwork': 'network',
'serviceAccount': 'SA',
'usePublicIps': 'false',
'pipelineConfig': 'pipeline_config/pgp_comm_apps.properties',
"currentTms": '"2022-06-28 10:00:00"',
'labels': {},
'workerMachineType': 'n1-standard-2',
'env': 'dev'
},
dag=dag,)
task

Django Huey Crontab every day at 10am

I'm using Django 4.0.4 and huey 2.4.3. What I would like to atchieve it's to run a task everyday at 10am, using a periodic task.
My task folder path => project/apps/utils/tasks
+-- project/
| +-- apps/
| +-- utils/
| +-- tasks/
| +--__init__.py
| +--sms_task.py
| +--tasks.py
In the init.py file in tasks folder I've imported all tasks:
__all__ = ["tasks", "sms_task"]
Here my Huey Config in settings.py file:
HUEY = {
'huey_class': 'huey.RedisHuey', # Huey implementation to use.
'name': 'ASISPO', # Name of the Redis connection.
'immediate': False,
'connection': {
'url': env('REDIS_URL', default=None), # Allow Redis config via a DSN.
},
'consumer': {
'blocking': True,
'loglevel': True,
'workers': 8, # Number of consumer workers.
'scheduler_interval': 1, # Check schedule every second, -s.
'health_check_interval': 5, # Check worker health every second.
'simple_log': True,
},
}
What I've done so far in my tasks.py file:
#periodic_task(crontab(hour='10'))
def getjplus1():
calculation = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
filter_fields = {
'date_ope': calculation,
'escalade': 'False',
'rel_dou': 'False',
'rel_dou_ok':'False',
'rel_hemo':'False',
'rel_hemo_ok':'False'
}
for key, value in filter_fields.items():
suivis_patient = SuiviPatient.objects.filter(
response_suivi_patient__suivi_field_name__icontains=key ,
response_suivi_patient__response__icontains=value,
status='planifié',
archived=False
)
get_suivi_patient_j1(list(set(suivis_patient)))
But it's not running at 10am.
however it works when I run periodic_task every minute like so:
#periodic_task(crontab(minute="*/1"))
def getjplus1():
calculation = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
filter_fields = {
'date_ope': calculation,
'escalade': 'False',
'rel_dou': 'False',
'rel_dou_ok':'False',
'rel_hemo':'False',
'rel_hemo_ok':'False'
}
for key, value in filter_fields.items():
suivis_patient = SuiviPatient.objects.filter(
response_suivi_patient__suivi_field_name__icontains=key ,
response_suivi_patient__response__icontains=value,
status='planifié',
archived=False
)
get_suivi_patient_j1(list(set(suivis_patient))
)
Since you mentioned that you are using huey package as part of your code. You can utilize its periodic_task decorator and add crontab within it.
from huey import crontab
#huey.periodic_task(crontab(hour='10'))
def every_ten_in_the_morning():
print('This task runs every 10 in the morning.')
You can also check its documentation to know more about huey periodic task : https://huey.readthedocs.io/en/latest/guide.html#periodic-tasks
For this you can use via pip install django-celery and use celery.
from celery.schedules import crontab
from celery.task import periodic_task
#periodic_task(run_every=crontab(hour=10, minute=0))
def every_day():
print("This is run every Monday morning at 10:00")

Apache Airflow - Dag doesn't start even with start_date and schedule_interval defined

I am new at Airflow but I've defined a Dag to send a basic email every day at 9am. My DAG is the following one:
from airflow import DAG
from datetime import datetime, timedelta
from airflow.operators.bash_operator import BashOperator
from airflow.operators.email_operator import EmailOperator
from airflow.utils.dates import days_ago
date_log = str(datetime.today())
my_email = ''
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': days_ago(0),
'email': ['my_email'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'concurrency': 1,
'max_active_runs': 1
}
with DAG('TEST', default_args=default_args, schedule_interval='0 9 * * *',max_active_runs=1, catchup=False) as dag:
t_teste = EmailOperator(dag=dag, task_id='successful_notification',
to='my_email',
subject='Airflow Dag ' + date_log,
html_content="""""")
t_teste
I've all the configurations as I needed, and I have the webserver and scheduler running. Also, I have my Dag active on UI. My problem is that my DAG seems to be doing nothing. It hasn't run for two days, and even if it passes the scheduled time, it doesn't run as expected. I have already tested and run my trigger manually, and it ran successfully. But if I wait for the trigger time, it does nothing.
Do you know how what I am doing wrong?
Thanks!
Your DAG will never be scheduled. Airflow schedule calculates state_date + schedule_interval and schedule the DAG at the END of the interval.
>>> import airflow
>>> from airflow.utils.dates import days_ago
>>> print(days_ago(0))
2021-06-26 00:00:00+00:00
Calculating 2021-06-26 (today) + schedule_interval it means that the DAG will run on 2021-06-27 09:00 however when we reach 2021-06-27 the calculation will produce 2021-06-28 09:00 and so on resulting in DAG never actually runs.
The conclusion is: never use dynamic values in start_date!
To solve your issue simply change:
'start_date': days_ago(0) to some static value like: 'start_date': datetime(2021,6,25)
note that if you are running older versions of Airflow you might also need to change the dag_id.

Airflow - GCP - files from DAG folder are not showing up

I'm new to GCP . I have a sample python script created in a GCP environment which is running fine. I want to schedule this in Airflow. I copied the file in DAG folder in the environment (gs://us-west2-*******-6f9ce4ef-bucket/dags), but it's showing up in the airflow DAG ..
This is the location in airflow config.
dags_folder = /home/airflow/gcs/dags
Pls do let me know how to get my python code to show up in airflow..do i have to setup any other things. I kept all default.
Thanks in advance.
What you did is already correct, wherein you placed your python script in your gs://auto-generated-bucket/dags/. I'm not sure if you were able to use the airflow library in your script, but this library will let you configure the behavior of your DAG in airflow. You can see an example in the Cloud Composer quickstart.
You can check an in-depth tutorial of DAGs here.
Sample DAG (test_dag.py) that prints the dag_run.id:
# test_dag.py #
import datetime
import airflow
from airflow.operators import bash_operator
YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)
default_args = {
'owner': 'Composer Example',
'depends_on_past': False,
'email': [''],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'start_date': YESTERDAY,
}
with airflow.DAG(
'this_is_the_test_dag', ## <-- This string will be displayed in the AIRFLOW web interface as the DAG name ##
'catchup=False',
default_args=default_args,
schedule_interval=datetime.timedelta(days=1)) as dag:
# Print the dag_run id from the Airflow logs
print_dag_run_conf = bash_operator.BashOperator(
task_id='print_dag_run_conf', bash_command='echo {{ dag_run.id }}')
gs://auto-generated-bucket/dags/ gcs location:
Airflow Web server:

airflow DAG keeps retrying without showing any errors

I use google composer. I have a dag that uses the panda.read_csv() function to read a .csv.gz file. The DAG keeps trying without showing any errors. Here is the airflow log:
*** Reading remote log from gs://us-central1-data-airflo-dxxxxx-bucket/logs/youtubetv_gcpbucket_to_bq_daily_v2_csv/file_transfer_gcp_to_bq/2018-11-04T20:00:00/1.log.
[2018-11-05 21:03:58,123] {cli.py:374} INFO - Running on host airflow-worker-77846bb966-vgrbz
[2018-11-05 21:03:58,239] {models.py:1196} INFO - Dependencies all met for <TaskInstance: youtubetv_gcpbucket_to_bq_daily_v2_csv.file_transfer_gcp_to_bq 2018-11-04 20:00:00 [queued]>
[2018-11-05 21:03:58,297] {models.py:1196} INFO - Dependencies all met for <TaskInstance: youtubetv_gcpbucket_to_bq_daily_v2_csv.file_transfer_gcp_to_bq 2018-11-04 20:00:00 [queued]>
[2018-11-05 21:03:58,298] {models.py:1406} INFO -
----------------------------------------------------------------------
---------
Starting attempt 1 of
----------------------------------------------------------------------
---------
[2018-11-05 21:03:58,337] {models.py:1427} INFO - Executing <Task(BranchPythonOperator): file_transfer_gcp_to_bq> on 2018-11-04 20:00:00
[2018-11-05 21:03:58,338] {base_task_runner.py:115} INFO - Running: ['bash', '-c', u'airflow run youtubetv_gcpbucket_to_bq_daily_v2_csv file_transfer_gcp_to_bq 2018-11-04T20:00:00 --job_id 15096 --raw -sd DAGS_FOLDER/dags/testdags/youtubetv_gcp_to_bq_v2.py']
python code in DAG:
from datetime import datetime,timedelta
from airflow import DAG
from airflow import models
import os
import io,logging, sys
import pandas as pd
from io import BytesIO, StringIO
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.operators.python_operator import BranchPythonOperator
from airflow.operators.bash_operator import BashOperator
#GCP
from google.cloud import storage
import google.cloud
from google.cloud import bigquery
from google.oauth2 import service_account
from airflow.operators.slack_operator import SlackAPIPostOperator
from airflow.models import Connection
from airflow.utils.db import provide_session
from airflow.utils.trigger_rule import TriggerRule
def readCSV(checked_date,file_name, **kwargs):
subDir=checked_date.replace('-','/')
fileobj = get_byte_fileobj(BQ_PROJECT_NAME, YOUTUBETV_BUCKET, subDir+"/"+file_name)
df_chunks = pd.read_csv(fileobj, compression='gzip',memory_map=True, chunksize=1000000) # return TextFileReader
print ("done reaCSV")
return df_chunks
DAG:
file_transfer_gcp_to_bq = BranchPythonOperator(
task_id='file_transfer_gcp_to_bq',
provide_context=True,
python_callable=readCSV,
op_kwargs={'checked_date': '2018-11-03', 'file_name':'daily_events_xxxxx_partner_report.csv.gz'}
)
The DAG is successfully run on my local airflow version.
def readCSV(checked_date,file_name, **kwargs):
subDir=checked_date.replace('-','/')
fileobj = get_byte_fileobj(BQ_PROJECT_NAME, YOUTUBETV_BUCKET, subDir+"/"+file_name)
df = pd.read_csv(fileobj, compression='gzip',memory_map=True)
return df
tested get_byte_fileobj and it works as a stand alone function.
Based on this discussion airflow google composer group it is a known issue.
One of the reason can be because of overkilling all the composer resources (in my case memory)
I have a similar issue recently.
In my case it's beacause the kubernetes worker overload.
You can watch the worker performance on kubernetes dashboard too see whether your case is cluster overloadding issue.
If yes, you can try set the value of an airflow configuration celeryd_concurrency lower to reduce the parallism in a worker and see whether the cluster loads goes down