t1 = BashOperator(
task_id='task_1',
bash_command="Rscript Failure.R",
dag=dag)
t2 = BashOperator(
task_id='task_2',
bash_command="Rscript Success.R",
dag=dag)
t1fail = BashOperator(
trigger_rule=TriggerRule.ONE_FAILED,
task_id='task_1fail',
bash_command="echo task1 failed",
dag=dag)
t2fail = BashOperator(
trigger_rule=TriggerRule.ONE_FAILED,
task_id='task_2fail',
bash_command="echo task2 failed",
dag=dag)
tSuccess = BashOperator(
task_id='t_Success',
bash_command="echo task1 failed",
dag=dag)
t2.set_upstream(t1)
tSuccess.set_upstream(t1, t2)
t2fail.set_upstream(t2)
t1fail.set_upstream(t1)
When task t1 fails to execute, ideal situation is t1fail is to be called.
But what I get is the task t2fail is also being called, as the trigger is set to one failed. Is there a way i can make the task t2fail to start only when the task t2 runs and it fails.
Related
I have a dag like the one below and i need to use the value returned from a python operator outside the tasks. How do i achieve this?
dag = DAG(
dag_id='example_batch_submit_job',
schedule_interval=None,
start_date=datetime(2022, 7, 27),
tags=['batch_job'],
catchup=False)
def get_inputs(**kwargs):
num_jobs = kwargs['dag_run'].conf['num_jobs']
return num_jobs
run_this = PythonOperator(
task_id='get_input',
provide_context=True,
python_callable=get_inputs,
dag=DAG,
)
jobs = num_jobs <------ How do i pass the returned value here
for job in jobs:
submit_batch_job = BatchOperator(
task_id=f'submit_batch_job_{job}',
job_name=JOB_NAME,
job_queue=JOB_QUEUE,
job_definition=JOB_DEFINITION,
parameters={}
)
For Airflow<2.3.0
#task
def make_list(count):
context = get_current_context()
for i in range(count):
t = BatchOperator(
task_id=f"submit_batch_job_{i}",
job_name=JOB_NAME,
job_queue=JOB_QUEUE,
job_definition=JOB_DEFINITION,
parameters={},
overrides={},
)
t.execute(context)
job_list = make_list("{{ ti.xcom_pull(task_ids='get_input', key='return_value') }}")
run_this >> job_list
For Airflow >= 2.3.0 :
You can use Dynamic Task, which create number of task dynamically according to parameter in the execution.
#task
def make_list(count):
return [i for i in range(count)]
job_list = make_list("{{ ti.xcom_pull(task_ids='get_input', key='return_value') }}")
batch = BatchOperator.partial(
task_id="submit_batch_job",
job_name=JOB_NAME,
job_queue=JOB_QUEUE,
job_definition=JOB_DEFINITION,
parameters={}
).expand(job_id=job_list)
run_this >> job_list >> batch
also, be notice that num_jobs is str unless you set in your Dag that "render_template_as_native_obj=True". if you don't then you just need to cast it : int(count)
I am running a Vertex AI batch prediction using the python API.
The function I am using is from the google cloud docs:
def create_batch_prediction_job_dedicated_resources_sample(
key_path,
project: str,
location: str,
model_display_name: str,
job_display_name: str,
gcs_source: Union[str, Sequence[str]],
gcs_destination: str,
machine_type: str = "n1-standard-2",
sync: bool = True,
):
credentials = service_account.Credentials.from_service_account_file(
key_path)
# Initilaize an aiplatfrom object
aiplatform.init(project=project, location=location, credentials=credentials)
# Get a list of Models by Model name
models = aiplatform.Model.list(filter=f'display_name="{model_display_name}"')
model_resource_name = models[0].resource_name
# Get the model
my_model = aiplatform.Model(model_resource_name)
batch_prediction_job = my_model.batch_predict(
job_display_name=job_display_name,
gcs_source=gcs_source,
gcs_destination_prefix=gcs_destination,
machine_type=machine_type,
sync=sync,
)
#batch_prediction_job.wait_for_resource_creation()
batch_prediction_job.wait()
print(batch_prediction_job.display_name)
print(batch_prediction_job.resource_name)
print(batch_prediction_job.state)
return batch_prediction_job
datetime_today = datetime.datetime.now()
model_display_name = 'test_model'
key_path = 'vertex_key.json'
project = 'my_project'
location = 'asia-south1'
job_display_name = 'batch_prediction_' + str(datetime_today)
model_name = '1234'
gcs_source = 'gs://my_bucket/Cleaned_Data/user_item_pairs.jsonl'
gcs_destination = 'gs://my_bucket/prediction'
create_batch_prediction_job_dedicated_resources_sample(key_path,project,location,model_display_name,job_display_name,
gcs_source,gcs_destination)
OUTPUT:
92 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/my_project/locations/asia-south1/batchPredictionJobs/37737350127597649
The above output is being printed on the terminal over and over after every few seconds.
The issue that I have is that the python program calling this function keeps on running until it is force stopped. I have tried both batch_prediction_job.wait() & batch_prediction_job.wait_for_resource_creation() with the same results.
How do I start a batch_prediction_job without waiting for it to complete and terminating the program just after the job has be created?
I gave you the wrong instruction on the comments, change the parameter sync=False and the function should return just after be executed.
Whether this function call should be synchronous (wait for pipeline run to finish before terminating) or asynchronous (return immediately)
sync=False
def create_batch_prediction_job_dedicated_resources_sample(
# ...
sync: bool = False,
):
UPDATE - Adding more details:
Check here my notebook code where I tested it and its working:
You have to change the sync=False AND remove/comment the following print lines:
#batch_prediction_job.wait()
#print(batch_prediction_job.display_name)
#print(batch_prediction_job.resource_name)
#print(batch_prediction_job.state)
Your code edited:
def create_batch_prediction_job_dedicated_resources_sample(
key_path,
project: str,
location: str,
model_display_name: str,
job_display_name: str,
gcs_source: Union[str, Sequence[str]],
gcs_destination: str,
machine_type: str = "n1-standard-2",
sync: bool = False,
):
credentials = service_account.Credentials.from_service_account_file(key_path)
# Initilaize an aiplatfrom object
aiplatform.init(project=project, location=location, credentials=credentials)
# Get a list of Models by Model name
models = aiplatform.Model.list(filter=f'display_name="{model_display_name}"')
model_resource_name = models[0].resource_name
# Get the model
my_model = aiplatform.Model(model_resource_name)
batch_prediction_job = my_model.batch_predict(
job_display_name=job_display_name,
gcs_source=gcs_source,
gcs_destination_prefix=gcs_destination,
machine_type=machine_type,
sync=sync,
)
return batch_prediction_job
datetime_today = datetime.datetime.now()
model_display_name = 'test_model'
key_path = 'vertex_key.json'
project = '<my_project_name>'
location = 'asia-south1'
job_display_name = 'batch_prediction_' + str(datetime_today)
model_name = '1234'
gcs_source = 'gs://<my_bucket_name>/Cleaned_Data/user_item_pairs.jsonl'
gcs_destination = 'gs://<my_bucket_name>/prediction'
create_batch_prediction_job_dedicated_resources_sample(key_path,
project,location,
model_display_name,
job_display_name,
gcs_source,
gcs_destination,
sync=False,)
Results sync=False:
Results sync=True:
I have a dag like this (This is a semi-pseudocode), I want to execute the tasks in different branches based on their output.
#This is a method that return a or b
def dosth():
.....
return a or b
t1 = PythonOperator(
't1',
python_callable = dosth
)
branchA = BashOperator(
'branchA',....
)
branchB = BashOperator(
'branchB',....
)
What I want is if dosth returns a, I want the dag to execute the task in branchA, if it returns b,I want the dag to execute the task in branchB. Anyone knows how can we approach this?
Check this doc about Branching: https://airflow.apache.org/docs/stable/concepts.html?highlight=branch#branching
You need to use BranchPythonOperator where you can specify the condition to be evaluated to decide which task should be run next.
Example based on your semi-pseudocode:
def dosth():
if some_condition:
return 'branchA'
else:
return 'branchB'
t1 = BranchPythonOperator(
task_id='t1',
provide_context=True,
python_callable= dosth,
dag=dag)
branchA = BashOperator(
'branchA',....
)
branchB = BashOperator(
'branchB',....
)
The function you pass to python_callable should return the task_id of the next task that should run.
Another Example:
def branch_func(**kwargs):
ti = kwargs['ti']
xcom_value = int(ti.xcom_pull(task_ids='start_task'))
if xcom_value >= 5:
return 'continue_task'
else:
return 'stop_task'
start_op = BashOperator(
task_id='start_task',
bash_command="echo 5",
xcom_push=True,
dag=dag)
branch_op = BranchPythonOperator(
task_id='branch_task',
provide_context=True,
python_callable=branch_func,
dag=dag)
continue_op = DummyOperator(task_id='continue_task', dag=dag)
stop_op = DummyOperator(task_id='stop_task', dag=dag)
start_op >> branch_op >> [continue_op, stop_op]
My twisted server, which connects four clients, initializes the Sqlalchemy engine once for each client. When the system runs for a period of time, the following error will be reported:
[Failure instance: Traceback: <class'sqlalchemy. exc. ResourceClosedError'>: This result object does not return rows. It has been closed automatically.
/usr/lib/python 2.7/threading.py:801:u bootstrap_inner
3/usr/lib/python 2.7/threading.py:754:run
4/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/twisted/_threads/_threadworker.py:46:work
5/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/twisted/_threads/_team.py:190:doWork
6 - < exception caught here >
7/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/twisted/python/threadpool.py:250:inContext
8/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/twisted/python/threadpool.py:266: <lambda>
9/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/twisted/python/context.py:122:call WithContext
10/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/twisted/python/context.py:85:call WithContext
11. / data_server.py:231: check_update_mysqldb
12. / DRV / mysqldb_driver. py: 61: search_device_by_mac
13/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/sqlalchemy/orm/query.py:2895:first:
14/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/sqlalchemy/orm/query.py:2687:u getitem_u
15/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/sqlalchemy/orm/loading.py:98:instances
16/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/sqlalchemy/util/compat.py:265:raise_from_cause
17/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/sqlalchemy/orm/loading.py:61:instances
18/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/sqlalchemy/orm/query.py:3842:row_processor
19/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/sqlalchemy/orm/loading.py:361:_instance_processor
20/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/sqlalchemy/engine/result.py:654:_getter
21/home/sites/data_collecting_server/venv/local/lib/python 2.7/site-packages/sqlalchemy/engine/result.py:1088:_non_result
22]
I solved the problem successfully. I wrote a test script that calls this way to isolate sessions
Here is my test code:
# coding=utf-8
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import or_
from sqlalchemy.orm import scoped_session
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import NullPool
from config.settings import MYSQLDB_SETTING
import threading
import multiprocessing
import time
class SqlalchemyDriver(object):
def __init__(self, host, user, password, port, database):
db = "mysql+mysqldb://{user}:{password}#{host}:{port}/{database}?"\
"charset=utf8".format(
user=user,
password=password,
host=host,
port=port,
database=database
)
# self.engine = create_engine(db, poolclass=NullPool)
self.engine = create_engine(db, pool_pre_ping=True, pool_size=10, max_overflow=10, pool_timeout=30)
self.Session = scoped_session(sessionmaker(bind=self.engine, autoflush=False))
drv = SqlalchemyDriver(
host=MYSQLDB_SETTING["host"],
user=MYSQLDB_SETTING["user"],
password=MYSQLDB_SETTING["password"],
port=MYSQLDB_SETTING["port"],
database=MYSQLDB_SETTING["database"]
)
class MysqldbDriver(object):
def __init__(self):
self.classes = self.get_table_classes()
self.session = drv.Session()
print(id(self.session))
def close_session(self):
drv.Session.remove()
def get_table_classes(self):
base = automap_base()
base.prepare(drv.engine, reflect=True)
return base.classes
def search_device_by_mac(self, mac_address):
time.sleep(1)
res = self.session.query(self.classes.client_management_device). \
filter_by(mac_address=mac_address).first()
print(res)
return res
def query_func():
mysql = MysqldbDriver()
mysql.search_device_by_mac('xx:xx:xx:xx:xx:xx')
def my_test():
while True:
t1 = threading.Thread(target=query_func)
t2 = threading.Thread(target=query_func)
t3 = threading.Thread(target=query_func)
t4 = threading.Thread(target=query_func)
t5 = threading.Thread(target=query_func)
t6 = threading.Thread(target=query_func)
t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()
time.sleep(3)
t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
t6.join()
time.sleep(1)
p1 = multiprocessing.Process(target=my_test)
p2 = multiprocessing.Process(target=my_test)
p3 = multiprocessing.Process(target=my_test)
p4 = multiprocessing.Process(target=my_test)
print('running')
p1.start()
p2.start()
p3.start()
p4.start()
time.sleep(1)
p1.join()
p2.join()
p3.join()
p4.join()
time.sleep(1)
I am using a MapReduce code, and I met a problem: the code doesn't respond after the map (takes 1 hr) finished. I digged into the code and find this function doesn't respond:
def wait(self, running, tag): """Test if any worker has finished its job. If so, decrease its key and make it available """
atimer = Timer('Wait')
inittime = time()
status = MPI.Status()
while time() - inittime < self.config['jobwait']:
if world.Iprobe(source=MPI.ANY_SOURCE,tag=tag,status=status):
jobf = world.recv(source=status.source, tag=tag)
idx = 0
for ii, worker in enumerate(self.workers):
if worker.id == status.source: idx = ii; break
if self.config['verbosity'] >= 8:
print('Freeing worker '+str(self.workers[idx].id))
worker = self.workers[idx]
# faulty worker's job has already been cleaned
if not worker.isFaulty():
del running[jobf]
else:
self.nActive += 1
worker.setFree()
heapq._siftup(self.workers, idx)
This line doesn't respond :
if world.Iprobe(source=MPI.ANY_SOURCE,tag=tag,status=status):
I wonder if there is a time out for Iprobe() in mpi4py and how to set the timeout time for it? Is there a alternative for Iprobe() that has the same role here?
Here is the previous function that send the message via .send()
def execTask(self, task):
"""Wrapper function calling mapping/reducing/finalizing phase tasks,
dispatch tasks to workers until all finished and collect feedback.
Faulty workers are removed from active duty work list.
"""
atimer = Timer(task)
print( 'Entering {0:s} phase...'.format(task) )
taskDict = { 'Map':(self.mapIn, MAP_START, MAP_FINISH), \
'Init':(self.mapIn, INIT_START, MAP_FINISH), \
'Reduce':(self.reduceIn, REDUCE_START, REDUCE_FINISH) }
# line up jobs and workers into priority queues
jobs = taskDict[task][0][:]
heapq.heapify(jobs); running = {}
heapq.heapify(self.workers)
while (jobs or running) and self.nActive > 0:
# dispatch all jobs to all free workers
while jobs and self.workers[0].isFree():
job = heapq.heappop(jobs)
worker = heapq.heappop(self.workers)
world.send(job, dest=worker.id, tag=taskDict[task][1])
print('hi')
print job
worker.setBusy(); heapq.heappush(self.workers, worker)
running[job] = (time(), worker)
if self.config['verbosity'] >= 6:
print('Dispatching file '+os.path.basename(job)+' to worker '+str(worker.id))
# if no more free workers, break
if not self.workers[0].isFree(): break
# wait for finishing workers as well as do cleaning
self.wait(running, taskDict[task][2])
# print running
self.clean(running, jobs)
print( '{0:s} phase completed'.format(task) )
The whole code can be seen here:
https://drive.google.com/file/d/0B36fJi35SPIedWdjbW5NdzlCeTg/view?usp=sharing