I have a tasks that contains dynamic arguments I want to run periodically, how do I pass dynamic elements to the tasks arguments when the task is being called in django celery beat?
Here is the task I want to run periodically:
#task(bind=True)
def generate_export(export_type, xform, export_id=None, options=None):
"""
Create appropriate export object given the export type.
param: export_type
param: xform
params: export_id: ID of export object associated with the request
param: options: additional parameters required for the lookup.
binary_select_multiples: boolean flag
end: end offset
ext: export extension type
dataview_pk: dataview pk
group_delimiter: "/" or "."
query: filter_query for custom queries
remove_group_name: boolean flag
split_select_multiples: boolean flag
index_tag: ('[', ']') or ('_', '_')
show_choice_labels: boolean flag
language: language labels as in the XLSForm/XForm
"""
username = xform.user.username
id_string = xform.id_string
end = options.get("end")
extension = options.get("extension", export_type)
filter_query = options.get("query")
remove_group_name = options.get("remove_group_name", False)
start = options.get("start")
export_type_func_map = {
Export.XLS_EXPORT: 'to_xls_export',
Export.CSV_EXPORT: 'to_flat_csv_export',
Export.DHIS2CSV_EXPORT: 'to_dhis2csv_export',
Export.CSV_ZIP_EXPORT: 'to_zipped_csv',
Export.SAV_ZIP_EXPORT: 'to_zipped_sav',
Export.GOOGLE_SHEETS_EXPORT: 'to_google_sheets',
}
if xform is None:
xform = XForm.objects.get(
user__username__iexact=username, id_string__iexact=id_string)
dataview = None
if options.get("dataview_pk"):
dataview = DataView.objects.get(pk=options.get("dataview_pk"))
records = dataview.query_data(dataview, all_data=True,
filter_query=filter_query)
total_records = dataview.query_data(dataview,
count=True)[0].get('count')
else:
records = query_data(xform, query=filter_query, start=start, end=end)
if filter_query:
total_records = query_data(xform, query=filter_query, start=start,
end=end, count=True)[0].get('count')
else:
total_records = xform.num_of_submissions
if isinstance(records, QuerySet):
records = records.iterator()
export_builder = ExportBuilder()
export_builder.TRUNCATE_GROUP_TITLE = True \
if export_type == Export.SAV_ZIP_EXPORT else remove_group_name
export_builder.GROUP_DELIMITER = options.get(
"group_delimiter", DEFAULT_GROUP_DELIMITER
)
export_builder.SPLIT_SELECT_MULTIPLES = options.get(
"split_select_multiples", True
)
export_builder.BINARY_SELECT_MULTIPLES = options.get(
"binary_select_multiples", False
)
export_builder.INCLUDE_LABELS = options.get('include_labels', False)
export_builder.INCLUDE_LABELS_ONLY = options.get(
'include_labels_only', False
)
export_builder.INCLUDE_HXL = options.get('include_hxl', False)
export_builder.INCLUDE_IMAGES \
= options.get("include_images", settings.EXPORT_WITH_IMAGE_DEFAULT)
export_builder.VALUE_SELECT_MULTIPLES = options.get(
'value_select_multiples', False)
export_builder.REPEAT_INDEX_TAGS = options.get(
"repeat_index_tags", DEFAULT_INDEX_TAGS
)
export_builder.SHOW_CHOICE_LABELS = options.get('show_choice_labels',
False)
export_builder.language = options.get('language')
# 'win_excel_utf8' is only relevant for CSV exports
if 'win_excel_utf8' in options and export_type != Export.CSV_EXPORT:
del options['win_excel_utf8']
export_builder.set_survey(xform.survey, xform)
# change the dhis2csv exports to standard csv format
if extension == 'dhis2csv':
extension = 'csv'
temp_file = NamedTemporaryFile(suffix=("." + extension))
columns_with_hxl = export_builder.INCLUDE_HXL and get_columns_with_hxl(
xform.survey_elements)
# get the export function by export type
func = getattr(export_builder, export_type_func_map[export_type])
try:
func.__call__(
temp_file.name, records, username, id_string, filter_query,
start=start, end=end, dataview=dataview, xform=xform,
options=options, columns_with_hxl=columns_with_hxl,
total_records=total_records
)
except NoRecordsFoundError:
pass
except SPSSIOError as e:
export = get_or_create_export(export_id, xform, export_type, options)
export.error_message = str(e)
export.internal_status = Export.FAILED
export.save()
report_exception("SAV Export Failure", e, sys.exc_info())
return export
# generate filename
basename = "%s_%s" % (
id_string, datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f"))
if remove_group_name:
# add 'remove group name' flag to filename
basename = "{}-{}".format(basename, GROUPNAME_REMOVED_FLAG)
if dataview:
basename = "{}-{}".format(basename, DATAVIEW_EXPORT)
filename = basename + "." + extension
# check filename is unique
while not Export.is_filename_unique(xform, filename):
filename = increment_index_in_filename(filename)
file_path = os.path.join(
username,
'exports',
id_string,
export_type,
filename)
# seek to the beginning as required by storage classes
temp_file.seek(0)
export_filename = default_storage.save(file_path,
File(temp_file, file_path))
temp_file.close()
dir_name, basename = os.path.split(export_filename)
# get or create export object
export = get_or_create_export(export_id, xform, export_type, options)
export.filedir = dir_name
export.filename = basename
export.internal_status = Export.SUCCESSFUL
# do not persist exports that have a filter
# Get URL of the exported sheet.
if export_type == Export.GOOGLE_SHEETS_EXPORT:
export.export_url = export_builder.url
# if we should create a new export is true, we should not save it
if start is None and end is None:
export.save()
return export
and this is where I call the tasks in the celery beat schedule:
CELERY_BEAT_SCHEDULE = {
'download_csv': {
'task': 'onadata.libs.utils.export_tools.generate_export',
# There are 4 ways we can handle time, read further
'schedule': crontab(minute='*'),
# If you're using any arguments
'args': ()
}
}
how do I pass parameters into the arguments for the tasks??
There is no way to pass argument dynamically in Celery Beat. I think your function is not suitable with periodic task.
Instead of giving a factor directly to the generate_export function, it must be changed to get the required items within the function. Or change to a simple asynchronous operation.
I faced a similar problem. The args field in beat_schedule is fixed at startup and does not change afterward.
But there is a hackish way to pass different arguments to your task.
Use the before_task_publish signal to add custom data in headers.
from celery.signals import before_task_publish
#before_task_publish.connect
def before_publish(sender=None, headers=None, body=None, **kwargs):
if sender == "tasks.generate_export":
headers["custom_args"] = {
"export_type": "some_val"
"xform": "some_val"
"export_id": get_export_id()
"options": options_dict
}
By default, Celery uses JSON serializer. So, make sure the data you add to headers are JSON serializable. Alternatively, you can use pickle to serialize the data, but it brings security concerns with it.
Now you can access these headers in a bound task.
#task(bind=True)
def generate_export(self):
args = self.request.get("custom_args", None)
# do something with args
Related
The issue
I'm trying out great expectations with dagster, as per this guide
My pipeline seems to execute correctly until it reaches this block:
expectation = dagster_ge.ge_validation_op_factory(
name='ge_validation_op',
datasource_name='dev.data-pipeline-data-storage.data_pipelines.raw_data.sirene_update',
suite_name='suite.data_pipelines.raw_data.sirene_update',
)
if expectation["success"]:
print("Success")
trying to call expectation["success"] results in a
# TypeError: 'SolidDefinition' object is not subscriptable
When I go inside the code of ge_validation_op_factory, there is a _ge_validation_fn that should yield ExpectationResult, but somehow it gets coverted into a SolidDefinition...
Dagster version = 0.15.9;
Great Expectations version = 0.15.44
Code to reproduce the error
In my code, I am trying to interact with an s3 bucket, so it would be a bit tedious to re-create the code for my example but here it is anyway:
In a gx_postprocessing.py
import json
import boto3
import dagster_ge
from dagster import (
op,
graph,
Field,
String,
OpExecutionContext,
)
from typing import List, Dict
#op(
config_schema={
"bucket": Field(
String,
description="s3 bucket name",
),
"path_in_s3": Field(
String,
description="Prefix representing the path to data",
),
"technical_date": Field(
String,
description="date string to fetch data",
),
"file_name": Field(
String,
description="file name that contains the data",
),
}
)
def read_in_json_datafile_from_s3(context: OpExecutionContext):
bucket = context.op_config["bucket"]
path_in_s3 = context.op_config["path_in_s3"]
technical_date = context.op_config["technical_date"]
file_name = context.op_config["file_name"]
object = f"{path_in_s3}/" f"technical_date={technical_date}/" f"{file_name}"
s3 = boto3.resource("s3")
content_object = s3.Object(bucket, object)
file_content = content_object.get()["Body"].read().decode("utf-8")
json_content = json.loads(file_content)
return json_content
#op
def process_example_dq(data: List[Dict]):
return len(data)
#op
def postprocess_example_dq(numrows, expectation):
if expectation["success"]:
return numrows
else:
raise ValueError
#op
def validate_example_dq(context: OpExecutionContext):
expectation = dagster_ge.ge_validation_op_factory(
name='ge_validation_op',
datasource_name='my_bucket.data_pipelines.raw_data.example_update',
suite_name='suite.data_pipelines.raw_data.example_update',
)
return expectation
#graph(
config={
"read_in_json_datafile_from_s3": {
"config": {
"bucket": "my_bucket",
"path_in_s3": "my_path",
"technical_date": "2023-01-24",
"file_name": "myfile_20230124.json",
}
},
},
)
def example_update_evaluation():
output_dict = read_in_json_datafile_from_s3()
nb_items = process_example_dq(data=output_dict)
expectation = validate_example_dq()
postprocess_example_dq(
numrows=nb_items,
expectation=expectation,
)
Do not forget to add great_expectations_poc_pipeline to your __init__.py where the pipelines=[..] are listed.
In this example, dagster_ge.ge_validation_op_factory(...) is returning an OpDefinition, which is the same type of thing as (for example) process_example_dq, and should be composed in the graph definition the same way, rather than invoked within another op.
So instead, you'd want to have something like:
validate_example_dq = dagster_ge.ge_validation_op_factory(
name='ge_validation_op',
datasource_name='my_bucket.data_pipelines.raw_data.example_update',
suite_name='suite.data_pipelines.raw_data.example_update',
)
Then use that op inside your graph definition the same way you currently are (i.e. expectation = validate_example_dq())
I have a workflow where I put files into an S3 bucket, which triggers a Lambda function. The Lambda function extracts some info about the file and inserts a row into a DynamoDB table for each file:
def put_filename_in_db(dynamodb, filekey, filename):
table = dynamodb.Table(dynamodb_table_name)
try:
response = table.put_item(
Item={
'masterclient': masterclient,
'filekey': filekey,
'filename': filename,
'filetype': filetype,
'source_bucket_name': source_bucket_name,
'unixtimestamp': unixtimestamp,
'processed_on': None,
'archive_path': None,
'archived_on': None,
}
)
except Exception as e:
raise Exception(f"Error")
return response
def get_files():
bucket_content = s3_client.list_objects(Bucket=str(source_bucket_name), Prefix=Incoming_prefix)['Contents']
file_list = []
for k, v in enumerate(bucket_content):
if (v['Key'].endswith("zip") and not v['Key'].startswith(Archive_prefix)):
filekey = v['Key']
filename = ...
dict = {"filekey": filekey, "filename": filename}
file_list.append(dict)
logger.info(f'Found {len(file_list)} files to process: {file_list}')
return file_list
def lambda_handler(event, context):
for current_item in get_files():
filekey = current_item['filekey']
filename = current_item['filename']
put_filename_in_db(dynamodb, filekey, filename)
return {
'statusCode': 200
}
This is how my DynamoDB table is defined in terraform:
resource "aws_dynamodb_table" "filenames" {
name = local.dynamodb_table_filenames
billing_mode = "PAY_PER_REQUEST"
#read_capacity = 10
#write_capacity = 10
hash_key = "filename"
stream_enabled = true
stream_view_type = "NEW_IMAGE"
attribute {
name = "filename"
type = "S"
}
}
resource "aws_lambda_event_source_mapping" "allow_dynamodb_table_to_trigger_lambda" {
event_source_arn = aws_dynamodb_table.filenames.stream_arn
function_name = aws_lambda_function.trigger_stepfunction_lambda.arn
starting_position = "LATEST"
}
New entries in the DynamoDB table trigger another Lambda function which contains this:
def parse_file_info_from_trigger(event):
filename = event['Records'][0]['dynamodb']['Keys']['filename']['S']
filetype = event['Records'][0]['dynamodb']['NewImage']['filetype']['S']
unixtimestamp = event['Records'][0]['dynamodb']['NewImage']['unixtimestamp']['S']
masterclient = event['Records'][0]['dynamodb']['NewImage']['masterclient']['S']
source_bucket_name = event['Records'][0]['dynamodb']['NewImage']['source_bucket_name']['S']
filekey = event['Records'][0]['dynamodb']['NewImage']['filekey']['S']
return filename, filetype, unixtimestamp, masterclient, source_bucket_name, filekey
def start_step_function(event, state_machine_zip_files_arn):
if event['Records'][0]['eventName'] == 'INSERT':
filename, filetype, unixtimestamp, masterclient, source_bucket_name, filekey = parse_file_info_from_trigger(event)
......
else:
logger.info(f'This is not an Insert event')
However, the costs for this process are extremely high. If I start testing with a single file loaded into S3, the overall DynamoDB costs for that day were $0.785. If I do it for around 50 files for a day, that would mean my total costs per day are 40$, which seems too high if we want to run the workflow on a daily basis.
Am I doing something wrong? Or is DynamoDB generally expensive? If it's the later, then what part exactly is costing so much? Or is it because put_filename_in_db is running in a loop?
The data_get function retrieves a value from a nested array or object using "dot" notation:
$data = ['products' => ['desk' => ['price' => 100]]];
$price = data_get($data, 'products.desk.price');
// 100
More detail in Laravel Doc
I do this function:
def get_data(data, dot_path, default=None):
arr_paths = dot_path.split('.')
result = data
for path in arr_paths:
try:
if isinstance(result, (dict, list, tuple)):
result = result[path]
else:
result = None
except KeyError as e:
result = None
if not result:
result = default
return result
I have downloaded an archive of mails from my gmail account. I am using the following python(2.7) code taken from a blog to convert the contents of the archive to csv.
import mailbox
import csv
writer = csv.writer(open(("clean_mail.csv", "wb"))
for message in mailbox.mbox('archive.mbox'):
writer.writerow([message['subject'], message['from'], message['date']])
I want to include the body of the mail(the actual messages) too...but couldn't figure out how. I have not used python earlier, can someone help please? I have used other SO options given but couldn't get through.
To do the same task, I have used the following code too: but get indentation error for line 60: return json_msg. I have tried different indentation options but nothing improved.
import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse
MBOX = 'Users/mymachine/client1/Takeout/Mail/archive.mbox'
OUT_FILE = 'Users/mymachine/client1/Takeout/Mail/archive.mbox.json'
def cleanContent(msg):
msg = quopri.decodestring(msg)
try:
soup = BeautifulSoup(msg)
except:
return ''
return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
def default(self, o): return list(o)
def gen_json_msgs(mb):
while 1:
msg = mb.next()
if msg is None:
break
yield jsonifyMessage(msg)
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v.decode('utf-8', 'ignore')
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
.replace(' ', '').decode('utf-8', 'ignore').split(',')
for part in msg.walk():
json_part = {}
if part.get_content_maintype() == 'multipart':
continue
json_part['contentType'] = part.get_content_type()
content = part.get_payload(decode=False).decode('utf-8', 'ignore')
json_part['content'] = cleanContent(content)
json_msg['parts'].append(json_part)
then = parse(json_msg['Date'])
millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
json_msg['Date'] = {'$date' : millis}
return json_msg
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
if msg != None:
f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()
Try this.
import mailbox
import csv
writer = csv.writer(open(("clean_mail.csv", "wb"))
for message in mailbox.mbox('archive.mbox'):
if message.is_multipart():
content = ''.join(part.get_payload() for part in message.get_payload())
else:
content = message.get_payload()
writer.writerow([message['subject'], message['from'], message['date'],content])
or this:
import mailbox
import csv
def get_message(message):
if not message.is_multipart():
return message.get_payload()
contents = ""
for msg in message.get_payload():
contents = contents + str(msg.get_payload()) + '\n'
return contents
if __name__ == "__main__":
writer = csv.writer(open("clean_mail.csv", "wb"))
for message in mailbox.mbox("archive.mbox"):
contents = get_message(message)
writer.writerow([message["subject"], message["from"], message["date"],contents])
Find the documentation here.
A little improvement of Rahul snippet for multipart content:
import sys
import mailbox
import csv
from email.header import decode_header
infile = sys.argv[1]
outfile = sys.argv[2]
writer = csv.writer(open(outfile, "w"))
def get_content(part):
content = ''
payload = part.get_payload()
if isinstance(payload, str):
content += payload
else:
for part in payload:
content += get_content(part)
return content
writer.writerow(['date', 'from', 'to', 'subject', 'content'])
for index, message in enumerate(mailbox.mbox(infile)):
content = get_content(message)
row = [
message['date'],
message['from'].strip('>').split('<')[-1],
message['to'],
decode_header(message['subject'])[0][0],
content
]
writer.writerow(row)
This is roughly what I'm trying to do:
def post(request):
VehicleFormSet = formset_factory(StaffVehicleForm)
if request.method == 'POST':
vehicle_formset = VehicleFormSet(request.POST)
if 'add_vehicle' in request.POST:
if vehicle_formset.is_valid():
form_count = vehicle_formset.total_form_count()
vehicle_formset.forms.append(vehicle_formset._construct_form(form_count))
Basically, if a user clicks the "Add" button and their entry is valid, I want to add another blank form to the formset, and hide the previous one.
The problem with the code above is that I can't figure out how to increase total_form_count(). The way I have it now, it will work once, and then if you press it again, nothing will happen, presumably because form_count is the same. I also don't like calling _construct_form and relying on the internals.
class RequiredFormSet(BaseFormSet):
def add_form(self, **kwargs):
# add the form
tfc = self.total_form_count()
self.forms.append(self._construct_form(tfc, **kwargs))
self.forms[tfc].is_bound = False
# make data mutable
self.data = self.data.copy()
# increase hidden form counts
total_count_name = '%s-%s' % (self.management_form.prefix, TOTAL_FORM_COUNT)
initial_count_name = '%s-%s' % (self.management_form.prefix, INITIAL_FORM_COUNT)
self.data[total_count_name] = self.management_form.cleaned_data[TOTAL_FORM_COUNT] + 1
self.data[initial_count_name] = self.management_form.cleaned_data[INITIAL_FORM_COUNT] + 1
def add_fields(self, form, index):
super(RequiredFormSet, self).add_fields(form, index)
form.empty_permitted = False
That will do it. Only took 7 hours to figure out. And I still don't know why I need .is_bound = False to make the initial values not screw up.
I do this using javascript. Since the formset renders three management fields
<input type="hidden" id="id_TOTAL_FORMS" value="1" name="TOTAL_FORMS">
<input type="hidden" id="id_INITIAL_FORMS" value="1" name="INITIAL_FORMS">.
<input type="hidden" id="id_MAX_NUM_FORMS" name="MAX_NUM_FORMS">
you can use javascript to increment the id_TOTAL_FORMS value, and just add in the extra fields. So I'd create my fieldset like this:
VehicleFormSet = modelformset_factory(StaffVehicleForm, extra = 0, max_num = None)
The tricky thing is to create the extra form fields in javascript. I usually use AJAX to fetch a new row from a custom view.
For posterity here is another way which works without JS (or alongside JS) and which does not require intimate knowledge of formset methods. Instead, you can just inspect the POST data and adjust it as if JS had done some work client-side. The following makes sure that there is always (at least) one empty form at the end of the formset:
def hsview( request):
HS_formset = formset_factory( HSTestForm, extra=3 )
prefix='XYZZY'
testinpost, empty = 'key', '' # field in the form and its default/empty value
extra=3
# I prefer to do the short init of unbound forms first, so I invert the usual test ...
if request.method != 'POST':
formset = HS_formset( prefix=prefix)
else:
# process POSTed forms data.
# pull all relevant things out of POST data, because POST itself is not mutable
# (it doesn't matter if prefix allows in extraneous items)
data = { k:v for k,v in request.POST.items() if k.startswith(prefix) }
#if there are no spare empty forms, tell it we want another form, in place of or extra to client-side JS
#don't want to crash if unvalidated POST data is nbg so catch all ...
try:
n = int( data[ prefix + '-TOTAL_FORMS'])
test = '{}-{}-{}'.format(prefix, n-1, testinpost)
#print(test)
test = data.get( test, empty)
except Exception:
test = 'bleagh'
# log the error if it matters enough ...
if test != empty:
data[ prefix + '-TOTAL_FORMS'] = n + 1
# now the usual formset processing ...
formset = HS_formset( data, prefix=prefix)
# other_form = OtherForm( request.POST)
if formset.is_valid():
...
I use RegEx in my Vue.js method:
addForm: function () {
this.count++
let form_count = this.count
form_count++
let formID = 'id_form-' + this.count
incremented_form = this.vue_form.replace(/form-\d/g, 'form-' + this.count)
this.formList.push(incremented_form)
this.$nextTick(() => {
let total_forms = document.getElementsByName('form-TOTAL_FORMS').forEach
(function (ele, idx) {
ele.value = form_count
})
})
},
delForm: function () {
if (this.count != 0) {
this.count--
let form_count = this.count
form_count++
let formID = 'id_form-' + this.count
this.formList.pop()
this.$nextTick(() => {
let total_forms = document.getElementsByName('form-TOTAL_FORMS').forEach
(function (ele, idx) {
ele.value = form_count
})
})
}
else return
},