How to use dagster with great expectations? - great-expectations

The issue
I'm trying out great expectations with dagster, as per this guide
My pipeline seems to execute correctly until it reaches this block:
expectation = dagster_ge.ge_validation_op_factory(
name='ge_validation_op',
datasource_name='dev.data-pipeline-data-storage.data_pipelines.raw_data.sirene_update',
suite_name='suite.data_pipelines.raw_data.sirene_update',
)
if expectation["success"]:
print("Success")
trying to call expectation["success"] results in a
# TypeError: 'SolidDefinition' object is not subscriptable
When I go inside the code of ge_validation_op_factory, there is a _ge_validation_fn that should yield ExpectationResult, but somehow it gets coverted into a SolidDefinition...
Dagster version = 0.15.9;
Great Expectations version = 0.15.44
Code to reproduce the error
In my code, I am trying to interact with an s3 bucket, so it would be a bit tedious to re-create the code for my example but here it is anyway:
In a gx_postprocessing.py
import json
import boto3
import dagster_ge
from dagster import (
op,
graph,
Field,
String,
OpExecutionContext,
)
from typing import List, Dict
#op(
config_schema={
"bucket": Field(
String,
description="s3 bucket name",
),
"path_in_s3": Field(
String,
description="Prefix representing the path to data",
),
"technical_date": Field(
String,
description="date string to fetch data",
),
"file_name": Field(
String,
description="file name that contains the data",
),
}
)
def read_in_json_datafile_from_s3(context: OpExecutionContext):
bucket = context.op_config["bucket"]
path_in_s3 = context.op_config["path_in_s3"]
technical_date = context.op_config["technical_date"]
file_name = context.op_config["file_name"]
object = f"{path_in_s3}/" f"technical_date={technical_date}/" f"{file_name}"
s3 = boto3.resource("s3")
content_object = s3.Object(bucket, object)
file_content = content_object.get()["Body"].read().decode("utf-8")
json_content = json.loads(file_content)
return json_content
#op
def process_example_dq(data: List[Dict]):
return len(data)
#op
def postprocess_example_dq(numrows, expectation):
if expectation["success"]:
return numrows
else:
raise ValueError
#op
def validate_example_dq(context: OpExecutionContext):
expectation = dagster_ge.ge_validation_op_factory(
name='ge_validation_op',
datasource_name='my_bucket.data_pipelines.raw_data.example_update',
suite_name='suite.data_pipelines.raw_data.example_update',
)
return expectation
#graph(
config={
"read_in_json_datafile_from_s3": {
"config": {
"bucket": "my_bucket",
"path_in_s3": "my_path",
"technical_date": "2023-01-24",
"file_name": "myfile_20230124.json",
}
},
},
)
def example_update_evaluation():
output_dict = read_in_json_datafile_from_s3()
nb_items = process_example_dq(data=output_dict)
expectation = validate_example_dq()
postprocess_example_dq(
numrows=nb_items,
expectation=expectation,
)
Do not forget to add great_expectations_poc_pipeline to your __init__.py where the pipelines=[..] are listed.

In this example, dagster_ge.ge_validation_op_factory(...) is returning an OpDefinition, which is the same type of thing as (for example) process_example_dq, and should be composed in the graph definition the same way, rather than invoked within another op.
So instead, you'd want to have something like:
validate_example_dq = dagster_ge.ge_validation_op_factory(
name='ge_validation_op',
datasource_name='my_bucket.data_pipelines.raw_data.example_update',
suite_name='suite.data_pipelines.raw_data.example_update',
)
Then use that op inside your graph definition the same way you currently are (i.e. expectation = validate_example_dq())

Related

Display name is not reflected on Vertex pipelines UI

I have a job that uses CustomContainerTrainingJobRunOp with example code
def custom_job(task_name, env, machine_type):
from google_cloud_pipeline_components.aiplatform import (
CustomContainerTrainingJobRunOp,
)
op = CustomContainerTrainingJobRunOp(
project=PROJECT_ID,
display_name=task_name,
location=REGION,
machine_type=machine_type,
replica_count=1,
container_uri=IMAGE_URI,
staging_bucket=STAGING,
)
return op
#dsl.pipeline(
name="name-of-the-pipeline", # This is ok
description="some job",
pipeline_root=PIPELINE_ROOT,
)
def pipeline(project_id: str = PROJECT_ID, gcp_region: str = REGION):
bq_task = custom_job("setup_bq", "stg")
compiler.Compiler().compile(pipeline_func=pipeline, package_path='file.json')
In the UI, the job name is shown as

Error when trying to use CustomPythonPackageTrainingJobRunOp in VertexAI pipeline

I am using the google cloud pipeline component CustomPythonPackageTrainingJobRunOp in a VertexAI pipeline . I have been able to run this package successfully as a CustomTrainingJob before. I can see multiple (11) error messages in the logs but the only one that seems to make sense to me is, "ValueError: too many values to unpack (expected 2) " but I am unable to figure out the solution. I can add all the other error messages too if required. I am logging some messages at the start of the training code so I know the errors happen before the training code is executed. I am completely stuck on this. Links to samples where someone has used CustomPythonPackageTrainingJobRunOp in a pipeline would very helpful as well. Below is the pipeline code that I am trying to execute:
import kfp
from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient
from google_cloud_pipeline_components import aiplatform as gcc_aip
#kfp.dsl.pipeline(name=pipeline_name)
def pipeline(
project: str = "adsfafs-321118",
location: str = "us-central1",
display_name: str = "vertex_pipeline",
python_package_gcs_uri: str = "gs://vertex/training/training-package-3.0.tar.gz",
python_module_name: str = "trainer.task",
container_uri: str = "us-docker.pkg.dev/vertex-ai/training/scikit-learn-cpu.0-23:latest",
staging_bucket: str = "vertex_bucket",
base_output_dir: str = "gs://vertex_artifacts/custom_training/"
):
gcc_aip.CustomPythonPackageTrainingJobRunOp(
display_name=display_name,
python_package_gcs_uri=python_package_gcs_uri,
python_module=python_module_name,
container_uri=container_uri,
project=project,
location=location,
staging_bucket=staging_bucket,
base_output_dir=base_output_dir,
args = ["--arg1=val1", "--arg2=val2", ...]
)
compiler.Compiler().compile(
pipeline_func=pipeline, package_path=package_path
)
api_client = AIPlatformClient(project_id=project_id, region=region)
response = api_client.create_run_from_job_spec(
package_path,
pipeline_root=pipeline_root_path
)
In the documentation for CustomPythonPackageTrainingJobRunOp, the type of the argument "python_module" seems to be "google.cloud.aiplatform.training_jobs.CustomPythonPackageTrainingJob" instead of string, which seems odd. However, I tried to re-define the pipeline, where I have replaced argument python_module in CustomPythonPackageTrainingJobRunOp with a CustomPythonPackageTrainingJob object instead of a string, as below but still getting the same error:
def pipeline(
project: str = "...",
location: str = "...",
display_name: str = "...",
python_package_gcs_uri: str = "...",
python_module_name: str = "...",
container_uri: str = "...",
staging_bucket: str = "...",
base_output_dir: str = "...",
):
job = aiplatform.CustomPythonPackageTrainingJob(
display_name= display_name,
python_package_gcs_uri=python_package_gcs_uri,
python_module_name=python_module_name,
container_uri=container_uri,
staging_bucket=staging_bucket
)
gcc_aip.CustomPythonPackageTrainingJobRunOp(
display_name=display_name,
python_package_gcs_uri=python_package_gcs_uri,
python_module=job,
container_uri=container_uri,
project=project,
location=location,
base_output_dir=base_output_dir,
args = ["--arg1=val1", "--arg2=val2", ...]
)
Edit:
Added the args that I was passing and had forgotten to add here.
Turns out that the way I was passing the args to the python module was incorrect. Instead of args = ["--arg1=val1", "--arg2=val2", ...], you need to specify args = ["--arg1", val1, "--arg2", val2, ...]

Scheduling tasks with dynamic arguments to run periodically with celery in django

I have a tasks that contains dynamic arguments I want to run periodically, how do I pass dynamic elements to the tasks arguments when the task is being called in django celery beat?
Here is the task I want to run periodically:
#task(bind=True)
def generate_export(export_type, xform, export_id=None, options=None):
"""
Create appropriate export object given the export type.
param: export_type
param: xform
params: export_id: ID of export object associated with the request
param: options: additional parameters required for the lookup.
binary_select_multiples: boolean flag
end: end offset
ext: export extension type
dataview_pk: dataview pk
group_delimiter: "/" or "."
query: filter_query for custom queries
remove_group_name: boolean flag
split_select_multiples: boolean flag
index_tag: ('[', ']') or ('_', '_')
show_choice_labels: boolean flag
language: language labels as in the XLSForm/XForm
"""
username = xform.user.username
id_string = xform.id_string
end = options.get("end")
extension = options.get("extension", export_type)
filter_query = options.get("query")
remove_group_name = options.get("remove_group_name", False)
start = options.get("start")
export_type_func_map = {
Export.XLS_EXPORT: 'to_xls_export',
Export.CSV_EXPORT: 'to_flat_csv_export',
Export.DHIS2CSV_EXPORT: 'to_dhis2csv_export',
Export.CSV_ZIP_EXPORT: 'to_zipped_csv',
Export.SAV_ZIP_EXPORT: 'to_zipped_sav',
Export.GOOGLE_SHEETS_EXPORT: 'to_google_sheets',
}
if xform is None:
xform = XForm.objects.get(
user__username__iexact=username, id_string__iexact=id_string)
dataview = None
if options.get("dataview_pk"):
dataview = DataView.objects.get(pk=options.get("dataview_pk"))
records = dataview.query_data(dataview, all_data=True,
filter_query=filter_query)
total_records = dataview.query_data(dataview,
count=True)[0].get('count')
else:
records = query_data(xform, query=filter_query, start=start, end=end)
if filter_query:
total_records = query_data(xform, query=filter_query, start=start,
end=end, count=True)[0].get('count')
else:
total_records = xform.num_of_submissions
if isinstance(records, QuerySet):
records = records.iterator()
export_builder = ExportBuilder()
export_builder.TRUNCATE_GROUP_TITLE = True \
if export_type == Export.SAV_ZIP_EXPORT else remove_group_name
export_builder.GROUP_DELIMITER = options.get(
"group_delimiter", DEFAULT_GROUP_DELIMITER
)
export_builder.SPLIT_SELECT_MULTIPLES = options.get(
"split_select_multiples", True
)
export_builder.BINARY_SELECT_MULTIPLES = options.get(
"binary_select_multiples", False
)
export_builder.INCLUDE_LABELS = options.get('include_labels', False)
export_builder.INCLUDE_LABELS_ONLY = options.get(
'include_labels_only', False
)
export_builder.INCLUDE_HXL = options.get('include_hxl', False)
export_builder.INCLUDE_IMAGES \
= options.get("include_images", settings.EXPORT_WITH_IMAGE_DEFAULT)
export_builder.VALUE_SELECT_MULTIPLES = options.get(
'value_select_multiples', False)
export_builder.REPEAT_INDEX_TAGS = options.get(
"repeat_index_tags", DEFAULT_INDEX_TAGS
)
export_builder.SHOW_CHOICE_LABELS = options.get('show_choice_labels',
False)
export_builder.language = options.get('language')
# 'win_excel_utf8' is only relevant for CSV exports
if 'win_excel_utf8' in options and export_type != Export.CSV_EXPORT:
del options['win_excel_utf8']
export_builder.set_survey(xform.survey, xform)
# change the dhis2csv exports to standard csv format
if extension == 'dhis2csv':
extension = 'csv'
temp_file = NamedTemporaryFile(suffix=("." + extension))
columns_with_hxl = export_builder.INCLUDE_HXL and get_columns_with_hxl(
xform.survey_elements)
# get the export function by export type
func = getattr(export_builder, export_type_func_map[export_type])
try:
func.__call__(
temp_file.name, records, username, id_string, filter_query,
start=start, end=end, dataview=dataview, xform=xform,
options=options, columns_with_hxl=columns_with_hxl,
total_records=total_records
)
except NoRecordsFoundError:
pass
except SPSSIOError as e:
export = get_or_create_export(export_id, xform, export_type, options)
export.error_message = str(e)
export.internal_status = Export.FAILED
export.save()
report_exception("SAV Export Failure", e, sys.exc_info())
return export
# generate filename
basename = "%s_%s" % (
id_string, datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f"))
if remove_group_name:
# add 'remove group name' flag to filename
basename = "{}-{}".format(basename, GROUPNAME_REMOVED_FLAG)
if dataview:
basename = "{}-{}".format(basename, DATAVIEW_EXPORT)
filename = basename + "." + extension
# check filename is unique
while not Export.is_filename_unique(xform, filename):
filename = increment_index_in_filename(filename)
file_path = os.path.join(
username,
'exports',
id_string,
export_type,
filename)
# seek to the beginning as required by storage classes
temp_file.seek(0)
export_filename = default_storage.save(file_path,
File(temp_file, file_path))
temp_file.close()
dir_name, basename = os.path.split(export_filename)
# get or create export object
export = get_or_create_export(export_id, xform, export_type, options)
export.filedir = dir_name
export.filename = basename
export.internal_status = Export.SUCCESSFUL
# do not persist exports that have a filter
# Get URL of the exported sheet.
if export_type == Export.GOOGLE_SHEETS_EXPORT:
export.export_url = export_builder.url
# if we should create a new export is true, we should not save it
if start is None and end is None:
export.save()
return export
and this is where I call the tasks in the celery beat schedule:
CELERY_BEAT_SCHEDULE = {
'download_csv': {
'task': 'onadata.libs.utils.export_tools.generate_export',
# There are 4 ways we can handle time, read further
'schedule': crontab(minute='*'),
# If you're using any arguments
'args': ()
}
}
how do I pass parameters into the arguments for the tasks??
There is no way to pass argument dynamically in Celery Beat. I think your function is not suitable with periodic task.
Instead of giving a factor directly to the generate_export function, it must be changed to get the required items within the function. Or change to a simple asynchronous operation.
I faced a similar problem. The args field in beat_schedule is fixed at startup and does not change afterward.
But there is a hackish way to pass different arguments to your task.
Use the before_task_publish signal to add custom data in headers.
from celery.signals import before_task_publish
#before_task_publish.connect
def before_publish(sender=None, headers=None, body=None, **kwargs):
if sender == "tasks.generate_export":
headers["custom_args"] = {
"export_type": "some_val"
"xform": "some_val"
"export_id": get_export_id()
"options": options_dict
}
By default, Celery uses JSON serializer. So, make sure the data you add to headers are JSON serializable. Alternatively, you can use pickle to serialize the data, but it brings security concerns with it.
Now you can access these headers in a bound task.
#task(bind=True)
def generate_export(self):
args = self.request.get("custom_args", None)
# do something with args

in tastypie, how can i set name for json result

In tastypie, I want set json result name.
I have a class that I use for it but I can set name in.
enter cclass ContentResource(ModelResource):
class Meta:
results = ListField(attribute='results')
queryset = Content.objects.all()
resource_name = 'content'
max_limit = None
#filtering = {"title": "contains"}
def alter_list_data_to_serialize(self, request, data_dict):
if isinstance(data_dict, dict):
if 'meta' in data_dict:
# Get rid of the "meta".
del(data_dict['meta'])
# Rename the objects.
data_dict['Mobile'] = data_dict['objects']
del(data_dict['objects'])
return data_dict
ode here it returns this
{"Mobile":
[
{
"added": "2015-07-23T11:30:20.911835",
"content_cast": "",
"content_company": "HamrahCinema",
"content_description": "so nice",
"content_director": "",
"content_duration": "2:20",
"content_filelanguage": null,
}
]
}
when I use /content/api/content every thing is ok, but when I use /content/api/content/1,"mobile" is removed.
as educated guess, I would suggest using alter_detail_data_to_serialize

Mock Stripe Methods in Python for testing

So I am trying to mock all the stripe web hooks in the method so that I can write the Unit test for it. I am using the mock library for mocking the stripe methods. Here is the method I am trying to mock:
class AddCardView(APIView):
"""
* Add card for the customer
"""
permission_classes = (
CustomerPermission,
)
def post(self, request, format=None):
name = request.DATA.get('name', None)
cvc = request.DATA.get('cvc', None)
number = request.DATA.get('number', None)
expiry = request.DATA.get('expiry', None)
expiry_month, expiry_year = expiry.split("/")
customer_obj = request.user.contact.business.customer
customer = stripe.Customer.retrieve(customer_obj.stripe_id)
try:
card = customer.sources.create(
source={
"object": "card",
"number": number,
"exp_month": expiry_month,
"exp_year": expiry_year,
"cvc": cvc,
"name": name
}
)
# making it the default card
customer.default_source = card.id
customer.save()
except CardError as ce:
logger.error("Got CardError for customer_id={0}, CardError={1}".format(customer_obj.pk, ce.json_body))
return Response({"success": False, "error": "Failed to add card"})
else:
customer_obj.card_last_4 = card.get('last4')
customer_obj.card_kind = card.get('type', '')
customer_obj.card_fingerprint = card.get('fingerprint')
customer_obj.save()
return Response({"success": True})
This is the method for unit testing:
#mock.patch('stripe.Customer.retrieve')
#mock.patch('stripe.Customer.create')
def test_add_card(self,create_mock,retrieve_mock):
response = {
'default_card': None,
'cards': {
"count": 0,
"data": []
}
}
# save_mock.return_value = response
create_mock.return_value = response
retrieve_mock.return_value = response
self.api_client.client.login(username = self.username, password = self.password)
res = self.api_client.post('/biz/api/auth/card/add')
print res
Now stripe.Customer.retrieve is being mocked properly. But I am not able to mock customer.sources.create. I am really stuck on this.
This is the right way of doing it:
#mock.patch('stripe.Customer.retrieve')
def test_add_card_failure(self, retrieve_mock):
data = {
'name': "shubham",
'cvc': 123,
'number': "4242424242424242",
'expiry': "12/23",
}
e = CardError("Card Error", "", "")
retrieve_mock.return_value.sources.create.return_value = e
self.api_client.client.login(username=self.username, password=self.password)
res = self.api_client.post('/biz/api/auth/card/add', data=data)
self.assertEqual(self.deserialize(res)['success'], False)
Even though the given answer is correct, there is a way more comfortable solution using vcrpy. That is creating a cassette (record) once a given record does not exist yet. When it does, the mocking is done transparently and the record will be replayed. Beautiful.
Having a vanilla pyramid application, using py.test, my test now looks like this:
import vcr
# here we have some FactoryBoy fixtures
from tests.fixtures import PaymentServiceProviderFactory, SSOUserFactory
def test_post_transaction(sqla_session, test_app):
# first we need a PSP and a User existent in the DB
psp = PaymentServiceProviderFactory() # type: PaymentServiceProvider
user = SSOUserFactory()
sqla_session.add(psp, user)
sqla_session.flush()
with vcr.use_cassette('tests/casettes/tests.checkout.services.transaction_test.test_post_transaction.yaml'):
# with that PSP we create a new PSPTransaction ...
res = test_app.post(url='/psps/%s/transaction' % psp.id,
params={
'token': '4711',
'amount': '12.44',
'currency': 'EUR',
})
assert 201 == res.status_code
assert 'id' in res.json_body
IMO, the following method is better than the rest of the answers
import unittest
import stripe
import json
from unittest.mock import patch
from stripe.http_client import RequestsClient # to mock the request session
stripe.api_key = "foo"
stripe.default_http_client = RequestsClient() # assigning the default HTTP client
null = None
false = False
true = True
charge_resp = {
"id": "ch_1FgmT3DotIke6IEFVkwh2N6Y",
"object": "charge",
"amount": 1000,
"amount_captured": 1000,
"amount_refunded": 0,
"billing_details": {
"address": {
"city": "Los Angeles",
"country": "USA",
},
"email": null,
"name": "Jerin",
"phone": null
},
"captured": true,
}
def get_customer_city_from_charge(stripe_charge_id):
# this is our function and we are writing unit-test for this function
charge_response = stripe.Charge.retrieve("foo-bar")
return charge_response.billing_details.address.city
class TestStringMethods(unittest.TestCase):
#patch("stripe.default_http_client._session")
def test_get_customer_city_from_charge(self, mock_session):
mock_response = mock_session.request.return_value
mock_response.content.decode.return_value = json.dumps(charge_resp)
mock_response.status_code = 200
city_name = get_customer_city_from_charge("some_id")
self.assertEqual(city_name, "Los Angeles")
if __name__ == '__main__':
unittest.main()
Advantages of this method
You can generate the corresponding class objects (here, the charge_response variable is a type of Charge--(source code))
You can use the dot (.) operator over the response (as we can do with real stripe SDK)
dot operator support for deep attributes