MissingConfigVariableError while creating DataContext in Great Expectations - great-expectations

Unable to create DataContext with the following configuration.I am try to use a Databricks spark df datasource and in house DB as storeBackendDefaults
I get the MissingConfigVariableError exceptions
Could some explain what I am missing
import great_expectations as ge
import great_expectations.exceptions as ge_exceptions
from great_expectations.data_context.types.base import DataContextConfig, DatasourceConfig, FilesystemStoreBackendDefaults, DatabaseStoreBackendDefaults
from great_expectations.data_context import BaseDataContext
my_spark_datasource_config = DatasourceConfig(
class_name="Datasource",
execution_engine={"class_name": "SparkDFExecutionEngine"},
data_connectors={"sample_sparkdf_runtime_data_connector": {
"module_name": "great_expectations.datasource.data_connector",
"class_name": "RuntimeDataConnector",
"batch_identifiers": [
"some_key_maybe_pipeline_stage",
"some_other_key_maybe_run_id"
]
}
}
)
data_context_config = DataContextConfig(config_version = 2
,plugins_directory = None
,config_variables_file_path = None
,datasources={"my_spark_datasource": my_spark_datasource_config}
,store_backend_defaults=DatabaseStoreBackendDefaults(default_credentials = {
"drivername": "PrestoSQL",
"host": "*****",
"port": "443",
"username": "*****",
"password": "*****",
"database": "****"
}
),
anonymous_usage_statistics={"enabled": False}
)
context = BaseDataContext(project_config=data_context_config)

Related

Django/Wagtail Rest Framework API Ordering/Pagination

I am using wagtail. I have serialized my API. I want to order them by -first_published_at, when someone hit my API url api/v2/pages they will see an ordered API without filtering it via URL. here is my api.py code:
class ProdPagesAPIViewSet(BaseAPIViewSet):
renderer_classes = [JSONRenderer]
filter_backends = [FieldsFilter,
ChildOfFilter,
AncestorOfFilter,
DescendantOfFilter,
OrderingFilter,
TranslationOfFilter,
LocaleFilter,
SearchFilter,]
meta_fields = ["type","seo_title","search_description","first_published_at"]
body_fields = ["id","type","seo_title","search_description","first_published_at","title"]
listing_default_fields = ["type","seo_title","search_description","first_published_at","id","title","alternative_title","news_slug","blog_image","video_thumbnail","categories","blog_authors","excerpt","content","content2","tags",]
nested_default_fields = []
ordered_queryset= []
name = "pages"
model = AddStory
api_router.register_endpoint("pages", ProdPagesAPIViewSet)
I have tried ordered_queryset= [AddStory.objects.order_by('-first_published_at')]
But it's not ordered by the newest published stories. How should I do the query?
Here is my API response
{
"meta": {
"total_count": 6
},
"items": [
{
"id": 4,
"meta": {
"type": "blog.AddStory",
"seo_title": "",
"search_description": "",
"first_published_at": "2022-08-30T11:05:11.341355Z"
},
{
"id": 6,
"meta": {
"type": "blog.AddStory",
"seo_title": "",
"search_description": "",
"first_published_at": "2022-08-30T11:13:47.114889Z"
},
{
"id": 7,
"meta": {
"type": "blog.AddStory",
"seo_title": "",
"search_description": "",
"first_published_at": "2022-08-31T11:13:47.114889Z"
},
Solved after using get_queryset instead of order_queryset
#api.py
class ProdPagesAPIViewSet(BaseAPIViewSet):
renderer_classes = [JSONRenderer]
filter_backends = [FieldsFilter,
ChildOfFilter,
AncestorOfFilter,
DescendantOfFilter,
OrderingFilter,
TranslationOfFilter,
LocaleFilter,
SearchFilter,]
meta_fields = ["type","seo_title","search_description","first_published_at"]
body_fields = ["id","type","seo_title","search_description","first_published_at","title"]
listing_default_fields = ["type","seo_title","search_description","first_published_at","id","title","alternative_title","news_slug","blog_image","video_thumbnail","categories","blog_authors","excerpt","content","content2","tags",]
nested_default_fields = []
def get_queryset(self):
return self.model.objects.all().order_by("-first_published_at")
name = "pages"
model = AddStory
api_router.register_endpoint("pages", ProdPagesAPIViewSet)

Get Dimensions for USAGE_TYPE AWS Boto3 CostExplorer Client

I'm trying to get Costs using CostExplorer Client in boto3. But I can't find the values to use as a Dimension filter. The documentation says that we can extract those values from GetDimensionValues but how do I use GetDimensionValues.
response = client.get_cost_and_usage(
TimePeriod={
'Start': str(start_time).split()[0],
'End': str(end_time).split()[0]
},
Granularity='DAILY',
Filter = {
'Dimensions': {
'Key':'USAGE_TYPE',
'Values': [
'DataTransfer-In-Bytes'
]
}
},
Metrics=[
'NetUnblendedCost',
],
GroupBy=[
{
'Type': 'DIMENSION',
'Key': 'SERVICE'
},
]
)
The boto3 reference for GetDimensionValues has a lot of details on how to use that call. Here's some sample code you might use to print out possible dimension values:
response = client.get_dimension_values(
TimePeriod={
'Start': '2022-01-01',
'End': '2022-06-01'
},
Dimension='USAGE_TYPE',
Context='COST_AND_USAGE',
)
for dimension_value in response["DimensionValues"]:
print(dimension_value["Value"])
Output:
APN1-Catalog-Request
APN1-DataTransfer-Out-Bytes
APN1-Requests-Tier1
APN2-Catalog-Request
APN2-DataTransfer-Out-Bytes
APN2-Requests-Tier1
APS1-Catalog-Request
APS1-DataTransfer-Out-Bytes
.....

IndexError: list index out of range with moto

I am mocking an internal function which is returning dynamodb query. the query had begins_with which was throwing error IndexError: list index out of range.
I changed the query and removed begins_with yet still getting the same error. If I remove AND condition from KeyConditionExpression then the query works.
Below is the query:
val = 'test#val#testing'
input_query = {
'TableName': <table_name>,
'KeyConditionExpression': '#23b62 = :23b62 And #23b63 = :23b63)',
'FilterExpression': 'contains(#23b64, :23b64)',
'ProjectionExpression': '#23b60,#23b61',
'ExpressionAttributeNames': {'#23b60': 'level', '#23b61': 'test_id', '#23b62': 'PK', '#23b63': 'SK', '#23b64': 'used_in'},
'ExpressionAttributeValues': {':23b62': {'S': 'testing'}, ':23b63': {'S': val}, ':23b64': {'S': 'test'}}
}
New Query :
dynamodb_client.query(TableName="table",
KeyConditionExpression = "#PK = :PK And #SK = :SK",
ExpressionAttributeNames = {
"#PK": "PK",
"#SK": "SK"
},
FilterExpression = "contains(Used, :used)",
ExpressionAttributeValues ={
":PK": {"S": "tests"},
":SK": {"S": "test#en#testing"},
":used": {"S": "testing"}
}
)
Test case:
from botocore.exceptions import ClientError
from dynamodb_json import json_util as dynamodb_json
import logging
from contextlib import contextmanager
import pytest
from unittest.mock import patch
#contextmanager
def ddb_setup(dynamodb_resource):
table = dynamodb_resource.create_table(
TableName='table',
KeySchema=[
{
'AttributeName': 'PK',
'KeyType': 'HASH'
}, {
'AttributeName': 'SK',
'KeyType': 'SORT'
},
],
AttributeDefinitions=[
{
'AttributeName': 'PK',
'AttributeType': 'S'
}, {
'AttributeName': 'SK',
'AttributeType': 'S'
},
],
ProvisionedThroughput={
'ReadCapacityUnits': 1,
'WriteCapacityUnits': 1,
}
)
yield
class TestDynamoDB:
def test_create_table(self, dynamodb_resource, dynamodb_client):
with ddb_setup(dynamodb_resource):
try:
response = dynamodb_client.describe_table(
TableName='table')
resp = dynamodb_client.query(TableName="table",
KeyConditionExpression = "#PK = :PK And #SK = :SK",
ExpressionAttributeNames = {
"#PK": "PK",
"#SK": "SK"
},
FilterExpression = "contains(Used, :used)",
ExpressionAttributeValues ={
":PK": {"S": "tests"},
":SK": {"S": "test#en#testing"},
":used": {"S": "testing"}
}
)
except ClientError as err:
logger.error(f"error: {err.response['Error']['Code']}", )
assert err.response['Error']['Code'] == 'ResourceNotFoundException'
Could anyone suggest how can I run this query with moto with AND condition.
Here is an example of a working test configuration using pytest and moto. I've added code that shows how to use the AND condition using the resource and client API.
import boto3
import boto3.dynamodb.conditions as conditions
import moto
import pytest
TABLE_NAME = "data"
#pytest.fixture
def test_table():
with moto.mock_dynamodb():
client = boto3.client("dynamodb")
client.create_table(
AttributeDefinitions=[
{"AttributeName": "PK", "AttributeType": "S"},
{"AttributeName": "SK", "AttributeType": "S"}
],
TableName=TABLE_NAME,
KeySchema=[
{"AttributeName": "PK", "KeyType": "HASH"},
{"AttributeName": "SK", "KeyType": "RANGE"}
],
BillingMode="PAY_PER_REQUEST"
)
table = boto3.resource("dynamodb").Table(TABLE_NAME)
table.put_item(Item={
"PK": "pk_value",
"SK": "sk_value"
})
yield TABLE_NAME
def test_query_with_and_using_resource(test_table):
table = boto3.resource("dynamodb").Table(TABLE_NAME)
response = table.query(
KeyConditionExpression=conditions.Key("PK").eq("pk_value") & conditions.Key("SK").eq("sk_value")
)
assert len(response["Items"]) == 1
def test_query_with_and_using_client(test_table):
client = boto3.client("dynamodb")
response = client.query(
TableName=TABLE_NAME,
KeyConditionExpression="#PK = :PK AND #SK = :SK",
ExpressionAttributeNames={
"#PK": "PK",
"#SK": "SK"
},
ExpressionAttributeValues={
":PK": {"S": "pk_value"},
":SK": {"S": "sk_value"}
}
)
assert len(response["Items"]) == 1
First, we set up a table with a dummy item, and then there are two tests, the first for the resource and the second for the client API. Maybe this helps you figure out the mistake.
AWS uses the keyword RANGE to indicate that something is a sort-key. (No idea why..)
If you replace:
'KeyType': 'SORT'
with
'KeyType': 'RANGE'
the test passes.
I'm assuming that AWS throws a more obvious error when creating a table with an unknown KeyType. If you want, you can create a feature request on Moto's Github for Moto to replicate that behaviour and throw the same exception.

Flatten json return by DRF

I have json API returned as below format.
But I want to return json API decomposing namingzone key as specified below.
Could anyone tell me how I can revise serializer to achieve this?
serializer.py is also specified below.
For models.py and views.py, please refer to my previous post.
current
{
"zone": {
"zone": "office_enclosed",
"namingzone": [
{
"naming": "moffice"
}
]
},
"lpd": 11.9,
"sensor": true
},
{
"zone": {
"zone": "office_open",
"namingzone": [
{
"naming": "off"
},
{
"naming": "office"
}
]
},
"lpd": 10.5,
"sensor": true
}
Target
{
"zone": "office_enclosed",
"naming": "moffice",
"lpd": 11.9,
"sensor": true
},
{
"zone": "office_open",
"naming": "off",
"lpd": 10.5,
"sensor": true
},
{
"zone": "office_open",
"naming": "office",
"lpd": 10.5,
"sensor": true
}
serializer.py
class namingNewSerializer(serializers.ModelSerializer):
class Meta:
model=Naming
fields=('naming',)
class zoneSerializer(serializers.ModelSerializer):
namingzone=namingNewSerializer(many=True)
class Meta:
model=Zone
fields = ('zone','namingzone')
class lightSerializer(serializers.ModelSerializer):
zone = zoneSerializer()
class Meta:
model=Light
fields = ('zone','lpd','sensor')
class namingSerializer(serializers.ModelSerializer):
zone=zoneSerializer()
class Meta:
model=Naming
fields=('zone','naming')
I would say using Serializer might complicate the implementations. Rather, you can take an pythonic approach. Try like this:
class SomeView(APIView):
...
def get(self, request, *args, **kwargs):
data = lightSerializer(Light.objects.all(), many=True).data
data = list(data) # convert lazy object to list
updated_data = list()
for item in data:
newdict = dict()
zone = item['zone']
newdict.update({'zone':zone['zone'], 'lpd': item['lpd'], 'sensor':item['sensor']})
for naming_zone in zone.get('namingzone'):
naming_zone.update(newDict)
updated_data.append(naming_zone)
return Response(updated_data, status=status.HTTP_200_OK)
See DRF Field document about source. It will help you.
https://www.django-rest-framework.org/api-guide/fields/#source

Django Haystack - How to force exact attribute match without stemming?

I'm using Django 1.5 with django-haystack 2.0 and an elasticsearch backend. I'm trying to search by an exact attribute match. However, I'm getting "similar" results even though I'm using both the __exact operator and the Exact() class. How can I prevent this behavior?
For example:
# models.py
class Person(models.Model):
name = models.TextField()
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr="name")
def get_model(self):
return Person
def index_queryset(self, using=None):
return self.get_model().objects.all()
# templates/search/indexes/people/person_text.txt
{{ object.name }}
>>> p1 = Person(name="Simon")
>>> p1.save()
>>> p2 = Person(name="Simons")
>>> p2.save()
$ ./manage.py rebuild_index
>>> person_sqs = SearchQuerySet().models(Person)
>>> person_sqs.filter(name__exact="Simons")
[<SearchResult: people.person (name=u'Simon')>
<SearchResult: people.person (name=u'Simons')>]
>>> person_sqs.filter(name=Exact("Simons", clean=True))
[<SearchResult: people.person (name=u'Simon')>
<SearchResult: people.person (name=u'Simons')>]
I only want the search result for "Simons" - the "Simon" result should not show up.
Python3, Django 1.10, Elasticsearch 2.4.4.
TL;DR: define custom tokenizer (not filter)
Verbose explanation
a) use EdgeNgramField:
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.EdgeNgramField(document=True, use_template=True)
...
b) template:
# templates/search/indexes/people/person_text.txt
{{ object.name }}
c) create custom search backend:
# backends.py
from django.conf import settings
from haystack.backends.elasticsearch_backend import (
ElasticsearchSearchBackend,
ElasticsearchSearchEngine,
)
class CustomElasticsearchSearchBackend(ElasticsearchSearchBackend):
def __init__(self, connection_alias, **connection_options):
super(CustomElasticsearchSearchBackend, self).__init__(
connection_alias, **connection_options)
setattr(self, 'DEFAULT_SETTINGS', settings.ELASTICSEARCH_INDEX_SETTINGS)
class CustomElasticsearchSearchEngine(ElasticsearchSearchEngine):
backend = CustomElasticsearchSearchBackend
d) define custom tokenizer (not filter!):
# settings.py
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'apps.persons.backends.CustomElasticsearchSearchEngine',
'URL': 'http://127.0.0.1:9200/',
'INDEX_NAME': 'haystack',
},
}
ELASTICSEARCH_INDEX_SETTINGS = {
"settings": {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "custom_ngram_tokenizer",
"filter": ["asciifolding", "lowercase"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "custom_edgengram_tokenizer",
"filter": ["asciifolding", "lowercase"]
}
},
"tokenizer": {
"custom_ngram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 12,
"token_chars": ["letter", "digit"]
},
"custom_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 12,
"token_chars": ["letter", "digit"]
}
}
}
}
}
HAYSTACK_DEFAULT_OPERATOR = 'AND'
e) use AutoQuery (more versatile):
# views.py
search_value = 'Simons'
...
person_sqs = \
SearchQuerySet().models(Person).filter(
content=AutoQuery(search_value)
)
f) reindex after changes:
$ ./manage.py rebuild_index
I was facing a similar problem. if you change the settings of your haystacks elasticsearch back end like:
DEFAULT_SETTINGS = {
'settings': {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["haystack_ngram", "lowercase"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["haystack_edgengram", "lowercase"]
}
},
"tokenizer": {
"haystack_ngram_tokenizer": {
"type": "nGram",
"min_gram": 6,
"max_gram": 15,
},
"haystack_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 6,
"max_gram": 15,
"side": "front"
}
},
"filter": {
"haystack_ngram": {
"type": "nGram",
"min_gram": 6,
"max_gram": 15
},
"haystack_edgengram": {
"type": "edgeNGram",
"min_gram": 6,
"max_gram": 15
}
}
}
}
}
Then it will tokenize only when the query is more than 6 character.
If you want results like "xyzsimonsxyz", then you would need to use ngram analyzer instead of EdgeNGram or you could use both depending on your requirements. EdgeNGram generates tokens only from the beginning.
with NGram 'simons' will be one of the generated tokens for term xyzsimonsxyz assuming max_gram >=6 and you will get expected results, also search_analyzer needs to be different or you will get weird results.
Also index size might get pretty big with ngram if you have huge chunk of text
Not use CharField use EdgeNgramField.
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.EdgeNgramField(model_attr="name")
def get_model(self):
return Person
def index_queryset(self, using=None):
return self.get_model().objects.all()
And not user filter, user autocomplete
person_sqs = SearchQuerySet().models(Person)
person_sqs.autocomplete(name="Simons")
source: http://django-haystack.readthedocs.org/en/v2.0.0/autocomplete.html