How to build a multi-dimentional json native query for Druid? - postman

I have data with multiple dimensions, stored in the Druid cluster. for example, Data of movies and the revenue they earned from each country where they were screened.
I'm trying to build a query that the answer to be returned will be a table of all the movies, the total revenue of each of them, and the revenue for each country.
I succeeded to do it in Turnilo - it generated for me the following Druid query -
[
[
{
"queryType": "timeseries",
"dataSource": "movies_source",
"intervals": "2021-11-18T00:01Z/2021-11-21T00:01Z",
"granularity": "all",
"aggregations": [
{
"name": "__VALUE__",
"type": "doubleSum",
"fieldName": "revenue"
}
]
},
{
"queryType": "topN",
"dataSource": "movies_source",
"intervals": "2021-11-18T00:01Z/2021-11-21T00:01Z",
"granularity": "all",
"dimension": {
"type": "default",
"dimension": "movie_id",
"outputName": "movie_id"
},
"aggregations": [
{
"name": "revenue",
"type": "doubleSum",
"fieldName": "revenue"
}
],
"metric": "revenue",
"threshold": 50
}
],
[
{
"queryType": "topN",
"dataSource": "movies_source",
"intervals": "2021-11-18T00:01Z/2021-11-21T00:01Z",
"granularity": "all",
"filter": {
"type": "selector",
"dimension": "movie_id",
"value": "some_movie_id"
},
"dimension": {
"type": "default",
"dimension": "country",
"outputName": "country"
},
"aggregations": [
{
"name": "revenue",
"type": "doubleSum",
"fieldName": "revenue"
}
],
"metric": "revenue",
"threshold": 5
}
]
]
But it doesn't work when I'm trying to use it as a body for a Postman query - I got
{
"error": "Unknown exception",
"errorMessage": "Unexpected token (START_ARRAY), expected VALUE_STRING: need JSON String that contains type id (for subtype of org.apache.druid.query.Query)\n at [Source: (org.eclipse.jetty.server.HttpInputOverHTTP); line: 2, column: 3]",
"errorClass": "com.fasterxml.jackson.databind.exc.MismatchedInputException",
"host": null
}
How should I build the corresponding query so that it works with Postman?

I am not familiar with Turnilo but have you tried using the Druid Console to write SQL and convert to Native request with the "Explain SQL query" option under the "Run/..." menu?
Your native queries seem to be doing a Top N instead of listing all movies, so I think the SQL might be something like:
SELECT movie_id, country_id, SUM(revenue) total_revenue
FROM movies_source
WHERE __time BETWEEN '2021-11-18 00:01:00' AND '2021-11-21 00:01:00'
GROUP BY movie_id, country_id
ORDER BY total_revenue DESC
LIMIT 50
I don't have the data source to test, but tested with sample wikipedia data with similar query structure:
SELECT namespace, cityName, sum(sum_added) total
FROM "wikipedia" r
WHERE cityName IS NOT NULL
AND __time BETWEEN '2015-09-12 00:00:00' AND '2015-09-15 00:00:00'
GROUP BY namespace, cityName
ORDER BY total DESC
limit 50
which results in the following Native query:
{
"queryType": "groupBy",
"dataSource": {
"type": "table",
"name": "wikipedia"
},
"intervals": {
"type": "intervals",
"intervals": [
"2015-09-12T00:00:00.000Z/2015-09-15T00:00:00.001Z"
]
},
"virtualColumns": [],
"filter": {
"type": "not",
"field": {
"type": "selector",
"dimension": "cityName",
"value": null,
"extractionFn": null
}
},
"granularity": {
"type": "all"
},
"dimensions": [
{
"type": "default",
"dimension": "namespace",
"outputName": "d0",
"outputType": "STRING"
},
{
"type": "default",
"dimension": "cityName",
"outputName": "d1",
"outputType": "STRING"
}
],
"aggregations": [
{
"type": "longSum",
"name": "a0",
"fieldName": "sum_added",
"expression": null
}
],
"postAggregations": [],
"having": null,
"limitSpec": {
"type": "default",
"columns": [
{
"dimension": "a0",
"direction": "descending",
"dimensionOrder": {
"type": "numeric"
}
}
],
"limit": 50
},
"context": {
"populateCache": false,
"sqlOuterLimit": 101,
"sqlQueryId": "cd5aabed-5e08-49b7-af63-fe82c125d3ee",
"useApproximateCountDistinct": false,
"useApproximateTopN": false,
"useCache": false
},
"descending": false
}

Related

LogicApp:replace the message in the csv table with a "." for ","

I have the flow where i want to edit the column in the csv table and replace the "," by a "."
How do I do that? Because the replace function expression in logicApp does not return the column:
It asks me to take the complete body when I use the replace function.
Where as details column is available which I want to edit:
How should I replace the "," from the details column?
I did this then, Then i don't see the variable I initialize.
For instance I've taken this as my sample .csv file which I'm retrieving from my storage account.
Firstly I have used Parse CSV file like you did the same, then initialised and used the Append the string variable connector taking the Productsname column. Lastly, have used the replace function expression to replace ' , ' with a ' . '.
NOTE: I have used '|' following productsname variable for future purpose.
Here is my Logic App workflow
THE COMPOSE CONNECTOR EXPRESSION :-
split(replace(variables('Productname'),',','.'),'|')
OUTPUT:
Here is my workflow that you can refer to:
{
"definition": {
"$schema": "https://schema.management.azure.com/providers/Microsoft.Logic/schemas/2016-06-01/workflowdefinition.json#",
"actions": {
"Compose": {
"inputs": "#split(replace(variables('Productname'),',','.'),'|')",
"runAfter": {
"For_each_2": [
"Succeeded"
]
},
"type": "Compose"
},
"For_each_2": {
"actions": {
"Append_to_string_variable": {
"inputs": {
"name": "Productname",
"value": "#{items('For_each_2')?['Productname']}|"
},
"runAfter": {},
"type": "AppendToStringVariable"
}
},
"foreach": "#body('Parse_CSV')",
"runAfter": {
"Initialize_variable": [
"Succeeded"
]
},
"type": "Foreach"
},
"Get_blob_content_(V2)": {
"inputs": {
"host": {
"connection": {
"name": "#parameters('$connections')['azureblob']['connectionId']"
}
},
"method": "get",
"path": "/v2/datasets/#{encodeURIComponent(encodeURIComponent('AccountNameFromSettings'))}/files/#{encodeURIComponent(encodeURIComponent('JTJmY29udGFpbmVyMjQwOCUyZlByb2R1Y3RzLmNzdg=='))}/content"
},
"metadata": {
"JTJmY29udGFpbmVyMjQwOCUyZlByb2R1Y3RzLmNzdg==": "/container2408/Products.csv"
},
"runAfter": {},
"type": "ApiConnection"
},
"Initialize_variable": {
"inputs": {
"variables": [
{
"name": "Productname",
"type": "string"
}
]
},
"runAfter": {
"Parse_CSV": [
"Succeeded"
]
},
"type": "InitializeVariable"
},
"Parse_CSV": {
"inputs": {
"body": {
"content": "#{base64(body('Get_blob_content_(V2)'))}",
"headers": "Productid,Productname"
},
"host": {
"connection": {
"name": "#parameters('$connections')['plumsail']['connectionId']"
}
},
"method": "post",
"path": "/flow/v1/Documents/jobs/ParseCsv"
},
"runAfter": {
"Get_blob_content_(V2)": [
"Succeeded"
]
},
"type": "ApiConnection"
}
},
"contentVersion": "1.0.0.0",
"outputs": {},
"parameters": {
"$connections": {
"defaultValue": {},
"type": "Object"
}
},
"triggers": {
"manual": {
"inputs": {
"schema": {}
},
"kind": "Http",
"type": "Request"
}
}
},
"parameters": {
"$connections": {
"value": {
"azureblob": {
"connectionId": "/subscriptions/<subscription id>/resourceGroups/<Your resource group name>/providers/Microsoft.Web/connections/azureblob",
"connectionName": "azureblob",
"id": "/subscriptions/<subscription id>/providers/Microsoft.Web/locations/northcentralus/managedApis/azureblob"
},
"plumsail": {
"connectionId": "/subscriptions/<subscription id >/resourceGroups/<Your resource group name>/providers/Microsoft.Web/connections/plumsail",
"connectionName": "plumsail",
"id": "/subscriptions/<subscription id>/providers/Microsoft.Web/locations/northcentralus/managedApis/plumsail"
}
}
}
}
}
I used items function express and did it directly.
#replace(item()?['details'],',','')
This was a bit strange it didn't work at first but now it is working.

Google People API - Birthday - Date object is not returned in GET requests

In Google People - people.connections.list and other GET APIs, the Date object of Birthday field is not returned for some Contacts. Have authenticated with the full People scope [https://www.googleapis.com/auth/contacts].
We also do not know the format of the "text" field to parse that, as the param can have any random string.
How to parse the birthday of a user? When will the Date object not be returned?
Sample Request
https://people.googleapis.com/v1/people/me/connections?pageSize=100&requestSyncToken=true&personFields=birthdays
Sample response Birthdays
{
"birthdays": [
{
"metadata": {
"source": {
"id": "3ebd95668aeed9d7",
"type": "CONTACT"
},
"primary": true
},
"text": "2000-07-24"
}
],
"resourceName": "people/c4520933868599957975",
"etag": "<etag>"
},
{
"birthdays": [
{
"metadata": {
"source": {
"id": "5f5712ce0861c5b0",
"type": "CONTACT"
},
"primary": true
},
"text": "1880-03-11"
}
],
"resourceName": "people/c6869980432690169264",
"etag": "<etag>"
},
{
"birthdays": [
{
"date": {
"month": 1,
"year": 1990,
"day": 26
},
"metadata": {
"source": {
"id": "a16dde58e814a36",
"type": "CONTACT"
},
"primary": true
},
"text": "01/26/1990"
}
],
"resourceName": "people/c727012367875000886",
"etag": "<etag>"
},
{
"birthdays": [
{
"date": {
"month": 1,
"year": 1998,
"day": 1
},
"metadata": {
"source": {
"id": "f350f568dd11db1",
"type": "CONTACT"
},
"primary": true
},
"text": "Jan 1, 1998"
}
],
"resourceName": "people/c1095798948755479985",
"etag": "<etag>"
},
{
"birthdays": [
{
"metadata": {
"source": {
"id": "3652e00f0d28c9f4",
"type": "CONTACT"
},
"primary": true
},
"text": "random string accept"
}
],
"resourceName": "people/c3914437381388290548",
"etag": "<etag>"
}
]
The birthday field, can take both Date or text, and are not guaranteed to be the same.

How do I combine 2 search metrics with math expression in cloudwatch?

I am trying to get the percentage memory used when running a lambda to display in a graph on cloudwatch. I know there are other ways I can pull the data, but for reasons outside of the scope of this question, I would like to stick to using search to pull the metrics.
I have the following graph
{
"metrics": [
[ { "expression": "SEARCH('{SomeMetricNamespace} MetricName=\"MemorySize\"', 'Average', 300)", "id": "m1", "visible": "true" } ],
[ { "expression": "SEARCH('{SomeMetricNamespace} MetricName=\"MaxMemoryUsed\"', 'Average', 300)", "id": "m2", "visible": "true" } ],
[ { "expression": "m2/m1*100", "label": "pecentage memory used", "id": "e1", "stat": "Average" } ]
],
"view": "timeSeries",
"stacked": false,
"region": "us-west-2",
"stat": "Average",
"period": 300,
"title": "Memory",
"yAxis": {
"left": {
"label": "Percentage Usage",
"showUnits": false
}
},
"liveData": false
}
The error I am getting
Error in expression e1 [Unsupported operand type(s) for /: '[Array[TimeSeries], Array[TimeSeries]]']
Is there a way to combine the first 2 expressions to give me the percentage memory used?
The result of the expressions are arrays of time series so you can not apply directly operations (+ - * / ^). As a workaround you could transform each time series into single values (average values) for each expression and then calculate the percentage.
The source should be similar to this:
{
"metrics": [
[ { "expression": "SEARCH('{SomeMetricNamespace} MetricName=\"MemorySize\"', 'Average', 300)", "id": "m1", "visible": "false" } ],
[ { "expression": "SEARCH('{SomeMetricNamespace} MetricName=\"MaxMemoryUsed\"', 'Average', 300)", "id": "m2", "visible": "false" } ],
[ { "expression": "AVG(m1)", "label": "AVGMemorySize", "id": "e1", "visible": "false" } ],
[ { "expression": "AVG(m2)", "label": "AVGMaxMemoryUsed", "id": "e2", "visible": "false" } ],
[ { "expression": "e2/e1*100", "label": "pecentage memory used", "id": "e3", "stat": "Average" } ]
],
"view": "timeSeries",
"stacked": false,
"region": "us-west-2",
"stat": "Average",
"period": 300,
"title": "Memory",
"yAxis": {
"left": {
"label": "Percentage Usage",
"showUnits": false
}
},
"liveData": false
}

Power Query M. Json array of key-pair transform in Column

Thanks for your help,
I'm trying to transform the Array called 'Tags' hosting a list of pair of key-values into a list of columns: CustomerId, CustomerDisplayName, CustomerPath where each list contains a respective value being a store for each charge.
{
"periodFrom": "2020-11-09T00:00:00",
"periodTo": "2020-12-08T00:00:00",
"charges": [
{
"listPrice": 5.05,
"netPrice": 5.05,
"netPriceProrated": 5.05,
"subTotal": 5.05,
"currency": "CAD",
"isBilled": true,
"isProratable": true,
"deductions": [],
"fees": [],
"invoice": {
"number": "2822835",
"date": "2020-11-16T00:00:00",
"periodFrom": "2020-10-09T00:00:00",
"periodTo": "2020-11-08T00:00:00"
},
"taxes": [
{
"name": "GST",
"appliedRate": 5.0
},
{
"name": "QST",
"appliedRate": 9.975
}
],
"tags": [
{
"name": "CustomerId",
"value": "42c8edf4-365a-4068-bde6-33675832afbb"
},
{
"name": "CustomerDisplayName",
"value": "Blue Sky Group"
},
{
"name": "CustomerPath",
"value": "Devmesh Co-Branded/Blue Sky Group"
}
]
}
]
}
Here the actual snippet of code I'm actually
let
...
Response2 = Table.FromRecords( { Response } ),
#"Expand1" = Table.ExpandListColumn(Response2, "charges"),
#"Expand2" = Table.ExpandRecordColumn(#"Expand1", "charges", {"productId", "productName", "sku", "chargeId", "chargeName", "chargeType", "periodFrom", "periodTo", "quantity", "listPrice", "netPrice", "netPriceProrated", "subTotal", "currency", "isBilled", "isProratable", "deductions", "fees", "invoice", "taxes", "tags"}, {"charges.productId", "charges.productName", "charges.sku", "charges.chargeId", "charges.chargeName", "charges.chargeType", "charges.periodFrom", "charges.periodTo", "charges.quantity", "charges.listPrice", "charges.netPrice", "charges.netPriceProrated", "charges.subTotal", "charges.currency", "charges.isBilled", "charges.isProratable", "charges.deductions", "charges.fees", "charges.invoice", "charges.taxes", "charges.tags"})
in
#"Expand2"

How to upgrade Data Pipeline definition from EMR 3.x to 4.x/5.x?

I would like to upgrade my AWS data pipeline definition to EMR 4.x or 5.x, so I can take advantage of Hive's latest features (version 2.0+), such as CURRENT_DATE and CURRENT_TIMESTAMP, etc.
The change from EMR 3.x to 4.x/5.x requires the use of releaseLabel in EmrCluster, versus amiVersion.
When I use a "releaseLabel": "emr-4.1.0", I get the following error: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.tez.TezTask
Below is my data pipeline definition, for EMR 3.x. It works well, so I hope others find this useful (including the answer for emr 4.x/5.x), as the common answer/recommendation to importing data into DynamoDB from a file is to use Data Pipeline, but literally no one has put forward a solid & simple working example (say for custom data format).
{
"objects": [
{
"type": "DynamoDBDataNode",
"id": "DynamoDBDataNode1",
"name": "OutputDynamoDBTable",
"dataFormat": {
"ref": "DynamoDBDataFormat1"
},
"region": "us-east-1",
"tableName": "testImport"
},
{
"type": "Custom",
"id": "Custom1",
"name": "InputCustomFormat",
"column": [
"firstName", "lastName"
],
"columnSeparator" : "|",
"recordSeparator" : "\n"
},
{
"type": "S3DataNode",
"id": "S3DataNode1",
"name": "InputS3Data",
"directoryPath": "s3://data.domain.com",
"dataFormat": {
"ref": "Custom1"
}
},
{
"id": "Default",
"name": "Default",
"scheduleType": "ondemand",
"failureAndRerunMode": "CASCADE",
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"pipelineLogUri": "s3://logs.data.domain.com"
},
{
"type": "HiveActivity",
"id": "HiveActivity1",
"name": "S3ToDynamoDBImportActivity",
"output": {
"ref": "DynamoDBDataNode1"
},
"input": {
"ref": "S3DataNode1"
},
"hiveScript": "INSERT OVERWRITE TABLE ${output1} SELECT reflect('java.util.UUID', 'randomUUID') as uuid, TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP())) as loadDate, firstName, lastName FROM ${input1};",
"runsOn": {
"ref": "EmrCluster1"
}
},
{
"type": "EmrCluster",
"name": "EmrClusterForImport",
"id": "EmrCluster1",
"coreInstanceType": "m1.medium",
"coreInstanceCount": "1",
"masterInstanceType": "m1.medium",
"amiVersion": "3.11.0",
"region": "us-east-1",
"terminateAfter": "1 Hours"
},
{
"type": "DynamoDBDataFormat",
"id": "DynamoDBDataFormat1",
"name": "OutputDynamoDBDataFormat",
"column": [
"uuid", "loadDate", "firstName", "lastName"
]
}
],
"parameters": []
}
A sample file could look like
John|Doe
Jane|Doe
Carl|Doe
Bonus: rather than setting CURRENT_DATE in a column, how I can set as a variable in the hiveScript section? I tried SET loadDate = CURRENT_DATE;\n\n INSERT OVERWRITE..." to no avail. Not shown in my example are other dynamic fields I would like to set before the query clause.