BigQuery Array Manipulations - google-cloud-platform

I need some help with BigQuery Array manipulation as follow:
Column1 represent the list of the content ids & Column2 represent the list of embedded content ids.
|---------------------------------------------------------------------------------------------------------------------------------------------------------|
| Column1 | Column2 |
|---------------------------------------------------------------------------------------------------------------------------------------------------------|
|{"contentId":["1.5433912","1.5536755","1.5536970","1.5536380","1.5536809","1.5535567"]} |{'1.5433912':['1.5561001','1.5559520','1.5560946','1.5561026']} |
|----------------------------------------------------------------------------------------|----------------------------------------------------------------|
|{"contentId":["1.5536141","1.5535574","1.5534770","1.5535870"]} |{'1.5535574':['1.5527726','1.5533354','1.5533093']} |
|----------------------------------------------------------------------------------------|----------------------------------------------------------------|
|{"contentId":["1.5561069","1.5557612","1.5561433"]}. |{'1.5561069':['1.5527726'],'1.5561433':['1.5533093']} |
|----------------------------------------------------------------------------------------|----------------------------------------------------------------|
Desired output as follow:

Below is for BigQuery Standard SQL
#standardSQL
SELECT ARRAY_CONCAT_AGG(
IF(REGEXP_REPLACE(SPLIT(refs, ':')[OFFSET(1)], r"\[|\]|'", '')IS NULL, [TRIM(item, '"')],
ARRAY(
SELECT ref
FROM UNNEST(SPLIT(REGEXP_REPLACE(SPLIT(refs, ':')[OFFSET(1)], r"\[|\]|'", ''))) AS ref WITH OFFSET
ORDER BY OFFSET
))
ORDER BY OFFSET) AS contentId
FROM `project.dataset.table` t,
UNNEST(JSON_EXTRACT_ARRAY(Column1, '$.contentId')) AS item WITH OFFSET
LEFT JOIN UNNEST(REGEXP_EXTRACT_ALL(Column2, r"'.*?':\[.*?\]")) refs
ON STARTS_WITH(refs, "'" || TRIM(item, '"'))
GROUP BY FORMAT('%t', t)
If to apply to sample data from your question as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT '{"contentId":["1.5433912","1.5536755","1.5536970","1.5536380","1.5536809","1.5535567"]}' Column1, "{'1.5433912':['1.5561001','1.5559520','1.5560946','1.5561026']}" Column2 UNION ALL
SELECT '{"contentId":["1.5536141","1.5535574","1.5534770","1.5535870"]}', " {'1.5535574':['1.5527726','1.5533354','1.5533093']} " UNION ALL
SELECT '{"contentId":["1.5561069","1.5557612","1.5561433"]}', "{'1.5561069':['1.5527726'],'1.5561433':['1.5533093']}"
)
SELECT ARRAY_CONCAT_AGG(
IF(REGEXP_REPLACE(SPLIT(refs, ':')[OFFSET(1)], r"\[|\]|'", '')IS NULL, [TRIM(item, '"')],
ARRAY(
SELECT ref
FROM UNNEST(SPLIT(REGEXP_REPLACE(SPLIT(refs, ':')[OFFSET(1)], r"\[|\]|'", ''))) AS ref WITH OFFSET
ORDER BY OFFSET
))
ORDER BY OFFSET) AS contentId
FROM `project.dataset.table` t,
UNNEST(JSON_EXTRACT_ARRAY(Column1, '$.contentId')) AS item WITH OFFSET
LEFT JOIN UNNEST(REGEXP_EXTRACT_ALL(Column2, r"'.*?':\[.*?\]")) refs
ON STARTS_WITH(refs, "'" || TRIM(item, '"'))
GROUP BY FORMAT('%t', t)
result is exactly as in your expected example
Row contentId
1 1.5561001
1.5559520
1.5560946
1.5561026
1.5536755
1.5536970
1.5536380
1.5536809
1.5535567
2 1.5536141
1.5527726
1.5533354
1.5533093
1.5534770
1.5535870
3 1.5527726
1.5557612
1.5533093

Related

combine column in json format in big query

I have columns in bigquery like this:
expected output:
I am trying to merge columns into json using bigquery.
I am taking letter before underscore(common name ) as column then converting.
I am trying this query:
with selectdata as (
SELECT a_firstname, a_middlename,a_lastname FROM `account_id.Dataset.Table_name`
)
select TO_JSON_STRING(t) AS json_data FROM selectdata AS t;
How can I join columns with condition or with case to achieve this output in bigquery
Consider below approach
create temp function extract_keys(input string) returns array<string> language js as """
return Object.keys(JSON.parse(input));
""";
create temp function extract_values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
select * except(row_id) from (
select format('%t',t) row_id,
split(key, '_')[offset(0)] as col,
'{' || string_agg(format('"%s":"%s"', split(key, '_')[safe_offset(1)], value)) || '}' as value
from your_table t, unnest(extract_keys(to_json_string(t))) key with offset
join unnest(extract_values(to_json_string(t))) value with offset
using(offset)
group by row_id, col
)
pivot (any_value(value) for col in ('a','b','c'))
if applied to sample data in your question - output is

Athena- extract substring from string - comma delimited

I want to create Athena view from Athena table.
In table, the column value is "lastname, firstname" so I want to extract these values as 'lastname' and 'firstname' and then need to store it into seperate columns in a view. example- firstname need to be stored into new column- 'first_name' and lastname need to be store into new column - 'last_name'
whats the SQL function which I can use here? I tried split function but then it's giving me an array.
Assuming the input strings have a fixed and known number of elements, you can do something like this:
WITH data(value) AS (
VALUES ('Aaa,Bbb')
)
SELECT elements[1], elements[2]
FROM (
SELECT split(value, ',') AS elements
FROM data
)
=>
_col0 | _col1
-------+-------
Aaa | Bbb
(1 row)
create or replace view "names" as
select
SPLIT_PART("column_name",',', 1) as first_name
, SPLIT_PART("column_name", ',', 2) as last_name
from myTable
You can use UNNEST on the split result:
WITH dataset AS (
SELECT * FROM (VALUES
('aaa,bbb'),
('aaa1,bbb1')
) AS t (str))
SELECT str_col
FROM dataset
CROSS JOIN UNNEST(split(str, ',')) as tmp(str_col)
Output:
str_col
aaa
bbb
aaa1
bbb1
UPD
If you have at least one comma guaranteed than it is as easy as:
WITH dataset AS (
SELECT * FROM (VALUES
('aaa,bbb'),
('aaa1,bbb1')
) AS t (str))
SELECT splt[1] last_name, splt[2] first_name
FROM
(SELECT split(str, ',') as splt
FROM dataset)
Output:
last_name
first_name
aaa
bbb
aaa1
bbb1
In case you can have varing number of commas but limitied to some number you can use TRY:
WITH dataset AS (
SELECT * FROM (VALUES
('aaa,bbb'),
('aaa1,bbb1,ddd1')
) AS t (str))
SELECT splt[1], splt[2], TRY(splt[3])
FROM
(SELECT split(str, ',') as splt
FROM dataset)
Output:
_col0
_col1
_col2
aaa
bbb
aaa1
bbb1
ddd1

Sum of array elements in Bigquery

I have to calculate the total num of positive and (negative+null or empty values) from the table basically 2 values . I have the below query to list the negative and null and positive values .. but i want the entire count . please assist.
SELECT
ARRAY(
SELECT count(value),
FROM UNNEST(event_data_results) where REGEXP_CONTAINS(name, r'data.result.result') and ((REGEXP_CONTAINS(value, r'^-?\d+$') and SAFE_CAST(value AS INT64) <= 0 ))) AS negative_attributes,
ARRAY(
SELECT count(value) as neg_val,
FROM UNNEST(event_data_results) where value = 'null' or value='' ) AS null_attributes,
ARRAY(
SELECT count(value),
FROM UNNEST(event_data_results) where REGEXP_CONTAINS(name, r'data.result.result') and (REGEXP_CONTAINS(value, r'^-?\d+$') and SAFE_CAST(value AS INT64) > 0 )) AS positive_attributes
FROM `table` where EXISTS (SELECT 1 FROM UNNEST(event_keys) as keys , UNNEST(event_data_results) as results WHERE keys.value = "attribute")
event_keys,event_data_results , data_metrics all are repeatable struct
result should be postive : 4 negative+null :4
Below is for BigQuery Standard SQL
#standardSQL
SELECT
COUNTIF(result.value > 0) positive_attributes,
COUNTIF(result.value < 0) negative_attributes,
COUNTIF(IFNULL(result.value, 0) = 0) null_or_zero_attributes
FROM `project.dataset.table`,
UNNEST(event_data_results) AS result
WHERE EXISTS (
SELECT 1
FROM UNNEST(event_keys) AS key
WHERE key.value = "attribute"
)
you can add here whatever conditions you need
Also, if result.value is a string - you can use SAFE_CAST(result.value AS INT64) as you already do so i was not focusing on this aspect of your case

Pass a list/array in DB2 stored procedure

SELECT cc.clientid
FROM customer_client cc
GROUP BY cc.clientid
HAVING SUM(CASE WHEN cc.customerid IN (4567, 5678) THEN 1 ELSE 0 END) = COUNT(*)
AND COUNT(*) = 2;
I'm calling this query in a Db2 stored procedure where in I've to pass the list of customer id - any working suggestion?
I've tried passing it as below in procedure
CREATE PROCEDURE Find_Client_Customers (
IN IN_CUSTIDS VARCHAR(1000),
IN IN_CUST_COUNT INT)
but this is passing the list as a string.
You may use a string tokenizer:
create function regexp_tokenize_number(
source varchar(1024)
, pattern varchar(128))
returns table (seq int, tok bigint)
contains sql
deterministic
no external action
return
select seq, tok
from xmltable('for $id in tokenize($s, $p) return <i>{string($id)}</i>'
passing
source as "s"
, pattern as "p"
columns
seq for ordinality
, tok bigint path 'if (. castable as xs:long) then xs:long(.) else ()'
) t;
select *
from table(regexp_tokenize_number('123, 456', ',')) t;
SEQ TOK
--- ---
1 123
2 456
In your case:
SELECT cc.clientid
FROM customer_client cc
GROUP BY cc.clientid
HAVING SUM(CASE WHEN cc.customerid IN
(
select t.tok
from table(regexp_tokenize_number('4567, 5678', ',')) t
) THEN 1 ELSE 0 END) = COUNT(*)
AND COUNT(*) = 2;

regular expression to solve for the following

Example 1
asdk[wovkd'vk'psacxu5=205478499|205477661zamd;amd;a;d
Example 2
sadlmdlmdadsldu5=205478499|205477661|234567899amsd/samdamd
u5 can have multiple values separated by |
How can I capture all u5 values from a long string I have?
Below is for BigQuery Standard SQL
#standardSQL
WITH data AS (
SELECT 1 AS id, "asdk[wovkd'vk'psacxu5=205478499|205477661zamd;amd;a;d" AS junk UNION ALL
SELECT 2, "sadlmdlmdadsldu5=205478499|205477661|234567899amsd/samdamd"
)
SELECT id, SPLIT(REGEXP_EXTRACT(junk, r'(?i)u5=([\d|]*)'), '|') AS value
FROM data
with output as below
id value
1 205478499
205477661
2 205478499
205477661
234567899