Regular expression to remove duplicates from comma separated string - regex

I have following string:
'C,2,1,2,3,1'
I need a regular expression to remove duplicates and the result string should be like this:
'C,2,1,3'

If your input data is more than one string, I assume there is some kind of id column you can use to distinguish the strings from each other. If no such column exists, it can be created in the first factored subquery, for example by using rownum.
with
inputs ( id, str ) as (
select 1, 'C,2,1,2,3,1' from dual union all
select 2, 'A,ZZ,3,A,3,ZZ' from dual
),
unwrapped ( id, str, lvl, token ) as (
select id, str, level, regexp_substr(str, '[^,]+', 1, level)
from inputs
connect by level <= 1 + regexp_count(str, ',')
and prior id = id
and prior sys_guid() is not null
),
with_rn ( id, str, lvl, token, rn ) as (
select id, str, lvl, token, row_number() over (partition by id, token order by lvl)
from unwrapped
)
select id, str, listagg(token, ',') within group (order by lvl) as new_str
from with_rn
where rn = 1
group by id, str
order by id
;
ID STR NEW_STR
---- ------------------ --------------------
1 C,2,1,2,3,1 C,2,1,3
2 A,ZZ,3,A,3,ZZ A,ZZ,3

Try this:
with
-- your input data
t_in as (select 'C,2,1,2,3,1' as s from dual),
-- your string splitted into a table, a row per list item
t_split as (
select (regexp_substr(s,'(\w+)(,|$)',1,rownum,'c',1)) s,
level n
from t_in
connect by level <= regexp_count(s,'(\w+)(,|$)') + 1
),
-- this table grouped to obtain distinct values with
-- minimum levels for sorting
t_grouped as (
select s, min(n) n from t_split group by s
)
select listagg(s, ',') within group (order by n)
from t_grouped;
Depending on your Oracle version you might have to replace listagg with wm_concat (it's googlable)

Here another shorter solution:
select listagg(val, ',') within group(order by min(id))
from (select rownum as id,
trim(regexp_substr(str, '[^,]+', 1, level)) as val
from (select 'C,2,1,2,3,1' as str from dual)
connect by regexp_substr(str, '[^,]+', 1, level) is not null)
group by val;

Related

SQL for nested WITH CLAUSE - RESULTS OFFSET in Oracle 19c

Please suggest a way to implement nesting of (temp - results - select) as shown below?
I see that oracle 19c does not allow nesting of WITH clause.
with temp2 as
(
with temp1 as
(
__
__
),
results(..fields..) as
(
select ..<calc part>.. from temp1, results where __
)
select ..<calc part>.. from temp1 join results where __
),
results(..fields..) as
(
select ..<calc part>.. from temp2, results where __
)
select ..<calc part>.. from temp2 join results where __
For instance:
DB Fiddle
I need to calculate CALC3 in similar recursive way as of CALC
CREATE TABLE TEST ( DT DATE, NAME VARCHAR2(10), VALUE NUMBER(10,3));
insert into TEST values ( to_date( '01-jan-2021'), 'apple', 198.95 );
insert into TEST values ( to_date( '02-jan-2021'), 'apple', 6.15 );
insert into TEST values ( to_date( '03-jan-2021'), 'apple', 4.65 );
insert into TEST values ( to_date( '06-jan-2021'), 'apple', 20.85 );
insert into TEST values ( to_date( '01-jan-2021'), 'banana', 80.5 );
insert into TEST values ( to_date( '02-jan-2021'), 'banana', 9.5 );
insert into TEST values ( to_date( '03-jan-2021'), 'banana', 31.65 );
--Existing working code -
with t as
( select
test.*,
row_number() over ( partition by name order by dt ) as seq
from test
),
results(name, dt, value, calc ,seq) as
(
select name, dt, value, value/5 calc, seq
from t
where seq = 1
union all
select t.name, t.dt, t.value, ( 4 * results.calc + t.value ) / 5, t.seq
from t, results
where t.seq - 1 = results.seq
and t.name = results.name
)
select results.*, calc*3 as calc2 -- Some xyz complex logic as calc2
from results
order by name, seq;
Desired output:
CALC3 - grouped by name and dt -
((CALC3 of prev day record * 4) + CALC2 of current record )/ 5
i.e for APPLE
for 1-jan-21, CALC = ((0*4)+119.37)/5 = 23.87 -------> since it is 1st record, have taken 0 as CALC3 of prev day record
for 2-jan-21, CALC = ((23.87*4)+99.19)/5= 115.33 -----> prev CALC3 is considered from 1-jan-21 - 23.87 and 99.19 from current row
for 3-jan-21, CALC = ((115.33*4)+82.14)/5= 477.76 and so on
For BANANA
1-jan-21, CALC = ((0*4)+48.30)/5=9.66
1-jan-21, CALC = ((9.66*4)+44.34)/5=47.51
etc
You do not need to, you can just do it all in one level:
with temp1(...fields...) as
(
__
__
__
),
results1(...fields...) as
(
select ...<calc part>... from temp1 where __
),
temp2( ...fields...) as
(
select ...<calc part>... from temp1 join results1 where __
),
results2(...fields...) as
(
select ...<calc part>... from temp2 where __
)
select ...<calc part>... from temp2 join results2 where __
For your actual problem, you can use a MODEL clause:
SELECT dt,
name,
amount,
calc,
seq,
calc2,
calc3
FROM (
SELECT t.*,
ROW_NUMBER() OVER (PARTITION BY name ORDER BY dt) AS seq
FROM test t
)
MODEL
PARTITION BY (name)
DIMENSION BY (seq)
MEASURES ( dt, amount, 0 AS calc, 0 AS calc2, 0 as calc3)
RULES (
calc[1] = amount[1]/5,
calc[seq>1] = (amount[cv(seq)] + 4*calc[cv(seq)-1])/5,
calc2[seq] = 3*calc[cv(seq)],
calc3[1] = calc2[1]/5,
calc3[seq>1] = (calc2[cv(seq)] + 4*calc3[cv(seq)-1])/5
)
Which outputs:
DT
NAME
AMOUNT
CALC
SEQ
CALC2
CALC3
01-JAN-21
banana
80.5
16.1
1
48.3
9.66
02-JAN-21
banana
9.5
14.78
2
44.34
16.596
03-JAN-21
banana
31.65
18.154
3
54.462
24.1692
01-JAN-21
apple
198.95
39.79
1
119.37
23.874
02-JAN-21
apple
6.15
33.062
2
99.186
38.9364
03-JAN-21
apple
4.65
27.3796
3
82.1388
47.57688
06-JAN-21
apple
20.85
26.07368
4
78.22104
53.705712
db<>fiddle here

rewrite redshift query as athena

I am trying to convert this redshift query to athena.
select
a.customerid,
a.country,
a.stockcode,
a.description,
a.invoicedate,
a.sales_amt,
(b.nbr_months_active) as nbr_months_active
from
ecommerce_sales_data a
inner join (
select
customerid,
count(
distinct(
DATE_PART(y, cast(invoicedate as date)) || '-' || LPAD(
DATE_PART(mon, cast(invoicedate as date)),
2,
'00'
)
)
) as nbr_months_active
from
ecommerce_sales_data
group by
1
) b on a.customerid = b.customerid
This is what I have tried. It returns the results. But I am not sure if the results will match with redshift query in all cases.
WITH students_results(InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country) AS (VALUES
('536365','85123A','WHITE HANGING HEART T-LIGHT HOLDER','6','12/1/2010 8:26','2.55','17850','United Kingdom'),
('536365','71053','WHITE METAL LANTERN','6','12/1/2010 8:26','3.39','17850','United Kingdom'),
('536365','84406B','CREAM CUPID HEARTS COAT HANGER','8','12/1/2010 8:26','2.75','17850','United Kingdom')
)
select
a.customerid,
a.country,
a.stockcode,
a.description,
a.invoicedate,
cast(a.quantity as decimal(11,2)) * cast(a.unitprice as decimal(11,2)) as sales_amt,
(b.nbr_months_active) as nbr_months_active
from
students_results a
inner join (
select
customerid,
count(
distinct(
date_format(date_parse(invoicedate,'%m/%d/%Y %k:%i'), '%Y-%m')
)) as nbr_months_active
FROM students_results group by customerid) as b
on a.customerid = b.customerid
The source of Redshift query is here:
https://aws.amazon.com/blogs/machine-learning/build-multi-class-classification-models-with-amazon-redshift-ml/

BigQuery Arrays - check if Array contains specific values

I'm trying to see if a certain set of items exist within a BigQuery Array.
Below query works (Checking if a 1 item exists within an array):
WITH sequences AS
(
SELECT 1 AS id, [10,20,30,40] AS some_numbers
UNION ALL
SELECT 2 AS id, [20,30,40,50] AS some_numbers
UNION ALL
SELECT 3 AS id, [40,50,60,70] AS some_numbers
)
SELECT id, some_numbers
FROM sequences
WHERE 20 IN UNNEST(some_numbers)
What I'm not able to do is below (Checking if a more than 1 item exists within an array):
(This query errors)
WITH sequences AS
(
SELECT 1 AS id, [10,20,30,40] AS some_numbers
UNION ALL
SELECT 2 AS id, [20,30,40,50] AS some_numbers
UNION ALL
SELECT 3 AS id, [40,50,60,70] AS some_numbers
)
SELECT id, some_numbers
FROM sequences
WHERE (20,30) IN UNNEST(some_numbers)
I managed to find below workaround, but I feel there's a better way to do this:
WITH sequences AS
(
SELECT 1 AS id, [10,20,30,40] AS some_numbers
UNION ALL
SELECT 2 AS id, [20,30,40,50] AS some_numbers
UNION ALL
SELECT 3 AS id, [40,50,60,70] AS some_numbers
)
SELECT id, some_numbers
FROM sequences
WHERE (
(
SELECT COUNT(1)
FROM UNNEST(some_numbers) s
WHERE s in (20,30)
) > 1
)
Any suggestions are appreciated.
Not much to suggest... Official docs suggest to use exists:
WHERE EXISTS (SELECT *
FROM UNNEST(some_numbers) AS s
WHERE s in (20,30));
Assuming you are looking for rows where ALL elements in match array [20, 30] are found in target array (some_numbers). Also assuming no duplicate numbers in both (match and target) arrays
select id, some_numbers
from sequences a,
unnest([struct([20, 30] as match)]) b
where (
select count(1) = array_length(match)
from a.some_numbers num
join b.match num
using(num)
)

How to find the exact match of a string and replace in Oracle?

I am trying to replace the words in a sentence if the word exits in "words" column with a hyperlink along with it's word and id. The table contains the column id word and sentence. Below code I got it from one of the helpful fellow mate here.Thank you.
https://dbfiddle.uk/?rdbms=oracle_18&fiddle=8fe103264dc650ad4bd87b20f9c6931a
Create table temp(
id NUMBER,
word VARCHAR2(1000),
Sentence VARCHAR2(2000)
);
insert into temp
SELECT 1,'automation testing', 'automtestingation testing is popular kind of testing' FROM DUAL UNION ALL
SELECT 2,'testing','manual testing' FROM DUAL UNION ALL
SELECT 3,'manual testing','this is an old method of testing' FROM DUAL UNION ALL
SELECT 5,'B-number analysis','B-number analysis table' FROM DUAL UNION ALL
SELECT 6,'B-number analysis table','testing B-number analysis' FROM DUAL;
MERGE INTO temp dst
USING (
WITH ordered_words ( rn, id, word ) AS (
SELECT ROW_NUMBER() OVER ( ORDER BY LENGTH( word ) ASC, word DESC ),
id,
word
FROM temp
),
sentences ( rid, sentence, rn ) AS (
SELECT ROWID,
sentence,
COUNT(*) OVER () + 1
FROM temp
UNION ALL
SELECT s.rid,
REGEXP_REPLACE(
REGEXP_REPLACE(
s.sentence,
'(^|[^a-z])' || w.word || '($|[^a-z])',
'\1' || 'http://localhost/'|| w.id ||'/<u>'||w.word ||'<u>' || '\2',
1,
0,
'i'
),
'(^|[^a-z])' || w.word || '($|[^a-z])',
'\1' || w.word || '\2',
1,
0,
'i'
),
s.rn - 1
FROM sentences s
INNER JOIN ordered_words w
ON ( s.rn - 1 = w.rn )
)
SELECT rid, sentence
FROM sentences
WHERE rn = 1
) src
ON ( dst.ROWID = src.RID )
WHEN MATCHED THEN
UPDATE
SET sentence = src.sentence;
The value to be replaced is https://localhost/"id"/"word" If you see the value for id = 5 (B-number analysis) the sentence is https://localhost/6/localhost/5/B-number analysis table But the actual value is supposed to be https://localhost/6/B-number analysis table.
Current output:
ID WORD SENTENCE
1 automation testing automtestingation http://localhost/2/<u>testing<u>
is popular kind of http://localhost/2/<u>testing<u>
2 testing http://localhost/3/<u>manual
http://localhost/2/<u>testing<u><u>
3 manual testing this is an old method of
http://localhost/2/<u>testing<u>
5 B-number analysis http://localhost/6/<u>http://localhost/5/<u>B-
number analysis<u> table<u>
6 B-number analysis table http://localhost/2/<u>testing<u>
http://localhost/5/<u>B-number analysis<u>
Do it in two steps:
First replace the strings with the ids in some wrapper that is not going to appear in your text (i.e. testing maps to ${2})
Then, once all the replacements have been done, replace the wrapped ids with the urls (i.e. ${2} maps to http://localhost/2/<u>testing</u>)
Oracle Setup:
Create table temp(
id NUMBER,
word VARCHAR2(1000),
Sentence VARCHAR2(2000)
);
insert into temp
SELECT 1,'automation testing', 'automtestingation testing is popular kind of testing' FROM DUAL UNION ALL
SELECT 2,'testing','manual testing' FROM DUAL UNION ALL
SELECT 3,'manual testing','this is an old method of testing' FROM DUAL UNION ALL
SELECT 4,'punctuation','automation testing,manual testing,punctuation,automanual testing-testing' FROM DUAL UNION ALL
SELECT 5,'B-number analysis','B-number analysis table' FROM DUAL UNION ALL
SELECT 6,'B-number analysis table','testing B-number analysis' FROM DUAL UNION ALL
SELECT 7,'Not Matched','testing testing testing' FROM DUAL;
Merge:
MERGE INTO temp dst
USING (
WITH ordered_words ( rn, id, word ) AS (
SELECT ROW_NUMBER() OVER ( ORDER BY LENGTH( word ) ASC, word DESC ),
id,
word
FROM temp
),
sentences_with_ids ( rid, sentence, rn ) AS (
SELECT ROWID,
sentence,
( SELECT COUNT(*) + 1 FROM ordered_words )
FROM temp
UNION ALL
SELECT s.rid,
REGEXP_REPLACE(
REGEXP_REPLACE(
s.sentence,
'(^|\W)' || w.word || '($|\W)',
'\1${'|| w.id ||'}\2'
),
'(^|\W)' || w.word || '($|\W)',
'\1${' || w.id || '}\2'
),
s.rn - 1
FROM sentences_with_ids s
INNER JOIN ordered_words w
ON ( s.rn - 1 = w.rn )
),
sentences_with_words ( rid, sentence, rn ) AS (
SELECT rid,
sentence,
( SELECT COUNT(*) + 1 FROM ordered_words )
FROM sentences_with_ids
WHERE rn = 1
UNION ALL
SELECT s.rid,
REPLACE(
s.sentence,
'${' || w.id || '}',
'http://localhost/' || w.id || '/<u>' || w.word || '</u>'
),
s.rn - 1
FROM sentences_with_words s
INNER JOIN ordered_words w
ON ( s.rn - 1 = w.rn )
)
SELECT rid, sentence
FROM sentences_with_words
WHERE rn = 1
) src
ON ( dst.ROWID = src.RID )
WHEN MATCHED THEN
UPDATE
SET sentence = src.sentence;
Output:
ID | WORD | SENTENCE
-: | :---------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | automation testing | automtestingation http://localhost/2/<u>testing</u> is popular kind of http://localhost/2/<u>testing</u>
2 | testing | http://localhost/3/<u>manual testing</u>
3 | manual testing | this is an old method of http://localhost/2/<u>testing</u>
4 | punctuation | http://localhost/1/<u>automation testing</u>,http://localhost/3/<u>manual testing</u>,http://localhost/4/<u>punctuation</u>,automanual http://localhost/2/<u>testing</u>-http://localhost/2/<u>testing</u>
5 | B-number analysis | http://localhost/6/<u>B-number analysis table</u>
6 | B-number analysis table | http://localhost/2/<u>testing</u> http://localhost/5/<u>B-number analysis</u>
7 | Not Matched | http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u>
db<>fiddle here
Update:
Escape any special regular expression characters in the words:
MERGE INTO temp dst
USING (
WITH ordered_words ( rn, id, word, regex_safe_word ) AS (
SELECT ROW_NUMBER() OVER ( ORDER BY LENGTH( word ) ASC, word DESC ),
id,
word,
REGEXP_REPLACE( word, '([][)(}{|^$\.*+?])', '\\\1' )
FROM temp
),
sentences_with_ids ( rid, sentence, rn ) AS (
SELECT ROWID,
sentence,
( SELECT COUNT(*) + 1 FROM ordered_words )
FROM temp
UNION ALL
SELECT s.rid,
REGEXP_REPLACE(
REGEXP_REPLACE(
s.sentence,
'(^|\W)' || w.regex_safe_word || '($|\W)',
'\1${'|| w.id ||'}\2'
),
'(^|\W)' || w.regex_safe_word || '($|\W)',
'\1${' || w.id || '}\2'
),
s.rn - 1
FROM sentences_with_ids s
INNER JOIN ordered_words w
ON ( s.rn - 1 = w.rn )
),
sentences_with_words ( rid, sentence, rn ) AS (
SELECT rid,
sentence,
( SELECT COUNT(*) + 1 FROM ordered_words )
FROM sentences_with_ids
WHERE rn = 1
UNION ALL
SELECT s.rid,
REPLACE(
s.sentence,
'${' || w.id || '}',
'http://localhost/' || w.id || '/<u>' || w.word || '</u>'
),
s.rn - 1
FROM sentences_with_words s
INNER JOIN ordered_words w
ON ( s.rn - 1 = w.rn )
)
SELECT rid, sentence
FROM sentences_with_words
WHERE rn = 1
) src
ON ( dst.ROWID = src.RID )
WHEN MATCHED THEN
UPDATE
SET sentence = src.sentence;
Output:
ID | WORD | SENTENCE
-: | :---------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | automation testing | automtestingation http://localhost/2/<u>testing</u> is popular kind of http://localhost/2/<u>testing</u>
2 | testing | http://localhost/3/<u>manual testing</u>
3 | manual testing | this is an old method of http://localhost/2/<u>testing</u>
4 | punctuation | http://localhost/1/<u>automation testing</u>,http://localhost/3/<u>manual testing</u>,http://localhost/4/<u>punctuation</u>,automanual http://localhost/2/<u>testing</u>-http://localhost/2/<u>testing</u>
5 | B-number analysis | http://localhost/6/<u>B-number analysis table</u>
6 | B-number analysis table | http://localhost/2/<u>testing</u> http://localhost/5/<u>B-number analysis</u>
7 | Not Matched | http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u>
8 | ^[($ | http://localhost/2/<u>testing</u> characters http://localhost/8/<u>^[($</u> that need escaping in a regular expression
db<>fiddle here

Using REGEXP_EXTRACT extract 7 digit numbers

I have one requirement.I want to extract 7 digit numbers from one of the column and lookup with another table get another column for each 7 digit number and concatenate with "|".
Column Data: In this need to extract 7 digit numbers.
";2435034;1;5.98;;eVar36=bopis|ev2=2605,;1483528;1;17.97;;ev6=bopis|evar52=2605,;1010203;1;7.98;;ev6=bopis|ev2=2605"
Output(Extract 7 digit number):
2435034,1483528,1010203
Another table:
account name
2435034 D1
1483528 D2
1010203 D3
Final output is(after joining with another table):
account_nbr account_name
2435034|1483528|1010203 D1|D2|D3
I tried with the following command to extract 7 digit number. I was getting only first number, remaining number are not coming.
REGEXP_EXTRACT(REGEXP_REPLACE(";2435034;1;5.98;;eVar36=bopis|ev2=2605,;1483528;1;17.97;;ev6=bopis|evar52=2605,;1010203;1;7.98;;ev6=bopis|ev2=2605", r'[^\d]+', ','),r'[0-9]+')
This might be simple but not able to figure it out. Tried with GROUP_CONCAT and SPLIT function also getting following error.
Exactly one capturing group must be specified
Please let me know if you have any suggestions.
Thanks in advance.
Below is for BigQuery Standard SQL
#standardSQL
SELECT
STRING_AGG(nbr, '|' ORDER BY pos) account_nbr,
STRING_AGG(name, '|' ORDER BY pos) account_name,
data
FROM `project.dataset.yourTable` t,
UNNEST(REGEXP_EXTRACT_ALL(REGEXP_REPLACE(t.data, r'[^\d]+', ','),r'[0-9]{7}')) nbr WITH OFFSET pos
JOIN `project.dataset.anotherTable` x
ON CAST(x.account AS STRING) = nbr
GROUP BY data
You can test / play with it using dummy data from your question:
#standardSQL
WITH `project.dataset.yourTable` AS (
SELECT ";2435034;1;5.98;;eVar36=bopis|ev2=2605,;1483528;1;17.97;;ev6=bopis|evar52=2605,;1010203;1;7.98;;ev6=bopis|ev2=2605" data
), `project.dataset.anotherTable` AS (
SELECT 2435034 account, 'D1' name UNION ALL
SELECT 1483528, 'D2' UNION ALL
SELECT 1010203, 'D3'
)
SELECT
STRING_AGG(nbr, '|' ORDER BY pos) account_nbr,
STRING_AGG(name, '|' ORDER BY pos) account_name,
data
FROM `project.dataset.yourTable` t,
UNNEST(REGEXP_EXTRACT_ALL(REGEXP_REPLACE(t.data, r'[^\d]+', ','),r'[0-9]{7}')) nbr WITH OFFSET pos
JOIN `project.dataset.anotherTable` x
ON CAST(x.account AS STRING) = nbr
GROUP BY data
Update for new question in comments: records are getting filtered if t.data is null. Is there a way i can get the records even t.data is null? In my table some of the records doesnt have value for t.data
#standardSQL
WITH `project.dataset.yourTable` AS (
SELECT 1 id, ";2435031;1;5.98;;eVar36=bopis|ev2=2605,;1483528;1;17.97;;ev6=bopis|evar52=2605,;1010203;1;7.98;;ev6=bopis|ev2=2605" data UNION ALL
SELECT 2, NULL
), `project.dataset.anotherTable` AS (
SELECT 2435034 account, 'D1' name UNION ALL
SELECT 1483528, 'D2' UNION ALL
SELECT 1010203, 'D3'
)
SELECT
id,
(SELECT STRING_AGG(nbr, '|' ORDER BY pos)
FROM UNNEST(REGEXP_EXTRACT_ALL(
REGEXP_REPLACE(t.data, r'[^\d]+', ','),r'[0-9]{7}')) nbr WITH OFFSET pos
JOIN `project.dataset.anotherTable` x
ON CAST(x.account AS STRING) = nbr
) a_nbr,
(SELECT STRING_AGG(name, '|' ORDER BY pos)
FROM UNNEST(REGEXP_EXTRACT_ALL(
REGEXP_REPLACE(t.data, r'[^\d]+', ','),r'[0-9]{7}')) nbr WITH OFFSET pos
JOIN `project.dataset.anotherTable` x
ON CAST(x.account AS STRING) = nbr
) a_name,
data
FROM `project.dataset.yourTable` t
GROUP BY id, data