How to find the exact match of a string and replace in Oracle? - regex

I am trying to replace the words in a sentence if the word exits in "words" column with a hyperlink along with it's word and id. The table contains the column id word and sentence. Below code I got it from one of the helpful fellow mate here.Thank you.
https://dbfiddle.uk/?rdbms=oracle_18&fiddle=8fe103264dc650ad4bd87b20f9c6931a
Create table temp(
id NUMBER,
word VARCHAR2(1000),
Sentence VARCHAR2(2000)
);
insert into temp
SELECT 1,'automation testing', 'automtestingation testing is popular kind of testing' FROM DUAL UNION ALL
SELECT 2,'testing','manual testing' FROM DUAL UNION ALL
SELECT 3,'manual testing','this is an old method of testing' FROM DUAL UNION ALL
SELECT 5,'B-number analysis','B-number analysis table' FROM DUAL UNION ALL
SELECT 6,'B-number analysis table','testing B-number analysis' FROM DUAL;
MERGE INTO temp dst
USING (
WITH ordered_words ( rn, id, word ) AS (
SELECT ROW_NUMBER() OVER ( ORDER BY LENGTH( word ) ASC, word DESC ),
id,
word
FROM temp
),
sentences ( rid, sentence, rn ) AS (
SELECT ROWID,
sentence,
COUNT(*) OVER () + 1
FROM temp
UNION ALL
SELECT s.rid,
REGEXP_REPLACE(
REGEXP_REPLACE(
s.sentence,
'(^|[^a-z])' || w.word || '($|[^a-z])',
'\1' || 'http://localhost/'|| w.id ||'/<u>'||w.word ||'<u>' || '\2',
1,
0,
'i'
),
'(^|[^a-z])' || w.word || '($|[^a-z])',
'\1' || w.word || '\2',
1,
0,
'i'
),
s.rn - 1
FROM sentences s
INNER JOIN ordered_words w
ON ( s.rn - 1 = w.rn )
)
SELECT rid, sentence
FROM sentences
WHERE rn = 1
) src
ON ( dst.ROWID = src.RID )
WHEN MATCHED THEN
UPDATE
SET sentence = src.sentence;
The value to be replaced is https://localhost/"id"/"word" If you see the value for id = 5 (B-number analysis) the sentence is https://localhost/6/localhost/5/B-number analysis table But the actual value is supposed to be https://localhost/6/B-number analysis table.
Current output:
ID WORD SENTENCE
1 automation testing automtestingation http://localhost/2/<u>testing<u>
is popular kind of http://localhost/2/<u>testing<u>
2 testing http://localhost/3/<u>manual
http://localhost/2/<u>testing<u><u>
3 manual testing this is an old method of
http://localhost/2/<u>testing<u>
5 B-number analysis http://localhost/6/<u>http://localhost/5/<u>B-
number analysis<u> table<u>
6 B-number analysis table http://localhost/2/<u>testing<u>
http://localhost/5/<u>B-number analysis<u>

Do it in two steps:
First replace the strings with the ids in some wrapper that is not going to appear in your text (i.e. testing maps to ${2})
Then, once all the replacements have been done, replace the wrapped ids with the urls (i.e. ${2} maps to http://localhost/2/<u>testing</u>)
Oracle Setup:
Create table temp(
id NUMBER,
word VARCHAR2(1000),
Sentence VARCHAR2(2000)
);
insert into temp
SELECT 1,'automation testing', 'automtestingation testing is popular kind of testing' FROM DUAL UNION ALL
SELECT 2,'testing','manual testing' FROM DUAL UNION ALL
SELECT 3,'manual testing','this is an old method of testing' FROM DUAL UNION ALL
SELECT 4,'punctuation','automation testing,manual testing,punctuation,automanual testing-testing' FROM DUAL UNION ALL
SELECT 5,'B-number analysis','B-number analysis table' FROM DUAL UNION ALL
SELECT 6,'B-number analysis table','testing B-number analysis' FROM DUAL UNION ALL
SELECT 7,'Not Matched','testing testing testing' FROM DUAL;
Merge:
MERGE INTO temp dst
USING (
WITH ordered_words ( rn, id, word ) AS (
SELECT ROW_NUMBER() OVER ( ORDER BY LENGTH( word ) ASC, word DESC ),
id,
word
FROM temp
),
sentences_with_ids ( rid, sentence, rn ) AS (
SELECT ROWID,
sentence,
( SELECT COUNT(*) + 1 FROM ordered_words )
FROM temp
UNION ALL
SELECT s.rid,
REGEXP_REPLACE(
REGEXP_REPLACE(
s.sentence,
'(^|\W)' || w.word || '($|\W)',
'\1${'|| w.id ||'}\2'
),
'(^|\W)' || w.word || '($|\W)',
'\1${' || w.id || '}\2'
),
s.rn - 1
FROM sentences_with_ids s
INNER JOIN ordered_words w
ON ( s.rn - 1 = w.rn )
),
sentences_with_words ( rid, sentence, rn ) AS (
SELECT rid,
sentence,
( SELECT COUNT(*) + 1 FROM ordered_words )
FROM sentences_with_ids
WHERE rn = 1
UNION ALL
SELECT s.rid,
REPLACE(
s.sentence,
'${' || w.id || '}',
'http://localhost/' || w.id || '/<u>' || w.word || '</u>'
),
s.rn - 1
FROM sentences_with_words s
INNER JOIN ordered_words w
ON ( s.rn - 1 = w.rn )
)
SELECT rid, sentence
FROM sentences_with_words
WHERE rn = 1
) src
ON ( dst.ROWID = src.RID )
WHEN MATCHED THEN
UPDATE
SET sentence = src.sentence;
Output:
ID | WORD | SENTENCE
-: | :---------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | automation testing | automtestingation http://localhost/2/<u>testing</u> is popular kind of http://localhost/2/<u>testing</u>
2 | testing | http://localhost/3/<u>manual testing</u>
3 | manual testing | this is an old method of http://localhost/2/<u>testing</u>
4 | punctuation | http://localhost/1/<u>automation testing</u>,http://localhost/3/<u>manual testing</u>,http://localhost/4/<u>punctuation</u>,automanual http://localhost/2/<u>testing</u>-http://localhost/2/<u>testing</u>
5 | B-number analysis | http://localhost/6/<u>B-number analysis table</u>
6 | B-number analysis table | http://localhost/2/<u>testing</u> http://localhost/5/<u>B-number analysis</u>
7 | Not Matched | http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u>
db<>fiddle here
Update:
Escape any special regular expression characters in the words:
MERGE INTO temp dst
USING (
WITH ordered_words ( rn, id, word, regex_safe_word ) AS (
SELECT ROW_NUMBER() OVER ( ORDER BY LENGTH( word ) ASC, word DESC ),
id,
word,
REGEXP_REPLACE( word, '([][)(}{|^$\.*+?])', '\\\1' )
FROM temp
),
sentences_with_ids ( rid, sentence, rn ) AS (
SELECT ROWID,
sentence,
( SELECT COUNT(*) + 1 FROM ordered_words )
FROM temp
UNION ALL
SELECT s.rid,
REGEXP_REPLACE(
REGEXP_REPLACE(
s.sentence,
'(^|\W)' || w.regex_safe_word || '($|\W)',
'\1${'|| w.id ||'}\2'
),
'(^|\W)' || w.regex_safe_word || '($|\W)',
'\1${' || w.id || '}\2'
),
s.rn - 1
FROM sentences_with_ids s
INNER JOIN ordered_words w
ON ( s.rn - 1 = w.rn )
),
sentences_with_words ( rid, sentence, rn ) AS (
SELECT rid,
sentence,
( SELECT COUNT(*) + 1 FROM ordered_words )
FROM sentences_with_ids
WHERE rn = 1
UNION ALL
SELECT s.rid,
REPLACE(
s.sentence,
'${' || w.id || '}',
'http://localhost/' || w.id || '/<u>' || w.word || '</u>'
),
s.rn - 1
FROM sentences_with_words s
INNER JOIN ordered_words w
ON ( s.rn - 1 = w.rn )
)
SELECT rid, sentence
FROM sentences_with_words
WHERE rn = 1
) src
ON ( dst.ROWID = src.RID )
WHEN MATCHED THEN
UPDATE
SET sentence = src.sentence;
Output:
ID | WORD | SENTENCE
-: | :---------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | automation testing | automtestingation http://localhost/2/<u>testing</u> is popular kind of http://localhost/2/<u>testing</u>
2 | testing | http://localhost/3/<u>manual testing</u>
3 | manual testing | this is an old method of http://localhost/2/<u>testing</u>
4 | punctuation | http://localhost/1/<u>automation testing</u>,http://localhost/3/<u>manual testing</u>,http://localhost/4/<u>punctuation</u>,automanual http://localhost/2/<u>testing</u>-http://localhost/2/<u>testing</u>
5 | B-number analysis | http://localhost/6/<u>B-number analysis table</u>
6 | B-number analysis table | http://localhost/2/<u>testing</u> http://localhost/5/<u>B-number analysis</u>
7 | Not Matched | http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u> http://localhost/2/<u>testing</u>
8 | ^[($ | http://localhost/2/<u>testing</u> characters http://localhost/8/<u>^[($</u> that need escaping in a regular expression
db<>fiddle here

Related

SQL for nested WITH CLAUSE - RESULTS OFFSET in Oracle 19c

Please suggest a way to implement nesting of (temp - results - select) as shown below?
I see that oracle 19c does not allow nesting of WITH clause.
with temp2 as
(
with temp1 as
(
__
__
),
results(..fields..) as
(
select ..<calc part>.. from temp1, results where __
)
select ..<calc part>.. from temp1 join results where __
),
results(..fields..) as
(
select ..<calc part>.. from temp2, results where __
)
select ..<calc part>.. from temp2 join results where __
For instance:
DB Fiddle
I need to calculate CALC3 in similar recursive way as of CALC
CREATE TABLE TEST ( DT DATE, NAME VARCHAR2(10), VALUE NUMBER(10,3));
insert into TEST values ( to_date( '01-jan-2021'), 'apple', 198.95 );
insert into TEST values ( to_date( '02-jan-2021'), 'apple', 6.15 );
insert into TEST values ( to_date( '03-jan-2021'), 'apple', 4.65 );
insert into TEST values ( to_date( '06-jan-2021'), 'apple', 20.85 );
insert into TEST values ( to_date( '01-jan-2021'), 'banana', 80.5 );
insert into TEST values ( to_date( '02-jan-2021'), 'banana', 9.5 );
insert into TEST values ( to_date( '03-jan-2021'), 'banana', 31.65 );
--Existing working code -
with t as
( select
test.*,
row_number() over ( partition by name order by dt ) as seq
from test
),
results(name, dt, value, calc ,seq) as
(
select name, dt, value, value/5 calc, seq
from t
where seq = 1
union all
select t.name, t.dt, t.value, ( 4 * results.calc + t.value ) / 5, t.seq
from t, results
where t.seq - 1 = results.seq
and t.name = results.name
)
select results.*, calc*3 as calc2 -- Some xyz complex logic as calc2
from results
order by name, seq;
Desired output:
CALC3 - grouped by name and dt -
((CALC3 of prev day record * 4) + CALC2 of current record )/ 5
i.e for APPLE
for 1-jan-21, CALC = ((0*4)+119.37)/5 = 23.87 -------> since it is 1st record, have taken 0 as CALC3 of prev day record
for 2-jan-21, CALC = ((23.87*4)+99.19)/5= 115.33 -----> prev CALC3 is considered from 1-jan-21 - 23.87 and 99.19 from current row
for 3-jan-21, CALC = ((115.33*4)+82.14)/5= 477.76 and so on
For BANANA
1-jan-21, CALC = ((0*4)+48.30)/5=9.66
1-jan-21, CALC = ((9.66*4)+44.34)/5=47.51
etc
You do not need to, you can just do it all in one level:
with temp1(...fields...) as
(
__
__
__
),
results1(...fields...) as
(
select ...<calc part>... from temp1 where __
),
temp2( ...fields...) as
(
select ...<calc part>... from temp1 join results1 where __
),
results2(...fields...) as
(
select ...<calc part>... from temp2 where __
)
select ...<calc part>... from temp2 join results2 where __
For your actual problem, you can use a MODEL clause:
SELECT dt,
name,
amount,
calc,
seq,
calc2,
calc3
FROM (
SELECT t.*,
ROW_NUMBER() OVER (PARTITION BY name ORDER BY dt) AS seq
FROM test t
)
MODEL
PARTITION BY (name)
DIMENSION BY (seq)
MEASURES ( dt, amount, 0 AS calc, 0 AS calc2, 0 as calc3)
RULES (
calc[1] = amount[1]/5,
calc[seq>1] = (amount[cv(seq)] + 4*calc[cv(seq)-1])/5,
calc2[seq] = 3*calc[cv(seq)],
calc3[1] = calc2[1]/5,
calc3[seq>1] = (calc2[cv(seq)] + 4*calc3[cv(seq)-1])/5
)
Which outputs:
DT
NAME
AMOUNT
CALC
SEQ
CALC2
CALC3
01-JAN-21
banana
80.5
16.1
1
48.3
9.66
02-JAN-21
banana
9.5
14.78
2
44.34
16.596
03-JAN-21
banana
31.65
18.154
3
54.462
24.1692
01-JAN-21
apple
198.95
39.79
1
119.37
23.874
02-JAN-21
apple
6.15
33.062
2
99.186
38.9364
03-JAN-21
apple
4.65
27.3796
3
82.1388
47.57688
06-JAN-21
apple
20.85
26.07368
4
78.22104
53.705712
db<>fiddle here

rewrite redshift query as athena

I am trying to convert this redshift query to athena.
select
a.customerid,
a.country,
a.stockcode,
a.description,
a.invoicedate,
a.sales_amt,
(b.nbr_months_active) as nbr_months_active
from
ecommerce_sales_data a
inner join (
select
customerid,
count(
distinct(
DATE_PART(y, cast(invoicedate as date)) || '-' || LPAD(
DATE_PART(mon, cast(invoicedate as date)),
2,
'00'
)
)
) as nbr_months_active
from
ecommerce_sales_data
group by
1
) b on a.customerid = b.customerid
This is what I have tried. It returns the results. But I am not sure if the results will match with redshift query in all cases.
WITH students_results(InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country) AS (VALUES
('536365','85123A','WHITE HANGING HEART T-LIGHT HOLDER','6','12/1/2010 8:26','2.55','17850','United Kingdom'),
('536365','71053','WHITE METAL LANTERN','6','12/1/2010 8:26','3.39','17850','United Kingdom'),
('536365','84406B','CREAM CUPID HEARTS COAT HANGER','8','12/1/2010 8:26','2.75','17850','United Kingdom')
)
select
a.customerid,
a.country,
a.stockcode,
a.description,
a.invoicedate,
cast(a.quantity as decimal(11,2)) * cast(a.unitprice as decimal(11,2)) as sales_amt,
(b.nbr_months_active) as nbr_months_active
from
students_results a
inner join (
select
customerid,
count(
distinct(
date_format(date_parse(invoicedate,'%m/%d/%Y %k:%i'), '%Y-%m')
)) as nbr_months_active
FROM students_results group by customerid) as b
on a.customerid = b.customerid
The source of Redshift query is here:
https://aws.amazon.com/blogs/machine-learning/build-multi-class-classification-models-with-amazon-redshift-ml/

finding the row with earliest date for each customerID who purchased specific product and return the date in new column

I'm working with PowerBI and have the following table:
customer_id|item_id| date
1 | A | 01/01/01
1 | B | 01/01/01
1 | A | 02/02/02
1 | A | 03/03/03
2 | A | 03/03/03
2 | C | 03/03/03
...
I would like to find the earliest date for each customer_id who purchased item A and return 1 in a new column. So that I get a new column in the table that looks like the following:
customer_id | item_id | date | Column_want
1 | A | 01/01/01 | 1
1 | B | 01/01/01 | blank
1 | A | 02/02/02 | blank
1 | A | 03/03/03 | blank
2 | A | 03/03/03 | 1
2 | C | 03/03/03 | blank
...
I've tried to filter the column by item A and then using TOPN(1,...) to choose only the top rows. However, it doesn't seem to work.
This seems like such a trivial request. Is there any smarter way around this?
It's possible to use TOPN for this but that function returns an entire row of a table so it looks pretty clunky like this:
Column_want =
IF (
Table1[item_id] = "A" && Table1[date]
= SELECTCOLUMNS (
TOPN (
1,
FILTER (
Table1,
Table1[item_id] = "A"
&& Table1[customer_id] = EARLIER ( Table1[customer_id] )
),
Table1[date], ASC
),
"date", Table1[date]
),
1
)
I'd suggest something more like this:
Column_Want =
IF (
Table1[date]
= CALCULATE (
MIN ( Table1[date] ),
FILTER (
ALLEXCEPT ( Table1, Table1[customer_id], Table1[item_id] ),
Table1[item_id] = "A"
)
),
1
)
Or this:
Column_Want =
IF (
Table1[date]
= MINX (
FILTER (
Table1,
EARLIER ( Table1[item_id] ) = "A"
&& Table1[customer_id] = EARLIER ( Table1[customer_id] )
),
Table1[date]
),
1
)
You could create a calculated column using variables:
Column_want =
VAR Customer_id ='Table'[customer_id]
VAR Earliest_date = CALCULATE(MIN('Table'[date]),
FILTER('Table','Table'[customer_id]=Customer_id))
VAR Earliest_item = CALCULATE(MIN('Table'[item_id]),
FILTER('Table','Table'[date]=Earliest_date),
FILTER('Table','Table'[customer_id]=Customer_id))
RETURN IF('Table'[date]=Earliest_date && 'Table'[item_id]=Earliest_item,
1,BLANK())
The idea is to calculate the earliest date for a particular Customer ID using Calculate and max (Earliest_date variable). Earliest_Item variable is calculated to avoid multiple records for the same customer getting tagged as 1. Hope this helps.

Replacing everything but a specific pattern in a string (Oracle)

I want to replace everything in a string with '' except for a given pattern using Oracle's regexp_replace.
In my case the pattern refers to German licence plates. The patterns is contained in the usage column (verwendungszweck_bez) of a revenue table (of a bank). The pattern can be matched by ([a-z]{1,3})[- ]([a-z]{1,2}) ?([0-9]{1,4}). Now I'd like to reverse the matching pattern in order to match everything except for the pattern.
The usage column looks like this:
ALLIANZ VERSICHERUNGS-AG VERTRAG AS-9028000568 KFZ-VERSICHERUNG KFZ-VERS. XX-Y 427 01.01.19 - 31.12.19
XX-Y 427 would be the pattern I'm interested in. The string can contain more than one license plate:
AXA VERSICHERUNG AG 40301089910 KFZ HAFTPFLICHT ABC-RM10 37,35 + 40330601383 KFZ HAFTPFLIVHT ABC-LX 283 21,19
In this case I need ABC-RM10 and ABC-LX 283.
So far I just replace everything from the string with regexp_replace:
regexp_replace(lower(a.verwendungszweck_bez),'^(.*?)kfz','')
because there's always 'kfz' in the string and the licence plate information follows (not necessarily direct) after that.
upper(regexp_replace(regexp_substr(regexp_replace(lower(a.verwendungszweck_bez),'(^(.*?)kfz',''),'([a-z]{1,3})[- ]([a-z]{1,2}) ?([0-9]{1,4})',1,1),'([a-z]{1,3})[- ]([a-z]{1,2}) ?([0-9]{1,4})','\1-\2 \3'))
This works but I'm sure there's a better solution.
The result should be a list of customers, licence plates and count of cars like this:
Customer|licence plates |count
1234567 |XX-Y 427| 1
1255599 |ABC-RM 10 + ABC-LX 283| 2
You can use a recursive sub-query to find the items. Also, you can use UPPER and TRANSLATE to normalise the data to remove the optional separators in the number plates and convert it into a single case:
Test Data:
CREATE TABLE test_data ( value ) AS
SELECT 'ALLIANZ VERSICHERUNGS-AG VERTRAG AS-9028000568 KFZ-VERSICHERUNG KFZ-VERS. XX-Y 427 01.01.19 - 31.12.19' FROM DUAL UNION ALL
-- UNG AG 4030 should not match
SELECT 'AXA VERSICHERUNG AG 40301089910 KFZ HAFTPFLICHT ABC-RM10 37,35 + 40330601383 KFZ HAFTPFLIVHT ABC-LX 283 21,19' FROM DUAL UNION ALL
-- Multiple matches adjacent to each other
SELECT 'AA-A1BB-BB222CC C3333' FROM DUAL UNION ALL
-- Duplicate values with different separators and cases
SELECT 'AA-A1 AA-A 1 aa a1' FROM DUAL
Query:
WITH items ( value, item, next_pos ) AS (
SELECT value,
TRANSLATE( UPPER( REGEXP_SUBSTR( value, '([^a-z]|^)([a-z]{1,3}[- ][a-z]{1,2} ?\d{1,4})(\D|$)', 1, 1, 'i', 2 ) ), '_ -', '_' ),
REGEXP_INSTR( value, '([^a-z]|^)([a-z]{1,3}[- ][a-z]{1,2} ?\d{1,4})(\D|$)', 1, 1, 1, 'i', 2 ) - 1
FROM test_data
UNION ALL
SELECT value,
TRANSLATE( UPPER( REGEXP_SUBSTR( value, '([^a-z]|^)([a-z]{1,3}[- ][a-z]{1,2} ?\d{1,4})(\D|$)', next_pos, 1, 'i', 2 ) ), '_ -', '_' ),
REGEXP_INSTR( value, '([^a-z]|^)([a-z]{1,3}[- ][a-z]{1,2} ?\d{1,4})(\D|$)', next_pos, 1, 1, 'i', 2 ) - 1
FROM items
WHERE next_pos > 0
)
SELECT item,
COUNT(*)
FROM items
WHERE item IS NOT NULL AND NEXT_POS > 0
GROUP BY item
Output:
ITEM | COUNT(*)
:------- | -------:
CCC3333 | 1
AAA1 | 4
XXY427 | 1
ABCRM10 | 1
ABCLX283 | 1
BBBB222 | 1
db<>fiddle here
The result should be a list of customers ...
You haven't given any information on how customers relate to this; that part is left as an exercise to the reader (who hopefully has the client values somewhere and can correlate them to the input).
Update:
If you want the count of unique number plates per row then:
WITH items ( rid, value, item, next_pos ) AS (
SELECT ROWID,
value,
TRANSLATE( UPPER( REGEXP_SUBSTR( value, '([^a-z]|^)([a-z]{1,3}[- ][a-z]{1,2} ?\d{1,4})(\D|$)', 1, 1, 'i', 2 ) ), '_ -', '_' ),
REGEXP_INSTR( value, '([^a-z]|^)([a-z]{1,3}[- ][a-z]{1,2} ?\d{1,4})(\D|$)', 1, 1, 1, 'i', 2 ) - 1
FROM test_data
UNION ALL
SELECT rid,
value,
TRANSLATE( UPPER( REGEXP_SUBSTR( value, '([^a-z]|^)([a-z]{1,3}[- ][a-z]{1,2} ?\d{1,4})(\D|$)', next_pos, 1, 'i', 2 ) ), '_ -', '_' ),
REGEXP_INSTR( value, '([^a-z]|^)([a-z]{1,3}[- ][a-z]{1,2} ?\d{1,4})(\D|$)', next_pos, 1, 1, 'i', 2 ) - 1
FROM items
WHERE next_pos > 0
)
SELECT LISTAGG( item, ' + ' ) WITHIN GROUP ( ORDER BY item ) AS items,
COUNT(*)
FROM (
SELECT DISTINCT
rid,
item
FROM items
WHERE item IS NOT NULL AND NEXT_POS > 0
)
GROUP BY rid;
Which outputs:
ITEMS | COUNT(*)
:----------------------- | -------:
XXY427 | 1
ABCLX283 + ABCRM10 | 2
AAA1 + BBBB222 + CCC3333 | 3
AAA1 | 1
db<>fiddle here

Regular expression to remove duplicates from comma separated string

I have following string:
'C,2,1,2,3,1'
I need a regular expression to remove duplicates and the result string should be like this:
'C,2,1,3'
If your input data is more than one string, I assume there is some kind of id column you can use to distinguish the strings from each other. If no such column exists, it can be created in the first factored subquery, for example by using rownum.
with
inputs ( id, str ) as (
select 1, 'C,2,1,2,3,1' from dual union all
select 2, 'A,ZZ,3,A,3,ZZ' from dual
),
unwrapped ( id, str, lvl, token ) as (
select id, str, level, regexp_substr(str, '[^,]+', 1, level)
from inputs
connect by level <= 1 + regexp_count(str, ',')
and prior id = id
and prior sys_guid() is not null
),
with_rn ( id, str, lvl, token, rn ) as (
select id, str, lvl, token, row_number() over (partition by id, token order by lvl)
from unwrapped
)
select id, str, listagg(token, ',') within group (order by lvl) as new_str
from with_rn
where rn = 1
group by id, str
order by id
;
ID STR NEW_STR
---- ------------------ --------------------
1 C,2,1,2,3,1 C,2,1,3
2 A,ZZ,3,A,3,ZZ A,ZZ,3
Try this:
with
-- your input data
t_in as (select 'C,2,1,2,3,1' as s from dual),
-- your string splitted into a table, a row per list item
t_split as (
select (regexp_substr(s,'(\w+)(,|$)',1,rownum,'c',1)) s,
level n
from t_in
connect by level <= regexp_count(s,'(\w+)(,|$)') + 1
),
-- this table grouped to obtain distinct values with
-- minimum levels for sorting
t_grouped as (
select s, min(n) n from t_split group by s
)
select listagg(s, ',') within group (order by n)
from t_grouped;
Depending on your Oracle version you might have to replace listagg with wm_concat (it's googlable)
Here another shorter solution:
select listagg(val, ',') within group(order by min(id))
from (select rownum as id,
trim(regexp_substr(str, '[^,]+', 1, level)) as val
from (select 'C,2,1,2,3,1' as str from dual)
connect by regexp_substr(str, '[^,]+', 1, level) is not null)
group by val;