Find out if a string contains only ASCII characters - regex

I need to know whether a string contains only ASCII characters. So far I use this REGEX:
DECLARE
str VARCHAR2(100) := 'xyz';
BEGIN
IF REGEXP_LIKE(str, '^[ -~]+$') THEN
DBMS_OUTPUT.PUT_LINE('Pure ASCII');
END IF;
END;
/
Pure ASCII
' ' and ~ are the first, resp. last character in ASCII.
Problem is, this REGEXP_LIKE fails on certain NLS-Settings:
ALTER SESSION SET NLS_SORT = 'GERMAN';
DECLARE
str VARCHAR2(100) := 'xyz';
BEGIN
IF REGEXP_LIKE(str, '^[ -~]+$') THEN
DBMS_OUTPUT.PUT_LINE('Pure ASCII');
END IF;
END;
/
ORA-12728: invalid range in regular expression
ORA-06512: at line 4
Do anybody knows a solution which works independently from current user NLS-Settings? Is this behavior on purpose or should it be considered as a bug?

You can use TRANSLATE to do this. Basically, translate away all the ASCII printable characters (there aren't that many of them) and see what you have left.
Here is a query that does it:
WITH input ( p_string_to_test) AS (
SELECT 'This this string' FROM DUAL UNION ALL
SELECT 'Test this ' || CHR(7) || ' string too!' FROM DUAL UNION ALL
SELECT 'xxx' FROM DUAL)
SELECT p_string_to_test,
case when translate(p_string_to_test,
chr(0) || q'[ !"#$%&'()*+,-./0123456789:;<=>?#ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~]',
chr(0)) is null then 'Yes' else 'No' END is_ascii
FROM input;
+-------------------------+----------+
| P_STRING_TO_TEST | IS_ASCII |
+-------------------------+----------+
| This this string | Yes |
| Test this string too! | No |
| xxx | Yes |
+-------------------------+----------+

ASCII function with upper limit of 127 may be used :
declare
str nvarchar2(100) := '\xyz~*-=)(/&%+$#£>|"éß';
a nvarchar2(1);
b number := 0;
begin
for i in 1..length(str)
loop
a := substrc(str,i,1);
b := greatest(ascii(a),b);
end loop;
if b < 128 then
dbms_output.put_line('String is composed of Pure ASCII characters');
else
dbms_output.put_line('String has non-ASCII characters');
end if;
end;

I think I will go for one of these two
IF CONVERT(str, 'US7ASCII') = str THEN
DBMS_OUTPUT.PUT_LINE('Pure ASCII');
END IF;
IF ASCIISTR(REPLACE(str, '\', '/')) = REPLACE(str, '\', '/') THEN
DBMS_OUTPUT.PUT_LINE('Pure ASCII');
END IF;

Related

PL/SQL. Parse clob UTF8 chars with regexp_like regular expressions

I want to check if any line of my clob have strange characters like (ñ§). These characters are read from a csv-file with an unexpected encoding (UTF-8) which converts some of them.
I tried to filter each line using a regular expression but it's not working as intended. Is there a way to know the encoding of a csv-file when read?
How could I fix the regular expression to allow lines with only these characters? a-zA-Z 0-9 .,;:"'()-_& space tab.
Clob example readed from csv:
l_clob clob :='
"exp","objc","objc","OBR","031110-5","S","EXAMPLE","NAME","08/03/2018",,"122","3","12,45"
"xp","objc","obj","OBR","031300-5","S","EXAMPLE","NAME","08/03/2018",,"0","0","0"
';
Another clob:
DECLARE
l_clob CLOB
:= '"exp","objc","objc","OBR","031110-5","S","EXAMPLE","NAME","08/03/2018",,"122","3","12,45"
"xp","objc","obj","OBR","031300-5","S","EXAMPLE","NAME","08/03/2018",,"0","0","0"';
l_offset PLS_INTEGER := 1;
l_line VARCHAR2 (32767);
csvregexp CONSTANT VARCHAR2 (1000)
:= '^([''"]+[-&\s(a-z0-9)]*[''"]+[,:;\t\s]?)?[''"]+[-&\s(a-z0-9)]*[''"]+' ;
l_total_length PLS_INTEGER := LENGTH (l_clob);
l_line_length PLS_INTEGER;
BEGIN
WHILE l_offset <= l_total_length
LOOP
l_line_length := INSTR (l_clob, CHR (10), l_offset) - l_offset;
IF l_line_length < 0
THEN
l_line_length := l_total_length + 1 - l_offset;
END IF;
l_line := SUBSTR (l_clob, l_offset, l_line_length);
IF REGEXP_LIKE (l_line, csvregexp, 'i')
THEN -- i (case insensitive matches)
DBMS_OUTPUT.put_line ('Ok');
DBMS_OUTPUT.put_line (l_line);
ELSE
DBMS_OUTPUT.put_line ('Error');
DBMS_OUTPUT.put_line (l_line);
END IF;
l_offset := l_offset + l_line_length + 1;
END LOOP;
END;
If you only want to allow special characters you can use this regex:
Your Regex
csvregexp CONSTANT VARCHAR2 (1000) := '^[a-zA-Z 0-9 .,;:"''()-_&]+$' ;
Regex-Details
^ Start of your string - no chars before this - prevents partial match
[] a set of allowed chars
[]+ a set of allowed chars. Has to be one char minimum up to inf. (* instead of + would mean 0-inf.)
[a-zA-Z]+ 1 to inf. letters
[a-zA-Z0-9]+ 1 to inf. letters and numbers
$ end of your string - no chars behind this - prevents partial match
I think you can work it out with this ;-)
If you know there could be an other encoding in your input, you could try to convert and check against the regex again.
Example-convert
select convert('täst','us7ascii', 'utf8') from dual;

Regex: How to Implement Negative Lookbehind in PL/SQL

How do I match all the strings that begin with loockup. and end with _id but not prefixed by msg? Here below are some examples:
lookup.asset_id -> should match
lookup.msg_id -> shouldn't match
lookup.whateverelse_id -> should match
I know Oracle does not support negative lookbehind (i.e. (?<!))... so I've tried to explicitly enumerate the possibilities using alternation:
regexp_count('i_asset := lookup.asset_id;', 'lookup\.[^\(]+([^m]|m[^s]|ms[^g])_id') <> 0 then
dbms_output.put_line('match'); -- this matches as expected
end if;
regexp_count('i_msg := lookup.msg_id;', 'lookup\.[^\(]+([^m]|m[^s]|ms[^g])_id') <> 0 then
dbms_output.put_line('match'); -- this shouldn’t match
-- but it does like the previous example... why?
end if;
The second regexp_count expression should't match... but it does like the first one. Am I missing something?
EDIT
In the real use case, I've a string that contains PL/SQL code that might contains more than one lookup.xxx_id instances:
declare
l_source_code varchar2(2048) := '
...
curry := lookup.curry_id(key_val => ''CHF'', key_type => ''asset_iso'');
asset : = lookup.asset_id(key_val => ''UBSN''); -- this is wrong since it does
-- not specify key_type
...
msg := lookup.msg_id(key_val => ''hello''); -- this is fine since msg_id does
-- not require key_type
';
...
end;
I need to determine whether there is at least one wrong lookup, i.e. all occurrences, except lookup.msg_id, must also specify the key_type parameter.
With lookup\.[^\(]+([^m]|m[^s]|ms[^g])_id, you are basically asking to check for a string
starting with lookup. denoted by lookup\.,
followed by at least one character different from ( denoted by [^\(]+,
followed by either -- ( | | )
one character different from m -- [^m], or
two characters: m plus no s -- m[^s], or
three characters: ms and no g -- ms[^g], and
ending in _id denoted by _id.
So, for lookup.msg_id, the first part matches obviously, the second consumes ms, and leaves the g for the first alternative of the third.
This could be fixed by patching up the third part to be always three characters long like lookup\.[^\(]+([^m]..|m[^s.]|ms[^g])_id. This, however, would fail everything, where the part between lookup. and _id is not at least four characters long:
WITH
Input (s, r) AS (
SELECT 'lookup.asset_id', 'should match' FROM DUAL UNION ALL
SELECT 'lookup.msg_id', 'shouldn''t match' FROM DUAL UNION ALL
SELECT 'lookup.whateverelse_id', 'should match' FROM DUAL UNION ALL
SELECT 'lookup.a_id', 'should match' FROM DUAL UNION ALL
SELECT 'lookup.ab_id', 'should match' FROM DUAL UNION ALL
SELECT 'lookup.abc_id', 'should match' FROM DUAL
)
SELECT
r, s, INSTR(s, 'lookup.msg_id') has_msg, REGEXP_COUNT(s , 'lookup\.[^\(]+([^m]..|m[^s]|ms[^g])_id') matched FROM Input
;
| R | S | HAS_MSG | MATCHED |
|-----------------|------------------------|---------|---------|
| should match | lookup.asset_id | 0 | 1 |
| shouldn't match | lookup.msg_id | 1 | 0 |
| should match | lookup.whateverelse_id | 0 | 1 |
| should match | lookup.a_id | 0 | 0 |
| should match | lookup.ab_id | 0 | 0 |
| should match | lookup.abc_id | 0 | 0 |
If you have just to make sure, there is no msg in the position in question, you might want to go for
(INSTR(s, 'lookup.msg_id') = 0) AND REGEXP_COUNT(s, 'lookup\.[^\(]+_id') <> 0
For code clarity REGEXP_INSTR(s, 'lookup\.[^\(]+_id') > 0 might be preferable…
#j3d Just comment if further detail is required.
With the requirements still being kind of vague…
Split the string at the semicolon.
Check each substring s to comply:
WITH Input (s) AS (
SELECT ' curry := lookup.curry_id(key_val => ''CHF'', key_type => ''asset_iso'');' FROM DUAL UNION ALL
SELECT 'curry := lookup.curry_id(key_val => ''CHF'', key_type => ''asset_iso'');' FROM DUAL UNION ALL
SELECT 'asset := lookup.asset_id(key_val => ''UBSN'');' FROM DUAL UNION ALL
SELECT 'msg := lookup.msg_id(key_val => ''hello'');' FROM DUAL
)
SELECT
s
FROM Input
WHERE REGEXP_LIKE(s, '^\s*[a-z]+\s+:=\s+lookup\.msg_id\(key_val => ''[a-zA-Z0-9]+''\);$')
OR
((REGEXP_INSTR(s, '^\s*[a-z]+\s+:=\s+lookup\.msg_id') = 0)
AND (REGEXP_INSTR(s, '[(,]\s*key_type') > 0)
AND (REGEXP_INSTR(s,
'^\s*[a-z]+\s+:=\s+lookup\.[a-z]+_id\(( ?key_[a-z]+ => ''[a-zA-Z_]+?'',?)+\);$') > 0))
;
| S |
|--------------------------------------------------------------------------|
|[tab] curry := lookup.curry_id(key_val => 'CHF', key_type => 'asset_iso');|
| curry := lookup.curry_id(key_val => 'CHF', key_type => 'asset_iso');|
| msg := lookup.msg_id(key_val => 'hello');|
This would tolerate a superfluous comma right before the closing parenthesis. But if the input is syntactically correct, such a comma won't exist.

Checking for a pure string using regexp_like

I need to check a "substring of the first 6 characters" of an input string for a pure string.
declare
p_str varchar2(30) := 'ABCD1240';
l_result varchar2(20);
begin
if REGEXP_LIKE(substr(p_str,1,6), '[[:alpha:]]') then
dbms_output.put_line('It is a pure string');
else
dbms_output.put_line('It is an alphanumeric');
end if;
end;
/
I can see that the first 6 characters of the string ABCD1290 is alphanumeric as it contains 12.
But, the output that is printed says otherwise.
Am I doing something wrong with the "alpha" in regexp_like ?
I thought alpha was supposed to be pure characters and not numbers.
Here, ABCD1290 should give me: alphanumeric as output.
ABCDXY90 should be : pure string
Try this:
declare
l_res varchar2(100);
begin
for i in (select 'abcdef123' val from dual union
select '123abc123' from dual union
select '123456abc' from dual)
loop
if REGEXP_LIKE(i.val, '^\D{6}')
then
l_res := 'alpha';
else
l_res := 'numeric';
end if;
dbms_output.put_line(i.val || ' is ' || l_res);
end loop;
end;
123456abc is numeric
123abc123 is numeric
abcdef123 is alpha

Removing LEADING and TRAILING keywords from a String in PL/SQL

I need to remove certain keywords from an input string and return the new string. Keywords are stored in another table like MR, MRS, DR, PVT, PRIVATE, CO, COMPANY, LTD, LIMITED etc. They are two kind of keywords LEADING - MR, MRS, DR and TRAILING - PVT, PRIVATE, CO, COMPANY, LTD, LIMITED etc.
So if Keywords is a LEADING then we have to remove that from the beginning and if it's a TRAILING then we have to remove that from the end. e.g.-MR Jones MRS COMPANY should return JONES MRS and MR MRS Jones PVT COMPANY should return MRS JONES PVT (As in first iteration MR and PVT will be trimmed and then word will become MRS JONES PVT) It should remove only very first occurrence of the reserve keyword either at the beginning or at the end of the input string so there are multiple occurrence of the LEADING keyword at the begining it should remove only the first one not the others like I gave example above, it is same for TRAILING keywords as well.
I have written the function below, and it is working fine but it is not efficient and I believe performance of this can be improved a lot(may be using regular expression). Below is the function:
CREATE OR REPLACE FUNCTION replace_keyword (p_in_name IN VARCHAR2)
RETURN VARCHAR2
IS
l_name VARCHAR2 (4000);
l_keyword_found BOOLEAN;
CURSOR c IS
SELECT *
FROM RSRV_KEY_WORDS
WHERE ACTIVE = 'Y'
AND upper(POSITION) in ('LEADING', 'TRAILING');
BEGIN
--Remove the leading and trailing blank spaces
l_name := TRIM (UPPER (p_in_name));
--remove LEADING keywords
l_keyword_found := false;
for rec in c LOOP
IF UPPER (rec.POSITION) = 'LEADING'
AND SUBSTR(l_name, 1,INSTR(l_name,' ',1) - 1) = rec.key_word
AND l_keyword_found = false
THEN
l_name := SUBSTR(l_name,INSTR(l_name,' ',1)+1);
l_keyword_found := true;
END IF;
EXIT WHEN (l_keyword_found);
END LOOP;
--Remove multiple spaces in a word and replace with single blank space
l_name := REGEXP_REPLACE (l_name, '[[:space:]]{2,}', ' ');
--Remove the leading and trailing blank spaces
l_name := TRIM (l_name);
--remove TRAILING keywords
l_keyword_found := false;
for rec in c LOOP
IF UPPER (rec.POSITION) = 'TRAILING'
AND SUBSTR(l_name, INSTR(l_name,' ',-1) + 1) = rec.key_word
AND l_keyword_found = false
THEN
l_name := SUBSTR(l_name,1,INSTR(l_name,' ',-1)-1);
l_keyword_found := true;
END IF;
EXIT WHEN (l_keyword_found);
END LOOP;
--Remove multiple spaces in a word and replace with single blank space
l_name := REGEXP_REPLACE (l_name, '[[:space:]]{2,}', ' ');
--Remove the leading and trailing blank spaces
l_name := TRIM (l_name);
return l_name;
EXCEPTION
WHEN OTHERS
THEN
raise_application_error (
-20001,
'An error was encountered - ' || SQLCODE || ' -ERROR- ' || SQLERRM);
END;
/
I cant really say if this will be faster, but I would give it a try:
Assuming the keywords in RSRV_KEY_WORDS does not change very often I would create a function to produce a regular expression from the table and have Oracle cache the result:
create or replace function get_lead_and_trail_regexp return varchar2
result_cache relies_on (RSRV_KEY_WORDS) is
declare
CURSOR c IS
SELECT ( SELECT listagg(key_word,'|') within group (order by 1)
FROM RSRV_KEY_WORDS
WHERE ACTIVE = 'Y'
AND upper(POSITION) = 'LEADING' ) as leading,
( SELECT listagg(key_word,'|') within group (order by 1)
FROM RSRV_KEY_WORDS
WHERE ACTIVE = 'Y'
AND upper(POSITION) = 'TRAILING' ) as trailing
FROM dual;
begin
for rec in c loop
return '(^[ ]+(('||rec.leading||')[ ]+))|([ ]+(('||rec.trailing||'||)[ ]+)$)';
end loop;
return null; -- Not very likely
end get_lead_and_trail_regexp;
You can then use the regular expression to remove first leading and first trailing keywords in one stroke:
l_name := REGEXP_REPLACE (l_name, get_lead_and_trail_regexp , ' ');
and then carry one with removing any duplicate spaces.
I have tested the regular expression with java.lang.String.replaceAll as I do not currently have an Oracle database available, but I believe it will work with REGEXP_REPLACE too.

High Surrogate Detection in Oracle Regular Expression

I have a web service that pulls text from an NCLOB column and returns the data via XML. The NCLOB column is populated by extracting text from documents, so there are occasions where invalid XML characters are placed in the XML, causing the consuming system to fail.
As per the W3C, the range of valid characters is:
#x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
/* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
We have tried a few different RegExp patterns, and we're close, but we're not completely there yet. Here is the closest we've come. All of the invalid characters are replaced except for the high surrogates (DB9B - DBFF).
REGEXP_REPLACE(
TEXT,
'[^[:print:]' || chr(13) || chr(10) || ']|[' || UNISTR('\FFFE-\FFFF') || ']',
'*')
We have also tried this, but none of the surrogates (D800 - DFFE) are replaced.
REGEXP_REPLACE(REPLACE(TEXT, unistr('\0000'), ' '),
'[' || unistr('\0001-\0008') || ']'
|| '|[' || unistr('\000B-\000C') || ']'
|| '|[' || unistr('\000E-\001F') || ']'
|| '|[' || unistr('\D800-\DFFF') || ']'
|| '|[' || unistr('\FFFE-\FFFF') || ']',' ')
How can we match the high surrogates? Any thoughts or guidance would be most appreciated.
You could write your own function since regex_replace does not seem to work for the high surrogates. Here's an example (tested on 9.2 and 11.2):
CREATE OR REPLACE FUNCTION replace_invalid(p_clob NCLOB) RETURN NCLOB IS
l_result NCLOB;
l_char NVARCHAR2(1 char);
BEGIN
FOR i IN 1 .. length(p_clob) LOOP
l_char := substr(p_clob, i, 1);
IF utl_raw.cast_to_binary_integer(utl_raw.cast_to_raw(l_char))
BETWEEN to_number('DB9B', 'xxxx') AND to_number('DBFF', 'xxxx') THEN
l_result := l_result || N'*';
ELSE
l_result := l_result || l_char;
END IF;
END LOOP;
RETURN l_result;
END;
It should run with large NCLOB, here's an example with a clob > 32k characters:
SQL> DECLARE
2 l_in NCLOB;
3 l_out NCLOB;
4 BEGIN
5 FOR i IN 1 .. to_number('DBFF', 'xxxx') LOOP
6 l_in := l_in || nchr(i);
7 END LOOP;
8 dbms_output.put_line('l_in length:' || length(l_in));
9 l_out := replace_invalid(l_in);
10 dbms_output.put_line('l_out length:' || length(l_out));
11 dbms_output.put_line('chars replaced:'
12 || (length(l_out) - length(REPLACE(l_out, '*', ''))));
13 END;
14 /
l_in length:56319
l_out length:56319
chars replaced:102
You can use series of calls TRANSLATE.
For example,
SELECT UNISTR('abc\00e5\00f1\00f6') Source FROM DUAL;
SELECT TRANSLATE(UNISTR('abc\00e5\00f1\00f6'), UNISTR('a\00e5\00f1'), 'a') Final FROM DUAL;
Source
------
abcåñö
Final
------
abcö
SQLFiddle