I´m trying to define a recursive function in SAS, as follows
%macro f(n);
%if &n<=1 %then %put f(&n)=&n;
%else %put f(&n)=%eval(f(%eval(&n-1))+f(%eval(&n-2)));
%mend;
but when i>=2 doesn´t work.
How can I solve this?
Recursive macro is highly atypical. You will have a much better recursive programming experience using Proc DS2 and implementing methods therein.
Regardless, you must understand that macro is not a normal functional programming language; it is a source code generator with side-effects. Your macro might not emit any source code and be programmed solely for side effects, or it might be written to emit source code according to a template and have no other side effects.
%macro fib(n);
%if &n < 0 %then %abort cancel;
%if &n = 0 %then
0 /* emit source code 0. The %if is a recursion sentinel */
;
%else
%if &n = 1 %then
1 /* emit source code 1. The %if is a recursion sentinel */
;
%else %do;
/* emit source code that is side effect of eval summing recursive invocation */
%eval (
%fib(%eval(&n-1)) + %fib(%eval(&n-2))
)
%end;
%mend;
%put %fib(0);
%put %fib(1);
%put %fib(2);
%put %fib(3);
%put %fib(4);
%put %fib(5);
%put %fib(6);
The net result emitted is from macro facility (sub-system).
%fib(6) code generation would be
%put
%eval (
%eval ( /* 5 */
%eval ( /* 4 */
%eval ( /* 3 */
%eval ( /* 2 */
1 /* 1 */
+
0 /* 1 */
)
+
%eval ( /* 1 */
1
)
)
+
%eval ( /* 2 */
1 /* 1 */
+
0 /* 1 */
)
)
+
%eval ( /* 3 */
%eval ( /* 2 */
1 /* 1 */
+
0 /* 1 */
)
+
%eval ( /* 1 */
1
)
)
)
+
%eval ( /* 4 */
%eval ( /* 3 */
%eval ( /* 2 */
1 /* 1 */
+
0 /* 1 */
)
+
%eval ( /* 1 */
1
)
)
+
%eval ( /* 2 */
1 /* 1 */
+
0 /* 1 */
)
)
)
;
%macro f(n);
%if &n<=2 %then 1;
%else %eval(%f(%eval(&n-1))+%f(%eval(&n-2)));
%mend;
Related
I wrote the following SAS macro program code.
The purpose of the code is to repeat PROCREG while the condition(&maxVIFval > 1000) is satisfied.
However the %DO %WHILE loop is executed continually even though &maxVIFval has a value of 1000 or less.
I wonder what the reason is.
I'm looking for help..
Forgive my poor English.
%MACRO RegRepeat ;
%global maxVIFvar ; /* maxVIFvar: global symbol table */
%global maxVIFval ; /* maxVIFval: global symbol table */
%global j ; /* j : global symbol table */
%let maxVIFval = 10000 ;
%let j = 1 ;
%do %WHILE( &maxVIFval > 1000 ) ;
/* Regression procedure below */
PROC REG DATA=KNHANES.HNAP1319 ;
TITLE ;
TITLE "MAX VIF: &maxVIFval / Repeated Number: &j" ;
WEIGHT wt_pft_pool ;
WHERE HE_COPD = 3 ;
MODEL HE_fev1_new = SEX_new AGE OBE INCM_LH DJ4_dg_new sm1_former sm1_current at_risk_occp
&airPollution / lackfit ADJRSQ VIF ;
ods output ParameterEstimates = PE ;
RUN ;
/* SQL procedure below */
PROC SQL ;
CREATE TABLE WORK.maxVIF AS
SELECT Variable, max(VarianceInflation) AS maxVIF
FROM WORK.PE
HAVING VarianceInflation = MAX(VarianceInflation) ;
QUIT ;
/* DATA Step below */
DATA WORK.maxVIFtoMV ;
SET WORK.maxVIF ;
CALL SYMPUT("maxVIFvar", VARIABLE);
CALL SYMPUT("maxVIFval", maxVIF);
RUN ;
%let airPollution = %sysfunc(tranwrd(&airPollution, &maxVIFvar, )) ;
%let j = %eval(&j+1) ;
%PUT MAX VIF: &maxVIFval / Repeated Number: &j ;
%end ;
%MEND ;
%RegRepeat
These are sentences from the log window.
MPRINT(REGREPEAT): TITLE "MAX VIF: 10000 / Repeated Number: 1" ;
MPRINT(REGREPEAT): TITLE "MAX VIF: 1588.907368 / Repeated Number: 2" ;
MPRINT(REGREPEAT): TITLE "MAX VIF: 1297.8441436 / Repeated Number: 3" ;
MPRINT(REGREPEAT): TITLE "MAX VIF: 696.59402283 / Repeated Number: 4" ;
MPRINT(REGREPEAT): TITLE "MAX VIF: 569.83744288 / Repeated Number: 5" ;
Please let me know if you need any additional information.
Thank you for your kindness.
To fix this issue, use %sysevalf(&maxVIFval > 1000) to instead of &maxVIFval > 1000.
When SAS compares two numbers in macro, it has different rules when numbers contain decimal point or not. When there is a decimal point, SAS treats numbers in macro as characters:
%if 696.59402283>1000 %then %do;
%put 0;
%end;
%else %do;
%put 1;
%end;
0
SAS compares characters char by char from left to right when comparing characters, "6" is larger than "1" in ascii code sequence, so we have 696.59402283>1000.
When there are no decimal points, SAS treats numbers in macro as numbers:
%if 696>1000 %then %do;
%put 0;
%end;
%else %do;
%put 1;
%end;
1
The comparsion is based on mathmatical rules.
I sugguest you always use %sysevalf() function if you want to do number comparsion in macro, unless you are 100% sure there are no decimal points.
I'm trying to define a macro function that returns unique list items from a space separated list. This macro itself uses other macros that I tested and seem to work fine by themselves (see examples below), it's all very simple code.
However for some reason the code runs indefinitely and I don't know how to debug it properly. I usually debug using %put statements but they don't print here as there's no error and I stop the code manually.
Here is the main macro, followed by my other convenience macros that I use, you can execute the whole batch to load the macros and then check out given examples.
*---------------------------------------------------------------;
* LIST_UNIQUE ;
* Return only unique items from list, ;
* in order of first appearance ;
*---------------------------------------------------------------;
/* EXAMPLE
%put %list_unique(); ** (nothing)
%put %list_unique(a); ** a
%put %list_unique(a a); ** doesn't work (should be a)
%put %list_unique(a b); ** doesn't work (should be a b)
*/
%macro list_unique(data);
%local out curr_item;
%do i=1 %to %list_length(&data);
%let curr_item = %extract(&data,&i);
%if not %list_in(&curr_item,&out) %then %let out = &out &curr_item;
%end;
&out
%mend;
*---------------------------------------------------------------;
* LIST_LENGTH ;
* Length of space separated list ;
*---------------------------------------------------------------;
/* EXAMPLES :
%put %list_length(); ** 0
%put %list_length(item1 item2 item3); ** 3
*/
%macro list_length(data);
%sysfunc(countw(&data,%str( )))
%mend;
*---------------------------------------------------------------;
* LIST_IN ;
* check if item is in list ;
*---------------------------------------------------------------;
/* EXAMPLE
%put %list_in(,a); ** 0
%put %list_in(a,); ** 0
%put %list_in(a,a); ** 1
%put %list_in(a,a a); ** 1
%put %list_in(b,a b c d); ** 1
%put %list_in(e,a b c d); ** 0
*/
%macro list_in
(item /* item to search in list */
,list /* space separated list to quote */
);
/* exception when list has null length */
%if not %length(&list) %then 0%return;
/* general case */
%do i=1 %to %list_length(&list);
%if %extract_pos(&list,&i) = &item %then 1%return;
%end;
0
%mend;
*-------------------------------------------------------------------------------;
* EXTRACT_POS ;
* Extracts subset of values from space separated list ;
*-------------------------------------------------------------------------------;
/* EXAMPLES
%put %extract_pos(,1); ** (nothing)
%put %extract_pos(a b c d,); ** (nothing)
%put %extract_pos(a b c d,1); ** a
%put %extract_pos(a b c d,2 1:3 1); ** b a b c a
*/
%macro extract_pos
(data
,ind
);
%local i j token output new_ind;
%do i=1 %to %sysfunc(countw(&ind,%str( )));
%let token = %scan(&ind,&i,%str( ));
%if %index(&token,:) %then %do; /* if token with ':' */
%do j=%scan(&token,1,:) %to %scan(&token,2,:);
%let output = &output %scan(&data,&j,%str( ));
%end;
%end;
%else %do; /* if simple token */
%let output = &output %scan(&data,&token,%str( ));
%end;
%end;
&output
%mend;
You cannot protect macros you call from modifying your macro variables, but if the macros are designed properly they will NOT. Unless you are INTENDING to modify any existing macro variable you need to define your macro variables as local. One or more of your macros were using the macro variable I without defining it as local. So if there already existed a macro variable named I then the macro modified the existing variable's value.
Also one of your macros was calling %extract() instead of %extract_pos().
I also simplified your %list_in() macro to just be a call to an existing SAS function, like your %list_length() macro.
%macro list_unique
/*---------------------------------------------------------------
Return only unique items from list
---------------------------------------------------------------*/
(data /* Space delimited list of items */
);
%local i curr_item out ;
%do i=1 %to %list_length(&data);
%let curr_item = %extract_pos(&data,&i);
%if not %list_in(&curr_item,&out) %then %let out=&out &curr_item;
%end;
&out
%mend list_unique;
%macro list_length(data);
%sysfunc(countw(&data,%str( )))
%mend list_length;
%macro list_in
/*---------------------------------------------------------------
Check if item is in list
---------------------------------------------------------------*/
(item /* item to find in list */
,list /* space separated list to search */
);
%sysevalf(%sysfunc(indexw(&list,&item,%str( ))),boolean)
%mend list_in;
%macro extract_pos
/*-------------------------------------------------------------------------------
Extracts subset of values from space separated list
-------------------------------------------------------------------------------*/
(data /* Space delimited list of values */
,ind /* Space delimited list of positions or position ranges */
);
%local i j token output;
%do i=1 %to %sysfunc(countw(&ind,%str( )));
%let token = %scan(&ind,&i,%str( ));
%do j=%scan(&token,1,:) %to %scan(&token,-1,:);
/* Token is single position or range in format start:end */
%let output = &output %scan(&data,&j,%str( ));
%end;
%end;
&output
%mend extract_pos;
Test
831 %put %list_unique(); %** (nothing);
832 %put %list_unique(a); %** a ;
a
833 %put %list_unique(a a); %** doesnot work (should be a);
a
834 %put %list_unique(a b); %** doesnot work (should be a b);
a b
Enable Macro Debugging by adding this line to the beginning of your code, which will resolve the macro code and variables:
Options macrogen symbolgen mlogic mprint mfile;
Run your code and check your log for details,
When done Disable Macro Debugging by replacing the options in step 1 with the options below:
Options nomacrogen NoSymbolgen nomlogic nomprint nomfile;
For more details you can check the SAS Debugging documentation
http://support.sas.com/documentation/cdl/en/mcrolref/61885/HTML/default/viewer.htm#a001066200.htm
Variable I is shared down in nameSpace. An easy fix is to use different looping variable in each macro.
Found some documentation on SAS logic of sharing. SAS Blogs
Note: code edited after remarks from #user667489 but issues remain.
I built a fairly simple macro to return the intersection of 2 space separated lists as a new space separated list but for some reason the definition of the macro returns errors.
The macro loops through both lists and keep an element if a match is found (very straightforward, no handling of duplicates or optimization).
I cannot make sense of the log, which shows a combination of following error messages:
ERROR: Macro keyword LET appears as text.
ERROR: Macro keyword MACRO appears as text.
%macro list_intersection
(list1= /* space separated list, or unique term */
,list2= /* space separated list, or unique term */
);
%local output;
%local i;
%local j;
%let i = 1;
%let j = 1;
%do %while (%length(%scan(&list1,&i)));
%do %while (%length(%scan(&list2,&j)));
%if (%scan(&list1,&i) = %scan(&list2,&j)) %then
%let output = &output %scan(&list1,&i);
%let j = %eval(&j+1);
%end;
%let i = %eval(&i+1);
%end;
&output
%mend;
Can you help me circling the issue ?
I'm also open to a more efficient/robust/simple way of achieving the same output.
Reproducible log
Using SAS 9.3, I put above code in a separate program for it not to be polluted, save project, close and reopen. Open program, click run button, and here is the complete log:
1 The SAS System 09:50 Monday, January 22, 2018
1 ;*';*";*/;quit;run;
2 OPTIONS PAGENO=MIN;
3 %LET _CLIENTTASKLABEL='Program3';
ERROR: Macro keyword LET appears as text.
4 %LET _CLIENTPROJECTPATH='F:\CI\Projects\Wealth Indicators\106 Explore DIM_PHYSICALPERSON\SAS\Rework_table_auto2.egp';
ERROR: Macro keyword LET appears as text.
5 %LET _CLIENTPROJECTNAME='Rework_table_auto2.egp';
ERROR: Macro keyword LET appears as text.
6 %LET _SASPROGRAMFILE=;
ERROR: Macro keyword LET appears as text.
7
8 ODS _ALL_ CLOSE;
9 OPTIONS DEV=ACTIVEX;
10 GOPTIONS XPIXELS=0 YPIXELS=0;
11 FILENAME EGSR TEMP;
12 ODS tagsets.sasreport13(ID=EGSR) FILE=EGSR STYLE=HtmlBlue
12 ! STYLESHEET=(URL="file:///C:/Program%20Files/SASHome/x86/SASEnterpriseGuide/5.1/Styles/HtmlBlue.css") NOGTITLE NOGFOOTNOTE
12 ! GPATH=&sasworklocation ENCODING=UTF8 options(rolap="on");
13
14 GOPTIONS ACCESSIBLE;
15 %macro list_intersection
ERROR: Macro keyword MACRO appears as text.
16 (list1= /* space separated list, or unique term */
17 ,list2= /* space separated list, or unique term */
18 );
19 %local output;
ERROR: Macro keyword LOCAL appears as text.
20 %local i;
ERROR: Macro keyword LOCAL appears as text.
21 %local j;
ERROR: Macro keyword LOCAL appears as text.
22 %let i = 1;
ERROR: Macro keyword LET appears as text.
23 %let j = 1;
ERROR: Macro keyword LET appears as text.
24 %do %while (%length(%scan(&list1,&i)));
ERROR: Macro keyword DO appears as text.
25 %do %while (%length(%scan(&list2,&j)));
ERROR: Macro keyword DO appears as text.
26 %if (%scan(&list1,&i) = %scan(&list2,&j)) %then
ERROR: Macro keyword IF appears as text.
27 %let output = &output %scan(&list1,&i);
28 %let j = %eval(&j+1);
ERROR: Macro keyword LET appears as text.
29 %end;
ERROR: Macro keyword END appears as text.
30 %let i = %eval(&i+1);
ERROR: Macro keyword LET appears as text.
31 %end;
ERROR: Macro keyword END appears as text.
32 &output
33 %mend;
ERROR: Macro keyword MEND appears as text.
34
35 GOPTIONS NOACCESSIBLE;
36 %LET _CLIENTTASKLABEL=;
ERROR: Macro keyword LET appears as text.
37 %LET _CLIENTPROJECTPATH=;
2 The SAS System 09:50 Monday, January 22, 2018
ERROR: Macro keyword LET appears as text.
38 %LET _CLIENTPROJECTNAME=;
ERROR: Macro keyword LET appears as text.
39 %LET _SASPROGRAMFILE=;
ERROR: Macro keyword LET appears as text.
40
41 ;*';*";*/;quit;run;
42 ODS _ALL_ CLOSE;
43
44
45 QUIT; RUN;
46
Initial code before edit:
%macro list_intersection
(list1= /* space separated list, or unique term */
,list2= /* space separated list, or unique term */
);
%local output =;
%local i = 1;
%local j = 1;
%do %while (%length(%scan(&list1,&i)));
%do %while (%length(%scan(&list2,&j)));
%if (%scan(&list1,&i) = %scan(&list2,&j) %then
%local output = &output %scan(&list1,&i);
%let j = %eval(&j+1);
%end;
%let i = %eval(&i+1);
%end;
&output
%mend;
A few things immediately stand out:
You cannot use %local to set a value for a macro variable. Instead of %local i=1; you must write two separate statements: %local i; %let i = 1;. %local initialises macro variables to an empty string.
You have unbalanced brackets in your %if statement.
Try moving %let j = %eval(&j+1); into the outer %do %while loop.
Also, you probably want to make sure that %scan only uses space as a delimiter - it defaults to space plus . < ( + & ! $ * ) ; ^ - / , % |
Here's a working version:
%macro list_intersection
(list1= /* space separated list, or unique term */
,list2= /* space separated list, or unique term */
);
%local output;
%local i;
%local j;
%let i = 1;
%do %while (%length(%scan(&list1,&i,%str( ))));
%let j = 1;
%do %while (%length(%scan(&list2,&j,%str( ))));
%if %scan(&list1,&i,%str( )) = %scan(&list2,&j,%str( )) %then
%let output = &output %scan(&list1,&i,%str( ));
%let j = %eval(&j+1);
%end;
%let i = %eval(&i+1);
%end;
&output
%mend;
%put %list_intersection(list1=1 2 3,list2=2 3 4);
You can use SAS functions to make that much easier. Use the COUNTW() function to find the upper bound for the %DO loop. Use the FINDW() function to test if word is found in the other list.
%macro list_intersection
(list1 /* space separated list of terms */
,list2 /* space separated list of terms */
);
%local output i next ;
%do i=1 %to %sysfunc(countw(&list1,%str( ))) ;
%let next=%scan(&list1,&i,%str( ));
%if %sysfunc(findw(&list2,&next,,s)) %then %let output=&output &next ;
%end;
&output
%mend;
You could include the i modifier in the findw() call to make it case insensitive. You could also test if the word is already in the output string to eliminate duplicates.
%macro list_intersection_nodups
(list1 /* space separated list of terms */
,list2 /* space separated list of terms */
);
%local output i next ;
%do i=1 %to %sysfunc(countw(&list1,%str( ))) ;
%let next=%scan(&list1,&i,%str( ));
%if %sysfunc(findw(&list2,&next,,si)) and not %sysfunc(findw(&output,&next,,si))
%then %let output=&output &next ;
%end;
&output
%mend;
Example:
274 %put %list_intersection_nodups(A B a C,a c d);
A C
I've been reading up on how to DROP variables from my dataset that have null values in every observation - it seems the best way to do this is using the %DROPMISS macro function - however I'm getting an error msg - below is the code I'm trying and the error msg
Code
%DROPMISS (DSIN=dataset1, DSOUT=dataset2);
Log
4665 %DROPMISS (DSIN=dataset1, DSOUT=dataset2);
-
180
WARNING: Apparent invocation of macro DROPMISS not resolved.
ERROR 180-322: Statement is not valid or it is used out of proper order.
You need to define the dropmiss macro before you use it.
You can find it here in the appendix (page3)
http://support.sas.com/resources/papers/proceedings10/048-2010.pdf
Or better formatted here:
/******************/
options nomprint noSYMBOLGEN MLOGIC;
/****************************/
%macro DROPMISS( DSNIN /* name of input SAS dataset
*/
, DSNOUT /* name of output SAS dataset
*/
, NODROP= /* [optional] variables to be omitted from dropping even if
they have only missing values */
) ;
/* PURPOSE: To find both Character and Numeric the variables that have only
missing values and drop them if
* they are not in &NONDROP
*
* NOTE: if there are no variables in the dataset, produce no variables
processing code
*
*
* EXAMPLE OF USE:
* %DROPMISS( DSNIN, DSNOUT )
* %DROPMISS( DSNIN, DSNOUT, NODROP=A B C D--H X1-X100 )
* %DROPMISS( DSNIN, DSNOUT, NODROP=_numeric_ )
* %DROPMISS( DSNIN, DSNOUT, NOdrop=_character_ )
*/
%local I ;
%if "&DSNIN" = "&DSNOUT"
%then %do ;
%put /------------------------------------------------\ ;
%put | ERROR from DROPMISS: | ;
%put | Input Dataset has same name as Output Dataset. | ;
%put | Execution terminating forthwith. | ;
%put \------------------------------------------------/ ;
%goto L9999 ;
%end ;
/*###################################################################*/
/* begin executable code
/*####################################################################/
/*===================================================================*/
/* Create dataset of variable names that have only missing values
/* exclude from the computation all names in &NODROP
/*===================================================================*/
proc contents data=&DSNIN( drop=&NODROP ) memtype=data noprint out=_cntnts_( keep=
name type ) ; run ;
%let N_CHAR = 0 ;
%let N_NUM = 0 ;
data _null_ ;
set _cntnts_ end=lastobs nobs=nobs ;
if nobs = 0 then stop ;
n_char + ( type = 2 ) ;
n_num + ( type = 1 ) ;
/* create macro vars containing final # of char, numeric variables */
if lastobs
then do ;
call symput( 'N_CHAR', left( put( n_char, 5. ))) ;
call symput( 'N_NUM' , left( put( n_num , 5. ))) ;
end ;
run ;
/*===================================================================*/
/* if there are no variables in dataset, stop further processing
/*===================================================================*/
%if %eval( &N_NUM + &N_CHAR ) = 0
%then %do ;
%put /----------------------------------\ ;
%put | ERROR from DROPMISS: | ;
%put | No variables in dataset. | ;
%put | Execution terminating forthwith. | ;
%put \----------------------------------/ ;
%goto L9999 ;
%end ;
/*===================================================================*/
/* put global macro names into global symbol table for later retrieval
/*===================================================================*/
%LET NUM0 =0;
%LET CHAR0 = 0;
%IF &N_NUM >0 %THEN %DO;
%do I = 1 %to &N_NUM ;
%global NUM&I ;
%end ;
%END;
%if &N_CHAR > 0 %THEN %DO;
%do I = 1 %to &N_CHAR ;
%global CHAR&I ;
%end ;
%END;
/*===================================================================*/
/* create macro vars containing variable names
/* efficiency note: could compute n_char, n_num here, but must declare macro names
to be
global b4 stuffing them
/*
/*===================================================================*/
proc sql noprint ;
%if &N_CHAR > 0 %then %str( select name into :CHAR1 - :CHAR&N_CHAR from
_cntnts_ where type = 2 ; ) ;
%if &N_NUM > 0 %then %str( select name into :NUM1 - :NUM&N_NUM from
_cntnts_ where type = 1 ; ) ;
quit ;
/*===================================================================*/
/* Determine the variables that are missing
/*
/*===================================================================*/
%IF &N_CHAR > 1 %THEN %DO;
%let N_CHAR_1 = %EVAL(&N_CHAR - 1);
%END;
Proc sql ;
select %do I= 1 %to &N_NUM; max (&&NUM&I) , %end; %IF &N_CHAR > 1 %THEN %DO;
%do I= 1 %to &N_CHAR_1; max(&&CHAR&I), %END; %end; MAX(&&CHAR&N_CHAR)
into
%do I= 1 %to &N_NUM; :NUMMAX&I , %END; %IF &N_CHAR > 1 %THEN %DO;
%do I= 1 %to &N_CHAR_1; :CHARMAX&I,%END; %END; :CHARMAX&N_CHAR
from &DSNIN;
quit;
/*===================================================================*/
/* initialize DROP_NUM, DROP_CHAR global macro vars
/*===================================================================*/
%let DROP_NUM = ;
%let DROP_CHAR = ;
%if &N_NUM > 0 %THEN %DO;
DATA _NULL_;
%do I = 1 %to &N_NUM ;
%IF &&NUMMAX&I =. %THEN %DO;
%let DROP_NUM = &DROP_NUM %qtrim( &&NUM&I ) ;
%END;
%end ;
RUN;
%END;
%IF &N_CHAR > 0 %THEN %DO;
DATA _NULL_;
%do I = 1 %to &N_CHAR ;
%IF "%qtrim(&&CHARMAX&I)" eq "" %THEN %DO;
%let DROP_CHAR = &DROP_CHAR %qtrim( &&CHAR&I ) ;
%END;
%end ;
RUN;
%END;
/*===================================================================*/
/* Create output dataset
/*===================================================================*/
data &DSNOUT ;
%if &DROP_CHAR ^= %then %str(DROP &DROP_CHAR ; ) ; /* drop char variables
that
have only missing values */
%if &DROP_NUM ^= %then %str(DROP &DROP_NUM ; ) ; /* drop num variables
that
have only missing values */
set &DSNIN ;
%if &DROP_CHAR ^= or &DROP_NUM ^= %then %do;
%put /----------------------------------\ ;
%put | Variables dropped are &DROP_CHAR &DROP_NUM | ;
%put \----------------------------------/ ;
%end;
%if &DROP_CHAR = and &DROP_NUM = %then %do;
%put /----------------------------------\ ;
%put | No variables are dropped |;
%put \----------------------------------/ ;
%end;
run ;
%L9999:
%mend DROPMISS ;
I was looking everywhere online for a good parsing code, however all the example are very trivial. The following PERL expression works fine only for 5 bytes.
rx1=prxparse("s/<.*?>//");
My table contains a text filed with the strings something like this
<meta name="generator" content="HTML Tidy, see www.w3.org" /> Test
<table style="WIDTH: 360.0pt;BORDER-COLLAPSE: collapse;" border="0"
cellspacing="0" cellpadding="0" width="480"> <tr style="HEIGHT: 15.0pt;">
<td style="BORDER-BOTTOM: rgb(236,233,216);BORDER-LEFT: rgb(236,233,216);
BACKGROUND-COLOR: transparent;WIDTH: 360.0pt;HEIGHT: 15.0pt; " width="480">
So it contains <table> <tr <td . . . and other complex html structures. How to parse this kind of html into plain text ?
What #Joe says in the comments is true but unfortunately it doesn't negate that fact that people often need to solve this kind of problem. Below are some macros that I use when I need to extract certain values out of XML/HTML. It's not perfect but it's gotten the job done for everything I've needed.
The major limitation of the below macros is that they require the HTML/XML they are parsing to exist in a single field in SAS. The size limitation of a single field in SAS is 32767 chars, which means that if your HTML file is bigger than that then you will need to take just the subset of it that you need to work with.
Examples are included and the best way to figure out how it works is just to run the examples.
/*****************************************************************************
** PROGRAM: PRXEXTRACT.SAS
**
** SEARCHES THROUGH AN XML (OR HTML) FILE FOR AN ELEMENT AND EXTRACTS THE
** VALUE BETWEEN AN ELEMENTS TAGS.
**
** PARAMETERS:
** iElement : The element to search through the blob for.
** iField : The fieldname to save the result to.
** iType : (N or C) for Numeric or Character.
** iLength : The length of the field to create.
** iXMLField : The name of the field that contains the XML blob to parse.
** iDelimiterType: (1 or 2). Defaults to 1. 1 USES <> AS DELIMS. 2 USES [].
**
******************************************************************************
** HISTORY:
** 1.0 MODIFIED: 14-FEB-2011 BY:RP
** - CREATED.
** 1.1 MODIFIED: 16-FEB-2011 BY:RP
** - ADDED OPTION TO CHANGE DELIMITERS FROM <> TO []
** 1.1 MODIFIED: 17-FEB-2011 BY:RP
** - CORRECTED ERROR WHEN MATCH RETURNS A LENGTH OF ZERO
** - CORRECTED MISSING AMPERSAND FROM IDELIMITERTYPE CHECK.
** - ADDED ESCAPING QUOTES TO [] DELIMITER TYPE
** - CORRECTED WARNING WHEN MATCH RETURNS MISSING NUMERIC FIELD
** 1.2 MODIFIED: 25-FEB-2011 BY:RP
** - ADDED DELIMITER TYPES TO WORK WITH MASKED HTML CODES
** 1.3 MODIFIED: 11-MAR-2011 BY:RP
** - MODIFIED TO ALLOW FOR OPTIONAL ATTRIBUTES ON THE ELEMENT BEING SEARCHED FOR.
** 1.4 MODIFIED: 14-MAR-2011 BY:RP
** - CORRECTED TO REMOVE FALSE MATCHES FROM PRIOR VERSION. ADDED EXAMPLE.
** 1.5 MODIFIED: 10-APR-2012 BY:RP
** - CORRECTED PROBLEM WITH ZERO LENGTH STRING MATCHES
** 1.6 MODIFIED: 22-MAY-2012 BY:RP
** - ADDED ABILITY TO CAPTURE ATTRIBUTES
*****************************************************************************/
%macro prxExtract(iElement=, iField=, iType=, iLength=, iXMLField=, iDelimiterType=1, iSequence=1, iAttributesField=);
%local delim_open delim_close;
crLf = byte(10) || byte(13);
&iXMLField = compress(&iXMLField,crLf,);
%if &iDelimiterType eq 1 %then %do;
%let delim_open = <;
%let delim_close = >;
%end;
%else %if &iDelimiterType eq 2 %then %do;
%let delim_open = \[;
%let delim_close = \];
%end;
%else %if &iDelimiterType eq 3 %then %do;
%let delim_open = %nrbquote(&)lt%quote(%str(;)) ;
%let delim_close = %nrbquote(&)gt%quote(%str(;)) ;
%end;
%else %do;
%put ERR%str()ROR (prxExtract.sas): You specified an incorrect option for the iDelimiterType parameter.;
%end;
%if %sysfunc(index(&iField,[)) %then %do;
/* DONT DO THIS IF ITS AN ARRAY */
%end;
%else %do;
%if "%upcase(&iType)" eq "N" %then %do;
attrib &iField length=&iLength format=best.;
%end;
%else %do;
attrib &iField length=$&iLength format=$&iLength..;
%end;
%end;
/*
** BREAKDOWN OF REGULAR EXPRESSION (EXAMPLE USES < AND > AS DELIMS AND ANI AS THE ELEMENT BEING LOOKED FOR:
**
** &delim_open&iElement --> FINDS <ANI
** (\s+.*?&delim_close|&delim_close){1}? --> FINDS THE SHORTEST SINGLE INSTANCE OF EITHER:
** - ONE OR MORE SPACES FOLLOWED BY ANYTHING UNTIL A > CHARACTER
** - OR JUST A > CHARACTER
** THE ?: JUST TELLS IT NOT TO CAPTURE WHAT IT FOUND INBETWEEN THE ( AND )
** (.*?) --> FINDS WHAT WE ARE SEARCHING FOR AND CAPTURES IT INTO BUFFER 1.
** &delim_open --> FINDS <
** \/ --> FINDS THE / CHARACTER. THE FIRST SLASH ESCAPES IT SO IT KNOWS ITS NOT A SPECIAL REGEX SLASH
** &iElement&delim_close --> FINDS ANI>
*/
prx_id = prxparse("/&delim_open&iElement((\s+.*?)&delim_close|&delim_close){1}?(.*?)&delim_open\/&iElement&delim_close/i");
prx_start = 1;
prx_stop = length(&iXMLField);
prx_sequence = 0;
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
do while (prx_pos > 0);
prx_sequence = prx_sequence + 1;
if prx_sequence = &iSequence then do;
if prx_length > 0 then do;
call prxposn(prx_id, 3, prx_pos, prx_length);
%if "%upcase(&iType)" eq "N" %then %do;
length prx_tmp_n $200;
prx_tmp_n = substr(&iXMLField, prx_pos, prx_length);
if cats(prx_tmp_n) ne "" then do;
&iField = input(substr(&iXMLField, prx_pos, prx_length), ?best.);
end;
%end;
%else %do;
if prx_length ne 0 then do;
&iField = substr(&iXMLField, prx_pos, prx_length);
end;
else do;
&iField = "";
end;
%end;
**
** ALSO SAVE THE ATTRIBUTES TO A FIELD IF REQUESTED
*;
%if "%upcase(&iAttributesField)" ne "" %then %do;
call prxposn(prx_id, 2, prx_pos, prx_length);
if prx_length ne 0 then do;
&iAttributesField = substr(&iXMLField, prx_pos, prx_length);
end;
else do;
&iAttributesField = "";
end;
%end;
end;
end;
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
end;
drop crLf prx:;
%mend;
Example for a single element:
data example;
xml = "<test><ANI2Digits>00</ANI2Digits><XNI xniattrib=1>7606256091</XNI><ANI>number2</ANI><ANI x=hmm y=yay>number3</ANI></test>"; * NOTE THE XML MUST BE ALL ON ONE LINE;
%prxExtract(iElement=xni, iField=my_xni, iType=c, iLength=15, iXMLField=xml, iSequence=1, iAttributesField=my_xni_attribs);
run;
Example for repeating elements:
data example;
xml = "<test><ANI2Digits>00</ANI2Digits><ANI>7606256091</ANI><ANI>number2</ANI><ANI x=hmm y=yay>number3</ANI></test>"; * NOTE THE XML MUST BE ALL ON ONE LINE;
%prxExtract(iElement=ani2digits, iField=ani2digits, iType=c, iLength=50, iXMLField=xml);
length ani1-ani6 $15;
length attr1-attr6 $100;
array arrani [1:6] $ ani1-ani6;
array arrattr [1:6] $ attr1-attr6;
%prxCount (iElement=ani, iXMLField=xml, iDelimiterType=1);
do cnt=1 to prx_count;
%prxExtract(iElement=ani, iField=arrani[cnt], iType=c, iLength=15, iXMLField=xml, iSequence=cnt, iAttributesField=arrattr[cnt]);
end;
run;
Finally - if you are need the version for multiple elements you will also need the prxcount macro:
/*****************************************************************************
** PROGRAM: MACROS.PRXCOUNT.SAS
**
** RETURNS THE NUMBER OF TIMES AN ELEMENT IS FOUND IN AN HTML/XML FILE.
**
** PARAMETERS:
** iElement : The element to search through the blob for.
** iXMLField : The name of the field that contains the XML blob to parse.
** iDelimiterType: (1/2/3). Defaults to 1. 1 USES <> AS DELIMS. 2 USES [].
** 3 USES ENCODED VALUES FOR <>.
**
******************************************************************************
** HISTORY:
** 1.0 MODIFIED: 25-FEB-2011 BY:RP
** - CREATED.
** 1.1 MODIFIED: 14-MAR-2011 BY:RP
** - MODIFIED TO ALLOW FOR OPTIONAL ATTRIBUTES ON THE ELEMENT BEING SEARCHED FOR.
*****************************************************************************/
%macro prxCount(iElement=, iXMLField=, iDelimiterType=1);
%local delim_open delim_close;
crLf = byte(10) || byte(13);
&iXMLField = compress(&iXMLField,crLf,);
%if &iDelimiterType eq 1 %then %do;
%let delim_open = <;
%let delim_close = >;
%end;
%else %if &iDelimiterType eq 2 %then %do;
%let delim_open = \[;
%let delim_close = \];
%end;
%else %if &iDelimiterType eq 3 %then %do;
%let delim_open = %nrbquote(&)lt%quote(%str(;)) ;
%let delim_close = %nrbquote(&)gt%quote(%str(;)) ;
%end;
%else %do;
%put ERR%str()ROR (prxCount.sas): You specified an incorrect option for the iDelimiterType parameter.;
%end;
prx_id = prxparse("/&delim_open&iElement(\s+.*?&delim_close|&delim_close){1}?(.*?)&delim_open\/&iElement&delim_close/i");
prx_count = 0;
prx_start = 1;
prx_stop = length(&iXMLField);
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
do while (prx_pos > 0);
prx_count = prx_count + 1;
call prxposn(prx_id, 1, prx_pos, prx_length);
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
end;
drop crLf prx_:;
%mend;