Why SAS "proc sql" is way too slower than "data step" - sas

I was trying to calculate past average stock returns. I find using the following "data step" code is much better than using "proc sql" code.
The data step code:
%macro same(start = ,end = );
proc sql;drop view temp;quit;
proc sql;
create table temp
as select distinct a.*, mean(b.ret_dm) as same_&start._&end, count(b.ret_dm) as sc_&start._&end
from msf1 as a left join msf1 as b
on a.stkcd = b.stkcd and &start <= a.ym - b.ym <= &end and a.month = b.month
group by a.stkcd,a.ym;
quit;
proc sql;
create table same
as select a.*, b.same_&start._&end, b.sc_&start._&end
from same as a left join temp as b
on a.stkcd = b.stkcd and a.ym = b.ym;
quit;
proc sql; drop table temp;quit;
%mend;
data same; set msf;run;
%same(start = 1, end = 12);
The proc sql code:
%macro MA_1;
%do p = 2 %to 9; *;
%put p &p;
proc printto log = junk ; run;
proc sql;
create table price&p
as select distinct a.*, b.count,b.ym
from price&p as a left join tradingdate as b
on a.date = b.date;
quit;
proc sort data = price&p; by stkcd ym date;quit;
data msf;
set price&p;
by stkcd ym date;
if last.ym;
run;
proc printto; run;
%do j = 1 %to %sysfunc(countw(&laglist));
%let lag = %scan(&laglist,&j);
%put lag &lag;
/*********************************************/
proc sql; drop table ma_&lag._&p ;quit;
%do i = 1 %to 2018; *;
proc printto log = junk ; run;
data getname;
set stock;
if _n_ = &i;
call symput('stkcd',stkcd);
run;
proc printto; run;
%put &i &stkcd;
proc printto log = junk ; run;
proc sql;
create table temp
as select distinct a.*, mean(b.prc) as ma_&lag._&p
from msf (where = (stkcd = "&stkcd" )) as a left join price&p (where = (stkcd = "&stkcd" )) as b
on a.stkcd = b.stkcd and 0 <= a.count - b.count <= &lag
group by a.stkcd, a.date
order by a.stkcd, a.date;
quit;
proc append base = ma_&lag._&p data = temp force; quit;
proc printto; run;
%end;
dm "log; clear;";
proc sql;
create table ma_allprc
as select a.*, b.ma_&lag._&p
from ma_allprc as a left join ma_&lag._&p as b
on a.stkcd = b.stkcd and a.date = b.date;
quit;
proc sql; drop table ma_&lag._&p;quit;
%end;
%end;
%mend;
%let laglist = 5 10 20 50 100 200 500 1000 2000; * ;
data ma_allprc; set msf;run;
%ma_1;
"Proc sql" is much slower than I thought. "Data step" takes about 3 hours, but "Proc sql" takes about 2 days.
I even have to loop over each stock when using proc sql, cause it takes up too much of the memory space, I have to say that using proc sql to calculate past averages is dumb, but currently I have no better ideas. :(
Does anybody have a solution with that..

Related

SAS macro variable do cannot resolve [duplicate]

This question already has answers here:
Why won't my macro variable resolve?
(2 answers)
Closed 6 years ago.
This is a rather stupid example but it keeps the essense of what I am trying to do (using SAS university edition):
data TableList;
input tables $ cols $;
cards;
tab1 col
tab2 cul
;
run;
%macro test;
proc sql;
select tables
into:tabs separated by " "
from TableList;
quit;
%do i=1 %to 2;
%let t = %scan(&tabs,&i);
proc sql;
select cols
into: col
from TableList
where tables='&t';
quit;
%put &col;
%end;
%mend;
%test;
The problem with this is when I run this code I got this error message:
WARNING: Apparent symbolic reference COL not resolved.
&col
Why is this. Does not sas change &col with its true value at run time?
UPDATE:
Setting "&t" instead of '&t' solved my problem. The code is now working.
data TableList;
input tables $ cols $;
cards;
tab1 col
tab2 cul
;
run;
%macro test;
proc sql;
select tables
into:tabs separated by " "
from TableList;
quit;
%do i=1 %to 2;
%let t = %scan(&tabs,&i);
proc sql;
select cols
into: col
from TableList
where tables="&t";
quit;
%put Column &col;
%end;
%mend;
%test;
Several issues here
where tables='&t' will not work because of the single quotes. You have to use double quotes whenever using macro variables.
Also &t is not defined
This seems to work (i.e. printing cul in the log) , but I had to define t manually.
data TableList;
input tables $ cols $;
cards;
tab1 col
tab2 cul
;
run;
%let t=tab2;
%macro test;
proc sql;
select tables
into:tabs separated by " "
from TableList;
quit;
%do i=1 %to 2;
%let t = %scan(&tabs,&i);
proc sql;
select cols
into: col
from TableList
where tables="&t";
quit;
%put &col;
%end;
%mend;
%test;

How to get minimum and maximum value of all the columns of a table?

How to get minimum and maximum value of all the columns of a table? Please note that the columns may be both numeric, date or character. We have to find min and max of all the variables in following format:
Name_of_columns, minimum, maximum
Here's a macro that will do what your asking for which doesn't require you to know the variable names or their type:
%macro maxmin;
/* get variable names */
proc contents noprint data = test.hashval out=test.contents;run;
proc sql noprint;
select count(*) into: cnt from test.contents;quit;
%let cnt = &cnt;
proc sql noprint;
select name into: name1 - : name&cnt from test.contents;quit;
/* get length of all variable names and results */
proc delete data = test.results; run;
%let name_len = 0;
%let max_len = 0;
%let min_len = 0;
%do i = 1 %to &cnt;
proc sql noprint;
select max(&&name&i),min(&&name&i) into: max&i, :min&i from test.hashval;quit;
%let max&i = %cmpres(&&max&i);
%let min&i = %cmpres(&&min&i);
%if (&name_len < %length(&&name&i)) %then %let name_len = %length(&&name&i);
%if (&max_len < %length(&&max&i)) %then %let max_len = %length(&&max&i);
%if (&min_len < %length(&&min&i)) %then %let min_len = %length(&&min&i);
%end;
/*create results */
%do i = 1 %to &cnt;
data temp;
length NAME $&name_len MAX $&max_len MIN $&min_len;
NAME = "&&name&i";
MAX = "&&max&i";
MIN = "&&min&i";
run;
proc append base = test.results data= temp force;run;
%end;
%mend maxmin;
%maxmin;
proc sql;
create view myExtrema_1 as
Select min(alphaVar) as alphaVar, ..., put(min(numVar),best32.) as numVar, ...
from myTable
Union
Select max(alphaVar), ..., put(max(numVar),best32.), ...
from myTable;
quit;
proc transpose data=myExtrema_1
out=myExtrema(rename=(
_name_ = Variable
col1 = Minimum
col2 = Maximum
));
var alphaVar ... numVar ...;
run;
On request of the commenter, I tested it with
proc sql;
create view Class_1 as
Select min(Name) as Name
, min(Sex) as Sex
, put(min(Age),best32.) as Age
, put(min(Height),best32.) as Height
, put(min(Weight),best32.) as Weight
from sasHelp.Class
Union
Select max(Name) as Name
, max(Sex) as Sex
, put(max(Age),best32.) as Age
, put(max(Height),best32.) as Height
, put(max(Weight),best32.) as Weight
from sasHelp.Class;
quit;
proc transpose data=Class_1
out=Class(rename=(
_name_ = Variable
col1 = Minimum
col2 = Maximum
));
var Name Sex Age Height Weight;
run;

Crosstable displaying frequency combination of N variables in SAS

What I've got:
a table of 20 rows in SAS (originally 100k)
various binary attributes (columns)
What I'm looking to get:
A crosstable displaying the frequency of the attribute combinations
like this:
Attribute1 Attribute2 Attribute3 Attribute4
Attribute1 5 0 1 2
Attribute2 0 3 0 3
Attribute3 2 0 5 4
Attribute4 1 2 0 10
*The actual sum of combinations is made up and probably not 100% logical
The code I currently have:
/*create dummy data*/
data monthly_sales (drop=i);
do i=1 to 20;
Attribute1=rand("Normal")>0.5;
Attribute2=rand("Normal")>0.5;
Attribute3=rand("Normal")>0.5;
Attribute4=rand("Normal")>0.5;
output;
end;
run;
I guess this can be done smarter, but this seem to work. First I created a table that should hold all the frequencies:
data crosstable;
Attribute1=.;Attribute2=.;Attribute3=.;Attribute4=.;output;output;output;output;
run;
Then I loop through all the combinations, inserting the count into the crosstable:
%macro lup();
%do i=1 %to 4;
%do j=&i %to 4;
proc sql noprint;
select count(*) into :Antall&i&j
from monthly_sales (where=(Attribute&i and Attribute&j));
quit;
data crosstable;
set crosstable;
if _n_=&j then Attribute&i=&&Antall&i&j;
if _n_=&i then Attribute&j=&&Antall&i&j;
run;
%end;
%end;
%mend;
%lup;
Note that since the frequency count for (i,j)=(j,i) you do not need to do both.
I'd recommend using the built-in SAS tools for this sort of thing, and probably displaying your data slightly differently as well, unless you really want a diagonal table. e.g.
data monthly_sales (drop=i);
do i=1 to 20;
Attribute1=rand("Normal")>0.5;
Attribute2=rand("Normal")>0.5;
Attribute3=rand("Normal")>0.5;
Attribute4=rand("Normal")>0.5;
count = 1;
output;
end;
run;
proc freq data = monthly_sales noprint;
table attribute1 * attribute2 * attribute3 * attribute4 / out = frequency_table;
run;
proc summary nway data = monthly_sales;
class attribute1 attribute2 attribute3 attribute4;
var count;
output out = summary_table(drop = _TYPE_ _FREQ_) sum(COUNT)= ;
run;
Either of these gives you a table with 1 row for each contribution of attributes in your data, which is slightly different from what you requested, but conveys the same information. You can force proc summary to include rows for combinations of class variables that don't exist in your data by using the completetypes option in the proc summary statement.
It's definitely worth taking the time to get familiar with proc summary if you're doing statistical analysis in SAS - you can include additional output statistics and process multiple variables with minimal additional code and processing overhead.
Update: it's possible to produce the desired table without resorting to macro logic, albeit a rather complex process:
proc summary data = monthly_sales completetypes;
ways 1 2; /*Calculate only 1 and 2-way summaries*/
class attribute1 attribute2 attribute3 attribute4;
var count;
output out = summary_table(drop = _TYPE_ _FREQ_) sum(COUNT)= ;
run;
/*Eliminate unnecessary output rows*/
data summary_table;
set summary_table;
array a{*} attribute:;
sum = sum(of a[*]);
missing = 0;
do i = 1 to dim(a);
missing + missing(a[i]);
a[i] = a[i] * count;
end;
/*We want rows where two attributes are both 1 (sum = 2),
or one attribute is 1 and the others are all missing*/
if sum = 2 or (sum = 1 and missing = dim(a) - 1);
drop i missing sum;
edge = _n_;
run;
/*Transpose into long format - 1 row per combination of vars*/
proc transpose data = summary_table out = tr_table(where = (not(missing(col1))));
by edge;
var attribute:;
run;
/*Use cartesian join to produce table containing desired frequencies (still not in the right shape)*/
option linesize = 150;
proc sql noprint _method _tree;
create table diagonal as
select a._name_ as aname,
b._name_ as bname,
a.col1 as count
from tr_table a, tr_table b
where a.edge = b.edge
group by a.edge
having (count(a.edge) = 4 and aname ne bname) or count(a.edge) = 1
order by aname, bname
;
quit;
/*Transpose the table into the right shape*/
proc transpose data = diagonal out = want(drop = _name_);
by aname;
id bname;
var count;
run;
/*Re-order variables and set missing values to zero*/
data want;
informat aname attribute1-attribute4;
set want;
array a{*} attribute:;
do i = 1 to dim(a);
a[i] = sum(a[i],0);
end;
drop i;
run;
Yeah, user667489 was right, I just added some extra code to get the cross-frequency table looking good. First, I created a table with 10 million rows and 10 variables:
data monthly_sales (drop=i);
do i=1 to 10000000;
Attribute1=rand("Normal")>0.5;
Attribute2=rand("Normal")>0.5;
Attribute3=rand("Normal")>0.5;
Attribute4=rand("Normal")>0.5;
Attribute5=rand("Normal")>0.5;
Attribute6=rand("Normal")>0.5;
Attribute7=rand("Normal")>0.5;
Attribute8=rand("Normal")>0.5;
Attribute9=rand("Normal")>0.5;
Attribute10=rand("Normal")>0.5;
output;
end;
run;
Create an empty 10x10 crosstable:
data crosstable;
Attribute1=.;Attribute2=.;Attribute3=.;Attribute4=.;Attribute5=.;Attribute6=.;Attribute7=.;Attribute8=.;Attribute9=.;Attribute10=.;
output;output;output;output;output;output;output;output;output;output;
run;
Create a frequency table using proc freq:
proc freq data = monthly_sales noprint;
table attribute1 * attribute2 * attribute3 * attribute4 * attribute5 * attribute6 * attribute7 * attribute8 * attribute9 * attribute10
/ out = frequency_table;
run;
Loop through all the combinations of Attributes and sum the "count" variable. Insert it into the crosstable:
%macro lup();
%do i=1 %to 10;
%do j=&i %to 10;
proc sql noprint;
select sum(count) into :Antall&i&j
from frequency_table (where=(Attribute&i and Attribute&j));
quit;
data crosstable;
set crosstable;
if _n_=&j then Attribute&i=&&Antall&i&j;
if _n_=&i then Attribute&j=&&Antall&i&j;
run;
%end;
%end;
%mend;
%lup;

sas how to use a list to store count distinct of all the variables in a table

I want to store the count distinct of each variable from a table in another. I wanted to use a loop for it, over the list of the variables. So first, I stored the variables names in "vars", doing this:
proc sql ;
select name
into :vars separated by ' '
from dictionary.columns
where libname eq 'HW' and
memname eq "ORDERS";
quit;
Then, I created another list with the result of the count distinct with the following code:
%macro g();
%let b=;
%do i = 1 %to 3;
%let a=%scan(&vars,&i);
proc sql;
select count(distinct &a)
into :gaby from hw.ORDERS;
quit;
%let b=&b &gaby;
%end;
%put &b;
%mend g;
%g();
After this, I wanted to add both to a table, but I can add the vars variable but not the b variable.
data a;
call symput('lista', symget('vars'));
call symput('lista1', symget('b'));
do i=1 to 3;
timept=i;
variable=scan("&vars",i);
dist=scan("&b",i);
output;
end;
run;
The table shows correctly the name of the variables but instead of showing the count distinct (that were stored in b) shows the letter "b".
Is there a way to perform this? also, is there a way to perform it easily?
Thanks!!!!!!!!!!
You're pretty close. I would just use a single SQL pass and create an output table directly. If you want it in a column form, then use PROC TRANSPOSE.
proc sql noprint;
select name
into :vars separated by ' '
from dictionary.columns
where libname eq 'SASHELP' and
memname eq "SHOES";
quit;
%put &vars;
%macro create_table();
proc sql noprint;
%local i n var;
%let n = %sysfunc(countw(&vars));
create table output as
select
%do i=1 %to %eval(&n-1);
%let var = %scan(&vars,&i);
count(distinct &var) as &var,
%end;
%let var = %scan(&vars,&n);
count(distinct &var) as &var
from sashelp.shoes;
quit;
%mend;
%create_table;
proc transpose data=output out=want(rename=(_NAME_=variable COL1=Dist));
run;

How to loop through a macro variable in SAS

I have an example like this:
proc sql;
select dealno into :deal_no
from deal_table;
Now I want to traverse the variable deal_no now containing all dealno in table deal_table but I don't know how to do it.
Another option is add 'separated by' to the sql code, which will add a delimiter to the values. You can then use the SCAN function in a data step or %SCAN in a macro to loop through the values and perform whatever task you want. Example below.
proc sql noprint;
select age into :age separated by ','
from sashelp.class;
quit;
%put &age.;
data test;
do i=1 by 1 while(scan("&age.",i) ne '');
age=scan("&age.",i);
output;
end;
drop i;
run;
If you do
%put &deal_no;
you can see that it only contains the first value of dealno, not all of them.
To avoid that you can do something like this:
proc sql;
create table counter as select dealno from deal_table;
select dealno into :deal_no_1 - :deal_no_&sqlobs
from deal_table;
quit;
%let N = &sqlobs;
%macro loop;
%do i = 1 %to &N;
%put &&deal_no_&i;
%end;
%mend;
%loop; run;
Here's another solution.
proc sql noprint;
select age into :ageVals separated by ' '
from ageData;
quit;
%put &ageVals;
%macro loopAgeVals; %let i = 1; %let ageVal = %scan(&ageVals, &i);
%do %while("&ageVal" ~= "");
%put &ageVal;
%let i = %eval(&i + 1);
%let ageVal = %scan(&ageVals, &i);
%end;
%mend;
%loopAgeVals;