sas proc sql having with condition for positive and negative values - sas

I don't know how to select, in proc sql, the min for positive values of diff (to have the 25/11/2022) and the max for negatives diff (to have the 11/11/2022).
Do I need to use the case statement or having ?
Thank you for your help !

This is one way of doing it:
data have;
input NO_K DATE1 :DDMMYY10. diff;
format DATE1 :DDMMYY10.;
datalines;
055800 04/11/2022 -1209600
055800 11/11/2022 -604800
055800 25/11/2022 604800
055800 30/11/2022 1036800
;
run;
proc sql;
create table want1 as
select *
from have
where diff > 0
having diff = min(diff)
;
create table want2 as
select *
from have
where diff < 0
having diff = max(diff)
;
quit;
data want;
set want1 want2;
run;

You could use the below should it need to be entirely in PROC SQL
proc sql;
create table want as
select * from have
where diff = (select min(diff)
from have
where diff > 0)
union
select * from have
where diff = (select max(diff)
from have
where diff < 0)
;
quit;

Related

Why SAS "proc sql" is way too slower than "data step"

I was trying to calculate past average stock returns. I find using the following "data step" code is much better than using "proc sql" code.
The data step code:
%macro same(start = ,end = );
proc sql;drop view temp;quit;
proc sql;
create table temp
as select distinct a.*, mean(b.ret_dm) as same_&start._&end, count(b.ret_dm) as sc_&start._&end
from msf1 as a left join msf1 as b
on a.stkcd = b.stkcd and &start <= a.ym - b.ym <= &end and a.month = b.month
group by a.stkcd,a.ym;
quit;
proc sql;
create table same
as select a.*, b.same_&start._&end, b.sc_&start._&end
from same as a left join temp as b
on a.stkcd = b.stkcd and a.ym = b.ym;
quit;
proc sql; drop table temp;quit;
%mend;
data same; set msf;run;
%same(start = 1, end = 12);
The proc sql code:
%macro MA_1;
%do p = 2 %to 9; *;
%put p &p;
proc printto log = junk ; run;
proc sql;
create table price&p
as select distinct a.*, b.count,b.ym
from price&p as a left join tradingdate as b
on a.date = b.date;
quit;
proc sort data = price&p; by stkcd ym date;quit;
data msf;
set price&p;
by stkcd ym date;
if last.ym;
run;
proc printto; run;
%do j = 1 %to %sysfunc(countw(&laglist));
%let lag = %scan(&laglist,&j);
%put lag &lag;
/*********************************************/
proc sql; drop table ma_&lag._&p ;quit;
%do i = 1 %to 2018; *;
proc printto log = junk ; run;
data getname;
set stock;
if _n_ = &i;
call symput('stkcd',stkcd);
run;
proc printto; run;
%put &i &stkcd;
proc printto log = junk ; run;
proc sql;
create table temp
as select distinct a.*, mean(b.prc) as ma_&lag._&p
from msf (where = (stkcd = "&stkcd" )) as a left join price&p (where = (stkcd = "&stkcd" )) as b
on a.stkcd = b.stkcd and 0 <= a.count - b.count <= &lag
group by a.stkcd, a.date
order by a.stkcd, a.date;
quit;
proc append base = ma_&lag._&p data = temp force; quit;
proc printto; run;
%end;
dm "log; clear;";
proc sql;
create table ma_allprc
as select a.*, b.ma_&lag._&p
from ma_allprc as a left join ma_&lag._&p as b
on a.stkcd = b.stkcd and a.date = b.date;
quit;
proc sql; drop table ma_&lag._&p;quit;
%end;
%end;
%mend;
%let laglist = 5 10 20 50 100 200 500 1000 2000; * ;
data ma_allprc; set msf;run;
%ma_1;
"Proc sql" is much slower than I thought. "Data step" takes about 3 hours, but "Proc sql" takes about 2 days.
I even have to loop over each stock when using proc sql, cause it takes up too much of the memory space, I have to say that using proc sql to calculate past averages is dumb, but currently I have no better ideas. :(
Does anybody have a solution with that..

Finding the max value of a variable in SAS per ID per time period

proc sql;
create table abc as select distinct formatted_date ,Contract, late_days
from merged_dpd_raw_2602
group by 1,2
;quit;
this gives me the 3 variables I\m working with
they have the form
|ID|Date in YYMMs.10| number|
proc sql;
create table max_dpd_per_contract as select distinct contract, max(late_days) as DPD_for_contract
from sasa
group by 1
;quit;
this gives me the maximum number for the entire period but how do I go on to make it per period?
I'm guessing the timeseries procedure should be used here.
proc timeseries data=sasa
out=sasa2;
by contract;
id formatted_date interval=day ACCUMULATE=maximum ;
trend maximum ;
var late_days;
run;
but I am unsure how to continue.
I want to to find the maximum value of the variable "late days" per a given time period(month). So for contact A for the time period jan2018 the max late_days value is X.
how the data looks:https://imgur.com/iIufDAx
In SQL you will want to calculate your aggregate within a group that uses a computed month value.
Example:
data have;
call streaminit(2021);
length contract date days_late 8;
do contract = 1 to 10;
days_late = 0;
do date = '01jan2020'd to '31dec2020'd;
if days_late then
if rand('uniform') < .55 then
days_late + 1;
else
days_late = 0;
else
days_late + rand('uniform') < 0.25;
output;
end;
end;
format date date9.;
run;
options fmterr;
proc sql;
create table want as
select
contract
, intnx('month', date, 0) as month format = monyy7.
, max(days_late) as max_days_late
from
have
group by
contract, month
;
You will get the same results using Proc MEANS
proc means nway data=have noprint;
class contract date;
format date monyy7.;
output out=want_2 max(days_late) = max_days_late;
run;

Date comparison using PROC SQL within SAS

I'm using PROC SQL within SAS and trying to get a count where the current month is equal to the month on a date field I'm reading. the format of the input date is - mmddyy10.
This is a sample of what I'm trying –
data test;
input job $ lastrun;
DateNew = datejul(lastrun);
Format datenew mmddyy10.;
datalines;
joba 19300
jobb 19200
jobc 19303
jobx 19288
run;
proc print; run;
proc sql;
select
count(job) AS cnt_LastMonth
from test
where datepart(datenew) = intnx('month', today(), -1, 'same');
quit;
In this example I'm expecting the cnt_LastMonth to return 3, however it returns 0.
You can't calculate datepart from date variable, only from datetime. And if you want to compare dates that belong to one month, don't ignore year value.
proc sql;
create table qert as
select
count(job) AS cnt_LastMonth
from test
where intnx('month', DateNew, 0, 'b') = intnx('month', today(), -1, 'b');
/*Increments both dates to the month's begin
Instead of it you can try to use:
where month(DateNew) = month(today())-1 and year(DateNew)=year(today());
*/
quit;
proc sql;
select count(job) AS cnt_LastMonth
from test
where month(DateNew)= 10;
quit;
OR
proc sql;
SELECT count(A2.job) AS cnt_LastMonth
FROM (SELECT *,
MONTH(Date_Minus_1) as Month_filter,
MONTH(DateNew) as Month
FROM(SELECT *,
intnx('Month',today(),-1,'s') as Date_Minus_1 format=mmddyy10.
FROM test) A1)A2
Where A2.Month =A2.Month_filter;
Run;

Use a macro instead of 25 proc sql steps?

I have a SAS code (SQL) that has to repeat for 25 times; for each month/year combination (see code below). How can I use a macro in this code?
proc sql;
create table hh_oud_AUG_17 as
select hh_key
,sum(RG_count) as RG_count_aug_17
,case when sum(RG_count) >=2 then 1 else 0 end as loyabo_recht_aug_17
from basis_RG_oud
where valid_from_dt <= "01AUG2017"d <= valid_to_dt
group by hh_key
order by hh_key
;
quit;
proc sql;
create table hh_oud_SEP_17 as
select hh_key
,sum(RG_count) as RG_count_sep_17
,case when sum(RG_count) >=2 then 1 else 0 end as loyabo_recht_sep_17
from basis_RG_oud
where valid_from_dt <= "01SEP2017"d <= valid_to_dt
group by hh_key
order by hh_key
;
quit;
If you use a data step to do this, you can put all the desired columns in the same output dataset rather than using a macro to create 25 separate datasets:
/*Generate lists of variable names*/
data _null_;
stem1 = "RG_count_";
stem2 = "loyabo_recht_";
month = '01aug2017'd;
length suffix $4 vlist1 vlist2 $1000;
do i = 0 to 24;
suffix = put(intnx('month', month, i, 's'), yymmn4.);
vlist1 = catx(' ', vlist1, cats(stem1,suffix));
vlist2 = catx(' ', vlist2, cats(stem2,suffix));
end;
call symput("vlist1",vlist1);
call symput("vlist2",vlist2);
run;
%put vlist1 = &vlist1;
%put vlist2 = &vlist2;
/*Produce output table*/
data want;
if 0 then set have;
start_month = '01aug2017'd;
array rg_count[2, 0:24] &vlist1 &vlist2;
do _n_ = 1 by 1 until(last.hh_key);
set basis_RG_oud;
by hh_key;
do i = 0 to hbound2(rg_count);
if valid_from_dt <= intnx('month', start_month, i, 's') <= valid_to_dt
then rg_count[1,i] = sum(rg_count[1,i],1);
end;
end;
do _n_ = 1 to _n_;
set basis_RG_oud;
do i = 0 to hbound2(rg_count);
rg_count[2,i] = rg_count[1,i] >= 2;
end;
end;
run;
Create a second data set that enumerates (is a list of) the months to be examined. Cross Join the original data to that second data set. Create a single output table (or view) that contains the month as a categorical variable and aggregates based on that. You will be able to by-group process, classify or subset based on the month variable.
data months;
do month = '01jan2017'd to '31dec2018'd;
output;
month = intnx ('month', month, 0, 'E');
end;
format month monyy7.;
run;
proc sql;
create table want as
select
month, hh_key,
sum(RG_count) as RG_count,
case when sum(RG_count) >=2 then 1 else 0 end as loyabo_recht
from
basis_RG_oud
cross join
months
where
valid_from_dt <= month <= valid_to_dt
group
by month, hh_key
order
by month, hh_key
;
…
/* Some analysis */
BY MONTH;
…
/* Some tabulation */
CLASS MONTH;
TABLE … MONTH …
WHERE year(month) = 2018;

sas proc sql - get min date and add 1 year

I have a dataset with IDs, and each ID has multiple dates (actually datetime). I want to use PROC SQL to get the minimum datetime and also add 1 year to the minimum. I'm trying to do this all in one PROC SQL but have been fumbling and can't get this to work. Below are two attempts. Would appreciate any advice.
*** GENERATE RANDOM DATES AFTER JAN 1, 2012 AND CREATE DATE/TIME VARIABLE ***;
data have ;
format date mmddyy10. dt datetime15.;
do person_id=100, 200, 300, 400, 500;
do i = 1 to 100;
jdate = int(1000 * ranuni(123987));
date = mdy(1,1,2012) + jdate;
dt = dhms(date, 0,0,0);
output;
end;
end;
run;
*** TRY1: THIS DOES NOT WORK - GETS MIN DATE/TIME AND REMERGES WITH EVERY RECORD***;
proc sql;
create table try1 as
select min(dt) as index_dt format=datetime15. ,
(dt + 365*24*60*60) as followup_date format=datetime15.
from have
;
quit;
*** TRY2: USE MIN() IN "HAVING" STATEMENT ***;
*** PROBLEMATIC IF PERSON_ID HAS MIN(DT) OCCUR MULTIPLE TIMES ***;
proc sql;
create table try2 as
select person_id,
dt as index_dt format=datetime15.,
(dt + 365*24*60*60) as followup_date format=datetime15.
from have
group by person_id
having dt=min(dt)
;
quit;
Try this:
proc sql;
create table try1 as
select
min(dt) as index_dt format=datetime15. ,
calculated index_dt + 365*24*60*60 as followup_date format=datetime15.
from have
;
quit;
The trick here is using the "calculated" keyword.
Also you may want to do the following to add a year on instead of your multiplications:
proc sql;
create table try1 as
select
min(dt) as index_dt format=datetime15. ,
input(compress(
put(intnx('YEAR', datepart(calculated index_dt),1,'SAMEDAY'),date9.)||":"||
put(timepart(calculated index_dt),time5.)),datetime15.) as followup_date format=datetime15.
from have
;
quit;
Try using "select distinct person_id" instead of "select person_id" - that should help with your issue with duplicates. I'm not sure if SAS treats 365*24*3600 as the correct number of seconds per year, so that may be a contributing factor as well.
i don't think that you can do in only proc sql. I think to do that in this way:
*** GENERATE RANDOM DATES AFTER JAN 1, 2012 AND CREATE DATE/TIME VARIABLE ***;
data have ;
format date mmddyy10. dt datetime15.;
do person_id=100, 200, 300, 400, 500;
do i = 1 to 100;
jdate = int(1000 * ranuni(123987));
date = mdy(1,1,2012) + jdate;
dt = dhms(date, 0,0,0);
output;
end;
end;
run;
%macro do_elaboration(ds=);
/*count how many rows has my table */
%let dataset=&ds.;
%let DSID = %sysfunc(open(&dataset., IS));
%let nobs = %sysfunc(attrn(&DSID., NLOBS));
%let rc=%sysfunc(close(&DSID.));
/*loop over the number of rows*/
%do i=1 %to &nobs.;
/*at each loop get one id*/
data _NULL_;
set &ds. (OBS=&i OBS=&i);
call symputx("id", person_id);
run;
/*with proc sql get the min_dt*/
proc sql noprint;
select min(dt) into:min_dt
from &ds.
where person_id=&id.
;
quit;
/*increment the min_dt with the function sas intnx*/
data have_final_tmp;
person_id = &id.;
followup_date = intnx('dtyear',&min_dt,1);
format followup_date datetime15.;
run;
/*put all id with the followup_date in only one dataset*/
proc append base=have_final data=have_final_tmp force;
run;
%end;
%mend do_elaboration;
/*call the macro*/
%do_elaboration(ds=have);
I write the code very quickly and i don't test it so you should check it, but the concept is clear.