Double output for long to wide transformation of sequential data - sas

The data is as follows:
Random characters so the bot lets me post my edit:sdnasdinaiwefjaepofj
ID|Character_date|Flag|SASDATE
A |2012_01 |0/1 |MONYY
A |2012_02 |0/1 |MONYY
.................
F |2012_02 |0/1 |MONYY
I want to transform it to be wide but with 12 months intervals.
So for each date there would be an account with 12 columns each indicating a date of flag activation with a horizon of 12 months
ID|Character_date|SASDATE|Flag_actived_date1 |Flag_actived_date2 |Flag_actived_date3
|Flag_actived_date4 |Flag_actived_date5 |Flag_actived_date6 |Flag_actived_date7 |...
A |2012_01 |MONYY |(if flag 1 the date)|....
B |2012_01 |MONYY |(if flag 1 the date)|....
C |2012_01 |MONYY |(if flag 1 the date)|....
...............
A |2012_02 |MONYY |(if flag 1 the date)|....
B |2012_02 |MONYY |(if flag 1 the date)|....
C |2012_02 |MONYY |(if flag 1 the date)|....
data pd_base_ttd2;
set pd_base_std end=eof;
format ttd best12. def_count best12.;
ARRAY def_dates{0:13} def_dates1-def_dates14;
;retain count def_dates1-def_dates14 def_count;
by descending credit_id ;
if first.credit_id then do;
count=0;
def_count=0;
do i=0 to 13;def_dates{i}=0;end;
;end;
if default_flag=1 then do;
def_dates{mod(count,12)}=date_obs;
count=count+1;
def_count=def_count+1;
;end;
else if default_flag=0 then count=count+1;
if last.credit_id or mod(count,12)=11 then output;
run;
DATA pd_base_std;
if 0 then set pd_base_ttd2(keep=credit_id YYYY_mm) pd_base_std;
if _n_ eq 1 then do;
declare hash h_cf(dataset:'pd_base_ttd2',hashexp:15, multidata:'Y');
h_cf.defineKey('credit_id','YYYY_mm');
h_cf.defineData('credit_id','YYYY_mm');
h_cf.defineDone();
end;
set pd_base_std;
rc_cf = h_cf.find();
do while(rc_cf=0);
rc_cf=h_cf.find_next();
end;
if rc_cf ne 0 then output;
drop rc_cf;
run;
proc sort data=pd_base_std ; by descending credit_id ; run;
data pd_base_all; set pd_base_ttd2 pd_base_all; run;
And repeat that over and over.
What I want is something like this.:
data pd_base_ttd2 pd_base_std ;
do untill(eof);
set pd_base_std end=eof;
format ttd best12. def_count best12.;
ARRAY def_dates{0:13} def_dates1-def_dates14;
;retain count def_dates1-def_dates14 def_count;
by descending credit_id ;
if first.credit_id then do;
count=0;
def_count=0;
do i=0 to 13;def_dates{i}=0;end;
;end;
if default_flag=1 then do;
def_dates{mod(count,12)}=date_obs;
count=count+1;
def_count=def_count+1;
;end;
else if default_flag=0 then count=count+1;
if last.credit_id or mod(count,12)=11 then
do;
output to pd_base_ttd2;
rc=1;
end;
end;
if rc=1 then delete;
do _N_=1 by 1 untill(last.credit_id);
set pd_base_std end=ef;
by descending credit_id;
output pd_base_std ;
end;
run;

I over complicated my approaches, probably eager to test out stuff I read about.
data pd_base_ttd2 pd_base_std ;
do _N_=1 by 1 until(last.credit_id);
set lgd.pd_base_std end=eof;
format ttd best12. def_count best12.;
ARRAY def_dates{0:13} def_dates1-def_dates14;
retain count def_dates1-def_dates14 def_count;
by descending credit_id ;
if first.credit_id then do;
count=0;
def_count=0;
do i=0 to 13;def_dates{i}=0;end;
;end;
if default_flag=1 then do;
def_dates{mod(count,12)}=date_obs;
count=count+1;
def_count=def_count+1;
;end;
else if default_flag=0 then count=count+1;
if last.credit_id or mod(count,12)=11 then output pd_base_ttd2;
else output pd_base_std ;
end;
run;
Now the question is how to this without a macro loop but instead making sas read the set again and again? Like: ok you reached the end, now start over, it would also have to be able to keep the descending order of both credit_id and YYYY_MM.

Related

Populate SAS variable by repeating values

I have a SAS table with a lot of missing values. This is only a simple example.
The real table is much bigger (>1000 rows) and the numbers is not the same. But what is the same is that I have a column a that have no missing numbers. Column b and c have a sequence that is shorter than the length of a.
a b c
1 1b 1000
2 2b 2000
3 3b
4
5
6
7
What I want is to fill b an c with repeating the sequences until they columns are full. The result should look like this:
a b c
1 1b 1000
2 2b 2000
3 3b 1000
4 1b 2000
5 2b 1000
6 3b 2000
7 1b 1000
I have tried to make a macro but it become to messy.
The hash-of-hashes solution is the most flexible here, I suspect.
data have;
infile datalines delimiter="|";
input a b $ c;
datalines;
1|1b|1000
2|2b|2000
3|3b|
4| |
5| |
6| |
7| |
;;;;
run;
%let vars=b c;
data want;
set have;
rownum = _n_;
if _n_=1 then do;
declare hash hoh(ordered:'a');
declare hiter hih('hoh');
hoh.defineKey('varname');
hoh.defineData('varname','hh');
hoh.defineDone();
declare hash hh();
do varnum = 1 to countw("&vars.");
varname = scan("&vars",varnum);
hh = _new_ hash(ordered:'a');
hh.defineKey("rownum");
hh.defineData(varname);
hh.defineDone();
hoh.replace();
end;
end;
do rc=hih.next() by 0 while (rc=0);
if strip(vvaluex(varname)) in (" ",".") then do;
num_items = hh.num_items;
rowmod = mod(_n_-1,num_items)+1;
hh.find(key:rowmod);
end;
else do;
hh.replace();
end;
rc = hih.next();
end;
keep a &Vars.;
run;
Basically, one hash is built for each variable you are using. They're each added to the hash of hashes. Then we iterate over that, and search to see if the variable requested is populated. If it is then we add it to its hash. If it isn't then we retrieve the appropriate one.
Assuming that you can tell how many rows to use for each variable by counting how many non-missing values are in the column then you could use this code generation technique to generate a data step that will use the POINT= option SET statements to cycle through the first Nx observations for variable X.
First get a list of the variable names;
proc transpose data=have(obs=0) out=names ;
var _all_;
run;
Then use those to generate a PROC SQL select statement to count the number of non-missing values for each variable.
filename code temp ;
data _null_;
set names end=eof ;
file code ;
if _n_=1 then put 'create table counts as select ' ;
else put ',' #;
put 'sum(not missing(' _name_ ')) as ' _name_ ;
if eof then put 'from have;' ;
run;
proc sql noprint;
%include code /source2 ;
quit;
Then transpose that so that again you have one row per variable name but this time it also has the counts in COL1.
proc transpose data=counts out=names ;
var _all_;
run;
Now use that to generate SET statements needed for a DATA step to create the output from the input.
filename code temp;
data _null_;
set names ;
file code ;
length pvar $32 ;
pvar = cats('_point',_n_);
put pvar '=mod(_n_-1,' col1 ')+1;' ;
put 'set have(keep=' _name_ ') point=' pvar ';' ;
run;
Now use the generated statements.
data want ;
set have(drop=_all_);
%include code / source2;
run;
So for your example data file with variables A, B and C and 7 total observations the LOG for the generated data step looks like this:
1229 data want ;
1230 set have(drop=_all_);
1231 %include code / source2;
NOTE: %INCLUDE (level 1) file CODE is file .../#LN00026.
1232 +_point1 =mod(_n_-1,7 )+1;
1233 +set have(keep=a ) point=_point1 ;
1234 +_point2 =mod(_n_-1,3 )+1;
1235 +set have(keep=b ) point=_point2 ;
1236 +_point3 =mod(_n_-1,2 )+1;
1237 +set have(keep=c ) point=_point3 ;
NOTE: %INCLUDE (level 1) ending.
1238 run;
NOTE: There were 7 observations read from the data set WORK.HAVE.
NOTE: The data set WORK.WANT has 7 observations and 3 variables.
Populate a temporary array with the values, then check the row and add the appropriate value.
Setup the data
data have;
infile datalines delimiter="|";
input a b $ c;
datalines;
1|1b|1000
2|2b|2000
3|3b|
4| |
5| |
6| |
7| |
;
Get a count of the non-null values
proc sql noprint;
select count(*)
into :n_b
from have
where b ^= "";
select count(*)
into :n_c
from have
where c ^=.;
quit;
Now populate the missing values by repeating the contents of each array.
data want;
set have;
/*Temporary Arrays*/
array bvals[&n_b] $ 32 _temporary_;
array cvals[&n_c] _temporary_;
if _n_ <= &n_b then do;
/*Populate the b array*/
bvals[_n_] = b;
end;
else do;
/*Fill the missing values*/
b = bvals[mod(_n_+&n_b-1,&n_b)+1];
end;
if _n_ <= &n_c then do;
/*populate C values array*/
cvals[_n_] = c;
end;
else do;
/*fill in the missing C values*/
c = cvals[mod(_n_+&n_c-1,&n_c)+1];
end;
run;
data want;
set have;
n=mod(_n_,3);
if n=0 then b='3b';
else b=cats(n,'b');
if n in (1,0) then c=1000;
else c=2000;
drop n;
run;

SAS code for calculating IV

I found some code from obseveupdate websit. They are used for IV calculation. When I run it code it goes through, but all IV and Woe are zeros. I changed another data set to try, also get zeros for all variables. Could you help me figure out why?
data inputdata;
length Region $ 20 age $ 20 Gender $ 20;
infile datalines dsd dlm= ':' truncover;
input Region $ age $ Gender $ target ;
datalines;
Scotland:18-25:Male:1
Scotland:18-25:Female:0
Scotland:26-35:Male:0
Wales:26-35:Male:1
Wales:36-45:Female:0
Wales:26-35:Male:1
London:36-45:Male:1
London:26-35:Male:0
London:18-25:Unknown:1
London:36-45:Male:0
Northern Ireland:36-45:Female:0
Northern Ireland:26-35:Male:1
Northern Ireland:36-45:Male:0
Engand (Not London):45+:Female:0
Engand (Not London):18-25:Male:1
Engand (Not London):26-35:Female:0
Engand (Not London):45+:Female:0
Engand (Not London):36-45:Female:1
Engand (Not London):45+:Female:1
;
data _tempdata;
set inputdata;;
n=_n_;
run;
proc sort data=_tempdata;
by target n;
run;
proc transpose data=_tempdata out = _tempdata;
by target n;
var _character_ _numeric_;
run;
proc sort data=_tempdata out=_tempdata;
by _name_ target;
run;
proc freq data=_tempdata;
by _name_ target;
tables col1 /out=_tempdata;
run;
proc sort data=_tempdata;
by _name_ col1;
run;
proc transpose data=_tempdata out=_tempdata;
by _name_ col1;
id target;
var percent;
run;
data IV_Table(keep=variable IV) WOE_Table(keep=variable attribute woe);
set _tempdata;
by _name_;
rename col1=attribute _name_=variable;
_0=sum(_0,0)/100; *Convert to percent and convert null to zero;
_1=sum(_1,0)/100; *Convert to percent and convert null to zero;
woe=log(_0/_1)*100;output WOE_Table;*Output WOE;
if _1 ne 0 and _0 ne 0 then do;
raw=(_0-_1)*log(_0/_1);
end;
else raw=0;
IV+sum(raw,0);*Culmulativly add to IV, set null to zero;
if last._name_ then do; *only _tempdata the last final row;
output IV_table;
IV=0;
end;
where upcase(_name_) ^='TARGET' and upcase(_name_) ^= 'N';run;
proc sort data=IV_table;by descending IV;run;
title1 "IV Listing";proc print data=IV_table;run;
proc sort data=woe_table;
by variable WOE;
run;
title1 "WOE Listing";
proc print data=WOE_Table;run;

Indicator variable for maximum value by groups

Is there any more elegant way than that presented below for the following task:
to create Indicator Variables (below "MAX_X1" and "MAX_X2") whithin each group (below "key1") of multiple observation (below "key2") with value 1 if this observation corresponds to the maximum value of the variable in eache group and 0 otherwise
data have;
call streaminit(4321);
do key1=1 to 10;
do key2=1 to 5;
do x1=rand("uniform");
x2=rand("Normal");
output;
end;
end;
end;
run;
proc means data=have noprint;
by key1;
var x1 x2;
output out=max
max= / autoname;
run;
data want;
merge have max;
by key1;
drop _:;
run;
proc sql;
title "MAX";
select name into :MAXvars separated by ' '
from dictionary.columns
WHERE LIBNAME="WORK" AND MEMNAME="WANT" AND NAME like "%_Max"
order by name;
quit;
title;
data want; set want;
array MAX (*) &MAXvars;
array XVars (*) x1 x2;
array Indicators (*) MAX_X1 MAX_X2;
do i=1 to dim(MAX);
if XVars[i]=MAX[i] then Indicators[i]=1; else Indicators[i]=0;
end;
drop i;
run;
Thanks for any suggestion of optimization
Proc sql can be used with a group by statement to allow summary functions across values of a variable.
data have;
call streaminit(4321);
do key1=1 to 10;
do key2=1 to 5;
do x1=rand("uniform");
x2=rand("Normal");
output;
end;
end;
end;
run;
proc sql;
create table want
as select
key1,
key2,
x1,
x2,
case
when x1 = max(x1) then 1
else 0 end as max_x1,
case
when x2 = max(x2) then 1
else 0 end as max_x2
from have
group by key1
order by key1, key2;
quit;
It is also possible to do this in a single data step, provided that you read the input dataset twice - this is an example of a double DOW-loop.
data have;
call streaminit(4321);
do key1=1 to 10;
do key2=1 to 5;
do x1=rand("uniform");
x2=rand("Normal");
output;
end;
end;
end;
run;
/*Sort by key1 (or generate index) if not already sorted*/
proc sort data = have;
by key1;
run;
data want;
if 0 then set have;
array xvars[3,2] x1 x2 x1_max_flag x2_max_flag t_x1_max t_x2_max;
/*1st DOW-loop*/
do _n_ = 1 by 1 until(last.key1);
set have;
by key1;
do i = 1 to 2;
xvars[3,i] = max(xvars[1,i],xvars[3,i]);
end;
end;
/*2nd DOW-loop*/
do _n_ = 1 to _n_;
set have;
do i = 1 to 2;
xvars[2,i] = (xvars[1,i] = xvars[3,i]);
end;
output;
end;
drop i t_:;
run;
This may be a bit complicated to understand, so here's a rough explanation of how it flows:
Read one by group with the first DOW-loop, updating rolling max variables as each row is read in. Don't output anything yet.
Now read the same by-group again using the second DOW-loop, checking to see whether each row is equal to the rolling max and outputting each row.
Go back to first DOW-loop, read the next by-group and repeat.

how to swap first and last observations in sas data set if it contain 3 observations

I have a data set with 3 observations, 1 2 3
4 5 6
7 8 9 , now i have to interchange 1 2 3 and 7 8 9.
How can do this in base sas?
If you just want to sort your dataset by a variable in descending order, use proc sort:
data example;
input number;
datalines;
123
456
789
;
run;
proc sort data = example;
by descending number;
run;
If you want to re-order a dataset in a more complex way, create a new variable containing the position that you want each row to be in, and then sort it by that variable.
If you want to swap the contents of the first and last observations while leaving the rest of the dataset in place, you could do something like this.
data class;
set sashelp.class;
run;
data firstobs;
i = 1;
set sashelp.class(obs = 1);
run;
data lastobs;
i = nobs;
set sashelp.class nobs = nobs point = nobs;
output;
stop;
run;
data transaction;
set lastobs firstobs;
/*Swap the values of i for first and last obs*/
retain _i;
if _n_ = 1 then do;
_i = i;
i = 1;
end;
if _n_ = 2 then i = _i;
drop _i;
run;
data class;
set transaction(keep = i);
modify class point = i;
set transaction;
run;
This modifies just the first and last observations, which should be quite a bit faster than sorting or replacing a large dataset. You can do a similar thing with the update statement, but that only works if your dataset is already sorted / indexed by a unique key.
By Sandeep Sharma:sandeep.sharmas091#gmail.com
data testy;
input a;
datalines;
1
2
3
4
5
6
7
8
9
;
run;
data ghj;
drop y;
do i=nobs-2 to nobs;
set testy point=i nobs=nobs;
output;
end;
do n=4 to nobs-3;
set testy point=n;
output;
end;
do y=1 to 3;
set testy;
output;
end;
stop;
run;

Multiple hash objects in SAS

I have two SAS data sets. The first is relatively small, and contains unique dates and a corresponding ID:
date dateID
1jan90 10
2jan90 15
3jan90 20
...
The second data set very large, and has two date variables:
dt1 dt2
1jan90 2jan90
3jan90 1jan90
...
I need to match both dt1 and dt2 to dateID, so the output would be:
id1 id2
10 15
20 10
Efficiency is very important here. I know how to use a hash object to do one match, so I could do one data step to do the match for dt1 and then another step for dt2, but I'd like to do both in one data step. How can this be done?
Here's how I would do the match for just dt1:
data tbl3;
if 0 then set tbl1 tbl2;
if _n_=1 then do;
declare hash dts(dataset:'work.tbl2');
dts.DefineKey('date');
dts.DefineData('dateid');
dts.DefineDone();
end;
set tbl1;
if dts.find(key:date)=0 then output;
run;
A format would probably work just as efficiently given the size of your hash table...
data fmt ;
retain fmtname 'DTID' type 'N' ;
set tbl1 ;
start = date ;
label = dateid ;
run ;
proc format cntlin=fmt ; run ;
data tbl3 ;
set tbl2 ;
id1 = put(dt1,DTID.) ;
id2 = put(dt2,DTID.) ;
run ;
Edited version based on below comments...
data fmt ;
retain fmtname 'DTID' type 'I' ;
set tbl1 end=eof ;
start = date ;
label = dateid ;
output ;
if eof then do ;
hlo = 'O' ;
label = . ;
output ;
end ;
run ;
proc format cntlin=fmt ; run ;
data tbl3 ;
set tbl2 ;
id1 = input(dt1,DTID.) ;
id2 = input(dt2,DTID.) ;
run ;
I don't have SAS in front of me right now to test it but the code would look like this:
data tbl3;
if 0 then set tbl1 tbl2;
if _n_=1 then do;
declare hash dts(dataset:'work.tbl2');
dts.DefineKey('date');
dts.DefineData('dateid');
dts.DefineDone();
end;
set tbl1;
date = dt1;
if dts.find()=0 then do;
id1 = dateId;
end;
date = dt2;
if dts.find()=0 then do;
id2 = dateId;
end;
if dt1 or dt2 then do output; * KEEP ONLY RECORDS THAT MATCHED AT LEAST ONE;
drop date dateId;
run;
I agree with the format solution, for one, but if you want to do the hash solution, here it goes. The basic thing here is that you define the key as the variable you're matching, not in the hash itself.
data tbl2;
informat date DATE7.;
input date dateID;
datalines;
01jan90 10
02jan90 15
03jan90 20
;;;;
run;
data tbl1;
informat dt1 dt2 DATE7.;
input dt1 dt2;
datalines;
01jan90 02jan90
03jan90 01jan90
;;;;
run;
data tbl3;
if 0 then set tbl1 tbl2;
if _n_=1 then do;
declare hash dts(dataset:'work.tbl2');
dts.DefineKey('date');
dts.DefineData('dateid');
dts.DefineDone();
end;
set tbl1;
rc1 = dts.find(key:dt1);
if rc1=0 then id1=dateID;
rc2 = dts.find(key:dt2);
if rc2=0 then id2=dateID;
if rc1=0 and rc2=0 then output;
run;