sas adding column names and rows based on max and min values - sas

I want the data to be transposed (date) and arranged accordingly and fillin gaps
Eg:
/*COUNT ID BASED ON bucket and Sold_month*/
PROC SQL;
CREATE TABLE want AS
SELECT bucket, Sold_month,COUNT(ID) as IID from TRANS_2 GROUP by bucket,
Sold_month;
quit;
/*Format date*/
proc datasets ;
modify want;
FORMAT Sold_month YYMMN6.;
quit;
/*TRANSPOSE*/
PROC TRANSPOSE data=want out=wantf (drop=_:) prefix=D_ ;
by bucket;
var IID;
id Sold_month;
format IID best.;
run;
/*Sorting the dates Sold month*/
proc contents data=wantf out=col_names(keep=name) noprint;
run;
proc sort data=col_names out=col_names_sorted;
by name;
run;
proc sql;
create table col_names_sorted_n as
select name from col_names_sorted where name<>'bucket';
run;
data _null_;
set col_names_sorted_n;
by name;
retain sorted_cols;
length sorted_cols $2500.;
if _n_ = 1 then sorted_cols =name ;
else sorted_cols = catx(' ', sorted_cols, name);
call symput('sorted_cols', sorted_cols);
run;
%put &sorted_cols;
/* Final SOLD DATA- */
data output_sorted;
retain bucket &sorted_cols;
set wantf ;
run;
/*ADDING 0 to missing values*/
proc stdize data=output_sorted out=Transaction reponly missing=0;
run;
This works but I dont get the missing column names and rows which I want to get by seeing the maximum and minimum values of bucket(0-max) and sold_dt(all year_month from min year-month to max year-month). I dont want to manually enter all the columns/buckets but derive the max and min and do. But bucket should start from 0 to max available

Might be easiest to just use PROC SUMMARY with COMPLETETYPES option.
proc summary data=TRANS_2 nway completetypes ;
class bucket sold_month ;
var id;
format sold_month yymmn6. ;
output out=tall n=iid;
run;
proc transpose data=tall out=want(drop=_name_) prefix=d_ ;
by bucket ;
id sold_month;
var iid ;
run;
If you have totally missing BUCKET values, for example you have no records for BUCKET=4 at all, then you will need add some records. Might be easier to add them on the summarized table in which case you could skip the COMPLETETYPES and do it on your own. So summarize the data:
proc sql noprint;
create table tall as
select bucket
, intnx('month',sold_month,0,'b') as sold_month format=yymmn6.
, count(id) as IID
from TRANS_2
group by 1,2
;
create table tall_dummy as
select max(bucket) as bucket
, min(sold_month) as min
, max(sold_month) as max
from tall
;
quit;
Generate table of all bucket numbers and and all months with zero for count.
data tall_dummy ;
set tall_dummy ;
keep bucket sold_month iid;
iid=0;
do bucket = 0 to bucket ;
do i=0 to intck('month',min,max);
sold_month=intnx('month',min,i,'b') ;
output;
end;
end;
format sold_month yymmn6.;
run;
Then merge with the actually bucket*month combinations and replace the zeros with actual counts.
data tall_full ;
merge tall_dummy tall ;
by bucket sold_month;
run;
Then you can transpose.
proc transpose data=tall_full out=want(drop=_name_) prefix=d_ ;
by bucket ;
id sold_month;
var iid ;
run;

Related

SAS Demographic Table

I have been trying to create a demographic table like below this but I can't seem append the different tables. Please advise on where I can make adjustments in the code.
Group A Group B
chort 1 cohort 2 cohort 3 subtotal cohort 4 cohort 5 cohort 6 subtotal
Age
n
mean
sd
median
min
Gender
n
female
male
Race
n
white
asian
hispanic
black
My Code:
PROC FORMAT;
value content
1=' '
2='Age'
3='Gender'
4='Race'
value sex
1=' n'
2=' female'
3=' male';
value race
1=' n'
2=' white'
3=' asian'
4=' hispanic'
5=' black';
value stat
1=' n'
2=' Mean'
3=' Std. Dev.'
4=' Median'
5=' Minimum';
RUN;
DATA testtest;
SET test.test(keep = id group cohort age gender race);
RUN;
data tottest;
set testtest;
output;
if prxmatch('m/COHORT 1|COHORT 2|COHORT 3/oi', cohort) then do;
cohort='Subtotal';
output;
end;
if prxmatch('m/COHORT 4|COHORT 5|COHORT 6/oi', cohort) then do;
cohort='Subtotal';
output;
end;
run;
data count;
if 0 then set testtest nobs=npats;
call symput('npats',put(npats,1.));
stop;
run;
proc freq data=tottest;
tables cohort /out=patk0 noprint;
tables cohort*sex /out=sex0 noprint;
tables cohort*race /out=race0 noprint;
run;
PROC MEANS DATA = testtest n mean std min median;
class cohort;
VAR age;
RUN;
I know that I would have to transpose it and out it in a report. But before I do that, how do I get the variable out of my proc means, proc freq, etc?

How to filter a dataset according to its quantile

In the following code, how could I keep only the observations superior to the 95th quantile?
data test;
input business_ID $ count;
datalines;
'busi1' 2
'busi1' 10
'busi1' 4
'busi2' 1
'busi3' 2
'busi3' 1
;
run;
proc sort data = test;
by descending count;
run;
I don't know how to cleanly stock the quartile and then re-use it with an if condition.
Thanks
Edit : I can determine the quantile with this code :
proc means data=test noprint;
var count;
output out=quantile P75= / autoname;
run;
But how can I relate to it in the Test dataset so that I can select every observations above that quantile?
You could either read the value of the quantile in a macro variable to use in a subsequent if or where condition:
proc means data=test noprint;
var count;
output out=quantile P75= / autoname;
run;
data _null_;
set quantile;
call symput('quantile',count_p75);
run;
data test;
set test;
where count > &quantile.;
run;
or you could use an SQL subquery
proc means data=test noprint;
var count;
output out=quantile P75= / autoname;
run;
proc sql undo_policy=none;
create table test as
select *
from test
where count > (select count_p75 from quantile)
;
quit;
(Note that your question mentions the 95th quantile whereas your sample code mentions the 75th)
User2877959's solution is solid. Recently I did this with Proc Rank. The solution is a bit 'work around-y', but saves a lot of typing.
proc rank data=Input groups=1000 out=rank_out;
var var_to_rank;
ranks Rank_val;
run;
data seventy_five;
set rank_out;
if rank_val>750;
run;
More on Rank: http://documentation.sas.com/?docsetId=proc&docsetTarget=p0le3p5ngj1zlbn1mh3tistq9t76.htm&docsetVersion=9.4&locale=en

How can I extract the unique values of a variable and their counts in SAS

Suppose I have these data read into SAS:
I would like to list each unique name and the number of months it appeared in the data above to give a data set like this:
I have looked into PROC FREQ, but I think I need to do this in a DATA step, because I would like to be able to create other variables within the new data set and otherwise be able to manipulate the new data.
Data step:
proc sort data=have;
by name month;
run;
data want;
set have;
by name month;
m=month(lag(month));
if first.id then months=1;
else if month(date)^=m then months+1;
if last.id then output;
keep name months;
run;
Pro Sql:
proc sql;
select distinct name,count(distinct(month(month))) as months from have group by name;
quit;
While it's possible to do this in a data step, you wouldn't; you'd use proc freq or similar. Almost every PROC can give you an output dataset (rather than just print to the screen).
PROC FREQ data=sashelp.class;
tables age/out=age_counts noprint;
run;
Then you can use this output dataset (age_counts) as a SET input to another data step to perform your further calculations.
You can also use proc sql to group the variable and count how many are in that group. It might be faster than proc freq depending on how large your data is.
proc sql noprint;
create table counts as
select AGE, count(*) as AGE_CT from sashelp.class
group by AGE;
quit;
If you want to do it in a data step, you can use a Hash Object to hold the counted values:
data have;
do i=1 to 100;
do V = 'a', 'b', 'c';
output;
end;
end;
run;
data _null_;
set have end=last;
if _n_ = 1 then do;
declare hash cnt();
rc = cnt.definekey('v');
rc = cnt.definedata('v','v_cnt');
rc = cnt.definedone();
call missing(v_cnt);
end;
rc = cnt.find();
if rc then do;
v_cnt = 1;
cnt.add();
end;
else do;
v_cnt = v_cnt + 1;
cnt.replace();
end;
if last then
rc = cnt.output(dataset: "want");
run;
This is very efficient as it is a single loop over the data. The WANT data set contains the key and count values.

Transpose the dataset

I have the original dataset
What I want is:
My code:
proc transpose data = lib.original
out= lib.new(rename=(col1=Mean col2=Median));
var WBCmean RBCmean WBCmedian RBCmedian;
run;
But I get
Can you give some hint?
EDIT
If I add by statement,
proc transpose data = lib.original
out= lib.new;
by Gender;
var WBCmean RBCmean WBCmedian RBCmedian;
run;
then I get
One way is to convert to a tall skinny format.
proc transpose data = have out=middle ;
by gender ;
var WBCmean RBCmean WBCmedian RBCmedian;
run;
Then generate the new variables needed,
data middle;
set middle ;
testcode = substr(_name_,1,3);
_name_ = substr(_name_,4);
run;
you might need to sort the data depending on the order of variable names in your original VAR statement,
proc sort data=middle;
by gender testcode _name_;
run;
and then transpose into the new arrangement.
proc transpose data=middle out=want ;
by gender testcode ;
id _name_;
var col1 ;
run;

SAS code for calculating IV

I found some code from obseveupdate websit. They are used for IV calculation. When I run it code it goes through, but all IV and Woe are zeros. I changed another data set to try, also get zeros for all variables. Could you help me figure out why?
data inputdata;
length Region $ 20 age $ 20 Gender $ 20;
infile datalines dsd dlm= ':' truncover;
input Region $ age $ Gender $ target ;
datalines;
Scotland:18-25:Male:1
Scotland:18-25:Female:0
Scotland:26-35:Male:0
Wales:26-35:Male:1
Wales:36-45:Female:0
Wales:26-35:Male:1
London:36-45:Male:1
London:26-35:Male:0
London:18-25:Unknown:1
London:36-45:Male:0
Northern Ireland:36-45:Female:0
Northern Ireland:26-35:Male:1
Northern Ireland:36-45:Male:0
Engand (Not London):45+:Female:0
Engand (Not London):18-25:Male:1
Engand (Not London):26-35:Female:0
Engand (Not London):45+:Female:0
Engand (Not London):36-45:Female:1
Engand (Not London):45+:Female:1
;
data _tempdata;
set inputdata;;
n=_n_;
run;
proc sort data=_tempdata;
by target n;
run;
proc transpose data=_tempdata out = _tempdata;
by target n;
var _character_ _numeric_;
run;
proc sort data=_tempdata out=_tempdata;
by _name_ target;
run;
proc freq data=_tempdata;
by _name_ target;
tables col1 /out=_tempdata;
run;
proc sort data=_tempdata;
by _name_ col1;
run;
proc transpose data=_tempdata out=_tempdata;
by _name_ col1;
id target;
var percent;
run;
data IV_Table(keep=variable IV) WOE_Table(keep=variable attribute woe);
set _tempdata;
by _name_;
rename col1=attribute _name_=variable;
_0=sum(_0,0)/100; *Convert to percent and convert null to zero;
_1=sum(_1,0)/100; *Convert to percent and convert null to zero;
woe=log(_0/_1)*100;output WOE_Table;*Output WOE;
if _1 ne 0 and _0 ne 0 then do;
raw=(_0-_1)*log(_0/_1);
end;
else raw=0;
IV+sum(raw,0);*Culmulativly add to IV, set null to zero;
if last._name_ then do; *only _tempdata the last final row;
output IV_table;
IV=0;
end;
where upcase(_name_) ^='TARGET' and upcase(_name_) ^= 'N';run;
proc sort data=IV_table;by descending IV;run;
title1 "IV Listing";proc print data=IV_table;run;
proc sort data=woe_table;
by variable WOE;
run;
title1 "WOE Listing";
proc print data=WOE_Table;run;