Loop through SAS variables and create data sets - sas

I have a SAS data set t3. I want to run a data step inside a loop through a set of variables to create additional sets based on the variable value = 1, and rank two variables bal and otheramt in each subset, and then merge the ranks for each subset onto the original data set. Each rank column needs to be dynamically named so I know what subset is getting ranked. I know how to do proc rank and macros basically but do not know how to do this in the most dynamic way inside of a macro. Can you assist?
ID
bal
otheramt
firstvar
secondvar
lastvar
444
581
100
1
1
555
255
200
1
1
1
666
255
300
--------------
1
--------------
%macro dog();
data new;
set t3;
ARRAY Indicators(5) FirstVar--LastVar;
/*create data set for each of the subsets if firstvar = 1, secondvar = 1 ... lastvar = 1 */
/*for each new data set, rank by bal and otheramt*/
/*name the new rank columns [FirstVar]BalRank, [FirstVar]OtherAmtRank; */
/*merge the new ranks onto the original data set by ID*/
%mend;
%dog()
The Proc rank section would be something like this, but I would need the rank columns to have information about what subset I am ranking.
proc rank data=subset1 out=subset1ranked;
var bal otheramt;
ranks bal_rank otheramt_rank;
run;

Instead of using macro, use data transformation and reshaping that allows simpler steps to be written.
Example:
Rows are split into multiple rows based on flag so group processing in RANK can occur. Two transposes are required to reshape the results back a single row per id.
data have;
call streaminit(20230216);
do id = 1 to 100;
foo = rand('integer', 50,150);
bar = rand('integer', 100,200);
flag1 = rand('integer', 0, 1);
flag2 = rand('integer', 0, 1);
flag3 = rand('integer', 0, 1);
output;
end;
run;
data step1;
set have;
/* important: the group value becomes part of the variable name later */
if flag1 then do; group='flag1_'; output; end;
if flag2 then do; group='flag2_'; output; end;
if flag3 then do; group='flag3_'; output; end;
drop flag:;
run;
proc sort data=step1;
by group;
run;
proc rank data=step1 out=step2;
by group;
var foo bar;
ranks foo_rank bar_rank;
run;
proc sort data=step2;
by id group;
run;
* pivot (reshape) so there is one row per ranked var;
proc transpose data=step2 out=step3(drop=_label_);
by id foo bar group;
var foo_rank bar_rank;
run;
* pivot again so there is one row per id;
proc transpose data=step3 out=step4(drop=_name_);
by id;
var col1;
id group _name_;
run;
* merge so those 0 0 0 flag rows remain intact;
data want;
merge have step4;
by id;
run;

Since we don't have much sample data, I created test data from sashelp.class with some indicator variables like yours.
data have;
set sashelp.class;
firstvar=round(rand('uniform',1));
secondvar=round(rand('uniform',1));
thirdvar=round(rand('uniform',1));
drop sex weight;
run;
Partial output:
Name Age Height firstvar secondvar thirdvar
Alfred 14 69 1 0 1
Alice 13 56.5 0 1 1
Barbara 13 65.3 1 0 0
Carol 14 62.8 0 0 0
To dynamically rank data based on indicator variables, I created a macro that accepts a list of indicators and rank variables. The 2 lists help to create the specific variable names you requested. Here's the macro call:
%rank(indicators=firstvar secondvar thirdvar,
rank_vars=age height);
Here's part of the final output. Notice the indicators in the sample output above coincide with the ranks in this output. Also note that Carol is not in the output because she had no indicators set to 1.
Name Age Height firstvar_age_rank firstvar_height_rank secondvar_age_rank secondvar_height_rank thirdvar_age_rank thirdvar_height_rank
Alfred 14 69 8 11 . . 6.5 10
Alice 13 56.5 . . 3.5 2 4.5 2
Barbara 13 65.3 6.5 8 . . . .
Henry 14 63.5 . . 5.5 5 . .
The full macro is listed below. It has 3 parts.
Create a temp data set with a group variable that contains the number of the indicator variable based on the order of the variable in the list. Whenever an indicator = 1 the obs is output. If an obs has all 3 indicators set to 1 then it will be output 3 times with the group variable set to the number of each indicator variable. This step is important because proc rank will rank groups independently.
Generate the rankings on the temp data set. Each group will be ranked independently of the other groups and can be done in one step.
Construct the final data set by essentially transposing the ranked data into columns.
%macro rank(indicators=, rank_vars=);
%let cnt_ind = %sysfunc(countw(&indicators));
%let cnt_vars = %sysfunc(countw(&rank_vars));
data temp;
set have;
array indicators(*) &indicators;
do i = 1 to dim(indicators);
if indicators(i) = 1 then do;
group = i; * create a group based on order of indicators;
output; * an obs can be output multiple times;
end;
end;
drop i &indicators;
run;
proc sort data=temp;
by group;
run;
* Generate rankings by group;
proc rank data=temp out=ranks;
by group;
var &rank_vars;
ranks
%let vars = ;
%do i = 1 %to &cnt_vars;
%let var = %scan(&rank_vars, &i);
%let vars = &vars &var._rank;
%end;
&vars;
run;
proc sort data=ranks;
by name group;
run;
* Contruct final data set by transposing the ranks into columns;
data want;
set ranks;
by name;
* retain statement to declare new variables and retain values;
retain
%let vars = ;
%do i = 1 %to &cnt_ind;
%let ivar = %scan(&indicators, &i);
%do j = 1 %to &cnt_vars;
%let jvar = %scan(&rank_vars, &j);
%let vars = &vars &ivar._&jvar._rank;
%end;
%end;
&vars;
if first.name then call missing (of &vars);
* option 1: build series of IF statements;
%let vars = ;
%do i = 1 %to &cnt_ind;
%let ivar = %scan(&indicators, &i);
%str(if group = &i then do;)
%do j = 1 %to &cnt_vars;
%let jvar = %scan(&rank_vars, &j);
%let newvar = &ivar._&jvar._rank;
%str(&newvar = &jvar._rank;)
%end;
%str(end;)
%end;
if last.name then output;
drop group
%let vars = ;
%do i = 1 %to &cnt_vars;
%let var = %scan(&rank_vars, &i);
%let vars = &vars &var._rank;
%end;
&vars;
run;
%mend;
When constructing the final data set and transposing the rank variables, there are a couple of options. The first option shown above is to dynamically build a series of if statements. Here is what the code generates:
MPRINT(RANK): * option 1: build series of IF statements;
MPRINT(RANK): if group = 1 then do;
MPRINT(RANK): firstvar_age_rank = age_rank;
MPRINT(RANK): firstvar_height_rank = height_rank;
MPRINT(RANK): end;
MPRINT(RANK): if group = 2 then do;
MPRINT(RANK): secondvar_age_rank = age_rank;
MPRINT(RANK): secondvar_height_rank = height_rank;
MPRINT(RANK): end;
MPRINT(RANK): if group = 3 then do;
MPRINT(RANK): thirdvar_age_rank = age_rank;
MPRINT(RANK): thirdvar_height_rank = height_rank;
MPRINT(RANK): end;
The 2nd option is to use an array and mathematically calculate the index into the array by the group number and variable number. Here is the snippet of macro code to replace the if series code:
* option 2: create arrays and calculate index into array
* by group number and variable number;
array ranks(*) &vars;
array rankvars(*)
%let vars = ;
%do i = 1 %to &cnt_vars;
%let var = %scan(&rank_vars, &i);
%let vars = &vars &var._rank;
%end;
&vars;
%str(idx = dim(rankvars) * (group - 1);)
%str(do i = 1 to dim(rankvars);)
%str(ranks(idx + i) = rankvars(i);)
%str(end;)
Here is the generated code:
MPRINT(RANK): * option 2: create arrays and calculate index into array * by group number and variable number;
MPRINT(RANK): array ranks(*) firstvar_age_rank firstvar_height_rank secondvar_age_rank secondvar_height_rank thirdvar_age_rank
thirdvar_height_rank;
MPRINT(RANK): array rankvars(*) age_rank height_rank;
MPRINT(RANK): idx = dim(rankvars) * (group - 1);
MPRINT(RANK): do i = 1 to dim(rankvars);
MPRINT(RANK): ranks(idx + i) = rankvars(i);
MPRINT(RANK): end;
It takes a minute to understand the array option, but once you do, it is preferable over generating if statments. As the number of variables increases, the code generated by the array option is the same and operates more efficiently.

Related

Divide a dataset into subsets based on a column and perform a repeated operation for subsets

I need to perform the same operation on many different periods. In my sample data for two periods: 402 and 403.
I cannot understand the concept of how I can make a loop that will do it for me.
At the end, I'd like to have final1 for period 402, final2 for period 403 etc.
Sample data that I use for testing:
data one;
input period $ a $ b $ c $ d e;
cards;
402 a . a 1 3
402 . b . 2 4
402 a a a . 5
402 . . b 3 5
403 a a a . 6
403 a a a . 7
403 a a a 2 8
;
run;
This is how I manually choose one period of one data:
data new;
set one;
where period='402';
run;
This is how I calculate different things for the given period e.g. number of missing data, non-missing, total:
1 - For numeric variables:
proc iml;
use new;
read all var _NUM_ into x[colname=nNames];
n = countn(x,"col");
nmiss = countmiss(x,"col");
ntotal = n + nmiss;
2 - and similarly for char variables:
read all var _CHAR_ into x[colname=cNames];
close nww;
c = countn(x,"col");
cmiss = countmiss(x,"col");
ctotal = c + cmiss;
Save numeric and char results:
create cnt1Data var {nNames n nmiss ntotal};
append;
close cnt1Data;
create cnt2Data var {cNames c cmiss ctotal};
append;
close cnt2Data;
Rename columns to be the same:
data cnt1Datatemp;
set cnt1Data;
rename nNames = Name n = nonMissing nmiss = missing ntotal = total;
run;
data cnt2Datatemp;
set cnt2Data;
rename cNames = Name c = nonMissing cmiss = missing ctotal = total;
run;
and merge data into the final set:
data final;
set cnt1Datatemp cnt2Datatemp;
run;
Final data for period 402 should look like:
a b c d e
2 2 1 1 0 - missing
2 2 3 3 4 - non-missing
4 4 4 4 4 - total
and respectively for period 403:
a b c d e
0 0 0 2 0 - missing
3 3 3 1 3 - non-missing
3 3 3 3 3 - total
You can make something similar with simple SQL query.
create table miss_count as select period
, sum(missing(A)) as A
, sum(missing(B)) as B
...
from have
group by period
;
Results:
period a b c d e
402 2 2 1 1 0
403 0 0 0 2 0
It you add in
, count(*) as nobs
then you have all the information you need to calculate all of the counts you wanted.
If the number of variables is short enough you can even generate the code into a macro variable (limit of 64K bytes in a macro variable)
proc sql noprint;
select catx(' ','sum(missing(',nliteral(name),')) as',nliteral(name))
into :varlist separated by ','
from dictionary.columns
where libname='WORK' and memname='ONE' and lowcase(name) ne 'period'
;
create table miss_count as select period,count(*) as nobs,&varlist
from one
group by period
;
quit;
Results:
period nobs a b c d e
402 4 2 2 1 1 0
403 3 0 0 0 2 0
It is much easier to find this information in sql;
proc sql;
select sum(a is not missing) as fil_a
, sum(a is missing) as mis_a
, count(*) as tot_a
from one
where period eq 402;
quit;
You can even 0handle all periods at once using group by.
There are a few ways to make this work for all variables in a dataset (except for some group by variables). For instance:
%macro count_missing();
proc sql;
select count(*), name
into :no_var, :var_list separated by ' '
from sasHelp.vcolumn
where libName eq 'WORK' and memName eq 'ONE' and upcase(name) ne 'PERIOD';
create view count_missing as
select count(*) as total
%do var_nr = 1 %to &no_var;
%let var = %scan(&var_list, &var_nr);
, sum(&var is missing) as mis_&var
%end;
from work.one
group by period;
quit;
data report_missing;
set count_missing;
format count_of $32.;
count_of = 'missing';
%do var_nr = 1 %to &no_var;
%let var = %scan(&var_list, &var_nr);
&var = mis_&var;
%end;
output;
count_of = 'non missing';
%do var_nr = 1 %to &no_var;
%let var = %scan(&var_list, &var_nr);
&var = total - mis_&var;
%end;
output;
count_of = 'total';
%do var_nr = 1 %to &no_var;
%let var = %scan(&var_list, &var_nr);
&var = total;
%end;
output;
end;
%mend;
%count_missing();
You don't need iml to summarize data over observations. You can do that with a retain statement too. Moreover, using by processing with first and last, you can process all periods in one go.
data final;
set one;
by period;
if first.period then do;
mis_a = 0;
total = 0;
end;
retain mis_a;
if missing(a) then mis_a +=1; else fil_a += 1;
total += 1;
if last.period;
fil_a = total - mis_a;
end;
This is by far the fastest way to handle a big dataset if the data is sorted by period.
To make it work for a set of variables not known upfront, you can apply the same techniques as in my other solution.

how to create variables that names are concat with two array variable names

I have a HCC dataset DATA_HCC that with member ID and 79 binary variables:
Member_ID HCC1 HCC2 HCC6 HCC8 ... HCC189
XXXXXXX1 1 0 1 0 ... 0
XXXXXXX2 0 0 1 0 ... 0
XXXXXXX3 0 1 0 0 ... 1
I am trying to create a output dataset that could create new binary variables for all the combination of those 79 variables. Each new variable represents if a member had both of the variables as 1.
%LET hccList = HCC1 HCC2 HCC6 HCC8 HCC9 HCC10 HCC11 HCC12 HCC17 HCC18 HCC19 HCC21 HCC22 HCC23 HCC27
HCC28 HCC29 HCC33 HCC34 HCC35 HCC39 HCC40 HCC46 HCC47 HCC48 HCC54 HCC55 HCC57 HCC58
HCC70 HCC71 HCC72 HCC73 HCC74 HCC75 HCC76 HCC77 HCC78 HCC79 HCC80 HCC82 HCC83 HCC84
HCC85 HCC86 HCC87 HCC88 HCC96 HCC99 HCC100 HCC103 HCC104 HCC106 HCC107 HCC108 HCC110
HCC111 HCC112 HCC114 HCC115 HCC122 HCC124 HCC134 HCC135 HCC136 HCC137 HCC157 HCC158
HCC161 HCC162 HCC166 HCC167 HCC169 HCC170 HCC173 HCC176 HCC186 HCC188 HCC189;
DATA COUNT_HCC; SET DATA_HCC;
ARRAY HCC [*] &hccList.;
DO i = 1 TO DIM(HCC);
DO j = i+1 TO DIM(HCC);
%LET HCC_COMBO = CATX('_', VARNAME(HCC[i]), VARNAME(HCC[j]));
&HCC_COMBO. = MIN(HCC[i], HCC[j]);
END;
END;
RUN;
I tried to use CATX function to just concat the two variable names but it didn't work.
Here is the log error that I got:
ERROR: Undeclared array referenced: CATX.
ERROR: Variable CATX has not been declared as an array.
ERROR 71-185: The VARNAME function call does not have enough arguments.
And the results output sample would like this:
Member_ID HCC1_HCC2 HCC1_HCC6 HCC1_HCC8 ... HCC188_HCC189
XXXXXXX1 0 1 0 ... 0
XXXXXXX2 0 0 0 ... 0
XXXXXXX3 0 0 0 ... 1
To achieve dynamic variable name generation, use a macro to create the variables that you need. The below code generates dynamic variable names and generates data step code to create the variables.
%macro get_hcc_combo_mins;
%do i = 1 %to %sysfunc(countw(&hccList.));
%do j = %eval(&i.+1) %to %sysfunc(countw(&hccList.));
%let hcc1 = %scan(&hccList., &i.);
%let hcc2 = %scan(&hccList., &j.);
&hcc1._&hcc2. = min(&hcc1., &hcc2.);
%end;
%end;
%mend;
DATA COUNT_HCC; SET DATA_HCC;
ARRAY HCC [*] &hccList.;
%get_hcc_combo_mins;
RUN;
The macro %get_hcc_combo_mins generates this code in the data step:
HCC1_HCC2 = min(HCC1, HCC2);
HCC1_HCC6 = min(HCC1, HCC6);
HCC1_HCC8 = min(HCC1, HCC8);
...
There may be other ways to do this all within one data step that I'm not aware of, but macros can get the job done.
A DATA Step with LEXCOMB can generate variable name pairs. CALL EXECUTE submit a statement using those names.
Example:
Presume HCC: variable names, which specific ones not known apriori.
data have;
call streaminit(1234);
do id = 1 to 100;
array hcc hcc1 hcc3 hcc5 hcc7 hcc10-hcc79 hcc150 hcc155 hcc180 hcc190-hcc191;
do over hcc;
hcc = rand('uniform', dim(hcc)) < _i_;
end;
output;
end;
run;
data _null_;
set have;
array hcc hcc:;
do _n_ = 1 to dim(hcc);
hcc(_n_) = _n_;
end;
call execute("data pairwise; set have;");
do _n_ = 1 to comb(dim(hcc),2);
call lexcomb(_n_, 2, of hcc(*));
index1 = hcc(1);
index2 = hcc(2);
name1 = vname(hcc(index1));
name2 = vname(hcc(index2));
put name1=;
call execute (cats(
catx( '_',name1,name2),
'=',
catx(' and ',name1,name2),
';'
));
end;
call execute('run;');
stop;
run;
See if you can use this as a template.
/* Example data */
data have (drop = i j);
array h {*} HCC1 HCC2 HCC6 HCC8 HCC9 HCC10 HCC11 HCC12 HCC17 HCC18 HCC19 HCC21 HCC22 HCC23 HCC27
HCC28 HCC29 HCC33 HCC34 HCC35 HCC39 HCC40 HCC46 HCC47 HCC48 HCC54 HCC55 HCC57 HCC58
HCC70 HCC71 HCC72 HCC73 HCC74 HCC75 HCC76 HCC77 HCC78 HCC79 HCC80 HCC82 HCC83 HCC84
HCC85 HCC86 HCC87 HCC88 HCC96 HCC99 HCC100 HCC103 HCC104 HCC106 HCC107 HCC108 HCC110
HCC111 HCC112 HCC114 HCC115 HCC122 HCC124 HCC134 HCC135 HCC136 HCC137 HCC157 HCC158
HCC161 HCC162 HCC166 HCC167 HCC169 HCC170 HCC173 HCC176 HCC186 HCC188 HCC189;
do i = 1 to 10;
do j = 1 to dim (h);
h [j] = rand('uniform') > .5;
end;
output;
end;
run;
/* Create long version of output data */
data temp (drop = i j);
set have;
array a {*} HC:;
do i = 1 to dim (a)-1;
do j = i+1 to dim (a);
v = catx('_', vname (a[i]), vname (a[j]));
d = a [i] * a [j];
n = _N_;
output;
end;
end;
run;
/* Transpose to wide format */
proc transpose data=temp out=temp2 (drop=_: n);
by n;
id v;
var d;
run;
/* Merge back with original data */
data want;
merge have temp2;
run;

Creating variables that count the "levels" of other variables

I have a dataset analogous to the simplified table below (let's call it "DS_have"):
SurveyID Participant FavoriteColor FavoriteFood SurveyMonth
S101 G92 Blue Pizza Jan
S102 B34 Blue Cake Feb
S103 Z28 Green Cake Feb
S104 V11 Red Cake Feb
S105 P03 Yellow Pizza Mar
S106 A71 Red Pizza Mar
S107 C48 Green Cake Mar
S108 G92 Blue Cake Apr
...
I'd like to create a set of numeric variables that identify the discrete categories/levels of each variable in the dataset above. The result should look like the following dataset ("DS_want"):
SurveyID Participant FavoriteColor FavoriteFood SurveyMonth ColorLevels FoodLevels ParticipantLevels MonthLevels
S101 G92 Blue Pizza Jan 1 1 1 1
S102 B34 Blue Cake Feb 1 2 2 2
S103 Z28 Green Cake Feb 2 2 3 2
S104 V11 Red Cake Feb 3 2 4 2
S105 P03 Yellow Pizza Mar 4 1 5 3
S106 A71 Red Pizza Mar 3 1 6 3
S107 C48 Green Cake Mar 2 2 7 3
S108 G92 Blue Cake Apr 1 1 1 4
...
Essentially, I want to know what syntax I should use to generate unique numerical values for each "level" or category of variables in the DS_Have dataset. Note that I cannot use conditional if/then statements to create the values in the ":Levels" variables for each category, as the number of levels for some variables is in the thousands.
One straightforward solution is to use proc tabulate to generate a tabulated list, then iterate over that and create informats to convert the text to a number; then you just use input to code them.
*store variables you want to work with in a macro variable to make this easier;
%let vars=FavoriteColor FavoriteFood SurveyMonth;
*run a tabulate to get the unique values;
proc tabulate data=have out=freqs;
class &vars.;
tables (&vars.),n;
run;
*if you prefer to have this in a particular order, sort by that now - otherwise you may have odd results (as this will). Sort by _TYPE_ then your desired order.;
*Now create a dataset to read in for informat.;
data for_fmt;
if 0 then set freqs;
array vars &vars.;
retain type 'i';
do label = 1 by 1 until (last._type_); *for each _type_, start with 1 and increment by 1;
set freqs;
by _type_ notsorted;
which_var = find(_type_,'1'); *parses the '100' value from TYPE to see which variable this row is doing something to. May not work if many variables - need another solution to identify which (depends on your data what works);
start = coalescec(vars[which_var]);
fmtname = cats(vname(vars[which_var]),'I');
output;
if first._type_ then do; *set up what to do if you encounter a new value not coded - set it to missing;
hlo='o'; *this means OTHER;
start=' ';
label=.;
output;
hlo=' ';
label=1;
end;
end;
run;
proc format cntlin=for_fmt; *import to format catalog via PROC FORMAT;
quit;
Then code them like this (you might create a macro to do this looping over the &vars macro variable).
data want;
set have;
color_code = input(FavoriteColor,FavoriteColorI.);
run;
Another approach - create a hash object to keep track of the levels encountered for each variable, and read the dataset twice via a double DOW-loop, applying the level numbers on the second pass. It's perhaps not as elegant as Joe's solution, but it should use slightly less memory and I suspect it will scale to a somewhat larger number of variables.
%macro levels_rename(DATA,OUT,VARS,NEWVARS);
%local i NUMVARS VARNAME;
data &OUT;
if 0 then set &DATA;
length LEVEL 8;
%let i = 1;
%let VARNAME = %scan(&VARS,&i);
%do %while(&VARNAME ne );
declare hash h&i();
rc = h&i..definekey("&VARNAME");
rc = h&i..definedata("LEVEL");
rc = h&i..definedone();
%let i = %eval(&i + 1);
%let VARNAME = %scan(&VARS,&i);
%end;
%let NUMVARS = %eval(&i - 1);
do _n_ = 1 by 1 until(eof);
set &DATA end = eof;
%do i = 1 %to &NUMVARS;
LEVEL = h&i..num_items + 1;
rc = h&i..add();
%end;
end;
do _n_ = 1 to _n_;
set &DATA;
%do i = 1 %to &NUMVARS;
rc = h&i..find();
%scan(&NEWVARS,&i) = LEVEL;
%end;
output;
end;
drop LEVEL;
run;
%mend;
%levels_rename(sashelp.class,class_renamed,NAME SEX, NAME_L SEX_L);

Crosstable displaying frequency combination of N variables in SAS

What I've got:
a table of 20 rows in SAS (originally 100k)
various binary attributes (columns)
What I'm looking to get:
A crosstable displaying the frequency of the attribute combinations
like this:
Attribute1 Attribute2 Attribute3 Attribute4
Attribute1 5 0 1 2
Attribute2 0 3 0 3
Attribute3 2 0 5 4
Attribute4 1 2 0 10
*The actual sum of combinations is made up and probably not 100% logical
The code I currently have:
/*create dummy data*/
data monthly_sales (drop=i);
do i=1 to 20;
Attribute1=rand("Normal")>0.5;
Attribute2=rand("Normal")>0.5;
Attribute3=rand("Normal")>0.5;
Attribute4=rand("Normal")>0.5;
output;
end;
run;
I guess this can be done smarter, but this seem to work. First I created a table that should hold all the frequencies:
data crosstable;
Attribute1=.;Attribute2=.;Attribute3=.;Attribute4=.;output;output;output;output;
run;
Then I loop through all the combinations, inserting the count into the crosstable:
%macro lup();
%do i=1 %to 4;
%do j=&i %to 4;
proc sql noprint;
select count(*) into :Antall&i&j
from monthly_sales (where=(Attribute&i and Attribute&j));
quit;
data crosstable;
set crosstable;
if _n_=&j then Attribute&i=&&Antall&i&j;
if _n_=&i then Attribute&j=&&Antall&i&j;
run;
%end;
%end;
%mend;
%lup;
Note that since the frequency count for (i,j)=(j,i) you do not need to do both.
I'd recommend using the built-in SAS tools for this sort of thing, and probably displaying your data slightly differently as well, unless you really want a diagonal table. e.g.
data monthly_sales (drop=i);
do i=1 to 20;
Attribute1=rand("Normal")>0.5;
Attribute2=rand("Normal")>0.5;
Attribute3=rand("Normal")>0.5;
Attribute4=rand("Normal")>0.5;
count = 1;
output;
end;
run;
proc freq data = monthly_sales noprint;
table attribute1 * attribute2 * attribute3 * attribute4 / out = frequency_table;
run;
proc summary nway data = monthly_sales;
class attribute1 attribute2 attribute3 attribute4;
var count;
output out = summary_table(drop = _TYPE_ _FREQ_) sum(COUNT)= ;
run;
Either of these gives you a table with 1 row for each contribution of attributes in your data, which is slightly different from what you requested, but conveys the same information. You can force proc summary to include rows for combinations of class variables that don't exist in your data by using the completetypes option in the proc summary statement.
It's definitely worth taking the time to get familiar with proc summary if you're doing statistical analysis in SAS - you can include additional output statistics and process multiple variables with minimal additional code and processing overhead.
Update: it's possible to produce the desired table without resorting to macro logic, albeit a rather complex process:
proc summary data = monthly_sales completetypes;
ways 1 2; /*Calculate only 1 and 2-way summaries*/
class attribute1 attribute2 attribute3 attribute4;
var count;
output out = summary_table(drop = _TYPE_ _FREQ_) sum(COUNT)= ;
run;
/*Eliminate unnecessary output rows*/
data summary_table;
set summary_table;
array a{*} attribute:;
sum = sum(of a[*]);
missing = 0;
do i = 1 to dim(a);
missing + missing(a[i]);
a[i] = a[i] * count;
end;
/*We want rows where two attributes are both 1 (sum = 2),
or one attribute is 1 and the others are all missing*/
if sum = 2 or (sum = 1 and missing = dim(a) - 1);
drop i missing sum;
edge = _n_;
run;
/*Transpose into long format - 1 row per combination of vars*/
proc transpose data = summary_table out = tr_table(where = (not(missing(col1))));
by edge;
var attribute:;
run;
/*Use cartesian join to produce table containing desired frequencies (still not in the right shape)*/
option linesize = 150;
proc sql noprint _method _tree;
create table diagonal as
select a._name_ as aname,
b._name_ as bname,
a.col1 as count
from tr_table a, tr_table b
where a.edge = b.edge
group by a.edge
having (count(a.edge) = 4 and aname ne bname) or count(a.edge) = 1
order by aname, bname
;
quit;
/*Transpose the table into the right shape*/
proc transpose data = diagonal out = want(drop = _name_);
by aname;
id bname;
var count;
run;
/*Re-order variables and set missing values to zero*/
data want;
informat aname attribute1-attribute4;
set want;
array a{*} attribute:;
do i = 1 to dim(a);
a[i] = sum(a[i],0);
end;
drop i;
run;
Yeah, user667489 was right, I just added some extra code to get the cross-frequency table looking good. First, I created a table with 10 million rows and 10 variables:
data monthly_sales (drop=i);
do i=1 to 10000000;
Attribute1=rand("Normal")>0.5;
Attribute2=rand("Normal")>0.5;
Attribute3=rand("Normal")>0.5;
Attribute4=rand("Normal")>0.5;
Attribute5=rand("Normal")>0.5;
Attribute6=rand("Normal")>0.5;
Attribute7=rand("Normal")>0.5;
Attribute8=rand("Normal")>0.5;
Attribute9=rand("Normal")>0.5;
Attribute10=rand("Normal")>0.5;
output;
end;
run;
Create an empty 10x10 crosstable:
data crosstable;
Attribute1=.;Attribute2=.;Attribute3=.;Attribute4=.;Attribute5=.;Attribute6=.;Attribute7=.;Attribute8=.;Attribute9=.;Attribute10=.;
output;output;output;output;output;output;output;output;output;output;
run;
Create a frequency table using proc freq:
proc freq data = monthly_sales noprint;
table attribute1 * attribute2 * attribute3 * attribute4 * attribute5 * attribute6 * attribute7 * attribute8 * attribute9 * attribute10
/ out = frequency_table;
run;
Loop through all the combinations of Attributes and sum the "count" variable. Insert it into the crosstable:
%macro lup();
%do i=1 %to 10;
%do j=&i %to 10;
proc sql noprint;
select sum(count) into :Antall&i&j
from frequency_table (where=(Attribute&i and Attribute&j));
quit;
data crosstable;
set crosstable;
if _n_=&j then Attribute&i=&&Antall&i&j;
if _n_=&i then Attribute&j=&&Antall&i&j;
run;
%end;
%end;
%mend;
%lup;

Split SAS dataset

I have a SAS dataset that looks like this:
id | dept | ...
1 A
2 A
3 A
4 A
5 A
6 A
7 A
8 A
9 B
10 B
11 B
12 B
13 B
Each observation represents a person.
I would like to split the dataset into "team" datasets, each dataset can have a maximum of 3 observations.
For the example above this would mean creating 3 datasets for dept A (2 of these datasets would contain 3 observations and the third dataset would contain 2 observations). And 2 datasets for dept B (1 containing 3 observations and the other containing 2 observations).
Like so:
First dataset (deptA1):
id | dept | ...
1 A
2 A
3 A
Second dataset (deptA2)
id | dept | ...
4 A
5 A
6 A
Third dataset (deptA3)
id | dept | ...
7 A
8 A
Fourth dataset (deptB1)
id | dept | ...
9 B
10 B
11 B
Fifth dataset (deptB2)
id | dept | ...
12 B
13 B
The full dataset I'm using contains thousands of observations with over 50 depts. I can work out how many datasets per dept are required and I think a macro is the best way to go as the number of datasets required is dynamic. But I can't figure out the logic to create the datasets so that they have have a maximum of 3 observations. Any help appreciated.
Another version.
Compared to DavB version, it only processes input data once and splits it into several tables in single datastep.
Also if more complex splitting rule is required, it can be implemented in datastep view WORK.SOURCE_PREP.
data WORK.SOURCE;
infile cards;
length ID 8 dept $1;
input ID dept;
cards;
1 A
2 A
3 A
4 A
5 A
6 A
7 A
8 A
9 B
10 B
11 B
12 B
13 B
14 C
15 C
16 C
17 C
18 C
19 C
20 C
;
run;
proc sort data=WORK.SOURCE;
by dept ID;
run;
data WORK.SOURCE_PREP / view=WORK.SOURCE_PREP;
set WORK.SOURCE;
by dept;
length table_name $32;
if first.dept then do;
count = 1;
table = 1;
end;
else count + 1;
if count > 3 then do;
count = 1;
table + 1;
end;
/* variable TABLE_NAME to hold table name */
TABLE_NAME = catt('WORK.', dept, put(table, 3. -L));
run;
/* prepare list of tables */
proc sql noprint;
create table table_list as
select distinct TABLE_NAME from WORK.SOURCE_PREP where not missing(table_name)
;
%let table_cnt=&sqlobs;
select table_name into :table_list separated by ' ' from table_list;
select table_name into :tab1 - :tab&table_cnt from table_list;
quit;
%put &table_list;
%macro loop_when(cnt, var);
%do i=1 %to &cnt;
when ("&&&var.&i") output &&&var.&i;
%end;
%mend;
data &table_list;
set WORK.SOURCE_PREP;
select (TABLE_NAME);
/* generate OUTPUT statements */
%loop_when(&table_cnt, tab)
end;
run;
You could try this:
%macro split(inds=,maxobs=);
proc sql noprint;
select distinct dept into :dept1-:dept9999
from &inds.
order by dept;
select ceil(count(*)/&maxobs.) into :numds1-:numds9999
from &inds.
group by dept
order by dept;
quit;
%let numdept=&sqlobs;
data %do i=1 %to &numdept.;
%do j=1 %to &&numds&i;
dept&&dept&i&&j.
%end;
%end;;
set &inds.;
by dept;
if first.dept then counter=0;
counter+1;
%do i=1 %to &numdept.;
%if &i.=1 %then %do;
if
%end;
%else %do;
else if
%end;
dept="&&dept&i" then do;
%do k=1 %to &&numds&i.;
%if &k.=1 %then %do;
if
%end;
%else %do;
else if
%end;
counter<=&maxobs.*&k. then output dept&&dept&i&&k.;
%end;
end;
%end;
run;
%mend split;
%split(inds=YOUR_DATASET,maxobs=3);
Just replace the INDS parameter value in the %SPLIT macro call to the name of your input data set.