SUM() in SAS from same table where statement - sas

trying to figure out how I can make the below happen in SAS:
so on the compare column I would like to add (both amount to both item 1 and 2 separately)

one way to do.
data have;
input state $ compare $ comp_cnt;
datalines;
NY Both 4000
NY Item1 3500
NY Item2 2000
KY Both 5000
KY Item1 3000
KY Item2 4000
;
proc SQL;
select a.state,
a.compare,
a.comp_cnt +b.comp_cnt as comp_cnt
from
(select * from have
where compare ne 'Both')a
left join
(select * from have
where compare ='Both')b
on a.state=b.state;
quit;

Related

Bundling healthcare claims using SAS/SQL

Observations from "other_claims" data set are to summed with the observations in the "event_claims" data set under the following conditions:
"Other_claims" occur within a 90-day window of the event_claim, "stay_discharge_dt," are to be summed with the event cost("cost_event").
If the "other_claim" partially overlaps with the 90-day period, only overlapping days are to be included.
The included fraction: (# of overlapping days)/(total # of days of the other_claim)
Here's the sql solution I am considering. I'm curious if this could be more efficient?
data event_claims;
input patient_id stay_admission_dt mmddyy10. #14stay_discharge_dt mmddyy10. doctor cost_event;
format stay_admission_dt stay_discharge_dt mmddyy10.;
datalines;
1 06/10/2019 06/15/2019 45 20000
2 10/18/2018 10/22/2018 78 30000
;
data other_claims;
length patient_id 3. type $19;
input patient_id Type$ service_start_date :mmddyy10. service_end_date :mmddyy10. service_cost dollar7.0;
format service_start_date service_end_date mmddyy10.;
datalines;
1 skilled_nursing 06/15/2019 06/25/2019 $7,000
1 home-health 06/25/2019 08/25/2019 $24,000
1 office_visit 07/1/2019 07/1/2019 $200
1 home_health 08/26/2019 09/26/2019 $12,000
2 er_visit 10/15/2018 10/16/2018 $1,500
2 home_health 10/23/2018 11/23/2018 $8,000
2 outpatient_services 01/18/2019 1/22/2019 $5,000
;
proc sql;
create table events_others as
select a.person_id
,a.stay_admission_dt
,a.stay_discharge_dt
,a.stay_discharge_dt+90 as service_deadline format mmddyy10.
,b.service_start_date
,b.service_end_date
,case when b.service_start_date > calculated service_deadline
or b.service_start_date < a.stay_admission_dt
then "service not payable"
else "payable" end as payable
,case when calculated payable = "payable"
and b.service_end_date > calculated service_deadline
then intck("days",b.service_end_date, calculated service_deadline )
else 0 end as overlap /* When the other claim event exceeds the 90-day window of the*/
,a.service_cost
,b.service_cost as service_cost_other
,case when calculated overlap ne 0
then (intck("days",b.service_start_date,b.service_end_date) + calculated overlap)/intck("days",b.service_start_date,b.service_end_date)
else 0 end as partial_factor
,calculated partial_factor * b.service_cost as final_other_cost format=dollar9.2
from event_claims a
left join other_claims b
on a.person_id=b.person_id
group by a.person_id
,a.stay_admission_dt
,a.stay_discharge_dt
order by a.person_id
,a.stay_admission_dt
;quit;
proc sql;
create table total_cost_of_care as
select a.*
,b.final_other_cost format=dollar9.2
,a.service_cost + final_other_cost as total_episode_cost format=dollar12.2
from events_others a
inner join
(select person_id
,stay_admission_dt
,sum(final_other_cost) as final_other_cost
from events_others
group by person_id
,stay_admission_dt
) b
on (a.person_id=b.person_id
and a.stay_admission_dt=b.stay_admission_dt)
;quit;

Calculating proportion and cumulative data in SAS

I have a dataset called stores.I want to extract total_sales(retail_price),
proportion of sales and cumulative proportion of sales by each store in
SAS.
Sample dataset : - Stores
Date Store_Postcode Retail_Price month Distance
08/31/2013 CR7 8LE 470 8 7057.8
10/26/2013 CR7 8LE 640 10 7057.8
08/19/2013 CR7 8LE 500 8 7057.8
08/17/2013 E2 0RY 365 8 1702.2
09/22/2013 W4 3PH 395.5 12 2522
06/19/2013 W4 3PH 360.5 6 1280.9
11/15/2013 W10 6HQ 475 12 3213.5
06/20/2013 W10 6HQ 500 1 3213.5
09/18/2013 E7 8NW 315 9 2154.8
10/23/2013 E7 8NW 570 10 5777.9
11/18/2013 W10 6HQ 455 11 3213.5
08/21/2013 W10 6HQ 530 8 3213.5
Code i tried: -
Proc sql;
Create table work.Top_sellers as
Select Store_postcode as Stores,SUM(Retail_price) as Total_Sales,Round((Retail_price/Sum(Retail_price)),0.01) as
Proportion_of_sales
From work.stores
Group by Store_postcode
Order by total_sales;
Quit;
I've no idea on how to calculate cumulative variable in proc sql...
Please help me improve my code!!
Computing a cumulative result in SQL requires the data to have an explicit unique ordered key and the query involves a reflexive join with 'triangular' criteria for the cumulative aspect.
data have;
do id = 100 to 120;
sales = ceil (10 + 25 * ranuni(123));
output;
end;
run;
proc sql;
create table want as
select
have1.id
, have1.sales
, sum(have2.sales) as sales_cusum
from
have as have1
join
have as have2
on
have1.id >= have2.id /* 'triangle' criteria */
group by
have1.id, have1.sales
order by
have1.id
;
quit;
A second way is re-compute the cusum on row by row basis
proc sql;
create table want as
select have.id, have.sales,
( select sum(inner.sales)
from (select * from have) as inner
where inner.id <= have.id
)
as cusum
from
have;
I change my mind, CDF is a different calculation.
Here's how to do this via a data step. First calculate the cumulative totals (I used a data step here, but I could use PROC EXPAND if you had SAS/ETS).
*sort demo data;
proc sort data=sashelp.shoes out=shoes;
by region sales;
run;
data cTotal last (keep = region cTotal);
set shoes;
by region;
*calculate running total;
if first.region then cTotal=0;
cTotal = cTotal + sales;
*output records, everything to cTotal but only the last record which is total to Last dataset;
if last.region then output last;
output cTotal;
retain cTotal;
run;
*merge in results and calculate percentages;
data calcs;
merge cTotal Last (rename=cTotal=Total);
by region;
percent = cTotal/Total;
run;
If you need a more efficient solution, I'd try a DoW solution.

Counting categorical variables on row in SAS

Sample Data
I was wondering if it is possible to use data instead of proc to count the number of categorical variables on a row as shown in 'count' example above. This will allow me to further use the data e.g COUNT=1 or COUNT > 1 to check morbidity.
Also will it be possible to then count the number of each diagnosis in the entire data set per patient while accounting for duplicates if there is any? For example there are 3 CB's and 2 AA's in this data set but CB should be 2 because patient 2 had it recorded twice.
Thank you for your time and have a lovely new year.
Your question is not clear but your could manage your diag using union all and count distinct
selec patient count(distinct diag )
from (
select patient, diag1 as diag
from my_table
uniona all
select patient, diag2
from my_table
uniona all
select patient, diag3
from my_table
uniona all
select patient, diag4
from my_table
) t
group by patient
or simply union and count
selec patient count(diag )
from (
select patient, diag1 as diag
from my_table
uniona
select patient, diag2
from my_table
uniona
select patient, diag3
from my_table
uniona
select patient, diag4
from my_table
) t
group by patient
The image indicates that for each row you want a count of the number of columns with non-missing values. Additionally, you apparently have some way to do this using a PROC step, but would like to know how using a DATA step.
In DATA step you can count the number of non-missing values indirectly using CMISS, or directly using COUNTC against a constructed value:
data have;
attrib pid length=8 diag1-diag4 length=$5;
input pid & diag1-diag4;
datalines;
1 AA J9 HH6 .
2 CB . . CB
3 J10 AA CB J10
4 B B . F90 .
5 J10 . . .
6 . . . .
run;
data have_with_count;
set have;
count = 4 - cmiss (of diag1-diag4);
count_way2 = countc(catx('~', of diag1-diag4, 'SENTINEL'), '~');
run;
In order to work again MySQL data source you will also need a libref that connects you to that remote data server.
Added
Counting distinct values across a row can be accomplished using a hash or sortc. Consider this example that sorts a copy of the row data (as an array) and counts the unique values within:
data want;
set have;
array diag diag1-diag4;
array v(4) $5 _temporary_;
do _n_ = 1 to dim(diag);
v(_n_) = diag(_n_);
end;
call sortc(of v(*));
uniq = 0;
do _n_ = 1 to dim(v);
if missing(v(_n_)) then continue;
if uniq = 0 then
uniq + 1;
else
uniq + ( v(_n_) ne v(_n_-1) );
end;
run;
With Richard's dummy data to count number of diagnosis and unique number of diagnosis:
data want;
set have;
array var diag:;
length temp $30.;
call missing(diag_num);
do over var;
if not missing(var) then do;
diag_num+1;
temp=ifc(whichc(var, temp),temp,catx(' ',temp,var));
end;
end;
unique_diag=countw(temp);
drop temp;
run;

Using the sum of the columns, to create a new varible

I have data set, that has States, Corn, and Cotton. I want to create a new variable, Corn_Pct in SAS (% of state corn output relative to the country's output of corn). The same for Cotton_pct.
sample of data: (numbers are not real)
State Corn Cotton
TX 135 500
AK 120 350
...
Can anyone help?
You can do this using a simple Proc SQL. Let the dataset be "Test",
Proc sql ;
create table test_percent as
select *,
Corn/sum(corn) as Corn_Pct format=percent7.1,
Cotton/sum(Cotton) as Cotton_Pct format=percent7.1
from test
;
quit;
If you have many columns, you can use Arrays and do loops to automatically generate percentages everytime.
I have calculated the total of a column in Inner Query and then used that total for the calculation in outer query using Cross Join
Hey Try this:-
/*My Dataset */
Data Test;
input State $ Corn Cotton ;
cards;
TK 135 500
AK 120 350
CK 100 250
FG 200 300
run;
/*Code*/
Proc sql;
create table test_percent as
Select a.*, (corn * 100/sm_corn) as Corn_pct, (Cotton * 100/sm_cotton) as Cotton_pct
from test a
cross join
(
select sum(corn) as sm_corn ,
sum(Cotton) as sm_cotton
from test
) b ;
quit;
/*My Output*/
State Corn Cotton Corn_pct Cotton_pct
TK 135 500 24.32432432 35.71428571
AK 120 350 21.62162162 25
CK 100 250 18.01801802 17.85714286
FG 200 300 36.03603604 21.42857143
Here you have an alternative using proc means and data step:
proc means data=test sum noprint;
output out=test2(keep=corn cotton) sum=corn cotton;
quit;
data test_percent (drop=corn_sum cotton_sum);
set test2(rename=(corn=corn_sum cotton=cotton_sum) in=in1) test(in=in2);
if (in1=1) then do;
call symput('corn_sum',corn_sum);
call symput('cotton_sum',cotton_sum);
end;
else do;
Corn_pct = corn/symget('corn_sum');
Cotton_pct = cotton/symget('cotton_sum');
output;
end;
run;

Mean procedure with SAS

I want to find the mean of following datalines;
the way I am trying, I am getting the mean based on no. of observation which in this case is 6. But I want it based on Day so it comes something like Mean = Timeread/(no. of day) which is 3
name Day Timeread
X 1 12
X 1 23
X 1 12
X 2 8
X 2 5
X 3 3
This is the code I used
proc summary data = xyz nway missing;
class Name;
var timeread;
output out = Average mean=;
run;
proc print data = Average;
run;
I'm not sure how to do this with proc mean but you can do this in SQL like so:
proc sql noprint;
create table want as
select name,
sum(timeread) / count(distinct day) as daily_mean
from have
group by name
;
quit;
This uses the HAVE dataset from #CarolinaJay65's answer.
If you are just wanting the mean of total timeread by total distinct days
Data HAVE;
Input name $ Day Timeread ;
Datalines;
X 1 12
X 1 23
X 1 12
X 2 8
X 2 5
X 3 3
;
Run;
Proc Sql;
Create table WANT as
Select Name, (select count(distinct(Day)) from HAVE) as DAYS
, sum(timeread) as TIMEREAD_TOTAL
, calculated timeread_total/calculated days as MEAN
From HAVE
Group by Name;
Quit;