I'm new to SAS and I'm at a dead end.
I need to get the final table. C with a full set of attributes, and the" intersection " of versioning, i.e. as soon as a version change has occurred in one of the Tariffs or Abonents tables, the version in C should also change. If the version was changed simultaneously, in both tables, then in C the version should be changed once.
Tarifs
abon_id tariff_plan type from_date to_date
1 1 1 01OCT2005 01JAN2040
2 1 2 05NOV2005 01DEC2006
2 2 2 02DEC2006 01DEC2007
2 2 1 02DEC2007 01JAN2040
3 0 0 07NOV1917 11JUN1991
3 1 1 12JUN1991 01JAN2040
4 1 1 12JUN1991 01JAN2040
Abonents
abon_id name sex from_date
1 Igor M 01OCT2005 01JAN2040
2 Vasya M 05NOV2005 01AUG2006
2 Lena F 02AUG2006 02SEP2007
2 Yulia F 03SEP2007 01JAN2040
3 USSR Country 07NOV1917 11JUN1991
3 Russia Country 12JUN1991 01JAN2040
4 Petya M 12AUG1991 01JAN2040
Resulting table should be:
C:
abon_id tariff_plan type name sex fd td
1 1 1 Igor М 01oct2005 01jan2040
2 1 2 Vasya М 05nov2005 01aug2006
2 1 2 Lena F 02aug2006 01dec2006
2 2 2 Lena F 02dec2006 02sep2007
2 2 2 Julia F 03sep2007 01dec2007
2 2 1 Julia F 02dec2007 01jan2040
3 0 0 USSR Country 07nov1917 11jun1991
3 1 1 Russia Country 12jun1991 01jan2040
4 1 1 . . 12jun1991 11aug1991
4 1 1 Petya M 12aug1991 01jan2040
So far I have something like:
data out;
retain fd1 fd2 td1 td2;
format fd1 fd2 td1 td2 ddmmyy10.;
merge Tarifs(in=x) Abonents(in=y);
by abon_id fd;
fd1 = 0; fd2 = 0; td1 = 0; td2 = 0;
if x then do;
fd1 = fd;
td1 = td;
end;
if y then do;
fd2 = fd;
td2 = td;
end;
if fd1 <= fd2 then do;
fd = fd1;
if fd2 < td1 and f2 < td2 then td = fd2;
else if td1 < td2 then td = td1;
else td = td2;
end;
else do;
fd = fd2;
if fd1 < td1 and fd1 < td2 then td = fd1;
else if td1 < td2 then td = td1;
else td = td2;
end;
run;
But I think I'm doing something wrong. Please help me!
You can use SQL union to combine the the overlaps with the tarifs pre-abonent
data tarifs;
input
abon_id tariff_plan type from_date: date9. to_date date9.;
format _numeric_ 4. from_date to_date date9.;
datalines;
1 1 1 01OCT2005 01JAN2040
2 1 2 05NOV2005 01DEC2006
2 2 2 02DEC2006 01DEC2007
2 2 1 02DEC2007 01JAN2040
3 0 0 07NOV1917 11JUN1991
3 1 1 12JUN1991 01JAN2040
4 1 1 12JUN1991 01JAN2040
data abonents;
length abon_id 8 name $10 sex $10;
input
abon_id name sex from_date: date9. to_date date9.;
format from_date to_date date9.;
datalines;
1 Igor M 01OCT2005 01JAN2040
2 Vasya M 05NOV2005 01AUG2006
2 Lena F 02AUG2006 02SEP2007
2 Julia F 03SEP2007 01JAN2040
3 USSR Country 07NOV1917 11JUN1991
3 Russia Country 12JUN1991 01JAN2040
4 Petya M 12AUG1991 01JAN2040
;
proc sql;
create table want as
(
select
A.abon_id, A.tariff_plan, A.type
, B.name, B.sex
, case
when A.from_date < B.from_date then B.from_date else A.from_date
end as fd format=date9.
, case
when A.to_date > B.to_date then B.to_date else A.to_date
end as td format=date9.
from tarifs A
left join abonents B
on A.abon_id = B.abon_id
where
B.from_date between A.from_date and A.to_date
or
B.to_date between A.from_date and A.to_date
)
union
(
select
A.abon_id, A.tariff_plan, A.type
, ' ' as name , ' ' as sex
, A.from_date as fd
, min(B.from_date)-1 as td
from tarifs A
left join abonents B
on A.abon_id = B.abon_id
group by
B.abon_id
having
A.from_date < min(B.from_date)
)
;
A simple merge can not accomplish the task because you need to cross join on abon_id.
A cross join can be accomplished in DATA Step by multidata hashing the abonents, linear traversing the tariffs with SET and iterating over find/find_next.
Example
data tarifs;
input
abon_id tariff_plan type from_date: date9. to_date date9.;
format _numeric_ 4. from_date to_date date9.;
datalines;
1 1 1 01OCT2005 01JAN2040
2 1 2 05NOV2005 01DEC2006
2 2 2 02DEC2006 01DEC2007
2 2 1 02DEC2007 01JAN2040
3 0 0 07NOV1917 11JUN1991
3 1 1 12JUN1991 01JAN2040
4 1 1 12JUN1991 01JAN2040
5 1 1 06JAN2021 31DEC2031
data abonents;
length abon_id 8 name $10 sex $10;
input
abon_id name sex from_date: date9. to_date date9.;
format from_date to_date date9.;
datalines;
1 Igor M 01OCT2005 01JAN2040
2 Vasya M 05NOV2005 01AUG2006
2 Lena F 02AUG2006 02SEP2007
2 Julia F 03SEP2007 01JAN2040
3 USSR Country 07NOV1917 11JUN1991
3 Russia Country 12JUN1991 01JAN2040
4 Petya M 12AUG1991 01JAN2040
;
data want(keep=abon_id tariff_plan type name sex tariffed:);
if 0 then set tarifs abonents;
if _n_ = 1 then do;
declare hash abon (dataset:'abonents', multidata:'y');
abon.defineKey('abon_id');
abon.defineData('name', 'sex', 'from_date', 'to_date');
abon.defineDone();
end;
set tarifs (rename=(from_date=fd to_date=td));
min_from = 1e9;
if abon.find() = 0 then do until (abon.find_next() ne 0);
if fd <= from_date <= td then tariffed_fd = from_date;
else
if from_date <= fd <= to_date then tariffed_fd = fd;
if fd <= to_date <= td then tariffed_td = to_date;
else
if from_date <= td <= to_date then tariffed_td = td;
if nmiss(of tariffed:) = 0 then output;
if from_date < min_from then min_from = from_date;
call missing (of tariffed:);
end;
if fd < min_from then do;
tariffed_fd = fd;
tariffed_td = from_date - 1;
call missing (name, sex);
output;
end;
format min_from tariffed: date9.;
run;
Related
I have this database:
data temp;
input ID monitoring_date score ;
datalines;
1 10/11/2006 0
1 10/12/2006 0
1 15/01/2007 1
1 20/01/2007 1
1 20/04/2007 1
2 10/08/2008 0
2 11/09/2008 0
2 17/10/2008 1
2 12/11/2008 0
3 10/12/2008 0
3 10/08/2008 0
3 11/09/2008 0
3 17/10/2009 1
3 12/12/2009 1
3 05/01/2010 0
4 10/12/2006 0
4 10/08/2006 0
4 11/09/2006 0
4 17/10/2007 0
4 12/12/2007 0
4 09/04/2008 1
4 05/08/2008 1
5 10/12/2013 0
5 03/09/2013 0
5 11/09/2013 0
5 19/10/2014 0
5 10/12/2014 1
5 14/01/2015 1
6 10/12/2017 0
6 10/08/2018 0
6 11/09/2018 0
6 17/10/2018 1
6 12/12/2018 1
6 09/04/2019 1
6 25/07/2019 0
6 05/08/2019 1
6 15/09/2019 0
;
I would like to create a new database with a new column where I note, for each ID, the date of the first progression of the score from 0 to 1 and if this progression is stable at least 3 months until at the end of monitoring else date_progresion = . :
data want;
input ID date_progression;
datalines;
1 15/01/2007
2 .
3 .
4 09/04/2008
5 .
6 .
;
I really have no idea to code this and I would like to get the wanted data to generate a cox model where the progression (Yes/No) is my event.
I am really stuck !
Thank you in advance for your help!
A DOW loop can process the ID groups, tracking for a single active run of 1s. A run has a start date and duration.
Example:
data want;
do _n_ = 1 by 1 until (last.id);
set have;
by id;
select;
when (pscore = 0 and score = 1) do; state = 1; start = date; dur = 1; end;
when (pscore = 1 and score = 1) do; state = 2; dur + 1; end;
when (pscore = 1 and score = 0) do; state = 3; start = .; dur = .; end;
when (pscore = 0 and score = 0) do; state = 4; end;
otherwise;
end;
pscore = score;
end;
if state = 2 and dur >= 3 then progression_date = start;
keep ID progression_date;
format progression_date yymmdd10.;
run;
I have data that looks like -
data abc;
input ID $ drug $ episode start_date date9. end_date date9.;
format start_date end_date date9.;
informat start_date end_date date9.;
datalines ;
1 A 1 01Jan2012 30Mar2012
1 A 2 01May2012 03Jul2012
1 A 3 28Sep2012 28Oct2012
1 A 4 01Nov2012 30Dec2012
1 B 1 01Apr2012 10May2012
1 B 2 02Nov2012 28Dec2012
1 B 3 01Jan2012 30Mar2012
1 C 1 01Jul2012 02Aug2012
;
run;
Here we have subjects and the the drugs they take. A new episode of one drug means that the person discontinued.
If the start date (start date of 1st episode) of second drug consumed , lies in between the episodes of first drug , then we will ignore all the further episodes of 1st drug.
Eg. here 1 april (start date of drug B) lies after the first episode of drug A, so episode 2,3,4 of drug A would be deleted.
Similarly the start date for drug C lies after the end date of episode 1 for drug B then episode 2 of drug B would be deleted.
The maximum number of episodes a subject can have is 15.
The resultant dataset should look like -
ID Drug Episode start_date end_date
1 A 1 1-Jan 30-Mar
1 B 1 1-Apr 10-May
1 C 1 1-Jul 2-Aug
How about this? I added another ID to the example data for demonstration.
data abc;
input ID $ drug $ episode start_date :date9. end_date :date9.;
format start_date end_date date9.;
datalines ;
1 A 1 01Jan2012 30Mar2012
1 A 2 01May2012 03Jul2012
1 A 3 28Sep2012 28Oct2012
1 A 4 01Nov2012 30Dec2012
1 B 1 01Apr2012 10May2012
1 B 2 02Nov2012 28Dec2012
1 B 3 01Jan2012 30Mar2012
1 C 1 01Jul2012 02Aug2012
2 A 1 01Jan2012 30Mar2012
2 A 2 01May2012 03Jul2012
2 A 3 28Sep2012 28Oct2012
2 A 4 01Nov2012 30Dec2012
2 B 1 01Apr2012 10May2012
2 B 2 02Nov2012 28Dec2012
2 B 3 01Jan2012 30Mar2012
2 C 1 01Jul2012 02Aug2012
;
run;
data want;
format ID drug episode start_date end_date;
keep ID drug episode start_date end_date;
declare hash h ();
h.definekey ('ID', 'd');
h.definedata ('_start_date');
h.definedone ();
do until (lr1);
set abc (rename= (start_date = _start_date)) end=lr1;
by ID drug;
if first.ID then d = 0;
if first.drug then d + 1;
if episode = 1 then h.add();
end;
do until (lr2);
set abc end=lr2;
by ID drug;
if first.ID then d = 0;
if first.drug then do;
d + 1; flag = 0;
end;
rc = h.find(key : ID, key : d+1);
if start_date > _start_date then flag=1;
if flag = 0 then output;
end;
retain flag;
run;
Result:
ID drug episode start_date end_date
1 A 1 01JAN2012 30MAR2012
1 B 1 01APR2012 10MAY2012
1 C 1 01JUL2012 02AUG2012
2 A 1 01JAN2012 30MAR2012
2 B 1 01APR2012 10MAY2012
2 C 1 01JUL2012 02AUG2012
In a summarized dataset, I have the status of an event at each hour after baseline in which it was recorded. I also have the last hour the event could have been recorded. I want to create a new dataset with one record for each hour from the first through the last hour, with the status for each record being the one from the last recorded status.
Here is an example dataset:
data new;
input hour status last_hour;
cards;
2 1 12
4 1 12
5 1 12
6 1 12
7 0 12
9 1 12
10 0 12
;
run;
In this case, the first recorded hour was the second, and the last recorded hour was the 10th. The last possible hour to record data was the 12th.
The final dataset should look like so:
0 . 12
1 . 12
2 1 12
3 1 12
4 1 12
5 1 12
6 1 12
7 0 12
8 0 12
9 1 12
10 0 12
11 0 12
12 0 12
I sort of have it working with this series of data steps, but I'm not sure if there's a cleaner way I'm not seeing.
data step1;
set new (keep=id hour);
by id;
do hour = 0 to last_hour;
output;
end;
run;
proc sort data=step1;
by id hour;
run;
proc sql;
create table step2 as
select distinct a.id, a.hour, b.status
from step1 as a
left join new as b
on a.id = b.id
and a.hour = b.hour
order by a.id, a.hour;
quit;
data step3;
set step2;
by id hour;
retain previous_status;
if first.id then do;
previous_status = .;
if status > . then previous_status = status;
end;
if not first.id then do;
if status = . and previous_status > . then status = previous_status;
if status > . then previous_status = status;
end;
run;
Seeing your code, it seems you left out of your question the fact that you also have id's. So this is a newer solution that deals with different id's. See further below for my first solution ignoring id's.
Since last_hour is always 12, I left it out of the have dataset. It will be added later on.
data have;
input id hour status;
cards;
1 2 1
1 4 1
1 5 1
1 6 1
1 7 0
1 9 1
1 10 0
2 2 1
2 4 1
2 5 1
2 6 1
2 7 0
2 9 1
2 10 0
;
Create a hours dataset, just containing numbers 0 thru 12;
data hours;
do i = 0 to 12;
hour = i;
output;
end;
drop i;
run;
Create a temporary dataset that will have the right number of rows (13 rows for every id, with valid hour values where they exist in the have table).
proc sql;
create table tmp as
select distinct t1.id, t2.hour, 12 as last_hour
from have as t1
cross join
(select hour from hours) as t2;
quit;
Then use merge and retain to fill in the missing hour column where appropriate.
data want;
merge have
tmp;
by id hour;
retain status_previous;
if not first.id then do;
if status ne . then status_previous = status;
else if status_previous ne . then status = status_previous;
end;
if last.id then status_previous = .;
drop status_previous;
run;
Previous solution (no id's)
If last_hour is always 12, then this should do it:
data have;
input hour status last_hour;
datalines;
2 1 12
4 1 12
5 1 12
6 1 12
7 0 12
9 1 12
10 0 12
;
data hours;
do i = 0 to 12;
hour = i;
last_hour = 12;
output;
end;
drop i;
run;
data want;
merge have
hours;
by hour;
retain status_previous;
if status ne . then status_previous = status;
else if status_previous ne . then status = status_previous;
drop status_previous;
run;
Here is a very basic question, but I'm unable to find an easy way to do it.
I have a dataset that references different highschools and students :
Highschool Students Sexe
A 1 m
A 2 m
A 3 m
A 4 f
A 5 f
B 1 m
B 2 m
And I'd like to create two new variables that count the number of male and female in each schools :
Highschool Students Sexe Nb_m Nb_f
A 1 m 1 0
A 2 m 2 0
A 3 m 3 0
A 4 f 3 1
A 5 f 3 2
B 1 m 1 0
B 2 m 2 0
And I can finally extract the last line with the total that would look like this :
Highschool Students Sexe Nb_m Nb_f
A 5 f 3 2
B 2 m 2 0
Any ideas ?
You can do this in a single PROC SQL step...
Also, I don't think you really need the value of Sexe from the last row.
proc sql ;
create table want as
select Highschool,
sum(case when Sexe = 'f' then 1 else 0 end) as Nb_f,
sum(case when Sexe = 'm' then 1 else 0 end) as Nb_m,
Nb_f + Nb_m as Students
group by Highschool
order by Highschool ;
quit ;
First you have to sort your dataset by Highschool:
proc sort data = your_dataset;
by Highschool;
run;
then you use
- retain to not reset Nb_m and Nb_f at every record;
- last function and output statement to print only the last observation for every school.
data new_dataset;
set your_dataset;
by Highschool;
retain Nb_m Nb_f;
if Sexe = 'm' then
Nb_m + 1;
else
Nb_f + 1;
if last.Highschool then do;
Students = Nb_m + Nb_f;
output;
Nb_m = 0;
Nb_f = 0;
end;
run;
I have a dataset with 4 observations (rows) per person.
I want to create three new variables that calculate the difference between the second and first, third and second, and fourth and third rows.
I think retain can do this, but I'm not sure how.
Or do I need an array?
Thanks!
data test;
input person var;
datalines;
1 5
1 10
1 12
1 20
2 1
2 3
2 5
2 90
;
run;
data test;
set test;
by person notsorted;
retain pos;
array diffs{*} diff0-diff3;
retain diff0-diff3;
if first.person then do;
pos = 0;
end;
pos + 1;
diffs{pos} = dif(var);
if last.person then output;
drop var diff0 pos;
run;
Why not use The Lag function.
data test; input person var;
cards;
1 5
1 10
1 12
1 20
2 1
2 3
2 5
2 90
run;
data test; set test;
by person;
LagVar=Lag(Var);
difference=var-Lagvar;
if first.person then difference=.;
run;
An alternative approach without arrays.
/*-- Data from simonn's answer --*/
data SO1019005;
input person var;
datalines;
1 5
1 10
1 12
1 20
2 1
2 3
2 5
2 90
;
run;
/*-- Why not just do a transpose? --*/
proc transpose data=SO1019005 out=NewData;
by person;
run;
/*-- Now calculate your new vars --*/
data NewDataWithVars;
set NewData;
NewVar1 = Col2 - Col1;
NewVar2 = Col3 - Col2;
Newvar3 = Col4 - Col3;
run;
Why not use the dif() function instead?
/* test data */
data one;
do id = 1 to 2;
do v = 1 to 4 by 1;
output;
end;
end;
run;
/* check */
proc print data=one;
run;
/* on lst
Obs id v
1 1 1
2 1 2
3 1 3
4 1 4
5 2 1
6 2 2
7 2 3
8 2 4
*/
/* now create diff within id */
data two;
set one;
by id notsorted; /* assuming already in order */
dif = ifn(first.id, ., dif(v));
run;
proc print data=two;
run;
/* on lst
Obs id v dif
1 1 1 .
2 1 2 1
3 1 3 1
4 1 4 1
5 2 1 .
6 2 2 1
7 2 3 1
8 2 4 1
*/
data output_data;
retain count previous_value diff1 diff2 diff3;
set data input_data
by person;
if first.person then do;
count = 0;
end;
else do;
count = count + 1;
if count = 1 then diff1 = abs(value - previous_value);
if count = 2 then diff2 = abs(value - previous_value);
if count = 3 then do;
diff3 = abs(value - previous_value);
output output_data;
end;
end;
previous_value = value;
run;