/* Virtual dairy farmer * Richard A. DeVenezia * May 17, 2003 * * A response to this SAS-L posting From: "Arjen" Subject: Matching dates and periods from different datasets (Epidemiology) Date: 13 May 2005 01:48:00 -0700 Hi all, I'm studying disease transmission on a dairy farm. I have two datasets. Dataset 'Herd' consists of the following variables: AnimalID DamID (ID of Animal's mother) DOB (Date of Birth) Status (Disease status, positive, negative or U - unknown) Status_conf (Date at which positive status was confirmed) The second dataset, 'Shedders', contains calving information on al DAMS that have disease status 'positive'. That is, variables are: DamID Status (always positive) Status_conf DOC (Date of Calving; a given DamID can occur several times in the dataset depending on the number of calvings) Conf_at_calving (The confirmation of positive disease status is very likely to occur at an older age; therefore a cow may have had several calvings (say 3, at which the variable = "No") when disease status was not yet confirmed, and the fourth calving with Conf_at_calving="Yes") Now, I'd like to determine whether an animal from dataset 'Herd' is born within 30 days of the calving of a shedding dam - ANY shedding dam (because animals mostly are born in calving pens that are 'visited' by all calving dams). So, I need to combine the DOB-variable from 'Herd' with the Conf_at_calving-variable (must be "Yes") and DOC-variable from 'Shedders'. Preferably this would result in a new variable 'Exposed' that would say 'Y', 'N' or 'U'. Example, consider: Data Herd; input AnimalID DamID DOB STATUS STAT_Conf; cards; 10 9 11000 1 12500 11 9 11420 0 . 12 11 12000 0 . ; run; Data Shedders; input DamID Status Status_conf DOC Conf_at_calving \$; cards; 7 1 9000 11990 Y 9 1 11400 10600 N 9 1 11400 11000 N 9 1 11400 11420 Y ; run; Ideally this would result in: Animal 10 Exposed: No Animal 11 Exposed: Yes (because its Dam #9 was positive at 11420) Animal 12 Exposed: Yes (because Dam #7 calved at 11990 (< 30 days before 12000) and at that time had a confirmed positive disease status) Of course, the information in 'Shedders' could be stored in the 'Herd'-table, because all animals are members of the same population, if that would make it easier. Does anybody have any ideas on how to solve this problem? Thanks so much for your efforts! Arjen */ %let seed=20050513; proc format; value sex 0='F' 1='M'; run; %macro monotonic(); %mend; data gen0; length motherId id \$8 sex dob dot rot dod calfCount 4 calfId \$8; do _n_ = 1 to 10; id = resolve ('%monotonic'); id = symget ('sysindex'); sex = 0; dob = 1 + floor(9*30*ranuni(&seed)); donc = floor(dob + 20*30 + 15*30*ranuni(&seed)); * first calf at 20 to 35 months; motherId = ""; calfId = ""; calfcount = 0; dod=.; dot=.; rot=.; output; end; format sex sex. calfcount rot 2.; format dob donc dod dot yymmdd10.; label dob='Date of Birth' donc='Date of Next Calving' dod='Date of Death' dot='Date of Test'; label rot='Result of Test'; label calfId='Most recent calf'; run; proc sort data=gen0; by donc; run; data gen0; set gen0; by donc; if first.donc then seq=1; else seq+1; run; %let MAXCDATE = %eval (3*365+%sysfunc(today())); data _null_ ;*/ debug; if 0 then set gen0; declare hash herd (dataset:'gen0',ordered:'a'); herd.defineKey ('donc', 'seq'); herd.defineData ('motherId','id', 'sex','dob','dot','rot','dod','calfCount','donc','calfId','seq'); herd.defineDone(); array cdate [0:&MAXCDATE] _temporary_; do date = 1 to today(); do _n_ = 1 by 1 while (0=herd.find (key:date,key:_n_)); calfId = resolve ('%monotonic'); calfId = symget ('sysindex'); calfCount + 1; herd.remove(); if date-dob > 5*365 then do; * thank you for your service - rip; dod = date + 6*7*ranuni(&seed); * assign random disease test results to culled dams; _x_ = ranuni(&seed); if _x_ < 0.25 then do; dot = floor(dob + (dod-dob)*ranuni(&seed)); rot = +1; end; else if _x_ < 0.50 then do; dot = floor(dob + (dod-dob)*ranuni(&seed)); rot = -1; end; else do; dot = .; rot = .; end; cdate[0]+1; seq = cdate[0]; donc = .; herd.add(); end; else do; * prepare for next birth; donc = floor(date+9*30+10*ranuni(&seed)-5); cdate[donc]+1; seq=cdate[donc]; herd.add(); end; * identify the calf; motherId = id; id = calfId; calfId = ""; calfCount = 0; dob = date; dod = .; sex = ranuni (&seed) < 0.5; * cull the males and 53% of the females; if sex = 0 then donc = floor(dob + 20*30 + 15*30*ranuni(&seed)) * ( ranuni(&seed)<(1-0.53) ); else donc = 0; * cull at 6wk to 7mo; if donc = 0 then dod = floor(dob + 1.5*30 + 5.5*30*ranuni(&seed)); cdate[donc]+1; seq = cdate[donc]; if donc=0 then donc=.; * assign random disease test results ; _x_ = ranuni(&seed); if _x_ < 0.25 then rot = +1; else if _x_ < 0.80 then rot = -1; else rot = .; * assign random date of test ; if rot then do; if dod then /* culled */ dot = floor(dob + (dod-dob)*ranuni(&seed)); else /* future dam */ dot = 50 + 5*365*ranuni(&seed); end; else dot = .; herd.add(); end; end; herd.output(dataset:'herdHistory'); stop; format date yymmdd10.; run; proc sort data=herdHistory(drop=donc seq); by dob; run; /* *------------------------------------------------; * Determine Herd Count on a given day; * Method 1, SQL; data dates; do date = 1 to today(); output; end; format date yymmdd10.; run; proc sql; create table HerdCountOnDate as select date , (select count(id) from herdHistory where (dob^=. and dod^=. and dates.date between dob and dod) or (dob^=. and dod =. and dates.date >= dob) or (dod^=. and dob =. and dates.date <= dod) ) as herdsize from dates ; quit; *------------------------------------------------; * Determine Herd Count on a given day; * Method 2, DATA Step; data explode; set herdHistory (keep=dob dod id); if dob = . then dob=1; if dod = . then dod=today(); do date=dob to dod; output; end; keep date id; format date yymmdd10.; run; proc sql; create table HerdCountOnDateM2 as select date, count(*) as herdsize from explode group by date; quit; *------------------------------------------------; * Verify both methods results in identical counts; data differentCounts; merge HerdCountOnDate HerdCountOnDateM2 (rename=herdsize=herdsize2) ; by date; if herdsize ne . and herdsize2 ne . and herdsize ne herdsize2 then output; run; */ %let studyDate = '01JAN2003'D; proc sql; create view herd (label="Herd (as of &studyDate)") as select * from herdHistory where &studyDate between dob and dod or dob = . and &studyDate <= dod or dob <= &studyDate and dod = . ; create view peers (label="Self and peers - mother status (as of &studyDate)") as select animal.* , peer.id as peerid , peer.motherId as peerMotherId , ( select case when mama.dot > &studyDate then . else mama.rot end from herdHistory as mama where mama.id = peer.motherId ) as peerMotherStatus format=2. from herdHistory as animal , herdHistory as peer where animal.dob - peer.dob between 0 and 30 order by animal.dob, animal.id, peer.dob ; create table exposed(label="Calf age exposure to Shedding Dams (&studyDate)") as select unique herd.* , count(peers.id)-1 as nPeers , sum(peers.peerMotherStatus = +1) as nPeersPositive , sum(peers.peerMotherStatus = -1) as nPeersNegative , sum(peers.peerMotherStatus = .) as nPeersUnknown , case when calculated nPeersPositive > 0 then 'Exposed' when calculated nPeersNegative = calculated nPeers then 'Not-Exposed' else 'Exposure-Unknown' end as exposure length=20 from herd left join peers on herd.id = peers.id group by herd.id ; quit;