setwd("/Users/jameshanley/Documents/Courses/634/JUPITER") # fill in the path to YOUR datafile ds=read.table("JUPITERdataset.txt") # ds: jh shorthand for "dataset" n=length(ds$tx) ; n # number of observations in ds ds[seq(1,n,500),] # list every 500th observation # events & person days in tx=1 (statin) and tx=0 (placebo) groups sum(ds$terminated.by.event[ds$tx==1]);length(ds$last.day[ds$tx==1]); sum(ds$terminated.by.event[ds$tx==0]);length(ds$last.day[ds$tx==0]); # Kaplan-Meier curves library(survival) # have a look at programs within survival package.. # JH uses the Package Manager in Packages and Data menu # to look at contents of different packages... # survfit is used to fit Kaplan-Meier curve(s) # you must supply the time and event-indicator as a pair, as Surv object # in survival analysis, event indicator = 0 usually used for a censored observation, # and 1 for an observation terminated by the event of interest. fit=survfit( Surv(ds$last.day,ds$terminated.by.event) ~ ds$tx) plot(fit) # why so messy ... ?? # structure of fit [object.oriented] helps to know # what the various parts of the results are stored under what name.. str(fit) # if use the Surv(time,status)~grouping variable form, it fits as many # curves as there are groups; they can then be accessed as [1], [2] etc.. # now, by being able to get at the relevant components, and using # general plot function, have control over the appearance of plot.. plot(fit[1]$time/365,1-fit[1]$surv,type="s", xlab="Year", ylab="Cumulative Incidence") points(fit[2]$time/365,1-fit[2]$surv,type="s") # ======= viewed from an "event-rate" perspective ============ # overall numbers of cases, person years, and event rates by treatment cases.tx1 = sum(ds$terminated.by.event[ds$tx==1]) # sum of 0's and 1s py.tx1 = (1/365)*sum(ds$last.day[ds$tx==1]) # easier to deal in p-years than p-days rate.tx1 = 100*(cases.tx1/py.tx1); round(c(cases.tx1,py.tx1,rate.tx1),2) cases.tx0 = sum(ds$terminated.by.event[ds$tx==0]) py.tx0 = (1/365)*sum(ds$last.day[ds$tx==0]) rate.tx0 = 100*(cases.tx0/py.tx0); round(c(cases.tx0,py.tx0,rate.tx0),2) # ********* WINDOW-SPECIFIC RATES .. window width = 1 year # printed excerpts are in "last in, first out" order # creates 4 vectors # WARNING: this nest piece of code, which splits each observation # into the year by year contributions made by each subject, # was easy to code, but is somewhat inneficient.. and may take # a minute or so to run.. still faster than by hand! # to let you know where it is at, it prints selected messages.. # printed excerpts are in "last in, first out" order # IF this code takes too long on your computer, # you can always read in the already-baked result (the time.slice.ds dataset) # from the course website... verbose=T Tx=c(); years=c(); e=c(); year=c(); # 4 vectors to be created for (i in 1:n){ yrs=ceiling(ds$last.day[i]/365) if(verbose & i == (100*floor(i/100)) ) print(paste( "subject",toString(i),"contributes to the first",toString(yrs),"1-year-wide window(s)")) for (yr in 1:yrs){ if (yr> # a year older... << >> not quite since not all 18000 have contributed to all years # Dr Genest, in his seminar on this study, in RVH in Fall, told us there was an outcry # and protest against this study in the US after the 1st year, and so Astra Zeneca # then began to recruit manymore of its patients from overseas.. so those now in year 5 # may be of a different provenance from those who who are coming along behind them...) R CODE to compute year-specific statistics # for you to supply .. # question: # how similar are the year-specific rate ratios comparing statin vs placebo. ? # are they homogeneous enough that you think it makes sense to combine them # into 1 summary rate ratio? # (remember than some of the jumpig around will be because of random variation.. # the Poisson numerators are not all that big in higher years, and so the rate # ratio is not that stable.) # If you consider them the rate ratios "poolable", then pool them # the dataset format makes it easy to program the summary ratio given # in Rothman 2002, eqn # 8-5 for summary Incidence Ratio # [there shouldn't be "confounding" by year, but just the same... ] # since RAMQ will (and mds) be more interested -- for cost effectiveness -- # in rate differences, while you are at it, use equation 8-4 to calculate # a summary Incidence difference # [again, the summary one probably won't be v. different from the crude one.. # but its worth being aware of how these calculations would need to be # done in a non-experimental situation, or one where an rct dataset is smaller, # and there could be major treatment imbalances on a stronger variable than # year...) # === last point... (should actually be sooner) === # double check that nothing got lost in the programming of the time- # slicing.. compare overall totals with those at beginning of this section.. # can also use for crude rate ratios etc ... sapply(summarystats[2:5],sum) # ========