# modified Jan 27, 2010 to include analysis of # FU-interval-specific event rates and rate ratios setwd("/Users/jameshanley/Documents/Courses/634/survival_analysis") setwd("/Users/jameshanley/Documents/Courses/634/survival") # fill in the path to YOUR datafile ds=read.csv("2gStentsIndivData.csv") # ds: jh shorthand for "dataset" # 1st and last 5 records head(ds) ; tail(ds) n=length(ds$tx) ; n # number of observations in ds ds[seq(1,n,100),] # list every 100th observation # events & person days in tx=1 (2g) and tx=0 (1g) groups length(ds$day[ds$tx==1]); sum(ds$day[ds$tx==1]); sum(ds$event[ds$tx==1]); length(ds$day[ds$tx==0]); sum(ds$day[ds$tx==0]); sum(ds$event[ds$tx==0]); # Kaplan-Meier curves library(survival) # have a look at programs within survival package.. # JH uses the Package Manager in Packages and Data menu # to look at contents of different packages... # survfit is used to fit Kaplan-Meier curve(s) # you must supply the time and event-indicator as a pair, as Surv object # in survival analysis, event indicator = 0 usually used for a censored observation, # and 1 for an observation terminated by the event of interest. ## if want Nelson-Aalen curve... ## na.fit = fit=survfit( Surv(ds$day,ds$event) ~ ds$tx, type="fleming-harrington") ## (documentation is not very clear on this...) ## str(na.fit) ## plot(na.fit) fit=survfit( Surv(ds$day,ds$event) ~ ds$tx) plot(fit) # not very elegant # structure of fit [object.oriented] helps to know # what the various parts of the results are stored under what name.. str(fit) # if use the Surv(time,status)~grouping variable form, it fits as many # curves as there are groups; they can then be accessed as [1], [2] etc.. # now, by being able to get at the relevant components, and using # general plot function, have control over the appearance of plot.. # plot your own plot(fit[1]$time/365,1-fit[1]$surv,type="s", xlab="Year", ylab="Cumulative Incidence",col="red") points(fit[2]$time/365,1-fit[2]$surv,type="s",col="blue") # ======= viewed from an "event-rate" perspective ============ # overall numbers of cases, person years, and event rates by treatment cases.tx1 = sum(ds$event[ds$tx==1]) # sum of 0's and 1s py.tx1 = (1/365)*sum(ds$day[ds$tx==1]) # easier to deal in p-years than p-days rate.tx1 = 100*(cases.tx1/py.tx1); round(c(cases.tx1,py.tx1,rate.tx1),2) cases.tx0 = sum(ds$event[ds$tx==0]) py.tx0 = (1/365)*sum(ds$day[ds$tx==0]) rate.tx0 = 100*(cases.tx0/py.tx0); round(c(cases.tx0,py.tx0,rate.tx0),2) # ********* FU.INTERVAL-SPECIFIC RATES .. # add 1 to all days, so no failures at exactly t=0 # (most survival analysis programs cannot handle [ie will exclude!] # events that coincide with entry) ds$day = ds$day+ 1 # to track the time-slices for each person, give each patient an ID ds$ID=1:length(ds$day) # cutpoints to create FU intervals for use in survSplit function cutpoints = seq(10,360,10) # use = c(...) if wsih to have unequal-width intervals split=survSplit(ds,cut=cutpoints,end="day",event="event",start="start.day") split$pt.days = split$day- split$start.day head(split) tail(split) # interval- and tx- specific numerators and denominators events.interval.tx=aggregate(split$event, by=list(tx=split$tx,mid.interval=split$start.day+5),FUN="sum") PT.interval.tx=aggregate(split$pt.days, by=list(tx=split$tx,mid.interval=split$start.day+5),FUN="sum") events.interval.tx PT.interval.tx dta= cbind(PT.interval.tx,events.interval.tx$x) names(dta)[3]="Pt.days" names(dta)[4]="events" dta # fit a "multiplicative rates" model to the data starting at specified.start.of.FU = 30 # (alterable... ) fit=glm(events~tx+mid.interval, family=poisson,offset=log(Pt.days), data=subset(dta,mid.interval >= specified.start.of.FU) ) summary(fit) exp(fit$coefficients) # non-multiplicative ??? look at coeff. of product term FIT=glm(events~tx + mid.interval +tx*mid.interval, family=poisson,offset=log(Pt.days), data=subset(dta,mid.interval >= specified.start.of.FU) ) summary(FIT) # 1st 10 days only FIT10=glm(events~tx + mid.interval, family=poisson,offset=log(Pt.days), data=subset(dta,mid.interval < 10 ) ) summary(FIT10) dta[1:2,] log( (18/8820) / (33/8806) ) # log RateRatio sqrt(1/18 + 1/33) # Std. Error of log RateRatio -- # compare wth Std. Error in regression # (est. of coeff. for mid.interval not possible: no variation in mid.interval)