# modified Jan 27, 2010 to include analysis of
# FU-interval-specific event rates and rate ratios

setwd("/Users/jameshanley/Documents/Courses/634/survival_analysis") 
setwd("/Users/jameshanley/Documents/Courses/634/survival") 

  # fill in the path to YOUR datafile

ds=read.csv("2gStentsIndivData.csv") # ds: jh shorthand for "dataset"

# 1st and last 5 records

head(ds) ; tail(ds)

n=length(ds$tx) ; n # number of observations in ds

ds[seq(1,n,100),] # list every 100th observation

# events & person days in tx=1 (2g) and tx=0 (1g) groups

length(ds$day[ds$tx==1]);
sum(ds$day[ds$tx==1]); 
sum(ds$event[ds$tx==1]);

length(ds$day[ds$tx==0]);
sum(ds$day[ds$tx==0]);
sum(ds$event[ds$tx==0]);

# Kaplan-Meier curves

library(survival)

# have a look at programs within survival package..
# JH uses the Package Manager in Packages and Data menu
# to look at contents of different packages...

# survfit is used to fit Kaplan-Meier curve(s)
# you must supply the time and event-indicator as a pair, as Surv object
# in survival analysis, event indicator = 0 usually used for a censored observation,
# and 1 for an observation terminated by the event of interest.

## if want Nelson-Aalen curve...
## na.fit = fit=survfit( Surv(ds$day,ds$event) ~ ds$tx, type="fleming-harrington")
## (documentation is not very clear on this...)

## str(na.fit)

## plot(na.fit) 

fit=survfit( Surv(ds$day,ds$event) ~ ds$tx)

plot(fit)  # not very elegant


# structure of fit [object.oriented] helps to know
# what the various parts of the results are stored under what name..

str(fit) 

# if use the Surv(time,status)~grouping variable form, it fits as many
# curves as there are groups; they can then be accessed as [1], [2] etc.. 

# now, by being able to get at the relevant components, and using
# general plot function, have control over the appearance of plot..

# plot your own

plot(fit[1]$time/365,1-fit[1]$surv,type="s",
  xlab="Year", ylab="Cumulative Incidence",col="red")
points(fit[2]$time/365,1-fit[2]$surv,type="s",col="blue")


# ======= viewed from an "event-rate" perspective ============

# overall numbers of cases, person years, and event rates by treatment

cases.tx1  = sum(ds$event[ds$tx==1]) # sum of 0's and 1s
py.tx1 = (1/365)*sum(ds$day[ds$tx==1]) # easier to deal in p-years than p-days
rate.tx1 = 100*(cases.tx1/py.tx1);
round(c(cases.tx1,py.tx1,rate.tx1),2)

cases.tx0  = sum(ds$event[ds$tx==0])
py.tx0 = (1/365)*sum(ds$day[ds$tx==0])
rate.tx0 = 100*(cases.tx0/py.tx0);
round(c(cases.tx0,py.tx0,rate.tx0),2)


# *********  FU.INTERVAL-SPECIFIC RATES .. 

# add 1 to all days, so no failures at exactly t=0
# (most survival analysis programs cannot handle [ie will exclude!] 
# events that coincide with entry)

ds$day = ds$day+ 1

# to track the time-slices for each person,  give each patient an ID 

ds$ID=1:length(ds$day)

# cutpoints to create FU intervals for use in survSplit function

cutpoints = seq(10,360,10) # use = c(...) if wsih to have unequal-width intervals

split=survSplit(ds,cut=cutpoints,end="day",event="event",start="start.day")
split$pt.days = split$day- split$start.day
head(split)
tail(split)

# interval- and tx- specific numerators and denominators

events.interval.tx=aggregate(split$event,
  by=list(tx=split$tx,mid.interval=split$start.day+5),FUN="sum")
  
PT.interval.tx=aggregate(split$pt.days,
  by=list(tx=split$tx,mid.interval=split$start.day+5),FUN="sum")

events.interval.tx
PT.interval.tx

dta= cbind(PT.interval.tx,events.interval.tx$x)
names(dta)[3]="Pt.days"
names(dta)[4]="events"

dta

# fit a "multiplicative rates" model to the data starting at

 specified.start.of.FU =   30  # (alterable... )  

fit=glm(events~tx+mid.interval,
  family=poisson,offset=log(Pt.days), 
  data=subset(dta,mid.interval >= specified.start.of.FU) )
summary(fit)

exp(fit$coefficients)

# non-multiplicative ???  look at coeff. of product term

FIT=glm(events~tx + mid.interval +tx*mid.interval,
  family=poisson,offset=log(Pt.days), 
  data=subset(dta,mid.interval >= specified.start.of.FU) )
summary(FIT)

# 1st 10 days only

FIT10=glm(events~tx + mid.interval,
  family=poisson,offset=log(Pt.days), 
  data=subset(dta,mid.interval < 10 ) )
summary(FIT10)

dta[1:2,]

log( (18/8820) / (33/8806) ) # log RateRatio

sqrt(1/18 + 1/33)  # Std. Error of log RateRatio -- 
# compare wth Std. Error in regression
# (est. of coeff. for mid.interval not possible: no variation in mid.interval)