#  analyses of titanic data     jh 2017.11.06

setwd("/Users/jameshanley/Desktop/bios601-2017/Week11Ch06")

library(Epi)

######  titanic data ###############
 
titanic = read.table("titanic.txt",header=T)
titanic[1:10,]
titanic[427:436,]
summary(titanic)
titanic[titanic$last <= titanic$born,]

# if wish, drop passenger names;

titanic=titanic[,3:7]
summary(titanic)

# Define as Lexis object with timescales calendar time and age 

L=Lexis(entry=list(per=1912),
        exit=list( per=last, age=last-born),
        exit.status = dead,data=titanic)
names(L)

plot(L,cex.lab=2,xlab="year",cex.axis=1.25)
summary(L$lex.dur)
abline(v=seq(1910,2010,10),col="lightblue",lwd=0.5)
abline(h=seq(0,100,10),col="lightblue",lwd=0.5)

LL <- splitLexis(  L, breaks = seq(1911,2005,5), time.scale="per")
LL <- splitLexis( LL, breaks = c(0,1,seq(0,105,5)), time.scale="age" )
length(LL$lex.dur)
LL[1:30,]

LL.male=LL[LL$ifemale==0,];   LL.male[101:124,]
LL.female=LL[LL$ifemale==1,]; LL.female[101:124,]

sum(LL.male$lex.dur) ; sum(LL.female$lex.dur)                       
                        
########## USA death.rates #########################################

us.rates=read.table("USmx-5x5.txt",header=T) ; dim(us.rates)
head(us.rates); tail(us.rates)

# augment .. use 1991-1995 for 1996-2000 and for 2001-2005

us.rates=rbind(us.rates,us.rates[451:475,],us.rates[451:475,])
dim(us.rates)

# drop 1901-05 and 1906-10

us.rates=us.rates[51:525,] ; dim(us.rates)

# array them (dropping ages 105-)

us.males  =array( us.rates[,4], c(25,19) ); us.males  = us.males[1:22,];
us.females=array( us.rates[,3], c(25,19) ); us.females= us.females[1:22,];
rm(us.rates)
                          
## log death rates by age band

par(mfrow = c(1,2), # 1 x 2 pictures on one plot
          pty = "s")       # square plotting region,
                           # independent of device size
# 1911-1915

plot(1:22,log2(us.males[1:22,1]),
    col="blue",main="log2 rates, 1911-1915",xlab="age band",   ylim=c(-13,-1),cex.lab=2,cex.main=2,cex.axis=2,cex=2,ylab="")
points(1:22,log2(us.females[1:22,1]),
col="red",ylab="",cex.lab=2,cex.main=2,cex.axis=2,cex=2)

# 1991-1995

plot(1:22,log2(us.males[1:22,19]),
 col="blue",main="log2 rates, 1991-1995",
 xlab="age band",cex.lab=2,
 cex.main=2,cex.axis=2,
 ylab="",ylim=c(-13,-1),pch=19,cex=1.25)
points(1:22,log2(us.females[1:22,19]),
  col="red",cex=1.25,pch=19)

##### death rates in 3 age-bands, from 1910 to 1995

par(mfrow = c(1,3),pty = "s")  

plot(1:17,log2(us.males[1,1:17]),
  col="blue",ylim=c(-11,-3),
  main="log2 rates, 0-1 age band",
  xlab="17 quinquennia 1911-1995",
  ylab="",cex.lab=2,cex.main=2,cex.axis=2,cex=1.25,pch=19)
points(1:17,log2(us.females[1,1:17]),
  col="red",cex=1.25, pch=19)

plot(1:17,log2(us.females[7,1:17]),
  col = "red",cex.lab=2,cex.main=2,cex.axis=2,
  ylab="",ylim=c(-11,-3),
  main="log2 rates, 25-29 age band",
  xlab="17 quinquennia 1911-1995", 
  pch=19,cex=1.25)
points(1:17,log2(us.males[7,1:17]),col = "blue",
  cex=1.25,pch=19)

plot(1:17,log2(us.females[15,1:17]),
   col = "red",cex.lab=2,cex.main=2,
   cex.axis=2,ylab="",cex=1.25,pch=19,
   ylim=c(-11,-3),
   main="log2 rates,  65-69 age band",xlab="17 quinquennia 1911-1995")
points(1:17,log2(us.males[15,1:17]),
 col = "blue",cex=1.25,pch=19)
                           

#### Death rates in Titanic survivors vs US General Population #####

# Overall no. of deaths and p-t

observed.deaths = sum(LL[,"lex.Xst"]); observed.deaths
person.time = sum(LL[,"lex.dur"]); person.time


# *** aggregates within age-period cells, using aggregate ***********
	
# age categories (a.c for short) 

# deal with 01- and 1-4 .. ie convert to categories

LL$a.c = 1*(LL$age==0) + 2 * ( (LL$age > 0) & (LL$age < 6) ) + 
         (2+floor(LL$age/5)) * ( LL$age > 5) ; LL[1:30,c("age","a.c")]

#  t  is short for total  

# p is short for (calendar) period ; m.cat for whether male

t.us=aggregate(LL[,c("lex.Xst","lex.dur")],      
           by=list(a.cat=LL$a.c,
                   p.cat=floor((LL$per-1907)/5),
                   m.cat=1-LL$ifemale,
                   c.cat=LL$class ),
           sum) ; 
        
       dim(t.us); t.us[1:10,]
 
  #  augment with a column of rates and one of null expectations
   
  t.us$usrate = NA ;
  n.rows = length(t.us$a.cat)
  
  for (k in 1:n.rows) {
  	i = t.us[k,"a.cat"]; j = t.us[k,"p.cat"]; s=t.us[k,"m.cat"];
  	if (s==1) t.us$usrate[k] = us.males[i,j] else t.us$usrate[k] = us.females[i,j]
  	} 
  	
  t.us[1:20,]  
  
  # titanic (index) vs USA(ref) hr = O/E (see footnote in assignment)
  
  ( O = sum( t.us$lex.Xst ) ) # observed deaths
  ( E = sum( t.us$usrate * t.us$lex.dur ) ) # expected deaths
  ( O.E.ratio = O/E )
  
  ( round(c(O,E,O.E.ratio),2) )
  
  # males .. subset then go back to hratio  t.us=t.us[t.us$m.cat==0,]  
              
  ( null.chi = abs ( (O - E ) / sqrt(E) ) )
   
  c("ChiSq=",round(null.chi^2,2), "Chi",round(null.chi,2), "P-value", 
    round(2*(1-pnorm(null.chi)),4) ) 
  
  ( test.based.ci.95 = round(O.E.ratio^(1+1.96*c(1,0,-1)/null.chi),2) );    
     
# Q6 is limited to the Titanic passengers
# but need to segregate the experience by class
# as well as gender, age and year

# so, aggregate the relevant items in LL,
# by class, gender, age, and year

# first, the person-time, ie durations in each cell

# death = 1 if eXit status is 1 (dead) at end of slice
# death = 0 if eXit status is 0 (alive)

# 10 yrs wide calendar year bins, 5 yrs wide age bins
  

D = tapply( status(LL,"exit")==1,
  list( period=10*floor(LL$per/10)+5,
        age=5*floor(LL$age/5)+2,
        ifemale=LL$ifemale,
        class=LL$class ) , sum )

PY= tapply( dur(LL),
  list( period=10*floor(LL$per/10)+5,
        age=5*floor(LL$age/5)+2,
        ifemale=LL$ifemale,
        class=LL$class ) , sum )


#deaths per year
sum(D,na.rm=TRUE)/sum(PY,na.rm=TRUE)

#years per (until) death 
sum(PY,na.rm=TRUE)/sum(D,na.rm=TRUE)

## curious: where are the person years, by class
  
PY. = tapply(LL$lex.dur,
  list(age=5*floor(LL$age/5),class=LL$class),sum)

plot( as.numeric(dimnames(PY.)$age), PY.[,1] ,
  col=1,pch="1", ylab="Number of PY",xlab="ageBin")
points( as.numeric(dimnames(PY.)$age), PY.[,2] ,
  col=2,pch="2", ylab="Number of PY",xlab="ageBin")
points( as.numeric(dimnames(PY.)$age), PY.[,3] ,
  col=3,pch="3", ylab="Number of PY",xlab="ageBin")
   
# MH comparison of classes 2 & 3 versus 1 (ref),
# matched on gender, binned-age and binned-period
	       
for (class in 2:3) {
	
  print( noquote(" "))
  print( noquote(c("class:",class," vs. class 1")))	
  print( noquote(" "))
  
  
  mh.num = sum(D[,,,class]*PY[,,,1]/
	               (PY[,,,class]+PY[,,,1]),na.rm=TRUE )
	               
  mh.den = sum(D[,,,1]*PY[,,,class]/
	               (PY[,,,class]+PY[,,,1]),na.rm=TRUE )
	               
  print(round(c(mh.num,mh.den,mh.num/mh.den),2))
}

#  if you want to fit some Poisson regression models
# instead of matching, you could just use the 
# LL dataset directly...  e.g.

summary(glm(lex.Xst ~ ifemale+age+per+as.factor(class) +
            offset(log(lex.dur)),
            family=poisson, data=LL) )

# you would need to convert the beta.hats to
# multiplicative factors.. i.e by what percentage
# do neighbouring years of ages or genders differ?