#  analyses of titanic data     jh 2008.01.20

library(Epi)

######  titanic data ###############
 
titanic = read.table("titanic.txt",header=T)
titanic[1:10,]
titanic[427:436,]

# if wish, drop passenger names;

titanic=titanic[,3:7]
summary(titanic)

# Define as Lexis object with timescales calendar time and age 

L=Lexis(entry=list(per=1912),exit=list( per=last, age=last-born),exit.status = dead,data=titanic)
names(L)
plot(L,cex.lab=2,xlab="year",cex.axis=1.5)
summary(L$lex.dur)

LL <- splitLexis(  L, breaks = seq(1911,2005,5), time.scale="per")
LL <- splitLexis( LL, breaks = c(0,1,seq(0,105,5)), time.scale="age" )
length(LL$lex.dur)
LL[1:30,]

LL.male=LL[LL$ifemale==0,];   LL.male[101:124,]
LL.female=LL[LL$ifemale==1,]; LL.female[101:124,]

# Tabulate the deaths and the person-years (using Lexis e.g.)

# jh did the sex.specific work this way 
# before happening on the much easier R aggregate fn.

# see ********* using aggregate" below for more flexible ways *****

tab(LL)
tab(LL.male)
tab(LL.female)

d=tapply( status(LL,"exit")==1, list( timeBand(LL,"age","left"),
                                    timeBand(LL,"per","left") ), sum )
                                                                        
d.female=tapply( status(LL.female,"exit")==1, list( timeBand(LL.female,"age","left"),
                                    timeBand(LL.female,"per","left") ), sum )
                                    
py=tapply( dur(LL),  list( timeBand(LL,"age","left"),
                        timeBand(LL,"per","left") ), sum )
                        
py.female=tapply( dur(LL.female),  list( timeBand(LL.female,"age","left"),
                        timeBand(LL.female,"per","left") ), sum )

d.male=d-d.female; py.male=py - py.female
dim(py);dim(py.male);dim(py.female)

py

# the 10 week old passenger 

y.range=c(1910,2013); a.range=c(-5,125)
plot(y.range,a.range,col="white",xlim=y.range,
   ylim=a.range,xlab="",ylab="",
   cex.lab=1.2,cex.axis=1.2,axes=FALSE,
   frame.plot=F,asp=(y.range[2]-y.range[1])/(a.range[2]=a.range[1]))
for (a in c(0,1, seq(5,105,5)) ){lines(c(1911,2006),c(a,a))}
for (y in seq(1911,2006,5)){lines(c(y,y),c(0,105)) }
for (y in seq(1911,2006,5)){
	if (y < 2000) mille ="1" else mille="2"
	if (y < 2000) century ="9" else century="0" 
	if (century=="0") decade="0" else decade=paste(floor((y-1900)/10))
	yr = (y-1900)-10*floor((y-1900)/10);
	text(y,118, mille,cex=0.8);
	text(y,115, century,cex=0.8);
	text(y,112, decade,cex=0.8);
	text(y,109, paste(yr),cex=0.8); }	
for (a in seq(0,105,10)){text(2012.5,a, paste(a),cex=0.8,adj=c(1,0.5))}
lines(c(1912,2003),c(0,91),col="red")
text(1912,-2,"1912",cex=2,col="darkgray");
text(2012.5,105,"Age",adj=c(1,0.5),cex=0.8);
text(1961,-2, "Year",cex=1.2);
lines(c(1912,1912),c(-1,0))
points(rep(1912,104),seq(1,104,1),cex=0.25,col="darkgray")

                        
                        
########## USA death.rates #########################################

us.rates=read.table("USmx-5x5.txt",header=T) ; dim(us.rates)

# augment .. use 1991-1995 for 1996-2000 and for 2001-2005

us.rates=rbind(us.rates,us.rates[451:475,],us.rates[451:475,])
dim(us.rates)

# drop 1901-05 and 1906-10

us.rates=us.rates[51:525,] ; dim(us.rates)

# array them (dropping ages 105-)

us.males  =array( us.rates[,4], c(25,19) ); us.males  = us.males[1:22,];
us.females=array( us.rates[,3], c(25,19) ); us.females= us.females[1:22,];
rm(us.rates)
                          

## log death rates by age band

par(mfrow = c(1,2), # 2 x 2 pictures on one plot
          pty = "s")       # square plotting region,
                           # independent of device size
# 1911-1915

plot(1:22,log2(us.males[1:22,1]),
    col="blue",main="log2 rates, 1911-1915",xlab="age band",
    ylim=c(-13,-1),cex.lab=2,cex.main=2,cex.axis=2,cex=2,ylab="")
points(1:22,log2(us.females[1:22,1]),
    col="red",ylab="",cex.lab=2,cex.main=2,cex.axis=2,cex=2)

# 1991-1995

plot(1:22,log2(us.males[1:22,19]),col="blue",main="log2 rates, 1991-1995",xlab="age band",cex.lab=2,cex.main=2,cex.axis=2,cex=2,ylab="",ylim=c(-13,-1))
points(1:22,log2(us.females[1:22,19]),col="red",cex.lab=2,cex.main=2,cex.axis=2,cex=2)

############# death rates in 3 age-bands, from 1910 to 1995

par(mfrow = c(1,3),pty = "s")  

plot(1:17,log2(us.males[1,1:17]),col="blue",ylim=c(-11,-3),main="log2 rates, 0-1 age band",xlab="17 quinquennia 1911-1995",ylab="",cex.lab=2,cex.main=2,cex.axis=2,cex=2)
points(1:17,log2(us.females[1,1:17]),col="red",cex=2)

plot(1:17,log2(us.females[7,1:17]),col = "red",cex.lab=2,cex.main=2,cex.axis=2,cex=2,
ylab="",ylim=c(-11,-3),main="log2 rates, 25-29 age band",xlab="17 quinquennia 1911-1995")
points(1:17,log2(us.males[7,1:17]),col = "blue",cex=2)

plot(1:17,log2(us.females[15,1:17]),col = "red",cex.lab=2,cex.main=2,cex.axis=2,ylab="",cex=2,
ylim=c(-11,-3),main="log2 rates,  65-69 age band",xlab="17 quinquennia 1911-1995")
points(1:17,log2(us.males[15,1:17]),col = "blue",cex=2)
                           
######### Death rates in Titanic survivors vs US General Population ##########

observed.deaths = sum(LL[,"lex.Xst"]); observed.deaths
person.time = sum(LL[,"lex.dur"]); person.time

d.tmp=d; d.tmp[is.na(d)]=0 ; d=d.tmp ; 
py.tmp=py; py.tmp[is.na(py.tmp)]=0 ; py=py.tmp ; 

d.tmp=d.male; d.tmp[is.na(d.tmp)]=0 ; d.male=d.tmp ; 
py.tmp=py.male; py.tmp[is.na(py.tmp)]=0 ; py.male=py.tmp ;

d.tmp=d.female; d.tmp[is.na(d.tmp)]=0 ; d.female=d.tmp ; 
py.tmp=py.female; py.tmp[is.na(py.tmp)]=0 ; py.female=py.tmp ; 

# all
  o=sum(apply(d,1,sum)); e=sum(apply(py*(us.males+us.females)/2,1,sum)); c(o,e,o/e)
# males
  o=sum(apply(d.male,1,sum)); e=sum(apply(py.male*us.males,1,sum)); c(o,e,o/e)
# females
  o=sum(apply(d.female,1,sum)); e=sum(apply(py.female*us.females,1,sum)); c(o,e,o/e)

# confounding by age ?  ave age of the p-y at beginning

ave.age.1914.females=sum( py.female[,1]*c(0.5,3,seq(7.5,102.5,5)) ) / 
                     sum(c(0.5,3,seq(7.5,102.5,5)))
ave.age.1914.males  =sum( py.male[,1]  *c(0.5,3,seq(7.5,102.5,5)) ) / 
                     sum(c(0.5,3,seq(7.5,102.5,5)))

# ?? confounding by age?

ave.age.1912.female=1912-mean(L[L$ifemale==1,"born"])
ave.age.1912.male=1912-mean(L[L$ifemale==0,"born"])
round(c(ave.age.1912.male,ave.age.1912.female),1)

# mantel-haenszel over age-period   females vs US, males vs US 

mh.female =sum( apply(d.female*(10^6)/(10^6+py.female),             1,sum) )/
           sum( apply(us.females*(10^6)*py.female/(10^6+py.female), 1,sum) )

mh.male =  sum( apply(d.male*(10^6)/(10^6+py.male),             1,sum) )/
           sum( apply(us.males*(10^6)*py.male/(10^6+py.male), 1,sum) )

c(mh.male, mh.female)

# what does a rate (hazard) ratio of hr=0.835 mean for remaining
# longevity of 30 yr old females in 1912 ? (i.e. facing age bands B and up)

# females 30 years and up ie B=8 and up, males from 25 years onwards B=7 

 who="F"; B=8; hr = 0.835; rates=us.females;  
 
 who="M"; B=7; hr = 0.94 ; rates=us.males; 

s.initial.us=1; s.initial.titanic=1;
ave.longevity.us  = 0; 
ave.longevity.titanic  = 0;
for (i in B:22) {
	s.final.us=s.initial.us*exp(- 5 * rates[i,(i-B)+1] );
	ave.longevity.us = ave.longevity.us + 
	                   5*(s.initial.us+s.final.us)/2;
	s.initial.us=s.final.us;
	
	s.final.titanic=s.initial.titanic*exp(- 5 * rates[i,(i-B)+1] * hr );
	ave.longevity.titanic = ave.longevity.titanic + 
	                        5*(s.initial.titanic+s.final.titanic)/2;
	s.initial.titanic=s.final.titanic;	
	}
who; round(c(ave.longevity.titanic, 
        ave.longevity.us, 
        ave.longevity.titanic - ave.longevity.us),1)
        
# passenger class distributions

table(titanic$class,titanic$ifemale)

summary(lm(titanic$class ~ titanic$ifemale))

##########

# rate regression  internal to titanic passengers

summary( glm(lex.Xst ~ ifemale + age + per , family=poisson, 
              offset = log(lex.dur),data=LL) )
              
              
# rate regression  US population only..

a.mid=c(0.5,3,seq(7.5,102.5,5)); 
y.mid=seq(1913,2003,5); 

m = c(); a=c(); since.1900=c(); r=c();


	  for (k in 0:1){
	  	for (i in 2:20){
	    for (j in 1:19){
		      a=c(a,a.mid[i]);
		      since.1900=c(since.1900,y.mid[j]-1900);
		      if (k==0) r=c(r,us.females[i,j])
		      else r=c(r,us.males[i,j]);
		      m=c(m,k)
		               }
	                 }
	         }

	   
summary( glm(log(r) ~ m + a + since.1900) )


# ************** using aggregate ***********
	
# deal with 01- and 1-4 .. ie convert to categories

LL$a.c = 1*(LL$age==0) + 2 * ( (LL$age > 0) & (LL$age < 6) ) + 
         (2+floor(LL$age/5)) * ( LL$age > 5) ; LL[1:30,c("age","a.c")]

t.us=aggregate(LL[,c("lex.Xst","lex.dur")],      
           by=list(a.cat=LL$a.c,
                   p.cat=floor((LL$per-1907)/5),
                   m.cat=1-LL$ifemale,
                   c.cat=LL$class ),
           sum) ; 
        
       dim(t.us); t.us[1:10,]
 
  #  augment with a column of rates and one of null expectations
   
  t.us$usrate = NA ;
  n.rows = length(t.us$a.cat)
  
  for (k in 1:n.rows) {
  	i = t.us[k,"a.cat"]; j = t.us[k,"p.cat"]; s=t.us[k,"m.cat"];
  	if (s==1) t.us$usrate[k] = us.males[i,j] else t.us$usrate[k] = us.females[i,j]
  	} 
  	
  t.us[1:20,]  
  
  # titanic (index) vs USA   hr.mh = O/E if experience in ref cat >> in index
  
  mh.num = sum( t.us$lex.Xst )
  mh.den = sum( t.us$usrate * t.us$lex.dur )
  mh.ratio = mh.num/mh.den
  
  c( round(mh.num,2), round(mh.den,2), round(mh.ratio,2) )
  
  # males .. subset then go back to hratio  t.us=t.us[t.us$m.cat==0,]  
              
  null.chi = abs ( (mh.num - mh.den ) / sqrt(mh.den) )
   
  c("ChiSq=",round(null.chi^2,2), "Chi",round(null.chi,2), "P-value", 
    round(2*(1-pnorm(null.chi)),3) ) 
  
  test.based.ci.95 = round(mh.ratio^(1+1.96*c(1,-1)/null.chi),2) ; test.based.ci.95   
     
  # --