# analyses of titanic data jh 2008.01.20 library(Epi) ###### titanic data ############### titanic = read.table("titanic.txt",header=T) titanic[1:10,] titanic[427:436,] # if wish, drop passenger names; titanic=titanic[,3:7] summary(titanic) # Define as Lexis object with timescales calendar time and age L=Lexis(entry=list(per=1912),exit=list( per=last, age=last-born),exit.status = dead,data=titanic) names(L) plot(L,cex.lab=2,xlab="year",cex.axis=1.5) summary(L$lex.dur) LL <- splitLexis( L, breaks = seq(1911,2005,5), time.scale="per") LL <- splitLexis( LL, breaks = c(0,1,seq(0,105,5)), time.scale="age" ) length(LL$lex.dur) LL[1:30,] LL.male=LL[LL$ifemale==0,]; LL.male[101:124,] LL.female=LL[LL$ifemale==1,]; LL.female[101:124,] # Tabulate the deaths and the person-years (using Lexis e.g.) # jh did the sex.specific work this way # before happening on the much easier R aggregate fn. # see ********* using aggregate" below for more flexible ways ***** tab(LL) tab(LL.male) tab(LL.female) d=tapply( status(LL,"exit")==1, list( timeBand(LL,"age","left"), timeBand(LL,"per","left") ), sum ) d.female=tapply( status(LL.female,"exit")==1, list( timeBand(LL.female,"age","left"), timeBand(LL.female,"per","left") ), sum ) py=tapply( dur(LL), list( timeBand(LL,"age","left"), timeBand(LL,"per","left") ), sum ) py.female=tapply( dur(LL.female), list( timeBand(LL.female,"age","left"), timeBand(LL.female,"per","left") ), sum ) d.male=d-d.female; py.male=py - py.female dim(py);dim(py.male);dim(py.female) py # the 10 week old passenger y.range=c(1910,2013); a.range=c(-5,125) plot(y.range,a.range,col="white",xlim=y.range, ylim=a.range,xlab="",ylab="", cex.lab=1.2,cex.axis=1.2,axes=FALSE, frame.plot=F,asp=(y.range[2]-y.range[1])/(a.range[2]=a.range[1])) for (a in c(0,1, seq(5,105,5)) ){lines(c(1911,2006),c(a,a))} for (y in seq(1911,2006,5)){lines(c(y,y),c(0,105)) } for (y in seq(1911,2006,5)){ if (y < 2000) mille ="1" else mille="2" if (y < 2000) century ="9" else century="0" if (century=="0") decade="0" else decade=paste(floor((y-1900)/10)) yr = (y-1900)-10*floor((y-1900)/10); text(y,118, mille,cex=0.8); text(y,115, century,cex=0.8); text(y,112, decade,cex=0.8); text(y,109, paste(yr),cex=0.8); } for (a in seq(0,105,10)){text(2012.5,a, paste(a),cex=0.8,adj=c(1,0.5))} lines(c(1912,2003),c(0,91),col="red") text(1912,-2,"1912",cex=2,col="darkgray"); text(2012.5,105,"Age",adj=c(1,0.5),cex=0.8); text(1961,-2, "Year",cex=1.2); lines(c(1912,1912),c(-1,0)) points(rep(1912,104),seq(1,104,1),cex=0.25,col="darkgray") ########## USA death.rates ######################################### us.rates=read.table("USmx-5x5.txt",header=T) ; dim(us.rates) # augment .. use 1991-1995 for 1996-2000 and for 2001-2005 us.rates=rbind(us.rates,us.rates[451:475,],us.rates[451:475,]) dim(us.rates) # drop 1901-05 and 1906-10 us.rates=us.rates[51:525,] ; dim(us.rates) # array them (dropping ages 105-) us.males =array( us.rates[,4], c(25,19) ); us.males = us.males[1:22,]; us.females=array( us.rates[,3], c(25,19) ); us.females= us.females[1:22,]; rm(us.rates) ## log death rates by age band par(mfrow = c(1,2), # 2 x 2 pictures on one plot pty = "s") # square plotting region, # independent of device size # 1911-1915 plot(1:22,log2(us.males[1:22,1]), col="blue",main="log2 rates, 1911-1915",xlab="age band", ylim=c(-13,-1),cex.lab=2,cex.main=2,cex.axis=2,cex=2,ylab="") points(1:22,log2(us.females[1:22,1]), col="red",ylab="",cex.lab=2,cex.main=2,cex.axis=2,cex=2) # 1991-1995 plot(1:22,log2(us.males[1:22,19]),col="blue",main="log2 rates, 1991-1995",xlab="age band",cex.lab=2,cex.main=2,cex.axis=2,cex=2,ylab="",ylim=c(-13,-1)) points(1:22,log2(us.females[1:22,19]),col="red",cex.lab=2,cex.main=2,cex.axis=2,cex=2) ############# death rates in 3 age-bands, from 1910 to 1995 par(mfrow = c(1,3),pty = "s") plot(1:17,log2(us.males[1,1:17]),col="blue",ylim=c(-11,-3),main="log2 rates, 0-1 age band",xlab="17 quinquennia 1911-1995",ylab="",cex.lab=2,cex.main=2,cex.axis=2,cex=2) points(1:17,log2(us.females[1,1:17]),col="red",cex=2) plot(1:17,log2(us.females[7,1:17]),col = "red",cex.lab=2,cex.main=2,cex.axis=2,cex=2, ylab="",ylim=c(-11,-3),main="log2 rates, 25-29 age band",xlab="17 quinquennia 1911-1995") points(1:17,log2(us.males[7,1:17]),col = "blue",cex=2) plot(1:17,log2(us.females[15,1:17]),col = "red",cex.lab=2,cex.main=2,cex.axis=2,ylab="",cex=2, ylim=c(-11,-3),main="log2 rates, 65-69 age band",xlab="17 quinquennia 1911-1995") points(1:17,log2(us.males[15,1:17]),col = "blue",cex=2) ######### Death rates in Titanic survivors vs US General Population ########## observed.deaths = sum(LL[,"lex.Xst"]); observed.deaths person.time = sum(LL[,"lex.dur"]); person.time d.tmp=d; d.tmp[is.na(d)]=0 ; d=d.tmp ; py.tmp=py; py.tmp[is.na(py.tmp)]=0 ; py=py.tmp ; d.tmp=d.male; d.tmp[is.na(d.tmp)]=0 ; d.male=d.tmp ; py.tmp=py.male; py.tmp[is.na(py.tmp)]=0 ; py.male=py.tmp ; d.tmp=d.female; d.tmp[is.na(d.tmp)]=0 ; d.female=d.tmp ; py.tmp=py.female; py.tmp[is.na(py.tmp)]=0 ; py.female=py.tmp ; # all o=sum(apply(d,1,sum)); e=sum(apply(py*(us.males+us.females)/2,1,sum)); c(o,e,o/e) # males o=sum(apply(d.male,1,sum)); e=sum(apply(py.male*us.males,1,sum)); c(o,e,o/e) # females o=sum(apply(d.female,1,sum)); e=sum(apply(py.female*us.females,1,sum)); c(o,e,o/e) # confounding by age ? ave age of the p-y at beginning ave.age.1914.females=sum( py.female[,1]*c(0.5,3,seq(7.5,102.5,5)) ) / sum(c(0.5,3,seq(7.5,102.5,5))) ave.age.1914.males =sum( py.male[,1] *c(0.5,3,seq(7.5,102.5,5)) ) / sum(c(0.5,3,seq(7.5,102.5,5))) # ?? confounding by age? ave.age.1912.female=1912-mean(L[L$ifemale==1,"born"]) ave.age.1912.male=1912-mean(L[L$ifemale==0,"born"]) round(c(ave.age.1912.male,ave.age.1912.female),1) # mantel-haenszel over age-period females vs US, males vs US mh.female =sum( apply(d.female*(10^6)/(10^6+py.female), 1,sum) )/ sum( apply(us.females*(10^6)*py.female/(10^6+py.female), 1,sum) ) mh.male = sum( apply(d.male*(10^6)/(10^6+py.male), 1,sum) )/ sum( apply(us.males*(10^6)*py.male/(10^6+py.male), 1,sum) ) c(mh.male, mh.female) # what does a rate (hazard) ratio of hr=0.835 mean for remaining # longevity of 30 yr old females in 1912 ? (i.e. facing age bands B and up) # females 30 years and up ie B=8 and up, males from 25 years onwards B=7 who="F"; B=8; hr = 0.835; rates=us.females; who="M"; B=7; hr = 0.94 ; rates=us.males; s.initial.us=1; s.initial.titanic=1; ave.longevity.us = 0; ave.longevity.titanic = 0; for (i in B:22) { s.final.us=s.initial.us*exp(- 5 * rates[i,(i-B)+1] ); ave.longevity.us = ave.longevity.us + 5*(s.initial.us+s.final.us)/2; s.initial.us=s.final.us; s.final.titanic=s.initial.titanic*exp(- 5 * rates[i,(i-B)+1] * hr ); ave.longevity.titanic = ave.longevity.titanic + 5*(s.initial.titanic+s.final.titanic)/2; s.initial.titanic=s.final.titanic; } who; round(c(ave.longevity.titanic, ave.longevity.us, ave.longevity.titanic - ave.longevity.us),1) # passenger class distributions table(titanic$class,titanic$ifemale) summary(lm(titanic$class ~ titanic$ifemale)) ########## # rate regression internal to titanic passengers summary( glm(lex.Xst ~ ifemale + age + per , family=poisson, offset = log(lex.dur),data=LL) ) # rate regression US population only.. a.mid=c(0.5,3,seq(7.5,102.5,5)); y.mid=seq(1913,2003,5); m = c(); a=c(); since.1900=c(); r=c(); for (k in 0:1){ for (i in 2:20){ for (j in 1:19){ a=c(a,a.mid[i]); since.1900=c(since.1900,y.mid[j]-1900); if (k==0) r=c(r,us.females[i,j]) else r=c(r,us.males[i,j]); m=c(m,k) } } } summary( glm(log(r) ~ m + a + since.1900) ) # ************** using aggregate *********** # deal with 01- and 1-4 .. ie convert to categories LL$a.c = 1*(LL$age==0) + 2 * ( (LL$age > 0) & (LL$age < 6) ) + (2+floor(LL$age/5)) * ( LL$age > 5) ; LL[1:30,c("age","a.c")] t.us=aggregate(LL[,c("lex.Xst","lex.dur")], by=list(a.cat=LL$a.c, p.cat=floor((LL$per-1907)/5), m.cat=1-LL$ifemale, c.cat=LL$class ), sum) ; dim(t.us); t.us[1:10,] # augment with a column of rates and one of null expectations t.us$usrate = NA ; n.rows = length(t.us$a.cat) for (k in 1:n.rows) { i = t.us[k,"a.cat"]; j = t.us[k,"p.cat"]; s=t.us[k,"m.cat"]; if (s==1) t.us$usrate[k] = us.males[i,j] else t.us$usrate[k] = us.females[i,j] } t.us[1:20,] # titanic (index) vs USA hr.mh = O/E if experience in ref cat >> in index mh.num = sum( t.us$lex.Xst ) mh.den = sum( t.us$usrate * t.us$lex.dur ) mh.ratio = mh.num/mh.den c( round(mh.num,2), round(mh.den,2), round(mh.ratio,2) ) # males .. subset then go back to hratio t.us=t.us[t.us$m.cat==0,] null.chi = abs ( (mh.num - mh.den ) / sqrt(mh.den) ) c("ChiSq=",round(null.chi^2,2), "Chi",round(null.chi,2), "P-value", round(2*(1-pnorm(null.chi)),3) ) test.based.ci.95 = round(mh.ratio^(1+1.96*c(1,-1)/null.chi),2) ; test.based.ci.95 # --