# analyses of titanic data jh 2015.11.07 library(Epi) ###### titanic data ############### titanic = read.table("titanic.txt",header=T) titanic[1:10,] titanic[427:436,] # if wish, drop passenger names; titanic=titanic[,3:7] summary(titanic) # Define as Lexis object with timescales calendar time and age L=Lexis(entry=list(per=1912),exit=list( per=last, age=last-born),exit.status = dead,data=titanic) names(L) plot(L,cex.lab=2,xlab="year",cex.axis=1.5) summary(L$lex.dur) LL <- splitLexis( L, breaks = seq(1911,2005,5), time.scale="per") LL <- splitLexis( LL, breaks = c(0,1,seq(0,105,5)), time.scale="age" ) length(LL$lex.dur) LL[1:30,] # change names : lex.dur -> n.years # lex.Xst -> death names(LL)[c(4,6)] = c("n.years","death") LL[1:30,] LL.male=LL[LL$ifemale==0,]; LL.male[101:124,] LL.female=LL[LL$ifemale==1,]; LL.female[101:124,] ########## USA death.rates ######################################### us.rates=read.table("USmx-5x5.txt",header=T) ; dim(us.rates) # augment .. use 1991-1995 for 1996-2000 and for 2001-2005 us.rates=rbind(us.rates,us.rates[451:475,],us.rates[451:475,]) dim(us.rates) # drop 1901-05 and 1906-10 us.rates=us.rates[51:525,] ; dim(us.rates) # array them (dropping ages 105-) us.males =array( us.rates[,4], c(25,19) ); us.males = us.males[1:22,]; us.females=array( us.rates[,3], c(25,19) ); us.females= us.females[1:22,]; rm(us.rates) ## log death rates by age band par(mfrow = c(1,2), # 1 x 2 pictures on one plot pty = "s") # square plotting region, # independent of device size # 1911-1915 plot(1:22,log2(us.males[1:22,1]), col="blue",main="log2 rates, 1911-1915",xlab="age band", ylim=c(-13,-1),cex.lab=2,cex.main=2,cex.axis=2,cex=2,ylab="") points(1:22,log2(us.females[1:22,1]), col="red",ylab="",cex.lab=2,cex.main=2,cex.axis=2,cex=2) # 1991-1995 plot(1:22,log2(us.males[1:22,19]),col="blue",main="log2 rates, 1991-1995",xlab="age band",cex.lab=2,cex.main=2,cex.axis=2,cex=2,ylab="",ylim=c(-13,-1)) points(1:22,log2(us.females[1:22,19]),col="red",cex.lab=2,cex.main=2,cex.axis=2,cex=2) ############# death rates in 3 age-bands, from 1910 to 1995 par(mfrow = c(1,3),pty = "s") plot(1:17,log2(us.males[1,1:17]),col="blue",ylim=c(-11,-3),main="log2 rates, 0-1 age band",xlab="17 quinquennia 1911-1995",ylab="",cex.lab=2,cex.main=2,cex.axis=2,cex=2) points(1:17,log2(us.females[1,1:17]),col="red",cex=2) plot(1:17,log2(us.females[7,1:17]),col = "red",cex.lab=2,cex.main=2,cex.axis=2,cex=2, ylab="",ylim=c(-11,-3),main="log2 rates, 25-29 age band",xlab="17 quinquennia 1911-1995") points(1:17,log2(us.males[7,1:17]),col = "blue",cex=2) plot(1:17,log2(us.females[15,1:17]),col = "red",cex.lab=2,cex.main=2,cex.axis=2,ylab="",cex=2, ylim=c(-11,-3),main="log2 rates, 65-69 age band",xlab="17 quinquennia 1911-1995") points(1:17,log2(us.males[15,1:17]),col = "blue",cex=2) ######### Death rates in Titanic survivors vs US General Population ########## # Overall no. of deaths and p-t observed.deaths = sum(LL[,"n.years"]); observed.deaths person.time = sum(LL[,"death"]); person.time # ************** aggregates within age-period cells, using aggregate *********** # age categories (a.c for short) # deal with 01- and 1-4 .. ie convert to categories LL$a.c = 1*(LL$age==0) + 2 * ( (LL$age > 0) & (LL$age < 6) ) + (2+floor(LL$age/5)) * ( LL$age > 5) ; LL[1:30,c("age","a.c")] # t is short for total # p is short for (calendar) period ; m.cat for whether male t.us=aggregate(LL[,c("death","n.years")], by=list(a.cat=LL$a.c, p.cat=floor((LL$per-1907)/5), m.cat=1-LL$ifemale, c.cat=LL$class ), sum) ; dim(t.us); t.us[1:10,] # augment with a column of rates and one of null expectations t.us$usrate = NA ; n.rows = length(t.us$a.cat) for (k in 1:n.rows) { i = t.us[k,"a.cat"]; j = t.us[k,"p.cat"]; s=t.us[k,"m.cat"]; if (s==1) t.us$usrate[k] = us.males[i,j] else t.us$usrate[k] = us.females[i,j] } t.us[1:20,] # titanic (index) vs USA(ref) hr.mh = O/E (see footnote in assignment) mh.num = sum( t.us$death ) # observed deaths mh.den = sum( t.us$usrate * t.us$n.years ) # expected deaths mh.ratio = mh.num/mh.den round(c(mh.num,mh.den,mh.ratio),2) # males .. subset then go back to hratio t.us=t.us[t.us$m.cat==0,] null.chi = abs ( (mh.num - mh.den ) / sqrt(mh.den) ) c("ChiSq=",round(null.chi^2,2), "Chi",round(null.chi,2), "P-value", round(2*(1-pnorm(null.chi)),4) ) test.based.ci.95 = round(mh.ratio^(1+1.96*c(1,-1)/null.chi),2) ; test.based.ci.95 # Q6 is limited to the Titanic passengers # but need to segregate the experience by class # as well as gender, age and year # so, aggregate the relevant items in LL, # by class, gender, age, and year # first, the person-time, ie durations in each cell # death = 1 if eXit status is 1 (dead) at end of slice # death = 0 if eXit status is 0 (alive) # 10 yrs wide calendar year bins, 5 yrs wide age bins subtotals=aggregate(LL[,c("death","n.years")], by=list(period=10*floor(LL$per/10)+5, age=5*floor(LL$age/5)+2, ifemale=LL$ifemale, class=LL$class), FUN=sum) head(subtotals) sum(subtotals$death) sum(subtotals$n.years) #deaths per year sum(subtotals$death)/sum(subtotals$n.years) #years per (until) death sum(subtotals$n.years)/sum(subtotals$death) ## curious: where are the person years, by class PY. = aggregate(subtotals$n.years, by= list(age=subtotals$age,class=subtotals$class), FUN=sum) plot(PY.$age,PY.$x, col=PY.$class+1,pch=PY.$class+1, ylab="Number of PY",xlab="ageBin") legend(80,700, c("1st","2nd","3rd"),col=2:4,pch=2:4) # MH comparison of classes 2 & 3 versus 1 (ref), # matched on gender, binned-age and binned-period # old trick for multi-dimensional index in a single # subtotals[1:10,] subtotals$cell = subtotals$period+ subtotals$ifemale/10+ subtotals$age/1000 for (class in 2:3) { print( noquote(" ")) print( noquote(c("class:",class," vs. class 1"))) print( noquote(" ")) # put data for 2 compared classes in same row index.cells = (subtotals$class==class) ref.cells = (subtotals$class== 1) dta = merge(subtotals[index.cells, c("cell","death","n.years") ], subtotals[ ref.cells, c("cell","death","n.years") ], by.x = "cell", by.y = "cell" ) print( head(dta)) print( tail(dta)) mh.num = sum(dta$death.x*dta$n.years.y/ (dta$n.years.x+dta$n.years.y) ) mh.den = sum(dta$death.y*dta$n.years.x/ (dta$n.years.x+dta$n.years.y) ) print(round(c(mh.num,mh.den,mh.num/mh.den),2)) } # if you want to fit some Poisson regression models # instead of matching, you could just use the # LL dataset directly... e.g. summary(glm(death ~ ifemale+age+per+as.factor(class) + offset(log(n.years)), family=poisson, data=LL) ) # you would need to convert the beta.hats to # multiplicative factors.. i.e by what percentage # do neighbouring years of ages or genders differ?