## data and analysis, after Woolf 1955 (jh Jan 1, 2011 ) ## added Feb 2015: retro- and pro-spective logistic regressions ## of Woolf data -- data= scan( what=list("character", "integer","integer","integer","integer") ) London 911 579 4578 4219 Manchester 361 246 4532 3775 Newcastle 396 219 6598 5261 # ds=data.frame( 1:3, as.numeric(data[[2]]), as.numeric(data[[3]]), as.numeric(data[[4]]), as.numeric(data[[5]]) ) colnames(ds)=list("city","o","a","O","A") ds$x = (ds$o/ds$O)/ (ds$a/ds$A) ; ds$y = log(ds$x) ; ds ds$w = 1/ ( 1/ds$o + 1/ds$a + 1/ds$O + 1/ds$A ) ; ds Y = sum(ds$w * ds$y)/ sum(ds$w ) SD.Y = sqrt(1/sum(ds$w) ) X = exp(Y) X.limits = X * exp( c(-1.96,0,1.96)*SD.Y ) ; round(X.limits,2) ############## (prospective) logistic regression of Woolf's data, ############## with city as a "factor" n.case.series = c(ds[1,2], ds[1,3], ds[2,2], ds[2,3], ds[3,2], ds[3,3] ); n.base.series = c(ds[1,4], ds[1,5], ds[2,4], ds[2,5], ds[3,4], ds[3,5] ); city = c(1,1,2,2,3,3); # strata, used as factor O = rep(c(1,0),3) # predictor, or 'exposure' # n.case.series : n.base.series split is the 'outcome' prospective.DS=data.frame(city,O,n.case.series,n.base.series) ; prospective.DS prospective.fit = glm(cbind(n.case.series,n.base.series)~ as.factor(city) + O, family=binomial, data = prospective.DS) summary(prospective.fit) round(exp(prospective.fit$coefficients),2) fitted.count.cases = prospective.fit$fitted.values* (n.case.series+n.base.series) ## .f = "fitted" ds$o.f = fitted.count.cases[c(1,3,5)] ; ds$a.f = fitted.count.cases[c(2,4,6)] ; ds$O.f = (ds$o+ds$O) - ds$o.f ; ds$A.f = (ds$a+ds$A) - ds$a.f ; ds$IDR.fitted = ( ds$o.f / ds$O.f ) / ( ds$a.f / ds$A.f ) ; ds$w.f = 1 / ( 1/ds$o.f + 1/ds$a.f + 1/ds$O.f + 1/ds$A.f ) ; round(ds,2) ds # SD using fitted values... SD.log.idr = sqrt(1/sum(ds$w.f) ) ; round(SD.log.idr,5) # ----- compare with SE in output from logistic regression # Yes, SE [= sqrt(Var)] for log(Rate Ratio) estimate in logistic regression # of case-control (or other) data uses a `Woolf-like' variance # based on the FITTED cell frequencies.. ### RETROSPECTIVE logistic regression of Woolf's data, ### with city as a "factor" ### cf article `Are there 2 logistic regressions? by Breslow and Powers in.case.series = rep(c(1,0),3) # this is now the 'predictor' or 'exposure' # n.O : n.A split is now the 'outcome' n.O = c(ds[1,2], ds[1,4], ds[2,2], ds[2,4], ds[3,2], ds[3,4] ); n.A = c(ds[1,3], ds[1,5], ds[2,3], ds[2,5], ds[3,3], ds[3,5] ); retro.DS=data.frame(city, in.case.series, n.O, n.A) ; retro.DS retro.fit = glm(cbind(n.O,n.A)~as.factor(city) + in.case.series, family=binomial, data=retro.DS) summary(retro.fit) round(exp(retro.fit$coefficients),2) # cf Article `Are there 2 logistic regressions?' by Breslow and Powers. # available in bios602 Resources for `case-control' studies ############## amount of information in case-control study as fumction of the control:case ratio ### simulation using Woolf's data for London actual.data = ds[1,2:5]; n.cases = (actual.data$o + actual.data$a) ; n.cases n.controls = (actual.data$O + actual.data$A) ; n.controls actual.data$n.cases = n.cases actual.data$n.ctls = n.controls actual.data$ctl.case.ratio = round(n.controls/n.cases,1) actual.data$Var.logOR = round(sum(1/ds[1,2:5]),6); actual.data$info.logOR = round(1/actual.data$Var.logOR,1); actual.data$scaled.info.logOR = NA; actual.data simulated.data = actual.data; Ratios= c( seq(1000,200,-100), seq(100,20,-10), 10:1 ) row=1 for (ratio in Ratios ) { row = row + 1; simulated.data = rbind(simulated.data,actual.data) simulated.data[row,3:4] = ratio * simulated.data[row,3:4] / (n.controls/n.cases); simulated.data$n.ctls[row] = sum(simulated.data[row,3:4]); simulated.data$Var.logOR[row] = round(sum(1/simulated.data[row,1:2]) + sum(1/simulated.data[row,3:4]),6); simulated.data$info.logOR[row] = round(1/simulated.data$Var.logOR[row],1); simulated.data$ctl.case.ratio[row] = ratio ; simulated.data$scaled.info.logOR[row] = simulated.data$info.logOR[row]/simulated.data$info.logOR[2] } simulated.data plot(simulated.data$ctl.case.ratio, simulated.data$info.logOR, log="x") plot(simulated.data$ctl.case.ratio,simulated.data$scaled.info.logOR,log="x",ylim=c(0,1),pch=19,cex=1) for (ratio in Ratios) { segments(ratio, 0, ratio, 1, col="lightblue", lwd=0.75) if(ratio < 10 & ratio !=4) text(ratio, 0, toString(ratio), col="blue", cex=0.75, adj=c(0.5,1.25)) } for (y in seq(0,1,.1)) { segments(1, y, 1000, y, col="lightblue", lwd=0.75) } segments(4, 0, 4, simulated.data$scaled.info.logOR[simulated.data$ctl.case.ratio==4], col="blue", lwd=1.5) ; text(4, 0, "4", col="blue", cex=1.25, adj=c(0.5,1.25)) segments(1, 0.8, 4, 0.8, col="blue", lwd=1.5); text(1, 0.8, "80% efficiency", col="blue", cex=1.25, adj=c(0,-0.25))