# simple example of EM algorithm # in deja vu example where we have MLE by direct method # for concise description of EM, see section 2.2 # of the Tibshari et al paper on darts # Fisher's data on 300 errors from N(0,sigma) distribution # task is MLE of sigma or sigma.sq # bin frequencies f =c(114, 84, 53, 24, 14, 6, 3, 1, 1) ; n=sum(f); n lower.lim = 0:8 ; upper.lim=c(1:8,Inf) # directly log.lik=function(sigma) sum(f*log(2*(pnorm(upper.lim,0,sigma) - pnorm(lower.lim,0,sigma)) )) log.lik(2) optim(2,log.lik,method="BFGS",control=list(fnscale=-1)) # E-M algorithm # log.lik(sigma.sq) if know actual y value... # Lik = { (sigma.sq)^(-1/2) } * exp[-y^2 / (2 sigma.sq) ] # log.lik = -(1/2) log(sigma.sq) - y^2 / (2 sigma.sq) # E[log.lik | only know which bin its in] = # -(1/2) log(sigma.sq) - E[y^2 | y in the bin its in ] / (2 sigma.sq) # so we need to calcualte the average y^2 of the possible y's within each bin. yy.pdf = function(y,mean,sd) (y^2) * dnorm(y,mean,sd) # for getting ave sq # initial guess sigma = 2 print( c(sigma, log.lik(sigma) ) ) print( noquote(" " ) ) n.iterations = 5 for (iteration in 1:n.iterations ) { # E step .. ave. squared value of obsns. # within bin (not necessarily sq of mid-bin value... # it depends on pdf shape in the bin) ave.squared.value=rep(NA, length(f)) for (i in 1:length(f)) ave.squared.value[i] = integrate(yy.pdf, lower.lim[i], upper.lim[i], mean=0,sd=sigma)$value / integrate(dnorm, lower.lim[i], upper.lim[i], mean=0,sd=sigma)$value # the denominator scales the (restricted) pdf to make it a legitimate pdf # just for the possible values in that bin # M step sigma.sq = sum(f*ave.squared.value) / n sigma = sqrt(sigma.sq) print(noquote( c("iteration",iteration, "sigma", round(sigma,6) ) ) ) print( noquote(" " ) ) print( noquote( c("ave.squared.values", round(ave.squared.value,6) ) )) print( noquote(" " ) ) print( log.lik(sigma) ) print( noquote(" " ) ) } # plot(0:n.iterations,sigma.estimate, xlim=c(0,n.iterations),xlab="iteration") optim(2,log.lik,method="BFGS",control=list(fnscale=-1)) ######## # Estimation of infection rate ('Rate') in # In the control group in the Uganda trial, # 2319 initially HIV- men were tested at the 6-month, or 0.5year follow-up, # and 19 of them were found to be HIV+, and the remaining 2300 were found to be HIV-. # can treat 19/2319 as a binomial with p(pos) = 1 - p(neg) = 1 - exp(-0.5*Rate) # MLE of p = 19/2319 ; # so, MLE of Rate is mle.via.p = -2 * log(1 - 19/2319) mle.via.p # 0.01645387 infections per man-year at risk log.lik = function(Rate) 19* log( 1-exp(-0.5*Rate) ) - 2300*0.5*Rate optim(0.012,log.lik,method="BFGS",control=list(fnscale=-1,parscale=0.001)) $par [1] 0.01645387 $value [1] -110.2065 # a good approx is to say that the average infection time was 1/2 way in the 6 mo # ave.t=0.25 # so total time at risk = 2319*0.5 - 19*ave.t = 1154.75 man-years # so rate = 19/1154.75 = 0.01645378 infections per man-year at risk # notice that this placement at mid-bin is too far to right, thereby # increased the man-months, and reducing the rate # EM # assume we know time t_i of each of i=1:19 infections # lik = prod_i (Rate exp[-0.5*Rate*t_i ) * [exp(-0.5*Rate)]^2300 # loglik = sum { log(Rate) -0.5*Rate*t_i } + 2300 log(1-p) # E[loglik | all we know is that 19 were in interval 0 to 0.5 years] t.pdf = function(t,Rate) t * dexp(t,Rate) # for getting ave Rate = 0.01 print( c(iteration, Rate, ave.t, log.lik(Rate) ) ) print( noquote(" " ) ) Rate.estimate=Rate n.iterations = 5 for (iteration in 1:n.iterations ) { # E step .. ave. t of infection. # within bin (not necessarily mid-bin value... # it depends on pdf shape in the bin) ave.t = integrate(t.pdf, 0, 0.5, Rate=Rate)$value / integrate(dexp, 0, 0.5, rate= Rate)$value # the denominator scales the (restricted) pdf to make it a legitimate pdf # just for the possible values in that bin # M step Rate = 19 / (2300*0.5+19*ave.t) print( c(iteration, Rate, ave.t, log.lik(Rate) ) ) } # jh 2012.10.09