# simple example of EM algorithm
# in deja vu example where we have MLE by direct method

# for concise description of EM, see section 2.2
# of the Tibshari et al paper on darts

# Fisher's data on 300 errors from N(0,sigma) distribution
# task is MLE of sigma or sigma.sq

# bin frequencies

f =c(114, 84, 53, 24, 14, 6, 3, 1, 1) ; n=sum(f); n
lower.lim = 0:8 ; upper.lim=c(1:8,Inf)

# directly

log.lik=function(sigma) sum(f*log(2*(pnorm(upper.lim,0,sigma) - pnorm(lower.lim,0,sigma)) ))
log.lik(2)
optim(2,log.lik,method="BFGS",control=list(fnscale=-1))

# E-M algorithm

# log.lik(sigma.sq) if know actual y value...

# Lik =  { (sigma.sq)^(-1/2) } *  exp[-y^2 / (2 sigma.sq) ]

# log.lik = -(1/2) log(sigma.sq)  - y^2 / (2 sigma.sq)

# E[log.lik | only know which bin its in] = 

#         -(1/2) log(sigma.sq) - E[y^2 | y in the bin its in ]  / (2 sigma.sq)

# so we need to calcualte the average y^2 of the possible y's within each bin.

yy.pdf = function(y,mean,sd) (y^2) * dnorm(y,mean,sd) # for getting ave sq

# initial guess

sigma = 2
print( c(sigma, log.lik(sigma) ) )
print( noquote(" " ) )


n.iterations = 5


for (iteration in 1:n.iterations ) {
	
  # E step  ..  ave. squared value of obsns. 
  # within bin (not necessarily sq of mid-bin value... 
  # it depends on pdf shape in the bin)

  ave.squared.value=rep(NA, length(f))
  
  for (i in 1:length(f)) ave.squared.value[i] = 
    integrate(yy.pdf, lower.lim[i], upper.lim[i], mean=0,sd=sigma)$value /
    integrate(dnorm, lower.lim[i], upper.lim[i], mean=0,sd=sigma)$value 
        # the denominator scales the (restricted) pdf to make it a legitimate pdf 
        # just for the possible values in that bin
  
 # M step 

  sigma.sq =  sum(f*ave.squared.value) / n
  
  sigma = sqrt(sigma.sq)
  
  
  print(noquote( c("iteration",iteration, "sigma", round(sigma,6) ) ) )
  print( noquote(" " ) )
  print( noquote( c("ave.squared.values", round(ave.squared.value,6) ) ))
  print( noquote(" " ) )
  print( log.lik(sigma)  ) 
  print( noquote(" " ) )

}

# plot(0:n.iterations,sigma.estimate, xlim=c(0,n.iterations),xlab="iteration")

optim(2,log.lik,method="BFGS",control=list(fnscale=-1))


########

# Estimation of infection rate ('Rate') in 

# In the control group in the Uganda trial, 
# 2319 initially HIV- men were tested at the 6-month, or 0.5year follow-up, 
# and 19 of them were found to be HIV+, and the remaining 2300 were found to be HIV-.

# can treat 19/2319 as a binomial with p(pos) = 1 - p(neg) = 1 - exp(-0.5*Rate)
# MLE of p = 19/2319 ;
# so, MLE of Rate is 
mle.via.p = -2 * log(1 - 19/2319) 
mle.via.p 
# 0.01645387

infections per man-year at risk

log.lik = function(Rate) 19* log( 1-exp(-0.5*Rate) ) - 2300*0.5*Rate 

optim(0.012,log.lik,method="BFGS",control=list(fnscale=-1,parscale=0.001))

$par
[1] 0.01645387

$value
[1] -110.2065


# a good approx is to say that the average infection time was 1/2 way in the 6 mo
# 
ave.t=0.25 
# so total time at risk = 2319*0.5 - 19*ave.t = 1154.75 man-years
# so rate = 19/1154.75 = 0.01645378 infections per man-year at risk
# notice that this placement at mid-bin is too far to right, thereby
# increased the man-months, and reducing the rate

# EM

# assume we know time t_i of each of i=1:19 infections
# lik = prod_i (Rate exp[-0.5*Rate*t_i )  *   [exp(-0.5*Rate)]^2300
# loglik = sum { log(Rate) -0.5*Rate*t_i } + 2300 log(1-p)
# E[loglik | all we know is that 19 were in interval 0 to 0.5 years] 

t.pdf = function(t,Rate) t * dexp(t,Rate) # for getting ave 

Rate = 0.01

print( c(iteration, Rate, ave.t, log.lik(Rate) ) )
print( noquote(" " ) )

Rate.estimate=Rate

n.iterations = 5

for (iteration in 1:n.iterations ) {
	
  # E step  ..  ave. t of infection. 
  # within bin (not necessarily mid-bin value... 
  # it depends on pdf shape in the bin)

  ave.t = integrate(t.pdf, 0, 0.5, Rate=Rate)$value /
          integrate(dexp, 0, 0.5, rate= Rate)$value 
        # the denominator scales the (restricted) pdf to make it a legitimate pdf 
        # just for the possible values in that bin
  
 # M step 

  Rate =  19 / (2300*0.5+19*ave.t)
  
  print( c(iteration, Rate, ave.t, log.lik(Rate) ) ) 

}


# jh 2012.10.09