# 2012.10.14

# below are several fits ..

# ideally, one should use the Poisson model (with inbuilt var = mu) rather than the
# N(,) model with inbuilt (assumed) same variance regardless of mu

# also, this rate problem is different from the usual ones we have
# with mortality, cancer, etc.. in that in such cases, we know
# the age at which the event (cancer, death) occurred.
# whereas, with the mutations, we are not there to watch the mutations happen
# we only get to see the total, at the end... 

# thus the need to integrate the rate function from birth to the age
# at which the father fathered the child...

# This examples has some lessons about parametrizations and 'centering'
# of the ages (the 'x') in the likelihood — and in any regressions

# recall from regular (simple) regression, the correlation of
# beta_0_hat and beta_1_hat becomes more pronounced the further the
# x's are from zero, and that the correlation is zero only if the
# x's are centered ..  

# the same issues occur when using 2 x's, but one of the x's is the square
# of the other.... i.e., x and x^2 (or here age and (1/2) age^2 )  

# age and age^2 are very  correlated, but age-30 and its square are 
# less so.... or even age-15 and its square...

# so its important for maximization/numerical purposes to
# make the 2-D search less challenging......

setwd("/Users/jameshanley/Dropbox/Courses/bios601/Intensity-Rates")

ds=read.table("mutationsFathersMothersAge.txt")
colnames(ds)=c("fathers.age", "mothers.age", "n.mutations")
ds$aa =  (1/2) * ds$fathers.age * ds$fathers.age

head(ds)

cor(ds)
plot(ds)

m=lm(n.mutations~mothers.age,data=ds) ; summary(m)

f=lm(n.mutations ~ fathers.age,data=ds) ; summary(f)
f.no.int = lm(n.mutations~-1+fathers.age,data=ds) ; summary(f.no.int)


fm=lm(n.mutations~fathers.age+mothers.age,data=ds); summary(fm)

glm.f=glm(n.mutations~-1+fathers.age,
          family=poisson(link="identity"),data=ds); 
summary(glm.f)

glm.fm=glm(n.mutations~fathers.age+mothers.age,
          family=poisson(link="identity"),data=ds); 
summary(glm.fm)


## 

f.linear.rate=lm(n.mutations~-1+fathers.age+aa,data=ds) ; summary(f.linear.rate)

f.linear.rate.p=glm(n.mutations~-1+fathers.age+aa,data=ds,
family=poisson(link="identity")) ; summary(f.linear.rate.p) # Normal errors

plot(sort(ds$fathers.age),sort(f.linear.rate$fitted.values),type="l")

lines(ds$fathers.age,f.no.int$fitted.values,col="red")

lines(ds$fathers.age,f$fitted.values,col="blue")

plot(15:40,f.no.int$coefficients[1]+0*(15:40))
lines(15:40,f.linear.rate$coefficients[1]+f.linear.rate$coefficients[2]*(15:40))

plot(15:40,f.linear.rate$coefficients[1]+
          f.linear.rate$coefficients[2]*(15:40) +
          f.linear.rate$coefficients[3]*(15:40),type="l")
          
########  added later 

age=ds$fathers.age

half.age.sq = (age^2)/2

mutations=ds$n.mutations
         
mu = function(lambda)  lambda * age 

log.lik = function(lambda) -sum( mu(lambda) ) + sum( mutations * log(  mu(lambda) ) )

optim(2,log.lik,method="BFGS",hessian=T,control=list(fnscale=-1,parscale=0.001))


mu.linear = function(lambda,beta)  lambda * age +  beta * half.age.sq 

log.lik.linear = function(par)  -sum( mu.linear(par[1],par[2] ) ) + 
                           sum( mutations * log( mu.linear(par[1],par[2]) ) )

optim(c(2.2,-0.01),log.lik.linear,method="BFGS",hessian=T,control=list(fnscale=-1, parscale=c(0.1,0.001))) 

summary( glm(mutations ~ -1 + age + half.age.sq, family=poisson(link="identity"),
  control=list(trace=T, epsilon=0.0000000000001) ) ) 


# this wont work in outer function (since lower upper and f are themselves vectors), 
# so 'force' the issue ... 

log.lik.LINEAR = function(p1,p2)  -sum( mu.linear(p1,p2 ) ) + 
                           sum( mutations * log( mu.linear(p1,p2) ) )
                           
log.lik.vector = Vectorize( log.lik.LINEAR, c("p1", "p2") )

p1 = seq(2.20,2.3,0.0002) ; length(p1) ; p2= seq(0,0.005, 0.00001) ; length(p2)

log.lik.values = outer(p1, p2, log.lik.vector)

max.log.lik.values  = max(log.lik.values) ; max.log.lik.values

i.max = unlist(apply(log.lik.values,1, function(x) which(x==max.log.lik.values) )) ; i.max
j.max = unlist(apply(log.lik.values,2, function(x) which(x==max.log.lik.values) )) ; j.max

mle = c(p1[i.max],p2[j.max]) ; mle

rel.log.lik.values = log.lik.values  - max.log.lik.values

contour( p1, p2, rel.log.lik.values, nlevels=100,xlab="p1", ylab="p2", labcex=1.1)
points(mle[1],mle[2],col="red")


######  re-parameterize  , with different age 'origins' or 'centres'

# 

age.c=25  #### age.'centre' .... vary this and see the diff. in shape of contours

rate = function(a,lambda.at.c,beta) lambda.at.c + beta*(a-age.c)


# integral = lambda*age + beta*( (1/2) * age^2 ) - 30 * beta * age

# mu.linear.c = function(a,lambda.at.30,beta) integrate(rate, 0, a, 
#    lambda.at.30 = lambda.at.30, beta=beta)$value
   
mu.linear.c = function(lambda.at.c,beta) lambda.at.c*age + beta*half.age.sq - age.c * beta * age


mu.linear.c(2, 0.01)

log.lik.LINEAR.c = function(p1,p2) -sum( mu.linear.c(p1,p2 ) ) + 
                           sum( mutations * log( mu.linear.c(p1,p2) ) )
                           
log.lik.LINEAR.c(2, .01)

log.lik.vector.c = Vectorize( log.lik.LINEAR.c, c("p1", "p2") )

p1 = seq(2.02,2.15,0.0005) ; length(p1) ; p2= seq(-0.02, 0.002,  0.0001) ; length(p2)

log.lik.values.c = outer(p1, p2, log.lik.vector.c)

# max.log.lik.values.c  = max(log.lik.values.c) ; max.log.lik.values.c

contour( p1, p2, log.lik.values.c, nlevels=100,xlab="p1", ylab="p2", labcex=0.75)

# need to use a single vector

log.lik.LINEAR.c = function(para) -sum( mu.linear.c(para[1],para[2] ) ) + 
                           sum( mutations * log( mu.linear.c(para[1],para[2]) ) )
                           
                           
optim(c(2.2,-0.01),log.lik.LINEAR.c,method="BFGS",hessian=T,
  control=list(fnscale=-1, parscale=c(0.1,0.001)))