# 2019  -- non-parametric ... plots etc...


# Avalanche Data -- see article

setwd("/Users/jameshanley/Documents/Courses/bios601/Likelihood")

ds=read.table("avalanche-data.txt",header=T,comment.char="#"); head(ds); tail(ds)

summary(ds$time)

# 2013.10.07 .. ML estimation via optimize (1-para) and optim (multi-para)

# dataset with 549 indiv. obsns

max.live = 2000   # after seeing plot...
                  # nobody survived past 1000 min, 
time = pmin(ds$time,max.live)

# single exponential

exp.logLik=function(mean) { sum(
	 (  ds$status)*log(pexp(time,rate=1/mean)) +
	 (1-ds$status)*log(pexp(time,rate=1/mean, lower.tail=FALSE))
	                           )
}

exp.logLik(20)

fit.exp=optimize(exp.logLik,c(1,100),maximum=TRUE)

plot(1:120,pexp(1:120,rate=1/fit.exp$maximum,
 lower.tail=FALSE),type="l", ylim=c(0,1),
 ylab="Estimated Proportion Alive", xlab="Minutes since being buried")

# mix of 2 lognormals or gammas

mixture.logLik=function(par,model="gamma",fitted=FALSE) {
	
    TIME=1:240
    
    logit.F1=par[1];
    F1 = exp(logit.F1)/(1+exp(logit.F1))
    
    if(model=="logNormals") {
        mean.log=par[2:3];
        log.sd.of.log = par[4:5]
	    sd.of.log = exp(log.sd.of.log)
	    CDF1=plnorm(time,mean.log[1],sd.of.log[1])
	    CDF2=plnorm(time,mean.log[2],sd.of.log[2])
	    CDF1.TIME=plnorm(TIME,mean.log[1],sd.of.log[1])
	    CDF2.TIME=plnorm(TIME,mean.log[2],sd.of.log[2])
	}
	if(model=="gamma") {
        mean.1.stage=exp(par[2:3]);
        shape = exp(par[4:5])
	    CDF1=pgamma(time,mean.1.stage[1],shape[1])
	    CDF2=plnorm(time,mean.1.stage[2],shape[2])
	    CDF1.TIME = pgamma(TIME,mean.1.stage[1],shape[1])
	    CDF2.TIME = pgamma(TIME,mean.1.stage[2],shape[2])
	    
	}
	
	CDF=F1*CDF1+(1-F1)*CDF2
	CDF.TIME=F1*CDF1.TIME+(1-F1)*CDF2.TIME
	
	if(!fitted) return(
	       sum( ds$status*log( F1*CDF1 + (1-F1)*CDF2 )
	            +
	        (1-ds$status)*log( F1*(1-CDF1) + (1-F1)*(1-CDF2) )
           )          )
           
    if(fitted) return(cbind(1-CDF1.TIME, 1-CDF2.TIME, 1-CDF.TIME))
}

par=c(0, 0,1, 5,5)

mixture.logLik(c(0, 0,1, 3,3), model="logNormals" )

fit=optim(c(0, 0,1, 5,5), mixture.logLik,
     control=list(fnscale=-1), model="logNormals", fitted=FALSE)

par=c(0, 0,0, 1,1)
fit=optim(par, mixture.logLik,
     control=list(fnscale=-1), model="gamma", fitted=FALSE)

fit


fitted=mixture.logLik(fit$par,model="gamma",fitted=TRUE)

lines(1:240,fitted[,1] , lty="dotted" )
lines(1:240,fitted[,2], lty="dotted" )
lines(1:240,fitted[,3],lty="dotted", col="red")




# jh sept 23. 2010 + Oct 11, 2011


# older attempts

plot(ds$time,ds$status)
plot(ds$time,ds$status,xlim=c(0,10000))

max(ds$time[ds$status==0])

library(fitdistrplus)  # install it if you dont already have it

# dataset with 549 indiv. obsns

max.live = 2000   # after seeing plot...
                  # nobody survived past 1000 min, 


# fill in the ???

# for fitdistrplus

left  = (ds$status == ???)*ds$time     + (ds$status==???)*0
right = (ds$status == ???)*max.live    + (ds$status==???)*pmin(ds$time,max.live)

DS = data.frame(left,right) ; head(DS) ; tail(DS)

fit.exp= mledist(DS,"exp",start=list(rate=1/500))
fit.exp$convergence
fit.exp$estimate
fit.exp$loglik
1/fit.exp$estimate

# check out shape of hazard fn. for gamma distrn.

y=1:100  # effective range if set mean to say 20 or so

# fool around with shape to get incr. or decr. hazard

pdf = dgamma(y, shape = 1, rate = 1/3)
comp.cdf = pgamma(y, shape = 1, rate = 1/3, lower.tail=FALSE)
hazard = pdf  / comp.cdf
plot( y, hazard )

######

           mledist(DS,"gamma",start=list(shape=1,rate=1/1000))
           
           # try strating values that do not give to many NAs 
           
fit.gamma= mledist(DS,"gamma",start=list(shape=1,rate=1/10))

fit.gamma$convergence
fit.gamma$estimate
fit.gamma$loglik
fit.gamma$estimate

fit.gamma$estimate[1]/fit.gamma$estimate[2]

# compute fitted hazard fn.   plug in mles

hgamma = function(x,shape,rate) dgamma(x,shape,rate)/pgamma(x,shape,rate,lower.tail=FALSE)

t=1:100 ; 
plot(t,dgamma(t,10,1/5))
plot(t,hgamma(t,10,1.5))


####### by hand , 2010 attempts to be generic ....


####### can borrow the 'from scratch' likelihood, and modify it for
####### gamma, so as to verify the mle's produced by mledist in 2011 !

################ MLE of parameters of 1-para model for avalanche data

# 1 -parameter models, with family as n argument and parameter called theta

# (exponential-based) likelihood  (theta=rate so mean = 1/rate ie rate = 1/mean)

exponential.log.lik = function(MEAN) {
	
	prob.died.before.extricated = pexp(ds$time,1/MEAN)
	prob.died.before.extricated[prob.died.before.extricated > 0.999999] = 0.999999 
	   # this avoids log of a virtually zero survival probability
	prob.alive.when.extricated =  1-prob.died.before.extricated
	logL = sum(  ds$status*log(prob.died.before.extricated) + (1-ds$status)*log(prob.alive.when.extricated) )
	return(logL)
	}

range = seq(1,500,length=50)

log.lik = unlist(lapply(range,exponential.log.lik)); log.lik
       # I have to ask around for a better way to avoid lists and lapply


plot(log(range),log.lik)   #  zomm in as needed   by changing range


Max.Log.lik=max(log.lik)

MLE=range[log.lik >= Max.Log.lik] ; MLE

range = seq(45,63,length=500)

#fitted Survival curve

S.curve=1-pexp(1:240,1/MLE)
plot(1:240,S.curve)

plot(1:240,1-pgamma(1:240,1,))



###############  older material.........
###############  we will come back to this when we study Newton-Raphson method of finding mle


# MLE via Newton-Raphson algorithm  NOV 4, 2010

# see http://en.wikipedia.org/wiki/Newton's_method

# in 1-D  if want to find x.root such that f[x.root] = 0 
 
# x.root = x.previous - f[x.previous]/f'[x.previous] 
 
# in our case f[theta] = d logL / d theta
 
# so we have , in 1 dimension 

# theta.new = theta.previous - d logL/d theta [theta.previous] / d^2 logL /d theta^2 [theta.previous] 
 
# or, in k dimensions, such that 
 
#            d logL/ d theta  is a vector of length k,  (often called the score vector, U)

#            d^2 logL / d theta^2 is a k x k matrix, (its matrix inverse is often called the Information matrix, I)


# theta.root = theta.previous - d logL/d theta [theta.previous] x Inv {d^2 logL /d theta^2 } [theta.previous]  

# EXAMPLE   1-d   theta = mean of exponential distribution

#  logL contribution from i-th data point , a person extricated at time t

#        = status*log(prob.died.before.t) + (1-status)*log(prob.alive.at.t)

# where prob.alive.at.t = exp(-t/theta) and prob.died.before.t =  1 - exp(-t/theta)

#  d logL / d theta = status      * ( d (prob.died.before.t)   / d theta )         / (   prob.died.before.t )

#                   +  (1-status) * ( d (1-prob.died.before.t) / d theta )         / ( 1-prob.died.before.t )

                    
#                   = status      * ( (-1) * (-1) * exp(-t/theta) * (t/theta^2) )  / ( 1 - exp(-t/theta) )

#                   +  (1-status) * ( (-1) * exp(-t/theta) * (t/theta^2)/ )        / (    exp(-t/theta) )


# with some help from Mathematica

firstD = function(theta) sum(  (-1)*ds$status    *   exp(-ds$time/theta) * (ds$time/theta^2)  / (1-exp(-ds$time/theta))  
                            +    (1-ds$status)                           * (ds$time/theta^2) 
                            )
                              
                              
secondD = function(theta) sum((-1)*ds$status  * exp(-2*ds$time/theta) * (ds$time^2/theta^4)  / (1-exp(-ds$time/theta))^2  
                              +(2)*ds$status  * exp(  -ds$time/theta) * (ds$time  /theta^3)  / (1-exp(-ds$time/theta))    
                             +(-1)*ds$status  * exp(  -ds$time/theta) * (ds$time^2/theta^4)  / (1-exp(-ds$time/theta))
                             +(-2)*(1-ds$status)                      * (ds$time/theta^3)  
                          )
                          
ds$time[ds$time>6000] = 6000 ;
ds$survive.t = 1-ds$status;
head(ds) ;  tail(ds)

firstD(10); secondD(10)
firstD(20); secondD(20)
firstD(30); secondD(30)
firstD(40); secondD(40)
firstD(50); secondD(50)
firstD(60); secondD(60)

plot(20:500,sapply(20:500,firstD))
plot(20:500,sapply(20:500,secondD))

# start with trial value

theta.trial = 20; c(theta.trial, firstD(theta.trial), secondD(theta.trial))

# update

theta.trial = theta.trial - firstD(theta.trial) / secondD(theta.trial) ; 
c(theta.trial, exponential.log.lik(theta.trial),
 firstD(theta.trial), secondD(theta.trial), -1/secondD(theta.trial))

var.theta.hat =  -1/secondD(theta.trial)

# double check -- can fit a generalized linear model

# since Prob[survive] = exp(-t/theta)

# log[ Prob[survive] ] = -t/theta , which has the form of a linear predictor,

# with a slope [beta] of -1/theta, and no intercept,

# so can use bernouilli regression with log link

fit.exp = glm(ds$survive.t~-1+ds$time,family=binomial(link="log")) ; summary(fit.exp)

-1/fit.exp$coefficients # this is theta.hat

summary(fit.exp)$cov.unscaled


# double check variances from 2 approaches...

# var(1/theta.hat) = (approx) var(thetahat) x square of jacobian

#                  = var(theta.hat) x [(-1/theta.hat^2) ]^2

  var.theta.hat * ( 1/theta.trial^2 )^2
  
# not sure why they do not match EXACTLY ..


###################### MLE of parameters of various 2-para models for avalanche data ####

general2para.log.lik = function(family,para1,para2) {
	if(family=="gamma") {rate=1/para2; cdf = function(t,para1,para2) pgamma(t,para1,rate) }
	if(family=="beta") cdf = function(t,para1,para2) pbeta(t,para1,para2)
    if(family=="lognormal") cdf = function(t,para1,para2) plnorm(t,para1,para2) # mean.log sd.log
	prob.died.before.extricated = cdf(ds$time,para1,para2)
	prob.died.before.extricated[prob.died.before.extricated > 0.999999] = 0.999999 
	   # this avoids log of a virtually zero survival probability
	prob.alive.when.extricated =  1-prob.died.before.extricated
	logL = sum(  ds$status*log(prob.died.before.extricated) + (1-ds$status)*log(prob.alive.when.extricated) )
	return(logL)
	}

firstD = function(family, para1,epsilon1, para2,epsilon2) {
	
	Dpara1 = ( general2para.log.lik(family,para1+epsilon1,para2) - 
	            general2para.log.lik(family,para1         ,para2) ) / epsilon1
	            
	Dpara2 = ( general2para.log.lik(family,para1,para2+epsilon2) - 
	            general2para.log.lik(family,para1,para2)          ) / epsilon2
	            
	return( c(Dpara1,Dpara2) )
	}
	
secondD = function(family, para1,epsilon1, para2,epsilon2) {
	
	Dpara1 = ( firstD(family,para1+epsilon1,epsilon1, para2,  epsilon2) - 
	           firstD(family,para1,         epsilon1, para2,  epsilon2) ) / epsilon1 ;
	            
	Dpara2 = ( firstD(family,para1, epsilon1, para2+epsilon2, epsilon2) - 
	           firstD(family,para1, epsilon1, para2,          epsilon2) ) / epsilon2 ;	                        
	return( matrix(c(Dpara1,Dpara2), nrow=2) )
	}


# start with trial values

family="lognormal" ; family="gamma"

general2para.log.lik(family, 4,5)
shape = 2; scale= 5;

theta=c(shape,scale)
theta
firstD(family,shape,0.001,scale, 0.001)
secondD(family,shape,0.001,scale, 0.001)
general2para.log.lik(family, shape,scale)

# update

theta = theta - firstD(family,shape,0.001,scale, 0.001) %*% 
                solve( secondD(family,shape,0.001,scale, 0.001) ) ; 

theta
shape=theta[1]; scale=theta[2] ; 
shape*scale
firstD(family,shape,0.001,scale, 0.001)
secondD(family,shape,0.001,scale, 0.001)
general2para.log.lik(family, shape,scale)


########## non parametric ......



left =  (ds$status==0)*ds$time + (ds$status==1)*0
right = (ds$status==0)*2000    + (ds$status==1)*pmin(ds$time,2000)
cbind(left,right,ds$time,ds$status)

support = seq(0.5,1999.5,1) ; L=length(support) ; L

N = length(left); N
m=matrix(rep(NA,L*N),nrow=N)
for (i in 1:length(support))  m[,i] = 1*(support[i] > left & support[i] < right) 
m[,1:10]

sums = apply(m,2,"sum") ; 
# plot(support,sums,type="l")

# eliminate redundant support points  (ie identical columns)

fromLeft=FALSE
if(fromLeft) { 
	M=m[,1] ; t=support[1];
   for(col in 2:L) if( sum( m[,col]==m[,col-1] ) < N) {
   	  t = c(t,support[col]) 
   	  M = cbind(M,m[,col]) }
   n.atoms = length(t) 
}

if(!fromLeft) {   
   M=m[,L] ; t=support[L]
   for(col in (L-1):1) if( sum( m[,col]==m[,col+1] ) < N) {
   	 t= c(t,support[col]) 
   	 M = cbind(M,m[,col]) }
   n.atoms = length(t)
   t = t[n.atoms:1]
   M= M[n.atoms:1,]   
}
M[,1:20]
n.atoms


SUMS = apply(M,2,"sum") ; SUMS

plot(-10,-1000,ylim=c(0,430),xlim=c(0,150),
 xlab="Minute",ylab="Observation")
for(i in seq(1,422,10)){
 segments(left[i],i,right[i],i,lwd=2,
  col= c("red","green")[1+1*(left[i]>0)] )
 tt = t[t >= left[i] & t <= right[i] ]
 points(tt,rep(i,length(tt)),
 cex=0.5,pch=19)
}

# self-consistency

# start with ...

s.hat = 1-cumsum(SUMS)/sum(SUMS)     #plot(t,s.hat,type="l")

index.of.last.support.point.before.each.extraction = 
   unlist(lapply(ds$time,  function(T) sum(t < T) ))
	   
n.known.gt.t=unlist(lapply(t, function(T) sum(ds$status==0 & ds$time > T) ))

for(iteration in 1:100) {
	
  all.s.hats = s.hat[index.of.last.support.point.before.each.extraction]

  n.presumed.gt.t = unlist(
  lapply ( t[1:n.atoms],  function(T) {
	           #print("T"); print(T)
	           which.of.n.atoms = (1:n.atoms)[ t == T]
	           #print("which.of.n.atoms") ; print(which.of.n.atoms)
	           S = s.hat[which.of.n.atoms]
	           #print("S") ; print(S)
	           
	           # extracted.alive.before.T
	           
	           extracted.alive.before.T = (ds$status==0 & ds$time < T)
	           indices.of.these = (1:N)[extracted.alive.before.T]
	           #print("indices.of.these") ; print(indices.of.these)
	           S.for.each.of.these  = all.s.hats[indices.of.these]
	           #print("S.for.each.of.these") ; print(S.for.each.of.these)
	           prob = S / S.for.each.of.these   
	           #print("probs") ; print(prob)
	           n.before.expect.gt.t = sum(prob) 
	            
	           # extracted.dead.after.T
	           
	           #print("AFTER")
	           
	           extracted.dead.after.T = (ds$status==1 & ds$time > T)
	           indices.of.these = (1:N)[extracted.dead.after.T]
	           #print("indices.of.these") ; print(indices.of.these)
	           S.for.each.of.these  = all.s.hats[indices.of.these]
	           #print("S.for.each.of.these") ; print(S.for.each.of.these)
	           prob = (S - S.for.each.of.these ) / (1 -  S.for.each.of.these)   
	           #print("probs")
	           #print(prob)
	           n.after.that.were.gt.t = sum(prob) 
	           
	           n.fitted = n.before.expect.gt.t+n.after.that.were.gt.t
	           return(n.fitted)
	           }
      )
   )
#n.presumed.gt.t
	
new.s.hat = (n.known.gt.t + n.presumed.gt.t )/N

print( max( abs( new.s.hat - s.hat ) ) )

s.hat = new.s.hat

}

plot(t,s.hat,type="l",xlim=c(0,165)) 
abline(v=seq(0,100,10))

prob.mass=c(1,new.s.hat[1:(n.atoms-1)]) -new.s.hat 
sum(prob.mass)

plot(-10,-1000,ylim=c(0,430),xlim=c(0,150),
 xlab="Minute",ylab="Observation")
for(i in seq(1,422,10)){
 segments(left[i],i,right[i],i,lwd=2,
  col= c("red","green")[1+1*(left[i]>0)] )
 tt = t[t >= left[i] & t <= right[i] ]
 size = prob.mass[t >= left[i] & t <= right[i] ]
 points(tt,rep(i,length(tt)),cex=5*size,pch=19)
}
abline(v=seq(0,100,10))

############




plot(t,s.hat,type="l",xlim=c(0,165))

delta.t = (t - c(0, t[1:(n.atoms-1)])); delta.t ; sum(delta.t)

integral.s.hat = sum( s.hat*delta.t) ; integral.s.hat

delta.s.hat = c(1, s.hat[1:(n.atoms-1)]) - s.hat ; delta.s.hat ; 

pdf.hat = delta.s.hat / delta.t

plot(t,pdf.hat,type="l",xlim=c(0,100))

h.hat = pdf.hat / s.hat

plot(t,h.hat,type="l",xlim=c(0,100))


###### # 

library(Icens)  
# package ‘Icens’ is not available (for R version 3.6.1)

data= matrix(c(left,right),nrow=N) ; data

EM=EMICM(data, EMstep=TRUE, ICMstep=TRUE, keepiter=FALSE, tol=1e-07,maxiter=1000) 

sum(EM$pf)

############

# https://www.r-project.org/conferences/useR-2010/tutorials/Fay_1.pdf