# version 2008.02.24

# females;

# Deaths from all causes, lung cancer and breast cancer
# among Quebec females, 1971, 1976, 1980-1992 2002

# Item       content
# ---------  --------------------------------------------------------------
# sex      2 = female [1=male], in case you want to consolidate into 1 file
# year     71, 76, 1971, 1976, 1980(1)1992 2002
# pop      population
# dac      number of deaths, all causes
# dlungca  number of deaths, lung cancer
# dbrca    number of deaths, breast cancer
# agefrom  lower limit of age category (0,1, 5,10,15, ...  85)
# ageto    upper limit of age category (1,5,10,15,20, ... 100*)
# * last age category is '85+'

females=read.table("q_f_mortality_data.txt",header=TRUE)
 
# subset to certain years and certain variables,
# and create a new 'year' variable 
# (original dataset had the year 1971 as 71 and 2002 as 02)

ds = females[

          (females$year==71 | females$year==02 ) &
          (females$agefrom >= 40 & females$agefrom <= 80 ) ,
          
          c(2,3,4,7,8)
                     ] ; 
                     
ds[ds$year==71,"yrs.since71"] = 0 ;
ds[ds$year== 2,"yrs.since71"] = 31 ; ds

#### age-standardized comparison of 1991 and 2002 death rates #########

# create common age structure for age-standardized comparison 

  ds[1:9,"combined.pop"]   = ds[1:9,"pop"]+ds[10:18,"pop"]
  
  ds[10:18,"combined.pop"] = ds[1:9,"pop"]+ds[10:18,"pop"] ;
  
  ds$combined.pop = ds$combined.pop/2; ds

# expected deaths if common age, but 1971 and 2002 rates
 
  expected = (ds$dac / ds$pop) * ds$combined.pop; expected
 
# tot. expected deaths (1971 followed by 2002, and common popl'n size/structure)

 e=c(sum(expected[1:9]),sum(expected[10:18]),sum(ds$combined.pop[1:9])); e
 
# observed (1971 deaths & pop, followed by 2002)

 o=c(sum(ds$dac[1:9]),sum(ds$pop[1:9]),sum(ds$dac[10:18]),sum(ds$pop[10:18])); o
 
 # crude rates, and the 2002/1971 ratio  
  
   round(c(o[1]/o[2], o[3]/o[4], (o[3]/o[4]) / (o[1]/o[2])  ),5)

 # standardized.rates, and the 2002/1971 ratio  
 
   round(c( e[1]/e[3], e[2]/e[3], (e[2]/e[3]) / (e[1]/e[3])  ),5)

# age-specific rate ratios, 2002/1971  
 
  round( ( ds$dac[10:18]/ds$pop[10:18] ) / ( ds$dac[1:9]/ds$pop[1:9] ) ,2)

  
 

### some multiplicative models for rates ###########################

  glm(dac~as.factor(agefrom) + yrs.since71,
      family=poisson,offset=log(pop),data=ds)

  glm(dac~as.factor(agefrom) + as.factor(yrs.since71),
      family=poisson,offset=log(pop),data=ds)
      
# exp(beta_hat for 2002 vs 1991) ... the 10th coefficient..  

  exp(coef(glm(dac~as.factor(agefrom) + as.factor(yrs.since71),
            family=poisson,offset=log(pop),data=ds)))[10]
            
# using age as a 'continuous' variable [as Gompertz did]
    
  glm(dac~ agefrom + as.factor(yrs.since71),
      family=poisson,offset=log(pop),data=ds)

# etc etc ...

# males;

# same structure, except, instead of dac dlungca dbrca,
# it is dac dprca [Prostate cancer] dlungca -- note also different order of variables 

  males=read.table("q_m_mortality_data.txt",header=TRUE)



# -------- for comparisons of males vs females

# as in computing in general, several ways to 'get there'.. e.g

# first extract variables of interest in each dataset

  females=read.table("q_f_mortality_data.txt",header=TRUE)

  f.ds   =females[females$year    ==  2 & 
                  females$agefrom >= 40 &
                  females$agefrom <= 80   ,
                  c(1,3,4,7)] ; f.ds

  males  =read.table("q_m_mortality_data.txt",header=TRUE)

  m.ds   =males[males$year    ==  2 & 
                  males$agefrom >= 40 &
                  males$agefrom <= 80   ,
                  c(1,3,4,7)] ; m.ds

# concatenate them vertically so 9 + 9= 18 observations

  ds = rbind(f.ds,m.ds) ; ds; # rbind puts rows under each other
                              # cbind puts columns beside each other
                        
# and use an indicator variable for males

# this way, never have to check which is sex=1 and sex=2 (or 1 and 0)
# i.male = iNDICATOR of male -- no more jokes about dummy vraiables.
# If someone overheard you talking about sex and dummy variables,
# what would they think you were referring to?

# much more sensible to speak of an indicator variable for males
# [or for females if you prefer .. jh chose males in this particular
# example because he suspects the coefficient will be positive,
# and its easier to deal with exp(beta_hat) > 1 and so IDR > 1 ]

# 'indicator of male' also avoids the argument as to whether it
# should be sex or gender -- finding sex=1 and sex=2 
# in a dataset is statistically ambiguous to say the least! 

  ds[,"i.male"] = 2 - ds[,"sex"] ; ds
 
 
# Fit Gompertz model -- parallel lines for log(rate) vs age

  glm(dac~agefrom+i.male,family=poisson,offset=log(pop),data=ds)

  round(exp(coef(glm(dac~agefrom+i.male,
                     family=poisson,offset=log(pop),data=ds))),
       6)
               
# better if we start age at 40, so intercept means something..

  ds[,"years.above.40"] = ds[,"agefrom"]-40 ; ds

# Fit Gompertz model -- parallel lines for log(rate) vs age

  glm(dac~years.above.40 + i.male,family=poisson,offset=log(pop),data=ds)

# Fit Gompertz models with separate ie sex-specific 
# straight lines for log(rate) vs age
# (effect modification on the log scale)

  summary(glm(dac~years.above.40 + i.male + years.above.40*i.male,
              family=poisson,offset=log(pop),data=ds))

# does product term (so we have different slopes for different folks )
# improve the fit ?

# (of course, this assumes that straight line fits are adequate) 

# plotting... (some examples)

fitted.rates = (1/ds$pop) *
  fitted(glm(dac~years.above.40 + i.male + years.above.40*i.male,
              family=poisson,offset=log(pop),data=ds))

observed.rates = (1/ds$pop) * ds$dac

plot(ds$years.above.40,fitted.rates)

plot(ds$years.above.40[1:9],log(fitted.rates)[1:9],"l",col="red",
ylim=c(-7,-2) )
lines(ds$years.above.40[10:18],log(fitted.rates)[10:18],col="blue")
points(ds$years.above.40[1:9],log(observed.rates)[1:9],col="red",cex=2)
points(ds$years.above.40[10:18],log(observed.rates)[10:18],col="blue",cex=2)

# etc etc


# ---------------  Source for data: ---------------------------

# Ministere de la Sante et des Services sociaux (MSSS).
# Statistiques sur les causes de mortalite au Quebec: 1971, 1976, 1980-92.
# Quebec: MSSS, 1994.

# Tableau 75  La population selon l'age

# Tableau composé à partir des données transmises par monsieur
# Gilles Pelletier du Service des études opérationnelles et données
#statistiques du ministère de la Santé et des Services sociaux.
# Les données ont été générées de la façon suivante:

#> 1971: estimation de la population, Statistique Canada, 
#        corrigées du sous-dénombrement, publiées le 22 octobre 1993;

#> 1976-1980 et 1982-1985:   estimations du Bureau de la statistique du Québec,
#        corrigées du sous-dénombrement;

#> 1987-1990 estimations du ministère de Santé et des Services sociaux
#       corrigées selon la méthode du Bureau de la statistique du Québec;

#> 1981, 1986 et 1991:   recensement canadien, 
#        corrigé par le Bureau de la statistique du Québec;

#> 1992-1996:    projections provisoires du Bureau de
#        la statistique du Québec de mars 1993.
        
# http://www.stat.gouv.qc.ca/donstat/societe/demographie/struc_poplt/201_02.htm

# http://www.stat.gouv.qc.ca/donstat/societe/demographie/naisn_deces/index.htm#deces

#Tableau  2  Nombre de deces pour ensemble des causes selon l'age

#Tableau 17  Nombre de deces par tumeur maligne de la tranchee,
#            des bronches et du poumons (CODE CIM-9: 162) selon l'age
                         
#Tableau 19  Nombre de deces par tumeur maligne du sein
#            (CODE CIM-9: 174) selon l'age
            

#Tableau 21  Nombre de deces par tumeur maligne de la prostate
#          (CODE CIM-9: 174) selon l'age