# version 2008.02.24 # females; # Deaths from all causes, lung cancer and breast cancer # among Quebec females, 1971, 1976, 1980-1992 2002 # Item content # --------- -------------------------------------------------------------- # sex 2 = female [1=male], in case you want to consolidate into 1 file # year 71, 76, 1971, 1976, 1980(1)1992 2002 # pop population # dac number of deaths, all causes # dlungca number of deaths, lung cancer # dbrca number of deaths, breast cancer # agefrom lower limit of age category (0,1, 5,10,15, ... 85) # ageto upper limit of age category (1,5,10,15,20, ... 100*) # * last age category is '85+' females=read.table("q_f_mortality_data.txt",header=TRUE) # subset to certain years and certain variables, # and create a new 'year' variable # (original dataset had the year 1971 as 71 and 2002 as 02) ds = females[ (females$year==71 | females$year==02 ) & (females$agefrom >= 40 & females$agefrom <= 80 ) , c(2,3,4,7,8) ] ; ds[ds$year==71,"yrs.since71"] = 0 ; ds[ds$year== 2,"yrs.since71"] = 31 ; ds #### age-standardized comparison of 1991 and 2002 death rates ######### # create common age structure for age-standardized comparison ds[1:9,"combined.pop"] = ds[1:9,"pop"]+ds[10:18,"pop"] ds[10:18,"combined.pop"] = ds[1:9,"pop"]+ds[10:18,"pop"] ; ds$combined.pop = ds$combined.pop/2; ds # expected deaths if common age, but 1971 and 2002 rates expected = (ds$dac / ds$pop) * ds$combined.pop; expected # tot. expected deaths (1971 followed by 2002, and common popl'n size/structure) e=c(sum(expected[1:9]),sum(expected[10:18]),sum(ds$combined.pop[1:9])); e # observed (1971 deaths & pop, followed by 2002) o=c(sum(ds$dac[1:9]),sum(ds$pop[1:9]),sum(ds$dac[10:18]),sum(ds$pop[10:18])); o # crude rates, and the 2002/1971 ratio round(c(o[1]/o[2], o[3]/o[4], (o[3]/o[4]) / (o[1]/o[2]) ),5) # standardized.rates, and the 2002/1971 ratio round(c( e[1]/e[3], e[2]/e[3], (e[2]/e[3]) / (e[1]/e[3]) ),5) # age-specific rate ratios, 2002/1971 round( ( ds$dac[10:18]/ds$pop[10:18] ) / ( ds$dac[1:9]/ds$pop[1:9] ) ,2) ### some multiplicative models for rates ########################### glm(dac~as.factor(agefrom) + yrs.since71, family=poisson,offset=log(pop),data=ds) glm(dac~as.factor(agefrom) + as.factor(yrs.since71), family=poisson,offset=log(pop),data=ds) # exp(beta_hat for 2002 vs 1991) ... the 10th coefficient.. exp(coef(glm(dac~as.factor(agefrom) + as.factor(yrs.since71), family=poisson,offset=log(pop),data=ds)))[10] # using age as a 'continuous' variable [as Gompertz did] glm(dac~ agefrom + as.factor(yrs.since71), family=poisson,offset=log(pop),data=ds) # etc etc ... # males; # same structure, except, instead of dac dlungca dbrca, # it is dac dprca [Prostate cancer] dlungca -- note also different order of variables males=read.table("q_m_mortality_data.txt",header=TRUE) # -------- for comparisons of males vs females # as in computing in general, several ways to 'get there'.. e.g # first extract variables of interest in each dataset females=read.table("q_f_mortality_data.txt",header=TRUE) f.ds =females[females$year == 2 & females$agefrom >= 40 & females$agefrom <= 80 , c(1,3,4,7)] ; f.ds males =read.table("q_m_mortality_data.txt",header=TRUE) m.ds =males[males$year == 2 & males$agefrom >= 40 & males$agefrom <= 80 , c(1,3,4,7)] ; m.ds # concatenate them vertically so 9 + 9= 18 observations ds = rbind(f.ds,m.ds) ; ds; # rbind puts rows under each other # cbind puts columns beside each other # and use an indicator variable for males # this way, never have to check which is sex=1 and sex=2 (or 1 and 0) # i.male = iNDICATOR of male -- no more jokes about dummy vraiables. # If someone overheard you talking about sex and dummy variables, # what would they think you were referring to? # much more sensible to speak of an indicator variable for males # [or for females if you prefer .. jh chose males in this particular # example because he suspects the coefficient will be positive, # and its easier to deal with exp(beta_hat) > 1 and so IDR > 1 ] # 'indicator of male' also avoids the argument as to whether it # should be sex or gender -- finding sex=1 and sex=2 # in a dataset is statistically ambiguous to say the least! ds[,"i.male"] = 2 - ds[,"sex"] ; ds # Fit Gompertz model -- parallel lines for log(rate) vs age glm(dac~agefrom+i.male,family=poisson,offset=log(pop),data=ds) round(exp(coef(glm(dac~agefrom+i.male, family=poisson,offset=log(pop),data=ds))), 6) # better if we start age at 40, so intercept means something.. ds[,"years.above.40"] = ds[,"agefrom"]-40 ; ds # Fit Gompertz model -- parallel lines for log(rate) vs age glm(dac~years.above.40 + i.male,family=poisson,offset=log(pop),data=ds) # Fit Gompertz models with separate ie sex-specific # straight lines for log(rate) vs age # (effect modification on the log scale) summary(glm(dac~years.above.40 + i.male + years.above.40*i.male, family=poisson,offset=log(pop),data=ds)) # does product term (so we have different slopes for different folks ) # improve the fit ? # (of course, this assumes that straight line fits are adequate) # plotting... (some examples) fitted.rates = (1/ds$pop) * fitted(glm(dac~years.above.40 + i.male + years.above.40*i.male, family=poisson,offset=log(pop),data=ds)) observed.rates = (1/ds$pop) * ds$dac plot(ds$years.above.40,fitted.rates) plot(ds$years.above.40[1:9],log(fitted.rates)[1:9],"l",col="red", ylim=c(-7,-2) ) lines(ds$years.above.40[10:18],log(fitted.rates)[10:18],col="blue") points(ds$years.above.40[1:9],log(observed.rates)[1:9],col="red",cex=2) points(ds$years.above.40[10:18],log(observed.rates)[10:18],col="blue",cex=2) # etc etc # --------------- Source for data: --------------------------- # Ministere de la Sante et des Services sociaux (MSSS). # Statistiques sur les causes de mortalite au Quebec: 1971, 1976, 1980-92. # Quebec: MSSS, 1994. # Tableau 75 La population selon l'age # Tableau composé à partir des données transmises par monsieur # Gilles Pelletier du Service des études opérationnelles et données #statistiques du ministère de la Santé et des Services sociaux. # Les données ont été générées de la façon suivante: #> 1971: estimation de la population, Statistique Canada, # corrigées du sous-dénombrement, publiées le 22 octobre 1993; #> 1976-1980 et 1982-1985: estimations du Bureau de la statistique du Québec, # corrigées du sous-dénombrement; #> 1987-1990 estimations du ministère de Santé et des Services sociaux # corrigées selon la méthode du Bureau de la statistique du Québec; #> 1981, 1986 et 1991: recensement canadien, # corrigé par le Bureau de la statistique du Québec; #> 1992-1996: projections provisoires du Bureau de # la statistique du Québec de mars 1993. # http://www.stat.gouv.qc.ca/donstat/societe/demographie/struc_poplt/201_02.htm # http://www.stat.gouv.qc.ca/donstat/societe/demographie/naisn_deces/index.htm#deces #Tableau 2 Nombre de deces pour ensemble des causes selon l'age #Tableau 17 Nombre de deces par tumeur maligne de la tranchee, # des bronches et du poumons (CODE CIM-9: 162) selon l'age #Tableau 19 Nombre de deces par tumeur maligne du sein # (CODE CIM-9: 174) selon l'age #Tableau 21 Nombre de deces par tumeur maligne de la prostate # (CODE CIM-9: 174) selon l'age