setwd("/Users/jameshanley/Dropbox/Courses/bios601/CaseControlStudies/BirthMonthSports") # model for success rate, # lambda = E(NHL)/1Million births (say) # data #### model for expected data # E[NHL | month of birth ] = lambda[month] * BIRTHS.that.month # form of lambda[month] function is for user to decide.. # first, the numerators SKIP=FALSE if(SKIP) { ds = read.csv("NHLnameB.csv",header=TRUE,as.is=rep(TRUE,11)) head(ds); tail(ds) ; length(ds$Birthplace) CDN = c( grep("BC",ds$Birthplace) , grep("ALTA",ds$Birthplace), grep("SASK",ds$Birthplace), grep("MAN",ds$Birthplace), grep("ONT",ds$Birthplace), grep("PQ",ds$Birthplace), grep("NB",ds$Birthplace), grep("NS",ds$Birthplace), grep("PEI",ds$Birthplace), grep("NFLD",ds$Birthplace), grep("NW",ds$Birthplace), grep("Yuk",ds$Birthplace) ) ds=ds[CDN,] head(ds); tail(ds) ; length(ds$Birthplace) ds$month = as.numeric(substr(ds$Birthdate,1,2)) table(ds$month) num.B = as.numeric(table(ds$month)) } num.B = c(63,29,51,41,48,39,35,37,44,38,30,34) num.C = c(50,53,38,53,46,38,42,31,25,34,37,36); num.D = c(22,40, 8,27,15,20,24,28,20,19,12,21) ; # Shuyan num.H = c(41,33,30,30,31,33,27,18,21,24,29,22);  # Terry num.J = c( 7, 9,13,12,10,16, 8, 6, 6, 6, 8, 7); # Golsa  num.P = c(37,34,39,38,27,29,26,23,38,25,24,28); # num.P num.L = c(32,21,33,31,27,27,19,18,20,23,28,17); # num.L num.M = c(56,51,55,53,41,49,42,42,50,43,36,35); # Yi  num.S = c(38,56,42,38,48,29,37,29,38,30,34,37); # Yi NHL = num.B+num.C+num.D+num.H+num.J+num.P+num.L+num.M+num.S NHL ############ Denominators ## What if we DO have populations based denominarors ## these data are not well synchronized (they referr o recent births), ## are used her only for didactic illustrative purposes All = read.csv("BirthsMonthYearCanada.csv",header=TRUE) str(All) B=apply(All[2:13,],1,sum) ; B ; sum(B) B=c(485587,461859,524724,519407,541186,521102, 535241,521685,522027,500305,467838,473563); month=-5:6 # whatever centering you want... sum(NHL)/sum(B) # grade 76 estimate glm(NHL ~ -1+B, family=poisson(link="identity")) glm(NHL ~ 1+offset(log(B)), family=poisson) glm(NHL ~ 1+month+offset(log(B)), family=poisson) # etc to be completed by student... ## what if we do not have B, but obtain a denominator sample ## that we hope is representative and unbiased... ## from http://en.wikipedia.org/wiki/List_of_current_Canadian_senators_by_age ## accessed feb 2015 Senators = read.csv("CanadianSenators.csv",header=TRUE,as.is=TRUE) head(Senators); tail(Senators) str(Senators) Senators$MonthOfBirth = substr(Senators$DateOfOfbirth,4,6) convert=function(mmm) { result=NA if(mmm !="") result=which(month.abb==mmm) return(result) } convert("Dec") Senators$Month = unlist( lapply(Senators$MonthOfBirth,convert) ) sens = table( Senators$Month ) ### from Feb 2015 ## http://en.wikipedia.org/wiki ## List_of_current_members_of_the_Canadian_House_of_Commons_by_age MPs = read.csv("CanadianMPs.csv",header=TRUE,as.is=TRUE) head(MPs); tail(MPs) str(MPs) mps = table( substr(MPs$Date.of.Birth,6,7) ) # sample.of.Births b = sens+mps[2:13] ### regression approaches ## SE's for parameter estimates of parameters in rate model ## need to recognize fact that sample.of.Births is a Sample, ## it is not like B. ## indeed, the denom series is SMALLER than num (NHL) series ## to be filled in by student. ## .......... # ============================================================ ### other non-regression (graphical) apparoaches, B data...