setwd("/Users/jameshanley/Dropbox/Courses/bios601/CaseControlStudies/BirthMonthSports")


# model for success rate,  

# lambda = E(NHL)/1Million births (say)

#   data 

#### model for expected data

# E[NHL | month of birth ] = lambda[month] * BIRTHS.that.month

#  form of lambda[month] function is for user to decide..

# first, the numerators 

SKIP=FALSE
if(SKIP) {	

  ds = read.csv("NHLnameB.csv",header=TRUE,as.is=rep(TRUE,11))
  head(ds); tail(ds) ; length(ds$Birthplace)

  CDN = c( grep("BC",ds$Birthplace) ,
         grep("ALTA",ds$Birthplace),
         grep("SASK",ds$Birthplace),
         grep("MAN",ds$Birthplace),
         grep("ONT",ds$Birthplace),
         grep("PQ",ds$Birthplace),
         grep("NB",ds$Birthplace),
         grep("NS",ds$Birthplace),
         grep("PEI",ds$Birthplace),
         grep("NFLD",ds$Birthplace),
         grep("NW",ds$Birthplace),
         grep("Yuk",ds$Birthplace)
            
       )
  ds=ds[CDN,]
  head(ds); tail(ds) ; length(ds$Birthplace)
  ds$month = as.numeric(substr(ds$Birthdate,1,2))
  table(ds$month)
  num.B = as.numeric(table(ds$month))
}


num.B     = c(63,29,51,41,48,39,35,37,44,38,30,34)
num.C     = c(50,53,38,53,46,38,42,31,25,34,37,36); 
num.D     = c(22,40, 8,27,15,20,24,28,20,19,12,21) ; # Shuyan 
num.H     = c(41,33,30,30,31,33,27,18,21,24,29,22);  # Terry
num.J     = c( 7, 9,13,12,10,16, 8, 6, 6, 6, 8, 7);  # Golsa 
num.P     = c(37,34,39,38,27,29,26,23,38,25,24,28);  # num.P
num.L     = c(32,21,33,31,27,27,19,18,20,23,28,17);  # num.L
num.M     = c(56,51,55,53,41,49,42,42,50,43,36,35);  # Yi 
num.S     = c(38,56,42,38,48,29,37,29,38,30,34,37);  # Yi

NHL = num.B+num.C+num.D+num.H+num.J+num.P+num.L+num.M+num.S

NHL

############ Denominators

## What if we DO have populations based denominarors

## these data are not well synchronized (they referr o recent births),
## are used her only for didactic illustrative purposes

All = read.csv("BirthsMonthYearCanada.csv",header=TRUE)
str(All)
B=apply(All[2:13,],1,sum) ; B ; sum(B)

B=c(485587,461859,524724,519407,541186,521102,
    535241,521685,522027,500305,467838,473563);


month=-5:6  # whatever centering you want...

sum(NHL)/sum(B)  # grade 76 estimate 
glm(NHL ~ -1+B, family=poisson(link="identity"))

glm(NHL ~ 1+offset(log(B)), family=poisson)

glm(NHL ~ 1+month+offset(log(B)), family=poisson)

#  etc to be completed by student...


## what if we do not have B, but obtain a denominator sample
## that we hope is representative and unbiased...

## from http://en.wikipedia.org/wiki/List_of_current_Canadian_senators_by_age
## accessed feb 2015

Senators = read.csv("CanadianSenators.csv",header=TRUE,as.is=TRUE)
head(Senators); tail(Senators)
str(Senators)

Senators$MonthOfBirth = substr(Senators$DateOfOfbirth,4,6)

convert=function(mmm) {
	result=NA
	if(mmm !="") result=which(month.abb==mmm)
	return(result)
}
convert("Dec")
Senators$Month = unlist( lapply(Senators$MonthOfBirth,convert) )
sens = table( Senators$Month )

### from Feb 2015
## http://en.wikipedia.org/wiki
##    List_of_current_members_of_the_Canadian_House_of_Commons_by_age

MPs = read.csv("CanadianMPs.csv",header=TRUE,as.is=TRUE)
head(MPs); tail(MPs)
str(MPs)

mps = table( substr(MPs$Date.of.Birth,6,7) )

# sample.of.Births

b  = sens+mps[2:13]

### regression approaches

## SE's for parameter estimates of parameters in rate model
## need to recognize fact that  sample.of.Births is a Sample,
## it is not like B. 

## indeed, the denom series is SMALLER than num (NHL) series

## to be filled in by student.

## ..........


# ============================================================

### other non-regression (graphical)  apparoaches, B data...