/*
Code Sheet for the Variables in the Low Birth Weight Study
Described in Section 1.6.2 Page 25-26, 2nd edition of ALR

Variable        Description           Codes/Values          Name

 1 Identification Code           ID Number             ID

 2 Low Birth Weight              1 = BWT<=2500g,       LOW
                                 0 = BWT>2500g

 3 Age of Mother                 Years                 AGE

 4 Weight of Mother at           Pounds                LWT
  Last Menstrual Period

 5 Race                          1 = White, 2 = Black  RACE
                                 3  = Other

 6 Smoking Status                0 = No, 1 = Yes       SMOKE
   During Pregnancy
 7 History of Premature Labor    0 = None, 1 = One,    PTL
                                 2 = Two, etc.

 8 History of Hypertension       0 = No, 1 = Yes       HT

 9 Presence of Uterine           0 = No, 1 = Yes       UI
  Irritability

10 Number of Physician Visits    0 = None, 1 = One     FTV
   During the First Trimester    2 = Two,etc.

11 Birth Weight                  Grams                 BWT

*/

/*  --- for SAS ---  */

/* given the size of the raw data file, better to keep
   data separate from the program.. so use INFILE
   rather than LINES

   so download the lowbwt.dat file , save it somewhere
   on the hard disk (remember the path!), then have the
   INFILE statement point to it... ie give the full path

   e.g. if you store the .dat file in sub-directory or folder
        called c:\681folder\ , path would be "c:\681folder\lowbwt.dat"

   MISSOVER option in infile is important safeguard against
   SAS taking going into next line of raw data in order to find
   as many data items as there are variables in the INPUT statement
   (eg if for some reason you had blank fields,
   With MISSOVER, can limit the damage to the offending record.)
*/

OPTIONS LINESIZE=85 PAGESIZE=60 ;  /* change #chars/line #lines/page    */
RUN;

DATA lowbwt;   /* make a 2-part name if wish to create perm. file       */
               /* rather than re-creating the dataset each time         */
               /* then next time would use library instead of DATA step */

*INFILE "c:\681folder\lowbwt.dat.dat" ;

INFILE "Macintosh HD:User:dad:courses:681:alr_2:lowbwt.dat" MISSOVER;

INPUT ID low  age  lwt  race smoke  ptl ht  ui ftv  bwt ;

 /* make our OWN indicator variables for race         */
 /* and be more memorable than 'race_2 and race_3' !  */

 ir_black = 0;    /* <<i>>ndicator of <<r>>ace=black  */
 ir_other = 0 ;   /* <<i>>ndicator of <<r>>ace=other  */

 IF race  = 2 THEN ir_black = 1;

 IF race  = 3 THEN ir_other = 1;

 IF id ne . ;    /* skips obsn. if my Mac sees blank line */

RUN;

* -------------------------------------------------------;

TITLE CHECK the Raw data and coding; /* PROC UNIVARIATE useful too */

PROC MEANS DATA=lowbwt;
 VAR low  age lwt  race ir_black ir_other smoke ptl ht ui ftv bwt ;
RUN;

PROC FREQ DATA=lowbwt;
 TABLES low  race smoke ptl ui ftv race*ir_black  race*ir_other ;
RUN;

* -------------------------------------------------------;

TITLE1 Table 2.2 FIT Logistic regression by PROC GENMOD;
TITLE2 (Generalized linear model: specify distribution & link) ;

PROC GENMOD DATA = lowbwt;
 MODEL low = age lwt ir_black ir_other ftv /
          DIST=BINOMIAL LINK=LOGIT  COVB CORRB ;
RUN;

* -------------------------------------------------------;

TITLE1 Table 2.2   FIT Logistic regression by PROC LOGISTIC;
TITLE2 use DESCENDING to make sure it models prob[low=1];
TITLE3 store predicted values in a new variable ;
TITLE4 Ask for variance-covariance (and correlation) matrix  ;
TITLE5 of estimated coefficients: use COVB and CORRB options ;
TITLE6 check against Table 2.2                          ;

PROC LOGISTIC DATA = lowbwt DESCENDING;
 MODEL low = age lwt ir_black ir_other ftv  / COVB CORRB ;
 OUTPUT OUT = b PREDICTED= fitted_p;
RUN;

TITLE1 Table 2.3  FIT Logistic regression by PROC LOGISTIC;
TITLE2 .;
TITLE3 ...................LWT and race     ;
TITLE4 .  ;

PROC LOGISTIC DATA = lowbwt DESCENDING;
 MODEL low = lwt ir_black ir_other / COVB CORRB ;
 OUTPUT OUT = b PREDICTED= fitted_p;
RUN;



* -------------------------------------------------------;


/* --- Stata section ---  */

/* given the size of the raw data file, better to keep
   data separate from the program.. so use infile
   rather than input

   so download the lowbwt.dat file , save it somewhere
   on the hard disk (remember the path!), then use the
   'Set working Folder' (under Stata's File menu) to 
   point to the folder where it is stored
   
*/

* start

clear

infile id low  age lwt race smoke  ptl ht  ui ftv bwt using lowbwt.dat 

* make our OWN indicator variables for race       
* and be more memorable than 'race_2 and race_3' !
* 
* ir_black  <<i>>ndicator of <<r>>ace=black 
* ir_other  <<i>>ndicator of <<r>>ace=other
*
gen ir_black =  race == 2 if !missing(race)
gen ir_other =  race == 3 if !missing(race)

* -------------------------------------------------------;

* CHECK the Raw data and coding

summarize low age lwt race ir_black ir_other smoke ptl ht ui ftv bwt

tab1 low race smoke ptl ui ftv

tab2 race ir_black , col
tab2 race ir_other , col


* Table 2.2 FIT Logistic regression by Generalized linear model
* ie        specify distribution & link 

glm low  age lwt ir_black ir_other ftv , family(binomial) link(logit)

* ask for variance-covariance (andcorrelation) matrix of estimated coefficients
* vce for covariance
* vce, corr for correlation version

vce

vce, corr

* Logistic regression by logistic command
* ask for coefficient on logit scale
* store predicted values in a new variable
* Ask for variance-covariance (and correlation) matrix 
* of estimated coefficients
* check against Table 2.2

logistic low  age lwt ir_black ir_other ftv
logit  

vce
vce, corr

predict p


* Table 2.3  FIT Logistic regression by logistic
* lwt and race 

logistic low  lwt ir_black ir_other
logit

vce
vce, corr

predict p