/* ILLUSTRATION OF PARTIAL REGRESSION PLOTS with a Y variable measured on a numerical scale.. i.e., Y = birthweight and PROC REG (SAS) or regress (Stata) */ /* Code Sheet for the Variables in the Low Birth Weight Study Described in Section 1.6.2 Page 25-26, 2nd edition of ALR Variable Description Codes/Values Name 1 Identification Code ID Number ID 2 Low Birth Weight 1 = BWT<=2500g, LOW 0 = BWT>2500g 3 Age of Mother Years AGE 4 Weight of Mother at Pounds LWT Last Menstrual Period 5 Race 1 = White, 2 = Black RACE 3 = Other 6 Smoking Status 0 = No, 1 = Yes SMOKE During Pregnancy 7 History of Premature Labor 0 = None, 1 = One, PTL 2 = Two, etc. 8 History of Hypertension 0 = No, 1 = Yes HT 9 Presence of Uterine 0 = No, 1 = Yes UI Irritability 10 Number of Physician Visits 0 = None, 1 = One FTV During the First Trimester 2 = Two,etc. 11 Birth Weight Grams BWT */ /* --- for SAS --- */ /* given the size of the raw data file, better to keep data separate from the program.. so use INFILE rather than LINES so download the lowbwt.dat file , save it somewhere on the hard disk (remember the path!), then have the INFILE statement point to it... ie give the full path e.g. if you store the .dat file in sub-directory or folder called c:\681folder\ , path would be "c:\681folder\lowbwt.dat" MISSOVER option in infile is important safeguard against SAS taking going into next line of raw data in order to find as many data items as there are variables in the INPUT statement (eg if for some reason you had blank fields, With MISSOVER, can limit the damage to the offending record.) */ OPTIONS LINESIZE=75 PAGESIZE=40 ; /* change #chars/line #lines/page */ RUN; DATA lowbwt; /* make a 2-part name if wish to create perm. file */ /* rather than re-creating the dataset each time */ /* then next time would use library instead of DATA step */ *INFILE "c:\681folder\lowbwt.dat.dat" ; INFILE "Macintosh HD:User:dad:courses:681:alr_2:lowbwt.dat" MISSOVER; INPUT ID low age lwt race smoke ptl ht ui ftv bwt ; /* make our OWN indicator variables for race */ /* and be more memorable than 'race_2 and race_3' ! */ ir_black = 0; /* <>ndicator of <>ace=black */ ir_other = 0 ; /* <>ndicator of <>ace=other */ IF race = 2 THEN ir_black = 1; IF race = 3 THEN ir_other = 1; IF id ne . ; /* skips obsn. if my Mac sees blank line */ RUN; * -------------------------------------------------------; TITLE FIT regression by PROC REG [ focus on lwt ] ; PROC REG DATA = lowbwt ; MODEL bwt = ir_black ir_other smoke lwt / PARTIAL ; RUN; * DOUBLE CHECK... using continuous lwt as the X under scrutiny ; * (1); PROC REG DATA = lowbwt ; MODEL bwt = ir_black ir_other smoke ; /* OMIT lwt */ OUTPUT OUT = b residual = y_res; /* get y residuals from other Xs */ * (2); PROC REG DATA = b ; MODEL lwt = ir_black ir_other smoke ; /* what remains of lwt after */ OUTPUT OUT = c residual = x_res; /* predicting lwt from other Xs */ * (3); /* now regress (1) on (2) and see that it gives same slope 3.934 for lwt as in multivariable model */ PROC REG DATA = c ; MODEL y_res = x_res ; RUN; PROC PLOT DATA = c ; PLOT y_res * x_res ; RUN; * -------------------------------------------------------; * -------------------------------------------------------; /* ILLUSTRATION OF assessing linearity of logits with a Y variable measured on a binary scale.. i.e., Y = birthweight and X = pwt PROC LOGISTIC or GENMOD (SAS) */ * -------------------------------------------------------; /* ILLUSTRATION OF assessing linearity of logits with a Y variable measured on a binary scale.. i.e., Y = birthweight and X = pwt PROC LOGISTIC or GENMOD (SAS) */ * -------------------------------------------------------; TITLE Treat lwt as linear ; PROC LOGISTIC DATA=lowbwt DESCENDING; MODEL low = ir_black ir_other smoke lwt ; RUN; PROC UNIVARIATE DATA = lowbwt; VAR lwt; RUN; DATA with_q; SET lowbwt; lwt_q1=0; lwt_q2=0; lwt_q3=0; lwt_q4=0; if lwt < 110 then lwt_q1=1; if lwt >= 110 and lwt < 121 then lwt_q2=1; if lwt >= 121 and lwt < 140 then lwt_q3=1; if lwt >= 140 then lwt_q4=1; RUN; PROC LOGISTIC DATA=with_q DESCENDING OUTEST = betas; MODEL low = ir_black ir_other smoke lwt_q2 lwt_q3 lwt_q4 / cl ; RUN; * -------------------------------------------------------; /* --- Stata section --- */ /* given the size of the raw data file, better to keep data separate from the program.. so use infile rather than input so download the lowbwt.dat file , save it somewhere on the hard disk (remember the path!), then use the 'Set working Folder' (under Stata's File menu) to point to the folder where it is stored */ * start clear infile id low age lwt race smoke ptl ht ui ftv bwt using lowbwt.dat * make our OWN indicator variables for race * and be more memorable than 'race_2 and race_3' ! * * ir_black <>ndicator of <>ace=black * ir_other <>ndicator of <>ace=other * gen ir_black = race == 2 if !missing(race) gen ir_other = race == 3 if !missing(race) * -------------------------------------------------------; regress bwt lwt ir_black ir_other smoke lwt avplot lwt * DOUBLE CHECK... using continuous lwt as the X under scrutiny ; * (1) regress bwt ir_black ir_other smoke /* OMIT lwt */ predict y_res , residual /* get y residuals from other Xs */ * (2) regress lwt ir_black ir_other smoke /* what remains of lwt after */ predict x_res , residual /* predicting lwt from other Xs */ * (3) now regress (1) on (2) and see that it gives * same slope 3.934 for lwt as in multivariable model regress y_res x_res plot y_res x_res