/* --- for SAS --- */ /* given the size of the raw data file, better to keep data separate from the program.. so use INFILE rather than LINES so download the chdage.dat file , save it somewhere on the hard disk (remember the path!), then have the INFILE statement point to it... ie give the full path e.g. if you store the .dat file in sub-directory or folder called c:\681folder\ , path would be "c:\681folder\chdage.dat" MOSSOVER option in infile is important safeguard against SAS taking going into next line of raw data in order to find as many data items as there are variables in the INPUT statement (eg if for some reason you had blank fields With MISSOVER, can limit the damage to the offending record. */ OPTIONS LINESIZE=85 PAGESIZE=60 ; * change #chars/line #lines/page */ RUN; DATA chdage; /* make a 2-part name if wish to create perm. file */ /* rather than re-creating the dataset each time */ /* then next time would use library instead of DATA step */ *INFILE "c:\681folder\chdage.dat" ; INFILE "Macintosh HD:User:dad:courses:681:alr_1:chdage.dat" MISSOVER; INPUT id age chd; /* create age categories (some other, faster ways too) */ if 20 <= age <= 29 then age_mid = 25 ; if 30 <= age <= 34 then age_mid = 32 ; if 35 <= age <= 39 then age_mid = 37 ; if 40 <= age <= 44 then age_mid = 42 ; if 45 <= age <= 49 then age_mid = 47 ; if 50 <= age <= 54 then age_mid = 52 ; if 55 <= age <= 59 then age_mid = 57 ; if 60 <= age <= 69 then age_mid = 65 ; IF id ne . ; RUN; * -------------------------------------------------------; TITLE PLOT the Raw data ; PROC PLOT DATA=chdage; PLOT chd * age / HPOS=75 VPOS=15 ; RUN; * -------------------------------------------------------; TITLE Table 1.2 CHD vs Categorised ages; PROC FREQ DATA=chdage; TABLE age_mid * chd / NOCL NOPERCENT ; RUN; * -------------------------------------------------------; TITLE1 make means (proportions) of chd by age_mid; TITLE2 put into new file called summary; PROC MEANS DATA=chdage; CLASS age_mid; VAR chd ; output out = summary MEAN=prev_chd ; /* mean is a prevalence proportion */ RUN; PROC PRINT DATA=summary; RUN; * -------------------------------------------------------; TITLE PLOT the data with age categorized; PROC PLOT DATA = summary; PLOT prev_chd * age_mid / HPOS=75 VPOS=25 ; RUN; * -------------------------------------------------------; TITLE1 FIT Logistic regression by PROC GENMOD; TITLE2 GENMOD fits Generalized linear models ; PROC GENMOD DATA = chdage; MODEL chd = age / DIST = BINOMIAL LINK = LOGIT ; RUN; * -------------------------------------------------------; TITLE1 FIT Logistic regression by PROC LOGISTIC; TITLE2 coefficients have wrong sign... ; TITLE3 see message in SAS log window .. LOGISTIC is modeling prob[chd=0] ; PROC LOGISTIC DATA = chdage; MODEL chd = age ; RUN; * -------------------------------------------------------; TITLE1 FIT Logistic regression by PROC LOGISTIC; TITLE2 use DESCENDING to make sure it models prob[chd=1]; TITLE3 check against Table 1.3 ; PROC LOGISTIC DATA = chdage DESCENDING; MODEL chd = age ; RUN; * -------------------------------------------------------; TITLE1 FIT Logistic regression by PROC LOGISTIC; TITLE2 use DESCENDING to make sure it models prob[chd=1]; TITLE3 store predicted values in a new varaible ; PROC LOGISTIC DATA = chdage DESCENDING; MODEL chd = age ; OUTPUT OUT = b PREDICTED= fitted_p; RUN; * ---------------------------- ; TITLE1 PLOT the prevalences fiited by PROC LOGISTIC; TITLE2 If you wish, use high-res ; PROC PLOT DATA = b; PLOT fitted_p * age / HPOS=75 VPOS=45 ; RUN; * ---------------------------- ; TITLE1 using high-res.. see onlinedoc for all the options ; PROC GPLOT DATA = b; PLOT fitted_p * age ; RUN; * -------------------------------------------------------; TITLE1 combine fitted points for smooth curve (in b) ; TITLE2 with observed prevalences (in summary); PROC SORT DATA = b; BY age_mid; PROC SORT DATA = summary; BY age_mid; data c; MERGE b summary; BY age_mid; RUN; * ---------------------------- ; TITLE1 Overlay the observed and fitted prevalences; PROC PLOT DATA = c; PLOT prev_chd * age="O" fitted_p * age = "F" / OVERLAY HPOS=65 VPOS=40 ; WHERE (age = age_mid); RUN; * ---------------------------- ; TITLE1 using high-res.. see onlinedoc for all the options ; PROC GPLOT DATA = c; PLOT prev_chd * age fitted_p * age / OVERLAY ; WHERE (age = age_mid); RUN; * -------------------------------------------------------; /* --- Stata section --- */ /* --- Stata section --- */ /* --- Stata section --- */ /* --- for Stata --- */ /* given the size of the raw data file, better to keep data separate from the program.. so use INFILE rather than LINES so download the chdage.dat file , save it somewhere on the hard disk (remember the path!), then have the INFILE statement point to it... ie give the full path e.g. if you store the .dat file in sub-directory or folder called c:\681folder\ , path would be "c:\681folder\chdage.dat" */ *INFILE "c:\681folder\chdage.dat" ; INFILE "Macintosh HD:User:dad:courses:681:alr_1:chdage.dat" MISSOVER; INPUT id age chd; /* create age categories (some other, faster ways too) */ if 20 <= age <= 29 then age_mid = 25 ; if 30 <= age <= 34 then age_mid = 32 ; if 35 <= age <= 39 then age_mid = 37 ; if 40 <= age <= 44 then age_mid = 42 ; if 45 <= age <= 49 then age_mid = 47 ; if 50 <= age <= 54 then age_mid = 52 ; if 55 <= age <= 59 then age_mid = 57 ; if 60 <= age <= 69 then age_mid = 65 ; IF id ne . ; RUN; * -------------------------------------------------------; TITLE PLOT the Raw data ; PROC PLOT DATA=chdage; PLOT chd * age / HPOS=75 VPOS=15 ; RUN; * -------------------------------------------------------; TITLE Table 1.2 CHD vs Categorised ages; PROC FREQ DATA=chdage; TABLE age_mid * chd / NOCL NOPERCENT ; RUN; * -------------------------------------------------------; TITLE1 make means (proportions) of chd by age_mid; TITLE2 put into new file called summary; PROC MEANS DATA=chdage; CLASS age_mid; VAR chd ; output out = summary MEAN=prev_chd ; /* mean is a prevalence proportion */ RUN; PROC PRINT DATA=summary; RUN; * -------------------------------------------------------; TITLE PLOT the data with age categorized; PROC PLOT DATA = summary; PLOT prev_chd * age_mid / HPOS=75 VPOS=25 ; RUN; * -------------------------------------------------------; TITLE1 FIT Logistic regression by PROC GENMOD; TITLE2 GENMOD fits Generalized linear models ; PROC GENMOD DATA = chdage; MODEL chd = age / DIST = BINOMIAL LINK = LOGIT ; RUN; * -------------------------------------------------------; TITLE1 FIT Logistic regression by PROC LOGISTIC; TITLE2 coefficients have wrong sign... ; TITLE3 see message in SAS log window .. LOGISTIC is modeling prob[chd=0] ; PROC LOGISTIC DATA = chdage; MODEL chd = age ; RUN; * -------------------------------------------------------; TITLE1 FIT Logistic regression by PROC LOGISTIC; TITLE2 use DESCENDING to make sure it models prob[chd=1]; TITLE3 check against Table 1.3 ; PROC LOGISTIC DATA = chdage DESCENDING; MODEL chd = age ; RUN; * -------------------------------------------------------; TITLE1 FIT Logistic regression by PROC LOGISTIC; TITLE2 use DESCENDING to make sure it models prob[chd=1]; TITLE3 store predicted values in a new varaible ; PROC LOGISTIC DATA = chdage DESCENDING; MODEL chd = age ; OUTPUT OUT = b PREDICTED= fitted_p; RUN; * ---------------------------- ; TITLE1 PLOT the prevalences fiited by PROC LOGISTIC; TITLE2 If you wish, use high-res ; PROC PLOT DATA = b; PLOT fitted_p * age / HPOS=75 VPOS=45 ; RUN; * ---------------------------- ; TITLE1 using high-res.. see onlinedoc for all the options ; PROC GPLOT DATA = b; PLOT fitted_p * age ; RUN; * -------------------------------------------------------; TITLE1 combine fitted points for smooth curve (in b) ; TITLE2 with observed prevalences (in summary); PROC SORT DATA = b; BY age_mid; PROC SORT DATA = summary; BY age_mid; data c; MERGE b summary; BY age_mid; RUN; * ---------------------------- ; TITLE1 Overlay the observed and fitted prevalences; PROC PLOT DATA = c; PLOT prev_chd * age="O" fitted_p * age = "F" / OVERLAY HPOS=65 VPOS=40 ; WHERE (age = age_mid); RUN; * ---------------------------- ; TITLE1 using high-res.. see onlinedoc for all the options ; PROC GPLOT DATA = c; PLOT prev_chd * age fitted_p * age / OVERLAY ; WHERE (age = age_mid); RUN; * -------------------------------------------------------;