SAS code for bootstrap standard
error of estimate of First Principal Component
Appendix to forthcoming article
Creating non-parametric bootstrap
samples using Poisson frequencies
in
Computer Methods and Programs in Biomedicine
James A. Hanley[a*], Brenda MacGibbon[b]
[a] Department of Epidemiology, Biostatistics and Occupational Health,
McGill University, Montreal, Quebec, H3A 1A2, Canada
[b] GERAD and Département de mathématiques, Université du Québec
à Montréal
data scores;
INFILE 'path:scores_Efron_Tibshirani.txt' ;
INPUT mec vec alg ana sta; RUN;
OPTIONS NOTES; RUN;
%let verbose = %str( );
%let verbose = %str(*);
/* use * to suppress printing
blank to allow printing */
*****************************
PROPOSED version
*****************************;
* PROPOSED version --
Poisson frequencies ;
* OPTION 1
--------
fast if dataset is small,
no macro required ;
* 200 bootstrap samples of data;
* 200 x 88 = 17600 observations
x
7 variables
[all in 1 file]
( 5 analysis variables
+ sample_identifier
+ n_times variable) ;
title 'PROPOSED -
Poisson frequencies -1 LONG file';
%let samples = 200;
%let data = %str(scores);
* generate samples;
data all;
set &data;
DO B_sample = 1 to &samples;
n_times = RANPOI(5555555,1.0);
if n_times > 0 then OUTPUT;
END;
RUN;
* analyze samples & store output;
PROC SORT DATA =all; by B_sample;
PROC princomp DATA =all NOPRINT
COVARIANCE VARDEF=N OUTSTAT=stats;
by B_sample;
VAR mec vec alg ana sta;
FREQ n_times;
RUN;
* extract relevant statistic
for each dataset;
DATA eigenv ;
SET stats; by B_sample;
IF _TYPE_ = "EIGENVAL" THEN DO;
PCT1 = mec/
( mec+vec+alg+ana+sta); OUTPUT;
END;
&verbose PROC PRINT DATA=eigenv;
RUN;
* obtain distrn. of
bootstrap estimates;
PROC UNIVARIATE DATA=eigenv;
VAR PCT1 ; RUN;
* ***************************** ;
* Proposed version -
Poisson frequencies ;
* OPTION 2 ..
--------
macro after frequencies added;
* 1 file, with frequencies appended;
* original dataset
+ 200 n_times counts ;
* 200 observations x 205 variables
(5 anal. vars + 200 n_times counts) ;
title 'PROPOSED -
Poisson frequencies - 1 WIDE file';
%let samples = 200;
%let data = %str(scores);
* generate frequencies & add as vars;
DATA all;
DROP B_sample;
ARRAY n_times(&samples)
nt1-nt&samples;
set &data ;
DO B_sample = 1 to &samples;
n_times(B_sample)
= RANPOI(3333333,1.0);
END;
RUN;
* macro to use a different
frequency var. each time;
%MACRO process(samples);
%do s = 1 %to &samples;
%put &s;
* process;
PROC princomp DATA = all
NOPRINT
COVARIANCE VARDEF=N
OUTSTAT=stats;
VAR mec vec alg ana sta;
FREQ nt&s;
WHERE (nt&s > 0);
* extract;
DATA eigenval ; keep PCT1;
SET stats;
IF _TYPE_ = "EIGENVAL" THEN DO;
PCT1 = mec/
( mec+vec+alg+ana+sta);
OUTPUT;
END;
RUN;
* delete file;
PROC DATASETS nolist nodetails;
delete stats ;
* append results;
PROC DATASETS NOLIST NODETAILS;
APPEND BASE = b_values
DATA = eigenval;
%end;
%MEND process;
* clear;
proc datasets nolist nodetails;
delete b_values ; RUN;
OPTIONS NONOTES; RUN;
* invoke macro;
%process(&samples); RUN;
proc univariate data=b_values;
var PCT1; run;
* ******************************* ;
* Proposed version --
Poisson frequencies ;
* OPTION 3 .. ;
* macro to create, process and
delete each sample in turn;
title PROPOSED - Poisson
frequencies - 1 file at a time;
%MACRO b_strap(data,n,samples);
* seeds;
DATA r_nos; KEEP nu_seed;
Do i = 1 to 5000;
* increase for more samples;
nu_seed = int(100000000
*ranuni(5555555));
OUTPUT;
END;
RUN;
%do s = 1 %to &samples;
%put &s;
* create ;
data b_sample;
RETAIN count 0;
which = &s;
If count = 0 THEN DO;
SET r_nos point = which;
count = 1;
END;
seed = nu_seed;
Do i = 1 to &n;
set &data point = i ;
times = ranpoi(seed,1.0);
if times > 0 then OUTPUT;
IF _error_ = 1 then abort;
END;
STOP;
RUN;
* process ;
PROC princomp DATA = b_sample
NOPRINT COVARIANCE VARDEF=N
OUTSTAT=stats;
VAR mec vec alg ana sta;
FREQ times;
* extract/save statistic ;
DATA eigenval ; keep PCT1;
SET stats;
IF _TYPE_ = "EIGENVAL" THEN DO;
PCT1 = mec/
( mec+vec+alg+ana+sta);
OUTPUT;
END; RUN;
* delete sample;
PROC DATASETS nolist nodetails;
delete stats ;
* append to datafile of statistics;
PROC DATASETS NOLIST NODETAILS;
APPEND BASE = b_values
DATA = eigenval;
%end;
%MEND b_strap;
* clear the statistics datafile;
proc datasets nolist nodetails;
delete b_values; RUN;
OPTIONS NONOTES; RUN;
/* turns off notes */
* invoke macro;
%b_strap(scores,88,200); RUN;
proc univariate data=b_values;
var PCT1; RUN;
**********************************
STANDARD version
********************************** ;
* using pointer (random access) to
* sample with replacement from orig.;
* OPTION IF DATASET SMALL
(no macro);
title 'STANDARD - 1 LONG file';
%let n = 88; %let samples = 200;
%let seed = 5555555;
%let data = %str(scores);
* create samples;
DATA all;
DO B_sample = 1 to &samples;
DO i = 1 to &n;
which = int(
&n*ranuni(&seed)+1);
set &data point = which;
OUTPUT;
IF _error_ = 1 then abort;
END;
END;
STOP; RUN;
* process ;
PROC SORT DATA =all; by B_sample;
PROC princomp DATA =all NOPRINT
COVARIANCE VARDEF=N
OUTSTAT=stats;
by B_sample;
VAR mec vec alg ana sta; RUN;
* extract ;
DATA eigenval ; keep PCT1;
SET stats;
IF _TYPE_ = "EIGENVAL" THEN DO;
PCT1 = mec/
( mec+vec+alg+ana+sta);
OUTPUT;
END; RUN;
proc univariate data = eigenval;
var PCT1; RUN;
* *********************************;
* STANDARD version using pointer
(random access) to sample with
replacement from original ;
* OPTION IF DATASET LARGE
so have macro create/process each
bootstrap sample;
title 'STANDARD - 1 file at time ';
%let samples = 200;
* seeds;
DATA r_nos; KEEP nu_seed;
Do i = 1 to &samples;
nu_seed = int(
100000000*ranuni(5555555));
OUTPUT;
END;
RUN;
%MACRO b_strap(data,n,samples);
%do s = 1 %to &samples;
%put &s;
* create ;
DATA b_sample;
RETAIN count 0;
which = &s;
IF count = 0 THEN DO;
SET r_nos point = which;
count = 1;
END;
seed = nu_seed;
Do i = 1 to &n;
which = int(
&n*ranuni(seed)+1);
set &data point = which;
OUTPUT;
IF _error_ = 1 then abort;
END;
STOP; RUN;
* process ;
PROC princomp DATA = b_sample
NOPRINT
COVARIANCE VARDEF=N
OUTSTAT=stats;
VAR mec vec alg ana sta;
* extract ;
DATA eigenval ; keep PCT1;
SET stats;
IF _TYPE_ = "EIGENVAL"
THEN DO;
PCT1 = mec/
(mec+vec+alg+ana+sta);
OUTPUT;
END;
RUN;
* delete ;
PROC DATASETS nolist nodetails;
delete out ;
* append results ;
PROC DATASETS NOLIST
NODETAILS;
APPEND BASE = b_values
DATA = eigenval;
%end;
%MEND b_strap;
* invoke;
proc datasets nolist nodetails;
delete b_values ; RUN;
OPTIONS NONOTES; RUN;
%b_strap(scores,88,&samples); RUN;
proc univariate data=b_values;
var PCT1; RUN;
created: April. 11, 2006
|