SAS code for bootstrap standard error of estimate of First Principal Component

Appendix to forthcoming article

Creating non-parametric bootstrap samples using Poisson frequencies
in
Computer Methods and Programs in Biomedicine

James A. Hanley[a*], Brenda MacGibbon[b]


[a] Department of Epidemiology, Biostatistics and Occupational Health,
McGill University, Montreal, Quebec, H3A 1A2, Canada

[b] GERAD and Département de mathématiques, Université du Québec à Montréal


data scores;
INFILE 'path:
scores_Efron_Tibshirani.txt' ;
INPUT mec vec alg ana sta; RUN;

OPTIONS NOTES; RUN;

%let verbose = %str( );
%let verbose = %str(*);

/* use * to suppress printing
blank to allow printing */

*****************************
PROPOSED version
*****************************;

* PROPOSED version --
Poisson frequencies ;

* OPTION 1
--------

fast if dataset is small,
no macro required ;

* 200 bootstrap samples of data;

* 200 x 88 = 17600 observations
x
7 variables
[all in 1 file]

( 5 analysis variables
+ sample_identifier
+ n_times variable) ;

title 'PROPOSED -
Poisson frequencies -1 LONG file';

%let samples = 200;
%let data = %str(scores);


* generate samples;

data all;
set &data;
DO B_sample = 1 to &samples;
n_times = RANPOI(5555555,1.0);
if n_times > 0 then OUTPUT;
END;
RUN;

* analyze samples & store output;

PROC SORT DATA =all; by B_sample;
PROC princomp DATA =all NOPRINT
COVARIANCE VARDEF=N OUTSTAT=stats;
by B_sample;
VAR mec vec alg ana sta;
FREQ n_times;
RUN;

* extract relevant statistic
for each dataset;

DATA eigenv ;
SET stats; by B_sample;
IF _TYPE_ = "EIGENVAL" THEN DO;
PCT1 = mec/
( mec+vec+alg+ana+sta); OUTPUT;
END;
&verbose PROC PRINT DATA=eigenv;
RUN;

* obtain distrn. of
bootstrap estimates;

PROC UNIVARIATE DATA=eigenv;
VAR PCT1 ; RUN;

* ***************************** ;
* Proposed version -
Poisson frequencies ;

* OPTION 2 ..
--------
macro after frequencies added;

* 1 file, with frequencies appended;

* original dataset
+ 200 n_times counts ;

* 200 observations x 205 variables
(5 anal. vars + 200 n_times counts) ;

title 'PROPOSED -
Poisson frequencies - 1 WIDE file';

%let samples = 200;
%let data = %str(scores);

* generate frequencies & add as vars;

DATA all;

DROP B_sample;
ARRAY n_times(&samples)
nt1-nt&samples;
set &data ;
DO B_sample = 1 to &samples;
n_times(B_sample)
= RANPOI(3333333,1.0);
END;
RUN;

* macro to use a different
frequency var. each time;

%MACRO process(samples);

%do s = 1 %to &samples;

%put &s;

* process;

PROC princomp DATA = all
NOPRINT
COVARIANCE VARDEF=N
OUTSTAT=stats;
VAR mec vec alg ana sta;
FREQ nt&s;
WHERE (nt&s > 0);

* extract;

DATA eigenval ; keep PCT1;
SET stats;
IF _TYPE_ = "EIGENVAL" THEN DO;
PCT1 = mec/
( mec+vec+alg+ana+sta);
OUTPUT;
END;
RUN;

* delete file;

PROC DATASETS nolist nodetails;
delete stats ;

* append results;

PROC DATASETS NOLIST NODETAILS;
APPEND BASE = b_values
DATA = eigenval;

%end;

%MEND process;

* clear;

proc datasets nolist nodetails;
delete b_values ; RUN;

OPTIONS NONOTES; RUN;

* invoke macro;

%process(&samples); RUN;

proc univariate data=b_values;
var PCT1; run;
* ******************************* ;
* Proposed version --
Poisson frequencies ;

* OPTION 3 .. ;

* macro to create, process and
delete each sample in turn;

title PROPOSED - Poisson
frequencies - 1 file at a time;

%MACRO b_strap(data,n,samples);

* seeds;

DATA r_nos; KEEP nu_seed;
Do i = 1 to 5000;
* increase for more samples;
nu_seed = int(100000000
*ranuni(5555555));
OUTPUT;
END;
RUN;

%do s = 1 %to &samples;

%put &s;

* create ;

data b_sample;
RETAIN count 0;
which = &s;
If count = 0 THEN DO;
SET r_nos point = which;
count = 1;
END;
seed = nu_seed;
Do i = 1 to &n;
set &data point = i ;
times = ranpoi(seed,1.0);
if times > 0 then OUTPUT;
IF _error_ = 1 then abort;
END;
STOP;
RUN;
* process ;

PROC princomp DATA = b_sample
NOPRINT COVARIANCE VARDEF=N
OUTSTAT=stats;
VAR mec vec alg ana sta;
FREQ times;

* extract/save statistic ;

DATA eigenval ; keep PCT1;
SET stats;
IF _TYPE_ = "EIGENVAL" THEN DO;
PCT1 = mec/
( mec+vec+alg+ana+sta);
OUTPUT;
END; RUN;

* delete sample;

PROC DATASETS nolist nodetails;
delete stats ;

* append to datafile of statistics;

PROC DATASETS NOLIST NODETAILS;
APPEND BASE = b_values
DATA = eigenval;

%end;

%MEND b_strap;

* clear the statistics datafile;

proc datasets nolist nodetails;
delete b_values; RUN;

OPTIONS NONOTES; RUN;
/* turns off notes */

* invoke macro;

%b_strap(scores,88,200); RUN;

proc univariate data=b_values;
var PCT1; RUN;
**********************************
STANDARD version
********************************** ;
* using pointer (random access) to
* sample with replacement from orig.;

* OPTION IF DATASET SMALL
(no macro);

title 'STANDARD - 1 LONG file';

%let n = 88; %let samples = 200;
%let seed = 5555555;
%let data = %str(scores);

* create samples;

DATA all;
DO B_sample = 1 to &samples;
DO i = 1 to &n;
which = int(
&n*ranuni(&seed)+1);
set &data point = which;
OUTPUT;
IF _error_ = 1 then abort;
END;
END;
STOP; RUN;

* process ;

PROC SORT DATA =all; by B_sample;
PROC princomp DATA =all NOPRINT
COVARIANCE VARDEF=N
OUTSTAT=stats;
by B_sample;
VAR mec vec alg ana sta; RUN;

* extract ;

DATA eigenval ; keep PCT1;
SET stats;
IF _TYPE_ = "EIGENVAL" THEN DO;
PCT1 = mec/
( mec+vec+alg+ana+sta);
OUTPUT;
END; RUN;
proc univariate data = eigenval;
var PCT1; RUN;
* *********************************;
* STANDARD version using pointer
(random access) to sample with
replacement from original ;

* OPTION IF DATASET LARGE

so have macro create/process each
bootstrap sample;

title 'STANDARD - 1 file at time ';

%let samples = 200;

* seeds;

DATA r_nos; KEEP nu_seed;
Do i = 1 to &samples;
nu_seed = int(
100000000*ranuni(5555555));
OUTPUT;
END;
RUN;

%MACRO b_strap(data,n,samples);

%do s = 1 %to &samples;

%put &s;

* create ;

DATA b_sample;
RETAIN count 0;
which = &s;
IF count = 0 THEN DO;
SET r_nos point = which;
count = 1;
END;
seed = nu_seed;
Do i = 1 to &n;
which = int(
&n*ranuni(seed)+1);
set &data point = which;
OUTPUT;
IF _error_ = 1 then abort;
END;
STOP; RUN;

* process ;

PROC princomp DATA = b_sample
NOPRINT
COVARIANCE VARDEF=N
OUTSTAT=stats;
VAR mec vec alg ana sta;

* extract ;

DATA eigenval ; keep PCT1;
SET stats;
IF _TYPE_ = "EIGENVAL"
THEN DO;
PCT1 = mec/
(mec+vec+alg+ana+sta);
OUTPUT;
END;
RUN;

* delete ;

PROC DATASETS nolist nodetails;
delete out ;

* append results ;

PROC DATASETS NOLIST
NODETAILS;
APPEND BASE = b_values
DATA = eigenval;

%end;

%MEND b_strap;

* invoke;

proc datasets nolist nodetails;
delete b_values ; RUN;

OPTIONS NONOTES; RUN;

%b_strap(scores,88,&samples); RUN;

proc univariate data=b_values;
var PCT1; RUN;


created: April. 11, 2006