SAS source codes for performing EM estimation
of CACE and computing bootstrap standard errors
options ls=80 ps=54;
libname ss '~/jobs';
**************************;
* MACROS ;
**************************;
*==========================================================================;
%macro numobs(dsn);
*==========================================================================;
* Macro numobs obtains the number of observations in the data set dsn and
;
* puts this number in macro variable: no ;
* -----------------------------------------------------------------------
;
* Input parameter: dsn=input dataset ;
* Output parameter: no=number of observations in dsn ;
*--------------------------------------------------------------------------;
%global no;
data _null_;
if 0 then set &dsn nobs=count;
call symput('no',left(put(count,8.)));
stop;
run;
%mend numobs;
*==========================================================================;
%macro bootstrp(dsn,out);
*==========================================================================;
* Macro bootstrap draws a bootstrap sample for experimental and control
;
* groups from input dataset dsn and produces drawing counts in ;
* output dataset out ;
* -------------------------------------------------------------------------
;
* Input parameter:
* dsn=input dataset to draw bootstrap sample ;
* Output parameter:
* out=output dataset containing drawn count of each observation, case ID;
* and show/no-show status ;
*--------------------------------------------------------------------------;
* draw bootstrap sample from experimental group;
data exp; set &dsn;
if show=. then delete;
data exp; set exp;
draw=_n_;
run;
* obtain number of observations in experimental group;
%numobs(exp);
%let n=&no;
data temp(keep=draw) sd(keep=seed); set sd;
do i=1 to &n;
call ranuni(seed,u);
draw=round(u*(&n-1)+1);
output temp;
if i=&n then output sd;
end;
proc freq noprint data=temp;
tables draw / out=b;
* merge drawn counts to dataset containing experimental group;
data exp; merge exp b;
by draw;
run;
* draw bootstrap sample from controls;
data &out; set &dsn;
if show ne . then delete;
data &out; set &out;
draw=_n_;
*obtain number of observations in control group;
%numobs(&out);
%let n=&no;
data temp(keep=draw) sd(keep=seed); set sd;
do i=1 to &n;
call ranuni(seed,u);
draw=round(u*(&n-1)+1);
output temp;
if i=&n then output sd;
end;
proc freq noprint data=temp;
tables draw / out=b;
* merge drawn counts to dataset containing controls;
data &out; merge &out b; by draw; run;
proc append base=&out data=exp;
data &out; set &out;
if count=. then count=0;
keep &id count show;
run;
%mend bootstrp;
*==========================================================================;
%macro bootloop(seed,b,d,dd,id,mun,muco,sgsq,pn);
*==========================================================================;
* Macro bootloop iterates the macros bootstrp and iterem ;
* to obtain parameter estimates via EM for b bootstrap samples ;
* -----------------------------------------------------------------------
;
* Input parameters: ;
* seed = seed for random number drawing ;
* b = number of bootstrap samples ;
* d = input dataset ;
* dd = working dataset ;
* id = identifying variable for cases in input dataset ;
* mun = starting value for mean of never-takers ;
* muco = starting value for mean of control compliers ;
* sgsq = starting value for common variance ;
* pn = starting value for probability of noncompliance ;
* ;
* Output parameters: ;
* lgest = SAS dataset containing logistic regression ;
* parameter estimates from the b bootstrap samples ;
* btest = SAS dataset containing regression parameter ;
* estimates from the b bootstrap samples ;
*--------------------------------------------------------------------------;
data sd;
seed=&seed;
run;
%do bb=1 %to &b;
* remove previous bootstrap count for cases;
data ⅆ set ⅆ drop count; run;
* create temporary dataset (one) for storing bootstrap counts;
data one; set &d; keep &id show; run;
* obtain a bootstrap sample for dataset d and stores the count in dataset
h;
%bootstrp(one,h);
* merge new bootstrap counts to working dataset;
data h; set h;
if show in (0 1) then do; ci=show; wt=count; output; end;
if show=. then do; ci=0; wt=0.5*count; output; ci=1; wt=0.5*count; output;
end;
run;
proc sort data=h; by &id show ci;
proc sort data=ⅆ by &id show ci;
data ⅆ merge &dd h; by &id show ci;
run;
title "Bootstrap iteration &bb";
* call macro iterem to perform EM iterations for bootstrap drawn;
%iterem(&mun,&muco,&sgsq,&pn,&d,&dd);
data logest; set logest;
drop _link_ _type_ _name_ _lnlike_;
run;
%if &bb>1 %then %do;
proc append base=lgest data=logest;
proc append base=btest data=outlk;
run;
%end;
%else %do;
data lgest; set logest; data btest; set outlk; run;
%end;
%end; * Bootstrap loop;
%mend bootloop;
*==========================================================================;
%macro em;
*==========================================================================;
* Macro em performs the E and M steps in the EM algorithm and ;
* computes the loglikelihood value for the step ;
* -----------------------------------------------------------------------
;
* Input parameters: ;
* (same as macro iterem) ;
* Output parameters: ;
* outlg = SAS dataset containing logistic regression parameter estimates;
* of each EM iteration ;
* outf = SAS dataset containing regression parameter parameter estimates,;
* log-likelihood values of each EM iteration ;
*--------------------------------------------------------------------------;
* ========================= E-step ============================ ;
* update weights for noncompliance in control group;
data in1; set in1;
if show=0 then do; nevertk=1; comply=0; ci=show; wt=1*count; output; end;
if show=1 then do; nevertk=0; comply=1; ci=show; wt=1*count; output; end;
if show=. then do;
gn=exp(-(y-predn)**2/(2*sig2));
gc0=exp(-(y-predc0)**2/(2*sig2));
probnshw=probn*gn/((1-probn)*gc0+probn*gn);
nevertk=probnshw; comply=1-probnshw;
if ci=0 then do; wt=nevertk*count; output; end;
if ci=1 then do; wt=comply*count; output; end;
end;
run;
proc sort data=in1; by descending ci;
run;
* ========================= M-step ============================ ;
* logistic regression of noncompliance on covariates;
proc logistic data=in1 order=data outest=logest noprint;
model ci=&xcompl;
weight wt;
output out=lout p=prob&k;
data out; set lout;
probn=1-prob&k;
keep probn;
%if k>1 %then %do;
data in1; set in1; drop probn; run;
%end;
* create covariates for use in regression of y;
data in1; merge in1 out;
numnvtk=wt*(1-ci);
pprob=wt*probn;
* compute mean of predicted probabilities - mean_prn and;
* estimated proportion of never-takers - prob_btk;
proc means sum noprint data=in1;
var numnvtk pprob;
output out=pout sum=totnvtk totprobn;
data pout; set pout;
mean_prn=totprobn/&nall;
prob_ntk=totnvtk/&nall;
keep mean_prn prob_ntk;
* linear regression of y (outcome of interest) on covariates;
proc reg data=in1 outest=est noprint;
model y=&xregn / sse;
weight wt;
output out=regout p=predy;
run;
* compute ML estimate of predicted means and common variance;
data est; merge mvars est;
sig2=_sse_/&nall;
mu_n=&mn;
mu_c0=&mcc;
mu_c1=&mec;
CACE=&cace;
keep intercep &xregn sig2 mu_n mu_c0 mu_c1 CACE;
data est; merge pout est;
run;
data estout; set est;
do i=1 to &ndup; output; end;
keep sig2;
run;
* compute likelihood value;
data regout; merge regout estout;
if ci=0 then do;
like=probn/sqrt(2*3.14159265359*sig2)*exp(-(y-predy)**2/(2*sig2));
output;
end;
else do;
like=(1-probn)/sqrt(2*3.14159265359*sig2)*exp(-(y-predy)**2/(2*sig2));
output;
end;
* compute contribution to log-likelihood from controls;
data ctrl0; set regout;
if show ne . or ci=1 then delete;
like0=like;
predn=predy;
keep like0 predn id count;
proc sort data=ctrl0; by id;
data ctrl1; set regout;
if show ne . or ci=0 then delete;
like1=like;
predc0=predy;
keep like1 predc0 id count;
proc sort data=ctrl1; by id;
data ctrl; merge ctrl0 ctrl1; by id;
ci=0;
lnlkc=count*log(like1+like0);
proc means n sum data=ctrl noprint;
var lnlkc;
output out=outctrl sum=llk_c;
* compute contribution to log-likelihood from experimental never-takers;
data in2; set regout;
if show ne 0 then delete;
lnlken=count*log(like);
proc means n sum data=in2 noprint;
var lnlken;
output out=outexpn sum=llk_en;
* compute contribution to log-likelihood from experimental compliers;
data in2; set regout;
if show ne 1 then delete;
lnlkec=count*log(like);
proc means n sum data=in2 noprint;
var lnlkec;
output out=outexpc sum=llk_ec;
* compute log-likelihood at k-th iteration;
data outlk; merge est outctrl outexpn outexpc;
lnlk&k=llk_c+llk_en+llk_ec;
lnlike=lnlk&k;
iter=&k;
drop _type_ _freq_;
run;
data logest; set logest;
drop _link_ _type_ _name_ _lnlike_;
run;
%if &k>1 %then %do;
proc append base=outlg data=logest;
data outlk; set outlk; drop lnlk&k;
proc append base=outf data=outlk;
run;
%end;
%else %do;
data outlg; set logest;
data outf; set outlk; drop lnlk&k;
run;
%end;
* update value of sig2 in working dataset dd;
data in1; merge in1 estout; run;
* update value of predn, predc0 in working dataset dd;
data ctrl; set ctrl;
output;
ci=1; output;
keep id ci predn predc0;
proc sort data=ctrl; by id ci;
proc sort data=in1; by id ci;
data in1; merge in1 ctrl; by id ci;
run;
%mend em;
*==========================================================================;
%macro iterem(mun,muco,sgsq,pn,d,dd);
*==========================================================================;
* Macro iterem calls the macro em iteratively to execute EM iterations
;
* until convergence of the algorithm ;
* -----------------------------------------------------------------------
;
* Input parameters: ;
* mun = starting value for mean of never-takers ;
* muco = starting value for mean of control compliers ;
* sgsq = starting value for common variance ;
* pn = starting value for probability of noncompliance ;
* d = input dataset ;
* dd = working dataset ;
*--------------------------------------------------------------------------;
* Initialization: create working dataset (in1) for use in EM iteration;
data in1; set ⅆ
file print;
* set starting values for EM;
* predn = mean of never-taker distribution;
* predc0 = mean of control complier distribution;
* sig2 = common variance of the distributions;
predn=&mun;
predc0=&muco;
sig2=&sgsq;
probn=&pn;
* print starting values in output file;
if _n_=1 then do;
put " Starting values for EM:";
put " mu_n="predn " mu_c0="predc0 " sigma2="sig2;
put " probability of noncompliance="probn;
end;
run;
%numobs(&d);
%let nall=&no; * nall=number of observations in dataset;
%numobs(in1);
%let ndup=&no; * ndup=number of observations in dataset with duplicated
controls;
data prvest;
prev=1; * prev=log-likelihood value at previous iteration;
run;
* iterates macro em and check for convergence;
* lnlike=current log-likelihood value;
* convergence criteria: |(prev-lnlike)/prev|<0.0000001;
* maximum number of EM iterations allowed: 100;
%let k=0; * k=number of iterations;
%let u=1;
%do %until(&u=0);
%let k=%eval(&k+1);
%em;
data prvest; merge prvest outlk;
v=abs((prev-lnlike)/prev);
if v<0.0000001 then call symput('u','0');
prev=lnlike;
keep prev;
run;
%put "u=&u";
%if &u>0 %then %do;
%if &k>100 %then %do;
%let u=0;
data _null_; file print;
put "************ WARNING: EM Algorithm DID NOT converge in &k
iterations! ***********";
run;
%end;
%end;
%else %do;
data _null_; file print;
put "******************** EM Algorithm converged in &k iterations!
*******************";
run;
%end;
%end; * EM iteration loop;
%mend iterem;
/*--------------------------------------------------------------------------------------*/
/* SAS codes to perform EM esimation of the mean and variance of the distribution
of */
/* the dependent variable (y) for different participant types: */
/* NEVER-TAKERS, CONTROL COMPLIERS, AND EXPERIMENTAL COMPLIERS */
/* with mean of the distributions dependent on covariates (xregn) and with
*/
/* variances of the distributions EQUAL */
/* The probability of compliance is modeled with a logistic regression
dependent */
/* on covariates (xcompl) */
/*--------------------------------------------------------------------------------------*/
**********************************************************************;
* MAIN PROGRAM ;
* (NOTE: all of the following steps are REQUIRED in order to execute the
macros properly);
**********************************************************************;
* input dataset;
* consists of a treatment group and a control group and;
* show/no-show status of treatment group is known;
data in;
infile '~/jobs/in.dat';
** variables REQUIRED in input dataset (use the same variable name as specified);
* y = outcome variable of interest;
* id = identifying variable for each case;
* grp = randomized group indicator;
*(1=experimental group; 0=control group);
* show = compliance status;
* (1=show in experimental group; 0=no-show in experimental group; .=control
group);
** other variables needed (specify your own variable names);
* covariates for use in regression for Y;
* covariates for use in logistic regression for compliance;
input age assertv grp deprsn0 deprsn2 deprsn3 diff_dep econhard h_age h_asrt
h_ecn h_mot h_nmar h_nwht h_sch id income l_inc motv2 nmarr risk01 schgrd
show y;
run;
* create working dataset inwrk from input dataset for use in EM iterations;
data inwrk; set in;
** variables REQUIRED in this dataset (use the same variable name as specified):;
* id = identifying variable for each case;
* y = dependent variable for regression;
* show = observed compliance status;
* = 1 for shows in treatment group/compliers;
* = 0 for no-shows in treatment group/never-takers;
* = . for control group;
* ci = compliance status to be used in model fitting;
* = show if show is not missing;
* = 1 if show is missing (control group);
* = 0 if show is missing (control group);
* wt = weight for compliance status ci and bootstrap count;
* =1 if show is not missing and remains constant in EM;
* =0.5 if show is missing (control group) and will be updated in EM;
* initialize value of ci and wt;
if show in (1 0) then do; ci=show; wt=1*count; output; end;
if show=. then do; ci=1; wt=0.5*count; output; ci=0; wt=0.5*count; output;
end;
* count = contribution of each case to analysis, value will be updated
when computing the bootstrap standard errors;
* = number of times the case was randomly picked by bootstrap;
* = 0 if case was not picked by bootstrap;
* initialize value of count;
count=1;
data inwrk; set inwrk;
* create variable for experimental compliers (use variable name as specified);
* ri = indicator for experimental compliers;
ri=ci*grp;
run;
* specify the covariates used in model for y;
* needed for computation of means;
%let mvar=risk01 deprsn0;
* specify the labels for the mean of the covariates specified;
%let mlabl=mean_rsk mean_dep;
* compute means of covariates used in regression for Y and;
* store the means in SAS dataset mvars;
* these means are used in computing the predicted means;
proc means mean data=in;
title "Means of covariates used in regression for Y";
var &mvar;
output out=mvars mean=&mlabl;
data mvars; set mvars;
keep &mlabl;
run;
**** Fitting Model ****;
** specify the variables needed for model fitting,;
* the computation of predicted means, and the CACE;
* independent variables (xcompl) for logistic regression;
%let xcompl=risk01 age schgrd motv2 h_age h_sch h_mot h_asrt h_nmar h_ecn
h_nwht l_inc;
* independent variables (xregn) for regression: covariates for reduced
model;
* variables REQUIRED in the model: ci ri;
%let xregn=ci ri risk01 deprsn0;
* predicted means for model for Y;
* mn=predicted mean for never-takers;
%let mn=intercep+mean_rsk*risk01+mean_dep*deprsn0;
* mcc=predicted mean for control compliers;
%let mcc=intercep+ci+mean_rsk*risk01+mean_dep*deprsn0;
* mec=predicted mean for experimental compliers;
%let mec=intercep+ci+ri+mean_rsk*risk01+mean_dep*deprsn0;
* CACE for model;
%let cace=ri;
* start EM iterations;
title "Start EM algorithm";
%iterem(-0.087758,-0.067319,0.35331,0.5,in,inwrk);
proc print data=outlg;
title "Summary of EM iterations";
title2 "Parameter estimates from logistic regression for noncompliance";
proc print data=outf;
title2 "Parameter estimates from regression for Y";
run;
* obtain bootstrap estimates for parameter estimates and the CACE; %bootloop(342305947,10,in,inwrk,id,-0.087758,-0.067319,0.35331,0.5);
proc means n mean std var data=lgest;
title "Summary of Bootstrap iterations";
title2 "Logistic Regression estimates from &b bootstrap samples";
title3 "Bootstrap means and standard errors";
proc means n mean std var data=btest;
title2 "Regression coefficient estimates from &b bootstrap samples";
title3 "Bootstrap means and standard errors";
run;