Skip to content

Study 1

The codes will also give Root-Mean-Square-Error(RMSE) and Mean for each case of inclusive strategy.

The missing mechanism (missing.function) is customizable and will be shown in a plot. In the scatter plot, only Y is missing while X is complete.
AUX_{sign} is randomly assigned to be -1 or 1.
AUX_1 = AUX_{sign} \times AUX, that is, AUX_1 \times AUX_{sign} = AUX
AUX_{plus} = AUX + constant and AUX_{plus} \ge 1
AUX_2=AUX_{sign} \times AUX_{plus}^\frac{1}{2}, that is, AUX_2^2=AUX_{plus}

The missing rate is just conditional to AUX. Theoretically, it should be MAR including AUX, or AUX_{plus}, or AUX_2, or (AUX_{sign},AUX_1). This simulation study shown that AUX_2, and AUX_{sign},AUX_1 can't help EM or MI, although their nonlinear function AUX or AUX_{plus} can.

The following R codes also gave convenient EM and MI function-call interface for SEM user.

rm(list=ls(all=TRUE));seed=2009;
cor3=c(.6,.54,.9);
##cor3=c(.6,.24,.4);
missing.rate=.25;
missing.function=function(AUX) {
logit= -3.6 + 5*AUX;
exp(logit)/(1+exp(logit))}
N=500;N.rep=30;
aux.list<-list(NULL
,c('AUX.sign')
,c('AUX.sign','AUX.1')
,c('AUX')
,c('AUX.sign','AUX.1','AUX')
,c('AUX.2')
,c('AUX.plus')
,c('AUX.plus','AUX.2')
);

require(MASS);
require(norm);
em <- function(dt,showits=FALSE){
dt <- as.matrix(dt);
require(norm);
thetahat<-em.norm(s<-prelim.norm(dt),showits=showits);
res<-getparam.norm(s,thetahat,corr=FALSE);
res.m<-cbind( res$mu,res$sigma);
rownames(res.m)<-colnames(dt);
colnames(res.m)<-c('.mean',colnames(dt));
res.m
}

p.rep <- function(N.rep=2
,N=500
,cor3=c(.6,.54,.9)
,
missing.rate=.25
,
missing.function=function(AUX) {
logit= -3.6 + 5*AUX;
exp(logit)/(1+exp(logit))}
){
if (exists('seed')) {set.seed(seed);cat('Random Seed:',seed,'\n\n')};
sigma<-diag(3);
sigma[row(sigma) < col(sigma)]<-cor3;
sigma<-pmax(sigma,t(sigma));
colnames(sigma)<-rownames(sigma)<-c('X','Y','AUX');
print(sigma);

AUX <- rnorm(1e6,0,sd=sqrt(sigma[3,3]));
r1 = missing.function(AUX);
mean.r=mean(r1);
if (missing.rate < mean.r) {
missing.adjust <- function (AUX) {
missing.function(AUX)*missing.rate/mean.r};
}else{
missing.adjust <- function (AUX) {
1- (1-missing.function(AUX)) * (1-missing.rate)/(1-mean.r)};
}
plot(missing.adjust,-3,3
,xlab='AUX',ylab='missing.rate'
,ylim=c(0,1),main='dark:adjusted\ngrey:original');
plot(missing.function,-3,3,col='grey',add=TRUE);
rm('AUX','r1');

prt<-proc.time();
for (i.rep in 1:N.rep){
dt = data.frame(mvrnorm(N,rep(0,3),sigma));
names(dt)<-colnames(sigma);

dt$Y.mr = missing.adjust(dt$AUX);
dt$Y.mis = ifelse(runif(N,0,1)<dt$Y.mr,NA,dt$Y);
dt$AUX.sign <- ifelse(sample(N) < N/2,1,-1) ;
dt$AUX.1 <- dt$AUX * dt$AUX.sign;
dt$AUX.2 <- sqrt(dt$AUX.plus <- (dt$AUX - min(dt$AUX) + 1))*dt$AUX.sign;

res<-matrix(c(0,.6),nrow=1);
colnames(res)<-c('YfromI','YfromX');
rownames(res)[nrow(res)]<-c('population');
res<-rbind(res,lm(Y~X,dt)$coefficients);
rownames(res)[nrow(res)]<-c('X,Y');

for (k in 1:length(aux.list)){
res.em<-em(dt[,c('X','Y.mis',aux.list[[k]])]);
res<-rbind(res, c(
res.em['Y.mis','.mean']-res.em['X','.mean']*res.em['Y.mis','X']/res.em['X','X']
,res.em['Y.mis','X']/res.em['X','X']
));
rownames(res)[nrow(res)]<-paste(c('X','Y.mis',aux.list[[k]]),collapse=',');
}

if (!exists('res.YfromI')) {
res.YfromI <- res[,'YfromI'];
res.YfromX <- res[,'YfromX'];
}else{
res.YfromI <- cbind(res.YfromI,res[,'YfromI']);
res.YfromX <- cbind(res.YfromX,res[,'YfromX']);
}

}
colnames(res.YfromI)<-colnames(res.YfromX)<-NULL;
print(proc.time()-prt);

res.RMSE <- cbind(sqrt(rowMeans(res.YfromI^2)),sqrt(rowMeans((res.YfromX-sigma[2,1])^2)));
res.M <- cbind(rowMeans(res.YfromI),rowMeans(res.YfromX));
colnames(res.RMSE)<-colnames(res.M)<-c('YfromI','YfromX');
list(YfromI=res.YfromI,YfromX=res.YfromX
,RMSE=res.RMSE,M=res.M,sigma=sigma,last.data=dt);
}
##seed=2009;
res <- p.rep(N.rep,cor3=cor3,N=N,missing.rate=missing.rate,missing.function=missing.function);
res$RMSE
res$M
##res <- p.rep(40000,cor3=cor3,N=N,missing.rate=missing.rate,missing.function=missing.function);
##try large repetitions on local R platform

require(R2HTML);require(Rpad);
.HTML.file = "";
HTMLhr();HTMLh2('Population');HTML(res$sigma);
HTMLhr();HTMLh2('Root Mean Square Error');HTML(res$RMSE);
HTMLhr();HTMLh2('Mean');HTML(res$M);

############
## To verify MI, will get simiar results like EM
## Only use data of the last replication
############

##layout(cbind(c(1,1,1),2:4));

dt<-res$last.data;
plot(Y~X,data=dt ,main='grey dash & circle:complete\ndark line & dot:incomplete',type='n');
points(Y~X,data=dt[is.na(dt$Y.mis),],col='grey');
points(dt$X,dt$Y.mis,pch=20);
abline(lm(Y~X,data=dt),col='grey',lty='dashed');
abline(lm(Y~X,data=dt[!is.na(dt$Y.mis),]));
abline(v=0,col='grey');
abline(h=mean(dt$Y.mis,na.rm=TRUE));
abline(h=mean(dt$Y),col='grey',lty='dashed');

plot(c(dt$AUX.1,dt$AUX,dt$AUX.2,dt$AUX.plus),c(dt$Y.mr%x%rep(1,4)),ylim=c(0,1),type='n'
,ylab='missing rate'
,xlab='grey:AUX dotted:AUXplus');
points(dt$AUX[order(dt$AUX)],dt$Y.mr[order(dt$AUX)],type='l',col='grey');
points(dt$AUX.plus[order(dt$AUX)],dt$Y.mr[order(dt$AUX)],type='l',lty='dotted');

plot(c(dt$AUX.1,dt$AUX,dt$AUX.2,dt$AUX.plus),c(dt$Y.mr%x%rep(1,4)),ylim=c(0,1),type='n'
,ylab='missing rate'
,xlab='AUX1=AUXsign*AUX');
points(dt$AUX.1[order(dt$AUX.1)],dt$Y.mr[order(dt$AUX.1)],pch='.');

plot(c(dt$AUX.1,dt$AUX,dt$AUX.2,dt$AUX.plus),c(dt$Y.mr%x%rep(1,4)),ylim=c(0,1),type='n'
,ylab='missing rate'
,xlab='AUX2=AUXsign*sqrt(AUXplus)');
points(dt$AUX.2[order(dt$AUX.2)],dt$Y.mr[order(dt$AUX.2)],pch='.');

dt$AUX.r <- residuals(lm(AUX~X,data=dt));
dt$AUX.sign.r <- residuals(lm(AUX.sign ~ X,data=dt));
##For lisrel
## write.table(dt,choose.files(),na="-99999",row.names=FALSE)
##Or for Mplus
## write.table(dt,choose.files(),na="-99999",row.names=FALSE,col.names=FALSE)
## gsub('[.]','',paste('NAMES ARE',paste(colnames(dt),collapse=' '),';'))

############
## Mulitple Imputation
############
sem.mi<-function(ram,data
,
MI=NA
,
raw=FALSE
,
fixed.x=NULL
,
aux.x=NULL
,
confidence=.90
,
N.mi=30
,
N.step = 100
,
showits=TRUE
,...){
require(sem);
dt <- as.matrix(data);
if(!is.na(MI)){
print(sort(table(dt), decreasing = TRUE)[1:5]);
### plot(data,pch='.');
dt[dt==MI] <- NA;
print(sort(table(dt), decreasing = TRUE)[1:5]);
##Check the missing token!
}
if (showits) cat("Verify: Number of iterations should be less than", N.step,'\n');
thetahat<-em.norm(s<-prelim.norm(dt),showits=showits);
rngseed(ifelse(exists('seed'),max(seed,1),2009));
sem.list<-list();
est <- std.err <- vector("list",N.mi);

prt=proc.time();
for (i in 1:N.mi) {
### i=1
dt.im<-dt;
dt.im[is.na(dt)]<-da.norm(s,thetahat,steps=N.step,return.ymis=TRUE)$ymis;
dt.im<-dt.im[,setdiff(colnames(dt.im),aux.x)];
if (raw){
sem.list[[i]]<-sem(ram,raw.moments(dt.im),nrow(dt.im),raw=TRUE,fixed.x=fixed.x);
}else{
sem.list[[i]]<-sem(ram,var(dt.im),nrow(dt.im),raw=FALSE,fixed.x=fixed.x);
}
est[[i]]<-matrix(summary(sem.list[[i]])$coeff$Estimate);
std.err[[i]]<-matrix(summary(sem.list[[i]])$coeff$'Std Error');
}
##time/repetition
(proc.time()-prt)/N.mi

coeff.mi<-mi.inference(est,std.err,confidence=confidence);
##estimated relative efficiency to infinite repetitions
coeff.mi$relative.efficiency<-N.mi/((coeff.mi<-mi.inference(est,std.err))$fminf+N.mi);
res<-list();
res$coef<-matrix(nrow=nrow(est[[i]]),ncol=0);
for (j in 1:length(coeff.mi)) res$coef <- cbind(res$coef,coeff.mi[[j]]);
rownames(res$coef)=names(sem.list[[i]]$coeff);
colnames(res$coef)=names(coeff.mi);
##
res$list<-sem.list;
res
}

ram<-matrix(byrow=TRUE,ncol=3,data=c(
'Y.mis <- X','YfromX',NA,
'X <- .Intercept','XfromI',NA,
'Y.mis <- .Intercept','YfromI',NA,
'X <-> X','varX',NA,
'Y.mis <-> Y.mis','varY',NA));
class(ram)<-'mod';

##
EMvsMI<-matrix(nrow=length(aux.list),ncol=6);
colnames(EMvsMI)=c('EM.YfromI','MI.YfromI','Efficiency.YfromI'
,'EM.YfromX','MI.YfromX','Efficiency.YfromX');
rownames(EMvsMI)=rownames(res$YfromI)[2+1:length(aux.list)]
EMvsMI[,c(1,4)] <- cbind(res$YfromI[2+1:length(aux.list),ncol(res$YfromI)]
,res$YfromX[2+1:length(aux.list),ncol(res$YfromX)]);
prt=proc.time();
for(k in 1:length(aux.list)){
dt.a<-res$last.data[,c('X','Y.mis',aux.list[[k]])];
.Intercept=dt.a$X*0+1;
data=cbind(dt.a,.Intercept);
res.mi = sem.mi(ram,data
,raw=TRUE,fixed.x='.Intercept',aux.x=aux.list[[k]]
,N.step=100,N.mi=10)
EMvsMI[k,c(2,5)]<-res.mi$coef[c('YfromI','YfromX'),'est']
EMvsMI[k,c(3,6)]<-res.mi$coef[c('YfromI','YfromX'),'relative.efficiency']
}
proc.time()-prt
EMvsMI
HTMLhr();HTMLh2('EM vs MI');HTML(EMvsMI);

dt.a<-res$last.data[,c('Y','Y.mis','AUX.2')];
plot(Y~AUX.2,data=dt.a
,col=ifelse(is.na(Y.mis),'grey','black')
,ylim=c(-5,5)
,main='Linear (dot), curvilinear (dash) and LOESS (blue) prediction');
AUX.2<-data.frame(AUX.2=0.01*(-300:300));
points(AUX.2$AUX.2
,predict(lm(Y.mis~AUX.2,na.omit(dt.a)),newdata=AUX.2)
,type='l',lty='dotted');
points(AUX.2$AUX.2
,predict(lm(Y.mis~1+AUX.2+I(AUX.2^2),na.omit(dt.a)),newdata=AUX.2)
,type='l',lty='dashed');
points(AUX.2$AUX.2
,predict(loess(Y.mis~AUX.2,na.omit(dt.a))
,newdata=AUX.2$AUX.2)
,type='l',col='blue');


Root Mean Square Error of 40000 replications

YfromI YfromX
population 0.0000 0.0000
X,Y 0.0358 0.0360
X,Y.mis 0.2625 0.1511
X,Y.mis,AUX.sign 0.2624 0.1511
X,Y.mis,AUX.sign,AUX.1 0.2630 0.1517
X,Y.mis,AUX 0.0390 0.0389
X,Y.mis,AUX.sign,AUX.1,AUX 0.0390 0.0389
X,Y.mis,AUX.2 0.2625 0.1512
X,Y.mis,AUX.plus 0.0390 0.0389
X,Y.mis,AUX.plus,AUX.2 0.0390 0.0389


Mean of 40000 replications

YfromI YfromX
population 0.00000 0.60000
X,Y 0.00004 0.59977
X,Y.mis -0.25998 0.45392
X,Y.mis,AUX.sign -0.25998 0.45394
X,Y.mis,AUX.sign,AUX.1 -0.26049 0.45337
X,Y.mis,AUX -0.00002 0.59966
X,Y.mis,AUX.sign,AUX.1,AUX -0.00002 0.59967
X,Y.mis,AUX.2 -0.26004 0.45392
X,Y.mis,AUX.plus -0.00002 0.59966
X,Y.mis,AUX.plus,AUX.2 -0.00002 0.59967