The codes will also give Root-Mean-Square-Error(RMSE) and Mean for each case of inclusive strategy.
The missing mechanism (missing.function) is customizable and will be shown in a plot. In the scatter plot, only Y is missing while X is complete.
is randomly assigned to be -1 or 1.
, that is,
and
, that is,
The missing rate is just conditional to . Theoretically, it should be MAR including
, or
, or
, or
. This simulation study shown that
, and
can't help EM or MI, although their nonlinear function
or
can.
The following R codes also gave convenient EM and MI function-call interface for SEM user.
rm(list=ls(all=TRUE));seed=2009;
cor3=c(.6,.54,.9);
##cor3=c(.6,.24,.4);
missing.rate=.25;
missing.function=function(AUX) {
logit= -3.6 + 5*AUX;
exp(logit)/(1+exp(logit))}
N=500;N.rep=30;
aux.list<-list(NULL
,c('AUX.sign')
,c('AUX.sign','AUX.1')
,c('AUX')
,c('AUX.sign','AUX.1','AUX')
,c('AUX.2')
,c('AUX.plus')
,c('AUX.plus','AUX.2')
);require(MASS);
require(norm);
em <- function(dt,showits=FALSE){
dt <- as.matrix(dt);
require(norm);
thetahat<-em.norm(s<-prelim.norm(dt),showits=showits);
res<-getparam.norm(s,thetahat,corr=FALSE);
res.m<-cbind( res$mu,res$sigma);
rownames(res.m)<-colnames(dt);
colnames(res.m)<-c('.mean',colnames(dt));
res.m
}p.rep <- function(N.rep=2
,N=500
,cor3=c(.6,.54,.9)
,
missing.rate=.25
,
missing.function=function(AUX) {
logit= -3.6 + 5*AUX;
exp(logit)/(1+exp(logit))}
){
if (exists('seed')) {set.seed(seed);cat('Random Seed:',seed,'\n\n')};
sigma<-diag(3);
sigma[row(sigma) < col(sigma)]<-cor3;
sigma<-pmax(sigma,t(sigma));
colnames(sigma)<-rownames(sigma)<-c('X','Y','AUX');
print(sigma);AUX <- rnorm(1e6,0,sd=sqrt(sigma[3,3]));
r1 = missing.function(AUX);
mean.r=mean(r1);
if (missing.rate < mean.r) {
missing.adjust <- function (AUX) {
missing.function(AUX)*missing.rate/mean.r};
}else{
missing.adjust <- function (AUX) {
1- (1-missing.function(AUX)) * (1-missing.rate)/(1-mean.r)};
}
plot(missing.adjust,-3,3
,xlab='AUX',ylab='missing.rate'
,ylim=c(0,1),main='dark:adjusted\ngrey:original');
plot(missing.function,-3,3,col='grey',add=TRUE);
rm('AUX','r1');prt<-proc.time();
for (i.rep in 1:N.rep){
dt = data.frame(mvrnorm(N,rep(0,3),sigma));
names(dt)<-colnames(sigma);dt$Y.mr = missing.adjust(dt$AUX);
dt$Y.mis = ifelse(runif(N,0,1)<dt$Y.mr,NA,dt$Y);
dt$AUX.sign <- ifelse(sample(N) < N/2,1,-1) ;
dt$AUX.1 <- dt$AUX * dt$AUX.sign;
dt$AUX.2 <- sqrt(dt$AUX.plus <- (dt$AUX - min(dt$AUX) + 1))*dt$AUX.sign;res<-matrix(c(0,.6),nrow=1);
colnames(res)<-c('YfromI','YfromX');
rownames(res)[nrow(res)]<-c('population');
res<-rbind(res,lm(Y~X,dt)$coefficients);
rownames(res)[nrow(res)]<-c('X,Y');for (k in 1:length(aux.list)){
res.em<-em(dt[,c('X','Y.mis',aux.list[[k]])]);
res<-rbind(res, c(
res.em['Y.mis','.mean']-res.em['X','.mean']*res.em['Y.mis','X']/res.em['X','X']
,res.em['Y.mis','X']/res.em['X','X']
));
rownames(res)[nrow(res)]<-paste(c('X','Y.mis',aux.list[[k]]),collapse=',');
}if (!exists('res.YfromI')) {
res.YfromI <- res[,'YfromI'];
res.YfromX <- res[,'YfromX'];
}else{
res.YfromI <- cbind(res.YfromI,res[,'YfromI']);
res.YfromX <- cbind(res.YfromX,res[,'YfromX']);
}}
colnames(res.YfromI)<-colnames(res.YfromX)<-NULL;
print(proc.time()-prt);res.RMSE <- cbind(sqrt(rowMeans(res.YfromI^2)),sqrt(rowMeans((res.YfromX-sigma[2,1])^2)));
res.M <- cbind(rowMeans(res.YfromI),rowMeans(res.YfromX));
colnames(res.RMSE)<-colnames(res.M)<-c('YfromI','YfromX');
list(YfromI=res.YfromI,YfromX=res.YfromX
,RMSE=res.RMSE,M=res.M,sigma=sigma,last.data=dt);
}
##seed=2009;
res <- p.rep(N.rep,cor3=cor3,N=N,missing.rate=missing.rate,missing.function=missing.function);
res$RMSE
res$M
##res <- p.rep(40000,cor3=cor3,N=N,missing.rate=missing.rate,missing.function=missing.function);
##try large repetitions on local R platformrequire(R2HTML);require(Rpad);
.HTML.file = "";
HTMLhr();HTMLh2('Population');HTML(res$sigma);
HTMLhr();HTMLh2('Root Mean Square Error');HTML(res$RMSE);
HTMLhr();HTMLh2('Mean');HTML(res$M);############
## To verify MI, will get simiar results like EM
## Only use data of the last replication
##############layout(cbind(c(1,1,1),2:4));
dt<-res$last.data;
plot(Y~X,data=dt ,main='grey dash & circle:complete\ndark line & dot:incomplete',type='n');
points(Y~X,data=dt[is.na(dt$Y.mis),],col='grey');
points(dt$X,dt$Y.mis,pch=20);
abline(lm(Y~X,data=dt),col='grey',lty='dashed');
abline(lm(Y~X,data=dt[!is.na(dt$Y.mis),]));
abline(v=0,col='grey');
abline(h=mean(dt$Y.mis,na.rm=TRUE));
abline(h=mean(dt$Y),col='grey',lty='dashed');plot(c(dt$AUX.1,dt$AUX,dt$AUX.2,dt$AUX.plus),c(dt$Y.mr%x%rep(1,4)),ylim=c(0,1),type='n'
,ylab='missing rate'
,xlab='grey:AUX dotted:AUXplus');
points(dt$AUX[order(dt$AUX)],dt$Y.mr[order(dt$AUX)],type='l',col='grey');
points(dt$AUX.plus[order(dt$AUX)],dt$Y.mr[order(dt$AUX)],type='l',lty='dotted');plot(c(dt$AUX.1,dt$AUX,dt$AUX.2,dt$AUX.plus),c(dt$Y.mr%x%rep(1,4)),ylim=c(0,1),type='n'
,ylab='missing rate'
,xlab='AUX1=AUXsign*AUX');
points(dt$AUX.1[order(dt$AUX.1)],dt$Y.mr[order(dt$AUX.1)],pch='.');plot(c(dt$AUX.1,dt$AUX,dt$AUX.2,dt$AUX.plus),c(dt$Y.mr%x%rep(1,4)),ylim=c(0,1),type='n'
,ylab='missing rate'
,xlab='AUX2=AUXsign*sqrt(AUXplus)');
points(dt$AUX.2[order(dt$AUX.2)],dt$Y.mr[order(dt$AUX.2)],pch='.');dt$AUX.r <- residuals(lm(AUX~X,data=dt));
dt$AUX.sign.r <- residuals(lm(AUX.sign ~ X,data=dt));
##For lisrel
## write.table(dt,choose.files(),na="-99999",row.names=FALSE)
##Or for Mplus
## write.table(dt,choose.files(),na="-99999",row.names=FALSE,col.names=FALSE)
## gsub('[.]','',paste('NAMES ARE',paste(colnames(dt),collapse=' '),';'))############
## Mulitple Imputation
############
sem.mi<-function(ram,data
,
MI=NA
,
raw=FALSE
,
fixed.x=NULL
,
aux.x=NULL
,
confidence=.90
,
N.mi=30
,
N.step = 100
,
showits=TRUE
,...){
require(sem);
dt <- as.matrix(data);
if(!is.na(MI)){
print(sort(table(dt), decreasing = TRUE)[1:5]);
### plot(data,pch='.');
dt[dt==MI] <- NA;
print(sort(table(dt), decreasing = TRUE)[1:5]);
##Check the missing token!
}
if (showits) cat("Verify: Number of iterations should be less than", N.step,'\n');
thetahat<-em.norm(s<-prelim.norm(dt),showits=showits);
rngseed(ifelse(exists('seed'),max(seed,1),2009));
sem.list<-list();
est <- std.err <- vector("list",N.mi);prt=proc.time();
for (i in 1:N.mi) {
### i=1
dt.im<-dt;
dt.im[is.na(dt)]<-da.norm(s,thetahat,steps=N.step,return.ymis=TRUE)$ymis;
dt.im<-dt.im[,setdiff(colnames(dt.im),aux.x)];
if (raw){
sem.list[[i]]<-sem(ram,raw.moments(dt.im),nrow(dt.im),raw=TRUE,fixed.x=fixed.x);
}else{
sem.list[[i]]<-sem(ram,var(dt.im),nrow(dt.im),raw=FALSE,fixed.x=fixed.x);
}
est[[i]]<-matrix(summary(sem.list[[i]])$coeff$Estimate);
std.err[[i]]<-matrix(summary(sem.list[[i]])$coeff$'Std Error');
}
##time/repetition
(proc.time()-prt)/N.micoeff.mi<-mi.inference(est,std.err,confidence=confidence);
##estimated relative efficiency to infinite repetitions
coeff.mi$relative.efficiency<-N.mi/((coeff.mi<-mi.inference(est,std.err))$fminf+N.mi);
res<-list();
res$coef<-matrix(nrow=nrow(est[[i]]),ncol=0);
for (j in 1:length(coeff.mi)) res$coef <- cbind(res$coef,coeff.mi[[j]]);
rownames(res$coef)=names(sem.list[[i]]$coeff);
colnames(res$coef)=names(coeff.mi);
##
res$list<-sem.list;
res
}ram<-matrix(byrow=TRUE,ncol=3,data=c(
'Y.mis <- X','YfromX',NA,
'X <- .Intercept','XfromI',NA,
'Y.mis <- .Intercept','YfromI',NA,
'X <-> X','varX',NA,
'Y.mis <-> Y.mis','varY',NA));
class(ram)<-'mod';##
EMvsMI<-matrix(nrow=length(aux.list),ncol=6);
colnames(EMvsMI)=c('EM.YfromI','MI.YfromI','Efficiency.YfromI'
,'EM.YfromX','MI.YfromX','Efficiency.YfromX');
rownames(EMvsMI)=rownames(res$YfromI)[2+1:length(aux.list)]
EMvsMI[,c(1,4)] <- cbind(res$YfromI[2+1:length(aux.list),ncol(res$YfromI)]
,res$YfromX[2+1:length(aux.list),ncol(res$YfromX)]);
prt=proc.time();
for(k in 1:length(aux.list)){
dt.a<-res$last.data[,c('X','Y.mis',aux.list[[k]])];
.Intercept=dt.a$X*0+1;
data=cbind(dt.a,.Intercept);
res.mi = sem.mi(ram,data
,raw=TRUE,fixed.x='.Intercept',aux.x=aux.list[[k]]
,N.step=100,N.mi=10)
EMvsMI[k,c(2,5)]<-res.mi$coef[c('YfromI','YfromX'),'est']
EMvsMI[k,c(3,6)]<-res.mi$coef[c('YfromI','YfromX'),'relative.efficiency']
}
proc.time()-prt
EMvsMI
HTMLhr();HTMLh2('EM vs MI');HTML(EMvsMI);dt.a<-res$last.data[,c('Y','Y.mis','AUX.2')];
plot(Y~AUX.2,data=dt.a
,col=ifelse(is.na(Y.mis),'grey','black')
,ylim=c(-5,5)
,main='Linear (dot), curvilinear (dash) and LOESS (blue) prediction');
AUX.2<-data.frame(AUX.2=0.01*(-300:300));
points(AUX.2$AUX.2
,predict(lm(Y.mis~AUX.2,na.omit(dt.a)),newdata=AUX.2)
,type='l',lty='dotted');
points(AUX.2$AUX.2
,predict(lm(Y.mis~1+AUX.2+I(AUX.2^2),na.omit(dt.a)),newdata=AUX.2)
,type='l',lty='dashed');
points(AUX.2$AUX.2
,predict(loess(Y.mis~AUX.2,na.omit(dt.a))
,newdata=AUX.2$AUX.2)
,type='l',col='blue');
Root Mean Square Error of 40000 replications
|
Mean of 40000 replications
|