- initialcov<-function(X,meanvec)
- {
- for (i in 1:ncol(X))
- {
- X[is.na(X[,i]),i]=meanvec[i]
- }
- covmat=cov(X)*(nrow(X)-1)/nrow(X)
- return(covmat)
- }
- empredict<-function(X, emmean, emcov)
- {
- completeX=X
- rowmiss=sort(unique(row(X)[is.na(X)]))
- nrowmiss=length(rowmiss) #number of rows with missing
- T2=0
- for (i in 1:nrowmiss)
- {
- wholerow=X[rowmiss[i],]
- x2=wholerow[!is.na(wholerow)]
- colmiss=col(t(as.matrix(wholerow)))[is.na(t(as.matrix(wholerow)))]
- mu1=emmean[colmiss] #missing
- mu2=emmean[-colmiss] #available
- sigma11=emcov[colmiss,colmiss]
- sigma12=emcov[colmiss,-colmiss]
- sigma21=emcov[-colmiss,colmiss]
- sigma22=emcov[-colmiss,-colmiss]
- x1=mu1+sigma12%*%solve(sigma22)%*%(x2-mu2)
- wholerow[is.na(wholerow)]=x1
- completeX[rowmiss[i],]=wholerow
- matprod=completeX[rowmiss[i],]%*%t(completeX[rowmiss[i],])
- x1x1=sigma11-sigma12%*%solve(sigma22)%*%sigma21+x1%*%t(x1)
- matprod[colmiss,colmiss]=x1x1
- T2=T2+matprod
- }
- T1=apply(completeX,2,sum)
- rownomiss=seq(1,nrow(X))[-rowmiss]
- nrownomiss=length(rownomiss)
- for (i in 1:nrownomiss)
- {
- wholerow=X[rownomiss[i],]
- T2=T2+wholerow%*%t(wholerow)
- }
- return(list(T1=T1, T2=T2))
- }
- emestimate<-function(T1, T2, n)
- {
- means=T1/n
- covs=T2/n-means%*%t(means)
- return(list(means=means, covs=covs))
- }
- EM<-function(X, delta=0.0001) #X is a n*p matrix with missing
- {
- x=as.matrix(X)
- #initial estimate
- emmean=apply(X,2,mean,na.rm=TRUE)
- emcov=initialcov(X, emmean)
- if (min(eigen(emcov)$values)<0)
- {
- stop("the initial estimate of covariance matrix is negative definite")
- }
- n=nrow(X) #number of rows
- meandiff=9999999
- covdiff=9999999
- while(meandiff>delta & covdiff>delta)
- {
- #prediction
- T1=empredict(X, emmean, emcov)$T1
- T2=empredict(X, emmean, emcov)$T2
- #estimate
- tempmean=emestimate(T1, T2, n)$means
- tempcov=emestimate(T1, T2, n)$covs
- meandiff=sqrt(sum((tempmean-emmean)^2))
- covdiff=sqrt(sum((tempcov-emcov)^2))
- emmean=tempmean
- emcov=tempcov
- }
- return(list(emmean=emmean, emcov=emcov))
- }
- > X
- [,1] [,2] [,3]
- [1,] NA 0 3
- [2,] 7 2 6
- [3,] 5 1 2
- [4,] NA NA 5
- > EM(X,delta=0.00000001)
- $emmean
- [1] 6.055589 1.115385 4.000000
- $emcov
- [,1] [,2] [,3]
- [1,] 0.5974356 0.3743413 1.2152566
- [2,] 0.3743413 0.6200690 0.8653846
- [3,] 1.2152566 0.8653846 2.5000000
Raw Paste