Last active
September 13, 2019 15:52
-
-
Save davidckatz/6fa74187728145741583e75eb8e04e45 to your computer and use it in GitHub Desktop.
Impute missing 3D landmarks using partial least squares regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INSTRUCTIONS (see the file below for the R script) | |
This function imputes missing landmark data in one specimen based upon the locations of the other, recorded landmarks. The predicted relationship is estimated using two-block partial least squares analysis (PLS) on a set of complete case observations. The complete case data is divided into two blocks: one includes all landmarks that were recorded in the incomplete specimen, the other all landmarks that were not. PLS is used to identify axes that maximize covariance between the two blocks, and these relationships are used to predict missing landmarks in the incomplete case. | |
INPUTS | |
1. n*(p*k) matrix of observations in which row 1 is the case with missing data and the remaining rows are complete cases | |
2. Optionally, a vector of group identities (group.id) for all specimens, with length = nrow(X). | |
VALUES (returns) | |
1. Vector of landmark coordinates rescaled by estimated centroid size. | |
NOTES | |
1. As written, only accepts 3D landmark data. | |
2. Support functions nested within impute.pls2B include a complete set of functions for Generalized Procrustes Analysis. | |
3. If the group identity vector is supplied, a pooled within-group covariance matrix is computed. If not, the ordinary covariance matrix for the complete dataset is computed. Pooling may produce better estimates because it reduces the influence of among group deviances on the covariance matrix. The group.id option should only be used where at least one complete case is from the same group as the missing case. | |
STEP-THROUGH | |
1. Generalized Procrustes Analysis on complete cases. | |
2. If group.id is included, pooling. | |
3. Compute initial estimate of missing coordinate data based on fitting the incomplete case to the complete case consensus configuration. | |
4. Iterate PLS estimation procedure until the difference between the current and previous estimated shapes differs by less than an a priori defined threshold. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# See the file above for instructions. | |
impute.pls2B <- function(X, group.id=NA) | |
{ | |
################################################ | |
# FUNCTIONS UTLIZED DURING IMPUTATION | |
# Generalized Procrustes Analysis | |
GPA <- function(d){ | |
# REMOVE LOCATION | |
# array of centroid data | |
cent.array <- array.of.centroids(d) | |
# translate each specimen by the centroid of included landmarks | |
trans.array <- d-cent.array | |
# REMOVE SCLAE | |
# vector of centroid sizes | |
cent.size <- rep(NA, dim(trans.array)[1]) | |
for(i in 1:dim(trans.array)[1]) | |
{cent.size[i]<-c.size(trans.array[i,,], cent=centroid(trans.array[i,,]))} | |
# scale to unit size | |
s.array <- trans.array/cent.size | |
# REMOVE ROTATION | |
# initial fitting of all specimens to the first specimen in the array | |
gpa.fitting <- gpa.iteration(scaled=s.array) | |
r.array <- gpa.fitting$x | |
r.mean <- gpa.fitting$mean.x | |
scaled.mean <- gpa.fitting$scaled.mean | |
list(r.array = r.array, r.mean=r.mean, | |
cent.size = cent.size, scaled.mean=scaled.mean) | |
} | |
# SUPPORT FUNCTIONS | |
# array of centroids (used to translate specimen array to origin) | |
array.of.centroids <- function(d) | |
{ | |
# matrix of centroids | |
cent.matrix <- apply(d, 1, centroid) | |
# make conformable arrays of centroids for each specimen | |
cent.array <- array(NA, dim=c(dim(d))) | |
# fill centroid array | |
for(j in 1:dim(cent.array)[1]){ | |
cent.array[j,,] <- matrix(rep(cent.matrix[,j], dim(cent.array)[2]), | |
dim(cent.array)[2], dim(cent.array)[3], byrow=TRUE) | |
} | |
return(cent.array) | |
} | |
# centroid size of landmark configuration | |
c.size <- function(d, cent=centroid(d)) | |
{ | |
centroid.mat <- matrix(cent, dim(d)[1], dim(d)[2], byrow=TRUE) | |
summed.sq.dists <- sum(apply((d-centroid.mat)^2, 1, sum)) | |
cs <- sqrt(summed.sq.dists) | |
return(cs) | |
} | |
# centroid of landmark configuation | |
centroid <- function(centroid.of) | |
{ | |
centroid.unity.vector <- matrix(1, nrow(centroid.of), 1) | |
C <- t(1/nrow(centroid.of) * t(centroid.of) %*% centroid.unity.vector) | |
return (C) | |
} | |
# iterative rotations for Procrustes fitting | |
gpa.iteration <- function(scaled){ # requires centered, scaled n*p*k array of landmark data. k=3 | |
# FIRST ROTATION | |
x <- array(NA, dim=c(dim(scaled))) | |
# rotation of each specimen by its own rotation matrix | |
for (n in 1:dim(scaled)[1]) { | |
x.mean <- scaled[1,,] | |
rm <- rotation.matrix(x.mean, scaled[n,,])$RM #rotation matrix for each specimen | |
x[n,,] <- scaled[n,,] %*% rm # rotate all specimens to first | |
} | |
# ITERATIVE ROTATIONS | |
delta.sos <- array.sos(x, x.mean) # provides criteria for when to stop iterating | |
sos.old <- delta.sos | |
while (delta.sos > .001) | |
{ | |
# shrinkage prevention! makes the mean specimen of unit centroid size | |
unscaled.mean <- apply(x, c(2, 3), mean) | |
scaled.mean <- unscaled.mean/c.size(unscaled.mean) | |
# rotations | |
for(m in 1:dim(x)[1]) | |
{ | |
rm <- rotation.matrix(scaled.mean, x[m,,])$RM | |
x[m,,] <- x[m,,] %*% rm | |
} | |
# data to evaluate the while loop criterion | |
sos.new <- array.sos(x, scaled.mean) | |
delta.sos <- sos.old-sos.new | |
sos.old <- sos.new | |
} | |
# recompute mean a final time | |
mean.x <- apply(x, c(2, 3), mean) | |
list(x = x, mean.x = mean.x, scaled.mean=scaled.mean) | |
} | |
# sum of squares (proc dist) for an array of landmark configurations | |
array.sos <- function(x, C) | |
{ | |
imputed.lms <- imputed.lms | |
#find the distance of each coordinate from the mean | |
diff <- sweep(x, c(2,3), C, "-") | |
#sum across the squares of each coordinate distance distance | |
sos <- sum(diff^2, na.rm=TRUE) | |
return(sos) | |
} | |
# translate a matrix (geometry rather than matrix multiplication | |
# it's easier with geometry to handle rows with missing data) | |
translate <- function(translate.what, translate.by=translate.what, imputed.lms=NA) | |
{ | |
unity.vector <- | |
matrix(1, nrow(translate.what), 1) | |
#translation matrix | |
trans.matrix <- | |
unity.vector %*% centroid(translate.by) | |
#tranlation by translation matrix | |
translate <- | |
translate.what - trans.matrix | |
return(translate) | |
} | |
# rotation matrix to fit one shape to another | |
rotation.matrix <- function(hold.this, rotate.this) | |
{ | |
#decompose | |
decomp <- svd(t(hold.this) %*% rotate.this) | |
U <- decomp$u | |
V <- decomp$v | |
D <- decomp$d | |
#if there is a 0 element to D, change it to a 1 | |
if(any(D==0)) {D[which(D==0)] <- 1} | |
# preliminary rotation matrix | |
pre.S <- matrix(0, length(D), length(D)) | |
diag(pre.S) <- sign(D) | |
pre.RM <- V %*% pre.S %*% t(U) | |
determ <- det(pre.RM) | |
# test for reflection. if yes, adjust rotation matrix. | |
if (determ >= 0) {S <- pre.S} else { | |
S <- matrix(0, length(D), length(D)) | |
for (i in 1:length(D)) | |
{S[i,i] <- ifelse (D[i]==min(D), -(sign(D[i])), sign(D[i]))} | |
} | |
#Produce the appropriate rotation matrix | |
RM <- V%*%S%*%t(U) | |
list(RM=RM, determ=determ) | |
} | |
# ordinary Procrustes analysis (fit one specimen to a reference). | |
# assumes reference (hold.this) is already centered and scaled. | |
opa.one <- function(hold.this, rotate.this) | |
{ | |
# translate and scaledspecimen to be rotated | |
rotate.this <- translate(rotate.this)/c.size(rotate.this) | |
#rotate matrix | |
rm <- rotation.matrix(hold.this, rotate.this)$RM | |
# rotated configuration | |
opa <- rotate.this%*%rm | |
list(opa=opa, cs=cs) | |
} | |
#converts n*p*k array to n*(p*k) matrix of proper setup | |
lm.array.to.matrix <- function(lm.array) | |
{ | |
lm.matrix <- aperm(lm.array, c(1,3,2)) | |
dim(lm.matrix) <- | |
c(dim(lm.array)[1], dim(lm.array)[2] * dim(lm.array)[3]) | |
return(lm.matrix) | |
} | |
# centering data on group (pop, species, etc) means | |
group.centering <- function(m, grp.vec) | |
{ | |
groups <- unique(grp.vec) | |
#empty matrix for group means | |
mean.mat <- matrix(NA, nrow=length(groups), ncol=ncol(m)) | |
#loop to fill matrix of group means | |
for(i in 1:length(groups)) | |
{ | |
grp.dat <- subset(m, grp.vec == groups[i]) | |
mean.mat[i,] <- apply(grp.dat, 2, mean) | |
} | |
#expanded matrix of mean with nrow = nrow(m) | |
matches <- match(grp.vec, groups) | |
matched.means <- mean.mat[matches,] | |
#within group residuals | |
group.centered <- m - matched.means | |
list(group.centered = group.centered, | |
matched.means = matched.means, | |
groups = groups, | |
mean.mat=mean.mat) | |
} | |
######################################################################### | |
# DATA PREP: COMPLETE CASES | |
# rearrange complete case matrix as a n*p*k 3D array | |
d <- aperm(array(X[-1,], dim=c(nrow(X)-1, 3, ncol(complete)/3)), c(1,3,2)) | |
# GPA of complete cases | |
cc.gpa <- GPA(d) | |
cc.array <- cc.gpa$r.array | |
cc.mat <- lm.array.to.matrix(cc.array) | |
cc.mean <- cc.gpa$r.mean | |
cc.cs <- cc.gpa$cent.size | |
cc.scaledmu <- cc.gpa$scaled.mean | |
# setting up residual covariance matrix, depending on whether | |
# there is group id data. | |
# Scenario 1: there is group data | |
if(length(group.id)>1){ | |
cc.group.id <- group.id[-1] | |
ctr <- group.centering(cc.mat,cc.group.id) | |
resid <- ctr$group.centered | |
mus <- ctr$mean.mat | |
grps <- ctr$groups | |
matched.means <- ctr$matched.means | |
missing.mean <- matched.means[which(unique(cc.group.id)==group.id[1]),]} else { | |
# Scenario 2: no group id information | |
matched.means <- matrix(as.vector(t(cc.mean)),nrow(cc.mat),ncol(cc.mat), byrow=T) | |
missing.mean <- matched.means[1,] | |
resid <- cc.mat-matched.means | |
mus <- matched.means[1,] | |
} | |
################################################################# | |
# DATA PREP: INCOMPLETE SPECIMEN | |
# matrix for the incomplete case (and its centroid size) | |
imp.case <- matrix(X[1,],ncol(X)/3,3,byrow=T) | |
# identify missing landmarks | |
imputed.lms <- which(is.na(imp.case[,1])) | |
# estimate centroid size of missing case with all landmarks is centroid size | |
# for recorded landmarks/% total CS attributable to those landmarks on mean shape | |
mean.form <- (cc.mean/c.size(cc.mean))*mean(cc.cs) | |
cs.ratio <- c.size(mean.form[-imputed.lms,])/c.size(mean.form) | |
missing.cs <- c.size(imp.case[-imputed.lms,])/cs.ratio | |
########################################################## | |
# FIRST IMPUTATION OF MISSING LANDMARK (SUB IN THE MEAN COORDINATE) | |
# empty vector for the imputed outcome | |
imputed <- rep(NA, ncol(X)) | |
# translate partial complete case so partial centroid is at origin | |
part.mean <- translate(translate.what=cc.mean,translate.by=cc.mean[-imputed.lms,]) | |
# translate and scale the incomplete specimen so it is same size as cs(part.mean) | |
imp.t <- translate(imp.case, imp.case[-imputed.lms,]) | |
imp.ts <- (imp.t/c.size(imp.case[-imputed.lms,]))*c.size(part.mean[-imputed.lms,]) | |
# useful index information for vector form of imputed specimen | |
missing.coords <- which(is.na(as.vector(t(imp.ts)))) | |
# ordinary procrustes analysis | |
imp.rm <- rotation.matrix(part.mean[-imputed.lms,],imp.ts[-imputed.lms,])$RM | |
missing.fit <- imp.ts%*%imp.rm | |
# swap in mean landmark coordinates for missing coordinates | |
missing.fit[imputed.lms,]<-part.mean[imputed.lms,] | |
# translate and scale to cs=1 to complete original config. for PLS loop | |
one<-translate(missing.fit)/c.size(missing.fit) | |
###################################################### | |
# ITERATIVE PLS PREDICTION | |
# initial values for while loop criterion | |
proc.dist <- 99 | |
while(proc.dist > 0.0001) | |
{ | |
#PLS | |
#define blocks | |
predictor.block <- resid[,-missing.coords] | |
missing.block <- resid[,missing.coords] | |
two.blocks <- cbind(predictor.block, missing.block) | |
#covariance matrix and cross block covariance matrix | |
R <- (t(two.blocks)%*%two.blocks)/(nrow(two.blocks)-1) | |
R1 <- R[1:ncol(predictor.block),1:ncol(predictor.block)] | |
R2 <- R[-(1:ncol(predictor.block)),-(1:ncol(predictor.block))] | |
R12 <- R[1:nrow(R1), -(1:ncol(R1))] | |
#singular value decomposition of cross block covariance matrix | |
pls.svd <- svd(R12) | |
S<-diag(pls.svd$d) | |
U<-pls.svd$u | |
V<-pls.svd$v | |
#scores | |
predictor.scores <- predictor.block%*%U | |
# PLS PREDICTIVE EQUATION | |
#linear model to predict missing scores from block of available scores | |
pls.lm <- lm(missing.block~predictor.scores)$coefficients | |
# IMPUTATION BASED ON pls.lm | |
# score of imputed specimen (predictor block lms) on the singular value axes | |
one <- opa.one(cc.scaledmu,one)$opa | |
one.vec <- as.vector(t(one)) | |
# current imputed shape as matrix | |
#one<-matrix(missing.vec, length(one)/3,3,byrow=T) | |
# residuals and predictor scores for imputed shape | |
imp.resids <- one.vec - missing.mean | |
imp.predictor.scores <- imp.resids[-missing.coords]%*%U | |
#Predict coordinates with regression (ALPHA + XB + ADD.BACK.MEAN) | |
pred.coords <- pls.lm[1,] + imp.predictor.scores%*%pls.lm[-1,] + missing.mean[missing.coords] | |
#fill in the imputation matrix row for the specimen. | |
for(j in 1:length(missing.coords)){one.vec[missing.coords[j]]<-pred.coords[j]} | |
# new imputed shape as matrix | |
redone <- matrix(one.vec,nrow(one),ncol(one), byrow=T) | |
redone <- translate(redone)/c.size(redone) | |
# TEST CRITERION FOR ESTIMATING AGAIN | |
one.redone <- opa.one(one, redone)$opa | |
proc.dist <- sqrt(sum((one.redone-one)^2)) | |
one <- redone | |
print(proc.dist) | |
} | |
imputed<-as.vector(t(one))*missing.cs | |
return(imputed) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment