Full imputation pipeline: run missForest on selected traits, back-transform to original scale, validate via PCA comparison.
imputeTraits<-function(traitsData, selectedTraits, traitPCA, PCAmodel, meanInputed,
sdInputed, nm_PCA, percImpute=1, dimensions, nboot=100){
require(missForest)
require(Metrics)
# Data: extract the species with complete 'selectedTraits'. For imputation all available traits are used: 'columsImputation'
columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1)
completeSp <- rownames(na.omit(traitsData[,selectedTraits]))
missingSp <- setdiff(rownames(traitsData), completeSp)
traitWithoutNA<-traitsData[completeSp, columsImputation]
traitWithNA <- traitsData[missingSp, columsImputation]
#transform the traits with NA values into a mask (1 for existing traits, NA for NA traits)
traitNAMask <- replace(traitWithNA, traitWithNA > -Inf, 1)
output <- list()
for (i in dimensions){
output[[i]] <- matrix(NA, nc = nboot, nr = nrow(traitsData), dimnames = list(rownames(traitsData), paste0("Sim. ", 1:nboot)))
names(output)[i] <- paste0("PC", i)
}
RMSE<-matrix(NA,nc=2,nr=nboot,dimnames = list(paste0("Sim.",1:nboot),c("RMSE_PC1","RMSE_PC2")))
for (b in 1:nboot){
cat(paste0("\n..................SIMULATION ",b," out of ",nboot,"................\n"))
# Among the complete data, let's add NA values for 'percImpute' rows:
traitSimulNA<-traitWithoutNA[sample(1:nrow(traitWithoutNA), percImpute*nrow(traitWithoutNA)), ]
randomMask <- traitNAMask[sample(1:nrow(traitNAMask) , nrow(traitSimulNA)), ]
# We create missing values in the traitSimulNA matrix by "pasting" them from real missing observations (this way keeping the structure of missing data):
traitSimulNA <- traitSimulNA * randomMask
# We simulate this missing traits using the same imputation method than using in the paper:
# Using the phylogenetic eigenvalues
# We combine the new dataset of species without NA ('traitWithoutNA') and the other species with 'real' NA:
# we impute all traits
traitSimulNAAll <- rbind(traitSimulNA, # Our new NA simulated
traitWithoutNA[setdiff(rownames(traitWithoutNA), rownames(traitSimulNA)), ]) # the species with complete data without the species with new NA
traitsAux <- rbind(traitSimulNAAll, traitWithNA) # the other species
# Imputation with 'missForest'
imputeRFModel <- missForest(xmis = traitsAux)$ximp
traitsinPhylImput <- as.matrix(imputeRFModel)
# We scale the data using the same mean and SD that we used in the paper:
traitsinPhylImputScaled<-traitsinPhylImput[,selectedTraits]
for (i in 1:length(selectedTraits)){
traitsinPhylImputScaled[,i]<-(traitsinPhylImput[, selectedTraits[i]] - meanInputed[i]) / sdInputed[i]
}
colnames(traitsinPhylImputScaled)<-names(meanInputed)
# We predict the position of the imputed species in the functional space:
PCApositionNA <- predict(PCAmodel, newdata = traitsinPhylImputScaled[rownames(traitSimulNA), ])
# Imputed with PCA compared to original position of the species with the functional space
imputed<-PCApositionNA[,dimensions]
original<-traitPCA[rownames(PCApositionNA),]
RMSE1<-(mean((original[,1] - imputed[,1]) ** 2)) / (max(traitPCA[, 1]) - min(traitPCA[, 1]))
RMSE2<-(mean((original[,2] - imputed[,2]) ** 2)) / (max(traitPCA[, 2]) - min(traitPCA[, 2]))
RMSE[b,]<-c(RMSE1,RMSE2)
for (i in dimensions){
output[[i]][rownames(imputed),b] <- imputed[, i]
}
print(apply(RMSE,2,mean,na.rm=T))
}
return(list(output,RMSE))
}