imputeTraits(traitsData, selectedTraits, traitPCA, PCAmodel, meanInputed, ...)

Full imputation pipeline: run missForest on selected traits, back-transform to original scale, validate via PCA comparison.

machine learningstatisticsfunctional diversity

Args:traitsData — trait data frame with NAsselectedTraits — columns to imputetraitPCA — PCA of complete casesPCAmodel — reference PCA modelmeanInputed — column means for back-transformation

imputeTraits<-function(traitsData, selectedTraits, traitPCA, PCAmodel, meanInputed,
                       sdInputed, nm_PCA, percImpute=1, dimensions, nboot=100){
  require(missForest)
  require(Metrics)
  
  # Data: extract the species with complete 'selectedTraits'.  For imputation all available traits are used: 'columsImputation'
  columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1) 
  completeSp <- rownames(na.omit(traitsData[,selectedTraits]))
  missingSp <- setdiff(rownames(traitsData), completeSp)
  traitWithoutNA<-traitsData[completeSp, columsImputation]
  traitWithNA <- traitsData[missingSp, columsImputation]
  #transform the traits with NA values into a mask (1 for existing traits, NA for NA traits)
  traitNAMask <- replace(traitWithNA, traitWithNA > -Inf, 1)
  output <- list()
  for (i in dimensions){
    output[[i]] <- matrix(NA, nc = nboot, nr = nrow(traitsData), dimnames = list(rownames(traitsData), paste0("Sim. ", 1:nboot)))
    names(output)[i] <- paste0("PC", i)
  }
  RMSE<-matrix(NA,nc=2,nr=nboot,dimnames = list(paste0("Sim.",1:nboot),c("RMSE_PC1","RMSE_PC2")))
  for (b in 1:nboot){
    cat(paste0("\n..................SIMULATION ",b," out of ",nboot,"................\n"))
    
    # Among the complete data, let's add NA values for 'percImpute' rows:
    traitSimulNA<-traitWithoutNA[sample(1:nrow(traitWithoutNA), percImpute*nrow(traitWithoutNA)), ]
    randomMask <- traitNAMask[sample(1:nrow(traitNAMask) , nrow(traitSimulNA)), ]
    
    # We create missing values in the traitSimulNA matrix by "pasting" them from real missing observations (this way keeping the structure of missing data):
    traitSimulNA <-  traitSimulNA * randomMask
    
    # We simulate this missing traits using the same imputation method than using in the paper:
    # Using the phylogenetic eigenvalues
    # We combine the new dataset of species without NA ('traitWithoutNA') and the other species with 'real' NA:
    # we impute all traits 
    traitSimulNAAll <- rbind(traitSimulNA, # Our new NA simulated
                             traitWithoutNA[setdiff(rownames(traitWithoutNA), rownames(traitSimulNA)),  ]) # the species with complete data without the species with new NA 
    traitsAux <- rbind(traitSimulNAAll, traitWithNA) # the other species 
    # Imputation with 'missForest'
    imputeRFModel <- missForest(xmis = traitsAux)$ximp
    traitsinPhylImput <- as.matrix(imputeRFModel)
    
    # We scale the data using the same mean and SD that we used in the paper: 
    traitsinPhylImputScaled<-traitsinPhylImput[,selectedTraits]
    for (i in 1:length(selectedTraits)){
      traitsinPhylImputScaled[,i]<-(traitsinPhylImput[, selectedTraits[i]] - meanInputed[i]) / sdInputed[i]  
    }
    colnames(traitsinPhylImputScaled)<-names(meanInputed)
    
    # We predict the position of the imputed species in the functional space:
    PCApositionNA <- predict(PCAmodel, newdata = traitsinPhylImputScaled[rownames(traitSimulNA), ])
    
    # Imputed with PCA compared to original position of the species with the functional space
    imputed<-PCApositionNA[,dimensions]
    original<-traitPCA[rownames(PCApositionNA),]
    
    RMSE1<-(mean((original[,1] - imputed[,1]) ** 2)) / (max(traitPCA[, 1]) - min(traitPCA[, 1]))
    RMSE2<-(mean((original[,2] - imputed[,2]) ** 2)) / (max(traitPCA[, 2]) - min(traitPCA[, 2]))
    
    RMSE[b,]<-c(RMSE1,RMSE2)
    
    for (i in dimensions){
      output[[i]][rownames(imputed),b]  <- imputed[, i] 
    }
    print(apply(RMSE,2,mean,na.rm=T))
  }
  return(list(output,RMSE))
}