mean_impute(df, by_group)

Simple mean imputation: replace NA values with column means (optionally per taxonomic group). Fast fallback before random-forest imputation.

imputationutility

Args:df — species × traitsby_group=NULL — grouping column (e.g. "order")

mean_impute <- function(df, by_group = NULL) {
  num_cols <- names(df)[sapply(df, is.numeric)]
  
  if (is.null(by_group)) {
    for (col in num_cols) {
      df[[col]][is.na(df[[col]])] <- mean(df[[col]], na.rm = TRUE)
    }
  } else {
    for (col in num_cols) {
      group_means <- tapply(df[[col]], df[[by_group]], mean, na.rm = TRUE)
      idx <- is.na(df[[col]])
      df[[col]][idx] <- group_means[df[[by_group]][idx]]
      # Fallback to global mean if group mean is also NA
      df[[col]][is.na(df[[col]])] <- mean(df[[col]], na.rm = TRUE)
    }
  }
  df
}
# ── Example ──────────────────────────────────────────────────────
df <- iris
df[sample(150, 25), "Sepal.Length"] <- NA
df[sample(150, 30), "Petal.Width"]  <- NA
cat("Before:", sum(is.na(df)), "NAs
")
df_imp <- mean_impute(df, by_group = "Species")
cat("After: ", sum(is.na(df_imp[, 1:4])), "NAs
")