This page collects reusable R utilities (installation helpers, summary/stat functions, trait workflows, PCA, TPD utilities, and assorted snippets).
All chunks are shown for copy/paste but not evaluated (eval = FALSE).

Contents

Library

packages <- c(
  "ade4","ape","berryFunctions","betapart","biscale","cowplot","PCA_TRAITS.table",
  "dplyr","funrar","geiger","ggplot2","ggpubr","lsmeans","missForest","motmot",
  "multcomp","mvMORPH","paleotree","pals","paran","phylobase","phytools","picante",
  "plotly","plotrix","psych","quanteda","ratematrix","RColorBrewer","readr","rgbif",
  "rnaturalearth","rredlist","sf","shape","stats","tidyr","TPD","vegan",
  "VennDiagram","viridis","wesanderson"
)

installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}
# Packages loading
lapply(packages, library, character.only = TRUE)

Statistics

distrib <- function(x, k = 6) {
  # NA removing
  x <- x[is.na(x) == FALSE]

  Nx <- length(x)
  sum <- round(summary(x), k)
  res <- c(Nx, sum)
  names(res) <- c("N", names(sum))
  return(res)
} # end of function distrib

hms_span <- function(start, end) {
  dsec <- as.numeric(difftime(end, start, unit = "secs"))
  hours <- floor(dsec / 3600)
  minutes <- floor((dsec - 3600 * hours) / 60)
  seconds <- dsec - 3600 * hours - 60 * minutes
  paste0(
    sapply(c(hours, minutes, seconds), function(x) {
      formatC(x, width = 2, format = "d", flag = "0")
    }),
    collapse = ":"
  )
}

sesandpvalues_bis <- function(obs, rand, nreps, probs = c(0.025, 0.975), rnd = 2) {
  SES <- (obs - mean(rand)) / sd(rand)
  pValsSES <- rank(c(obs, rand))[1] / (length(rand) + 1)
  results <- round(c(obs, SES, mean(rand), quantile(rand, prob = probs), pValsSES, nreps), rnd)
  names(results) <- c("Observed", "SES", "MeanRd", "CI025Rd", "CI975Rd", "Pval", "Nreps")
  return(results)
}

extract_t_test <- function(x, y, ...) {
  z <- t.test(x, y)
  stat <- z$statistic
  pval <- z$p.value
  pvalStars <- ifelse(pval > 0.05, ".",
                      ifelse(pval > 0.01, "*",
                             ifelse(pval > 0.001, "**", "***")))
  est <- z$estimate
  sdv <- c(sd(x), sd(y))

  res <- data.frame(est[1], sdv[1], est[2], sdv[2], stat, pval, pvalStars)
  res[1:4] <- round(as.numeric(res[1:4]), 2)
  names(res) <- c("Mean x", "sd x", "Mean y", "sd y", "t", "Pvalues", "P")
  return(res)
}

extract_indices_list <- function(ind) {
  indSES <- matrix(
    NA, nr = ncol(ind), nc = 7,
    dimnames = list(
      colnames(ind),
      c("Observed", "SES", "MeanRd", "CI025Rd", "CI975Rd", "Pval", "Nreps")
    )
  )
  for (i in 1:nrow(indSES)) {
    indSES[i, ] <- sesandpvalues_bis(
      obs = ind[1, i],
      rand = ind[-1, i],
      nreps = length(ind[, 1]) - 1,
      probs = c(0.025, 0.975),
      rnd = 2
    )
  }
  rownames(indSES) <- gsub("\\.", " ", rownames(indSES))
  return(indSES)
}

extract_indices_change <- function(ind) {
  indSES <- matrix(
    NA, nr = ncol(ind), nc = 7,
    dimnames = list(
      colnames(ind),
      c("Observed", "SES", "MeanRd", "CI025Rd", "CI975Rd", "Pval", "Nreps")
    )
  )
  for (i in 1:nrow(indSES)) {
    indSES[i, ] <- sesandpvalues_bis(
      obs = ((ind[2, i] - ind[1, i]) / ind[1, i]) * 100,
      rand = ((ind[-c(1:2), i] - ind[1, i]) / ind[1, i]) * 100,
      nreps = length(ind[, 1]) - 2,
      probs = c(0.025, 0.975),
      rnd = 2
    )
  }
  rownames(indSES) <- gsub("\\.", " ", rownames(indSES))
  return(indSES)
}

extract_indices_change_log <- function(ind) {
  indSES <- matrix(
    NA, nr = ncol(ind), nc = 7,
    dimnames = list(
      colnames(ind),
      c("Observed", "SES", "MeanRd", "CI025Rd", "CI975Rd", "Pval", "Nreps")
    )
  )
  for (i in 1:nrow(indSES)) {
    indSES[i, ] <- sesandpvalues_bis(
      obs = log(ind[2, i] / ind[1, i]),
      rand = log(ind[-c(1:2), i] / ind[1, i]),
      nreps = length(ind[, 1]) - 2,
      probs = c(0.025, 0.975),
      rnd = 2
    )
  }
  rownames(indSES) <- gsub("\\.", " ", rownames(indSES))
  return(indSES)
}

Sorting data

standard_names <- function(Traits, nrowTraits, source = 163, dim = NA) {
  require(taxize)
  standNamesTraits <- as.data.frame(
    matrix(
      NA,
      nrow = nrowTraits,
      ncol = 1,
      dimnames = list(rownames(Traits), "IUCNName")
    )
  )
  batchSize <- 1000

  for (i in 1:(ceiling(nrowTraits / batchSize))) {
    cat(paste0("\nBatch ", i, " out of ", (ceiling(nrowTraits / batchSize)), "\n"))
    rowsSelect <- ((i - 1) * batchSize + 1):min((i * batchSize), nrowTraits)
    taxoAux <- as.data.frame(
      taxize::gnr_resolve(
        names = rownames(standNamesTraits)[rowsSelect],
        preferred_data_sources = source, # 163 is IUCN
        best_match_only = TRUE,
        canonical = TRUE
      )
    )
    standNamesTraits[taxoAux$user_supplied_name, "IUCNName"] <- taxoAux$matched_name2
  }

  standNamesTraits[, "IUCNName"] <- gsub(
    x = standNamesTraits[, "IUCNName"],
    pattern = " ",
    replacement = "_"
  )
  naIUCN <- which(is.na(standNamesTraits[, "IUCNName"]))
  standNamesTraits[naIUCN, "IUCNName"] <- rownames(standNamesTraits)[naIUCN]

  # Some species could not be resolved into IUCN naming. Remove them:
  yesName <- which(!is.na(standNamesTraits[, "IUCNName"]))
  if (dim == 1) {
    return(Traits[yesName])
  } else {
    return(Traits[yesName, ])
  }
}

addToPhylo <- function(phylogenyTraits, GROUPTraits) {
  phylogenyTraits <- force.ultrametric(tree = phylogenyTraits, method = c("extend"))
  tipsToAdd <- which(!rownames(GROUPTraits) %in% phylogenyTraits$tip.label)
  namesToAdd <- rownames(GROUPTraits)[tipsToAdd]
  genusToAdd <- unlist(strsplit(namesToAdd, "_"))[
    which(1:length(strsplit(namesToAdd, "_")) %% 2 == 1)
  ]
  phylogenyTraitsAdd <- phylogenyTraits

  for (add in 1:length(namesToAdd)) {
    cat(paste("\rADDING SP TO GENUS IN PHYLOGENY", add, "/", length(namesToAdd), "\r"))
    phylogenyTraitsAdd <- add.species.to.genus(
      tree = phylogenyTraitsAdd,
      species = namesToAdd[add],
      genus = genusToAdd[add],
      where = "root"
    )
  }
  return(phylogenyTraitsAdd)
}

RedToTraits <- function(IUCNGROUPtoTraits) {
  RedToTraits <- matrix(
    rep(NA, nrow(IUCNGROUPtoTraits)),
    ncol = 1,
    dimnames = list(IUCNGROUPtoTraits$Species, "IUCN")
  )

  for (i in 1:nrow(IUCNGROUPtoTraits)) {
    selRows <- which(IUCNGROUPtoTraits$Species == IUCNGROUPtoTraits$Species[i])
    if (length(selRows) > 1) {
      dataAux <- IUCNGROUPtoTraits[selRows, ]
      if (any(is.na(dataAux$infra_name))) {
        coger <- which(is.na(dataAux$infra_name))
        if (length(coger) == 1) {
          RedToTraits[IUCNGROUPtoTraits$Species[i], "IUCN"] <- as.character(dataAux$categorySimp[coger])
        } else {
          if (length(unique(dataAux$categorySimp[coger])) == 1) {
            RedToTraits[IUCNGROUPtoTraits$Species[i], "IUCN"] <-
              as.character(unique(dataAux$categorySimp[coger]))
          } else {
            RedToTraits[IUCNGROUPtoTraits$Species[i], "IUCN"] <-
              as.character(sort(unique(dataAux$categorySimp))[1])
          }
        }
      } else {
        RedToTraits[IUCNGROUPtoTraits$Species[i], "IUCN"] <-
          as.character(sort(unique(dataAux$categorySimp))[1])
      }
    } else {
      RedToTraits[IUCNGROUPtoTraits$Species[i], "IUCN"] <-
        as.character(IUCNGROUPtoTraits[selRows, "categorySimp"])
    }
  }
  return(RedToTraits)
}

imputation_trait <- function(GROUPData, selectedTraits, shortnames = c("am", "bs", "ls", "os")) {
  require(missForest)
  columnsTraits <- 1:(which(colnames(GROUPData) == "Eigen.1") - 1)
  GROUPData[, columnsTraits] <- log10(GROUPData[, columnsTraits])

  # NOTE: All traits are kept for imputation
  columsImputation <- 1:(which(colnames(GROUPData) == "IUCN") - 1)
  GROUPTraitslogImputed <- as.matrix(missForest(xmis = GROUPData[, columsImputation])$ximp)

  GROUPTraitsImputed <- GROUPTraitsMissing <- GROUPData
  GROUPTraitsImputed[, columnsTraits] <- GROUPTraitslogImputed[, columnsTraits]

  GROUPTraitsMissing <- GROUPTraitsMissing[, selectedTraits]
  GROUPTraitsImputed <- GROUPTraitsImputed[, selectedTraits]
  colnames(GROUPTraitsMissing) <- colnames(GROUPTraitsImputed) <- shortnames

  GROUPOtherInfo <- GROUPData[, (max(columnsTraits) + 1):ncol(GROUPData)]
  GROUPTraitsImputed <- data.frame(GROUPTraitsImputed, IUCN = GROUPData$IUCN)
  GROUPTraitsMissing <- data.frame(GROUPTraitsMissing, IUCN = GROUPData$IUCN)

  return(list(GROUPTraitsImputed, GROUPTraitsMissing))
}

PCA

# Perform PCA
# TRAITS: table of traits sp x tr : MUST BE SCALED AND log10
make.PCA <- function(TRAITS, dimensionsAux, savePCA) {
  paranAux <- paran(TRAITS)
  PCA_TRAITS <- list()
  PCA_TRAITS$traits <- TRAITS
  PCA_TRAITS$PCA <- princomp(PCA_TRAITS$traits)
  PCA_TRAITS$dimensions <- dimensionsAux
  PCA_TRAITS$variance <- (summary(PCA_TRAITS$PCA)[1][[1]]^2)[1:dimensionsAux] /
    sum(summary(PCA_TRAITS$PCA)[1][[1]]^2)
  PCA_TRAITS$loadings <- PCA_TRAITS$PCA$loadings
  PCA_TRAITS$traitsUse <- data.frame(PCA_TRAITS$PCA$scores[, 1:PCA_TRAITS$dimensions])
  if (!is.na(savePCA)) {
    saveRDS(PCA_TRAITS, savePCA)
  }
  return(PCA_TRAITS)
}

# Possible to turn it as function for all groups (Nat Comm 2021)
make.PCA.groups <- function(traitsImputed, traitsMissing,
                            group = c("Plants","Birds","Mammals","Reptiles","Amphibians","Fishes")) {
  require(paran)
  paranAux <- paran(traitsImputed[, 1:(ncol(traitsImputed) - 1)])
  if (paranAux$Retained < 2) paranAux$Retained <- 2
  dimensionsAux <- paranAux$Retained

  PCAImpute <- list()
  PCAImpute$traits <- traitsImputed[, 1:(ncol(traitsImputed) - 1)]
  PCAImpute$traitsMissing <- traitsMissing[, 1:(ncol(traitsImputed) - 1)]
  PCAImpute$nImputed <- rowSums(is.na(PCAImpute$traitsMissing))
  PCAImpute$PCA <- princomp(traitsImputed[, 1:(ncol(traitsImputed) - 1)])
  PCAImpute$dimensions <- dimensionsAux
  PCAImpute$variance <- (summary(PCAImpute$PCA)[1][[1]]^2)[1:dimensionsAux] /
    sum(summary(PCAImpute$PCA)[1][[1]]^2)

  if (group %in% c("Birds","Mammals")) { # Mammals & Birds
    multPC1 <- ifelse(PCAImpute$PCA$loadings["bm", "Comp.1"] > 0, 1, -1)
    PCAImpute$PCA$scores[, 1] <- multPC1 * PCAImpute$PCA$scores[, 1]
    PCAImpute$PCA$loadings[, "Comp.1"] <- multPC1 * PCAImpute$PCA$loadings[, "Comp.1"]
  }

  if (group == "Reptiles") { # Reptiles
    multPC2 <- ifelse(PCAImpute$PCA$loadings["inc", "Comp.2"] > 0, -1, 1)
    PCAImpute$PCA$scores[, 2] <- multPC2 * PCAImpute$PCA$scores[, 2]
    PCAImpute$PCA$loadings[, "Comp.2"] <- multPC2 * PCAImpute$PCA$loadings[, "Comp.2"]
  }

  if (group == "Fishes") { # Fishes
    multPC1 <- ifelse(PCAImpute$PCA$loadings["svl", "Comp.1"] > 0, 1, -1)
    multPC2 <- ifelse(PCAImpute$PCA$loadings["svl", "Comp.2"] < 0, 1, -1)
    PCAImpute$PCA$scores[, 1] <- multPC1 * PCAImpute$PCA$scores[, 1]
    PCAImpute$PCA$loadings[, "Comp.1"] <- multPC1 * PCAImpute$PCA$loadings[, "Comp.1"]
    PCAImpute$PCA$scores[, 2] <- multPC2 * PCAImpute$PCA$scores[, 2]
    PCAImpute$PCA$loadings[, "Comp.2"] <- multPC2 * PCAImpute$PCA$loadings[, "Comp.2"]
  }

  PCAImpute$loadings <- PCAImpute$PCA$loadings
  PCAImpute$traitsUse <- data.frame(PCAImpute$PCA$scores[, 1:PCAImpute$dimensions])

  treathCat <- c("EX", "EW", "CR", "EN", "VU")
  noThreatCat <- c("LC", "NT")
  PCAImpute$Threat <- with(traitsImputed,
                           ifelse(IUCN %in% treathCat, 1,
                                  ifelse(IUCN %in% noThreatCat, 0, NA)))
  return(PCAImpute)
}


# =========================
# Example for birds
# =========================
# 1. Load data
# =========================
PCA <- readRDS("data/processed/PCA_Birds.rds")
shortNames <- read.csv("data/processed/Shortnames_Birds.csv")
type <- names(PCA)
title_space = c('Combined','Locomotion','Reproduction','Diet')
# =========================
# 2. Define functions
# =========================

PCA_plot <- function(PCoAPlot,PCoACorPlot,multAx1,multAx,title,legend){
  Plan12=ggplot(data = data.frame(PCoAPlot$vectors)) +
    geom_point(aes(x = Axis.1, y = Axis.2),col="grey89") +
    theme_classic()+
    ggtitle(title)+
    xlab(paste0("PCoA 1 (",round(PCoAPlot$values[1,2]*100,2),"%)"))+
    ylab(paste0("PCoA 2 (",round(PCoAPlot$values[2,2]*100,2),"%)"))+
    #coord_fixed() + ## need aspect ratio of 1!
    geom_segment(data = data.frame(PCoACorPlot),
                 aes(x = 0, xend = Axis.1*multAx, y = 0, yend = Axis.2*multAx),
                 arrow = arrow(length = unit(0.25, "cm")), colour = PCoACorPlot$color) +
    geom_text(data = data.frame(PCoACorPlot), 
              aes(x =  Axis.1*multAx1, y = Axis.2*multAx1, 
                  label =  names), colour = PCoACorPlot$color,
              size = 5) +
    theme(
      plot.title = element_text(
        size = 18,       # Increase size (adjust as needed)
        face = "bold"    # Make it bold
      )
    )
  
  if(!is.null(legend)){
    Plan12 = Plan12 + annotate(geom = "text",label = legend, x = Inf, y = -Inf, hjust = 1, vjust = -0.5)      # Make text bold)
  }
  
  
  
  Plan12
}

# ---------- utilities ----------
.sanitize_axes <- function(df) {
  stopifnot(all(c("Axis.1","Axis.2") %in% names(df)))
  df <- df[is.finite(df$Axis.1) & is.finite(df$Axis.2), , drop = FALSE]
  df$Axis.1 <- as.numeric(df$Axis.1)
  df$Axis.2 <- as.numeric(df$Axis.2)
  df
}

.safe_limits <- function(x, pad_factor = 0.08) {
  xr <- range(x, finite = TRUE)
  if (!is.finite(xr[1]) || !is.finite(xr[2])) return(c(-1, 1))
  if (xr[1] == xr[2]) {
    eps <- ifelse(abs(xr[1]) > 0, abs(xr[1]) * pad_factor, 1e-6)
    return(c(xr[1] - eps, xr[2] + eps))
  }
  width <- diff(xr)
  c(xr[1] - width * pad_factor, xr[2] + width * pad_factor)
}

# KDE grid using {ks}
.kde_grid <- function(df, gridsize = 181, H = NULL) {
  if (!requireNamespace("ks", quietly = TRUE)) {
    stop("Please install.packages('ks') to use the KDE-based density background.")
  }
  df <- .sanitize_axes(df)
  if (nrow(df) < 3) return(list(grid = NULL, x = NULL, y = NULL, z = NULL))
  
  xlim <- .safe_limits(df$Axis.1); ylim <- .safe_limits(df$Axis.2)
  
  X <- cbind(
    pmin(pmax(df$Axis.1, xlim[1]), xlim[2]),
    pmin(pmax(df$Axis.2, ylim[1]), ylim[2])
  )
  
  if (is.null(H)) {
    H <- try(ks::Hpi(x = X), silent = TRUE)
    if (inherits(H, "try-error") || any(!is.finite(as.vector(H)))) {
      H <- try(ks::Hscv(x = X), silent = TRUE)
    }
    if (inherits(H, "try-error") || any(!is.finite(as.vector(H)))) {
      v <- apply(X, 2, stats::var, na.rm = TRUE); v[v <= 0 | !is.finite(v)] <- 1e-6
      H <- diag(v)
    }
  }
  
  kde <- ks::kde(
    x = X, H = H,
    xmin = c(xlim[1], ylim[1]), xmax = c(xlim[2], ylim[2]),
    gridsize = c(gridsize, gridsize)
  )
  
  gx <- kde$eval.points[[1]]
  gy <- kde$eval.points[[2]]
  gz <- kde$estimate  # matrix [length(gx) x length(gy)]
  
  grid <- expand.grid(x = gx, y = gy)
  grid$z <- as.vector(gz)
  
  list(grid = grid, x = gx, y = gy, z = gz)
}

# Compute density cutoffs (levels) that enclose given probabilities on the grid
.hdr_levels_grid <- function(gx, gy, gz, probs = c(0.25, 0.5, 0.99)) {
  # assume regular grid (true for ks::kde)
  dx <- mean(diff(gx)); dy <- mean(diff(gy))
  w <- as.vector(gz)
  ord <- order(w, decreasing = TRUE)
  cum_mass <- cumsum(w[ord]) * dx * dy
  # normalize to 1 (numerical integration may be slightly off)
  cum_mass <- cum_mass / max(cum_mass, na.rm = TRUE)
  
  sapply(probs, function(p) {
    idx <- which(cum_mass >= p)[1]
    if (is.na(idx)) min(w, na.rm = TRUE) else w[ord][idx]
  })
}

# Build contour paths + label positions from a raster grid
.contour_data <- function(gx, gy, gz, levels, prob_labels) {
  cls <- contourLines(x = gx, y = gy, z = gz, levels = levels)  # note t()
  if (length(cls) == 0) return(list(lines = NULL, labels = NULL))
  lines_df <- do.call(rbind, lapply(seq_along(cls), function(i) {
    data.frame(x = cls[[i]]$x, y = cls[[i]]$y, level = cls[[i]]$level, id = i)
  }))
  labels_df <- do.call(rbind, lapply(seq_along(cls), function(i) {
    n <- length(cls[[i]]$x); j <- floor(n/2)
    data.frame(x = cls[[i]]$x[j], y = cls[[i]]$y[j], level = cls[[i]]$level, id = i)
  }))
  # map level -> probability text
  lvl_map <- setNames(prob_labels, levels)
  labels_df$prob <- lvl_map[as.character(labels_df$level)]
  list(lines = lines_df, labels = labels_df)
}

# ---------- funspace-style plot ----------
PCA_plot_funspace <- function(PCoAPlot, PCoACorPlot, multAx1, multAx, title, legend = NULL,
                              probs = c(0.25, 0.5, 0.99),
                              gridsize = 181, H = NULL, bins_filled = 30) {
  
  df <- data.frame(PCoAPlot$vectors)
  df <- .sanitize_axes(df)
  
  # KDE grid
  KG <- .kde_grid(df, gridsize = gridsize, H = H)
  
  # density background
  # continuous raster background
  p <- ggplot() +
    geom_raster(
      data = KG$grid,
      aes(x = x, y = y, fill = z)
    ) +
    scale_fill_gradientn(
      colours = c(
        "#FFFFFF",  # white
        "#FC8D59",  # medium orange-red
        "#EF6548",  # strong red-orange
        "#D7301F",  # deep red
        "#990000"   # dark brownish red
      ),
      guide = "none"
    )
  
  # HDR probability contours + labels (0.25, 0.5, 0.95 by default)
  levs <- .hdr_levels_grid(KG$x, KG$y, KG$z, probs = probs)
  cd <- .contour_data(KG$x, KG$y, KG$z, levels = as.numeric(levs),
                      prob_labels = paste0(probs))
  if (!is.null(cd$lines)) {
    p <- p +
      geom_path(data = cd$lines, aes(x, y, group = id),
                colour = "black", linewidth = 0.6)
  }
  if (!is.null(cd$labels)) {
    p <- p +
      geom_text(data = cd$labels, aes(x, y, label = prob),
                colour = "black", size = 3)
  }
  
  # trait loading arrows + labels
  p <- p +
    geom_segment(
      data = data.frame(PCoACorPlot),
      aes(x = 0, xend = Axis.1 * multAx, y = 0, yend = Axis.2 * multAx),
      arrow = arrow(length = unit(0.25, "cm")),
      colour = PCoACorPlot$color, linewidth = 0.5
    ) +
    geom_text(
      data = data.frame(PCoACorPlot),
      aes(x = Axis.1 * multAx1, y = Axis.2 * multAx1, label = names),
      colour = PCoACorPlot$color, size = 5
    ) +
    theme_classic() +
    ggtitle(title) +
    xlab(paste0("PCoA 1 (", round(PCoAPlot$values[1, 2] * 100, 2), "%)")) +
    ylab(paste0("PCoA 2 (", round(PCoAPlot$values[2, 2] * 100, 2), "%)")) +
    coord_fixed() +
    theme(plot.title = element_text(size = 18, face = "bold"))
  
  if (!is.null(legend)) {
    p <- p + annotate("text", label = legend, x = Inf, y = -Inf, hjust = 1, vjust = -0.5)
  }
  
  p
}


## 2.1 Generate a PCA correlation plot with density legend
make_pca_plot <- function(pca_object, correlation_data, multAx1, multAx, title) {
  legend_text <- paste0(
    "Space occupied by:\n",
    "50% = ", round(pca_object$ALLDensity[1, "0.5"], 2), "\n",
    "99% = ", round(pca_object$ALLDensity[1, "0.99"], 2)
  )
  PCA_plot(pca_object$PCoA, correlation_data, multAx1, multAx, title, legend_text)
}
make_pca_plot_34 <- function(pca_object, correlation_data, multAx1, multAx, title) {
  legend_text <- paste0(
    "Space occupied by:\n",
    "50% = ", round(pca_object$ALLDensity[1, "0.5"], 2), "\n",
    "99% = ", round(pca_object$ALLDensity[1, "0.99"], 2)
  )
  PCA_plot_34(pca_object$PCoA, correlation_data, multAx1, multAx, title, legend_text)
}

## 2.2 Format PCA correlation table with colors and labels
format_correlation_table <- function(PCoACorPlot, shortNames) {
  cbind.data.frame(
    PCoACorPlot,
    color = shortNames[match(rownames(PCoACorPlot), shortNames$original), "color"],
    names = shortNames[match(rownames(PCoACorPlot), shortNames$original), "short"]
  )
}

## 2.3 Perform Procrustes analysis between PCA spaces
run_procrustes <- function(PCA_list) {
  combNames <- combn(names(PCA_list), 2)
  combNames <- apply(combNames, 2, function(x) paste0(x[1], "_", x[2]))
  procrustes_table <- matrix(NA, ncol = 1, nrow = length(combNames),
                             dimnames = list(combNames, 'Birds'))
  
  for (j in 1:length(PCA_list)) {
    x <- PCA_list[[j]]$PCoA$vectors
    for (i in 1:length(PCA_list)) {
      if (i > j) {
        y <- PCA_list[[i]]$PCoA$vectors
        rownames(y) = rownames(x)
        prcTest <- ade4::procuste.rtest(as.data.frame(x), as.data.frame(y), nrepet = 999)
        cor_coef <- prcTest$obs
        p_val <- prcTest$pvalue
        signif <- ifelse(p_val < 0.001, "***", ifelse(p_val < 0.01, "**", ifelse(p_val < 0.05, "*", "ns")))
        procrustes_table[paste0(names(PCA_list)[j], "_", names(PCA_list)[i]), 1] <- 
          paste0(round(cor_coef, 3), " ", signif)
      }
    }
  }
  return(procrustes_table)
}

# =========================
# 3. Perform analysis
# =========================
multAx <- 0.25
multAx1 <- 0.29
plotPCAList <- plotPCAList_34 <- list()
densityTable <- NULL

for (i in seq_along(PCA)) {
  cor_table <- format_correlation_table(PCA[[i]]$PCoACor, shortNames)
  plotPCAList[[i]] <- PCA_plot_funspace(PCA[[i]]$PCoA, cor_table, multAx1, multAx, paste0(title_space[i],"-trait space"))
  #plotPCAList_34[[i]] <- PCA_plot_funspace(PCA[[i]], cor_table, multAx1, multAx, paste0(title_space[i],"-trait space"))
  
  densityTable <- rbind(densityTable,
                        data.frame(`50` = PCA[[i]]$ALLDensity[1, "0.5"],
                                   `99` = PCA[[i]]$ALLDensity[1, "0.99"],
                                   `100` = PCA[[i]]$ALLDensity[1, "1"],
                                   row.names = names(PCA)[i]))
}

names(plotPCAList)  <- type

# =========================
# 4. Save outputs
# =========================

# 4.1 Density table
write.csv(densityTable, file = "results/tables/BirdstableDensity.csv")

# 4.2 PCA plots grid
ggarrange(
  NULL, plotPCAList[["MLD"]], NULL,
  plotPCAList[["M"]], plotPCAList[["L"]], plotPCAList[["D"]],
  hjust = 0, align = "v", ncol = 3, nrow = 2
) %>% ggexport(
  filename = "results/figures/TraitsSpacesForFramework.png",
  width = 2800, height = 1800, res = 200, pointsize = 5
)

# 4.3 Procrustes correlation table
procrustes_res <- run_procrustes(PCA)
write.csv(procrustes_res, file = "results/tables/ProcrustesTable.csv")

Functional diversity

TPDsMean without ITV

# PCA_TRAITS: object from PCA previously done
dimensions <- PCA_TRAITS$dimensions
traitsUSE <- PCA_TRAITS$traitsUse
colnames(traitsUSE) <- paste0("Comp.", 1:dimensions)
gridSize <- ifelse(dimensions == 4, 30, 100)
sdTraits <- sqrt(diag(Hpi.diag(traitsUSE)))

TPDsAux <- TPDsMean(
  species = rownames(traitsUSE),
  means = traitsUSE,
  sds = matrix(rep(sdTraits, nrow(traitsUSE)), byrow = TRUE, ncol = dimensions),
  alpha = 0.95,
  n_divisions = gridSize
)

make.TPD.2D.high.def <- function(targetGroupName, PCAImpute, alphaUse = 0.95,
                                 saveFile = paste0(getwd(), "/TPDs2DHighDef.rds")) {
  require(TPD)
  cat(paste("\n ESTIMATING TPDs for: ", targetGroupName, "\n"))
  dimensions <- 2
  traitsUSE <- PCAImpute$traitsUse[, 1:dimensions]
  colnames(traitsUSE) <- paste0("Comp.", 1:2)
  gridSize <- 200
  sdTraits <- sqrt(diag(Hpi.diag(traitsUSE)))

  TPDsAux <- TPDsMean(
    species = rownames(traitsUSE),
    means = traitsUSE,
    sds = matrix(rep(sdTraits, nrow(traitsUSE)), byrow = TRUE, ncol = dimensions),
    alpha = alphaUse,
    n_divisions = gridSize
  )
  saveRDS(TPDsAux, saveFile)
  TPDsAux <- NULL
  gc()
}

make.TPD.2D.low.def <- function(targetGroupName, PCAImpute, alphaUse = 0.95,
                                saveFile = paste0(getwd(), "/TPDs2DLowDef.rds")) {
  require(TPD)
  cat(paste("\n ESTIMATING TPDs for: ", targetGroupName, "\n"))
  dimensions <- 2
  traitsUSE <- PCAImpute$traitsUse[, 1:dimensions]
  colnames(traitsUSE) <- paste0("Comp.", 1:2)
  gridSize <- 100
  sdTraits <- sqrt(diag(Hpi.diag(traitsUSE)))

  TPDsAux <- TPDsMean(
    species = rownames(traitsUSE),
    means = traitsUSE,
    sds = matrix(rep(sdTraits, nrow(traitsUSE)), byrow = TRUE, ncol = dimensions),
    alpha = alphaUse,
    n_divisions = gridSize
  )
  saveRDS(TPDsAux, saveFile)
  TPDsAux <- NULL
  gc()
}

make.TPDLarge <- function(targetGroupName, PCAImpute, alphaUse = 0.95, TPDsMean_large,
                          saveFile = paste0(getwd(), "/TPDsLarge.rds")) {
  cat(paste("\n ESTIMATING TPDs for: ", targetGroupName, "\n"))
  dimensions <- PCAImpute$dimensions
  traitsUSE <- PCAImpute$traitsUse
  colnames(traitsUSE) <- paste0("Comp.", 1:dimensions)
  gridSize <- ifelse(dimensions == 4, 30, 100)
  sdTraits <- sqrt(diag(Hpi.diag(traitsUSE)))

  TPDsAux <- TPDsMean_large(
    species = rownames(traitsUSE),
    means = traitsUSE,
    sds = matrix(rep(sdTraits, nrow(traitsUSE)), byrow = TRUE, ncol = dimensions),
    alpha = alphaUse,
    n_divisions = gridSize
  )
  saveRDS(TPDsAux, saveFile)
  TPDsAux <- NULL
  gc()
}

Function TPD package updated

TPDRichness <- function(TPDc = NULL, TPDs = NULL) {
  if (is.null(TPDc) & is.null(TPDs)) {
    stop("At least one of 'TPDc' or 'TPDs' must be supplied")
  }
  if (!is.null(TPDc) & class(TPDc) != "TPDcomm") {
    stop("Class mismatch: please specify if your object is a TPDc or a TPDs (TPDcomm expected).")
  }
  if (!is.null(TPDs) & class(TPDs) != "TPDsp") {
    stop("Class mismatch: please specify if your object is a TPDc or a TPDs (TPDsp expected).")
  }

  results <- list()

  Calc_FRich <- function(x) {
    results_FR <- numeric()
    if (class(x) == "TPDcomm") {
      TPD <- x$TPDc$TPDc
      names_aux <- names(x$TPDc$TPDc)
      cell_volume <- x$data$cell_volume
    }
    if (class(x) == "TPDsp") {
      TPD <- x$TPDs
      names_aux <- names(x$TPDs)
      cell_volume <- x$data$cell_volume
    }
    for (i in 1:length(TPD)) {
      TPD_aux <- TPD[[i]]
      TPD_aux[TPD_aux > 0] <- cell_volume
      results_FR[i] <- sum(TPD_aux)
    }
    names(results_FR) <- names_aux
    return(results_FR)
  }

  if (!is.null(TPDc)) {
    results$communities <- list()
    results$communities$FRichness <- Calc_FRich(TPDc)
  }
  if (!is.null(TPDs)) {
    if (TPDs$data$type == "One population_One species" |
        TPDs$data$type == "One population_Multiple species") {
      results$species <- list()
      results$species$FRichness <- Calc_FRich(TPDs)
    } else {
      results$populations <- list()
      results$populations$FRichness <- Calc_FRich(TPDs)
    }
    if (TPDs$data$method == "mean") {
      message("WARNING: When TPDs are calculated using the TPDsMean function, Evenness and Divergence are meaningless!!")
    }
  }
  return(results)
}

imageTPD <- function(x, thresholdPlot = 0.99) {
  TPDList <- x$TPDc$TPDc
  imageTPD <- list()
  for (comm in 1:length(TPDList)) {
    percentile <- rep(NA, length(TPDList[[comm]]))
    TPDList[[comm]] <- cbind(index = 1:length(TPDList[[comm]]),
                             prob = TPDList[[comm]], percentile)
    orderTPD <- order(TPDList[[comm]][, "prob"], decreasing = TRUE)
    TPDList[[comm]] <- TPDList[[comm]][orderTPD, ]
    TPDList[[comm]][, "percentile"] <- cumsum(TPDList[[comm]][, "prob"])
    TPDList[[comm]] <- TPDList[[comm]][order(TPDList[[comm]][, "index"]), ]
    imageTPD[[comm]] <- TPDList[[comm]]
  }
  names(imageTPD) <- names(TPDList)

  trait1Edges <- unique(x$data$evaluation_grid[, 1])
  trait2Edges <- unique(x$data$evaluation_grid[, 2])
  imageMat <- array(
    NA,
    c(length(trait1Edges), length(trait2Edges), length(imageTPD)),
    dimnames = list(trait1Edges, trait2Edges, names(TPDList))
  )
  for (comm in 1:length(TPDList)) {
    percentileSpace <- x$data$evaluation_grid
    percentileSpace$percentile <- imageTPD[[comm]][, "percentile"]
    for (i in 1:length(trait2Edges)) {
      colAux <- subset(percentileSpace, percentileSpace[, 2] == trait2Edges[i])
      imageMat[, i, comm] <- colAux$percentile
    }
    imageMat[imageMat > thresholdPlot] <- NA
  }
  return(imageMat)
}

overlapF <- function(x, y) {
  if (!identical(x$eval.points, y$eval.points)) stop("Not identical eval points!")
  TPDx <- x$estimate
  if (class(x$eval.points) == "list") {
    x$eval.points <- expand.grid(x$eval.points)
  }
  cellEdge <- numeric()
  for (i in 1:ncol(x$eval.points)) {
    cellEdge[i] <- unique(x$eval.points[, i])[2] - unique(x$eval.points[, i])[1]
  }
  cellSizex <- prod(cellEdge)
  TPDx <- TPDx * cellSizex
  TPDx <- TPDx / sum(TPDx)

  TPDy <- y$estimate
  TPDy <- TPDy * cellSizex
  TPDy <- TPDy / sum(TPDy)

  OL <- sum(pmin(TPDx, TPDy))
  return(OL)
}

percentileTPD <- function(x) {
  TPDList <- x$TPDc$TPDc
  results <- list()
  for (comm in 1:length(TPDList)) {
    percentile <- rep(NA, nrow(TPDList[[comm]]))
    TPDList[[comm]] <- cbind(TPDList[[comm]], percentile)
    orderTPD <- order(TPDList[[comm]][, "notZeroProb"], decreasing = TRUE)
    TPDList[[comm]] <- TPDList[[comm]][orderTPD, ]
    TPDList[[comm]][, "percentile"] <- cumsum(TPDList[[comm]][, "notZeroProb"])
    TPDList[[comm]] <- TPDList[[comm]][order(TPDList[[comm]][, "notZeroIndex"]), ]
    results[[comm]] <- TPDList[[comm]]
  }
  names(results) <- names(TPDList)
  return(results)
}

quantileTPD <- function(x, thresholdPlot = 0.99) {
  TPDList <- x$TPDc$TPDc
  quantileTPD <- list()
  for (comm in 1:length(TPDList)) {
    percentile <- rep(NA, length(TPDList[[comm]]))
    TPDList[[comm]] <- cbind(index = 1:length(TPDList[[comm]]),
                             prob = TPDList[[comm]], percentile)
    orderTPD <- order(TPDList[[comm]][, "prob"], decreasing = TRUE)
    TPDList[[comm]] <- TPDList[[comm]][orderTPD, ]
    TPDList[[comm]][, "percentile"] <- cumsum(TPDList[[comm]][, "prob"])
    TPDList[[comm]] <- TPDList[[comm]][order(TPDList[[comm]][, "index"]), ]
    quantileTPD[[comm]] <- TPDList[[comm]]
  }
  names(quantileTPD) <- names(TPDList)

  trait1Edges <- unique(x$data$evaluation_grid[, 1])
  trait2Edges <- unique(x$data$evaluation_grid[, 2])
  imageMat <- array(
    NA,
    c(length(trait1Edges), length(trait2Edges), length(quantileTPD)),
    dimnames = list(trait1Edges, trait2Edges, names(TPDList))
  )
  for (comm in 1:length(TPDList)) {
    percentileSpace <- x$data$evaluation_grid
    percentileSpace$percentile <- quantileTPD[[comm]][, "percentile"]
    for (i in 1:length(trait2Edges)) {
      colAux <- subset(percentileSpace, percentileSpace[, 2] == trait2Edges[i])
      imageMat[, i, comm] <- 1 - colAux$percentile
    }
    imageMat[imageMat < (1 - thresholdPlot)] <- 0
  }
  return(imageMat)
}

MAE_TPD <- function(x, y) {
  diffXY <- abs(x - y)
  MAE <- mean(diffXY)
  return(MAE)
}

FishMORPH

################################################################################
# data preparation
################################################################################

# --- Load the data ---

trait <- readRDS("dataOriginal/FishMORPH_Traits.rds")
phylogeny <- readRDS("dataOriginal/FishMORPH_Phylogeny.rds")
traitNames <- colnames(trait)[-c(1:6)]

# --- Prepare the data ---

# Species list
list_sp <- gsub("\\.", " ", as.character(trait$Genus.species))

# Lenght-weight fishbase
lgtwgt <- as.data.frame(length_weight(list_sp))
lgtwgt <- lgtwgt[!is.na(lgtwgt$a) & !is.na(lgtwgt$b), ]

# Best coeff
ab <- lgtwgt %>%
  group_by(Species) %>%
  slice_max(order_by = CoeffDetermination, with_ties = FALSE, na_rm = TRUE) %>%
  dplyr::select(Species, a, b) %>%
  distinct()

# Other traits
speciesInfo <- species(list_sp) %>% data.table()
speciesInfoSub <- speciesInfo[, .(Species, Fresh, LongevityWild, Length, Weight, LTypeMaxM)]
speciesInfoSub <- unique(speciesInfoSub)
speciesInfoSub$Species <- gsub(" ", ".", speciesInfoSub$Species)
speciesInfoSub <- speciesInfoSub %>%
  mutate(
    Length = ifelse(LTypeMaxM != "SL", NA, Length),
    Weight2 = ifelse(Species %in% gsub(" ", ".", rownames(ab)),
                     ab[gsub("\\.", " ", Species), "a"] * Length^ab[gsub("\\.", " ", Species), "b"],
                     NA)
  ) %>%
  mutate(Weight = ifelse(is.na(Weight) & !is.na(Weight2), Weight2, Weight))

# fusion with Fishmorph
fishTraits <- merge(trait[,6:15], speciesInfoSub[, .(Species, Length, Weight)], 
                    by.x = "Genus.species", by.y = "Species", all.x = TRUE)

fishTraits <- fishTraits %>%
  mutate(across(-Genus.species, ~log10(.x + 1))) %>%
  rename(species = Genus.species) %>%
  mutate(species = gsub("\\.", "_", species))

spToKeep <- fishTraits %>% # keep only freshwater species
  dplyr::select(species) %>%
  mutate(species = gsub("_", " ", species)) %>%
  pull() %>%
  rfishbase::species() %>%
  as.data.table() %>%
  filter(Fresh == 1) %>%
  dplyr::select(Species) %>%
  mutate(Species = gsub(" ", "_", Species)) %>%
  pull()

fishTraits <- fishTraits %>%
  filter(species %in% spToKeep)

#dir.create("dataPrepared/Fish", showWarnings = FALSE, recursive = TRUE)

# write.table(fishTraits, "dataPrepared/Fish/fishTraitsMissing.txt")
fishTraits <- read.table( "dataPrepared/Fish/fishTraitsMissing.txt")

# --- PCoA phylogenetic ---

phylogeny$tip.label <- gsub("\\.", "_", phylogeny$tip.label)
phylogeny <- drop.tip(phylogeny, setdiff(phylogeny$tip.label, fishTraits$species))
phylogenyTraits <- phytools::force.ultrametric(phylogeny)
phylDiss <- sqrt(cophenetic(phylogenyTraits))
pcoaPhyl <- cmdscale(phylDiss, k = 10) #long

rownames(pcoaPhyl) = fishTraits$species
colnames(pcoaPhyl) = paste0("Eigen.", 1:10)

# write.table(pcoaPhyl, "dataPrepared/Fish/pcoaPhylogenyFish.txt")
pcoaPhyl <- read.table("dataPrepared/Fish/pcoaPhylogenyFish.txt", header = T, stringsAsFactors = F)

# --- Taxonomic standardization (quick) ---

list_sp_raw <- fishTraits$species
list_sp_raw <- gsub("_", " ", list_sp_raw)
list_sp_raw <- stringr::str_squish(list_sp_raw)

verified_names <- taxize::gna_verifier( # long
  names = list_sp_raw,
  data_sources = c(11), 
  all_matches = FALSE,   
  capitalize = TRUE,         
  species_group = TRUE,      
  output_type = "table"      
)

new_names <- gsub(" ", "_", verified_names$matchedCanonicalSimple)

rownames(pcoaPhyl) <- new_names
fishTraits$species <- new_names
traitsAndPCOA <- cbind(fishTraits, pcoaPhyl)

# write.table(traitsAndPCOA, "dataPrepared/Fish/traitsWithPCOA.txt")
traitsAndPCOA <- read.table("dataPrepared/Fish/traitsWithPCOA.txt", header = T, stringsAsFactors = F)

# --- IUCN data ---

species_list <- unique(str_trim(traitsAndPCOA$species))
iucn_statut <- read.csv("dataOriginal/assessments.csv", header = TRUE, stringsAsFactors = FALSE) # IUCN data 2024
iucn_clean <- iucn_statut %>%
  mutate(scientificName = str_trim(scientificName))
species_list <- gsub("_", " ", species_list)
matched_species <- inner_join(
  tibble(species = species_list),
  iucn_clean,
  by = c("species" = "scientificName")
)

species_to_update <- tibble(species = species_list) %>%
  anti_join(iucn_clean, by = c("species" = "scientificName"))
synonyms_info <- rfishbase::synonyms(
  species_list = species_to_update$species,
  server = "fishbase",
  version = "latest",
  fields = NULL
)
synonyms_info_filtered <- synonyms_info %>%
  filter(!Status %in% c("misapplied name", "ambiguous synonym", "provisionally accepted name"))
synonyms_mapping <- synonyms_info_filtered %>%
  dplyr::select(Species, synonym) %>%
  distinct()

iucn_clean <- iucn_clean %>%
  mutate(scientificName = if_else(
    scientificName %in% synonyms_mapping$Species,
    synonyms_mapping$synonym[match(scientificName, synonyms_mapping$Species)],
    scientificName
  ))
matched_species <- inner_join(
  tibble(species = species_list),
  iucn_clean,
  by = c("species" = "scientificName")
)

species_to_update <- tibble(species = species_list) %>%
  anti_join(iucn_clean, by = c("species" = "scientificName")) %>%
  as.data.frame()
rownames(species_to_update) <- species_to_update$species

# manual correction
species_to_check <- read.csv("dataOriginal/species_to_update_900_done.csv", sep = ";", header = TRUE, stringsAsFactors = FALSE)
colnames(species_to_check) <- c("scientificName", "redlistCategory")

iucn_clean <- iucn_clean[, c("scientificName", "redlistCategory")]
acronyms <- c(
  "Critically Endangered" = "CR",
  "Endangered" = "EN",
  "Vulnerable" = "VU",
  "Near Threatened" = "NT",
  "Least Concern" = "LC",
  "Data Deficient" = "DD",
  "Extinct" = "EX",
  "Extinct in the Wild" = "EW",
  "Not Evaluated" = "NE"
)
iucn_clean <- iucn_clean %>%
  mutate(redlistCategory = dplyr::recode(redlistCategory, !!!acronyms))
iucn_clean <- bind_rows(iucn_clean, species_to_check)
iucn_clean <- iucn_clean %>%
  filter(scientificName %in% species_list)
iucn_clean <- iucn_clean %>%
  group_by(scientificName) %>%
  slice(1) %>%
  ungroup()

traitsAndPCOA$species <- gsub("_", " ", traitsAndPCOA$species)
traitsAndPCOA$IUCN <- iucn_clean$redlistCategory[match(traitsAndPCOA$species, iucn_clean$scientificName)]

# write.table(traitsAndPCOA, "dataPrepared/Fish/traitsWithPCOAIUCN.txt")
traitsAndPCOAIUCN <- read.table("dataPrepared/Fish/traitsWithPCOAIUCN.txt", header = T, stringsAsFactors = F)

# --- Complete taxonomy ---

taxInfo <- pblapply(traitsAndPCOAIUCN$species, function(sp) {
  tryCatch({
    traitdataform::get_gbif_taxonomy(sp, subspecies = TRUE, higherrank = TRUE,
                                     conf_threshold = 80, resolve_synonyms = FALSE)[1, ]
  }, error = function(e) NULL)
}) %>% rbindlist(fill = TRUE)

# --- Final table ---

fishTraitsPhylogenyIUCN <- data.table(traitsAndPCOAIUCN)

# write.table(fishTraitsPhylogenyIUCN, "dataPrepared/Fish/AllDataFish_clean.txt")
fishData <- read.table("dataPrepared/Fish/traitsWithPCOAIUCN.txt", header = T, stringsAsFactors = F)

# define columns
columnsTraits <- 2:(which(colnames(fishData) == "Eigen.1") - 1)
columnsImputation <- 2:(which(colnames(fishData) == "IUCN") - 1)

# --- missForest imputation ---

set.seed(123) 
imputed_forest <- missForest(xmis = fishData[, columnsImputation]) # it takes long
print(imputed_forest$OOBerror)

traits_names <- colnames(fishData)[columnsTraits]
fishData_imputed_forest <- fishData
fishData_imputed_forest[traits_names] <- imputed_forest$ximp[traits_names]

# write.table(fishData_imputed_forest, "dataPrepared/Fish/fishData_imputed_forest.txt", row.names = FALSE)
fishData_imputed_forest <- read.table("dataPrepared/Fish/fishData_imputed_forest.txt", header = T)

# --- trait selection and renaming ---

selectedTraits <- c("EdHd", "EhBd", "JlHd", "MoBd", "BlBd", "HdBd",
                    "PFiBd", "PFlBl", "CFdCPd", "Length", "Weight")

newTraitNames <- c("es", "ep", "ms", "mp", "elo", "wid", "pp", "ps", "cs", "svl", "bm")

fishTraitsMissing <- fishData[, selectedTraits]
colnames(fishTraitsMissing) <- newTraitNames
rownames(fishTraitsMissing) <- fishData$species

fishTraitsImputed <- fishData_imputed_forest[, selectedTraits]
colnames(fishTraitsImputed) <- newTraitNames
rownames(fishTraitsImputed) <- fishData$species

# --- add IUCN to the table ---

fishTraitsMissing <- data.frame(fishTraitsMissing, IUCN = fishData$IUCN)
# write.table(fishTraitsMissing, "dataPrepared/Fish/TraitFishMissing.txt")
fishTraitsImputed <- data.frame(fishTraitsImputed, IUCN = fishData$IUCN)
# write.table(fishTraitsImputed, "dataPrepared/Fish/TraitFishImputed.txt")

# --- ACP and TPD ---

# it takes long !!!
results <- computePCAandTPDs(fishTraitsImputed[,!colnames(fishTraitsImputed) == "IUCN"])
# saveRDS(results, "output/All_fish.rds")
pca_trait = results$PCA
# saveRDS(pca_trait, "output/PCA_fish.rds")
tpd_trait = results$TPDs
# saveRDS(tpd_trait, "output/TPDs_fish.rds")

# --- (optionnal) other fish info ---

fishOtherInfo <- fishData[, (max(columnsTraits) + 1):ncol(fishData)]

# --- add uses to pca_traits ---

df_scraping <- readRDS("output/fish_human_uses_binary_FB.rds") %>% as.data.table()
df_uni <- fread("data/uni.csv")

setnames(df_scraping, c("species_name", "aquarium", "fisheries", "bait", "game_fish", "aquaculture"),
         c("Species", "Aquarium", "Fisheries", "Bait", "Game_fish", "Aquaculture"))
setnames(df_uni, "Game fish", "Game_fish", skip_absent = TRUE)

binary_cols <- c("Fisheries", "Aquaculture", "Aquarium", "Game_fish", "Bait")

merged_df <- merge(
  df_uni,
  df_scraping[, c("Species", binary_cols), with = FALSE],
  by = "Species",
  suffixes = c("", ".scraping"),
  all.x = TRUE
)

for (col in binary_cols) {
  col_scraping <- paste0(col, ".scraping")
  if (col_scraping %in% colnames(merged_df)) {
    merged_df[[col]] <- pmax(
      as.numeric(merged_df[[col]]),
      as.numeric(merged_df[[col_scraping]]),
      na.rm = TRUE
    )
    merged_df[[col_scraping]] <- NULL
  }
}

merged_df[, All_uses := as.integer(Fisheries + Aquaculture + Aquarium + Game_fish + Bait > 0)]

merged_df_clean <- merged_df %>%
  select(Species, all_of(binary_cols), All_uses) %>%
  rename("Game fish" = Game_fish, "All uses" = All_uses)

usage_cols <- c("Fisheries", "Aquaculture", "Aquarium", "Game fish", "Bait", "All uses")

uses_df <- as.data.frame(merged_df_clean[, c("Species", usage_cols), with = FALSE])
uses_df <- uses_df[!is.na(uses_df$Species) & uses_df$Species != "NA", ]
rownames(uses_df) <- uses_df$Species
uses_df$Species <- NULL

uses_df["Centromochlus musaica", ] <- list( # miss match in the species name for Centromochlus musaica
  Fisheries = 0,
  Aquaculture = 0,
  Aquarium = 1, # only one use associated to Centromochlus musaica
  `Game fish` = 0,
  Bait = 0,
  `All uses` = 1
)

pca_trait$uses <- uses_df[species_ref, usage_cols, drop = FALSE]
use <- pca_trait$uses
saveRDS(pca_trait, "output/pca_trait.rds")

# --- create the matrice ---

pca_trait <- readRDS("output/pca_trait.rds")
  
species_scores <- as.data.frame(pca_trait$traits_scores[,1:4])
species_uses <- as.data.frame(pca_trait$uses)

species_uses$rownames <- rownames(species_uses)
species_scores$rownames <- rownames(species_scores)
species_scores_uses <- merge(species_uses, species_scores, by = "rownames")
rownames(species_scores_uses) <- species_scores_uses$rownames
species_scores_uses$rownames <- NULL
species_uses$rownames <- NULL
MatriceFish <- t(as.matrix(species_uses))
column_names <- rownames(species_uses)
MatriceFish_1 <- matrix(1, nrow = 1, ncol = length(column_names))
colnames(MatriceFish_1) <- column_names
rownames(MatriceFish_1) <- "all"
MatriceFish <- rbind(MatriceFish, MatriceFish_1)

write.csv(MatriceFish, "output/MatriceFish.csv", row.names = TRUE)

Imputation

imputeTraits<-function(traitsData, selectedTraits, traitPCA, PCAmodel, meanInputed,
                       sdInputed, nm_PCA, percImpute=1, dimensions, nboot=100){
  require(missForest)
  require(Metrics)
  
  # Data: extract the species with complete 'selectedTraits'.  For imputation all available traits are used: 'columsImputation'
  columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1) 
  completeSp <- rownames(na.omit(traitsData[,selectedTraits]))
  missingSp <- setdiff(rownames(traitsData), completeSp)
  traitWithoutNA<-traitsData[completeSp, columsImputation]
  traitWithNA <- traitsData[missingSp, columsImputation]
  #transform the traits with NA values into a mask (1 for existing traits, NA for NA traits)
  traitNAMask <- replace(traitWithNA, traitWithNA > -Inf, 1)
  output <- list()
  for (i in dimensions){
    output[[i]] <- matrix(NA, nc = nboot, nr = nrow(traitsData), dimnames = list(rownames(traitsData), paste0("Sim. ", 1:nboot)))
    names(output)[i] <- paste0("PC", i)
  }
  RMSE<-matrix(NA,nc=2,nr=nboot,dimnames = list(paste0("Sim.",1:nboot),c("RMSE_PC1","RMSE_PC2")))
  for (b in 1:nboot){
    cat(paste0("\n..................SIMULATION ",b," out of ",nboot,"................\n"))
    
    # Among the complete data, let's add NA values for 'percImpute' rows:
    traitSimulNA<-traitWithoutNA[sample(1:nrow(traitWithoutNA), percImpute*nrow(traitWithoutNA)), ]
    randomMask <- traitNAMask[sample(1:nrow(traitNAMask) , nrow(traitSimulNA)), ]
    
    # We create missing values in the traitSimulNA matrix by "pasting" them from real missing observations (this way keeping the structure of missing data):
    traitSimulNA <-  traitSimulNA * randomMask
    
    # We simulate this missing traits using the same imputation method than using in the paper:
    # Using the phylogenetic eigenvalues
    # We combine the new dataset of species without NA ('traitWithoutNA') and the other species with 'real' NA:
    # we impute all traits 
    traitSimulNAAll <- rbind(traitSimulNA, # Our new NA simulated
                             traitWithoutNA[setdiff(rownames(traitWithoutNA), rownames(traitSimulNA)),  ]) # the species with complete data without the species with new NA 
    traitsAux <- rbind(traitSimulNAAll, traitWithNA) # the other species 
    # Imputation with 'missForest'
    imputeRFModel <- missForest(xmis = traitsAux)$ximp
    traitsinPhylImput <- as.matrix(imputeRFModel)
    
    # We scale the data using the same mean and SD that we used in the paper: 
    traitsinPhylImputScaled<-traitsinPhylImput[,selectedTraits]
    for (i in 1:length(selectedTraits)){
      traitsinPhylImputScaled[,i]<-(traitsinPhylImput[, selectedTraits[i]] - meanInputed[i]) / sdInputed[i]  
    }
    colnames(traitsinPhylImputScaled)<-names(meanInputed)
    
    # We predict the position of the imputed species in the functional space:
    PCApositionNA <- predict(PCAmodel, newdata = traitsinPhylImputScaled[rownames(traitSimulNA), ])
    
    # Imputed with PCA compared to original position of the species with the functional space
    imputed<-PCApositionNA[,dimensions]
    original<-traitPCA[rownames(PCApositionNA),]
    
    RMSE1<-(mean((original[,1] - imputed[,1]) ** 2)) / (max(traitPCA[, 1]) - min(traitPCA[, 1]))
    RMSE2<-(mean((original[,2] - imputed[,2]) ** 2)) / (max(traitPCA[, 2]) - min(traitPCA[, 2]))
    
    RMSE[b,]<-c(RMSE1,RMSE2)
    
    for (i in dimensions){
      output[[i]][rownames(imputed),b]  <- imputed[, i] 
    }
    print(apply(RMSE,2,mean,na.rm=T))
  }
  return(list(output,RMSE))
} 


for (gr in 2:5){
  if(gr == 2){
    pcoaPhyl <- read.table("dataPrepared/Mammals/pcoaPhylogenyMammals.txt",row.names = NULL )
    traitsData <- read.table("dataPrepared/Mammals/AllDataMammals.txt", header=T)
    columnsTraits <- 1:(which(colnames(traitsData) == "Eigen.1")-1)
    traitsData[,columnsTraits] <- log10(traitsData[, columnsTraits])
    columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1) 
    selectedTraits <- c("litter_or_clutch_size_n", "litters_or_clutches_per_y",
                        "adult_body_mass_g", "longevity_y", "gestation_d", "adult_svl_cm",
                        "weaning_d", "female_maturity_d")
    
    nm_PCA<-c("ls", "ly", "bm", "long", "gest", "svl", "wea", "fmat")
    
    traitsDataImputed <- read.table("dataPrepared/Mammals/mammalTraitsImputedIUCN.txt")
    meanInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,mean)
    sdInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,sd)
    
    traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)] <- 
      scale(traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)])
  }
  if(gr == 3){
    pcoaPhyl <- read.table("dataPrepared/Aves/pcoaPhylogenyAves.txt",row.names = NULL )
    traitsData <-read.table("dataPrepared/aves/AllDataAves.txt", header=T)
    columnsTraits <- 1:(which(colnames(traitsData) == "Eigen.1")-1)
    traitsData[,columnsTraits] <- log10(traitsData[, columnsTraits])
    columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1) 
    selectedTraits <- c("litter_or_clutch_size_n", "adult_body_mass_g", "egg_mass_g",
                        "incubation_d", "longevity_y", "fledging_age_d", "litters_or_clutches_per_y", 
                        "adult_svl_cm")
    
    nm_PCA<-c("ls", "bm", "em", "inc", "long", "fa", "ly", "svl")
    
    traitsDataImputed <- read.table("dataPrepared/aves/aveTraitsImputedIUCN.txt")
    meanInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,mean)
    sdInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,sd)
    
    traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)] <- 
      scale(traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)])
  }
  if(gr == 4){
    pcoaPhyl <- read.table("dataPrepared/reptiles/pcoaPhylogenyReptiles.txt",row.names = NULL )
    traitsData <-read.table("dataPrepared/reptiles/AllDataReptiles.txt", header=T)
    columnsTraits <- 1:(which(colnames(traitsData) == "Eigen.1")-1)
    traitsData[,columnsTraits] <- log10(traitsData[, columnsTraits])
    columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1) 
    selectedTraits <- c("litter_or_clutch_size_n", "litters_or_clutches_per_y",
                        "adult_body_mass_g", "incubation_d", "longevity_y", "no_sex_svl_cm")
    
    nm_PCA<-c("ls", "ly", "bm", "inc", "long", "svl")
    
    traitsDataImputed <- read.table("dataPrepared/reptiles/reptileTraitsImputedIUCN.txt")
    meanInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,mean)
    sdInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,sd)
    
    traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)] <- 
      scale(traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)])
  }
  if(gr == 5){
    pcoaPhyl <- read.table("dataPrepared/Amphibians/pcoaPhylogenyAmphibians.txt",row.names = NULL )
    traitsData <-read.table("dataPrepared/Amphibians/AllDataAmphibians.txt", header=T)
    columnsTraits <- 1:(which(colnames(traitsData) == "Eigen.1")-1)
    traitsData[,columnsTraits] <- log10(traitsData[, columnsTraits])
    columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1) 
    selectedTraits <- c("Age_at_maturity_min_y",
                        "Body_size_mm", "Litter_size_max_n",
                        "Offspring_size_min_mm")
    
    nm_PCA<-c("am", "bs", "ls", "os")
    
    traitsDataImputed <- read.table("dataPrepared/Amphibians/amphibianTraitsImputedIUCN.txt")
    meanInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,mean)
    sdInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,sd)
    
    traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)] <- 
      scale(traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)])
  }
  if(gr == 6){
    pcoaPhyl <- read.table("dataPrepared/Fish/pcoaPhylogenyFish.txt",row.names = NULL )
    traitsData <- read.table("dataPrepared/Fish/AllDataFish.txt", header=T)
    selectedTraits <-  c("es","ep","ms","mp","elo","wid","pp","ps","cs","svl","bm")
    colnames(traitsData)[1:length(selectedTraits)] <- selectedTraits
    
    traitsDataImputed <- read.table("dataPrepared/Fish/fishTraitsImputedIUCN.txt")
    meanInputed<-apply(traitsDataImputed[, !colnames(traitsDataImputed) %in% c("IUCN")], 2, mean)
    sdInputed<-apply(traitsDataImputed[, !colnames(traitsDataImputed) %in% c("IUCN")],2,sd)
    traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)] <- 
      scale(traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)])
    
  }
  
  traitAll<-PCAImputedALL[[gr]]$traitsMissing
  traitPCA<-PCAImputedALL[[gr]]$traitsUse
  PCAmodel<-PCAImputedALL[[gr]]$PCA
  dimensions<-c(1:PCAImputedALL[[gr]]$dimensions)
  nreps <- 100
  cat(paste0("\n....................GROUP:",targetGroupName[gr],"............................\n"))
  matCov <- imputeTraits(traitsData, selectedTraits, traitPCA, PCAmodel, meanInputed, sdInputed, nm_PCA, 
                         percImpute = 0.50, dimensions, nboot = nreps)
  saveRDS(matCov,file=paste0("dataPrepared/CovarianceMatrix_",targetGroupName[gr],".RDS"))
}

Others functions

# Regex to select all symbols but '_'. useful for identify formulas in dplyr::
"[^[:alnum:][:space:]_]"

x <- iris
x <- x[, 1:2]
colnames(x) <- c("x1", "x2")

library(dplyr)

funfun <- function(data, expr = "log(x1 / x2) * 10^2") {
  expr <- parse(text = expr)
  data %>% mutate(x3 = eval(expr))
}

funfun(x)

## How to make a gower for big data table
library(cluster)       # daisy() and Gower distance
library(data.table)    # large data handling
library(parallel)      # parallel computing
library(bigstatsr)     # very large matrices

load(here::here("outputs", "test_phenofish_final.RData"))  # assumes object `test_phenofish_final`

data_morph <- test_phenofish_final %>%
  filter(trait_type == "morphological" & fresh == 1 & trait_coding == "continuous")

data_morph <- data_morph %>%
  dplyr::group_by(specimen_id, trait_name) %>%
  mutate(trait_value = as.numeric(trait_value))

data_morph <- data_morph %>%
  dplyr::group_by(phenofish_name, trait_name) %>%
  dplyr::summarise(trait_value = mean(trait_value), .groups = "drop")

species_traits <- reshape2::dcast(data_morph, phenofish_name ~ trait_name)
colnames(species_traits)[1] <- "species"
rownames(species_traits) <- species_traits[, 1]
traits <- species_traits[, -1]

block_size <- 10000
n <- nrow(traits)

gower_matrix <- matrix(NA, n, n)

compute_gower_block <- function(start_idx, end_idx, traits_data) {
  block_data <- traits_data[start_idx:end_idx, , drop = FALSE]
  gower_dist_block <- daisy(block_data, metric = "gower")
  return(as.matrix(gower_dist_block))
}

block_indices <- split(1:n, ceiling(1:n / block_size))

cl <- makeCluster(detectCores() - 1)
clusterExport(cl, varlist = c("traits", "daisy", "compute_gower_block"))

gower_blocks <- parLapply(cl, block_indices, function(indices) {
  start_idx <- indices[1]
  end_idx <- tail(indices, 1)
  compute_gower_block(start_idx, end_idx, traits)
})

for (i in 1:length(gower_blocks)) {
  idx_range <- block_indices[[i]]
  gower_matrix[idx_range, idx_range] <- gower_blocks[[i]]
}

stopCluster(cl)

print(gower_matrix[1:100, 1:100])
save(gower_matrix, file = "gower_matrix.RData")

pcoa_res <- ape::pcoa(as.dist(gower_matrix))
pca_res <- cmdscale(as.dist(gower_matrix), k = 2)
summary(pca_res)