This page collects reusable R utilities (installation helpers,
summary/stat functions, trait workflows, PCA, TPD utilities, and
assorted snippets).
All chunks are shown for copy/paste but not evaluated
(eval = FALSE).
packages <- c(
"ade4","ape","berryFunctions","betapart","biscale","cowplot","PCA_TRAITS.table",
"dplyr","funrar","geiger","ggplot2","ggpubr","lsmeans","missForest","motmot",
"multcomp","mvMORPH","paleotree","pals","paran","phylobase","phytools","picante",
"plotly","plotrix","psych","quanteda","ratematrix","RColorBrewer","readr","rgbif",
"rnaturalearth","rredlist","sf","shape","stats","tidyr","TPD","vegan",
"VennDiagram","viridis","wesanderson"
)
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Packages loading
lapply(packages, library, character.only = TRUE)distrib <- function(x, k = 6) {
# NA removing
x <- x[is.na(x) == FALSE]
Nx <- length(x)
sum <- round(summary(x), k)
res <- c(Nx, sum)
names(res) <- c("N", names(sum))
return(res)
} # end of function distrib
hms_span <- function(start, end) {
dsec <- as.numeric(difftime(end, start, unit = "secs"))
hours <- floor(dsec / 3600)
minutes <- floor((dsec - 3600 * hours) / 60)
seconds <- dsec - 3600 * hours - 60 * minutes
paste0(
sapply(c(hours, minutes, seconds), function(x) {
formatC(x, width = 2, format = "d", flag = "0")
}),
collapse = ":"
)
}
sesandpvalues_bis <- function(obs, rand, nreps, probs = c(0.025, 0.975), rnd = 2) {
SES <- (obs - mean(rand)) / sd(rand)
pValsSES <- rank(c(obs, rand))[1] / (length(rand) + 1)
results <- round(c(obs, SES, mean(rand), quantile(rand, prob = probs), pValsSES, nreps), rnd)
names(results) <- c("Observed", "SES", "MeanRd", "CI025Rd", "CI975Rd", "Pval", "Nreps")
return(results)
}
extract_t_test <- function(x, y, ...) {
z <- t.test(x, y)
stat <- z$statistic
pval <- z$p.value
pvalStars <- ifelse(pval > 0.05, ".",
ifelse(pval > 0.01, "*",
ifelse(pval > 0.001, "**", "***")))
est <- z$estimate
sdv <- c(sd(x), sd(y))
res <- data.frame(est[1], sdv[1], est[2], sdv[2], stat, pval, pvalStars)
res[1:4] <- round(as.numeric(res[1:4]), 2)
names(res) <- c("Mean x", "sd x", "Mean y", "sd y", "t", "Pvalues", "P")
return(res)
}
extract_indices_list <- function(ind) {
indSES <- matrix(
NA, nr = ncol(ind), nc = 7,
dimnames = list(
colnames(ind),
c("Observed", "SES", "MeanRd", "CI025Rd", "CI975Rd", "Pval", "Nreps")
)
)
for (i in 1:nrow(indSES)) {
indSES[i, ] <- sesandpvalues_bis(
obs = ind[1, i],
rand = ind[-1, i],
nreps = length(ind[, 1]) - 1,
probs = c(0.025, 0.975),
rnd = 2
)
}
rownames(indSES) <- gsub("\\.", " ", rownames(indSES))
return(indSES)
}
extract_indices_change <- function(ind) {
indSES <- matrix(
NA, nr = ncol(ind), nc = 7,
dimnames = list(
colnames(ind),
c("Observed", "SES", "MeanRd", "CI025Rd", "CI975Rd", "Pval", "Nreps")
)
)
for (i in 1:nrow(indSES)) {
indSES[i, ] <- sesandpvalues_bis(
obs = ((ind[2, i] - ind[1, i]) / ind[1, i]) * 100,
rand = ((ind[-c(1:2), i] - ind[1, i]) / ind[1, i]) * 100,
nreps = length(ind[, 1]) - 2,
probs = c(0.025, 0.975),
rnd = 2
)
}
rownames(indSES) <- gsub("\\.", " ", rownames(indSES))
return(indSES)
}
extract_indices_change_log <- function(ind) {
indSES <- matrix(
NA, nr = ncol(ind), nc = 7,
dimnames = list(
colnames(ind),
c("Observed", "SES", "MeanRd", "CI025Rd", "CI975Rd", "Pval", "Nreps")
)
)
for (i in 1:nrow(indSES)) {
indSES[i, ] <- sesandpvalues_bis(
obs = log(ind[2, i] / ind[1, i]),
rand = log(ind[-c(1:2), i] / ind[1, i]),
nreps = length(ind[, 1]) - 2,
probs = c(0.025, 0.975),
rnd = 2
)
}
rownames(indSES) <- gsub("\\.", " ", rownames(indSES))
return(indSES)
}standard_names <- function(Traits, nrowTraits, source = 163, dim = NA) {
require(taxize)
standNamesTraits <- as.data.frame(
matrix(
NA,
nrow = nrowTraits,
ncol = 1,
dimnames = list(rownames(Traits), "IUCNName")
)
)
batchSize <- 1000
for (i in 1:(ceiling(nrowTraits / batchSize))) {
cat(paste0("\nBatch ", i, " out of ", (ceiling(nrowTraits / batchSize)), "\n"))
rowsSelect <- ((i - 1) * batchSize + 1):min((i * batchSize), nrowTraits)
taxoAux <- as.data.frame(
taxize::gnr_resolve(
names = rownames(standNamesTraits)[rowsSelect],
preferred_data_sources = source, # 163 is IUCN
best_match_only = TRUE,
canonical = TRUE
)
)
standNamesTraits[taxoAux$user_supplied_name, "IUCNName"] <- taxoAux$matched_name2
}
standNamesTraits[, "IUCNName"] <- gsub(
x = standNamesTraits[, "IUCNName"],
pattern = " ",
replacement = "_"
)
naIUCN <- which(is.na(standNamesTraits[, "IUCNName"]))
standNamesTraits[naIUCN, "IUCNName"] <- rownames(standNamesTraits)[naIUCN]
# Some species could not be resolved into IUCN naming. Remove them:
yesName <- which(!is.na(standNamesTraits[, "IUCNName"]))
if (dim == 1) {
return(Traits[yesName])
} else {
return(Traits[yesName, ])
}
}
addToPhylo <- function(phylogenyTraits, GROUPTraits) {
phylogenyTraits <- force.ultrametric(tree = phylogenyTraits, method = c("extend"))
tipsToAdd <- which(!rownames(GROUPTraits) %in% phylogenyTraits$tip.label)
namesToAdd <- rownames(GROUPTraits)[tipsToAdd]
genusToAdd <- unlist(strsplit(namesToAdd, "_"))[
which(1:length(strsplit(namesToAdd, "_")) %% 2 == 1)
]
phylogenyTraitsAdd <- phylogenyTraits
for (add in 1:length(namesToAdd)) {
cat(paste("\rADDING SP TO GENUS IN PHYLOGENY", add, "/", length(namesToAdd), "\r"))
phylogenyTraitsAdd <- add.species.to.genus(
tree = phylogenyTraitsAdd,
species = namesToAdd[add],
genus = genusToAdd[add],
where = "root"
)
}
return(phylogenyTraitsAdd)
}
RedToTraits <- function(IUCNGROUPtoTraits) {
RedToTraits <- matrix(
rep(NA, nrow(IUCNGROUPtoTraits)),
ncol = 1,
dimnames = list(IUCNGROUPtoTraits$Species, "IUCN")
)
for (i in 1:nrow(IUCNGROUPtoTraits)) {
selRows <- which(IUCNGROUPtoTraits$Species == IUCNGROUPtoTraits$Species[i])
if (length(selRows) > 1) {
dataAux <- IUCNGROUPtoTraits[selRows, ]
if (any(is.na(dataAux$infra_name))) {
coger <- which(is.na(dataAux$infra_name))
if (length(coger) == 1) {
RedToTraits[IUCNGROUPtoTraits$Species[i], "IUCN"] <- as.character(dataAux$categorySimp[coger])
} else {
if (length(unique(dataAux$categorySimp[coger])) == 1) {
RedToTraits[IUCNGROUPtoTraits$Species[i], "IUCN"] <-
as.character(unique(dataAux$categorySimp[coger]))
} else {
RedToTraits[IUCNGROUPtoTraits$Species[i], "IUCN"] <-
as.character(sort(unique(dataAux$categorySimp))[1])
}
}
} else {
RedToTraits[IUCNGROUPtoTraits$Species[i], "IUCN"] <-
as.character(sort(unique(dataAux$categorySimp))[1])
}
} else {
RedToTraits[IUCNGROUPtoTraits$Species[i], "IUCN"] <-
as.character(IUCNGROUPtoTraits[selRows, "categorySimp"])
}
}
return(RedToTraits)
}
imputation_trait <- function(GROUPData, selectedTraits, shortnames = c("am", "bs", "ls", "os")) {
require(missForest)
columnsTraits <- 1:(which(colnames(GROUPData) == "Eigen.1") - 1)
GROUPData[, columnsTraits] <- log10(GROUPData[, columnsTraits])
# NOTE: All traits are kept for imputation
columsImputation <- 1:(which(colnames(GROUPData) == "IUCN") - 1)
GROUPTraitslogImputed <- as.matrix(missForest(xmis = GROUPData[, columsImputation])$ximp)
GROUPTraitsImputed <- GROUPTraitsMissing <- GROUPData
GROUPTraitsImputed[, columnsTraits] <- GROUPTraitslogImputed[, columnsTraits]
GROUPTraitsMissing <- GROUPTraitsMissing[, selectedTraits]
GROUPTraitsImputed <- GROUPTraitsImputed[, selectedTraits]
colnames(GROUPTraitsMissing) <- colnames(GROUPTraitsImputed) <- shortnames
GROUPOtherInfo <- GROUPData[, (max(columnsTraits) + 1):ncol(GROUPData)]
GROUPTraitsImputed <- data.frame(GROUPTraitsImputed, IUCN = GROUPData$IUCN)
GROUPTraitsMissing <- data.frame(GROUPTraitsMissing, IUCN = GROUPData$IUCN)
return(list(GROUPTraitsImputed, GROUPTraitsMissing))
}# Perform PCA
# TRAITS: table of traits sp x tr : MUST BE SCALED AND log10
make.PCA <- function(TRAITS, dimensionsAux, savePCA) {
paranAux <- paran(TRAITS)
PCA_TRAITS <- list()
PCA_TRAITS$traits <- TRAITS
PCA_TRAITS$PCA <- princomp(PCA_TRAITS$traits)
PCA_TRAITS$dimensions <- dimensionsAux
PCA_TRAITS$variance <- (summary(PCA_TRAITS$PCA)[1][[1]]^2)[1:dimensionsAux] /
sum(summary(PCA_TRAITS$PCA)[1][[1]]^2)
PCA_TRAITS$loadings <- PCA_TRAITS$PCA$loadings
PCA_TRAITS$traitsUse <- data.frame(PCA_TRAITS$PCA$scores[, 1:PCA_TRAITS$dimensions])
if (!is.na(savePCA)) {
saveRDS(PCA_TRAITS, savePCA)
}
return(PCA_TRAITS)
}
# Possible to turn it as function for all groups (Nat Comm 2021)
make.PCA.groups <- function(traitsImputed, traitsMissing,
group = c("Plants","Birds","Mammals","Reptiles","Amphibians","Fishes")) {
require(paran)
paranAux <- paran(traitsImputed[, 1:(ncol(traitsImputed) - 1)])
if (paranAux$Retained < 2) paranAux$Retained <- 2
dimensionsAux <- paranAux$Retained
PCAImpute <- list()
PCAImpute$traits <- traitsImputed[, 1:(ncol(traitsImputed) - 1)]
PCAImpute$traitsMissing <- traitsMissing[, 1:(ncol(traitsImputed) - 1)]
PCAImpute$nImputed <- rowSums(is.na(PCAImpute$traitsMissing))
PCAImpute$PCA <- princomp(traitsImputed[, 1:(ncol(traitsImputed) - 1)])
PCAImpute$dimensions <- dimensionsAux
PCAImpute$variance <- (summary(PCAImpute$PCA)[1][[1]]^2)[1:dimensionsAux] /
sum(summary(PCAImpute$PCA)[1][[1]]^2)
if (group %in% c("Birds","Mammals")) { # Mammals & Birds
multPC1 <- ifelse(PCAImpute$PCA$loadings["bm", "Comp.1"] > 0, 1, -1)
PCAImpute$PCA$scores[, 1] <- multPC1 * PCAImpute$PCA$scores[, 1]
PCAImpute$PCA$loadings[, "Comp.1"] <- multPC1 * PCAImpute$PCA$loadings[, "Comp.1"]
}
if (group == "Reptiles") { # Reptiles
multPC2 <- ifelse(PCAImpute$PCA$loadings["inc", "Comp.2"] > 0, -1, 1)
PCAImpute$PCA$scores[, 2] <- multPC2 * PCAImpute$PCA$scores[, 2]
PCAImpute$PCA$loadings[, "Comp.2"] <- multPC2 * PCAImpute$PCA$loadings[, "Comp.2"]
}
if (group == "Fishes") { # Fishes
multPC1 <- ifelse(PCAImpute$PCA$loadings["svl", "Comp.1"] > 0, 1, -1)
multPC2 <- ifelse(PCAImpute$PCA$loadings["svl", "Comp.2"] < 0, 1, -1)
PCAImpute$PCA$scores[, 1] <- multPC1 * PCAImpute$PCA$scores[, 1]
PCAImpute$PCA$loadings[, "Comp.1"] <- multPC1 * PCAImpute$PCA$loadings[, "Comp.1"]
PCAImpute$PCA$scores[, 2] <- multPC2 * PCAImpute$PCA$scores[, 2]
PCAImpute$PCA$loadings[, "Comp.2"] <- multPC2 * PCAImpute$PCA$loadings[, "Comp.2"]
}
PCAImpute$loadings <- PCAImpute$PCA$loadings
PCAImpute$traitsUse <- data.frame(PCAImpute$PCA$scores[, 1:PCAImpute$dimensions])
treathCat <- c("EX", "EW", "CR", "EN", "VU")
noThreatCat <- c("LC", "NT")
PCAImpute$Threat <- with(traitsImputed,
ifelse(IUCN %in% treathCat, 1,
ifelse(IUCN %in% noThreatCat, 0, NA)))
return(PCAImpute)
}
# =========================
# Example for birds
# =========================
# 1. Load data
# =========================
PCA <- readRDS("data/processed/PCA_Birds.rds")
shortNames <- read.csv("data/processed/Shortnames_Birds.csv")
type <- names(PCA)
title_space = c('Combined','Locomotion','Reproduction','Diet')
# =========================
# 2. Define functions
# =========================
PCA_plot <- function(PCoAPlot,PCoACorPlot,multAx1,multAx,title,legend){
Plan12=ggplot(data = data.frame(PCoAPlot$vectors)) +
geom_point(aes(x = Axis.1, y = Axis.2),col="grey89") +
theme_classic()+
ggtitle(title)+
xlab(paste0("PCoA 1 (",round(PCoAPlot$values[1,2]*100,2),"%)"))+
ylab(paste0("PCoA 2 (",round(PCoAPlot$values[2,2]*100,2),"%)"))+
#coord_fixed() + ## need aspect ratio of 1!
geom_segment(data = data.frame(PCoACorPlot),
aes(x = 0, xend = Axis.1*multAx, y = 0, yend = Axis.2*multAx),
arrow = arrow(length = unit(0.25, "cm")), colour = PCoACorPlot$color) +
geom_text(data = data.frame(PCoACorPlot),
aes(x = Axis.1*multAx1, y = Axis.2*multAx1,
label = names), colour = PCoACorPlot$color,
size = 5) +
theme(
plot.title = element_text(
size = 18, # Increase size (adjust as needed)
face = "bold" # Make it bold
)
)
if(!is.null(legend)){
Plan12 = Plan12 + annotate(geom = "text",label = legend, x = Inf, y = -Inf, hjust = 1, vjust = -0.5) # Make text bold)
}
Plan12
}
# ---------- utilities ----------
.sanitize_axes <- function(df) {
stopifnot(all(c("Axis.1","Axis.2") %in% names(df)))
df <- df[is.finite(df$Axis.1) & is.finite(df$Axis.2), , drop = FALSE]
df$Axis.1 <- as.numeric(df$Axis.1)
df$Axis.2 <- as.numeric(df$Axis.2)
df
}
.safe_limits <- function(x, pad_factor = 0.08) {
xr <- range(x, finite = TRUE)
if (!is.finite(xr[1]) || !is.finite(xr[2])) return(c(-1, 1))
if (xr[1] == xr[2]) {
eps <- ifelse(abs(xr[1]) > 0, abs(xr[1]) * pad_factor, 1e-6)
return(c(xr[1] - eps, xr[2] + eps))
}
width <- diff(xr)
c(xr[1] - width * pad_factor, xr[2] + width * pad_factor)
}
# KDE grid using {ks}
.kde_grid <- function(df, gridsize = 181, H = NULL) {
if (!requireNamespace("ks", quietly = TRUE)) {
stop("Please install.packages('ks') to use the KDE-based density background.")
}
df <- .sanitize_axes(df)
if (nrow(df) < 3) return(list(grid = NULL, x = NULL, y = NULL, z = NULL))
xlim <- .safe_limits(df$Axis.1); ylim <- .safe_limits(df$Axis.2)
X <- cbind(
pmin(pmax(df$Axis.1, xlim[1]), xlim[2]),
pmin(pmax(df$Axis.2, ylim[1]), ylim[2])
)
if (is.null(H)) {
H <- try(ks::Hpi(x = X), silent = TRUE)
if (inherits(H, "try-error") || any(!is.finite(as.vector(H)))) {
H <- try(ks::Hscv(x = X), silent = TRUE)
}
if (inherits(H, "try-error") || any(!is.finite(as.vector(H)))) {
v <- apply(X, 2, stats::var, na.rm = TRUE); v[v <= 0 | !is.finite(v)] <- 1e-6
H <- diag(v)
}
}
kde <- ks::kde(
x = X, H = H,
xmin = c(xlim[1], ylim[1]), xmax = c(xlim[2], ylim[2]),
gridsize = c(gridsize, gridsize)
)
gx <- kde$eval.points[[1]]
gy <- kde$eval.points[[2]]
gz <- kde$estimate # matrix [length(gx) x length(gy)]
grid <- expand.grid(x = gx, y = gy)
grid$z <- as.vector(gz)
list(grid = grid, x = gx, y = gy, z = gz)
}
# Compute density cutoffs (levels) that enclose given probabilities on the grid
.hdr_levels_grid <- function(gx, gy, gz, probs = c(0.25, 0.5, 0.99)) {
# assume regular grid (true for ks::kde)
dx <- mean(diff(gx)); dy <- mean(diff(gy))
w <- as.vector(gz)
ord <- order(w, decreasing = TRUE)
cum_mass <- cumsum(w[ord]) * dx * dy
# normalize to 1 (numerical integration may be slightly off)
cum_mass <- cum_mass / max(cum_mass, na.rm = TRUE)
sapply(probs, function(p) {
idx <- which(cum_mass >= p)[1]
if (is.na(idx)) min(w, na.rm = TRUE) else w[ord][idx]
})
}
# Build contour paths + label positions from a raster grid
.contour_data <- function(gx, gy, gz, levels, prob_labels) {
cls <- contourLines(x = gx, y = gy, z = gz, levels = levels) # note t()
if (length(cls) == 0) return(list(lines = NULL, labels = NULL))
lines_df <- do.call(rbind, lapply(seq_along(cls), function(i) {
data.frame(x = cls[[i]]$x, y = cls[[i]]$y, level = cls[[i]]$level, id = i)
}))
labels_df <- do.call(rbind, lapply(seq_along(cls), function(i) {
n <- length(cls[[i]]$x); j <- floor(n/2)
data.frame(x = cls[[i]]$x[j], y = cls[[i]]$y[j], level = cls[[i]]$level, id = i)
}))
# map level -> probability text
lvl_map <- setNames(prob_labels, levels)
labels_df$prob <- lvl_map[as.character(labels_df$level)]
list(lines = lines_df, labels = labels_df)
}
# ---------- funspace-style plot ----------
PCA_plot_funspace <- function(PCoAPlot, PCoACorPlot, multAx1, multAx, title, legend = NULL,
probs = c(0.25, 0.5, 0.99),
gridsize = 181, H = NULL, bins_filled = 30) {
df <- data.frame(PCoAPlot$vectors)
df <- .sanitize_axes(df)
# KDE grid
KG <- .kde_grid(df, gridsize = gridsize, H = H)
# density background
# continuous raster background
p <- ggplot() +
geom_raster(
data = KG$grid,
aes(x = x, y = y, fill = z)
) +
scale_fill_gradientn(
colours = c(
"#FFFFFF", # white
"#FC8D59", # medium orange-red
"#EF6548", # strong red-orange
"#D7301F", # deep red
"#990000" # dark brownish red
),
guide = "none"
)
# HDR probability contours + labels (0.25, 0.5, 0.95 by default)
levs <- .hdr_levels_grid(KG$x, KG$y, KG$z, probs = probs)
cd <- .contour_data(KG$x, KG$y, KG$z, levels = as.numeric(levs),
prob_labels = paste0(probs))
if (!is.null(cd$lines)) {
p <- p +
geom_path(data = cd$lines, aes(x, y, group = id),
colour = "black", linewidth = 0.6)
}
if (!is.null(cd$labels)) {
p <- p +
geom_text(data = cd$labels, aes(x, y, label = prob),
colour = "black", size = 3)
}
# trait loading arrows + labels
p <- p +
geom_segment(
data = data.frame(PCoACorPlot),
aes(x = 0, xend = Axis.1 * multAx, y = 0, yend = Axis.2 * multAx),
arrow = arrow(length = unit(0.25, "cm")),
colour = PCoACorPlot$color, linewidth = 0.5
) +
geom_text(
data = data.frame(PCoACorPlot),
aes(x = Axis.1 * multAx1, y = Axis.2 * multAx1, label = names),
colour = PCoACorPlot$color, size = 5
) +
theme_classic() +
ggtitle(title) +
xlab(paste0("PCoA 1 (", round(PCoAPlot$values[1, 2] * 100, 2), "%)")) +
ylab(paste0("PCoA 2 (", round(PCoAPlot$values[2, 2] * 100, 2), "%)")) +
coord_fixed() +
theme(plot.title = element_text(size = 18, face = "bold"))
if (!is.null(legend)) {
p <- p + annotate("text", label = legend, x = Inf, y = -Inf, hjust = 1, vjust = -0.5)
}
p
}
## 2.1 Generate a PCA correlation plot with density legend
make_pca_plot <- function(pca_object, correlation_data, multAx1, multAx, title) {
legend_text <- paste0(
"Space occupied by:\n",
"50% = ", round(pca_object$ALLDensity[1, "0.5"], 2), "\n",
"99% = ", round(pca_object$ALLDensity[1, "0.99"], 2)
)
PCA_plot(pca_object$PCoA, correlation_data, multAx1, multAx, title, legend_text)
}
make_pca_plot_34 <- function(pca_object, correlation_data, multAx1, multAx, title) {
legend_text <- paste0(
"Space occupied by:\n",
"50% = ", round(pca_object$ALLDensity[1, "0.5"], 2), "\n",
"99% = ", round(pca_object$ALLDensity[1, "0.99"], 2)
)
PCA_plot_34(pca_object$PCoA, correlation_data, multAx1, multAx, title, legend_text)
}
## 2.2 Format PCA correlation table with colors and labels
format_correlation_table <- function(PCoACorPlot, shortNames) {
cbind.data.frame(
PCoACorPlot,
color = shortNames[match(rownames(PCoACorPlot), shortNames$original), "color"],
names = shortNames[match(rownames(PCoACorPlot), shortNames$original), "short"]
)
}
## 2.3 Perform Procrustes analysis between PCA spaces
run_procrustes <- function(PCA_list) {
combNames <- combn(names(PCA_list), 2)
combNames <- apply(combNames, 2, function(x) paste0(x[1], "_", x[2]))
procrustes_table <- matrix(NA, ncol = 1, nrow = length(combNames),
dimnames = list(combNames, 'Birds'))
for (j in 1:length(PCA_list)) {
x <- PCA_list[[j]]$PCoA$vectors
for (i in 1:length(PCA_list)) {
if (i > j) {
y <- PCA_list[[i]]$PCoA$vectors
rownames(y) = rownames(x)
prcTest <- ade4::procuste.rtest(as.data.frame(x), as.data.frame(y), nrepet = 999)
cor_coef <- prcTest$obs
p_val <- prcTest$pvalue
signif <- ifelse(p_val < 0.001, "***", ifelse(p_val < 0.01, "**", ifelse(p_val < 0.05, "*", "ns")))
procrustes_table[paste0(names(PCA_list)[j], "_", names(PCA_list)[i]), 1] <-
paste0(round(cor_coef, 3), " ", signif)
}
}
}
return(procrustes_table)
}
# =========================
# 3. Perform analysis
# =========================
multAx <- 0.25
multAx1 <- 0.29
plotPCAList <- plotPCAList_34 <- list()
densityTable <- NULL
for (i in seq_along(PCA)) {
cor_table <- format_correlation_table(PCA[[i]]$PCoACor, shortNames)
plotPCAList[[i]] <- PCA_plot_funspace(PCA[[i]]$PCoA, cor_table, multAx1, multAx, paste0(title_space[i],"-trait space"))
#plotPCAList_34[[i]] <- PCA_plot_funspace(PCA[[i]], cor_table, multAx1, multAx, paste0(title_space[i],"-trait space"))
densityTable <- rbind(densityTable,
data.frame(`50` = PCA[[i]]$ALLDensity[1, "0.5"],
`99` = PCA[[i]]$ALLDensity[1, "0.99"],
`100` = PCA[[i]]$ALLDensity[1, "1"],
row.names = names(PCA)[i]))
}
names(plotPCAList) <- type
# =========================
# 4. Save outputs
# =========================
# 4.1 Density table
write.csv(densityTable, file = "results/tables/BirdstableDensity.csv")
# 4.2 PCA plots grid
ggarrange(
NULL, plotPCAList[["MLD"]], NULL,
plotPCAList[["M"]], plotPCAList[["L"]], plotPCAList[["D"]],
hjust = 0, align = "v", ncol = 3, nrow = 2
) %>% ggexport(
filename = "results/figures/TraitsSpacesForFramework.png",
width = 2800, height = 1800, res = 200, pointsize = 5
)
# 4.3 Procrustes correlation table
procrustes_res <- run_procrustes(PCA)
write.csv(procrustes_res, file = "results/tables/ProcrustesTable.csv")# PCA_TRAITS: object from PCA previously done
dimensions <- PCA_TRAITS$dimensions
traitsUSE <- PCA_TRAITS$traitsUse
colnames(traitsUSE) <- paste0("Comp.", 1:dimensions)
gridSize <- ifelse(dimensions == 4, 30, 100)
sdTraits <- sqrt(diag(Hpi.diag(traitsUSE)))
TPDsAux <- TPDsMean(
species = rownames(traitsUSE),
means = traitsUSE,
sds = matrix(rep(sdTraits, nrow(traitsUSE)), byrow = TRUE, ncol = dimensions),
alpha = 0.95,
n_divisions = gridSize
)
make.TPD.2D.high.def <- function(targetGroupName, PCAImpute, alphaUse = 0.95,
saveFile = paste0(getwd(), "/TPDs2DHighDef.rds")) {
require(TPD)
cat(paste("\n ESTIMATING TPDs for: ", targetGroupName, "\n"))
dimensions <- 2
traitsUSE <- PCAImpute$traitsUse[, 1:dimensions]
colnames(traitsUSE) <- paste0("Comp.", 1:2)
gridSize <- 200
sdTraits <- sqrt(diag(Hpi.diag(traitsUSE)))
TPDsAux <- TPDsMean(
species = rownames(traitsUSE),
means = traitsUSE,
sds = matrix(rep(sdTraits, nrow(traitsUSE)), byrow = TRUE, ncol = dimensions),
alpha = alphaUse,
n_divisions = gridSize
)
saveRDS(TPDsAux, saveFile)
TPDsAux <- NULL
gc()
}
make.TPD.2D.low.def <- function(targetGroupName, PCAImpute, alphaUse = 0.95,
saveFile = paste0(getwd(), "/TPDs2DLowDef.rds")) {
require(TPD)
cat(paste("\n ESTIMATING TPDs for: ", targetGroupName, "\n"))
dimensions <- 2
traitsUSE <- PCAImpute$traitsUse[, 1:dimensions]
colnames(traitsUSE) <- paste0("Comp.", 1:2)
gridSize <- 100
sdTraits <- sqrt(diag(Hpi.diag(traitsUSE)))
TPDsAux <- TPDsMean(
species = rownames(traitsUSE),
means = traitsUSE,
sds = matrix(rep(sdTraits, nrow(traitsUSE)), byrow = TRUE, ncol = dimensions),
alpha = alphaUse,
n_divisions = gridSize
)
saveRDS(TPDsAux, saveFile)
TPDsAux <- NULL
gc()
}
make.TPDLarge <- function(targetGroupName, PCAImpute, alphaUse = 0.95, TPDsMean_large,
saveFile = paste0(getwd(), "/TPDsLarge.rds")) {
cat(paste("\n ESTIMATING TPDs for: ", targetGroupName, "\n"))
dimensions <- PCAImpute$dimensions
traitsUSE <- PCAImpute$traitsUse
colnames(traitsUSE) <- paste0("Comp.", 1:dimensions)
gridSize <- ifelse(dimensions == 4, 30, 100)
sdTraits <- sqrt(diag(Hpi.diag(traitsUSE)))
TPDsAux <- TPDsMean_large(
species = rownames(traitsUSE),
means = traitsUSE,
sds = matrix(rep(sdTraits, nrow(traitsUSE)), byrow = TRUE, ncol = dimensions),
alpha = alphaUse,
n_divisions = gridSize
)
saveRDS(TPDsAux, saveFile)
TPDsAux <- NULL
gc()
}TPDRichness <- function(TPDc = NULL, TPDs = NULL) {
if (is.null(TPDc) & is.null(TPDs)) {
stop("At least one of 'TPDc' or 'TPDs' must be supplied")
}
if (!is.null(TPDc) & class(TPDc) != "TPDcomm") {
stop("Class mismatch: please specify if your object is a TPDc or a TPDs (TPDcomm expected).")
}
if (!is.null(TPDs) & class(TPDs) != "TPDsp") {
stop("Class mismatch: please specify if your object is a TPDc or a TPDs (TPDsp expected).")
}
results <- list()
Calc_FRich <- function(x) {
results_FR <- numeric()
if (class(x) == "TPDcomm") {
TPD <- x$TPDc$TPDc
names_aux <- names(x$TPDc$TPDc)
cell_volume <- x$data$cell_volume
}
if (class(x) == "TPDsp") {
TPD <- x$TPDs
names_aux <- names(x$TPDs)
cell_volume <- x$data$cell_volume
}
for (i in 1:length(TPD)) {
TPD_aux <- TPD[[i]]
TPD_aux[TPD_aux > 0] <- cell_volume
results_FR[i] <- sum(TPD_aux)
}
names(results_FR) <- names_aux
return(results_FR)
}
if (!is.null(TPDc)) {
results$communities <- list()
results$communities$FRichness <- Calc_FRich(TPDc)
}
if (!is.null(TPDs)) {
if (TPDs$data$type == "One population_One species" |
TPDs$data$type == "One population_Multiple species") {
results$species <- list()
results$species$FRichness <- Calc_FRich(TPDs)
} else {
results$populations <- list()
results$populations$FRichness <- Calc_FRich(TPDs)
}
if (TPDs$data$method == "mean") {
message("WARNING: When TPDs are calculated using the TPDsMean function, Evenness and Divergence are meaningless!!")
}
}
return(results)
}
imageTPD <- function(x, thresholdPlot = 0.99) {
TPDList <- x$TPDc$TPDc
imageTPD <- list()
for (comm in 1:length(TPDList)) {
percentile <- rep(NA, length(TPDList[[comm]]))
TPDList[[comm]] <- cbind(index = 1:length(TPDList[[comm]]),
prob = TPDList[[comm]], percentile)
orderTPD <- order(TPDList[[comm]][, "prob"], decreasing = TRUE)
TPDList[[comm]] <- TPDList[[comm]][orderTPD, ]
TPDList[[comm]][, "percentile"] <- cumsum(TPDList[[comm]][, "prob"])
TPDList[[comm]] <- TPDList[[comm]][order(TPDList[[comm]][, "index"]), ]
imageTPD[[comm]] <- TPDList[[comm]]
}
names(imageTPD) <- names(TPDList)
trait1Edges <- unique(x$data$evaluation_grid[, 1])
trait2Edges <- unique(x$data$evaluation_grid[, 2])
imageMat <- array(
NA,
c(length(trait1Edges), length(trait2Edges), length(imageTPD)),
dimnames = list(trait1Edges, trait2Edges, names(TPDList))
)
for (comm in 1:length(TPDList)) {
percentileSpace <- x$data$evaluation_grid
percentileSpace$percentile <- imageTPD[[comm]][, "percentile"]
for (i in 1:length(trait2Edges)) {
colAux <- subset(percentileSpace, percentileSpace[, 2] == trait2Edges[i])
imageMat[, i, comm] <- colAux$percentile
}
imageMat[imageMat > thresholdPlot] <- NA
}
return(imageMat)
}
overlapF <- function(x, y) {
if (!identical(x$eval.points, y$eval.points)) stop("Not identical eval points!")
TPDx <- x$estimate
if (class(x$eval.points) == "list") {
x$eval.points <- expand.grid(x$eval.points)
}
cellEdge <- numeric()
for (i in 1:ncol(x$eval.points)) {
cellEdge[i] <- unique(x$eval.points[, i])[2] - unique(x$eval.points[, i])[1]
}
cellSizex <- prod(cellEdge)
TPDx <- TPDx * cellSizex
TPDx <- TPDx / sum(TPDx)
TPDy <- y$estimate
TPDy <- TPDy * cellSizex
TPDy <- TPDy / sum(TPDy)
OL <- sum(pmin(TPDx, TPDy))
return(OL)
}
percentileTPD <- function(x) {
TPDList <- x$TPDc$TPDc
results <- list()
for (comm in 1:length(TPDList)) {
percentile <- rep(NA, nrow(TPDList[[comm]]))
TPDList[[comm]] <- cbind(TPDList[[comm]], percentile)
orderTPD <- order(TPDList[[comm]][, "notZeroProb"], decreasing = TRUE)
TPDList[[comm]] <- TPDList[[comm]][orderTPD, ]
TPDList[[comm]][, "percentile"] <- cumsum(TPDList[[comm]][, "notZeroProb"])
TPDList[[comm]] <- TPDList[[comm]][order(TPDList[[comm]][, "notZeroIndex"]), ]
results[[comm]] <- TPDList[[comm]]
}
names(results) <- names(TPDList)
return(results)
}
quantileTPD <- function(x, thresholdPlot = 0.99) {
TPDList <- x$TPDc$TPDc
quantileTPD <- list()
for (comm in 1:length(TPDList)) {
percentile <- rep(NA, length(TPDList[[comm]]))
TPDList[[comm]] <- cbind(index = 1:length(TPDList[[comm]]),
prob = TPDList[[comm]], percentile)
orderTPD <- order(TPDList[[comm]][, "prob"], decreasing = TRUE)
TPDList[[comm]] <- TPDList[[comm]][orderTPD, ]
TPDList[[comm]][, "percentile"] <- cumsum(TPDList[[comm]][, "prob"])
TPDList[[comm]] <- TPDList[[comm]][order(TPDList[[comm]][, "index"]), ]
quantileTPD[[comm]] <- TPDList[[comm]]
}
names(quantileTPD) <- names(TPDList)
trait1Edges <- unique(x$data$evaluation_grid[, 1])
trait2Edges <- unique(x$data$evaluation_grid[, 2])
imageMat <- array(
NA,
c(length(trait1Edges), length(trait2Edges), length(quantileTPD)),
dimnames = list(trait1Edges, trait2Edges, names(TPDList))
)
for (comm in 1:length(TPDList)) {
percentileSpace <- x$data$evaluation_grid
percentileSpace$percentile <- quantileTPD[[comm]][, "percentile"]
for (i in 1:length(trait2Edges)) {
colAux <- subset(percentileSpace, percentileSpace[, 2] == trait2Edges[i])
imageMat[, i, comm] <- 1 - colAux$percentile
}
imageMat[imageMat < (1 - thresholdPlot)] <- 0
}
return(imageMat)
}
MAE_TPD <- function(x, y) {
diffXY <- abs(x - y)
MAE <- mean(diffXY)
return(MAE)
}################################################################################
# data preparation
################################################################################
# --- Load the data ---
trait <- readRDS("dataOriginal/FishMORPH_Traits.rds")
phylogeny <- readRDS("dataOriginal/FishMORPH_Phylogeny.rds")
traitNames <- colnames(trait)[-c(1:6)]
# --- Prepare the data ---
# Species list
list_sp <- gsub("\\.", " ", as.character(trait$Genus.species))
# Lenght-weight fishbase
lgtwgt <- as.data.frame(length_weight(list_sp))
lgtwgt <- lgtwgt[!is.na(lgtwgt$a) & !is.na(lgtwgt$b), ]
# Best coeff
ab <- lgtwgt %>%
group_by(Species) %>%
slice_max(order_by = CoeffDetermination, with_ties = FALSE, na_rm = TRUE) %>%
dplyr::select(Species, a, b) %>%
distinct()
# Other traits
speciesInfo <- species(list_sp) %>% data.table()
speciesInfoSub <- speciesInfo[, .(Species, Fresh, LongevityWild, Length, Weight, LTypeMaxM)]
speciesInfoSub <- unique(speciesInfoSub)
speciesInfoSub$Species <- gsub(" ", ".", speciesInfoSub$Species)
speciesInfoSub <- speciesInfoSub %>%
mutate(
Length = ifelse(LTypeMaxM != "SL", NA, Length),
Weight2 = ifelse(Species %in% gsub(" ", ".", rownames(ab)),
ab[gsub("\\.", " ", Species), "a"] * Length^ab[gsub("\\.", " ", Species), "b"],
NA)
) %>%
mutate(Weight = ifelse(is.na(Weight) & !is.na(Weight2), Weight2, Weight))
# fusion with Fishmorph
fishTraits <- merge(trait[,6:15], speciesInfoSub[, .(Species, Length, Weight)],
by.x = "Genus.species", by.y = "Species", all.x = TRUE)
fishTraits <- fishTraits %>%
mutate(across(-Genus.species, ~log10(.x + 1))) %>%
rename(species = Genus.species) %>%
mutate(species = gsub("\\.", "_", species))
spToKeep <- fishTraits %>% # keep only freshwater species
dplyr::select(species) %>%
mutate(species = gsub("_", " ", species)) %>%
pull() %>%
rfishbase::species() %>%
as.data.table() %>%
filter(Fresh == 1) %>%
dplyr::select(Species) %>%
mutate(Species = gsub(" ", "_", Species)) %>%
pull()
fishTraits <- fishTraits %>%
filter(species %in% spToKeep)
#dir.create("dataPrepared/Fish", showWarnings = FALSE, recursive = TRUE)
# write.table(fishTraits, "dataPrepared/Fish/fishTraitsMissing.txt")
fishTraits <- read.table( "dataPrepared/Fish/fishTraitsMissing.txt")
# --- PCoA phylogenetic ---
phylogeny$tip.label <- gsub("\\.", "_", phylogeny$tip.label)
phylogeny <- drop.tip(phylogeny, setdiff(phylogeny$tip.label, fishTraits$species))
phylogenyTraits <- phytools::force.ultrametric(phylogeny)
phylDiss <- sqrt(cophenetic(phylogenyTraits))
pcoaPhyl <- cmdscale(phylDiss, k = 10) #long
rownames(pcoaPhyl) = fishTraits$species
colnames(pcoaPhyl) = paste0("Eigen.", 1:10)
# write.table(pcoaPhyl, "dataPrepared/Fish/pcoaPhylogenyFish.txt")
pcoaPhyl <- read.table("dataPrepared/Fish/pcoaPhylogenyFish.txt", header = T, stringsAsFactors = F)
# --- Taxonomic standardization (quick) ---
list_sp_raw <- fishTraits$species
list_sp_raw <- gsub("_", " ", list_sp_raw)
list_sp_raw <- stringr::str_squish(list_sp_raw)
verified_names <- taxize::gna_verifier( # long
names = list_sp_raw,
data_sources = c(11),
all_matches = FALSE,
capitalize = TRUE,
species_group = TRUE,
output_type = "table"
)
new_names <- gsub(" ", "_", verified_names$matchedCanonicalSimple)
rownames(pcoaPhyl) <- new_names
fishTraits$species <- new_names
traitsAndPCOA <- cbind(fishTraits, pcoaPhyl)
# write.table(traitsAndPCOA, "dataPrepared/Fish/traitsWithPCOA.txt")
traitsAndPCOA <- read.table("dataPrepared/Fish/traitsWithPCOA.txt", header = T, stringsAsFactors = F)
# --- IUCN data ---
species_list <- unique(str_trim(traitsAndPCOA$species))
iucn_statut <- read.csv("dataOriginal/assessments.csv", header = TRUE, stringsAsFactors = FALSE) # IUCN data 2024
iucn_clean <- iucn_statut %>%
mutate(scientificName = str_trim(scientificName))
species_list <- gsub("_", " ", species_list)
matched_species <- inner_join(
tibble(species = species_list),
iucn_clean,
by = c("species" = "scientificName")
)
species_to_update <- tibble(species = species_list) %>%
anti_join(iucn_clean, by = c("species" = "scientificName"))
synonyms_info <- rfishbase::synonyms(
species_list = species_to_update$species,
server = "fishbase",
version = "latest",
fields = NULL
)
synonyms_info_filtered <- synonyms_info %>%
filter(!Status %in% c("misapplied name", "ambiguous synonym", "provisionally accepted name"))
synonyms_mapping <- synonyms_info_filtered %>%
dplyr::select(Species, synonym) %>%
distinct()
iucn_clean <- iucn_clean %>%
mutate(scientificName = if_else(
scientificName %in% synonyms_mapping$Species,
synonyms_mapping$synonym[match(scientificName, synonyms_mapping$Species)],
scientificName
))
matched_species <- inner_join(
tibble(species = species_list),
iucn_clean,
by = c("species" = "scientificName")
)
species_to_update <- tibble(species = species_list) %>%
anti_join(iucn_clean, by = c("species" = "scientificName")) %>%
as.data.frame()
rownames(species_to_update) <- species_to_update$species
# manual correction
species_to_check <- read.csv("dataOriginal/species_to_update_900_done.csv", sep = ";", header = TRUE, stringsAsFactors = FALSE)
colnames(species_to_check) <- c("scientificName", "redlistCategory")
iucn_clean <- iucn_clean[, c("scientificName", "redlistCategory")]
acronyms <- c(
"Critically Endangered" = "CR",
"Endangered" = "EN",
"Vulnerable" = "VU",
"Near Threatened" = "NT",
"Least Concern" = "LC",
"Data Deficient" = "DD",
"Extinct" = "EX",
"Extinct in the Wild" = "EW",
"Not Evaluated" = "NE"
)
iucn_clean <- iucn_clean %>%
mutate(redlistCategory = dplyr::recode(redlistCategory, !!!acronyms))
iucn_clean <- bind_rows(iucn_clean, species_to_check)
iucn_clean <- iucn_clean %>%
filter(scientificName %in% species_list)
iucn_clean <- iucn_clean %>%
group_by(scientificName) %>%
slice(1) %>%
ungroup()
traitsAndPCOA$species <- gsub("_", " ", traitsAndPCOA$species)
traitsAndPCOA$IUCN <- iucn_clean$redlistCategory[match(traitsAndPCOA$species, iucn_clean$scientificName)]
# write.table(traitsAndPCOA, "dataPrepared/Fish/traitsWithPCOAIUCN.txt")
traitsAndPCOAIUCN <- read.table("dataPrepared/Fish/traitsWithPCOAIUCN.txt", header = T, stringsAsFactors = F)
# --- Complete taxonomy ---
taxInfo <- pblapply(traitsAndPCOAIUCN$species, function(sp) {
tryCatch({
traitdataform::get_gbif_taxonomy(sp, subspecies = TRUE, higherrank = TRUE,
conf_threshold = 80, resolve_synonyms = FALSE)[1, ]
}, error = function(e) NULL)
}) %>% rbindlist(fill = TRUE)
# --- Final table ---
fishTraitsPhylogenyIUCN <- data.table(traitsAndPCOAIUCN)
# write.table(fishTraitsPhylogenyIUCN, "dataPrepared/Fish/AllDataFish_clean.txt")
fishData <- read.table("dataPrepared/Fish/traitsWithPCOAIUCN.txt", header = T, stringsAsFactors = F)
# define columns
columnsTraits <- 2:(which(colnames(fishData) == "Eigen.1") - 1)
columnsImputation <- 2:(which(colnames(fishData) == "IUCN") - 1)
# --- missForest imputation ---
set.seed(123)
imputed_forest <- missForest(xmis = fishData[, columnsImputation]) # it takes long
print(imputed_forest$OOBerror)
traits_names <- colnames(fishData)[columnsTraits]
fishData_imputed_forest <- fishData
fishData_imputed_forest[traits_names] <- imputed_forest$ximp[traits_names]
# write.table(fishData_imputed_forest, "dataPrepared/Fish/fishData_imputed_forest.txt", row.names = FALSE)
fishData_imputed_forest <- read.table("dataPrepared/Fish/fishData_imputed_forest.txt", header = T)
# --- trait selection and renaming ---
selectedTraits <- c("EdHd", "EhBd", "JlHd", "MoBd", "BlBd", "HdBd",
"PFiBd", "PFlBl", "CFdCPd", "Length", "Weight")
newTraitNames <- c("es", "ep", "ms", "mp", "elo", "wid", "pp", "ps", "cs", "svl", "bm")
fishTraitsMissing <- fishData[, selectedTraits]
colnames(fishTraitsMissing) <- newTraitNames
rownames(fishTraitsMissing) <- fishData$species
fishTraitsImputed <- fishData_imputed_forest[, selectedTraits]
colnames(fishTraitsImputed) <- newTraitNames
rownames(fishTraitsImputed) <- fishData$species
# --- add IUCN to the table ---
fishTraitsMissing <- data.frame(fishTraitsMissing, IUCN = fishData$IUCN)
# write.table(fishTraitsMissing, "dataPrepared/Fish/TraitFishMissing.txt")
fishTraitsImputed <- data.frame(fishTraitsImputed, IUCN = fishData$IUCN)
# write.table(fishTraitsImputed, "dataPrepared/Fish/TraitFishImputed.txt")
# --- ACP and TPD ---
# it takes long !!!
results <- computePCAandTPDs(fishTraitsImputed[,!colnames(fishTraitsImputed) == "IUCN"])
# saveRDS(results, "output/All_fish.rds")
pca_trait = results$PCA
# saveRDS(pca_trait, "output/PCA_fish.rds")
tpd_trait = results$TPDs
# saveRDS(tpd_trait, "output/TPDs_fish.rds")
# --- (optionnal) other fish info ---
fishOtherInfo <- fishData[, (max(columnsTraits) + 1):ncol(fishData)]
# --- add uses to pca_traits ---
df_scraping <- readRDS("output/fish_human_uses_binary_FB.rds") %>% as.data.table()
df_uni <- fread("data/uni.csv")
setnames(df_scraping, c("species_name", "aquarium", "fisheries", "bait", "game_fish", "aquaculture"),
c("Species", "Aquarium", "Fisheries", "Bait", "Game_fish", "Aquaculture"))
setnames(df_uni, "Game fish", "Game_fish", skip_absent = TRUE)
binary_cols <- c("Fisheries", "Aquaculture", "Aquarium", "Game_fish", "Bait")
merged_df <- merge(
df_uni,
df_scraping[, c("Species", binary_cols), with = FALSE],
by = "Species",
suffixes = c("", ".scraping"),
all.x = TRUE
)
for (col in binary_cols) {
col_scraping <- paste0(col, ".scraping")
if (col_scraping %in% colnames(merged_df)) {
merged_df[[col]] <- pmax(
as.numeric(merged_df[[col]]),
as.numeric(merged_df[[col_scraping]]),
na.rm = TRUE
)
merged_df[[col_scraping]] <- NULL
}
}
merged_df[, All_uses := as.integer(Fisheries + Aquaculture + Aquarium + Game_fish + Bait > 0)]
merged_df_clean <- merged_df %>%
select(Species, all_of(binary_cols), All_uses) %>%
rename("Game fish" = Game_fish, "All uses" = All_uses)
usage_cols <- c("Fisheries", "Aquaculture", "Aquarium", "Game fish", "Bait", "All uses")
uses_df <- as.data.frame(merged_df_clean[, c("Species", usage_cols), with = FALSE])
uses_df <- uses_df[!is.na(uses_df$Species) & uses_df$Species != "NA", ]
rownames(uses_df) <- uses_df$Species
uses_df$Species <- NULL
uses_df["Centromochlus musaica", ] <- list( # miss match in the species name for Centromochlus musaica
Fisheries = 0,
Aquaculture = 0,
Aquarium = 1, # only one use associated to Centromochlus musaica
`Game fish` = 0,
Bait = 0,
`All uses` = 1
)
pca_trait$uses <- uses_df[species_ref, usage_cols, drop = FALSE]
use <- pca_trait$uses
saveRDS(pca_trait, "output/pca_trait.rds")
# --- create the matrice ---
pca_trait <- readRDS("output/pca_trait.rds")
species_scores <- as.data.frame(pca_trait$traits_scores[,1:4])
species_uses <- as.data.frame(pca_trait$uses)
species_uses$rownames <- rownames(species_uses)
species_scores$rownames <- rownames(species_scores)
species_scores_uses <- merge(species_uses, species_scores, by = "rownames")
rownames(species_scores_uses) <- species_scores_uses$rownames
species_scores_uses$rownames <- NULL
species_uses$rownames <- NULL
MatriceFish <- t(as.matrix(species_uses))
column_names <- rownames(species_uses)
MatriceFish_1 <- matrix(1, nrow = 1, ncol = length(column_names))
colnames(MatriceFish_1) <- column_names
rownames(MatriceFish_1) <- "all"
MatriceFish <- rbind(MatriceFish, MatriceFish_1)
write.csv(MatriceFish, "output/MatriceFish.csv", row.names = TRUE)imputeTraits<-function(traitsData, selectedTraits, traitPCA, PCAmodel, meanInputed,
sdInputed, nm_PCA, percImpute=1, dimensions, nboot=100){
require(missForest)
require(Metrics)
# Data: extract the species with complete 'selectedTraits'. For imputation all available traits are used: 'columsImputation'
columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1)
completeSp <- rownames(na.omit(traitsData[,selectedTraits]))
missingSp <- setdiff(rownames(traitsData), completeSp)
traitWithoutNA<-traitsData[completeSp, columsImputation]
traitWithNA <- traitsData[missingSp, columsImputation]
#transform the traits with NA values into a mask (1 for existing traits, NA for NA traits)
traitNAMask <- replace(traitWithNA, traitWithNA > -Inf, 1)
output <- list()
for (i in dimensions){
output[[i]] <- matrix(NA, nc = nboot, nr = nrow(traitsData), dimnames = list(rownames(traitsData), paste0("Sim. ", 1:nboot)))
names(output)[i] <- paste0("PC", i)
}
RMSE<-matrix(NA,nc=2,nr=nboot,dimnames = list(paste0("Sim.",1:nboot),c("RMSE_PC1","RMSE_PC2")))
for (b in 1:nboot){
cat(paste0("\n..................SIMULATION ",b," out of ",nboot,"................\n"))
# Among the complete data, let's add NA values for 'percImpute' rows:
traitSimulNA<-traitWithoutNA[sample(1:nrow(traitWithoutNA), percImpute*nrow(traitWithoutNA)), ]
randomMask <- traitNAMask[sample(1:nrow(traitNAMask) , nrow(traitSimulNA)), ]
# We create missing values in the traitSimulNA matrix by "pasting" them from real missing observations (this way keeping the structure of missing data):
traitSimulNA <- traitSimulNA * randomMask
# We simulate this missing traits using the same imputation method than using in the paper:
# Using the phylogenetic eigenvalues
# We combine the new dataset of species without NA ('traitWithoutNA') and the other species with 'real' NA:
# we impute all traits
traitSimulNAAll <- rbind(traitSimulNA, # Our new NA simulated
traitWithoutNA[setdiff(rownames(traitWithoutNA), rownames(traitSimulNA)), ]) # the species with complete data without the species with new NA
traitsAux <- rbind(traitSimulNAAll, traitWithNA) # the other species
# Imputation with 'missForest'
imputeRFModel <- missForest(xmis = traitsAux)$ximp
traitsinPhylImput <- as.matrix(imputeRFModel)
# We scale the data using the same mean and SD that we used in the paper:
traitsinPhylImputScaled<-traitsinPhylImput[,selectedTraits]
for (i in 1:length(selectedTraits)){
traitsinPhylImputScaled[,i]<-(traitsinPhylImput[, selectedTraits[i]] - meanInputed[i]) / sdInputed[i]
}
colnames(traitsinPhylImputScaled)<-names(meanInputed)
# We predict the position of the imputed species in the functional space:
PCApositionNA <- predict(PCAmodel, newdata = traitsinPhylImputScaled[rownames(traitSimulNA), ])
# Imputed with PCA compared to original position of the species with the functional space
imputed<-PCApositionNA[,dimensions]
original<-traitPCA[rownames(PCApositionNA),]
RMSE1<-(mean((original[,1] - imputed[,1]) ** 2)) / (max(traitPCA[, 1]) - min(traitPCA[, 1]))
RMSE2<-(mean((original[,2] - imputed[,2]) ** 2)) / (max(traitPCA[, 2]) - min(traitPCA[, 2]))
RMSE[b,]<-c(RMSE1,RMSE2)
for (i in dimensions){
output[[i]][rownames(imputed),b] <- imputed[, i]
}
print(apply(RMSE,2,mean,na.rm=T))
}
return(list(output,RMSE))
}
for (gr in 2:5){
if(gr == 2){
pcoaPhyl <- read.table("dataPrepared/Mammals/pcoaPhylogenyMammals.txt",row.names = NULL )
traitsData <- read.table("dataPrepared/Mammals/AllDataMammals.txt", header=T)
columnsTraits <- 1:(which(colnames(traitsData) == "Eigen.1")-1)
traitsData[,columnsTraits] <- log10(traitsData[, columnsTraits])
columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1)
selectedTraits <- c("litter_or_clutch_size_n", "litters_or_clutches_per_y",
"adult_body_mass_g", "longevity_y", "gestation_d", "adult_svl_cm",
"weaning_d", "female_maturity_d")
nm_PCA<-c("ls", "ly", "bm", "long", "gest", "svl", "wea", "fmat")
traitsDataImputed <- read.table("dataPrepared/Mammals/mammalTraitsImputedIUCN.txt")
meanInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,mean)
sdInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,sd)
traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)] <-
scale(traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)])
}
if(gr == 3){
pcoaPhyl <- read.table("dataPrepared/Aves/pcoaPhylogenyAves.txt",row.names = NULL )
traitsData <-read.table("dataPrepared/aves/AllDataAves.txt", header=T)
columnsTraits <- 1:(which(colnames(traitsData) == "Eigen.1")-1)
traitsData[,columnsTraits] <- log10(traitsData[, columnsTraits])
columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1)
selectedTraits <- c("litter_or_clutch_size_n", "adult_body_mass_g", "egg_mass_g",
"incubation_d", "longevity_y", "fledging_age_d", "litters_or_clutches_per_y",
"adult_svl_cm")
nm_PCA<-c("ls", "bm", "em", "inc", "long", "fa", "ly", "svl")
traitsDataImputed <- read.table("dataPrepared/aves/aveTraitsImputedIUCN.txt")
meanInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,mean)
sdInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,sd)
traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)] <-
scale(traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)])
}
if(gr == 4){
pcoaPhyl <- read.table("dataPrepared/reptiles/pcoaPhylogenyReptiles.txt",row.names = NULL )
traitsData <-read.table("dataPrepared/reptiles/AllDataReptiles.txt", header=T)
columnsTraits <- 1:(which(colnames(traitsData) == "Eigen.1")-1)
traitsData[,columnsTraits] <- log10(traitsData[, columnsTraits])
columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1)
selectedTraits <- c("litter_or_clutch_size_n", "litters_or_clutches_per_y",
"adult_body_mass_g", "incubation_d", "longevity_y", "no_sex_svl_cm")
nm_PCA<-c("ls", "ly", "bm", "inc", "long", "svl")
traitsDataImputed <- read.table("dataPrepared/reptiles/reptileTraitsImputedIUCN.txt")
meanInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,mean)
sdInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,sd)
traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)] <-
scale(traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)])
}
if(gr == 5){
pcoaPhyl <- read.table("dataPrepared/Amphibians/pcoaPhylogenyAmphibians.txt",row.names = NULL )
traitsData <-read.table("dataPrepared/Amphibians/AllDataAmphibians.txt", header=T)
columnsTraits <- 1:(which(colnames(traitsData) == "Eigen.1")-1)
traitsData[,columnsTraits] <- log10(traitsData[, columnsTraits])
columsImputation <- 1:(which(colnames(traitsData) == "IUCN")-1)
selectedTraits <- c("Age_at_maturity_min_y",
"Body_size_mm", "Litter_size_max_n",
"Offspring_size_min_mm")
nm_PCA<-c("am", "bs", "ls", "os")
traitsDataImputed <- read.table("dataPrepared/Amphibians/amphibianTraitsImputedIUCN.txt")
meanInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,mean)
sdInputed<-apply(traitsDataImputed[,!colnames(traitsDataImputed)%in%c("IUCN")],2,sd)
traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)] <-
scale(traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)])
}
if(gr == 6){
pcoaPhyl <- read.table("dataPrepared/Fish/pcoaPhylogenyFish.txt",row.names = NULL )
traitsData <- read.table("dataPrepared/Fish/AllDataFish.txt", header=T)
selectedTraits <- c("es","ep","ms","mp","elo","wid","pp","ps","cs","svl","bm")
colnames(traitsData)[1:length(selectedTraits)] <- selectedTraits
traitsDataImputed <- read.table("dataPrepared/Fish/fishTraitsImputedIUCN.txt")
meanInputed<-apply(traitsDataImputed[, !colnames(traitsDataImputed) %in% c("IUCN")], 2, mean)
sdInputed<-apply(traitsDataImputed[, !colnames(traitsDataImputed) %in% c("IUCN")],2,sd)
traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)] <-
scale(traitsDataImputed[, 1:(ncol(traitsDataImputed)-1)])
}
traitAll<-PCAImputedALL[[gr]]$traitsMissing
traitPCA<-PCAImputedALL[[gr]]$traitsUse
PCAmodel<-PCAImputedALL[[gr]]$PCA
dimensions<-c(1:PCAImputedALL[[gr]]$dimensions)
nreps <- 100
cat(paste0("\n....................GROUP:",targetGroupName[gr],"............................\n"))
matCov <- imputeTraits(traitsData, selectedTraits, traitPCA, PCAmodel, meanInputed, sdInputed, nm_PCA,
percImpute = 0.50, dimensions, nboot = nreps)
saveRDS(matCov,file=paste0("dataPrepared/CovarianceMatrix_",targetGroupName[gr],".RDS"))
}# Regex to select all symbols but '_'. useful for identify formulas in dplyr::
"[^[:alnum:][:space:]_]"
x <- iris
x <- x[, 1:2]
colnames(x) <- c("x1", "x2")
library(dplyr)
funfun <- function(data, expr = "log(x1 / x2) * 10^2") {
expr <- parse(text = expr)
data %>% mutate(x3 = eval(expr))
}
funfun(x)
## How to make a gower for big data table
library(cluster) # daisy() and Gower distance
library(data.table) # large data handling
library(parallel) # parallel computing
library(bigstatsr) # very large matrices
load(here::here("outputs", "test_phenofish_final.RData")) # assumes object `test_phenofish_final`
data_morph <- test_phenofish_final %>%
filter(trait_type == "morphological" & fresh == 1 & trait_coding == "continuous")
data_morph <- data_morph %>%
dplyr::group_by(specimen_id, trait_name) %>%
mutate(trait_value = as.numeric(trait_value))
data_morph <- data_morph %>%
dplyr::group_by(phenofish_name, trait_name) %>%
dplyr::summarise(trait_value = mean(trait_value), .groups = "drop")
species_traits <- reshape2::dcast(data_morph, phenofish_name ~ trait_name)
colnames(species_traits)[1] <- "species"
rownames(species_traits) <- species_traits[, 1]
traits <- species_traits[, -1]
block_size <- 10000
n <- nrow(traits)
gower_matrix <- matrix(NA, n, n)
compute_gower_block <- function(start_idx, end_idx, traits_data) {
block_data <- traits_data[start_idx:end_idx, , drop = FALSE]
gower_dist_block <- daisy(block_data, metric = "gower")
return(as.matrix(gower_dist_block))
}
block_indices <- split(1:n, ceiling(1:n / block_size))
cl <- makeCluster(detectCores() - 1)
clusterExport(cl, varlist = c("traits", "daisy", "compute_gower_block"))
gower_blocks <- parLapply(cl, block_indices, function(indices) {
start_idx <- indices[1]
end_idx <- tail(indices, 1)
compute_gower_block(start_idx, end_idx, traits)
})
for (i in 1:length(gower_blocks)) {
idx_range <- block_indices[[i]]
gower_matrix[idx_range, idx_range] <- gower_blocks[[i]]
}
stopCluster(cl)
print(gower_matrix[1:100, 1:100])
save(gower_matrix, file = "gower_matrix.RData")
pcoa_res <- ape::pcoa(as.dist(gower_matrix))
pca_res <- cmdscale(as.dist(gower_matrix), k = 2)
summary(pca_res)