FishMORPH — data preparation

Load FishMORPH trait and phylogeny objects, standardise species names and prepare the full species list.

datamorphology
Args:FishMORPH_Traits.rds — trait RDSFishMORPH_Phylogeny.rds — phylogeny RDS
################################################################################
# data preparation
################################################################################

# --- Load the data ---

trait <- readRDS("FishMORPH_Traits.rds")
phylogeny <- readRDS("FishMORPH_Phylogeny.rds")
traitNames <- colnames(trait)[-c(1:6)]

# --- Prepare the data ---

# Species list
list_sp <- gsub("\\.", " ", as.character(trait$Genus.species))

# Lenght-weight fishbase
lgtwgt <- as.data.frame(length_weight(list_sp))
lgtwgt <- lgtwgt[!is.na(lgtwgt$a) & !is.na(lgtwgt$b), ]

# Best coeff
ab <- lgtwgt %>%
  group_by(Species) %>%
  slice_max(order_by = CoeffDetermination, with_ties = FALSE, na_rm = TRUE) %>%
  dplyr::select(Species, a, b) %>%
  distinct()

# Other traits
speciesInfo <- species(list_sp) %>% data.table()
speciesInfoSub <- speciesInfo[, .(Species, Fresh, LongevityWild, Length, Weight, LTypeMaxM)]
speciesInfoSub <- unique(speciesInfoSub)
speciesInfoSub$Species <- gsub(" ", ".", speciesInfoSub$Species)
speciesInfoSub <- speciesInfoSub %>%
  mutate(
    Length = ifelse(LTypeMaxM != "SL", NA, Length),
    Weight2 = ifelse(Species %in% gsub(" ", ".", rownames(ab)),
                     ab[gsub("\\.", " ", Species), "a"] * Length^ab[gsub("\\.", " ", Species), "b"],
                     NA)
  ) %>%
  mutate(Weight = ifelse(is.na(Weight) & !is.na(Weight2), Weight2, Weight))

# fusion with Fishmorph
fishTraits <- merge(trait[,6:15], speciesInfoSub[, .(Species, Length, Weight)], 
                    by.x = "Genus.species", by.y = "Species", all.x = TRUE)

fishTraits <- fishTraits %>%
  mutate(across(-Genus.species, ~log10(.x + 1))) %>%
  rename(species = Genus.species) %>%
  mutate(species = gsub("\\.", "_", species))

spToKeep <- fishTraits %>% # keep only freshwater species
  dplyr::select(species) %>%
  mutate(species = gsub("_", " ", species)) %>%
  pull() %>%
  rfishbase::species() %>%
  as.data.table() %>%
  filter(Fresh == 1) %>%
  dplyr::select(Species) %>%
  mutate(Species = gsub(" ", "_", Species)) %>%
  pull()

fishTraits <- fishTraits %>%
  filter(species %in% spToKeep)

#dir.create("dataPrepared/Fish", showWarnings = FALSE, recursive = TRUE)

# write.table(fishTraits, "dataPrepared/Fish/fishTraitsMissing.txt")
fishTraits <- read.table( "dataPrepared/Fish/fishTraitsMissing.txt")

# --- PCoA phylogenetic ---

phylogeny$tip.label <- gsub("\\.", "_", phylogeny$tip.label)
phylogeny <- drop.tip(phylogeny, setdiff(phylogeny$tip.label, fishTraits$species))
phylogenyTraits <- phytools::force.ultrametric(phylogeny)
phylDiss <- sqrt(cophenetic(phylogenyTraits))
pcoaPhyl <- cmdscale(phylDiss, k = 10) #long

rownames(pcoaPhyl) = fishTraits$species
colnames(pcoaPhyl) = paste0("Eigen.", 1:10)

# write.table(pcoaPhyl, "dataPrepared/Fish/pcoaPhylogenyFish.txt")
pcoaPhyl <- read.table("dataPrepared/Fish/pcoaPhylogenyFish.txt", header = T, stringsAsFactors = F)

# --- Taxonomic standardization (quick) ---

list_sp_raw <- fishTraits$species
list_sp_raw <- gsub("_", " ", list_sp_raw)
list_sp_raw <- stringr::str_squish(list_sp_raw)

verified_names <- taxize::gna_verifier( # long
  names = list_sp_raw,
  data_sources = c(11), 
  all_matches = FALSE,   
  capitalize = TRUE,         
  species_group = TRUE,      
  output_type = "table"      
)

new_names <- gsub(" ", "_", verified_names$matchedCanonicalSimple)

rownames(pcoaPhyl) <- new_names
fishTraits$species <- new_names
traitsAndPCOA <- cbind(fishTraits, pcoaPhyl)

# write.table(traitsAndPCOA, "dataPrepared/Fish/traitsWithPCOA.txt")
traitsAndPCOA <- read.table("dataPrepared/Fish/traitsWithPCOA.txt", header = T, stringsAsFactors = F)

# --- IUCN data ---

species_list <- unique(str_trim(traitsAndPCOA$species))
iucn_statut <- read.csv("dataOriginal/assessments.csv", header = TRUE, stringsAsFactors = FALSE) # IUCN data 2024
iucn_clean <- iucn_statut %>%
  mutate(scientificName = str_trim(scientificName))
species_list <- gsub("_", " ", species_list)
matched_species <- inner_join(
  tibble(species = species_list),
  iucn_clean,
  by = c("species" = "scientificName")
)

species_to_update <- tibble(species = species_list) %>%
  anti_join(iucn_clean, by = c("species" = "scientificName"))
synonyms_info <- rfishbase::synonyms(
  species_list = species_to_update$species,
  server = "fishbase",
  version = "latest",
  fields = NULL
)
synonyms_info_filtered <- synonyms_info %>%
  filter(!Status %in% c("misapplied name", "ambiguous synonym", "provisionally accepted name"))
synonyms_mapping <- synonyms_info_filtered %>%
  dplyr::select(Species, synonym) %>%
  distinct()

iucn_clean <- iucn_clean %>%
  mutate(scientificName = if_else(
    scientificName %in% synonyms_mapping$Species,
    synonyms_mapping$synonym[match(scientificName, synonyms_mapping$Species)],
    scientificName
  ))
matched_species <- inner_join(
  tibble(species = species_list),
  iucn_clean,
  by = c("species" = "scientificName")
)

species_to_update <- tibble(species = species_list) %>%
  anti_join(iucn_clean, by = c("species" = "scientificName")) %>%
  as.data.frame()
rownames(species_to_update) <- species_to_update$species

# manual correction
species_to_check <- read.csv("dataOriginal/species_to_update_900_done.csv", sep = ";", header = TRUE, stringsAsFactors = FALSE)
colnames(species_to_check) <- c("scientificName", "redlistCategory")

iucn_clean <- iucn_clean[, c("scientificName", "redlistCategory")]
acronyms <- c(
  "Critically Endangered" = "CR",
  "Endangered" = "EN",
  "Vulnerable" = "VU",
  "Near Threatened" = "NT",
  "Least Concern" = "LC",
  "Data Deficient" = "DD",
  "Extinct" = "EX",
  "Extinct in the Wild" = "EW",
  "Not Evaluated" = "NE"
)
iucn_clean <- iucn_clean %>%
  mutate(redlistCategory = dplyr::recode(redlistCategory, !!!acronyms))
iucn_clean <- bind_rows(iucn_clean, species_to_check)
iucn_clean <- iucn_clean %>%
  filter(scientificName %in% species_list)
iucn_clean <- iucn_clean %>%
  group_by(scientificName) %>%
  slice(1) %>%
  ungroup()

traitsAndPCOA$species <- gsub("_", " ", traitsAndPCOA$species)
traitsAndPCOA$IUCN <- iucn_clean$redlistCategory[match(traitsAndPCOA$species, iucn_clean$scientificName)]

# write.table(traitsAndPCOA, "dataPrepared/Fish/traitsWithPCOAIUCN.txt")
traitsAndPCOAIUCN <- read.table("dataPrepared/Fish/traitsWithPCOAIUCN.txt", header = T, stringsAsFactors = F)

# --- Complete taxonomy ---

taxInfo <- pblapply(traitsAndPCOAIUCN$species, function(sp) {
  tryCatch({
    traitdataform::get_gbif_taxonomy(sp, subspecies = TRUE, higherrank = TRUE,
                                     conf_threshold = 80, resolve_synonyms = FALSE)[1, ]
  }, error = function(e) NULL)
}) %>% rbindlist(fill = TRUE)

# --- Final table ---

fishTraitsPhylogenyIUCN <- data.table(traitsAndPCOAIUCN)

# write.table(fishTraitsPhylogenyIUCN, "dataPrepared/Fish/AllDataFish_clean.txt")
fishData <- read.table("dataPrepared/Fish/traitsWithPCOAIUCN.txt", header = T, stringsAsFactors = F)

# define columns
columnsTraits <- 2:(which(colnames(fishData) == "Eigen.1") - 1)
columnsImputation <- 2:(which(colnames(fishData) == "IUCN") - 1)

# --- missForest imputation ---

set.seed(123) 
imputed_forest <- missForest(xmis = fishData[, columnsImputation]) # it takes long
print(imputed_forest$OOBerror)

traits_names <- colnames(fishData)[columnsTraits]
fishData_imputed_forest <- fishData
fishData_imputed_forest[traits_names] <- imputed_forest$ximp[traits_names]

# write.table(fishData_imputed_forest, "dataPrepared/Fish/fishData_imputed_forest.txt", row.names = FALSE)
fishData_imputed_forest <- read.table("dataPrepared/Fish/fishData_imputed_forest.txt", header = T)

# --- trait selection and renaming ---

selectedTraits <- c("EdHd", "EhBd", "JlHd", "MoBd", "BlBd", "HdBd",
                    "PFiBd", "PFlBl", "CFdCPd", "Length", "Weight")

newTraitNames <- c("es", "ep", "ms", "mp", "elo", "wid", "pp", "ps", "cs", "svl", "bm")

fishTraitsMissing <- fishData[, selectedTraits]
colnames(fishTraitsMissing) <- newTraitNames
rownames(fishTraitsMissing) <- fishData$species

fishTraitsImputed <- fishData_imputed_forest[, selectedTraits]
colnames(fishTraitsImputed) <- newTraitNames
rownames(fishTraitsImputed) <- fishData$species

# --- add IUCN to the table ---

fishTraitsMissing <- data.frame(fishTraitsMissing, IUCN = fishData$IUCN)
# write.table(fishTraitsMissing, "dataPrepared/Fish/TraitFishMissing.txt")
fishTraitsImputed <- data.frame(fishTraitsImputed, IUCN = fishData$IUCN)
# write.table(fishTraitsImputed, "dataPrepared/Fish/TraitFishImputed.txt")

# --- ACP and TPD ---

# it takes long !!!
results <- computePCAandTPDs(fishTraitsImputed[,!colnames(fishTraitsImputed) == "IUCN"])
# saveRDS(results, "output/All_fish.rds")
pca_trait = results$PCA
# saveRDS(pca_trait, "output/PCA_fish.rds")
tpd_trait = results$TPDs
# saveRDS(tpd_trait, "output/TPDs_fish.rds")

# --- (optionnal) other fish info ---

fishOtherInfo <- fishData[, (max(columnsTraits) + 1):ncol(fishData)]

# --- add uses to pca_traits ---

df_scraping <- readRDS("output/fish_human_uses_binary_FB.rds") %>% as.data.table()
df_uni <- fread("data/uni.csv")

setnames(df_scraping, c("species_name", "aquarium", "fisheries", "bait", "game_fish", "aquaculture"),
         c("Species", "Aquarium", "Fisheries", "Bait", "Game_fish", "Aquaculture"))
setnames(df_uni, "Game fish", "Game_fish", skip_absent = TRUE)

binary_cols <- c("Fisheries", "Aquaculture", "Aquarium", "Game_fish", "Bait")

merged_df <- merge(
  df_uni,
  df_scraping[, c("Species", binary_cols), with = FALSE],
  by = "Species",
  suffixes = c("", ".scraping"),
  all.x = TRUE
)

for (col in binary_cols) {
  col_scraping <- paste0(col, ".scraping")
  if (col_scraping %in% colnames(merged_df)) {
    merged_df[[col]] <- pmax(
      as.numeric(merged_df[[col]]),
      as.numeric(merged_df[[col_scraping]]),
      na.rm = TRUE
    )
    merged_df[[col_scraping]] <- NULL
  }
}

merged_df[, All_uses := as.integer(Fisheries + Aquaculture + Aquarium + Game_fish + Bait > 0)]

merged_df_clean <- merged_df %>%
  select(Species, all_of(binary_cols), All_uses) %>%
  rename("Game fish" = Game_fish, "All uses" = All_uses)

usage_cols <- c("Fisheries", "Aquaculture", "Aquarium", "Game fish", "Bait", "All uses")

uses_df <- as.data.frame(merged_df_clean[, c("Species", usage_cols), with = FALSE])
uses_df <- uses_df[!is.na(uses_df$Species) & uses_df$Species != "NA", ]
rownames(uses_df) <- uses_df$Species
uses_df$Species <- NULL

uses_df["Centromochlus musaica", ] <- list( # miss match in the species name for Centromochlus musaica
  Fisheries = 0,
  Aquaculture = 0,
  Aquarium = 1, # only one use associated to Centromochlus musaica
  `Game fish` = 0,
  Bait = 0,
  `All uses` = 1
)

pca_trait$uses <- uses_df[species_ref, usage_cols, drop = FALSE]
use <- pca_trait$uses
saveRDS(pca_trait, "output/pca_trait.rds")

# --- create the matrice ---

pca_trait <- readRDS("output/pca_trait.rds")
  
species_scores <- as.data.frame(pca_trait$traits_scores[,1:4])
species_uses <- as.data.frame(pca_trait$uses)

species_uses$rownames <- rownames(species_uses)
species_scores$rownames <- rownames(species_scores)
species_scores_uses <- merge(species_uses, species_scores, by = "rownames")
rownames(species_scores_uses) <- species_scores_uses$rownames
species_scores_uses$rownames <- NULL
species_uses$rownames <- NULL
MatriceFish <- t(as.matrix(species_uses))
column_names <- rownames(species_uses)
MatriceFish_1 <- matrix(1, nrow = 1, ncol = length(column_names))
colnames(MatriceFish_1) <- column_names
rownames(MatriceFish_1) <- "all"
MatriceFish <- rbind(MatriceFish, MatriceFish_1)

write.csv(MatriceFish, "output/MatriceFish.csv", row.names = TRUE)