01_DATA_load_and_clean — Step 1

Load AVONET BirdLife morphological traits, fetch IUCN Red List categories via API v4 (rredlist), merge into a single species × traits table, and resolve taxonomic duplicates. Results cached to data/processed/BirdTraitCombined.csv.

data loading taxonomy IUCN API
Inputs: data/raw/AVONET.xlsx IUCN_REDLIST_KEY (.Renviron) traitdata::amniota traitdata::elton_birds
Outputs: data/processed/BirdTraitCombined.csv data/processed/iucn_cache.rds
# ---- Reproducibility -------------------------------------------------------
GLOBAL_SEED <- 20251029
set.seed(GLOBAL_SEED)
source("scripts/00_GeneralScript.R")

########################################################################
### 1. Loading original trait and taxonomic standardization
########################################################################

if(!("BirdTraitCombined.csv" %in% list.files("data/raw"))){

  ### 1.a. AVONET morphological traits
  avonet_raw <- readxl::read_excel(
    path = "data/raw/AVONET.xlsx",
    sheet = "AVONET1_BirdLife",
    col_types = c(
      "skip", "text", "text", "text", "skip", "skip", "skip", "skip", "skip", "skip",
      "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
      "numeric", "numeric", "numeric", "numeric",
      rep("skip", 16)
    )
  )
  avonet_raw <- as.data.frame(avonet_raw)
  avonet_raw <- subset(avonet_raw, Family1 != "Apterygidae")  # remove flightless

  ### 1.b. IUCN categories via API v4 (rredlist >= 1.0.0)
  if (!requireNamespace("rredlist", quietly = TRUE)) install.packages("rredlist")
  library(rredlist)
  if (utils::packageVersion("rredlist") < "1.0.0")
    stop("rredlist >= 1.0.0 required (API v4). install.packages('rredlist')")

  `%||%` <- function(a, b) if (is.null(a) || length(a) == 0L) b else a

  parse_binomial <- function(x, sep = "[_ ]+") {
    parts <- strsplit(as.character(x), sep)
    data.frame(
      genus   = sapply(parts, function(p) if (length(p) >= 1L) p[1] else NA_character_),
      species = sapply(parts, function(p) if (length(p) >= 2L) p[2] else NA_character_),
      stringsAsFactors = FALSE
    )
  }

  query_iucn <- function(genus, species, key) {
    if (is.na(genus) || is.na(species) || genus == "" || species == "")
      return(NA_character_)
    tryCatch({
      res <- rredlist::rl_species(genus = genus, species = species, key = key)
      if (is.null(res) || is.null(res$assessments)) return(NA_character_)
      asm <- res$assessments
      if (is.data.frame(asm)) {
        if (nrow(asm) == 0L) return(NA_character_)
        latest_idx <- which(asm$latest == TRUE)
        if (length(latest_idx) == 0L) latest_idx <- 1L
        return(asm$red_list_category_code[latest_idx[1]])
      }
      if (is.list(asm) && length(asm) > 0L) {
        latest_idx <- which(vapply(asm, function(a) isTRUE(a$latest), logical(1L)))
        if (length(latest_idx) == 0L) latest_idx <- 1L
        return(asm[[latest_idx[1]]]$red_list_category_code %||% NA_character_)
      }
      NA_character_
    }, error = function(e) {
      if (!grepl("404|not found", conditionMessage(e), ignore.case = TRUE))
        message(sprintf("  [ERROR] %s %s — %s", genus, species, conditionMessage(e)))
      NA_character_
    })
  }

  fetch_iucn_categories <- function(binomials, key,
                                    cache_path = "data/processed/iucn_cache.rds",
                                    rate_limit_s = 0.6,
                                    retry_failed = FALSE) {
    binomials <- unique(binomials[!is.na(binomials) & binomials != ""])
    cache <- if (file.exists(cache_path)) readRDS(cache_path) else list()
    to_query <- if (retry_failed) {
      binomials[!binomials %in% names(cache) |
                  vapply(binomials, function(sp) is.null(cache[[sp]]) || is.na(cache[[sp]]), logical(1L))]
    } else {
      binomials[!binomials %in% names(cache)]
    }
    if (length(to_query) > 0L) {
      parsed <- parse_binomial(to_query)
      for (i in seq_along(to_query)) {
        cache[[to_query[i]]] <- query_iucn(parsed$genus[i], parsed$species[i], key = key)
        if (i %% 100L == 0L) {
          dir.create(dirname(cache_path), showWarnings = FALSE, recursive = TRUE)
          saveRDS(cache, cache_path)
        }
        if (i < length(to_query)) Sys.sleep(rate_limit_s)
      }
      dir.create(dirname(cache_path), showWarnings = FALSE, recursive = TRUE)
      saveRDS(cache, cache_path)
    }
    out <- unlist(cache[binomials]); names(out) <- binomials; out
  }

  API_KEY <- Sys.getenv("IUCN_REDLIST_KEY")
  if (API_KEY == "") stop("Set IUCN_REDLIST_KEY in .Renviron (https://api.iucnredlist.org)")

  iucn_categories <- fetch_iucn_categories(
    binomials = unique(avonet_raw$Species1), key = API_KEY,
    cache_path = "data/processed/iucn_cache.rds", rate_limit_s = 0.6
  )
  avonet_raw$iucn_category <- iucn_categories[avonet_raw$Species1]

  ### 1.c. Taxonomic fix + merge AMNIOTE + EltonTraits
  avonet_merged <- avonet_raw
  avonet_merged$scientificNameStd <- avonet_merged$Species1
  avonet_merged$GenusSpecies      <- avonet_merged$Species1

  avonet_traitdata_map <- traitdata::avonet
  avonet_traitdata_map$GenusSpecies <- paste(avonet_traitdata_map$Genus, avonet_traitdata_map$Species)
  avonet_traitdata_map <- unique(avonet_traitdata_map[, c("GenusSpecies", "scientificNameStd")])
  colnames(avonet_traitdata_map) <- c("Species1", "scientificNameStd_traitdata")
  avonet_merged <- merge(avonet_merged, avonet_traitdata_map, by = "Species1", all.x = TRUE)
  avonet_merged <- avonet_merged[!is.na(avonet_merged$scientificNameStd), ]
  avonet_merged <- avonet_merged[order(avonet_merged$scientificNameStd), ]

  species_to_remove <- c(
    'Atlantisia rogersi', 'Casuarius bennetti', 'Casuarius casuarius',
    'Casuarius unappendiculatus', 'Dromaius novaehollandiae',
    'Rhea americana', 'Rhea pennata', 'Struthio camelus'
  )
  avonet_filtered <- subset(avonet_merged, !(scientificNameStd %in% species_to_remove))

  # Deduplicate: keep row matching GenusSpecies, else row with fewest NAs
  duplicated_species <- names(which(table(avonet_filtered$scientificNameStd) > 1))
  avonet_final <- subset(avonet_filtered, !(scientificNameStd %in% duplicated_species))
  for (sp in duplicated_species) {
    entries <- avonet_filtered[avonet_filtered$scientificNameStd == sp, ]
    if (sp %in% entries$GenusSpecies) {
      avonet_final <- rbind(avonet_final, entries[entries$GenusSpecies == sp, ])
    } else {
      na_counts <- apply(entries, 1, function(row) sum(is.na(row)))
      avonet_final <- rbind(avonet_final, entries[which.min(na_counts), , drop = FALSE])
    }
  }

  resolve_duplicates <- function(df_grouped) {
    if (any(df_grouped$GenusSpecies == df_grouped$scientificNameStd[1])) {
      return(df_grouped %>% filter(GenusSpecies == scientificNameStd[1]) %>% slice(1))
    } else {
      return(df_grouped %>% mutate(n_NA = rowSums(is.na(.))) %>% arrange(n_NA) %>% slice(1) %>% select(-n_NA))
    }
  }

  clean_amniote_aves <- function(data = traitdata::amniota) {
    aves_data <- data %>% filter(Class == "Aves") %>% distinct() %>%
      filter(!is.na(scientificNameStd)) %>% mutate(GenusSpecies = paste(Genus, Species))
    dup_names <- aves_data %>% count(scientificNameStd) %>% filter(n > 1) %>% pull(scientificNameStd)
    aves_clean    <- aves_data %>% filter(!scientificNameStd %in% dup_names)
    aves_resolved <- aves_data %>% filter(scientificNameStd %in% dup_names) %>%
      group_by(scientificNameStd) %>% group_modify(~ resolve_duplicates(.x)) %>% ungroup()
    final_data <- bind_rows(aves_clean, aves_resolved)
    stopifnot(!any(duplicated(final_data$scientificNameStd)))
    return(final_data)
  }
  aveTraitsFinal <- clean_amniote_aves()

  clean_elton_diet_aves <- function(elton_birds_data) {
    elton_clean <- elton_birds_data %>%
      mutate(Genus = trimws(Genus), Species = trimws(Species),
             scientificNameStd = paste(Genus, Species), GenusSpecies = paste(Genus, Species)) %>%
      select(-c(1:7, 10, 36:42)) %>% distinct() %>% filter(!is.na(scientificNameStd))
    dup_names <- elton_clean %>% count(scientificNameStd) %>% filter(n > 1) %>% pull(scientificNameStd)
    elton_ok <- elton_clean %>% filter(!scientificNameStd %in% dup_names)
    dup_resolved <- elton_clean %>% filter(scientificNameStd %in% dup_names) %>%
      group_by(scientificNameStd) %>%
      group_modify(~ {
        match_gs <- .x %>% filter(GenusSpecies == .x$scientificNameStd[1])
        if (nrow(match_gs) > 0) return(slice(match_gs, 1))
        .x %>% mutate(n_NA = rowSums(is.na(.))) %>% arrange(n_NA) %>% slice(1) %>% select(-n_NA)
      }) %>% ungroup()
    elton_final <- bind_rows(elton_ok, dup_resolved)
    stopifnot(!any(duplicated(elton_final$scientificNameStd)))
    return(elton_final)
  }
  aveDietFinal <- clean_elton_diet_aves(traitdata::elton_birds)

  birdTraits <- avonet_final %>%
    rename(scientificNameStd_birdlife = scientificNameStd,
           scientificNameStd = scientificNameStd_traitdata) %>%
    full_join(aveDietFinal, by = "scientificNameStd") %>%
    full_join(aveTraitsFinal, by = "scientificNameStd") %>%
    mutate(scientificNameStd_traitdata = scientificNameStd,
           scientificNameStd = ifelse(!is.na(scientificNameStd_birdlife),
                                      scientificNameStd_birdlife, scientificNameStd_traitdata))

  cols_to_keep <- c("scientificNameStd",
                    colnames(birdTraits)[c(18,47:53,15)],
                    colnames(birdTraits)[4:14],
                    colnames(birdTraits)[21:30],
                    colnames(birdTraits)[35:42],
                    colnames(birdTraits)[54:82])
  birdTraits <- birdTraits[, unique(cols_to_keep)]
  if (any(duplicated(birdTraits$scientificNameStd))) {
    birdTraits <- birdTraits %>% group_by(scientificNameStd) %>% slice(1) %>% ungroup()
  }
  write.csv(birdTraits, file = "data/processed/BirdTraitCombined.csv", row.names = FALSE)

} else {
  birdTraits <- read.csv("data/processed/BirdTraitCombined.csv")
}