query_fishmorph(query, fishmorph_path, ...)

Retrieve all FishMORPH rows matching a taxon name (species, genus, family, or order). Automatically falls back to broader taxonomic levels when the exact match is not found, with an interactive or automatic fallback strategy.

FishMORPH taxonomy query fallback · GBIF
Arguments
ArgumentDefaultDescription
queryTaxon name: species binomial, genus, family, or order
fishmorph_pathPath to TraitFishImputed.txt
fishmorph_infoNULLPre-computed GBIF taxonomy table (from $fishmorph_info of a previous call)
level"auto"Taxonomic level to search: "species", "genus", "family", "order", or "auto" (detected from the query)
fallbackTRUEIf query not found at requested level, try progressively broader levels
interactiveFALSEIf TRUE and fallback matches found, ask user to confirm before returning broader results
min_n1Minimum number of rows required; warn if below
cache_pathNULLPath to save/load the GBIF taxonomy as .rds (e.g. "cache/fm_taxonomy.rds"). Created automatically on first run, loaded instantly on subsequent calls.
conf_threshold80GBIF synonym confidence (used only if fishmorph_info is NULL)
verboseTRUEPrint match summary
Returns (data.frame)
Filtered rows from FishMORPH with an extra column match_level ("species" / "genus" / "family" / "order") indicating at which taxonomic level each row was matched. Returns NULL with a message if nothing is found at any level.
query_fishmorph <- function(
    query,
    fishmorph_path,
    fishmorph_info   = NULL,
    level            = "auto",
    fallback         = TRUE,
    interactive      = FALSE,
    min_n            = 1,
    conf_threshold   = 80,
    cache_path       = NULL,   # path to save/load fishmorph_info as .rds (e.g. "cache/fm_taxonomy.rds")
    verbose          = TRUE
) {
  # ── Packages ──────────────────────────────────────────────────────────────
  if (!requireNamespace("data.table", quietly=TRUE)) install.packages("data.table")
  if (!requireNamespace("pbapply",    quietly=TRUE)) install.packages("pbapply")
  if (!requireNamespace("traitdataform",quietly=TRUE)) install.packages("traitdataform")
  library(data.table); library(pbapply); library(traitdataform)

  query <- trimws(query)

  # ── 1. Load FishMORPH ────────────────────────────────────────────────────
  fishTraits <- read.table(fishmorph_path)
  traitCols  <- setdiff(colnames(fishTraits), "IUCN")
  fishMorph  <- trimws(rownames(fishTraits))

  # ── 2. Build or reuse taxonomy lookup (with optional disk cache) ────────
  if (is.null(fishmorph_info)) {
    # Try to load from disk cache first
    if (!is.null(cache_path) && file.exists(cache_path)) {
      if (verbose) message("Loading cached FishMORPH taxonomy from: ", cache_path)
      fishmorph_info <- readRDS(cache_path)
    } else {
      if (verbose) {
        message("Resolving FishMORPH taxonomy via GBIF (~5-10 min on first run)...")
        if (!is.null(cache_path))
          message("Will save to cache: ", cache_path, "  (reused on all future calls)")
      }
      fishmorph_info <- pbapply::pblapply(fishMorph, function(sp) {
        tryCatch(
          traitdataform::get_gbif_taxonomy(sp, subspecies=TRUE, higherrank=TRUE,
            conf_threshold=conf_threshold, resolve_synonyms=TRUE)[1,],
          error=function(e) NULL)
      }) |> data.table::rbindlist(fill=TRUE) |> data.table::as.data.table()
      # Save to disk cache
      if (!is.null(cache_path)) {
        dir.create(dirname(cache_path), recursive=TRUE, showWarnings=FALSE)
        saveRDS(fishmorph_info, cache_path)
        if (verbose) message("Taxonomy cached to: ", cache_path)
      }
    }
  } else {
    fishmorph_info <- data.table::as.data.table(fishmorph_info)
  }

  # Attach species names
  fishmorph_info[, species := fishMorph]
  fishmorph_info[, genus   := sub(" .*$", "", fishMorph)]

  # ── 3. Auto-detect level ─────────────────────────────────────────────────
  # Heuristic: 2 words → species, 1 word → could be genus/family/order
  if (level == "auto") {
    if (grepl(" ", query)) {
      level <- "species"
    } else {
      # Check if it matches any known genus, family, or order
      if (query %in% fishmorph_info$genus) {
        level <- "genus"
      } else if (query %in% fishmorph_info$family) {
        level <- "family"
      } else if (query %in% fishmorph_info$order) {
        level <- "order"
      } else {
        level <- "species"   # will fail gracefully below
      }
    }
    if (verbose) message("Auto-detected level: ", level)
  }

  # ── 4. Fallback chain ────────────────────────────────────────────────────
  fallback_chain <- c("species","genus","family","order")
  start_idx      <- match(level, fallback_chain)
  if (is.na(start_idx)) stop("level must be one of: species, genus, family, order, auto")

  levels_to_try  <- if (fallback) fallback_chain[start_idx:length(fallback_chain)] else level

  # ── 5. Search each level ─────────────────────────────────────────────────
  for (lv in levels_to_try) {

    # Build match index
    idx <- switch(lv,
      species = which(fishMorph == query |
                      fishmorph_info$scientificName == query |
                      fishmorph_info$verbatimScientificName == query),
      genus   = which(fishmorph_info$genus == query),
      family  = which(fishmorph_info$family == query),
      order   = which(fishmorph_info$order == query)
    )
    idx <- unique(idx[!is.na(idx)])

    if (length(idx) == 0) {
      if (verbose) message(sprintf("  '%s' not found at %s level.", query, lv))
      if (!fallback || lv == utils::tail(levels_to_try, 1)) {
        message(sprintf(
          "\n No match found for '%s' at any level (%s).\n",
          query, paste(levels_to_try, collapse=" → ")))
        return(invisible(NULL))
      }
      next  # try next level
    }

    # Found something
    result <- fishTraits[idx, traitCols, drop=FALSE]
    result$match_level <- lv
    result$matched_name <- fishMorph[idx]
    # Add taxonomy columns
    result$genus  <- fishmorph_info$genus[idx]
    result$family <- fishmorph_info$family[idx]
    result$order  <- fishmorph_info$order[idx]

    if (verbose) {
      if (lv != level) {
        message(sprintf(
          "  Not found at '%s' level — matched at '%s' level instead.", level, lv))
      }
      message(sprintf("  Found %d row(s) matching '%s' (%s level).",
                      nrow(result), query, lv))
    }

    # Warn if very few rows
    if (nrow(result) < min_n) {
      warning(sprintf(
        "Only %d row(s) returned for '%s'. Consider using a broader taxonomic level.",
        nrow(result), query), call.=FALSE)
    }

    # Interactive confirmation for fallback
    if (interactive && lv != level) {
      ans <- readline(sprintf(
        "Found %d row(s) at %s level for '%s'. Use these? [y/n]: ",
        nrow(result), lv, query))
      if (!tolower(trimws(ans)) %in% c("y","yes","o","oui")) {
        message("Cancelled by user.")
        return(invisible(NULL))
      }
    }

    return(result)
  }

  invisible(NULL)
}

# ── Examples ──────────────────────────────────────────────────────────────────
# Load FishMORPH
# fm <- read.table("data/TraitFishImputed.txt")
#
# ── 1. Single species
# query_fishmorph("Gobio gobio", fishmorph_path="data/TraitFishImputed.txt")
#
# ── 2. Species not found → auto-fallback to genus
# query_fishmorph("Gobio vulgaris",   # old synonym, not in FishMORPH
#                 fishmorph_path="data/TraitFishImputed.txt",
#                 fallback=TRUE)
# # → "Not found at species level — matched at genus level instead."
# # → returns all Gobio species in FishMORPH
#
# ── 3. Query a genus directly
# query_fishmorph("Hoplias",
#                 fishmorph_path="data/TraitFishImputed.txt",
#                 level="genus")
#
# ── 4. Query a family
# query_fishmorph("Cichlidae",
#                 fishmorph_path="data/TraitFishImputed.txt",
#                 level="family")
#
# ── 5. Query an order
# query_fishmorph("Characiformes",
#                 fishmorph_path="data/TraitFishImputed.txt",
#                 level="order")
#
# ── 6. Disk cache — compute once, instant forever
# First call: slow (GBIF queries), saves taxonomy to disk
# r1 <- query_fishmorph("Gobio gobio",
#                       fishmorph_path = "data/TraitFishImputed.txt",
#                       cache_path     = "cache/fm_taxonomy.rds")
# → "Resolving FishMORPH taxonomy via GBIF (~5-10 min)..."
# → "Taxonomy cached to: cache/fm_taxonomy.rds"
#
# All subsequent calls load from disk instantly (no GBIF):
# r2 <- query_fishmorph("Cichlidae",
#                       fishmorph_path = "data/TraitFishImputed.txt",
#                       cache_path     = "cache/fm_taxonomy.rds",
#                       level          = "family")
# → "Loading cached FishMORPH taxonomy from: cache/fm_taxonomy.rds"
# → Found 312 row(s) matching 'Cichlidae' (family level).
#
# r3 <- query_fishmorph("Hoplias",
#                       fishmorph_path = "data/TraitFishImputed.txt",
#                       cache_path     = "cache/fm_taxonomy.rds")
#
# Tip: you can also pass the in-memory object directly (even faster):
# fm_tax <- readRDS("cache/fm_taxonomy.rds")
# r4 <- query_fishmorph("Characiformes", fishmorph_path="...",
#                       fishmorph_info = fm_tax, level="order")
#
# ── 7. Interactive confirmation when fallback is triggered
# query_fishmorph("Hoplias macroptera",  # hypothetical absent species
#                 fishmorph_path="...",
#                 interactive=TRUE)
# # → prompts: "Found 8 row(s) at genus level for 'Hoplias macroptera'. Use these? [y/n]:"