01_DATA_load_and_clean — Step 2

Build a bird megatree (from GitHub), compute phylogenetic PCoA eigenvectors, add missing species via phytools::add.species.to.genus, then impute missing trait values with missForest using parallel processing.

missForest phylogenetics trait imputation
Inputs: data/processed/BirdTraitCombined.csv bird_megatree (GitHub)
Outputs: data/processed/BirdPhylogeny.tre data/processed/pcoaPhylogenyAves.txt data/processed/phenoBirdsImputedAll.csv data/processed/phenoBirdsImputed.csv
if(!("BirdTraitCombined.csv" %in% list.files("data/processed"))){

  phylogeny_raw   <- ape::read.tree('https://raw.githubusercontent.com/megatrees/bird_20221117/main/bird_megatree.tre')
  phylogeny_ultra <- phytools::force.ultrametric(phylogeny_raw, method = "extend")

  sp_phylo <- phylogeny_ultra$tip.label
  sp_trait <- gsub(" ", "_", birdTraits$scientificNameStd)

  # Synonymy resolution via ITIS
  sp_to_check  <- sp_trait[!sp_trait %in% sp_phylo]
  chk_synonyms <- taxize::synonyms(sp_to_check, db = "itis", rows = 1)

  birdTraits$scientificNameStd    <- gsub(" ", "_", birdTraits$scientificNameStd)
  birdTraits$scientificNameStdNEW <- birdTraits$scientificNameStd

  for (i in seq_along(chk_synonyms)) {
    orig_name   <- sp_to_check[i]
    current_syn <- chk_synonyms[[i]]
    if ("acc_name" %in% colnames(current_syn)) {
      birdTraits$scientificNameStdNEW[birdTraits$scientificNameStd == orig_name] <-
        gsub(" ", "_", current_syn$acc_name[1])
    } else if ("syn_name" %in% colnames(current_syn)) {
      syn_candidates <- gsub(" ", "_", current_syn$syn_name)
      match_found    <- syn_candidates[syn_candidates %in% sp_phylo]
      if (length(match_found) > 0)
        birdTraits$scientificNameStdNEW[birdTraits$scientificNameStd == orig_name] <- match_found[1]
    }
  }
  is_missing <- is.na(birdTraits$scientificNameStdNEW)
  birdTraits$scientificNameStdNEW[is_missing] <- birdTraits$scientificNameStd[is_missing]

  # Add missing species to phylogeny
  tips_missing   <- birdTraits$scientificNameStdNEW[!birdTraits$scientificNameStdNEW %in% phylogeny_ultra$tip.label]
  names_to_add   <- unique(tips_missing)
  genus_to_add   <- sapply(strsplit(names_to_add, "_"), `[`, 1)
  phylogeny_final <- phylogeny_ultra
  for (i in seq_along(names_to_add)) {
    phylogeny_final <- phytools::add.species.to.genus(
      tree = phylogeny_final, species = names_to_add[i],
      genus = genus_to_add[i], where = "root"
    )
  }
  write.tree(phylogeny_final, file = "data/processed/BirdPhylogeny.tre")

  # Phylogenetic PCoA (10 axes)
  phylo_dist  <- sqrt(cophenetic(phylogeny_final))
  pcoa_phylo  <- cmdscale(phylo_dist, k = 10)
  colnames(pcoa_phylo) <- paste0("Eigen.", 1:ncol(pcoa_phylo))
  pcoa_df     <- as.data.frame(pcoa_phylo)
  pcoa_df$scientificNameStd <- rownames(pcoa_df)
  write.table(pcoa_df, "data/processed/pcoaPhylogenyAves.txt",
              row.names = FALSE, quote = FALSE, sep = "\t")

  # Merge + clean negative values
  clean_bird_traits_phylogeny <- function(birdTraits, pcoaPhyl,
                                          output_file = "data/processed/phenoBirds.csv") {
    merged_data <- merge(birdTraits, pcoaPhyl,
                         by.x = "scientificNameStd", by.y = "scientificNameStd", all.x = TRUE)
    numeric_columns_present <- intersect(
      c("Beak.Length_Culmen","Beak.Length_Nares","Beak.Width","Beak.Depth",
        "Tarsus.Length","Wing.Length","Kipps.Distance","Secondary1",
        "Hand-Wing.Index","Tail.Length","Mass",
        "female_maturity_d","litter_or_clutch_size_n","litters_or_clutches_per_y",
        "adult_body_mass_g","maximum_longevity_y","gestation_d","weaning_d",
        "birth_or_hatching_weight_g","weaning_weight_g","egg_mass_g","incubation_d",
        "fledging_age_d","longevity_y","male_maturity_d",
        "inter_litter_or_interbirth_interval_y","female_body_mass_g","male_body_mass_g",
        "no_sex_body_mass_g","egg_width_mm","egg_length_mm","fledging_mass_g",
        "adult_svl_cm","female_svl_cm","birth_or_hatching_svl_cm",
        "female_svl_at_maturity_cm","female_body_mass_at_maturity_g",
        "no_sex_svl_cm","no_sex_maturity_d"),
      colnames(merged_data)
    )
    merged_data[numeric_columns_present] <- lapply(
      merged_data[numeric_columns_present],
      function(x) if (is.numeric(x)) replace(x, x < 0, NA) else x
    )
    rownames(merged_data) <- merged_data$scientificNameStd
    merged_data <- merged_data[, !(colnames(merged_data) %in% c("scientificNameStd","scientificNameStdNEW"))]
    merged_data <- merged_data[, colSums(!is.na(merged_data)) > 0]
    write.csv(merged_data, file = output_file, row.names = TRUE)
    invisible(merged_data)
  }
  birdTraitsPhy <- clean_bird_traits_phylogeny(birdTraits, pcoa_df)
  write.csv(birdTraitsPhy, file = "data/processed/phenoBirds.csv")

} else {
  birdTraitsPhy <- read.csv("data/processed/phenoBirds.csv")

  morphoColumn <- c("Tarsus.Length","Wing.Length","Kipps.Distance","Secondary1",
                    "Hand.Wing.Index","Tail.Length","Mass","adult_svl_cm")
  lhtColum     <- c("litter_or_clutch_size_n","litters_or_clutches_per_y",
                    "adult_body_mass_g","maximum_longevity_y","birth_or_hatching_weight_g",
                    "incubation_d","fledging_age_d","longevity_y","egg_mass_g",
                    "male_maturity_d","inter_litter_or_interbirth_interval_y",
                    "female_body_mass_g","no_sex_body_mass_g",
                    "egg_length_mm","fledging_mass_g","no_sex_maturity_d")
  dietColumn   <- c('Diet.Inv','Diet.Vend','Diet.Vect','Diet.Vfish','Diet.Vunk',
                    'Diet.Scav','Diet.Fruit','Diet.Nect','Diet.Seed','Diet.PlantO')
  phyloColumn  <- paste0('Eigen.', 1:10)
  numericColumn <- c(morphoColumn, lhtColum)

  birdTraitsPhy[, numericColumn] <- replace(birdTraitsPhy[, numericColumn],
                                            birdTraitsPhy[, numericColumn] < 0, NA)
  birdTraitsPhy <- birdTraitsPhy[!is.na(apply(birdTraitsPhy[, dietColumn], 1, sum)), ]

  birdTraitsToImpute <- cbind(scale(log10(birdTraitsPhy[, numericColumn])),
                              birdTraitsPhy[, dietColumn],
                              birdTraitsPhy[, phyloColumn])
  colToImputeAll <- c(numericColumn, dietColumn, phyloColumn)

  doParallel::registerDoParallel(cores = 14)
  birdTraitslogImputed <- as.matrix(
    missForest(xmis = birdTraitsToImpute[, colToImputeAll], parallelize = "variables")$ximp
  )
  birdTraitsPhy[, colToImputeAll] <- birdTraitslogImputed[, colToImputeAll]
  write.csv(birdTraitsPhy, file = "data/processed/phenoBirdsImputedAll.csv", row.names = FALSE)

  colToImpute <- c(lhtColum, phyloColumn)
  birdTraitslogImputed <- as.matrix(
    missForest(xmis = birdTraitsToImpute[, colToImpute], parallelize = "variables")$ximp
  )
  birdTraitsPhy[, colToImpute] <- birdTraitslogImputed[, colToImpute]
  write.csv(birdTraitsPhy, file = "data/processed/phenoBirdsImputed.csv", row.names = FALSE)
}