Build a bird megatree (from GitHub), compute phylogenetic PCoA eigenvectors, add missing species via phytools::add.species.to.genus, then impute missing trait values with missForest using parallel processing.
if(!("BirdTraitCombined.csv" %in% list.files("data/processed"))){
phylogeny_raw <- ape::read.tree('https://raw.githubusercontent.com/megatrees/bird_20221117/main/bird_megatree.tre')
phylogeny_ultra <- phytools::force.ultrametric(phylogeny_raw, method = "extend")
sp_phylo <- phylogeny_ultra$tip.label
sp_trait <- gsub(" ", "_", birdTraits$scientificNameStd)
# Synonymy resolution via ITIS
sp_to_check <- sp_trait[!sp_trait %in% sp_phylo]
chk_synonyms <- taxize::synonyms(sp_to_check, db = "itis", rows = 1)
birdTraits$scientificNameStd <- gsub(" ", "_", birdTraits$scientificNameStd)
birdTraits$scientificNameStdNEW <- birdTraits$scientificNameStd
for (i in seq_along(chk_synonyms)) {
orig_name <- sp_to_check[i]
current_syn <- chk_synonyms[[i]]
if ("acc_name" %in% colnames(current_syn)) {
birdTraits$scientificNameStdNEW[birdTraits$scientificNameStd == orig_name] <-
gsub(" ", "_", current_syn$acc_name[1])
} else if ("syn_name" %in% colnames(current_syn)) {
syn_candidates <- gsub(" ", "_", current_syn$syn_name)
match_found <- syn_candidates[syn_candidates %in% sp_phylo]
if (length(match_found) > 0)
birdTraits$scientificNameStdNEW[birdTraits$scientificNameStd == orig_name] <- match_found[1]
}
}
is_missing <- is.na(birdTraits$scientificNameStdNEW)
birdTraits$scientificNameStdNEW[is_missing] <- birdTraits$scientificNameStd[is_missing]
# Add missing species to phylogeny
tips_missing <- birdTraits$scientificNameStdNEW[!birdTraits$scientificNameStdNEW %in% phylogeny_ultra$tip.label]
names_to_add <- unique(tips_missing)
genus_to_add <- sapply(strsplit(names_to_add, "_"), `[`, 1)
phylogeny_final <- phylogeny_ultra
for (i in seq_along(names_to_add)) {
phylogeny_final <- phytools::add.species.to.genus(
tree = phylogeny_final, species = names_to_add[i],
genus = genus_to_add[i], where = "root"
)
}
write.tree(phylogeny_final, file = "data/processed/BirdPhylogeny.tre")
# Phylogenetic PCoA (10 axes)
phylo_dist <- sqrt(cophenetic(phylogeny_final))
pcoa_phylo <- cmdscale(phylo_dist, k = 10)
colnames(pcoa_phylo) <- paste0("Eigen.", 1:ncol(pcoa_phylo))
pcoa_df <- as.data.frame(pcoa_phylo)
pcoa_df$scientificNameStd <- rownames(pcoa_df)
write.table(pcoa_df, "data/processed/pcoaPhylogenyAves.txt",
row.names = FALSE, quote = FALSE, sep = "\t")
# Merge + clean negative values
clean_bird_traits_phylogeny <- function(birdTraits, pcoaPhyl,
output_file = "data/processed/phenoBirds.csv") {
merged_data <- merge(birdTraits, pcoaPhyl,
by.x = "scientificNameStd", by.y = "scientificNameStd", all.x = TRUE)
numeric_columns_present <- intersect(
c("Beak.Length_Culmen","Beak.Length_Nares","Beak.Width","Beak.Depth",
"Tarsus.Length","Wing.Length","Kipps.Distance","Secondary1",
"Hand-Wing.Index","Tail.Length","Mass",
"female_maturity_d","litter_or_clutch_size_n","litters_or_clutches_per_y",
"adult_body_mass_g","maximum_longevity_y","gestation_d","weaning_d",
"birth_or_hatching_weight_g","weaning_weight_g","egg_mass_g","incubation_d",
"fledging_age_d","longevity_y","male_maturity_d",
"inter_litter_or_interbirth_interval_y","female_body_mass_g","male_body_mass_g",
"no_sex_body_mass_g","egg_width_mm","egg_length_mm","fledging_mass_g",
"adult_svl_cm","female_svl_cm","birth_or_hatching_svl_cm",
"female_svl_at_maturity_cm","female_body_mass_at_maturity_g",
"no_sex_svl_cm","no_sex_maturity_d"),
colnames(merged_data)
)
merged_data[numeric_columns_present] <- lapply(
merged_data[numeric_columns_present],
function(x) if (is.numeric(x)) replace(x, x < 0, NA) else x
)
rownames(merged_data) <- merged_data$scientificNameStd
merged_data <- merged_data[, !(colnames(merged_data) %in% c("scientificNameStd","scientificNameStdNEW"))]
merged_data <- merged_data[, colSums(!is.na(merged_data)) > 0]
write.csv(merged_data, file = output_file, row.names = TRUE)
invisible(merged_data)
}
birdTraitsPhy <- clean_bird_traits_phylogeny(birdTraits, pcoa_df)
write.csv(birdTraitsPhy, file = "data/processed/phenoBirds.csv")
} else {
birdTraitsPhy <- read.csv("data/processed/phenoBirds.csv")
morphoColumn <- c("Tarsus.Length","Wing.Length","Kipps.Distance","Secondary1",
"Hand.Wing.Index","Tail.Length","Mass","adult_svl_cm")
lhtColum <- c("litter_or_clutch_size_n","litters_or_clutches_per_y",
"adult_body_mass_g","maximum_longevity_y","birth_or_hatching_weight_g",
"incubation_d","fledging_age_d","longevity_y","egg_mass_g",
"male_maturity_d","inter_litter_or_interbirth_interval_y",
"female_body_mass_g","no_sex_body_mass_g",
"egg_length_mm","fledging_mass_g","no_sex_maturity_d")
dietColumn <- c('Diet.Inv','Diet.Vend','Diet.Vect','Diet.Vfish','Diet.Vunk',
'Diet.Scav','Diet.Fruit','Diet.Nect','Diet.Seed','Diet.PlantO')
phyloColumn <- paste0('Eigen.', 1:10)
numericColumn <- c(morphoColumn, lhtColum)
birdTraitsPhy[, numericColumn] <- replace(birdTraitsPhy[, numericColumn],
birdTraitsPhy[, numericColumn] < 0, NA)
birdTraitsPhy <- birdTraitsPhy[!is.na(apply(birdTraitsPhy[, dietColumn], 1, sum)), ]
birdTraitsToImpute <- cbind(scale(log10(birdTraitsPhy[, numericColumn])),
birdTraitsPhy[, dietColumn],
birdTraitsPhy[, phyloColumn])
colToImputeAll <- c(numericColumn, dietColumn, phyloColumn)
doParallel::registerDoParallel(cores = 14)
birdTraitslogImputed <- as.matrix(
missForest(xmis = birdTraitsToImpute[, colToImputeAll], parallelize = "variables")$ximp
)
birdTraitsPhy[, colToImputeAll] <- birdTraitslogImputed[, colToImputeAll]
write.csv(birdTraitsPhy, file = "data/processed/phenoBirdsImputedAll.csv", row.names = FALSE)
colToImpute <- c(lhtColum, phyloColumn)
birdTraitslogImputed <- as.matrix(
missForest(xmis = birdTraitsToImpute[, colToImpute], parallelize = "variables")$ximp
)
birdTraitsPhy[, colToImpute] <- birdTraitslogImputed[, colToImpute]
write.csv(birdTraitsPhy, file = "data/processed/phenoBirdsImputed.csv", row.names = FALSE)
}