ITV error measurement

Defined the error made by the operator

utility
# -----------------------------------------------------------------------------
# 0. Libraries
# -----------------------------------------------------------------------------

library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lme4)
library(glmmTMB)
library(DHARMa)
library(performance)


# -----------------------------------------------------------------------------
# 1. Load data
# -----------------------------------------------------------------------------

data_error <- read_csv("data/data_error.csv")

str(data_error)


# -----------------------------------------------------------------------------
# 2. Image reference dimensions
# -----------------------------------------------------------------------------

img_width    <- 4000
img_height   <- 3000
img_diagonal <- sqrt(img_width^2 + img_height^2)


# -----------------------------------------------------------------------------
# 3. Preprocessing — coerce factor/character columns, reshape to long format
# -----------------------------------------------------------------------------

# Cast metadata columns to character
df <- data_error %>%
  mutate(across(c(Code, Genus_species, Site, Lot_1), as.character))

# Identify landmark coordinate columns (pattern: "<n>_X" or "<n>_Y")
xy_cols <- names(df)[grepl("^\\d+_[XY]$", names(df))]

# Pivot from wide to long: one row per (individual × landmark × replicate)
long <- df %>%
  mutate(rep = row_number()) %>%
  pivot_longer(
    cols      = all_of(xy_cols),
    names_to  = c("landmark", "coord"),
    names_pattern = "^(\\d+)_(X|Y)$",
    values_to = "value"
  ) %>%
  pivot_wider(names_from = coord, values_from = value) %>%
  mutate(landmark = as.integer(landmark)) %>%
  filter(
    !is.na(X), !is.na(Y),
    !landmark %in% c(20, 21)   # remove semi-landmarks or problematic points
  )


# -----------------------------------------------------------------------------
# 4. Reference scale — inter-landmark distance between landmarks 1 and 2
# -----------------------------------------------------------------------------
# Option A (implemented): median distance across replicates per individual.
# Option B (alternative, preferred): distance computed on mean coordinates —
#   avoids digitization noise in the scale estimate.
#
#   ref_species <- long %>%
#     filter(landmark %in% c(1, 2)) %>%
#     group_by(Code, landmark) %>%
#     summarise(mX = mean(X), mY = mean(Y), .groups = "drop") %>%
#     pivot_wider(names_from = landmark, values_from = c(mX, mY)) %>%
#     mutate(ref_value = sqrt((mX_1 - mX_2)^2 + (mY_1 - mY_2)^2)) %>%
#     select(Code, ref_value)

ref_species <- long %>%
  filter(landmark %in% c(1, 2)) %>%
  pivot_wider(
    names_from  = landmark,
    values_from = c(X, Y),
    names_sep   = "_"
  ) %>%
  mutate(ref_value = sqrt((X_1 - X_2)^2 + (Y_1 - Y_2)^2)) %>%
  group_by(Code) %>%
  summarise(ref_value = median(ref_value, na.rm = TRUE), .groups = "drop")


# -----------------------------------------------------------------------------
# 5. Per-landmark digitization error
# -----------------------------------------------------------------------------
# For each replicate, compute the Euclidean distance from the landmark
# position to the mean position across replicates, then normalize by the
# reference inter-landmark distance (expressed as %).

landmark_err <- long %>%
  group_by(Code, Genus_species, Site, landmark) %>%
  reframe(
    X_coord = X,
    Y_coord = Y,
    mean_X  = mean(X, na.rm = TRUE),
    mean_Y  = mean(Y, na.rm = TRUE),
    dist    = sqrt((X - mean_X)^2 + (Y - mean_Y)^2)
  ) %>%
  left_join(ref_species, by = "Code") %>%
  mutate(dist_norm = (dist / ref_value) * 100)


# -----------------------------------------------------------------------------
# 6. Summary tables
# -----------------------------------------------------------------------------

# Error per individual (averaged over all landmarks and replicates)
indiv_err <- landmark_err %>%
  group_by(Code, Genus_species) %>%
  summarise(
    n_obs        = n(),
    mean_dist    = mean(dist_norm, na.rm = TRUE),
    sd_dist      = sd(dist_norm,   na.rm = TRUE),
    .groups = "drop"
  )

# Error per species
species_err <- landmark_err %>%
  group_by(Genus_species) %>%
  summarise(
    n_obs        = n(),
    mean_dist    = mean(dist_norm, na.rm = TRUE),
    sd_dist      = sd(dist_norm,   na.rm = TRUE),
    .groups = "drop"
  )

# Error per site
site_err <- landmark_err %>%
  group_by(Site) %>%
  summarise(
    n_obs        = n(),
    mean_dist    = mean(dist_norm, na.rm = TRUE),
    sd_dist      = sd(dist_norm,   na.rm = TRUE),
    .groups = "drop"
  )

# Global error (all individuals, landmarks, and replicates)
global_err <- landmark_err %>%
  summarise(
    n_obs        = n(),
    mean_dist    = mean(dist_norm, na.rm = TRUE),
    sd_dist      = sd(dist_norm,   na.rm = TRUE)
  )

# Error per landmark point (ranked by decreasing mean error)
point_err <- landmark_err %>%
  group_by(landmark) %>%
  summarise(
    n_obs        = n(),
    mean_dist    = mean(dist_norm, na.rm = TRUE),
    sd_dist      = sd(dist_norm,   na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_dist))


# -----------------------------------------------------------------------------
# 7. Export results
# -----------------------------------------------------------------------------

dir.create("output", showWarnings = FALSE)

write_csv(landmark_err, "output/erreur_landmark_par_individu.csv")
write_csv(indiv_err,    "output/erreur_par_individu.csv")
write_csv(species_err,  "output/erreur_par_espece.csv")
write_csv(site_err,     "output/erreur_par_site.csv")
write_csv(global_err,   "output/erreur_globale.csv")
write_csv(point_err,    "output/erreur_par_landmark.csv")


# -----------------------------------------------------------------------------
# 8. Visualization — digitization error by individual
# -----------------------------------------------------------------------------

p_indiv <- ggplot(
  indiv_err,
  aes(x = reorder(Code, sd_dist), y = sd_dist, fill = Genus_species)
) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(
    title = "Erreur de mesure par individu",
    x     = "Individu (Code)",
    y     = "Écart-type des distances normalisées (%)"
  ) +
  theme_minimal(base_size = 12)

ggsave("output/erreur_par_individu.png", p_indiv, width = 10, height = 8, dpi = 300)