diff --git a/R/RunResidenceExtraction_dplyr b/R/RunResidenceExtraction_dplyr new file mode 100644 index 0000000..54363c5 --- /dev/null +++ b/R/RunResidenceExtraction_dplyr @@ -0,0 +1,241 @@ +#' Extract Residence and Non-residence Events Within an Acoustic Detection Database +#' +#' @description +#' A vectorised, \pkg{dplyr}-based rewrite of the original \code{RunResidenceExtraction()}. +#' It quantifies how long a transmitter stayed within the detection field of a given +#' location (a *residence*), and the movements between consecutive locations +#' (a *non-residence*). Logic is fully vectorised, so it runs orders of magnitude +#' faster than the original record-by-record loop. Per-tag work is mapped with +#' \pkg{furrr}, so multiple tags can be processed in parallel. +#' +#' Residency is qualified two ways, used together: +#' 1. a maximum gap between successive detections (timeout) defines the raw events, and +#' 2. an optional *daily* threshold - a minimum number of detections on each day, +#' across a minimum number of calendar days - for an event to count as residency. +#' The per-day floor also applies to day one, so an event needs at least that many +#' detections to ever be 'real'. +#' +#' Non-residences are returned in a long arrival/departure shape: each movement becomes +#' two rows (one departure, one arrival) sharing a movement_id, with a haversine distance +#' (km) attached so you have how far each movement was. sex is carried through to all outputs. +#' +#' For background on the residency concept, see: +#' https://link.springer.com/article/10.1186/s40462-022-00364-z +#' +#' written by Pablo Fuenzalida and various AI LLMs on June 2025, and merged on June 2026 +#' to become this function +#' +#' @param dat a tidy detection data frame containing (at least) the columns +#' tag_id, datetime, receiver_name, station_name, location, latitude, longitude, sex. +#' lat/lon are required for the haversine distance in the non-residence output. +#' @param location_col the spatial level to analyse residency at: one of +#' "location", "station_name" or "receiver_name". Default "station_name". +#' @param time_threshold_secs maximum gap (in seconds) allowed between successive detections +#' before a residence event is closed ("timeout"). +#' @param min_detections_per_day optional daily threshold: minimum detections required on *each* +#' day of an event (including day one) for it to qualify as residency. +#' Default NULL (no daily filter). +#' @param min_residence_days optional daily threshold: minimum number of calendar days (inclusive) +#' an event must span to qualify as residency. Default NULL (no daily filter). +#' @param cores number of cores to use for parallel processing across tags. Default 2. +#' +#' @return a list with three data frames: +#' \code{residences} (residence event table; includes both duration_secs and duration_days), +#' \code{residenceslog} (detection-level log), and +#' \code{nonresidences} (long arrival/departure movement table with haversine distance in km). +#' +#' @importFrom dplyr arrange group_by mutate ungroup summarise filter lead lag transmute bind_rows n row_number first last if_else if_all everything select +#' @importFrom lubridate as_date +#' @importFrom furrr future_map +#' @importFrom future plan multisession sequential +#' @importFrom geosphere distHaversine +#' @export +RunResidenceExtraction <- function(dat, + location_col = "station_name", + time_threshold_secs, + min_detections_per_day = NULL, # NULL = skip the daily threshold + min_residence_days = NULL, # NULL = skip the daily threshold + cores = 2) { + + # --- setup -------------------------------------------------------------------- + + # which column holds the spatial level we're analysing residency at + if (!location_col %in% c("location", "station_name", "receiver_name")) # guard bad input + stop("location_col must be one of 'location', 'station_name' or 'receiver_name'.") + + # are we applying the daily thresholds? only if BOTH daily args are supplied + use_daily_threshold <- !is.null(min_detections_per_day) && !is.null(min_residence_days) + + # standardise into a tidy frame, copying the chosen level into a generic 'loc' column. + # we keep station_name + coordinates for the non-residence output, and sex to carry through. + dat <- dat %>% # start from the detection data + dplyr::mutate( + tag_id = as.character(tag_id), # tag id as chr (grouping key) + datetime = as.POSIXct(datetime), # ensure datetime is POSIXct + loc = as.character(.data[[location_col]]),# chosen residency level + station_name = as.character(station_name), # carried over for the output + latitude = as.numeric(latitude), # needed for haversine distance + longitude = as.numeric(longitude), # needed for haversine distance + sex = as.character(sex)) %>% # strip any names attr, carry through + dplyr::select(tag_id, datetime, loc, station_name, # keep only what we use downstream + latitude, longitude, sex) %>% + dplyr::filter(dplyr::if_all(dplyr::everything(), ~ !is.na(.))) %>% # drop rows with NAs (your rule) + dplyr::arrange(tag_id, datetime) # sort by tag then time (essential) + + # set up the parallel backend; tidy up on exit so we don't leave workers running + future::plan(future::multisession, workers = cores) # spin up worker sessions + on.exit(future::plan(future::sequential), add = TRUE) # return to sequential when done + + # split the data by tag so each tag is processed independently and in parallel + tag_data <- split(dat, dat$tag_id) # one list element per tag id + + # --- residences --------------------------------------------------------------- + + # per-tag worker: extract residence events for a single tag's detections + extract_residence <- function(df_tag) { # df_tag = all dets for one tag + events <- df_tag %>% # already time-ordered upstream + dplyr::mutate( + time_gap = as.numeric(difftime(datetime, # gap between consecutive dets + dplyr::lag(datetime), # previous detection's time + units = "secs")), + # a new residence event starts on the first row, after a timeout, or on a location change + new_event = dplyr::if_else(is.na(time_gap) | # first detection of the tag + time_gap > time_threshold_secs | # gap exceeds timeout + loc != dplyr::lag(loc),# moved to a new location + 1L, 0L), + event_id = cumsum(new_event),# cumulative counter labels events + day = lubridate::as_date(datetime))# calendar day for daily thresholds + + # summarise each event to one row, computing daily stats we may filter on below + events_summ <- events %>% + dplyr::group_by(event_id) %>%# collapse each event to one row + dplyr::summarise( + start_datetime = dplyr::first(datetime), # first detection in the event + end_datetime = dplyr::last(datetime), # last detection in the event + tag_id = dplyr::first(tag_id), # carry the tag id over + loc = dplyr::first(loc),# the location of this residence + station_name = dplyr::first(station_name),# carried over for the output + latitude = dplyr::first(latitude), # event location lat + longitude = dplyr::first(longitude), # event location lon + sex = dplyr::first(sex), # carry sex through + n_detections = dplyr::n(), # number of detections in the event + n_days_incl = as.integer(max(day) - min(day)) + 1L, # inclusive calendar-day span + # how many days within the event met the per-day detection threshold (incl. day one) + days_meeting_threshold = if (use_daily_threshold) + sum(as.numeric(table(day)) >= min_detections_per_day) else NA_integer_, + .groups = "drop") + + # apply the optional daily threshold: span enough days AND meet the per-day count on every day. + # the per-day floor also covers day one, so a one-detection blip can never be a residence. + if (use_daily_threshold) { + events_summ <- events_summ %>% + dplyr::filter(n_days_incl >= min_residence_days,# long enough in calendar days + days_meeting_threshold == n_days_incl) # every day meets the per-day count + } + + events_summ %>% + dplyr::mutate( + duration_secs = as.numeric(difftime(end_datetime, start_datetime, units = "secs")), # secs + duration_days = round(as.numeric(difftime(end_datetime, start_datetime, # days + units = "days")), 2), + residence_event = dplyr::row_number()) %>% # renumber events 1..n per tag + dplyr::select(start_datetime, end_datetime, residence_event, # tidy column order + tag_id, loc, station_name, latitude, longitude, + sex, duration_secs, duration_days, n_detections) + } + + # run the residence extraction across all tags in parallel, then stack into one frame + residences <- furrr::future_map(tag_data, extract_residence) %>% # parallel per-tag map + dplyr::bind_rows() # combine all tags' residences + + # --- non-residences (long arrival / departure shape) -------------------------- + + # build the movement base: connect each residence event with the next using lead(). + # one row here = one movement (departure location -> arrival location). + move_base <- residences %>% + dplyr::arrange(tag_id, start_datetime) %>% # order events within each tag + dplyr::group_by(tag_id) %>%# work per tag + dplyr::mutate( + next_loc = dplyr::lead(loc),# location moved *to* + next_station = dplyr::lead(station_name),# station moved *to* + next_time = dplyr::lead(start_datetime), # arrival time at next location + next_latitude = dplyr::lead(latitude),# arrival lat + next_longitude = dplyr::lead(longitude)) %>%# arrival lon + dplyr::filter(!is.na(next_loc),# drop final event (no move after) + loc != next_loc) %>% # a movement = a change in location + dplyr::mutate(movement_id = dplyr::row_number()) %>% # id links a departure to its arrival + dplyr::ungroup() + + # haversine distance (km) for each movement, applied to both rows of the movement. + # accounts for the earth being a sphere, so a little better than a direct distance + move_base <- move_base %>% + dplyr::mutate( + distance = round(geosphere::distHaversine( + cbind(longitude, latitude),# departure point (lon, lat) + cbind(next_longitude, next_latitude)# arrival point (lon, lat) + ) / 1000, 2))# metres -> km, 2 dp + + # departure rows: detection 1 of each movement (the location left behind) + departures <- move_base %>% + dplyr::transmute(tag_id,# tag id + datetime = end_datetime, # time the tag departed + loc,# location departed from + station_name, # station departed from + latitude, longitude,# departure coordinates + sex, # carry sex through + movement = "departure", # row type + movement_id, # links to the matching arrival + distance) # haversine distance of the move (km) + + # arrival rows: detection 2 of each movement (the location moved to) + arrivals <- move_base %>% + dplyr::transmute(tag_id, # tag id + datetime = next_time, # time the tag arrived + loc = next_loc, # location arrived at + station_name = next_station, # station arrived at + latitude = next_latitude, # arrival coordinates + longitude = next_longitude, + sex, # carry sex through + movement = "arrival", # row type + movement_id, # links to the matching departure + distance) # same distance on both rows + + # combine into the long arrival/departure database (double the rows of move_base) + nonresidences <- dplyr::bind_rows(departures, arrivals) %>% + dplyr::arrange(tag_id, movement_id, movement, datetime) + + # --- residences log ------------------------------------------------------------ + + # the log is the detection-level record of which event each ping belongs to. + extract_log <- function(df_tag) { # rebuild the per-detection labelling + df_tag %>% + dplyr::mutate( + time_gap = as.numeric(difftime(datetime, dplyr::lag(datetime), units = "secs")), + new_event = dplyr::if_else(is.na(time_gap) | + time_gap > time_threshold_secs | + loc != dplyr::lag(loc), 1L, 0L), + residence_event = cumsum(new_event)) %>% # same event id as above + dplyr::group_by(residence_event) %>% + dplyr::mutate(record = dplyr::row_number() - 1L, # 0-indexed record within event + elapsed = dplyr::if_else(is.na(time_gap), 0, time_gap)) %>% # 0 on first ping + dplyr::ungroup() %>% + dplyr::transmute(datetime, residence_event, record, # tidy log column order + tag_id, loc, sex, elapsed) # sex carried through + } + + # build the log across all tags in parallel and stack it + residenceslog <- furrr::future_map(tag_data, extract_log) %>% + dplyr::bind_rows() + + # --- tidy names to match the chosen level ------------------------------------- + + # restore the user-facing location column name (e.g. location / station_name / receiver_name) + names(residences)[names(residences) == "loc"] <- location_col + names(residenceslog)[names(residenceslog) == "loc"] <- location_col + names(nonresidences)[names(nonresidences) == "loc"] <- location_col + + # return the three data frames as a list + list(residences = as.data.frame(residences), # residence event table + residenceslog = as.data.frame(residenceslog),# detection-level log + nonresidences = as.data.frame(nonresidences)) # long arrival/departure + distance +}