Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 241 additions & 0 deletions R/RunResidenceExtraction_dplyr
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
#' Extract Residence and Non-residence Events Within an Acoustic Detection Database
#'
#' @description
#' A vectorised, \pkg{dplyr}-based rewrite of the original \code{RunResidenceExtraction()}.
#' It quantifies how long a transmitter stayed within the detection field of a given
#' location (a *residence*), and the movements between consecutive locations
#' (a *non-residence*). Logic is fully vectorised, so it runs orders of magnitude
#' faster than the original record-by-record loop. Per-tag work is mapped with
#' \pkg{furrr}, so multiple tags can be processed in parallel.
#'
#' Residency is qualified two ways, used together:
#' 1. a maximum gap between successive detections (timeout) defines the raw events, and
#' 2. an optional *daily* threshold - a minimum number of detections on each day,
#' across a minimum number of calendar days - for an event to count as residency.
#' The per-day floor also applies to day one, so an event needs at least that many
#' detections to ever be 'real'.
#'
#' Non-residences are returned in a long arrival/departure shape: each movement becomes
#' two rows (one departure, one arrival) sharing a movement_id, with a haversine distance
#' (km) attached so you have how far each movement was. sex is carried through to all outputs.
#'
#' For background on the residency concept, see:
#' https://link.springer.com/article/10.1186/s40462-022-00364-z
#'
#' written by Pablo Fuenzalida and various AI LLMs on June 2025, and merged on June 2026
#' to become this function
#'
#' @param dat a tidy detection data frame containing (at least) the columns
#' tag_id, datetime, receiver_name, station_name, location, latitude, longitude, sex.
#' lat/lon are required for the haversine distance in the non-residence output.
#' @param location_col the spatial level to analyse residency at: one of
#' "location", "station_name" or "receiver_name". Default "station_name".
#' @param time_threshold_secs maximum gap (in seconds) allowed between successive detections
#' before a residence event is closed ("timeout").
#' @param min_detections_per_day optional daily threshold: minimum detections required on *each*
#' day of an event (including day one) for it to qualify as residency.
#' Default NULL (no daily filter).
#' @param min_residence_days optional daily threshold: minimum number of calendar days (inclusive)
#' an event must span to qualify as residency. Default NULL (no daily filter).
#' @param cores number of cores to use for parallel processing across tags. Default 2.
#'
#' @return a list with three data frames:
#' \code{residences} (residence event table; includes both duration_secs and duration_days),
#' \code{residenceslog} (detection-level log), and
#' \code{nonresidences} (long arrival/departure movement table with haversine distance in km).
#'
#' @importFrom dplyr arrange group_by mutate ungroup summarise filter lead lag transmute bind_rows n row_number first last if_else if_all everything select
#' @importFrom lubridate as_date
#' @importFrom furrr future_map
#' @importFrom future plan multisession sequential
#' @importFrom geosphere distHaversine
#' @export
RunResidenceExtraction <- function(dat,
location_col = "station_name",
time_threshold_secs,
min_detections_per_day = NULL, # NULL = skip the daily threshold
min_residence_days = NULL, # NULL = skip the daily threshold
cores = 2) {

# --- setup --------------------------------------------------------------------

# which column holds the spatial level we're analysing residency at
if (!location_col %in% c("location", "station_name", "receiver_name")) # guard bad input
stop("location_col must be one of 'location', 'station_name' or 'receiver_name'.")

# are we applying the daily thresholds? only if BOTH daily args are supplied
use_daily_threshold <- !is.null(min_detections_per_day) && !is.null(min_residence_days)

# standardise into a tidy frame, copying the chosen level into a generic 'loc' column.
# we keep station_name + coordinates for the non-residence output, and sex to carry through.
dat <- dat %>% # start from the detection data
dplyr::mutate(
tag_id = as.character(tag_id), # tag id as chr (grouping key)
datetime = as.POSIXct(datetime), # ensure datetime is POSIXct
loc = as.character(.data[[location_col]]),# chosen residency level
station_name = as.character(station_name), # carried over for the output
latitude = as.numeric(latitude), # needed for haversine distance
longitude = as.numeric(longitude), # needed for haversine distance
sex = as.character(sex)) %>% # strip any names attr, carry through
dplyr::select(tag_id, datetime, loc, station_name, # keep only what we use downstream
latitude, longitude, sex) %>%
dplyr::filter(dplyr::if_all(dplyr::everything(), ~ !is.na(.))) %>% # drop rows with NAs (your rule)
dplyr::arrange(tag_id, datetime) # sort by tag then time (essential)

# set up the parallel backend; tidy up on exit so we don't leave workers running
future::plan(future::multisession, workers = cores) # spin up worker sessions
on.exit(future::plan(future::sequential), add = TRUE) # return to sequential when done

# split the data by tag so each tag is processed independently and in parallel
tag_data <- split(dat, dat$tag_id) # one list element per tag id

# --- residences ---------------------------------------------------------------

# per-tag worker: extract residence events for a single tag's detections
extract_residence <- function(df_tag) { # df_tag = all dets for one tag
events <- df_tag %>% # already time-ordered upstream
dplyr::mutate(
time_gap = as.numeric(difftime(datetime, # gap between consecutive dets
dplyr::lag(datetime), # previous detection's time
units = "secs")),
# a new residence event starts on the first row, after a timeout, or on a location change
new_event = dplyr::if_else(is.na(time_gap) | # first detection of the tag
time_gap > time_threshold_secs | # gap exceeds timeout
loc != dplyr::lag(loc),# moved to a new location
1L, 0L),
event_id = cumsum(new_event),# cumulative counter labels events
day = lubridate::as_date(datetime))# calendar day for daily thresholds

# summarise each event to one row, computing daily stats we may filter on below
events_summ <- events %>%
dplyr::group_by(event_id) %>%# collapse each event to one row
dplyr::summarise(
start_datetime = dplyr::first(datetime), # first detection in the event
end_datetime = dplyr::last(datetime), # last detection in the event
tag_id = dplyr::first(tag_id), # carry the tag id over
loc = dplyr::first(loc),# the location of this residence
station_name = dplyr::first(station_name),# carried over for the output
latitude = dplyr::first(latitude), # event location lat
longitude = dplyr::first(longitude), # event location lon
sex = dplyr::first(sex), # carry sex through
n_detections = dplyr::n(), # number of detections in the event
n_days_incl = as.integer(max(day) - min(day)) + 1L, # inclusive calendar-day span
# how many days within the event met the per-day detection threshold (incl. day one)
days_meeting_threshold = if (use_daily_threshold)
sum(as.numeric(table(day)) >= min_detections_per_day) else NA_integer_,
.groups = "drop")

# apply the optional daily threshold: span enough days AND meet the per-day count on every day.
# the per-day floor also covers day one, so a one-detection blip can never be a residence.
if (use_daily_threshold) {
events_summ <- events_summ %>%
dplyr::filter(n_days_incl >= min_residence_days,# long enough in calendar days
days_meeting_threshold == n_days_incl) # every day meets the per-day count
}

events_summ %>%
dplyr::mutate(
duration_secs = as.numeric(difftime(end_datetime, start_datetime, units = "secs")), # secs
duration_days = round(as.numeric(difftime(end_datetime, start_datetime, # days
units = "days")), 2),
residence_event = dplyr::row_number()) %>% # renumber events 1..n per tag
dplyr::select(start_datetime, end_datetime, residence_event, # tidy column order
tag_id, loc, station_name, latitude, longitude,
sex, duration_secs, duration_days, n_detections)
}

# run the residence extraction across all tags in parallel, then stack into one frame
residences <- furrr::future_map(tag_data, extract_residence) %>% # parallel per-tag map
dplyr::bind_rows() # combine all tags' residences

# --- non-residences (long arrival / departure shape) --------------------------

# build the movement base: connect each residence event with the next using lead().
# one row here = one movement (departure location -> arrival location).
move_base <- residences %>%
dplyr::arrange(tag_id, start_datetime) %>% # order events within each tag
dplyr::group_by(tag_id) %>%# work per tag
dplyr::mutate(
next_loc = dplyr::lead(loc),# location moved *to*
next_station = dplyr::lead(station_name),# station moved *to*
next_time = dplyr::lead(start_datetime), # arrival time at next location
next_latitude = dplyr::lead(latitude),# arrival lat
next_longitude = dplyr::lead(longitude)) %>%# arrival lon
dplyr::filter(!is.na(next_loc),# drop final event (no move after)
loc != next_loc) %>% # a movement = a change in location
dplyr::mutate(movement_id = dplyr::row_number()) %>% # id links a departure to its arrival
dplyr::ungroup()

# haversine distance (km) for each movement, applied to both rows of the movement.
# accounts for the earth being a sphere, so a little better than a direct distance
move_base <- move_base %>%
dplyr::mutate(
distance = round(geosphere::distHaversine(
cbind(longitude, latitude),# departure point (lon, lat)
cbind(next_longitude, next_latitude)# arrival point (lon, lat)
) / 1000, 2))# metres -> km, 2 dp

# departure rows: detection 1 of each movement (the location left behind)
departures <- move_base %>%
dplyr::transmute(tag_id,# tag id
datetime = end_datetime, # time the tag departed
loc,# location departed from
station_name, # station departed from
latitude, longitude,# departure coordinates
sex, # carry sex through
movement = "departure", # row type
movement_id, # links to the matching arrival
distance) # haversine distance of the move (km)

# arrival rows: detection 2 of each movement (the location moved to)
arrivals <- move_base %>%
dplyr::transmute(tag_id, # tag id
datetime = next_time, # time the tag arrived
loc = next_loc, # location arrived at
station_name = next_station, # station arrived at
latitude = next_latitude, # arrival coordinates
longitude = next_longitude,
sex, # carry sex through
movement = "arrival", # row type
movement_id, # links to the matching departure
distance) # same distance on both rows

# combine into the long arrival/departure database (double the rows of move_base)
nonresidences <- dplyr::bind_rows(departures, arrivals) %>%
dplyr::arrange(tag_id, movement_id, movement, datetime)

# --- residences log ------------------------------------------------------------

# the log is the detection-level record of which event each ping belongs to.
extract_log <- function(df_tag) { # rebuild the per-detection labelling
df_tag %>%
dplyr::mutate(
time_gap = as.numeric(difftime(datetime, dplyr::lag(datetime), units = "secs")),
new_event = dplyr::if_else(is.na(time_gap) |
time_gap > time_threshold_secs |
loc != dplyr::lag(loc), 1L, 0L),
residence_event = cumsum(new_event)) %>% # same event id as above
dplyr::group_by(residence_event) %>%
dplyr::mutate(record = dplyr::row_number() - 1L, # 0-indexed record within event
elapsed = dplyr::if_else(is.na(time_gap), 0, time_gap)) %>% # 0 on first ping
dplyr::ungroup() %>%
dplyr::transmute(datetime, residence_event, record, # tidy log column order
tag_id, loc, sex, elapsed) # sex carried through
}

# build the log across all tags in parallel and stack it
residenceslog <- furrr::future_map(tag_data, extract_log) %>%
dplyr::bind_rows()

# --- tidy names to match the chosen level -------------------------------------

# restore the user-facing location column name (e.g. location / station_name / receiver_name)
names(residences)[names(residences) == "loc"] <- location_col
names(residenceslog)[names(residenceslog) == "loc"] <- location_col
names(nonresidences)[names(nonresidences) == "loc"] <- location_col

# return the three data frames as a list
list(residences = as.data.frame(residences), # residence event table
residenceslog = as.data.frame(residenceslog),# detection-level log
nonresidences = as.data.frame(nonresidences)) # long arrival/departure + distance
}