diff --git a/DESCRIPTION b/DESCRIPTION index 46b1f38..b4aa44e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,12 @@ Package: BayesianWebs Title: Bayesian Modelling of Bipartite Networks -Version: 0.0.7 +Version: 0.0.8 Authors@R: c( person("Francisco", "Rodriguez-Sanchez", , "f.rodriguez.sanc@gmail.com", role = c("aut", "cre"), - comment = c(ORCID = "0000-0002-7981-1599")) + comment = c(ORCID = "0000-0002-7981-1599")), + person("William K.", "Petry", , "wpetry@ncsu.edu", role = "ctb", + comment = c(ORCID = "0000-0002-5230-5987")) ) Description: Bayesian modelling of bipartite network structure, following the approach of Young et al. . @@ -20,9 +22,13 @@ Imports: ggplot2, network.tools, tidybayes +Suggests: + rlang, + testthat (>= 3.0.0) Remotes: stan-dev/cmdstanr, Pakillo/network.tools Depends: R (>= 4.1) LazyData: true +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index f0ce0b1..9ff6f22 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,4 +12,7 @@ export(plot_prior) export(plot_residuals) export(predict_counts) export(prepare_data) +export(stancode) import(cmdstanr) +importFrom(rlang,"!!!") +importFrom(rlang,parse_exprs) diff --git a/R/check_model.R b/R/check_model.R index 00567c0..fbc73cf 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -9,7 +9,7 @@ #' #' @examplesIf interactive() #' data(web) -#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web))) +#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web))) #' fit <- fit_model(dt, refresh = 0) #' check_model(fit, data = data) @@ -38,9 +38,3 @@ check_lp <- function(fit = NULL) { gg } - - - - - - diff --git a/R/fit_model.R b/R/fit_model.R index 7841bbc..91b1e99 100644 --- a/R/fit_model.R +++ b/R/fit_model.R @@ -1,51 +1,125 @@ -#' Fit model +#' Fit bipartite network model #' #' @param data A named list containing the required data, as obtained from #' [prepare_data()]. -#' @param model character. One of "Young2021", "sampling_effort", or -#' "varying_preferences", or a path to a file describing the Stan model in case -#' you want to use a modified Stan model. -#' @param beta Rate of exponential prior on `r` (preference) parameter. -#' Default beta is 0.01. Increase it if you have large count numbers -#' (can examine the resultant prior using [plot_prior()]). -#' @param ... Further arguments for [cmdstanr::sample()], like `iter_warmup`, -#' `iter_sampling`, or `thin`, among others. It is recommended to increase the -#' number of iterations (e.g. iter_sampling = 10000). +#' @param plant_effort How to model the sampling effort allocated to each plant. If unknown, `"param"` (default) estimates a single parameter during the modelling step that applies to all plants, \eqn{C}. Alternatively, `"data"` allows measured values of sampling effort to be supplied in `data`, \eqn{C_i}. +#' @param animal_pref How to model the increase in visits of animals to plants with which they share a link relative to unlinked plants. `"fixed"` (default), fits a single preference parameter for all animals during the modelling step, \eqn{r}. `"varying"` fits a separate preference parameter for each animal type, \eqn{r_i}. +#' @param plant_abun How the model treats the abundances of interacting plants. `"param"` (default) treats these abundances as a parameter that the model estimates, \eqn{\sigma_i}. `"data"` treats these abundances as measured and requires the user to supply the absolute abundances of each plant in `data`. +#' @param animal_abun How the model treats the abundances of interacting animals. `"param"` (default) treats these abundances as a parameter that the model estimates, \eqn{\tau_i}. `"data"` treats these abundances as measured and requires the user to supply the absolute abundances of each animal in `data`. +#' @param beta Rate parameter of exponential prior on the preference parameter or parameters \eqn{r} or \eqn{r_i}. The default, 0.01, is suitable for small counts and should be increased for larger counts. Use `plot_prior()` to visualize. +#' @param alpha_sigma Vector of concentration parameters for Dirichlet prior on the relative abundances of interacting plants, \eqn{\sigma_i}, when `plant_abun = "param"`. Ignored otherwise. Defaults to 1. +#' @param alpha_tau Concentration parameters of Dirichlet prior on the relative abundances of interacting plants, \eqn{\tau_i}, when `animal_abun = "param"`. Ignored otherwise. Defaults to 1. +#' @param ... Further arguments passed to [cmdstanr::sample()], like `iter_warmup`, +#' `iter_sampling`, or `thin`, among others. #' #' @return A fitted model ([cmdstanr::CmdStanMCMC()] object). #' @export #' @import cmdstanr #' +#' @details +#' This function allows fits generalized variations of the bipartite network structure model of Young et al. 2021 [](https://doi.org/10.1038%2Fs41467-021-24149-x). Briefly, the observed interaction count between each plant and animal pair is treated as a random draw from a Poisson distribution with a mean of: +#' \deqn{\mu_{ij} = C_i\sigma_i\tau_j\left(1 + r_iB_{ij}\right).} +#' Here, the subscripts \eqn{i} and \eqn{j} are indices that correspond to the plant (row) and animal (column) species, respectively. \eqn{C_i} is the observer's sampling effort allocated to each plant species; we expect to see more interactions with plants the longer they are observed. \eqn{\sigma_i} and \eqn{\tau_i} are the relative contributions of each plant and animal to the total pool of interactions. We expect to observe higher interaction counts for species that contribute most to the interactions made by their guild. The observed interaction count should be compounded when species of both guilds are those that are most interactive. Importantly, these terms capture more than just each species' relative abundances in the their guild. An animal that is highly active but numerically rare animal could have a higher \eqn{\tau_i} than a numerically dominant, but less active animal. \eqn{r_i} is the increase in visits of animals to plants with which they share a link relative to unlinked plants, and \eqn{B_{ij}} is the inferred absence (0) or presence (1) of a link between the plant-animal pair. +#' +#' \bold{Prior distributions} +#' +#' The default priors are very weak to accommodate a wide range of empirical datasets. They may fail for exceptional datasets and they do not capture system-specific prior information that users may have from additional data. +#' +#' The animal preference parameter, \eqn{r_i}, has an exponential prior with a rate parameter `beta`. The default, \eqn{\mathrm{exp}(0.01)}, implies a mean of 1/0.01 = 100 with 95% of the probability mass falling between about 2.5-370. Increase `beta` for animal communities that weakly increase visitation in the presence of a link, or decrease for communities that strongly increase visitation in the presence of a link. +#' +#' The relative interaction abundances for plants, \eqn{\sigma_i}, has a Dirichlet prior that accepts a vector of concentration parameters, `alpha_sigma`, with a length equal to the number of plant species (rows) in the interaction matrix. The default, \eqn{\mathrm{Dir}(1, ..., 1)}, implies a uniform distribution of proportions. Increasing some of the concentration parameters will shift the probability mass to favor higher relative interaction abundances for the corresponding plants. Increasing all concentration parameters while maintaining their relative sizes reduces the variance. +#' +#' The relative interaction abundances for animals, \eqn{\tau_j}, has a Dirichlet prior that accepts a vector of concentration parameters, `alpha_tau`, with a length equal to the number of animal species (columns) in the interaction matrix. The default, \eqn{\mathrm{Dir}(1, ..., 1)}, implies a uniform distribution of proportions. Increasing some of the concentration parameters will shift the probability mass to favor higher relative interaction abundances for the corresponding animals Increasing all concentration parameters while maintaining their relative sizes reduces the variance. +#' +#' At this time, prior choices have not been implemented for sampling effort, \eqn{C_i}, when `plant_effort = "param"`. +#' #' @examplesIf interactive() #' data(web) -#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web))) -#' fit <- fit_model(dt) +#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web))) +#' # fit the model used by Young et al. 2021 +#' fit <- fit_model(dt, plant_effort = "param", animal_pref = "fixed", +#' plant_abun = "param", animal_abun = "param") +#' # changing options and priors +#' fit2 <- fit_model(dt, plant_effort = "data", animal_pref = "varying", +#' plant_abun = "param", animal_abun = "param", +#' beta = rep(0.01, 21), +#' alpha_sigma = c(2.5, 3, 4, 3, 1, 1, 5, 2), +#' alpha_tau = 2) fit_model <- function(data = NULL, - model = c("sampling_effort", "Young2021", "varying_preferences"), + plant_effort = c("param", "data"), + animal_pref = c("fixed", "varying"), + plant_abun = c("param", "data"), + animal_abun = c("param", "data"), beta = 0.01, + alpha_sigma = 1, + alpha_tau = 1, ...) { if (is.null(data)) { stop("Please provide the data as produced by 'prepare_data()'") } - model <- match.arg(model) # TODO: remove this later to allow for user models + plant_effort <- match.arg(plant_effort) + animal_pref <- match.arg(animal_pref) + plant_abun <- match.arg(plant_abun) + animal_abun <- match.arg(animal_abun) - if (model %in% c("sampling_effort", "Young2021", "varying_preferences")) { - mod.stan <- system.file(paste0("models/", model, ".stan"), package = "BayesianWebs") - } else { - mod.stan <- model + ## Check that data provided are suitable for the specified model variant + if (plant_effort == "data" & is.null(data$C)) stop('Sampling effort, C, must be supplied in data when plant_effort = "data"') + if (plant_abun == "data") { + if (is.null(data$abun_p)) stop('Plant abundances, abun_p, must be supplied in `data` when plant_abun = "data"') } + if (animal_abun == "data") { + if (is.null(data$abun_a)) stop('Animal abundances, abun_a, must be supplied in `data` when animal_abun = "data"') + } + stopifnot(beta > 0, alpha_sigma > 0, alpha_tau > 0) - ## Add beta to data file - stopifnot(beta > 0) + ## Check prior parameters and compose prior parameter vectors + if (animal_pref == "fixed") { + stopifnot(length(beta) == 1L) + } else if (animal_pref == "varying") { + if (length(beta) == 1L) { + message("Single value supplied to 'beta' will be used as the prior parameter for each animal.") + beta <- rep(beta, times = ncol(data$M)) + } else if (length(beta) != ncol(data$M)) { + stop("`beta` must have length 1 (same value applied to each animal) or be equal to the number of animal species.") + } + } + if (plant_abun == "param") { + if (length(alpha_sigma) == 1L) { + message("Single value supplied to 'alpha_sigma' will be used as the prior parameter for each plant.") + alpha_sigma <- rep(alpha_sigma, times = nrow(data$M)) + } else if (length(alpha_sigma) != nrow(data$M)) { + stop("`alpha_sigma` must have length 1 (same value applied to each plant) or be equal to the number of plant species.") + } + } + if (animal_abun == "param") { + if (length(alpha_tau) == 1L) { + message("Single value supplied to 'alpha_tau' will be used as the prior parameter for each animal.") + alpha_tau <- rep(alpha_tau, ncol(data$M)) + } else if (length(alpha_tau) != ncol(data$M)) { + stop("`alpha_tau` must have length 1 (same value applied to each animal) or be equal to the number of animal species.") + } + } + + + ## Add priors to data list data$beta <- beta + data$alpha_sigma <- alpha_sigma + data$alpha_tau <- alpha_tau + ## Make Stan code + ## TODO: Pre-compile all model variants during package installation? + mod.stan <- stancode(plant_effort = plant_effort, + animal_pref = animal_pref, + plant_abun = plant_abun, + animal_abun = animal_abun) + tf <- cmdstanr::write_stan_file(mod.stan) - ## compile model - model.eff <- cmdstanr::cmdstan_model(mod.stan) + ## Compile model + model.eff <- cmdstanr::cmdstan_model(tf) + ## Fit model fit <- model.eff$sample(data, ...) - } diff --git a/R/get_posterior.R b/R/get_posterior.R index 7a3021b..4e99591 100644 --- a/R/get_posterior.R +++ b/R/get_posterior.R @@ -5,11 +5,12 @@ #' @param param character. Name of the parameter to retrieve the posterior samples. #' #' @return A data frame +#' @importFrom rlang parse_exprs !!! #' @export #' #' @examplesIf interactive() #' data(web) -#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web))) +#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web))) #' fit <- fit_model(dt, refresh = 0) #' get_posterior(fit, dt, param = "connectance") #' @@ -29,64 +30,57 @@ get_posterior <- function(fit = NULL, "animal.abund", "int.prob", "link")) { - - # r = avg. visits from mutualists (preference) - # rho = connectance - # sigma = plant.abund - # tau = animal.abund - # Q = interaction probability - + ## parse arguments & construct parameter set to extract param <- match.arg(param) - - params_all <- function(fit) { - if (fit$metadata()$model_name == "varying_preferences_model") { - out <- tidybayes::spread_draws(fit, rho, r[Animal], sigma[Plant], tau[Animal], Q[Plant, Animal]) - } else { - out <- tidybayes::spread_draws(fit, rho, r, sigma[Plant], tau[Animal], Q[Plant, Animal]) - } - return(out) + all_vars <- tidybayes::get_variables(fit) + if (sum(grepl("^r(\\[\\d+\\])?$", all_vars)) >= 1) { + sel_r <- "r[Animal]" + } else { + sel_r <- "r" } - - params_preference <- function(fit) { - if (fit$metadata()$model_name == "varying_preferences_model") { - out <- tidybayes::spread_draws(fit, r[Animal]) - } else { - out <- tidybayes::spread_draws(fit, r) - } - return(out) + if (sum(grepl("^sigma", all_vars)) > 1) { + sel_sigma <- "sigma[Plant]" + } else if (sum(grepl("^sigma", all_vars)) == 1) { + sel_sigma <- "sigma" + } else { + sel_sigma <- NULL + } + if (sum(grepl("^tau", all_vars)) > 1) { + sel_tau <- "tau[Animal]" + } else if (sum(grepl("^tau", all_vars)) == 1) { + sel_tau <- "tau" + } else { + sel_tau <- NULL } - post <- switch( param, - all = params_all(fit), + all = tidybayes::spread_draws(fit, !!!rlang::parse_exprs(c("rho", sel_r, sel_sigma, + sel_tau, "Q[Plant, Animal]"))), connectance = tidybayes::spread_draws(fit, rho), - preference = params_preference(fit), - plant.abund = tidybayes::spread_draws(fit, sigma[Plant]), - animal.abund = tidybayes::spread_draws(fit, tau[Animal]), - int.prob = tidybayes::spread_draws(fit, Q[Plant, Animal]), + preference = tidybayes::spread_draws(fit, !!!rlang::parse_exprs(sel_r)), + plant_abund = tidybayes::spread_draws(fit, !!!rlang::parse_exprs(sel_sigma)), + animal_abund = tidybayes::spread_draws(fit, !!!rlang::parse_exprs(sel_tau)), + int_prob = tidybayes::spread_draws(fit, Q[Plant, Animal]), link = tidybayes::spread_draws(fit, Q[Plant, Animal]), ) # use more informative names - param.names <- c( + param_names <- c( connectance = "rho", preference = "r", - plant.abund = "sigma", - animal.abund = "tau", - int.prob = "Q") + plant_abund = "sigma", + animal_abund = "tau", + int_prob = "Q") - post <- dplyr::rename(post, dplyr::any_of(param.names)) + post <- dplyr::rename(post, dplyr::any_of(param_names)) ## generate posteriors of link existence if (param == "all" | param == "link") { post <- is_there_link(post) } - - ## rename plants and animals with original labels - if ("Animal" %in% names(post)) { animals <- data.frame(Animal = 1:ncol(data$M), Animal.name = colnames(data$M)) post <- post |> @@ -102,6 +96,4 @@ get_posterior <- function(fit = NULL, } return(post) - - } diff --git a/R/globals.R b/R/globals.R index 108091f..657acd9 100644 --- a/R/globals.R +++ b/R/globals.R @@ -1,5 +1,5 @@ utils::globalVariables(c(".chain", ".draw", ".lower", ".upper", - "Animal", "Count.obs", "Count.pred", "Plant", "Q", - "Visits", "animal.abund", "count", "effort", - "int.prob", "lp__", "plant.abund", "preference", + "Animal", "Count_obs", "Count_pred", "Plant", "Q", + "Visits", "animal_abund", "count", "effort", + "int_prob", "lp__", "plant_abund", "preference", "prior", "r", "rho", "sigma", "tau")) diff --git a/R/is_there_link.R b/R/is_there_link.R index b8eacb1..6ebb9ba 100644 --- a/R/is_there_link.R +++ b/R/is_there_link.R @@ -9,8 +9,7 @@ #' is_there_link <- function(df = NULL) { - - stopifnot("int.prob" %in% names(df)) + stopifnot("int_prob" %in% names(df)) df |> - dplyr::mutate(link = stats::rbinom(1, size = 1, prob = int.prob)) + dplyr::mutate(link = stats::rbinom(1, size = 1, prob = int_prob)) } diff --git a/R/plot_counts_pred.R b/R/plot_counts_pred.R index 6f246fd..028c607 100644 --- a/R/plot_counts_pred.R +++ b/R/plot_counts_pred.R @@ -1,7 +1,9 @@ #' Plot heatmap of predicted counts #' -#' @param pred.df A data frame containing the predicted counts, as generated by +#' @param pred_df A data frame containing the predicted counts, as generated by #' [predict_counts()] +#' @param .width probability cutoff to determine the width of the resulting interval (default 0.95) +#' @param .point point summary function, which takes a vector and returns a single value, e.g. [mean()], [median()], or [Mode()] #' @param ... Further arguments for [network.tools::plot_web_heatmap()]. #' #' @return A ggplot object @@ -9,17 +11,17 @@ #' #' @examplesIf interactive() #' data(web) -#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web))) +#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web))) #' fit <- fit_model(dt, refresh = 0) -#' pred.df <- predict_counts(fit, dt) -#' plot_counts_pred(pred.df) -#' plot_counts_pred(pred.df, sort = FALSE) +#' pred_df <- predict_counts(fit, dt) +#' plot_counts_pred(pred_df) +#' plot_counts_pred(pred_df, sort = FALSE) -plot_counts_pred <- function(pred.df = NULL, ...) { +plot_counts_pred <- function(pred_df = NULL, .width = 0.95, .point = mean, ...) { - pred.df |> + pred_df |> dplyr::select(Plant, Animal, count) |> - tidybayes::mean_qi() |> + tidybayes::point_interval(.width = .width, .point = .point) |> network.tools::plot_web_heatmap(int.var = "count", ...) + ggplot2::labs(title = "Predicted counts", fill = "") } diff --git a/R/plot_counts_pred_obs.R b/R/plot_counts_pred_obs.R index bae0876..62d50fb 100644 --- a/R/plot_counts_pred_obs.R +++ b/R/plot_counts_pred_obs.R @@ -1,11 +1,12 @@ #' Plot predicted versus observed counts #' -#' @param pred.df A data frame containing the predicted counts, as generated by +#' @param pred_df A data frame containing the predicted counts, as generated by #' `predict_counts()` #' @param data Data list (from `prepare_data()`) #' @param byplant Logical. If TRUE, show predicted and observed counts per plant #' (using `ggplot2::facet_wrap()`). If FALSE, show all interactions in the same plot. -#' @param width width of the credible interval (default = 0.95). +#' @param .width probability cutoff to determine the width of the resulting interval (default 0.95). +#' @param .point point summary function, which takes a vector and returns a single value, e.g. [mean()], [median()], or [Mode()] #' @param ... Further arguments to be passed to `ggplot2::facet_wrap()` if #' `byplant = TRUE`, or to `tidybayes::geom_pointinterval()` if `byplant = FALSE`. #' @@ -14,22 +15,23 @@ #' #' @examplesIf interactive() #' data(web) -#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web))) +#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web))) #' fit <- fit_model(dt, refresh = 0) -#' pred.df <- predict_counts(fit, dt) -#' plot_counts_pred_obs(pred.df, dt) -#' plot_counts_pred_obs(pred.df, dt, fatten_point = 3) -#' plot_counts_pred_obs(pred.df, dt, byplant = TRUE) -#' plot_counts_pred_obs(pred.df, dt, byplant = TRUE, scale = "free") +#' pred_df <- predict_counts(fit, dt) +#' plot_counts_pred_obs(pred_df, dt) +#' plot_counts_pred_obs(pred_df, dt, fatten_point = 3) +#' plot_counts_pred_obs(pred_df, dt, byplant = TRUE) +#' plot_counts_pred_obs(pred_df, dt, byplant = TRUE, scale = "free") -plot_counts_pred_obs <- function(pred.df = NULL, data = NULL, byplant = FALSE, width = 0.95, ...) { +plot_counts_pred_obs <- function(pred_df = NULL, data = NULL, byplant = FALSE, .width = 0.95, + .point = mean, ...) { obs <- data$M |> - network.tools::wide2long(int.name = "Count.obs") + network.tools::wide2long(int.name = "Count_obs") - pred <- pred.df |> - dplyr::select(Plant, Animal, Count.pred = count) |> - tidybayes::mean_qi(.width = width) |> + pred <- pred_df |> + dplyr::select(Plant, Animal, Count_pred = count) |> + tidybayes::point_interval(.width = .width, .point = .point) |> dplyr::left_join(obs, by = c("Plant", "Animal")) gg <- ggplot2::ggplot(pred) + @@ -38,17 +40,14 @@ plot_counts_pred_obs <- function(pred.df = NULL, data = NULL, byplant = FALSE, w if (isTRUE(byplant)) { gg <- gg + - tidybayes::geom_pointinterval(ggplot2::aes(x = Count.obs, y = Count.pred, + tidybayes::geom_pointinterval(ggplot2::aes(x = Count_obs, y = Count_pred, ymin = .lower, ymax = .upper)) + ggplot2::facet_wrap(~Plant, ...) } else { gg <- gg + - tidybayes::geom_pointinterval(ggplot2::aes(x = Count.obs, y = Count.pred, + tidybayes::geom_pointinterval(ggplot2::aes(x = Count_obs, y = Count_pred, ymin = .lower, ymax = .upper), ...) } - return(gg) - - } diff --git a/R/plot_interaction_prob.R b/R/plot_interaction_prob.R index 990bc34..5a55a22 100644 --- a/R/plot_interaction_prob.R +++ b/R/plot_interaction_prob.R @@ -10,16 +10,16 @@ #' #' @examplesIf interactive() #' data(web) -#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web))) +#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web))) #' fit <- fit_model(dt, refresh = 0) #' post <- get_posterior(fit, dt) #' plot_interaction_prob(post) plot_interaction_prob <- function(post = NULL) { post |> - dplyr::select(Plant, Animal, int.prob) |> + dplyr::select(Plant, Animal, int_prob) |> tidybayes::mean_qi() |> - network.tools::plot_web_heatmap(int.var = "int.prob", sort = FALSE) + + network.tools::plot_web_heatmap(int.var = "int_prob", sort = FALSE) + ggplot2::labs(title = "Mean interaction probability", fill = "Probability") } diff --git a/R/plot_prior.R b/R/plot_prior.R index bede4d6..ebb9b3d 100644 --- a/R/plot_prior.R +++ b/R/plot_prior.R @@ -20,7 +20,7 @@ #' #' ## Providing fitted model #' data(web) -#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web))) +#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web))) #' fit <- fit_model(dt, refresh = 0) #' plot_prior(fit = fit, data = dt) @@ -66,14 +66,9 @@ plot_prior <- function(beta = NULL, fit = NULL, data = NULL) { ggplot2::geom_density(ggplot2::aes(preference), bounds = c(0, Inf), fill = "grey90", data = df.post, alpha = 0.5) - - } - } - return(gg) - } @@ -81,6 +76,6 @@ plot_prior <- function(beta = NULL, fit = NULL, data = NULL) { get_beta <- function(fit = NULL) { datafile <- readLines(fit$data_file()) # reading json beta.info <- datafile[grep("beta", datafile)] - beta <- as.numeric(gsub('.*beta\": ', "", beta.info)) + beta <- as.numeric(regmatches(beta.info, regexpr("\\d+(?:\\.\\d+)?(?:[eE][-+]?\\d+)?", beta.info))) return(beta) } diff --git a/R/plot_residuals.R b/R/plot_residuals.R index a47a4c5..d1e4dc4 100644 --- a/R/plot_residuals.R +++ b/R/plot_residuals.R @@ -1,8 +1,6 @@ -#' Plot heatmap of residuals -#' #' Plot heatmap of residuals (observed - predicted counts). #' -#' @param pred.df A data frame containing the predicted counts, as generated by +#' @param pred_df A data frame containing the predicted counts, as generated by #' [predict_counts()] #' @param data Data list (from `prepare_data()`) #' @param ... Further arguments for [network.tools::plot_web_heatmap()]. @@ -12,18 +10,18 @@ #' #' @examplesIf interactive() #' data(web) -#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web))) +#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web))) #' fit <- fit_model(dt, refresh = 0) -#' pred.df <- predict_counts(fit, dt) -#' plot_residuals(pred.df, dt) -#' plot_residuals(pred.df, dt, sort = FALSE) +#' pred_df <- predict_counts(fit, dt) +#' plot_residuals(pred_df, dt) +#' plot_residuals(pred_df, dt, sort = FALSE) -plot_residuals <- function(pred.df = NULL, data = NULL, ...) { +plot_residuals <- function(pred_df = NULL, data = NULL, ...) { obs <- data$M |> network.tools::wide2long() - pred <- pred.df |> + pred <- pred_df |> dplyr::select(Plant, Animal, count) |> tidybayes::mean_qi() |> dplyr::select(Plant, Animal, count) |> diff --git a/R/predict_counts.R b/R/predict_counts.R index be8b819..10665bc 100644 --- a/R/predict_counts.R +++ b/R/predict_counts.R @@ -11,24 +11,36 @@ #' #' @examplesIf interactive() #' data(web) -#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web))) +#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web))) #' fit <- fit_model(dt, refresh = 0) #' predict_counts(fit, dt) predict_counts <- function(fit = NULL, data = NULL) { - ## generate posteriors post <- get_posterior(fit, data, param = "all") - ## add sampling effort per plant - C <- data.frame(Plant = rownames(data$M), effort = data$C) - post <- dplyr::left_join(post, C, by = "Plant") + ## fill in plant sampling effort data if not estimated as a parameter + if (!"plant_effort" %in% names(post)) { + df_e <- data.frame(Plant = rownames(data$M), + plant_effort = unname(data$C)) + post <- dplyr::left_join(x = post, y = df_e, by = "Plant") + } + + ## fill abundance data if not estimated as a parameter + if (!"plant_abund" %in% names(post)) { + df_p <- data.frame(Plant = rownames(data$M), + plant_abund = data$abun_p / sum(data$abun_p)) + post <- dplyr::left_join(x = post, y = df_p, by = "Plant") + } + if (!"animal_abund" %in% names(post)) { + df_a <- data.frame(Animal = colnames(data$M), + animal_abund = data$abun_a / sum(data$abun_a)) + post <- dplyr::left_join(x = post, y = df_a, by = "Animal") + } ## calculate predicted counts post <- dplyr::mutate(post, - count = (1 - int.prob) * effort * plant.abund * animal.abund + - int.prob * effort * plant.abund * animal.abund * (1 + preference)) - + count = (1 - int_prob) * plant_effort * plant_abund * animal_abund + + int_prob * plant_effort * plant_abund * animal_abund * (1 + preference)) return(post) - } diff --git a/R/prepare_data.R b/R/prepare_data.R index 56da6d5..c7c42ae 100644 --- a/R/prepare_data.R +++ b/R/prepare_data.R @@ -1,43 +1,51 @@ #' Prepare the data for modelling #' -#' @param mat An integer matrix containing quantitative (not qualitative or binary) -#' count data con interaction frequency (e.g. visits to flowers, number of fruits +#' @param mat A matrix containing quantitative counts (not qualitative or binary) +#' of observed interactions (e.g. visits to flowers, number of fruits #' consumed per plant or species). Plants must be in rows, Animals must be in columns. -#' @param sampl.eff A numeric vector with the sampling effort (e.g. observation hours) +#' @param plant_effort A numeric vector with the sampling effort (e.g. observation hours) #' spent on each plant. +#' @param plant_abun (optional) A numeric vector with the relative or absolute interacting abundances of each plant. +#' @param animal_abun (optional) A numeric vector with the relative or absolute interacting abundances of each animal. #' -#' @return A named list with all the data required to run the model. +#' @return A named list containing data to be passed to the bipartite network model. #' @export #' #' @examples #' data(web) -#' prepare_data(web, sampl.eff = rep(20, nrow(web))) - -prepare_data <- function(mat = NULL, sampl.eff = NULL) { +#' prepare_data(web, plant_effort = rep(20, nrow(web))) +prepare_data <- function(mat, plant_effort = NULL, plant_abun = NULL, animal_abun = NULL) { ## Checks - stopifnot(is.matrix(mat)) - stopifnot(is.integer(mat)) - if (min(mat) < 0) { - stop("There cannot be negative counts") + if (!is.integer(mat) & !all(mat == as.integer(mat))) stop("Elements of 'mat' must be integers.") + if (min(mat) < 0) stop("Elements of 'mat' must be >=0.") + if (max(mat) == 1) warning("'mat' should be a quantitative (not qualitative/binary) matrix with count data. Are you sure the maximum observed count is 1?") + if (!is.null(plant_effort)) { + stopifnot(length(plant_effort) == nrow(mat)) + stopifnot(is.numeric(plant_effort)) + stopifnot(all(plant_effort > 0)) + if (max(plant_effort) > 100) warning("Large values for `plant_effort` may interfere with sampling efficiency and model convergence. Consider rescaling to different units (e.g. hours instead of minutes) to obtain values <100.") } - if (max(mat) == 1) { - warning("mat must be a quantitative (not qualitative/binary) matrix with count data. Are you sure the maximum observed count is 1?") + if (!is.null(plant_abun)) { + stopifnot(length(plant_abun) == nrow(mat)) + stopifnot(is.numeric(plant_abun)) + stopifnot(all(plant_abun > 0)) } - - stopifnot(length(sampl.eff) == nrow(mat)) - stopifnot(is.numeric(sampl.eff)) - if (max(sampl.eff) > 100) { - warning("sampl.eff takes large values, which could hinder model convergence. Consider using different units (e.g. hours instead of minutes) to obtain smaller numbers") + if (!is.null(animal_abun)) { + stopifnot(length(animal_abun) == ncol(mat)) + stopifnot(is.numeric(animal_abun)) + stopifnot(all(animal_abun > 0)) } - ## - - ## Data list - dt <- list(M = mat, n_p = nrow(mat), n_a = ncol(mat), C = sampl.eff) - # dt <- tidybayes::compose_data(M = mat, n_p = nrow(mat), n_a = ncol(mat), C = sampl.eff) - - dt + ## Ensure interaction counts are stored as integers + p <- rownames(mat) + a <- colnames(mat) + mat <- matrix(as.integer(mat), nrow = nrow(mat), ncol = ncol(mat), dimnames = list(p, a)) + ## Prepare data list, dropping NULL elements + dt <- list(M = mat, n_p = nrow(mat), n_a = ncol(mat), C = plant_effort, + abun_p = plant_abun, abun_a = animal_abun) + dt <- Filter(Negate(is.null), dt) + return(dt) } diff --git a/R/stancode.R b/R/stancode.R new file mode 100644 index 0000000..171b75e --- /dev/null +++ b/R/stancode.R @@ -0,0 +1,181 @@ +#' Make Stan code +#' +#' @returns A string containing a Stan model. +#' @author William K. Petry +#' @noRd +#' @keywords internal +#' @export +#' +#' @details +#' This function is designed to be used internally. See `fit_model` for documentation +#' of arguments. +#' +stancode <- function(plant_effort = c("param", "data"), + animal_pref = c("fixed", "varying"), + plant_abun = c("param", "data"), + animal_abun = c("param", "data")) { + # validate arguments + plant_effort <- match.arg(plant_effort) + animal_pref <- match.arg(animal_pref) + plant_abun <- match.arg(plant_abun) + animal_abun <- match.arg(animal_abun) + + # generate data block + scode_data <- paste0( + "data {\n", + " // Dimensions of the data matrix, and matrix itself.\n", + " int n_p;\n", + " int n_a;\n", + " array[n_p, n_a] int M;\n", + switch(plant_effort, + param = "", + data = " vector[n_p] C;\n"), + switch(animal_pref, + fixed = " real beta;\n", + varying = " vector[n_a] beta;\n"), + switch(plant_abun, + param = " vector[n_p] alpha_sigma;\n", + data = " vector[n_p] abun_p;\n"), + switch(animal_abun, + param = " vector[n_a] alpha_tau;\n", + data = " vector[n_a] abun_a;\n"), + "}" + ) + + # generate transformed data block + scode_tdata <- paste0( + "transformed data {\n", + ifelse(plant_abun == "data" | animal_abun == "data", + "// Normalize abundance data\n", ""), + switch(plant_abun, + param = "", + data = " simplex[n_p] sigma = abun_p / sum(abun_p);\n"), + switch(animal_abun, + param = "", + data = " simplex[n_a] tau = abun_a / sum(abun_a);\n"), + "// Pre-compute the marginals of M to save computation in the model loop.\n", + " array[n_p] int M_rows = rep_array(0, n_p);\n", + " array[n_a] int M_cols = rep_array(0, n_a);\n", + " int M_tot = 0;\n", + " for (i in 1:n_p) {\n", + " for (j in 1:n_a) {\n", + " M_rows[i] += M[i, j];\n", + " M_cols[j] += M[i, j];\n", + " M_tot += M[i, j];\n", + " }\n", + " }\n", + "}" + ) + + # generate parameter block + scode_param <- paste0( + "parameters {\n", + switch(plant_effort, + param = " real C;\n", + data = ""), + switch(animal_pref, + fixed = " real r;\n", + varying = " vector [n_a] r;\n"), + switch(plant_abun, + param = " simplex[n_p] sigma;\n", + data = ""), + switch(animal_abun, + param = " simplex[n_a] tau;\n", + data = ""), + " real rho;\n", + "}" + ) + + # generate model block + scode_model <- paste0( + "model {\n", + " // Priors\n", + switch(animal_pref, + fixed = " r ~ exponential(beta);\n", + varying = paste0(" for (j in 1:n_a) {\n", + " r[j] ~ exponential(beta[j]);\n", + " }\n")), + switch(plant_abun, + param = " sigma ~ dirichlet(alpha_sigma);\n", + data = ""), + switch(animal_abun, + param = " tau ~ dirichlet(alpha_tau);\n", + data = ""), + " // Global sums and parameters\n", + " target += M_tot * log(C) - C;\n", + " // Weighted marginals of the data matrix\n", + paste0(" for (i in 1:n_p) {\n", + " target += M_rows[i] * log(sigma[i]);\n", + " }\n", + " for (j in 1:n_a) {\n", + " target += M_cols[j] * log(tau[j]);\n", + " }\n"), + " // Pairwise loop\n", + paste0(" for (i in 1:n_p) {\n", + " for (j in 1:n_a) {\n", + " real nu_ij_0 = log(1 - rho);\n"), + paste0(" real nu_ij_1 = log(rho) + M[i,j] * log(1 + ", + switch(animal_pref, + fixed = "r", + varying = "r[j]"), + ") - ", + switch(plant_effort, + param = "C", + data = "C[i]"), + " * ", + switch(animal_pref, + fixed = "r", + varying = "r[j]"), + " * ", + "sigma[i] * tau[j];\n"), + " if (nu_ij_0 > nu_ij_1)\n", + " target += nu_ij_0 + log1p_exp(nu_ij_1 - nu_ij_0);\n", + " else\n", + " target += nu_ij_1 + log1p_exp(nu_ij_0 - nu_ij_1);\n", + " }\n", + " }\n", + "}" + ) + + # generate generated quantities block + scode_gquant <- paste0( + "generated quantities {\n", + " // Posterior edge probability matrix\n", + " array[n_p, n_a] real Q;\n", + " // Pointwise log-likelihood matrix\n", + " array[n_p, n_a] real log_lik;\n", + " for (i in 1:n_p) {\n", + " for (j in 1:n_a) {\n", + " real nu_ij_0 = log(1 - rho);\n", + paste0(" real nu_ij_1 = log(rho) + M[i,j] * log(1 + ", + switch(animal_pref, + fixed = "r", + varying = "r[j]"), + ") - ", + switch(plant_effort, + param = "C", + data = "C[i]"), + " * ", + switch(animal_pref, + fixed = "r", + varying = "r[j]"), + " * sigma[i] * tau[j];\n"), + " if (nu_ij_1 > 0)\n", + " Q[i, j] = 1 / (1 + exp(nu_ij_0 - nu_ij_1));\n", + " else\n", + " Q[i, j] = exp(nu_ij_1) / (exp(nu_ij_0) + exp(nu_ij_1));\n", + " // pointwise log-likelihood\n", + " if (nu_ij_0 > nu_ij_1)\n", + " log_lik[i, j] = nu_ij_0 + log1p_exp(nu_ij_1 - nu_ij_0);\n", + " else\n", + " log_lik[i, j] = nu_ij_1 + log1p_exp(nu_ij_0 - nu_ij_1);\n", + " }\n", + " }\n", + "}" + ) + + # make code + scode <- paste(scode_data, scode_tdata, scode_param, scode_model, scode_gquant, + sep = "\n") + return(scode) +} diff --git a/docs/404.html b/docs/404.html index 725655a..d77cbc7 100644 --- a/docs/404.html +++ b/docs/404.html @@ -20,7 +20,7 @@ BayesianWebs - 0.0.7 + 0.0.8