Pakillo · wpetry · Feb 7, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,12 @@
 Package: BayesianWebs
 Title: Bayesian Modelling of Bipartite Networks
-Version: 0.0.7
+Version: 0.0.8
 Authors@R: 
     c(
       person("Francisco", "Rodriguez-Sanchez", , "f.rodriguez.sanc@gmail.com", role = c("aut", "cre"),
-           comment = c(ORCID = "0000-0002-7981-1599"))
+           comment = c(ORCID = "0000-0002-7981-1599")),
+      person("William K.", "Petry", , "wpetry@ncsu.edu", role = "ctb",
+             comment = c(ORCID = "0000-0002-5230-5987"))
     )
 Description: Bayesian modelling of bipartite network structure, following the approach
     of Young et al. <doi:10.1038/s41467-021-24149-x>.
@@ -20,9 +22,13 @@ Imports:
     ggplot2,
     network.tools,
     tidybayes
+Suggests:
+    rlang,
+    testthat (>= 3.0.0)
 Remotes:
     stan-dev/cmdstanr,
     Pakillo/network.tools
 Depends: 
     R (>= 4.1)
 LazyData: true
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,4 +12,7 @@ export(plot_prior)
 export(plot_residuals)
 export(predict_counts)
 export(prepare_data)
+export(stancode)
 import(cmdstanr)
+importFrom(rlang,"!!!")
+importFrom(rlang,parse_exprs)
diff --git a/R/check_model.R b/R/check_model.R
@@ -9,7 +9,7 @@
 #'
 #' @examplesIf interactive()
 #' data(web)
-#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web)))
+#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web)))
 #' fit <- fit_model(dt, refresh = 0)
 #' check_model(fit, data = data)
 
@@ -38,9 +38,3 @@ check_lp <- function(fit = NULL) {
   gg
 
 }
-
-
-
-
-
-
diff --git a/R/fit_model.R b/R/fit_model.R
@@ -1,51 +1,125 @@
-#' Fit model
+#' Fit bipartite network model
 #'
 #' @param data A named list containing the required data, as obtained from
 #'  [prepare_data()].
-#' @param model character. One of "Young2021", "sampling_effort", or
-#' "varying_preferences", or a path to a file describing the Stan model in case
-#' you want to use a modified Stan model.
-#' @param beta Rate of exponential prior on `r` (preference) parameter.
-#' Default beta is 0.01. Increase it if you have large count numbers
-#' (can examine the resultant prior using [plot_prior()]).
-#' @param ... Further arguments for [cmdstanr::sample()], like `iter_warmup`,
-#' `iter_sampling`, or `thin`, among others. It is recommended to increase the
-#' number of iterations (e.g. iter_sampling = 10000).
+#' @param plant_effort How to model the sampling effort allocated to each plant. If unknown, `"param"` (default) estimates a single parameter during the modelling step that applies to all plants, \eqn{C}. Alternatively, `"data"` allows measured values of sampling effort to be supplied in `data`, \eqn{C_i}.
+#' @param animal_pref How to model the increase in visits of animals to plants with which they share a link relative to unlinked plants. `"fixed"` (default), fits a single preference parameter for all animals during the modelling step, \eqn{r}. `"varying"` fits a separate preference parameter for each animal type, \eqn{r_i}.
+#' @param plant_abun How the model treats the abundances of interacting plants. `"param"` (default) treats these abundances as a parameter that the model estimates, \eqn{\sigma_i}. `"data"` treats these abundances as measured and requires the user to supply the absolute abundances of each plant in `data`.
+#' @param animal_abun How the model treats the abundances of interacting animals. `"param"` (default) treats these abundances as a parameter that the model estimates, \eqn{\tau_i}. `"data"` treats these abundances as measured and requires the user to supply the absolute abundances of each animal in `data`.
+#' @param beta Rate parameter of exponential prior on the preference parameter or parameters \eqn{r} or \eqn{r_i}. The default, 0.01, is suitable for small counts and should be increased for larger counts. Use `plot_prior()` to visualize.
+#' @param alpha_sigma Vector of concentration parameters for Dirichlet prior on the relative abundances of interacting plants, \eqn{\sigma_i}, when `plant_abun = "param"`. Ignored otherwise. Defaults to 1.
+#' @param alpha_tau Concentration parameters of Dirichlet prior on the relative abundances of interacting plants, \eqn{\tau_i}, when `animal_abun = "param"`. Ignored otherwise. Defaults to 1.
+#' @param ... Further arguments passed to [cmdstanr::sample()], like `iter_warmup`,
+#' `iter_sampling`, or `thin`, among others.
 #'
 #' @return A fitted model ([cmdstanr::CmdStanMCMC()] object).
 #' @export
 #' @import cmdstanr
 #'
+#' @details
+#' This function allows fits generalized variations of the bipartite network structure model of Young et al. 2021 [<doi:10.1038/s41467-021-24149-x>](https://doi.org/10.1038%2Fs41467-021-24149-x). Briefly, the observed interaction count between each plant and animal pair is treated as a random draw from a Poisson distribution with a mean of:
+#' \deqn{\mu_{ij} = C_i\sigma_i\tau_j\left(1 + r_iB_{ij}\right).}
+#' Here, the subscripts \eqn{i} and \eqn{j} are indices that correspond to the plant (row) and animal (column) species, respectively. \eqn{C_i} is the observer's sampling effort allocated to each plant species; we expect to see more interactions with plants the longer they are observed. \eqn{\sigma_i} and \eqn{\tau_i} are the relative contributions of each plant and animal to the total pool of interactions. We expect to observe higher interaction counts for species that contribute most to the interactions made by their guild. The observed interaction count should be compounded when species of both guilds are those that are most interactive. Importantly, these terms capture more than just each species' relative abundances in the their guild. An animal that is highly active but numerically rare animal could have a higher \eqn{\tau_i} than a numerically dominant, but less active animal. \eqn{r_i} is the increase in visits of animals to plants with which they share a link relative to unlinked plants, and \eqn{B_{ij}} is the inferred absence (0) or presence (1) of a link between the plant-animal pair.
+#'
+#' \bold{Prior distributions}
+#'
+#' The default priors are very weak to accommodate a wide range of empirical datasets. They may fail for exceptional datasets and they do not capture system-specific prior information that users may have from additional data.
+#'
+#' The animal preference parameter, \eqn{r_i}, has an exponential prior with a rate parameter `beta`. The default, \eqn{\mathrm{exp}(0.01)}, implies a mean of 1/0.01 = 100 with 95% of the probability mass falling between about 2.5-370. Increase `beta` for animal communities that weakly increase visitation in the presence of a link, or decrease for communities that strongly increase visitation in the presence of a link.
+#'
+#' The relative interaction abundances for plants, \eqn{\sigma_i}, has a Dirichlet prior that accepts a vector of concentration parameters, `alpha_sigma`, with a length equal to the number of plant species (rows) in the interaction matrix. The default, \eqn{\mathrm{Dir}(1, ..., 1)}, implies a uniform distribution of proportions. Increasing some of the concentration parameters will shift the probability mass to favor higher relative interaction abundances for the corresponding plants. Increasing all concentration parameters while maintaining their relative sizes reduces the variance.
+#'
+#' The relative interaction abundances for animals, \eqn{\tau_j}, has a Dirichlet prior that accepts a vector of concentration parameters, `alpha_tau`, with a length equal to the number of animal species (columns) in the interaction matrix. The default, \eqn{\mathrm{Dir}(1, ..., 1)}, implies a uniform distribution of proportions. Increasing some of the concentration parameters will shift the probability mass to favor higher relative interaction abundances for the corresponding animals Increasing all concentration parameters while maintaining their relative sizes reduces the variance.
+#'
+#' At this time, prior choices have not been implemented for sampling effort, \eqn{C_i}, when `plant_effort = "param"`.
+#'
 #' @examplesIf interactive()
 #' data(web)
-#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web)))
-#' fit <- fit_model(dt)
+#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web)))
+#' # fit the model used by Young et al. 2021
+#' fit <- fit_model(dt, plant_effort = "param", animal_pref = "fixed",
+#'                  plant_abun = "param", animal_abun = "param")
+#' # changing options and priors
+#' fit2 <- fit_model(dt, plant_effort = "data", animal_pref = "varying",
+#'                   plant_abun = "param", animal_abun = "param",
+#'                   beta = rep(0.01, 21),
+#'                   alpha_sigma = c(2.5, 3, 4, 3, 1, 1, 5, 2),
+#'                   alpha_tau = 2)
 
 fit_model <- function(data = NULL,
-                      model = c("sampling_effort", "Young2021", "varying_preferences"),
+                      plant_effort = c("param", "data"),
+                      animal_pref = c("fixed", "varying"),
+                      plant_abun = c("param", "data"),
+                      animal_abun = c("param", "data"),
                       beta = 0.01,
+                      alpha_sigma = 1,
+                      alpha_tau = 1,
                       ...) {
 
   if (is.null(data)) {
     stop("Please provide the data as produced by 'prepare_data()'")
   }
 
-  model <- match.arg(model)  # TODO: remove this later to allow for user models
+  plant_effort <- match.arg(plant_effort)
+  animal_pref <- match.arg(animal_pref)
+  plant_abun <- match.arg(plant_abun)
+  animal_abun <- match.arg(animal_abun)
 
-  if (model %in% c("sampling_effort", "Young2021", "varying_preferences")) {
-    mod.stan <- system.file(paste0("models/", model, ".stan"), package = "BayesianWebs")
-  } else {
-    mod.stan <- model
+  ## Check that data provided are suitable for the specified model variant
+  if (plant_effort == "data" & is.null(data$C)) stop('Sampling effort, C, must be supplied in data when plant_effort = "data"')
+  if (plant_abun == "data") {
+    if (is.null(data$abun_p)) stop('Plant abundances, abun_p, must be supplied in `data` when plant_abun = "data"')
   }
+  if (animal_abun == "data") {
+    if (is.null(data$abun_a)) stop('Animal abundances, abun_a, must be supplied in `data` when animal_abun = "data"')
+  }
+  stopifnot(beta > 0, alpha_sigma > 0, alpha_tau > 0)
 
-  ## Add beta to data file
-  stopifnot(beta > 0)
+  ## Check prior parameters and compose prior parameter vectors
+  if (animal_pref == "fixed") {
+    stopifnot(length(beta) == 1L)
+  } else if (animal_pref == "varying") {
+    if (length(beta) == 1L) {
+      message("Single value supplied to 'beta' will be used as the prior parameter for each animal.")
+      beta <- rep(beta, times = ncol(data$M))
+    } else if (length(beta) != ncol(data$M)) {
+      stop("`beta` must have length 1 (same value applied to each animal) or be equal to the number of animal species.")
+    }
+  }
+  if (plant_abun == "param") {
+    if (length(alpha_sigma) == 1L) {
+      message("Single value supplied to 'alpha_sigma' will be used as the prior parameter for each plant.")
+      alpha_sigma <- rep(alpha_sigma, times = nrow(data$M))
+    } else if (length(alpha_sigma) != nrow(data$M)) {
+      stop("`alpha_sigma` must have length 1 (same value applied to each plant) or be equal to the number of plant species.")
+    }
+  }
+  if (animal_abun == "param") {
+    if (length(alpha_tau) == 1L) {
+      message("Single value supplied to 'alpha_tau' will be used as the prior parameter for each animal.")
+      alpha_tau <- rep(alpha_tau, ncol(data$M))
+    } else if (length(alpha_tau) != ncol(data$M)) {
+      stop("`alpha_tau` must have length 1 (same value applied to each animal) or be equal to the number of animal species.")
+    }
+  }
+
+
+  ## Add priors to data list
   data$beta <- beta
+  data$alpha_sigma <- alpha_sigma
+  data$alpha_tau <- alpha_tau
 
+  ## Make Stan code
+  ## TODO: Pre-compile all model variants during package installation?
+  mod.stan <- stancode(plant_effort = plant_effort,
+                       animal_pref = animal_pref,
+                       plant_abun = plant_abun,
+                       animal_abun = animal_abun)
+  tf <- cmdstanr::write_stan_file(mod.stan)
 
-  ## compile model
-  model.eff <- cmdstanr::cmdstan_model(mod.stan)
+  ## Compile model
+  model.eff <- cmdstanr::cmdstan_model(tf)
 
+  ## Fit model
   fit <- model.eff$sample(data, ...)
-
 }
diff --git a/R/get_posterior.R b/R/get_posterior.R
@@ -5,11 +5,12 @@
 #' @param param character. Name of the parameter to retrieve the posterior samples.
 #'
 #' @return A data frame
+#' @importFrom rlang parse_exprs !!!
 #' @export
 #'
 #' @examplesIf interactive()
 #' data(web)
-#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web)))
+#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web)))
 #' fit <- fit_model(dt, refresh = 0)
 #' get_posterior(fit, dt, param = "connectance")
 #'
@@ -29,64 +30,57 @@ get_posterior <- function(fit = NULL,
                                     "animal.abund",
                                     "int.prob",
                                     "link")) {
-
-  # r = avg. visits from mutualists (preference)
-  # rho = connectance
-  # sigma = plant.abund
-  # tau = animal.abund
-  # Q = interaction probability
-
+  ## parse arguments & construct parameter set to extract
   param <- match.arg(param)
-
-  params_all <- function(fit) {
-    if (fit$metadata()$model_name == "varying_preferences_model") {
-      out <- tidybayes::spread_draws(fit, rho, r[Animal], sigma[Plant], tau[Animal], Q[Plant, Animal])
-    } else {
-      out <- tidybayes::spread_draws(fit, rho, r, sigma[Plant], tau[Animal], Q[Plant, Animal])
-    }
-    return(out)
+  all_vars <- tidybayes::get_variables(fit)
+  if (sum(grepl("^r(\\[\\d+\\])?$", all_vars)) >= 1) {
+    sel_r <- "r[Animal]"
+  } else {
+    sel_r <- "r"
   }
-
-  params_preference <- function(fit) {
-    if (fit$metadata()$model_name == "varying_preferences_model") {
-      out <- tidybayes::spread_draws(fit, r[Animal])
-    } else {
-      out <- tidybayes::spread_draws(fit, r)
-    }
-    return(out)
+  if (sum(grepl("^sigma", all_vars)) > 1) {
+    sel_sigma <- "sigma[Plant]"
+  } else if (sum(grepl("^sigma", all_vars)) == 1) {
+    sel_sigma <- "sigma"
+  } else {
+    sel_sigma <- NULL
+  }
+  if (sum(grepl("^tau", all_vars)) > 1) {
+    sel_tau <- "tau[Animal]"
+  } else if (sum(grepl("^tau", all_vars)) == 1) {
+    sel_tau <- "tau"
+  } else {
+    sel_tau <- NULL
   }
-
 
   post <- switch(
     param,
-    all = params_all(fit),
+    all = tidybayes::spread_draws(fit, !!!rlang::parse_exprs(c("rho", sel_r, sel_sigma,
+                                                               sel_tau, "Q[Plant, Animal]"))),
     connectance = tidybayes::spread_draws(fit, rho),
-    preference = params_preference(fit),
-    plant.abund = tidybayes::spread_draws(fit, sigma[Plant]),
-    animal.abund = tidybayes::spread_draws(fit, tau[Animal]),
-    int.prob = tidybayes::spread_draws(fit, Q[Plant, Animal]),
+    preference = tidybayes::spread_draws(fit, !!!rlang::parse_exprs(sel_r)),
+    plant_abund = tidybayes::spread_draws(fit, !!!rlang::parse_exprs(sel_sigma)),
+    animal_abund = tidybayes::spread_draws(fit, !!!rlang::parse_exprs(sel_tau)),
+    int_prob = tidybayes::spread_draws(fit, Q[Plant, Animal]),
     link = tidybayes::spread_draws(fit, Q[Plant, Animal]),
   )
 
   # use more informative names
-  param.names <- c(
+  param_names <- c(
     connectance = "rho",
     preference = "r",
-    plant.abund = "sigma",
-    animal.abund = "tau",
-    int.prob = "Q")
+    plant_abund = "sigma",
+    animal_abund = "tau",
+    int_prob = "Q")
 
-  post <- dplyr::rename(post, dplyr::any_of(param.names))
+  post <- dplyr::rename(post, dplyr::any_of(param_names))
 
   ## generate posteriors of link existence
   if (param == "all" | param == "link") {
     post <- is_there_link(post)
   }
 
-
-
   ## rename plants and animals with original labels
-
   if ("Animal" %in% names(post)) {
     animals <- data.frame(Animal = 1:ncol(data$M), Animal.name = colnames(data$M))
     post <- post |>
@@ -102,6 +96,4 @@ get_posterior <- function(fit = NULL,
   }
 
   return(post)
-
-
 }
diff --git a/R/globals.R b/R/globals.R
@@ -1,5 +1,5 @@
 utils::globalVariables(c(".chain", ".draw", ".lower", ".upper",
-                         "Animal", "Count.obs", "Count.pred", "Plant", "Q",
-                         "Visits", "animal.abund", "count", "effort",
-                         "int.prob", "lp__", "plant.abund", "preference",
+                         "Animal", "Count_obs", "Count_pred", "Plant", "Q",
+                         "Visits", "animal_abund", "count", "effort",
+                         "int_prob", "lp__", "plant_abund", "preference",
                          "prior", "r", "rho", "sigma", "tau"))
diff --git a/R/is_there_link.R b/R/is_there_link.R
@@ -9,8 +9,7 @@
 #'
 
 is_there_link <- function(df = NULL) {
-
-  stopifnot("int.prob" %in% names(df))
+  stopifnot("int_prob" %in% names(df))
   df |>
-    dplyr::mutate(link = stats::rbinom(1, size = 1, prob = int.prob))
+    dplyr::mutate(link = stats::rbinom(1, size = 1, prob = int_prob))
 }
diff --git a/R/plot_counts_pred.R b/R/plot_counts_pred.R
@@ -1,25 +1,27 @@
 #' Plot heatmap of predicted counts
 #'
-#' @param pred.df A data frame containing the predicted counts, as generated by
+#' @param pred_df A data frame containing the predicted counts, as generated by
 #' [predict_counts()]
+#' @param .width probability cutoff to determine the width of the resulting interval (default 0.95)
+#' @param .point point summary function, which takes a vector and returns a single value, e.g. [mean()], [median()], or [Mode()]
 #' @param ... Further arguments for [network.tools::plot_web_heatmap()].
 #'
 #' @return A ggplot object
 #' @export
 #'
 #' @examplesIf interactive()
 #' data(web)
-#' dt <- prepare_data(mat = web, sampl.eff = rep(20, nrow(web)))
+#' dt <- prepare_data(mat = web, plant_effort = rep(20, nrow(web)))
 #' fit <- fit_model(dt, refresh = 0)
-#' pred.df <- predict_counts(fit, dt)
-#' plot_counts_pred(pred.df)
-#' plot_counts_pred(pred.df, sort = FALSE)
+#' pred_df <- predict_counts(fit, dt)
+#' plot_counts_pred(pred_df)
+#' plot_counts_pred(pred_df, sort = FALSE)
 
-plot_counts_pred <- function(pred.df = NULL, ...) {
+plot_counts_pred <- function(pred_df = NULL, .width = 0.95, .point = mean, ...) {
 
-  pred.df |>
+  pred_df |>
     dplyr::select(Plant, Animal, count) |>
-    tidybayes::mean_qi() |>
+    tidybayes::point_interval(.width = .width, .point = .point) |>
     network.tools::plot_web_heatmap(int.var = "count", ...) +
     ggplot2::labs(title = "Predicted counts", fill = "")
 }