diff --git a/.Rbuildignore b/.Rbuildignore index 2e29672b..c01f3f58 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,7 +2,8 @@ ^doc$ ^docs$ ^LDATS_model.pdf$ -^working.R$ +^notes.R$ +^tempering.R$ ^.travis.yml$ ^.*\.Rproj$ ^\.Rproj\.user$ diff --git a/DESCRIPTION b/DESCRIPTION index d25e84a7..634fc688 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,20 +36,18 @@ Encoding: UTF-8 LazyData: true Imports: coda, + compositions, digest, - dplyr, extraDistr, graphics, grDevices, - here, lubridate, - magrittr, + mcmc, memoise, methods, mvtnorm, nnet, progress, - reshape, stats, topicmodels, viridis @@ -58,10 +56,7 @@ Suggests: pkgdown, rmarkdown, testthat, - vdiffr, - clue, - RCurl, - tidyr + vdiffr VignetteBuilder: knitr RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index ebd8b0ce..a0155003 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,58 +1,65 @@ # Generated by roxygen2: do not edit by hand -S3method(logLik,LDA_VEM) +S3method(AIC,LDA) +S3method(logLik,LDA) +S3method(logLik,TS) S3method(logLik,TS_fit) -S3method(logLik,multinom_TS_fit) +S3method(logLik,mlm) +S3method(plot,LDA) S3method(plot,LDA_TS) -S3method(plot,LDA_VEM) S3method(plot,LDA_set) -S3method(plot,TS_fit) -S3method(print,LDA_TS) -S3method(print,TS_fit) -S3method(print,TS_on_LDA) +S3method(plot,TS) +S3method(plot,TS_set) +S3method(print,TS) export(AICc) +export(LDA) export(LDA_TS) export(LDA_TS_control) +export(LDA_call) +export(LDA_control) export(LDA_msg) export(LDA_plot_bottom_panel) export(LDA_plot_top_panel) -export(LDA_set) -export(LDA_set_control) export(TS) +export(TS_call) export(TS_control) export(TS_diagnostics_plot) -export(TS_on_LDA) +export(TS_msg) export(TS_summary_plot) export(autocorr_plot) -export(check_LDA_TS_inputs) -export(check_LDA_models) -export(check_LDA_set_inputs) -export(check_TS_inputs) -export(check_TS_on_LDA_inputs) -export(check_changepoints) +export(check_LDA) +export(check_LDAs) +export(check_TS) +export(check_class) export(check_control) export(check_document_covariate_table) export(check_document_term_table) -export(check_formula) export(check_formulas) -export(check_multinom_TS_inputs) export(check_nchangepoints) -export(check_seeds) +export(check_nonneg_integer) +export(check_nonneg_integer_matrix) +export(check_pos_integer) +export(check_replicates) export(check_timename) export(check_topics) export(check_weights) -export(conform_LDA_TS_data) +export(conform_data) export(count_trips) -export(diagnose_ptMCMC) export(document_weights) export(ecdf_plot) export(est_changepoints) export(est_regressors) export(eta_diagnostics_plots) export(eval_step) -export(expand_TS) +export(identity_LDA) export(iftrue) +export(ldats_classic) +export(ldats_classic_control) +export(leave_p_out) +export(list_depth) export(logsumexp) +export(measure_LDA) +export(measure_TS) export(measure_eta_vcov) export(measure_rho_vcov) export(memoise_fun) @@ -61,16 +68,16 @@ export(mirror_vcov) export(modalvalue) export(multinom_TS) export(multinom_TS_chunk) +export(multinom_TS_control) export(normalize) +export(null_rule) +export(package_LDA) export(package_LDA_TS) -export(package_LDA_set) export(package_TS) -export(package_TS_on_LDA) export(package_chunk_fits) +export(package_sequential_TS) export(posterior_plot) export(pred_gamma_TS_plot) -export(prep_LDA_control) -export(prep_TS_data) export(prep_chunks) export(prep_cpts) export(prep_ids) @@ -79,40 +86,61 @@ export(prep_proposal_dist) export(prep_ptMCMC_inputs) export(prep_saves) export(prep_temp_sequence) -export(print_model_run_message) +export(prepare_LDA) +export(prepare_TS) export(process_saves) export(propose_step) export(proposed_step_mods) +export(random_loo) export(rho_diagnostics_plots) export(rho_hist) export(rho_lines) +export(run_LDA) +export(run_TS) export(select_LDA) export(select_TS) -export(set_LDA_TS_plot_cols) +export(sequential_TS) +export(sequential_TS_control) +export(set_LDA_TS_plot_colors) export(set_LDA_plot_colors) export(set_TS_summary_plot_cols) export(set_gamma_colors) export(set_rho_hist_colors) -export(sim_LDA_TS_data) -export(sim_LDA_data) -export(sim_TS_data) +export(simplex_TS) +export(simplex_TS_chunk) +export(simplex_TS_control) +export(simulate_LDA_TS_data) +export(simulate_LDA_data) +export(simulate_TS_data) +export(soft_call) export(softmax) export(step_chains) export(summarize_etas) export(summarize_rhos) export(swap_chains) +export(systematic_loo) export(take_step) +export(topicmodels_LDA) export(trace_plot) export(update_cpts) export(update_ids) +export(update_list) export(update_pbar) export(update_saves) export(verify_changepoint_locations) +import(mvtnorm) importFrom(coda,HPDinterval) importFrom(coda,as.mcmc) importFrom(coda,autocorr) importFrom(coda,autocorr.diag) importFrom(coda,effectiveSize) +importFrom(compositions,acomp) +importFrom(compositions,alr) +importFrom(compositions,alrInv) +importFrom(compositions,clr) +importFrom(compositions,clrInv) +importFrom(compositions,ilr) +importFrom(compositions,ilrInv) importFrom(digest,digest) importFrom(extraDistr,rcat) importFrom(extraDistr,rdirichlet) @@ -128,9 +156,10 @@ importFrom(graphics,points) importFrom(graphics,rect) importFrom(graphics,text) importFrom(lubridate,is.Date) -importFrom(magrittr,"%>%") +importFrom(mcmc,temper) importFrom(memoise,memoise) importFrom(methods,is) +importFrom(mvtnorm,dmvnorm) importFrom(mvtnorm,rmvnorm) importFrom(nnet,multinom) importFrom(progress,progress_bar) @@ -139,6 +168,7 @@ importFrom(stats,acf) importFrom(stats,as.formula) importFrom(stats,coef) importFrom(stats,ecdf) +importFrom(stats,lm) importFrom(stats,logLik) importFrom(stats,median) importFrom(stats,rgeom) @@ -148,5 +178,4 @@ importFrom(stats,sd) importFrom(stats,terms) importFrom(stats,var) importFrom(stats,vcov) -importFrom(topicmodels,LDA) importFrom(viridis,viridis) diff --git a/NEWS.md b/NEWS.md index bca6d351..17301778 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,43 @@ Version numbers follow [Semantic Versioning](https://semver.org/). -# LDATS 0.2.5(https://github.com/weecology/ldats/releases/tag/v0.2.5) +# LDATS 0.3.0 +*Ongoing* + +## Generalization of "LDA" +* The phrase "LDA" could be any of the Linquistic Decomposition Analyses including any of a number of true Latent Dirichlet Allocation functions or models, so LDA is no longer fully imported from topicmodels but keeping topicmodels the package imported to allow calling of topicmodels::LDA from inside LDA as the default. +* Existing functionality allows a simple switching to the Gibbs sampler version of the topicmodels LDA or an entirely different modeling function. + +## Additional renamings and small things +* seed is now replicate + +## Function renamings/restructuring +* LDA_set is now LDA and TS_on_LDA is now TS +* LDA and TS are both structured similarly (see pipeline doc) with "prepare", "run", "package" as the workflow. + +## Introduction of soften logical variable +* Designed to help soften errors in pipelines through wrapping in tryCatch. + + +## Expansion of control lists +* The functions that can be toggled via control list is expanded (from measuror and selector) to now include base LDA and TS model functions, TS response and method functions, and arguments for all functions. + +## Allowance of single-topic models +* As a placeholder/identity model for comparison to models that include decomposition. + +## Data conforming +* Data can come into LDA_TS LDA TS in a variety of forms, and depending on usages, might take a variety of different forms. +* The purpose of this the new conform_data function is to generalize and extract the code used to shuddle between data formats from functions and replace it with a single line. +* Currently its not saving the test/train split explicitly, just implicitly via the data encoding that exists. + +## Data subsetting +* New facilities for subsetting the data for purposes such as cross validation +* Generalized functions for random and systematic leave-outs, with flexible number left out and (potentially) asymmetric buffers. +* Specific functions for typical use (systematic leave 1 out with no buffer, e.g.). + + + +# [LDATS 0.2.5](https://github.com/weecology/ldats/releases/tag/v0.2.5) *2019-12-22* ## General editing of simulation functions diff --git a/R/LDA.R b/R/LDA.R index 55f77fd7..df5975ac 100644 --- a/R/LDA.R +++ b/R/LDA.R @@ -1,315 +1,349 @@ -#' @title Run a set of Latent Dirichlet Allocation models + +#' @title Run a set of Linguistic Decomposition Analysis models #' -#' @description For a given dataset consisting of counts of words across -#' multiple documents in a corpus, conduct multiple Latent Dirichlet -#' Allocation (LDA) models (using the Variational Expectation -#' Maximization (VEM) algorithm; Blei \emph{et al.} 2003) to account for [1] -#' uncertainty in the number of latent topics and [2] the impact of initial -#' values in the estimation procedure. \cr \cr -#' \code{LDA_set} is a list wrapper of \code{\link[topicmodels]{LDA}} -#' in the \code{topicmodels} package (Grun and Hornik 2011). \cr \cr -#' \code{check_LDA_set_inputs} checks that all of the inputs -#' are proper for \code{LDA_set} (that the table of observations is -#' conformable to a matrix of integers, the number of topics is an integer, -#' the number of seeds is an integer and the controls list is proper). -#' -#' @param document_term_table Table of observation count data (rows: -#' documents, columns: terms. May be a class \code{matrix} or -#' \code{data.frame} but must be conformable to a matrix of integers, -#' as verified by \code{\link{check_document_term_table}}. +#' @description Conduct Linguistic Decomposition Analyses. \cr \cr +#' \code{LDA} provides the main interface for Linguistic Decomposition +#' Analysis conducted within the LDATS application of (Christensen +#' \emph{et al.} 2018). \cr \cr +#' \code{prepare_LDA} pre-prepares the LDA model objects for simpler +#' use within the subfunctions. \cr \cr +#' \code{check_LDA} ensures that the inputs are proper. +#' See \code{\link{check_topics}}, \code{\link{check_replicates}}, +#' and \code{\link{check_control}} for specifics. \cr \cr +#' \code{LDA_control} defines and creates the control list used to fit +#' the LDA model. \cr \cr +#' \code{run_LDA} runs (via \code{\link{LDA_call}}) all LDA models +#' as set up by \code{prep_LDA_models}. \cr \cr +#' \code{LDA_call} runs (via \code{\link{do.call}}) a single LDA model +#' as set up by \code{prep_LDA_models}. \cr \cr +#' \code{LDA_msg} produces a model-running message if desired. \cr \cr +#' \code{measure_LDA} determines the fit value used to select among the +#' models. \cr \cr +#' \code{select_LDA} chooses the best model(s) of interest based on their +#' measured values and the selector function. \cr \cr +#' \code{package_LDA} sets the class and names the elements of the results +#' \code{list} from \code{\link{LDA_call}} applied to the +#' combination of TS models requested for the data input(s). +#' +#' @details For a (potentially subset) dataset consisting of counts of words +#' across multiple documents in a corpus, +#' \enumerate{ +#' \item Conduct multiple Linguistic Decomposition Analysis (LDA) models +#' (e.g., Latent Dirichlet Allocation using the Variational Expectation +#' Maximization (VEM) algorithm; Blei \emph{et al.} 2003, Grun and +#' Hornik 2011), +#' \item Select from the LDA model results to pick those used in the Time +#' Series (TS) models, and +#' \item Package the results. +#' } +#' +#' @param data Any of the data structures allowable for LDATS analyses: +#' \code{matrix} or \code{data.frame} document term table, +#' \code{list} of document term and covariate tables, a \code{list} of +#' training and test sets of the two tables, or a \code{list} of multiple +#' replicate splits of training and test sets of the two tables. \cr +#' See \code{\link{conform_data}}, which is used to ensure data structure +#' validity for the desired model. #' #' @param topics Vector of the number of topics to evaluate for each model. #' Must be conformable to \code{integer} values. #' -#' @param nseeds Number of seeds (replicate starts) to use for each +#' @param replicates Number of replicate starts to use for each #' value of \code{topics}. Must be conformable to \code{integer} value. #' -#' @param control A \code{list} of parameters to control the running and -#' selecting of LDA models. Values not input assume default values set -#' by \code{\link{LDA_set_control}}. Values for running the LDAs replace -#' defaults in (\code{LDAcontol}, see \code{\link[topicmodels]{LDA}} (but if -#' \code{seed} is given, it will be overwritten; use \code{iseed} instead). -#' -#' @return -#' \code{LDA_set}: \code{list} (class: \code{LDA_set}) of LDA models -#' (class: \code{LDA_VEM}). -#' \code{check_LDA_set_inputs}: an error message is thrown if any input is -#' improper, otherwise \code{NULL}. -#' -#' @references -#' Blei, D. M., A. Y. Ng, and M. I. Jordan. 2003. Latent Dirichlet -#' Allocation. \emph{Journal of Machine Learning Research} -#' \strong{3}:993-1022. -#' \href{http://jmlr.csail.mit.edu/papers/v3/blei03a.html}{link}. +#' @param control A \code{list} of parameters to control the fitting of the +#' LDA model. Values not input assume defaults set by +#' \code{\link{LDA_control}}. #' -#' Grun B. and K. Hornik. 2011. topicmodels: An R Package for Fitting Topic -#' Models. \emph{Journal of Statistical Software} \strong{40}:13. -#' \href{https://www.jstatsoft.org/article/view/v040i13}{link}. +#' @param LDA,LDAs model \code{list} (\code{LDA}) or a \code{list} of LDA +#' model \code{list}s (\code{LDAs}). #' -#' @examples -#' data(rodents) -#' lda_data <- rodents$document_term_table -#' r_LDA <- LDA_set(lda_data, topics = 2, nseeds = 2) -#' -#' @export +#' @param model Main LDA \code{function}. #' -LDA_set <- function(document_term_table, topics = 2, nseeds = 1, - control = list()){ - check_LDA_set_inputs(document_term_table, topics, nseeds, control) - control <- do.call("LDA_set_control", control) - mod_topics <- rep(topics, each = length(seq(2, nseeds * 2, 2))) - iseed <- control$iseed - mod_seeds <- rep(seq(iseed, iseed + (nseeds - 1)* 2, 2), length(topics)) - nmods <- length(mod_topics) - mods <- vector("list", length = nmods) - for (i in 1:nmods){ - LDA_msg(mod_topics[i], mod_seeds[i], control) - control_i <- prep_LDA_control(seed = mod_seeds[i], control = control) - mods[[i]] <- LDA(document_term_table, k = mod_topics[i], - control = control_i) - } - package_LDA_set(mods, mod_topics, mod_seeds) -} - -#' @title Calculate the log likelihood of a VEM LDA model fit +#' @param model_args \code{list} of (named) arguments to be used in +#' \code{model} via \code{\link{LDA_call}}. #' -#' @description Imported but updated calculations from topicmodels package, as -#' applied to Latent Dirichlet Allocation fit with Variational Expectation -#' Maximization via \code{\link[topicmodels]{LDA}}. +#' @param nsubsets Number of data subsets. #' -#' @details The number of degrees of freedom is 1 (for alpha) plus the number -#' of entries in the document-topic matrix. The number of observations is -#' the number of entries in the document-term matrix. +#' @param subset_rule \code{function} used to subset the data. #' -#' @param object A \code{LDA_VEM}-class object. +#' @param quiet \code{logical} indicator of whether the model should run +#' quietly (if \code{FALSE}, a progress bar and notifications are printed). #' -#' @param ... Not used, simply included to maintain method compatibility. +#' @param soften \code{logical} indicator of whether the model should error +#' softly or if errors should trigger a full-stop to the pipeline. #' -#' @return Log likelihood of the model \code{logLik}, also with \code{df} -#' (degrees of freedom) and \code{nobs} (number of observations) values. +#' @param measurer \code{function} used in evaluation of the LDA +#' models; \code{measurer} creates a value for each model. #' +#' @param measurer_args \code{list} of (named) arguments to be used in +#' \code{measurer} via \code{\link{do.call}}. +#' +#' @param selector \code{function} usde in evaluation of the LDA +#' models; \code{selector} operates on the values to choose the models. +#' +#' @param selector_args \code{list} of (named) arguments to be used in +#' \code{selector} via \code{\link{do.call}}. +#' +#' @param ... Not passed along to the output, rather included to allow for +#' automated removal of unneeded controls. +#' +#' @return +#' \code{LDA},\code{pacakage_LDA}: class \code{LDA_set} \code{list} of +#' both selected and all results from \code{\link{LDA_call}} applied for +#' each model on each data input(s) as well as the control \code{list} +#' used to fit the model. \cr \cr +#' \code{prepare_LDA}: \code{list} of \code{list}s, each of which is a +#' preliminary model object for an LDA model fit. \cr \cr +#' \code{check_LDA}: an error message is thrown if any input is improper, +#' otherwise \code{NULL}. +#' \code{LDA_control}: \code{list} of controls for the LDA model, with +#' named elements corresponding to the arguments. +#' \code{run_LDA}: \code{LDA_set} \code{list} of model results from all +#' runs of a \code{} function, such as +#' \code{\link{topicmodels_LDA}}. \cr \cr +#' \code{LDA_call}: \code{LDA} \code{list} of model results from a single +#' run of a \code{} function, such as +#' \code{\link{topicmodels_LDA}}. \cr \cr +#' \code{measure_LDA}: \code{vector} of values corresponding to the model +#' evaluations. \cr \cr +#' \code{select_LDA}: \code{list} of selected models' \code{list}s. +#' #' @references -#' Buntine, W. 2002. Variational extensions to EM and multinomial PCA. -#' \emph{European Conference on Machine Learning, Lecture Notes in Computer -#' Science} \strong{2430}:23-34. \href{https://bit.ly/327sltH}{link}. +#' Blei, D. M., A. Y. Ng, and M. I. Jordan. 2003. Latent Dirichlet +#' Allocation. \emph{Journal of Machine Learning Research} +#' \strong{3}:993-1022. +#' \href{http://jmlr.csail.mit.edu/papers/v3/blei03a.html}{link}. +#' +#' Christensen, E., D. J. Harris, and S. K. M. Ernest. 2018. +#' Long-term community change through multiple rapid transitions in a +#' desert rodent community. \emph{Ecology} \strong{99}:1523-1529. +#' \href{https://doi.org/10.1002/ecy.2373}{link}. #' #' Grun B. and K. Hornik. 2011. topicmodels: An R Package for Fitting Topic #' Models. \emph{Journal of Statistical Software} \strong{40}:13. #' \href{https://www.jstatsoft.org/article/view/v040i13}{link}. #' -#' Hoffman, M. D., D. M. Blei, and F. Bach. 2010. Online learning for -#' latent Dirichlet allocation. \emph{Advances in Neural Information -#' Processing Systems} \strong{23}:856-864. -#' \href{https://bit.ly/2LEr5sb}{link}. +#' @name LDA #' -#' @examples -#' data(rodents) -#' lda_data <- rodents$document_term_table -#' r_LDA <- LDA_set(lda_data, topics = 2) -#' logLik(r_LDA[[1]]) +#' + + +#' @rdname LDA #' #' @export #' -logLik.LDA_VEM <- function(object, ...){ - val <- sum(object@loglikelihood) - df <- as.integer(object@control@estimate.alpha) + length(object@beta) - attr(val, "df") <- df - attr(val, "nobs") <- object@Dim[1] * object@Dim[2] - class(val) <- "logLik" - val +LDA <- function(data, topics = 2, replicates = 1, control = list()){ + LDAs <- prepare_LDA(data = data, topics = topics, replicates = replicates, + control = control) + LDAs <- run_LDA(LDAs = LDAs) + LDAs <- package_LDA(LDAs = LDAs) + LDAs } -#' @rdname LDA_set -#' + +#' @rdname LDA +#' #' @export #' -check_LDA_set_inputs <- function(document_term_table, topics, nseeds, - control){ - check_document_term_table(document_term_table) - check_topics(topics) - check_seeds(nseeds) - check_control(control) +check_LDA <- function(topics = 2, replicates = 1, control = list()){ + check_topics(topics = topics) + check_replicates(replicates = replicates) + check_control(control = control) } -#' @title Set the control inputs to include the seed -#' -#' @description Update the control list for the LDA model with the specific -#' seed as indicated. And remove controls not used within the LDA itself. -#' -#' @param seed \code{integer} used to set the seed of the specific model. -#' -#' @param control Named list of control parameters to be used in -#' \code{\link[topicmodels]{LDA}} Note that if \code{control} has an -#' element named \code{seed} it will be overwritten by the \code{seed} -#' argument of \code{prep_LDA_control}. +#' @rdname LDA #' -#' @return \code{list} of controls to be used in the LDA. +#' @export #' -#' @examples -#' prep_LDA_control(seed = 1) +prepare_LDA <- function(data, topics = 2, replicates = 1, control = list()){ + check_LDA(topics = topics, replicates = replicates, control = control) + control <- do.call("LDA_control", control) + messageq("----- Linguistic Decomposition Analyses -----", control$quiet) + data <- conform_data(data = data, control = control) + subsets <- names(data) + if(length(replicates) < length(topics)){ + replicates <- rep(replicates, length(topics)) + } + LDA_topics <- rep(topics, replicates) + LDA_reps <- sequence(replicates) + LDA_subsets <- rep(subsets, each = length(LDA_reps)) + LDA_reps <- rep(LDA_reps, length(subsets)) + LDA_topics <- rep(LDA_topics, length(subsets)) + nLDA <- length(LDA_topics) + LDAs <- vector("list", length = nLDA) + topicword <- rep(NA, nLDA) + for(i in 1:nLDA){ + LDAs[[i]] <- list(data = data[[LDA_subsets[i]]], + data_subset = LDA_subsets[i], + topics = LDA_topics[i], + replicate = LDA_reps[i], + control = control) + topicword[i] <- ifelse(LDA_topics[i] == 1, "topic", "topics") + } + name_tab <- data.frame(paste("data subset", LDA_subsets), + paste(",", LDA_topics, topicword), + paste(", replicate", LDA_reps)) + names(LDAs) <- apply(name_tab, 1, paste0, collapse = "") + + LDAs +} + + + +#' @rdname LDA #' #' @export #' -prep_LDA_control <- function(seed, control = list()){ - control$quiet <- NULL - control$measurer <- NULL - control$selector <- NULL - control$iseed <- NULL - control$seed <- seed - control +run_LDA <- function(LDAs){ + nLDA <- length(LDAs) + for (i in 1:nLDA){ + LDAs[[i]] <- LDA_call(LDA = LDAs[[i]]) + } + LDAs } -#' @title Select the best LDA model(s) for use in time series -#' -#' @description Select the best model(s) of interest from an -#' \code{LDA_set} object, based on a set of user-provided functions. The -#' functions default to choosing the model with the lowest AIC value. + +#' @rdname LDA #' -#' @param LDA_models An object of class \code{LDA_set} produced by -#' \code{\link{LDA_set}}. +#' @export #' -#' @param control A \code{list} of parameters to control the running and -#' selecting of LDA models. Values not input assume default values set -#' by \code{\link{LDA_set_control}}. Values for running the LDAs replace -#' defaults in (\code{LDAcontol}, see \code{\link[topicmodels]{LDA}} (but if -#' \code{seed} is given, it will be overwritten; use \code{iseed} instead). +LDA_call <- function(LDA){ + LDA_msg(LDA = LDA) + fun <- LDA$control$model + args <- update_list(LDA$control$model_args, LDA = LDA) + soft_call(what = fun, args = args, soften = LDA$control$soften) +} + + +#' @rdname LDA #' -#' @return A reduced version of \code{LDA_models} that only includes the -#' selected LDA model(s). The returned object is still an object of -#' class \code{LDA_set}. +#' @export #' -#' @examples -#' data(rodents) -#' lda_data <- rodents$document_term_table -#' r_LDA <- LDA_set(lda_data, topics = 2, nseeds = 2) -#' select_LDA(r_LDA) +LDA_msg <- function(LDA){ + subset_msg <- paste0(" - data subset ", LDA$data_subset) + topic_word <- ifelse(LDA$topics == 1, " topic", " topics") + topic_msg <- paste0(", ", LDA$topics, topic_word) + rep_msg <- paste0(", replicate ", LDA$rep) + messageq(paste0(subset_msg, topic_msg, rep_msg), LDA$control$quiet) +} + + +#' @rdname LDA #' #' @export #' -select_LDA <- function(LDA_models = NULL, control = list()){ - if("LDA_set" %in% attr(LDA_models, "class") == FALSE){ - stop("LDA_models must be of class LDA_set") - } - control <- do.call("LDA_set_control", control) - measurer <- control$measurer - selector <- control$selector - lda_measured <- vapply(LDA_models, measurer, 0) %>% - matrix(ncol = 1) - lda_selected <- apply(lda_measured, 2, selector) - which_selected <- which(lda_measured %in% lda_selected) - out <- LDA_models[which_selected] - class(out) <- c("LDA_set", "list") +package_LDA <- function(LDAs){ + selected_LDAs <- select_LDA(LDAs = LDAs) + out <- list(selected_LDAs = selected_LDAs, LDAs = LDAs) + class(out) <- c("LDA_set", "list") out } -#' @title Package the output from LDA_set -#' -#' @description Name the elements (LDA models) and set the class -#' (\code{LDA_set}) of the models returned by \code{\link{LDA_set}}. -#' -#' @param mods Fitted models returned from \code{\link[topicmodels]{LDA}}. + +#' @rdname LDA #' -#' @param mod_topics Vector of \code{integer} values corresponding to the -#' number of topics in each model. -#' -#' @param mod_seeds Vector of \code{integer} values corresponding to the -#' seed used for each model. -#' -#' @return \code{lis} (class: \code{LDA_set}) of LDA models (class: -#' \code{LDA_VEM}). -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' topics <- 2 -#' nseeds <- 2 -#' control <- LDA_set_control() -#' mod_topics <- rep(topics, each = length(seq(2, nseeds * 2, 2))) -#' iseed <- control$iseed -#' mod_seeds <- rep(seq(iseed, iseed + (nseeds - 1)* 2, 2), length(topics)) -#' nmods <- length(mod_topics) -#' mods <- vector("list", length = nmods) -#' for (i in 1:nmods){ -#' LDA_msg(mod_topics[i], mod_seeds[i], control) -#' control_i <- prep_LDA_control(seed = mod_seeds[i], control = control) -#' mods[[i]] <- topicmodels::LDA(document_term_table, k = mod_topics[i], -#' control = control_i) -#' } -#' package_LDA_set(mods, mod_topics, mod_seeds) -#' } -#' #' @export #' -package_LDA_set <- function(mods, mod_topics, mod_seeds){ - if (!("LDA_VEM" %in% class(mods[[1]]))){ - stop("mods not of class LDA_VEM") +select_LDA <- function(LDAs){ + nLDAs <- length(LDAs) + maxtopics <- 0 + for(i in 1:nLDAs){ + maxtopics <- max(c(maxtopics, LDAs[[i]]$topics)) } - check_topics(mod_topics) - if (!is.numeric(mod_seeds) || any(mod_seeds%% 1 != 0)){ - stop("mod_seeds must be integers") + if(maxtopics == 1){ + return(LDAs) } - names(mods) <- paste0("k: ", mod_topics, ", seed: ", mod_seeds) - class(mods) <- c("LDA_set", "list") - mods + vals <- measure_LDA(LDAs = LDAs) + fun <- LDAs[[1]]$control$selector + args <- update_list(LDAs[[1]]$control$selector_args, x = vals) + args[names(args) == ""] <- NULL + selection <- do.call(what = fun, args = args) + LDAs[selection] } -#' @title Create the model-running-message for an LDA +#' @rdname LDA #' -#' @description Produce and print the message for a given LDA model. -#' -#' @param mod_topics \code{integer} value corresponding to the number of -#' topics in the model. -#' -#' @param mod_seeds \code{integer} value corresponding to the seed used for -#' the model. -#' -#' @param control Class \code{LDA_controls} list of control parameters to be -#' used in \code{LDA} (note that "seed" will be overwritten). +#' @export #' -#' @examples -#' LDA_msg(mod_topics = 4, mod_seeds = 2) +measure_LDA <- function(LDAs){ + nLDAs <- length(LDAs) + vals <- rep(NA, nLDAs) + for(i in 1:nLDAs){ + fun <- LDAs[[i]]$control$measurer + args <- LDAs[[i]]$control$measurer_args + args <- update_list(args, object = LDAs[[i]]) + args[names(args) == ""] <- NULL + vals_i <- do.call(what = fun, args = args) + if(length(vals_i) != 0){ + vals[i] <- vals_i + } + } + vals +} + + +#' @rdname LDA #' #' @export #' -LDA_msg <- function(mod_topics, mod_seeds, control = list()){ - control <- do.call("LDA_set_control", control) - check_topics(mod_topics) - check_seeds(mod_seeds) - topic_msg <- paste0("Running LDA with ", mod_topics, " topics ") - seed_msg <- paste0("(seed ", mod_seeds, ")") - messageq(paste0(topic_msg, seed_msg), control$quiet) +LDA_control <- function(model = topicmodels_LDA, + model_args = list(method = "VEM", seeded = TRUE), + measurer = AIC, + measurer_args = list(NULL), + selector = which.min, + selector_args = list(NULL), + nsubsets = 1, + subset_rule = NULL, + soften = TRUE, + quiet = FALSE, ...){ + list(model = model, model_args = model_args, + measurer = measurer, measurer_args = measurer_args, + selector = selector, selector_args = selector_args, + nsubsets = nsubsets, subset_rule = subset_rule, + soften = soften, quiet = quiet) } -#' @title Create control list for set of LDA models + + +#' @title Determine the AIC of a Linguistic Decomposition Analysis +#' model #' -#' @description This function provides a simple creation and definition of -#' the list used to control the set of LDA models. It is set up to be easy -#' to work with the existing control capacity of -#' \code{\link[topicmodels]{LDA}}. +#' @description Convenience function to extract and format the AIC +#' of a \code{LDA}-class object fit by \code{\link{LDA_call}}. #' -#' @param quiet \code{logical} indicator of whether the model should run -#' quietly. +#' @param object Class \code{LDA} object to be evaluated. +#' +#' @param ... Not used, simply included to maintain method compatibility. #' -#' @param measurer,selector Function names for use in evaluation of the LDA -#' models. \code{measurer} is used to create a value for each model -#' and \code{selector} operates on the values to choose the model(s) to -#' pass on. +#' @param k Per-parameter numeric penalty. #' -#' @param iseed \code{integer} initial seed for the model set. +#' @return AIC of the model. #' -#' @param ... Additional arguments to be passed to -#' \code{\link[topicmodels]{LDA}} as a \code{control} input. +#' @export +#' +AIC.LDA <- function(object, ..., k = 2){ + lls <- logLik(object) + -2 * as.numeric(lls) + k * attr(lls, "df") +} + + +#' @title Determine the log likelihood of a Linguistic Decomposition Analysis +#' model #' -#' @return \code{list} for controlling the LDA model fit. +#' @description Convenience function to extract and format the log likelihood +#' of a \code{LDA}-class object fit by \code{\link{LDA_call}}. #' -#' @examples -#' LDA_set_control() +#' @param object Class \code{LDA} object to be evaluated. +#' +#' @param ... Not used, simply included to maintain method compatibility. +#' +#' @return Log likelihood of the model \code{logLik}, also with \code{df} +#' (degrees of freedom) and \code{nobs} (number of observations) values. #' #' @export #' -LDA_set_control <- function(quiet = FALSE, measurer = AIC, selector = min, - iseed = 2, ...){ - list(quiet = quiet, measurer = measurer, selector = selector, - iseed = iseed, ...) +logLik.LDA <- function(object, ...){ + object$log_likelihood } + diff --git a/R/LDATS.R b/R/LDATS.R index e43f6e96..8bed6ebe 100644 --- a/R/LDATS.R +++ b/R/LDATS.R @@ -1,18 +1,18 @@ #' @importFrom coda as.mcmc autocorr autocorr.diag effectiveSize HPDinterval +#' @importFrom compositions acomp alr alrInv clr clrInv ilr ilrInv #' @importFrom digest digest #' @importFrom extraDistr rcat rdirichlet #' @importFrom graphics abline axis hist mtext par plot points rect text #' @importFrom grDevices devAskNewPage rgb #' @importFrom lubridate is.Date -#' @importFrom magrittr %>% +#' @importFrom mcmc temper #' @importFrom memoise memoise #' @importFrom methods is -#' @importFrom mvtnorm rmvnorm +#' @importFrom mvtnorm rmvnorm dmvnorm #' @importFrom nnet multinom #' @importFrom progress progress_bar -#' @importFrom stats acf AIC as.formula coef ecdf logLik median rgeom rnorm +#' @importFrom stats acf AIC as.formula coef ecdf lm logLik median rgeom rnorm #' runif sd terms var vcov -#' @importFrom topicmodels LDA #' @importFrom viridis viridis #' diff --git a/R/LDA_TS.R b/R/LDA_TS.R index d9c7bb17..3c57e2aa 100644 --- a/R/LDA_TS.R +++ b/R/LDA_TS.R @@ -1,94 +1,182 @@ -#' @title Run a full set of Latent Dirichlet Allocations and Time -#' Series models -#' -#' @description Conduct a complete LDATS analysis (Christensen -#' \emph{et al.} 2018), including running a suite of Latent Dirichlet -#' Allocation (LDA) models (Blei \emph{et al.} 2003, Grun and Hornik 2011) -#' via \code{\link{LDA_set}}, selecting LDA model(s) via -#' \code{\link{select_LDA}}, running a complete set of Bayesian Time Series -#' (TS) models (Western and Kleykamp 2004) via \code{\link{TS_on_LDA}} on -#' the chosen LDA model(s), and selecting the best TS model via -#' \code{\link{select_TS}}. \cr \cr -#' \code{conform_LDA_TS_data} converts the \code{data} input to -#' match internal and sub-function specifications. \cr \cr -#' \code{check_LDA_TS_inputs} checks that the inputs to -#' \code{LDA_TS} are of proper classes for a full analysis. +#' @title Run a set of Linguistic Decomposition Analysis models coupled to +#' Bayesian Time Series models #' -#' @param data Either a document term table or a list including at least -#' a document term table (with the word "term" in the name of the element) -#' and optionally also a document covariate table (with the word -#' "covariate" in the name of the element). -#' \cr \cr -#' The document term table is a table of observation count data (rows: -#' documents, columns: terms) that may be a \code{matrix} or -#' \code{data.frame}, but must be conformable to a matrix of integers, -#' as verified by \code{\link{check_document_term_table}}. -#' \cr \cr -#' The document covariate table is a table of associated data (rows: -#' documents, columns: time index and covariate options) that may be a -#' \code{matrix} or \code{data.frame}, but must be a conformable to a data -#' table, as verified by \code{\link{check_document_covariate_table}}. Every -#' model needs a covariate to describe the time value for each document -#' (in whatever units and whose name in the table is input in -#' \code{timename}) that dictates the application of the change points. -#' \strong{\emph{If a covariate table is not provided, the model assumes the -#' observations were equi-spaced in time}}. All covariates named within -#' specific models in \code{formulas} must be included. -#' -#' @param topics Vector of the number of topics to evaluate for each model. -#' Must be conformable to \code{integer} values. -#' -#' @param nseeds \code{integer} number of seeds (replicate starts) to use for -#' each value of \code{topics} in the LDAs. Must be conformable to -#' \code{integer} value. -#' -#' @param formulas Vector of \code{\link[stats]{formula}}(s) for the -#' continuous (non-change point) component of the time series models. Any -#' predictor variable included in a formula must also be a column in the -#' \code{document_covariate_table}. Each element (formula) in the vector -#' is evaluated for each number of change points and each LDA model. -#' -#' @param nchangepoints Vector of \code{integer}s corresponding to the number -#' of change points to include in the time series models. 0 is a valid input -#' corresponding to no change points (\emph{i.e.}, a singular time series -#' model), and the current implementation can reasonably include up to 6 -#' change points. Each element in the vector is the number of change points +#' @description Analyze compositional time series using the Linguistic +#' Decomposition Analysis coupled to Bayesian Time Series models +#' generally following Christensen \emph{et al.} (2018). +#' \code{LDA_TS} is the primary model function. \cr \cr +#' \code{LDA_TS_control} defines the control \code{list} arguments for +#' \code{\link{LDA_TS}}. \cr \cr +#' \code{package_LDA_TS} combines the results from each model component. +#' +#' @details For a (potentially subset) dataset consisting of counts of words +#' across multiple documents in a corpus, +#' \enumerate{ +#' \item Conduct multiple Linguistic Decomposition Analysis (LDA) models +#' (e.g., Latent Dirichlet Allocation using the Variational Expectation +#' Maximization (VEM) algorithm; Blei \emph{et al.} 2003), +#' \item Select from the LDA model results to pick those used in the Time +#' Series (TS) models, +#' \item Conduct multiple compositional Bayesian TS models +#' (e.g., changepoint softmax regression; Ripley 1996, Venables +#' and Ripley 2002, Western and Kleykamp 2004, Bishop 2006, Ruggieri +#' 2013) via a generalized linear modeling approach (McCullagh and +#' Nelder 1989) and using parallel tempering Markov Chain Monte Carlo +#' (ptMCMC) methods (Earl and Deem 2005), +#' \item Select from the TS model results to pick those used to summarize +#' the whole model, and +#' \item Package the results. +#' } +#' +#' @param formulas Vector of \code{\link[stats]{formula}}(s) defining the +#' regression between the change points. Any predictor variable included +#' must also be a column in \code{data} and any (compositional) response +#' variable must be a set of columns in \code{data}. \cr +#' Each element (formula) in the vector is evaluated for each number of +#' change points and each LDA model. \cr +#' (See \code{\link{TS}}.) +#' +#' @param nchangepoints \code{integer}-conformable vector corresponding to the +#' number of change points to include in the models. 0 is valid (corresponds +#' to no change points, so a singular time series model) and the current +#' implementation can reasonably include up to 6 change points. The +#' number of change points is used to dictate the segmentation of the +#' time series into chunks fit with separate models dictated by +#' \code{formula}. \cr +#' Each element in the vector is the number of change points #' used to segment the data for each formula (entry in \code{formulas}) -#' component of the TS model, for each selected LDA model. -#' +#' component of the TS model, for each selected LDA model. \cr +#' (See \code{\link{TS}}.) +#' #' @param timename \code{character} element indicating the time variable #' used in the time series. Defaults to \code{"time"}. The variable must be #' integer-conformable or a \code{Date}. If the variable named #' is a \code{Date}, the input is converted to an integer, resulting in the -#' timestep being 1 day, which is often not desired behavior. -#' -#' @param weights Optional input for overriding standard weighting for -#' documents in the time series. Defaults to \code{TRUE}, -#' translating to an appropriate weighting of the documents -#' based on the size (number of words) each document (the result of -#' \code{\link[topicmodels]{LDA}} is a matrix of proportions, which does not -#' account for size differences among documents. Alternatively can be -#' \code{NULL} for an equal weighting among documents or a \code{numeric} -#' vector. -#' -#' @param control A \code{list} of parameters to control the running and -#' selecting of LDA and TS models. Values not input assume default values -#' set by \code{\link{LDA_TS_control}}. -#' -#' @param quiet \code{logical} indicator for \code{conform_LDA_TS_data} to -#' indicate if messages should be printed. +#' timestep being 1 day, which is often not desired behavior. \cr +#' (See \code{\link{TS}}.) +#' +#' @param weights Optional class \code{numeric} vector of weights for each +#' document. Defaults to \code{NULL}, translating to an equal weight for +#' each document. When using \code{\link{sequential_TS}} in a standard LDATS +#' analysis, it is advisable to weight the documents by their total size, +#' as the result of, e.g., \code{\link[topicmodels]{LDA}} is a matrix of +#' proportions, which does not account for size differences among documents. +#' For most models, a scaling of the weights (so that the average is 1) is +#' most appropriate, and this is accomplished using \code{document_weights}. +#' \cr +#' (See \code{\link{TS}}.) +#' +#' @param control \code{list} of parameters to control the fitting of the +#' LDATS model. Values not input assume defaults set by +#' \code{\link{LDA_TS_control}}. +#' +#' @param data Any of the data structures allowable for LDATS analyses: +#' \code{matrix} or \code{data.frame} document term table, +#' \code{list} of document term and covariate tables, a \code{list} of +#' training and test sets of the two tables, or a \code{list} of multiple +#' replicate splits of training and test sets of the two tables. \cr +#' See \code{\link{conform_data}}, which is used to ensure data structure +#' validity for the desired model. +#' +#' @param topics \code{integer}-conformable \code{vector} of the number of +#' topics to evaluate for each model. \cr +#' (See \code{\link{LDA}}.) +#' +#' @param replicates \code{integer}-conformable number of replicate starts to +#' use for each value of \code{topics}. \cr +#' (See \code{\link{LDA}}.) +#' +#' @param LDAs \code{LDA_set} \code{list} of selected and all LDAs from +#' \code{\link{LDA}}. +#' +#' @param TSs \code{TS_set} \code{list} of selected and all TSs from +#' \code{\link{TS}}. +#' +#' @param LDA_model Main LDA \code{function}. +#' +#' @param LDA_model_args \code{list} of (named) arguments to be used in +#' \code{LDA_model} via \code{\link{LDA_call}}. +#' +#' @param TS_model Main TS \code{function}. +#' +#' @param TS_model_args \code{list} of (named) arguments to be used in +#' \code{TS_model}. +#' +#' @param TS_response \code{function} used to model the compositional +#' response. +#' +#' @param TS_response_args \code{list} of (named) arguments to be used in +#' \code{TS_response} via \code{\link{do.call}}. +#' \cr \cr +#' Could be managed via a \code{_TS_control} function like +#' \code{\link{multinom_TS_control}}. +#' +#' @param TS_method \code{function} used to drive the sampler of the TS +#' models; \code{TS_method} defines and operates the computational +#' procedure. \cr \cr +#' Current pre-built options include \code{\link{ldats_classic}}. +#' +#' @param TS_method_args \code{list} of (named) arguments to be used in +#' \code{TS_method} via \code{\link{do.call}}. +#' \cr \cr +#' Could be managed via a \code{_control} function like +#' \code{\link{ldats_classic_control}}. +#' +#' @param summary_prob Probability used for summarizing the posterior +#' distributions (via the highest posterior density interval, see +#' \code{\link[coda]{HPDinterval}}). +#' +#' @param quiet \code{logical} indicator of whether the model should run +#' quietly (if \code{FALSE}, a progress bar and notifications are printed). +#' +#' @param soften \code{logical} indicator of whether the model should error +#' softly or if errors should trigger a full-stop to the pipeline. +#' +#' @param TS_measurer \code{function} used in evaluation of the TS +#' models; \code{measurer} creates a value for each model. +#' +#' @param TS_measurer_args \code{list} of (named) arguments to be used in +#' \code{TS_measurer} via \code{\link{do.call}}. +#' +#' @param TS_selector \code{function} usde in evaluation of the TS +#' models; \code{TS_selector} operates on the values to choose the models. +#' +#' @param TS_selector_args \code{list} of (named) arguments to be used in +#' \code{TS_selector} via \code{\link{do.call}}. +#' +#' @param LDA_measurer \code{function} used in evaluation of the LDA +#' models; \code{LDA_measurer} creates a value for each model. +#' +#' @param LDA_measurer_args \code{list} of (named) arguments to be used in +#' \code{LDA_measurer} via \code{\link{do.call}}. +#' +#' @param LDA_selector \code{function} usde in evaluation of the LDA +#' models; \code{LDA_selector} operates on the values to choose the models. +#' +#' @param LDA_selector_args \code{list} of (named) arguments to be used in +#' \code{LDA_selector} via \code{\link{do.call}}. +#' +#' @param ... Not passed along to the output, rather included to allow for +#' automated removal of unneeded controls. +#' +#' @param nsubsets Number of data subsets. +#' +#' @param subset_rule \code{function} used to subset the data. #' #' @return -#' \code{LDA_TS}: a class \code{LDA_TS} list object including all -#' fitted LDA and TS models and selected models specifically as elements -#' \code{"LDA models"} (from \code{\link{LDA_set}}), -#' \code{"Selected LDA model"} (from \code{\link{select_LDA}}), -#' \code{"TS models"} (from \code{\link{TS_on_LDA}}), and -#' \code{"Selected TS model"} (from \code{\link{select_TS}}). \cr \cr -#' \code{conform_LDA_TS_data}: a data \code{list} that is ready for analyses -#' using the stage-specific functions. \cr \cr -#' \code{check_LDA_TS_inputs}: an error message is thrown if any input is -#' improper, otherwise \code{NULL}. +#' \code{LDA_TS},\code{package_LDA_TS}: class-\code{LDA_TS} \code{list} +#' with all fitted LDA and TS models and selected models specifically +#' as elements named +#' \describe{ +#' \item{\code{LDA models}}{\code{list} of all and selected models as +#' well as controls from \code{\link{LDA}}} +#' \item{\code{TS models}}{\code{list} of all and selected models as +#' well as controls from \code{\link{TS}}} +#' \item{\code{control}}{\code{list} of overall model controls} +#' } \cr \cr +#' \code{LDA_TS_control}: \code{list} of \code{list}s and single elements +#' that control fitting of the LDATS model, with named elements +#' corresponding to the arguments. #' #' @references #' Blei, D. M., A. Y. Ng, and M. I. Jordan. 2003. Latent Dirichlet @@ -96,293 +184,124 @@ #' \strong{3}:993-1022. #' \href{http://jmlr.csail.mit.edu/papers/v3/blei03a.html}{link}. #' +#' Bishop, C. M. 2006. \emph{Pattern Recognition and Machine Learning}. +#' Springer, New York, NY, USA. +#' #' Christensen, E., D. J. Harris, and S. K. M. Ernest. 2018. #' Long-term community change through multiple rapid transitions in a #' desert rodent community. \emph{Ecology} \strong{99}:1523-1529. #' \href{https://doi.org/10.1002/ecy.2373}{link}. #' +#' Earl, D. J. and M. W. Deem. 2005. Parallel tempering: theory, +#' applications, and new perspectives. \emph{Physical Chemistry Chemical +#' Physics} \strong{7}: 3910-3916. +#' \href{https://doi.org/10.1039/B509983H}{link}. +#' #' Grun B. and K. Hornik. 2011. topicmodels: An R Package for Fitting Topic #' Models. \emph{Journal of Statistical Software} \strong{40}:13. #' \href{https://www.jstatsoft.org/article/view/v040i13}{link}. #' +#' McCullagh, P. and J. A. Nelder. 1989. \emph{Generalized Linear Models}. +#' 2nd Edition. Chapman and Hall, New York, NY, USA. +#' +#' Ripley, B. D. 1996. \emph{Pattern Recognition and Neural Networks}. +#' Cambridge University Press, Cambridge, UK. +#' +#' Ruggieri, E. 2013. A Bayesian approach to detecting change points in +#' climactic records. \emph{International Journal of Climatology} +#' \strong{33}:520-528. +#' \href{https://doi.org/10.1002/joc.3447}{link}. +#' +#' Venables, W. N. and B. D. Ripley. 2002. \emph{Modern and Applied +#' Statistics with S}. Fourth Edition. Springer, New York, NY, USA. +#' #' Western, B. and M. Kleykamp. 2004. A Bayesian change point model for #' historical time series analysis. \emph{Political Analysis} #' \strong{12}:354-374. #' \href{https://doi.org/10.1093/pan/mph023}{link}. #' -#' @examples -#' data(rodents) -#' \donttest{ -#' mod <- LDA_TS(data = rodents, topics = 2, nseeds = 1, formulas = ~1, -#' nchangepoints = 1, timename = "newmoon") -#' } -#' conform_LDA_TS_data(rodents) -#' check_LDA_TS_inputs(rodents, timename = "newmoon") +#' @name LDA_TS +#' + +#' @rdname LDA_TS #' #' @export #' -LDA_TS <- function(data, topics = 2, nseeds = 1, formulas = ~ 1, +LDA_TS <- function(data, topics = 2, replicates = 1, formulas = ~ 1, nchangepoints = 0, timename = "time", weights = TRUE, control = list()){ - check_LDA_TS_inputs(data, topics, nseeds, formulas, nchangepoints, - timename, weights, control) control <- do.call("LDA_TS_control", control) - data <- conform_LDA_TS_data(data, control$quiet) - dtt <- data$document_term_table - dct <- data$document_covariate_table - weights <- iftrue(weights, document_weights(dtt)) - messageq("----Latent Dirichlet Allocation----", control$quiet) - LDAs <- LDA_set(dtt, topics, nseeds, control$LDA_set_control) - sel_LDA <- select_LDA(LDAs, control$LDA_set_control) - messageq("----Time Series Models----", control$quiet) - TSs <- TS_on_LDA(sel_LDA, dct, formulas, nchangepoints, timename, weights, - control$TS_control) - sel_TSs <- select_TS(TSs, control$TS_control) - package_LDA_TS(LDAs, sel_LDA, TSs, sel_TSs) + LDAs <- LDA(data = data, topics = topics, replicates = replicates, + control = control$LDA_control) + TSs <- TS(LDAs = LDAs, formulas = formulas, nchangepoints = nchangepoints, + timename = timename, weights = weights, + control = control$TS_control) + package_LDA_TS(LDAs = LDAs, TSs = TSs, control = control) } #' @rdname LDA_TS #' #' @export #' -conform_LDA_TS_data <- function(data, quiet = FALSE){ - if(inherits(data, "data.frame") | inherits(data, "matrix")){ - msg <- "covariate table not provided, assuming equi-spaced data" - messageq(msg, quiet) - nobs <- nrow(data) - covariate <- data.frame(time = 1:nobs) - data <- list(document_term_table = data, - document_covariate_table = covariate) - } else if(inherits(data, "list")){ - which_term <- grep("term", names(data), ignore.case = TRUE) - which_covariate <- grep("covariate", names(data), ignore.case = TRUE) - if(length(which_term) != 1){ - stop("one, and only one, element in `data` can include `term`") - } - if (length(which_covariate) == 0){ - msg <- "covariate table not provided, assuming equi-spaced data" - messageq(msg, quiet) - nobs <- nrow(data[[which_term]]) - covariate <- data.frame(time = 1:nobs) - data$document_covariate_table <- covariate - } else if(length(which_covariate) > 1){ - stop("at most one element in `data` can include `covariate`") - } - names(data)[which_term] <- "document_term_table" - names(data)[which_covariate] <- "document_covariate_table" - } else{ - stop("data must be a data.frame, matrix, or list") - } - data +package_LDA_TS <- function(LDAs, TSs, control){ + out <- list("LDA models" = LDAs, "TS models" = TSs, control = control) + class(out) <- c("LDA_TS", "list") + out } #' @rdname LDA_TS -#' -#' @export -#' -check_LDA_TS_inputs <- function(data = NULL, - topics = 2, nseeds = 1, formulas = ~ 1, - nchangepoints = 0, - timename = "time", - weights = TRUE, - control = list()){ - check_control(control) - control <- do.call("LDA_TS_control", control) - data <- conform_LDA_TS_data(data) - weights <- iftrue(weights, document_weights(data$document_term_table)) - check_document_covariate_table(data$document_covariate_table, - document_term_table = data$document_term_table) - check_timename(data$document_covariate_table, timename) - check_formulas(formulas, data$document_covariate_table, control$TS_control) - check_nchangepoints(nchangepoints) - check_weights(weights) - check_document_term_table(data$document_term_table) - check_topics(topics) - check_seeds(nseeds) -} - -#' @title Print the selected LDA and TS models of LDA_TS object -#' -#' @description Convenience function to print only the selected elements of a -#' \code{LDA_TS}-class object returned by \code{\link{LDA_TS}} -#' -#' @param x Class \code{LDA_TS} object to be printed. -#' -#' @param ... Not used, simply included to maintain method compatibility. -#' -#' @return The selected models in \code{x} as a two-element \code{list} with -#' the TS component only returning the non-hidden components. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' mod <- LDA_TS(data = rodents, topics = 2, nseeds = 1, formulas = ~1, -#' nchangepoints = 1, timename = "newmoon") -#' print(mod) -#' } #' #' @export #' -print.LDA_TS <- function(x, ...){ - print(x[["Selected LDA model"]]) - print(x[["Selected TS model"]]) - list(LDA = x[["Selected LDA model"]], TS = x[["Selected TS model"]]) -} +LDA_TS_control <- function(LDA_model = topicmodels_LDA, + LDA_model_args = + list(method = "VEM", seeded = TRUE), + LDA_measurer = AIC, + LDA_measurer_args = list(NULL), + LDA_selector = which.min, + LDA_selector_args = list(NULL), + TS_model = sequential_TS, + TS_model_args = + list(control = sequential_TS_control()), + TS_response = multinom_TS, + TS_response_args = + list(control = multinom_TS_control()), + TS_method = ldats_classic, + TS_method_args = + list(control = ldats_classic_control()), + TS_measurer = AIC, + TS_measurer_args = list(NULL), + TS_selector = which.min, + TS_selector_args = list(NULL), + summary_prob = 0.95, + nsubsets = 1, + subset_rule = NULL, + soften = TRUE, + quiet = FALSE, ...){ -#' @title Package the output of LDA_TS -#' -#' @description Combine the objects returned by \code{\link{LDA_set}}, -#' \code{\link{select_LDA}}, \code{\link{TS_on_LDA}}, and -#' \code{\link{select_TS}}, name them as elements of the list, and -#' set the class of the list as \code{LDA_TS}, for the return from -#' \code{\link{LDA_TS}}. -#' -#' @param LDAs List (class: \code{LDA_set}) of LDA models (class: -#' \code{LDA}), as returned by \code{\link{LDA_set}}. -#' -#' @param sel_LDA A reduced version of \code{LDAs} that only includes the -#' LDA model(s) selected by \code{\link{select_LDA}}. Still should be of -#' class \code{LDA_set}. -#' -#' @param TSs Class \code{TS_on_LDA} list of results from \code{\link{TS}} -#' applied for each model on each LDA model input, as returned by -#' \code{\link{TS_on_LDA}}. -#' -#' @param sel_TSs A reduced version of \code{TSs} (of class \code{TS_fit}) -#' that only includes the TS model chosen via \code{\link{select_TS}}. -#' -#' @return Class \code{LDA_TS}-class object including all fitted models and -#' selected models specifically, ready to be returned from -#' \code{\link{LDA_TS}}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' data <- rodents -#' control <- LDA_TS_control() -#' dtt <- data$document_term_table -#' dct <- data$document_covariate_table -#' weights <- document_weights(dtt) -#' LDAs <- LDA_set(dtt, 2, 1, control$LDA_set_control) -#' sel_LDA <- select_LDA(LDAs, control$LDA_set_control) -#' TSs <- TS_on_LDA(sel_LDA, dct, ~1, 1, "newmoon", weights, -#' control$TS_control) -#' sel_TSs <- select_TS(TSs, control$TS_control) -#' package_LDA_TS(LDAs, sel_LDA, TSs, sel_TSs) -#' } -#' -#' @export -#' -package_LDA_TS <- function(LDAs, sel_LDA, TSs, sel_TSs){ - if (!("LDA_set" %in% class(LDAs))){ - stop("LDAs not of class LDA_set") - } - if (!("LDA_set" %in% class(sel_LDA))){ - stop("sel_LDA not of class LDA_set") - } - if (!("TS_on_LDA" %in% class(TSs))){ - stop("TSs not of class TS_on_LDA") - } - if (!("TS_fit" %in% class(sel_TSs))){ - stop("sel_TS not of class TS_fit") - } + LDA_control <- LDA_control(model = LDA_model, model_args = LDA_model_args, + measurer = LDA_measurer, + measurer_args = LDA_measurer_args, + selector = LDA_selector, + selector_args = LDA_selector_args, + nsubsets = nsubsets, subset_rule = subset_rule, + soften = soften, quiet = quiet) + TS_control <- TS_control(model = TS_model, + model_args = TS_model_args, + response = TS_response, + response_args = TS_response_args, + method = TS_method, + method_args = TS_method_args, + measurer = TS_measurer, + measurer_args = TS_measurer_args, + selector = TS_selector, + selector_args = TS_selector_args, + summary_prob = summary_prob, + soften = soften, quiet = quiet) - out <- list("LDA models" = LDAs, "Selected LDA model" = sel_LDA, - "TS models" = TSs, "Selected TS model" = sel_TSs) - class(out) <- c("LDA_TS", "list") - out + list(LDA_control = LDA_control, TS_control = TS_control, + nsubsets = nsubsets, subset_rule = subset_rule, + soften = soften, quiet = quiet) } -#' @title Create the controls list for the LDATS model -#' -#' @description Create and define a list of control options used to run the -#' LDATS model, as implemented by \code{\link{LDA_TS}}. -#' -#' @param quiet \code{logical} indicator of whether the model should run -#' quietly. -#' -#' @param measurer_LDA,selector_LDA Function names for use in evaluation of -#' the LDA models. \code{measurer_LDA} is used to create a value for each -#' model and \code{selector_LDA} operates on the values to choose the model. -#' -#' @param iseed \code{integer} initial seed for the LDA model set. -#' -#' @param ... Additional arguments to be passed to -#' \code{\link[topicmodels]{LDA}} as a \code{control} input. -#' -#' @param memoise \code{logical} indicator of whether the multinomial -#' functions should be memoised (via \code{\link[memoise]{memoise}}). -#' Memoisation happens to both \code{\link{multinom_TS}} and -#' \code{\link{multinom_TS_chunk}}. -#' -#' @param response \code{character} element indicating the response variable -#' used in the time series. Should be set to \code{"gamma"} for LDATS. -#' -#' @param lambda \code{numeric} "weight" decay term used to set the prior -#' on the regressors within each chunk-level model. Defaults to 0, -#' corresponding to a fully vague prior. -#' -#' @param measurer_TS,selector_TS Function names for use in evaluation of the -#' TS models. \code{measurer_TS} is used to create a value for each model -#' and \code{selector_TS} operates on the values to choose the model. -#' -#' @param ntemps \code{integer} number of temperatures (chains) to use in the -#' ptMCMC algorithm. -#' -#' @param penultimate_temp Penultimate temperature in the ptMCMC sequence. -#' -#' @param ultimate_temp Ultimate temperature in the ptMCMC sequence. -#' -#' @param q Exponent controlling the ptMCMC temperature sequence from the -#' focal chain (reference with temperature = 1) to the penultimate chain. 0 -#' (default) implies a geometric sequence. 1 implies squaring before -#' exponentiating. -#' -#' @param nit \code{integer} number of iterations (steps) used in the ptMCMC -#' algorithm. -#' -#' @param magnitude Average magnitude (defining a geometric distribution) -#' for the proposed step size in the ptMCMC algorithm. -#' -#' @param burnin \code{integer} number of iterations to remove from the -#' beginning of the ptMCMC algorithm. -#' -#' @param thin_frac Fraction of iterations to retain, from the ptMCMC. Must be -#' \eqn{(0, 1]}, and the default value of 1 represents no thinning. -#' -#' @param summary_prob Probability used for summarizing the posterior -#' distributions (via the highest posterior density interval, see -#' \code{\link[coda]{HPDinterval}}) of the TS model. -#' -#' @param seed Input to \code{set.seed} in the time series model for -#' replication purposes. -#' -#' @return \code{list} of control \code{lists}, with named elements -#' \code{LDAcontrol}, \code{TScontrol}, and \code{quiet}. -#' -#' @examples -#' LDA_TS_control() -#' -#' @export -#' -LDA_TS_control <- function(quiet = FALSE, measurer_LDA = AIC, - selector_LDA = min, iseed = 2, - memoise = TRUE, response = "gamma", lambda = 0, - measurer_TS = AIC, selector_TS = min, ntemps = 6, - penultimate_temp = 2^6, ultimate_temp = 1e10, - q = 0, nit = 1e4, magnitude = 12, burnin = 0, - thin_frac = 1, summary_prob = 0.95, - seed = NULL, ...){ - LDAcontrol <- LDA_set_control(quiet = quiet, measurer = measurer_LDA, - selector = selector_LDA, iseed = iseed, ...) - TScontrol <- TS_control(memoise = memoise, response = response, - lambda = lambda, measurer = measurer_TS, - selector = selector_TS, ntemps = ntemps, - penultimate_temp = penultimate_temp, - ultimate_temp = ultimate_temp, q = q, - nit = nit, magnitude = magnitude, quiet = quiet, - burnin = burnin, thin_frac = thin_frac, - summary_prob = summary_prob, seed = seed) - list(LDA_set_control = LDAcontrol, TS_control = TScontrol, quiet = quiet) -} diff --git a/R/LDA_TS_plots.R b/R/LDA_TS_plots.R index e0726fe8..76955cbb 100644 --- a/R/LDA_TS_plots.R +++ b/R/LDA_TS_plots.R @@ -1,8 +1,16 @@ #' @title Plot the key results from a full LDATS analysis #' -#' @description Generalization of the \code{\link[graphics]{plot}} function to -#' work on fitted LDA_TS model objects (class \code{LDA_TS}) returned by -#' \code{\link{LDA_TS}}). +#' @description +#' \code{plot.LDA_TS} generalizes the \code{\link[graphics]{plot}} function +#' to work on fitted LDA_TS model objects (class \code{LDA_TS}) returned +#' by \code{\link{LDA_TS}}). \cr \cr +#' \code{set_LDA_TS_plot_colors} produces the options for the colors +#' controlling the panels of the LDATS summary plots, needed because +#' the change point histogram panel should be in a different color scheme +#' than the LDA and fitted time series model panels, which should be +#' in a matching color scheme. See \code{\link{set_LDA_plot_colors}}, +#' \code{\link{set_TS_summary_plot_cols}}, \code{\link{set_gamma_colors}}, +#' and \code{\link{set_rho_hist_colors}} for specific details on usage. #' #' @param x A \code{LDA_TS} object of a full LDATS model fit by #' \code{\link{LDA_TS}}. @@ -25,9 +33,26 @@ #' summary plot. Currently only defined for \code{"median"} and #' \code{"mode"}. #' +#' @param rho_cols,gamma_cols Colors to be used in the specific plot. Any +#' valid color values (\emph{e.g.}, see +#' \code{\link[grDevices]{colors}}, \code{\link[grDevices]{rgb}}) can be +#' input as with a standard plot. The default (\code{NULL}) triggers use +#' of \code{\link[viridis]{viridis}} color options (see +#' \code{rho_option},\code{gamma_option}). +#' +#' @param rho_option,gamma_option A \code{character} string indicating +#' the color option from \code{\link[viridis]{viridis}} to use if +#' "cols == NULL". Four options are available: "magma" (or "A"), +#' "inferno" (or "B"), "plasma" (or "C"), "viridis" (or "D", the default +#' option) and "cividis" (or "E"). +#' +#' @param rho_alpha,gamma_alpha Numeric value [0,1] that indicates the +#' transparency of the colors used. Supported only on some devices, see +#' \code{\link[grDevices]{rgb}}. +#' #' @param cols \code{list} of elements used to define the colors for the two #' panels of the summary plot, as generated simply using -#' \code{\link{set_LDA_TS_plot_cols}}. \code{cols} has two elements: +#' \code{\link{set_LDA_TS_plot_colors}}. \code{cols} has two elements: #' \code{LDA} and \code{TS}, each corresponding the set of plots for #' its stage in the full model. \code{LDA} contains entries \code{cols} #' and \code{option} (see \code{\link{set_LDA_plot_colors}}). \code{TS} @@ -37,97 +62,47 @@ #' \code{\link{set_TS_summary_plot_cols}}, \code{\link{set_gamma_colors}}, #' and \code{\link{set_rho_hist_colors}}). #' -#' @return \code{NULL}. +#' @return +#' \code{plot.LDA_TS}: \code{NULL}. \cr \cr +#' \code{set_LDA_TS_plot_colors}: \code{list} of elements used to define +#' the colors for the summary plots, which has two +#' elements: \code{LDA} and \code{TS}, each corresponding the set of +#' plots for its stage in the full model. \code{LDA} contains entries +#' \code{cols} and \code{options} (see \code{\link{set_LDA_plot_colors}}). +#' \code{TS} contains two entries, \code{rho} and \code{gamma}, each +#' corresponding to the related panel, and each containing default values +#' for entries named \code{cols}, \code{option}, and \code{alpha} (see +#' \code{\link{set_TS_summary_plot_cols}}, \code{\link{set_gamma_colors}}, +#' and \code{\link{set_rho_hist_colors}}). +#' +#' @name plot.LDA_TS #' -#' @examples -#' \donttest{ -#' data(rodents) -#' mod <- LDA_TS(data = rodents, topics = 2, nseeds = 1, formulas = ~1, -#' nchangepoints = 1, timename = "newmoon") -#' plot(mod, binwidth = 5, xlab = "New moon") -#' } + +#' @rdname plot.LDA_TS #' #' @export #' -plot.LDA_TS <- function(x, ..., - cols = set_LDA_TS_plot_cols(), +plot.LDA_TS <- function(x, ..., cols = set_LDA_TS_plot_colors(), bin_width = 1, xname = NULL, border = NA, selection = "median"){ oldpar <- par(no.readonly = TRUE) on.exit(par(oldpar)) - tname <- x$"Selected TS model"$timename - tvar <- x$"Selected TS model"$data[ , tname] - plot.LDA_set(x$"Selected LDA model", xtime = tvar, xname = NULL, + tname <- x$"TS models"$"selected_TSs"[[1]]$timename + tvar <- x$"TS models"$"selected_TSs"[[1]]$data$train$ts_data[ , tname] + plot.LDA_set(x$"LDA models", xtime = tvar, xname = NULL, cols = cols$LDA$cols, option = cols$LDA$option, LDATS = TRUE) - plot.TS_fit(x$"Selected TS model", plot_type = "summary", cols = cols$TS, + plot.TS_set(x$"TS models", plot_type = "summary", cols = cols$TS, bin_width = bin_width, xname = xname, border = border, selection = selection, LDATS = TRUE) } - - -#' @title Create the list of colors for the LDATS summary plot -#' -#' @description A default list generator function that produces the options -#' for the colors controlling the panels of the LDATS summary plots, needed -#' because the change point histogram panel should be in a different color -#' scheme than the LDA and fitted time series model panels, which should be -#' in a matching color scheme. See \code{\link{set_LDA_plot_colors}}, -#' \code{\link{set_TS_summary_plot_cols}}, \code{\link{set_gamma_colors}}, -#' and \code{\link{set_rho_hist_colors}} for specific details on usage. -#' -#' @param rho_cols Colors to be used to plot the histograms of change points. -#' Any valid color values (\emph{e.g.}, see \code{\link[grDevices]{colors}}, -#' \code{\link[grDevices]{rgb}}) can be input as with a standard plot. -#' The default (\code{rho_cols = NULL}) triggers use of -#' \code{\link[viridis]{viridis}} color options (see \code{rho_option}). -#' -#' @param rho_option A \code{character} string indicating the color option -#' from \code{\link[viridis]{viridis}} to use if `rho_cols == NULL`. Four -#' options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -#' (or "C"), "viridis" (or "D", the default option) and "cividis" (or "E"). -#' -#' @param rho_alpha Numeric value [0,1] that indicates the transparency of the -#' colors used. Supported only on some devices, see -#' \code{\link[grDevices]{rgb}}. -#' -#' @param gamma_cols Colors to be used to plot the LDA topic proportions, -#' time series of observed topic proportions, and time series of fitted -#' topic proportions. Any valid color values (\emph{e.g.}, see -#' \code{\link[grDevices]{colors}}, \code{\link[grDevices]{rgb}}) can be -#' input as with a standard plot. The default (\code{gamma_cols = NULL}) -#' triggers use of \code{\link[viridis]{viridis}} color options (see -#' \code{gamma_option}). -#' -#' @param gamma_option A \code{character} string indicating the color option -#' from \code{\link[viridis]{viridis}} to use if gamma_cols == NULL`. Four -#' options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -#' (or "C", the default option), "viridis" (or "D") and "cividis" (or "E"). -#' -#' @param gamma_alpha Numeric value [0,1] that indicates the transparency of -#' the colors used. Supported only on some devices, see -#' \code{\link[grDevices]{rgb}}. -#' -#' @return \code{list} of elements used to define the colors for the two -#' panels of the summary plot, as generated simply using -#' \code{\link{set_LDA_TS_plot_cols}}. \code{cols} has two elements: -#' \code{LDA} and \code{TS}, each corresponding the set of plots for -#' its stage in the full model. \code{LDA} contains entries \code{cols} -#' and \code{options} (see \code{\link{set_LDA_plot_colors}}). \code{TS} -#' contains two entries, \code{rho} and \code{gamma}, each corresponding -#' to the related panel, and each containing default values for entries -#' named \code{cols}, \code{option}, and \code{alpha} (see -#' \code{\link{set_TS_summary_plot_cols}}, \code{\link{set_gamma_colors}}, -#' and \code{\link{set_rho_hist_colors}}). -#' -#' @examples -#' set_LDA_TS_plot_cols() +#' @rdname plot.LDA_TS #' #' @export #' -set_LDA_TS_plot_cols <- function(rho_cols = NULL, rho_option = "D", - rho_alpha = 0.4, gamma_cols = NULL, - gamma_option = "C", gamma_alpha = 0.8){ +set_LDA_TS_plot_colors <- function(rho_cols = NULL, rho_option = "D", + rho_alpha = 0.4, gamma_cols = NULL, + gamma_option = "C", gamma_alpha = 0.8){ list( LDA = list(cols = gamma_cols, option = gamma_option, alpha = gamma_alpha), TS = set_TS_summary_plot_cols(rho_cols = rho_cols, @@ -137,4 +112,4 @@ set_LDA_TS_plot_cols <- function(rho_cols = NULL, rho_option = "D", gamma_option = gamma_option, gamma_alpha = gamma_alpha) ) -} +} \ No newline at end of file diff --git a/R/LDA_models.R b/R/LDA_models.R new file mode 100644 index 00000000..f967131f --- /dev/null +++ b/R/LDA_models.R @@ -0,0 +1,131 @@ + +#' @title Latent Dirichlet Allocation Linguistic Decomposition Analysis +#' as conducted via the topicmodels package +#' +#' @description Fit the standard LDATS LDA model (a true Latent Dirichlet +#' Allocation) using \code{\link[topicmodels]{LDA}} (Grun and Hornik 2011). +#' Default methodology is the Variational Expectation Maximization routine +#' (VEM) as described by Blei \emph{et al.} (2003) and implemented by +#' Grun and Hornik (2011). \cr \cr +#' If the model is defined to only fit one topic, \code{\link{identity_LDA}} +#' is used by default. +#' +#' @param LDA A prepared (via \code{\link{prepare_LDA}} LDA model +#' \code{list}. +#' +#' @param ... Additional arguments to be passed to +#' \code{\link[topicmodels]{LDA}} as a \code{control} input. +#' +#' @param seeded \code{logical} indicator of if the LDA should be a seeded +#' replicate. +#' +#' @param method Fitting routine used in \code{\link[topicmodels]{LDA}}. +#' Currenlty, only \code{"VEM"} and \code{"Gibbs"} are supported. +#' +#' @return \code{LDA} \code{list} with components +#' \describe{ +#' \item{alpha}{parameter estimate.} +#' \item{beta}{parameter estimate.} +#' \item{terms}{\code{character} \code{vector} of term names.} +#' \item{document_topic_matrix}{estimated latent topic compositions.} +#' \item{test_document_topic_matrox}{estimated latent topic compositions +#' of the test data (not presently available for usage).} +#' \item{log_likelihood}{model log likelihood.} +#' \item{data}{data object used to fit the LDA model.} +#' \item{data_subset}{number of the data subset from the whole data set.} +#' \item{topics}{\code{integer} number of topics in the model.} +#' \item{replicat}{\code{integer} replicate number.} +#' \item{control}{\code{list} of controls used to fit the model. See +#' \code{\link{LDA_control}}.} +#' } +#' +#' @references +#' Blei, D. M., A. Y. Ng, and M. I. Jordan. 2003. Latent Dirichlet +#' Allocation. \emph{Journal of Machine Learning Research} +#' \strong{3}:993-1022. +#' \href{http://jmlr.csail.mit.edu/papers/v3/blei03a.html}{link}. +#' +#' Grun B. and K. Hornik. 2011. topicmodels: An R Package for Fitting Topic +#' Models. \emph{Journal of Statistical Software} \strong{40}:13. +#' \href{https://www.jstatsoft.org/article/view/v040i13}{link}. +#' +#' @export +#' +topicmodels_LDA <- function(LDA, method = "VEM", seeded = TRUE, ...){ + data <- LDA$data + topics <- LDA$topics + rep <- LDA$rep + data_subset <- LDA$data_subset + if(topics == 1){ + identity_LDA(LDA) + } else{ + fun_control <- list(...) + if(seeded){ + fun_control <- update_list(fun_control, seed = rep * 2) + } + mod <- topicmodels::LDA(x = data$train$document_term_table, k = topics, + method = method, control = fun_control) + mod_ll <- sum(mod@loglikelihood) + alpha <- tryCatch(as.integer(mod@control@estimate.alpha), + error = function(x){0}) + df <- alpha + length(mod@beta) + attr(mod_ll, "df") <- df + attr(mod_ll, "nobs") <- mod@Dim[1] * mod@Dim[2] + class(mod_ll) <- "logLik" + out <- update_list(LDA, params = list(alpha = mod@alpha, beta = mod@beta), + document_topic_table = mod@gamma, + terms = mod@terms, + test_document_topic_matrix = NULL, #not available + log_likelihood = mod_ll) + class(out) <- c("LDA", "list") + out + } +} + +#' @title Identity Linguistic Decomposition Analysis +#' +#' @description This function acts as an "identity" model, wherein the +#' output is functionally the input. This allows for "single-topic" models +#' that do not actually decompose the data to be included in the model set. +#' +#' @param LDA A prepared (via \code{\link{prepare_LDA}} LDA model +#' \code{list}. +#' +#' @return \code{LDA} \code{list} with components (many of which are +#' placeholders): +#' \describe{ +#' \item{alpha}{parameter estimate.} +#' \item{beta}{parameter estimate.} +#' \item{terms}{\code{character} \code{vector} of term names.} +#' \item{document_topic_matrix}{estimated latent topic compositions.} +#' \item{test_document_topic_matrox}{estimated latent topic compositions +#' of the test data (not presently available for usage).} +#' \item{log_likelihood}{model log likelihood.} +#' \item{data}{data object used to fit the LDA model.} +#' \item{data_subset}{number of the data subset from the whole data set.} +#' \item{topics}{\code{integer} number of topics in the model.} +#' \item{replicat}{\code{integer} replicate number.} +#' \item{control}{\code{list} of controls used to fit the model. See +#' \code{\link{LDA_control}}.} +#' } +#' +#' @export +#' +identity_LDA <- function(LDA){ + data <- LDA$data + rep <- LDA$rep + data_subset <- LDA$data_subset + document_topic_table <- matrix(1, ncol = 1, + nrow = NROW(data$train$document_term_table)) + beta <- apply(data$train$document_term_table, 2, sum) + beta <- log(beta/sum(beta)) + beta <- matrix(beta, nrow = 1) + out <- update_list(LDA, params = list(alpha = 1, beta = beta), + document_topic_table = document_topic_table, + log_likelihood = NULL, data = data, + topics = 1, rep = rep, data_subset = data_subset, + terms = colnames(data$train$document_term_table), + test_document_topic_matrix = NULL) #not available + class(out) <- c("LDA", "list") + out +} diff --git a/R/LDA_plots.R b/R/LDA_plots.R index ba3f7462..4ba463dc 100644 --- a/R/LDA_plots.R +++ b/R/LDA_plots.R @@ -1,46 +1,20 @@ -#' @title Plot a set of LDATS LDA models -#' -#' @description Generalization of the \code{\link[graphics]{plot}} function to -#' work on a list of LDA topic models (class \code{LDA_set}). -#' -#' @param x An \code{LDA_set} object of LDA topic models. -#' -#' @param ... Additional arguments to be passed to subfunctions. -#' -#' @return \code{NULL}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' lda_data <- rodents$document_term_table -#' r_LDA <- LDA_set(lda_data, topics = 2, nseeds = 2) -#' plot(r_LDA) -#' } -#' -#' @export -#' -plot.LDA_set <- function(x, ...){ - on.exit(devAskNewPage(FALSE)) - if (length(x) > 1){ - devAskNewPage(TRUE) - } - y <- lapply(x, plot, ...) - y <- NULL -} #' @title Plot the results of an LDATS LDA model #' -#' @description Create an LDATS LDA summary plot, with a top panel showing -#' the topic proportions for each word and a bottom panel showing the topic -#' proportions of each document/over time. The plot function is defined for -#' class \code{LDA_VEM} specifically (see \code{\link[topicmodels]{LDA}}). -#' \cr \cr +#' @description +#' \code{plot.LDA} creates an LDATS LDA summary plot, with a top panel +#' showing the topic proportions for each word and a bottom panel showing +#' the topic proportions of each document/over time. \cr \cr +#' \code{plot.LDA_set} plots either the \code{selected} model results or +#' all of them from a \code{LDA_set} of \code{LDA} models. \cr \cr #' \code{LDA_plot_top_panel} creates an LDATS LDA summary plot -#' top panel showing the topic proportions word-by-word. \cr \cr +#' top panel showing the topic proportions word-by-word. \cr \cr #' \code{LDA_plot_bottom_panel} creates an LDATS LDA summary plot -#' bottom panel showing the topic proportions over time/documents. +#' bottom panel showing the topic proportions over time/documents. \cr \cr +#' \code{set_LDA_plot_colors} creates the set of colors to be used in +#' the LDA plots based on the variety of argument options. #' -#' @param x Object of class \code{LDA_VEM}. +#' @param x Object of class \code{LDA}. #' #' @param xtime Optional x values used to plot the topic proportions according #' to a specific time value (rather than simply the order of observations). @@ -71,28 +45,50 @@ plot.LDA_set <- function(x, ...){ #' #' @param ... Not used, retained for alignment with base function. #' -#' @return \code{NULL}. +#' @param selected \code{logical} indicator of if only the selected LDAs +#' (the first element in \code{x}) should be plotted or if all the LDAs +#' (the second element in \code{x}) should be plotted. #' -#' @examples -#' \donttest{ -#' data(rodents) -#' lda_data <- rodents$document_term_table -#' r_LDA <- LDA_set(lda_data, topics = 4, nseeds = 10) -#' best_lda <- select_LDA(r_LDA)[[1]] -#' plot(best_lda, option = "cividis") -#' LDA_plot_top_panel(best_lda, option = "cividis") -#' LDA_plot_bottom_panel(best_lda, option = "cividis") -#' } +#' @return +#' \code{plot.LDA},\code{plot.LDA_set},\code{LDA_plot_top_panel}, +#' \code{LDA_plot_bottom_panel}: \code{NULL}. \cr \cr +#' \code{set_LDA_plot_colors}: \code{vector} of \code{character} hex codes +#' indicating colors to use. #' -#' @export +#' @name plot.LDA +#' + + + +#' @rdname plot.LDA #' -plot.LDA_VEM <- function(x, ..., xtime = NULL, xname = NULL, cols = NULL, +#' @export +#' +plot.LDA <- function(x, ..., xtime = NULL, xname = NULL, cols = NULL, option = "C", alpha = 0.8, LDATS = FALSE){ LDA_plot_top_panel(x, cols, option, alpha, TRUE, LDATS) LDA_plot_bottom_panel(x, xtime, xname, cols, option, alpha, TRUE, LDATS) } -#' @rdname plot.LDA_VEM + +#' @rdname plot.LDA +#' +#' @export +#' +plot.LDA_set <- function(x, ..., selected = TRUE){ + if(selected){ + x <- x[[1]] + } else{ + x <- x[[2]] + } + on.exit(devAskNewPage(FALSE)) + if (length(x) > 1){ + devAskNewPage(TRUE) + } + y <- lapply(x, plot, ...) + y <- NULL +} +#' @rdname plot.LDA #' #' @export #' @@ -101,13 +97,17 @@ LDA_plot_top_panel <- function(x, cols = NULL, option = "C", alpha = 0.8, oldpar <- par(no.readonly = TRUE) on.exit(par(oldpar)) cols <- set_LDA_plot_colors(x, cols, option, alpha) - gamma <- x@gamma - beta <- exp(x@beta) + gamma <- x$document_topic_table + beta <- exp(x$params$beta) nobs <- nrow(gamma) ntopics <- ncol(gamma) nwords <- ncol(beta) beta_order <- apply(beta, 2, order) beta_sorted <- apply(beta, 2, sort) + if(length(dim(beta_sorted)) == 0){ + beta_order <- matrix(beta_order, nrow = 1) + beta_sorted <- matrix(beta_sorted, nrow = 1) + } counter <- 1 rect_mat <- matrix(NA, nrow = nwords * ntopics, ncol = 4) @@ -143,7 +143,7 @@ LDA_plot_top_panel <- function(x, cols = NULL, option = "C", alpha = 0.8, col = rect_col[i]) } axis(2, at = seq(0, max_y, 0.1), labels = FALSE, tck = -0.02) - mtext(side = 1, at = seq(1, nwords, 1), text = x@terms, tck = 0, + mtext(side = 1, at = seq(1, nwords, 1), text = x$terms, tck = 0, cex = 0.5, line = 0) if (LDATS){ @@ -165,7 +165,7 @@ LDA_plot_top_panel <- function(x, cols = NULL, option = "C", alpha = 0.8, } } -#' @rdname plot.LDA_VEM +#' @rdname plot.LDA #' #' @export #' @@ -175,7 +175,7 @@ LDA_plot_bottom_panel <- function(x, xtime = NULL, xname = NULL, cols = NULL, oldpar <- par(no.readonly = TRUE) on.exit(par(oldpar)) cols <- set_LDA_plot_colors(x, cols, option, alpha) - gamma <- x@gamma + gamma <- x$document_topic_table ntopics <- ncol(gamma) if (is.null(xtime)){ @@ -202,44 +202,13 @@ LDA_plot_bottom_panel <- function(x, xtime = NULL, xname = NULL, cols = NULL, } } -#' @title Prepare the colors to be used in the LDA plots -#' -#' @description Based on the inputs, create the set of colors to be used in -#' the LDA plots made by \code{\link{plot.LDA_TS}}. -#' -#' @param x Object of class \code{LDA}. -#' -#' @param cols Colors to be used to plot the topics. -#' Any valid color values (\emph{e.g.}, see \code{\link[grDevices]{colors}}, -#' \code{\link[grDevices]{rgb}}) can be input as with a standard plot. -#' The default (\code{cols = NULL}) triggers use of -#' \code{\link[viridis]{viridis}} color options (see \code{option}). -#' -#' @param option A \code{character} string indicating the color option -#' from \code{\link[viridis]{viridis}} to use if `cols == NULL`. Four -#' options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -#' (or "C", the default option), "viridis" (or "D") and "cividis" (or "E"). +#' @rdname plot.LDA #' -#' @param alpha Numeric value [0,1] that indicates the transparency of the -#' colors used. Supported only on some devices, see -#' \code{\link[grDevices]{rgb}}. -#' -#' @return \code{vector} of \code{character} hex codes indicating colors to -#' use. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' lda_data <- rodents$document_term_table -#' r_LDA <- LDA_set(lda_data, topics = 4, nseeds = 10) -#' set_LDA_plot_colors(r_LDA[[1]]) -#' } -#' -#' @export +#' @export #' set_LDA_plot_colors <- function(x, cols = NULL, option = "C", alpha = 0.8){ - gamma <- x@gamma + gamma <- x$document_topic_table ntopics <- ncol(gamma) if (length(cols) == 0){ cols <- viridis(ntopics, option = option, alpha = alpha, end = 0.9) diff --git a/R/TS.R b/R/TS.R index f963788c..85d2b4fc 100644 --- a/R/TS.R +++ b/R/TS.R @@ -1,40 +1,65 @@ -#' @title Conduct a single multinomial Bayesian Time Series analysis -#' -#' @description This is the main interface function for the LDATS application -#' of Bayesian change point Time Series analyses (Christensen \emph{et al.} -#' 2018), which extends the model of Western and Kleykamp (2004; -#' see also Ruggieri 2013) to multinomial (proportional) response data using -#' softmax regression (Ripley 1996, Venables and Ripley 2002, Bishop 2006) -#' using a generalized linear modeling approach (McCullagh and Nelder 1989). -#' The models are fit using parallel tempering Markov Chain Monte Carlo -#' (ptMCMC) methods (Earl and Deem 2005) to locate change points and -#' neural networks (Ripley 1996, Venables and Ripley 2002, Bishop 2006) to -#' estimate regressors. \cr \cr -#' \code{check_TS_inputs} checks that the inputs to -#' \code{TS} are of proper classes for a full analysis. -#' -#' @param data \code{data.frame} including [1] the time variable (indicated -#' in \code{timename}), [2] the predictor variables (required by -#' \code{formula}) and [3], the multinomial response variable (indicated in -#' \code{formula}) as verified by \code{\link{check_timename}} and -#' \code{\link{check_formula}}. Note that the response variables should be -#' formatted as a \code{data.frame} object named as indicated by the -#' \code{response} entry in the \code{control} list, such as \code{gamma} -#' for a standard TS analysis on LDA output. See \code{Examples}. -#' -#' @param formula \code{\link[stats]{formula}} defining the regression between -#' relationship the change points. Any -#' predictor variable included must also be a column in -#' \code{data} and any (multinomial) response variable must be a set of -#' columns in \code{data}, as verified by \code{\link{check_formula}}. -#' -#' @param nchangepoints \code{integer} corresponding to the number of -#' change points to include in the model. 0 is a valid input (corresponding -#' to no change points, so a singular time series model), and the current + +#' @title Conduct a Bayesian compositional Time Series analysis +#' +#' @description Analyze compositional Time Series models using Bayesian +#' sampling methods. \cr \cr +#' \code{TS} is the main interface function for the LDATS application +#' of Bayesian change point Time Series analyses (Christensen +#' \emph{et al.} 2018). \cr \cr +#' \code{prepare_TS} pre-prepares the TS model objects for simpler +#' use within the subfunctions. \cr \cr +#' \code{check_TS} ensures that the inputs are proper. +#' See \code{\link{check_LDAs}}, +#' \code{\link{check_document_covariate_table}}, +#' code{\link{check_formulas}}, \code{\link{check_nchangepoints}}, +#' \code{\link{check_timename}}, \code{\link{check_weights}}, +#' and \code{\link{check_control}} for specifics. \cr \cr +#' \code{TS_control} defines and creates the control \code{list} for the TS +#' model running. \cr \cr +#' \code{run_TS} runs (via \code{\link{TS_call}}) all TS models +#' as set up by \code{prep_TS_models}. \cr \cr +#' \code{TS_call} runs (via \code{\link{do.call}}) a single TS model +#' as set up by \code{prep_TS_models}. \cr \cr +#' \code{TS_msg} produces a model-running message if desired. \cr \cr +#' \code{measure_TS} determines the fit value used to select among the +#' models. \cr \cr +#' \code{select_TS} chooses the best model(s) of interest based on their +#' measured values and the selector function. \cr \cr +#' \code{package_TS} sets the class and names the elements of the results +#' \code{list} from \code{\link{TS_call}} applied to the +#' combination of TS models requested for the LDA model(s) input. +#' +#' @details For a (potentially subset) dataset consisting of proportions of +#' topics across multiple documents in a corpus +#' \enumerate{ +#' \item Conduct multiple compositional Bayesian TS models +#' (e.g., changepoint softmax regression; Ripley 1996, Venables +#' and Ripley 2002, Western and Kleykamp 2004, Bishop 2006, Ruggieri +#' 2013) via a generalized linear modeling approach (McCullagh and +#' Nelder 1989) and using parallel tempering Markov Chain Monte Carlo +#' (ptMCMC) methods (Earl and Deem 2005), +#' \item Select from the TS model results to pick those used to summarize +#' the whole model, and +#' \item Package the results. +#' } +#' +#' @param formulas Vector of \code{\link[stats]{formula}}(s) defining the +#' regression between the change points. Any predictor variable included +#' must also be a column in \code{data} and any (compositional) response +#' variable must be a set of columns in \code{data}. \cr +#' Each element (formula) in the vector is evaluated for each number of +#' change points and each LDA model. +#' +#' @param nchangepoints \code{integer}-conformable vector corresponding to the +#' number of change points to include in the models. 0 is valid (corresponds +#' to no change points, so a singular time series model) and the current #' implementation can reasonably include up to 6 change points. The #' number of change points is used to dictate the segmentation of the #' time series into chunks fit with separate models dictated by -#' \code{formula}. +#' \code{formula}. \cr +#' Each element in the vector is the number of change points +#' used to segment the data for each formula (entry in \code{formulas}) +#' component of the TS model, for each selected LDA model. #' #' @param timename \code{character} element indicating the time variable #' used in the time series. Defaults to \code{"time"}. The variable must be @@ -44,60 +69,97 @@ #' #' @param weights Optional class \code{numeric} vector of weights for each #' document. Defaults to \code{NULL}, translating to an equal weight for -#' each document. When using \code{multinom_TS} in a standard LDATS +#' each document. When using \code{\link{TS_call}} in a standard LDATS #' analysis, it is advisable to weight the documents by their total size, -#' as the result of \code{\link[topicmodels]{LDA}} is a matrix of +#' as the result of, e.g., \code{\link[topicmodels]{LDA}} is a matrix of #' proportions, which does not account for size differences among documents. #' For most models, a scaling of the weights (so that the average is 1) is #' most appropriate, and this is accomplished using \code{document_weights}. #' #' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by +#' Time Series model. Values not input assume defaults set by #' \code{\link{TS_control}}. +#' +#' @param LDAs Class \code{LDA_set} \code{list} of both selected and all +#' results from \code{\link{LDA}}. +#' +#' @param TS,TSs time series model \code{list} (\code{TS}) or a \code{list} +#' of many time series model \code{list}s (\code{TSs}). +#' +#' @param response \code{character} element indicating the response variable +#' used in the time series. \cr \cr +#' Must have a corresponding \code{_TS} function. +#' +#' @param response_args \code{list} of (named) arguments to be used in +#' \code{response} via \code{\link{do.call}}. +#' \cr \cr +#' Could be managed via a \code{_TS_control} function like +#' \code{\link{multinom_TS_control}}. +#' +#' @param summary_prob Probability used for summarizing the posterior +#' distributions (via the highest posterior density interval, see +#' \code{\link[coda]{HPDinterval}}). +#' +#' @param quiet \code{logical} indicator of whether the model should run +#' quietly (if \code{FALSE}, a progress bar and notifications are printed). #' -#' @return \code{TS}: \code{TS_fit}-class list containing the following -#' elements, many of -#' which are hidden for \code{print}ing, but are accessible: -#' \describe{ -#' \item{data}{\code{data} input to the function.} -#' \item{formula}{\code{\link[stats]{formula}} input to the function.} -#' \item{nchangepoints}{\code{nchangepoints} input to the function.} -#' \item{weights}{\code{weights} input to the function.} -#' \item{control}{\code{control} input to the function.} -#' \item{lls}{Iteration-by-iteration -#' \link[=logLik.multinom_TS_fit]{logLik} values for the -#' full time series fit by \code{\link{multinom_TS}}.} -#' \item{rhos}{Iteration-by-iteration change point estimates from -#' \code{\link{est_changepoints}}.} -#' \item{etas}{Iteration-by-iteration marginal regressor estimates from -#' \code{\link{est_regressors}}, which have been -#' unconditioned with respect to the change point locations.} -#' \item{ptMCMC_diagnostics}{ptMCMC diagnostics, -#' see \code{\link{diagnose_ptMCMC}}} -#' \item{rho_summary}{Summary table describing \code{rhos} (the change -#' point locations), -#' see \code{\link{summarize_rhos}}.} -#' \item{rho_vcov}{Variance-covariance matrix for the estimates of -#' \code{rhos} (the change point locations), see -#' \code{\link{measure_rho_vcov}}.} -#' \item{eta_summary}{Summary table describing \code{ets} (the -#' regressors), -#' see \code{\link{summarize_etas}}.} -#' \item{eta_vcov}{Variance-covariance matrix for the estimates of -#' \code{etas} (the regressors), see -#' \code{\link{measure_eta_vcov}}.} -#' \item{logLik}{Across-iteration average of log-likelihoods -#' (\code{lls}).} -#' \item{nparams}{Total number of parameters in the full model, -#' including the change point locations and regressors.} -#' \item{deviance}{Penalized negative log-likelihood, based on -#' \code{logLik} and \code{nparams}.} -#' } -#' \code{check_TS_inputs}: An error message is thrown if any input -#' is not proper, else \code{NULL}. -#' -#' @references +#' @param soften \code{logical} indicator of whether the model should error +#' softly or if errors should trigger a full-stop to the pipeline. +#' +#' @param model Main Time Series \code{function}. +#' +#' @param model_args \code{list} of (named) arguments to be used in +#' \code{model} via \code{\link{TS_call}}. +#' +#' @param method \code{function} used to drive the sampler of the TS +#' models; \code{method} defines and operates the computational procedure. +#' \cr \cr +#' Current pre-built options include \code{\link{ldats_classic}}. +#' +#' @param method_args \code{list} of (named) arguments to be used in +#' \code{method} via \code{\link{do.call}}. +#' \cr \cr +#' Could be managed via a \code{_control} function like +#' \code{\link{ldats_classic_control}}. +#' +#' @param measurer \code{function} used in evaluation of the TS +#' models; \code{measurer} creates a value for each model. +#' +#' @param measurer_args \code{list} of (named) arguments to be used in +#' \code{measurer} via \code{\link{do.call}}. +#' +#' @param selector \code{function} usde in evaluation of the TS +#' models; \code{selector} operates on the values to choose the models. +#' +#' @param selector_args \code{list} of (named) arguments to be used in +#' \code{selector} via \code{\link{do.call}}. +#' +#' @param ... Not passed along to the output, rather included to allow for +#' automated removal of unneeded controls. +#' +#' @return +#' \code{TS},\code{pacakage_TS}: class \code{TS_set} \code{list} of both +#' selected and all results from \code{\link{TS_call}} applied for +#' each model on each LDA model input as well as the control \code{list} +#' used to fit the model. \cr \cr +#' \code{prepare_TS}: \code{list} of \code{list}s, each of which is a +#' preliminary model object for a Time Series model fit. \cr \cr +#' \code{check_TS}: an error message is thrown if any input is improper, +#' otherwise \code{NULL}. +#' \code{TS_control}: \code{list} of named control elements for +#' model fitting. +#' \code{measure_TS}: \code{vector} of values corresponding to the model +#' evaluations. \cr \cr +#' \code{select_TS}: \code{list} of selected models' \code{list}s. \cr \cr +#' \code{run_TS}: \code{TS_set} \code{list} of model results from all +#' runs of a \code{} function, such as +#' \code{\link{topicmodels_LDA}}. \cr \cr +#' \code{TS_call}: \code{TS} \code{list} of model results from a single +#' run of a \code{} function, such as +#' \code{\link{sequential_TS}}. \cr \cr +#' \code{TS_msg}: a message is produced. +#' +#' @references #' Bishop, C. M. 2006. \emph{Pattern Recognition and Machine Learning}. #' Springer, New York, NY, USA. #' @@ -130,873 +192,338 @@ #' \strong{12}:354-374. #' \href{https://doi.org/10.1093/pan/mph023}{link}. #' -#' @examples -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' \donttest{ -#' TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) -#' } -#' check_TS_inputs(data, timename = "newmoon") +#' @name TS +#' + + +#' @rdname TS #' #' @export #' -TS <- function(data, formula = gamma ~ 1, nchangepoints = 0, +TS <- function(LDAs, formulas = ~ 1, nchangepoints = 0, timename = "time", weights = NULL, control = list()){ - check_TS_inputs(data, formula, nchangepoints, timename, weights, control) - control <- do.call("TS_control", control) - set.seed(control$seed) - data <- data[order(data[,timename]), ] - rho_dist <- est_changepoints(data, formula, nchangepoints, timename, - weights, control) - eta_dist <- est_regressors(rho_dist, data, formula, timename, weights, - control) - package_TS(data, formula, timename, weights, control, rho_dist, eta_dist) + TSs <- prepare_TS(LDAs = LDAs, formulas = formulas, + nchangepoints = nchangepoints, timename = timename, + weights = weights, control = control) + TSs <- run_TS(TSs = TSs) + TSs <- package_TS(TSs = TSs) + TSs } #' @rdname TS -#' +#' #' @export #' -check_TS_inputs <- function(data, formula = gamma ~ 1, nchangepoints = 0, - timename = "time", weights = NULL, - control = list()){ - check_formula(data, formula) - check_nchangepoints(nchangepoints) - check_weights(weights) - check_timename(data, timename) - check_control(control) - return() +check_TS <- function(LDAs, formulas = ~ 1, nchangepoints = 0, + timename = "time", weights = NULL, control = list()){ + check_LDAs(LDAs = LDAs) + check_document_covariate_table(LDAs = LDAs) + check_formulas(LDAs = LDAs, formulas = formulas) + check_nchangepoints(nchangepoints = nchangepoints) + check_timename(LDAs = LDAs, timename = timename) + check_weights(weights = weights) + check_control(control = control) } -#' @title Summarize the Time Series model -#' -#' @description Calculate relevant summaries for the run of a Time Series -#' model within \code{\link{TS}} and package the output as a -#' \code{TS_fit}-class object. -#' -#' @param data \code{data.frame} including [1] the time variable (indicated -#' in \code{timename}), [2] the predictor variables (required by -#' \code{formula}) and [3], the multinomial response variable (indicated in -#' \code{formula}) as verified by \code{\link{check_timename}} and -#' \code{\link{check_formula}}. Note that the response variables should be -#' formatted as a \code{data.frame} object named as indicated by the -#' \code{response} entry in the \code{control} list, such as \code{gamma} -#' for a standard TS analysis on LDA output. -#' -#' @param formula \code{\link[stats]{formula}} defining the regression between -#' relationship the change points. Any -#' predictor variable included must also be a column in -#' \code{data} and any (multinomial) response variable must be a set of -#' columns in \code{data}, as verified by \code{\link{check_formula}}. -#' -#' @param timename \code{character} element indicating the time variable -#' used in the time series. -#' -#' @param weights Optional class \code{numeric} vector of weights for each -#' document. Defaults to \code{NULL}, translating to an equal weight for -#' each document. When using \code{multinom_TS} in a standard LDATS -#' analysis, it is advisable to weight the documents by their total size, -#' as the result of \code{\link[topicmodels]{LDA}} is a matrix of -#' proportions, which does not account for size differences among documents. -#' For most models, a scaling of the weights (so that the average is 1) is -#' most appropriate, and this is accomplished using \code{document_weights}. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @param rho_dist List of saved data objects from the ptMCMC estimation of -#' change point locations returned by \code{\link{est_changepoints}} -#' (unless \code{nchangepoints} is 0, then \code{NULL}). -#' -#' @param eta_dist Matrix of draws (rows) from the marginal posteriors of the -#' coefficients across the segments (columns), as estimated by -#' \code{\link{est_regressors}}. -#' -#' @return \code{TS_fit}-class list containing the following elements, many of -#' which are hidden for \code{print}ing, but are accessible: -#' \describe{ -#' \item{data}{\code{data} input to the function.} -#' \item{formula}{\code{\link[stats]{formula}} input to the function.} -#' \item{nchangepoints}{\code{nchangepoints} input to the function.} -#' \item{weights}{\code{weights} input to the function.} -#' \item{timename}{\code{timename} input to the function.} -#' \item{control}{\code{control} input to the function.} -#' \item{lls}{Iteration-by-iteration -#' \link[=logLik.multinom_TS_fit]{logLik} values for the -#' full time series fit by \code{\link{multinom_TS}}.} -#' \item{rhos}{Iteration-by-iteration change point estimates from -#' \code{\link{est_changepoints}}.} -#' \item{etas}{Iteration-by-iteration marginal regressor estimates from -#' \code{\link{est_regressors}}, which have been -#' unconditioned with respect to the change point locations.} -#' \item{ptMCMC_diagnostics}{ptMCMC diagnostics, -#' see \code{\link{diagnose_ptMCMC}}} -#' \item{rho_summary}{Summary table describing \code{rhos} (the change -#' point locations), -#' see \code{\link{summarize_rhos}}.} -#' \item{rho_vcov}{Variance-covariance matrix for the estimates of -#' \code{rhos} (the change point locations), see -#' \code{\link{measure_rho_vcov}}.} -#' \item{eta_summary}{Summary table describing \code{ets} (the -#' regressors), -#' see \code{\link{summarize_etas}}.} -#' \item{eta_vcov}{Variance-covariance matrix for the estimates of -#' \code{etas} (the regressors), see -#' \code{\link{measure_eta_vcov}}.} -#' \item{logLik}{Across-iteration average of log-likelihoods -#' (\code{lls}).} -#' \item{nparams}{Total number of parameters in the full model, -#' including the change point locations and regressors.} -#' \item{AIC}{Penalized negative log-likelihood, based on -#' \code{logLik} and \code{nparams}.} -#' } -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' formula <- gamma ~ 1 -#' nchangepoints <- 1 -#' control <- TS_control() -#' data <- data[order(data[,"newmoon"]), ] -#' rho_dist <- est_changepoints(data, formula, nchangepoints, "newmoon", -#' weights, control) -#' eta_dist <- est_regressors(rho_dist, data, formula, "newmoon", weights, -#' control) -#' package_TS(data, formula, "newmoon", weights, control, rho_dist, -#' eta_dist) -#' } +#' @rdname TS #' #' @export #' -package_TS <- function(data, formula, timename, weights, control, rho_dist, - eta_dist){ - - check_formula(data, formula) - check_weights(weights) - check_control(control) - check_timename(data, timename) - control <- do.call("TS_control", control) - nchangepoints <- dim(rho_dist$cpts)[1] - if (is.null(nchangepoints)){ - nchangepoints <- 0 - mod <- multinom_TS(data, formula, changepoints = NULL, timename, weights, - control) - mod <- mod[[1]][[1]] - lls <- as.numeric(logLik(mod)) - rhos <- NULL - } else{ - lls <- rho_dist$lls[1, ] - rhos <- t(array(rho_dist$cpts[ , 1, ], dim = dim(rho_dist$cpts)[c(1, 3)])) - } - - ptMCMC_diagnostics <- diagnose_ptMCMC(rho_dist) - rho_summary <- summarize_rhos(rhos, control) - rho_vcov <- measure_rho_vcov(rhos) - eta_summary <- summarize_etas(eta_dist, control) - eta_vcov <- measure_eta_vcov(eta_dist) - - logLik <- mean(lls) - ncoefs <- ncol(eta_dist) - nparams <- nchangepoints + ncoefs - AIC <- -2 * logLik + 2 * nparams - - out <- list(data = data, formula = formula, nchangepoints = nchangepoints, - timename = timename, weights = weights, - control = control, lls = lls, rhos = rhos, - etas = eta_dist, ptMCMC_diagnostics = ptMCMC_diagnostics, - rho_summary = rho_summary, rho_vcov = rho_vcov, - eta_summary = eta_summary, eta_vcov = eta_vcov, - logLik = logLik, nparams = nparams, AIC = AIC) - class(out) <- c("TS_fit", "list") - to_hide <- c("data", "weights", "control", "lls", "rhos", "etas", - "rho_vcov", "eta_vcov") - if (nchangepoints == 0){ - to_hide <- c(to_hide, "ptMCMC_diagnostics", "rho_summary") +run_TS <- function(TSs){ + nTS <- length(TSs) + for (i in 1:nTS){ + TSs[[i]] <- TS_call(TS = TSs[[i]]) } - attr(out, "hidden") <- to_hide - out + TSs } -#' @title Print a Time Series model fit -#' -#' @description Convenience function to print only the most important -#' components of a \code{TS_fit}-class object fit by -#' \code{\link{TS}}. -#' -#' @param x Class \code{TS_fit} object to be printed. -#' -#' @param ... Not used, simply included to maintain method compatibility. -#' -#' @return The non-hidden parts of \code{x} as a \code{list}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) -#' print(TSmod) -#' } + +#' @rdname TS #' #' @export #' -print.TS_fit <- function(x, ...){ - hid <- attr(x, "hidden") - notHid <- !names(x) %in% hid - print(x[notHid]) +TS_call <- function(TS){ + TS_msg(TS = TS) + fun <- TS$control$model + args <- update_list(TS$control$model_args, TS = TS) + soft_call(what = fun, args = args, soften = TS$control$soften) } -#' @title Summarize the regressor (eta) distributions -#' -#' @description \code{summarize_etas} calculates summary statistics for each -#' of the chunk-level regressors. -#' \cr \cr -#' \code{measure_ets_vcov} generates the variance-covariance matrix for -#' the regressors. -#' -#' @param etas Matrix of regressors (columns) across iterations of the -#' ptMCMC (rows), as returned from \code{\link{est_regressors}}. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return \code{summarize_etas}: table of summary statistics for chunk-level -#' regressors including mean, median, mode, posterior interval, standard -#' deviation, MCMC error, autocorrelation, and effective sample size for -#' each regressor. \cr \cr -#' \code{measure_eta_vcov}: variance-covariance matrix for chunk-level -#' regressors. -#' -#' @examples -#' etas <- matrix(rnorm(100), 50, 2) -#' summarize_etas(etas) -#' measure_eta_vcov(etas) -#' -#' @export -#' -summarize_etas <- function(etas, control = list()){ - check_control(control) - control <- do.call("TS_control", control) - if (!is.matrix(etas)){ - stop("etas should be a matrix") - } - prob <- control$summary_prob - Mean <- round(apply(etas, 2, mean), 4) - Median <- round(apply(etas, 2, median), 4) - SD <- round(apply(etas, 2, sd), 4) - MCMCerr <- round(SD / sqrt(nrow(etas)), 4) - HPD <- HPDinterval(as.mcmc(etas), prob = prob) - Lower <- round(HPD[ , "lower"], 4) - Upper <- round(HPD[ , "upper"], 4) - AC10 <- tryCatch(t(round(autocorr.diag(as.mcmc(etas), lag = 10), 4)), - error = function(x){"-"}) - ESS <- effectiveSize(etas) - out <- data.frame(Mean, Median, Lower, Upper, SD, MCMCerr, AC10, ESS) - colnames(out)[3:4] <- paste0(c("Lower_", "Upper_"), paste0(prob*100, "%")) - colnames(out)[7] <- "AC10" - rownames(out) <- colnames(etas) - out -} -#' @rdname summarize_etas + +#' @rdname TS #' #' @export #' -measure_eta_vcov <- function(etas){ - if (!is.matrix(etas)){ - stop("expecting etas to be a matrix") - } - out <- var(etas) - colnames(out) <- colnames(etas) - rownames(out) <- colnames(etas) - out -} +TS_msg <- function(TS){ + subset_msg <- paste0(" - data subset ", TS$data_subset) + topic_msg <- paste0(", ", TS$topics, " topics") + rep_msg <- paste0(", replicate ", TS$rep) -#' @title Summarize the rho distributions -#' -#' @description \code{summarize_rho} calculates summary statistics for each -#' of the change point locations. -#' \cr \cr -#' \code{measure_rho_vcov} generates the variance-covariance matrix for the -#' change point locations. -#' -#' @param rhos Matrix of change point locations (columns) across iterations of -#' the ptMCMC (rows) or \code{NULL} if no change points are in the model, -#' as returned from \code{\link{est_changepoints}}. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return \code{summarize_rhos}: table of summary statistics for change point -#' locations including mean, median, mode, posterior interval, standard -#' deviation, MCMC error, autocorrelation, and effective sample size for -#' each change point location. \cr \cr -#' \code{measure_rho_vcov}: variance-covariance matrix for change -#' point locations. -#' -#' @examples -#' rhos <- matrix(sample(80:100, 100, TRUE), 50, 2) -#' summarize_rhos(rhos) -#' measure_rho_vcov(rhos) -#' -#' @export -#' -summarize_rhos <- function(rhos, control = list()){ - check_control(control) - control <- do.call("TS_control", control) - if (is.null(rhos)) { - return() - } - prob <- control$summary_prob - Mean <- round(apply(rhos, 2, mean), 2) - Median <- apply(rhos, 2, median) - Mode <- apply(rhos, 2, modalvalue) - SD <- round(apply(rhos, 2, sd), 2) - MCMCerr <- round(SD / sqrt(nrow(rhos)), 4) - HPD <- HPDinterval(as.mcmc(rhos), prob = prob) - Lower <- HPD[ , "lower"] - Upper <- HPD[ , "upper"] - AC10 <- t(round(autocorr.diag(as.mcmc(rhos), lag = 10), 4)) - ESS <- effectiveSize(rhos) - out <- data.frame(Mean, Median, Mode, Lower, Upper, SD, MCMCerr, AC10, ESS) - colnames(out)[4:5] <- paste0(c("Lower_", "Upper_"), paste0(prob*100, "%")) - colnames(out)[8] <- "AC10" - rownames(out) <- sprintf("Changepoint_%d", seq_len(nrow(out))) - out + formula_msg <- paste0(", ", deparse(TS$formula)) + nchangepoints <- TS$nchangepoints + txt <- ifelse(nchangepoints == 1, " change point", " change points") + changepoints_msg <- paste0(", ", nchangepoints, txt) + msg <- paste0(subset_msg, topic_msg, rep_msg, formula_msg, changepoints_msg) + messageq(msg, TS$control$quiet) } -#' @rdname summarize_rhos -#' -#' @export -#' -measure_rho_vcov <- function(rhos){ - if (is.null(rhos)) { - return() - } - if (!is.matrix(rhos)){ - stop("expecting rhos to be a matrix") - } - out <- var(rhos) - colnames(out) <- sprintf("CP_%d", 1:dim(out)[1]) - rownames(out) <- sprintf("CP_%d", 1:dim(out)[2]) - out -} -#' @title Estimate the distribution of regressors, unconditional on the -#' change point locations -#' -#' @description This function uses the marginal posterior distributions of -#' the change point locations (estimated by \code{\link{est_changepoints}}) -#' in combination with the conditional (on the change point locations) -#' posterior distributions of the regressors (estimated by -#' \code{\link{multinom_TS}}) to estimate the marginal posterior -#' distribution of the regressors, unconditional on the change point -#' locations. -#' -#' @details The general approach follows that of Western and Kleykamp -#' (2004), although we note some important differences. Our regression -#' models are fit independently for each chunk (segment of time), and -#' therefore the variance-covariance matrix for the full model -#' has \code{0} entries for covariances between regressors in different -#' chunks of the time series. Further, because the regression model here -#' is a standard (non-hierarchical) softmax (Ripley 1996, Venables and -#' Ripley 2002, Bishop 2006), there is no error term in the regression -#' (as there is in the normal model used by Western and Kleykamp 2004), -#' and so the posterior distribution used here is a multivariate normal, -#' as opposed to a multivariate t, as used by Western and Kleykamp (2004). -#' -#' @param rho_dist List of saved data objects from the ptMCMC estimation of -#' change point locations (unless \code{nchangepoints} is 0, then -#' \code{NULL}) returned from \code{\link{est_changepoints}}. -#' -#' @param data \code{data.frame} including [1] the time variable (indicated -#' in \code{timename}), [2] the predictor variables (required by -#' \code{formula}) and [3], the multinomial response variable (indicated in -#' \code{formula}) as verified by \code{\link{check_timename}} and -#' \code{\link{check_formula}}. Note that the response variables should be -#' formatted as a \code{data.frame} object named as indicated by the -#' \code{response} entry in the \code{control} list, such as \code{gamma} -#' for a standard TS analysis on LDA output. -#' -#' @param formula \code{\link[stats]{formula}} defining the regression between -#' relationship the change points. Any -#' predictor variable included must also be a column in -#' \code{data} and any (multinomial) response variable must be a set of -#' columns in \code{data}, as verified by \code{\link{check_formula}}. -#' -#' @param timename \code{character} element indicating the time variable -#' used in the time series. -#' -#' @param weights Optional class \code{numeric} vector of weights for each -#' document. Defaults to \code{NULL}, translating to an equal weight for -#' each document. When using \code{multinom_TS} in a standard LDATS -#' analysis, it is advisable to weight the documents by their total size, -#' as the result of \code{\link[topicmodels]{LDA}} is a matrix of -#' proportions, which does not account for size differences among documents. -#' For most models, a scaling of the weights (so that the average is 1) is -#' most appropriate, and this is accomplished using \code{document_weights}. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return \code{matrix} of draws (rows) from the marginal posteriors of the -#' coefficients across the segments (columns). -#' -#' @references -#' Bishop, C. M. 2006. \emph{Pattern Recognition and Machine Learning}. -#' Springer, New York, NY, USA. -#' -#' Ripley, B. D. 1996. \emph{Pattern Recognition and Neural Networks}. -#' Cambridge University Press, Cambridge, UK. -#' -#' Venables, W. N. and B. D. Ripley. 2002. \emph{Modern and Applied -#' Statistics with S}. Fourth Edition. Springer, New York, NY, USA. -#' -#' Western, B. and M. Kleykamp. 2004. A Bayesian change point model for -#' historical time series analysis. \emph{Political Analysis} -#' \strong{12}:354-374. -#' \href{https://doi.org/10.1093/pan/mph023}{link}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' formula <- gamma ~ 1 -#' nchangepoints <- 1 -#' control <- TS_control() -#' data <- data[order(data[,"newmoon"]), ] -#' rho_dist <- est_changepoints(data, formula, nchangepoints, "newmoon", -#' weights, control) -#' eta_dist <- est_regressors(rho_dist, data, formula, "newmoon", weights, -#' control) -#' } +#' @rdname TS #' #' @export #' -est_regressors <- function(rho_dist, data, formula, timename, weights, +prepare_TS <- function(LDAs, formulas = ~ 1, nchangepoints = 0, + timename = "time", weights = NULL, control = list()){ - check_formula(data, formula) - check_weights(weights) - check_control(control) + check_TS(LDAs = LDAs, formulas = formulas, nchangepoints = nchangepoints , + timename = timename, weights = weights, control = control) control <- do.call("TS_control", control) - if (!is.null(rho_dist)){ - if (any(names(rho_dist)[1:3] != c("cpts", "lls", "ids"))){ - stop("expecting rho_dist to have elements cpts, lls, ids") + messageq("----- Time Series Analyses -----", control$quiet) + if (!is(formulas, "list")) { + if (is(formulas, "formula")) { + formulas <- c(formulas) + } else{ + stop("formulas does not contain formula(s)") } - } - if (is.null(rho_dist)){ - mod <- multinom_TS(data, formula, changepoints = NULL, timename, weights, - control) - mod <- mod[[1]][[1]] - mv <- as.vector(t(coef(mod))) - vcv <- mirror_vcov(mod) - eta <- rmvnorm(control$nit, mv, vcv) - seg_names <- rep(1, ncol(vcv)) - coef_names <- colnames(vcv) - colnames(eta) <- paste(seg_names, coef_names, sep = "_") - return(eta) + } else if (!all(vapply(formulas, is, TRUE, "formula"))) { + stop("formulas does not contain all formula(s)") } - focal_rho <- rho_dist$cpts[ , 1, ] - nchangepts <- dim(rho_dist$cpts)[1] - if (nchangepts == 1){ - collapsedrho <- focal_rho - } else{ - collapsedrho <- apply(focal_rho, 2, paste, collapse = "_") + formulas2 <- formulas + for (i in seq_along(formulas)) { + tformula <- paste(as.character(formulas[[i]]), collapse = "") + formulas2[[i]] <- as.formula(paste("gamma", tformula)) } - freq_r <- table(collapsedrho) - unique_r <- names(freq_r) - nr <- length(unique_r) - n_topic <- ncol(data$gamma) - n_covar <- length(attr(terms(formula), "term.labels")) - n_eta_segment <- (n_topic - 1) * (n_covar + 1) - n_changept <- dim(rho_dist$cpts)[1] - n_segment <- n_changept + 1 - n_eta <- n_eta_segment * n_segment - eta <- matrix(NA, nrow = control$nit, ncol = n_eta) - pbar <- prep_pbar(control, "eta", nr) - - for(i in 1:nr){ - update_pbar(pbar, control) - cpts <- as.numeric(strsplit(unique_r[i], "_")[[1]]) - mods <- multinom_TS(data, formula, cpts, timename, weights, control) - ndraws <- freq_r[i] - colindex1 <- 1 - for(j in 1:n_segment){ - colindex2 <- colindex1 + n_eta_segment - 1 - seg_mod <- mods[[1]][[j]] - mv <- as.vector(t(coef(seg_mod))) - vcv <- mirror_vcov(seg_mod) - drawn <- rmvnorm(ndraws, mv, vcv) - rows_in <- which(collapsedrho == unique_r[i]) - cols_in <- colindex1:colindex2 - eta[rows_in, cols_in] <- drawn - colindex1 <- colindex2 + 1 - } + formulas <- formulas2 + nmods <- length(LDAs[[1]]) + mods <- 1:nmods + tab <- expand.grid(LDA = mods, formulas = formulas, + nchangepoints = nchangepoints, stringsAsFactors = FALSE) + + nTSs <- NROW(tab) + TSs <- vector("list", length = nTSs) + for(i in 1:nTSs){ + lda <- LDAs[[1]][[tab$LDA[i]]] + + ts_data <- lda$data + ts_data$train$ts_data <- lda$data$train$document_covariate_table + ts_data$train$ts_data$gamma <- lda$document_topic_table + + ts_data$test$ts_data <- lda$data$test$document_covariate_table + ts_data$test$ts_data$gamma <- lda$test_document_topic_table + + weights <- iftrue(weights, + document_weights(lda$data$train$document_term_table)) + + TSs[[i]] <- list(data = ts_data, + data_subset = lda[["data_subset"]], + formula = tab$formula[[i]], + nchangepoints = tab$nchangepoints[i], + weights = weights, + timename = timename, + control = control, + topics = lda$topics, replicate = lda$replicate) } - seg_names <- rep(1:n_segment, each = n_eta_segment) - coef_names <- rep(colnames(vcv), n_segment) - colnames(eta) <- paste(seg_names, coef_names, sep = "_") - eta + cp_text <- ifelse(tab[, 3] == 1 , "change point", "change points") + name_tab <- data.frame(paste("LDA", tab[ , 1]), + paste(",", tab[ , 2]), + paste(",", tab[ , 3], cp_text)) + names(TSs) <- apply(name_tab, 1, paste0, collapse = "") + TSs + } -#' @title Use ptMCMC to estimate the distribution of change point locations -#' -#' @description This function executes ptMCMC-based estimation of the -#' change point location distributions for multinomial Time Series analyses. -#' -#' @param data \code{data.frame} including [1] the time variable (indicated -#' in \code{timename}), [2] the predictor variables (required by -#' \code{formula}) and [3], the multinomial response variable (indicated in -#' \code{formula}) as verified by \code{\link{check_timename}} and -#' \code{\link{check_formula}}. Note that the response variables should be -#' formatted as a \code{data.frame} object named as indicated by the -#' \code{response} entry in the \code{control} list, such as \code{gamma} -#' for a standard TS analysis on LDA output. -#' -#' @param formula \code{\link[stats]{formula}} defining the regression between -#' relationship the change points. Any -#' predictor variable included must also be a column in -#' \code{data} and any (multinomial) response variable must be a set of -#' columns in \code{data}, as verified by \code{\link{check_formula}}. -#' -#' @param nchangepoints \code{integer} corresponding to the number of -#' change points to include in the model. 0 is a valid input (corresponding -#' to no change points, so a singular time series model), and the current -#' implementation can reasonably include up to 6 change points. The -#' number of change points is used to dictate the segmentation of the -#' time series into chunks fit with separate models dictated by -#' \code{formula}. -#' -#' @param timename \code{character} element indicating the time variable -#' used in the time series. -#' -#' @param weights Optional class \code{numeric} vector of weights for each -#' document. Defaults to \code{NULL}, translating to an equal weight for -#' each document. When using \code{multinom_TS} in a standard LDATS -#' analysis, it is advisable to weight the documents by their total size, -#' as the result of \code{\link[topicmodels]{LDA}} is a matrix of -#' proportions, which does not account for size differences among documents. -#' For most models, a scaling of the weights (so that the average is 1) is -#' most appropriate, and this is accomplished using \code{document_weights}. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return List of saved data objects from the ptMCMC estimation of -#' change point locations (unless \code{nchangepoints} is 0, then -#' \code{NULL} is returned). -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' formula <- gamma ~ 1 -#' nchangepoints <- 1 -#' control <- TS_control() -#' data <- data[order(data[,"newmoon"]), ] -#' rho_dist <- est_changepoints(data, formula, nchangepoints, "newmoon", -#' weights, control) -#' } + +#' @rdname TS #' #' @export #' -est_changepoints <- function(data, formula, nchangepoints, timename, weights, - control = list()){ - check_TS_inputs(data, formula, nchangepoints, timename, weights, control) - control <- do.call("TS_control", control) - if (nchangepoints == 0){ - return(NULL) - } - saves <- prep_saves(nchangepoints, control) - inputs <- prep_ptMCMC_inputs(data, formula, nchangepoints, timename, - weights, control) - cpts <- prep_cpts(data, formula, nchangepoints, timename, weights, control) - ids <- prep_ids(control) - pbar <- prep_pbar(control, "rho") - - for(i in 1:control$nit){ - update_pbar(pbar, control) - steps <- step_chains(i, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - saves <- update_saves(i, saves, steps, swaps) - cpts <- update_cpts(cpts, swaps) - ids <- update_ids(ids, swaps) - } - process_saves(saves, control) +package_TS <- function(TSs){ + selected_TSs <- select_TS(TSs = TSs) + out <- list(selected_TSs = selected_TSs, TSs = TSs) + class(out) <- c("TS_set", "list") + out } -#' @title Initialize and tick through the progress bar -#' -#' @description \code{prep_pbar} creates and \code{update_pbar} steps -#' through the progress bars (if desired) in \code{\link{TS}} -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. Of use here is \code{quiet} which is a -#' a \code{logical} indicator of whether there should be information -#' (i.e. the progress bar) printed during the run or not. Default is -#' \code{TRUE}. -#' -#' @param bar_type "rho" (for change point locations) or "eta" (for -#' regressors). -#' -#' @param nr \code{integer} number of unique realizations, needed when -#' \code{bar_type} = "eta". -#' -#' @param pbar The progress bar object returned from \code{prep_pbar}. -#' -#' @return \code{prep_pbar}: the initialized progress bar object. \cr \cr -#' \code{update_pbar}: the ticked-forward \code{pbar}. -#' -#' @examples -#' pb <- prep_pbar(control = list(nit = 2)); pb -#' pb <- update_pbar(pb); pb -#' pb <- update_pbar(pb); pb +#' @rdname TS #' #' @export #' -prep_pbar <- function(control = list(), bar_type = "rho", - nr = NULL){ - check_control(control) - control <- do.call("TS_control", control) - if (!(bar_type %in% c("eta", "rho"))){ - stop("bar_type must be eta or rho") - } - if (!is.null(nr)){ - if (!is.numeric(nr) || any(nr %% 1 != 0)){ - stop("nr must be integer-valued") - } - } - form <- " [:bar] :percent eta: :eta" - if (bar_type == "rho"){ - msg <- " Estimating changepoint distribution" - out <- progress_bar$new(form, control$nit, width = 60) - } - if (bar_type == "eta"){ - msg <- " Estimating regressor distribution" - out <- progress_bar$new(form, nr, width = 60) - } - messageq(msg, control$quiet) - out +select_TS <- function(TSs){ + +# use softcall! + + vals <- measure_TS(TSs = TSs) + fun <- TSs[[1]]$control$selector + args <- update_list(TSs[[1]]$control$selector_args, x = vals) + args[names(args) == ""] <- NULL + selection <- do.call(what = fun, args = args) + TSs[selection] } -#' @rdname prep_pbar +#' @rdname TS #' #' @export #' -update_pbar <- function(pbar, control = list()){ - if (!("progress_bar" %in% class(pbar))){ - stop("pbar must be of class progress_bar") - } - check_control(control) - control <- do.call("TS_control", control) - if (control$quiet){ - return() +measure_TS <- function(TSs){ + +# use softcall! + + nTSs <- length(TSs) + vals <- rep(NA, nTSs) + for(i in 1:nTSs){ + fun <- TSs[[i]]$control$measurer + args <- TSs[[i]]$control$measurer_args + args <- update_list(args, object = TSs[[i]]) + args[names(args) == ""] <- NULL + vals_i <- do.call(what = fun, args = args) + if(length(vals_i) != 0){ + vals[i] <- vals_i + } } - pbar$tick() + vals } -#' @title Check that a formula is proper -#' -#' @description Check that \code{formula} is actually a -#' \code{\link[stats]{formula}} and that the -#' response and predictor variables are all included in \code{data}. -#' -#' @param formula \code{formula} to evaluate. -#' -#' @param data \code{data.frame} including [1] the time variable (indicated -#' in \code{timename}), [2] the predictor variables (required by -#' \code{formula}) and [3], the multinomial response variable (indicated in -#' \code{formula}) as verified by \code{\link{check_timename}} and -#' \code{\link{check_formula}}. Note that the response variables should be -#' formatted as a \code{data.frame} object named as indicated by the -#' \code{response} entry in the \code{control} list, such as \code{gamma} -#' for a standard TS analysis on LDA output. -#' -#' @return An error message is thrown if \code{formula} is not proper, -#' else \code{NULL}. -#' -#' @examples -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' check_formula(data, gamma ~ 1) +#' @rdname TS #' #' @export #' -check_formula <- function(data, formula){ - - if (!is(formula, "formula")){ - stop("formula does not contain a single formula") - } +TS_control <- function(model = sequential_TS, + model_args = list(control = sequential_TS_control()), + response = multinom_TS, + response_args = list(control = multinom_TS_control()), + method = ldats_classic, + method_args = list(control = ldats_classic_control()), + summary_prob = 0.95, + measurer = AIC, + measurer_args = list(NULL), + selector = which.min, + selector_args = list(NULL), + soften = TRUE, + quiet = FALSE, ...){ + list(model = model, model_args = model_args, + response = response, response_args = response_args, + method = method, method_args = method_args, + measurer = measurer, measurer_args = measurer_args, + selector = selector, selector_args = selector_args, + summary_prob = summary_prob, soften = soften, quiet = quiet) +} - respLoc <- attr(terms(formula), "response") - if (respLoc == 0){ - stop("formula inputs should include response variable") - } - resp <- as.character(attr(terms(formula), "variables"))[-1][respLoc] - pred <- attr(terms(formula), "term.labels") - if (!resp %in% colnames(data)){ - stop("formula includes response not present in data") - } - if (!all(pred %in% colnames(data))){ - misses <- pred[which(pred %in% colnames(data) == FALSE)] - mis <- paste(misses, collapse = ", ") - stop(paste0("formula includes predictors not present in data: ", mis)) - } - return() -} -#' @title Create the controls list for the Time Series model -#' -#' @description This function provides a simple creation and definition of a -#' list used to control the time series model fit occurring within -#' \code{\link{TS}}. +#' @title Print a Time Series model #' -#' @param memoise \code{logical} indicator of whether the multinomial -#' functions should be memoised (via \code{\link[memoise]{memoise}}). -#' Memoisation happens to both \code{\link{multinom_TS}} and -#' \code{\link{multinom_TS_chunk}}. +#' @description Convenience function to print only the most important +#' components of a \code{TS}-class object fit by +#' \code{\link{sequential_TS}}. #' -#' @param response \code{character} element indicating the response variable -#' used in the time series. +#' @param x Class \code{TS} object to be printed. #' -#' @param lambda \code{numeric} "weight" decay term used to set the prior -#' on the regressors within each chunk-level model. Defaults to 0, -#' corresponding to a fully vague prior. +#' @param ... Not used, simply included to maintain method compatibility. #' -#' @param measurer,selector Function names for use in evaluation of the TS -#' models. \code{measurer} is used to create a value for each model -#' and \code{selector} operates on the values to choose the model. +#' @return The non-hidden parts of \code{x} are printed and returned +#' invisibly as a \code{list}. #' -#' @param ntemps \code{integer} number of temperatures (chains) to use in the -#' ptMCMC algorithm. +#' @export #' -#' @param penultimate_temp Penultimate temperature in the ptMCMC sequence. +print.TS <- function(x, ...){ + hid <- attr(x, "hidden") + notHid <- !(names(x) %in% hid) + print(x[notHid]) +} + + +#' @title Determine the log likelihood of a Time Series model #' -#' @param ultimate_temp Ultimate temperature in the ptMCMC sequence. +#' @description Convenience function to extract and format the log likelihood +#' of a \code{TS}-class object fit by \code{\link{sequential_TS}}. #' -#' @param q Exponent controlling the ptMCMC temperature sequence from the -#' focal chain (reference with temperature = 1) to the penultimate chain. 0 -#' (default) implies a geometric sequence. 1 implies squaring before -#' exponentiating. +#' @param object Class \code{TS} object to be evaluated. #' -#' @param nit \code{integer} number of iterations (steps) used in the ptMCMC -#' algorithm. +#' @param ... Not used, simply included to maintain method compatibility. #' -#' @param magnitude Average magnitude (defining a geometric distribution) -#' for the proposed step size in the ptMCMC algorithm. +#' @return Log likelihood of the model \code{logLik}, also with \code{df} +#' (degrees of freedom) and \code{nobs} (number of observations) values. #' -#' @param quiet \code{logical} indicator of whether the model should run -#' quietly (if \code{FALSE}, a progress bar and notifications are printed). +#' @export #' -#' @param burnin \code{integer} number of iterations to remove from the -#' beginning of the ptMCMC algorithm. +logLik.TS <- function(object, ...){ + val <- object$logLik + attr(val, "df") <- object$nparams + attr(val, "nobs") <- nrow(object$data) + class(val) <- "logLik" + val +} + +#' @title Prepare the time chunk table for a change point Time Series model #' -#' @param thin_frac Fraction of iterations to retain, must be \eqn{(0, 1]}, -#' and the default value of 1 represents no thinning. +#' @description Creates the table containing the start and end times for each +#' chunk within a time series, based on the change points (used to break up +#' the time series) and the range of the time series. \cr \cr +#' If there are no change points (i.e. \code{changepoints = NULL}, there is +#' still a single chunk defined by the start and end of the time series. #' -#' @param summary_prob Probability used for summarizing the posterior -#' distributions (via the highest posterior density interval, see -#' \code{\link[coda]{HPDinterval}}). +#' @param data Class \code{data.frame} object including the predictor and +#' response variables, but specifically here containing the column indicated +#' by the \code{timename} input. #' -#' @param seed Input to \code{set.seed} for replication purposes. +#' @param changepoints Numeric vector indicating locations of the change +#' points. Must be conformable to \code{integer} values. #' -#' @return \code{list}, with named elements corresponding to the arguments. +#' @param timename \code{character} element indicating the time variable +#' used in the time series. Defaults to \code{"time"}. The variable must be +#' integer-conformable or a \code{Date}. If the variable named +#' is a \code{Date}, the input is converted to an integer, resulting in the +#' timestep being 1 day, which is often not desired behavior. #' -#' @examples -#' TS_control() +#' @return \code{data.frame} of \code{start} and \code{end} times (columns) +#' for each chunk (rows). #' -#' @export +#' @export #' -TS_control <- function(memoise = TRUE, response = "gamma", lambda = 0, - measurer = AIC, selector = min, ntemps = 6, - penultimate_temp = 2^6, ultimate_temp = 1e10, q = 0, - nit = 1e4, magnitude = 12, quiet = FALSE, burnin = 0, - thin_frac = 1, summary_prob = 0.95, seed = NULL){ - list(memoise = memoise, response = response, lambda = lambda, - measurer = measurer, selector = selector, ntemps = ntemps, - penultimate_temp = penultimate_temp, ultimate_temp = ultimate_temp, - q = q, nit = nit, magnitude = magnitude, quiet = quiet, - burnin = burnin, thin_frac = thin_frac, summary_prob = summary_prob, - seed = seed) - +prep_chunks <- function(data, changepoints = NULL, timename = "time"){ + start <- c(min(data[ , timename]), changepoints + 1) + end <- c(changepoints, max(data[ , timename])) + data.frame(start, end) } -#' @title Determine the log likelihood of a Time Series model + +#' @title Verify the change points of a time series model #' -#' @description Convenience function to extract and format the log likelihood -#' of a \code{TS_fit}-class object fit by \code{\link{multinom_TS}}. +#' @description Verify that a time series can be broken into a set +#' of chunks based on input change points. #' -#' @param object Class \code{TS_fit} object to be evaluated. +#' @param data Class \code{data.frame} object including the predictor and +#' response variables. #' -#' @param ... Not used, simply included to maintain method compatibility. +#' @param changepoints Numeric vector indicating locations of the change +#' points. Must be conformable to \code{integer} values. #' -#' @return Log likelihood of the model \code{logLik}, also with \code{df} -#' (degrees of freedom) and \code{nobs} (number of observations) values. +#' @param timename \code{character} element indicating the time variable +#' used in the time series. Defaults to \code{"time"}. The variable must be +#' integer-conformable or a \code{Date}. If the variable named +#' is a \code{Date}, the input is converted to an integer, resulting in the +#' timestep being 1 day, which is often not desired behavior. #' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) -#' logLik(TSmod) -#' } +#' @return Logical indicator of the check passing \code{TRUE} or failing +#' \code{FALSE}. #' -#' @export +#' @export #' -logLik.TS_fit <- function(object, ...){ - val <- object$logLik - attr(val, "df") <- object$nparams - attr(val, "nobs") <- nrow(object$data) - class(val) <- "logLik" - val -} \ No newline at end of file +verify_changepoint_locations <- function(data, changepoints = NULL, + timename = "time"){ + if (is.null(changepoints)){ + return(TRUE) + } + first_time <- min(data[ , timename]) + last_time <- max(data[ , timename]) + time_check <- any(changepoints <= first_time | changepoints >= last_time) + sort_check <- is.unsorted(changepoints, strictly = TRUE) + !(time_check | sort_check) +} + + + diff --git a/R/TS_methods.R b/R/TS_methods.R new file mode 100644 index 00000000..61683c4a --- /dev/null +++ b/R/TS_methods.R @@ -0,0 +1,625 @@ +#' @title Estimate changepoints using the LDATS classic ptMCMC method +#' +#' @description Uses the LDATS classic parallel tempering Markov Chain Monte +#' Carlo (ptMCMC) methods (Earl and Deem 2005) to fit a changepoint +#' model, following Christensen \emph{et al.} (2018). \cr \cr +#' \code{ldats_classic} is the top-level function for the sampler. \cr \cr +#' \code{ldats_classic_control} defines and creates a control \code{list} +#' for use with \code{\link{ldats_classic}}. +#' \code{prep_ptMCMC_inputs} packages the static inputs (controls and +#' data structures) used by the ptMCMC algorithm in the context of +#' estimating change points. +#' \code{prep_proposal_dist} prep-calculates the proposal distribution +#' for the ptMCMC algorithm in order to decrease computation time. +#' The proposal distribution is a joint of three distributions: +#' [1] a multinomial distribution selecting among the change points within +#' the chain, [2] a binomial distribution selecting the direction of the +#' step of the change point (earlier or later in the time series), and +#' [3] a geometric distribution selecting the magnitude of the step. +#' \code{prep_ids} creates the active vector of identities (ids) for each +#' of the chains in the ptMCMC algorithm, which are used to track trips +#' of the particles among chains. \cr \cr +#' \code{update_ids} updates the active vector of identities (ids) for +#' each of the chains in the ptMCMC algorithm after each iteration. +#' \cr \cr +#' \code{prep_cpts} initializes each chain using a draw from the available +#' times (i.e. assuming a uniform prior), the best fit (by likelihood) +#' draw is put in the focal chain with each subsequently worse fit placed +#' into the subsequently hotter chain. \cr \cr +#' \code{update_cpts} updates the change points after every iteration in +#' the ptMCMC algorithm. \cr \cr +#' \code{prep_saves} creates the data structure used to save the +#' output from each iteration of the ptMCMC algorithm. \cr \cr +#' \code{update_saves} adds to the data structure after each iteration. +#' \cr \cr +#' \code{process_saves} processes (burn-in iterations are dropped and the +#' remaining iterations are thinned) the saved data objects after the +#' ptMCMC is complete. \cr \cr +#' \code{prep_temp_sequence} creates the series of temperatures used in the +#' ptMCMC algorithm. \cr \cr +#' \code{step_chains} steps the chains forward one iteration +#' of the within-chain component of the ptMCMC algorithm. \cr \cr +#' \code{prop_step} makes the proposal for the next step. \cr \cr +#' \code{eval_step} evaluates the proposa. \cr \cr +#' \code{take_step} updates the configuration. \cr \cr +#' \code{proposed_step_mods} proposes the models for all chains in a given +#' step of the sampler. \cr \cr +#' \code{swap_chains} handles the among-chain swapping based on +#' temperatures and likelihood differentials. \cr \cr +#' \code{count_trips} counts the full trips (from one extreme temperature +#' chain to the other and back again; Katzgraber \emph{et al.} 2006) for +#' each of the ptMCMC particles, as identified by their id on +#' initialization. +#' +#' @details For each iteration of the ptMCMC algorithm, all of the chains +#' have the potential to take a step. The possible step is proposed under +#' a proposal distribution (here for change points we use a symmetric +#' geometric distribution), the proposed step is then evaluated and either +#' accepted or not (following the Metropolis-Hastings rule; Metropolis, +#' \emph{et al.} 1953, Hasting 1960, Gupta \emph{et al.} 2018), and then +#' accordingly taken or not (the configurations are updated). \cr \cr +#' The ptMCMC algorithm couples the chains (which are +#' taking their own walks on the distribution surface) through "swaps", +#' where neighboring chains exchange configurations (Geyer 1991, Falcioni +#' and Deem 1999) following the Metropolis criterion (Metropolis +#' \emph{et al.} 1953). This allows them to share information and search the +#' surface in combination (Earl and Deem 2005). +#' +#' @param TS \code{list} time series model object. +#' +#' @param control A \code{list} of parameters to control the fitting of the +#' time series model via the LDATS classic ptMCMC method. Values not input +#' assume defaults set by \code{\link{ldats_classic_control}}. +#' +#' @param ids \code{vector} (for \code{update_ids}, \code{swap_chains}) +#' of the existing chain ids or \code{matrix} (rows: chains, +#' columns: iterations; for \code{count_trips}) +#' of identifiers of the particles in each chain for +#' each iteration of the ptMCMC algorithm. +#' +#' @param prop_changepts \code{matrix} of proposed change points across +#' chains. +#' +#' @param inputs \code{list} of static inputs for use within the ptMCMC +#' algorithm. +#' +#' @param i \code{integer} iteration index. +#' +#' @param cpts \code{matrix} of change point locations across chains. +#' +#' @param saves The existing list of saved data objects. +#' +#' @param steps Chain configuration after within-temperature steps. +#' +#' @param swaps Chain configuration after among-temperature swaps. +#' +#' @param prop_step Proposed step output from \code{propose_step}. +#' +#' @param accept_step \code{logical} indicator of acceptance of each chain's +#' proposed step. +#' +#' @param chainsin Chain configuration to be evaluated for swapping. +#' +#' @param ntemps \code{integer} number of temperatures (chains) to use in the +#' ptMCMC algorithm. +#' +#' @param penultimate_temp Penultimate temperature in the ptMCMC sequence. +#' +#' @param ultimate_temp Ultimate temperature in the ptMCMC sequence. +#' +#' @param q Exponent controlling the ptMCMC temperature sequence from the +#' focal chain (reference with temperature = 1) to the penultimate chain. 0 +#' (default) implies a geometric sequence. 1 implies squaring before +#' exponentiating. +#' +#' @param nit \code{integer} number of iterations (steps) used in the ptMCMC +#' algorithm. +#' +#' @param magnitude Average magnitude (defining a geometric distribution) +#' for the proposed step size in the ptMCMC algorithm. +#' +#' @param burnin \code{integer} number of iterations to remove from the +#' beginning of the ptMCMC algorithm. +#' +#' @param thin_frac Fraction of iterations to retain, must be \eqn{(0, 1]}, +#' and the default value of 1 represents no thinning. +#' +#' @param quiet \code{logical} indicator of whether the model should run +#' quietly (if \code{FALSE}, a progress bar and notifications are printed). +#' +#' @param memoise \code{logical} indicator of whether the response +#' function should be memoised (via \code{\link[memoise]{memoise}}). +#' +#' @return +#' \code{ldats_classic}: \code{list} of changepoint locations, log +#' likelihoods, and model diagnostics. \cr \cr +#' \code{ldats_classic_control}: \code{list} of named control elements +#' for use in \code{\link{ldats_classic}}. \cr \cr +#' \code{prep_ptMCMC_inputs}: \code{list} containing the static +#' inputs for use within the ptMCMC algorithm for estimating change +#' points. \cr \cr +#' \code{prep_temp_sequence}: \code{vector} of temperatures. +#' \code{prep_proposal_dist}: \code{list} of two \code{matrix} elements: +#' [1] the size of the proposed step for each iteration of each chain and +#' [2] the identity of the change point location to be shifted by the +#' step for each iteration of each chain. +#' \code{prep_ids},\code{update_ids}: \code{vector} of ids. \cr \cr +#' \code{prep_cpts},\code{update_cpts}: \code{list} of [1] \code{matrix} +#' of change points (rows) for each temperature (columns) and [2] +#' \code{vector} of log-likelihood values for each of the chains. \cr \cr +#' \code{prep_saves},\code{update_saves},\code{process_saves}: \code{list} +#' of ptMCMC objects: change points (\code{$cpts}), +#' log-likelihoods (\code{$lls}), chain ids (\code{$ids}), +#' step acceptances (\code{$step_accepts}), and swap acceptances +#' (\code{$swap_accepts}). \cr \cr +#' \code{step_chains}: \code{list} of change points, log-likelihoods, +#' and logical indicators of acceptance for each chain. \cr \cr +#' \code{propose_step}: \code{list} of change points and +#' log-likelihood values for the proposal. \cr \cr +#' \code{eval_step}: \code{logical} vector indicating if each +#' chain's proposal was accepted. \cr \cr +#' \code{take_step}: \code{list} of change points, log-likelihoods, +#' and logical indicators of acceptance for each chain. \cr \cr +#' \code{swap_chains}: \code{list} of updated change points, +#' log-likelihoods, and chain ids, as well as a \code{vector} of +#' acceptance indicators for each swap. \cr \cr +#' \code{proposed_step_mods}: \code{list} of models associated with the +#' proposed step, with an element for each chain. \cr \cr +#' \code{count_trips}: \code{list} of [1] \code{vector} of within particle +#' trip counts (\code{$trip_counts}), and [2] \code{vector} of +#' within-particle average trip rates (\code{$trip_rates}). +#' +#' @references +#' +#' Christensen, E., D. J. Harris, and S. K. M. Ernest. 2018. +#' Long-term community change through multiple rapid transitions in a +#' desert rodent community. \emph{Ecology} \strong{99}:1523-1529. +#' \href{https://doi.org/10.1002/ecy.2373}{link}. +#' +#' Earl, D. J. and M. W. Deem. 2005. Parallel tempering: theory, +#' applications, and new perspectives. \emph{Physical Chemistry Chemical +#' Physics} \strong{7}: 3910-3916. +#' \href{https://doi.org/10.1039/B509983H}{link}. +#' +#' Falcioni, M. and M. W. Deem. 1999. A biased Monte Carlo scheme for +#' zeolite structure solution. \emph{Journal of Chemical Physics} +#' \strong{110}: 1754-1766. +#' \href{https://aip.scitation.org/doi/10.1063/1.477812}{link}. +#' +#' Geyer, C. J. 1991. Markov Chain Monte Carlo maximum likelihood. \emph{In +#' Computing Science and Statistics: Proceedings of the 23rd Symposium on +#' the Interface}. pp 156-163. American Statistical Association, New York, +#' USA. \href{https://www.stat.umn.edu/geyer/f05/8931/c.pdf}{link}. +#' +#' Gupta, S., L. Hainsworth, J. S. Hogg, R. E. C. Lee, and J. R. Faeder. +#' 2018. Evaluation of parallel tempering to accelerate Bayesian parameter +#' estimation in systems biology. +#' \href{https://arxiv.org/abs/1801.09831}{link}. +#' +#' Hastings, W. K. 1970. Monte Carlo sampling methods using Markov Chains +#' and their applications. \emph{Biometrika} \strong{57}:97-109. +#' \href{https://doi.org/10.2307/2334940}{link}.#' +#' Katzgraber, H. G., S. Trebst, D. A. Huse. And M. Troyer. 2006. +#' Feedback-optimized parallel tempering Monte Carlo. \emph{Journal of +#' Statistical Mechanics: Theory and Experiment} \strong{3}:P03018 +#' \href{https://bit.ly/2LICGXh}{link}. +#' +#' Metropolis, N., A. W. Rosenbluth, M. N. Rosenbluth, A. H. Teller, and E. +#' Teller. 1953. Equations of state calculations by fast computing machines. +#' \emph{Journal of Chemical Physics} \strong{21}: 1087-1092. +#' \href{https://bayes.wustl.edu/Manual/EquationOfState.pdf}{link}. +#' +#' @name ldats_classic +#' + + + +#' @rdname ldats_classic +#' +#' @export +#' +ldats_classic <- function(TS, control = list()){ + TS$control$method_args$control <- do.call("ldats_classic_control", control) + saves <- prep_saves(TS = TS) + inputs <- prep_ptMCMC_inputs(TS = TS) + cpts <- prep_cpts(TS = TS) + ids <- prep_ids(TS = TS) + pbar <- prep_pbar(control = TS$control$method_args$control, type = "rho") + + for(i in 1:control$nit){ + update_pbar(pbar = pbar, control = TS$control$method_args$control) + steps <- step_chains(TS = TS, i = i, cpts = cpts, inputs = inputs) + swaps <- swap_chains(chainsin = steps, inputs = inputs, ids = ids) + saves <- update_saves(i = i, saves = saves, steps = steps, swaps = swaps) + cpts <- update_cpts(cpts = cpts, swaps = swaps) + ids <- update_ids(ids = ids, swaps = swaps) + } + + process_saves(saves = saves, TS = TS) + +} + +#' @rdname ldats_classic +#' +#' @export +#' +count_trips <- function(ids){ + nit <- ncol(ids) + ntemps <- nrow(ids) + last_extreme <- NA + last_extreme_vector <- numeric(nit) + trips <- numeric(ntemps) + for(i in 1:ntemps){ + for(j in 1:nit){ + if(ids[1, j] == i){ + last_extreme <- "bottom" + } + if(ids[ntemps, j] == i){ + last_extreme <- "top" + } + last_extreme_vector[j] <- last_extreme + } + first_top <- match("top", last_extreme_vector) + if (is.na(first_top)){ + trips[i] <- 0 + } else{ + last_pos <- rle(last_extreme_vector[first_top:nit])$values + trips[i] <- sum(last_pos == "bottom") + } + } + trip_rates <- trips / nit + list(trip_counts = trips, trip_rates = trip_rates) +} + + + +#' @rdname ldats_classic +#' +#' @export +#' +swap_chains <- function(chainsin, inputs, ids){ + temps <- inputs$temps + itemps <- 1/temps + ntemps <- length(temps) + revtemps <- seq(ntemps - 1, 1) + lls <- chainsin$lls + changepts <- chainsin$changepts + accept_swap <- rep(FALSE, ntemps - 1) + + for (j in revtemps){ + cutoff <- exp((itemps[j] - itemps[j + 1]) * (lls[j + 1] - lls[j])) + accept <- runif(1) < cutoff + if (accept) { + + accept_swap[j] <- TRUE + placeholder <- changepts[, j] + changepts[ , j] <- changepts[, j + 1] + changepts[ , j + 1] <- placeholder + + placeholder <- lls[j] + lls[j] <- lls[j + 1] + lls[j + 1] <- placeholder + + placeholder <- ids[j] + ids[j] <- ids[j + 1] + ids[j + 1] <- placeholder + } + } + list(changepts = changepts, lls = lls, ids = ids, accept_swap = accept_swap) +} + +#' @rdname ldats_classic +#' +#' @export +#' +ldats_classic_control <- function(ntemps = 6, penultimate_temp = 2^6, + ultimate_temp = 1e10, q = 0, + nit = 1e4, magnitude = 12, + burnin = 0, thin_frac = 1, + memoise = TRUE, quiet = FALSE){ + list(ntemps = ntemps, penultimate_temp = penultimate_temp, + ultimate_temp = ultimate_temp, q = q, nit = nit, + magnitude = magnitude, burnin = burnin, thin_frac = thin_frac, + memoise = memoise, quiet = quiet) +} + +#' @rdname ldats_classic +#' +#' @export +#' +prep_cpts <- function(TS){ + data <- TS$data$train$ts_data + temps <- prep_temp_sequence(TS = TS) + ntemps <- length(temps) + min_time <- min(data[ , TS$timename]) + max_time <- max(data[ , TS$timename]) + times <- seq(min_time, max_time, 1) + avail_times <- times[-c(1, length(times))] + cps <- matrix(NA, nrow = TS$nchangepoints, ncol = ntemps) + for (i in 1:ntemps){ + cp_times <- sort(sample(avail_times, TS$nchangepoints, replace = FALSE)) + cps[ , i] <- cp_times + } + lls <- rep(NA, ntemps) + for (i in 1:ntemps){ + fun <- TS$control$response + fun <- memoise_fun(fun, TS$control$memoise) + args <- list(data = data, formula = TS$formula, changepoints = cps[ , i], + timename = TS$timename, weights = TS$weights, + control = TS$control$response_args$control) + modfit <- soft_call(what = fun, args = args, soften = TRUE) + lls[i] <- modfit$logLik + } + + if(any(lls == -Inf)){ + iter <- 1 + while(all(lls == -Inf)){ + + cps <- matrix(NA, nrow = TS$nchangepoints, ncol = ntemps) + for (i in 1:ntemps){ + cp_times <- sort(sample(avail_times, TS$nchangepoints, + replace = FALSE)) + cps[ , i] <- cp_times + } + lls <- rep(NA, ntemps) + for (i in 1:ntemps){ + fun <- TS$control$response + fun <- memoise_fun(fun, TS$control$memoise) + args <- list(data = data, formula = TS$formula, + changepoints = cps[ , i], + timename = TS$timename, weights = TS$weights, + control = TS$control$response_args$control) + modfit <- soft_call(what = fun, args = args, soften = TRUE) + lls[i] <- modfit$logLik + } + iter <- iter + 1 + if(iter > 10){ + stop("max number of starts tried with all -Inf LogLiks") + } + } + to_replace <- which(lls == -Inf) + to_use <- which(lls != -Inf) + for(i in 1:length(to_replace)){ + to_use_i <- sample(to_use, 1) + lls[to_replace[i]] <- lls[to_use_i] + cps[, to_replace[i]] <- cps[ , to_use_i] + } + } + + cps <- cps[ , order(lls, decreasing = TRUE), drop = FALSE] + lls <- sort(lls, decreasing = TRUE) + + out <- list(cps, lls) + names(out) <- c("changepts", "lls") + out +} + +#' @rdname ldats_classic +#' +#' @export +#' +update_cpts <- function(cpts, swaps){ + list(changepts = swaps$changepts, lls = swaps$lls) +} + +#' @rdname ldats_classic +#' +#' @export +#' +prep_temp_sequence <- function(TS){ + ntemps <- TS$control$method_args$control$ntemps + penultimate_temp <- TS$control$method_args$control$penultimate_temp + ultimate_temp <- TS$control$method_args$control$ultimate_temp + q <- TS$control$method_args$control$q + sequence <- seq(0, log2(penultimate_temp), length.out = ntemps - 1) + log_temps <- sequence^(1 + q) / log2(penultimate_temp)^q + c(2^(log_temps), ultimate_temp) +} + +#' @rdname ldats_classic +#' +#' @export +#' +prep_saves <- function(TS){ + nchangepoints <- TS$nchangepoints + ntemps <- TS$control$method_args$control$ntemps + nit <- TS$control$method_args$control$nit + cpts <- array(NA, c(nchangepoints, ntemps, nit)) + lls <- matrix(NA, ntemps, nit) + ids <- matrix(NA, ntemps, nit) + step_accepts <- matrix(FALSE, ntemps, nit) + swap_accepts <- matrix(FALSE, ntemps - 1, nit) + list(cpts = cpts, lls = lls, ids = ids, step_accepts = step_accepts, + swap_accepts = swap_accepts) +} + +#' @rdname ldats_classic +#' +#' @export +#' +update_saves <- function(i, saves, steps, swaps){ + saves$cpts[ , , i] <- swaps$changepts + saves$lls[ , i] <- swaps$lls + saves$ids[ , i] <- swaps$ids + saves$step_accepts[ , i] <- steps$accept_step + saves$swap_accepts[ , i] <- swaps$accept_swap + saves +} + +#' @rdname ldats_classic +#' +#' @export +#' +process_saves <- function(saves, TS){ + nit <- TS$control$method_args$control$nit + iters <- 1:nit + if (TS$control$method_args$control$burnin > 0){ + iters <- iters[-(1:TS$control$method_args$control$burnin)] + } + niters <- length(iters) + thin_interval <- ceiling(1/TS$control$method_args$control$thin_frac) + iters_thinned <- seq(1, niters, by = thin_interval) + dims <- c(dim(saves$cpts)[1:2], length(iters_thinned)) + + trips <- count_trips(saves$ids) + diagnostics <- list(step_acceptance_rate = rowMeans(saves$step_accepts), + swap_acceptance_rate = rowMeans(saves$swap_accepts), + trip_counts = trips$trip_counts, + trip_rates = trips$trip_rates) + + + saves$cpts <- array(saves$cpts[ , , iters_thinned], dim = dims) + saves$lls <- saves$lls[, iters_thinned] + saves$ids <- saves$ids[, iters_thinned] + saves$step_accepts <- saves$step_accepts[ , iters_thinned] + saves$swap_accepts <- saves$swap_accepts[ , iters_thinned] + + saves$diagnostics <- diagnostics + + saves +} + +#' @rdname ldats_classic +#' +#' @export +#' +prep_ptMCMC_inputs <- function(TS){ + fun <- TS$control$response + fun <- memoise_fun(fun, TS$control$method_args$control$memoise) + list(control = TS$control$method_args$control, + temps = prep_temp_sequence(TS = TS), + pdist = prep_proposal_dist(TS = TS), + formula = TS$formula, + weights = TS$weights, + data = TS$data$train$ts_data, + response = TS$response, + timename = TS$timename, + fun = fun) +} + +#' @rdname ldats_classic +#' +#' @export +#' +prep_proposal_dist <- function(TS){ + nchangepoints <- TS$nchangepoints + ntemps <- TS$control$method_args$control$ntemps + nit <- TS$control$method_args$control$nit + if(nchangepoints == 0){ + steps <- matrix(0, nrow = nit, ncol = ntemps) + which_steps <- matrix(numeric(0), nrow = nit, ncol = ntemps) + } else{ + magnitude <- TS$control$method_args$control$magnitude + step_signs <- sample(c(-1, 1), nit * ntemps, replace = TRUE) + step_magnitudes <- 1 + rgeom(nit * ntemps, 1 / magnitude) + steps <- matrix(step_signs * step_magnitudes, nrow = nit) + which_steps <- sample.int(nchangepoints, nit * ntemps, replace = TRUE) + which_steps <- matrix(which_steps, nrow = nit) + } + list(steps = steps, which_steps = which_steps) +} + +#' @rdname ldats_classic +#' +#' @export +#' +prep_ids <- function(TS){ + ntemps <- TS$control$method_args$control$ntemps + if (!is.numeric(ntemps) || any(ntemps %% 1 != 0)){ + stop("ntemps must be integer-valued") + } + 1:ntemps +} + +#' @rdname ldats_classic +#' +#' @export +#' +update_ids <- function(ids, swaps){ + swaps$ids +} + +#' @rdname ldats_classic +#' +#' @export +#' +step_chains <- function(TS, i, cpts, inputs){ + prop_step <- propose_step(TS = TS, i = i, cpts = cpts, inputs = inputs) + accept_step <- eval_step(i = i, cpts = cpts, prop_step = prop_step, + inputs = inputs) + take_step(cpts = cpts, prop_step = prop_step, accept_step = accept_step) +} + +#' @rdname ldats_classic +#' +#' @export +#' +propose_step <- function(TS, i, cpts, inputs){ + + pdist <- inputs$pdist + ntemps <- length(inputs$temps) + selection <- cbind(pdist$which_steps[i, ], 1:ntemps) + prop_changepts <- cpts$changepts + curr_changepts_s <- cpts$changepts[selection] + prop_changepts_s <- curr_changepts_s + pdist$steps[i, ] + if(all(is.na(prop_changepts_s))){ + prop_changepts_s <- NULL + } + prop_changepts[selection] <- prop_changepts_s + mods <- proposed_step_mods(TS = TS, prop_changepts = prop_changepts, + inputs = inputs) + lls <- vapply(mods, logLik, 0) + list(changepts = prop_changepts, lls = lls) +} + +#' @rdname ldats_classic +#' +#' @export +#' +eval_step <- function(i, cpts, prop_step, inputs){ + temps <- inputs$temps + ntemps <- length(temps) + itemps <- 1 / temps + runif(ntemps) < exp((prop_step$lls - cpts$lls) * itemps) +} + +#' @rdname ldats_classic +#' +#' @export +#' +take_step <- function(cpts, prop_step, accept_step){ + changepts <- cpts$changepts + lls <- cpts$lls + changepts[ , accept_step] <- prop_step$changepts[ , accept_step] + lls[accept_step] <- prop_step$lls[accept_step] + list(changepts = changepts, lls = lls, accept_step = accept_step) +} + +#' @rdname ldats_classic +#' +#' @export +#' +proposed_step_mods <- function(TS, prop_changepts, inputs){ + + data <- inputs$data + formula <- inputs$formula + weights <- inputs$weights + TS_function <- inputs$TS_function + ntemps <- length(inputs$temps) + control <- inputs$control + timename <- inputs$timename + out <- vector("list", length = ntemps) + for (i in 1:ntemps){ + + fun <- inputs$fun + args <- list(data = data, formula = TS$formula, + changepoints = prop_changepts[ , i], + timename = TS$timename, weights = TS$weights, + control = control) + out[[i]] <- soft_call(what = fun, args = args, soften = TRUE) + } + out +} + + + diff --git a/R/TS_models.R b/R/TS_models.R new file mode 100644 index 00000000..3f2fecb5 --- /dev/null +++ b/R/TS_models.R @@ -0,0 +1,503 @@ +#' @title Estimate a Time Series model sequentially +#' +#' @description This set of functions estimates the Time Series model +#' by sequential methods that first estimate the change point locations +#' with full flexibility of the regressor models between change points, +#' then estimate the regressors between the change points, intially +#' conditional on their locations, but with marginal estimation to produce +#' regressor values unconditional on change point locations. \cr \cr +#' \code{sequential_TS} combines each stage of the model estimation and +#' packages the model results in a consistent output. \cr \cr +#' \code{sequential_TS_control} defines and creates the control \code{list} +#' used to fit the sequential Time Series model. +#' \code{est_changepoints} estimates the change point location +#' distributions for multinomial Time Series analyses. \cr \cr +#' \code{est_regressors} uses the marginal posterior distributions of +#' the change point locations (estimated by +#' \code{\link{est_changepoints}}) in combination with the conditional +#' (on the change point locations) posterior distributions of the +#' regressors (estimated by a \code{_TS} function) to +#' estimate the marginal posterior distribution of the regressors, +#' unconditional on the change point locations. \cr \cr +#' \code{package_sequential_TS} calculates relevant summaries for the run of +#' a sequenial Time Series model within \code{\link{sequential_TS}} and +#' packages the output as a \code{TS}-class object. \cr \cr +#' \code{summarize_etas} calculates summary statistics for each +#' of the chunk-level regressors. \cr \cr +#' \code{measure_ets_vcov} generates the variance-covariance matrix for +#' the regressors. \cr \cr +#' \code{summarize_rho} calculates summary statistics for each +#' of the change point locations. \cr \cr +#' \code{measure_rho_vcov} generates the variance-covariance matrix for the +#' change point locations. +#' +#' @param rho_dist \code{list} of saved data objects from the estimation of +#' change point locations (unless \code{nchangepoints} is 0, then +#' \code{NULL}) returned from \code{\link{est_changepoints}}. +#' +#' @param eta_dist \code{matrix} of draws (rows) from the marginal posteriors +#' of the coefficients across the segments (columns), as estimated by +#' \code{\link{est_regressors}}. +#' +#' @param control A \code{list} of parameters to control the fitting of the +#' Time Series model. Values not input assume defaults set by +#' \code{\link{sequential_TS_control}}. +#' +#' @param etas \code{matrix} of regressors (columns) across iterations of the +#' sampler (rows), as returned from \code{\link{est_regressors}}. +#' +#' @param rhos \code{matrix} of change point locations (columns) across +#' iterations of the sampler (rows) or \code{NULL} if no change points are +#' in the model, as returned from \code{\link{est_changepoints}}. +#' +#' @param TS Time series model \code{list}. +#' +#' @param summary_prob Probability used for summarizing the posterior +#' distributions (via the highest posterior density interval, see +#' \code{\link[coda]{HPDinterval}}). +#' +#' @param quiet \code{logical} indicator of whether the model should run +#' quietly (if \code{FALSE}, a progress bar and notifications are printed). +#' +#' @param soften \code{logical} indicator of whether the model should error +#' softly or if errors should trigger a full-stop to the pipeline. +#' +#' @param method \code{function} used to drive the sampler of the TS +#' models; \code{method} defines and operates the computational procedure. +#' \cr \cr +#' Current pre-built options include \code{\link{ldats_classic}}. +#' +#' @param method_args \code{list} of (named) arguments to be used in +#' \code{method} via \code{\link{do.call}}. +#' \cr \cr +#' Could be managed via a \code{_control} function like +#' \code{\link{ldats_classic_control}}. +#' +#' @param ... Not passed along to the output, rather included to allow for +#' automated removal of unneeded controls. +#' +#' @details The general approach follows that of Western and Kleykamp +#' (2004), although we note some important differences. Our regression +#' models are fit independently for each chunk (segment of time), and +#' therefore the variance-covariance matrix for the full model +#' has \code{0} entries for covariances between regressors in different +#' chunks of the time series. \cr \cr +#' Further differences are model-specific. For example, the original softmax +#' multinomial regression model used here is a standard (non-hierarchical) +#' model (Ripley 1996, Venables and Ripley 2002, Bishop 2006), mean that +#' there is no error term in the regression (as there is in the normal +#' model used by Western and Kleykamp 2004), and so the posterior +#' distribution used here is a multivariate normal, as opposed to a +#' multivariate t, as used by Western and Kleykamp (2004). +#' +#' @references +#' Bishop, C. M. 2006. \emph{Pattern Recognition and Machine Learning}. +#' Springer, New York, NY, USA. +#' +#' Ripley, B. D. 1996. \emph{Pattern Recognition and Neural Networks}. +#' Cambridge University Press, Cambridge, UK. +#' +#' Venables, W. N. and B. D. Ripley. 2002. \emph{Modern and Applied +#' Statistics with S}. Fourth Edition. Springer, New York, NY, USA. +#' +#' Western, B. and M. Kleykamp. 2004. A Bayesian change point model for +#' historical time series analysis. \emph{Political Analysis} +#' \strong{12}:354-374. +#' \href{https://doi.org/10.1093/pan/mph023}{link}. +#' +#' @return +#' \code{sequential_TS} and \code{package_sequential_TS}: +#' \code{TS}-class list containing the following elements, many of +#' which are hidden for \code{print}ing, but are accessible: +#' \describe{ +#' \item{data}{\code{data} input to the function.} +#' \item{formula}{\code{\link[stats]{formula}} input to the function.} +#' \item{nchangepoints}{\code{nchangepoints} input to the function.} +#' \item{weights}{\code{weights} input to the function.} +#' \item{timename}{\code{timename} input to the function.} +#' \item{control}{\code{control} input to the function.} +#' \item{lls}{Iteration-by-iteration +#' \link[=logLik.TS_fit]{logLik} values for the +#' full time series fit by \code{\link{multinom_TS}}.} +#' \item{rhos}{Iteration-by-iteration change point estimates from +#' \code{\link{est_changepoints}} and diagnostics.} +#' \item{focal_rhos}{Simplified object of just the change point +#' locations of interest.} +#' \item{etas}{Iteration-by-iteration marginal regressor estimates from +#' \code{\link{est_regressors}}, which have been +#' unconditioned with respect to change point locations.} +#' \item{rho_summary}{Summary table describing \code{rhos} (the change +#' point locations), see +#' \code{\link{summarize_rhos}}.} +#' \item{rho_vcov}{Variance-covariance matrix for the estimates of +#' \code{rhos} (the change point locations), see +#' \code{\link{measure_rho_vcov}}.} +#' \item{eta_summary}{Summary table describing \code{ets} (the +#' regressors), see +#' \code{\link{summarize_etas}}.} +#' \item{eta_vcov}{Variance-covariance matrix for the estimates of +#' \code{etas} (the regressors), see +#' \code{\link{measure_eta_vcov}}.} +#' \item{logLik}{Across-iteration average of log-likelihoods +#' (\code{lls}).} +#' \item{nparams}{Total number of parameters in the full model, +#' including the change point locations and regressors.} +#' } \cr \cr +#' \code{sequential_TS_control}: \code{list} of named control elements for +#' sequential model fitting. +#' \code{est_changepoints}: \code{list} of saved data objects from the +#' estimation of change point locations, uunless \code{nchangepoints} +#' is 0, then \code{NULL}. \cr \cr +#' \code{est_regressors}: \code{matrix} of draws (rows) from the marginal +#' posteriors of the coefficients across the segments (columns). \cr \cr +#' \code{summarize_etas}: table of summary statistics for chunk-level +#' regressors including mean, median, mode, posterior interval, standard +#' deviation, MCMC error, autocorrelation, and effective sample size for +#' each regressor. \cr \cr +#' \code{measure_eta_vcov}: variance-covariance matrix for chunk-level +#' regressors. \cr \cr +#' \code{summarize_rhos}: table of summary statistics for change point +#' locations including mean, median, mode, posterior interval, standard +#' deviation, MCMC error, autocorrelation, and effective sample size for +#' each change point location. \cr \cr +#' \code{measure_rho_vcov}: variance-covariance matrix for change +#' point locations. +#' +#' @name sequential_TS +#' + +#' @rdname sequential_TS +#' +#' @export +#' +sequential_TS <- function(TS, control = list()){ + TS$control$model_args$control <- do.call("sequential_TS_control", control) + rho_dist <- est_changepoints(TS = TS) + eta_dist <- est_regressors(rho_dist = rho_dist, TS = TS) + package_sequential_TS(TS = TS, rho_dist = rho_dist, eta_dist = eta_dist) +} + +#' @rdname sequential_TS +#' +#' @export +#' +package_sequential_TS <- function(TS, rho_dist, eta_dist){ + if(is.null(rho_dist)){ + focal_rhos <- NULL + data <- TS$data$train$ts_data + fun <- TS$control$response + args <- list(data = data, formula = TS$formula, changepoints = NULL, + timename = TS$timename, weights = TS$weights, + control = TS$control$response_args$control) + mod <- soft_call(what = fun, args = args, soften = TRUE) + lls <- as.numeric(logLik(mod)) + + } else{ + vals <- rho_dist$cpts[ , 1, , drop = FALSE] + dims <- dim(rho_dist$cpts)[c(1, 3)] + lls <- rho_dist$lls[1, ] + focal_rhos <- t(array(vals, dim = dims)) + } + + rho_summary <- summarize_rhos(rhos = focal_rhos, TS = TS) + rho_vcov <- measure_rho_vcov(rhos = focal_rhos) + eta_summary <- summarize_etas(etas = eta_dist, TS = TS) + eta_vcov <- measure_eta_vcov(etas = eta_dist) + + logLik <- mean(lls) + ncoefs <- ncol(eta_dist) + nparams <- TS$nchangepoints + ncoefs + + out <- update_list(TS, focal_rhos = focal_rhos, rhos = rho_dist, + etas = eta_dist, rho_summary = rho_summary, + rho_vcov = rho_vcov, eta_summary = eta_summary, + eta_vcov = eta_vcov, logLik = logLik, nparams = nparams) + class(out) <- c("TS", "list") + to_hide <- c("data", "weights", "control", "lls", "rhos", "etas", + "focal_rhos", "rho_vcov", "eta_vcov") + if (TS$nchangepoints == 0){ + to_hide <- c(to_hide, "rho_summary") + } + attr(out, "hidden") <- to_hide + out +} + +#' @rdname sequential_TS +#' +#' @export +#' +est_changepoints <- function(TS){ + if (TS$nchangepoints == 0){ + return(NULL) + } + fun <- TS$control$method + args <- update_list(TS$control$method_args, TS = TS) + soft_call(what = fun, args = args, soften = TS$control$soften) +} + + +#' @rdname sequential_TS +#' +#' @export +#' +est_regressors <- function(rho_dist, TS){ + data <- TS$data$train$ts_data + if(is.null(rho_dist)){ + + msg <- " - estimating regressor distribution" + messageq(msg, TS$control$quiet) + + fun <- TS$control$response + args <- list(data = data, formula = TS$formula, changepoints = NULL, + timename = TS$timename, weights = TS$weights, + control = TS$control$response_args$control) + mod <- soft_call(what = fun, args = args, soften = TRUE) + + seg_mod <- mod[[1]][[1]] + mod_class <- class(seg_mod) + coefs <- coef(seg_mod) + weighted <- !all(seg_mod$weights == 1) + if(all(c("multinom", "nnet") %in% mod_class)){ + coefs <- t(coefs) + vcv <- mirror_vcov(seg_mod) + } else if(all(c("mlm", "lm") %in% mod_class)){ + nresps <- length(summary(seg_mod)) + + if(weighted){ + resp_vcv_dim <- dim(vcov(summary(seg_mod)[[1]])) + full_vcv <- matrix(0, resp_vcv_dim[1] * nresps, + resp_vcv_dim[1] * nresps) + for(i in 1:nresps){ + in_row <- (1 + (i - 1) * resp_vcv_dim[1]):(i * resp_vcv_dim[1]) + in_col <- in_row + full_vcv[in_row, in_col] <- vcov(summary(seg_mod)[[i]]) + } + vcv <- full_vcv + coef_names <- rep(row.names(coef(summary(seg_mod)[[1]])), nresps) + resp_names <- rep(1:nresps, each = NROW(coef(summary(seg_mod)[[1]]))) + colnames(vcv) <- paste(resp_names, coef_names, sep =":") + } else{ + vcv <- vcov(seg_mod) + resp_names <- rep(1:nresps, each = NROW(coef(summary(seg_mod)[[1]]))) + colnames(vcv) <- paste(resp_names, colnames(vcv), sep ="") + rownames(vcv) <- paste(resp_names, rownames(vcv), sep ="") + } + } else{ + vcv <- mirror_vcov(seg_mod) + } + + mv <- as.vector(coefs) + + eta <- rmvnorm(TS$control$method_args$control$nit, mv, vcv) + seg_names <- rep(1, ncol(vcv)) + coef_names <- colnames(vcv) + colnames(eta) <- paste(seg_names, coef_names, sep = "_") + + return(eta) + } + + + focal_rho <- rho_dist$cpts[ , 1, ] + nchangepts <- dim(rho_dist$cpts)[1] + if (nchangepts == 1){ + collapsedrho <- focal_rho + } else{ + collapsedrho <- apply(focal_rho, 2, paste, collapse = "_") + } + freq_r <- table(collapsedrho) + unique_r <- names(freq_r) + nr <- length(unique_r) + n_topic <- ncol(data$gamma) + n_covar <- length(attr(terms(TS$formula), "term.labels")) + + fun <- TS$control$response + fun <- memoise_fun(fun, TS$control$method_args$control$memoise) + args <- list(data = data, formula = TS$formula, changepoints = NULL, + timename = TS$timename, weights = TS$weights, + control = TS$control$response_args$control) + mod <- soft_call(what = fun, args = args, soften = TRUE) + mod_class <- class(mod[[1]][[1]]) + + + if(all(c("mlm", "lm") %in% mod_class) && + TS$control$response_args$control$transformation == "clr"){ + n_eta_segment <- (n_topic) * (n_covar + 1) + } else{ + n_eta_segment <- (n_topic - 1) * (n_covar + 1) + } + + n_changept <- dim(rho_dist$cpts)[1] + n_segment <- n_changept + 1 + n_eta <- n_eta_segment * n_segment + n_iter <- dim(rho_dist$cpts)[3] + eta <- matrix(NA, nrow = n_iter, ncol = n_eta) + pbar <- prep_pbar(control = TS$control$model_args$control, + type = "eta", nr = nr) + + for(i in 1:nr){ + update_pbar(pbar = pbar, control = TS$control$model_args$control) + cpts <- as.numeric(strsplit(unique_r[i], "_")[[1]]) + + + data <- TS$data$train$ts_data + fun <- TS$control$response + fun <- memoise_fun(fun, TS$control$method_args$control$memoise) + args <- list(data = data, formula = TS$formula, changepoints = cpts, + timename = TS$timename, weights = TS$weights, + control = TS$control$response_args$control) + mod <- soft_call(what = fun, args = args, soften = TRUE) + + + ndraws <- freq_r[i] + colindex1 <- 1 + for(j in 1:n_segment){ + colindex2 <- colindex1 + n_eta_segment - 1 + seg_mod <- mod[[1]][[j]] + + coefs <- coef(seg_mod) + mod_class <- class(seg_mod) + weighted <- !all(seg_mod$weights == 1) + + if(all(c("multinom", "nnet") %in% mod_class)){ + coefs <- t(coefs) + vcv <- mirror_vcov(seg_mod) + + } else if(all(c("mlm", "lm") %in% mod_class)){ + nresps <- length(summary(seg_mod)) + if(weighted){ + resp_vcv_dim <- dim(vcov(summary(seg_mod)[[1]])) + full_vcv <- matrix(0, resp_vcv_dim[1] * nresps, + resp_vcv_dim[1] * nresps) + for(k in 1:nresps){ + in_row <- (1 + (k - 1) * resp_vcv_dim[1]):(k * resp_vcv_dim[1]) + in_col <- in_row + full_vcv[in_row, in_col] <- vcov(summary(seg_mod)[[k]]) + } + vcv <- full_vcv + coef_names <- rep(row.names(coef(summary(seg_mod)[[1]])), nresps) + resp_names <- rep(1:nresps, + each = NROW(coef(summary(seg_mod)[[1]]))) + colnames(vcv) <- paste(resp_names, coef_names, sep =":") + } else{ + vcv <- vcov(seg_mod) + resp_names <- rep(1:nresps, + each = NROW(coef(summary(seg_mod)[[1]]))) + colnames(vcv) <- paste(resp_names, colnames(vcv), sep ="") + rownames(vcv) <- paste(resp_names, rownames(vcv), sep ="") + } + } else{ + vcv <- mirror_vcov(seg_mod) + } + + + mv <- as.vector(coefs) + drawn <- rmvnorm(ndraws, mv, vcv) + rows_in <- which(collapsedrho == unique_r[i]) + cols_in <- colindex1:colindex2 + eta[rows_in, cols_in] <- drawn + colindex1 <- colindex2 + 1 + } + } + seg_names <- rep(1:n_segment, each = n_eta_segment) + coef_names <- rep(colnames(vcv), n_segment) + colnames(eta) <- paste(seg_names, coef_names, sep = "_") + eta +} + + +#' @rdname sequential_TS +#' +#' @export +#' +sequential_TS_control <- function(method = ldats_classic, + method_args = ldats_classic_control(), + summary_prob = 0.95, soften = TRUE, + quiet = FALSE, ...){ + list(method = method, method_args = method_args, + summary_prob = summary_prob, soften = soften, quiet = quiet) +} + +#' @rdname sequential_TS +#' +#' @export +#' +summarize_etas <- function(etas, TS){ + if (!is.matrix(etas)){ + stop("etas should be a matrix") + } + prob <- TS$control$summary_prob + Mean <- round(apply(etas, 2, mean), 4) + Median <- round(apply(etas, 2, median), 4) + SD <- round(apply(etas, 2, sd), 4) + MCMCerr <- round(SD / sqrt(nrow(etas)), 4) + HPD <- HPDinterval(as.mcmc(etas), prob = prob) + Lower <- round(HPD[ , "lower"], 4) + Upper <- round(HPD[ , "upper"], 4) + AC10 <- tryCatch(t(round(autocorr.diag(as.mcmc(etas), lag = 10), 4)), + error = function(x){"-"}) + ESS <- effectiveSize(etas) + out <- data.frame(Mean, Median, Lower, Upper, SD, MCMCerr, AC10, ESS) + colnames(out)[3:4] <- paste0(c("Lower_", "Upper_"), paste0(prob*100, "%")) + colnames(out)[7] <- "AC10" + rownames(out) <- colnames(etas) + out +} + +#' @rdname sequential_TS +#' +#' @export +#' +measure_eta_vcov <- function(etas){ + if (!is.matrix(etas)){ + stop("expecting etas to be a matrix") + } + out <- var(etas) + colnames(out) <- colnames(etas) + rownames(out) <- colnames(etas) + out +} + + + +#' @rdname sequential_TS +#' +#' @export +#' +summarize_rhos <- function(rhos, TS){ + if (is.null(rhos)) { + return() + } + prob <- TS$control$summary_prob + Mean <- round(apply(rhos, 2, mean), 2) + Median <- apply(rhos, 2, median) + Mode <- apply(rhos, 2, modalvalue) + SD <- round(apply(rhos, 2, sd), 2) + MCMCerr <- round(SD / sqrt(nrow(rhos)), 4) + HPD <- HPDinterval(as.mcmc(rhos), prob = prob) + Lower <- HPD[ , "lower"] + Upper <- HPD[ , "upper"] + AC10 <- t(round(autocorr.diag(as.mcmc(rhos), lag = 10), 4)) + ESS <- effectiveSize(rhos) + out <- data.frame(Mean, Median, Mode, Lower, Upper, SD, MCMCerr, AC10, ESS) + colnames(out)[4:5] <- paste0(c("Lower_", "Upper_"), paste0(prob*100, "%")) + colnames(out)[8] <- "AC10" + rownames(out) <- sprintf("Changepoint_%d", seq_len(nrow(out))) + out +} + +#' @rdname sequential_TS +#' +#' @export +#' +measure_rho_vcov <- function(rhos){ + if (is.null(rhos)) { + return() + } + if (!is.matrix(rhos)){ + stop("expecting rhos to be a matrix") + } + out <- var(rhos) + colnames(out) <- sprintf("CP_%d", 1:dim(out)[1]) + rownames(out) <- sprintf("CP_%d", 1:dim(out)[2]) + out +} diff --git a/R/TS_on_LDA.R b/R/TS_on_LDA.R deleted file mode 100644 index c57712fe..00000000 --- a/R/TS_on_LDA.R +++ /dev/null @@ -1,704 +0,0 @@ -#' @title Conduct a set of Time Series analyses on a set of LDA models -#' -#' @description This is a wrapper function that expands the main Time Series -#' analyses function (\code{\link{TS}}) across the LDA models (estimated -#' using \code{\link[topicmodels]{LDA}} or \code{\link{LDA_set}} and the -#' Time Series models, with respect to both continuous time formulas and the -#' number of discrete changepoints. This function allows direct passage of -#' the control parameters for the parallel tempering MCMC through to the -#' main Time Series function, \code{\link{TS}}, via the -#' \code{ptMCMC_controls} argument. \cr \cr -#' \code{check_TS_on_LDA_inputs} checks that the inputs to -#' \code{TS_on_LDA} are of proper classes for a full analysis. -#' -#' @param LDA_models List of LDA models (class \code{LDA_set}, produced by -#' \code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -#' produced by \code{\link[topicmodels]{LDA}}). -#' -#' @param document_covariate_table Document covariate table (rows: documents, -#' columns: time index and covariate options). Every model needs a -#' covariate to describe the time value for each document (in whatever -#' units and whose name in the table is input in \code{timename}) -#' that dictates the application of the change points. -#' In addition, all covariates named within specific models in -#' \code{formula} must be included. Must be a conformable to a data table, -#' as verified by \code{\link{check_document_covariate_table}}. -#' -#' @param formulas Vector of \code{\link[stats]{formula}}(s) for the -#' continuous (non-change point) component of the time series models. Any -#' predictor variable included in a formula must also be a column in the -#' \code{document_covariate_table}. Each element (formula) in the vector -#' is evaluated for each number of change points and each LDA model. -#' -#' @param nchangepoints Vector of \code{integer}s corresponding to the number -#' of change points to include in the time series models. 0 is a valid input -#' corresponding to no change points (\emph{i.e.}, a singular time series -#' model), and the current implementation can reasonably include up to 6 -#' change points. Each element in the vector is the number of change points -#' used to segment the data for each formula (entry in \code{formulas}) -#' component of the TS model, for each selected LDA model. -#' -#' @param timename \code{character} element indicating the time variable -#' used in the time series. Defaults to \code{"time"}. The variable must be -#' integer-conformable or a \code{Date}. If the variable named -#' is a \code{Date}, the input is converted to an integer, resulting in the -#' timestep being 1 day, which is often not desired behavior. -#' -#' @param weights Optional class \code{numeric} vector of weights for each -#' document. Defaults to \code{NULL}, translating to an equal weight for -#' each document. When using \code{multinom_TS} in a standard LDATS -#' analysis, it is advisable to weight the documents by their total size, -#' as the result of \code{\link[topicmodels]{LDA}} is a matrix of -#' proportions, which does not account for size differences among documents. -#' For most models, a scaling of the weights (so that the average is 1) is -#' most appropriate, and this is accomplished using \code{document_weights}. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return \code{TS_on_LDA}: \code{TS_on_LDA}-class \code{list} of results -#' from \code{\link{TS}} applied for each model on each LDA model input. -#' \cr \cr -#' \code{check_TS_inputs}: An error message is thrown if any input -#' is not proper, else \code{NULL}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) -#' LDA_models <- select_LDA(LDAs) -#' weights <- document_weights(document_term_table) -#' formulas <- c(~ 1, ~ newmoon) -#' mods <- TS_on_LDA(LDA_models, document_covariate_table, formulas, -#' nchangepoints = 0:1, timename = "newmoon", weights) -#' } -#' -#' @export -#' -TS_on_LDA <- function(LDA_models, document_covariate_table, formulas = ~ 1, - nchangepoints = 0, timename = "time", weights = NULL, - control = list()){ - check_TS_on_LDA_inputs(LDA_models, document_covariate_table, formulas, - nchangepoints, timename, weights, control) - control <- do.call("TS_control", control) - mods <- expand_TS(LDA_models, formulas, nchangepoints) - nmods <- nrow(mods) - TSmods <- vector("list", nmods) - - for(i in 1:nmods){ - print_model_run_message(mods, i, LDA_models, control) - formula_i <- mods$formula[[i]] - nchangepoints_i <- mods$nchangepoints[i] - data_i <- prep_TS_data(document_covariate_table, LDA_models, mods, i) - TSmods[[i]] <- TS(data_i, formula_i, nchangepoints_i, timename, weights, - control) - } - package_TS_on_LDA(TSmods, LDA_models, mods) - -} - -#' @title Prepare the model-specific data to be used in the TS analysis -#' of LDA output -#' -#' @description Append the estimated topic proportions from a fitted LDA model -#' to the document covariate table to create the data structure needed for -#' \code{\link{TS}}. -#' -#' @param document_covariate_table Document covariate table (rows: documents, -#' columns: time index and covariate options). Every model needs a -#' covariate to describe the time value for each document (in whatever -#' units and whose name in the table is input in \code{timename}) -#' that dictates the application of the change points. -#' In addition, all covariates named within specific models in -#' \code{formula} must be included. Must be a conformable to a data table, -#' as verified by \code{\link{check_document_covariate_table}}. -#' -#' @param LDA_models List of LDA models (class \code{LDA_set}, produced by -#' \code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -#' produced by \code{\link[topicmodels]{LDA}}). -#' -#' @param mods The \code{data.table} created by \code{\link{expand_TS}} that -#' contains each of the models (defined by the LDA model to use and the and -#' formula number of changepoints for the TS model). Indexed here by -#' \code{i}. -#' -#' @param i \code{integer} index referencing the row in \code{mods} to use. -#' -#' @return Class \code{data.frame} object including [1] the time variable -#' (indicated in \code{control}), [2] the predictor variables (required by -#' \code{formula}) and [3], the multinomial response variable (indicated -#' in \code{formula}), ready for input into \code{TS}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) -#' LDA_models <- select_LDA(LDAs) -#' weights <- document_weights(document_term_table) -#' formulas <- c(~ 1, ~ newmoon) -#' mods <- expand_TS(LDA_models, formulas = ~1, nchangepoints = 0) -#' data1 <- prep_TS_data(document_covariate_table, LDA_models, mods) -#' } -#' -#' @export -#' -prep_TS_data <- function(document_covariate_table, LDA_models, mods, i = 1){ - check_document_covariate_table(document_covariate_table, LDA_models) - check_LDA_models(LDA_models) - if(is(LDA_models, "LDA")){ - LDA_models <- c(LDA_models) - class(LDA_models) <- c("LDA_set", "list") - } - data_i <- document_covariate_table - data_i$gamma <- LDA_models[[mods$LDA[i]]]@gamma - data_i -} - -#' @title Select the best Time Series model -#' -#' @description Select the best model of interest from an -#' \code{TS_on_LDA} object generated by \code{\link{TS_on_LDA}}, based on -#' a set of user-provided functions. The functions default to choosing the -#' model with the lowest AIC value. \cr \cr -#' Presently, the set of functions should result in a singular selected -#' model. If multiple models are chosen via the selection, only the first -#' is returned. -#' -#' @param TS_models An object of class \code{TS_on_LDA} produced by -#' \code{\link{TS_on_LDA}}. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return A reduced version of \code{TS_models} that only includes the -#' selected TS model. The returned object is a single TS model object of -#' class \code{TS_fit}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) -#' LDA_models <- select_LDA(LDAs) -#' weights <- document_weights(document_term_table) -#' formulas <- c(~ 1, ~ newmoon) -#' mods <- TS_on_LDA(LDA_models, document_covariate_table, formulas, -#' nchangepoints = 0:1, timename = "newmoon", weights) -#' select_TS(mods) -#' } -#' -#' @export -#' -select_TS <- function(TS_models, control = list()){ - if (!("TS_on_LDA" %in% class(TS_models))){ - stop("TS_models must be of class TS_on_LDA") - } - check_control(control) - control <- do.call("TS_control", control) - measurer <- control$measurer - selector <- control$selector - TS_measured <- vapply(TS_models, measurer, 0) %>% - matrix(ncol = 1) - TS_selected <- apply(TS_measured, 2, selector) - which_selected <- which(TS_measured %in% TS_selected) - if (length(which_selected) > 1){ - warning("Selection results in multiple models, returning first") - which_selected <- which_selected[1] - } - out <- TS_models[[which_selected]] - class(out) <- c("TS_fit", "list") - out -} - -#' @title Package the output of TS_on_LDA -#' -#' @description Set the class and name the elements of the results list -#' returned from applying \code{\link{TS}} to the combination of TS models -#' requested for the LDA model(s) input. -#' -#' @param TSmods list of results from \code{\link{TS}} applied for each model -#' on each LDA model input. -#' -#' @param LDA_models List of LDA models (class \code{LDA_set}, produced by -#' \code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -#' produced by \code{\link[topicmodels]{LDA}}). -#' -#' @param models \code{data.frame} object returned from -#' \code{\link{expand_TS}} that contains the combinations of LDA models, -#' and formulas and nchangepoints used in the TS models. -#' -#' @return Class \code{TS_on_LDA} list of results from \code{\link{TS}} -#' applied for each model on each LDA model input. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) -#' LDA_models <- select_LDA(LDAs) -#' weights <- document_weights(document_term_table) -#' mods <- expand_TS(LDA_models, c(~ 1, ~ newmoon), 0:1) -#' nmods <- nrow(mods) -#' TSmods <- vector("list", nmods) -#' for(i in 1:nmods){ -#' formula_i <- mods$formula[[i]] -#' nchangepoints_i <- mods$nchangepoints[i] -#' data_i <- prep_TS_data(document_covariate_table, LDA_models, mods, i) -#' TSmods[[i]] <- TS(data_i, formula_i, nchangepoints_i, "newmoon", -#' weights, TS_control()) -#' } -#' package_TS_on_LDA(TSmods, LDA_models, mods) -#' } -#' -#' @export -#' -package_TS_on_LDA <- function(TSmods, LDA_models, models){ - check_LDA_models(LDA_models) - if(is(LDA_models, "LDA")){ - LDA_models <- c(LDA_models) - class(LDA_models) <- c("LDA_set", "list") - } - nmodels <- nrow(models) - nms <- rep(NA, nmodels) - for (i in 1:nmodels){ - nms[i] <- paste0(names(LDA_models)[models$LDA[i]], ", ", - deparse(models$formula[[i]]), ", ", - models$nchangepoints[i], " changepoints") - } - names(TSmods) <- nms - class(TSmods) <- list("TS_on_LDA", "list") - TSmods -} - - -#' @title Print a set of Time Series models fit to LDAs -#' -#' @description Convenience function to print only the names of a -#' \code{TS_on_LDA}-class object generated by \code{\link{TS_on_LDA}}. -#' -#' @param x Class \code{TS_on_LDA} object to be printed. -#' -#' @param ... Not used, simply included to maintain method compatibility. -#' -#' @return \code{character} \code{vector} of the names of \code{x}'s models. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) -#' LDA_models <- select_LDA(LDAs) -#' weights <- document_weights(document_term_table) -#' formulas <- c(~ 1, ~ newmoon) -#' mods <- TS_on_LDA(LDA_models, document_covariate_table, formulas, -#' nchangepoints = 0:1, timename = "newmoon", weights) -#' print(mods) -#' } -#' -#' @export -#' -print.TS_on_LDA <- function(x, ...){ - print(names(x)) -} - -#' @title Print the message to the console about which combination of the -#' Time Series and LDA models is being run -#' -#' @description If desired, print a message at the beginning of every model -#' combination stating the TS model and the LDA model being evaluated. -#' -#' @param models \code{data.frame} object returned from -#' \code{\link{expand_TS}} that contains the combinations of LDA models, -#' and formulas and nchangepoints used in the TS models. -#' -#' @param i \code{integer} index of the row to use from \code{models}. -#' -#' @param LDA_models List of LDA models (class \code{LDA_set}, produced by -#' \code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -#' produced by \code{\link[topicmodels]{LDA}}). -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. Of particular importance here is -#' the \code{logical}-class element named \code{quiet}. -#' -#' @return \code{NULL}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) -#' LDA_models <- select_LDA(LDAs) -#' weights <- document_weights(document_term_table) -#' formulas <- c(~ 1, ~ newmoon) -#' nchangepoints <- 0:1 -#' mods <- expand_TS(LDA_models, formulas, nchangepoints) -#' print_model_run_message(mods, 1, LDA_models, TS_control()) -#' } -#' -#' @export -#' -print_model_run_message <- function(models, i, LDA_models, control){ - control <- do.call("TS_control", control) - equation <- deparse(models$formula[[i]]) - chngpt_msg <- paste0("with ", models$nchangepoints[i], " changepoints ") - reg_msg <- paste0("and equation ", equation) - ts_msg <- paste0(chngpt_msg, reg_msg) - lda_msg <- names(LDA_models)[models$LDA[i]] - msg <- paste0("Running TS model ", ts_msg, " on LDA model ", lda_msg, "\n") - messageq(msg, control$quiet) -} - -#' @title Expand the TS models across the factorial combination of -#' LDA models, formulas, and number of change points -#' -#' @description Expand the completely crossed combination of model inputs: -#' LDA model results, formulas, and number of change points. -#' -#' @param LDA_models List of LDA models (class \code{LDA_set}, produced by -#' \code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -#' produced by \code{\link[topicmodels]{LDA}}). -#' -#' @param formulas Vector of \code{\link[stats]{formula}}(s) for the -#' continuous (non-change point) component of the time series models. Any -#' predictor variable included in a formula must also be a column in the -#' \code{document_covariate_table}. Each element (formula) in the vector -#' is evaluated for each number of change points and each LDA model. -#' -#' @param nchangepoints Vector of \code{integer}s corresponding to the number -#' of change points to include in the time series models. 0 is a valid input -#' corresponding to no change points (\emph{i.e.}, a singular time series -#' model), and the current implementation can reasonably include up to 6 -#' change points. Each element in the vector is the number of change points -#' used to segment the data for each formula (entry in \code{formulas}) -#' component of the TS model, for each selected LDA model. -#' -#' @return Expanded \code{data.frame} table of the three values (columns) for -#' each unique model run (rows): [1] the LDA model (indicated -#' as a numeric element reference to the \code{LDA_models} object), [2] the -#' regressor formula, and [3] the number of changepoints. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) -#' LDA_models <- select_LDA(LDAs) -#' weights <- document_weights(document_term_table) -#' formulas <- c(~ 1, ~ newmoon) -#' nchangepoints <- 0:1 -#' expand_TS(LDA_models, formulas, nchangepoints) -#' } -#' -#' @export -#' -expand_TS <- function(LDA_models, formulas, nchangepoints){ - check_LDA_models(LDA_models) - check_nchangepoints(nchangepoints) - if (is(LDA_models, "LDA")) { - LDA_models <- c(LDA_models) - class(LDA_models) <- c("LDA_set", "list") - } - if (!is(formulas, "list")) { - if (is(formulas, "formula")) { - formulas <- c(formulas) - } else{ - stop("formulas does not contain formula(s)") - } - } else if (!all(vapply(formulas, is, TRUE, "formula"))) { - stop("formulas does not contain all formula(s)") - } - formulas - - out <- formulas - for (i in seq_along(formulas)) { - tformula <- paste(as.character(formulas[[i]]), collapse = "") - out[[i]] <- as.formula(paste("gamma", tformula)) - } - formulas <- out - nmods <- length(LDA_models) - mods <- 1:nmods - out <- expand.grid(mods, formulas, nchangepoints, stringsAsFactors = FALSE) - colnames(out) <- c("LDA", "formula", "nchangepoints") - out -} - -#' @title Check that nchangepoints vector is proper -#' -#' @description Check that the vector of numbers of changepoints is -#' conformable to integers greater than 1. -#' -#' @param nchangepoints Vector of the number of changepoints to evaluate. -#' -#' @return An error message is thrown if \code{nchangepoints} is not proper, -#' else \code{NULL}. -#' -#' @examples -#' check_nchangepoints(0) -#' check_nchangepoints(2) -#' -#' @export -#' -check_nchangepoints <- function(nchangepoints){ - if (!is.numeric(nchangepoints) || any(nchangepoints %% 1 != 0)){ - stop("nchangepoints must be integer-valued") - } - if (any(nchangepoints < 0)){ - stop("nchangepoints must be non-negative") - } - return() -} - -#' @title Check that weights vector is proper -#' -#' @description Check that the vector of document weights is numeric and -#' positive and inform the user if the average weight isn't 1. -#' -#' @param weights Vector of the document weights to evaluate, or \code{TRUE} -#' for triggering internal weighting by document sizes. -#' -#' @return An error message is thrown if \code{weights} is not proper, -#' else \code{NULL}. -#' -#' @examples -#' check_weights(1) -#' wts <- runif(100, 0.1, 100) -#' check_weights(wts) -#' wts2 <- wts / mean(wts) -#' check_weights(wts2) -#' check_weights(TRUE) -#' -#' @export -#' -check_weights <- function(weights){ - if(is.logical(weights)){ - if(weights){ - return() - } else{ - stop("if logical, weights need to be TRUE") - } - } - if(!is.null(weights)){ - if (!is.numeric(weights)){ - stop("weights vector must be numeric") - } - if (any(weights <= 0)){ - stop("weights must be positive") - } - if (round(mean(weights)) != 1){ - warning("weights should have a mean of 1, fit may be unstable") - } - } - return() -} - -#' @title Check that LDA model input is proper -#' -#' @description Check that the \code{LDA_models} input is either a set of -#' LDA models (class \code{LDA_set}, produced by -#' \code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -#' produced by \code{\link[topicmodels]{LDA}}). -#' -#' @param LDA_models List of LDA models or singular LDA model to evaluate. -#' -#' @return An error message is thrown if \code{LDA_models} is not proper, -#' else \code{NULL}. -#' -#' @examples -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDAs <- LDA_set(document_term_table, topics = 2, nseeds = 1) -#' LDA_models <- select_LDA(LDAs) -#' check_LDA_models(LDA_models) -#' -#' @export -#' -check_LDA_models <- function(LDA_models){ - if(("LDA_set" %in% class(LDA_models)) == FALSE){ - if(is(LDA_models, "LDA") == FALSE){ - stop("LDA_models is not an LDA object or LDA_set object") - } - } - return() -} - -#' @title Check that the document covariate table is proper -#' -#' @description Check that the table of document-level covariates is -#' conformable to a data frame and of the right size (correct number of -#' documents) for the document-topic output from the LDA models. -#' -#' @param document_covariate_table Document covariate table to evaluate. -#' -#' @param LDA_models Reference LDA model list (class \code{LDA_set}) that -#' includes as its first element a properly fitted \code{LDA} model with -#' a \code{gamma} slot with the document-topic distribution. -#' -#' @param document_term_table Optional input for checking when -#' \code{LDA_models} is \code{NULL} -#' -#' @return An error message is thrown if \code{document_covariate_table} is -#' not proper, else \code{NULL}. -#' -#' @examples -#' data(rodents) -#' check_document_covariate_table(rodents$document_covariate_table) -#' -#' @export -#' -check_document_covariate_table <- function(document_covariate_table, - LDA_models = NULL, - document_term_table = NULL){ - dct_df <- tryCatch(data.frame(document_covariate_table), - warning = function(x){NA}, error = function(x){NA}) - if(is(LDA_models, "LDA")){ - LDA_models <- c(LDA_models) - class(LDA_models) <- c("LDA_set", "list") - } - if (length(dct_df) == 1 && is.na(dct_df)){ - stop("document_covariate_table is not conformable to a data frame") - } - if (!is.null(LDA_models)){ - if (nrow(data.frame(document_covariate_table)) != - nrow(LDA_models[[1]]@gamma)){ - stop("number of documents in covariate table is not equal to number of - documents observed") - } - } else if (!is.null(document_term_table)){ - if (nrow(data.frame(document_covariate_table)) != - nrow(data.frame(document_term_table))){ - stop("number of documents in covariate table is not equal to number of - documents observed") - } - } - return() -} - -#' @title Check that the time vector is proper -#' -#' @description Check that the vector of time values is included in the -#' document covariate table and that it is either a integer-conformable or -#' a \code{date}. If it is a \code{date}, the input is converted to an -#' integer, resulting in the timestep being 1 day, which is often not -#' desired behavior. -#' -#' @param document_covariate_table Document covariate table used to query -#' for the time column. -#' -#' @param timename Column name for the time variable to evaluate. -#' -#' @return An error message is thrown if \code{timename} is -#' not proper, else \code{NULL}. -#' -#' @examples -#' data(rodents) -#' check_timename(rodents$document_covariate_table, "newmoon") -#' -#' @export -#' -check_timename <- function(document_covariate_table, timename){ - if (!("character" %in% class(timename))){ - stop("timename is not a character value") - } - if (length(timename) > 1){ - stop("timename can only be one value") - } - covariate_names <- colnames(document_covariate_table) - if ((timename %in% covariate_names) == FALSE){ - stop("timename not present in document covariate table") - } - time_covariate <- document_covariate_table[ , timename] - if (!(is.Date(time_covariate)) & - (!is.numeric(time_covariate) || !all(time_covariate %% 1 == 0))){ - stop("covariate indicated by timename is not an integer or a date") - } - return() -} - -#' @title Check that formulas vector is proper and append the response -#' variable -#' -#' @description Check that the vector of formulas is actually formatted -#' as a vector of \code{\link[stats]{formula}} objects and that the -#' predictor variables are all included in the document covariate table. -#' -#' @param formulas Vector of the formulas to evaluate. -#' -#' @param document_covariate_table Document covariate table used to evaluate -#' the availability of the data required by the formula inputs. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return An error message is thrown if \code{formulas} is -#' not proper, else \code{NULL}. -#' -#' @examples -#' data(rodents) -#' check_formulas(~ 1, rodents$document_covariate_table) -#' -#' @export -#' -check_formulas <- function(formulas, document_covariate_table, - control = list()){ - check_document_covariate_table(document_covariate_table) - check_control(control) - control <- do.call("TS_control", control) - # response <- control$response - dct <- document_covariate_table - if (!is(formulas, "list")) { - if (is(formulas, "formula")) { - formulas <- c(formulas) - } else{ - stop("formulas does not contain formula(s)") - } - } else if (!all(vapply(formulas, is, TRUE, "formula"))) { - stop("formulas does not contain all formula(s)") - } - resp <- unlist(lapply(lapply(formulas, terms), attr, "response")) - pred <- unlist(lapply(lapply(formulas, terms), attr, "term.labels")) - if (any(resp != 0)) { - stop("formula inputs should not include response variable") - } - if (!all(pred %in% colnames(dct))) { - misses <- pred[which(pred %in% colnames(dct) == FALSE)] - mis <- paste(misses, collapse = ", ") - stop(paste0("formulas include predictors not present in data: ", mis)) - } - return() -} - -#' @rdname TS_on_LDA -#' -#' @export -#' -check_TS_on_LDA_inputs <- function(LDA_models, document_covariate_table, - formulas = ~ 1, nchangepoints = 0, - timename = "time", weights = NULL, - control = list()){ - check_LDA_models(LDA_models) - check_document_covariate_table(document_covariate_table, LDA_models) - check_timename(document_covariate_table, timename) - check_formulas(formulas, document_covariate_table, control) - check_nchangepoints(nchangepoints) - check_weights(weights) - check_control(control) -} diff --git a/R/TS_plots.R b/R/TS_plots.R index b405fa48..ed356fad 100644 --- a/R/TS_plots.R +++ b/R/TS_plots.R @@ -1,15 +1,71 @@ -#' @title Plot an LDATS TS model -#' -#' @description Generalization of the \code{\link[graphics]{plot}} function to -#' work on fitted TS model objects (class \code{TS_fit}) returned from -#' \code{\link{TS}}. +#' @title Plot an LDATS Time Series model +#' +#' @description +#' \code{plot.TS} is a generalization of the \code{\link[graphics]{plot}} +#' function to work on fitted TS model objects (class \code{TS}) +#' returned from \code{\link{TS}}. \cr \cr +#' \code{plot.TS_set} plots a \code{TS_set} of \code{TS} models, either +#' just the \code{selected} models or all. \cr \cr +#' \code{TS_diagnostics_plot} makes the 4-panel figures (showing trace +#' plots, posterior ECDF, posterior density, and iteration +#' autocorrelation) for each of the parameters (change point locations +#' and regressors) fitted within a compositional time series model (fit +#' by \code{\link{TS}}). \cr \cr +#' \code{eta_diagnostics_plots} creates the diagnostic plots +#' for the regressors (etas) of a time series model. \cr \cr +#' \code{rho_diagnostics_plots} creates the diagnostic plots +#' for the change point locations (rho) of a time series model. \cr \cr +#' \code{trace_plot} produces a trace plot for the parameter of interest +#' (rho or eta) as part of \code{\link{TS_diagnostics_plot}}. A +#' horizontal line is added to show the median of the posterior. \cr \cr +#' \code{ecdf_plot} makes a vanilla ECDF (empirical cumulative distribution +#' function) plot using \code{\link[stats]{ecdf}} for the parameter of +#' interest (rho or eta) as part of \code{\link{TS_diagnostics_plot}}. +#' A horizontal line is added to show the median of the posterior. \cr \cr +#' \code{autocorr_plot} produces a vanilla ACF plot using +#' \code{\link[stats]{acf}} for the parameter of interest (rho or eta) +#' as part of \code{\link{TS_diagnostics_plot}}.\cr \cr +#' \code{posterior_plot} makes a vanilla histogram plot using +#' \code{\link[graphics]{hist}} for the parameter of interest (rho or eta) +#' as part of \code{\link{TS_diagnostics_plot}}. A vertical line is added +#' to show the median of the posterior. \cr \cr +#' \code{TS_summary_plot} produces a two-panel figure of [1] the change +#' point distributions as histograms over time and [2] the time series of +#' the fitted topic proportions over time, based on a selected set of +#' change point locations. \cr \cr +#' \code{pred_gamma_TS_plot} produces a time series of the +#' fitted topic proportions over time, based on a selected set of change +#' point locations. \cr \cr +#' \code{rho_hist}: make a plot of the change point distributions as +#' histograms over time. \cr \cr +#' \code{rho_lines} adds vertical lines to the plot of the time series of +#' fitted proportions associated with the change points of interest. +#' \cr \cr +#' \code{set_gamma_colors} creates the set of colors to be used in +#' the time series of the fitted gamma (topic proportion) values. \cr \cr +#' \code{set_rho_hist_colors} creates the set of colors to be used in +#' the change point histogram. \cr \cr +#' \code{set_TS_summary_plot_cols} acts as a default \code{list} +#' generator function that produces the options for the colors +#' controlling the panels of the TS summary plots, so needed +#' because the panels should be in different color schemes. See +#' \code{\link{set_gamma_colors}} and \code{\link{set_rho_hist_colors}} +#' for specific details on usage. #' -#' @param x A \code{TS_fit} object of a multinomial time series model fit by -#' \code{\link{TS}}. +#' @param x In \code{plot.TS}, a \code{TS_fit} object of a multinomial time +#' series model fit by \code{\link{TS}}. In \code{plot.TS_set}, a +#' \code{TS_set} \code{list} of \code{TS} objects. #' #' @param ... Additional arguments to be passed to subfunctions. Not currently #' used, just retained for alignment with \code{\link[graphics]{plot}}. #' +#' @param selected \code{logical} indicator of if only the selected TSs +#' (the first element in \code{x}) should be plotted or if all the TSs +#' (the second element in \code{x}) should be plotted. +#' +#' @param spec_rhos \code{numeric} vector indicating the locations along the +#' x axis where the specific change points being used are located. +#' #' @param plot_type "diagnostic" or "summary". #' #' @param bin_width Width of the bins used in the histograms of the summary @@ -27,38 +83,71 @@ #' summary plot. Currently only defined for \code{"median"} and #' \code{"mode"}. #' -#' @param cols \code{list} of elements used to define the colors for the two -#' panels of the summary plot, as generated simply using -#' \code{\link{set_TS_summary_plot_cols}}. \code{cols} has two elements -#' \code{rho} and \code{gamma}, each corresponding to the related panel, -#' and each containing default values for entries named \code{cols}, -#' \code{option}, and \code{alpha}. See \code{\link{set_gamma_colors}} and -#' \code{\link{set_rho_hist_colors}} for details on usage. +#' @param together \code{logical} indicating if the subplots are part of a +#' larger plot output. +#' +#' @param cols,rho_cols,gamma_cols +#' In \code{plot.TS}, \code{cols} is a \code{list} of elements used to +#' define the colors for the two panels of the summary plot, as generated +#' simply using \code{\link{set_TS_summary_plot_cols}}. +#' \code{cols} has two elements \code{rho} and \code{gamma}, each +#' corresponding to the related panel, and each containing default values +#' for entries named \code{cols}, \code{option}, and \code{alpha}. \cr +#' For \code{rho_cols} and \code{gamma_cols} always and for \code{cols} in +#' \code{set_rho_hist_colors}, \code{set_gamma_colors}, +#' \code{rho_hist}, and \code{pred_gamma_TS_plot}, colors to be used in +#' the specific plot. Any valid color values (\emph{e.g.}, see +#' \code{\link[grDevices]{colors}}, \code{\link[grDevices]{rgb}}) can be +#' input as with a standard plot. The default (\code{NULL}) triggers use +#' of \code{\link[viridis]{viridis}} color options (see +#' \code{option},\code{rho_option},\code{gamma_option}). #' #' @param LDATS \code{logical} indicating if the plot is part of a larger #' LDATS plot output. #' -#' @param interactive \code{logical} input, should be code{TRUE} unless +#' @param interactive \code{logical} input, should be \code{TRUE} unless #' testing. +#' +#' @param ylab \code{character} value used to label the y axis. +#' +#' @param draw \code{vector} of parameter values drawn from the posterior +#' distribution, indexed to the iteration by the order of the vector. +#' +#' @param xlab \code{character} value used to label the x axis. #' -#' @return \code{NULL}. +#' @param option,rho_option,gamma_option A \code{character} string indicating +#' the color option from \code{\link[viridis]{viridis}} to use if +#' "cols == NULL". Four options are available: "magma" (or "A"), +#' "inferno" (or "B"), "plasma" (or "C"), "viridis" (or "D", the default +#' option) and "cividis" (or "E"). +#' +#' @param alpha,rho_alpha,gamma_alpha Numeric value [0,1] that indicates the +#' transparency of the colors used. Supported only on some devices, see +#' \code{\link[grDevices]{rgb}}. #' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) -#' plot(TSmod) -#' } +#' @return +#' \code{plot.TS},\code{plot.TS_set},\code{TS_diagnostics_plot}, +#' \code{eta_diagnostics_plots},\code{rho_diagnostics_plots}, +#' \code{trace_plot},\code{posterior_plot},\code{autocorr_plot}, +#' \code{ecdf_plot},\code{TS_summary_plot},\code{pred_gamma_TS_plot}, +#' \code{rho_hist},\code{rho_lines}:\code{NULL}. \cr \cr +#' \code{set_rho_hist_cols},\code{set_gamma_colors}: \code{vector} of +#' \code{character} hex codes indicating colors to use. +#' \code{set_TS_summary_plot_cols}: \code{list} of elements used to define +#' the colors for the two panels. Contains two elements \code{rho} and +#' \code{gamma}, each corresponding to the related panel, and each +#' containing default values for entries named \code{cols}, +#' \code{option}, and \code{alpha}. +#' +#' @name plot.TS +#' + + +#' @rdname plot.TS #' #' @export #' -plot.TS_fit <- function(x, ..., plot_type = "summary", interactive = FALSE, +plot.TS <- function(x, ..., plot_type = "summary", interactive = FALSE, cols = set_TS_summary_plot_cols(), bin_width = 1, xname = NULL, border = NA, selection = "median", LDATS = FALSE){ @@ -69,37 +158,25 @@ plot.TS_fit <- function(x, ..., plot_type = "summary", interactive = FALSE, } } -#' @title Plot the diagnostics of the parameters fit in a TS model -#' -#' @description Plot 4-panel figures (showing trace plots, posterior ECDF, -#' posterior density, and iteration autocorrelation) for each of the -#' parameters (change point locations and regressors) fitted within a -#' multinomial time series model (fit by \code{\link{TS}}). \cr \cr -#' \code{eta_diagnostics_plots} creates the diagnostic plots -#' for the regressors (etas) of a time series model. \cr \cr -#' \code{rho_diagnostics_plots} creates the diagnostic plots -#' for the change point locations (rho) of a time series model. +#' @rdname plot.TS #' -#' @param interactive \code{logical} input, should be code{TRUE} unless -#' testing. +#' @export #' -#' @param x Object of class \code{TS_fit}, generated by \code{\link{TS}} to -#' have its diagnostics plotted. -#' -#' @return \code{NULL}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) -#' TS_diagnostics_plot(TSmod) -#' } +plot.TS_set <- function(x, ..., selected = TRUE){ + if(selected){ + x <- x[[1]] + } else{ + x <- x[[2]] + } + on.exit(devAskNewPage(FALSE)) + if (length(x) > 1){ + devAskNewPage(TRUE) + } + y <- lapply(x, plot, ...) + y <- NULL +} + +#' @rdname plot.TS #' #' @export #' @@ -108,7 +185,7 @@ TS_diagnostics_plot <- function(x, interactive = TRUE){ eta_diagnostics_plots(x, interactive) } -#' @rdname TS_diagnostics_plot +#' @rdname plot.TS #' #' @export #' @@ -138,7 +215,7 @@ eta_diagnostics_plots <- function(x, interactive){ } } -#' @rdname TS_diagnostics_plot +#' @rdname plot.TS #' #' @export #' @@ -146,7 +223,7 @@ rho_diagnostics_plots <- function(x, interactive){ on.exit(devAskNewPage(FALSE)) oldpar <- par(no.readonly = TRUE) on.exit(par(oldpar)) - rhos <- x$rhos + rhos <- x$focal_rhos if (is.null(rhos)){ return() } @@ -162,155 +239,59 @@ rho_diagnostics_plots <- function(x, interactive){ } } -#' @title Produce the trace plot panel for the TS diagnostic plot of a -#' parameter -#' -#' @description Produce a trace plot for the parameter of interest (rho or -#' eta) as part of \code{\link{TS_diagnostics_plot}}. A horizontal line -#' is added to show the median of the posterior. -#' -#' @param x Vector of parameter values drawn from the posterior distribution, -#' indexed to the iteration by the order of the vector. -#' -#' @param ylab \code{character} value used to label the y axis. -#' -#' @return \code{NULL}. -#' -#' @examples -#' trace_plot(rnorm(100, 0, 1)) + + +#' @rdname plot.TS #' -#' @export +#' @export #' -trace_plot <- function(x, ylab = "parameter value"){ - plot(x, type = "l", lwd = 1, col = 0, +trace_plot <- function(draw, ylab = "parameter value"){ + plot(draw, type = "l", lwd = 1, col = 0, xlab = "Iteration", ylab = ylab, las = 1, bty = "L") - ext <- 0.01 * length(x) - points(c(-ext, length(x) + ext), rep(median(x), 2), type = "l", lwd = 2, - lty = 2) - points(seq_along(x), x, type = "l", col = rgb(0.4, 0.4, 0.4, alpha = 0.9)) + ext <- 0.01 * length(draw) + points(c(-ext, length(draw) + ext), rep(median(draw), 2), + type = "l", lwd = 2, lty = 2) + points(seq_along(draw), draw, type = "l", col = rgb(0.4, 0.4, 0.4, + alpha = 0.9)) } -#' @title Produce the posterior distribution ECDF panel for the TS -#' diagnostic plot of a parameter -#' -#' @description Produce a vanilla ECDF (empirical cumulative distribution -#' function) plot using \code{ecdf} for the parameter of interest (rho or -#' eta) as part of \code{\link{TS_diagnostics_plot}}. A horizontal line -#' is added to show the median of the posterior. -#' -#' @param x Vector of parameter values drawn from the posterior distribution, -#' indexed to the iteration by the order of the vector. -#' -#' @param xlab \code{character} value used to label the x axis. -#' -#' @return \code{NULL}. -#' -#' @examples -#' ecdf_plot(rnorm(100, 0, 1)) + +#' @rdname plot.TS #' -#' @export +#' @export #' -ecdf_plot <- function(x, xlab = "parameter value"){ - ECDF <- ecdf(x) +ecdf_plot <- function(draw, xlab = "parameter value"){ + ECDF <- ecdf(draw) plot(ECDF, main = "", xlab = xlab, ylab = "%", las = 1, bty = "L") abline(a = 0.5, b = 0, lwd = 2, lty = 2) } -#' @title Produce the posterior distribution histogram panel for the TS -#' diagnostic plot of a parameter -#' -#' @description Produce a vanilla histogram plot using \code{hist} for the -#' parameter of interest (rho or eta) as part of -#' \code{\link{TS_diagnostics_plot}}. A vertical line is added to show the -#' median of the posterior. -#' -#' @param x Vector of parameter values drawn from the posterior distribution, -#' indexed to the iteration by the order of the vector. -#' -#' @param xlab \code{character} value used to label the x axis. -#' -#' @return \code{NULL}. -#' -#' @examples -#' posterior_plot(rnorm(100, 0, 1)) + + +#' @rdname plot.TS #' -#' @export +#' @export #' -posterior_plot <- function(x, xlab = "parameter value"){ - hist(x, las = 1, main = "", xlab = xlab) - points(rep(median(x), 2), c(0, 1e5), type = "l", lwd = 2, lty = 2) +posterior_plot <- function(draw, xlab = "parameter value"){ + hist(draw, las = 1, main = "", xlab = xlab) + points(rep(median(draw), 2), c(0, 1e5), type = "l", lwd = 2, lty = 2) } -#' @title Produce the autocorrelation panel for the TS diagnostic plot of -#' a parameter -#' -#' @description Produce a vanilla ACF plot using \code{\link[stats]{acf}} for -#' the parameter of interest (rho or eta) as part of -#' \code{\link{TS_diagnostics_plot}}. -#' -#' @param x Vector of parameter values drawn from the posterior distribution, -#' indexed to the iteration by the order of the vector. -#' -#' @return \code{NULL}. -#' -#' @examples -#' autocorr_plot(rnorm(100, 0, 1)) + + +#' @rdname plot.TS #' -#' @export +#' @export #' -autocorr_plot <- function(x){ - acf(x, las = 1, ylab = "Autocorrelation") +autocorr_plot <- function(draw){ + acf(draw, las = 1, ylab = "Autocorrelation") } -#' @title Create the list of colors for the TS summary plot -#' -#' @description A default list generator function that produces the options -#' for the colors controlling the panels of the TS summary plots, so needed -#' because the panels should be in different color schemes. See -#' \code{\link{set_gamma_colors}} and \code{\link{set_rho_hist_colors}} for -#' specific details on usage. -#' -#' @param rho_cols Colors to be used to plot the histograms of change points. -#' Any valid color values (\emph{e.g.}, see \code{\link[grDevices]{colors}}, -#' \code{\link[grDevices]{rgb}}) can be input as with a standard plot. -#' The default (\code{rho_cols = NULL}) triggers use of -#' \code{\link[viridis]{viridis}} color options (see \code{rho_option}). -#' -#' @param rho_option A \code{character} string indicating the color option -#' from \code{\link[viridis]{viridis}} to use if `rho_cols == NULL`. Four -#' options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -#' (or "C"), "viridis" (or "D", the default option) and "cividis" (or "E"). -#' -#' @param rho_alpha Numeric value [0,1] that indicates the transparency of the -#' colors used. Supported only on some devices, see -#' \code{\link[grDevices]{rgb}}. -#' -#' @param gamma_cols Colors to be used to plot the LDA topic proportions, -#' time series of observed topic proportions, and time series of fitted -#' topic proportions. Any valid color values (\emph{e.g.}, see -#' \code{\link[grDevices]{colors}}, \code{\link[grDevices]{rgb}}) can be -#' input as with a standard plot. The default (\code{gamma_cols = NULL}) -#' triggers use of \code{\link[viridis]{viridis}} color options (see -#' \code{gamma_option}). -#' -#' @param gamma_option A \code{character} string indicating the color option -#' from \code{\link[viridis]{viridis}} to use if gamma_cols == NULL`. Four -#' options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -#' (or "C"), "viridis" (or "D", the default option) and "cividis" (or "E"). -#' -#' @param gamma_alpha Numeric value [0,1] that indicates the transparency of -#' the colors used. Supported only on some devices, see -#' \code{\link[grDevices]{rgb}}. -#' -#' @return \code{list} of elements used to define the colors for the two -#' panels. Contains two elements \code{rho} and \code{gamma}, each -#' corresponding to the related panel, and each containing default values -#' for entries named \code{cols}, \code{option}, and \code{alpha}. -#' -#' @examples -#' set_TS_summary_plot_cols() + + +#' @rdname plot.TS #' -#' @export +#' @export #' set_TS_summary_plot_cols <- function(rho_cols = NULL, rho_option = "D", rho_alpha = 0.4, gamma_cols = NULL, @@ -322,80 +303,30 @@ set_TS_summary_plot_cols <- function(rho_cols = NULL, rho_option = "D", ) } -#' @title Create the summary plot for a TS fit to an LDA model -#' -#' @description Produces a two-panel figure of [1] the change point -#' distributions as histograms over time and [2] the time series of the -#' fitted topic proportions over time, based on a selected set of -#' change point locations. \cr \cr -#' \code{pred_gamma_TS_plot} produces a time series of the -#' fitted topic proportions over time, based on a selected set of change -#' point locations. \cr \cr -#' \code{rho_hist}: make a plot of the change point -#' distributions as histograms over time. -#' -#' @param x Object of class \code{TS_fit} produced by \code{\link{TS}}. -#' -#' @param cols \code{list} of elements used to define the colors for the two -#' panels, as generated simply using \code{\link{set_TS_summary_plot_cols}}. -#' Has two elements \code{rho} and \code{gamma}, each corresponding to the -#' related panel, and each containing default values for entries named -#' \code{cols}, \code{option}, and \code{alpha}. See -#' \code{\link{set_gamma_colors}} and \code{\link{set_rho_hist_colors}} for -#' details on usage. -#' -#' @param bin_width Width of the bins used in the histograms, in units of the -#' x-axis (the time variable used to fit the model). -#' -#' @param xname Label for the x-axis in the summary time series plot. Defaults -#' to \code{NULL}, which results in usage of the \code{timename} element -#' of the control list (held in\code{control$TS_control$timename}). To have -#' no label printed, set \code{xname = ""}. -#' -#' @param border Border for the histogram, default is \code{NA}. -#' -#' @param selection Indicator of the change points to use. Currently only -#' defined for "median" and "mode". -#' -#' @param together \code{logical} indicating if the subplots are part of a -#' larger LDA plot output. + + +#' @rdname plot.TS #' -#' @param LDATS \code{logical} indicating if the plot is part of a larger -#' LDATS plot output. -#' -#' @return \code{NULL}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) -#' TS_summary_plot(TSmod) -#' pred_gamma_TS_plot(TSmod) -#' rho_hist(TSmod) -#' } -#' -#' @export +#' @export #' TS_summary_plot <- function(x, cols = set_TS_summary_plot_cols(), bin_width = 1, xname = NULL, border = NA, selection = "median", LDATS = FALSE){ - rc <- cols$rho - rho_cols <- set_rho_hist_colors(x$rhos, rc$cols, rc$option, rc$alpha) - rho_hist(x, rho_cols, bin_width, xname, border, TRUE, LDATS) - gc <- cols$gamma - gamma_cols <- set_gamma_colors(x, gc$cols, gc$option, gc$alpha) - pred_gamma_TS_plot(x, selection, gamma_cols, xname, TRUE, LDATS) + rho_cols <- set_rho_hist_colors(x = x, cols = cols$rho$cols, + option = cols$rho$option, + alpha = cols$rho$alpha) + rho_hist(x = x, cols = rho_cols, bin_width = bin_width, xname = xname, + border = border, together = TRUE, LDATS = LDATS) + gamma_cols <- set_gamma_colors(x = x, cols = cols$gamma$cols, + option = cols$gamma$option, + alpha = cols$gamma$alpha) + pred_gamma_TS_plot(x = x, selection = selection, cols = gamma_cols, + xname = xname, together = TRUE, LDATS = LDATS) } -#' @rdname TS_summary_plot +#' @rdname plot.TS #' -#' @export +#' @export #' pred_gamma_TS_plot <- function(x, selection = "median", cols = set_gamma_colors(x), @@ -409,7 +340,7 @@ pred_gamma_TS_plot <- function(x, selection = "median", } else{ par(fig = c(0, 1, 0, 1)) } - rhos <- x$rhos + rhos <- x$focal_rhos nrhos <- ncol(rhos) if (!is.null(nrhos)){ if (selection == "median"){ @@ -423,11 +354,17 @@ pred_gamma_TS_plot <- function(x, selection = "median", spec_rhos <- NULL } x$control$timename <- NULL # to remove from v0.1.0 model fits - seg_mods <- multinom_TS(x$data, x$formula, spec_rhos, - x$timename, x$weights, x$control) - nsegs <- length(seg_mods[[1]]) - t1 <- min(x$data[, x$timename]) - t2 <- max(x$data[, x$timename]) + + fun <- x$control$response + args <- list(data = x$data$train$ts_data, formula = x$formula, + changepoints = spec_rhos, timename = x$timename, + weights = x$weights, + control = x$control$response_args$control) + mod <- soft_call(what = fun, args = args, soften = TRUE) + + nsegs <- length(mod[[1]]) + t1 <- min(x$data$train$ts_data[, x$timename]) + t2 <- max(x$data$train$ts_data[, x$timename]) if (is.null(xname)){ xname <- x$timename @@ -440,14 +377,14 @@ pred_gamma_TS_plot <- function(x, selection = "median", axis(1) mtext(side = 2, line = 3.5, cex = 1.25, "Proportion") mtext(side = 1, line = 2.5, cex = 1.25, xname) - ntopics <- ncol(as.matrix(x$data[[x$control$response]])) + ntopics <- ncol(as.matrix(x$data$train$ts_data$gamma)) seg1 <- c(0, spec_rhos[-length(rhos)]) seg2 <- c(spec_rhos, t2) - time_obs <- rep(NA, nrow(x$data)) - pred_vals <- matrix(NA, nrow(x$data), ntopics) + time_obs <- rep(NA, nrow(x$data$train$ts_data)) + pred_vals <- matrix(NA, nrow(x$data$train$ts_data), ntopics) sp1 <- 1 for (i in 1:nsegs){ - mod_i <- seg_mods[[1]][[i]] + mod_i <- mod[[1]][[i]] spec_vals <- sp1:(sp1 + nrow(mod_i$fitted.values) - 1) pred_vals[spec_vals, ] <- mod_i$fitted.values time_obs[spec_vals] <- mod_i$timevals @@ -461,29 +398,9 @@ pred_gamma_TS_plot <- function(x, selection = "median", } } -#' @title Add change point location lines to the time series plot +#' @rdname plot.TS #' -#' @description Adds vertical lines to the plot of the time series of fitted -#' proportions associated with the change points of interest. -#' -#' @param spec_rhos \code{numeric} vector indicating the locations along the -#' x axis where the specific change points being used are located. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) -#' pred_gamma_TS_plot(TSmod) -#' rho_lines(200) -#' } -#' -#' @export +#' @export #' rho_lines <- function(spec_rhos) { if(is.null(spec_rhos)) { @@ -495,9 +412,10 @@ rho_lines <- function(spec_rhos) { } } -#' @rdname TS_summary_plot + +#' @rdname plot.TS #' -#' @export +#' @export #' rho_hist <- function(x, cols = set_rho_hist_colors(x$rhos), bin_width = 1, xname = NULL, border = NA, together = FALSE, @@ -511,10 +429,10 @@ rho_hist <- function(x, cols = set_rho_hist_colors(x$rhos), bin_width = 1, } else{ par(fig = c(0, 1, 0, 1), mar = c(4, 5, 1, 1)) } - rhos <- x$rhos + rhos <- x$focal_rhos nrhos <- ncol(rhos) niter <- nrow(rhos) - timeobs <- x$data[, x$timename] + timeobs <- x$data$train$ts_data[, x$timename] timerange <- range(timeobs) timevals <- seq(timerange[1], timerange[2], 1) ntimes <- length(timevals) @@ -557,44 +475,9 @@ rho_hist <- function(x, cols = set_rho_hist_colors(x$rhos), bin_width = 1, } } -#' @title Prepare the colors to be used in the change point histogram -#' -#' @description Based on the inputs, create the set of colors to be used in -#' the change point histogram. -#' -#' @param x \code{matrix} of change point locations (element \code{rhos}) -#' from an object of class \code{TS_fit}, fit by \code{\link{TS}}. -#' -#' @param cols Colors to be used to plot the histograms of change points. -#' Any valid color values (\emph{e.g.}, see \code{\link[grDevices]{colors}}, -#' \code{\link[grDevices]{rgb}}) can be input as with a standard plot. -#' The default (\code{rho_cols = NULL}) triggers use of -#' \code{\link[viridis]{viridis}} color options (see \code{rho_option}). -#' -#' @param option A \code{character} string indicating the color option -#' from \code{\link[viridis]{viridis}} to use if "cols == NULL". Four -#' options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -#' (or "C"), "viridis" (or "D", the default option) and "cividis" (or "E"). -#' -#' @param alpha Numeric value [0,1] that indicates the transparency of the -#' colors used. Supported only on some devices, see -#' \code{\link[grDevices]{rgb}}. -#' -#' @return Vector of \code{character} hex codes indicating colors to use. -#' -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) -#' set_rho_hist_colors(TSmod$rhos) -#' } + + +#' @rdname plot.TS #' #' @export #' @@ -604,7 +487,7 @@ set_rho_hist_colors <- function(x = NULL, cols = NULL, option = "D", return(NULL) } - nrhos <- ncol(x) + nrhos <- ncol(x$focal_rhos) if (length(cols) == 0){ cols <- viridis(nrhos, option = option, alpha = alpha, end = 0.9) } @@ -630,39 +513,10 @@ set_rho_hist_colors <- function(x = NULL, cols = NULL, option = "D", cols } -#' @title Prepare the colors to be used in the gamma time series -#' -#' @description Based on the inputs, create the set of colors to be used in -#' the time series of the fitted gamma (topic proportion) values. -#' -#' @param x Object of class \code{TS_fit}, fit by \code{\link{TS}}. -#' -#' @param cols Colors to be used to plot the time series of fitted topic -#' proportions. -#' -#' @param option A \code{character} string indicating the color option -#' from \code{\link[viridis]{viridis}} to use if "cols == NULL". Four -#' options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -#' (or "C"), "viridis" (or "D", the default option) and "cividis" (or "E"). -#' -#' @param alpha Numeric value [0,1] that indicates the transparency of the -#' colors used. Supported only on some devices, see -#' \code{\link[grDevices]{rgb}}. -#' -#' @return Vector of \code{character} hex codes indicating colors to use. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) -#' set_gamma_colors(TSmod) -#' } + + + +#' @rdname plot.TS #' #' @export #' @@ -671,7 +525,7 @@ set_gamma_colors <- function(x, cols = NULL, option = "D", alpha = 1){ return(NULL) } - ntopics <- ncol(as.matrix(x$data[x$control$response])) + ntopics <- ncol(as.matrix(x$data$train$ts_data$gamma)) if (length(cols) == 0){ cols <- viridis(ntopics, option = option, alpha = alpha, end = 0.9) } diff --git a/R/TS_responses.R b/R/TS_responses.R new file mode 100644 index 00000000..6fbfe27c --- /dev/null +++ b/R/TS_responses.R @@ -0,0 +1,413 @@ +#' @title Fit a simplex-based change point Time Series model +#' +#' @description +#' \code{simplex_TS} fits a set of simplex regression models +#' (Aitchison 1986, Aitchison \emph{et al.} 2002) to a time +#' series of compositional data divided into multiple segments (a.k.a. +#' chunks) based on given locations for a set of change points, using +#' e.g., the isometric log ratio (ILR) transformation +#' (Egozcue \emph{et al.} 2003, Pawlowsky-Glahn 2003. \cr \cr +#' \code{simplex_TS_chunk} fits a simplex regression model using, e.g., the +#' ILR transformation to a defined chunk of time (a.k.a. segment) +#' \code{[chunk$start, chunk$end]} within a time series. \cr \cr +#' \code{simplex_TS_control} defines and creates the control \code{list} for +#' fitting. +#' +#' @param data \code{data.frame} including [1] the time variable (indicated +#' in \code{timename}), [2] the predictor variables (required by +#' \code{formula}) and [3], the compositional response variable (indicated +#' in \code{formula}). \cr \cr +#' Note that the response variables should be formatted as a +#' \code{data.frame} object named as indicated by the +#' \code{response} entry in the \code{control} list, such as \code{gamma} +#' for a standard TS analysis on LDA output. \cr \cr +#' See \code{Examples}. +#' +#' @param formula \code{\link[stats]{formula}} defining the regression between +#' relationship the change points. Any +#' predictor variable included must also be a column in +#' \code{data} and any (compositional) response variable must be a set of +#' columns in \code{data}. +#' +#' @param changepoints Numeric vector indicating locations of the change +#' points. Must be conformable to \code{integer} values. +#' +#' @param chunk Length-2 vector of times: [1] \code{start}, the start time +#' for the chunk and [2] \code{end}, the end time for the chunk. +#' +#' @param timename \code{character} element indicating the time variable +#' used in the time series. Defaults to \code{"time"}. The variable must be +#' integer-conformable or a \code{Date}. If the variable named +#' is a \code{Date}, the input is converted to an integer, resulting in the +#' timestep being 1 day, which is often not desired behavior. +#' +#' @param weights Optional class \code{numeric} vector of weights for each +#' document. Defaults to \code{NULL}, translating to an equal weight for +#' each document. When using \code{simplex_TS} in a standard LDATS +#' analysis, it is advisable to weight the documents by their total size, +#' as the result of \code{\link{topicmodels_LDA}} is a matrix of +#' proportions, which does not account for size differences among documents. +#' For most models, a scaling of the weights (so that the average is 1) is +#' most appropriate, and this is accomplished using +#' \code{\link{document_weights}}. +#' +#' @param control A \code{list} of parameters to control the fitting of the +#' Time Series model. Values not input assume defaults set by +#' \code{\link{TS_control}}. +#' +#' @param quiet \code{logical} indicator of whether the model should run +#' quietly (if \code{FALSE}, a progress bar and notifications are printed). +#' +#' @param transformation Ratio \code{function} to use for the transformation +#' to the simplex geometry. Options include \code{\link[compositions]{alr}}, +#' \code{\link[compositions]{clr}}, and \code{\link[compositions]{ilr}}. +#' +#' @param ... Not passed along to the output, rather included to allow for +#' automated removal of unneeded controls. +#' +#' @return +#' \code{simplex_TS}: \code{TS_fit} \code{list} of [1] chunk-level model +#' fits (\code{"chunk models"}), [2] the total log likelihood across +#' all chunks (\code{"logLik"}), and [3] a \code{data.frame} of chunk +#' beginning and ending times (with columns \code{"start"} and +#' \code{"end"}). \cr \cr +#' \code{simplex_TS_chunk}: fitted model object for the chunk, +#' of class \code{lm}. \cr \cr +#' \code{simplex_TS_control}: \code{list}, with named elements +#' corresponding to response function controls. +#' +#' @references +#' Aitchison, J. 1986. \emph{The Statistical Analysis of Compositional +#' Data}. Monographs on Statistics and Applied Probability. Chapman & Hall +#' Ltd., London, UK. +#' +#' Aitchison, J, C. Barcelo-Vidal, J.J. Egozcue, and V. Pawlowsky-Glahn. +#' 2002. A consise guide to the algebraic geometric structure of the +#' simplex, the sample space for compositional data analysis, Terra Nostra, +#' Schriften der Alfred Wegener-Stiftung, 03/2003. +#' +#' Egozcue J.J., V. Pawlowsky-Glahn, G. Mateu-Figueras and C. Barcelo-Vidal. +#' 2003. Isometric logratio transformations for compositional data analysis. +#' \emph{Mathematical Geology}, \strong{35}:279-300. +#' +#' Pawlowsky-Glahn, V. 2003. Statistical modelling on coordinates. In: +#' Thio-Henestrosa, S. and J. A. Martin-Fernandez, Eds. +#' \emph{Proceedings of the 1st International Workshop on Compositional Data +#' Analysis}, Universitat de Girona, ISBN 84-8458-111-X. +#' \href{link}{http://ima.udg.es/Activitats/CoDaWork03}. +#' +#' @name simplex_TS +#' + + +#' @rdname simplex_TS +#' +#' @export +#' +simplex_TS <- function(data, formula, changepoints = NULL, + timename = "time", weights = NULL, + control = list()){ + control <- do.call("simplex_TS_control", control) + if (!verify_changepoint_locations(data, changepoints, timename)){ + out <- list("chunk models" = NA, "logLik" = -Inf, "chunks" = NA) + class(out) <- c("TS_fit", "list") + return(out) + } + + chunks <- prep_chunks(data, changepoints, timename) + nchunks <- nrow(chunks) + fits <- vector("list", length = nchunks) + for (i in 1:nchunks){ + fits[[i]] <- simplex_TS_chunk(data = data, formula = formula, + chunk = chunks[i, ], timename = timename, + weights = weights, control = control) + } + package_chunk_fits(chunks, fits) +} + +#' @rdname simplex_TS +#' +#' @export +#' +simplex_TS_chunk <- function(data, formula, chunk, timename = "time", + weights = NULL, control = list()){ + formula <- as.formula(format(formula)) + time_obs <- data[ , timename] + chunk_start <- as.numeric(chunk["start"]) + chunk_end <- as.numeric(chunk["end"]) + in_chunk <- time_obs >= chunk_start & time_obs <= chunk_end + simplex_data <- data[ , !grepl("gamma", colnames(data)), drop = FALSE] + simplex_data$gamma <- do.call(control$transformation, + list(x = acomp(data$gamma))) + + + fit <- lm(formula, simplex_data, weights, subset = in_chunk) + fit$timevals <- time_obs[which(in_chunk)] + fit +} + + +#' @rdname simplex_TS +#' +#' @export +#' +simplex_TS_control <- function(transformation = ilr, quiet = FALSE, ...){ + list(transformation = transformation, quiet = quiet) +} + + +#' Log-likelihood of multivariate linear regression model +#' +#' @method logLik mlm +#' @param object multivariate linear regression model fit with +#' \code{\link{lm}}. +#' of class \code{mlm} +#' @param ... not used. +#' @return log-lik at (unrestricted) maximum with df as attribute. +#' @author Andi Boeck +#' @import mvtnorm +#' @export +#' @examples y <- cbind(rnorm(10), rnorm(10)); x <- 1:10; +#' mod <- lm(y~x) +#' logLik(mod) + + +#' @title Determine the log likelihood of a multivariate linear regression +#' model +#' +#' @description Convenience function to extract and format the log likelihood +#' of a multivariate linear regression, such as fitted by the +#' \code{\link{simplex_TS}} models. +#' +#' @details Adapted from the function contained in the old R-Forge Atools +#' package (href{https://rdrr.io/rforge/Atools/src/R/logLik.mlm.R}{see}). +#' +#' @param object multivariate linear regression model fit using +#' \code{\link[stats]{lm}} and of class \code{mlm} +#' +#' @param ... Not used, simply included to maintain method compatibility. +#' +#' @return Log likelihood of the model \code{logLik}, also with \code{df} +#' (degrees of freedom) and \code{nobs} (number of observations) values. +#' +#' @export +#' +logLik.mlm <- function(object, ...){ + resids <- residuals(object) + n <- nrow(resids) + Sigma_ML <- crossprod(resids) / n + ans <- sum(dmvnorm(resids, sigma = Sigma_ML, log = TRUE)) + + df <- length(coef(object)) + nrow(Sigma_ML) * (nrow(Sigma_ML) + 1) / 2 + attr(ans, "nobs") <- n + attr(ans, "df") <- df + class(ans) <- "logLik" + ans +} + +#' @title Fit a multinomial change point Time Series model +#' +#' @description +#' \code{multinom_TS} fits a set of multinomial regression models (via +#' \code{\link[nnet]{multinom}}, Venables and Ripley 2002) to a time +#' series of data divided into multiple segments (a.k.a. chunks) based on +#' given locations for a set of change points. \cr \cr +#' \code{multinom_TS_chunk} fits a multinomial regression model (via +#' \code{\link[nnet]{multinom}}, Ripley 1996, Venables and Ripley 2002) +#' to a defined chunk of time (a.k.a. segment) +#' \code{[chunk$start, chunk$end]} within a time series. \cr \cr +#' \code{multinom_TS_control} defines and creates the control \code{list} +#' for fitting. +#' +#' @param data \code{data.frame} including [1] the time variable (indicated +#' in \code{timename}), [2] the predictor variables (required by +#' \code{formula}) and [3], the multinomial response variable (indicated in +#' \code{formula}). \cr \cr +#' Note that the response variables should be formatted as a +#' \code{data.frame} object named as indicated by the +#' \code{response} entry in the \code{control} list, such as \code{gamma} +#' for a standard TS analysis on LDA output. \cr \cr +#' See \code{Examples}. +#' +#' @param formula \code{\link[stats]{formula}} defining the regression between +#' relationship the change points. Any +#' predictor variable included must also be a column in +#' \code{data} and any (multinomial) response variable must be a set of +#' columns in \code{data}. +#' +#' @param changepoints Numeric vector indicating locations of the change +#' points. Must be conformable to \code{integer} values. +#' +#' @param chunk Length-2 vector of times: [1] \code{start}, the start time +#' for the chunk and [2] \code{end}, the end time for the chunk. +#' +#' @param timename \code{character} element indicating the time variable +#' used in the time series. Defaults to \code{"time"}. The variable must be +#' integer-conformable or a \code{Date}. If the variable named +#' is a \code{Date}, the input is converted to an integer, resulting in the +#' timestep being 1 day, which is often not desired behavior. +#' +#' @param weights Optional class \code{numeric} vector of weights for each +#' document. Defaults to \code{NULL}, translating to an equal weight for +#' each document. When using \code{multinom_TS} in a standard LDATS +#' analysis, it is advisable to weight the documents by their total size, +#' as the result of \code{\link{topicmodels_LDA}} is a matrix of +#' proportions, which does not account for size differences among documents. +#' For most models, a scaling of the weights (so that the average is 1) is +#' most appropriate, and this is accomplished using +#' \code{\link{document_weights}}. +#' +#' @param control A \code{list} of parameters to control the fitting of the +#' Time Series model including the parallel tempering Markov Chain +#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by +#' \code{\link{TS_control}}. +#' +#' @param lambda \code{numeric} "weight" decay term used to set the prior +#' on the regressors within each chunk-level model. Defaults to 0, +#' corresponding to a fully vague prior. +#' +#' @param quiet \code{logical} indicator of whether the model should run +#' quietly (if \code{FALSE}, a progress bar and notifications are printed). +#' +#' @param ... Not passed along to the output, rather included to allow for +#' automated removal of unneeded controls. +#' +#' @return +#' \code{multinom_TS}: Object of class \code{TS_fit}, which is a +#' \code{list} of [1] chunk-level model fits (\code{"chunk models"}), +#' [2] the total log likelihood combined across all chunks +#' (\code{"logLik"}), and [3] a \code{data.frame} of chunk beginning and +#' ending times (with columns \code{"start"} and \code{"end"}). \cr \cr +#' \code{multinom_TS_chunk}: fitted model object for the chunk, +#' of classes \code{multinom} and \code{nnet}. \cr \cr +#' \code{multinom_TS_control}: \code{list}, with named elements +#' corresponding to response function controls. +#' +#' @references +#' Ripley, B. D. 1996. \emph{Pattern Recognition and Neural Networks}. +#' Cambridge University Press, Cambridge, UK. +#' +#' Venables, W. N. and B. D. Ripley. 2002. \emph{Modern and Applied +#' Statistics with S}. Fourth Edition. Springer, New York, NY, USA. +#' +#' @name multinom_TS +#' + + +#' @rdname multinom_TS +#' +#' @export +#' +multinom_TS <- function(data, formula, changepoints = NULL, + timename = "time", weights = NULL, + control = list()){ + control <- do.call("multinom_TS_control", control) + if (!verify_changepoint_locations(data, changepoints, timename)){ + out <- list("chunk models" = NA, "logLik" = -Inf, "chunks" = NA) + class(out) <- c("TS_fit", "list") + return(out) + } + + chunks <- prep_chunks(data, changepoints, timename) + nchunks <- nrow(chunks) + fits <- vector("list", length = nchunks) + for (i in 1:nchunks){ + fits[[i]] <- multinom_TS_chunk(data = data, formula = formula, + chunk = chunks[i, ], timename = timename, + weights = weights, control = control) + } + package_chunk_fits(chunks, fits) +} + +#' @rdname multinom_TS +#' +#' @export +#' +multinom_TS_chunk <- function(data, formula, chunk, timename = "time", + weights = NULL, control = list()){ + formula <- as.formula(format(formula)) + time_obs <- data[ , timename] + chunk_start <- as.numeric(chunk["start"]) + chunk_end <- as.numeric(chunk["end"]) + in_chunk <- time_obs >= chunk_start & time_obs <= chunk_end + fit <- multinom(formula, data, weights, subset = in_chunk, trace = FALSE, + decay = control$lambda) + fit$timevals <- time_obs[which(in_chunk)] + fit +} + +#' @rdname multinom_TS +#' +#' @export +#' +multinom_TS_control <- function(lambda = 0, quiet = FALSE, ...){ + list(lambda = lambda, quiet = quiet) +} + + + +#' @title Package the output of the chunk-level TS models into a TS_fit list +#' +#' @description Takes the list of fitted chunk-level models returned from +#' a \code{_TS_chunk} function and packages it as a +#' \code{TS_fit} object. This involves naming the model fits based +#' on the chunk time windows, combining the log likelihood values across the +#' chunks, and setting the class of the output object. +#' +#' @param chunks Data frame of \code{start} and \code{end} times for each +#' chunk (row). +#' +#' @param fits List of chunk-level fits returned by \code{TS_chunk_memo}, +#' the memoised version of \code{\link{multinom_TS_chunk}}. +#' +#' @return Object of class \code{TS_fit}, which is a list of [1] +#' chunk-level model fits, [2] the total log likelihood combined across +#' all chunks, and [3] the chunk time data table. +#' +#' @export +#' +#' +package_chunk_fits <- function(chunks, fits){ + nchunks <- nrow(chunks) + chunk_times <- paste0("(", chunks[ , "start"], " - ", chunks[ , "end"], ")") + names(fits) <- paste("chunk", 1:nchunks, chunk_times, "model") + ll <- (vapply(fits, logLik, 0)) + ll[ll == Inf] <- -Inf + ll <- sum(ll) + out <- list("chunk models" = fits, "logLik" = ll, "chunks" = chunks) + class(out) <- c("TS_fit", "list") + out +} + +#' @title Log likelihood of a TS model (as a TS_fit-class list) +#' +#' @description Convenience function to simply extract the \code{logLik} +#' element (and \code{df} and \code{nobs}) from a \code{TS_fit} +#' object fit by a \code{_TS} function. +#' +#' @param object A \code{TS_fit}-class object. +#' +#' @param ... Not used, simply included to maintain method compatibility. +#' +#' @return Log likelihood of the model, as class \code{logLik}, with +#' attributes \code{df} (degrees of freedom) and \code{nobs} (the number of +#' weighted observations, accounting for size differences among documents). +#' +#' @export +#' +logLik.TS_fit <- function(object, ...){ + ll <- object$logLik + df <- NA + nobs <- NA + if (object$logLik != -Inf){ + nchunks <- nrow(object$chunks) + dfperchunk <- length(coef(object$"chunk models"[[1]])) + df <- nchunks - 1 + dfperchunk * nchunks + nobs <- 0 + for(i in 1:nchunks){ + nobs <- nobs + sum(object$"chunk models"[[i]]$weights) + } + } + structure(ll, df = df, nobs = nobs, class = "logLik") +} + diff --git a/R/argument_checking.R b/R/argument_checking.R new file mode 100644 index 00000000..ddb445cd --- /dev/null +++ b/R/argument_checking.R @@ -0,0 +1,339 @@ +#' @title Check that arguments are properly formatted for usage +#' +#' @description Verify the class, structure, and values of inputted arguments +#' to ensure proper LDATS modeling. \cr \cr +#' \code{check_class} is a general class-verifier. \cr \cr +#' \code{check_nonneg_integer} is a specified checking function for +#' \code{numeric} values that must be integer-conformable and non-negative +#' (0 is acceptable). \cr \cr +#' \code{check_nonneg_integer_matrix} is a specified checking function for +#' tables of \code{numeric} values that must be integer-conformable and +#' non-negative (0 is acceptable) and that must be conformable to +#' a \code{matrix}. \cr \cr +#' \code{check_pos_integer} is a specified checking function for +#' \code{numeric} values that must be integer-conformable and positive +#' (0 is not acceptable). \cr \cr +#' \code{check_control} verifies that a control \code{list} is a \code{list} +#' \cr \cr +#' \code{check_topics} ensures that the vector of numbers of topics is +#' positive integer-conformable. \cr \cr +#' \code{check_replicates} ensures that the number of replicates is +#' positive integer-conformable. \cr \cr +#' \code{check_nchangepoints} ensures that the of change points is +#' positive integer-conformable. \cr \cr +#' \code{check_document_term_table} ensures that the table of document +#' term counts is conformable to a \code{matrix} of positive integers. +#' \cr \cr +#' \code{check_LDAs} verifies that the argument is either a +#' \code{LDA} or \code{LDA_set} \code{list}. \cr \cr +#' \code{check_document_covariate_table} check that the table of +#' document-level covariates in the \code{LDAs} data is +#' conformable to a data frame and of the right size (correct number of +#' documents) for the document-topic output from the LDA models. \cr \cr +#' \code{check_weights} ensures that the vector of document weights is +#' \code{numeric} and positive and inform the user if the average weight +#' isn't 1.\cr \cr +#' \code{check_timename} checks that the vector of time values is included +#' in the \code{document_covariate_table} and that it is either a +#' \code{integer}-conformable or a \code{Date}. +#' If it is a \code{Date}, the input is converted to an +#' \code{integer}, resulting in the timestep being 1 day, which is often +#' not desired behavior. \cr \cr +#' \code{check_formulas} verifies that the input contains only +#' \code{\link[stats]{formula}}s and that the response and predictor +#' variables are all included in \code{LDAs} data sets. \cr \cr +#' \code{check_nchangepoints} checks that the \code{vector} of numbers of +#' changepoints is conformable to non-negative \code{integers}.\cr \cr +#' +#' @param object An object whose class should be checked against +#' \code{eclass}. +#' +#' @param eclass Expected class of \code{object} to be checked. If more +#' than one option is included, any are sufficient (\code{object} only +#' needs to be one \code{eclass}, not all). +#' +#' @param control Control \code{list} to evaluate. +#' +#' @param topics \code{vector} of the number of topics to evaluate for each +#' model. Must be conformable to positive \code{integer} values. +#' +#' @param replicates \code{integer} number of replicate starts to use for +#' each value of \code{topics} in the LDAs. Must be conformable to +#' positive \code{integer} values. +#' +#' @param document_term_table Table of observation count data (rows: +#' documents, columns: terms. May be a \code{matrix} or +#' \code{data.frame} but must be conformable to a matrix of non-negative +#' \code{integers}. +#' +#' @param nchangepoints \code{integer}-conformable \code{vector} of the +#' number of changepoints to evaluate (must be non-negative). +#' +#' @param weights \code{numeric} \code{vector} of the document weights to +#' evaluate, or \code{TRUE} for triggering internal weighting by document +#' sizes. +#' +#' @param LDAs \code{LDA_models} \code{list} of LDA models or singular LDA +#' model (\code{LDA}) to evaluate. +#' +#' @param timename Column name for the time variable to evaluate in the +#' \code{document_covariate_table} if provided. +#' +#' @param formulas \code{vector} of the \code{\link[stats]{formula}}s +#' to evaluate. +#' +#' @return an error message is thrown if the input is improper, otherwise +#' \code{NULL}. +#' +#' +#' @name argument_checking +#' + + + +#' @rdname argument_checking +#' +#' @export +#' +check_class <- function(object, eclass = "list"){ + if(any(eclass == "nonneg_integer")){ + check_nonneg_integer(object) + return(invisible()) + } + if(any(eclass == "nonneg_integer_matrix")){ + check_nonneg_integer_matrix(object) + return(invisible()) + } + if(any(eclass == "pos_integer")){ + check_pos_integer(object) + return(invisible()) + } + if (!any(eclass %in% class(object))){ + object_name <- deparse(substitute(object)) + failed_eclass <- eclass[which(!(eclass %in% class(object)))] + failed_eclass <- paste0(failed_eclass, collapse = " or ") + stop(paste0(object_name, " is not a ", failed_eclass)) + } +} + +#' @rdname argument_checking +#' +#' @export +#' +check_nonneg_integer <- function(object){ + object_name <- deparse(substitute(object)) + + if (!is.numeric(object) || any(object %% 1 != 0)){ + stop(paste0(object_name, " must be integer-valued")) + } + if (any(object < 0)){ + stop(paste0(object_name, " must be non-negative")) + } +} + + +#' @rdname argument_checking +#' +#' @export +#' +check_nonneg_integer_matrix <- function(object){ + object_name <- deparse(substitute(object)) + object_m <- as.matrix(object) + check_nonneg_integer(object_m) +} + + + +#' @rdname argument_checking +#' +#' @export +#' +check_pos_integer <- function(object){ + object_name <- deparse(substitute(object)) + if (!is.numeric(object) || any(object %% 1 != 0)){ + stop(paste0(object_name, " must be integer-valued")) + } + if (any(object <= 0)){ + stop(paste0(object_name, " must be positive")) + } +} + +#' @rdname argument_checking +#' +#' @export +#' +check_control <- function(control, eclass = "list"){ + check_class(object = control, eclass = eclass) +} + + +#' @rdname argument_checking +#' +#' @export +#' +check_topics <- function(topics){ + check_class(object = topics, eclass = "pos_integer") +} + + +#' @rdname argument_checking +#' +#' @export +#' +check_replicates <- function(replicates){ + check_class(object = replicates, eclass = "pos_integer") +} + +#' @rdname argument_checking +#' +#' @export +#' +check_nchangepoints <- function(nchangepoints){ + check_class(object = nchangepoints, eclass = "nonneg_integer") +} + +#' @rdname argument_checking +#' +#' @export +#' +check_document_term_table <- function(document_term_table){ + check_class(object = document_term_table, eclass = "nonneg_integer_matrix") + +} + + +#' @rdname argument_checking +#' +#' @export +#' +check_weights <- function(weights){ + if(is.logical(weights)){ + if(weights){ + return(invisible()) + } else{ + stop("if logical, weights need to be TRUE") + } + } + if(!is.null(weights)){ + if (!is.numeric(weights)){ + stop("weights vector must be numeric") + } + if (any(weights <= 0)){ + stop("weights must be positive") + } + if (round(mean(weights)) != 1){ + warning("weights should have a mean of 1, fit may be unstable") + } + } +} + +#' @rdname argument_checking +#' +#' @export +#' +check_LDAs <- function(LDAs){ + check_class(object = LDAs, eclass = c("LDA_set", "LDA")) +} + + + + +#' @rdname argument_checking +#' +#' @export +#' +check_timename <- function(LDAs, timename){ + + document_cov_table <- LDAs[[1]][[1]]$data$train$document_covariate_table + if (!("character" %in% class(timename))){ + stop("timename is not a character value") + } + if (length(timename) > 1){ + stop("timename can only be one value") + } + covariate_names <- colnames(document_cov_table) + if ((timename %in% covariate_names) == FALSE){ + stop("timename not present in document covariate table") + } + time_covariate <- document_cov_table[ , timename] + if (!(is.Date(time_covariate)) & + (!is.numeric(time_covariate) || !all(time_covariate %% 1 == 0))){ + stop("covariate indicated by timename is not an integer or a date") + } +} + + + + + + +#' @rdname argument_checking +#' +#' @export +#' +check_formulas <- function(LDAs, formulas){ + + dct <- LDAs[[1]][[1]]$data$train$document_covariate_table + control <- LDAs[[1]][[1]]$control + + # response <- control$response + + if (!is(formulas, "list")) { + if (is(formulas, "formula")) { + formulas <- c(formulas) + } else{ + stop("formulas does not contain formula(s)") + } + } else if (!all(vapply(formulas, is, TRUE, "formula"))) { + stop("formulas does not contain all formula(s)") + } + resp <- unlist(lapply(lapply(formulas, terms), attr, "response")) + pred <- unlist(lapply(lapply(formulas, terms), attr, "term.labels")) + if (any(resp != 0)) { + stop("formula inputs should not include response variable") + } + if (!all(pred %in% colnames(dct))) { + misses <- pred[which(pred %in% colnames(dct) == FALSE)] + mis <- paste(misses, collapse = ", ") + stop(paste0("formulas include predictors not present in data: ", mis)) + } +} + + + + + +#' @rdname argument_checking +#' +#' @export +#' +check_document_covariate_table <- function(LDAs){ + + dct <- LDAs[[1]][[1]]$data$train$document_covariate_table + dtt <- LDAs[[1]][[1]]$data$train$document_term_table + + + + dct_df <- tryCatch(data.frame(dct), + warning = function(x){NA}, error = function(x){NA}) + if(is(LDAs, "LDA")){ + LDAs <- c(LDAs) + class(LDAs) <- c("LDA_set", "list") + } + if (length(dct_df) == 1 && is.na(dct_df)){ + stop("document_covariate_table is not conformable to a data frame") + } + if (!is.null(LDAs)){ + if (nrow(data.frame(dct)) != nrow(LDAs[[1]][[1]]$document_topic_table)){ + stop("number of documents in covariate table is not equal to number of + documents observed") + } + } else if (!is.null(dtt)){ + if (nrow(data.frame(dct)) != nrow(data.frame(dtt))){ + stop("number of documents in covariate table is not equal to number of + documents observed") + } + } +} + + diff --git a/R/data_preparation.R b/R/data_preparation.R new file mode 100644 index 00000000..2050ce6c --- /dev/null +++ b/R/data_preparation.R @@ -0,0 +1,264 @@ +#' @title Conform data for LDATS modeling +#' +#' @description Given any of a variety of possible data input types +#' (\code{data.frame}/\code{matrix}, \code{list}, \code{list} of +#' \code{list}s, or \code{list} of \code{list} of \code{list}s) and +#' controls, this produces a properly formatted set of data (sets) for +#' LDATS modeling. +#' +#' @details This function makes use of the \code{\link{list_depth}} +#' utility that recursively works through an object to tell you +#' how nested a lists is. \cr \cr +#' Working up from the most elemental version of \code{data} possible, +#' if it's not a \code{list}, but the data are a term table, the +#' covariate table is added with assumed equispersed data and +#' the data are now a \code{list}. \cr +#' Then, if it is a \code{list} but only a of depth 1 (a \code{list} of two +#' tables), it is wrapped in a \code{list} to make it depth-2, +#' functionally a 1-subset data set. \cr +#' Then, if it is a \code{list} of depth two, it may need to be expanded to +#' a multiple-subset data set, to allow for cross validtion methods, for +#' example. So, the \code{list} of depth 2 is replicated out to create a +#' longer \code{list} that is still depth 2 but is now of length +#' \code{control$nsubsets}. \cr +#' Then, the subsetting of the data occurs according to the +#' \code{control$subset_rule}, and each depth-2 \code{list} is split to a +#' final level of training and testing subsets of the data, making the +#' \code{list} depth 3. \cr \cr +#' The training and testing data are constructed as trimmed versions of the +#' two tables, even if no data are required for testing. +#' +#' @param data A document term table, \code{list} of document term and +#' covariate tables, a \code{list} of training and test sets of the two +#' tables, or a \code{list} of multiple replicate splits of training and +#' test sets of the two tables. +#' +#' @param control \code{list} of control options for the data conforming. +#' +#' @return \code{list} of properly formatted LDATS data. +#' +#' @name conform_data +#' + +#' @rdname conform_data +#' +#' @export +#' +conform_data <- function(data, control = list()){ + depth <- list_depth(data) + if(depth == 0){ + if(inherits(data, "data.frame") | inherits(data, "matrix")){ + msg <- "covariate table not provided, assuming equi-spaced data" + messageq(msg, control$quiet) + nobs <- nrow(data) + covariate <- data.frame(time = 1:nobs) + data <- list(document_term_table = data, + document_covariate_table = covariate) + depth <- list_depth(data) + } else{ + stop("improper data format") + } + } + if(depth == 1){ + which_term <- grep("term", names(data), ignore.case = TRUE) + which_covariate <- grep("covariate", names(data), ignore.case = TRUE) + if(length(which_term) != 1){ + stop("one, and only one, element in `data` can include `term`") + } + if (length(which_covariate) == 0){ + msg <- "covariate table not provided, assuming equi-spaced data" + messageq(msg, control$quiet) + nobs <- nrow(data[[which_term]]) + covariate <- data.frame(time = 1:nobs) + data$document_covariate_table <- covariate + } else if(length(which_covariate) > 1){ + stop("at most one element in `data` can include `covariate`") + } + names(data)[which_term] <- "document_term_table" + names(data)[which_covariate] <- "document_covariate_table" + data <- list(data) + depth <- list_depth(data) + } + if(depth == 2){ + nsubsets_in <- length(data) + nsubsets_out <- control$nsubsets + + if(nsubsets_in != 1 && nsubsets_in != nsubsets_out){ + stop("mimatched request for data subsets") + } + + if(nsubsets_out > 0){ + data_1 <- data[[1]] + for(i in 1:nsubsets_out){ + + rule <- control$rule + if(is.null(rule)){ + rule <- null_rule + } + if(nsubsets_in == 1){ + data_i <- data_1 + } else if (nsubsets_in > 1){ + data_i <- data[[i]] + } + + + if(!all(c("test", "train") %in% names(data_i))){ + which_term <- grep("term", names(data_i), ignore.case = TRUE) + which_covariate <- grep("covariate", names(data_i), + ignore.case = TRUE) + if(length(which_term) != 1){ + stop("one, and only one, element in `data` can include `term`") + } + if (length(which_covariate) == 0){ + msg <- "covariate table not provided, assuming equi-spaced data" + messageq(msg, control$quiet) + nobs <- nrow(data_i[[which_term]]) + covariate <- data.frame(time = 1:nobs) + data_i$document_covariate_table <- covariate + } else if(length(which_covariate) > 1){ + stop("at most one element in `data` can include `covariate`") + } + names(data_i)[which_term] <- "document_term_table" + names(data_i)[which_covariate] <- "document_covariate_table" + } + dtt <- data_i$document_term_table + dct <- data_i$document_covariate_table + args <- list(data = dtt, iteration = i) + test_train <- do.call(what = rule, args = args) + in_train <- test_train == "train" + in_test <- test_train == "test" + train <- list(document_term_table = dtt[in_train, , drop = FALSE], + document_covariate_table = + dct[in_train, , drop = FALSE]) + test <- list(document_term_table = dtt[in_test, , drop = FALSE], + document_covariate_table = dct[in_test, , drop = FALSE]) + data[[i]] <- list(test = test, train = train) + } + names(data) <- 1:nsubsets_out + } + depth <- list_depth(data) + } + if(depth == 3){ + + nsubsets_in <- length(data) + nsubsets_out <- control$nsubsets + + if(nsubsets_in != 1 && nsubsets_in != nsubsets_out){ + stop("mimatched request for data subsets") + } + + for(i in 1:nsubsets_out){ + + data_i <- data[[i]] + which_train <- grep("train", names(data_i), ignore.case = TRUE) + which_test <- grep("test", names(data_i), ignore.case = TRUE) + if(length(which_train) != 1){ + stop("only one, element in a `data` subset can include `train`") + } + if(length(which_test) != 1){ + stop("only one, element in a `data` subset can include `test`") + } + for(j in 1:2){ + data_ij <- data[[i]][[j]] + which_term <- grep("term", names(data_ij), ignore.case = TRUE) + which_covariate <- grep("covariate", names(data_ij), + ignore.case = TRUE) + if(length(which_term) != 1){ + stop("one, and only one, element in `data` can include `term`") + } + if (length(which_covariate) == 0){ + stop("covariate table not provided, can't be made from split data") + } else if(length(which_covariate) > 1){ + stop("at most one element in `data` can include `covariate`") + } + } + } + } + data + +} + +#' @title Subset data sets +#' +#' @description For use within, e.g., cross validation methods, these +#' functions subdivide the data into testing and training subsets. \cr \cr +#' \code{null_rule} places all data in the training set. \cr \cr +#' \code{random_loo} conducts randomized leave-one-out with no buffer. +#' \cr \cr +#' \code{systematic_loo} conducts systematic leave-one-out with no buffer. +#' Assumes 1:1 between iteration and datum location to drop. \cr \cr +#' \code{leave_p_out} is a fully flexible leave p out function allowing for +#' asymmetric buffers and randomization. If \code{random = TRUE}, the test +#' data are selected randomly, otherwise locations are used. +#' +#' @param data \code{data.frame} or \code{matrix} of data to be split. +#' +#' @param iteration \code{integer}-conformable value indicating which +#' iteration through the process the current implementation is. +#' +#' @param p \code{integer}-conformable value of how many samples to leave out. +#' +#' @param pre,post \code{integer}-conformable values of how many samples +#' to include in the buffer around the focal left out data. Can be +#' asymmetric. +#' +#' @param random \code{logical} indicator of if the left out data should be +#' randomly selected. +#' +#' @param locations \code{integer}-conformable values referencing which +#' data to hold out. +#' +#' @return \code{character} \code{vector} of \code{"train"} and \code{"test"} +#' values. +#' +#' @name data_subsetting +#' + + +#' @rdname data_subsetting +#' +#' @export +#' +null_rule <- function(data, iteration = 1){ + n <- NROW(data) + rep("train", n) +} + + +#' @rdname data_subsetting +#' +#' @export +#' +systematic_loo <- function(data, iteration = 1){ + leave_p_out(data = data, random = FALSE, locations = iteration) +} + +#' @rdname data_subsetting +#' +#' @export +#' +random_loo <- function(data, iteration = 1){ + leave_p_out(data = data) +} + +#' @rdname data_subsetting +#' +#' @export +#' +leave_p_out <- function(data, p = 1, pre = 0, post = 0, + random = TRUE, locations = NULL){ + n <- NROW(data) + test_train <- rep("train", n) + + if(random){ + locations <- sample(1:n, p) + } + + for(i in 1:p){ + hold_out <- (locations[i] - pre):(locations[i] + post) + test_train[hold_out] <- "out" + } + test_train[locations] <- "test" + test_train +} + diff --git a/R/multinom_TS.R b/R/multinom_TS.R deleted file mode 100644 index b43b3114..00000000 --- a/R/multinom_TS.R +++ /dev/null @@ -1,397 +0,0 @@ -#' @title Fit a multinomial change point Time Series model -#' -#' @description Fit a set of multinomial regression models (via -#' \code{\link[nnet]{multinom}}, Venables and Ripley 2002) to a time series -#' of data divided into multiple segments (a.k.a. chunks) based on given -#' locations for a set of change points. \cr \cr -#' \code{check_multinom_TS_inputs} checks that the inputs to -#' \code{multinom_TS} are of proper classes for an analysis. -#' -#' @param data \code{data.frame} including [1] the time variable (indicated -#' in \code{timename}), [2] the predictor variables (required by -#' \code{formula}) and [3], the multinomial response variable (indicated in -#' \code{formula}) as verified by \code{\link{check_timename}} and -#' \code{\link{check_formula}}. Note that the response variables should be -#' formatted as a \code{data.frame} object named as indicated by the -#' \code{response} entry in the \code{control} list, such as \code{gamma} -#' for a standard TS analysis on LDA output. See \code{Examples}. -#' -#' @param formula \code{\link[stats]{formula}} defining the regression between -#' relationship the change points. Any -#' predictor variable included must also be a column in -#' \code{data} and any (multinomial) response variable must be a set of -#' columns in \code{data}, as verified by \code{\link{check_formula}}. -#' -#' @param changepoints Numeric vector indicating locations of the change -#' points. Must be conformable to \code{integer} values. Validity -#' checked by \code{\link{check_changepoints}} and -#' \code{\link{verify_changepoint_locations}}. -#' -#' @param timename \code{character} element indicating the time variable -#' used in the time series. Defaults to \code{"time"}. The variable must be -#' integer-conformable or a \code{Date}. If the variable named -#' is a \code{Date}, the input is converted to an integer, resulting in the -#' timestep being 1 day, which is often not desired behavior. -#' -#' @param weights Optional class \code{numeric} vector of weights for each -#' document. Defaults to \code{NULL}, translating to an equal weight for -#' each document. When using \code{multinom_TS} in a standard LDATS -#' analysis, it is advisable to weight the documents by their total size, -#' as the result of \code{\link[topicmodels]{LDA}} is a matrix of -#' proportions, which does not account for size differences among documents. -#' For most models, a scaling of the weights (so that the average is 1) is -#' most appropriate, and this is accomplished using -#' \code{\link{document_weights}}. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return \code{multinom_TS}: Object of class \code{multinom_TS_fit}, -#' which is a list of [1] -#' chunk-level model fits (\code{"chunk models"}), [2] the total log -#' likelihood combined across all chunks (\code{"logLik"}), and [3] a -#' \code{data.frame} of chunk beginning and ending times (\code{"logLik"} -#' with columns \code{"start"} and \code{"end"}). \cr \cr -#' \code{check_multinom_TS_inputs}: an error message is thrown if any -#' input is improper, otherwise \code{NULL}. -#' -#' @references -#' Venables, W. N. and B. D. Ripley. 2002. \emph{Modern and Applied -#' Statistics with S}. Fourth Edition. Springer, New York, NY, USA. -#' -#' @examples -#' data(rodents) -#' dtt <- rodents$document_term_table -#' lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) -#' dct <- rodents$document_covariate_table -#' dct$gamma <- lda[[1]]@gamma -#' weights <- document_weights(dtt) -#' check_multinom_TS_inputs(dct, timename = "newmoon") -#' mts <- multinom_TS(dct, formula = gamma ~ 1, changepoints = c(20,50), -#' timename = "newmoon", weights = weights) -#' -#' @export -#' -multinom_TS <- function(data, formula, changepoints = NULL, - timename = "time", weights = NULL, - control = list()){ - - check_multinom_TS_inputs(data, formula, changepoints, timename, weights, - control) - control <- do.call("TS_control", control) - if (!verify_changepoint_locations(data, changepoints, timename)){ - out <- list("chunk models" = NA, "logLik" = -Inf, "chunks" = NA) - class(out) <- c("multinom_TS_fit", "list") - return(out) - } - - TS_chunk_memo <- memoise_fun(multinom_TS_chunk, control$memoise) - - chunks <- prep_chunks(data, changepoints, timename) - nchunks <- nrow(chunks) - fits <- vector("list", length = nchunks) - for (i in 1:nchunks){ - fits[[i]] <- TS_chunk_memo(data, formula, chunks[i, ], timename, weights, - control) - } - package_chunk_fits(chunks, fits) -} - -#' @rdname multinom_TS -#' -#' @export -#' -check_multinom_TS_inputs <- function(data, formula = gamma~1, - changepoints = NULL, - timename = "time", weights = NULL, - control = list()){ - check_changepoints(changepoints) - check_weights(weights) - check_formula(data, formula) - check_timename(data, timename) - check_control(control) -} - -#' @title Check that a set of change point locations is proper -#' -#' @description Check that the change point locations are \code{numeric} -#' and conformable to \code{interger} values. -#' -#' @param changepoints Change point locations to evaluate. -#' -#' @return An error message is thrown if \code{changepoints} are not proper, -#' else \code{NULL}. -#' -#' @examples -#' check_changepoints(100) -#' -#' @export -#' -check_changepoints <- function(changepoints = NULL){ - if (is.null(changepoints)){ - return() - } - if (!is.numeric(changepoints) || any(changepoints %% 1 != 0)){ - stop("changepoints must be integer-valued") - } -} - -#' @title Log likelihood of a multinomial TS model -#' -#' @description Convenience function to simply extract the \code{logLik} -#' element (and \code{df} and \code{nobs}) from a \code{multinom_TS_fit} -#' object fit by \code{\link{multinom_TS}}. Extends -#' \code{\link[stats]{logLik}} from \code{\link[nnet]{multinom}} to -#' \code{multinom_TS_fit} objects. -#' -#' @param object A \code{multinom_TS_fit}-class object. -#' -#' @param ... Not used, simply included to maintain method compatibility. -#' -#' @return Log likelihood of the model, as class \code{logLik}, with -#' attributes \code{df} (degrees of freedom) and \code{nobs} (the number of -#' weighted observations, accounting for size differences among documents). -#' -#' @examples -#' data(rodents) -#' dtt <- rodents$document_term_table -#' lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) -#' dct <- rodents$document_covariate_table -#' dct$gamma <- lda[[1]]@gamma -#' weights <- document_weights(dtt) -#' mts <- multinom_TS(dct, formula = gamma ~ 1, changepoints = c(20,50), -#' timename = "newmoon", weights = weights) -#' logLik(mts) -#' -#' @export -#' -logLik.multinom_TS_fit <- function(object, ...){ - ll <- object$logLik - df <- NA - nobs <- NA - if (object$logLik != -Inf){ - nchunks <- nrow(object$chunks) - dfperchunk <- length(coef(object$"chunk models"[[1]])) - df <- nchunks - 1 + dfperchunk * nchunks - nobs <- 0 - for(i in 1:nchunks){ - nobs <- nobs + sum(object$"chunk models"[[i]]$weights) - } - } - structure(ll, df = df, nobs = nobs, class = "logLik") -} - -#' @title Package the output of the chunk-level multinomial models into a -#' multinom_TS_fit list -#' -#' @description Takes the list of fitted chunk-level models returned from -#' \code{TS_chunk_memo} (the memoised version of -#' \code{\link{multinom_TS_chunk}} and packages it as a -#' \code{multinom_TS_fit} object. This involves naming the model fits based -#' on the chunk time windows, combining the log likelihood values across the -#' chunks, and setting the class of the output object. -#' -#' @param chunks Data frame of \code{start} and \code{end} times for each -#' chunk (row). -#' -#' @param fits List of chunk-level fits returned by \code{TS_chunk_memo}, -#' the memoised version of \code{\link{multinom_TS_chunk}}. -#' -#' @return Object of class \code{multinom_TS_fit}, which is a list of [1] -#' chunk-level model fits, [2] the total log likelihood combined across -#' all chunks, and [3] the chunk time data table. -#' -#' @examples -#' data(rodents) -#' dtt <- rodents$document_term_table -#' lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) -#' dct <- rodents$document_covariate_table -#' dct$gamma <- lda[[1]]@gamma -#' weights <- document_weights(dtt) -#' formula <- gamma ~ 1 -#' changepoints <- c(20,50) -#' timename <- "newmoon" -#' TS_chunk_memo <- memoise_fun(multinom_TS_chunk, TRUE) -#' chunks <- prep_chunks(dct, changepoints, timename) -#' nchunks <- nrow(chunks) -#' fits <- vector("list", length = nchunks) -#' for (i in 1:nchunks){ -#' fits[[i]] <- TS_chunk_memo(dct, formula, chunks[i, ], timename, -#' weights, TS_control()) -#' } -#' package_chunk_fits(chunks, fits) -#' -#' @export -#' -#' -package_chunk_fits <- function(chunks, fits){ - nchunks <- nrow(chunks) - chunk_times <- paste0("(", chunks[ , "start"], " - ", chunks[ , "end"], ")") - names(fits) <- paste("chunk", 1:nchunks, chunk_times, "model") - ll <- sum(vapply(fits, logLik, 0)) - out <- list("chunk models" = fits, "logLik" = ll, "chunks" = chunks) - class(out) <- c("multinom_TS_fit", "list") - out -} - -#' @title Prepare the time chunk table for a multinomial change point -#' Time Series model -#' -#' @description Creates the table containing the start and end times for each -#' chunk within a time series, based on the change points (used to break up -#' the time series) and the range of the time series. If there are no -#' change points (i.e. \code{changepoints} is \code{NULL}, there is still a -#' single chunk defined by the start and end of the time series. -#' -#' @param data Class \code{data.frame} object including the predictor and -#' response variables, but specifically here containing the column indicated -#' by the \code{timename} input. -#' -#' @param changepoints Numeric vector indicating locations of the change -#' points. Must be conformable to \code{integer} values. -#' -#' @param timename \code{character} element indicating the time variable -#' used in the time series. Defaults to \code{"time"}. The variable must be -#' integer-conformable or a \code{Date}. If the variable named -#' is a \code{Date}, the input is converted to an integer, resulting in the -#' timestep being 1 day, which is often not desired behavior. -#' -#' @return \code{data.frame} of \code{start} and \code{end} times (columns) -#' for each chunk (rows). -#' -#' @examples -#' data(rodents) -#' dtt <- rodents$document_term_table -#' lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) -#' dct <- rodents$document_covariate_table -#' dct$gamma <- lda[[1]]@gamma -#' chunks <- prep_chunks(dct, changepoints = 100, timename = "newmoon") -#' -#' @export -#' -prep_chunks <- function(data, changepoints = NULL, - timename = "time"){ - start <- c(min(data[ , timename]), changepoints + 1) - end <- c(changepoints, max(data[ , timename])) - data.frame(start, end) -} - -#' @title Verify the change points of a multinomial time series model -#' -#' @description Verify that a time series can be broken into a set -#' of chunks based on input change points. -#' -#' @param data Class \code{data.frame} object including the predictor and -#' response variables. -#' -#' @param changepoints Numeric vector indicating locations of the change -#' points. Must be conformable to \code{integer} values. -#' -#' @param timename \code{character} element indicating the time variable -#' used in the time series. Defaults to \code{"time"}. The variable must be -#' integer-conformable or a \code{Date}. If the variable named -#' is a \code{Date}, the input is converted to an integer, resulting in the -#' timestep being 1 day, which is often not desired behavior. -#' -#' @return Logical indicator of the check passing \code{TRUE} or failing -#' \code{FALSE}. -#' -#' @examples -#' data(rodents) -#' dtt <- rodents$document_term_table -#' lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) -#' dct <- rodents$document_covariate_table -#' dct$gamma <- lda[[1]]@gamma -#' verify_changepoint_locations(dct, changepoints = 100, -#' timename = "newmoon") -#' -#' @export -#' -verify_changepoint_locations <- function(data, changepoints = NULL, - timename = "time"){ - - if (is.null(changepoints)){ - return(TRUE) - } - - first_time <- min(data[ , timename]) - last_time <- max(data[ , timename]) - time_check <- any(changepoints <= first_time | changepoints >= last_time) - sort_check <- is.unsorted(changepoints, strictly = TRUE) - - !(time_check | sort_check) -} - -#' @title Fit a multinomial Time Series model chunk -#' -#' @description Fit a multinomial regression model (via -#' \code{\link[nnet]{multinom}}, Ripley 1996, Venables and Ripley 2002) -#' to a defined chunk of time (a.k.a. segment) -#' \code{[chunk$start, chunk$end]} within a time series. -#' -#' @param data Class \code{data.frame} object including the predictor and -#' response variables. -#' -#' @param formula Formula as a \code{\link[stats]{formula}} or -#' \code{\link[base]{character}} object describing the chunk. -#' -#' @param chunk Length-2 vector of times: [1] \code{start}, the start time -#' for the chunk and [2] \code{end}, the end time for the chunk. -#' -#' @param weights Optional class \code{numeric} vector of weights for each -#' document. Defaults to \code{NULL}, translating to an equal weight for -#' each document. When using \code{multinom_TS} in a standard LDATS -#' analysis, it is advisable to weight the documents by their total size, -#' as the result of \code{\link[topicmodels]{LDA}} is a matrix of -#' proportions, which does not account for size differences among documents. -#' For most models, a scaling of the weights (so that the average is 1) is -#' most appropriate, and this is accomplished using \code{document_weights}. -#' -#' @param timename \code{character} element indicating the time variable -#' used in the time series. Defaults to \code{"time"}. The variable must be -#' integer-conformable or a \code{Date}. If the variable named -#' is a \code{Date}, the input is converted to an integer, resulting in the -#' timestep being 1 day, which is often not desired behavior. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return Fitted model object for the chunk, of classes \code{multinom} and -#' \code{nnet}. -#' -#' @references -#' Ripley, B. D. 1996. Pattern Recognition and Neural Networks. Cambridge. -#' -#' Venables, W. N. and B. D. Ripley. 2002. Modern Applied Statistics with S. -#' Fourth edition. Springer. -#' -#' @examples -#' data(rodents) -#' dtt <- rodents$document_term_table -#' lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) -#' dct <- rodents$document_covariate_table -#' dct$gamma <- lda[[1]]@gamma -#' weights <- document_weights(dtt) -#' chunk <- c(start = 0, end = 100) -#' mtsc <- multinom_TS_chunk(dct, formula = gamma ~ 1, chunk = chunk, -#' timename = "newmoon", weights = weights) -#' -#' @export -#' -multinom_TS_chunk <- function(data, formula, chunk, timename = "time", - weights = NULL, control = list()){ - control <- do.call("TS_control", control) - formula <- as.formula(format(formula)) - time_obs <- data[ , timename] - chunk_start <- as.numeric(chunk["start"]) - chunk_end <- as.numeric(chunk["end"]) - in_chunk <- time_obs >= chunk_start & time_obs <= chunk_end - fit <- multinom(formula, data, weights, subset = in_chunk, trace = FALSE, - decay = control$lambda) - fit$timevals <- time_obs[which(in_chunk)] - fit -} diff --git a/R/ptMCMC.R b/R/ptMCMC.R deleted file mode 100644 index 6695312a..00000000 --- a/R/ptMCMC.R +++ /dev/null @@ -1,934 +0,0 @@ -#' @title Calculate ptMCMC summary diagnostics -#' -#' @description Summarize the step and swap acceptance rates as well as trip -#' metrics from the saved output of a ptMCMC estimation. -#' -#' @details Within-chain step acceptance rates are averaged for each of the -#' chains from the raw step acceptance histories -#' (\code{ptMCMCout$step_accepts}) and between-chain swap acceptance rates -#' are similarly averaged for each of the neighboring pairs of chains from -#' the raw swap acceptance histories (\code{ptMCMCout$swap_accepts}). -#' Trips are defined as movement from one extreme chain to the other and -#' back again (Katzgraber \emph{et al.} 2006). Trips are counted and turned -#' to per-iteration rates using \code{\link{count_trips}}. -#' \cr \cr -#' This function was first designed to work within \code{\link{TS}} and -#' process the output of \code{\link{est_changepoints}}, but has been -#' generalized and would work with any output from a ptMCMC as long as -#' \code{ptMCMCout} is formatted properly. -#' -#' @param ptMCMCout Named \code{list} of saved data objects from a ptMCMC -#' estimation including elements named \code{step_accepts} (matrix of -#' \code{logical} outcomes of each step; rows: chains, columns: iterations), -#' \code{swap_accepts} (matrix of \code{logical} outcomes of each swap; -#' rows: chain pairs, columns: iterations), and \code{ids} (matrix of -#' particle identifiers; rows: chains, columns: iterations). -#' \code{ptMCMCout = NULL} indicates no use of ptMCMC and so the function -#' returns \code{NULL}. -#' -#' @return \code{list} of [1] within-chain average step acceptance rates -#' (\code{$step_acceptance_rate}), [2] average between-chain swap acceptance -#' rates (\code{$swap_acceptance_rate}), [3] within particle trip counts -#' (\code{$trip_counts}), and [4] within-particle average trip rates -#' (\code{$trip_rates}). -#' -#' @references -#' Katzgraber, H. G., S. Trebst, D. A. Huse. And M. Troyer. 2006. -#' Feedback-optimized parallel tempering Monte Carlo. \emph{Journal of -#' Statistical Mechanics: Theory and Experiment} \strong{3}:P03018 -#' \href{https://bit.ly/2LICGXh}{link}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' data <- data[order(data[,"newmoon"]), ] -#' rho_dist <- est_changepoints(data, gamma ~ 1, 1, "newmoon", -#' weights, TS_control()) -#' diagnose_ptMCMC(rho_dist) -#' } -#' -#' @export -#' -diagnose_ptMCMC <- function(ptMCMCout){ - if(is.null(ptMCMCout)){ - return(NULL) - } - trips <- count_trips(ptMCMCout$ids) - list(step_acceptance_rate = rowMeans(ptMCMCout$step_accepts), - swap_acceptance_rate = rowMeans(ptMCMCout$swap_accepts), - trip_counts = trips$trip_counts, trip_rates = trips$trip_rates) -} - -#' @title Count trips of the ptMCMC particles -#' -#' @description Count the full trips (from one extreme temperature chain to -#' the other and back again; Katzgraber \emph{et al.} 2006) for each of the -#' ptMCMC particles, as identified by their id on initialization. -#' \cr \cr -#' This function was designed to work within \code{\link{TS}} and process -#' the output of \code{\link{est_changepoints}} as a component of -#' \code{\link{diagnose_ptMCMC}}, but has been generalized -#' and would work with any output from a ptMCMC as long as \code{ids} -#' is formatted properly. -#' -#' @param ids \code{matrix} of identifiers of the particles in each chain for -#' each iteration of the ptMCMC algorithm (rows: chains, -#' columns: iterations). -#' -#' @return \code{list} of [1] \code{vector} of within particle trip counts -#' (\code{$trip_counts}), and [2] \code{vector} of within-particle average -#' trip rates (\code{$trip_rates}). -#' -#' @references -#' Katzgraber, H. G., S. Trebst, D. A. Huse. And M. Troyer. 2006. -#' Feedback-optimized parallel tempering Monte Carlo. \emph{Journal of -#' Statistical Mechanics: Theory and Experiment} \strong{3}:P03018 -#' \href{https://bit.ly/2LICGXh}{link}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' data <- data[order(data[,"newmoon"]), ] -#' rho_dist <- est_changepoints(data, gamma ~ 1, 1, "newmoon", weights, -#' TS_control()) -#' count_trips(rho_dist$ids) -#' } -#' -#' @export -#' -count_trips <- function(ids){ - nit <- ncol(ids) - ntemps <- nrow(ids) - last_extreme <- NA - last_extreme_vector <- numeric(nit) - trips <- numeric(ntemps) - for(i in 1:ntemps){ - for(j in 1:nit){ - if(ids[1, j] == i){ - last_extreme <- "bottom" - } - if(ids[ntemps, j] == i){ - last_extreme <- "top" - } - last_extreme_vector[j] <- last_extreme - } - first_top <- match("top", last_extreme_vector) - if (is.na(first_top)){ - trips[i] <- 0 - } else{ - last_pos <- rle(last_extreme_vector[first_top:nit])$values - trips[i] <- sum(last_pos == "bottom") - } - } - trip_rates <- trips / nit - list(trip_counts = trips, trip_rates = trip_rates) -} - - -#' @title Conduct a set of among-chain swaps for the ptMCMC algorithm -#' -#' @description This function handles the among-chain swapping based on -#' temperatures and likelihood differentials. -#' \cr \cr -#' This function was designed to work within \code{\link{TS}} and -#' specifically \code{\link{est_changepoints}}. It is still hardcoded to do -#' so, but has the capacity to be generalized to work with any estimation -#' via ptMCMC with additional coding work. -#' -#' @details The ptMCMC algorithm couples the chains (which are -#' taking their own walks on the distribution surface) through "swaps", -#' where neighboring chains exchange configurations (Geyer 1991, Falcioni -#' and Deem 1999) following the Metropolis criterion (Metropolis -#' \emph{et al.} 1953). This allows them to share information and search the -#' surface in combination (Earl and Deem 2005). -#' -#' @param chainsin Chain configuration to be evaluated for swapping. -#' -#' @param inputs Class \code{ptMCMC_inputs} list, containing the static inputs -#' for use within the ptMCMC algorithm. -#' -#' @param ids The vector of integer chain ids. -#' -#' @return \code{list} of updated change points, log-likelihoods, and chain -#' ids, as well as a vector of acceptance indicators for each swap. -#' -#' @references -#' Earl, D. J. and M. W. Deem. 2005. Parallel tempering: theory, -#' applications, and new perspectives. \emph{Physical Chemistry Chemical -#' Physics} \strong{7}: 3910-3916. -#' \href{https://rsc.li/2XkxPCm}{link}. -#' -#' Falcioni, M. and M. W. Deem. 1999. A biased Monte Carlo scheme for -#' zeolite structure solution. \emph{Journal of Chemical Physics} -#' \strong{110}: 1754-1766. -#' \href{https://aip.scitation.org/doi/10.1063/1.477812}{link}. -#' -#' Geyer, C. J. 1991. Markov Chain Monte Carlo maximum likelihood. \emph{In -#' Computing Science and Statistics: Proceedings of the 23rd Symposium on -#' the Interface}. pp 156-163. American Statistical Association, New York, -#' USA. \href{https://www.stat.umn.edu/geyer/f05/8931/c.pdf}{link}. -#' -#' Metropolis, N., A. W. Rosenbluth, M. N. Rosenbluth, A. H. Teller, and E. -#' Teller. 1953. Equations of state calculations by fast computing machines. -#' \emph{Journal of Chemical Physics} \strong{21}: 1087-1092. -#' \href{https://bayes.wustl.edu/Manual/EquationOfState.pdf}{link}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' data <- data[order(data[,"newmoon"]), ] -#' saves <- prep_saves(1, TS_control()) -#' inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, -#' TS_control()) -#' cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) -#' ids <- prep_ids(TS_control()) -#' for(i in 1:TS_control()$nit){ -#' steps <- step_chains(i, cpts, inputs) -#' swaps <- swap_chains(steps, inputs, ids) -#' saves <- update_saves(i, saves, steps, swaps) -#' cpts <- update_cpts(cpts, swaps) -#' ids <- update_ids(ids, swaps) -#' } -#' } -#' -#' @export -#' -swap_chains <- function(chainsin, inputs, ids){ - temps <- inputs$temps - itemps <- 1/temps - ntemps <- length(temps) - revtemps <- seq(ntemps - 1, 1) - lls <- chainsin$lls - changepts <- chainsin$changepts - accept_swap <- rep(FALSE, ntemps - 1) - - for (j in revtemps){ - cutoff <- exp((itemps[j] - itemps[j + 1]) * (lls[j + 1] - lls[j])) - accept <- runif(1) < cutoff - if (accept) { - - accept_swap[j] <- TRUE - placeholder <- changepts[, j] - changepts[ , j] <- changepts[, j + 1] - changepts[ , j + 1] <- placeholder - - placeholder <- lls[j] - lls[j] <- lls[j + 1] - lls[j + 1] <- placeholder - - placeholder <- ids[j] - ids[j] <- ids[j + 1] - ids[j + 1] <- placeholder - } - } - list(changepts = changepts, lls = lls, ids = ids, accept_swap = accept_swap) -} - -#' @title Conduct a within-chain step of the ptMCMC algorithm -#' -#' @description This set of functions steps the chains forward one iteration -#' of the within-chain component of the ptMCMC algorithm. \code{step_chains} -#' is the main function, comprised of a proposal (made by \code{prop_step}), -#' an evaluation of that proposal (made by \code{eval_step}), and then an -#' update of the configuration (made by \code{take_step}). -#' \cr \cr -#' This set of functions was designed to work within \code{\link{TS}} and -#' specifically \code{\link{est_changepoints}}. They are still hardcoded to -#' do so, but have the capacity to be generalized to work with any -#' estimation via ptMCMC with additional coding work. -#' -#' @details For each iteration of the ptMCMC algorithm, all of the chains -#' have the potential to take a step. The possible step is proposed under -#' a proposal distribution (here for change points we use a symmetric -#' geometric distribution), the proposed step is then evaluated and either -#' accepted or not (following the Metropolis-Hastings rule; Metropolis, -#' \emph{et al.} 1953, Hasting 1960, Gupta \emph{et al.} 2018), and then -#' accordingly taken or not (the configurations are updated). -#' -#' @param i \code{integer} iteration index. -#' -#' @param cpts \code{matrix} of change point locations across chains. -#' -#' @param inputs Class \code{ptMCMC_inputs} \code{list}, containing the -#' static inputs for use within the ptMCMC algorithm. -#' -#' @param prop_step Proposed step output from \code{propose_step}. -#' -#' @param accept_step \code{logical} indicator of acceptance of each chain's -#' proposed step. -#' -#' @return -#' \code{step_chains}: \code{list} of change points, log-likelihoods, -#' and logical indicators of acceptance for each chain. \cr \cr -#' \code{propose_step}: \code{list} of change points and -#' log-likelihood values for the proposal. \cr \cr -#' \code{eval_step}: \code{logical} vector indicating if each -#' chain's proposal was accepted. \cr \cr -#' \code{take_step}: \code{list} of change points, log-likelihoods, -#' and logical indicators of acceptance for each chain. -#' -#' @references -#' Gupta, S., L. Hainsworth, J. S. Hogg, R. E. C. Lee, and J. R. Faeder. -#' 2018. Evaluation of parallel tempering to accelerate Bayesian parameter -#' estimation in systems biology. -#' \href{https://arxiv.org/abs/1801.09831}{link}. -#' -#' Hastings, W. K. 1970. Monte Carlo sampling methods using Markov Chains -#' and their applications. \emph{Biometrika} \strong{57}:97-109. -#' \href{https://doi.org/10.2307/2334940}{link}. -#' -#' Metropolis, N., A. W. Rosenbluth, M. N. Rosenbluth, A. H. Teller, and E. -#' Teller. 1953. Equations of state calculations by fast computing machines. -#' \emph{Journal of Chemical Physics} \strong{21}: 1087-1092. -#' \href{https://bayes.wustl.edu/Manual/EquationOfState.pdf}{link}. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' data <- data[order(data[,"newmoon"]), ] -#' saves <- prep_saves(1, TS_control()) -#' inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, -#' TS_control()) -#' cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) -#' ids <- prep_ids(TS_control()) -#' for(i in 1:TS_control()$nit){ -#' steps <- step_chains(i, cpts, inputs) -#' swaps <- swap_chains(steps, inputs, ids) -#' saves <- update_saves(i, saves, steps, swaps) -#' cpts <- update_cpts(cpts, swaps) -#' ids <- update_ids(ids, swaps) -#' } -#' # within step_chains() -#' cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) -#' i <- 1 -#' prop_step <- propose_step(i, cpts, inputs) -#' accept_step <- eval_step(i, cpts, prop_step, inputs) -#' take_step(cpts, prop_step, accept_step) -#' } -#' -#' @export -#' -step_chains <- function(i, cpts, inputs){ - prop_step <- propose_step(i, cpts, inputs) - accept_step <- eval_step(i, cpts, prop_step, inputs) - take_step(cpts, prop_step, accept_step) -} - -#' @rdname step_chains -#' -#' @export -#' -propose_step <- function(i, cpts, inputs){ - - pdist <- inputs$pdist - ntemps <- length(inputs$temps) - selection <- cbind(pdist$which_steps[i, ], 1:ntemps) - prop_changepts <- cpts$changepts - curr_changepts_s <- cpts$changepts[selection] - prop_changepts_s <- curr_changepts_s + pdist$steps[i, ] - if(all(is.na(prop_changepts_s))){ - prop_changepts_s <- NULL - } - prop_changepts[selection] <- prop_changepts_s - mods <- proposed_step_mods(prop_changepts, inputs) - lls <- vapply(mods, logLik, 0) - list(changepts = prop_changepts, lls = lls) -} - -#' @rdname step_chains -#' -#' @export -#' -eval_step <- function(i, cpts, prop_step, inputs){ - temps <- inputs$temps - ntemps <- length(temps) - itemps <- 1 / temps - runif(ntemps) < exp((prop_step$lls - cpts$lls) * itemps) -} - -#' @rdname step_chains -#' -#' @export -#' -take_step <- function(cpts, prop_step, accept_step){ - changepts <- cpts$changepts - lls <- cpts$lls - changepts[ , accept_step] <- prop_step$changepts[ , accept_step] - lls[accept_step] <- prop_step$lls[accept_step] - list(changepts = changepts, lls = lls, accept_step = accept_step) -} - -#' @title Fit the chunk-level models to a time series, given a set of -#' proposed change points within the ptMCMC algorithm -#' -#' @description This function wraps around \code{TS_memo} -#' (optionally memoised \code{\link{multinom_TS}}) to provide a -#' simpler interface within the ptMCMC algorithm and is implemented within -#' \code{\link{propose_step}}. -#' -#' @param prop_changepts \code{matrix} of proposed change points across -#' chains. -#' -#' @param inputs Class \code{ptMCMC_inputs} list, containing the static inputs -#' for use within the ptMCMC algorithm. -#' -#' @return List of models associated with the proposed step, with an element -#' for each chain. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' data <- data[order(data[,"newmoon"]), ] -#' saves <- prep_saves(1, TS_control()) -#' inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, -#' TS_control()) -#' cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) -#' i <- 1 -#' pdist <- inputs$pdist -#' ntemps <- length(inputs$temps) -#' selection <- cbind(pdist$which_steps[i, ], 1:ntemps) -#' prop_changepts <- cpts$changepts -#' curr_changepts_s <- cpts$changepts[selection] -#' prop_changepts_s <- curr_changepts_s + pdist$steps[i, ] -#' if(all(is.na(prop_changepts_s))){ -#' prop_changepts_s <- NULL -#' } -#' prop_changepts[selection] <- prop_changepts_s -#' mods <- proposed_step_mods(prop_changepts, inputs) -#' } -#' -#' @export -#' -proposed_step_mods <- function(prop_changepts, inputs){ - - data <- inputs$data - formula <- inputs$formula - weights <- inputs$weights - TS_memo <- inputs$TS_memo - ntemps <- length(inputs$temps) - control <- inputs$control - timename <- inputs$timename - out <- vector("list", length = ntemps) - for (i in 1:ntemps){ - out[[i]] <- TS_memo(data, formula, prop_changepts[ , i], timename, - weights, control) - } - out -} - - -#' @title Initialize and update the chain ids throughout the ptMCMC algorithm -#' -#' @description \code{prep_ids} creates and \code{update_ids} updates -#' the active vector of identities (ids) for each of the chains in the -#' ptMCMC algorithm. These ids are used to track trips of the particles -#' among chains. -#' \cr \cr -#' These functions were designed to work within \code{\link{TS}} and -#' specifically \code{\link{est_changepoints}}, but have been generalized -#' and would work within any general ptMCMC as long as \code{control}, -#' \code{ids}, and \code{swaps} are formatted properly. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @param ids The existing vector of chain ids. -#' -#' @param swaps Chain configuration after among-temperature swaps. -#' -#' @return The vector of chain ids. -#' -#' @examples -#' prep_ids() -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' data <- data[order(data[,"newmoon"]), ] -#' saves <- prep_saves(1, TS_control()) -#' inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, -#' TS_control()) -#' cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) -#' ids <- prep_ids(TS_control()) -#' for(i in 1:TS_control()$nit){ -#' steps <- step_chains(i, cpts, inputs) -#' swaps <- swap_chains(steps, inputs, ids) -#' saves <- update_saves(i, saves, steps, swaps) -#' cpts <- update_cpts(cpts, swaps) -#' ids <- update_ids(ids, swaps) -#' } -#' } -#' -#' @export -#' -prep_ids <- function(control = list()){ - control <- do.call("TS_control", control) - if (!is.numeric(control$ntemps) || any(control$ntemps %% 1 != 0)){ - stop("ntemps must be integer-valued") - } - 1:control$ntemps -} - -#' @rdname prep_ids -#' -#' @export -#' -update_ids <- function(ids, swaps){ - swaps$ids -} - -#' @title Prepare the inputs for the ptMCMC algorithm estimation of -#' change points -#' -#' @description Package the static inputs (controls and data structures) used -#' by the ptMCMC algorithm in the context of estimating change points. -#' \cr \cr -#' This function was designed to work within \code{\link{TS}} and -#' specifically \code{\link{est_changepoints}}. It is still hardcoded to do -#' so, but has the capacity to be generalized to work with any estimation -#' via ptMCMC with additional coding work. -#' -#' @param data Class \code{data.frame} object including [1] the time variable -#' (indicated in \code{control}), [2] the predictor variables (required by -#' \code{formula}) and [3], the multinomial response variable (indicated -#' in \code{formula}). -#' -#' @param formula \code{formula} describing the continuous change. Any -#' predictor variable included must also be a column in the -#' \code{data}. Any (multinomial) response variable must also be a set of -#' columns in \code{data}. -#' -#' @param nchangepoints Integer corresponding to the number of -#' change points to include in the model. 0 is a valid input (corresponding -#' to no change points, so a singular time series model), and the current -#' implementation can reasonably include up to 6 change points. The -#' number of change points is used to dictate the segmentation of the data -#' for each continuous model and each LDA model. -#' -#' @param timename \code{character} element indicating the time variable -#' used in the time series. Defaults to \code{"time"}. The variable must be -#' integer-conformable or a \code{Date}. If the variable named -#' is a \code{Date}, the input is converted to an integer, resulting in the -#' timestep being 1 day, which is often not desired behavior. -#' -#' @param weights Optional class \code{numeric} vector of weights for each -#' document. Defaults to \code{NULL}, translating to an equal weight for -#' each document. When using \code{multinom_TS} in a standard LDATS -#' analysis, it is advisable to weight the documents by their total size, -#' as the result of \code{\link[topicmodels]{LDA}} is a matrix of -#' proportions, which does not account for size differences among documents. -#' For most models, a scaling of the weights (so that the average is 1) is -#' most appropriate, and this is accomplished using -#' \code{\link{document_weights}}. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return Class \code{ptMCMC_inputs} \code{list}, containing the static -#' inputs for use within the ptMCMC algorithm for estimating change points. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' data <- data[order(data[,"newmoon"]), ] -#' saves <- prep_saves(1, TS_control()) -#' inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, -#' TS_control()) -#' } -#' @export -#' -prep_ptMCMC_inputs <- function(data, formula, nchangepoints, timename, - weights = NULL, control = list()){ - check_timename(data, timename) - check_formula(data, formula) - check_weights(weights) - check_nchangepoints(nchangepoints) - check_control(control) - control <- do.call("TS_control", control) - control$selector <- NULL - control$measurer <- NULL - out <- list(control = control, temps = prep_temp_sequence(control), - pdist = prep_proposal_dist(nchangepoints, control), - formula = formula, weights = weights, data = data, - TS_memo = memoise_fun(multinom_TS, control$memoise), - timename = timename) - class(out) <- c("ptMCMC_inputs", "list") - out -} - - -#' @title Pre-calculate the change point proposal distribution for the ptMCMC -#' algorithm -#' -#' @description Calculate the proposal distribution in advance of actually -#' running the ptMCMC algorithm in order to decrease computation time. -#' The proposal distribution is a joint of three distributions: -#' [1] a multinomial distribution selecting among the change points within -#' the chain, [2] a binomial distribution selecting the direction of the -#' step of the change point (earlier or later in the time series), and -#' [3] a geometric distribution selecting the magnitude of the step. -#' -#' @param nchangepoints Integer corresponding to the number of -#' change points to include in the model. 0 is a valid input (corresponding -#' to no change points, so a singular time series model), and the current -#' implementation can reasonably include up to 6 change points. The -#' number of change points is used to dictate the segmentation of the data -#' for each continuous model and each LDA model. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. Currently relevant here is -#' \code{magnitude}, which controls the magnitude of the step size (is the -#' average of the geometric distribution). -#' -#' @return \code{list} of two \code{matrix} elements: [1] the size of the -#' proposed step for each iteration of each chain and [2] the identity of -#' the change point location to be shifted by the step for each iteration of -#' each chain. -#' -#' @examples -#' prep_proposal_dist(nchangepoints = 2) -#' -#' @export -#' -prep_proposal_dist <- function(nchangepoints, control = list()){ - check_nchangepoints(nchangepoints) - check_control(control) - control <- do.call("TS_control", control) - ntemps <- control$ntemps - nit <- control$nit - if(nchangepoints == 0){ - steps <- matrix(0, nrow = nit, ncol = ntemps) - which_steps <- matrix(numeric(0), nrow = nit, ncol = ntemps) - } else{ - magnitude <- control$magnitude - step_signs <- sample(c(-1, 1), nit * ntemps, replace = TRUE) - step_magnitudes <- 1 + rgeom(nit * ntemps, 1 / magnitude) - steps <- matrix(step_signs * step_magnitudes, nrow = nit) - which_steps <- sample.int(nchangepoints, nit * ntemps, replace = TRUE) - which_steps <- matrix(which_steps, nrow = nit) - } - list(steps = steps, which_steps = which_steps) -} - -#' @title Prepare and update the data structures to save the ptMCMC output -#' -#' @description \code{prep_saves} creates the data structure used to save the -#' output from each iteration of the ptMCMC algorithm, which is added via -#' \code{update_saves}. Once the ptMCMC is complete, the saved data objects -#' are then processed (burn-in iterations are dropped and the remaining -#' iterations are thinned) via \code{process_saves}. -#' \cr \cr -#' This set of functions was designed to work within \code{\link{TS}} and -#' specifically \code{\link{est_changepoints}}. They are still hardcoded to -#' do so, but have the capacity to be generalized to work with any -#' estimation via ptMCMC with additional coding work. -#' -#' @param nchangepoints \code{integer} corresponding to the number of -#' change points to include in the model. 0 is a valid input (corresponding -#' to no change points, so a singular time series model), and the current -#' implementation can reasonably include up to 6 change points. The -#' number of change points is used to dictate the segmentation of the data -#' for each continuous model and each LDA model. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @param i \code{integer} iteration index. -#' -#' @param saves The existing list of saved data objects. -#' -#' @param steps Chain configuration after within-temperature steps. -#' -#' @param swaps Chain configuration after among-temperature swaps. -#' -#' @return \code{list} of ptMCMC objects: change points (\code{$cpts}), -#' log-likelihoods (\code{$lls}), chain ids (\code{$ids}), step acceptances -#' (\code{$step_accepts}), and swap acceptances (\code{$swap_accepts}). -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' data <- data[order(data[,"newmoon"]), ] -#' saves <- prep_saves(1, TS_control()) -#' inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, -#' TS_control()) -#' cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) -#' ids <- prep_ids(TS_control()) -#' for(i in 1:TS_control()$nit){ -#' steps <- step_chains(i, cpts, inputs) -#' swaps <- swap_chains(steps, inputs, ids) -#' saves <- update_saves(i, saves, steps, swaps) -#' cpts <- update_cpts(cpts, swaps) -#' ids <- update_ids(ids, swaps) -#' } -#' process_saves(saves, TS_control()) -#' } -#' -#' @export -#' -prep_saves <- function(nchangepoints, control = list()){ - check_nchangepoints(nchangepoints) - check_control(control) - control <- do.call("TS_control", control) - ntemps <- control$ntemps - nit <- control$nit - cpts <- array(NA, c(nchangepoints, ntemps, nit)) - lls <- matrix(NA, ntemps, nit) - ids <- matrix(NA, ntemps, nit) - step_accepts <- matrix(FALSE, ntemps, nit) - swap_accepts <- matrix(FALSE, ntemps - 1, nit) - list(cpts = cpts, lls = lls, ids = ids, step_accepts = step_accepts, - swap_accepts = swap_accepts) -} - -#' @rdname prep_saves -#' -#' @export -#' -update_saves <- function(i, saves, steps, swaps){ - saves$cpts[ , , i] <- swaps$changepts - saves$lls[ , i] <- swaps$lls - saves$ids[ , i] <- swaps$ids - saves$step_accepts[ , i] <- steps$accept_step - saves$swap_accepts[ , i] <- swaps$accept_swap - saves -} - -#' @rdname prep_saves -#' -#' @export -#' -process_saves <- function(saves, control = list()){ - control <- do.call("TS_control", control) - nit <- control$nit - iters <- 1:nit - if (control$burnin > 0){ - iters <- iters[-(1:control$burnin)] - } - niters <- length(iters) - thin_interval <- ceiling(1/control$thin_frac) - iters_thinned <- seq(1, niters, by = thin_interval) - dims <- c(dim(saves$cpts)[1:2], length(iters_thinned)) - saves$cpts <- array(saves$cpts[ , , iters_thinned], dim = dims) - saves$lls <- saves$lls[, iters_thinned] - saves$ids <- saves$ids[, iters_thinned] - saves$step_accepts <- saves$step_accepts[ , iters_thinned] - saves$swap_accepts <- saves$swap_accepts[ , iters_thinned] - saves -} - -#' @title Initialize and update the change point matrix used in the ptMCMC -#' algorithm -#' -#' @description Each of the chains is initialized by \code{prep_cpts} using a -#' draw from the available times (i.e. assuming a uniform prior), the best -#' fit (by likelihood) draw is put in the focal chain with each subsequently -#' worse fit placed into the subsequently hotter chain. \code{update_cpts} -#' updates the change points after every iteration in the ptMCMC algorithm. -#' -#' @param data \code{data.frame} including [1] the time variable (indicated -#' in \code{timename}), [2] the predictor variables (required by -#' \code{formula}) and [3], the multinomial response variable (indicated in -#' \code{formula}) as verified by \code{\link{check_timename}} and -#' \code{\link{check_formula}}. Note that the response variables should be -#' formatted as a \code{data.frame} object named as indicated by the -#' \code{response} entry in the \code{control} list, such as \code{gamma} -#' for a standard TS analysis on LDA output. -#' -#' @param formula \code{formula} defining the regression relationship between -#' the change points, see \code{\link[stats]{formula}}. Any -#' predictor variable included must also be a column in -#' \code{data} and any (multinomial) response variable must be a set of -#' columns in \code{data}, as verified by \code{\link{check_formula}}. -#' -#' @param nchangepoints \code{integer} corresponding to the number of -#' change points to include in the model. 0 is a valid input (corresponding -#' to no change points, so a singular time series model), and the current -#' implementation can reasonably include up to 6 change points. The -#' number of change points is used to dictate the segmentation of the data -#' for each continuous model and each LDA model. -#' -#' @param timename \code{character} element indicating the time variable -#' used in the time series. Defaults to \code{"time"}. The variable must be -#' integer-conformable or a \code{Date}. If the variable named -#' is a \code{Date}, the input is converted to an integer, resulting in the -#' timestep being 1 day, which is often not desired behavior. -#' -#' @param weights Optional class \code{numeric} vector of weights for each -#' document. Defaults to \code{NULL}, translating to an equal weight for -#' each document. When using \code{multinom_TS} in a standard LDATS -#' analysis, it is advisable to weight the documents by their total size, -#' as the result of \code{\link[topicmodels]{LDA}} is a matrix of -#' proportions, which does not account for size differences among documents. -#' For most models, a scaling of the weights (so that the average is 1) is -#' most appropriate, and this is accomplished using -#' \code{\link{document_weights}}. -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @param cpts The existing matrix of change points. -#' -#' @param swaps Chain configuration after among-temperature swaps. -#' -#' @return \code{list} of [1] \code{matrix} of change points (rows) for -#' each temperature (columns) and [2] \code{vector} of log-likelihood -#' values for each of the chains. -#' -#' @examples -#' \donttest{ -#' data(rodents) -#' document_term_table <- rodents$document_term_table -#' document_covariate_table <- rodents$document_covariate_table -#' LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] -#' data <- document_covariate_table -#' data$gamma <- LDA_models@gamma -#' weights <- document_weights(document_term_table) -#' data <- data[order(data[,"newmoon"]), ] -#' saves <- prep_saves(1, TS_control()) -#' inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, -#' TS_control()) -#' cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) -#' ids <- prep_ids(TS_control()) -#' for(i in 1:TS_control()$nit){ -#' steps <- step_chains(i, cpts, inputs) -#' swaps <- swap_chains(steps, inputs, ids) -#' saves <- update_saves(i, saves, steps, swaps) -#' cpts <- update_cpts(cpts, swaps) -#' ids <- update_ids(ids, swaps) -#' } -#' } -#' -#' @export -#' -prep_cpts <- function(data, formula, nchangepoints, timename, weights, - control = list()){ - - check_formula(data, formula) - check_nchangepoints(nchangepoints) - check_weights(weights) - check_timename(data, timename) - check_control(control) - control <- do.call("TS_control", control) - temps <- prep_temp_sequence(control) - ntemps <- length(temps) - min_time <- min(data[ , timename]) - max_time <- max(data[ , timename]) - times <- seq(min_time, max_time, 1) - avail_times <- times[-c(1, length(times))] - cps <- matrix(NA, nrow = nchangepoints, ncol = ntemps) - for (i in 1:ntemps){ - cp_times <- sort(sample(avail_times, nchangepoints, replace = FALSE)) - cps[ , i] <- cp_times - } - lls <- rep(NA, ntemps) - for (i in 1:ntemps){ - modfit <- multinom_TS(data, formula, cps[ ,i], timename, weights, control) - lls[i] <- modfit$logLik - } - cps <- cps[ , order(lls, decreasing = TRUE), drop = FALSE] - lls <- sort(lls, decreasing = TRUE) - - out <- list(cps, lls) - names(out) <- c("changepts", "lls") - out -} - -#' @rdname prep_cpts -#' -#' @export -#' -update_cpts <- function(cpts, swaps){ - list(changepts = swaps$changepts, lls = swaps$lls) -} - -#' @title Prepare the ptMCMC temperature sequence -#' -#' @description Create the series of temperatures used in the ptMCMC -#' algorithm. -#' \cr \cr -#' This function was designed to work within \code{\link{TS}} and -#' \code{\link{est_changepoints}} specifically, but has been generalized -#' and would work with any ptMCMC model as long as \code{control} -#' includes the relevant control parameters (and provided that the -#' \code{\link{check_control}} function and its use here are generalized). -#' -#' @param control A \code{list} of parameters to control the fitting of the -#' Time Series model including the parallel tempering Markov Chain -#' Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -#' \code{\link{TS_control}}. -#' -#' @return \code{vector} of temperatures. -#' -#' @examples -#' prep_temp_sequence() -#' -#' @export -#' -prep_temp_sequence <- function(control = list()){ - check_control(control) - control <- do.call("TS_control", control) - ntemps <- control$ntemps - penultimate_temp <- control$penultimate_temp - ultimate_temp <- control$ultimate_temp - q <- control$q - sequence <- seq(0, log2(penultimate_temp), length.out = ntemps - 1) - log_temps <- sequence^(1 + q) / log2(penultimate_temp)^q - c(2^(log_temps), ultimate_temp) -} diff --git a/R/simulate.R b/R/simulate.R index 869e2b2d..78bbc3b6 100644 --- a/R/simulate.R +++ b/R/simulate.R @@ -1,10 +1,25 @@ -#' @title Simulate LDA data from an LDA structure given parameters +#' @title Simulate LDA_TS data from LDA and TS model structures and parameters #' -#' @description For a given set of parameters \code{alpha} and \code{Beta} and -#' document-specific total word counts, simulate a document-by-term matrix. -#' Additional structuring variables (the numbers of topics (k), -#' documents (M), terms (V)) are inferred from input objects. +#' @description +#' \code{simulate_LDA_data} for a given set of parameters \code{alpha} and +#' \code{Beta} and document-specific total word counts, simulate a +#' document-by-term matrix. \cr +#' Additional structuring variables (the numbers of topics (k), +#' documents (M), terms (V)) are inferred from input objects. \cr \cr +#' \code{simulate_TS_data} for a given set of covariates \code{X}; +#' parameters \code{Eta}, \code{rho}, and \code{err}; and +#' document-specific time stamps \code{tD}, simulate a document-by-topic +#' matrix. Additional structuring variables (numbers of topics (k), +#' documents (M), segments (S), and covariates per segment (C)) are +#' inferred from input objects. +#' \code{simulate_LDA_TS_data} for a given set of covariates \code{X}; +#' parameters \code{Beta}, \code{Eta}, \code{rho}, and \code{err}; and +#' document-specific time stamps \code{tD} and lengths \code{N}), +#' simulate a document-by-topic matrix. \cr +#' Additional structuring variables (the numbers of topics (k), terms (V), +#' documents (M), segments (S), and covariates per segment (C)) +#' are inferred from input objects. #' #' @param N A vector of document sizes (total word counts). Must be integer #' conformable. Is used to infer the total number of documents. @@ -13,7 +28,28 @@ #' terms within topics. Dimension: k x V (number of topics x number of #' terms). Used to infer both (k) and (V). Must be non-negative and sum to #' 1 within topics. +#' +#' @param X \code{matrix} of covariates, dimension M (number of documents) x +#' C (number of covariates, including the intercept) (a.k.a the design +#' matrix). +#' +#' @param Eta \code{matrix} of regression parameters across the segments, +#' dimension: SC (number of segments x number of covariates, including the +#' intercept) x k (number of topics). +#' +#' @param rho Vector of integer-conformable time locations of changepoints or +#' \code{NULL} if no changepoints. Used to determine the number of +#' segments. Must exist within the bounds of the times of the documents, +#' \code{tD}. +#' +#' @param tD Vector of integer-conformable times of the documents. Must be +#' of length M (as determined by \code{X}). #' +#' @param err Additive error on the link-scale. Must be a non-negative +#' \code{numeric} value. Default value of \code{0} indicates no error. +#' +#' @param seed Input to \code{\link{set.seed}}. +#' #' @param alpha Single positive numeric value for the Dirichlet distribution #' parameter defining topics within documents. To specifically define #' document topic probabilities, use \code{Theta}. @@ -22,23 +58,32 @@ #' documents. Dimension: M x k (documents x topics). Must be non-negative #' and sum to 1 within documents. To generally define document topic #' probabilities, use \code{alpha}. -#' -#' @param seed Input to \code{\link{set.seed}}. #' -#' @return A document-by-term \code{matrix} of counts (dim: M x V). +#' @param invlink \code{function} name for the inverse link function. +#' Currently available are \code{\link{softmax}} and the inverses of +#' the ILR, ALR, and CLR transforms (\code{\link[compositions]{ilrInv}}, +#' \code{\link[compositions]{alrInv}}, and +#' \code{\link[compositions]{clrInv}}). #' -#' @examples -#' N <- c(10, 22, 15, 31) -#' alpha <- 1.2 -#' Beta <- matrix(c(0.1, 0.1, 0.8, 0.2, 0.6, 0.2), 2, 3, byrow = TRUE) -#' sim_LDA_data(N, Beta, alpha = alpha) -#' Theta <- matrix(c(0.2, 0.8, 0.8, 0.2, 0.5, 0.5, 0.9, 0.1), 4, 2, -#' byrow = TRUE) -#' sim_LDA_data(N, Beta, Theta = Theta) +#' @return +#' \code{simulate_LDA}: A document-by-term \code{matrix} of counts +#' (dim: M x V). \cr \cr +#' \code{simulate_TS}: document-by-topic \code{matrix} of +#' proportions (dim: M x k). \cr \cr +#' \code{simulate_LDA_TS}: A document-by-term \code{matrix} of counts +#' (dim: M x V). +#' +#' @name simulate_data +#' + + + +#' @rdname simulate_data #' #' @export #' -sim_LDA_data <- function(N, Beta, alpha = NULL, Theta = NULL, seed = NULL){ +simulate_LDA_data <- function(N, Beta, alpha = NULL, Theta = NULL, + seed = NULL){ if (length(dim(N)) > 1 | !is.numeric(N) || !all(N %% 1 == 0)){ stop("N must be a vector of integer conformable values") } @@ -88,47 +133,14 @@ sim_LDA_data <- function(N, Beta, alpha = NULL, Theta = NULL, seed = NULL){ w } -#' @title Simulate TS data from a TS model structure given parameters -#' -#' @description For a given set of covariates \code{X}; parameters \code{Eta}, -#' \code{rho}, and \code{err}; and document-specific time stamps \code{tD}, -#' simulate a document-by-topic matrix. Additional structuring variables -#' (numbers of topics (k), documents (M), segments (S), and -#' covariates per segment (C)) are inferred from input objects. -#' -#' @param X \code{matrix} of covariates, dimension M (number of documents) x -#' C (number of covariates, including the intercept) (a.k.a. the design -#' matrix). -#' -#' @param Eta \code{matrix} of regression parameters across the segments, -#' dimension: SC (number of segments x number of covariates, including the -#' intercept) x k (number of topics). -#' -#' @param rho Vector of integer-conformable time locations of changepoints or -#' \code{NULL} if no changepoints. Used to determine the number of -#' segments. Must exist within the bounds of the times of the documents, -#' \code{tD}. -#' -#' @param tD Vector of integer-conformable times of the documents. Must be -#' of length M (as determined by \code{X}). -#' -#' @param err Additive error on the link-scale. Must be a non-negative -#' \code{numeric} value. Default value of \code{0} indicates no error. -#' -#' @param seed Input to \code{\link{set.seed}}. + + +#' @rdname simulate_data #' -#' @return A document-by-topic \code{matrix} of probabilities (dim: M x k). -#' -#' @examples -#' tD <- c(1, 3, 4, 6) -#' rho <- 3 -#' X <- cbind(rep(1, 4), 1:4) -#' Eta <- cbind(c(0.5, 0.3, 0.9, 0.5), c(1.2, 1.1, 0.1, 0.5)) -#' sim_TS_data(X, Eta, rho, tD, err = 1) -#' #' @export #' -sim_TS_data <- function(X, Eta, rho, tD, err = 0, seed = NULL){ +simulate_TS_data <- function(X, Eta, rho, tD, err = 0, seed = NULL, + invlink = softmax){ if (length(dim(tD)) > 1 | !is.numeric(tD) || !all(tD %% 1 == 0)){ stop("tD must be a vector of integer conformable values") @@ -152,74 +164,34 @@ sim_TS_data <- function(X, Eta, rho, tD, err = 0, seed = NULL){ C <- nrow(Eta) / S s_start <- c(min(tD), rho + 1) s_end <- c(rho, max(tD)) - EGamma <- matrix(NA, nrow = nrow(X), ncol = ncol(Eta)) + + in1 <- which(tD >= s_start[1] & tD <= s_end[1]) + in2 <- (C * (1 - 1) + 1):(C * 1) + X_Eta <- matrix(data = X[in1, ], nrow = length(in1)) %*% Eta[in2,] + eps <- rnorm(length(X_Eta), 0, err) + val <- X_Eta + eps + tester <- do.call(invlink, list(val)) + EGamma <- matrix(NA, nrow = nrow(X), ncol = ncol(tester)) for(s in 1:S){ in1 <- which(tD >= s_start[s] & tD <= s_end[s]) in2 <- (C * (s - 1) + 1):(C * s) X_Eta <- matrix(data = X[in1, ], nrow = length(in1)) %*% Eta[in2,] eps <- rnorm(length(X_Eta), 0, err) - EGamma[in1,] <- softmax(X_Eta + eps) + val <- X_Eta + eps + EGamma[in1,] <- do.call(invlink, list(val)) } EGamma } - -#' @title Simulate LDA_TS data from LDA and TS model structures and parameters -#' -#' @description For a given set of covariates \code{X}; parameters -#' \code{Beta}, \code{Eta}, \code{rho}, and \code{err}; and -#' document-specific time stamps \code{tD} and lengths \code{N}), -#' simulate a document-by-topic matrix. -#' Additional structuring variables (the numbers of topics (k), terms (V), -#' documents (M), segments (S), and covariates per segment (C)) -#' are inferred from input objects. +#' @rdname simulate_data #' -#' @param N A vector of document sizes (total word counts). Must be integer -#' conformable. Is used to infer the total number of documents. -#' -#' @param Beta \code{matrix} of categorical distribution parameters defining -#' terms within topics. Dimension: k x V (number of topics x number of -#' terms). Used to infer both (k) and (V). Must be non-negative and sum to -#' 1 within topics. -#' -#' @param X \code{matrix} of covariates, dimension M (number of documents) x -#' C (number of covariates, including the intercept) (a.k.a the design -#' matrix). -#' -#' @param Eta \code{matrix} of regression parameters across the segments, -#' dimension: SC (number of segments x number of covariates, including the -#' intercept) x k (number of topics). -#' -#' @param rho Vector of integer-conformable time locations of changepoints or -#' \code{NULL} if no changepoints. Used to determine the number of -#' segments. Must exist within the bounds of the times of the documents, -#' \code{tD}. -#' -#' @param tD Vector of integer-conformable times of the documents. Must be -#' of length M (as determined by \code{X}). -#' -#' @param err Additive error on the link-scale. Must be a non-negative -#' \code{numeric} value. Default value of \code{0} indicates no error. -#' -#' @param seed Input to \code{\link{set.seed}}. -#' -#' @return A document-by-term \code{matrix} of counts (dim: M x V). -#' -#' @examples -#' N <- c(10, 22, 15, 31) -#' tD <- c(1, 3, 4, 6) -#' rho <- 3 -#' X <- cbind(rep(1, 4), 1:4) -#' Eta <- cbind(c(0.5, 0.3, 0.9, 0.5), c(1.2, 1.1, 0.1, 0.5)) -#' Beta <- matrix(c(0.1, 0.1, 0.8, 0.2, 0.6, 0.2), 2, 3, byrow = TRUE) -#' err <- 1 -#' sim_LDA_TS_data(N, Beta, X, Eta, rho, tD, err) -#' #' @export #' -sim_LDA_TS_data <- function(N, Beta, X, Eta, rho, tD, err = 0, seed = NULL){ - EGamma <- sim_TS_data(X, Eta, rho, tD, err, seed) - sim_LDA_data(N, Beta, Theta = EGamma, seed = seed) +simulate_LDA_TS_data <- function(N, Beta, X, Eta, rho, tD, err = 0, + seed = NULL, invlink = softmax){ + EGamma <- simulate_TS_data(X = X, Eta = Eta, rho = rho, tD = tD, err = err, + seed = seed, invlink = invlink) + simulate_LDA_data(N = N, Beta = Beta, Theta = EGamma, seed = seed) } diff --git a/R/utilities.R b/R/utilities.R index 667002d6..ba5f3309 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -1,3 +1,204 @@ +#' @title Soften a flexible function call to errors +#' +#' @description Wrapping around \code{\link{do.call}}, this function allows +#' for a simple "softening" where errors are returned as text, rather +#' than causing a break in the encompassing pipeline. +#' +#' @details As this is a basic wrapper on \code{\link{do.call}}, the basic +#' rules of its usage still apply: \cr +#' If \code{quote = FALSE} (the default), then the arguments are evaluated +#' in the calling environment and not in \code{envir}. \cr +#' If \code{quote = TRUE}, each argument is \code{\link{quote}}d +#' so that the effect of argument evaluation is to remove the quotes +#' leaving the original arguments unevaluated when the call is built. \cr +#' The behavior of some functions, such as \code{\link{substitute}}, will +#' not be the same for functions evaluated using \code{\link{do.call}} +#' and thus \code{soft_call} as if they were evaluated from the +#' interpreter. The precise semantics are currently undefined and subject +#' to change. +#' +#' @param what \code{function} or a non-empty \code{character} string +#' naming the \code{function} to be called. \cr +#' See \code{\link{do.call}}. +#' +#' @param args \code{list} of arguments to the \code{what} call. The names +#' attribute of the \code{list} gives the argument names.\cr +#' See \code{\link{do.call}}. +#' +#' @param quote \code{logical} value indicating whether to quote the +#' arguments. \cr +#' See \code{\link{do.call}}. +#' +#' @param envir \code{environment} within which to evaluate the call. This +#' argument will be most useful if \code{what} is a \code{character} +#' string and the arguments are symbols or \code{quote}d expressions.\cr +#' See \code{\link{do.call}}. +#' +#' @param soften \code{logical} value indicating whether to soften any errors +#' in the running of \code{what}. \cr +#' +#' @return Either the result of the (evaluated) call of \code{what} or a +#' \code{list} of the error message that resulted (if \code{soften = TRUE}). +#' +#' @export +#' +soft_call <- function(what = function(x){invisible(NULL)}, args = list(NULL), + quote = FALSE, envir = parent.frame(), soften = FALSE){ + if(list_depth(args) == 0){ + args <- list(args) + } + if(soften){ + tryCatch(do.call(what = what, args = args, quote = quote, envir = envir), + warning = function(x){eval(x$call)}, + error = function(x = list()){list(error = x$message)}) + } else{ + do.call(what = what, args = args, quote = quote, envir = envir) + } +} + + + +#' @title Initialize and tick through the progress bar +#' +#' @description \code{prep_pbar} creates and \code{update_pbar} steps +#' through the progress bars (if desired) in, e.g., \code{\link{TS}}. +#' +#' @param control A \code{list} of parameters to control the fitting of the +#' iterative model. +#' +#' @param type \code{character} value of possible types of progress bars. +#' Currently available options are "rho" (for change point locations) and +#' "eta" (for time series regressors). +#' +#' @param nr \code{integer} number of unique realizations, needed when +#' \code{type} = "eta". +#' +#' @param pbar The progress bar object returned from \code{prep_pbar}. +#' +#' @return +#' \code{prep_pbar}: the initialized progress bar object. \cr \cr +#' \code{update_pbar}: the ticked-forward \code{pbar}. +#' +#' @export +#' +prep_pbar <- function(control = list(), type = "rho", nr = NULL){ + if (!(type %in% c("eta", "rho"))){ + stop("type must be eta or rho") + } + if (!is.null(nr)){ + if (!is.numeric(nr) || any(nr %% 1 != 0)){ + stop("nr must be integer-valued") + } + } + form <- " [:bar] :percent eta: :eta" + if (type == "rho"){ + msg <- " - estimating change point distribution" + out <- progress_bar$new(form, control$nit, width = 60) + } + if (type == "eta"){ + msg <- " - estimating regressor distribution" + out <- progress_bar$new(form, nr, width = 60) + } + messageq(msg, control$quiet) + out +} + +#' @rdname prep_pbar +#' +#' @export +#' +update_pbar <- function(pbar, control = list()){ + if (!("progress_bar" %in% class(pbar))){ + stop("pbar must be of class progress_bar") + } + if (control$quiet){ + return() + } + pbar$tick() +} + + +#' @title Determine the depth of a list +#' +#' @description Evaluate an input for the depth of its nesting. +#' +#' @details If \code{xlist = list()}, then technically the input value is a +#' list, but is empty (of length \code{0}), so depth is returned as \code{0}. +#' +#' @param xlist Focal input \code{list}. +#' +#' @return \code{integer} value of the depth of the list. +#' +#' @examples +#' list_depth("a") +#' list_depth(list()) +#' list_depth(list("a")) +#' list_depth(list(list("a"))) +#' +#' @export +#' +list_depth <- function(xlist){ + xx <- match.call() + xxx <- deparse(xx[[2]]) + if(xxx == "list()"){ + 0L + } else if (inherits(xlist, "data.frame")){ + 0L + } else if (is.list(xlist)){ + 1L + max(sapply(xlist, list_depth)) + } else { + 0L + } +} + + +#' @title Update a list's elements +#' +#' @description Update a list with new values for elements +#' +#' @param orig_list \code{list} to be updated with \code{...}. +#' +#' @param ... Named elements to update in \code{orig_list} +#' +#' @return Updated \code{list}. +#' +#' @examples +#' orig_list <- list(a = 1, b = 3, c = 4) +#' update_list(orig_list) +#' update_list(orig_list, a = "a") +#' update_list(orig_list, a = 10, b = NULL) +#' +#' @export +#' +update_list <- function(orig_list = list(), ...){ + if(!is.list(orig_list)){ + stop("orig_list must be a list", call. = FALSE) + } + update_elems <- list(...) + nupdate_elems <- length(update_elems) + norig_elems <- length(orig_list) + update_list <- vector("list", length = norig_elems) + names(update_list) <- names(orig_list) + if(norig_elems > 0){ + for(i in 1:norig_elems){ + if(!is.null(orig_list[[i]])){ + update_list[[i]] <- orig_list[[i]] + } + } + } + if(nupdate_elems > 0){ + names_update_elems <- names(update_elems) + for(i in 1:nupdate_elems){ + if(!is.null(update_elems[[i]])){ + update_list[[names_update_elems[i]]] <- update_elems[[i]] + } + } + } + update_list +} + + + #' @title Calculate the log-sum-exponential (LSE) of a vector #' #' @description Calculate the exponent of a vector (offset by the max), sum @@ -133,9 +334,8 @@ modalvalue <- function(x){ #' number of words in a document within the corpus (mean value = 1). #' #' @param document_term_table Table of observation count data (rows: -#' documents, columns: terms. May be a class \code{matrix} or -#' \code{data.frame} but must be conformable to a matrix of integers, -#' as verified by \code{\link{check_document_term_table}}. +#' documents, columns: terms. May be a \code{matrix} or +#' \code{data.frame} but must be conformable to a matrix of integers. #' #' @return Vector of weights, one for each document, with the average sample #' receiving a weight of 1.0. @@ -181,7 +381,8 @@ messageq <- function(msg = NULL, quiet = FALSE){ #' @title Create a properly symmetric variance covariance matrix #' #' @description A wrapper on \code{\link[stats]{vcov}} to produce a symmetric -#' matrix. If the default matrix returned by \code{\link[stats]{vcov}} is +#' matrix. \cr +#' If the default matrix returned by \code{\link[stats]{vcov}} is #' symmetric it is returned simply. If it is not, in fact, symmetric #' (as occurs occasionally with \code{\link[nnet]{multinom}} applied to #' proportions), the matrix is made symmetric by averaging the lower and @@ -266,122 +467,17 @@ memoise_fun <- function(fun, memoise_tf = TRUE){ if (!("function" %in% class(fun))){ stop("fun is not a function") } - if (!("logical" %in% class(memoise_tf))){ + if (!(is.null(memoise_tf) || "logical" %in% class(memoise_tf))){ stop("memoise_tf is not logical") } - if (memoise_tf){ + if (!is.null(memoise_tf) && memoise_tf){ fun <- memoise(fun) } fun } - -#' @title Check that a control list is proper -#' -#' @description Check that a list of controls is of the right class. -#' -#' @param control Control list to evaluate. -#' -#' @param eclass Expected class of the list to be evaluated. -#' -#' @return an error message is thrown if the input is improper, otherwise -#' \code{NULL}. -#' -#' @examples -#' check_control(list()) -#' -#' @export -#' -check_control <- function(control, eclass = "list"){ - if (!(eclass %in% class(control))){ - stop(paste0("control is not a ", eclass)) - } - return() -} - - -#' @title Check that document term table is proper -#' -#' @description Check that the table of observations is conformable to -#' a matrix of integers. -#' -#' @param document_term_table Table of observation count data (rows: -#' documents, columns: terms. May be a class \code{matrix} or -#' \code{data.frame} but must be conformable to a matrix of integers, -#' as verified by \code{\link{check_document_term_table}}. -#' -#' @return an error message is thrown if the input is improper, otherwise -#' \code{NULL}. -#' -#' @examples -#' data(rodents) -#' check_document_term_table(rodents$document_term_table) -#' -#' @export -#' -check_document_term_table <- function(document_term_table){ - document_term_table_m <- as.matrix(document_term_table) - if(any(document_term_table_m %% 1 != 0)){ - dtt <- "document_term_table" - msg <- paste0(dtt, " is not conformable to a matrix of integers") - stop(msg) - } - return() -} - -#' @title Check that topics vector is proper -#' -#' @description Check that the vector of numbers of topics is conformable to -#' integers greater than 1. -#' -#' @param topics Vector of the number of topics to evaluate for each model. -#' Must be conformable to \code{integer} values. -#' -#' @return an error message is thrown if the input is improper, otherwise -#' \code{NULL}. -#' -#' @examples -#' check_topics(2) -#' -#' @export -#' -check_topics <- function(topics){ - if (!is.numeric(topics) || any(topics %% 1 != 0)){ - stop("topics vector must be integers") - } - if (any(topics < 2)){ - stop("minimum number of topics currently allowed is 2") - } - return() -} - -#' @title Check that nseeds value or seeds vector is proper -#' -#' @description Check that the vector of numbers of seeds is conformable to -#' integers greater than 0. -#' -#' @param nseeds \code{integer} number of seeds (replicate starts) to use for -#' each value of \code{topics} in the LDAs. Must be conformable to a -#' positive \code{integer} value. -#' -#' @return an error message is thrown if the input is improper, otherwise -#' \code{NULL}. -#' -#' @examples -#' check_seeds(1) -#' check_seeds(2) -#' -#' @export -#' -check_seeds <- function(nseeds){ - if (!is.numeric(nseeds) || any(nseeds %% 1 != 0)){ - stop("nseeds vector must be integers") - } - return() -} - # provides a functionality that can be used in testing for non-symmetric # vcov matrix -vcov.dummy <- function(object, ...){ +vcov.test <- function(object, ...){ matrix(c(1, 2, 2.1, 3), 2, 2) } diff --git a/_pkgdown.yml b/_pkgdown.yml index f8ed41ce..d0d288db 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -11,9 +11,9 @@ navbar: text: "Vignettes" menu: - text: "Rodents example" - href: rodents-example.html + href: articles/rodents-example.html - text: "Comparison to Christensen et al." - href: paper-comparison.html + href: articles/paper-comparison.html - text: "LDATS codebase" href: articles/LDATS_codebase.html reference: @@ -31,7 +31,7 @@ navbar: reference: - title: "Joint LDA and Time Series" - desc: "Functions for top-level LDATS modeling" + desc: "Functions for LDA_TS-level modeling" contents: - LDA_TS - package_LDA_TS diff --git a/doc/LDATS_codebase.Rmd b/doc/LDATS_codebase.Rmd index 0c736efa..5771b982 100644 --- a/doc/LDATS_codebase.Rmd +++ b/doc/LDATS_codebase.Rmd @@ -23,13 +23,12 @@ today <- Sys.Date() ## Overview -This vignette outlines the code base for the **LDATS** package. It was -constructed using **LDATS** version `r vers` on `r today`. +This vignette outlines the code base for the **LDATS** package. +It was constructed using **LDATS** version `r vers` on `r today`. ## Installation -To obtain the most recent version of **LDATS**, install the most recent -version from GitHub: +To obtain the most recent version of **LDATS**, install the most recent version from GitHub: ```{r, eval=FALSE} install.packages("devtools") @@ -38,100 +37,84 @@ devtools::install_github("weecology/LDATS") ## Analytical Pipeline -The full LDATS analysis can be executed using the `LDA_TS` function, -which is the top-level of the coding pipeline's hierarchy of -functions: +The full LDATS analysis can be executed using the `LDA_TS` function, which is the top-level of the coding pipeline's hierarchy of functions: * `LDA_TS()` - * `check_LDA_TS_inputs()` - * `check_timename()` - * `check_formulas()` - * `check_nchangepoints()` - * `check_weights()` - * `check_control()` - * `check_document_term_table()` - * `check_document_covariate_table()` - * `check_topics()` - * `check_seeds()` - * `LDA_set()` - * `check_LDA_TS_inputs()` - * `check_control()` - * `check_document_term_table()` - * `check_topics()` - * `check_seeds()` - * loop over models - * `prep_LDA_control()` - * `LDA()` - * `select_LDA()` - * applies the `measurer` and `selector` functions supplied via `LDA_controls_list()` - * `TS_on_LDA()` - * `check_LDA_TS_inputs()` - * `check_LDA_models()` - * `check_formulas()` - * `check_nchangepoints()` - * `check_timename()` - * `check_control()` - * `check_weights()` - * `check_document_covariate_table()` - * `expand_TS()` - * loop over models - * `prep_TS_data()` - * `TS()` - * `check_TS_inputs()` - * `check_formulas()` - * `check_nchangepoints()` - * `check_control()` - * `check_weights()` - * `est_changepoints()` - * `prep_saves()` - * `prep_ptMCMC_inputs()` - * `prep_cpts()` - * loop over chains - * `multinom_TS()` - * `prep_ids()` - * `prep_pbar()` - * loop over iterations - * `update_pbar()` - * `step_chains()` - * `propose_step()` - * `proposed_step_mods()` - * loop over chains - * `multinom_TS()` - * `eval_step()` - * `take_step()` - * `swap_chains()` - * loop over chain neighbors - * `update_saves()` - * `update_cpts()` - * `update_ids()` - * `est_regressors()` - * loop over unique realizations of change point locations - * `multinom_TS()` - * loop over chunks - * `mirror_vcov()` - * `rmvnorm()` - * `summarize_TS()` - * `diagnose_ptMCMC()` - * `count_trips()` - * `summarize_rhos()` - * `measure_rho_vcov()` - * `summarize_etas()` - * `measure_eta_vcov()` - * `select_TS()` - * applies the `measurer` and `selector` functions supplied via `TS_controls_list()` - * `package_LDA_TS()` - -Each component model's function (`LDA_set()` and `TS()`) can be run -independently, as well. + * `LDA_TS_control()` + * `LDA_control()` + * `TS_control()` + * `LDA()` + * `prepare_LDA()` + * `check_LDA()` + * `LDA_control()` + * `conform_data()` + * `run_LDA()` + * `LDA_call()` (replicated for each model) + * `LDA_msg()` + * `soft_call()` using `LDA$control$model` with `LDA$control$model_args` + * `topicmodels::LDA()` (*default*) + * `package_LDA()` + * `select_LDA()` + * `soft_call()` using `LDA$control$selector with LDA$control$selector_args + * `min()` (*default*) + * `measure_LDA()` (replicated for each model) + * `soft_call()` using `LDA$control$measurer` with `LDA$control$measurer_args` + * `AIC()` (*default*) + + * `TS()` + * `prepare_TS()` + * `TS_control()` + * `run_TS()` + * `TS_call()` (replicated for each model) + * `TS_msg()` + * `TS$control$model with TS$control$model_args + * `sequential_TS_control()` + * `est_changepoints()` + * `TS$control$method` with `TS$control$method_args` + * `ldats_classic_control()` + * `prep_saves()` + * `prep_ptMCMC_inputs()` + * `prep_temp_sequence()` + * `prep_proposal_dist()` + * `prep_cpts()` + * `prep_temp_sequence()` + * `TS$control$response` with `TS$control$response_args` + * `prep_ids()` + * `prep_pbar()` + * `update_pbar()` + * `step_chains()` + * `propose_step()` + * `proposed_step_mods()` + * `TS$control$response` with `TS$control$response_args` + * `eval_step()` + * `accept_step()` + * `swap_chains()` + * `update_saves()` + * `update_cpts()` + * `update_ids()` + * `process_saves()` + * `count_trips()` + * `est_regressors()` + * `TS$control$response` with `TS$control$response_args` + * `package_sequential_TS()` + * `summarize_rhos()` + * `measure_rho_vcov()` + * `summarize_eta()` + * `measure_eta_vcov()` + * `package_TS()` + * `select_TS()` + * `TS$control$selector` with `TS$control$selector_args` + * `measure_TS()` (replicated for each model) + * `TS$control$measurer` with `TS$control$measurer_args` + * `package_LDA_TS()` + + +Each component model's function (`LDA()` and `TS()`) can be run independently, as well. ## Controls Lists -To minimize the length of argument lists and facilitate simple default usage -throughout the pipeline, we implement an options/controls list approach, where -each of the main functions (`LDA_TS`, `LDA_set`, and `TS`) and its subfunctions -have a `control` argument that takes a `list` to replace the defaults returned -by its `_control` function: +To minimize the length of argument lists and facilitate simple default usage throughout the pipeline, we implement an options/controls list approach, where each of the main functions (`LDA_TS`, `LDA`, and `TS`) and their subfunctions have a `control` argument that takes a `list` to replace the defaults returned by its `_control` function: * `LDA_TS_control()` -* `LDA_set_control()` +* `LDA_control()` * `TS_control()` \ No newline at end of file diff --git a/doc/LDATS_codebase.html b/doc/LDATS_codebase.html index 1de2685f..7c620576 100644 --- a/doc/LDATS_codebase.html +++ b/doc/LDATS_codebase.html @@ -310,7 +310,7 @@

Juniper L. Simonis

Overview

-

This vignette outlines the code base for the LDATS package. It was constructed using LDATS version 0.3.0 on 2019-10-12.

+

This vignette outlines the code base for the LDATS package. It was constructed using LDATS version 0.2.5 on 2020-04-21.

Installation

@@ -324,145 +324,75 @@

Analytical Pipeline

  • LDA_TS()
      -
    • check_LDA_TS_inputs() +
    • LDA_TS_control()
        -
      • check_timename()
      • -
      • check_formulas()
      • -
      • check_nchangepoints()
      • -
      • check_weights()
      • -
      • check_control()
      • -
      • check_document_term_table()
      • -
      • check_document_covariate_table()
        -
      • -
      • check_topics()
      • -
      • check_seeds()
      • -
    • -
    • LDA_set() -
        -
      • check_LDA_TS_inputs() -
          -
        • check_control()
        • -
        • check_document_term_table()
        • -
        • check_topics()
        • -
        • check_seeds()
        • -
      • -
      • loop over models -
          -
        • prep_LDA_control()
        • -
        • LDA()
        • -
      • -
    • -
    • select_LDA() -
        -
      • applies the measurer and selector functions supplied via LDA_controls_list()
      • -
    • -
    • TS_on_LDA() -
        -
      • check_LDA_TS_inputs() -
          -
        • check_LDA_models()
        • -
        • check_formulas()
        • -
        • check_nchangepoints()
        • -
        • check_timename()
        • -
        • check_control()
        • -
        • check_weights()
        • -
        • check_document_covariate_table()
          -
        • +
        • LDA_control()
        • +
        • TS_control()
      • -
      • expand_TS()
      • -
      • loop over models -
          -
        • prep_TS_data()
        • -
        • TS() +
        • LDA()
            -
          • check_TS_inputs() +
          • prepare_LDA()
              -
            • check_formulas()
            • -
            • check_nchangepoints()
            • -
            • check_control()
            • -
            • check_weights()
            • +
            • check_LDA()
            • +
            • LDA_control()
            • +
            • conform_data()
          • -
          • est_changepoints() +
          • run_LDA()
              -
            • prep_saves()
            • -
            • prep_ptMCMC_inputs()
            • -
            • prep_cpts() +
            • LDA_call() (replicated for each model)
                -
              • loop over chains +
              • LDA_msg()
              • +
              • soft_call() using LDA$control$model with LDA$control$model_args
                  -
                • multinom_TS()
                • +
                • topicmodels::LDA() (default)
            • -
            • prep_ids()
            • -
            • prep_pbar()
            • -
            • loop over iterations +
            • package_LDA()
                -
              • update_pbar()
              • -
              • step_chains() -
                  -
                • propose_step() -
                    -
                  • proposed_step_mods() -
                      -
                    • loop over chains +
                    • select_LDA()
                        -
                      • multinom_TS()
                      • +
                      • soft_call() using LDA$control$selector with LDA$control$selector_args *min()(*default*) *measure_LDA()(replicated for each model) *soft_call()usingLDA\(control\)measurerwithLDA\(control\)measurer_args*AIC()` (default)
                • -
                • eval_step()
                • -
                • take_step()
              • -
              • swap_chains() +
              • TS()
                  -
                • loop over chain neighbors
                • -
              • -
              • update_saves()
              • -
              • update_cpts()
              • -
              • update_ids()
              • -
            • -
          • -
          • est_regressors() +
          • prepare_TS()
              -
            • loop over unique realizations of change point locations +
            • TS_control()
            • +
            • run_TS()
                -
              • multinom_TS()
              • -
              • loop over chunks +
              • TS_call() (replicated for each model)
                  -
                • mirror_vcov()
                • -
                • rmvnorm()
                • -
              • +
              • TS_msg()
              • +
              • TS$control$model with TS$control$model_args *sequential_TS_control()*est_changepoints()*TS\(control\)methodwithTS\(control\)method_args*ldats_classic_control()*prep_saves()*prep_ptMCMC_inputs()*prep_temp_sequence()*prep_proposal_dist()*prep_cpts()*prep_temp_sequence()*TS\(control\)responsewithTS\(control\)response_args*prep_ids()*prep_pbar()*update_pbar()*step_chains()*propose_step()*proposed_step_mods()*TS\(control\)responsewithTS\(control\)response_args*eval_step()*accept_step()*swap_chains()*update_saves()*update_cpts()*update_ids()*process_saves()*count_trips()*est_regressors()*TS\(control\)responsewithTS\(control\)response_args*package_sequential_TS()*summarize_rhos()*measure_rho_vcov()*summarize_eta()*measure_eta_vcov()`
          • -
          • summarize_TS() +
          • package_TS() +
              +
            • select_TS()
                -
              • diagnose_ptMCMC() +
              • TS$control$selector with TS$control$selector_args
              • +
              • measure_TS() (replicated for each model)
                  -
                • count_trips()
                • +
                • TS$control$measurer with TS$control$measurer_args
              • -
              • summarize_rhos()
              • -
              • measure_rho_vcov()
              • -
              • summarize_etas()
              • -
              • measure_eta_vcov()
        • -
        • select_TS() -
            -
          • applies the measurer and selector functions supplied via TS_controls_list()
        • package_LDA_TS()
      • -
    -

    Each component model’s function (LDA_set() and TS()) can be run independently, as well.

    +

    Each component model’s function (LDA() and TS()) can be run independently, as well.

Controls Lists

-

To minimize the length of argument lists and facilitate simple default usage throughout the pipeline, we implement an options/controls list approach, where each of the main functions (LDA_TS, LDA_set, and TS) and its subfunctions have a control argument that takes a list to replace the defaults returned by its _control function:

+

To minimize the length of argument lists and facilitate simple default usage throughout the pipeline, we implement an options/controls list approach, where each of the main functions (LDA_TS, LDA, and TS) and their subfunctions have a control argument that takes a list to replace the defaults returned by its _control function:

  • LDA_TS_control()
  • -
  • LDA_set_control()
  • +
  • LDA_control()
  • TS_control()
diff --git a/doc/rodents-example.R b/doc/rodents-example.R index 46f0fdbf..48f98d62 100644 --- a/doc/rodents-example.R +++ b/doc/rodents-example.R @@ -9,64 +9,74 @@ library(LDATS) vers <- packageVersion("LDATS") today <- Sys.Date() -## ---- eval=FALSE--------------------------------------------------------- +## ----download files, include = FALSE------------------------------------- + vignette_files <- tempdir() + dir.create(file.path(vignette_files, "output"), showWarnings = FALSE) + github_path <- "https://github.com/weecology/LDATS-replications/raw/master/output/" + files_to_download <- c("rodents_example_lda_model_set.RDS", "rodents_example_ts_model_set.RDS", + "rodents_example_lda_ts_model_set.RDS") + + for (file in files_to_download) { + download.file(url = paste0(github_path, file), + destfile = file.path(vignette_files, "output", file), + mode = "wb") + } + +## ---- eval = FALSE------------------------------------------------------- # install.packages("devtools") # devtools::install_github("weecology/LDATS") -# library(LDATS) ## ------------------------------------------------------------------------ data(rodents) head(rodents$document_term_table, 10) head(rodents$document_covariate_table, 10) -## ----lda_set, eval =F---------------------------------------------------- -# lda_model_set <- LDA_set(document_term_table = rodents$document_term_table, -# topics = c(2:5), -# nseeds = 10, -# control = list(quiet = TRUE)) -# +## ----lda_set, eval = FALSE----------------------------------------------- +# lda_model_set <- LDA(data = rodents, topics = 2:5, replicates = 10, +# control = list(quiet = TRUE)) -## ----lda set not quiet, eval =F------------------------------------------ -# lda_model_set2 <- LDA_set(document_term_table = rodents$document_term_table, -# topics = c(2:3), -# nseeds = 2) +## ----save lda model set, include = FALSE, eval = FALSE------------------- +# saveRDS(lda_model_set, file.path(vignette_files, "output", "rodents_example_lda_model_set.RDS")) -## ----load lda model set, include = F------------------------------------- -load(here::here('vignettes', 'rodents-example-files', 'lda_model_set.Rds')) +## ----lda set not quiet, eval = FALSE------------------------------------- +# lda_model_set2 <- LDA(data = rodents, topics = c(2:3), replicates = 2) + +## ----load lda model set, include = FALSE--------------------------------- +lda_model_set <- readRDS(file.path(vignette_files, "output", "rodents_example_lda_model_set.Rds")) rm(lda_model_set2) ## ----select LDA---------------------------------------------------------- -selected_lda_model <- select_LDA(lda_model_set) +selected_lda_model <- select_LDA(lda_model_set$LDAs)[[1]] ## ----LDA results--------------------------------------------------------- # Number of topics: -selected_lda_model[[1]]@k +selected_lda_model$topics # Topic composition of communities at each time step # Columns are topics; rows are time steps -head(selected_lda_model[[1]]@gamma) - +head(selected_lda_model$document_topic_table) -## ----plot lda, fig.width=7, fig.height=6--------------------------------- -plot(selected_lda_model[[1]]) +## ----plot lda, fig.width = 7, fig.height = 6----------------------------- +plot(selected_lda_model) +## ----ts set, eval = FALSE------------------------------------------------ +# ts_model_set <- TS(LDAs = lda_model_set, +# formulas = ~ sin_year + cos_year, +# nchangepoints = 0:1, +# timename = "newmoon", +# weights = TRUE, +# control = list(method_args = +# list(control = ldats_classic_control(nit = 1000)))) -## ----ts on lda, eval = F------------------------------------------------- -# changepoint_models <- TS_on_LDA(LDA_models = selected_lda_model, -# document_covariate_table = rodents$document_covariate_table, -# formulas = ~ sin_year + cos_year, -# nchangepoints = c(0:1), -# timename = "newmoon", -# weights = document_weights(rodents$document_term_table), -# control = list(nit = 1000)) -# +## ----save ts model set, include = FALSE, eval = FALSE-------------------- +# saveRDS(ts_model_set, file.path(vignette_files, "output", "rodents_example_ts_model_set.RDS")) -## ----reload ts, include = F---------------------------------------------- -load(here::here('vignettes', 'rodents-example-files', 'changepoint_models.Rds')) +## ----load ts model set, include = FALSE---------------------------------- +ts_model_set <- readRDS(file.path(vignette_files, "output", "rodents_example_ts_model_set.RDS")) ## ----select ts----------------------------------------------------------- -selected_changepoint_model <- select_TS(changepoint_models) +selected_changepoint_model <- select_TS(ts_model_set$TSs)[[1]] ## ----cpt results--------------------------------------------------------- # Number of changepoints @@ -77,35 +87,39 @@ selected_changepoint_model$rho_summary # Raw estimates for timesteps for each changepoint # Changepoints are columns -head(selected_changepoint_model$rhos) +head(selected_changepoint_model$focal_rhos) ## ----plot cpt, fig.width=7, fig.height=6--------------------------------- plot(selected_changepoint_model) -## ----lda_ts, eval = F---------------------------------------------------- +## ----lda_ts, eval = FALSE------------------------------------------------ # lda_ts_results <- LDA_TS(data = rodents, -# nseeds = 10, +# replicates = 10, # topics = 2:5, # formulas = ~ sin_year + cos_year, # nchangepoints= 0:1, # timename = "newmoon", -# control = list(nit = 1000)) +# control = list(TS_method_args = +# list(control = ldats_classic_control(nit = 1000)))) + +## ----save lda ts model set, include = FALSE, eval = FALSE---------------- +# saveRDS(lda_ts_results, file.path(vignette_files, "output", "rodents_example_lda_ts_model_set.RDS")) -## ----load ldats results, include = F------------------------------------- -load(here::here('vignettes', 'rodents-example-files', 'lda_ts_results.Rds')) +## ----load lda ts model set, include = FALSE------------------------------ +lda_ts_results <- readRDS(file.path(vignette_files, "output", "rodents_example_lda_ts_model_set.RDS")) ## ----LDA_TS results------------------------------------------------------ names(lda_ts_results) # Number of topics -lda_ts_results$`Selected LDA model`$k@k +lda_ts_results$"LDA models"$selected_LDAs[[1]]$topics # Number of changepoints -lda_ts_results$`Selected TS model`$nchangepoints +lda_ts_results$"TS models"$selected_TSs[[1]]$nchangepoints # Summary of changepoint locations -lda_ts_results$`Selected TS model`$rho_summary +lda_ts_results$"TS models"$selected_TSs[[1]]$rho_summary ## ----plot LDA_TS results, fig.height = 16, fig.width = 7, echo = F------- plot(lda_ts_results) diff --git a/doc/rodents-example.Rmd b/doc/rodents-example.Rmd index ef390a6d..fa046ca9 100644 --- a/doc/rodents-example.Rmd +++ b/doc/rodents-example.Rmd @@ -21,26 +21,35 @@ vers <- packageVersion("LDATS") today <- Sys.Date() ``` -This vignette walks through an example of **`LDATS`** at the command line and -was constructed using **`LDATS`** version `r vers` on `r today`. +```{r download files, include = FALSE} + vignette_files <- tempdir() + dir.create(file.path(vignette_files, "output"), showWarnings = FALSE) + github_path <- "https://github.com/weecology/LDATS-replications/raw/master/output/" + files_to_download <- c("rodents_example_lda_model_set.RDS", "rodents_example_ts_model_set.RDS", + "rodents_example_lda_ts_model_set.RDS") + + for (file in files_to_download) { + download.file(url = paste0(github_path, file), + destfile = file.path(vignette_files, "output", file), + mode = "wb") + } +``` + +This vignette walks through an example of **`LDATS`** at the command line and was constructed using **`LDATS`** version `r vers` on `r today`. ## Installation -To obtain the most recent version of **LDATS**, install and load the most recent -version from GitHub: +To obtain the most recent version of **LDATS**, install and load the most recent version from GitHub: -```{r, eval=FALSE} +```{r, eval = FALSE} install.packages("devtools") devtools::install_github("weecology/LDATS") -library(LDATS) ``` ## Data -For this vignette, we will be using rodent data from the control plots of the -[Portal Project](https://github.com/weecology/portaldata), which come with -the **LDATS** package (`data(rodents)`). +For this vignette, we will be using rodent data from the control plots of the [Portal Project](https://github.com/weecology/portaldata), which come with the **LDATS** package (`data(rodents)`). `rodents` contains two data tables, a `document_term_table` and a `document_covariate_table`. @@ -57,36 +66,36 @@ head(rodents$document_covariate_table, 10) ## Stage 1: LDA models -We use `LDA_set()` to run replicate LDA models (each with its own seed) with varying numbers of topics (`2:5`) and `select_LDA()` to select the best model. +We use `LDA()` to run replicate LDA models (each with its own seed) with varying numbers of topics (`2:5`), which includes a run of `select_LDA()` to select the best model(s). We use the `control` argument to pass controls to the LDA function via a `list`. In this case, we can set `quiet = TRUE` to make the model run quietly. -```{r lda_set, eval =F} -lda_model_set <- LDA_set(document_term_table = rodents$document_term_table, - topics = c(2:5), - nseeds = 10, - control = list(quiet = TRUE)) - +```{r lda_set, eval = FALSE} +lda_model_set <- LDA(data = rodents, topics = 2:5, replicates = 10, + control = list(quiet = TRUE)) ``` +```{r save lda model set, include = FALSE, eval = FALSE} +saveRDS(lda_model_set, file.path(vignette_files, "output", "rodents_example_lda_model_set.RDS")) +``` If we do not pass any controls, by default, `quiet = FALSE` (here run with only `2:3` topics and `2` seeds, to keep output short): -```{r lda set not quiet, eval =F} -lda_model_set2 <- LDA_set(document_term_table = rodents$document_term_table, - topics = c(2:3), - nseeds = 2) +```{r lda set not quiet, eval = FALSE} +lda_model_set2 <- LDA(data = rodents, topics = c(2:3), replicates = 2) ``` -`LDA_set()` returns a list of LDA models. We use `select_LDA()` to identify the best number of topics and choice of seed from our set of models. By default, we will choose models based on minimum AIC. To use different selection criteria, define the appropriate functions and specify them by passing `list(measurer = [measurer function], selector = [max, min, etc])` to the `control` argument. +By default, we will choose models based on minimum AIC. To use different selection criteria, define the appropriate functions and specify them by passing `list(measurer = [measurer function], selector = [max, min, etc])` to the `control` argument. + + -```{r load lda model set, include = F} -load(here::here('vignettes', 'rodents-example-files', 'lda_model_set.Rds')) +```{r load lda model set, include = FALSE} +lda_model_set <- readRDS(file.path(vignette_files, "output", "rodents_example_lda_model_set.Rds")) rm(lda_model_set2) ``` ```{r select LDA} -selected_lda_model <- select_LDA(lda_model_set) +selected_lda_model <- select_LDA(lda_model_set$LDAs)[[1]] ``` @@ -95,53 +104,58 @@ We can access the results of the model: ```{r LDA results} # Number of topics: -selected_lda_model[[1]]@k +selected_lda_model$topics # Topic composition of communities at each time step # Columns are topics; rows are time steps -head(selected_lda_model[[1]]@gamma) - +head(selected_lda_model$document_topic_table) ``` -`LDATS` includes flexible plot functionality for LDAs and time series. The top panel illustrates topic composition by species, and the bottom panel shows the proportion of the community made up of each topic over time. For all the available plot options see `?plot.LDA_VEM`. - -```{r plot lda, fig.width=7, fig.height=6} -plot(selected_lda_model[[1]]) +`LDATS` includes flexible plot functionality for LDAs and time series. +The top panel illustrates topic composition by species, and the bottom panel shows the proportion of the community made up of each topic over time. +For all the available plot options see `?plot.LDA`. +```{r plot lda, fig.width = 7, fig.height = 6} +plot(selected_lda_model) ``` ## Stage 2: TS changepoint models -We use `TS_on_LDA()` to run LDATS changepoint models with `0:6` changepoints, and then use `select_TS()` to find the best-fit model of these. - -Here, `TS_on_LDA()` predicts the `gamma` (the proportion of the community made of up each topic) from our LDA model(s) as a function of `sin_year` and `cos_year` in the `document_covariate_table`. We use `document_weights()` to weight the information from each time step according to the total number of rodents captured at that time step. +We use `TS` to run LDATS changepoint models with `0:6` changepoints, which includes a call to `select_TS()` to find the best-fit model(s) of these. +Here, `TS()` predicts the `gamma` (the proportion of the community made of up each topic) from our LDA model(s) as a function of `sin_year` and `cos_year` in the `document_covariate_table`. +We use `document_weights()` to weight the information from each time step according to the total number of rodents captured at that time step. -```{r ts on lda, eval = F} -changepoint_models <- TS_on_LDA(LDA_models = selected_lda_model, - document_covariate_table = rodents$document_covariate_table, - formulas = ~ sin_year + cos_year, - nchangepoints = c(0:1), - timename = "newmoon", - weights = document_weights(rodents$document_term_table), - control = list(nit = 1000)) +```{r ts set, eval = FALSE} +ts_model_set <- TS(LDAs = lda_model_set, + formulas = ~ sin_year + cos_year, + nchangepoints = 0:1, + timename = "newmoon", + weights = TRUE, + control = list(method_args = + list(control = ldats_classic_control(nit = 1000)))) +``` +```{r save ts model set, include = FALSE, eval = FALSE} +saveRDS(ts_model_set, file.path(vignette_files, "output", "rodents_example_ts_model_set.RDS")) ``` +We can adjust options (default settings can be seen using `TS_control()`) for both TS functions by passing a list to the `control` argument. +For a full list see `?TS_control`. Here we illustrate adjusting the number of ptMCMC iterations - the default is 10000, but it is convenient to use fewer iterations for code development. -We can adjust options (default settings can be seen using `TS_control()`) for both TS functions by passing a list to the `control` argument. For a full list see `?TS_control`. Here we illustrate adjusting the number of ptMCMC iterations - the default is 10000, but it is convenient to use fewer iterations for code development. - -Also, it is important to note that by default the TS functions take the name of the time-step column from the `document_covariate_table` to be `"time"`. To pass a different column name, use the `timename` argument in `TS_on_LDA()`. +Also, it is important to note that by default the TS functions take the name of the time-step column from the `document_covariate_table` to be `"time"`. +To pass a different column name, use the `timename` argument in `TS()`. -`select_TS()` will identify the best-fit changepoint model of the models from `TS_on_LDA()`. As with `select_LDA()`, we can adjust the `measurer` and `selector` functions using the `control` argument list. +`select_TS()` will identify the best-fit changepoint model of the models from `TS_on_LDA()`. +As with `select_LDA()`, we can adjust the `measurer` and `selector` functions using the `control` argument list. -```{r reload ts, include = F} -load(here::here('vignettes', 'rodents-example-files', 'changepoint_models.Rds')) +```{r load ts model set, include = FALSE} +ts_model_set <- readRDS(file.path(vignette_files, "output", "rodents_example_ts_model_set.RDS")) ``` ```{r select ts} -selected_changepoint_model <- select_TS(changepoint_models) +selected_changepoint_model <- select_TS(ts_model_set$TSs)[[1]] ``` We can access the results of the selected changepoint model: @@ -155,7 +169,7 @@ selected_changepoint_model$rho_summary # Raw estimates for timesteps for each changepoint # Changepoints are columns -head(selected_changepoint_model$rhos) +head(selected_changepoint_model$focal_rhos) ``` @@ -167,20 +181,25 @@ plot(selected_changepoint_model) ## Full analysis with `LDA_TS` -Finally, we can perform an entire LDATS analysis, including all of the above steps, using the `LDA_TS()` function and passing options to the LDA and TS functions as a `list` to the `control` argument. The default is for `LDA_TS` to weight the time series model based on the document sizes, so we do not need to tell it to do so. +Finally, we can perform an entire LDATS analysis, including all of the above steps, using the `LDA_TS()` function and passing options to the LDA and TS functions as a `list` to the `control` argument. +The default is for `LDA_TS` to weight the time series model based on the document sizes, so we do not need to tell it to do so. -```{r lda_ts, eval = F} +```{r lda_ts, eval = FALSE} lda_ts_results <- LDA_TS(data = rodents, - nseeds = 10, + replicates = 10, topics = 2:5, formulas = ~ sin_year + cos_year, nchangepoints= 0:1, timename = "newmoon", - control = list(nit = 1000)) + control = list(TS_method_args = + list(control = ldats_classic_control(nit = 1000)))) ``` -```{r load ldats results, include = F} -load(here::here('vignettes', 'rodents-example-files', 'lda_ts_results.Rds')) +```{r save lda ts model set, include = FALSE, eval = FALSE} +saveRDS(lda_ts_results, file.path(vignette_files, "output", "rodents_example_lda_ts_model_set.RDS")) +``` +```{r load lda ts model set, include = FALSE} +lda_ts_results <- readRDS(file.path(vignette_files, "output", "rodents_example_lda_ts_model_set.RDS")) ``` `LDA_TS()` returns a list of all the model objects, and we can access their contents as above: @@ -189,13 +208,13 @@ load(here::here('vignettes', 'rodents-example-files', 'lda_ts_results.Rds')) names(lda_ts_results) # Number of topics -lda_ts_results$`Selected LDA model`$k@k +lda_ts_results$"LDA models"$selected_LDAs[[1]]$topics # Number of changepoints -lda_ts_results$`Selected TS model`$nchangepoints +lda_ts_results$"TS models"$selected_TSs[[1]]$nchangepoints # Summary of changepoint locations -lda_ts_results$`Selected TS model`$rho_summary +lda_ts_results$"TS models"$selected_TSs[[1]]$rho_summary ``` Finally, we can plot the `LDA_TS` results. diff --git a/doc/rodents-example.html b/doc/rodents-example.html index 1e83d20e..c44b6a4b 100644 --- a/doc/rodents-example.html +++ b/doc/rodents-example.html @@ -308,13 +308,12 @@

Renata Diaz and Juniper L. Simonis

-

This vignette walks through an example of LDATS at the command line and was constructed using LDATS version 0.3.0 on 2019-10-12.

+

This vignette walks through an example of LDATS at the command line and was constructed using LDATS version 0.2.5 on 2020-04-21.

Installation

To obtain the most recent version of LDATS, install and load the most recent version from GitHub:

install.packages("devtools")
-devtools::install_github("weecology/LDATS")
-library(LDATS)
+devtools::install_github("weecology/LDATS")

Data

@@ -350,27 +349,23 @@

Data

Stage 1: LDA models

-

We use LDA_set() to run replicate LDA models (each with its own seed) with varying numbers of topics (2:5) and select_LDA() to select the best model.

+

We use LDA() to run replicate LDA models (each with its own seed) with varying numbers of topics (2:5), which includes a run of select_LDA() to select the best model(s).

We use the control argument to pass controls to the LDA function via a list. In this case, we can set quiet = TRUE to make the model run quietly.

-
lda_model_set <- LDA_set(document_term_table = rodents$document_term_table,
-                         topics = c(2:5),
-                         nseeds = 10,
-                         control = list(quiet = TRUE))
+
lda_model_set <- LDA(data = rodents, topics = 2:5, replicates = 10,
+                     control = list(quiet = TRUE))

If we do not pass any controls, by default, quiet = FALSE (here run with only 2:3 topics and 2 seeds, to keep output short):

-
lda_model_set2 <- LDA_set(document_term_table = rodents$document_term_table,
-                         topics = c(2:3),
-                         nseeds = 2)
-

LDA_set() returns a list of LDA models. We use select_LDA() to identify the best number of topics and choice of seed from our set of models. By default, we will choose models based on minimum AIC. To use different selection criteria, define the appropriate functions and specify them by passing list(measurer = [measurer function], selector = [max, min, etc]) to the control argument.

-
selected_lda_model <- select_LDA(lda_model_set)
+
lda_model_set2 <- LDA(data = rodents, topics = c(2:3), replicates = 2)
+

By default, we will choose models based on minimum AIC. To use different selection criteria, define the appropriate functions and specify them by passing list(measurer = [measurer function], selector = [max, min, etc]) to the control argument.

+
selected_lda_model <- select_LDA(lda_model_set$LDAs)[[1]]

We can access the results of the model:

# Number of topics:
 
-selected_lda_model[[1]]@k
+selected_lda_model$topics
 #> [1] 5
 
 # Topic composition of communities at each time step
 # Columns are topics; rows are time steps
-head(selected_lda_model[[1]]@gamma)
+head(selected_lda_model$document_topic_table)
 #>             [,1]        [,2]      [,3]        [,4]        [,5]
 #> [1,] 0.008303695 0.466475067 0.3431302 0.008585984 0.173505077
 #> [2,] 0.005769837 0.663030049 0.2480563 0.005808044 0.077335746
@@ -378,25 +373,25 @@ 

Stage 1: LDA models

#> [4,] 0.003986846 0.183762181 0.8042557 0.004034794 0.003960483 #> [5,] 0.005016414 0.005116156 0.9210663 0.063795689 0.005005489 #> [6,] 0.004157612 0.105779863 0.8817614 0.004170617 0.004130473
-

LDATS includes flexible plot functionality for LDAs and time series. The top panel illustrates topic composition by species, and the bottom panel shows the proportion of the community made up of each topic over time. For all the available plot options see ?plot.LDA_VEM.

-
plot(selected_lda_model[[1]])
+

LDATS includes flexible plot functionality for LDAs and time series. The top panel illustrates topic composition by species, and the bottom panel shows the proportion of the community made up of each topic over time. For all the available plot options see ?plot.LDA.

+
plot(selected_lda_model)

Stage 2: TS changepoint models

-

We use TS_on_LDA() to run LDATS changepoint models with 0:6 changepoints, and then use select_TS() to find the best-fit model of these.

-

Here, TS_on_LDA() predicts the gamma (the proportion of the community made of up each topic) from our LDA model(s) as a function of sin_year and cos_year in the document_covariate_table. We use document_weights() to weight the information from each time step according to the total number of rodents captured at that time step.

-
changepoint_models <- TS_on_LDA(LDA_models = selected_lda_model, 
-                                document_covariate_table = rodents$document_covariate_table,
-                                formulas = ~ sin_year + cos_year,
-                                nchangepoints = c(0:1), 
-                                timename = "newmoon",
-                                weights = document_weights(rodents$document_term_table),
-                                control = list(nit = 1000))
+

We use TS to run LDATS changepoint models with 0:6 changepoints, which includes a call to select_TS() to find the best-fit model(s) of these.

+

Here, TS() predicts the gamma (the proportion of the community made of up each topic) from our LDA model(s) as a function of sin_year and cos_year in the document_covariate_table. We use document_weights() to weight the information from each time step according to the total number of rodents captured at that time step.

+
ts_model_set <- TS(LDAs = lda_model_set, 
+                   formulas = ~ sin_year + cos_year,
+                   nchangepoints = 0:1, 
+                   timename = "newmoon",
+                   weights = TRUE,
+                   control = list(method_args = 
+                           list(control = ldats_classic_control(nit = 1000))))

We can adjust options (default settings can be seen using TS_control()) for both TS functions by passing a list to the control argument. For a full list see ?TS_control. Here we illustrate adjusting the number of ptMCMC iterations - the default is 10000, but it is convenient to use fewer iterations for code development.

-

Also, it is important to note that by default the TS functions take the name of the time-step column from the document_covariate_table to be "time". To pass a different column name, use the timename argument in TS_on_LDA().

+

Also, it is important to note that by default the TS functions take the name of the time-step column from the document_covariate_table to be "time". To pass a different column name, use the timename argument in TS().

select_TS() will identify the best-fit changepoint model of the models from TS_on_LDA(). As with select_LDA(), we can adjust the measurer and selector functions using the control argument list.

-
selected_changepoint_model <- select_TS(changepoint_models)
+
selected_changepoint_model <- select_TS(ts_model_set$TSs)[[1]]

We can access the results of the selected changepoint model:

# Number of changepoints
 selected_changepoint_model$nchangepoints
@@ -405,55 +400,55 @@ 

Stage 2: TS changepoint models

# Summary of timesteps (newmoon values) for each changepoint selected_changepoint_model$rho_summary #> Mean Median Mode Lower_95% Upper_95% SD MCMCerr AC10 -#> Changepoint_1 212.93 216 217 187 225 10.03 0.3172 0.0066 +#> Changepoint_1 208.11 211 204 187 225 16.27 0.5145 0.1138 #> ESS -#> Changepoint_1 360.2996 +#> Changepoint_1 161.9439 # Raw estimates for timesteps for each changepoint # Changepoints are columns -head(selected_changepoint_model$rhos) +head(selected_changepoint_model$focal_rhos) #> [,1] -#> [1,] 220 -#> [2,] 220 -#> [3,] 218 -#> [4,] 215 -#> [5,] 213 -#> [6,] 216
+#> [1,] 409 +#> [2,] 76 +#> [3,] 76 +#> [4,] 76 +#> [5,] 76 +#> [6,] 83

LDATS will plot the results of a changepoint model:

plot(selected_changepoint_model)
-

+

Full analysis with LDA_TS

Finally, we can perform an entire LDATS analysis, including all of the above steps, using the LDA_TS() function and passing options to the LDA and TS functions as a list to the control argument. The default is for LDA_TS to weight the time series model based on the document sizes, so we do not need to tell it to do so.

lda_ts_results <- LDA_TS(data = rodents,
-                         nseeds = 10, 
+                         replicates = 10, 
                          topics = 2:5,
                          formulas = ~ sin_year + cos_year,
                          nchangepoints= 0:1,
                          timename = "newmoon",
-                         control = list(nit = 1000))
+ control = list(TS_method_args = + list(control = ldats_classic_control(nit = 1000))))

LDA_TS() returns a list of all the model objects, and we can access their contents as above:

names(lda_ts_results)
-#> [1] "LDA models"         "Selected LDA model" "TS models"         
-#> [4] "Selected TS model"
-
-# Number of topics
-lda_ts_results$`Selected LDA model`$k@k
-#> [1] 5
-
-# Number of changepoints
-lda_ts_results$`Selected TS model`$nchangepoints
-#> [1] 1
-
-# Summary of changepoint locations
-lda_ts_results$`Selected TS model`$rho_summary
-#>                 Mean Median Mode Lower_95% Upper_95%  SD MCMCerr   AC10
-#> Changepoint_1 209.26    211  215       190       225 9.7  0.3067 0.0104
-#>                    ESS
-#> Changepoint_1 380.3491
+#> [1] "LDA models" "TS models" "control" + +# Number of topics +lda_ts_results$"LDA models"$selected_LDAs[[1]]$topics +#> [1] 5 + +# Number of changepoints +lda_ts_results$"TS models"$selected_TSs[[1]]$nchangepoints +#> [1] 1 + +# Summary of changepoint locations +lda_ts_results$"TS models"$selected_TSs[[1]]$rho_summary +#> Mean Median Mode Lower_95% Upper_95% SD MCMCerr AC10 +#> Changepoint_1 209.16 211 215 192 225 9.5 0.3004 0.0677 +#> ESS +#> Changepoint_1 441.4974

Finally, we can plot the LDA_TS results.

-

+

diff --git a/man/AIC.LDA.Rd b/man/AIC.LDA.Rd new file mode 100644 index 00000000..4c8ac35a --- /dev/null +++ b/man/AIC.LDA.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LDA.R +\name{AIC.LDA} +\alias{AIC.LDA} +\title{Determine the AIC of a Linguistic Decomposition Analysis + model} +\usage{ +\method{AIC}{LDA}(object, ..., k = 2) +} +\arguments{ +\item{object}{Class \code{LDA} object to be evaluated.} + +\item{...}{Not used, simply included to maintain method compatibility.} + +\item{k}{Per-parameter numeric penalty.} +} +\value{ +AIC of the model. +} +\description{ +Convenience function to extract and format the AIC + of a \code{LDA}-class object fit by \code{\link{LDA_call}}. +} diff --git a/man/LDA.Rd b/man/LDA.Rd new file mode 100644 index 00000000..0e137c0b --- /dev/null +++ b/man/LDA.Rd @@ -0,0 +1,164 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LDA.R +\name{LDA} +\alias{LDA} +\alias{check_LDA} +\alias{prepare_LDA} +\alias{run_LDA} +\alias{LDA_call} +\alias{LDA_msg} +\alias{package_LDA} +\alias{select_LDA} +\alias{measure_LDA} +\alias{LDA_control} +\title{Run a set of Linguistic Decomposition Analysis models} +\usage{ +LDA(data, topics = 2, replicates = 1, control = list()) + +check_LDA(topics = 2, replicates = 1, control = list()) + +prepare_LDA(data, topics = 2, replicates = 1, control = list()) + +run_LDA(LDAs) + +LDA_call(LDA) + +LDA_msg(LDA) + +package_LDA(LDAs) + +select_LDA(LDAs) + +measure_LDA(LDAs) + +LDA_control(model = topicmodels_LDA, model_args = list(method = "VEM", + seeded = TRUE), measurer = AIC, measurer_args = list(NULL), + selector = which.min, selector_args = list(NULL), nsubsets = 1, + subset_rule = NULL, soften = TRUE, quiet = FALSE, ...) +} +\arguments{ +\item{data}{Any of the data structures allowable for LDATS analyses: +\code{matrix} or \code{data.frame} document term table, +\code{list} of document term and covariate tables, a \code{list} of +training and test sets of the two tables, or a \code{list} of multiple +replicate splits of training and test sets of the two tables. \cr +See \code{\link{conform_data}}, which is used to ensure data structure +validity for the desired model.} + +\item{topics}{Vector of the number of topics to evaluate for each model. +Must be conformable to \code{integer} values.} + +\item{replicates}{Number of replicate starts to use for each +value of \code{topics}. Must be conformable to \code{integer} value.} + +\item{control}{A \code{list} of parameters to control the fitting of the +LDA model. Values not input assume defaults set by +\code{\link{LDA_control}}.} + +\item{LDA, LDAs}{model \code{list} (\code{LDA}) or a \code{list} of LDA +model \code{list}s (\code{LDAs}).} + +\item{model}{Main LDA \code{function}.} + +\item{model_args}{\code{list} of (named) arguments to be used in +\code{model} via \code{\link{LDA_call}}.} + +\item{measurer}{\code{function} used in evaluation of the LDA +models; \code{measurer} creates a value for each model.} + +\item{measurer_args}{\code{list} of (named) arguments to be used in +\code{measurer} via \code{\link{do.call}}.} + +\item{selector}{\code{function} usde in evaluation of the LDA +models; \code{selector} operates on the values to choose the models.} + +\item{selector_args}{\code{list} of (named) arguments to be used in +\code{selector} via \code{\link{do.call}}.} + +\item{nsubsets}{Number of data subsets.} + +\item{subset_rule}{\code{function} used to subset the data.} + +\item{soften}{\code{logical} indicator of whether the model should error +softly or if errors should trigger a full-stop to the pipeline.} + +\item{quiet}{\code{logical} indicator of whether the model should run +quietly (if \code{FALSE}, a progress bar and notifications are printed).} + +\item{...}{Not passed along to the output, rather included to allow for +automated removal of unneeded controls.} +} +\value{ +\code{LDA},\code{pacakage_LDA}: class \code{LDA_set} \code{list} of + both selected and all results from \code{\link{LDA_call}} applied for + each model on each data input(s) as well as the control \code{list} + used to fit the model. \cr \cr + \code{prepare_LDA}: \code{list} of \code{list}s, each of which is a + preliminary model object for an LDA model fit. \cr \cr + \code{check_LDA}: an error message is thrown if any input is improper, + otherwise \code{NULL}. + \code{LDA_control}: \code{list} of controls for the LDA model, with + named elements corresponding to the arguments. + \code{run_LDA}: \code{LDA_set} \code{list} of model results from all + runs of a \code{} function, such as + \code{\link{topicmodels_LDA}}. \cr \cr + \code{LDA_call}: \code{LDA} \code{list} of model results from a single + run of a \code{} function, such as + \code{\link{topicmodels_LDA}}. \cr \cr + \code{measure_LDA}: \code{vector} of values corresponding to the model + evaluations. \cr \cr + \code{select_LDA}: \code{list} of selected models' \code{list}s. +} +\description{ +Conduct Linguistic Decomposition Analyses. \cr \cr + \code{LDA} provides the main interface for Linguistic Decomposition + Analysis conducted within the LDATS application of (Christensen + \emph{et al.} 2018). \cr \cr + \code{prepare_LDA} pre-prepares the LDA model objects for simpler + use within the subfunctions. \cr \cr + \code{check_LDA} ensures that the inputs are proper. + See \code{\link{check_topics}}, \code{\link{check_replicates}}, + and \code{\link{check_control}} for specifics. \cr \cr + \code{LDA_control} defines and creates the control list used to fit + the LDA model. \cr \cr + \code{run_LDA} runs (via \code{\link{LDA_call}}) all LDA models + as set up by \code{prep_LDA_models}. \cr \cr + \code{LDA_call} runs (via \code{\link{do.call}}) a single LDA model + as set up by \code{prep_LDA_models}. \cr \cr + \code{LDA_msg} produces a model-running message if desired. \cr \cr + \code{measure_LDA} determines the fit value used to select among the + models. \cr \cr + \code{select_LDA} chooses the best model(s) of interest based on their + measured values and the selector function. \cr \cr + \code{package_LDA} sets the class and names the elements of the results + \code{list} from \code{\link{LDA_call}} applied to the + combination of TS models requested for the data input(s). +} +\details{ +For a (potentially subset) dataset consisting of counts of words + across multiple documents in a corpus, + \enumerate{ + \item Conduct multiple Linguistic Decomposition Analysis (LDA) models + (e.g., Latent Dirichlet Allocation using the Variational Expectation + Maximization (VEM) algorithm; Blei \emph{et al.} 2003, Grun and + Hornik 2011), + \item Select from the LDA model results to pick those used in the Time + Series (TS) models, and + \item Package the results. + } +} +\references{ +Blei, D. M., A. Y. Ng, and M. I. Jordan. 2003. Latent Dirichlet + Allocation. \emph{Journal of Machine Learning Research} + \strong{3}:993-1022. + \href{http://jmlr.csail.mit.edu/papers/v3/blei03a.html}{link}. + + Christensen, E., D. J. Harris, and S. K. M. Ernest. 2018. + Long-term community change through multiple rapid transitions in a + desert rodent community. \emph{Ecology} \strong{99}:1523-1529. + \href{https://doi.org/10.1002/ecy.2373}{link}. + + Grun B. and K. Hornik. 2011. topicmodels: An R Package for Fitting Topic + Models. \emph{Journal of Statistical Software} \strong{40}:13. + \href{https://www.jstatsoft.org/article/view/v040i13}{link}. +} diff --git a/man/LDA_TS.Rd b/man/LDA_TS.Rd index 21d8c4a0..0b97e938 100644 --- a/man/LDA_TS.Rd +++ b/man/LDA_TS.Rd @@ -2,121 +2,209 @@ % Please edit documentation in R/LDA_TS.R \name{LDA_TS} \alias{LDA_TS} -\alias{conform_LDA_TS_data} -\alias{check_LDA_TS_inputs} -\title{Run a full set of Latent Dirichlet Allocations and Time - Series models} +\alias{package_LDA_TS} +\alias{LDA_TS_control} +\title{Run a set of Linguistic Decomposition Analysis models coupled to + Bayesian Time Series models} \usage{ -LDA_TS(data, topics = 2, nseeds = 1, formulas = ~1, +LDA_TS(data, topics = 2, replicates = 1, formulas = ~1, nchangepoints = 0, timename = "time", weights = TRUE, control = list()) -conform_LDA_TS_data(data, quiet = FALSE) +package_LDA_TS(LDAs, TSs, control) -check_LDA_TS_inputs(data = NULL, topics = 2, nseeds = 1, - formulas = ~1, nchangepoints = 0, timename = "time", - weights = TRUE, control = list()) +LDA_TS_control(LDA_model = topicmodels_LDA, + LDA_model_args = list(method = "VEM", seeded = TRUE), + LDA_measurer = AIC, LDA_measurer_args = list(NULL), + LDA_selector = which.min, LDA_selector_args = list(NULL), + TS_model = sequential_TS, TS_model_args = list(control = + sequential_TS_control()), TS_response = multinom_TS, + TS_response_args = list(control = multinom_TS_control()), + TS_method = ldats_classic, TS_method_args = list(control = + ldats_classic_control()), TS_measurer = AIC, + TS_measurer_args = list(NULL), TS_selector = which.min, + TS_selector_args = list(NULL), summary_prob = 0.95, nsubsets = 1, + subset_rule = NULL, soften = TRUE, quiet = FALSE, ...) } \arguments{ -\item{data}{Either a document term table or a list including at least -a document term table (with the word "term" in the name of the element) -and optionally also a document covariate table (with the word -"covariate" in the name of the element). -\cr \cr -The document term table is a table of observation count data (rows: -documents, columns: terms) that may be a \code{matrix} or -\code{data.frame}, but must be conformable to a matrix of integers, -as verified by \code{\link{check_document_term_table}}. -\cr \cr -The document covariate table is a table of associated data (rows: -documents, columns: time index and covariate options) that may be a -\code{matrix} or \code{data.frame}, but must be a conformable to a data -table, as verified by \code{\link{check_document_covariate_table}}. Every -model needs a covariate to describe the time value for each document -(in whatever units and whose name in the table is input in -\code{timename}) that dictates the application of the change points. -\strong{\emph{If a covariate table is not provided, the model assumes the -observations were equi-spaced in time}}. All covariates named within -specific models in \code{formulas} must be included.} - -\item{topics}{Vector of the number of topics to evaluate for each model. -Must be conformable to \code{integer} values.} - -\item{nseeds}{\code{integer} number of seeds (replicate starts) to use for -each value of \code{topics} in the LDAs. Must be conformable to -\code{integer} value.} - -\item{formulas}{Vector of \code{\link[stats]{formula}}(s) for the -continuous (non-change point) component of the time series models. Any -predictor variable included in a formula must also be a column in the -\code{document_covariate_table}. Each element (formula) in the vector -is evaluated for each number of change points and each LDA model.} - -\item{nchangepoints}{Vector of \code{integer}s corresponding to the number -of change points to include in the time series models. 0 is a valid input -corresponding to no change points (\emph{i.e.}, a singular time series -model), and the current implementation can reasonably include up to 6 -change points. Each element in the vector is the number of change points +\item{data}{Any of the data structures allowable for LDATS analyses: +\code{matrix} or \code{data.frame} document term table, +\code{list} of document term and covariate tables, a \code{list} of +training and test sets of the two tables, or a \code{list} of multiple +replicate splits of training and test sets of the two tables. \cr +See \code{\link{conform_data}}, which is used to ensure data structure +validity for the desired model.} + +\item{topics}{\code{integer}-conformable \code{vector} of the number of +topics to evaluate for each model. \cr +(See \code{\link{LDA}}.)} + +\item{replicates}{\code{integer}-conformable number of replicate starts to +use for each value of \code{topics}. \cr +(See \code{\link{LDA}}.)} + +\item{formulas}{Vector of \code{\link[stats]{formula}}(s) defining the +regression between the change points. Any predictor variable included +must also be a column in \code{data} and any (compositional) response +variable must be a set of columns in \code{data}. \cr +Each element (formula) in the vector is evaluated for each number of +change points and each LDA model. \cr +(See \code{\link{TS}}.)} + +\item{nchangepoints}{\code{integer}-conformable vector corresponding to the +number of change points to include in the models. 0 is valid (corresponds +to no change points, so a singular time series model) and the current +implementation can reasonably include up to 6 change points. The +number of change points is used to dictate the segmentation of the +time series into chunks fit with separate models dictated by +\code{formula}. \cr +Each element in the vector is the number of change points used to segment the data for each formula (entry in \code{formulas}) -component of the TS model, for each selected LDA model.} +component of the TS model, for each selected LDA model. \cr +(See \code{\link{TS}}.)} \item{timename}{\code{character} element indicating the time variable used in the time series. Defaults to \code{"time"}. The variable must be integer-conformable or a \code{Date}. If the variable named is a \code{Date}, the input is converted to an integer, resulting in the -timestep being 1 day, which is often not desired behavior.} - -\item{weights}{Optional input for overriding standard weighting for -documents in the time series. Defaults to \code{TRUE}, -translating to an appropriate weighting of the documents -based on the size (number of words) each document (the result of -\code{\link[topicmodels]{LDA}} is a matrix of proportions, which does not -account for size differences among documents. Alternatively can be -\code{NULL} for an equal weighting among documents or a \code{numeric} -vector.} - -\item{control}{A \code{list} of parameters to control the running and -selecting of LDA and TS models. Values not input assume default values -set by \code{\link{LDA_TS_control}}.} - -\item{quiet}{\code{logical} indicator for \code{conform_LDA_TS_data} to -indicate if messages should be printed.} +timestep being 1 day, which is often not desired behavior. \cr +(See \code{\link{TS}}.)} + +\item{weights}{Optional class \code{numeric} vector of weights for each +document. Defaults to \code{NULL}, translating to an equal weight for +each document. When using \code{\link{sequential_TS}} in a standard LDATS +analysis, it is advisable to weight the documents by their total size, +as the result of, e.g., \code{\link[topicmodels]{LDA}} is a matrix of +proportions, which does not account for size differences among documents. +For most models, a scaling of the weights (so that the average is 1) is +most appropriate, and this is accomplished using \code{document_weights}. +\cr +(See \code{\link{TS}}.)} + +\item{control}{\code{list} of parameters to control the fitting of the +LDATS model. Values not input assume defaults set by +\code{\link{LDA_TS_control}}.} + +\item{LDAs}{\code{LDA_set} \code{list} of selected and all LDAs from +\code{\link{LDA}}.} + +\item{TSs}{\code{TS_set} \code{list} of selected and all TSs from +\code{\link{TS}}.} + +\item{LDA_model}{Main LDA \code{function}.} + +\item{LDA_model_args}{\code{list} of (named) arguments to be used in +\code{LDA_model} via \code{\link{LDA_call}}.} + +\item{LDA_measurer}{\code{function} used in evaluation of the LDA +models; \code{LDA_measurer} creates a value for each model.} + +\item{LDA_measurer_args}{\code{list} of (named) arguments to be used in +\code{LDA_measurer} via \code{\link{do.call}}.} + +\item{LDA_selector}{\code{function} usde in evaluation of the LDA +models; \code{LDA_selector} operates on the values to choose the models.} + +\item{LDA_selector_args}{\code{list} of (named) arguments to be used in +\code{LDA_selector} via \code{\link{do.call}}.} + +\item{TS_model}{Main TS \code{function}.} + +\item{TS_model_args}{\code{list} of (named) arguments to be used in +\code{TS_model}.} + +\item{TS_response}{\code{function} used to model the compositional +response.} + +\item{TS_response_args}{\code{list} of (named) arguments to be used in +\code{TS_response} via \code{\link{do.call}}. +\cr \cr +Could be managed via a \code{_TS_control} function like +\code{\link{multinom_TS_control}}.} + +\item{TS_method}{\code{function} used to drive the sampler of the TS +models; \code{TS_method} defines and operates the computational +procedure. \cr \cr +Current pre-built options include \code{\link{ldats_classic}}.} + +\item{TS_method_args}{\code{list} of (named) arguments to be used in +\code{TS_method} via \code{\link{do.call}}. +\cr \cr +Could be managed via a \code{_control} function like +\code{\link{ldats_classic_control}}.} + +\item{TS_measurer}{\code{function} used in evaluation of the TS +models; \code{measurer} creates a value for each model.} + +\item{TS_measurer_args}{\code{list} of (named) arguments to be used in +\code{TS_measurer} via \code{\link{do.call}}.} + +\item{TS_selector}{\code{function} usde in evaluation of the TS +models; \code{TS_selector} operates on the values to choose the models.} + +\item{TS_selector_args}{\code{list} of (named) arguments to be used in +\code{TS_selector} via \code{\link{do.call}}.} + +\item{summary_prob}{Probability used for summarizing the posterior +distributions (via the highest posterior density interval, see +\code{\link[coda]{HPDinterval}}).} + +\item{nsubsets}{Number of data subsets.} + +\item{subset_rule}{\code{function} used to subset the data.} + +\item{soften}{\code{logical} indicator of whether the model should error +softly or if errors should trigger a full-stop to the pipeline.} + +\item{quiet}{\code{logical} indicator of whether the model should run +quietly (if \code{FALSE}, a progress bar and notifications are printed).} + +\item{...}{Not passed along to the output, rather included to allow for +automated removal of unneeded controls.} } \value{ -\code{LDA_TS}: a class \code{LDA_TS} list object including all - fitted LDA and TS models and selected models specifically as elements - \code{"LDA models"} (from \code{\link{LDA_set}}), - \code{"Selected LDA model"} (from \code{\link{select_LDA}}), - \code{"TS models"} (from \code{\link{TS_on_LDA}}), and - \code{"Selected TS model"} (from \code{\link{select_TS}}). \cr \cr - \code{conform_LDA_TS_data}: a data \code{list} that is ready for analyses - using the stage-specific functions. \cr \cr - \code{check_LDA_TS_inputs}: an error message is thrown if any input is - improper, otherwise \code{NULL}. +\code{LDA_TS},\code{package_LDA_TS}: class-\code{LDA_TS} \code{list} + with all fitted LDA and TS models and selected models specifically + as elements named + \describe{ + \item{\code{LDA models}}{\code{list} of all and selected models as + well as controls from \code{\link{LDA}}} + \item{\code{TS models}}{\code{list} of all and selected models as + well as controls from \code{\link{TS}}} + \item{\code{control}}{\code{list} of overall model controls} + } \cr \cr + \code{LDA_TS_control}: \code{list} of \code{list}s and single elements + that control fitting of the LDATS model, with named elements + corresponding to the arguments. } \description{ -Conduct a complete LDATS analysis (Christensen - \emph{et al.} 2018), including running a suite of Latent Dirichlet - Allocation (LDA) models (Blei \emph{et al.} 2003, Grun and Hornik 2011) - via \code{\link{LDA_set}}, selecting LDA model(s) via - \code{\link{select_LDA}}, running a complete set of Bayesian Time Series - (TS) models (Western and Kleykamp 2004) via \code{\link{TS_on_LDA}} on - the chosen LDA model(s), and selecting the best TS model via - \code{\link{select_TS}}. \cr \cr - \code{conform_LDA_TS_data} converts the \code{data} input to - match internal and sub-function specifications. \cr \cr - \code{check_LDA_TS_inputs} checks that the inputs to - \code{LDA_TS} are of proper classes for a full analysis. +Analyze compositional time series using the Linguistic + Decomposition Analysis coupled to Bayesian Time Series models + generally following Christensen \emph{et al.} (2018). + \code{LDA_TS} is the primary model function. \cr \cr + \code{LDA_TS_control} defines the control \code{list} arguments for + \code{\link{LDA_TS}}. \cr \cr + \code{package_LDA_TS} combines the results from each model component. } -\examples{ - data(rodents) -\donttest{ - mod <- LDA_TS(data = rodents, topics = 2, nseeds = 1, formulas = ~1, - nchangepoints = 1, timename = "newmoon") -} - conform_LDA_TS_data(rodents) - check_LDA_TS_inputs(rodents, timename = "newmoon") - +\details{ +For a (potentially subset) dataset consisting of counts of words + across multiple documents in a corpus, + \enumerate{ + \item Conduct multiple Linguistic Decomposition Analysis (LDA) models + (e.g., Latent Dirichlet Allocation using the Variational Expectation + Maximization (VEM) algorithm; Blei \emph{et al.} 2003), + \item Select from the LDA model results to pick those used in the Time + Series (TS) models, + \item Conduct multiple compositional Bayesian TS models + (e.g., changepoint softmax regression; Ripley 1996, Venables + and Ripley 2002, Western and Kleykamp 2004, Bishop 2006, Ruggieri + 2013) via a generalized linear modeling approach (McCullagh and + Nelder 1989) and using parallel tempering Markov Chain Monte Carlo + (ptMCMC) methods (Earl and Deem 2005), + \item Select from the TS model results to pick those used to summarize + the whole model, and + \item Package the results. + } } \references{ Blei, D. M., A. Y. Ng, and M. I. Jordan. 2003. Latent Dirichlet @@ -124,15 +212,37 @@ Blei, D. M., A. Y. Ng, and M. I. Jordan. 2003. Latent Dirichlet \strong{3}:993-1022. \href{http://jmlr.csail.mit.edu/papers/v3/blei03a.html}{link}. + Bishop, C. M. 2006. \emph{Pattern Recognition and Machine Learning}. + Springer, New York, NY, USA. + Christensen, E., D. J. Harris, and S. K. M. Ernest. 2018. Long-term community change through multiple rapid transitions in a desert rodent community. \emph{Ecology} \strong{99}:1523-1529. \href{https://doi.org/10.1002/ecy.2373}{link}. + Earl, D. J. and M. W. Deem. 2005. Parallel tempering: theory, + applications, and new perspectives. \emph{Physical Chemistry Chemical + Physics} \strong{7}: 3910-3916. + \href{https://doi.org/10.1039/B509983H}{link}. + Grun B. and K. Hornik. 2011. topicmodels: An R Package for Fitting Topic Models. \emph{Journal of Statistical Software} \strong{40}:13. \href{https://www.jstatsoft.org/article/view/v040i13}{link}. + McCullagh, P. and J. A. Nelder. 1989. \emph{Generalized Linear Models}. + 2nd Edition. Chapman and Hall, New York, NY, USA. + + Ripley, B. D. 1996. \emph{Pattern Recognition and Neural Networks}. + Cambridge University Press, Cambridge, UK. + + Ruggieri, E. 2013. A Bayesian approach to detecting change points in + climactic records. \emph{International Journal of Climatology} + \strong{33}:520-528. + \href{https://doi.org/10.1002/joc.3447}{link}. + + Venables, W. N. and B. D. Ripley. 2002. \emph{Modern and Applied + Statistics with S}. Fourth Edition. Springer, New York, NY, USA. + Western, B. and M. Kleykamp. 2004. A Bayesian change point model for historical time series analysis. \emph{Political Analysis} \strong{12}:354-374. diff --git a/man/LDA_TS_control.Rd b/man/LDA_TS_control.Rd deleted file mode 100644 index fee42e29..00000000 --- a/man/LDA_TS_control.Rd +++ /dev/null @@ -1,85 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA_TS.R -\name{LDA_TS_control} -\alias{LDA_TS_control} -\title{Create the controls list for the LDATS model} -\usage{ -LDA_TS_control(quiet = FALSE, measurer_LDA = AIC, selector_LDA = min, - iseed = 2, memoise = TRUE, response = "gamma", lambda = 0, - measurer_TS = AIC, selector_TS = min, ntemps = 6, - penultimate_temp = 2^6, ultimate_temp = 1e+10, q = 0, - nit = 10000, magnitude = 12, burnin = 0, thin_frac = 1, - summary_prob = 0.95, seed = NULL, ...) -} -\arguments{ -\item{quiet}{\code{logical} indicator of whether the model should run -quietly.} - -\item{measurer_LDA, selector_LDA}{Function names for use in evaluation of -the LDA models. \code{measurer_LDA} is used to create a value for each -model and \code{selector_LDA} operates on the values to choose the model.} - -\item{iseed}{\code{integer} initial seed for the LDA model set.} - -\item{memoise}{\code{logical} indicator of whether the multinomial -functions should be memoised (via \code{\link[memoise]{memoise}}). -Memoisation happens to both \code{\link{multinom_TS}} and -\code{\link{multinom_TS_chunk}}.} - -\item{response}{\code{character} element indicating the response variable -used in the time series. Should be set to \code{"gamma"} for LDATS.} - -\item{lambda}{\code{numeric} "weight" decay term used to set the prior -on the regressors within each chunk-level model. Defaults to 0, -corresponding to a fully vague prior.} - -\item{measurer_TS, selector_TS}{Function names for use in evaluation of the -TS models. \code{measurer_TS} is used to create a value for each model -and \code{selector_TS} operates on the values to choose the model.} - -\item{ntemps}{\code{integer} number of temperatures (chains) to use in the -ptMCMC algorithm.} - -\item{penultimate_temp}{Penultimate temperature in the ptMCMC sequence.} - -\item{ultimate_temp}{Ultimate temperature in the ptMCMC sequence.} - -\item{q}{Exponent controlling the ptMCMC temperature sequence from the -focal chain (reference with temperature = 1) to the penultimate chain. 0 -(default) implies a geometric sequence. 1 implies squaring before -exponentiating.} - -\item{nit}{\code{integer} number of iterations (steps) used in the ptMCMC -algorithm.} - -\item{magnitude}{Average magnitude (defining a geometric distribution) -for the proposed step size in the ptMCMC algorithm.} - -\item{burnin}{\code{integer} number of iterations to remove from the -beginning of the ptMCMC algorithm.} - -\item{thin_frac}{Fraction of iterations to retain, from the ptMCMC. Must be -\eqn{(0, 1]}, and the default value of 1 represents no thinning.} - -\item{summary_prob}{Probability used for summarizing the posterior -distributions (via the highest posterior density interval, see -\code{\link[coda]{HPDinterval}}) of the TS model.} - -\item{seed}{Input to \code{set.seed} in the time series model for -replication purposes.} - -\item{...}{Additional arguments to be passed to -\code{\link[topicmodels]{LDA}} as a \code{control} input.} -} -\value{ -\code{list} of control \code{lists}, with named elements - \code{LDAcontrol}, \code{TScontrol}, and \code{quiet}. -} -\description{ -Create and define a list of control options used to run the - LDATS model, as implemented by \code{\link{LDA_TS}}. -} -\examples{ - LDA_TS_control() - -} diff --git a/man/LDA_msg.Rd b/man/LDA_msg.Rd deleted file mode 100644 index 125195bd..00000000 --- a/man/LDA_msg.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA.R -\name{LDA_msg} -\alias{LDA_msg} -\title{Create the model-running-message for an LDA} -\usage{ -LDA_msg(mod_topics, mod_seeds, control = list()) -} -\arguments{ -\item{mod_topics}{\code{integer} value corresponding to the number of -topics in the model.} - -\item{mod_seeds}{\code{integer} value corresponding to the seed used for -the model.} - -\item{control}{Class \code{LDA_controls} list of control parameters to be -used in \code{LDA} (note that "seed" will be overwritten).} -} -\description{ -Produce and print the message for a given LDA model. -} -\examples{ - LDA_msg(mod_topics = 4, mod_seeds = 2) - -} diff --git a/man/LDA_set.Rd b/man/LDA_set.Rd deleted file mode 100644 index af15115e..00000000 --- a/man/LDA_set.Rd +++ /dev/null @@ -1,66 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA.R -\name{LDA_set} -\alias{LDA_set} -\alias{check_LDA_set_inputs} -\title{Run a set of Latent Dirichlet Allocation models} -\usage{ -LDA_set(document_term_table, topics = 2, nseeds = 1, - control = list()) - -check_LDA_set_inputs(document_term_table, topics, nseeds, control) -} -\arguments{ -\item{document_term_table}{Table of observation count data (rows: -documents, columns: terms. May be a class \code{matrix} or -\code{data.frame} but must be conformable to a matrix of integers, -as verified by \code{\link{check_document_term_table}}.} - -\item{topics}{Vector of the number of topics to evaluate for each model. -Must be conformable to \code{integer} values.} - -\item{nseeds}{Number of seeds (replicate starts) to use for each -value of \code{topics}. Must be conformable to \code{integer} value.} - -\item{control}{A \code{list} of parameters to control the running and -selecting of LDA models. Values not input assume default values set -by \code{\link{LDA_set_control}}. Values for running the LDAs replace -defaults in (\code{LDAcontol}, see \code{\link[topicmodels]{LDA}} (but if - \code{seed} is given, it will be overwritten; use \code{iseed} instead).} -} -\value{ -\code{LDA_set}: \code{list} (class: \code{LDA_set}) of LDA models - (class: \code{LDA_VEM}). - \code{check_LDA_set_inputs}: an error message is thrown if any input is - improper, otherwise \code{NULL}. -} -\description{ -For a given dataset consisting of counts of words across - multiple documents in a corpus, conduct multiple Latent Dirichlet - Allocation (LDA) models (using the Variational Expectation - Maximization (VEM) algorithm; Blei \emph{et al.} 2003) to account for [1] - uncertainty in the number of latent topics and [2] the impact of initial - values in the estimation procedure. \cr \cr - \code{LDA_set} is a list wrapper of \code{\link[topicmodels]{LDA}} - in the \code{topicmodels} package (Grun and Hornik 2011). \cr \cr - \code{check_LDA_set_inputs} checks that all of the inputs - are proper for \code{LDA_set} (that the table of observations is - conformable to a matrix of integers, the number of topics is an integer, - the number of seeds is an integer and the controls list is proper). -} -\examples{ - data(rodents) - lda_data <- rodents$document_term_table - r_LDA <- LDA_set(lda_data, topics = 2, nseeds = 2) - -} -\references{ -Blei, D. M., A. Y. Ng, and M. I. Jordan. 2003. Latent Dirichlet - Allocation. \emph{Journal of Machine Learning Research} - \strong{3}:993-1022. - \href{http://jmlr.csail.mit.edu/papers/v3/blei03a.html}{link}. - - Grun B. and K. Hornik. 2011. topicmodels: An R Package for Fitting Topic - Models. \emph{Journal of Statistical Software} \strong{40}:13. - \href{https://www.jstatsoft.org/article/view/v040i13}{link}. -} diff --git a/man/LDA_set_control.Rd b/man/LDA_set_control.Rd deleted file mode 100644 index 9fe5aa11..00000000 --- a/man/LDA_set_control.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA.R -\name{LDA_set_control} -\alias{LDA_set_control} -\title{Create control list for set of LDA models} -\usage{ -LDA_set_control(quiet = FALSE, measurer = AIC, selector = min, - iseed = 2, ...) -} -\arguments{ -\item{quiet}{\code{logical} indicator of whether the model should run -quietly.} - -\item{measurer, selector}{Function names for use in evaluation of the LDA -models. \code{measurer} is used to create a value for each model -and \code{selector} operates on the values to choose the model(s) to -pass on.} - -\item{iseed}{\code{integer} initial seed for the model set.} - -\item{...}{Additional arguments to be passed to -\code{\link[topicmodels]{LDA}} as a \code{control} input.} -} -\value{ -\code{list} for controlling the LDA model fit. -} -\description{ -This function provides a simple creation and definition of - the list used to control the set of LDA models. It is set up to be easy - to work with the existing control capacity of - \code{\link[topicmodels]{LDA}}. -} -\examples{ - LDA_set_control() - -} diff --git a/man/TS.Rd b/man/TS.Rd index 9fffb18f..7e318a8b 100644 --- a/man/TS.Rd +++ b/man/TS.Rd @@ -2,38 +2,67 @@ % Please edit documentation in R/TS.R \name{TS} \alias{TS} -\alias{check_TS_inputs} -\title{Conduct a single multinomial Bayesian Time Series analysis} +\alias{check_TS} +\alias{run_TS} +\alias{TS_call} +\alias{TS_msg} +\alias{prepare_TS} +\alias{package_TS} +\alias{select_TS} +\alias{measure_TS} +\alias{TS_control} +\title{Conduct a Bayesian compositional Time Series analysis} \usage{ -TS(data, formula = gamma ~ 1, nchangepoints = 0, timename = "time", +TS(LDAs, formulas = ~1, nchangepoints = 0, timename = "time", weights = NULL, control = list()) -check_TS_inputs(data, formula = gamma ~ 1, nchangepoints = 0, - timename = "time", weights = NULL, control = list()) +check_TS(LDAs, formulas = ~1, nchangepoints = 0, timename = "time", + weights = NULL, control = list()) + +run_TS(TSs) + +TS_call(TS) + +TS_msg(TS) + +prepare_TS(LDAs, formulas = ~1, nchangepoints = 0, timename = "time", + weights = NULL, control = list()) + +package_TS(TSs) + +select_TS(TSs) + +measure_TS(TSs) + +TS_control(model = sequential_TS, model_args = list(control = + sequential_TS_control()), response = multinom_TS, + response_args = list(control = multinom_TS_control()), + method = ldats_classic, method_args = list(control = + ldats_classic_control()), summary_prob = 0.95, measurer = AIC, + measurer_args = list(NULL), selector = which.min, + selector_args = list(NULL), soften = TRUE, quiet = FALSE, ...) } \arguments{ -\item{data}{\code{data.frame} including [1] the time variable (indicated -in \code{timename}), [2] the predictor variables (required by -\code{formula}) and [3], the multinomial response variable (indicated in -\code{formula}) as verified by \code{\link{check_timename}} and -\code{\link{check_formula}}. Note that the response variables should be -formatted as a \code{data.frame} object named as indicated by the -\code{response} entry in the \code{control} list, such as \code{gamma} -for a standard TS analysis on LDA output. See \code{Examples}.} - -\item{formula}{\code{\link[stats]{formula}} defining the regression between -relationship the change points. Any -predictor variable included must also be a column in -\code{data} and any (multinomial) response variable must be a set of -columns in \code{data}, as verified by \code{\link{check_formula}}.} - -\item{nchangepoints}{\code{integer} corresponding to the number of -change points to include in the model. 0 is a valid input (corresponding -to no change points, so a singular time series model), and the current +\item{LDAs}{Class \code{LDA_set} \code{list} of both selected and all +results from \code{\link{LDA}}.} + +\item{formulas}{Vector of \code{\link[stats]{formula}}(s) defining the +regression between the change points. Any predictor variable included +must also be a column in \code{data} and any (compositional) response +variable must be a set of columns in \code{data}. \cr +Each element (formula) in the vector is evaluated for each number of +change points and each LDA model.} + +\item{nchangepoints}{\code{integer}-conformable vector corresponding to the +number of change points to include in the models. 0 is valid (corresponds +to no change points, so a singular time series model) and the current implementation can reasonably include up to 6 change points. The number of change points is used to dictate the segmentation of the time series into chunks fit with separate models dictated by -\code{formula}.} +\code{formula}. \cr +Each element in the vector is the number of change points +used to segment the data for each formula (entry in \code{formulas}) +component of the TS model, for each selected LDA model.} \item{timename}{\code{character} element indicating the time variable used in the time series. Defaults to \code{"time"}. The variable must be @@ -43,87 +72,136 @@ timestep being 1 day, which is often not desired behavior.} \item{weights}{Optional class \code{numeric} vector of weights for each document. Defaults to \code{NULL}, translating to an equal weight for -each document. When using \code{multinom_TS} in a standard LDATS +each document. When using \code{\link{TS_call}} in a standard LDATS analysis, it is advisable to weight the documents by their total size, -as the result of \code{\link[topicmodels]{LDA}} is a matrix of +as the result of, e.g., \code{\link[topicmodels]{LDA}} is a matrix of proportions, which does not account for size differences among documents. For most models, a scaling of the weights (so that the average is 1) is most appropriate, and this is accomplished using \code{document_weights}.} \item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by +Time Series model. Values not input assume defaults set by \code{\link{TS_control}}.} + +\item{TS, TSs}{time series model \code{list} (\code{TS}) or a \code{list} +of many time series model \code{list}s (\code{TSs}).} + +\item{model}{Main Time Series \code{function}.} + +\item{model_args}{\code{list} of (named) arguments to be used in +\code{model} via \code{\link{TS_call}}.} + +\item{response}{\code{character} element indicating the response variable +used in the time series. \cr \cr +Must have a corresponding \code{_TS} function.} + +\item{response_args}{\code{list} of (named) arguments to be used in +\code{response} via \code{\link{do.call}}. +\cr \cr +Could be managed via a \code{_TS_control} function like +\code{\link{multinom_TS_control}}.} + +\item{method}{\code{function} used to drive the sampler of the TS +models; \code{method} defines and operates the computational procedure. +\cr \cr +Current pre-built options include \code{\link{ldats_classic}}.} + +\item{method_args}{\code{list} of (named) arguments to be used in +\code{method} via \code{\link{do.call}}. +\cr \cr +Could be managed via a \code{_control} function like +\code{\link{ldats_classic_control}}.} + +\item{summary_prob}{Probability used for summarizing the posterior +distributions (via the highest posterior density interval, see +\code{\link[coda]{HPDinterval}}).} + +\item{measurer}{\code{function} used in evaluation of the TS +models; \code{measurer} creates a value for each model.} + +\item{measurer_args}{\code{list} of (named) arguments to be used in +\code{measurer} via \code{\link{do.call}}.} + +\item{selector}{\code{function} usde in evaluation of the TS +models; \code{selector} operates on the values to choose the models.} + +\item{selector_args}{\code{list} of (named) arguments to be used in +\code{selector} via \code{\link{do.call}}.} + +\item{soften}{\code{logical} indicator of whether the model should error +softly or if errors should trigger a full-stop to the pipeline.} + +\item{quiet}{\code{logical} indicator of whether the model should run +quietly (if \code{FALSE}, a progress bar and notifications are printed).} + +\item{...}{Not passed along to the output, rather included to allow for +automated removal of unneeded controls.} } \value{ -\code{TS}: \code{TS_fit}-class list containing the following - elements, many of - which are hidden for \code{print}ing, but are accessible: - \describe{ - \item{data}{\code{data} input to the function.} - \item{formula}{\code{\link[stats]{formula}} input to the function.} - \item{nchangepoints}{\code{nchangepoints} input to the function.} - \item{weights}{\code{weights} input to the function.} - \item{control}{\code{control} input to the function.} - \item{lls}{Iteration-by-iteration - \link[=logLik.multinom_TS_fit]{logLik} values for the - full time series fit by \code{\link{multinom_TS}}.} - \item{rhos}{Iteration-by-iteration change point estimates from - \code{\link{est_changepoints}}.} - \item{etas}{Iteration-by-iteration marginal regressor estimates from - \code{\link{est_regressors}}, which have been - unconditioned with respect to the change point locations.} - \item{ptMCMC_diagnostics}{ptMCMC diagnostics, - see \code{\link{diagnose_ptMCMC}}} - \item{rho_summary}{Summary table describing \code{rhos} (the change - point locations), - see \code{\link{summarize_rhos}}.} - \item{rho_vcov}{Variance-covariance matrix for the estimates of - \code{rhos} (the change point locations), see - \code{\link{measure_rho_vcov}}.} - \item{eta_summary}{Summary table describing \code{ets} (the - regressors), - see \code{\link{summarize_etas}}.} - \item{eta_vcov}{Variance-covariance matrix for the estimates of - \code{etas} (the regressors), see - \code{\link{measure_eta_vcov}}.} - \item{logLik}{Across-iteration average of log-likelihoods - (\code{lls}).} - \item{nparams}{Total number of parameters in the full model, - including the change point locations and regressors.} - \item{deviance}{Penalized negative log-likelihood, based on - \code{logLik} and \code{nparams}.} - } - \code{check_TS_inputs}: An error message is thrown if any input - is not proper, else \code{NULL}. +\code{TS},\code{pacakage_TS}: class \code{TS_set} \code{list} of both + selected and all results from \code{\link{TS_call}} applied for + each model on each LDA model input as well as the control \code{list} + used to fit the model. \cr \cr + \code{prepare_TS}: \code{list} of \code{list}s, each of which is a + preliminary model object for a Time Series model fit. \cr \cr + \code{check_TS}: an error message is thrown if any input is improper, + otherwise \code{NULL}. + \code{TS_control}: \code{list} of named control elements for + model fitting. + \code{measure_TS}: \code{vector} of values corresponding to the model + evaluations. \cr \cr + \code{select_TS}: \code{list} of selected models' \code{list}s. \cr \cr + \code{run_TS}: \code{TS_set} \code{list} of model results from all + runs of a \code{} function, such as + \code{\link{topicmodels_LDA}}. \cr \cr + \code{TS_call}: \code{TS} \code{list} of model results from a single + run of a \code{} function, such as + \code{\link{sequential_TS}}. \cr \cr + \code{TS_msg}: a message is produced. } \description{ -This is the main interface function for the LDATS application - of Bayesian change point Time Series analyses (Christensen \emph{et al.} - 2018), which extends the model of Western and Kleykamp (2004; - see also Ruggieri 2013) to multinomial (proportional) response data using - softmax regression (Ripley 1996, Venables and Ripley 2002, Bishop 2006) - using a generalized linear modeling approach (McCullagh and Nelder 1989). - The models are fit using parallel tempering Markov Chain Monte Carlo - (ptMCMC) methods (Earl and Deem 2005) to locate change points and - neural networks (Ripley 1996, Venables and Ripley 2002, Bishop 2006) to - estimate regressors. \cr \cr - \code{check_TS_inputs} checks that the inputs to - \code{TS} are of proper classes for a full analysis. -} -\examples{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) -\donttest{ - TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) +Analyze compositional Time Series models using Bayesian + sampling methods. \cr \cr + \code{TS} is the main interface function for the LDATS application + of Bayesian change point Time Series analyses (Christensen + \emph{et al.} 2018). \cr \cr + \code{prepare_TS} pre-prepares the TS model objects for simpler + use within the subfunctions. \cr \cr + \code{check_TS} ensures that the inputs are proper. + See \code{\link{check_LDAs}}, + \code{\link{check_document_covariate_table}}, + code{\link{check_formulas}}, \code{\link{check_nchangepoints}}, + \code{\link{check_timename}}, \code{\link{check_weights}}, + and \code{\link{check_control}} for specifics. \cr \cr + \code{TS_control} defines and creates the control \code{list} for the TS + model running. \cr \cr + \code{run_TS} runs (via \code{\link{TS_call}}) all TS models + as set up by \code{prep_TS_models}. \cr \cr + \code{TS_call} runs (via \code{\link{do.call}}) a single TS model + as set up by \code{prep_TS_models}. \cr \cr + \code{TS_msg} produces a model-running message if desired. \cr \cr + \code{measure_TS} determines the fit value used to select among the + models. \cr \cr + \code{select_TS} chooses the best model(s) of interest based on their + measured values and the selector function. \cr \cr + \code{package_TS} sets the class and names the elements of the results + \code{list} from \code{\link{TS_call}} applied to the + combination of TS models requested for the LDA model(s) input. } - check_TS_inputs(data, timename = "newmoon") - +\details{ +For a (potentially subset) dataset consisting of proportions of + topics across multiple documents in a corpus + \enumerate{ + \item Conduct multiple compositional Bayesian TS models + (e.g., changepoint softmax regression; Ripley 1996, Venables + and Ripley 2002, Western and Kleykamp 2004, Bishop 2006, Ruggieri + 2013) via a generalized linear modeling approach (McCullagh and + Nelder 1989) and using parallel tempering Markov Chain Monte Carlo + (ptMCMC) methods (Earl and Deem 2005), + \item Select from the TS model results to pick those used to summarize + the whole model, and + \item Package the results. + } } \references{ Bishop, C. M. 2006. \emph{Pattern Recognition and Machine Learning}. diff --git a/man/TS_control.Rd b/man/TS_control.Rd deleted file mode 100644 index 5f944c88..00000000 --- a/man/TS_control.Rd +++ /dev/null @@ -1,74 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS.R -\name{TS_control} -\alias{TS_control} -\title{Create the controls list for the Time Series model} -\usage{ -TS_control(memoise = TRUE, response = "gamma", lambda = 0, - measurer = AIC, selector = min, ntemps = 6, - penultimate_temp = 2^6, ultimate_temp = 1e+10, q = 0, - nit = 10000, magnitude = 12, quiet = FALSE, burnin = 0, - thin_frac = 1, summary_prob = 0.95, seed = NULL) -} -\arguments{ -\item{memoise}{\code{logical} indicator of whether the multinomial -functions should be memoised (via \code{\link[memoise]{memoise}}). -Memoisation happens to both \code{\link{multinom_TS}} and -\code{\link{multinom_TS_chunk}}.} - -\item{response}{\code{character} element indicating the response variable -used in the time series.} - -\item{lambda}{\code{numeric} "weight" decay term used to set the prior -on the regressors within each chunk-level model. Defaults to 0, -corresponding to a fully vague prior.} - -\item{measurer, selector}{Function names for use in evaluation of the TS -models. \code{measurer} is used to create a value for each model -and \code{selector} operates on the values to choose the model.} - -\item{ntemps}{\code{integer} number of temperatures (chains) to use in the -ptMCMC algorithm.} - -\item{penultimate_temp}{Penultimate temperature in the ptMCMC sequence.} - -\item{ultimate_temp}{Ultimate temperature in the ptMCMC sequence.} - -\item{q}{Exponent controlling the ptMCMC temperature sequence from the -focal chain (reference with temperature = 1) to the penultimate chain. 0 -(default) implies a geometric sequence. 1 implies squaring before -exponentiating.} - -\item{nit}{\code{integer} number of iterations (steps) used in the ptMCMC -algorithm.} - -\item{magnitude}{Average magnitude (defining a geometric distribution) -for the proposed step size in the ptMCMC algorithm.} - -\item{quiet}{\code{logical} indicator of whether the model should run -quietly (if \code{FALSE}, a progress bar and notifications are printed).} - -\item{burnin}{\code{integer} number of iterations to remove from the -beginning of the ptMCMC algorithm.} - -\item{thin_frac}{Fraction of iterations to retain, must be \eqn{(0, 1]}, -and the default value of 1 represents no thinning.} - -\item{summary_prob}{Probability used for summarizing the posterior -distributions (via the highest posterior density interval, see -\code{\link[coda]{HPDinterval}}).} - -\item{seed}{Input to \code{set.seed} for replication purposes.} -} -\value{ -\code{list}, with named elements corresponding to the arguments. -} -\description{ -This function provides a simple creation and definition of a - list used to control the time series model fit occurring within - \code{\link{TS}}. -} -\examples{ - TS_control() - -} diff --git a/man/TS_diagnostics_plot.Rd b/man/TS_diagnostics_plot.Rd deleted file mode 100644 index dad973da..00000000 --- a/man/TS_diagnostics_plot.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{TS_diagnostics_plot} -\alias{TS_diagnostics_plot} -\alias{eta_diagnostics_plots} -\alias{rho_diagnostics_plots} -\title{Plot the diagnostics of the parameters fit in a TS model} -\usage{ -TS_diagnostics_plot(x, interactive = TRUE) - -eta_diagnostics_plots(x, interactive) - -rho_diagnostics_plots(x, interactive) -} -\arguments{ -\item{x}{Object of class \code{TS_fit}, generated by \code{\link{TS}} to -have its diagnostics plotted.} - -\item{interactive}{\code{logical} input, should be code{TRUE} unless -testing.} -} -\value{ -\code{NULL}. -} -\description{ -Plot 4-panel figures (showing trace plots, posterior ECDF, - posterior density, and iteration autocorrelation) for each of the - parameters (change point locations and regressors) fitted within a - multinomial time series model (fit by \code{\link{TS}}). \cr \cr - \code{eta_diagnostics_plots} creates the diagnostic plots - for the regressors (etas) of a time series model. \cr \cr - \code{rho_diagnostics_plots} creates the diagnostic plots - for the change point locations (rho) of a time series model. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) - TS_diagnostics_plot(TSmod) -} - -} diff --git a/man/TS_on_LDA.Rd b/man/TS_on_LDA.Rd deleted file mode 100644 index 205739f2..00000000 --- a/man/TS_on_LDA.Rd +++ /dev/null @@ -1,96 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{TS_on_LDA} -\alias{TS_on_LDA} -\alias{check_TS_on_LDA_inputs} -\title{Conduct a set of Time Series analyses on a set of LDA models} -\usage{ -TS_on_LDA(LDA_models, document_covariate_table, formulas = ~1, - nchangepoints = 0, timename = "time", weights = NULL, - control = list()) - -check_TS_on_LDA_inputs(LDA_models, document_covariate_table, - formulas = ~1, nchangepoints = 0, timename = "time", - weights = NULL, control = list()) -} -\arguments{ -\item{LDA_models}{List of LDA models (class \code{LDA_set}, produced by -\code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -produced by \code{\link[topicmodels]{LDA}}).} - -\item{document_covariate_table}{Document covariate table (rows: documents, -columns: time index and covariate options). Every model needs a -covariate to describe the time value for each document (in whatever -units and whose name in the table is input in \code{timename}) -that dictates the application of the change points. -In addition, all covariates named within specific models in -\code{formula} must be included. Must be a conformable to a data table, -as verified by \code{\link{check_document_covariate_table}}.} - -\item{formulas}{Vector of \code{\link[stats]{formula}}(s) for the -continuous (non-change point) component of the time series models. Any -predictor variable included in a formula must also be a column in the -\code{document_covariate_table}. Each element (formula) in the vector -is evaluated for each number of change points and each LDA model.} - -\item{nchangepoints}{Vector of \code{integer}s corresponding to the number -of change points to include in the time series models. 0 is a valid input -corresponding to no change points (\emph{i.e.}, a singular time series -model), and the current implementation can reasonably include up to 6 -change points. Each element in the vector is the number of change points -used to segment the data for each formula (entry in \code{formulas}) -component of the TS model, for each selected LDA model.} - -\item{timename}{\code{character} element indicating the time variable -used in the time series. Defaults to \code{"time"}. The variable must be -integer-conformable or a \code{Date}. If the variable named -is a \code{Date}, the input is converted to an integer, resulting in the -timestep being 1 day, which is often not desired behavior.} - -\item{weights}{Optional class \code{numeric} vector of weights for each -document. Defaults to \code{NULL}, translating to an equal weight for -each document. When using \code{multinom_TS} in a standard LDATS -analysis, it is advisable to weight the documents by their total size, -as the result of \code{\link[topicmodels]{LDA}} is a matrix of -proportions, which does not account for size differences among documents. -For most models, a scaling of the weights (so that the average is 1) is -most appropriate, and this is accomplished using \code{document_weights}.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} -} -\value{ -\code{TS_on_LDA}: \code{TS_on_LDA}-class \code{list} of results - from \code{\link{TS}} applied for each model on each LDA model input. - \cr \cr - \code{check_TS_inputs}: An error message is thrown if any input - is not proper, else \code{NULL}. -} -\description{ -This is a wrapper function that expands the main Time Series - analyses function (\code{\link{TS}}) across the LDA models (estimated - using \code{\link[topicmodels]{LDA}} or \code{\link{LDA_set}} and the - Time Series models, with respect to both continuous time formulas and the - number of discrete changepoints. This function allows direct passage of - the control parameters for the parallel tempering MCMC through to the - main Time Series function, \code{\link{TS}}, via the - \code{ptMCMC_controls} argument. \cr \cr - \code{check_TS_on_LDA_inputs} checks that the inputs to - \code{TS_on_LDA} are of proper classes for a full analysis. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) - LDA_models <- select_LDA(LDAs) - weights <- document_weights(document_term_table) - formulas <- c(~ 1, ~ newmoon) - mods <- TS_on_LDA(LDA_models, document_covariate_table, formulas, - nchangepoints = 0:1, timename = "newmoon", weights) -} - -} diff --git a/man/TS_summary_plot.Rd b/man/TS_summary_plot.Rd deleted file mode 100644 index 23040bad..00000000 --- a/man/TS_summary_plot.Rd +++ /dev/null @@ -1,77 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{TS_summary_plot} -\alias{TS_summary_plot} -\alias{pred_gamma_TS_plot} -\alias{rho_hist} -\title{Create the summary plot for a TS fit to an LDA model} -\usage{ -TS_summary_plot(x, cols = set_TS_summary_plot_cols(), bin_width = 1, - xname = NULL, border = NA, selection = "median", LDATS = FALSE) - -pred_gamma_TS_plot(x, selection = "median", cols = set_gamma_colors(x), - xname = NULL, together = FALSE, LDATS = FALSE) - -rho_hist(x, cols = set_rho_hist_colors(x$rhos), bin_width = 1, - xname = NULL, border = NA, together = FALSE, LDATS = FALSE) -} -\arguments{ -\item{x}{Object of class \code{TS_fit} produced by \code{\link{TS}}.} - -\item{cols}{\code{list} of elements used to define the colors for the two -panels, as generated simply using \code{\link{set_TS_summary_plot_cols}}. -Has two elements \code{rho} and \code{gamma}, each corresponding to the -related panel, and each containing default values for entries named -\code{cols}, \code{option}, and \code{alpha}. See -\code{\link{set_gamma_colors}} and \code{\link{set_rho_hist_colors}} for -details on usage.} - -\item{bin_width}{Width of the bins used in the histograms, in units of the -x-axis (the time variable used to fit the model).} - -\item{xname}{Label for the x-axis in the summary time series plot. Defaults -to \code{NULL}, which results in usage of the \code{timename} element -of the control list (held in\code{control$TS_control$timename}). To have -no label printed, set \code{xname = ""}.} - -\item{border}{Border for the histogram, default is \code{NA}.} - -\item{selection}{Indicator of the change points to use. Currently only -defined for "median" and "mode".} - -\item{LDATS}{\code{logical} indicating if the plot is part of a larger -LDATS plot output.} - -\item{together}{\code{logical} indicating if the subplots are part of a -larger LDA plot output.} -} -\value{ -\code{NULL}. -} -\description{ -Produces a two-panel figure of [1] the change point - distributions as histograms over time and [2] the time series of the - fitted topic proportions over time, based on a selected set of - change point locations. \cr \cr - \code{pred_gamma_TS_plot} produces a time series of the - fitted topic proportions over time, based on a selected set of change - point locations. \cr \cr - \code{rho_hist}: make a plot of the change point - distributions as histograms over time. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) - TS_summary_plot(TSmod) - pred_gamma_TS_plot(TSmod) - rho_hist(TSmod) -} - -} diff --git a/man/argument_checking.Rd b/man/argument_checking.Rd new file mode 100644 index 00000000..2d2eedec --- /dev/null +++ b/man/argument_checking.Rd @@ -0,0 +1,136 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/argument_checking.R +\name{argument_checking} +\alias{argument_checking} +\alias{check_class} +\alias{check_nonneg_integer} +\alias{check_nonneg_integer_matrix} +\alias{check_pos_integer} +\alias{check_control} +\alias{check_topics} +\alias{check_replicates} +\alias{check_nchangepoints} +\alias{check_document_term_table} +\alias{check_weights} +\alias{check_LDAs} +\alias{check_timename} +\alias{check_formulas} +\alias{check_document_covariate_table} +\title{Check that arguments are properly formatted for usage} +\usage{ +check_class(object, eclass = "list") + +check_nonneg_integer(object) + +check_nonneg_integer_matrix(object) + +check_pos_integer(object) + +check_control(control, eclass = "list") + +check_topics(topics) + +check_replicates(replicates) + +check_nchangepoints(nchangepoints) + +check_document_term_table(document_term_table) + +check_weights(weights) + +check_LDAs(LDAs) + +check_timename(LDAs, timename) + +check_formulas(LDAs, formulas) + +check_document_covariate_table(LDAs) +} +\arguments{ +\item{object}{An object whose class should be checked against +\code{eclass}.} + +\item{eclass}{Expected class of \code{object} to be checked. If more +than one option is included, any are sufficient (\code{object} only +needs to be one \code{eclass}, not all).} + +\item{control}{Control \code{list} to evaluate.} + +\item{topics}{\code{vector} of the number of topics to evaluate for each +model. Must be conformable to positive \code{integer} values.} + +\item{replicates}{\code{integer} number of replicate starts to use for +each value of \code{topics} in the LDAs. Must be conformable to +positive \code{integer} values.} + +\item{nchangepoints}{\code{integer}-conformable \code{vector} of the +number of changepoints to evaluate (must be non-negative).} + +\item{document_term_table}{Table of observation count data (rows: +documents, columns: terms. May be a \code{matrix} or +\code{data.frame} but must be conformable to a matrix of non-negative +\code{integers}.} + +\item{weights}{\code{numeric} \code{vector} of the document weights to +evaluate, or \code{TRUE} for triggering internal weighting by document +sizes.} + +\item{LDAs}{\code{LDA_models} \code{list} of LDA models or singular LDA +model (\code{LDA}) to evaluate.} + +\item{timename}{Column name for the time variable to evaluate in the +\code{document_covariate_table} if provided.} + +\item{formulas}{\code{vector} of the \code{\link[stats]{formula}}s +to evaluate.} +} +\value{ +an error message is thrown if the input is improper, otherwise + \code{NULL}. +} +\description{ +Verify the class, structure, and values of inputted arguments + to ensure proper LDATS modeling. \cr \cr + \code{check_class} is a general class-verifier. \cr \cr + \code{check_nonneg_integer} is a specified checking function for + \code{numeric} values that must be integer-conformable and non-negative + (0 is acceptable). \cr \cr + \code{check_nonneg_integer_matrix} is a specified checking function for + tables of \code{numeric} values that must be integer-conformable and + non-negative (0 is acceptable) and that must be conformable to + a \code{matrix}. \cr \cr + \code{check_pos_integer} is a specified checking function for + \code{numeric} values that must be integer-conformable and positive + (0 is not acceptable). \cr \cr + \code{check_control} verifies that a control \code{list} is a \code{list} + \cr \cr + \code{check_topics} ensures that the vector of numbers of topics is + positive integer-conformable. \cr \cr + \code{check_replicates} ensures that the number of replicates is + positive integer-conformable. \cr \cr + \code{check_nchangepoints} ensures that the of change points is + positive integer-conformable. \cr \cr + \code{check_document_term_table} ensures that the table of document + term counts is conformable to a \code{matrix} of positive integers. + \cr \cr + \code{check_LDAs} verifies that the argument is either a + \code{LDA} or \code{LDA_set} \code{list}. \cr \cr + \code{check_document_covariate_table} check that the table of + document-level covariates in the \code{LDAs} data is + conformable to a data frame and of the right size (correct number of + documents) for the document-topic output from the LDA models. \cr \cr + \code{check_weights} ensures that the vector of document weights is + \code{numeric} and positive and inform the user if the average weight + isn't 1.\cr \cr + \code{check_timename} checks that the vector of time values is included + in the \code{document_covariate_table} and that it is either a + \code{integer}-conformable or a \code{Date}. + If it is a \code{Date}, the input is converted to an + \code{integer}, resulting in the timestep being 1 day, which is often + not desired behavior. \cr \cr + \code{check_formulas} verifies that the input contains only + \code{\link[stats]{formula}}s and that the response and predictor + variables are all included in \code{LDAs} data sets. \cr \cr + \code{check_nchangepoints} checks that the \code{vector} of numbers of + changepoints is conformable to non-negative \code{integers}.\cr \cr +} diff --git a/man/autocorr_plot.Rd b/man/autocorr_plot.Rd deleted file mode 100644 index ac106320..00000000 --- a/man/autocorr_plot.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{autocorr_plot} -\alias{autocorr_plot} -\title{Produce the autocorrelation panel for the TS diagnostic plot of - a parameter} -\usage{ -autocorr_plot(x) -} -\arguments{ -\item{x}{Vector of parameter values drawn from the posterior distribution, -indexed to the iteration by the order of the vector.} -} -\value{ -\code{NULL}. -} -\description{ -Produce a vanilla ACF plot using \code{\link[stats]{acf}} for - the parameter of interest (rho or eta) as part of - \code{\link{TS_diagnostics_plot}}. -} -\examples{ - autocorr_plot(rnorm(100, 0, 1)) - -} diff --git a/man/check_LDA_models.Rd b/man/check_LDA_models.Rd deleted file mode 100644 index 7f2f700d..00000000 --- a/man/check_LDA_models.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{check_LDA_models} -\alias{check_LDA_models} -\title{Check that LDA model input is proper} -\usage{ -check_LDA_models(LDA_models) -} -\arguments{ -\item{LDA_models}{List of LDA models or singular LDA model to evaluate.} -} -\value{ -An error message is thrown if \code{LDA_models} is not proper, - else \code{NULL}. -} -\description{ -Check that the \code{LDA_models} input is either a set of - LDA models (class \code{LDA_set}, produced by - \code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, - produced by \code{\link[topicmodels]{LDA}}). -} -\examples{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDAs <- LDA_set(document_term_table, topics = 2, nseeds = 1) - LDA_models <- select_LDA(LDAs) - check_LDA_models(LDA_models) - -} diff --git a/man/check_changepoints.Rd b/man/check_changepoints.Rd deleted file mode 100644 index 021d8bd2..00000000 --- a/man/check_changepoints.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/multinom_TS.R -\name{check_changepoints} -\alias{check_changepoints} -\title{Check that a set of change point locations is proper} -\usage{ -check_changepoints(changepoints = NULL) -} -\arguments{ -\item{changepoints}{Change point locations to evaluate.} -} -\value{ -An error message is thrown if \code{changepoints} are not proper, - else \code{NULL}. -} -\description{ -Check that the change point locations are \code{numeric} - and conformable to \code{interger} values. -} -\examples{ - check_changepoints(100) - -} diff --git a/man/check_control.Rd b/man/check_control.Rd deleted file mode 100644 index c950306d..00000000 --- a/man/check_control.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities.R -\name{check_control} -\alias{check_control} -\title{Check that a control list is proper} -\usage{ -check_control(control, eclass = "list") -} -\arguments{ -\item{control}{Control list to evaluate.} - -\item{eclass}{Expected class of the list to be evaluated.} -} -\value{ -an error message is thrown if the input is improper, otherwise - \code{NULL}. -} -\description{ -Check that a list of controls is of the right class. -} -\examples{ - check_control(list()) - -} diff --git a/man/check_document_covariate_table.Rd b/man/check_document_covariate_table.Rd deleted file mode 100644 index 59a49bc8..00000000 --- a/man/check_document_covariate_table.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{check_document_covariate_table} -\alias{check_document_covariate_table} -\title{Check that the document covariate table is proper} -\usage{ -check_document_covariate_table(document_covariate_table, - LDA_models = NULL, document_term_table = NULL) -} -\arguments{ -\item{document_covariate_table}{Document covariate table to evaluate.} - -\item{LDA_models}{Reference LDA model list (class \code{LDA_set}) that -includes as its first element a properly fitted \code{LDA} model with -a \code{gamma} slot with the document-topic distribution.} - -\item{document_term_table}{Optional input for checking when -\code{LDA_models} is \code{NULL}} -} -\value{ -An error message is thrown if \code{document_covariate_table} is - not proper, else \code{NULL}. -} -\description{ -Check that the table of document-level covariates is - conformable to a data frame and of the right size (correct number of - documents) for the document-topic output from the LDA models. -} -\examples{ - data(rodents) - check_document_covariate_table(rodents$document_covariate_table) - -} diff --git a/man/check_document_term_table.Rd b/man/check_document_term_table.Rd deleted file mode 100644 index 7a26419e..00000000 --- a/man/check_document_term_table.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities.R -\name{check_document_term_table} -\alias{check_document_term_table} -\title{Check that document term table is proper} -\usage{ -check_document_term_table(document_term_table) -} -\arguments{ -\item{document_term_table}{Table of observation count data (rows: -documents, columns: terms. May be a class \code{matrix} or -\code{data.frame} but must be conformable to a matrix of integers, -as verified by \code{\link{check_document_term_table}}.} -} -\value{ -an error message is thrown if the input is improper, otherwise - \code{NULL}. -} -\description{ -Check that the table of observations is conformable to - a matrix of integers. -} -\examples{ - data(rodents) - check_document_term_table(rodents$document_term_table) - -} diff --git a/man/check_formula.Rd b/man/check_formula.Rd deleted file mode 100644 index 4d2aead5..00000000 --- a/man/check_formula.Rd +++ /dev/null @@ -1,39 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS.R -\name{check_formula} -\alias{check_formula} -\title{Check that a formula is proper} -\usage{ -check_formula(data, formula) -} -\arguments{ -\item{data}{\code{data.frame} including [1] the time variable (indicated -in \code{timename}), [2] the predictor variables (required by -\code{formula}) and [3], the multinomial response variable (indicated in -\code{formula}) as verified by \code{\link{check_timename}} and -\code{\link{check_formula}}. Note that the response variables should be -formatted as a \code{data.frame} object named as indicated by the -\code{response} entry in the \code{control} list, such as \code{gamma} -for a standard TS analysis on LDA output.} - -\item{formula}{\code{formula} to evaluate.} -} -\value{ -An error message is thrown if \code{formula} is not proper, - else \code{NULL}. -} -\description{ -Check that \code{formula} is actually a - \code{\link[stats]{formula}} and that the - response and predictor variables are all included in \code{data}. -} -\examples{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - check_formula(data, gamma ~ 1) - -} diff --git a/man/check_formulas.Rd b/man/check_formulas.Rd deleted file mode 100644 index 20dc3b7d..00000000 --- a/man/check_formulas.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{check_formulas} -\alias{check_formulas} -\title{Check that formulas vector is proper and append the response - variable} -\usage{ -check_formulas(formulas, document_covariate_table, control = list()) -} -\arguments{ -\item{formulas}{Vector of the formulas to evaluate.} - -\item{document_covariate_table}{Document covariate table used to evaluate -the availability of the data required by the formula inputs.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} -} -\value{ -An error message is thrown if \code{formulas} is - not proper, else \code{NULL}. -} -\description{ -Check that the vector of formulas is actually formatted - as a vector of \code{\link[stats]{formula}} objects and that the - predictor variables are all included in the document covariate table. -} -\examples{ - data(rodents) - check_formulas(~ 1, rodents$document_covariate_table) - -} diff --git a/man/check_nchangepoints.Rd b/man/check_nchangepoints.Rd deleted file mode 100644 index 6615f714..00000000 --- a/man/check_nchangepoints.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{check_nchangepoints} -\alias{check_nchangepoints} -\title{Check that nchangepoints vector is proper} -\usage{ -check_nchangepoints(nchangepoints) -} -\arguments{ -\item{nchangepoints}{Vector of the number of changepoints to evaluate.} -} -\value{ -An error message is thrown if \code{nchangepoints} is not proper, - else \code{NULL}. -} -\description{ -Check that the vector of numbers of changepoints is - conformable to integers greater than 1. -} -\examples{ - check_nchangepoints(0) - check_nchangepoints(2) - -} diff --git a/man/check_seeds.Rd b/man/check_seeds.Rd deleted file mode 100644 index 82912be8..00000000 --- a/man/check_seeds.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities.R -\name{check_seeds} -\alias{check_seeds} -\title{Check that nseeds value or seeds vector is proper} -\usage{ -check_seeds(nseeds) -} -\arguments{ -\item{nseeds}{\code{integer} number of seeds (replicate starts) to use for -each value of \code{topics} in the LDAs. Must be conformable to a -positive \code{integer} value.} -} -\value{ -an error message is thrown if the input is improper, otherwise - \code{NULL}. -} -\description{ -Check that the vector of numbers of seeds is conformable to - integers greater than 0. -} -\examples{ - check_seeds(1) - check_seeds(2) - -} diff --git a/man/check_timename.Rd b/man/check_timename.Rd deleted file mode 100644 index d04a7763..00000000 --- a/man/check_timename.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{check_timename} -\alias{check_timename} -\title{Check that the time vector is proper} -\usage{ -check_timename(document_covariate_table, timename) -} -\arguments{ -\item{document_covariate_table}{Document covariate table used to query -for the time column.} - -\item{timename}{Column name for the time variable to evaluate.} -} -\value{ -An error message is thrown if \code{timename} is - not proper, else \code{NULL}. -} -\description{ -Check that the vector of time values is included in the - document covariate table and that it is either a integer-conformable or - a \code{date}. If it is a \code{date}, the input is converted to an - integer, resulting in the timestep being 1 day, which is often not - desired behavior. -} -\examples{ - data(rodents) - check_timename(rodents$document_covariate_table, "newmoon") - -} diff --git a/man/check_topics.Rd b/man/check_topics.Rd deleted file mode 100644 index 3f9415e1..00000000 --- a/man/check_topics.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utilities.R -\name{check_topics} -\alias{check_topics} -\title{Check that topics vector is proper} -\usage{ -check_topics(topics) -} -\arguments{ -\item{topics}{Vector of the number of topics to evaluate for each model. -Must be conformable to \code{integer} values.} -} -\value{ -an error message is thrown if the input is improper, otherwise - \code{NULL}. -} -\description{ -Check that the vector of numbers of topics is conformable to - integers greater than 1. -} -\examples{ - check_topics(2) - -} diff --git a/man/check_weights.Rd b/man/check_weights.Rd deleted file mode 100644 index 3d95c9ce..00000000 --- a/man/check_weights.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{check_weights} -\alias{check_weights} -\title{Check that weights vector is proper} -\usage{ -check_weights(weights) -} -\arguments{ -\item{weights}{Vector of the document weights to evaluate, or \code{TRUE} -for triggering internal weighting by document sizes.} -} -\value{ -An error message is thrown if \code{weights} is not proper, - else \code{NULL}. -} -\description{ -Check that the vector of document weights is numeric and - positive and inform the user if the average weight isn't 1. -} -\examples{ - check_weights(1) - wts <- runif(100, 0.1, 100) - check_weights(wts) - wts2 <- wts / mean(wts) - check_weights(wts2) - check_weights(TRUE) - -} diff --git a/man/conform_data.Rd b/man/conform_data.Rd new file mode 100644 index 00000000..d6dc6045 --- /dev/null +++ b/man/conform_data.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_preparation.R +\name{conform_data} +\alias{conform_data} +\title{Conform data for LDATS modeling} +\usage{ +conform_data(data, control = list()) +} +\arguments{ +\item{data}{A document term table, \code{list} of document term and +covariate tables, a \code{list} of training and test sets of the two +tables, or a \code{list} of multiple replicate splits of training and +test sets of the two tables.} + +\item{control}{\code{list} of control options for the data conforming.} +} +\value{ +\code{list} of properly formatted LDATS data. +} +\description{ +Given any of a variety of possible data input types + (\code{data.frame}/\code{matrix}, \code{list}, \code{list} of + \code{list}s, or \code{list} of \code{list} of \code{list}s) and + controls, this produces a properly formatted set of data (sets) for + LDATS modeling. +} +\details{ +This function makes use of the \code{\link{list_depth}} + utility that recursively works through an object to tell you + how nested a lists is. \cr \cr + Working up from the most elemental version of \code{data} possible, + if it's not a \code{list}, but the data are a term table, the + covariate table is added with assumed equispersed data and + the data are now a \code{list}. \cr + Then, if it is a \code{list} but only a of depth 1 (a \code{list} of two + tables), it is wrapped in a \code{list} to make it depth-2, + functionally a 1-subset data set. \cr + Then, if it is a \code{list} of depth two, it may need to be expanded to + a multiple-subset data set, to allow for cross validtion methods, for + example. So, the \code{list} of depth 2 is replicated out to create a + longer \code{list} that is still depth 2 but is now of length + \code{control$nsubsets}. \cr + Then, the subsetting of the data occurs according to the + \code{control$subset_rule}, and each depth-2 \code{list} is split to a + final level of training and testing subsets of the data, making the + \code{list} depth 3. \cr \cr + The training and testing data are constructed as trimmed versions of the + two tables, even if no data are required for testing. +} diff --git a/man/count_trips.Rd b/man/count_trips.Rd deleted file mode 100644 index 32ae4394..00000000 --- a/man/count_trips.Rd +++ /dev/null @@ -1,51 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{count_trips} -\alias{count_trips} -\title{Count trips of the ptMCMC particles} -\usage{ -count_trips(ids) -} -\arguments{ -\item{ids}{\code{matrix} of identifiers of the particles in each chain for -each iteration of the ptMCMC algorithm (rows: chains, -columns: iterations).} -} -\value{ -\code{list} of [1] \code{vector} of within particle trip counts - (\code{$trip_counts}), and [2] \code{vector} of within-particle average - trip rates (\code{$trip_rates}). -} -\description{ -Count the full trips (from one extreme temperature chain to - the other and back again; Katzgraber \emph{et al.} 2006) for each of the - ptMCMC particles, as identified by their id on initialization. - \cr \cr - This function was designed to work within \code{\link{TS}} and process - the output of \code{\link{est_changepoints}} as a component of - \code{\link{diagnose_ptMCMC}}, but has been generalized - and would work with any output from a ptMCMC as long as \code{ids} - is formatted properly. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - data <- data[order(data[,"newmoon"]), ] - rho_dist <- est_changepoints(data, gamma ~ 1, 1, "newmoon", weights, - TS_control()) - count_trips(rho_dist$ids) -} - -} -\references{ -Katzgraber, H. G., S. Trebst, D. A. Huse. And M. Troyer. 2006. - Feedback-optimized parallel tempering Monte Carlo. \emph{Journal of - Statistical Mechanics: Theory and Experiment} \strong{3}:P03018 - \href{https://bit.ly/2LICGXh}{link}. -} diff --git a/man/data_subsetting.Rd b/man/data_subsetting.Rd new file mode 100644 index 00000000..570d045d --- /dev/null +++ b/man/data_subsetting.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_preparation.R +\name{data_subsetting} +\alias{data_subsetting} +\alias{null_rule} +\alias{systematic_loo} +\alias{random_loo} +\alias{leave_p_out} +\title{Subset data sets} +\usage{ +null_rule(data, iteration = 1) + +systematic_loo(data, iteration = 1) + +random_loo(data, iteration = 1) + +leave_p_out(data, p = 1, pre = 0, post = 0, random = TRUE, + locations = NULL) +} +\arguments{ +\item{data}{\code{data.frame} or \code{matrix} of data to be split.} + +\item{iteration}{\code{integer}-conformable value indicating which +iteration through the process the current implementation is.} + +\item{p}{\code{integer}-conformable value of how many samples to leave out.} + +\item{pre, post}{\code{integer}-conformable values of how many samples +to include in the buffer around the focal left out data. Can be +asymmetric.} + +\item{random}{\code{logical} indicator of if the left out data should be +randomly selected.} + +\item{locations}{\code{integer}-conformable values referencing which +data to hold out.} +} +\value{ +\code{character} \code{vector} of \code{"train"} and \code{"test"} + values. +} +\description{ +For use within, e.g., cross validation methods, these + functions subdivide the data into testing and training subsets. \cr \cr + \code{null_rule} places all data in the training set. \cr \cr + \code{random_loo} conducts randomized leave-one-out with no buffer. + \cr \cr + \code{systematic_loo} conducts systematic leave-one-out with no buffer. + Assumes 1:1 between iteration and datum location to drop. \cr \cr + \code{leave_p_out} is a fully flexible leave p out function allowing for + asymmetric buffers and randomization. If \code{random = TRUE}, the test + data are selected randomly, otherwise locations are used. +} diff --git a/man/diagnose_ptMCMC.Rd b/man/diagnose_ptMCMC.Rd deleted file mode 100644 index 9dfaa612..00000000 --- a/man/diagnose_ptMCMC.Rd +++ /dev/null @@ -1,66 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{diagnose_ptMCMC} -\alias{diagnose_ptMCMC} -\title{Calculate ptMCMC summary diagnostics} -\usage{ -diagnose_ptMCMC(ptMCMCout) -} -\arguments{ -\item{ptMCMCout}{Named \code{list} of saved data objects from a ptMCMC -estimation including elements named \code{step_accepts} (matrix of -\code{logical} outcomes of each step; rows: chains, columns: iterations), -\code{swap_accepts} (matrix of \code{logical} outcomes of each swap; -rows: chain pairs, columns: iterations), and \code{ids} (matrix of -particle identifiers; rows: chains, columns: iterations). -\code{ptMCMCout = NULL} indicates no use of ptMCMC and so the function -returns \code{NULL}.} -} -\value{ -\code{list} of [1] within-chain average step acceptance rates - (\code{$step_acceptance_rate}), [2] average between-chain swap acceptance - rates (\code{$swap_acceptance_rate}), [3] within particle trip counts - (\code{$trip_counts}), and [4] within-particle average trip rates - (\code{$trip_rates}). -} -\description{ -Summarize the step and swap acceptance rates as well as trip - metrics from the saved output of a ptMCMC estimation. -} -\details{ -Within-chain step acceptance rates are averaged for each of the - chains from the raw step acceptance histories - (\code{ptMCMCout$step_accepts}) and between-chain swap acceptance rates - are similarly averaged for each of the neighboring pairs of chains from - the raw swap acceptance histories (\code{ptMCMCout$swap_accepts}). - Trips are defined as movement from one extreme chain to the other and - back again (Katzgraber \emph{et al.} 2006). Trips are counted and turned - to per-iteration rates using \code{\link{count_trips}}. - \cr \cr - This function was first designed to work within \code{\link{TS}} and - process the output of \code{\link{est_changepoints}}, but has been - generalized and would work with any output from a ptMCMC as long as - \code{ptMCMCout} is formatted properly. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - data <- data[order(data[,"newmoon"]), ] - rho_dist <- est_changepoints(data, gamma ~ 1, 1, "newmoon", - weights, TS_control()) - diagnose_ptMCMC(rho_dist) -} - -} -\references{ -Katzgraber, H. G., S. Trebst, D. A. Huse. And M. Troyer. 2006. - Feedback-optimized parallel tempering Monte Carlo. \emph{Journal of - Statistical Mechanics: Theory and Experiment} \strong{3}:P03018 - \href{https://bit.ly/2LICGXh}{link}. -} diff --git a/man/document_weights.Rd b/man/document_weights.Rd index 0f74fd19..2e57eee8 100644 --- a/man/document_weights.Rd +++ b/man/document_weights.Rd @@ -8,9 +8,8 @@ document_weights(document_term_table) } \arguments{ \item{document_term_table}{Table of observation count data (rows: -documents, columns: terms. May be a class \code{matrix} or -\code{data.frame} but must be conformable to a matrix of integers, -as verified by \code{\link{check_document_term_table}}.} +documents, columns: terms. May be a \code{matrix} or +\code{data.frame} but must be conformable to a matrix of integers.} } \value{ Vector of weights, one for each document, with the average sample diff --git a/man/ecdf_plot.Rd b/man/ecdf_plot.Rd deleted file mode 100644 index 22d2c84f..00000000 --- a/man/ecdf_plot.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{ecdf_plot} -\alias{ecdf_plot} -\title{Produce the posterior distribution ECDF panel for the TS - diagnostic plot of a parameter} -\usage{ -ecdf_plot(x, xlab = "parameter value") -} -\arguments{ -\item{x}{Vector of parameter values drawn from the posterior distribution, -indexed to the iteration by the order of the vector.} - -\item{xlab}{\code{character} value used to label the x axis.} -} -\value{ -\code{NULL}. -} -\description{ -Produce a vanilla ECDF (empirical cumulative distribution - function) plot using \code{ecdf} for the parameter of interest (rho or - eta) as part of \code{\link{TS_diagnostics_plot}}. A horizontal line - is added to show the median of the posterior. -} -\examples{ - ecdf_plot(rnorm(100, 0, 1)) - -} diff --git a/man/est_changepoints.Rd b/man/est_changepoints.Rd deleted file mode 100644 index f1d37798..00000000 --- a/man/est_changepoints.Rd +++ /dev/null @@ -1,77 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS.R -\name{est_changepoints} -\alias{est_changepoints} -\title{Use ptMCMC to estimate the distribution of change point locations} -\usage{ -est_changepoints(data, formula, nchangepoints, timename, weights, - control = list()) -} -\arguments{ -\item{data}{\code{data.frame} including [1] the time variable (indicated -in \code{timename}), [2] the predictor variables (required by -\code{formula}) and [3], the multinomial response variable (indicated in -\code{formula}) as verified by \code{\link{check_timename}} and -\code{\link{check_formula}}. Note that the response variables should be -formatted as a \code{data.frame} object named as indicated by the -\code{response} entry in the \code{control} list, such as \code{gamma} -for a standard TS analysis on LDA output.} - -\item{formula}{\code{\link[stats]{formula}} defining the regression between -relationship the change points. Any -predictor variable included must also be a column in -\code{data} and any (multinomial) response variable must be a set of -columns in \code{data}, as verified by \code{\link{check_formula}}.} - -\item{nchangepoints}{\code{integer} corresponding to the number of -change points to include in the model. 0 is a valid input (corresponding -to no change points, so a singular time series model), and the current -implementation can reasonably include up to 6 change points. The -number of change points is used to dictate the segmentation of the -time series into chunks fit with separate models dictated by -\code{formula}.} - -\item{timename}{\code{character} element indicating the time variable -used in the time series.} - -\item{weights}{Optional class \code{numeric} vector of weights for each -document. Defaults to \code{NULL}, translating to an equal weight for -each document. When using \code{multinom_TS} in a standard LDATS -analysis, it is advisable to weight the documents by their total size, -as the result of \code{\link[topicmodels]{LDA}} is a matrix of -proportions, which does not account for size differences among documents. -For most models, a scaling of the weights (so that the average is 1) is -most appropriate, and this is accomplished using \code{document_weights}.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} -} -\value{ -List of saved data objects from the ptMCMC estimation of - change point locations (unless \code{nchangepoints} is 0, then - \code{NULL} is returned). -} -\description{ -This function executes ptMCMC-based estimation of the - change point location distributions for multinomial Time Series analyses. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - formula <- gamma ~ 1 - nchangepoints <- 1 - control <- TS_control() - data <- data[order(data[,"newmoon"]), ] - rho_dist <- est_changepoints(data, formula, nchangepoints, "newmoon", - weights, control) -} - -} diff --git a/man/est_regressors.Rd b/man/est_regressors.Rd deleted file mode 100644 index b394b8e4..00000000 --- a/man/est_regressors.Rd +++ /dev/null @@ -1,108 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS.R -\name{est_regressors} -\alias{est_regressors} -\title{Estimate the distribution of regressors, unconditional on the - change point locations} -\usage{ -est_regressors(rho_dist, data, formula, timename, weights, - control = list()) -} -\arguments{ -\item{rho_dist}{List of saved data objects from the ptMCMC estimation of -change point locations (unless \code{nchangepoints} is 0, then -\code{NULL}) returned from \code{\link{est_changepoints}}.} - -\item{data}{\code{data.frame} including [1] the time variable (indicated -in \code{timename}), [2] the predictor variables (required by -\code{formula}) and [3], the multinomial response variable (indicated in -\code{formula}) as verified by \code{\link{check_timename}} and -\code{\link{check_formula}}. Note that the response variables should be -formatted as a \code{data.frame} object named as indicated by the -\code{response} entry in the \code{control} list, such as \code{gamma} -for a standard TS analysis on LDA output.} - -\item{formula}{\code{\link[stats]{formula}} defining the regression between -relationship the change points. Any -predictor variable included must also be a column in -\code{data} and any (multinomial) response variable must be a set of -columns in \code{data}, as verified by \code{\link{check_formula}}.} - -\item{timename}{\code{character} element indicating the time variable -used in the time series.} - -\item{weights}{Optional class \code{numeric} vector of weights for each -document. Defaults to \code{NULL}, translating to an equal weight for -each document. When using \code{multinom_TS} in a standard LDATS -analysis, it is advisable to weight the documents by their total size, -as the result of \code{\link[topicmodels]{LDA}} is a matrix of -proportions, which does not account for size differences among documents. -For most models, a scaling of the weights (so that the average is 1) is -most appropriate, and this is accomplished using \code{document_weights}.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} -} -\value{ -\code{matrix} of draws (rows) from the marginal posteriors of the - coefficients across the segments (columns). -} -\description{ -This function uses the marginal posterior distributions of - the change point locations (estimated by \code{\link{est_changepoints}}) - in combination with the conditional (on the change point locations) - posterior distributions of the regressors (estimated by - \code{\link{multinom_TS}}) to estimate the marginal posterior - distribution of the regressors, unconditional on the change point - locations. -} -\details{ -The general approach follows that of Western and Kleykamp - (2004), although we note some important differences. Our regression - models are fit independently for each chunk (segment of time), and - therefore the variance-covariance matrix for the full model - has \code{0} entries for covariances between regressors in different - chunks of the time series. Further, because the regression model here - is a standard (non-hierarchical) softmax (Ripley 1996, Venables and - Ripley 2002, Bishop 2006), there is no error term in the regression - (as there is in the normal model used by Western and Kleykamp 2004), - and so the posterior distribution used here is a multivariate normal, - as opposed to a multivariate t, as used by Western and Kleykamp (2004). -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - formula <- gamma ~ 1 - nchangepoints <- 1 - control <- TS_control() - data <- data[order(data[,"newmoon"]), ] - rho_dist <- est_changepoints(data, formula, nchangepoints, "newmoon", - weights, control) - eta_dist <- est_regressors(rho_dist, data, formula, "newmoon", weights, - control) -} - -} -\references{ -Bishop, C. M. 2006. \emph{Pattern Recognition and Machine Learning}. - Springer, New York, NY, USA. - - Ripley, B. D. 1996. \emph{Pattern Recognition and Neural Networks}. - Cambridge University Press, Cambridge, UK. - - Venables, W. N. and B. D. Ripley. 2002. \emph{Modern and Applied - Statistics with S}. Fourth Edition. Springer, New York, NY, USA. - - Western, B. and M. Kleykamp. 2004. A Bayesian change point model for - historical time series analysis. \emph{Political Analysis} - \strong{12}:354-374. - \href{https://doi.org/10.1093/pan/mph023}{link}. -} diff --git a/man/expand_TS.Rd b/man/expand_TS.Rd deleted file mode 100644 index 8a33d62e..00000000 --- a/man/expand_TS.Rd +++ /dev/null @@ -1,52 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{expand_TS} -\alias{expand_TS} -\title{Expand the TS models across the factorial combination of - LDA models, formulas, and number of change points} -\usage{ -expand_TS(LDA_models, formulas, nchangepoints) -} -\arguments{ -\item{LDA_models}{List of LDA models (class \code{LDA_set}, produced by -\code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -produced by \code{\link[topicmodels]{LDA}}).} - -\item{formulas}{Vector of \code{\link[stats]{formula}}(s) for the -continuous (non-change point) component of the time series models. Any -predictor variable included in a formula must also be a column in the -\code{document_covariate_table}. Each element (formula) in the vector -is evaluated for each number of change points and each LDA model.} - -\item{nchangepoints}{Vector of \code{integer}s corresponding to the number -of change points to include in the time series models. 0 is a valid input -corresponding to no change points (\emph{i.e.}, a singular time series -model), and the current implementation can reasonably include up to 6 -change points. Each element in the vector is the number of change points -used to segment the data for each formula (entry in \code{formulas}) -component of the TS model, for each selected LDA model.} -} -\value{ -Expanded \code{data.frame} table of the three values (columns) for - each unique model run (rows): [1] the LDA model (indicated - as a numeric element reference to the \code{LDA_models} object), [2] the - regressor formula, and [3] the number of changepoints. -} -\description{ -Expand the completely crossed combination of model inputs: - LDA model results, formulas, and number of change points. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) - LDA_models <- select_LDA(LDAs) - weights <- document_weights(document_term_table) - formulas <- c(~ 1, ~ newmoon) - nchangepoints <- 0:1 - expand_TS(LDA_models, formulas, nchangepoints) -} - -} diff --git a/man/identity_LDA.Rd b/man/identity_LDA.Rd new file mode 100644 index 00000000..7edbe401 --- /dev/null +++ b/man/identity_LDA.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LDA_models.R +\name{identity_LDA} +\alias{identity_LDA} +\title{Identity Linguistic Decomposition Analysis} +\usage{ +identity_LDA(LDA) +} +\arguments{ +\item{LDA}{A prepared (via \code{\link{prepare_LDA}} LDA model +\code{list}.} +} +\value{ +\code{LDA} \code{list} with components (many of which are + placeholders): + \describe{ + \item{alpha}{parameter estimate.} + \item{beta}{parameter estimate.} + \item{terms}{\code{character} \code{vector} of term names.} + \item{document_topic_matrix}{estimated latent topic compositions.} + \item{test_document_topic_matrox}{estimated latent topic compositions + of the test data (not presently available for usage).} + \item{log_likelihood}{model log likelihood.} + \item{data}{data object used to fit the LDA model.} + \item{data_subset}{number of the data subset from the whole data set.} + \item{topics}{\code{integer} number of topics in the model.} + \item{replicat}{\code{integer} replicate number.} + \item{control}{\code{list} of controls used to fit the model. See + \code{\link{LDA_control}}.} + } +} +\description{ +This function acts as an "identity" model, wherein the + output is functionally the input. This allows for "single-topic" models + that do not actually decompose the data to be included in the model set. +} diff --git a/man/ldats_classic.Rd b/man/ldats_classic.Rd new file mode 100644 index 00000000..e6917f78 --- /dev/null +++ b/man/ldats_classic.Rd @@ -0,0 +1,275 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TS_methods.R +\name{ldats_classic} +\alias{ldats_classic} +\alias{count_trips} +\alias{swap_chains} +\alias{ldats_classic_control} +\alias{prep_cpts} +\alias{update_cpts} +\alias{prep_temp_sequence} +\alias{prep_saves} +\alias{update_saves} +\alias{process_saves} +\alias{prep_ptMCMC_inputs} +\alias{prep_proposal_dist} +\alias{prep_ids} +\alias{update_ids} +\alias{step_chains} +\alias{propose_step} +\alias{eval_step} +\alias{take_step} +\alias{proposed_step_mods} +\title{Estimate changepoints using the LDATS classic ptMCMC method} +\usage{ +ldats_classic(TS, control = list()) + +count_trips(ids) + +swap_chains(chainsin, inputs, ids) + +ldats_classic_control(ntemps = 6, penultimate_temp = 2^6, + ultimate_temp = 1e+10, q = 0, nit = 10000, magnitude = 12, + burnin = 0, thin_frac = 1, memoise = TRUE, quiet = FALSE) + +prep_cpts(TS) + +update_cpts(cpts, swaps) + +prep_temp_sequence(TS) + +prep_saves(TS) + +update_saves(i, saves, steps, swaps) + +process_saves(saves, TS) + +prep_ptMCMC_inputs(TS) + +prep_proposal_dist(TS) + +prep_ids(TS) + +update_ids(ids, swaps) + +step_chains(TS, i, cpts, inputs) + +propose_step(TS, i, cpts, inputs) + +eval_step(i, cpts, prop_step, inputs) + +take_step(cpts, prop_step, accept_step) + +proposed_step_mods(TS, prop_changepts, inputs) +} +\arguments{ +\item{TS}{\code{list} time series model object.} + +\item{control}{A \code{list} of parameters to control the fitting of the +time series model via the LDATS classic ptMCMC method. Values not input +assume defaults set by \code{\link{ldats_classic_control}}.} + +\item{ids}{\code{vector} (for \code{update_ids}, \code{swap_chains}) +of the existing chain ids or \code{matrix} (rows: chains, +columns: iterations; for \code{count_trips}) +of identifiers of the particles in each chain for +each iteration of the ptMCMC algorithm.} + +\item{chainsin}{Chain configuration to be evaluated for swapping.} + +\item{inputs}{\code{list} of static inputs for use within the ptMCMC +algorithm.} + +\item{ntemps}{\code{integer} number of temperatures (chains) to use in the +ptMCMC algorithm.} + +\item{penultimate_temp}{Penultimate temperature in the ptMCMC sequence.} + +\item{ultimate_temp}{Ultimate temperature in the ptMCMC sequence.} + +\item{q}{Exponent controlling the ptMCMC temperature sequence from the +focal chain (reference with temperature = 1) to the penultimate chain. 0 +(default) implies a geometric sequence. 1 implies squaring before +exponentiating.} + +\item{nit}{\code{integer} number of iterations (steps) used in the ptMCMC +algorithm.} + +\item{magnitude}{Average magnitude (defining a geometric distribution) +for the proposed step size in the ptMCMC algorithm.} + +\item{burnin}{\code{integer} number of iterations to remove from the +beginning of the ptMCMC algorithm.} + +\item{thin_frac}{Fraction of iterations to retain, must be \eqn{(0, 1]}, +and the default value of 1 represents no thinning.} + +\item{memoise}{\code{logical} indicator of whether the response +function should be memoised (via \code{\link[memoise]{memoise}}).} + +\item{quiet}{\code{logical} indicator of whether the model should run +quietly (if \code{FALSE}, a progress bar and notifications are printed).} + +\item{cpts}{\code{matrix} of change point locations across chains.} + +\item{swaps}{Chain configuration after among-temperature swaps.} + +\item{i}{\code{integer} iteration index.} + +\item{saves}{The existing list of saved data objects.} + +\item{steps}{Chain configuration after within-temperature steps.} + +\item{prop_step}{Proposed step output from \code{propose_step}.} + +\item{accept_step}{\code{logical} indicator of acceptance of each chain's +proposed step.} + +\item{prop_changepts}{\code{matrix} of proposed change points across +chains.} +} +\value{ +\code{ldats_classic}: \code{list} of changepoint locations, log + likelihoods, and model diagnostics. \cr \cr + \code{ldats_classic_control}: \code{list} of named control elements + for use in \code{\link{ldats_classic}}. \cr \cr + \code{prep_ptMCMC_inputs}: \code{list} containing the static + inputs for use within the ptMCMC algorithm for estimating change + points. \cr \cr + \code{prep_temp_sequence}: \code{vector} of temperatures. + \code{prep_proposal_dist}: \code{list} of two \code{matrix} elements: + [1] the size of the proposed step for each iteration of each chain and + [2] the identity of the change point location to be shifted by the + step for each iteration of each chain. + \code{prep_ids},\code{update_ids}: \code{vector} of ids. \cr \cr + \code{prep_cpts},\code{update_cpts}: \code{list} of [1] \code{matrix} + of change points (rows) for each temperature (columns) and [2] + \code{vector} of log-likelihood values for each of the chains. \cr \cr + \code{prep_saves},\code{update_saves},\code{process_saves}: \code{list} + of ptMCMC objects: change points (\code{$cpts}), + log-likelihoods (\code{$lls}), chain ids (\code{$ids}), + step acceptances (\code{$step_accepts}), and swap acceptances + (\code{$swap_accepts}). \cr \cr + \code{step_chains}: \code{list} of change points, log-likelihoods, + and logical indicators of acceptance for each chain. \cr \cr + \code{propose_step}: \code{list} of change points and + log-likelihood values for the proposal. \cr \cr + \code{eval_step}: \code{logical} vector indicating if each + chain's proposal was accepted. \cr \cr + \code{take_step}: \code{list} of change points, log-likelihoods, + and logical indicators of acceptance for each chain. \cr \cr + \code{swap_chains}: \code{list} of updated change points, + log-likelihoods, and chain ids, as well as a \code{vector} of + acceptance indicators for each swap. \cr \cr + \code{proposed_step_mods}: \code{list} of models associated with the + proposed step, with an element for each chain. \cr \cr + \code{count_trips}: \code{list} of [1] \code{vector} of within particle + trip counts (\code{$trip_counts}), and [2] \code{vector} of + within-particle average trip rates (\code{$trip_rates}). +} +\description{ +Uses the LDATS classic parallel tempering Markov Chain Monte + Carlo (ptMCMC) methods (Earl and Deem 2005) to fit a changepoint + model, following Christensen \emph{et al.} (2018). \cr \cr + \code{ldats_classic} is the top-level function for the sampler. \cr \cr + \code{ldats_classic_control} defines and creates a control \code{list} + for use with \code{\link{ldats_classic}}. + \code{prep_ptMCMC_inputs} packages the static inputs (controls and + data structures) used by the ptMCMC algorithm in the context of + estimating change points. + \code{prep_proposal_dist} prep-calculates the proposal distribution + for the ptMCMC algorithm in order to decrease computation time. + The proposal distribution is a joint of three distributions: + [1] a multinomial distribution selecting among the change points within + the chain, [2] a binomial distribution selecting the direction of the + step of the change point (earlier or later in the time series), and + [3] a geometric distribution selecting the magnitude of the step. + \code{prep_ids} creates the active vector of identities (ids) for each + of the chains in the ptMCMC algorithm, which are used to track trips + of the particles among chains. \cr \cr + \code{update_ids} updates the active vector of identities (ids) for + each of the chains in the ptMCMC algorithm after each iteration. + \cr \cr + \code{prep_cpts} initializes each chain using a draw from the available + times (i.e. assuming a uniform prior), the best fit (by likelihood) + draw is put in the focal chain with each subsequently worse fit placed + into the subsequently hotter chain. \cr \cr + \code{update_cpts} updates the change points after every iteration in + the ptMCMC algorithm. \cr \cr + \code{prep_saves} creates the data structure used to save the + output from each iteration of the ptMCMC algorithm. \cr \cr + \code{update_saves} adds to the data structure after each iteration. + \cr \cr + \code{process_saves} processes (burn-in iterations are dropped and the + remaining iterations are thinned) the saved data objects after the + ptMCMC is complete. \cr \cr + \code{prep_temp_sequence} creates the series of temperatures used in the + ptMCMC algorithm. \cr \cr + \code{step_chains} steps the chains forward one iteration + of the within-chain component of the ptMCMC algorithm. \cr \cr + \code{prop_step} makes the proposal for the next step. \cr \cr + \code{eval_step} evaluates the proposa. \cr \cr + \code{take_step} updates the configuration. \cr \cr + \code{proposed_step_mods} proposes the models for all chains in a given + step of the sampler. \cr \cr + \code{swap_chains} handles the among-chain swapping based on + temperatures and likelihood differentials. \cr \cr + \code{count_trips} counts the full trips (from one extreme temperature + chain to the other and back again; Katzgraber \emph{et al.} 2006) for + each of the ptMCMC particles, as identified by their id on + initialization. +} +\details{ +For each iteration of the ptMCMC algorithm, all of the chains + have the potential to take a step. The possible step is proposed under + a proposal distribution (here for change points we use a symmetric + geometric distribution), the proposed step is then evaluated and either + accepted or not (following the Metropolis-Hastings rule; Metropolis, + \emph{et al.} 1953, Hasting 1960, Gupta \emph{et al.} 2018), and then + accordingly taken or not (the configurations are updated). \cr \cr + The ptMCMC algorithm couples the chains (which are + taking their own walks on the distribution surface) through "swaps", + where neighboring chains exchange configurations (Geyer 1991, Falcioni + and Deem 1999) following the Metropolis criterion (Metropolis + \emph{et al.} 1953). This allows them to share information and search the + surface in combination (Earl and Deem 2005). +} +\references{ +Christensen, E., D. J. Harris, and S. K. M. Ernest. 2018. + Long-term community change through multiple rapid transitions in a + desert rodent community. \emph{Ecology} \strong{99}:1523-1529. + \href{https://doi.org/10.1002/ecy.2373}{link}. + + Earl, D. J. and M. W. Deem. 2005. Parallel tempering: theory, + applications, and new perspectives. \emph{Physical Chemistry Chemical + Physics} \strong{7}: 3910-3916. + \href{https://doi.org/10.1039/B509983H}{link}. + + Falcioni, M. and M. W. Deem. 1999. A biased Monte Carlo scheme for + zeolite structure solution. \emph{Journal of Chemical Physics} + \strong{110}: 1754-1766. + \href{https://aip.scitation.org/doi/10.1063/1.477812}{link}. + + Geyer, C. J. 1991. Markov Chain Monte Carlo maximum likelihood. \emph{In + Computing Science and Statistics: Proceedings of the 23rd Symposium on + the Interface}. pp 156-163. American Statistical Association, New York, + USA. \href{https://www.stat.umn.edu/geyer/f05/8931/c.pdf}{link}. + + Gupta, S., L. Hainsworth, J. S. Hogg, R. E. C. Lee, and J. R. Faeder. + 2018. Evaluation of parallel tempering to accelerate Bayesian parameter + estimation in systems biology. + \href{https://arxiv.org/abs/1801.09831}{link}. + + Hastings, W. K. 1970. Monte Carlo sampling methods using Markov Chains + and their applications. \emph{Biometrika} \strong{57}:97-109. + \href{https://doi.org/10.2307/2334940}{link}.#' + Katzgraber, H. G., S. Trebst, D. A. Huse. And M. Troyer. 2006. + Feedback-optimized parallel tempering Monte Carlo. \emph{Journal of + Statistical Mechanics: Theory and Experiment} \strong{3}:P03018 + \href{https://bit.ly/2LICGXh}{link}. + + Metropolis, N., A. W. Rosenbluth, M. N. Rosenbluth, A. H. Teller, and E. + Teller. 1953. Equations of state calculations by fast computing machines. + \emph{Journal of Chemical Physics} \strong{21}: 1087-1092. + \href{https://bayes.wustl.edu/Manual/EquationOfState.pdf}{link}. +} diff --git a/man/list_depth.Rd b/man/list_depth.Rd new file mode 100644 index 00000000..19b052c4 --- /dev/null +++ b/man/list_depth.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{list_depth} +\alias{list_depth} +\title{Determine the depth of a list} +\usage{ +list_depth(xlist) +} +\arguments{ +\item{xlist}{Focal input \code{list}.} +} +\value{ +\code{integer} value of the depth of the list. +} +\description{ +Evaluate an input for the depth of its nesting. +} +\details{ +If \code{xlist = list()}, then technically the input value is a + list, but is empty (of length \code{0}), so depth is returned as \code{0}. +} +\examples{ + list_depth("a") + list_depth(list()) + list_depth(list("a")) + list_depth(list(list("a"))) + +} diff --git a/man/logLik.LDA.Rd b/man/logLik.LDA.Rd new file mode 100644 index 00000000..2541a012 --- /dev/null +++ b/man/logLik.LDA.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LDA.R +\name{logLik.LDA} +\alias{logLik.LDA} +\title{Determine the log likelihood of a Linguistic Decomposition Analysis + model} +\usage{ +\method{logLik}{LDA}(object, ...) +} +\arguments{ +\item{object}{Class \code{LDA} object to be evaluated.} + +\item{...}{Not used, simply included to maintain method compatibility.} +} +\value{ +Log likelihood of the model \code{logLik}, also with \code{df} + (degrees of freedom) and \code{nobs} (number of observations) values. +} +\description{ +Convenience function to extract and format the log likelihood + of a \code{LDA}-class object fit by \code{\link{LDA_call}}. +} diff --git a/man/logLik.LDA_VEM.Rd b/man/logLik.LDA_VEM.Rd deleted file mode 100644 index a0744fd1..00000000 --- a/man/logLik.LDA_VEM.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA.R -\name{logLik.LDA_VEM} -\alias{logLik.LDA_VEM} -\title{Calculate the log likelihood of a VEM LDA model fit} -\usage{ -\method{logLik}{LDA_VEM}(object, ...) -} -\arguments{ -\item{object}{A \code{LDA_VEM}-class object.} - -\item{...}{Not used, simply included to maintain method compatibility.} -} -\value{ -Log likelihood of the model \code{logLik}, also with \code{df} - (degrees of freedom) and \code{nobs} (number of observations) values. -} -\description{ -Imported but updated calculations from topicmodels package, as - applied to Latent Dirichlet Allocation fit with Variational Expectation - Maximization via \code{\link[topicmodels]{LDA}}. -} -\details{ -The number of degrees of freedom is 1 (for alpha) plus the number - of entries in the document-topic matrix. The number of observations is - the number of entries in the document-term matrix. -} -\examples{ - data(rodents) - lda_data <- rodents$document_term_table - r_LDA <- LDA_set(lda_data, topics = 2) - logLik(r_LDA[[1]]) - -} -\references{ -Buntine, W. 2002. Variational extensions to EM and multinomial PCA. - \emph{European Conference on Machine Learning, Lecture Notes in Computer - Science} \strong{2430}:23-34. \href{https://bit.ly/327sltH}{link}. - - Grun B. and K. Hornik. 2011. topicmodels: An R Package for Fitting Topic - Models. \emph{Journal of Statistical Software} \strong{40}:13. - \href{https://www.jstatsoft.org/article/view/v040i13}{link}. - - Hoffman, M. D., D. M. Blei, and F. Bach. 2010. Online learning for - latent Dirichlet allocation. \emph{Advances in Neural Information - Processing Systems} \strong{23}:856-864. - \href{https://bit.ly/2LEr5sb}{link}. -} diff --git a/man/logLik.TS.Rd b/man/logLik.TS.Rd new file mode 100644 index 00000000..3977fbea --- /dev/null +++ b/man/logLik.TS.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TS.R +\name{logLik.TS} +\alias{logLik.TS} +\title{Determine the log likelihood of a Time Series model} +\usage{ +\method{logLik}{TS}(object, ...) +} +\arguments{ +\item{object}{Class \code{TS} object to be evaluated.} + +\item{...}{Not used, simply included to maintain method compatibility.} +} +\value{ +Log likelihood of the model \code{logLik}, also with \code{df} + (degrees of freedom) and \code{nobs} (number of observations) values. +} +\description{ +Convenience function to extract and format the log likelihood + of a \code{TS}-class object fit by \code{\link{sequential_TS}}. +} diff --git a/man/logLik.TS_fit.Rd b/man/logLik.TS_fit.Rd index 6eb289c9..cb5ba1b4 100644 --- a/man/logLik.TS_fit.Rd +++ b/man/logLik.TS_fit.Rd @@ -1,35 +1,23 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS.R +% Please edit documentation in R/TS_responses.R \name{logLik.TS_fit} \alias{logLik.TS_fit} -\title{Determine the log likelihood of a Time Series model} +\title{Log likelihood of a TS model (as a TS_fit-class list)} \usage{ \method{logLik}{TS_fit}(object, ...) } \arguments{ -\item{object}{Class \code{TS_fit} object to be evaluated.} +\item{object}{A \code{TS_fit}-class object.} \item{...}{Not used, simply included to maintain method compatibility.} } \value{ -Log likelihood of the model \code{logLik}, also with \code{df} - (degrees of freedom) and \code{nobs} (number of observations) values. +Log likelihood of the model, as class \code{logLik}, with + attributes \code{df} (degrees of freedom) and \code{nobs} (the number of + weighted observations, accounting for size differences among documents). } \description{ -Convenience function to extract and format the log likelihood - of a \code{TS_fit}-class object fit by \code{\link{multinom_TS}}. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) - logLik(TSmod) -} - +Convenience function to simply extract the \code{logLik} + element (and \code{df} and \code{nobs}) from a \code{TS_fit} + object fit by a \code{_TS} function. } diff --git a/man/logLik.mlm.Rd b/man/logLik.mlm.Rd new file mode 100644 index 00000000..40e76c7c --- /dev/null +++ b/man/logLik.mlm.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TS_responses.R +\name{logLik.mlm} +\alias{logLik.mlm} +\title{Determine the log likelihood of a multivariate linear regression + model} +\usage{ +\method{logLik}{mlm}(object, ...) +} +\arguments{ +\item{object}{multivariate linear regression model fit with + \code{\link{lm}}. +of class \code{mlm}} + +\item{...}{not used.} + +\item{object}{multivariate linear regression model fit using +\code{\link[stats]{lm}} and of class \code{mlm}} + +\item{...}{Not used, simply included to maintain method compatibility.} +} +\value{ +log-lik at (unrestricted) maximum with df as attribute. + +Log likelihood of the model \code{logLik}, also with \code{df} + (degrees of freedom) and \code{nobs} (number of observations) values. +} +\description{ +Convenience function to extract and format the log likelihood + of a multivariate linear regression, such as fitted by the + \code{\link{simplex_TS}} models. +} +\details{ +Log-likelihood of multivariate linear regression model + +Adapted from the function contained in the old R-Forge Atools + package (href{https://rdrr.io/rforge/Atools/src/R/logLik.mlm.R}{see}). +} +\examples{ +y <- cbind(rnorm(10), rnorm(10)); x <- 1:10; +mod <- lm(y~x) +logLik(mod) +} +\author{ +Andi Boeck +} diff --git a/man/logLik.multinom_TS_fit.Rd b/man/logLik.multinom_TS_fit.Rd deleted file mode 100644 index e5f44187..00000000 --- a/man/logLik.multinom_TS_fit.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/multinom_TS.R -\name{logLik.multinom_TS_fit} -\alias{logLik.multinom_TS_fit} -\title{Log likelihood of a multinomial TS model} -\usage{ -\method{logLik}{multinom_TS_fit}(object, ...) -} -\arguments{ -\item{object}{A \code{multinom_TS_fit}-class object.} - -\item{...}{Not used, simply included to maintain method compatibility.} -} -\value{ -Log likelihood of the model, as class \code{logLik}, with - attributes \code{df} (degrees of freedom) and \code{nobs} (the number of - weighted observations, accounting for size differences among documents). -} -\description{ -Convenience function to simply extract the \code{logLik} - element (and \code{df} and \code{nobs}) from a \code{multinom_TS_fit} - object fit by \code{\link{multinom_TS}}. Extends - \code{\link[stats]{logLik}} from \code{\link[nnet]{multinom}} to - \code{multinom_TS_fit} objects. -} -\examples{ - data(rodents) - dtt <- rodents$document_term_table - lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) - dct <- rodents$document_covariate_table - dct$gamma <- lda[[1]]@gamma - weights <- document_weights(dtt) - mts <- multinom_TS(dct, formula = gamma ~ 1, changepoints = c(20,50), - timename = "newmoon", weights = weights) - logLik(mts) - -} diff --git a/man/mirror_vcov.Rd b/man/mirror_vcov.Rd index b5010a61..c6f2315d 100644 --- a/man/mirror_vcov.Rd +++ b/man/mirror_vcov.Rd @@ -15,7 +15,8 @@ Properly symmetric variance covariance \code{matrix}. } \description{ A wrapper on \code{\link[stats]{vcov}} to produce a symmetric - matrix. If the default matrix returned by \code{\link[stats]{vcov}} is + matrix. \cr + If the default matrix returned by \code{\link[stats]{vcov}} is symmetric it is returned simply. If it is not, in fact, symmetric (as occurs occasionally with \code{\link[nnet]{multinom}} applied to proportions), the matrix is made symmetric by averaging the lower and diff --git a/man/multinom_TS.Rd b/man/multinom_TS.Rd index 1598259c..ad4b406a 100644 --- a/man/multinom_TS.Rd +++ b/man/multinom_TS.Rd @@ -1,37 +1,38 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/multinom_TS.R +% Please edit documentation in R/TS_responses.R \name{multinom_TS} \alias{multinom_TS} -\alias{check_multinom_TS_inputs} +\alias{multinom_TS_chunk} +\alias{multinom_TS_control} \title{Fit a multinomial change point Time Series model} \usage{ multinom_TS(data, formula, changepoints = NULL, timename = "time", weights = NULL, control = list()) -check_multinom_TS_inputs(data, formula = gamma ~ 1, - changepoints = NULL, timename = "time", weights = NULL, - control = list()) +multinom_TS_chunk(data, formula, chunk, timename = "time", + weights = NULL, control = list()) + +multinom_TS_control(lambda = 0, quiet = FALSE, ...) } \arguments{ \item{data}{\code{data.frame} including [1] the time variable (indicated in \code{timename}), [2] the predictor variables (required by \code{formula}) and [3], the multinomial response variable (indicated in -\code{formula}) as verified by \code{\link{check_timename}} and -\code{\link{check_formula}}. Note that the response variables should be -formatted as a \code{data.frame} object named as indicated by the +\code{formula}). \cr \cr +Note that the response variables should be formatted as a +\code{data.frame} object named as indicated by the \code{response} entry in the \code{control} list, such as \code{gamma} -for a standard TS analysis on LDA output. See \code{Examples}.} +for a standard TS analysis on LDA output. \cr \cr +See \code{Examples}.} \item{formula}{\code{\link[stats]{formula}} defining the regression between relationship the change points. Any predictor variable included must also be a column in \code{data} and any (multinomial) response variable must be a set of -columns in \code{data}, as verified by \code{\link{check_formula}}.} +columns in \code{data}.} \item{changepoints}{Numeric vector indicating locations of the change -points. Must be conformable to \code{integer} values. Validity -checked by \code{\link{check_changepoints}} and -\code{\link{verify_changepoint_locations}}.} +points. Must be conformable to \code{integer} values.} \item{timename}{\code{character} element indicating the time variable used in the time series. Defaults to \code{"time"}. The variable must be @@ -43,7 +44,7 @@ timestep being 1 day, which is often not desired behavior.} document. Defaults to \code{NULL}, translating to an equal weight for each document. When using \code{multinom_TS} in a standard LDATS analysis, it is advisable to weight the documents by their total size, -as the result of \code{\link[topicmodels]{LDA}} is a matrix of +as the result of \code{\link{topicmodels_LDA}} is a matrix of proportions, which does not account for size differences among documents. For most models, a scaling of the weights (so that the average is 1) is most appropriate, and this is accomplished using @@ -53,38 +54,47 @@ most appropriate, and this is accomplished using Time Series model including the parallel tempering Markov Chain Monte Carlo (ptMCMC) controls. Values not input assume defaults set by \code{\link{TS_control}}.} + +\item{chunk}{Length-2 vector of times: [1] \code{start}, the start time +for the chunk and [2] \code{end}, the end time for the chunk.} + +\item{lambda}{\code{numeric} "weight" decay term used to set the prior +on the regressors within each chunk-level model. Defaults to 0, +corresponding to a fully vague prior.} + +\item{quiet}{\code{logical} indicator of whether the model should run +quietly (if \code{FALSE}, a progress bar and notifications are printed).} + +\item{...}{Not passed along to the output, rather included to allow for +automated removal of unneeded controls.} } \value{ -\code{multinom_TS}: Object of class \code{multinom_TS_fit}, - which is a list of [1] - chunk-level model fits (\code{"chunk models"}), [2] the total log - likelihood combined across all chunks (\code{"logLik"}), and [3] a - \code{data.frame} of chunk beginning and ending times (\code{"logLik"} - with columns \code{"start"} and \code{"end"}). \cr \cr - \code{check_multinom_TS_inputs}: an error message is thrown if any - input is improper, otherwise \code{NULL}. +\code{multinom_TS}: Object of class \code{TS_fit}, which is a + \code{list} of [1] chunk-level model fits (\code{"chunk models"}), + [2] the total log likelihood combined across all chunks + (\code{"logLik"}), and [3] a \code{data.frame} of chunk beginning and + ending times (with columns \code{"start"} and \code{"end"}). \cr \cr + \code{multinom_TS_chunk}: fitted model object for the chunk, + of classes \code{multinom} and \code{nnet}. \cr \cr + \code{multinom_TS_control}: \code{list}, with named elements + corresponding to response function controls. } \description{ -Fit a set of multinomial regression models (via - \code{\link[nnet]{multinom}}, Venables and Ripley 2002) to a time series - of data divided into multiple segments (a.k.a. chunks) based on given - locations for a set of change points. \cr \cr - \code{check_multinom_TS_inputs} checks that the inputs to - \code{multinom_TS} are of proper classes for an analysis. -} -\examples{ - data(rodents) - dtt <- rodents$document_term_table - lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) - dct <- rodents$document_covariate_table - dct$gamma <- lda[[1]]@gamma - weights <- document_weights(dtt) - check_multinom_TS_inputs(dct, timename = "newmoon") - mts <- multinom_TS(dct, formula = gamma ~ 1, changepoints = c(20,50), - timename = "newmoon", weights = weights) - +\code{multinom_TS} fits a set of multinomial regression models (via + \code{\link[nnet]{multinom}}, Venables and Ripley 2002) to a time + series of data divided into multiple segments (a.k.a. chunks) based on + given locations for a set of change points. \cr \cr + \code{multinom_TS_chunk} fits a multinomial regression model (via + \code{\link[nnet]{multinom}}, Ripley 1996, Venables and Ripley 2002) + to a defined chunk of time (a.k.a. segment) + \code{[chunk$start, chunk$end]} within a time series. \cr \cr + \code{multinom_TS_control} defines and creates the control \code{list} + for fitting. } \references{ -Venables, W. N. and B. D. Ripley. 2002. \emph{Modern and Applied +Ripley, B. D. 1996. \emph{Pattern Recognition and Neural Networks}. + Cambridge University Press, Cambridge, UK. + + Venables, W. N. and B. D. Ripley. 2002. \emph{Modern and Applied Statistics with S}. Fourth Edition. Springer, New York, NY, USA. } diff --git a/man/multinom_TS_chunk.Rd b/man/multinom_TS_chunk.Rd deleted file mode 100644 index a7147949..00000000 --- a/man/multinom_TS_chunk.Rd +++ /dev/null @@ -1,67 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/multinom_TS.R -\name{multinom_TS_chunk} -\alias{multinom_TS_chunk} -\title{Fit a multinomial Time Series model chunk} -\usage{ -multinom_TS_chunk(data, formula, chunk, timename = "time", - weights = NULL, control = list()) -} -\arguments{ -\item{data}{Class \code{data.frame} object including the predictor and -response variables.} - -\item{formula}{Formula as a \code{\link[stats]{formula}} or -\code{\link[base]{character}} object describing the chunk.} - -\item{chunk}{Length-2 vector of times: [1] \code{start}, the start time -for the chunk and [2] \code{end}, the end time for the chunk.} - -\item{timename}{\code{character} element indicating the time variable -used in the time series. Defaults to \code{"time"}. The variable must be -integer-conformable or a \code{Date}. If the variable named -is a \code{Date}, the input is converted to an integer, resulting in the -timestep being 1 day, which is often not desired behavior.} - -\item{weights}{Optional class \code{numeric} vector of weights for each -document. Defaults to \code{NULL}, translating to an equal weight for -each document. When using \code{multinom_TS} in a standard LDATS -analysis, it is advisable to weight the documents by their total size, -as the result of \code{\link[topicmodels]{LDA}} is a matrix of -proportions, which does not account for size differences among documents. -For most models, a scaling of the weights (so that the average is 1) is -most appropriate, and this is accomplished using \code{document_weights}.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} -} -\value{ -Fitted model object for the chunk, of classes \code{multinom} and - \code{nnet}. -} -\description{ -Fit a multinomial regression model (via - \code{\link[nnet]{multinom}}, Ripley 1996, Venables and Ripley 2002) - to a defined chunk of time (a.k.a. segment) - \code{[chunk$start, chunk$end]} within a time series. -} -\examples{ - data(rodents) - dtt <- rodents$document_term_table - lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) - dct <- rodents$document_covariate_table - dct$gamma <- lda[[1]]@gamma - weights <- document_weights(dtt) - chunk <- c(start = 0, end = 100) - mtsc <- multinom_TS_chunk(dct, formula = gamma ~ 1, chunk = chunk, - timename = "newmoon", weights = weights) - -} -\references{ -Ripley, B. D. 1996. Pattern Recognition and Neural Networks. Cambridge. - - Venables, W. N. and B. D. Ripley. 2002. Modern Applied Statistics with S. - Fourth edition. Springer. -} diff --git a/man/package_LDA_TS.Rd b/man/package_LDA_TS.Rd deleted file mode 100644 index 2cb9418b..00000000 --- a/man/package_LDA_TS.Rd +++ /dev/null @@ -1,52 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA_TS.R -\name{package_LDA_TS} -\alias{package_LDA_TS} -\title{Package the output of LDA_TS} -\usage{ -package_LDA_TS(LDAs, sel_LDA, TSs, sel_TSs) -} -\arguments{ -\item{LDAs}{List (class: \code{LDA_set}) of LDA models (class: -\code{LDA}), as returned by \code{\link{LDA_set}}.} - -\item{sel_LDA}{A reduced version of \code{LDAs} that only includes the -LDA model(s) selected by \code{\link{select_LDA}}. Still should be of -class \code{LDA_set}.} - -\item{TSs}{Class \code{TS_on_LDA} list of results from \code{\link{TS}} -applied for each model on each LDA model input, as returned by -\code{\link{TS_on_LDA}}.} - -\item{sel_TSs}{A reduced version of \code{TSs} (of class \code{TS_fit}) -that only includes the TS model chosen via \code{\link{select_TS}}.} -} -\value{ -Class \code{LDA_TS}-class object including all fitted models and - selected models specifically, ready to be returned from - \code{\link{LDA_TS}}. -} -\description{ -Combine the objects returned by \code{\link{LDA_set}}, - \code{\link{select_LDA}}, \code{\link{TS_on_LDA}}, and - \code{\link{select_TS}}, name them as elements of the list, and - set the class of the list as \code{LDA_TS}, for the return from - \code{\link{LDA_TS}}. -} -\examples{ -\donttest{ - data(rodents) - data <- rodents - control <- LDA_TS_control() - dtt <- data$document_term_table - dct <- data$document_covariate_table - weights <- document_weights(dtt) - LDAs <- LDA_set(dtt, 2, 1, control$LDA_set_control) - sel_LDA <- select_LDA(LDAs, control$LDA_set_control) - TSs <- TS_on_LDA(sel_LDA, dct, ~1, 1, "newmoon", weights, - control$TS_control) - sel_TSs <- select_TS(TSs, control$TS_control) - package_LDA_TS(LDAs, sel_LDA, TSs, sel_TSs) -} - -} diff --git a/man/package_LDA_set.Rd b/man/package_LDA_set.Rd deleted file mode 100644 index 5ecf98d2..00000000 --- a/man/package_LDA_set.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA.R -\name{package_LDA_set} -\alias{package_LDA_set} -\title{Package the output from LDA_set} -\usage{ -package_LDA_set(mods, mod_topics, mod_seeds) -} -\arguments{ -\item{mods}{Fitted models returned from \code{\link[topicmodels]{LDA}}.} - -\item{mod_topics}{Vector of \code{integer} values corresponding to the -number of topics in each model.} - -\item{mod_seeds}{Vector of \code{integer} values corresponding to the -seed used for each model.} -} -\value{ -\code{lis} (class: \code{LDA_set}) of LDA models (class: - \code{LDA_VEM}). -} -\description{ -Name the elements (LDA models) and set the class - (\code{LDA_set}) of the models returned by \code{\link{LDA_set}}. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - topics <- 2 - nseeds <- 2 - control <- LDA_set_control() - mod_topics <- rep(topics, each = length(seq(2, nseeds * 2, 2))) - iseed <- control$iseed - mod_seeds <- rep(seq(iseed, iseed + (nseeds - 1)* 2, 2), length(topics)) - nmods <- length(mod_topics) - mods <- vector("list", length = nmods) - for (i in 1:nmods){ - LDA_msg(mod_topics[i], mod_seeds[i], control) - control_i <- prep_LDA_control(seed = mod_seeds[i], control = control) - mods[[i]] <- topicmodels::LDA(document_term_table, k = mod_topics[i], - control = control_i) - } - package_LDA_set(mods, mod_topics, mod_seeds) -} - -} diff --git a/man/package_TS.Rd b/man/package_TS.Rd deleted file mode 100644 index c346fdc1..00000000 --- a/man/package_TS.Rd +++ /dev/null @@ -1,116 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS.R -\name{package_TS} -\alias{package_TS} -\title{Summarize the Time Series model} -\usage{ -package_TS(data, formula, timename, weights, control, rho_dist, eta_dist) -} -\arguments{ -\item{data}{\code{data.frame} including [1] the time variable (indicated -in \code{timename}), [2] the predictor variables (required by -\code{formula}) and [3], the multinomial response variable (indicated in -\code{formula}) as verified by \code{\link{check_timename}} and -\code{\link{check_formula}}. Note that the response variables should be -formatted as a \code{data.frame} object named as indicated by the -\code{response} entry in the \code{control} list, such as \code{gamma} -for a standard TS analysis on LDA output.} - -\item{formula}{\code{\link[stats]{formula}} defining the regression between -relationship the change points. Any -predictor variable included must also be a column in -\code{data} and any (multinomial) response variable must be a set of -columns in \code{data}, as verified by \code{\link{check_formula}}.} - -\item{timename}{\code{character} element indicating the time variable -used in the time series.} - -\item{weights}{Optional class \code{numeric} vector of weights for each -document. Defaults to \code{NULL}, translating to an equal weight for -each document. When using \code{multinom_TS} in a standard LDATS -analysis, it is advisable to weight the documents by their total size, -as the result of \code{\link[topicmodels]{LDA}} is a matrix of -proportions, which does not account for size differences among documents. -For most models, a scaling of the weights (so that the average is 1) is -most appropriate, and this is accomplished using \code{document_weights}.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} - -\item{rho_dist}{List of saved data objects from the ptMCMC estimation of -change point locations returned by \code{\link{est_changepoints}} -(unless \code{nchangepoints} is 0, then \code{NULL}).} - -\item{eta_dist}{Matrix of draws (rows) from the marginal posteriors of the -coefficients across the segments (columns), as estimated by -\code{\link{est_regressors}}.} -} -\value{ -\code{TS_fit}-class list containing the following elements, many of - which are hidden for \code{print}ing, but are accessible: - \describe{ - \item{data}{\code{data} input to the function.} - \item{formula}{\code{\link[stats]{formula}} input to the function.} - \item{nchangepoints}{\code{nchangepoints} input to the function.} - \item{weights}{\code{weights} input to the function.} - \item{timename}{\code{timename} input to the function.} - \item{control}{\code{control} input to the function.} - \item{lls}{Iteration-by-iteration - \link[=logLik.multinom_TS_fit]{logLik} values for the - full time series fit by \code{\link{multinom_TS}}.} - \item{rhos}{Iteration-by-iteration change point estimates from - \code{\link{est_changepoints}}.} - \item{etas}{Iteration-by-iteration marginal regressor estimates from - \code{\link{est_regressors}}, which have been - unconditioned with respect to the change point locations.} - \item{ptMCMC_diagnostics}{ptMCMC diagnostics, - see \code{\link{diagnose_ptMCMC}}} - \item{rho_summary}{Summary table describing \code{rhos} (the change - point locations), - see \code{\link{summarize_rhos}}.} - \item{rho_vcov}{Variance-covariance matrix for the estimates of - \code{rhos} (the change point locations), see - \code{\link{measure_rho_vcov}}.} - \item{eta_summary}{Summary table describing \code{ets} (the - regressors), - see \code{\link{summarize_etas}}.} - \item{eta_vcov}{Variance-covariance matrix for the estimates of - \code{etas} (the regressors), see - \code{\link{measure_eta_vcov}}.} - \item{logLik}{Across-iteration average of log-likelihoods - (\code{lls}).} - \item{nparams}{Total number of parameters in the full model, - including the change point locations and regressors.} - \item{AIC}{Penalized negative log-likelihood, based on - \code{logLik} and \code{nparams}.} - } -} -\description{ -Calculate relevant summaries for the run of a Time Series - model within \code{\link{TS}} and package the output as a - \code{TS_fit}-class object. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - formula <- gamma ~ 1 - nchangepoints <- 1 - control <- TS_control() - data <- data[order(data[,"newmoon"]), ] - rho_dist <- est_changepoints(data, formula, nchangepoints, "newmoon", - weights, control) - eta_dist <- est_regressors(rho_dist, data, formula, "newmoon", weights, - control) - package_TS(data, formula, "newmoon", weights, control, rho_dist, - eta_dist) -} - -} diff --git a/man/package_TS_on_LDA.Rd b/man/package_TS_on_LDA.Rd deleted file mode 100644 index a8d92ac8..00000000 --- a/man/package_TS_on_LDA.Rd +++ /dev/null @@ -1,51 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{package_TS_on_LDA} -\alias{package_TS_on_LDA} -\title{Package the output of TS_on_LDA} -\usage{ -package_TS_on_LDA(TSmods, LDA_models, models) -} -\arguments{ -\item{TSmods}{list of results from \code{\link{TS}} applied for each model -on each LDA model input.} - -\item{LDA_models}{List of LDA models (class \code{LDA_set}, produced by -\code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -produced by \code{\link[topicmodels]{LDA}}).} - -\item{models}{\code{data.frame} object returned from -\code{\link{expand_TS}} that contains the combinations of LDA models, -and formulas and nchangepoints used in the TS models.} -} -\value{ -Class \code{TS_on_LDA} list of results from \code{\link{TS}} - applied for each model on each LDA model input. -} -\description{ -Set the class and name the elements of the results list - returned from applying \code{\link{TS}} to the combination of TS models - requested for the LDA model(s) input. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) - LDA_models <- select_LDA(LDAs) - weights <- document_weights(document_term_table) - mods <- expand_TS(LDA_models, c(~ 1, ~ newmoon), 0:1) - nmods <- nrow(mods) - TSmods <- vector("list", nmods) - for(i in 1:nmods){ - formula_i <- mods$formula[[i]] - nchangepoints_i <- mods$nchangepoints[i] - data_i <- prep_TS_data(document_covariate_table, LDA_models, mods, i) - TSmods[[i]] <- TS(data_i, formula_i, nchangepoints_i, "newmoon", - weights, TS_control()) - } - package_TS_on_LDA(TSmods, LDA_models, mods) -} - -} diff --git a/man/package_chunk_fits.Rd b/man/package_chunk_fits.Rd index b9516cf7..32c4b117 100644 --- a/man/package_chunk_fits.Rd +++ b/man/package_chunk_fits.Rd @@ -1,9 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/multinom_TS.R +% Please edit documentation in R/TS_responses.R \name{package_chunk_fits} \alias{package_chunk_fits} -\title{Package the output of the chunk-level multinomial models into a - multinom_TS_fit list} +\title{Package the output of the chunk-level TS models into a TS_fit list} \usage{ package_chunk_fits(chunks, fits) } @@ -15,36 +14,14 @@ chunk (row).} the memoised version of \code{\link{multinom_TS_chunk}}.} } \value{ -Object of class \code{multinom_TS_fit}, which is a list of [1] +Object of class \code{TS_fit}, which is a list of [1] chunk-level model fits, [2] the total log likelihood combined across all chunks, and [3] the chunk time data table. } \description{ Takes the list of fitted chunk-level models returned from - \code{TS_chunk_memo} (the memoised version of - \code{\link{multinom_TS_chunk}} and packages it as a - \code{multinom_TS_fit} object. This involves naming the model fits based + a \code{_TS_chunk} function and packages it as a + \code{TS_fit} object. This involves naming the model fits based on the chunk time windows, combining the log likelihood values across the chunks, and setting the class of the output object. } -\examples{ - data(rodents) - dtt <- rodents$document_term_table - lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) - dct <- rodents$document_covariate_table - dct$gamma <- lda[[1]]@gamma - weights <- document_weights(dtt) - formula <- gamma ~ 1 - changepoints <- c(20,50) - timename <- "newmoon" - TS_chunk_memo <- memoise_fun(multinom_TS_chunk, TRUE) - chunks <- prep_chunks(dct, changepoints, timename) - nchunks <- nrow(chunks) - fits <- vector("list", length = nchunks) - for (i in 1:nchunks){ - fits[[i]] <- TS_chunk_memo(dct, formula, chunks[i, ], timename, - weights, TS_control()) - } - package_chunk_fits(chunks, fits) - -} diff --git a/man/plot.LDA_VEM.Rd b/man/plot.LDA.Rd similarity index 57% rename from man/plot.LDA_VEM.Rd rename to man/plot.LDA.Rd index 6bda8380..9a04badf 100644 --- a/man/plot.LDA_VEM.Rd +++ b/man/plot.LDA.Rd @@ -1,22 +1,28 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/LDA_plots.R -\name{plot.LDA_VEM} -\alias{plot.LDA_VEM} +\name{plot.LDA} +\alias{plot.LDA} +\alias{plot.LDA_set} \alias{LDA_plot_top_panel} \alias{LDA_plot_bottom_panel} +\alias{set_LDA_plot_colors} \title{Plot the results of an LDATS LDA model} \usage{ -\method{plot}{LDA_VEM}(x, ..., xtime = NULL, xname = NULL, - cols = NULL, option = "C", alpha = 0.8, LDATS = FALSE) +\method{plot}{LDA}(x, ..., xtime = NULL, xname = NULL, cols = NULL, + option = "C", alpha = 0.8, LDATS = FALSE) + +\method{plot}{LDA_set}(x, ..., selected = TRUE) LDA_plot_top_panel(x, cols = NULL, option = "C", alpha = 0.8, together = FALSE, LDATS = FALSE) LDA_plot_bottom_panel(x, xtime = NULL, xname = NULL, cols = NULL, option = "C", alpha = 0.8, together = FALSE, LDATS = FALSE) + +set_LDA_plot_colors(x, cols = NULL, option = "C", alpha = 0.8) } \arguments{ -\item{x}{Object of class \code{LDA_VEM}.} +\item{x}{Object of class \code{LDA}.} \item{...}{Not used, retained for alignment with base function.} @@ -44,32 +50,29 @@ colors used. Supported only on some devices, see \item{LDATS}{\code{logical} indicating if the LDA plot is part of a larger LDATS plot output.} +\item{selected}{\code{logical} indicator of if only the selected LDAs +(the first element in \code{x}) should be plotted or if all the LDAs +(the second element in \code{x}) should be plotted.} + \item{together}{\code{logical} indicating if the subplots are part of a larger LDA plot output.} } \value{ -\code{NULL}. +\code{plot.LDA},\code{plot.LDA_set},\code{LDA_plot_top_panel}, + \code{LDA_plot_bottom_panel}: \code{NULL}. \cr \cr + \code{set_LDA_plot_colors}: \code{vector} of \code{character} hex codes + indicating colors to use. } \description{ -Create an LDATS LDA summary plot, with a top panel showing - the topic proportions for each word and a bottom panel showing the topic - proportions of each document/over time. The plot function is defined for - class \code{LDA_VEM} specifically (see \code{\link[topicmodels]{LDA}}). - \cr \cr +\code{plot.LDA} creates an LDATS LDA summary plot, with a top panel + showing the topic proportions for each word and a bottom panel showing + the topic proportions of each document/over time. \cr \cr + \code{plot.LDA_set} plots either the \code{selected} model results or + all of them from a \code{LDA_set} of \code{LDA} models. \cr \cr \code{LDA_plot_top_panel} creates an LDATS LDA summary plot - top panel showing the topic proportions word-by-word. \cr \cr + top panel showing the topic proportions word-by-word. \cr \cr \code{LDA_plot_bottom_panel} creates an LDATS LDA summary plot - bottom panel showing the topic proportions over time/documents. -} -\examples{ -\donttest{ - data(rodents) - lda_data <- rodents$document_term_table - r_LDA <- LDA_set(lda_data, topics = 4, nseeds = 10) - best_lda <- select_LDA(r_LDA)[[1]] - plot(best_lda, option = "cividis") - LDA_plot_top_panel(best_lda, option = "cividis") - LDA_plot_bottom_panel(best_lda, option = "cividis") -} - + bottom panel showing the topic proportions over time/documents. \cr \cr + \code{set_LDA_plot_colors} creates the set of colors to be used in + the LDA plots based on the variety of argument options. } diff --git a/man/plot.LDA_TS.Rd b/man/plot.LDA_TS.Rd index 91dd8822..66f1c809 100644 --- a/man/plot.LDA_TS.Rd +++ b/man/plot.LDA_TS.Rd @@ -2,10 +2,15 @@ % Please edit documentation in R/LDA_TS_plots.R \name{plot.LDA_TS} \alias{plot.LDA_TS} +\alias{set_LDA_TS_plot_colors} \title{Plot the key results from a full LDATS analysis} \usage{ -\method{plot}{LDA_TS}(x, ..., cols = set_LDA_TS_plot_cols(), +\method{plot}{LDA_TS}(x, ..., cols = set_LDA_TS_plot_colors(), bin_width = 1, xname = NULL, border = NA, selection = "median") + +set_LDA_TS_plot_colors(rho_cols = NULL, rho_option = "D", + rho_alpha = 0.4, gamma_cols = NULL, gamma_option = "C", + gamma_alpha = 0.8) } \arguments{ \item{x}{A \code{LDA_TS} object of a full LDATS model fit by @@ -16,7 +21,7 @@ used, just retained for alignment with \code{plot}.} \item{cols}{\code{list} of elements used to define the colors for the two panels of the summary plot, as generated simply using -\code{\link{set_LDA_TS_plot_cols}}. \code{cols} has two elements: +\code{\link{set_LDA_TS_plot_colors}}. \code{cols} has two elements: \code{LDA} and \code{TS}, each corresponding the set of plots for its stage in the full model. \code{LDA} contains entries \code{cols} and \code{option} (see \code{\link{set_LDA_plot_colors}}). \code{TS} @@ -40,21 +45,46 @@ no label printed, set \code{xname = ""}.} \item{selection}{Indicator of the change points to use in the time series summary plot. Currently only defined for \code{"median"} and \code{"mode"}.} + +\item{rho_cols, gamma_cols}{Colors to be used in the specific plot. Any +valid color values (\emph{e.g.}, see +\code{\link[grDevices]{colors}}, \code{\link[grDevices]{rgb}}) can be +input as with a standard plot. The default (\code{NULL}) triggers use +of \code{\link[viridis]{viridis}} color options (see +\code{rho_option},\code{gamma_option}).} + +\item{rho_option, gamma_option}{A \code{character} string indicating +the color option from \code{\link[viridis]{viridis}} to use if +"cols == NULL". Four options are available: "magma" (or "A"), +"inferno" (or "B"), "plasma" (or "C"), "viridis" (or "D", the default +option) and "cividis" (or "E").} + +\item{rho_alpha, gamma_alpha}{Numeric value [0,1] that indicates the +transparency of the colors used. Supported only on some devices, see +\code{\link[grDevices]{rgb}}.} } \value{ -\code{NULL}. +\code{plot.LDA_TS}: \code{NULL}. \cr \cr + \code{set_LDA_TS_plot_colors}: \code{list} of elements used to define + the colors for the summary plots, which has two + elements: \code{LDA} and \code{TS}, each corresponding the set of + plots for its stage in the full model. \code{LDA} contains entries + \code{cols} and \code{options} (see \code{\link{set_LDA_plot_colors}}). + \code{TS} contains two entries, \code{rho} and \code{gamma}, each + corresponding to the related panel, and each containing default values + for entries named \code{cols}, \code{option}, and \code{alpha} (see + \code{\link{set_TS_summary_plot_cols}}, \code{\link{set_gamma_colors}}, + and \code{\link{set_rho_hist_colors}}). } \description{ -Generalization of the \code{\link[graphics]{plot}} function to - work on fitted LDA_TS model objects (class \code{LDA_TS}) returned by - \code{\link{LDA_TS}}). -} -\examples{ -\donttest{ - data(rodents) - mod <- LDA_TS(data = rodents, topics = 2, nseeds = 1, formulas = ~1, - nchangepoints = 1, timename = "newmoon") - plot(mod, binwidth = 5, xlab = "New moon") -} - +\code{plot.LDA_TS} generalizes the \code{\link[graphics]{plot}} function + to work on fitted LDA_TS model objects (class \code{LDA_TS}) returned + by \code{\link{LDA_TS}}). \cr \cr + \code{set_LDA_TS_plot_colors} produces the options for the colors + controlling the panels of the LDATS summary plots, needed because + the change point histogram panel should be in a different color scheme + than the LDA and fitted time series model panels, which should be + in a matching color scheme. See \code{\link{set_LDA_plot_colors}}, + \code{\link{set_TS_summary_plot_cols}}, \code{\link{set_gamma_colors}}, + and \code{\link{set_rho_hist_colors}} for specific details on usage. } diff --git a/man/plot.LDA_set.Rd b/man/plot.LDA_set.Rd deleted file mode 100644 index a05f5a3a..00000000 --- a/man/plot.LDA_set.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA_plots.R -\name{plot.LDA_set} -\alias{plot.LDA_set} -\title{Plot a set of LDATS LDA models} -\usage{ -\method{plot}{LDA_set}(x, ...) -} -\arguments{ -\item{x}{An \code{LDA_set} object of LDA topic models.} - -\item{...}{Additional arguments to be passed to subfunctions.} -} -\value{ -\code{NULL}. -} -\description{ -Generalization of the \code{\link[graphics]{plot}} function to - work on a list of LDA topic models (class \code{LDA_set}). -} -\examples{ -\donttest{ - data(rodents) - lda_data <- rodents$document_term_table - r_LDA <- LDA_set(lda_data, topics = 2, nseeds = 2) - plot(r_LDA) -} - -} diff --git a/man/plot.TS.Rd b/man/plot.TS.Rd new file mode 100644 index 00000000..390397f5 --- /dev/null +++ b/man/plot.TS.Rd @@ -0,0 +1,199 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TS_plots.R +\name{plot.TS} +\alias{plot.TS} +\alias{plot.TS_set} +\alias{TS_diagnostics_plot} +\alias{eta_diagnostics_plots} +\alias{rho_diagnostics_plots} +\alias{trace_plot} +\alias{ecdf_plot} +\alias{posterior_plot} +\alias{autocorr_plot} +\alias{set_TS_summary_plot_cols} +\alias{TS_summary_plot} +\alias{pred_gamma_TS_plot} +\alias{rho_lines} +\alias{rho_hist} +\alias{set_rho_hist_colors} +\alias{set_gamma_colors} +\title{Plot an LDATS Time Series model} +\usage{ +\method{plot}{TS}(x, ..., plot_type = "summary", interactive = FALSE, + cols = set_TS_summary_plot_cols(), bin_width = 1, xname = NULL, + border = NA, selection = "median", LDATS = FALSE) + +\method{plot}{TS_set}(x, ..., selected = TRUE) + +TS_diagnostics_plot(x, interactive = TRUE) + +eta_diagnostics_plots(x, interactive) + +rho_diagnostics_plots(x, interactive) + +trace_plot(draw, ylab = "parameter value") + +ecdf_plot(draw, xlab = "parameter value") + +posterior_plot(draw, xlab = "parameter value") + +autocorr_plot(draw) + +set_TS_summary_plot_cols(rho_cols = NULL, rho_option = "D", + rho_alpha = 0.4, gamma_cols = NULL, gamma_option = "C", + gamma_alpha = 0.8) + +TS_summary_plot(x, cols = set_TS_summary_plot_cols(), bin_width = 1, + xname = NULL, border = NA, selection = "median", LDATS = FALSE) + +pred_gamma_TS_plot(x, selection = "median", cols = set_gamma_colors(x), + xname = NULL, together = FALSE, LDATS = FALSE) + +rho_lines(spec_rhos) + +rho_hist(x, cols = set_rho_hist_colors(x$rhos), bin_width = 1, + xname = NULL, border = NA, together = FALSE, LDATS = FALSE) + +set_rho_hist_colors(x = NULL, cols = NULL, option = "D", alpha = 1) + +set_gamma_colors(x, cols = NULL, option = "D", alpha = 1) +} +\arguments{ +\item{x}{In \code{plot.TS}, a \code{TS_fit} object of a multinomial time +series model fit by \code{\link{TS}}. In \code{plot.TS_set}, a +\code{TS_set} \code{list} of \code{TS} objects.} + +\item{...}{Additional arguments to be passed to subfunctions. Not currently +used, just retained for alignment with \code{\link[graphics]{plot}}.} + +\item{plot_type}{"diagnostic" or "summary".} + +\item{interactive}{\code{logical} input, should be \code{TRUE} unless +testing.} + +\item{cols, rho_cols, gamma_cols}{In \code{plot.TS}, \code{cols} is a \code{list} of elements used to + define the colors for the two panels of the summary plot, as generated + simply using \code{\link{set_TS_summary_plot_cols}}. + \code{cols} has two elements \code{rho} and \code{gamma}, each + corresponding to the related panel, and each containing default values + for entries named \code{cols}, \code{option}, and \code{alpha}. \cr +For \code{rho_cols} and \code{gamma_cols} always and for \code{cols} in + \code{set_rho_hist_colors}, \code{set_gamma_colors}, + \code{rho_hist}, and \code{pred_gamma_TS_plot}, colors to be used in + the specific plot. Any valid color values (\emph{e.g.}, see + \code{\link[grDevices]{colors}}, \code{\link[grDevices]{rgb}}) can be + input as with a standard plot. The default (\code{NULL}) triggers use + of \code{\link[viridis]{viridis}} color options (see + \code{option},\code{rho_option},\code{gamma_option}).} + +\item{bin_width}{Width of the bins used in the histograms of the summary +time series plot, in units of the x-axis (the time variable used to fit +the model).} + +\item{xname}{Label for the x-axis in the summary time series plot. Defaults +to \code{NULL}, which results in usage of the \code{timename} element +of the control list (held in\code{control$TS_control$timename}). To have +no label printed, set \code{xname = ""}.} + +\item{border}{Border for the histogram, default is \code{NA}.} + +\item{selection}{Indicator of the change points to use in the time series +summary plot. Currently only defined for \code{"median"} and +\code{"mode"}.} + +\item{LDATS}{\code{logical} indicating if the plot is part of a larger +LDATS plot output.} + +\item{selected}{\code{logical} indicator of if only the selected TSs +(the first element in \code{x}) should be plotted or if all the TSs +(the second element in \code{x}) should be plotted.} + +\item{draw}{\code{vector} of parameter values drawn from the posterior +distribution, indexed to the iteration by the order of the vector.} + +\item{ylab}{\code{character} value used to label the y axis.} + +\item{xlab}{\code{character} value used to label the x axis.} + +\item{together}{\code{logical} indicating if the subplots are part of a +larger plot output.} + +\item{spec_rhos}{\code{numeric} vector indicating the locations along the +x axis where the specific change points being used are located.} + +\item{option, rho_option, gamma_option}{A \code{character} string indicating +the color option from \code{\link[viridis]{viridis}} to use if +"cols == NULL". Four options are available: "magma" (or "A"), +"inferno" (or "B"), "plasma" (or "C"), "viridis" (or "D", the default +option) and "cividis" (or "E").} + +\item{alpha, rho_alpha, gamma_alpha}{Numeric value [0,1] that indicates the +transparency of the colors used. Supported only on some devices, see +\code{\link[grDevices]{rgb}}.} +} +\value{ +\code{plot.TS},\code{plot.TS_set},\code{TS_diagnostics_plot}, + \code{eta_diagnostics_plots},\code{rho_diagnostics_plots}, + \code{trace_plot},\code{posterior_plot},\code{autocorr_plot}, + \code{ecdf_plot},\code{TS_summary_plot},\code{pred_gamma_TS_plot}, + \code{rho_hist},\code{rho_lines}:\code{NULL}. \cr \cr + \code{set_rho_hist_cols},\code{set_gamma_colors}: \code{vector} of + \code{character} hex codes indicating colors to use. + \code{set_TS_summary_plot_cols}: \code{list} of elements used to define + the colors for the two panels. Contains two elements \code{rho} and + \code{gamma}, each corresponding to the related panel, and each + containing default values for entries named \code{cols}, + \code{option}, and \code{alpha}. +} +\description{ +\code{plot.TS} is a generalization of the \code{\link[graphics]{plot}} + function to work on fitted TS model objects (class \code{TS}) + returned from \code{\link{TS}}. \cr \cr + \code{plot.TS_set} plots a \code{TS_set} of \code{TS} models, either + just the \code{selected} models or all. \cr \cr + \code{TS_diagnostics_plot} makes the 4-panel figures (showing trace + plots, posterior ECDF, posterior density, and iteration + autocorrelation) for each of the parameters (change point locations + and regressors) fitted within a compositional time series model (fit + by \code{\link{TS}}). \cr \cr + \code{eta_diagnostics_plots} creates the diagnostic plots + for the regressors (etas) of a time series model. \cr \cr + \code{rho_diagnostics_plots} creates the diagnostic plots + for the change point locations (rho) of a time series model. \cr \cr + \code{trace_plot} produces a trace plot for the parameter of interest + (rho or eta) as part of \code{\link{TS_diagnostics_plot}}. A + horizontal line is added to show the median of the posterior. \cr \cr + \code{ecdf_plot} makes a vanilla ECDF (empirical cumulative distribution + function) plot using \code{\link[stats]{ecdf}} for the parameter of + interest (rho or eta) as part of \code{\link{TS_diagnostics_plot}}. + A horizontal line is added to show the median of the posterior. \cr \cr + \code{autocorr_plot} produces a vanilla ACF plot using + \code{\link[stats]{acf}} for the parameter of interest (rho or eta) + as part of \code{\link{TS_diagnostics_plot}}.\cr \cr + \code{posterior_plot} makes a vanilla histogram plot using + \code{\link[graphics]{hist}} for the parameter of interest (rho or eta) + as part of \code{\link{TS_diagnostics_plot}}. A vertical line is added + to show the median of the posterior. \cr \cr + \code{TS_summary_plot} produces a two-panel figure of [1] the change + point distributions as histograms over time and [2] the time series of + the fitted topic proportions over time, based on a selected set of + change point locations. \cr \cr + \code{pred_gamma_TS_plot} produces a time series of the + fitted topic proportions over time, based on a selected set of change + point locations. \cr \cr + \code{rho_hist}: make a plot of the change point distributions as + histograms over time. \cr \cr + \code{rho_lines} adds vertical lines to the plot of the time series of + fitted proportions associated with the change points of interest. + \cr \cr + \code{set_gamma_colors} creates the set of colors to be used in + the time series of the fitted gamma (topic proportion) values. \cr \cr + \code{set_rho_hist_colors} creates the set of colors to be used in + the change point histogram. \cr \cr + \code{set_TS_summary_plot_cols} acts as a default \code{list} + generator function that produces the options for the colors + controlling the panels of the TS summary plots, so needed + because the panels should be in different color schemes. See + \code{\link{set_gamma_colors}} and \code{\link{set_rho_hist_colors}} + for specific details on usage. +} diff --git a/man/plot.TS_fit.Rd b/man/plot.TS_fit.Rd deleted file mode 100644 index c1487796..00000000 --- a/man/plot.TS_fit.Rd +++ /dev/null @@ -1,71 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{plot.TS_fit} -\alias{plot.TS_fit} -\title{Plot an LDATS TS model} -\usage{ -\method{plot}{TS_fit}(x, ..., plot_type = "summary", - interactive = FALSE, cols = set_TS_summary_plot_cols(), - bin_width = 1, xname = NULL, border = NA, selection = "median", - LDATS = FALSE) -} -\arguments{ -\item{x}{A \code{TS_fit} object of a multinomial time series model fit by -\code{\link{TS}}.} - -\item{...}{Additional arguments to be passed to subfunctions. Not currently -used, just retained for alignment with \code{\link[graphics]{plot}}.} - -\item{plot_type}{"diagnostic" or "summary".} - -\item{interactive}{\code{logical} input, should be code{TRUE} unless -testing.} - -\item{cols}{\code{list} of elements used to define the colors for the two -panels of the summary plot, as generated simply using -\code{\link{set_TS_summary_plot_cols}}. \code{cols} has two elements -\code{rho} and \code{gamma}, each corresponding to the related panel, -and each containing default values for entries named \code{cols}, -\code{option}, and \code{alpha}. See \code{\link{set_gamma_colors}} and -\code{\link{set_rho_hist_colors}} for details on usage.} - -\item{bin_width}{Width of the bins used in the histograms of the summary -time series plot, in units of the x-axis (the time variable used to fit -the model).} - -\item{xname}{Label for the x-axis in the summary time series plot. Defaults -to \code{NULL}, which results in usage of the \code{timename} element -of the control list (held in\code{control$TS_control$timename}). To have -no label printed, set \code{xname = ""}.} - -\item{border}{Border for the histogram, default is \code{NA}.} - -\item{selection}{Indicator of the change points to use in the time series -summary plot. Currently only defined for \code{"median"} and -\code{"mode"}.} - -\item{LDATS}{\code{logical} indicating if the plot is part of a larger -LDATS plot output.} -} -\value{ -\code{NULL}. -} -\description{ -Generalization of the \code{\link[graphics]{plot}} function to - work on fitted TS model objects (class \code{TS_fit}) returned from - \code{\link{TS}}. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) - plot(TSmod) -} - -} diff --git a/man/posterior_plot.Rd b/man/posterior_plot.Rd deleted file mode 100644 index 205b5493..00000000 --- a/man/posterior_plot.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{posterior_plot} -\alias{posterior_plot} -\title{Produce the posterior distribution histogram panel for the TS - diagnostic plot of a parameter} -\usage{ -posterior_plot(x, xlab = "parameter value") -} -\arguments{ -\item{x}{Vector of parameter values drawn from the posterior distribution, -indexed to the iteration by the order of the vector.} - -\item{xlab}{\code{character} value used to label the x axis.} -} -\value{ -\code{NULL}. -} -\description{ -Produce a vanilla histogram plot using \code{hist} for the - parameter of interest (rho or eta) as part of - \code{\link{TS_diagnostics_plot}}. A vertical line is added to show the - median of the posterior. -} -\examples{ - posterior_plot(rnorm(100, 0, 1)) - -} diff --git a/man/prep_LDA_control.Rd b/man/prep_LDA_control.Rd deleted file mode 100644 index 6fb0080d..00000000 --- a/man/prep_LDA_control.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA.R -\name{prep_LDA_control} -\alias{prep_LDA_control} -\title{Set the control inputs to include the seed} -\usage{ -prep_LDA_control(seed, control = list()) -} -\arguments{ -\item{seed}{\code{integer} used to set the seed of the specific model.} - -\item{control}{Named list of control parameters to be used in -\code{\link[topicmodels]{LDA}} Note that if \code{control} has an -element named \code{seed} it will be overwritten by the \code{seed} -argument of \code{prep_LDA_control}.} -} -\value{ -\code{list} of controls to be used in the LDA. -} -\description{ -Update the control list for the LDA model with the specific - seed as indicated. And remove controls not used within the LDA itself. -} -\examples{ - prep_LDA_control(seed = 1) - -} diff --git a/man/prep_TS_data.Rd b/man/prep_TS_data.Rd deleted file mode 100644 index 86b96e34..00000000 --- a/man/prep_TS_data.Rd +++ /dev/null @@ -1,55 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{prep_TS_data} -\alias{prep_TS_data} -\title{Prepare the model-specific data to be used in the TS analysis - of LDA output} -\usage{ -prep_TS_data(document_covariate_table, LDA_models, mods, i = 1) -} -\arguments{ -\item{document_covariate_table}{Document covariate table (rows: documents, -columns: time index and covariate options). Every model needs a -covariate to describe the time value for each document (in whatever -units and whose name in the table is input in \code{timename}) -that dictates the application of the change points. -In addition, all covariates named within specific models in -\code{formula} must be included. Must be a conformable to a data table, -as verified by \code{\link{check_document_covariate_table}}.} - -\item{LDA_models}{List of LDA models (class \code{LDA_set}, produced by -\code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -produced by \code{\link[topicmodels]{LDA}}).} - -\item{mods}{The \code{data.table} created by \code{\link{expand_TS}} that -contains each of the models (defined by the LDA model to use and the and -formula number of changepoints for the TS model). Indexed here by -\code{i}.} - -\item{i}{\code{integer} index referencing the row in \code{mods} to use.} -} -\value{ -Class \code{data.frame} object including [1] the time variable - (indicated in \code{control}), [2] the predictor variables (required by - \code{formula}) and [3], the multinomial response variable (indicated - in \code{formula}), ready for input into \code{TS}. -} -\description{ -Append the estimated topic proportions from a fitted LDA model - to the document covariate table to create the data structure needed for - \code{\link{TS}}. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) - LDA_models <- select_LDA(LDAs) - weights <- document_weights(document_term_table) - formulas <- c(~ 1, ~ newmoon) - mods <- expand_TS(LDA_models, formulas = ~1, nchangepoints = 0) - data1 <- prep_TS_data(document_covariate_table, LDA_models, mods) -} - -} diff --git a/man/prep_chunks.Rd b/man/prep_chunks.Rd index 152a6c1e..4772d133 100644 --- a/man/prep_chunks.Rd +++ b/man/prep_chunks.Rd @@ -1,9 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/multinom_TS.R +% Please edit documentation in R/TS.R \name{prep_chunks} \alias{prep_chunks} -\title{Prepare the time chunk table for a multinomial change point - Time Series model} +\title{Prepare the time chunk table for a change point Time Series model} \usage{ prep_chunks(data, changepoints = NULL, timename = "time") } @@ -28,16 +27,7 @@ timestep being 1 day, which is often not desired behavior.} \description{ Creates the table containing the start and end times for each chunk within a time series, based on the change points (used to break up - the time series) and the range of the time series. If there are no - change points (i.e. \code{changepoints} is \code{NULL}, there is still a - single chunk defined by the start and end of the time series. -} -\examples{ - data(rodents) - dtt <- rodents$document_term_table - lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) - dct <- rodents$document_covariate_table - dct$gamma <- lda[[1]]@gamma - chunks <- prep_chunks(dct, changepoints = 100, timename = "newmoon") - + the time series) and the range of the time series. \cr \cr + If there are no change points (i.e. \code{changepoints = NULL}, there is + still a single chunk defined by the start and end of the time series. } diff --git a/man/prep_cpts.Rd b/man/prep_cpts.Rd deleted file mode 100644 index 2c46ad05..00000000 --- a/man/prep_cpts.Rd +++ /dev/null @@ -1,98 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{prep_cpts} -\alias{prep_cpts} -\alias{update_cpts} -\title{Initialize and update the change point matrix used in the ptMCMC - algorithm} -\usage{ -prep_cpts(data, formula, nchangepoints, timename, weights, - control = list()) - -update_cpts(cpts, swaps) -} -\arguments{ -\item{data}{\code{data.frame} including [1] the time variable (indicated -in \code{timename}), [2] the predictor variables (required by -\code{formula}) and [3], the multinomial response variable (indicated in -\code{formula}) as verified by \code{\link{check_timename}} and -\code{\link{check_formula}}. Note that the response variables should be -formatted as a \code{data.frame} object named as indicated by the -\code{response} entry in the \code{control} list, such as \code{gamma} -for a standard TS analysis on LDA output.} - -\item{formula}{\code{formula} defining the regression relationship between -the change points, see \code{\link[stats]{formula}}. Any -predictor variable included must also be a column in -\code{data} and any (multinomial) response variable must be a set of -columns in \code{data}, as verified by \code{\link{check_formula}}.} - -\item{nchangepoints}{\code{integer} corresponding to the number of -change points to include in the model. 0 is a valid input (corresponding -to no change points, so a singular time series model), and the current -implementation can reasonably include up to 6 change points. The -number of change points is used to dictate the segmentation of the data -for each continuous model and each LDA model.} - -\item{timename}{\code{character} element indicating the time variable -used in the time series. Defaults to \code{"time"}. The variable must be -integer-conformable or a \code{Date}. If the variable named -is a \code{Date}, the input is converted to an integer, resulting in the -timestep being 1 day, which is often not desired behavior.} - -\item{weights}{Optional class \code{numeric} vector of weights for each -document. Defaults to \code{NULL}, translating to an equal weight for -each document. When using \code{multinom_TS} in a standard LDATS -analysis, it is advisable to weight the documents by their total size, -as the result of \code{\link[topicmodels]{LDA}} is a matrix of -proportions, which does not account for size differences among documents. -For most models, a scaling of the weights (so that the average is 1) is -most appropriate, and this is accomplished using -\code{\link{document_weights}}.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} - -\item{cpts}{The existing matrix of change points.} - -\item{swaps}{Chain configuration after among-temperature swaps.} -} -\value{ -\code{list} of [1] \code{matrix} of change points (rows) for - each temperature (columns) and [2] \code{vector} of log-likelihood - values for each of the chains. -} -\description{ -Each of the chains is initialized by \code{prep_cpts} using a - draw from the available times (i.e. assuming a uniform prior), the best - fit (by likelihood) draw is put in the focal chain with each subsequently - worse fit placed into the subsequently hotter chain. \code{update_cpts} - updates the change points after every iteration in the ptMCMC algorithm. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - data <- data[order(data[,"newmoon"]), ] - saves <- prep_saves(1, TS_control()) - inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, - TS_control()) - cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) - ids <- prep_ids(TS_control()) - for(i in 1:TS_control()$nit){ - steps <- step_chains(i, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - saves <- update_saves(i, saves, steps, swaps) - cpts <- update_cpts(cpts, swaps) - ids <- update_ids(ids, swaps) - } -} - -} diff --git a/man/prep_ids.Rd b/man/prep_ids.Rd deleted file mode 100644 index 24a05fd0..00000000 --- a/man/prep_ids.Rd +++ /dev/null @@ -1,61 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{prep_ids} -\alias{prep_ids} -\alias{update_ids} -\title{Initialize and update the chain ids throughout the ptMCMC algorithm} -\usage{ -prep_ids(control = list()) - -update_ids(ids, swaps) -} -\arguments{ -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} - -\item{ids}{The existing vector of chain ids.} - -\item{swaps}{Chain configuration after among-temperature swaps.} -} -\value{ -The vector of chain ids. -} -\description{ -\code{prep_ids} creates and \code{update_ids} updates - the active vector of identities (ids) for each of the chains in the - ptMCMC algorithm. These ids are used to track trips of the particles - among chains. - \cr \cr - These functions were designed to work within \code{\link{TS}} and - specifically \code{\link{est_changepoints}}, but have been generalized - and would work within any general ptMCMC as long as \code{control}, - \code{ids}, and \code{swaps} are formatted properly. -} -\examples{ - prep_ids() -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - data <- data[order(data[,"newmoon"]), ] - saves <- prep_saves(1, TS_control()) - inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, - TS_control()) - cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) - ids <- prep_ids(TS_control()) - for(i in 1:TS_control()$nit){ - steps <- step_chains(i, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - saves <- update_saves(i, saves, steps, swaps) - cpts <- update_cpts(cpts, swaps) - ids <- update_ids(ids, swaps) - } -} - -} diff --git a/man/prep_pbar.Rd b/man/prep_pbar.Rd index e4d9f2f7..5ddb1d8d 100644 --- a/man/prep_pbar.Rd +++ b/man/prep_pbar.Rd @@ -1,28 +1,24 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS.R +% Please edit documentation in R/utilities.R \name{prep_pbar} \alias{prep_pbar} \alias{update_pbar} \title{Initialize and tick through the progress bar} \usage{ -prep_pbar(control = list(), bar_type = "rho", nr = NULL) +prep_pbar(control = list(), type = "rho", nr = NULL) update_pbar(pbar, control = list()) } \arguments{ \item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}. Of use here is \code{quiet} which is a -a \code{logical} indicator of whether there should be information -(i.e. the progress bar) printed during the run or not. Default is -\code{TRUE}.} +iterative model.} -\item{bar_type}{"rho" (for change point locations) or "eta" (for -regressors).} +\item{type}{\code{character} value of possible types of progress bars. +Currently available options are "rho" (for change point locations) and +"eta" (for time series regressors).} \item{nr}{\code{integer} number of unique realizations, needed when -\code{bar_type} = "eta".} +\code{type} = "eta".} \item{pbar}{The progress bar object returned from \code{prep_pbar}.} } @@ -32,11 +28,5 @@ regressors).} } \description{ \code{prep_pbar} creates and \code{update_pbar} steps - through the progress bars (if desired) in \code{\link{TS}} -} -\examples{ - pb <- prep_pbar(control = list(nit = 2)); pb - pb <- update_pbar(pb); pb - pb <- update_pbar(pb); pb - + through the progress bars (if desired) in, e.g., \code{\link{TS}}. } diff --git a/man/prep_proposal_dist.Rd b/man/prep_proposal_dist.Rd deleted file mode 100644 index 399972fa..00000000 --- a/man/prep_proposal_dist.Rd +++ /dev/null @@ -1,43 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{prep_proposal_dist} -\alias{prep_proposal_dist} -\title{Pre-calculate the change point proposal distribution for the ptMCMC - algorithm} -\usage{ -prep_proposal_dist(nchangepoints, control = list()) -} -\arguments{ -\item{nchangepoints}{Integer corresponding to the number of -change points to include in the model. 0 is a valid input (corresponding -to no change points, so a singular time series model), and the current -implementation can reasonably include up to 6 change points. The -number of change points is used to dictate the segmentation of the data -for each continuous model and each LDA model.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}. Currently relevant here is -\code{magnitude}, which controls the magnitude of the step size (is the -average of the geometric distribution).} -} -\value{ -\code{list} of two \code{matrix} elements: [1] the size of the - proposed step for each iteration of each chain and [2] the identity of - the change point location to be shifted by the step for each iteration of - each chain. -} -\description{ -Calculate the proposal distribution in advance of actually - running the ptMCMC algorithm in order to decrease computation time. - The proposal distribution is a joint of three distributions: - [1] a multinomial distribution selecting among the change points within - the chain, [2] a binomial distribution selecting the direction of the - step of the change point (earlier or later in the time series), and - [3] a geometric distribution selecting the magnitude of the step. -} -\examples{ - prep_proposal_dist(nchangepoints = 2) - -} diff --git a/man/prep_ptMCMC_inputs.Rd b/man/prep_ptMCMC_inputs.Rd deleted file mode 100644 index 3ffad215..00000000 --- a/man/prep_ptMCMC_inputs.Rd +++ /dev/null @@ -1,77 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{prep_ptMCMC_inputs} -\alias{prep_ptMCMC_inputs} -\title{Prepare the inputs for the ptMCMC algorithm estimation of - change points} -\usage{ -prep_ptMCMC_inputs(data, formula, nchangepoints, timename, - weights = NULL, control = list()) -} -\arguments{ -\item{data}{Class \code{data.frame} object including [1] the time variable -(indicated in \code{control}), [2] the predictor variables (required by -\code{formula}) and [3], the multinomial response variable (indicated -in \code{formula}).} - -\item{formula}{\code{formula} describing the continuous change. Any -predictor variable included must also be a column in the -\code{data}. Any (multinomial) response variable must also be a set of -columns in \code{data}.} - -\item{nchangepoints}{Integer corresponding to the number of -change points to include in the model. 0 is a valid input (corresponding -to no change points, so a singular time series model), and the current -implementation can reasonably include up to 6 change points. The -number of change points is used to dictate the segmentation of the data -for each continuous model and each LDA model.} - -\item{timename}{\code{character} element indicating the time variable -used in the time series. Defaults to \code{"time"}. The variable must be -integer-conformable or a \code{Date}. If the variable named -is a \code{Date}, the input is converted to an integer, resulting in the -timestep being 1 day, which is often not desired behavior.} - -\item{weights}{Optional class \code{numeric} vector of weights for each -document. Defaults to \code{NULL}, translating to an equal weight for -each document. When using \code{multinom_TS} in a standard LDATS -analysis, it is advisable to weight the documents by their total size, -as the result of \code{\link[topicmodels]{LDA}} is a matrix of -proportions, which does not account for size differences among documents. -For most models, a scaling of the weights (so that the average is 1) is -most appropriate, and this is accomplished using -\code{\link{document_weights}}.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} -} -\value{ -Class \code{ptMCMC_inputs} \code{list}, containing the static - inputs for use within the ptMCMC algorithm for estimating change points. -} -\description{ -Package the static inputs (controls and data structures) used - by the ptMCMC algorithm in the context of estimating change points. - \cr \cr - This function was designed to work within \code{\link{TS}} and - specifically \code{\link{est_changepoints}}. It is still hardcoded to do - so, but has the capacity to be generalized to work with any estimation - via ptMCMC with additional coding work. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - data <- data[order(data[,"newmoon"]), ] - saves <- prep_saves(1, TS_control()) - inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, - TS_control()) -} -} diff --git a/man/prep_saves.Rd b/man/prep_saves.Rd deleted file mode 100644 index 176fd441..00000000 --- a/man/prep_saves.Rd +++ /dev/null @@ -1,78 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{prep_saves} -\alias{prep_saves} -\alias{update_saves} -\alias{process_saves} -\title{Prepare and update the data structures to save the ptMCMC output} -\usage{ -prep_saves(nchangepoints, control = list()) - -update_saves(i, saves, steps, swaps) - -process_saves(saves, control = list()) -} -\arguments{ -\item{nchangepoints}{\code{integer} corresponding to the number of -change points to include in the model. 0 is a valid input (corresponding -to no change points, so a singular time series model), and the current -implementation can reasonably include up to 6 change points. The -number of change points is used to dictate the segmentation of the data -for each continuous model and each LDA model.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} - -\item{i}{\code{integer} iteration index.} - -\item{saves}{The existing list of saved data objects.} - -\item{steps}{Chain configuration after within-temperature steps.} - -\item{swaps}{Chain configuration after among-temperature swaps.} -} -\value{ -\code{list} of ptMCMC objects: change points (\code{$cpts}), - log-likelihoods (\code{$lls}), chain ids (\code{$ids}), step acceptances - (\code{$step_accepts}), and swap acceptances (\code{$swap_accepts}). -} -\description{ -\code{prep_saves} creates the data structure used to save the - output from each iteration of the ptMCMC algorithm, which is added via - \code{update_saves}. Once the ptMCMC is complete, the saved data objects - are then processed (burn-in iterations are dropped and the remaining - iterations are thinned) via \code{process_saves}. - \cr \cr - This set of functions was designed to work within \code{\link{TS}} and - specifically \code{\link{est_changepoints}}. They are still hardcoded to - do so, but have the capacity to be generalized to work with any - estimation via ptMCMC with additional coding work. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - data <- data[order(data[,"newmoon"]), ] - saves <- prep_saves(1, TS_control()) - inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, - TS_control()) - cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) - ids <- prep_ids(TS_control()) - for(i in 1:TS_control()$nit){ - steps <- step_chains(i, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - saves <- update_saves(i, saves, steps, swaps) - cpts <- update_cpts(cpts, swaps) - ids <- update_ids(ids, swaps) - } - process_saves(saves, TS_control()) -} - -} diff --git a/man/prep_temp_sequence.Rd b/man/prep_temp_sequence.Rd deleted file mode 100644 index 264e13d9..00000000 --- a/man/prep_temp_sequence.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{prep_temp_sequence} -\alias{prep_temp_sequence} -\title{Prepare the ptMCMC temperature sequence} -\usage{ -prep_temp_sequence(control = list()) -} -\arguments{ -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} -} -\value{ -\code{vector} of temperatures. -} -\description{ -Create the series of temperatures used in the ptMCMC - algorithm. - \cr \cr - This function was designed to work within \code{\link{TS}} and - \code{\link{est_changepoints}} specifically, but has been generalized - and would work with any ptMCMC model as long as \code{control} - includes the relevant control parameters (and provided that the - \code{\link{check_control}} function and its use here are generalized). -} -\examples{ - prep_temp_sequence() - -} diff --git a/man/print.LDA_TS.Rd b/man/print.LDA_TS.Rd deleted file mode 100644 index 0a879ec2..00000000 --- a/man/print.LDA_TS.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA_TS.R -\name{print.LDA_TS} -\alias{print.LDA_TS} -\title{Print the selected LDA and TS models of LDA_TS object} -\usage{ -\method{print}{LDA_TS}(x, ...) -} -\arguments{ -\item{x}{Class \code{LDA_TS} object to be printed.} - -\item{...}{Not used, simply included to maintain method compatibility.} -} -\value{ -The selected models in \code{x} as a two-element \code{list} with - the TS component only returning the non-hidden components. -} -\description{ -Convenience function to print only the selected elements of a - \code{LDA_TS}-class object returned by \code{\link{LDA_TS}} -} -\examples{ -\donttest{ - data(rodents) - mod <- LDA_TS(data = rodents, topics = 2, nseeds = 1, formulas = ~1, - nchangepoints = 1, timename = "newmoon") - print(mod) -} - -} diff --git a/man/print.TS.Rd b/man/print.TS.Rd new file mode 100644 index 00000000..890b762b --- /dev/null +++ b/man/print.TS.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TS.R +\name{print.TS} +\alias{print.TS} +\title{Print a Time Series model} +\usage{ +\method{print}{TS}(x, ...) +} +\arguments{ +\item{x}{Class \code{TS} object to be printed.} + +\item{...}{Not used, simply included to maintain method compatibility.} +} +\value{ +The non-hidden parts of \code{x} are printed and returned + invisibly as a \code{list}. +} +\description{ +Convenience function to print only the most important + components of a \code{TS}-class object fit by + \code{\link{sequential_TS}}. +} diff --git a/man/print.TS_fit.Rd b/man/print.TS_fit.Rd deleted file mode 100644 index 091d1c66..00000000 --- a/man/print.TS_fit.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS.R -\name{print.TS_fit} -\alias{print.TS_fit} -\title{Print a Time Series model fit} -\usage{ -\method{print}{TS_fit}(x, ...) -} -\arguments{ -\item{x}{Class \code{TS_fit} object to be printed.} - -\item{...}{Not used, simply included to maintain method compatibility.} -} -\value{ -The non-hidden parts of \code{x} as a \code{list}. -} -\description{ -Convenience function to print only the most important - components of a \code{TS_fit}-class object fit by - \code{\link{TS}}. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) - print(TSmod) -} - -} diff --git a/man/print.TS_on_LDA.Rd b/man/print.TS_on_LDA.Rd deleted file mode 100644 index c30fc5c1..00000000 --- a/man/print.TS_on_LDA.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{print.TS_on_LDA} -\alias{print.TS_on_LDA} -\title{Print a set of Time Series models fit to LDAs} -\usage{ -\method{print}{TS_on_LDA}(x, ...) -} -\arguments{ -\item{x}{Class \code{TS_on_LDA} object to be printed.} - -\item{...}{Not used, simply included to maintain method compatibility.} -} -\value{ -\code{character} \code{vector} of the names of \code{x}'s models. -} -\description{ -Convenience function to print only the names of a - \code{TS_on_LDA}-class object generated by \code{\link{TS_on_LDA}}. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) - LDA_models <- select_LDA(LDAs) - weights <- document_weights(document_term_table) - formulas <- c(~ 1, ~ newmoon) - mods <- TS_on_LDA(LDA_models, document_covariate_table, formulas, - nchangepoints = 0:1, timename = "newmoon", weights) - print(mods) -} - -} diff --git a/man/print_model_run_message.Rd b/man/print_model_run_message.Rd deleted file mode 100644 index 3241361b..00000000 --- a/man/print_model_run_message.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{print_model_run_message} -\alias{print_model_run_message} -\title{Print the message to the console about which combination of the - Time Series and LDA models is being run} -\usage{ -print_model_run_message(models, i, LDA_models, control) -} -\arguments{ -\item{models}{\code{data.frame} object returned from -\code{\link{expand_TS}} that contains the combinations of LDA models, -and formulas and nchangepoints used in the TS models.} - -\item{i}{\code{integer} index of the row to use from \code{models}.} - -\item{LDA_models}{List of LDA models (class \code{LDA_set}, produced by -\code{\link{LDA_set}}) or a singular LDA model (class \code{LDA}, -produced by \code{\link[topicmodels]{LDA}}).} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}. Of particular importance here is -the \code{logical}-class element named \code{quiet}.} -} -\value{ -\code{NULL}. -} -\description{ -If desired, print a message at the beginning of every model - combination stating the TS model and the LDA model being evaluated. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) - LDA_models <- select_LDA(LDAs) - weights <- document_weights(document_term_table) - formulas <- c(~ 1, ~ newmoon) - nchangepoints <- 0:1 - mods <- expand_TS(LDA_models, formulas, nchangepoints) - print_model_run_message(mods, 1, LDA_models, TS_control()) -} - -} diff --git a/man/proposed_step_mods.Rd b/man/proposed_step_mods.Rd deleted file mode 100644 index 4fefc1ac..00000000 --- a/man/proposed_step_mods.Rd +++ /dev/null @@ -1,55 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{proposed_step_mods} -\alias{proposed_step_mods} -\title{Fit the chunk-level models to a time series, given a set of - proposed change points within the ptMCMC algorithm} -\usage{ -proposed_step_mods(prop_changepts, inputs) -} -\arguments{ -\item{prop_changepts}{\code{matrix} of proposed change points across -chains.} - -\item{inputs}{Class \code{ptMCMC_inputs} list, containing the static inputs -for use within the ptMCMC algorithm.} -} -\value{ -List of models associated with the proposed step, with an element - for each chain. -} -\description{ -This function wraps around \code{TS_memo} - (optionally memoised \code{\link{multinom_TS}}) to provide a - simpler interface within the ptMCMC algorithm and is implemented within - \code{\link{propose_step}}. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - data <- data[order(data[,"newmoon"]), ] - saves <- prep_saves(1, TS_control()) - inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, - TS_control()) - cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) - i <- 1 - pdist <- inputs$pdist - ntemps <- length(inputs$temps) - selection <- cbind(pdist$which_steps[i, ], 1:ntemps) - prop_changepts <- cpts$changepts - curr_changepts_s <- cpts$changepts[selection] - prop_changepts_s <- curr_changepts_s + pdist$steps[i, ] - if(all(is.na(prop_changepts_s))){ - prop_changepts_s <- NULL - } - prop_changepts[selection] <- prop_changepts_s - mods <- proposed_step_mods(prop_changepts, inputs) -} - -} diff --git a/man/rho_lines.Rd b/man/rho_lines.Rd deleted file mode 100644 index ea918728..00000000 --- a/man/rho_lines.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{rho_lines} -\alias{rho_lines} -\title{Add change point location lines to the time series plot} -\usage{ -rho_lines(spec_rhos) -} -\arguments{ -\item{spec_rhos}{\code{numeric} vector indicating the locations along the -x axis where the specific change points being used are located.} -} -\description{ -Adds vertical lines to the plot of the time series of fitted - proportions associated with the change points of interest. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) - pred_gamma_TS_plot(TSmod) - rho_lines(200) -} - -} diff --git a/man/select_LDA.Rd b/man/select_LDA.Rd deleted file mode 100644 index 9400d7ce..00000000 --- a/man/select_LDA.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA.R -\name{select_LDA} -\alias{select_LDA} -\title{Select the best LDA model(s) for use in time series} -\usage{ -select_LDA(LDA_models = NULL, control = list()) -} -\arguments{ -\item{LDA_models}{An object of class \code{LDA_set} produced by -\code{\link{LDA_set}}.} - -\item{control}{A \code{list} of parameters to control the running and -selecting of LDA models. Values not input assume default values set -by \code{\link{LDA_set_control}}. Values for running the LDAs replace -defaults in (\code{LDAcontol}, see \code{\link[topicmodels]{LDA}} (but if - \code{seed} is given, it will be overwritten; use \code{iseed} instead).} -} -\value{ -A reduced version of \code{LDA_models} that only includes the - selected LDA model(s). The returned object is still an object of - class \code{LDA_set}. -} -\description{ -Select the best model(s) of interest from an - \code{LDA_set} object, based on a set of user-provided functions. The - functions default to choosing the model with the lowest AIC value. -} -\examples{ - data(rodents) - lda_data <- rodents$document_term_table - r_LDA <- LDA_set(lda_data, topics = 2, nseeds = 2) - select_LDA(r_LDA) - -} diff --git a/man/select_TS.Rd b/man/select_TS.Rd deleted file mode 100644 index efd691a4..00000000 --- a/man/select_TS.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_on_LDA.R -\name{select_TS} -\alias{select_TS} -\title{Select the best Time Series model} -\usage{ -select_TS(TS_models, control = list()) -} -\arguments{ -\item{TS_models}{An object of class \code{TS_on_LDA} produced by -\code{\link{TS_on_LDA}}.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} -} -\value{ -A reduced version of \code{TS_models} that only includes the - selected TS model. The returned object is a single TS model object of - class \code{TS_fit}. -} -\description{ -Select the best model of interest from an - \code{TS_on_LDA} object generated by \code{\link{TS_on_LDA}}, based on - a set of user-provided functions. The functions default to choosing the - model with the lowest AIC value. \cr \cr - Presently, the set of functions should result in a singular selected - model. If multiple models are chosen via the selection, only the first - is returned. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDAs <- LDA_set(document_term_table, topics = 2:3, nseeds = 2) - LDA_models <- select_LDA(LDAs) - weights <- document_weights(document_term_table) - formulas <- c(~ 1, ~ newmoon) - mods <- TS_on_LDA(LDA_models, document_covariate_table, formulas, - nchangepoints = 0:1, timename = "newmoon", weights) - select_TS(mods) -} - -} diff --git a/man/sequential_TS.Rd b/man/sequential_TS.Rd new file mode 100644 index 00000000..ad676350 --- /dev/null +++ b/man/sequential_TS.Rd @@ -0,0 +1,200 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TS_models.R +\name{sequential_TS} +\alias{sequential_TS} +\alias{package_sequential_TS} +\alias{est_changepoints} +\alias{est_regressors} +\alias{sequential_TS_control} +\alias{summarize_etas} +\alias{measure_eta_vcov} +\alias{summarize_rhos} +\alias{measure_rho_vcov} +\title{Estimate a Time Series model sequentially} +\usage{ +sequential_TS(TS, control = list()) + +package_sequential_TS(TS, rho_dist, eta_dist) + +est_changepoints(TS) + +est_regressors(rho_dist, TS) + +sequential_TS_control(method = ldats_classic, + method_args = ldats_classic_control(), summary_prob = 0.95, + soften = TRUE, quiet = FALSE, ...) + +summarize_etas(etas, TS) + +measure_eta_vcov(etas) + +summarize_rhos(rhos, TS) + +measure_rho_vcov(rhos) +} +\arguments{ +\item{TS}{Time series model \code{list}.} + +\item{control}{A \code{list} of parameters to control the fitting of the +Time Series model. Values not input assume defaults set by +\code{\link{sequential_TS_control}}.} + +\item{rho_dist}{\code{list} of saved data objects from the estimation of +change point locations (unless \code{nchangepoints} is 0, then +\code{NULL}) returned from \code{\link{est_changepoints}}.} + +\item{eta_dist}{\code{matrix} of draws (rows) from the marginal posteriors +of the coefficients across the segments (columns), as estimated by +\code{\link{est_regressors}}.} + +\item{method}{\code{function} used to drive the sampler of the TS +models; \code{method} defines and operates the computational procedure. +\cr \cr +Current pre-built options include \code{\link{ldats_classic}}.} + +\item{method_args}{\code{list} of (named) arguments to be used in +\code{method} via \code{\link{do.call}}. +\cr \cr +Could be managed via a \code{_control} function like +\code{\link{ldats_classic_control}}.} + +\item{summary_prob}{Probability used for summarizing the posterior +distributions (via the highest posterior density interval, see +\code{\link[coda]{HPDinterval}}).} + +\item{soften}{\code{logical} indicator of whether the model should error +softly or if errors should trigger a full-stop to the pipeline.} + +\item{quiet}{\code{logical} indicator of whether the model should run +quietly (if \code{FALSE}, a progress bar and notifications are printed).} + +\item{...}{Not passed along to the output, rather included to allow for +automated removal of unneeded controls.} + +\item{etas}{\code{matrix} of regressors (columns) across iterations of the +sampler (rows), as returned from \code{\link{est_regressors}}.} + +\item{rhos}{\code{matrix} of change point locations (columns) across +iterations of the sampler (rows) or \code{NULL} if no change points are +in the model, as returned from \code{\link{est_changepoints}}.} +} +\value{ +\code{sequential_TS} and \code{package_sequential_TS}: + \code{TS}-class list containing the following elements, many of + which are hidden for \code{print}ing, but are accessible: + \describe{ + \item{data}{\code{data} input to the function.} + \item{formula}{\code{\link[stats]{formula}} input to the function.} + \item{nchangepoints}{\code{nchangepoints} input to the function.} + \item{weights}{\code{weights} input to the function.} + \item{timename}{\code{timename} input to the function.} + \item{control}{\code{control} input to the function.} + \item{lls}{Iteration-by-iteration + \link[=logLik.TS_fit]{logLik} values for the + full time series fit by \code{\link{multinom_TS}}.} + \item{rhos}{Iteration-by-iteration change point estimates from + \code{\link{est_changepoints}} and diagnostics.} + \item{focal_rhos}{Simplified object of just the change point + locations of interest.} + \item{etas}{Iteration-by-iteration marginal regressor estimates from + \code{\link{est_regressors}}, which have been + unconditioned with respect to change point locations.} + \item{rho_summary}{Summary table describing \code{rhos} (the change + point locations), see + \code{\link{summarize_rhos}}.} + \item{rho_vcov}{Variance-covariance matrix for the estimates of + \code{rhos} (the change point locations), see + \code{\link{measure_rho_vcov}}.} + \item{eta_summary}{Summary table describing \code{ets} (the + regressors), see + \code{\link{summarize_etas}}.} + \item{eta_vcov}{Variance-covariance matrix for the estimates of + \code{etas} (the regressors), see + \code{\link{measure_eta_vcov}}.} + \item{logLik}{Across-iteration average of log-likelihoods + (\code{lls}).} + \item{nparams}{Total number of parameters in the full model, + including the change point locations and regressors.} + } \cr \cr + \code{sequential_TS_control}: \code{list} of named control elements for + sequential model fitting. + \code{est_changepoints}: \code{list} of saved data objects from the + estimation of change point locations, uunless \code{nchangepoints} + is 0, then \code{NULL}. \cr \cr + \code{est_regressors}: \code{matrix} of draws (rows) from the marginal + posteriors of the coefficients across the segments (columns). \cr \cr + \code{summarize_etas}: table of summary statistics for chunk-level + regressors including mean, median, mode, posterior interval, standard + deviation, MCMC error, autocorrelation, and effective sample size for + each regressor. \cr \cr + \code{measure_eta_vcov}: variance-covariance matrix for chunk-level + regressors. \cr \cr + \code{summarize_rhos}: table of summary statistics for change point + locations including mean, median, mode, posterior interval, standard + deviation, MCMC error, autocorrelation, and effective sample size for + each change point location. \cr \cr + \code{measure_rho_vcov}: variance-covariance matrix for change + point locations. +} +\description{ +This set of functions estimates the Time Series model + by sequential methods that first estimate the change point locations + with full flexibility of the regressor models between change points, + then estimate the regressors between the change points, intially + conditional on their locations, but with marginal estimation to produce + regressor values unconditional on change point locations. \cr \cr + \code{sequential_TS} combines each stage of the model estimation and + packages the model results in a consistent output. \cr \cr + \code{sequential_TS_control} defines and creates the control \code{list} + used to fit the sequential Time Series model. + \code{est_changepoints} estimates the change point location + distributions for multinomial Time Series analyses. \cr \cr + \code{est_regressors} uses the marginal posterior distributions of + the change point locations (estimated by + \code{\link{est_changepoints}}) in combination with the conditional + (on the change point locations) posterior distributions of the + regressors (estimated by a \code{_TS} function) to + estimate the marginal posterior distribution of the regressors, + unconditional on the change point locations. \cr \cr + \code{package_sequential_TS} calculates relevant summaries for the run of + a sequenial Time Series model within \code{\link{sequential_TS}} and + packages the output as a \code{TS}-class object. \cr \cr + \code{summarize_etas} calculates summary statistics for each + of the chunk-level regressors. \cr \cr + \code{measure_ets_vcov} generates the variance-covariance matrix for + the regressors. \cr \cr + \code{summarize_rho} calculates summary statistics for each + of the change point locations. \cr \cr + \code{measure_rho_vcov} generates the variance-covariance matrix for the + change point locations. +} +\details{ +The general approach follows that of Western and Kleykamp + (2004), although we note some important differences. Our regression + models are fit independently for each chunk (segment of time), and + therefore the variance-covariance matrix for the full model + has \code{0} entries for covariances between regressors in different + chunks of the time series. \cr \cr + Further differences are model-specific. For example, the original softmax + multinomial regression model used here is a standard (non-hierarchical) + model (Ripley 1996, Venables and Ripley 2002, Bishop 2006), mean that + there is no error term in the regression (as there is in the normal + model used by Western and Kleykamp 2004), and so the posterior + distribution used here is a multivariate normal, as opposed to a + multivariate t, as used by Western and Kleykamp (2004). +} +\references{ +Bishop, C. M. 2006. \emph{Pattern Recognition and Machine Learning}. + Springer, New York, NY, USA. + + Ripley, B. D. 1996. \emph{Pattern Recognition and Neural Networks}. + Cambridge University Press, Cambridge, UK. + + Venables, W. N. and B. D. Ripley. 2002. \emph{Modern and Applied + Statistics with S}. Fourth Edition. Springer, New York, NY, USA. + + Western, B. and M. Kleykamp. 2004. A Bayesian change point model for + historical time series analysis. \emph{Political Analysis} + \strong{12}:354-374. + \href{https://doi.org/10.1093/pan/mph023}{link}. +} diff --git a/man/set_LDA_TS_plot_cols.Rd b/man/set_LDA_TS_plot_cols.Rd deleted file mode 100644 index 64b2d9d8..00000000 --- a/man/set_LDA_TS_plot_cols.Rd +++ /dev/null @@ -1,69 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA_TS_plots.R -\name{set_LDA_TS_plot_cols} -\alias{set_LDA_TS_plot_cols} -\title{Create the list of colors for the LDATS summary plot} -\usage{ -set_LDA_TS_plot_cols(rho_cols = NULL, rho_option = "D", - rho_alpha = 0.4, gamma_cols = NULL, gamma_option = "C", - gamma_alpha = 0.8) -} -\arguments{ -\item{rho_cols}{Colors to be used to plot the histograms of change points. -Any valid color values (\emph{e.g.}, see \code{\link[grDevices]{colors}}, -\code{\link[grDevices]{rgb}}) can be input as with a standard plot. -The default (\code{rho_cols = NULL}) triggers use of -\code{\link[viridis]{viridis}} color options (see \code{rho_option}).} - -\item{rho_option}{A \code{character} string indicating the color option -from \code{\link[viridis]{viridis}} to use if `rho_cols == NULL`. Four -options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -(or "C"), "viridis" (or "D", the default option) and "cividis" (or "E").} - -\item{rho_alpha}{Numeric value [0,1] that indicates the transparency of the -colors used. Supported only on some devices, see -\code{\link[grDevices]{rgb}}.} - -\item{gamma_cols}{Colors to be used to plot the LDA topic proportions, -time series of observed topic proportions, and time series of fitted -topic proportions. Any valid color values (\emph{e.g.}, see -\code{\link[grDevices]{colors}}, \code{\link[grDevices]{rgb}}) can be -input as with a standard plot. The default (\code{gamma_cols = NULL}) -triggers use of \code{\link[viridis]{viridis}} color options (see -\code{gamma_option}).} - -\item{gamma_option}{A \code{character} string indicating the color option -from \code{\link[viridis]{viridis}} to use if gamma_cols == NULL`. Four -options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -(or "C", the default option), "viridis" (or "D") and "cividis" (or "E").} - -\item{gamma_alpha}{Numeric value [0,1] that indicates the transparency of -the colors used. Supported only on some devices, see -\code{\link[grDevices]{rgb}}.} -} -\value{ -\code{list} of elements used to define the colors for the two - panels of the summary plot, as generated simply using - \code{\link{set_LDA_TS_plot_cols}}. \code{cols} has two elements: - \code{LDA} and \code{TS}, each corresponding the set of plots for - its stage in the full model. \code{LDA} contains entries \code{cols} - and \code{options} (see \code{\link{set_LDA_plot_colors}}). \code{TS} - contains two entries, \code{rho} and \code{gamma}, each corresponding - to the related panel, and each containing default values for entries - named \code{cols}, \code{option}, and \code{alpha} (see - \code{\link{set_TS_summary_plot_cols}}, \code{\link{set_gamma_colors}}, - and \code{\link{set_rho_hist_colors}}). -} -\description{ -A default list generator function that produces the options - for the colors controlling the panels of the LDATS summary plots, needed - because the change point histogram panel should be in a different color - scheme than the LDA and fitted time series model panels, which should be - in a matching color scheme. See \code{\link{set_LDA_plot_colors}}, - \code{\link{set_TS_summary_plot_cols}}, \code{\link{set_gamma_colors}}, - and \code{\link{set_rho_hist_colors}} for specific details on usage. -} -\examples{ - set_LDA_TS_plot_cols() - -} diff --git a/man/set_LDA_plot_colors.Rd b/man/set_LDA_plot_colors.Rd deleted file mode 100644 index f92cc229..00000000 --- a/man/set_LDA_plot_colors.Rd +++ /dev/null @@ -1,43 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LDA_plots.R -\name{set_LDA_plot_colors} -\alias{set_LDA_plot_colors} -\title{Prepare the colors to be used in the LDA plots} -\usage{ -set_LDA_plot_colors(x, cols = NULL, option = "C", alpha = 0.8) -} -\arguments{ -\item{x}{Object of class \code{LDA}.} - -\item{cols}{Colors to be used to plot the topics. -Any valid color values (\emph{e.g.}, see \code{\link[grDevices]{colors}}, -\code{\link[grDevices]{rgb}}) can be input as with a standard plot. -The default (\code{cols = NULL}) triggers use of -\code{\link[viridis]{viridis}} color options (see \code{option}).} - -\item{option}{A \code{character} string indicating the color option -from \code{\link[viridis]{viridis}} to use if `cols == NULL`. Four -options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -(or "C", the default option), "viridis" (or "D") and "cividis" (or "E").} - -\item{alpha}{Numeric value [0,1] that indicates the transparency of the -colors used. Supported only on some devices, see -\code{\link[grDevices]{rgb}}.} -} -\value{ -\code{vector} of \code{character} hex codes indicating colors to - use. -} -\description{ -Based on the inputs, create the set of colors to be used in - the LDA plots made by \code{\link{plot.LDA_TS}}. -} -\examples{ -\donttest{ - data(rodents) - lda_data <- rodents$document_term_table - r_LDA <- LDA_set(lda_data, topics = 4, nseeds = 10) - set_LDA_plot_colors(r_LDA[[1]]) -} - -} diff --git a/man/set_TS_summary_plot_cols.Rd b/man/set_TS_summary_plot_cols.Rd deleted file mode 100644 index 764bbdcc..00000000 --- a/man/set_TS_summary_plot_cols.Rd +++ /dev/null @@ -1,60 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{set_TS_summary_plot_cols} -\alias{set_TS_summary_plot_cols} -\title{Create the list of colors for the TS summary plot} -\usage{ -set_TS_summary_plot_cols(rho_cols = NULL, rho_option = "D", - rho_alpha = 0.4, gamma_cols = NULL, gamma_option = "C", - gamma_alpha = 0.8) -} -\arguments{ -\item{rho_cols}{Colors to be used to plot the histograms of change points. -Any valid color values (\emph{e.g.}, see \code{\link[grDevices]{colors}}, -\code{\link[grDevices]{rgb}}) can be input as with a standard plot. -The default (\code{rho_cols = NULL}) triggers use of -\code{\link[viridis]{viridis}} color options (see \code{rho_option}).} - -\item{rho_option}{A \code{character} string indicating the color option -from \code{\link[viridis]{viridis}} to use if `rho_cols == NULL`. Four -options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -(or "C"), "viridis" (or "D", the default option) and "cividis" (or "E").} - -\item{rho_alpha}{Numeric value [0,1] that indicates the transparency of the -colors used. Supported only on some devices, see -\code{\link[grDevices]{rgb}}.} - -\item{gamma_cols}{Colors to be used to plot the LDA topic proportions, -time series of observed topic proportions, and time series of fitted -topic proportions. Any valid color values (\emph{e.g.}, see -\code{\link[grDevices]{colors}}, \code{\link[grDevices]{rgb}}) can be -input as with a standard plot. The default (\code{gamma_cols = NULL}) -triggers use of \code{\link[viridis]{viridis}} color options (see -\code{gamma_option}).} - -\item{gamma_option}{A \code{character} string indicating the color option -from \code{\link[viridis]{viridis}} to use if gamma_cols == NULL`. Four -options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -(or "C"), "viridis" (or "D", the default option) and "cividis" (or "E").} - -\item{gamma_alpha}{Numeric value [0,1] that indicates the transparency of -the colors used. Supported only on some devices, see -\code{\link[grDevices]{rgb}}.} -} -\value{ -\code{list} of elements used to define the colors for the two - panels. Contains two elements \code{rho} and \code{gamma}, each - corresponding to the related panel, and each containing default values - for entries named \code{cols}, \code{option}, and \code{alpha}. -} -\description{ -A default list generator function that produces the options - for the colors controlling the panels of the TS summary plots, so needed - because the panels should be in different color schemes. See - \code{\link{set_gamma_colors}} and \code{\link{set_rho_hist_colors}} for - specific details on usage. -} -\examples{ - set_TS_summary_plot_cols() - -} diff --git a/man/set_gamma_colors.Rd b/man/set_gamma_colors.Rd deleted file mode 100644 index 45c5cdff..00000000 --- a/man/set_gamma_colors.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{set_gamma_colors} -\alias{set_gamma_colors} -\title{Prepare the colors to be used in the gamma time series} -\usage{ -set_gamma_colors(x, cols = NULL, option = "D", alpha = 1) -} -\arguments{ -\item{x}{Object of class \code{TS_fit}, fit by \code{\link{TS}}.} - -\item{cols}{Colors to be used to plot the time series of fitted topic -proportions.} - -\item{option}{A \code{character} string indicating the color option -from \code{\link[viridis]{viridis}} to use if "cols == NULL". Four -options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -(or "C"), "viridis" (or "D", the default option) and "cividis" (or "E").} - -\item{alpha}{Numeric value [0,1] that indicates the transparency of the -colors used. Supported only on some devices, see -\code{\link[grDevices]{rgb}}.} -} -\value{ -Vector of \code{character} hex codes indicating colors to use. -} -\description{ -Based on the inputs, create the set of colors to be used in - the time series of the fitted gamma (topic proportion) values. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) - set_gamma_colors(TSmod) -} - -} diff --git a/man/set_rho_hist_colors.Rd b/man/set_rho_hist_colors.Rd deleted file mode 100644 index 368d5ff9..00000000 --- a/man/set_rho_hist_colors.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{set_rho_hist_colors} -\alias{set_rho_hist_colors} -\title{Prepare the colors to be used in the change point histogram} -\usage{ -set_rho_hist_colors(x = NULL, cols = NULL, option = "D", alpha = 1) -} -\arguments{ -\item{x}{\code{matrix} of change point locations (element \code{rhos}) -from an object of class \code{TS_fit}, fit by \code{\link{TS}}.} - -\item{cols}{Colors to be used to plot the histograms of change points. -Any valid color values (\emph{e.g.}, see \code{\link[grDevices]{colors}}, -\code{\link[grDevices]{rgb}}) can be input as with a standard plot. -The default (\code{rho_cols = NULL}) triggers use of -\code{\link[viridis]{viridis}} color options (see \code{rho_option}).} - -\item{option}{A \code{character} string indicating the color option -from \code{\link[viridis]{viridis}} to use if "cols == NULL". Four -options are available: "magma" (or "A"), "inferno" (or "B"), "plasma" -(or "C"), "viridis" (or "D", the default option) and "cividis" (or "E").} - -\item{alpha}{Numeric value [0,1] that indicates the transparency of the -colors used. Supported only on some devices, see -\code{\link[grDevices]{rgb}}.} -} -\value{ -Vector of \code{character} hex codes indicating colors to use. -} -\description{ -Based on the inputs, create the set of colors to be used in - the change point histogram. -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - TSmod <- TS(data, gamma ~ 1, nchangepoints = 1, "newmoon", weights) - set_rho_hist_colors(TSmod$rhos) -} - -} diff --git a/man/sim_LDA_TS_data.Rd b/man/sim_LDA_TS_data.Rd deleted file mode 100644 index 1e60330a..00000000 --- a/man/sim_LDA_TS_data.Rd +++ /dev/null @@ -1,61 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/simulate.R -\name{sim_LDA_TS_data} -\alias{sim_LDA_TS_data} -\title{Simulate LDA_TS data from LDA and TS model structures and parameters} -\usage{ -sim_LDA_TS_data(N, Beta, X, Eta, rho, tD, err = 0, seed = NULL) -} -\arguments{ -\item{N}{A vector of document sizes (total word counts). Must be integer -conformable. Is used to infer the total number of documents.} - -\item{Beta}{\code{matrix} of categorical distribution parameters defining -terms within topics. Dimension: k x V (number of topics x number of -terms). Used to infer both (k) and (V). Must be non-negative and sum to -1 within topics.} - -\item{X}{\code{matrix} of covariates, dimension M (number of documents) x -C (number of covariates, including the intercept) (a.k.a the design -matrix).} - -\item{Eta}{\code{matrix} of regression parameters across the segments, -dimension: SC (number of segments x number of covariates, including the -intercept) x k (number of topics).} - -\item{rho}{Vector of integer-conformable time locations of changepoints or -\code{NULL} if no changepoints. Used to determine the number of -segments. Must exist within the bounds of the times of the documents, -\code{tD}.} - -\item{tD}{Vector of integer-conformable times of the documents. Must be -of length M (as determined by \code{X}).} - -\item{err}{Additive error on the link-scale. Must be a non-negative -\code{numeric} value. Default value of \code{0} indicates no error.} - -\item{seed}{Input to \code{\link{set.seed}}.} -} -\value{ -A document-by-term \code{matrix} of counts (dim: M x V). -} -\description{ -For a given set of covariates \code{X}; parameters - \code{Beta}, \code{Eta}, \code{rho}, and \code{err}; and - document-specific time stamps \code{tD} and lengths \code{N}), - simulate a document-by-topic matrix. - Additional structuring variables (the numbers of topics (k), terms (V), - documents (M), segments (S), and covariates per segment (C)) - are inferred from input objects. -} -\examples{ - N <- c(10, 22, 15, 31) - tD <- c(1, 3, 4, 6) - rho <- 3 - X <- cbind(rep(1, 4), 1:4) - Eta <- cbind(c(0.5, 0.3, 0.9, 0.5), c(1.2, 1.1, 0.1, 0.5)) - Beta <- matrix(c(0.1, 0.1, 0.8, 0.2, 0.6, 0.2), 2, 3, byrow = TRUE) - err <- 1 - sim_LDA_TS_data(N, Beta, X, Eta, rho, tD, err) - -} diff --git a/man/sim_LDA_data.Rd b/man/sim_LDA_data.Rd deleted file mode 100644 index 23133448..00000000 --- a/man/sim_LDA_data.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/simulate.R -\name{sim_LDA_data} -\alias{sim_LDA_data} -\title{Simulate LDA data from an LDA structure given parameters} -\usage{ -sim_LDA_data(N, Beta, alpha = NULL, Theta = NULL, seed = NULL) -} -\arguments{ -\item{N}{A vector of document sizes (total word counts). Must be integer -conformable. Is used to infer the total number of documents.} - -\item{Beta}{\code{matrix} of categorical distribution parameters defining -terms within topics. Dimension: k x V (number of topics x number of -terms). Used to infer both (k) and (V). Must be non-negative and sum to -1 within topics.} - -\item{alpha}{Single positive numeric value for the Dirichlet distribution -parameter defining topics within documents. To specifically define -document topic probabilities, use \code{Theta}.} - -\item{Theta}{\code{matrix} of probabilities defining topics within -documents. Dimension: M x k (documents x topics). Must be non-negative -and sum to 1 within documents. To generally define document topic -probabilities, use \code{alpha}.} - -\item{seed}{Input to \code{\link{set.seed}}.} -} -\value{ -A document-by-term \code{matrix} of counts (dim: M x V). -} -\description{ -For a given set of parameters \code{alpha} and \code{Beta} and - document-specific total word counts, simulate a document-by-term matrix. - Additional structuring variables (the numbers of topics (k), - documents (M), terms (V)) are inferred from input objects. -} -\examples{ - N <- c(10, 22, 15, 31) - alpha <- 1.2 - Beta <- matrix(c(0.1, 0.1, 0.8, 0.2, 0.6, 0.2), 2, 3, byrow = TRUE) - sim_LDA_data(N, Beta, alpha = alpha) - Theta <- matrix(c(0.2, 0.8, 0.8, 0.2, 0.5, 0.5, 0.9, 0.1), 4, 2, - byrow = TRUE) - sim_LDA_data(N, Beta, Theta = Theta) - -} diff --git a/man/sim_TS_data.Rd b/man/sim_TS_data.Rd deleted file mode 100644 index f1f8fd45..00000000 --- a/man/sim_TS_data.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/simulate.R -\name{sim_TS_data} -\alias{sim_TS_data} -\title{Simulate TS data from a TS model structure given parameters} -\usage{ -sim_TS_data(X, Eta, rho, tD, err = 0, seed = NULL) -} -\arguments{ -\item{X}{\code{matrix} of covariates, dimension M (number of documents) x -C (number of covariates, including the intercept) (a.k.a. the design -matrix).} - -\item{Eta}{\code{matrix} of regression parameters across the segments, -dimension: SC (number of segments x number of covariates, including the -intercept) x k (number of topics).} - -\item{rho}{Vector of integer-conformable time locations of changepoints or -\code{NULL} if no changepoints. Used to determine the number of -segments. Must exist within the bounds of the times of the documents, -\code{tD}.} - -\item{tD}{Vector of integer-conformable times of the documents. Must be -of length M (as determined by \code{X}).} - -\item{err}{Additive error on the link-scale. Must be a non-negative -\code{numeric} value. Default value of \code{0} indicates no error.} - -\item{seed}{Input to \code{\link{set.seed}}.} -} -\value{ -A document-by-topic \code{matrix} of probabilities (dim: M x k). -} -\description{ -For a given set of covariates \code{X}; parameters \code{Eta}, - \code{rho}, and \code{err}; and document-specific time stamps \code{tD}, - simulate a document-by-topic matrix. Additional structuring variables - (numbers of topics (k), documents (M), segments (S), and - covariates per segment (C)) are inferred from input objects. -} -\examples{ - tD <- c(1, 3, 4, 6) - rho <- 3 - X <- cbind(rep(1, 4), 1:4) - Eta <- cbind(c(0.5, 0.3, 0.9, 0.5), c(1.2, 1.1, 0.1, 0.5)) - sim_TS_data(X, Eta, rho, tD, err = 1) - -} diff --git a/man/simplex_TS.Rd b/man/simplex_TS.Rd new file mode 100644 index 00000000..51278799 --- /dev/null +++ b/man/simplex_TS.Rd @@ -0,0 +1,113 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TS_responses.R +\name{simplex_TS} +\alias{simplex_TS} +\alias{simplex_TS_chunk} +\alias{simplex_TS_control} +\title{Fit a simplex-based change point Time Series model} +\usage{ +simplex_TS(data, formula, changepoints = NULL, timename = "time", + weights = NULL, control = list()) + +simplex_TS_chunk(data, formula, chunk, timename = "time", + weights = NULL, control = list()) + +simplex_TS_control(transformation = ilr, quiet = FALSE, ...) +} +\arguments{ +\item{data}{\code{data.frame} including [1] the time variable (indicated +in \code{timename}), [2] the predictor variables (required by +\code{formula}) and [3], the compositional response variable (indicated +in \code{formula}). \cr \cr +Note that the response variables should be formatted as a +\code{data.frame} object named as indicated by the +\code{response} entry in the \code{control} list, such as \code{gamma} +for a standard TS analysis on LDA output. \cr \cr +See \code{Examples}.} + +\item{formula}{\code{\link[stats]{formula}} defining the regression between +relationship the change points. Any +predictor variable included must also be a column in +\code{data} and any (compositional) response variable must be a set of +columns in \code{data}.} + +\item{changepoints}{Numeric vector indicating locations of the change +points. Must be conformable to \code{integer} values.} + +\item{timename}{\code{character} element indicating the time variable +used in the time series. Defaults to \code{"time"}. The variable must be +integer-conformable or a \code{Date}. If the variable named +is a \code{Date}, the input is converted to an integer, resulting in the +timestep being 1 day, which is often not desired behavior.} + +\item{weights}{Optional class \code{numeric} vector of weights for each +document. Defaults to \code{NULL}, translating to an equal weight for +each document. When using \code{simplex_TS} in a standard LDATS +analysis, it is advisable to weight the documents by their total size, +as the result of \code{\link{topicmodels_LDA}} is a matrix of +proportions, which does not account for size differences among documents. +For most models, a scaling of the weights (so that the average is 1) is +most appropriate, and this is accomplished using +\code{\link{document_weights}}.} + +\item{control}{A \code{list} of parameters to control the fitting of the +Time Series model. Values not input assume defaults set by +\code{\link{TS_control}}.} + +\item{chunk}{Length-2 vector of times: [1] \code{start}, the start time +for the chunk and [2] \code{end}, the end time for the chunk.} + +\item{transformation}{Ratio \code{function} to use for the transformation +to the simplex geometry. Options include \code{\link[compositions]{alr}}, +\code{\link[compositions]{clr}}, and \code{\link[compositions]{ilr}}.} + +\item{quiet}{\code{logical} indicator of whether the model should run +quietly (if \code{FALSE}, a progress bar and notifications are printed).} + +\item{...}{Not passed along to the output, rather included to allow for +automated removal of unneeded controls.} +} +\value{ +\code{simplex_TS}: \code{TS_fit} \code{list} of [1] chunk-level model + fits (\code{"chunk models"}), [2] the total log likelihood across + all chunks (\code{"logLik"}), and [3] a \code{data.frame} of chunk + beginning and ending times (with columns \code{"start"} and + \code{"end"}). \cr \cr + \code{simplex_TS_chunk}: fitted model object for the chunk, + of class \code{lm}. \cr \cr + \code{simplex_TS_control}: \code{list}, with named elements + corresponding to response function controls. +} +\description{ +\code{simplex_TS} fits a set of simplex regression models + (Aitchison 1986, Aitchison \emph{et al.} 2002) to a time + series of compositional data divided into multiple segments (a.k.a. + chunks) based on given locations for a set of change points, using + e.g., the isometric log ratio (ILR) transformation + (Egozcue \emph{et al.} 2003, Pawlowsky-Glahn 2003. \cr \cr + \code{simplex_TS_chunk} fits a simplex regression model using, e.g., the + ILR transformation to a defined chunk of time (a.k.a. segment) + \code{[chunk$start, chunk$end]} within a time series. \cr \cr + \code{simplex_TS_control} defines and creates the control \code{list} for + fitting. +} +\references{ +Aitchison, J. 1986. \emph{The Statistical Analysis of Compositional + Data}. Monographs on Statistics and Applied Probability. Chapman & Hall + Ltd., London, UK. + + Aitchison, J, C. Barcelo-Vidal, J.J. Egozcue, and V. Pawlowsky-Glahn. + 2002. A consise guide to the algebraic geometric structure of the + simplex, the sample space for compositional data analysis, Terra Nostra, + Schriften der Alfred Wegener-Stiftung, 03/2003. + + Egozcue J.J., V. Pawlowsky-Glahn, G. Mateu-Figueras and C. Barcelo-Vidal. + 2003. Isometric logratio transformations for compositional data analysis. + \emph{Mathematical Geology}, \strong{35}:279-300. + + Pawlowsky-Glahn, V. 2003. Statistical modelling on coordinates. In: + Thio-Henestrosa, S. and J. A. Martin-Fernandez, Eds. + \emph{Proceedings of the 1st International Workshop on Compositional Data + Analysis}, Universitat de Girona, ISBN 84-8458-111-X. + \href{link}{http://ima.udg.es/Activitats/CoDaWork03}. +} diff --git a/man/simulate_data.Rd b/man/simulate_data.Rd new file mode 100644 index 00000000..191b25d0 --- /dev/null +++ b/man/simulate_data.Rd @@ -0,0 +1,90 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simulate.R +\name{simulate_data} +\alias{simulate_data} +\alias{simulate_LDA_data} +\alias{simulate_TS_data} +\alias{simulate_LDA_TS_data} +\title{Simulate LDA_TS data from LDA and TS model structures and parameters} +\usage{ +simulate_LDA_data(N, Beta, alpha = NULL, Theta = NULL, seed = NULL) + +simulate_TS_data(X, Eta, rho, tD, err = 0, seed = NULL, + invlink = softmax) + +simulate_LDA_TS_data(N, Beta, X, Eta, rho, tD, err = 0, seed = NULL, + invlink = softmax) +} +\arguments{ +\item{N}{A vector of document sizes (total word counts). Must be integer +conformable. Is used to infer the total number of documents.} + +\item{Beta}{\code{matrix} of categorical distribution parameters defining +terms within topics. Dimension: k x V (number of topics x number of +terms). Used to infer both (k) and (V). Must be non-negative and sum to +1 within topics.} + +\item{alpha}{Single positive numeric value for the Dirichlet distribution +parameter defining topics within documents. To specifically define +document topic probabilities, use \code{Theta}.} + +\item{Theta}{\code{matrix} of probabilities defining topics within +documents. Dimension: M x k (documents x topics). Must be non-negative +and sum to 1 within documents. To generally define document topic +probabilities, use \code{alpha}.} + +\item{seed}{Input to \code{\link{set.seed}}.} + +\item{X}{\code{matrix} of covariates, dimension M (number of documents) x +C (number of covariates, including the intercept) (a.k.a the design +matrix).} + +\item{Eta}{\code{matrix} of regression parameters across the segments, +dimension: SC (number of segments x number of covariates, including the +intercept) x k (number of topics).} + +\item{rho}{Vector of integer-conformable time locations of changepoints or +\code{NULL} if no changepoints. Used to determine the number of +segments. Must exist within the bounds of the times of the documents, +\code{tD}.} + +\item{tD}{Vector of integer-conformable times of the documents. Must be +of length M (as determined by \code{X}).} + +\item{err}{Additive error on the link-scale. Must be a non-negative +\code{numeric} value. Default value of \code{0} indicates no error.} + +\item{invlink}{\code{function} name for the inverse link function. +Currently available are \code{\link{softmax}} and the inverses of +the ILR, ALR, and CLR transforms (\code{\link[compositions]{ilrInv}}, +\code{\link[compositions]{alrInv}}, and +\code{\link[compositions]{clrInv}}).} +} +\value{ +\code{simulate_LDA}: A document-by-term \code{matrix} of counts + (dim: M x V). \cr \cr + \code{simulate_TS}: document-by-topic \code{matrix} of + proportions (dim: M x k). \cr \cr + \code{simulate_LDA_TS}: A document-by-term \code{matrix} of counts + (dim: M x V). +} +\description{ +\code{simulate_LDA_data} for a given set of parameters \code{alpha} and + \code{Beta} and document-specific total word counts, simulate a + document-by-term matrix. \cr + Additional structuring variables (the numbers of topics (k), + documents (M), terms (V)) are inferred from input objects. \cr \cr + \code{simulate_TS_data} for a given set of covariates \code{X}; + parameters \code{Eta}, \code{rho}, and \code{err}; and + document-specific time stamps \code{tD}, simulate a document-by-topic + matrix. Additional structuring variables (numbers of topics (k), + documents (M), segments (S), and covariates per segment (C)) are + inferred from input objects. + \code{simulate_LDA_TS_data} for a given set of covariates \code{X}; + parameters \code{Beta}, \code{Eta}, \code{rho}, and \code{err}; and + document-specific time stamps \code{tD} and lengths \code{N}), + simulate a document-by-topic matrix. \cr + Additional structuring variables (the numbers of topics (k), terms (V), + documents (M), segments (S), and covariates per segment (C)) + are inferred from input objects. +} diff --git a/man/soft_call.Rd b/man/soft_call.Rd new file mode 100644 index 00000000..882da884 --- /dev/null +++ b/man/soft_call.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{soft_call} +\alias{soft_call} +\title{Soften a flexible function call to errors} +\usage{ +soft_call(what = function(x) { invisible(NULL) }, + args = list(NULL), quote = FALSE, envir = parent.frame(), + soften = FALSE) +} +\arguments{ +\item{what}{\code{function} or a non-empty \code{character} string +naming the \code{function} to be called. \cr +See \code{\link{do.call}}.} + +\item{args}{\code{list} of arguments to the \code{what} call. The names +attribute of the \code{list} gives the argument names.\cr +See \code{\link{do.call}}.} + +\item{quote}{\code{logical} value indicating whether to quote the +arguments. \cr +See \code{\link{do.call}}.} + +\item{envir}{\code{environment} within which to evaluate the call. This +argument will be most useful if \code{what} is a \code{character} +string and the arguments are symbols or \code{quote}d expressions.\cr +See \code{\link{do.call}}.} + +\item{soften}{\code{logical} value indicating whether to soften any errors +in the running of \code{what}. \cr} +} +\value{ +Either the result of the (evaluated) call of \code{what} or a + \code{list} of the error message that resulted (if \code{soften = TRUE}). +} +\description{ +Wrapping around \code{\link{do.call}}, this function allows + for a simple "softening" where errors are returned as text, rather + than causing a break in the encompassing pipeline. +} +\details{ +As this is a basic wrapper on \code{\link{do.call}}, the basic + rules of its usage still apply: \cr + If \code{quote = FALSE} (the default), then the arguments are evaluated + in the calling environment and not in \code{envir}. \cr + If \code{quote = TRUE}, each argument is \code{\link{quote}}d + so that the effect of argument evaluation is to remove the quotes + leaving the original arguments unevaluated when the call is built. \cr + The behavior of some functions, such as \code{\link{substitute}}, will + not be the same for functions evaluated using \code{\link{do.call}} + and thus \code{soft_call} as if they were evaluated from the + interpreter. The precise semantics are currently undefined and subject + to change. +} diff --git a/man/step_chains.Rd b/man/step_chains.Rd deleted file mode 100644 index 2be8099a..00000000 --- a/man/step_chains.Rd +++ /dev/null @@ -1,107 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{step_chains} -\alias{step_chains} -\alias{propose_step} -\alias{eval_step} -\alias{take_step} -\title{Conduct a within-chain step of the ptMCMC algorithm} -\usage{ -step_chains(i, cpts, inputs) - -propose_step(i, cpts, inputs) - -eval_step(i, cpts, prop_step, inputs) - -take_step(cpts, prop_step, accept_step) -} -\arguments{ -\item{i}{\code{integer} iteration index.} - -\item{cpts}{\code{matrix} of change point locations across chains.} - -\item{inputs}{Class \code{ptMCMC_inputs} \code{list}, containing the -static inputs for use within the ptMCMC algorithm.} - -\item{prop_step}{Proposed step output from \code{propose_step}.} - -\item{accept_step}{\code{logical} indicator of acceptance of each chain's -proposed step.} -} -\value{ -\code{step_chains}: \code{list} of change points, log-likelihoods, - and logical indicators of acceptance for each chain. \cr \cr - \code{propose_step}: \code{list} of change points and - log-likelihood values for the proposal. \cr \cr - \code{eval_step}: \code{logical} vector indicating if each - chain's proposal was accepted. \cr \cr - \code{take_step}: \code{list} of change points, log-likelihoods, - and logical indicators of acceptance for each chain. -} -\description{ -This set of functions steps the chains forward one iteration - of the within-chain component of the ptMCMC algorithm. \code{step_chains} - is the main function, comprised of a proposal (made by \code{prop_step}), - an evaluation of that proposal (made by \code{eval_step}), and then an - update of the configuration (made by \code{take_step}). - \cr \cr - This set of functions was designed to work within \code{\link{TS}} and - specifically \code{\link{est_changepoints}}. They are still hardcoded to - do so, but have the capacity to be generalized to work with any - estimation via ptMCMC with additional coding work. -} -\details{ -For each iteration of the ptMCMC algorithm, all of the chains - have the potential to take a step. The possible step is proposed under - a proposal distribution (here for change points we use a symmetric - geometric distribution), the proposed step is then evaluated and either - accepted or not (following the Metropolis-Hastings rule; Metropolis, - \emph{et al.} 1953, Hasting 1960, Gupta \emph{et al.} 2018), and then - accordingly taken or not (the configurations are updated). -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - data <- data[order(data[,"newmoon"]), ] - saves <- prep_saves(1, TS_control()) - inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, - TS_control()) - cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) - ids <- prep_ids(TS_control()) - for(i in 1:TS_control()$nit){ - steps <- step_chains(i, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - saves <- update_saves(i, saves, steps, swaps) - cpts <- update_cpts(cpts, swaps) - ids <- update_ids(ids, swaps) - } - # within step_chains() - cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) - i <- 1 - prop_step <- propose_step(i, cpts, inputs) - accept_step <- eval_step(i, cpts, prop_step, inputs) - take_step(cpts, prop_step, accept_step) -} - -} -\references{ -Gupta, S., L. Hainsworth, J. S. Hogg, R. E. C. Lee, and J. R. Faeder. - 2018. Evaluation of parallel tempering to accelerate Bayesian parameter - estimation in systems biology. - \href{https://arxiv.org/abs/1801.09831}{link}. - - Hastings, W. K. 1970. Monte Carlo sampling methods using Markov Chains - and their applications. \emph{Biometrika} \strong{57}:97-109. - \href{https://doi.org/10.2307/2334940}{link}. - - Metropolis, N., A. W. Rosenbluth, M. N. Rosenbluth, A. H. Teller, and E. - Teller. 1953. Equations of state calculations by fast computing machines. - \emph{Journal of Chemical Physics} \strong{21}: 1087-1092. - \href{https://bayes.wustl.edu/Manual/EquationOfState.pdf}{link}. -} diff --git a/man/summarize_etas.Rd b/man/summarize_etas.Rd deleted file mode 100644 index 9e04bf09..00000000 --- a/man/summarize_etas.Rd +++ /dev/null @@ -1,41 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS.R -\name{summarize_etas} -\alias{summarize_etas} -\alias{measure_eta_vcov} -\title{Summarize the regressor (eta) distributions} -\usage{ -summarize_etas(etas, control = list()) - -measure_eta_vcov(etas) -} -\arguments{ -\item{etas}{Matrix of regressors (columns) across iterations of the -ptMCMC (rows), as returned from \code{\link{est_regressors}}.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} -} -\value{ -\code{summarize_etas}: table of summary statistics for chunk-level - regressors including mean, median, mode, posterior interval, standard - deviation, MCMC error, autocorrelation, and effective sample size for - each regressor. \cr \cr - \code{measure_eta_vcov}: variance-covariance matrix for chunk-level - regressors. -} -\description{ -\code{summarize_etas} calculates summary statistics for each - of the chunk-level regressors. - \cr \cr - \code{measure_ets_vcov} generates the variance-covariance matrix for - the regressors. -} -\examples{ - etas <- matrix(rnorm(100), 50, 2) - summarize_etas(etas) - measure_eta_vcov(etas) - -} diff --git a/man/summarize_rhos.Rd b/man/summarize_rhos.Rd deleted file mode 100644 index 0ac4d1c9..00000000 --- a/man/summarize_rhos.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS.R -\name{summarize_rhos} -\alias{summarize_rhos} -\alias{measure_rho_vcov} -\title{Summarize the rho distributions} -\usage{ -summarize_rhos(rhos, control = list()) - -measure_rho_vcov(rhos) -} -\arguments{ -\item{rhos}{Matrix of change point locations (columns) across iterations of -the ptMCMC (rows) or \code{NULL} if no change points are in the model, -as returned from \code{\link{est_changepoints}}.} - -\item{control}{A \code{list} of parameters to control the fitting of the -Time Series model including the parallel tempering Markov Chain -Monte Carlo (ptMCMC) controls. Values not input assume defaults set by -\code{\link{TS_control}}.} -} -\value{ -\code{summarize_rhos}: table of summary statistics for change point - locations including mean, median, mode, posterior interval, standard - deviation, MCMC error, autocorrelation, and effective sample size for - each change point location. \cr \cr - \code{measure_rho_vcov}: variance-covariance matrix for change - point locations. -} -\description{ -\code{summarize_rho} calculates summary statistics for each - of the change point locations. - \cr \cr - \code{measure_rho_vcov} generates the variance-covariance matrix for the - change point locations. -} -\examples{ - rhos <- matrix(sample(80:100, 100, TRUE), 50, 2) - summarize_rhos(rhos) - measure_rho_vcov(rhos) - -} diff --git a/man/swap_chains.Rd b/man/swap_chains.Rd deleted file mode 100644 index 6bb0263b..00000000 --- a/man/swap_chains.Rd +++ /dev/null @@ -1,83 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ptMCMC.R -\name{swap_chains} -\alias{swap_chains} -\title{Conduct a set of among-chain swaps for the ptMCMC algorithm} -\usage{ -swap_chains(chainsin, inputs, ids) -} -\arguments{ -\item{chainsin}{Chain configuration to be evaluated for swapping.} - -\item{inputs}{Class \code{ptMCMC_inputs} list, containing the static inputs -for use within the ptMCMC algorithm.} - -\item{ids}{The vector of integer chain ids.} -} -\value{ -\code{list} of updated change points, log-likelihoods, and chain - ids, as well as a vector of acceptance indicators for each swap. -} -\description{ -This function handles the among-chain swapping based on - temperatures and likelihood differentials. - \cr \cr - This function was designed to work within \code{\link{TS}} and - specifically \code{\link{est_changepoints}}. It is still hardcoded to do - so, but has the capacity to be generalized to work with any estimation - via ptMCMC with additional coding work. -} -\details{ -The ptMCMC algorithm couples the chains (which are - taking their own walks on the distribution surface) through "swaps", - where neighboring chains exchange configurations (Geyer 1991, Falcioni - and Deem 1999) following the Metropolis criterion (Metropolis - \emph{et al.} 1953). This allows them to share information and search the - surface in combination (Earl and Deem 2005). -} -\examples{ -\donttest{ - data(rodents) - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - LDA_models <- LDA_set(document_term_table, topics = 2)[[1]] - data <- document_covariate_table - data$gamma <- LDA_models@gamma - weights <- document_weights(document_term_table) - data <- data[order(data[,"newmoon"]), ] - saves <- prep_saves(1, TS_control()) - inputs <- prep_ptMCMC_inputs(data, gamma ~ 1, 1, "newmoon", weights, - TS_control()) - cpts <- prep_cpts(data, gamma ~ 1, 1, "newmoon", weights, TS_control()) - ids <- prep_ids(TS_control()) - for(i in 1:TS_control()$nit){ - steps <- step_chains(i, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - saves <- update_saves(i, saves, steps, swaps) - cpts <- update_cpts(cpts, swaps) - ids <- update_ids(ids, swaps) - } -} - -} -\references{ -Earl, D. J. and M. W. Deem. 2005. Parallel tempering: theory, - applications, and new perspectives. \emph{Physical Chemistry Chemical - Physics} \strong{7}: 3910-3916. - \href{https://rsc.li/2XkxPCm}{link}. - - Falcioni, M. and M. W. Deem. 1999. A biased Monte Carlo scheme for - zeolite structure solution. \emph{Journal of Chemical Physics} - \strong{110}: 1754-1766. - \href{https://aip.scitation.org/doi/10.1063/1.477812}{link}. - - Geyer, C. J. 1991. Markov Chain Monte Carlo maximum likelihood. \emph{In - Computing Science and Statistics: Proceedings of the 23rd Symposium on - the Interface}. pp 156-163. American Statistical Association, New York, - USA. \href{https://www.stat.umn.edu/geyer/f05/8931/c.pdf}{link}. - - Metropolis, N., A. W. Rosenbluth, M. N. Rosenbluth, A. H. Teller, and E. - Teller. 1953. Equations of state calculations by fast computing machines. - \emph{Journal of Chemical Physics} \strong{21}: 1087-1092. - \href{https://bayes.wustl.edu/Manual/EquationOfState.pdf}{link}. -} diff --git a/man/topicmodels_LDA.Rd b/man/topicmodels_LDA.Rd new file mode 100644 index 00000000..36018a64 --- /dev/null +++ b/man/topicmodels_LDA.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LDA_models.R +\name{topicmodels_LDA} +\alias{topicmodels_LDA} +\title{Latent Dirichlet Allocation Linguistic Decomposition Analysis + as conducted via the topicmodels package} +\usage{ +topicmodels_LDA(LDA, method = "VEM", seeded = TRUE, ...) +} +\arguments{ +\item{LDA}{A prepared (via \code{\link{prepare_LDA}} LDA model +\code{list}.} + +\item{method}{Fitting routine used in \code{\link[topicmodels]{LDA}}. +Currenlty, only \code{"VEM"} and \code{"Gibbs"} are supported.} + +\item{seeded}{\code{logical} indicator of if the LDA should be a seeded +replicate.} + +\item{...}{Additional arguments to be passed to +\code{\link[topicmodels]{LDA}} as a \code{control} input.} +} +\value{ +\code{LDA} \code{list} with components + \describe{ + \item{alpha}{parameter estimate.} + \item{beta}{parameter estimate.} + \item{terms}{\code{character} \code{vector} of term names.} + \item{document_topic_matrix}{estimated latent topic compositions.} + \item{test_document_topic_matrox}{estimated latent topic compositions + of the test data (not presently available for usage).} + \item{log_likelihood}{model log likelihood.} + \item{data}{data object used to fit the LDA model.} + \item{data_subset}{number of the data subset from the whole data set.} + \item{topics}{\code{integer} number of topics in the model.} + \item{replicat}{\code{integer} replicate number.} + \item{control}{\code{list} of controls used to fit the model. See + \code{\link{LDA_control}}.} + } +} +\description{ +Fit the standard LDATS LDA model (a true Latent Dirichlet + Allocation) using \code{\link[topicmodels]{LDA}} (Grun and Hornik 2011). + Default methodology is the Variational Expectation Maximization routine + (VEM) as described by Blei \emph{et al.} (2003) and implemented by + Grun and Hornik (2011). \cr \cr + If the model is defined to only fit one topic, \code{\link{identity_LDA}} + is used by default. +} +\references{ +Blei, D. M., A. Y. Ng, and M. I. Jordan. 2003. Latent Dirichlet + Allocation. \emph{Journal of Machine Learning Research} + \strong{3}:993-1022. + \href{http://jmlr.csail.mit.edu/papers/v3/blei03a.html}{link}. + + Grun B. and K. Hornik. 2011. topicmodels: An R Package for Fitting Topic + Models. \emph{Journal of Statistical Software} \strong{40}:13. + \href{https://www.jstatsoft.org/article/view/v040i13}{link}. +} diff --git a/man/trace_plot.Rd b/man/trace_plot.Rd deleted file mode 100644 index 84c7d2e0..00000000 --- a/man/trace_plot.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TS_plots.R -\name{trace_plot} -\alias{trace_plot} -\title{Produce the trace plot panel for the TS diagnostic plot of a - parameter} -\usage{ -trace_plot(x, ylab = "parameter value") -} -\arguments{ -\item{x}{Vector of parameter values drawn from the posterior distribution, -indexed to the iteration by the order of the vector.} - -\item{ylab}{\code{character} value used to label the y axis.} -} -\value{ -\code{NULL}. -} -\description{ -Produce a trace plot for the parameter of interest (rho or - eta) as part of \code{\link{TS_diagnostics_plot}}. A horizontal line - is added to show the median of the posterior. -} -\examples{ - trace_plot(rnorm(100, 0, 1)) - -} diff --git a/man/update_list.Rd b/man/update_list.Rd new file mode 100644 index 00000000..83c2a2e1 --- /dev/null +++ b/man/update_list.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{update_list} +\alias{update_list} +\title{Update a list's elements} +\usage{ +update_list(orig_list = list(), ...) +} +\arguments{ +\item{orig_list}{\code{list} to be updated with \code{...}.} + +\item{...}{Named elements to update in \code{orig_list}} +} +\value{ +Updated \code{list}. +} +\description{ +Update a list with new values for elements +} +\examples{ + orig_list <- list(a = 1, b = 3, c = 4) + update_list(orig_list) + update_list(orig_list, a = "a") + update_list(orig_list, a = 10, b = NULL) + +} diff --git a/man/verify_changepoint_locations.Rd b/man/verify_changepoint_locations.Rd index 144128a8..a1fa6042 100644 --- a/man/verify_changepoint_locations.Rd +++ b/man/verify_changepoint_locations.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/multinom_TS.R +% Please edit documentation in R/TS.R \name{verify_changepoint_locations} \alias{verify_changepoint_locations} -\title{Verify the change points of a multinomial time series model} +\title{Verify the change points of a time series model} \usage{ verify_changepoint_locations(data, changepoints = NULL, timename = "time") @@ -28,13 +28,3 @@ Logical indicator of the check passing \code{TRUE} or failing Verify that a time series can be broken into a set of chunks based on input change points. } -\examples{ - data(rodents) - dtt <- rodents$document_term_table - lda <- LDA_set(dtt, 2, 1, list(quiet = TRUE)) - dct <- rodents$document_covariate_table - dct$gamma <- lda[[1]]@gamma - verify_changepoint_locations(dct, changepoints = 100, - timename = "newmoon") - -} diff --git a/notes.R b/notes.R new file mode 100644 index 00000000..f29a5cc4 --- /dev/null +++ b/notes.R @@ -0,0 +1,66 @@ +to-do + +predict +tidying check functions and general usage +examples +tests +softcall message handling +tempering +vignettes + data + simplex response +allow TS to alternatively have a data input and then run through identityLDA +turn the vcov code in est_regressors into functions/methpds + +something weird with ren's data having a -Inf lm logLik +ugh looks like the vcv can sometimes be singular +but really a broader issue is that most of the cp locations give -Infs + which mean it's not a good fitting model, so how to address? + at least make sure all initial cp locations are not -Inf, otherwise breaks! +hmmm yeah, think on this---how best to set up a way to handle models that +are just not going to fit +also we should probably allow the inclusion of starting values for params, +esp change points here....that's the issue + +have a stop-gap patch that looks like it's working, but finding the root +cause of that issue will be helpful in the long run + +devtools::load_all() + data(rodents) +LDAs <- LDA(data = rodents, topics = 2, replicates = c(1)) + +rm(list=ls()) +LDAs <- LDA(data = rodents, topics = 4, replicates = c(1)) + +TSs <- TS(LDAs = LDAs, formulas = ~ 1, nchangepoints = 1, + timename = "newmoon", weights = TRUE, + control = list(response = simplex_TS, + response_args = list(control = + list(transformation = "ilr")), + method_args = list(control = list(nit = 100)))) + + + +TSs <- TS(LDAs = LDAs, formulas = ~ 1, nchangepoints = 0, + timename = "newmoon", #weights = TRUE, + control = list(response = simplex_TS, + response_args = list(control = + list(transformation = "clr")), + method_args = list(control = list(nit = 100)))) + +TSs <- TS(LDAs = LDAs, formulas = ~ newmoon, nchangepoints = 1, + timename = "newmoon", weights = TRUE, + control = list(method_args = list(control = list(nit = 100)))) + +plot(LDAs) +plot(TSs) +plot(TSs, plot_type="diagnostic") +plot(LDATSs) + + +not sure if needed: + +time_order_data <- function(x, timename = "time"){ + time_order <- order(x[ , timename]) + x[time_order , ] +} \ No newline at end of file diff --git a/tempering.R b/tempering.R new file mode 100644 index 00000000..84bef4ff --- /dev/null +++ b/tempering.R @@ -0,0 +1,114 @@ +a work in progress toy example of tempering + +i think the pseudo priors should be inverse temps? +basically how theyre set now +messing around, have the basic idea, i think +verify the math for the temps and such tho +library(mcmc) + +set.seed(1) +xx <- c(rnorm(2e5, 0, 0.5), rnorm(1e5, 4, 0.7)) +plot(density(xx)) +plot(density(xx)$x, log(density(xx)$y), type = "l") + +prixx <- c(rnorm(2e4, 0, 1), rnorm(1e4, 5, 2.5)) +plot(density(prixx)) +plot(density(prixx)$x, log(density(prixx)$y), type = "l") + + +temp_fun <- function(chain){ + c(1, 2, 5)[chain] +} + +lik_fun <- function(params){ + out <- approxfun(density(xx))(params) + out[is.na(out)] <- 1e-100 + out +} + +prior_fun <- function(params){ + out <- approxfun(density(prixx))(params) + out[is.na(out)] <- 1e-100 + out +} + +pseudoprior_fun <- function(chain){ + c(1, 0.5, 0.2)[chain] +} + +eval_fun <- function(inputs, lik_fun, prior_fun, temp_fun, pseudoprior_fun){ + chain <- inputs[1] + params <- inputs[-1] + temp <- temp_fun(chain) + lik <- lik_fun(params) + prior <- prior_fun(params) + pseudoprior <- pseudoprior_fun(chain) + 1/temp * log(lik) + log(prior) + log(pseudoprior) +} + + +xv<-rep(NA,512) +yv1<-rep(NA,512) +yv2<-rep(NA,512) +yv3<-rep(NA,512) + +for(i in 1:512){ + xv[i] <- density(xx)$x[i] + yv1[i] <- eval_fun(c(1, xv[i]), lik_fun, prior_fun, temp_fun, + pseudoprior_fun) + yv2[i] <- eval_fun(c(2, xv[i]), lik_fun, prior_fun, temp_fun, + pseudoprior_fun) + yv3[i] <- eval_fun(c(3, xv[i]), lik_fun, prior_fun, temp_fun, + pseudoprior_fun) +} + +yvs <- c(yv1, yv2, yv3) +plot(xv, yv1, type = "l", + ylim = c(min(yvs, na.rm = TRUE), max(yvs, na.rm = TRUE))) +points(xv, yv2, type = "l", lty = 2) +points(xv, yv3, type = "l", lty = 3) + + + +neighbors <- matrix(FALSE, 3, 3) +neighbors[row(neighbors) == col(neighbors) + 1] <- TRUE +neighbors[row(neighbors) == col(neighbors) - 1] <- TRUE + + +#serial tempering +initial <- c(3, 0) +tout <- temper(eval_fun, initial = initial, neighbors = neighbors, + nbatch = 1000, blen = 1, + temp_fun = temp_fun, lik_fun = lik_fun, prior_fun = prior_fun, + pseudoprior_fun = pseudoprior_fun) + +#parallel tempering +initial <- matrix(c(0,0,0), 3, 1) +ptout <- temper(eval_fun, initial = initial, neighbors = neighbors, + nbatch = 1000, blen = 1, parallel = TRUE, + temp_fun = temp_fun, lik_fun = lik_fun, prior_fun = prior_fun, + pseudoprior_fun = pseudoprior_fun) + + + + +# notes on temper + +so, in looking through the mcmc package's code, the main calculator function +logh takes 3 arguments: the objective function (returns log unnormalized +density), the state, the R environment at the time of the C call within +the temper function + +basically, the temper function creates a function that is an evaluation +of the objective function at the state variable, but it doesn't actually do +the evaluation, it just declares the relationship. this way, the function +(via the relationship) can be used on any state variable value, most +especially the proposal values. +it then creates an object that is the environment (including things passed +into temper via ...). this allows both more global environment variables and +generalized inputs via ... to be passed down to where the execution of the +function actually happens + +this is because mcmc uses R to call C to call R + + diff --git a/tests/figs/check-lda-plot-functions/base-lda-plot-bottom-panel.svg b/tests/figs/check-lda-plot-functions/base-lda-plot-bottom-panel.svg deleted file mode 100644 index ca522d20..00000000 --- a/tests/figs/check-lda-plot-functions/base-lda-plot-bottom-panel.svg +++ /dev/null @@ -1,64 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - -0 -100 -200 -300 -400 - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - -Document -Proportion - - - - - - - - - - - - - diff --git a/tests/figs/check-lda-plot-functions/base-lda-plot-top-panel.svg b/tests/figs/check-lda-plot-functions/base-lda-plot-top-panel.svg deleted file mode 100644 index 00c4ffee..00000000 --- a/tests/figs/check-lda-plot-functions/base-lda-plot-top-panel.svg +++ /dev/null @@ -1,142 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - -0.0 -0.2 -0.4 -0.6 - - - - - - - - - - - -Total Proportion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -BA -DM -DO -DS -NA. -OL -OT -PB -PE -PF -PH -PI -PL -PM -PP -RF -RM -RO -SF -SH -SO - - - - - -Topic 1 - -Topic 2 - - - - - - - diff --git a/tests/figs/check-lda-plot-functions/base-lda-plot.svg b/tests/figs/check-lda-plot-functions/base-lda-plot.svg deleted file mode 100644 index 46cb6bc9..00000000 --- a/tests/figs/check-lda-plot-functions/base-lda-plot.svg +++ /dev/null @@ -1,196 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - -0.0 -0.2 -0.4 -0.6 - - - - - - - - - - - -Total Proportion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -BA -DM -DO -DS -NA. -OL -OT -PB -PE -PF -PH -PI -PL -PM -PP -RF -RM -RO -SF -SH -SO - - - - - -Topic 1 - -Topic 2 - - - - - - - - - - - - - - - - - -0 -100 -200 -300 -400 - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - - - - - - - - - - - -Document -Proportion - - - - - - - - - - - - - diff --git a/tests/figs/check-lda-plot-functions/base-lda-set-selected-plot.svg b/tests/figs/check-lda-plot-functions/base-lda-set-selected-plot.svg deleted file mode 100644 index c2c9b056..00000000 --- a/tests/figs/check-lda-plot-functions/base-lda-set-selected-plot.svg +++ /dev/null @@ -1,255 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 -1.2 - - - - - - - - - - - -Total Proportion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -BA -DM -DO -DS -NA. -OL -OT -PB -PE -PF -PH -PI -PL -PM -PP -RF -RM -RO -SF -SH -SO - - - - - -Topic 1 - -Topic 2 - -Topic 3 - -Topic 4 - - - - - - - - - - - - - - - - - -0 -100 -200 -300 -400 - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - - - - - - - - - - - -Document -Proportion - - - - - - - - - - - - - - - diff --git a/tests/figs/check-lda-plot-functions/lda-plot-with-time-x.svg b/tests/figs/check-lda-plot-functions/lda-plot-with-time-x.svg deleted file mode 100644 index 014a20c1..00000000 --- a/tests/figs/check-lda-plot-functions/lda-plot-with-time-x.svg +++ /dev/null @@ -1,196 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - -0.0 -0.2 -0.4 -0.6 - - - - - - - - - - - -Total Proportion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -BA -DM -DO -DS -NA. -OL -OT -PB -PE -PF -PH -PI -PL -PM -PP -RF -RM -RO -SF -SH -SO - - - - - -Topic 1 - -Topic 2 - - - - - - - - - - - - - - - - - -0 -100 -200 -300 -400 - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - - - - - - - - - - - -New Moon -Proportion - - - - - - - - - - - - - diff --git a/tests/figs/check-lda-ts-plot-functions/base-lda-ts-non-interactive-plot.svg b/tests/figs/check-lda-ts-plot-functions/base-lda-ts-non-interactive-plot.svg deleted file mode 100644 index 91e43445..00000000 --- a/tests/figs/check-lda-ts-plot-functions/base-lda-ts-non-interactive-plot.svg +++ /dev/null @@ -1,765 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - -0.0 -0.2 -0.4 -0.6 - - - - - - - - - - - -Total Proportion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -BA -DM -DO -DS -NA. -OL -OT -PB -PE -PF -PH -PI -PL -PM -PP -RF -RM -RO -SF -SH -SO - - - - - -Topic 1 - -Topic 2 - - - - - - - - - - - - - - - - - -0 -100 -200 -300 -400 - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - - - - - - - - - - - -Proportion - - - - - - - - - - - - - - - - - - - - - - - -1 - - - - - - - - - - - -0.00 -0.06 -0.12 -0.18 -0.24 - - - - - - -0 -100 -200 -300 -400 -Proportion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -0.00 -0.25 -0.50 -0.75 -1.00 - - - - - - -0 -100 -200 -300 -400 -Proportion -newmoon - - - - - - - - - - - - - - diff --git a/tests/figs/check-ts-plot-functions/base-ts-autocorr-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-autocorr-plot.svg deleted file mode 100644 index ba7f6802..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-autocorr-plot.svg +++ /dev/null @@ -1,80 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -0 -2 -4 -6 -8 -10 -12 - - - - - --0.5 -0.0 -0.5 -1.0 - -Lag -Autocorrelation - - - - - - - - - - - - - -Series x - diff --git a/tests/figs/check-ts-plot-functions/base-ts-diagnostic-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-diagnostic-plot.svg deleted file mode 100644 index 095203b3..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-diagnostic-plot.svg +++ /dev/null @@ -1,286 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - -5 -10 -15 -20 - - - - - --0.1 -0.0 -0.1 -0.2 - - - - - - -Iteration -Segment 1 Topic 2 Intercept - - - - - - - - - - - - - - - - - - - - - - --0.1 -0.0 -0.1 -0.2 - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - - - - - - -Segment 1 Topic 2 Intercept -% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Segment 1 Topic 2 Intercept -Frequency - - - - - - - - - - --0.1 -0.0 -0.1 -0.2 - - - - - - - - -0 -1 -2 -3 -4 -5 -6 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -0 -2 -4 -6 -8 -10 -12 - - - - - - - - - --0.4 --0.2 -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - - - - - - -Lag -Autocorrelation - - - - - - - - - - - - - -Series x - - - - - - diff --git a/tests/figs/check-ts-plot-functions/base-ts-ecdf-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-ecdf-plot.svg deleted file mode 100644 index d22097b9..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-ecdf-plot.svg +++ /dev/null @@ -1,75 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - -270 -280 -290 -300 -310 - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - -parameter value -% - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tests/figs/check-ts-plot-functions/base-ts-eta-diagnostic-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-eta-diagnostic-plot.svg deleted file mode 100644 index d3df3dfe..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-eta-diagnostic-plot.svg +++ /dev/null @@ -1,293 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - -5 -10 -15 -20 - - - - - - --1.8 --1.6 --1.4 --1.2 --1.0 - - - - - - -Iteration -Segment 2 Topic 2 Intercept - - - - - - - - - - - - - - - - - - - - - - - --1.8 --1.6 --1.4 --1.2 --1.0 - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - - - - - - -Segment 2 Topic 2 Intercept -% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Segment 2 Topic 2 Intercept -Frequency - - - - - - - - - - - - --2.0 --1.8 --1.6 --1.4 --1.2 --1.0 - - - - - - - - - -0 -1 -2 -3 -4 -5 -6 -7 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -0 -2 -4 -6 -8 -10 -12 - - - - - - - - - --0.4 --0.2 -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - - - - - - -Lag -Autocorrelation - - - - - - - - - - - - - -Series x - - - - - - diff --git a/tests/figs/check-ts-plot-functions/base-ts-gamma-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-gamma-plot.svg deleted file mode 100644 index f92c1ae3..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-gamma-plot.svg +++ /dev/null @@ -1,63 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - -0.00 -0.25 -0.50 -0.75 -1.00 - - - - - - -0 -100 -200 -300 -400 -Proportion -newmoon - - - - - - - - - - - - - - diff --git a/tests/figs/check-ts-plot-functions/base-ts-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-plot.svg deleted file mode 100644 index 1fe5c64d..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-plot.svg +++ /dev/null @@ -1,590 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - -1 - - - - - - - - - - - -0.000 -0.088 -0.175 -0.262 -0.350 - - - - - - -0 -100 -200 -300 -400 -Proportion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -0.00 -0.25 -0.50 -0.75 -1.00 - - - - - - -0 -100 -200 -300 -400 -Proportion -newmoon - - - - - - - - - - - - - - diff --git a/tests/figs/check-ts-plot-functions/base-ts-posterior-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-posterior-plot.svg deleted file mode 100644 index 35a70eba..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-posterior-plot.svg +++ /dev/null @@ -1,63 +0,0 @@ - - - - - - - - - - - - - - - - -parameter value -Frequency - - - - - - -270 -280 -290 -300 -310 - - - - - - -0 -2 -4 -6 -8 - - - - - - - - - - - - - - - diff --git a/tests/figs/check-ts-plot-functions/base-ts-rho-diagnostic-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-rho-diagnostic-plot.svg deleted file mode 100644 index f701ae6b..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-rho-diagnostic-plot.svg +++ /dev/null @@ -1,254 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - -5 -10 -15 -20 - - - - - - -270 -280 -290 -300 -310 - - - - - - -Iteration -Change point 1 location - - - - - - - - - - - - - - - - - - - - - - - -270 -280 -290 -300 -310 - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - - - - - - -Change point 1 location -% - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Change point 1 location -Frequency - - - - - - - - - - - -270 -280 -290 -300 -310 - - - - - - -0 -2 -4 -6 -8 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -0 -2 -4 -6 -8 -10 -12 - - - - - --0.5 -0.0 -0.5 -1.0 - - - - - - -Lag -Autocorrelation - - - - - - - - - - - - - -Series x - - - - - - diff --git a/tests/figs/check-ts-plot-functions/base-ts-rho-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-rho-plot.svg deleted file mode 100644 index 5d69c4fa..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-rho-plot.svg +++ /dev/null @@ -1,527 +0,0 @@ - - - - - - - - - - - - - - - - - -newmoon - - - - - - -0.000 -0.088 -0.175 -0.262 -0.350 - - - - - - -0 -100 -200 -300 -400 -Proportion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tests/figs/check-ts-plot-functions/base-ts-summary-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-summary-plot.svg deleted file mode 100644 index 1fe5c64d..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-summary-plot.svg +++ /dev/null @@ -1,590 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - -1 - - - - - - - - - - - -0.000 -0.088 -0.175 -0.262 -0.350 - - - - - - -0 -100 -200 -300 -400 -Proportion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -0.00 -0.25 -0.50 -0.75 -1.00 - - - - - - -0 -100 -200 -300 -400 -Proportion -newmoon - - - - - - - - - - - - - - diff --git a/tests/figs/check-ts-plot-functions/base-ts-trace-plot.svg b/tests/figs/check-ts-plot-functions/base-ts-trace-plot.svg deleted file mode 100644 index c5925f21..00000000 --- a/tests/figs/check-ts-plot-functions/base-ts-trace-plot.svg +++ /dev/null @@ -1,56 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - -5 -10 -15 -20 - - - - - - -270 -280 -290 -300 -310 - -Iteration -parameter value - - - - - - - - diff --git a/tests/figs/check-ts-plot-functions/rho-line-plot.svg b/tests/figs/check-ts-plot-functions/rho-line-plot.svg deleted file mode 100644 index 219b91cb..00000000 --- a/tests/figs/check-ts-plot-functions/rho-line-plot.svg +++ /dev/null @@ -1,59 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - --10 --5 -0 -5 -10 - - - - - - - -0.0 -0.2 -0.4 -0.6 -0.8 -1.0 - -1 -1 - - - - - - - diff --git a/tests/static.csv b/tests/static.csv new file mode 100644 index 00000000..dee630fa --- /dev/null +++ b/tests/static.csv @@ -0,0 +1,31 @@ +"V1","V2","V3","V4","V5","V6","V7" +7,14,24,0,7,123,5 +10,11,23,1,6,103,5 +15,16,41,1,8,152,6 +9,14,29,1,6,134,4 +10,11,26,1,6,145,4 +12,14,31,1,6,149,5 +11,16,36,1,9,150,5 +6,6,18,0,5,86,3 +12,19,33,1,7,174,5 +10,13,32,1,7,128,5 +14,18,41,1,12,185,6 +7,11,25,1,6,85,4 +8,11,31,1,7,146,5 +5,7,21,0,4,97,3 +6,10,21,0,4,113,3 +6,8,22,0,5,96,2 +6,8,16,0,4,83,2 +11,16,34,1,8,177,6 +8,11,22,0,5,95,3 +11,15,36,1,7,151,6 +9,14,26,1,8,129,5 +8,13,28,0,6,142,4 +9,12,29,1,6,129,3 +6,7,20,0,3,71,3 +8,12,30,1,5,106,3 +5,6,17,0,4,73,2 +10,14,27,1,5,128,3 +10,16,31,1,7,122,5 +8,11,30,1,7,122,3 +7,9,19,0,4,113,3 diff --git a/tests/testthat/test-02-lda.R b/tests/testthat/test-02-lda.R deleted file mode 100644 index fb59c390..00000000 --- a/tests/testthat/test-02-lda.R +++ /dev/null @@ -1,72 +0,0 @@ -context("Check LDA functions") - -data(rodents) -lda_data <- rodents$document_term_table -lda <- LDA_set(lda_data, c(2, 4), nseeds = 2, list(quiet = TRUE)) - -test_that("check output from LDA_set", { - expect_equal(length(lda), 4) - expect_is(lda, "LDA_set") - expect_is(lda[[1]], "LDA") - expect_is(lda[[2]], "LDA") - expect_is(lda[[3]], "LDA") - expect_is(lda[[4]], "LDA") -}) - -test_that("check logLik for LDA_VEM", { - expect_is(logLik(lda[[1]]), "logLik") - expect_equal(round(as.numeric(logLik(lda[[1]]))), -47889) -}) - -test_that("check output from prep_LDA_control", { - expect_is(prep_LDA_control(1), "list") - expect_equal(prep_LDA_control(1)$seed, 1) - expect_equal(prep_LDA_control(1, list(seed = 10))$seed, 1) -}) - - -test_that("check selection via select_LDA", { - expect_is(select_LDA(lda), "LDA_set") - expect_equal(length(select_LDA(lda)), 1) - expect_equal(select_LDA(lda)[1], lda[3]) - expect_error(select_LDA("ok")) -}) - -test_that("check check_LDA_set_inputs", { - expect_silent(check_LDA_set_inputs(lda_data, 2, 1, list())) - expect_error(check_LDA_set_inputs(lda_data, 2, "ok", 2)) - expect_error(check_LDA_set_inputs(lda_data, "ok", 2, list())) - expect_error(check_LDA_set_inputs("ok", 2, 1, list())) -}) - - -test_that("check package_LDA_set", { - document_term_table <- lda_data - topics <- 2 - nseeds <- 1 - check_LDA_set_inputs(document_term_table, topics, nseeds, list()) - mod_topics <- rep(topics, each = length(seq(2, nseeds * 2, 2))) - mod_seeds <- rep(seq(2, nseeds * 2, 2), length(topics)) - nmods <- length(mod_topics) - mods <- vector("list", length = nmods) - for (i in 1:nmods){ - LDA_msg(mod_topics[i], mod_seeds[i], list()) - control_i <- prep_LDA_control(seed = mod_seeds[i], control = list()) - mods[[i]] <- LDA(document_term_table, k = mod_topics[i], - control = control_i) - } - expect_is(package_LDA_set(mods, mod_topics, mod_seeds), "LDA_set") - expect_error(package_LDA_set(mods, 0.2, mod_seeds)) - expect_error(package_LDA_set(mods, mod_topics, 0.2)) - expect_error(package_LDA_set("ok", mod_topics, mod_seeds)) -}) - -test_that("check LDA_msg", { - expect_message(LDA_msg(2, 1, list())) - expect_error(LDA_msg(2, 0.5, list())) -}) - -test_that("Check LDA_controls_list", { - expect_is(LDA_set_control(), "list") - expect_equal(length(LDA_set_control()), 4) -}) diff --git a/tests/testthat/test-03-lda_plot.R b/tests/testthat/test-03-lda_plot.R deleted file mode 100644 index 5405e3ed..00000000 --- a/tests/testthat/test-03-lda_plot.R +++ /dev/null @@ -1,66 +0,0 @@ -context("Check LDA plot functions") -tenv <- "cran" - -data(rodents) -lda_data <- rodents$document_term_table -ldas <- LDA_set(lda_data, c(2, 4), 2, list(quiet = TRUE)) -lda <- ldas[[1]] -xtime <- rodents$newmoon - -test_that("check output from set_LDA_plot_colors", { - col_default <- set_LDA_plot_colors(x = lda) - expect_equal(col_default, c("#0D0887CC", "#FCCE25CC")) - col_A <- set_LDA_plot_colors(x = lda, option = "A") - expect_equal(col_A, c("#000004CC", "#FECE91CC")) - col_grey <- set_LDA_plot_colors(x = lda, cols = "grey") - expect_equal(col_grey, c("#000000", "#CCCCCC")) - col_grey_and_A <- set_LDA_plot_colors(x = lda, cols = "grey", option = "A") - expect_equal(col_grey_and_A, c("#000000", "#CCCCCC")) - expect_error(set_LDA_plot_colors(x = lda, 1)) - col_extra <- set_LDA_plot_colors(x = lda, cols = 1:3) - expect_equal(col_extra, 1:2) -}) - -test_that("check plotting of plot.LDA", { - if (tenv == "cran"){ - expect_silent(plot(x = lda)) - expect_silent(plot(x = lda, xtime = xtime, xname = "New Moon")) - } else{ - plot(x = lda) - LDA_plot <- recordPlot() - vdiffr::expect_doppelganger("Base LDA plot", LDA_plot) - plot(x = lda, xtime = xtime, xname = "New Moon") - LDA_plot_xtime <- recordPlot() - vdiffr::expect_doppelganger("LDA plot with time x", LDA_plot_xtime) - } -}) - -test_that("check plotting of plot.LDA_set", { - sellda <- select_LDA(ldas) - if (tenv == "cran"){ - expect_silent(plot(x = sellda)) - } else{ - plot(x = sellda) - LDA_set_plot <- recordPlot() - vdiffr::expect_doppelganger("Base LDA_set selected plot", LDA_set_plot) - } -}) - -test_that("check LDA plot panels", { - cols <- set_LDA_plot_colors(x = lda, cols = NULL, option = "E") - - if (tenv == "cran"){ - expect_silent(LDA_plot_top_panel(x = lda, cols)) - expect_silent(LDA_plot_bottom_panel(x = lda, xtime = NULL, - xname = NULL, cols)) - } else{ - LDA_plot_top_panel(x = lda, cols) - LDA_top_plot <- recordPlot() - vdiffr::expect_doppelganger("Base LDA plot top panel", LDA_top_plot) - LDA_plot_bottom_panel(x = lda, xtime = NULL, xname = NULL, cols) - LDA_bottom_plot <- recordPlot() - vdiffr::expect_doppelganger("Base LDA plot bottom panel", LDA_bottom_plot) - } -}) - - diff --git a/tests/testthat/test-04-LDA_TS.R b/tests/testthat/test-04-LDA_TS.R deleted file mode 100644 index 2fb3d2c2..00000000 --- a/tests/testthat/test-04-LDA_TS.R +++ /dev/null @@ -1,74 +0,0 @@ -context("Check LDA_TS functions") - -data(rodents) -lda_data <- rodents$document_term_table -document_covariate_table <- rodents$document_covariate_table - mod0 <- LDA_TS(rodents, - topics = 2, nseeds = 1, formulas = ~ 1, nchangepoints = 0, - timename = "newmoon", - control = list(nit = 10)) - mod1 <- LDA_TS(rodents, - topics = 2, nseeds = 1, formulas = ~ 1, nchangepoints = 1, - timename = "newmoon", - control = list(nit = 50)) - - -test_that("LDA_TS on 0 changepoints", { - expect_is(mod0, "LDA_TS") - expect_equal(length(names(mod0)), 4) - expect_is(mod0[[4]], "TS_fit") -}) - -test_that("LDA_TS on 1 changepoints", { - - expect_is(mod1, "LDA_TS") - expect_equal(length(names(mod1)), 4) - expect_is(mod1[[4]], "TS_fit") -}) - -test_that("check print on LDA_TS", { - expect_output(print(mod1)) -}) - -test_that("Check LDA_TS_controls_list", { - expect_is(LDA_TS_control(), "list") - expect_equal(length(LDA_TS_control()), 3) -}) - -test_that("Check conform_LDA_TS_data", { - expect_is(conform_LDA_TS_data(rodents), "list") - expect_is(conform_LDA_TS_data(rodents[[1]]), "list") - expect_message(conform_LDA_TS_data(rodents[[1]])) - expect_error(conform_LDA_TS_data(list(term1 = 1, term2 = 2))) - - expect_is(conform_LDA_TS_data(list(term = rodents[[1]])), "list") - expect_message(conform_LDA_TS_data(list(term = rodents[[1]]))) - - expect_error(conform_LDA_TS_data(list(term = rodents[[1]], covariate1 = 1, - covariate2 = 2))) - expect_error(conform_LDA_TS_data("ok")) -}) - -test_that("Check package_LDA_TS", { - topics <- 2 - nseeds <- 1 - formulas <- ~ 1 - nchangepoints <- 1 - weights <- document_weights(lda_data) - timename <- "newmoon" - control <- LDA_TS_control(nit = 50) - LDAs <- LDA_set(lda_data, topics, nseeds, - control$LDA_set_control) - sel_LDA <- select_LDA(LDAs, control$LDA_set_control) - TSs <- TS_on_LDA(sel_LDA, document_covariate_table, formulas, nchangepoints, - timename, weights, control$TS_control) - sel_TSs <- select_TS(TSs, control$TS_control) - - expect_is(package_LDA_TS(LDAs, sel_LDA, TSs, sel_TSs), "LDA_TS") - expect_equal(length(package_LDA_TS(LDAs, sel_LDA, TSs, sel_TSs)), 4) - expect_error(package_LDA_TS()) - expect_error(package_LDA_TS("ok", sel_LDA, TSs, sel_TSs)) - expect_error(package_LDA_TS(LDAs, "ok", TSs, sel_TSs)) - expect_error(package_LDA_TS(LDAs, sel_LDA, "ok", sel_TSs)) - expect_error(package_LDA_TS(LDAs, sel_LDA, TSs, "ok")) -}) diff --git a/tests/testthat/test-05-LDA_TS_plots.R b/tests/testthat/test-05-LDA_TS_plots.R deleted file mode 100644 index 44bd0ea3..00000000 --- a/tests/testthat/test-05-LDA_TS_plots.R +++ /dev/null @@ -1,36 +0,0 @@ -context("Check LDA_TS plot functions") -tenv <- "cran" - -data(rodents) -lda_data <- rodents$document_term_table -document_term_table <- rodents$document_term_table -document_covariate_table <- rodents$document_covariate_table - -mod <- LDA_TS(rodents, - topics = 2, nseeds = 1, formulas = ~ 1, nchangepoints = 1, - timename = "newmoon", - control = list(nit = 100, seed = 1)) - -test_that("check plot for LDA_TS", { - if (tenv == "cran"){ - expect_silent(plot(mod, interactive = FALSE)) - } else{ - plot(mod, interactive = FALSE) - LDA_TS_set_plot <- recordPlot() - vdiffr::expect_doppelganger("Base LDA_TS non-interactive plot", - LDA_TS_set_plot) - } -}) - -test_that("check color list creation function", { - expect_equal(length(set_LDA_TS_plot_cols()), 2) - expect_equal(names(set_LDA_TS_plot_cols()), c("LDA", "TS")) - expect_equal(length(set_LDA_TS_plot_cols()[[1]]), 3) - expect_equal(length(set_LDA_TS_plot_cols()[[2]]), 2) - expect_equal(names(set_LDA_TS_plot_cols()[[2]]), c("rho", "gamma")) - expect_equal(length(set_LDA_TS_plot_cols()[[2]][[1]]), 3) - expect_equal(names(set_LDA_TS_plot_cols()[[2]][[1]]), - c("cols", "option", "alpha")) - expect_equal(names(set_LDA_TS_plot_cols()[[2]][[2]]), - c("cols", "option", "alpha")) -}) diff --git a/tests/testthat/test-06-multinom_TS.R b/tests/testthat/test-06-multinom_TS.R deleted file mode 100644 index be6416b6..00000000 --- a/tests/testthat/test-06-multinom_TS.R +++ /dev/null @@ -1,95 +0,0 @@ -context("Check multinomial TS functions") - -data(rodents) -lda_data <- rodents$document_term_table -lda <- LDA_set(lda_data, c(4), nseeds = 1, list(quiet = TRUE)) -dct <- rodents$document_covariate_table -mts_data <- data.frame(dct) -mts_data$gamma <- lda[[1]]@gamma -timename <- "newmoon" - -test_that("check packaging of chunk fits", { - TS_chunk_memo <- memoise_fun(multinom_TS_chunk, TRUE) - chunks <- prep_chunks(data = mts_data, changepoints = c(20,50), - timename = timename) - nchunks <- nrow(chunks) - fits <- vector("list", length = nchunks) - for (i in 1:nchunks){ - fits[[i]] <- TS_chunk_memo(data = mts_data, formula = gamma ~ 1, - chunk = chunks[i, ], timename = timename, - weights = NULL) - } - packaged <- package_chunk_fits(chunks, fits) - expect_is(packaged, "multinom_TS_fit") - expect_equal(round(packaged$logLik, 2), -516.58) -}) - -test_that("check logLik for multinom_TS_fit", { - mts <- multinom_TS(data = mts_data, formula = gamma~1, - changepoints = c(20,50), timename = "newmoon", weights = NULL) - expect_is(logLik(mts), "logLik") - expect_equal(round(as.numeric(logLik(mts))), -517) -}) - -test_that("check good output from multinom_TS", { - mts <- multinom_TS(data = mts_data, formula = gamma~1, - changepoints = c(20,50), timename = "newmoon", weights = NULL) - expect_is(mts, "list") - expect_is(mts, "multinom_TS_fit") - expect_equal(length(mts), 3) - expect_equal(names(mts), c("chunk models", "logLik", "chunks")) - expect_equal(length(mts$"chunk models"), 3) - expect_is(mts$logLik, "numeric") -}) - -test_that("check check_changepoints", { - expect_silent(check_changepoints()) - expect_silent(check_changepoints(1)) - expect_error(check_changepoints("ok")) - expect_error(check_changepoints(0.3)) -}) - -test_that("check failed output from multinom_TS", { - mts <- multinom_TS(data = mts_data, formula = gamma~1, - changepoints = c(50,40), timename = "newmoon", weights = NULL) - expect_is(mts, "list") - expect_equal(length(mts), 3) - expect_equal(names(mts), c("chunk models", "logLik", "chunks")) - expect_equal(mts$"chunk models", NA) - expect_equal(mts$logLik, -Inf) -}) - -test_that("check bad change point location catching of - verify_changepoint_locations", { - expect_equal(verify_changepoint_locations(mts_data, -1, "newmoon"), FALSE) - expect_equal(verify_changepoint_locations(mts_data, 1e5, "newmoon"), FALSE) - expect_equal(verify_changepoint_locations(mts_data, NULL, "newmoon"), TRUE) - expect_equal(verify_changepoint_locations(mts_data, 100, "newmoon"), TRUE) - expect_equal(verify_changepoint_locations( - mts_data, c(10, 50, 100), "newmoon"), TRUE) -}) - -test_that("check memoization of multinom_TS_chunk", { - expect_is(memoise_fun(multinom_TS_chunk, TRUE), "memoised") -}) - -chunk <- data.frame(start = 0, end = 40) -test_that("check multinom_TS_chunk", { - expect_is(multinom_TS_chunk(mts_data, "gamma ~ 1", chunk, timename, NULL), - "multinom") -}) - -test_that("check memoised multinom_TS_chunk", { - multinom_TS_chunk_memo <- memoise_fun(multinom_TS_chunk, TRUE) - expect_is( - multinom_TS_chunk_memo(mts_data, "gamma ~ 1", chunk, timename), - "multinom") -}) - -test_that("check prepping of chunks", { - expect_is(prep_chunks(mts_data, NULL, "newmoon"), "data.frame") - expect_equal(prep_chunks(mts_data, NULL, "newmoon")$start, 1) - expect_equal(prep_chunks(mts_data, NULL, "newmoon")$end, 467) - expect_equal(prep_chunks(mts_data, c(10), "newmoon")$start, c(1, 11)) - expect_equal(prep_chunks(mts_data, c(10), "newmoon")$end, c(10, 467)) -}) diff --git a/tests/testthat/test-07-ptMCMC.R b/tests/testthat/test-07-ptMCMC.R deleted file mode 100644 index 372fa947..00000000 --- a/tests/testthat/test-07-ptMCMC.R +++ /dev/null @@ -1,262 +0,0 @@ -context("Check ptMCMC functions") - -# use old RNG method for sample (for test reproducibility) -if ("sample.kind" %in% names(formals(RNGkind))){ - suppressWarnings(RNGkind(sample.kind = "Rounding")) -} - -data(rodents) -lda_data <- rodents$document_term_table -document_term_table <- rodents$document_term_table -document_covariate_table <- rodents$document_covariate_table -topics <- 2 -nseeds <- 1 -formulas <- ~ 1 -nchangepoints <- 1 -weights <- document_weights(document_term_table) -timename <- "newmoon" -LDAs <- LDA_set(document_term_table, topics, nseeds) -LDA_models <- select_LDA(LDAs) -control <- list(nit = 20, seed = 1) -mods <- expand_TS(LDA_models, formulas, nchangepoints) -formula <- mods$formula[[1]] -nchangepoints <- mods$nchangepoints[1] -data <- prep_TS_data(document_covariate_table, LDA_models, mods, 1) - -set.seed(1) -rho_dist0 <- est_changepoints(data, formula, nchangepoints = 0, timename, - weights, control) -rho_dist <- est_changepoints(data, formula, nchangepoints, timename, - weights, control) -eta_dist <- est_regressors(rho_dist, data, formula, timename, weights, - control) - - -saves <- prep_saves(nchangepoints, control) -inputs <- prep_ptMCMC_inputs(data, formula, nchangepoints, timename, weights, - control) -cpts <- prep_cpts(data, formula, nchangepoints, timename, weights, control) -ids <- prep_ids(control) - -test_that("check prep_proposal_dist", { - pd <- prep_proposal_dist(nchangepoints, control) - expect_equal(length(pd), 2) - expect_equal(dim(pd[[1]]), c(20, 6)) - pd2 <- prep_proposal_dist(0, control) - expect_equal(length(pd2), 2) - expect_equal(dim(pd2[[1]]), c(20, 6)) - expect_error(prep_proposal_dist("ok", control)) - expect_error(prep_proposal_dist(nchangepoints, "ok")) -}) - - - -test_that("check prep_ptMCMC_inputs", { - inpts <- prep_ptMCMC_inputs(data, formula, nchangepoints, timename, weights, - control) - expect_is(inpts, "ptMCMC_inputs") - expect_equal(length(inpts[[2]]), 6) - - expect_error( - prep_ptMCMC_inputs("ok", formula, nchangepoints, timename, weights, - control)) - expect_error( - prep_ptMCMC_inputs(data, "ok", nchangepoints, timename, weights, control)) - expect_error( - prep_ptMCMC_inputs(data, formula, "ok", timename, weights, control)) - expect_error( - prep_ptMCMC_inputs(data, formula, nchangepoints, timename, "ok", control)) - expect_error( - prep_ptMCMC_inputs(data, formula, nchangepoints, "ok", weights, control)) - expect_error(suppressWarnings( - prep_ptMCMC_inputs(data, formula, nchangepoints, timename, weights, - "ok"))) -}) - - - -test_that("check prep_ids", { - expect_equal(prep_ids(TS_control()), 1:6) - expect_error(prep_ids("ok")) - expect_error(prep_ids(list(ntemps = 0.3))) -}) -test_that("check update_ids", { - set.seed(123) - steps <- step_chains(1, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - ids <- update_ids(ids, swaps) - expect_equal(ids, c(1, 2, 4, 3, 5, 6)) -}) - - -test_that("check proposed_step_mods", { - pdist <- inputs$pdist - ntemps <- length(inputs$temps) - selection <- cbind(pdist$which_steps[1, ], 1:ntemps) - prop_changepts <- cpts$changepts - curr_changepts_s <- cpts$changepts[selection] - prop_changepts_s <- curr_changepts_s + pdist$steps[1, ] - if(all(is.na(prop_changepts_s))){ - prop_changepts_s <- NULL - } - prop_changepts[selection] <- prop_changepts_s - mods <- proposed_step_mods(prop_changepts, inputs) - - expect_is(mods, "list") - expect_is(mods[[1]], "multinom_TS_fit") - expect_is(mods[[1]][[1]], "list") - expect_is(mods[[1]][[1]][[1]], "multinom") - expect_equal(round(mods[[1]][[1]][[1]]$deviance, 1), 43.9) -}) - -test_that("check propose_step", { - prop_step <- propose_step(1, cpts, inputs) - expect_equal(length(prop_step), 2) - expect_equal(names(prop_step), c("changepts", "lls")) - expect_equal(prop_step[[1]][1,1], 198) -}) -test_that("check eval_step", { - set.seed(1) - prop_step <- propose_step(1, cpts, inputs) - accept_step <- eval_step(1, cpts, prop_step, inputs) - expect_equal(accept_step, c(T, F, T, T, T, T)) -}) -test_that("check take_step", { - prop_step <- propose_step(1, cpts, inputs) - accept_step <- eval_step(1, cpts, prop_step, inputs) - taken <- take_step(cpts, prop_step, accept_step) - expect_equal(length(taken), 3) - expect_equal(names(taken), c("changepts", "lls", "accept_step")) - expect_equal(taken[[3]][3], TRUE) -}) - - -test_that("check step_chains", { - steps <- step_chains(1, cpts, inputs) - expect_equal(length(steps), 3) - expect_equal(names(steps), c("changepts", "lls", "accept_step")) - expect_equal(steps[[3]][3], TRUE) -}) - -test_that("check swap_chains", { - steps <- step_chains(1, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - expect_equal(length(swaps), 4) - expect_equal(names(swaps), c("changepts", "lls", "ids", "accept_swap")) - expect_equal(swaps[[3]][3], 4) -}) - -test_that("check count_trips", { - set.seed(1) - expect_equal(length(count_trips(rho_dist$ids)), 2) - expect_equal(names(count_trips(rho_dist$ids)), - c("trip_counts", "trip_rates")) - expect_equal(count_trips(rho_dist$ids)[[1]][[3]], 0) - - expect_equal(count_trips(matrix(1, 6, 100))[[1]][3], 0) -}) - -test_that("check diagnose_ptMCMC", { - set.seed(1) - expect_equal(diagnose_ptMCMC(rho_dist0), NULL) - expect_equal(length(diagnose_ptMCMC(rho_dist)), 4) - expect_equal(names(diagnose_ptMCMC(rho_dist)), - c("step_acceptance_rate", "swap_acceptance_rate", "trip_counts", - "trip_rates")) - expect_equal(diagnose_ptMCMC(rho_dist)[[1]][1], 0.25) -}) - -test_that("check prep_saves", { - svs <- prep_saves(nchangepoints, control) - expect_is(svs, "list") - expect_equal(length(svs), 5) - expect_equal(dim(svs[[1]]), c(1, 6, 20)) - expect_error(prep_saves("ok", control)) - expect_error(prep_saves(nchangepoints, "ok")) -}) -test_that("check update_saves", { - set.seed(1) - saves <- prep_saves(nchangepoints, control) - inputs <- prep_ptMCMC_inputs(data, formula, nchangepoints, - timename, weights, control) - cpts <- prep_cpts(data, formula, nchangepoints, timename, weights, control) - ids <- prep_ids(control) - steps <- step_chains(1, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - saves <- update_saves(1, saves, steps, swaps) - expect_is(saves, "list") - expect_equal(length(saves), 5) - expect_equal(dim(saves[[1]]), c(1, 6, 20)) - expect_equal(saves[[1]][1, 1, 1], 309) -}) - -test_that("check process_saves", { - set.seed(1) - saves <- prep_saves(nchangepoints, control) - inputs <- prep_ptMCMC_inputs(data, formula, nchangepoints, - timename, weights, control) - cpts <- prep_cpts(data, formula, nchangepoints, timename, weights, control) - ids <- prep_ids(control) - for(i in 1:control$nit){ - steps <- step_chains(i, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - saves <- update_saves(i, saves, steps, swaps) - cpts <- update_cpts(cpts, swaps) - ids <- update_ids(ids, swaps) - } - out <- process_saves(saves, control) - expect_is(out, "list") - expect_equal(length(out), 5) - expect_equal(dim(out[[1]]), c(1, 6, 20)) - expect_equal(out[[1]][1, 1, 1], 309) - expect_equal(out[[1]][1, 1, 20], 270) - out2 <- process_saves(saves, list(burnin = 10, nit = 20)) - expect_is(out2, "list") - expect_equal(length(out2), 5) - expect_equal(dim(out2[[1]]), c(1, 6, 10)) - expect_equal(out2[[1]][1, 1, 1], 309) - expect_equal(out2[[1]][1, 1, 3], 301) -}) - -test_that("check prep_cpts", { - set.seed(1) - cpts <- prep_cpts(data, formula, nchangepoints, timename, weights, control) - expect_is(cpts, "list") - expect_equal(length(cpts), 2) - expect_equal(cpts[[1]][1,1], 268) - - expect_error(prep_cpts("ok", formula, nchangepoints, timename, weights, - control)) - expect_error(prep_cpts(data, "ok", nchangepoints, timename, weights, - control)) - expect_error(prep_cpts(data, formula, "ok", timename, weights, control)) - expect_error(prep_cpts(data, formula, nchangepoints, "ok", weights, - control)) - expect_error(prep_cpts(data, formula, nchangepoints, timename, "ok", - control)) - expect_error(prep_cpts(data, formula, nchangepoints, timename, weights, - "ok")) -}) -test_that("check update_cpts", { - set.seed(1) - saves <- prep_saves(nchangepoints, control) - inputs <- prep_ptMCMC_inputs(data, formula, nchangepoints, timename, - weights, control) - cpts <- prep_cpts(data, formula, nchangepoints, timename, weights, control) - ids <- prep_ids(control) - steps <- step_chains(1, cpts, inputs) - swaps <- swap_chains(steps, inputs, ids) - cpts <- update_cpts(cpts, swaps) - expect_is(cpts, "list") - expect_equal(length(cpts), 2) - expect_equal(cpts[[1]][1,1], 309) -}) - -test_that("check prep_temp_sequence", { - expect_equal(length(prep_temp_sequence()), 6) - expect_equal(length(prep_temp_sequence(list(ntemps = 9))), 9) - expect_equal(round(prep_temp_sequence()[3], 2), 8) - expect_equal(round(prep_temp_sequence(list(q = 1))[3], 1), 2.8) - expect_error(prep_temp_sequence(123)) -}) - diff --git a/tests/testthat/test-08-simulate.R b/tests/testthat/test-08-simulate.R deleted file mode 100644 index 4ab35466..00000000 --- a/tests/testthat/test-08-simulate.R +++ /dev/null @@ -1,92 +0,0 @@ -context("Check simulate functions") - -# use old RNG method for sample (for test reproducibility) -if ("sample.kind" %in% names(formals(RNGkind))){ - suppressWarnings(RNGkind(sample.kind = "Rounding")) -} - -test_that("check sim_LDA_data", { - N <- c(10, 22, 15, 31) - alpha <- 1.2 - Beta <- matrix(c(0.1, 0.1, 0.8, 0.2, 0.6, 0.2), 2, 3, byrow = TRUE) - Beta2 <- matrix(c(0.1, 0.2, 0.8, 0.2, 0.6, 0.2), 2, 3, byrow = TRUE) - Beta3 <- matrix(c(0.3, -0.1, 0.8, 0.2, 0.6, 0.2), 2, 3, byrow = TRUE) - Beta4 <- matrix(c(0.1, 0.1, 0.1, 0.1, 0.4, 0.2), 1, 6, byrow = TRUE) - Theta <- matrix(c(0.2, 0.8, 0.8, 0.2, 0.5, 0.5, 0.9, 0.1), 4, 2, - byrow = TRUE) - Theta2 <- matrix(c(1.8, -0.8, 0.8, 0.2, 0.5, 0.5, 0.9, 0.1), 4, 2, - byrow = TRUE) - Theta3 <- matrix(c(0.9, 0.8, 0.8, 0.2, 0.5, 0.5, 0.9, 0.1), 4, 2, - byrow = TRUE) - - expect_error(sim_LDA_data(N, Beta)) - expect_error(sim_LDA_data(N, Beta, Theta)) - - expect_is(sim_LDA_data(N, Beta, alpha), "matrix") - expect_is(sim_LDA_data(N, Beta, Theta = Theta), "matrix") - - expect_equal(dim(sim_LDA_data(N, Beta, alpha)), c(4,3)) - expect_equal(round(sim_LDA_data(N, Beta, alpha, seed = 1), 2)[1,1], 2) - expect_equal(round(sim_LDA_data(N, Beta, Theta = Theta, seed = 1), 2)[1,1], - 1) - - expect_error(sim_LDA_data("ok", Beta, alpha)) - expect_error(sim_LDA_data(N + 1.1, Beta, alpha)) - expect_error(sim_LDA_data(matrix(1, 2, 2), Beta, alpha)) - expect_error(sim_LDA_data(N, "ok", alpha)) - expect_error(sim_LDA_data(N, Beta2, alpha)) - expect_error(sim_LDA_data(N, Beta3, alpha)) - expect_error(sim_LDA_data(N, alpha, alpha)) - expect_error(sim_LDA_data(N, Beta, "ok")) - expect_error(sim_LDA_data(N, Beta, rep(alpha, 2))) - expect_error(sim_LDA_data(N, Beta, -1)) - expect_error(sim_LDA_data(N, Beta, Theta = "ok")) - expect_error(sim_LDA_data(N, Beta, Theta = Theta2)) - expect_error(sim_LDA_data(N, Beta, Theta = Theta3)) - - expect_is(sim_LDA_data(N, Beta4, alpha), "matrix") - expect_equal(dim(sim_LDA_data(N, Beta4, alpha)), c(4,6)) - expect_equal(round(sim_LDA_data(N, Beta4, alpha, seed = 1), 2)[1,1], 1) - -}) - -test_that("check sim_TS_data", { - tD <- c(1, 3, 4, 6) - rho <- 3 - X <- cbind(rep(1, 4), 1:4) - Eta <- cbind(c(0.5, 0.3, 0.9, 0.5), c(1.2, 1.1, 0.1, 0.5)) - expect_is(sim_TS_data(X, Eta, rho, tD), "matrix") - expect_equal(dim(sim_TS_data(X, Eta, rho, tD)), c(4,2)) - expect_equal(round(sim_TS_data(X, Eta, rho, tD), 2)[1,1], 0.18) - - expect_error(sim_TS_data(X, Eta, rho, "ok")) - expect_error(sim_TS_data(X, Eta, rho, matrix(1, 2, 2))) - expect_error(sim_TS_data(X, Eta, rho, tD + 0.1)) - expect_error(sim_TS_data("ok", Eta, rho, tD)) - expect_error(sim_TS_data(array(1, dim=c(2,2,2)), Eta, rho, tD)) - expect_error(sim_TS_data(X, "ok", rho, tD)) - expect_error(sim_TS_data(X, array(1, dim=c(2,2,2)), rho, tD)) - expect_error(sim_TS_data(X, Eta, "ok", tD)) - expect_error(sim_TS_data(X, Eta, 1.1, tD)) - expect_error(sim_TS_data(X, Eta, matrix(1, 2, 1), tD)) - expect_is(sim_TS_data(X, Eta[1:2,], NULL, tD), "matrix") - - expect_error(sim_TS_data(X, Eta, rho, tD, -1)) - expect_error(sim_TS_data(X, Eta, rho, tD, c(1,2))) - expect_error(sim_TS_data(X, Eta, rho, tD, "ok")) -}) - -test_that("check sim_LDA_TS_data", { - - N <- c(10, 22, 15, 31) - tD <- c(1, 3, 4, 6) - rho <- 3 - X <- cbind(rep(1, 4), 1:4) - Eta <- cbind(c(0.5, 0.3, 0.9, 0.5), c(1.2, 1.1, 0.1, 0.5)) - Beta <- matrix(c(0.1, 0.1, 0.8, 0.2, 0.6, 0.2), 2, 3, byrow = TRUE) - err <- 1 - sims <- sim_LDA_TS_data(N, Beta, X, Eta, rho, tD, err, seed = 1) - expect_is(sims, "matrix") - expect_equal(dim(sims), c(4, 3)) - expect_equal(round(sims, 2)[1,1], 1) -}) \ No newline at end of file diff --git a/tests/testthat/test-09-TS.R b/tests/testthat/test-09-TS.R deleted file mode 100644 index 85535ca8..00000000 --- a/tests/testthat/test-09-TS.R +++ /dev/null @@ -1,227 +0,0 @@ -context("Check TS functions") - -data(rodents) -lda_data <- rodents$document_term_table -document_term_table <- rodents$document_term_table -document_covariate_table <- rodents$document_covariate_table -topics <- 2 -nseeds <- 1 -formulas <- ~ 1 -nchangepoints <- 1 -weights <- document_weights(document_term_table) -LDAs <- LDA_set(document_term_table, topics, nseeds) -LDA_models <- select_LDA(LDAs) -control <- list(nit = 20, seed = 1) -timename <- "newmoon" -mods <- expand_TS(LDA_models, formulas, nchangepoints) -formula <- mods$formula[[1]] -nchangepoints <- mods$nchangepoints[1] -data <- prep_TS_data(document_covariate_table, LDA_models, mods, 1) -TSmod <- TS(data, formula, nchangepoints, timename, weights, control) - -rho_dist <- est_changepoints(data, formula, nchangepoints, timename, weights, - control) -eta_dist <- est_regressors(rho_dist, data, formula, timename, weights, - control) -set.seed(1) -test_that("check measure_eta_vcov", { - expect_is(measure_eta_vcov(eta_dist), "matrix") - expect_equal(round(measure_eta_vcov(eta_dist)[1, 1], 2), 0.32) - expect_error(measure_eta_vcov("ok")) -}) - -test_that("check measure_rho_vcov", { - set.seed(1) - nchangepoints <- dim(rho_dist$cpts)[1] - if (is.null(nchangepoints)){ - nchangepoints <- 0 - mod <- multinom_TS(data, formula, changepoints = NULL, - timename, weights, control) - mod <- mod[[1]][[1]] - lls <- as.numeric(logLik(mod)) - rhos <- NULL - } else{ - lls <- rho_dist$lls[1, ] - rhos <- t(array(rho_dist$cpts[ , 1, ], dim = dim(rho_dist$cpts)[c(1, 3)])) - } - expect_is(measure_rho_vcov(rhos), "matrix") - expect_equal(round(measure_rho_vcov(rhos)[1, 1], 1), 572.4) - expect_error(measure_rho_vcov("ok")) -}) - - -test_that("check summarize_etas", { - sum_e <- summarize_etas(eta_dist) - expect_is(sum_e, "data.frame") - expect_equal(round(sum_e[1, 1], 2), 2.52) - expect_equal(summarize_etas(eta_dist[1:3, ])$AC10[1], as.factor("-")) - expect_error(summarize_etas("ok")) - expect_error(summarize_etas(eta_dist, LDA_controls_list())) -}) - -test_that("check summarize_rhos", { - set.seed(1) - nchangepoints <- dim(rho_dist$cpts)[1] - if (is.null(nchangepoints)){ - nchangepoints <- 0 - mod <- multinom_TS(data, formula, changepoints = NULL, - timename, weights, control) - mod <- mod[[1]][[1]] - lls <- as.numeric(logLik(mod)) - rhos <- NULL - } else{ - lls <- rho_dist$lls[1, ] - rhos <- t(array(rho_dist$cpts[ , 1, ], dim = dim(rho_dist$cpts)[c(1, 3)])) - } - - sum_r <- summarize_rhos(rhos) - expect_is(sum_r, "data.frame") - expect_equal(round(sum_r[1, 1], 2), 253) - expect_error(summarize_rhos("ok")) - expect_error(summarize_rhos(rhos, LDA_controls_list())) -}) - - - -test_that("check est_changepoints", { - set.seed(1) - rhos <- est_changepoints(data, formula, nchangepoints, timename, weights, - control) - expect_is(rhos, "list") - expect_equal(length(rhos), 5) - expect_equal(names(rhos), - c("cpts", "lls", "ids", "step_accepts", "swap_accepts")) - expect_equal(dim(rhos[[1]]), c(1, 6, 20)) - expect_equal(round(sum(rhos$lls), 1), -29033.3) - - expect_error(est_changepoints(data, formula, nchangepoints, - timename, weights,"ok")) - expect_error(est_changepoints(data, formula, nchangepoints, "ok", weights, - control)) - expect_error(est_changepoints(data, formula, nchangepoints, "timename", - "ok", control)) - expect_error(est_changepoints(data, formula, "ok", - timename, weights, control)) - expect_error(est_changepoints(data, "ok", nchangepoints, - timename, weights, control)) - expect_error(est_changepoints("ok", formula, nchangepoints, - timename, weights, control)) -}) - -test_that("check est_regressors", { - set.seed(1) - rhos <- est_changepoints(data, formula, nchangepoints, - timename, weights, control) - etas <- est_regressors(rhos, data, formula, timename, weights, control) - set.seed(1) - rhos2 <- est_changepoints(data, formula, nchangepoints = 2, - timename, weights, list(nit = 20, seed = 1)) - etas2 <- est_regressors(rhos2, data, formula, timename, weights, - list(nit = 20, seed = 1)) - - expect_is(etas, "matrix") - expect_equal(colnames(etas), c("1_2:(Intercept)", "2_2:(Intercept)")) - expect_equal(dim(etas), c(20, 2)) - expect_equal(round(sum(etas[ , 1]), 1), 35.7) - expect_equal(round(sum(etas2[1:10 , 1]), 1), 15.2) - expect_error(est_regressors("ok", data, formula, timename,weights, - control)) - expect_error(est_regressors(rhos, data, formula, weights,timename, "ok")) - expect_error(est_regressors(rhos, data, formula, "ok", weights, control)) - expect_error(est_regressors(rhos, data, timename, "ok", - timename, control)) - expect_error(est_regressors(rhos, "ok", formula, timename, weights,control)) - expect_error(est_regressors(rhos, data, "ok", timename, weights,control)) - rhosx <- rhos - names(rhosx)[1] <- "ok" - expect_error(est_regressors(rhosx, data, formula, timename, weights, - control)) -}) - -test_that("check package_TS", { - summ <- package_TS(data, formula, timename, weights, control, - rho_dist, eta_dist) - expect_is(summ, "TS_fit") - expect_equal(length(summ), 17) - expect_equal(names(summ)[3], "nchangepoints") - expect_error( - package_TS("ok", formula, timename, weights, control, rho_dist, - eta_dist)) - expect_error(package_TS(data, formula, "ok", weights,control, rho_dist, - eta_dist)) - expect_error(package_TS(data, "ok", timename, weights, control, rho_dist, - eta_dist)) - - expect_error(package_TS(data, formula, timename, weights,"ok", rho_dist, - eta_dist)) - expect_error(package_TS(data, formula, timename,weights, control, "ok", - eta_dist)) - expect_error(package_TS(data, formula, timename,weights, control, - rho_dist, "ok")) - expect_error(package_TS("ok", formula, timename,weights, control, - rho_dist, eta_dist)) -}) - -test_that("check TS", { - expect_is(TSmod, "TS_fit") - expect_equal(length(TSmod), 17) - expect_equal(TSmod$nchangepoints, 1) - expect_error(TS(data, formula, nchangepoints = 0, timename, weights, "ok")) - expect_error(TS(data, formula, nchangepoints = 0, "ok", weights, control)) - expect_error(TS(data, "ok", nchangepoints = 0, timename, "ok", control)) - expect_error(TS("ok", formula, nchangepoints = 0, timename, weights, - control)) - expect_error(TS(data, formula, "ok", timename, weights, control)) -}) - -test_that("check check_TS_inputs", { - expect_silent(check_TS_inputs(data, formula, nchangepoints, - timename, weights, control)) - expect_error(check_TS_inputs(data, formula, nchangepoints, - timename, weights, "ok")) - expect_error(check_TS_inputs(data, formula, nchangepoints, - timename, "ok", control)) - expect_error(check_TS_inputs(data, formula, "ok", - timename, weights, control)) - expect_error(check_TS_inputs(data, "ok", nchangepoints, - timename, weights, control)) - expect_error(check_TS_inputs("ok", formula, nchangepoints, - timename, weights, control)) - expect_error(check_TS_inputs(data, formula, nchangepoints, - "ok",weights, control)) -}) - -test_that("check print for TS_fit", { - expect_output(print(TSmod)) -}) - -test_that("check progress bar functions", { - expect_is(prep_pbar(), "progress_bar") - expect_silent(prep_pbar(list(quiet = TRUE))) - expect_error(prep_pbar("ok")) - expect_error(prep_pbar(list(), "ok")) - expect_error(prep_pbar(nr = 0.5)) - pp <- prep_pbar() - expect_is(update_pbar(pp), "progress_bar") - expect_silent(update_pbar(pp, list(quiet = TRUE))) - - expect_error(update_pbar(pp, "ok")) - expect_error(update_pbar("ok", list())) -}) - - -test_that("check AIC for TS_fit", { - expect_equal(round(AIC(TSmod), 1), 404.2) -}) - -test_that("check check_formula", { - expect_silent(check_formula(data, formula)) - expect_error(check_formula(data, ~1)) - expect_error(check_formula(data, ~ok)) - expect_error(check_formula(document_covariate_table, formula)) -}) - -test_that("check TS_control", { - expect_is(TS_control(), "list") - expect_equal(length(TS_control()), 16) -}) diff --git a/tests/testthat/test-10-TS_on_LDA.R b/tests/testthat/test-10-TS_on_LDA.R deleted file mode 100644 index 17157cd5..00000000 --- a/tests/testthat/test-10-TS_on_LDA.R +++ /dev/null @@ -1,230 +0,0 @@ -context("Check TS_on_LDA functions") - -data(rodents) -lda_data <- rodents$document_term_table -document_term_table <- rodents$document_term_table -document_covariate_table <- rodents$document_covariate_table -topics <- 2 -nseeds <- 1 -formulas <- ~ 1 -nchangepoints <- 0 -weights <- document_weights(document_term_table) -timename <- "newmoon" -LDAs <- LDA_set(document_term_table, topics, nseeds) -LDA_models <- select_LDA(LDAs) - mods <- TS_on_LDA(LDA_models, document_covariate_table, formulas, - nchangepoints = 0:1, timename, weights, - control = list(nit = 20)) - -test_that("check prep_TS_data", { - mods <- expand_TS(LDA_models, formulas=c(~1, ~sin_year, ~newmoon), - nchangepoints = 0:1) - nmods <- nrow(mods) - TSmods <- vector("list", nmods) - - for (i in 1:nmods){ - data_i <- prep_TS_data(document_covariate_table, LDA_models, mods, i) - expect_is(data_i, "data.frame") - expect_equal(dim(data_i), c(436, 4)) - } - - expect_error(prep_TS_data(document_covariate_table, LDA_models, mods, "ok")) - expect_error(prep_TS_data(document_covariate_table, LDA_models, "ok", i)) - expect_error(prep_TS_data(document_covariate_table, "ok", mods, i)) - expect_error(prep_TS_data("ok", LDA_models, mods, i)) - - data_1 <- prep_TS_data(document_covariate_table, LDA_models[[1]], mods, 1) - data_i <- prep_TS_data(document_covariate_table, LDA_models, mods, 1) - expect_equal(data_1, data_i) -}) - -test_that("check select_TS", { - - sel_mod <- select_TS(mods, list()) - - expect_equal(length(mods), 2) - expect_equal(length(sel_mod), 17) - expect_is(mods, "TS_on_LDA") - expect_is(sel_mod, "TS_fit") - expect_error(select_TS(mods, "ok")) - expect_error(select_TS("ok", list())) - - xxfun <- function(x){x} - expect_warning(select_TS(mods, list(selector = xxfun))) -}) - -test_that("check package_TS_on_LDA", { - mods <- expand_TS(LDA_models, formulas, nchangepoints = 0:1) - nmods <- nrow(mods) - TSmods <- vector("list", nmods) - - for(i in 1:nmods){ - print_model_run_message(mods, i, LDA_models, list()) - formula_i <- mods$formula[[i]] - nchangepoints_i <- mods$nchangepoints[i] - data_i <- prep_TS_data(document_covariate_table, LDA_models, mods, i) - TSmods[[i]] <- TS(data_i, formula_i, nchangepoints_i,timename, weights, - control = list(nit = 20)) - } - expect_is(package_TS_on_LDA(TSmods, LDA_models, mods), "TS_on_LDA") - expect_is(package_TS_on_LDA(TSmods, LDA_models[[1]], mods), "TS_on_LDA") - expect_error(package_TS_on_LDA(TSmods, LDA_models, "ok")) - expect_error(package_TS_on_LDA("ok", LDA_models, mods)) - expect_error(package_TS_on_LDA(TSmods, "ok", mods)) -}) - - -test_that("check TS_on_LDA", { - expect_is(mods, "TS_on_LDA") - expect_equal(length(mods), 2) - expect_is(mods[[1]], "TS_fit") - expect_is(mods[[2]], "TS_fit") - - expect_error(TS_on_LDA()) - expect_error(TS_on_LDA("ok", document_covariate_table, formulas, - nchangepoints, timename, weights, list())) - expect_error(TS_on_LDA(LDA_models, "ok", formulas, - nchangepoints, timename, weights, list())) - expect_error(TS_on_LDA(LDA_models, document_covariate_table, "ok", - nchangepoints, timename, weights, list())) - expect_error(TS_on_LDA(LDA_models, document_covariate_table, formulas, - "ok", timename, weights, list())) - expect_error(TS_on_LDA(LDA_models, document_covariate_table, formulas, - nchangepoints, timename, "ok", list())) - expect_error(TS_on_LDA(LDA_models, document_covariate_table, formulas, - nchangepoints, "ok", weights, list())) - expect_error(TS_on_LDA(LDA_models, document_covariate_table, formulas, - nchangepoints, timename, weights, "ok")) -}) - - -test_that("check printing for TS_on_LDA", { - expect_output(print(mods)) -}) - -test_that("check print_model_run_message", { - mods <- expand_TS(LDA_models, formulas, nchangepoints) - expect_message(print_model_run_message(mods, 1, LDA_models, list())) - expect_silent(print_model_run_message(mods, 1, LDA_models, - control = list(quiet = TRUE))) -}) - -test_that("check expand_TS", { - exp_TS <- expand_TS(LDA_models, formulas, nchangepoints) - expect_is(exp_TS, "data.frame") - expect_equal(dim(exp_TS), c(1, 3)) - exp_TS <- expand_TS(LDA_models, c(~1, ~newmoon), 3:10) - expect_is(exp_TS, "data.frame") - expect_equal(dim(exp_TS), c(16, 3)) - exp_TS <- expand_TS(LDA_models[[1]], c(~1, ~newmoon), 3:10) - expect_is(exp_TS, "data.frame") - expect_equal(dim(exp_TS), c(16, 3)) - expect_error(expand_TS("ok", formulas, nchangepoints)) - expect_error(expand_TS(LDA_models, "ok", nchangepoints)) - expect_error(expand_TS(LDA_models, c("~1", "ok"), nchangepoints)) - expect_error(expand_TS(LDA_models, list("~1", "ok"), nchangepoints)) - expect_error(expand_TS(LDA_models, formulas, 2.5)) -}) - -test_that("check check_nchangepoints", { - expect_silent(check_nchangepoints(1)) - expect_silent(check_nchangepoints(0)) - expect_silent(check_nchangepoints(1:10)) - expect_error(check_nchangepoints(2.5)) - expect_error(check_nchangepoints(-1)) - expect_error(check_nchangepoints(NULL)) -}) - - -test_that("check check_weights", { - expect_equal(check_weights(TRUE), NULL) - expect_error(check_weights(FALSE)) - expect_silent(check_weights(weights)) - expect_silent(check_weights(1)) - expect_silent(check_weights(NULL)) - expect_error(check_weights("ok")) - expect_error(check_weights(-1)) - expect_warning(check_weights(100)) -}) - - -test_that("check check_LDA_models", { - expect_silent(check_LDA_models(LDA_models)) - expect_silent(check_LDA_models(LDA_models[[1]])) - expect_error(check_LDA_models("ok")) -}) - - -test_that("check check_document_covariate_table", { - expect_silent(check_document_covariate_table(document_covariate_table)) - expect_silent(check_document_covariate_table(document_covariate_table, - LDA_models = LDA_models)) - expect_silent(check_document_covariate_table(document_covariate_table, - LDA_models = LDAs)) - expect_silent(check_document_covariate_table(document_covariate_table, - document_term_table = document_term_table)) - expect_error(check_document_covariate_table(document_covariate_table, - LDA_models = 1)) - expect_error(check_document_covariate_table(document_covariate_table, - document_term_table = 1)) - expect_error(check_document_covariate_table(document_covariate_table = 1, - LDA_models = LDA_models)) - expect_error(check_document_covariate_table(document_covariate_table = 1, - document_term_table = document_term_table)) - expect_error(check_document_covariate_table(lm(1~1), LDA_models)) -}) - -test_that("check check_timename", { - expect_silent(check_timename(document_covariate_table, timename)) - expect_error(check_timename("ok", timename)) - expect_error(check_timename(document_covariate_table, "ok")) - expect_error(check_timename(document_covariate_table, 1)) - expect_error(check_timename(document_covariate_table, - rep(timename, 2))) - expect_error(check_timename(document_covariate_table, 1)) - expect_error(check_timename(data.frame(letters), "letters")) - - dct2 <- document_covariate_table - dct2[,timename] <- dct2[,timename] + 0.1 - expect_error(check_timename(dct2, timename)) -}) - -test_that("check check_formulas", { - expect_silent(check_formulas(formulas, document_covariate_table, list())) - expect_error(check_formulas("ok", document_covariate_table, list())) - expect_error(check_formulas(~newmoon, "ok", list())) - expect_error(check_formulas(c(~1, "ok"), - document_covariate_table, list())) - expect_error(check_formulas(list(~1, "ok"), - document_covariate_table, list())) - expect_error(check_formulas(formulas, document_covariate_table, "ok")) -}) - - -test_that("check check_TS_on_LDA_inputs", { - expect_silent( - check_TS_on_LDA_inputs(LDA_models, document_covariate_table, formulas, - nchangepoints, timename, weights, list())) - expect_error( - check_TS_on_LDA_inputs(LDA_models, document_covariate_table, formulas, - nchangepoints, timename, weights, "ok")) - expect_error( - check_TS_on_LDA_inputs(LDA_models, document_covariate_table, formulas, - nchangepoints, timename, "ok", list())) - expect_error( - check_TS_on_LDA_inputs(LDA_models, document_covariate_table, formulas, - "ok", weights, timename, weights, list())) - expect_error( - check_TS_on_LDA_inputs(LDA_models, document_covariate_table, "ok", - nchangepoints, timename, weights, list())) - expect_error( - check_TS_on_LDA_inputs(LDA_models, "ok", formulas, - nchangepoints, timename, weights, list())) - expect_error( - check_TS_on_LDA_inputs("ok", document_covariate_table, formulas, - nchangepoints, timename, weights, list())) - expect_error( - check_TS_on_LDA_inputs(LDA_models, document_covariate_table, formulas, - nchangepoints, "ok", weights, list())) - -}) diff --git a/tests/testthat/test-11-TS_plots.R b/tests/testthat/test-11-TS_plots.R deleted file mode 100644 index fbe92f54..00000000 --- a/tests/testthat/test-11-TS_plots.R +++ /dev/null @@ -1,193 +0,0 @@ -context("Check TS plot functions") -tenv <- "cran" - -data(rodents) -lda_data <- rodents$document_term_table -document_term_table <- rodents$document_term_table -document_covariate_table <- rodents$document_covariate_table -topics <- 2 -nseeds <- 1 -formulas <- ~ 1 -nchangepoints <- 1 -weights <- document_weights(document_term_table) -LDAs <- LDA_set(document_term_table, topics, nseeds) -LDA_models <- select_LDA(LDAs) -control <- list(nit = 20, seed = 1) -timename <- "newmoon" -mods <- expand_TS(LDA_models, formulas, nchangepoints) -formula <- mods$formula[[1]] -nchangepoints <- mods$nchangepoints[1] -data <- prep_TS_data(document_covariate_table, LDA_models, mods, 1) -TSmod <- TS(data, formula, nchangepoints, timename, weights, control) - - -test_that("check rho_hist color generator", { - rc <- set_TS_summary_plot_cols()$rho - rho_cols <- set_rho_hist_colors(TSmod$rhos, rc$cols, rc$option, rc$alpha) - expect_equal(rho_cols, "#44015466") -}) - -test_that("check pred_gamma color generator", { - gc <- set_TS_summary_plot_cols()$gamma - gamma_cols <- set_gamma_colors(TSmod, gc$cols, gc$option, gc$alpha) - expect_equal(gamma_cols, c("#0D0887CC", "#FCCE25CC")) -}) - -test_that("check pred_gamma plot", { - gc <- set_TS_summary_plot_cols()$gamma - gamma_cols <- set_gamma_colors(TSmod, gc$cols, gc$option, gc$alpha) - - if (tenv == "cran"){ - expect_silent(pred_gamma_TS_plot(TSmod, cols = gamma_cols)) - - expect_silent(pred_gamma_TS_plot(TSmod, selection = "mode", - cols = gamma_cols)) - } else{ - TS_gamma_plot <- pred_gamma_TS_plot(TSmod, cols = gamma_cols) - TS_gamma_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS gamma plot", TS_gamma_plot) - } - expect_equal(set_gamma_colors(NULL), NULL) - expect_error(pred_gamma_TS_plot(TSmod, selection = "ok", - cols = gamma_cols)) -}) - -test_that("check rho_lines", { - if (tenv == "cran"){ - expect_silent(plot(1, 1, xlim = c(-10, 10), ylim = c(0, 1))) - expect_silent(rho_lines(1)) - } else{ - plot(1, 1, xlim = c(-10, 10), ylim = c(0, 1)) - rho_lines(1) - TS_rho_line_plot <- recordPlot() - vdiffr::expect_doppelganger("rho line plot", TS_rho_line_plot) - } - expect_equal(rho_lines(NULL), NULL) -}) - - -test_that("check rho_hist plot", { - rc <- set_TS_summary_plot_cols()$rho - rho_cols <- set_rho_hist_colors(TSmod$rhos, rc$cols, rc$option, rc$alpha) - if (tenv == "cran"){ - expect_silent(rho_hist(TSmod, rho_cols)) - } else{ - TS_rho_plot <- rho_hist(TSmod, rho_cols) - TS_rho_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS rho plot", TS_rho_plot) - } - expect_equal(set_rho_hist_colors(NULL), NULL) -}) - - -test_that("check color list creation function", { - expect_equal(length(set_TS_summary_plot_cols()), 2) - expect_equal(names(set_TS_summary_plot_cols()), c("rho", "gamma")) - expect_equal(length(set_TS_summary_plot_cols()[[1]]), 3) - expect_equal(length(set_TS_summary_plot_cols()[[2]]), 3) - expect_equal(names(set_TS_summary_plot_cols()[[2]]), - c("cols", "option", "alpha")) - expect_equal(names(set_TS_summary_plot_cols()[[1]]), - c("cols", "option", "alpha")) -}) - - - - -test_that("check trace_plot", { - if (tenv == "cran"){ - expect_silent(trace_plot(TSmod$rhos[ , 1])) - } else{ - TS_trace_plot <- trace_plot(TSmod$rhos[ , 1]) - TS_trace_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS trace plot", TS_trace_plot) - } -}) - -test_that("check ecdf_plot", { - if (tenv == "cran"){ - expect_silent(ecdf_plot(TSmod$rhos[ , 1])) - } else{ - TS_ecdf_plot <- ecdf_plot(TSmod$rhos[ , 1]) - TS_ecdf_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS ecdf plot", TS_ecdf_plot) - } -}) - -test_that("check autocorr_plot", { - if (tenv == "cran"){ - expect_silent(autocorr_plot(TSmod$rhos[ , 1])) - } else{ - TS_autocorr_plot <- autocorr_plot(TSmod$rhos[ , 1]) - TS_autocorr_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS autocorr plot", TS_autocorr_plot) - } -}) - -test_that("check posterior_plot", { - if (tenv == "cran"){ - expect_silent(posterior_plot(TSmod$rhos[ , 1])) - } else{ - TS_posterior_plot <- posterior_plot(TSmod$rhos[ , 1]) - TS_posterior_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS posterior plot", TS_posterior_plot) - } -}) - - -test_that("check plotting of TS_fit", { - if (tenv == "cran"){ - expect_silent(plot(TSmod)) - expect_silent(plot(TSmod, plot_type = "diagnostic", interactive = FALSE)) - } else{ - plot(TSmod) - TS_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS plot", TS_plot) - } -}) - -test_that("check TS_diagnostics_plot", { - TSmod0 <- TS(data, formula, nchangepoints = 0, timename, weights, control) - if (tenv == "cran"){ - expect_silent(TS_diagnostics_plot(TSmod0, interactive = FALSE)) - } else{ - TS_diagnostics_plot(TSmod0, interactive = FALSE) - TS_diag_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS diagnostic plot", TS_diag_plot) - } -}) - - -test_that("check TS_summary_plot", { - if (tenv == "cran"){ - expect_silent(TS_summary_plot(TSmod, cols = set_TS_summary_plot_cols(), - bin_width = 1, xname = NULL, selection = "median")) - } else{ - TS_summary_plot(TSmod, cols = set_TS_summary_plot_cols(), - bin_width = 1, xname = NULL, selection = "median") - TS_summ_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS summary plot", TS_summ_plot) - } -}) - - -test_that("check rho_diagnostics_plots", { - if (tenv == "cran"){ - expect_silent(rho_diagnostics_plots(TSmod, interactive = FALSE)) - } else{ - rho_diagnostics_plots(TSmod, interactive = FALSE) - TS_rdiag_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS rho diagnostic plot", TS_rdiag_plot) - } -}) - -test_that("check eta_diagnostics_plots", { - expect_equal(eta_diagnostics_plots(NULL), NULL) - if (tenv == "cran"){ - expect_silent(eta_diagnostics_plots(TSmod, interactive = FALSE)) - } else{ - eta_diagnostics_plots(TSmod, interactive = FALSE) - TS_ediag_plot <- recordPlot() - vdiffr::expect_doppelganger("Base TS eta diagnostic plot", TS_ediag_plot) - } -}) diff --git a/tests/testthat/test-12-utilities.R b/tests/testthat/test-12-utilities.R deleted file mode 100644 index 629579b0..00000000 --- a/tests/testthat/test-12-utilities.R +++ /dev/null @@ -1,165 +0,0 @@ -context("Check utilities") - -# use old RNG method for sample (for test reproducibility) -if ("sample.kind" %in% names(formals(RNGkind))){ - suppressWarnings(RNGkind(sample.kind = "Rounding")) -} - -test_that("check logsumexp", { - expect_is(logsumexp(c(1,2)), "numeric") - expect_equal(length(logsumexp(c(1,2))), 1) - expect_equal(round(logsumexp(c(1,2)), 2), 2.31) - expect_error(logsumexp("ok")) -}) - -test_that("check softmax", { - expect_is(softmax(c(1,2)), "numeric") - expect_equal(length(softmax(c(1,2))), 2) - expect_equal(round(softmax(c(1,2)), 2)[1], 0.27) - expect_is(softmax(matrix(1, nrow = 2, ncol = 2)), "matrix") - expect_equal(dim(softmax(matrix(1, nrow = 2, ncol = 2))), c(2,2)) - expect_equal((softmax(matrix(1, nrow = 2, ncol = 2)))[1,1], 0.5) - expect_error(softmax("ok")) - expect_error(softmax(array(1, dim = c(2, 2, 2)))) -}) - -test_that("check iftrue", { - expect_equal(iftrue(TRUE,1), 1) - expect_equal(iftrue(1,2), 1) -}) - -test_that("check AICc", { - data(rodents) - set.seed(123) - lda_data <- rodents$document_term_table - r_LDA <- LDA_set(lda_data, topics = 2, nseeds = 1)[[1]] - expect_is(AICc(r_LDA), "numeric") - expect_equal(round(AICc(r_LDA)), 95865) - expect_error(AICc("ok")) -}) - - -test_that("check modalvalue", { - xx <- c(1, 2, 3, 4, 5, 4, 3, 2, 1, 2) - expect_equal(modalvalue(xx), 2) - expect_error(modalvalue("ok")) -}) - -test_that("check document_weights", { - data(rodents) - lda_data <- rodents$document_term_table - doc_weights <- document_weights(lda_data) - expect_equal(round(mean(doc_weights), 3), 1) - expect_equal(round(max(doc_weights), 3), 3.543) - expect_equal(round(min(doc_weights), 3), 0.151) - expect_equal(length(doc_weights), 436) - expect_error(document_weights("ok")) -}) - -test_that("check messageq", { - expect_message(messageq()) - expect_message(messageq("ok")) - expect_error(messageq("ok", "")) - expect_message(messageq("ok", quiet = FALSE)) - expect_silent(messageq("ok", quiet = TRUE)) -}) - -test_that("check mirror_vcov", { - - dummy <- "x" - class(dummy) <- "dummy" - - y <- 1:10 - x <- 101:110 + rnorm(length(y)) - mod <- lm(y ~ x) - vcv <- mirror_vcov(mod) - expect_equal(isSymmetric(vcv), TRUE) - expect_error(mirror_vcov("ok")) - expect_warning(mirror_vcov(dummy)) - data(rodents) - lda_data <- rodents$document_term_table - document_term_table <- rodents$document_term_table - document_covariate_table <- rodents$document_covariate_table - topics <- 2 - nseeds <- 1 - formulas <- ~ newmoon - nchangepoints <- 2 - weights <- document_weights(document_term_table) - control <- list() - timename <- "newmoon" - LDAs <- LDA_set(document_term_table, topics, nseeds, list()) - LDA_models <- select_LDA(LDAs, list()) - control <- list(nit = 50, seed = 1) - mods <- expand_TS(LDA_models, formulas, nchangepoints) - formula <- mods$formula[[1]] - nchangepoints <- mods$nchangepoints[1] - data <- prep_TS_data(document_covariate_table, LDA_models, mods, 1) - rho_dist <- est_changepoints(data, formula, nchangepoints, timename, - weights, control) - mod <- multinom_TS(data, formula, changepoints = NULL, timename, weights, - control) - - # doesnt work in 32 bit - #expect_equal(isSymmetric(vcov(mod[[1]][[1]])), FALSE) - - expect_equal(isSymmetric(mirror_vcov(mod[[1]][[1]])), TRUE) - -}) - -test_that("check normalize", { - xx <- c(1, 2, 3, 4, 5, 4, 3, 2, 1, 2) - expect_equal(mean(normalize(xx)), 0.425) - expect_equal(normalize(xx)[1], 0) - xx <- -1000:100 - expect_equal(mean(normalize(xx)), 0.5) - expect_equal(round(sd(normalize(xx)), 3), 0.289) - expect_error(normalize("ok")) -}) - -test_that("check memoise_fun", { - expect_is(memoise_fun(sum, TRUE), "memoised") - expect_is(memoise_fun(sum, FALSE), "function") - expect_error(memoise_fun(1, TRUE)) - expect_error(memoise_fun(sum, 1)) -}) - -test_that("check check_control", { - expect_silent(check_control(list(), "list")) - expect_silent(check_control(list())) - expect_error(check_control(list(), 1)) - expect_error(check_control(1, "list")) -}) - - -test_that("check check_document_term_table", { - dtt <- "a" - expect_error(check_document_term_table(dtt)) - dtt <- matrix(1:100, 10, 10) - expect_silent(check_document_term_table(dtt)) - dtt[1,1] <- 1.1 - expect_error(check_document_term_table(dtt)) - dtt <- data.frame("dummy" = 1:100) - expect_silent(check_document_term_table(dtt)) - dtt[1,1] <- 1.1 - expect_error(check_document_term_table(dtt)) -}) - -test_that("check error catching of check_topics", { - expect_error(check_topics("a")) - expect_error(check_topics(1.5)) - expect_error(check_topics(1)) - expect_error(check_topics(2), NA) - expect_error(check_topics(c(2, 3, 4)), NA) - expect_silent(check_topics(5)) - expect_silent(check_topics(2:5)) -}) - -test_that("check error catching of check_seeds", { - expect_error(check_seeds("a")) - expect_error(check_seeds(1.5)) - expect_error(check_seeds(2), NA) - expect_error(check_seeds(c(2, 3, 4)), NA) - expect_silent(check_seeds(5)) - expect_silent(check_seeds(1:5)) -}) - diff --git a/vignettes/LDATS_codebase.Rmd b/vignettes/LDATS_codebase.Rmd index 0c736efa..99f48b8c 100644 --- a/vignettes/LDATS_codebase.Rmd +++ b/vignettes/LDATS_codebase.Rmd @@ -23,13 +23,12 @@ today <- Sys.Date() ## Overview -This vignette outlines the code base for the **LDATS** package. It was -constructed using **LDATS** version `r vers` on `r today`. +This vignette outlines the code base for the **LDATS** package. +It was constructed using **LDATS** version `r vers` on `r today`. ## Installation -To obtain the most recent version of **LDATS**, install the most recent -version from GitHub: +To obtain the most recent version of **LDATS**, install the most recent version from GitHub: ```{r, eval=FALSE} install.packages("devtools") @@ -38,100 +37,113 @@ devtools::install_github("weecology/LDATS") ## Analytical Pipeline -The full LDATS analysis can be executed using the `LDA_TS` function, -which is the top-level of the coding pipeline's hierarchy of -functions: +The full LDATS analysis can be executed using the `LDA_TS` function, which is the top-level of the coding pipeline's hierarchy of functions. +Each component model's function (`LDA()` and `TS()`) can be run independently, as well as many of the deeper component functions. +However, most users should only need to interact with the code at the top levels of the codebase. * `LDA_TS()` - * `check_LDA_TS_inputs()` - * `check_timename()` - * `check_formulas()` - * `check_nchangepoints()` - * `check_weights()` - * `check_control()` - * `check_document_term_table()` - * `check_document_covariate_table()` - * `check_topics()` - * `check_seeds()` - * `LDA_set()` - * `check_LDA_TS_inputs()` - * `check_control()` - * `check_document_term_table()` - * `check_topics()` - * `check_seeds()` - * loop over models - * `prep_LDA_control()` - * `LDA()` - * `select_LDA()` - * applies the `measurer` and `selector` functions supplied via `LDA_controls_list()` - * `TS_on_LDA()` - * `check_LDA_TS_inputs()` - * `check_LDA_models()` - * `check_formulas()` - * `check_nchangepoints()` - * `check_timename()` - * `check_control()` - * `check_weights()` - * `check_document_covariate_table()` - * `expand_TS()` - * loop over models - * `prep_TS_data()` - * `TS()` - * `check_TS_inputs()` - * `check_formulas()` - * `check_nchangepoints()` - * `check_control()` - * `check_weights()` - * `est_changepoints()` - * `prep_saves()` - * `prep_ptMCMC_inputs()` - * `prep_cpts()` - * loop over chains - * `multinom_TS()` - * `prep_ids()` - * `prep_pbar()` - * loop over iterations - * `update_pbar()` - * `step_chains()` - * `propose_step()` - * `proposed_step_mods()` - * loop over chains - * `multinom_TS()` - * `eval_step()` - * `take_step()` - * `swap_chains()` - * loop over chain neighbors - * `update_saves()` - * `update_cpts()` - * `update_ids()` - * `est_regressors()` - * loop over unique realizations of change point locations - * `multinom_TS()` - * loop over chunks - * `mirror_vcov()` - * `rmvnorm()` - * `summarize_TS()` - * `diagnose_ptMCMC()` - * `count_trips()` - * `summarize_rhos()` - * `measure_rho_vcov()` - * `summarize_etas()` - * `measure_eta_vcov()` - * `select_TS()` - * applies the `measurer` and `selector` functions supplied via `TS_controls_list()` - * `package_LDA_TS()` - -Each component model's function (`LDA_set()` and `TS()`) can be run -independently, as well. + * `LDA_TS_control()` + * `LDA_control()` + * `TS_control()` + * `LDA()` + * `prepare_LDA()` + * `check_LDA()` + * `LDA_control()` + * `conform_data()` + * `run_LDA()` + * `LDA_call()` (replicated for each model) + * `LDA_msg()` + * `soft_call()` using `LDA$control$model` with `LDA$control$model_args` + * `topicmodels::LDA()` (*default*) + * `package_LDA()` + * `select_LDA()` + * `soft_call()` using `LDA$control$selector` with LDA$control$selector_args` + * `min()` (*default*) + * `measure_LDA()` (replicated for each model) + * `soft_call()` using `LDA$control$measurer` with `LDA$control$measurer_args` + * `AIC()` (*default*) + * `TS()` + * `prepare_TS()` + * `check_TS()` + * `TS_control()` + * `run_TS()` + * `TS_call()` (replicated for each model) + * `TS_msg()` + * `soft_call()` using `TS$control$model with TS$control$model_args + * `sequential_TS()` (*default*) + * `sequential_TS_control()` + * `est_changepoints()` + * `soft_call()` using `TS$control$method` with `TS$control$method_args` + * `ldats_classic()` (*default*) + * `ldats_classic_control()` + * `prep_saves()` + * `prep_ptMCMC_inputs()` + * `prep_temp_sequence()` + * `prep_proposal_dist()` + * `prep_cpts()` + * `prep_temp_sequence()` + * `soft_call()` using `TS$control$response` with `TS$control$response_args` + * `multinom_TS()` (*default*) + * `multinom_TS_control()` + * `prep_chunks()` + * `multinom_TS_chunk()` + * `nnet::multinom()` + * `package_chunk_fits()` + * `prep_ids()` + * `prep_pbar()` + * `for` each iteration + * `update_pbar()` + * `step_chains()` + * `propose_step()` + * `proposed_step_mods()` + * `soft_call()` using `TS$control$response` with `TS$control$response_args` + * `multinom_TS()` (*default*) + * `multinom_TS_control()` + * `prep_chunks()` + * `multinom_TS_chunk()` + * `nnet::multinom()` + * `package_chunk_fits()` + * `eval_step()` + * `take_step()` + * `swap_chains()` + * `update_saves()` + * `update_cpts()` + * `update_ids()` + * `process_saves()` + * `count_trips()` + * `est_regressors()` + * `soft_call()` using `TS$control$response` with `TS$control$response_args` + * `multinom_TS()` (*default*) + * `multinom_TS_control()` + * `prep_chunks()` + * `multinom_TS_chunk()` + * `nnet::multinom()` + * `package_chunk_fits()` + * `package_sequential_TS()` + * `summarize_rhos()` + * `measure_rho_vcov()` + * `summarize_etas()` + * `measure_eta_vcov()` + * `package_TS()` + * `select_TS()` + * `measure_TS()` + * `soft_call()` using `TS$control$measurer` with `TS$control$measurer_args` + * `soft_call()` using `TS$control$selector` with `TS$control$selector_args` + * `package_LDA_TS()` + +*default* indicates that the specified function and its subfunctions are defined by the default arguments, but can be swapped-out with alternatives using the control lists (see below). +Presently, the only function which has LDATS-based alternatives is the `response`, which can be also set to `simplex_TS` to use the simplex-based response models within an otherwise idential LDATS setup. +In addition, the `measurer` and `selector` functions have defaults (`AIC()` and `min()`, respectively), which can be swapped (e.g., to `logLik()` and `max()`) based on algorithmic choices; method-functions like `AIC()` and `logLik()` have specifics defined as needed for LDATS model objects. + + ## Controls Lists -To minimize the length of argument lists and facilitate simple default usage -throughout the pipeline, we implement an options/controls list approach, where -each of the main functions (`LDA_TS`, `LDA_set`, and `TS`) and its subfunctions -have a `control` argument that takes a `list` to replace the defaults returned -by its `_control` function: +To minimize the length of argument lists and facilitate simple default usage throughout the pipeline, we implement an options/controls list approach, where each of the main functions (`LDA_TS`, `LDA`, and `TS`) and their subfunctions have a `control` argument that takes a `list` to replace the defaults returned by its `_control` function: * `LDA_TS_control()` -* `LDA_set_control()` -* `TS_control()` \ No newline at end of file +* `LDA_control()` +* `TS_control()` + * `sequential_TS_control()` + * `multinom_TS_control()`, `simplex_TS_control()` + * `ldats_classic_control()` diff --git a/vignettes/paper-comparison.Rmd b/vignettes/paper-comparison.Rmd deleted file mode 100644 index b5b34072..00000000 --- a/vignettes/paper-comparison.Rmd +++ /dev/null @@ -1,649 +0,0 @@ ---- -title: "Comparison to Christensen et al. 2018" -author: "Renata Diaz and Hao Ye" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{paper-comparison} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} -params: - run_models: FALSE ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` -```{r, include=FALSE} -vers <- packageVersion("LDATS") -``` - -# Introduction - -This document provides a side-by-side comparison of **`LDATS`** (version `r vers`) results with analysis from [Christensen et al. 2018](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecy.2373). - -## Summary - -| Step | Changes from Christensen et al 2018 to `LDATS` | Effect on comparison | Recommendations for future users | -|:---------------|:------------------------|:-----------------------|:----------------------------| -|Data|Paper adjusts abundances according to unit effort, while `LDATS` uses raw capture numbers.|None: run comparison using adjusted data|Use raw, unweighted abundances.| -|LDA model selection|Paper conservatively overestimated the number of parameters for calculating AIC for model selection. `LDATS` calculates AIC appropriately.|Paper LDA selects 4 topics, while `LDATS` finds 6. Compare all combinations of paper and `LDATS` LDA models and changepoint models| Use `LDATS` AIC calculation.| -|Changepoint model document weights|Paper weighted all sampling periods equally regardless of abundance. `LDATS` by default weights the information from each sampling period according to the number of individuals captured (i.e. the amount of information gleaned about the underlying community composition).|None; use `weights = NULL` to set all weights equal to 1 for LDATS| Weight sampling periods according to abundance| -|Overall LDA + changepoint results| All combinations of LDA + changepoint model find 4 changepoints at approximately the same time steps|Choice of LDA model has more of an effect than choice of changepoint model|`LDATS` reflects best practices, but the paper methods will produce qualitatively similar results.| - -# Setup - -## LDATS Installation - -To obtain the most recent version of **`LDATS`**, install the most recent version from GitHub: - -```{r, eval = FALSE} -# install.packages("devtools") -devtools::install_github("weecology/LDATS") -``` - -Load in the **`LDATS`** package. -```{r} -library(LDATS) -set.seed(42) -nseeds <- 200 -nit <- 10000 -``` - -## Running the Models - -Because both the Latent Dirichlet Allocation (LDA) and time series components of the analysis can take a long time to run (especially with the settings above for the number of seeds and iterations), we will use pre-generated model outputs and turn off certain code chunks that run the models using a global `rmarkdown` parameter, `run_models = FALSE`. - -To change this functionality, you can re-render this file with: -```{r, eval = FALSE} -rmarkdown::render("paper-comparison.Rmd", params = list(run_models = TRUE)) -``` - -## Download Analysis Scripts and Data Files - -We're going to download analysis scripts, data files, and model objects, so we use a temporary location for storage: - -```{r set download location} -vignette_files <- tempdir() -``` - -To replicate the Christensen et al. 2018 analysis, we download some of the original scripts & data files from [Extreme-events-LDA repo](https://github.com/emchristensen/Extreme-events-LDA), as well as some raw data files from the [PortalData repo](https://github.com/weecology/PortalData): - -Main Analysis Scripts: - -* `rodent_LDA_analysis.R` - - main script for analyzing rodent community change using LDA - -* `rodent_data_for_LDA.R` - - contains a function that creates the rodent data table used in analyses - -* `AIC_model_selection.R` - - contains functions for calculating AIC for different candidate LDA models - -* `changepointmodel.r` - - contains change-point model code - -* `LDA-distance.R` - - function for computing Hellinger distance analyses - -Data: - -* `Rodent_table_dat.csv` - - table of rodent data, created by rodent_data_for_LDA.R - -* `moon_dates.csv` - - table of census dates (downloaded from the PortalData repository) - -* `Portal_rodent_trapping.csv` - - table of trapping effort (downloaded from the PortalData repository) - -Figure scripts: - -* `LDA_figure_scripts.R` -- contains functions for making main plots in manuscript (Fig 1). Called from rodent_LDA_analysis.R - -```{r download scripts} -test_file <- file.path(vignette_files, "rodent_LDA_analysis.r") - -if (!file.exists(test_file)) -{ - # from the Extreme-events-LDA repo - github_path <- "https://raw.githubusercontent.com/emchristensen/Extreme-events-LDA/master/" - files_to_download <- c("rodent_LDA_analysis.r", "rodent_data_for_LDA.r", - "AIC_model_selection.R", "changepointmodel.r", - "LDA-distance.R", "Rodent_table_dat.csv", - "LDA_figure_scripts.R") - - for (file in files_to_download) - { - download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file)) - } - - # from the PortalData repo - github_path <- "https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/" - files_to_download <- c("moon_dates.csv", "Portal_rodent_trapping.csv") - - for (file in files_to_download) - { - download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file)) - } -} -``` - -## Download Model Outputs - -We also have pre-generated model outputs that we download from the [LDATS-replications repo](https://github.com/weecology/LDATS-replications): - -LDA models: - -* `ldats_ldamodel.RDS` - - the best LDA model as selected by LDATS - -* `paper_ldamodel.RDS` - - the best LDA model as selected by the Christensen et al. analysis - -Changepoint outputs - -* `ldats_ldats.RDS` - - the posterior distribution of changepoints, using the LDATS LDA model and the LDATS changepoint selection - -* `ldats_paper.RDS` - - the posterior distribution of changepoints, using the LDATS LDA model and the paper's changepoint selection - -* `paper_ldats.RDS` - - the posterior distribution of changepoints, using the paper LDA model and the LDATS changepoint selection - -* `paper_paper.RDS` - - the posterior distribution of changepoints, using the paper LDA model and the paper's changepoint selection - -Figures - -* `lda_distances.png` - - figure showing the variance in the topics identified by the paper's LDA model code - -```{r download pre-generated model outputs, eval = !params$run_model} -test_file <- file.path(vignette_files, "ldats_ldamodel.RDS") - -if (!file.exists(test_file)) -{ - # from the Extreme-events-LDA repo - github_path <- "https://raw.githubusercontent.com/weecology/LDATS-replications/master/model_outputs/" - files_to_download <- c("ldats_ldamodel.RDS", "paper_ldamodel.RDS", - "ldats_ldats.RDS", "ldats_paper.RDS", - "paper_ldats.RDS", "paper_paper.RDS", - "lda_distances.png") - - for (file in files_to_download) - { - download.file(url = paste0(github_path, file), - destfile = file.path(vignette_files, file), - mode = "wb") - } -} -``` - -# Data Comparison - -The dataset of Portal rodents on control plots is included in the LDATS package: - -```{r LDATS data} -data(rodents) - -head(rodents[[1]]) -``` - -We can compare this against the data used in Christensen et al: - -```{r Paper data} -# parameters for subsetting the full Portal rodents data -periods <- 1:436 -control_plots <- c(2, 4, 8, 11, 12, 14, 17, 22) -species_list <- c("BA", "DM", "DO", "DS", "NA", "OL", "OT", "PB", "PE", "PF", - "PH", "PI", "PL", "PM", "PP", "RF", "RM", "RO", "SF", "SH", "SO") - -source(file.path(vignette_files, "rodent_data_for_LDA.r")) - -# assemble `paper_dat`, the data from Christensen et al. 2018 -paper_dat <- create_rodent_table(period_first = min(periods), - period_last = max(periods), - selected_plots = control_plots, - selected_species = species_list) - -# assemble `paper_covariates`, the associated dates and covariate data -moondat <- read.csv(file.path(vignette_files, "moon_dates.csv"), stringsAsFactors = F) - -paper_dates <- moondat %>% - dplyr::filter(period %>% dplyr::between(min(periods), max(periods))) %>% - dplyr::pull(censusdate) %>% - as.Date() - -paper_covariates <- data.frame( - index = seq_along(paper_dates), - date = paper_dates, - year_continuous = lubridate::decimal_date(paper_dates)) %>% - dplyr::mutate( - sin_year = sin(year_continuous * 2 * pi), - cos_year = cos(year_continuous * 2 * pi) - ) -``` - -## Compare the data from Christensen et al. with the included data in `LDATS` - -```{r rodent data comparison} -compare <- rodents[[1]] == paper_dat - -length(which(rowSums(compare) < ncol(compare))) -``` - -There are 16 rows where the data included in LDATS differs from the paper data. This is because the LDATS data is not adjusted to account for trapping effort, while the paper data does, by dividing all census counts by the actual number of plots trapped and multiplying by 8 to account for incompletely-trapped censuses. - -To confirm this, refer to lines 36-46 in `rodent_data_for_LDA.r`: - -```{} - # retrieve data on number of plots trapped per month - trap_table = read.csv('https://raw.githubusercontent.com/weecology/PortalData/master/Rodents/Portal_rodent_trapping.csv') - trap_table_controls = filter(trap_table, plot %in% selected_plots) - nplots_controls = aggregate(trap_table_controls$sampled,by=list(period = trap_table_controls$period),FUN=sum) - - # adjust species counts by number of plots trapped that month - r_table_adjusted = as.data.frame.matrix(r_table) - for (n in 1:436) { - #divide by number of control plots actually trapped (should be 8) and multiply by 8 to estimate captures as if all plots were trapped - r_table_adjusted[n,] = round(r_table_adjusted[n,]/nplots_controls$x[n]*8) - } -``` - -We can run the same procedure on the LDATS data to verify that we obtain a data.frame that matches. - -```{r adjust LDATS data after Christensen et al, eval = TRUE} -# get the trapping effort for each sample -trap_table <- read.csv(file.path(vignette_files, "Portal_rodent_trapping.csv")) -trap_table_controls <- dplyr::filter(trap_table, plot %in% control_plots) -nplots_controls <- aggregate(trap_table_controls$sampled, - by = list(period = trap_table_controls$period), - FUN = sum) - -# adjust species counts by number of plots trapped that month -# divide by number of control plots actually trapped (should be 8) and -# multiply by 8 to estimate captures as if all plots were trapped -ldats_rodents_adjusted <- as.data.frame.matrix(rodents[[1]]) -ldats_rodents_adjusted[periods, ] <- round(ldats_rodents_adjusted[periods, ] / nplots_controls$x[periods] * 8) -``` - -Now we can compare the adjusted LDATS dataset with both the original ldats dataset and the dataset from the paper: - -```{r dataset comparisons} -compare_raw <- rodents[[1]] == ldats_rodents_adjusted -length(which(rowSums(compare_raw) < ncol(compare_raw))) - -compare_adjusted <- ldats_rodents_adjusted == paper_dat -length(which(rowSums(compare_adjusted) < ncol(compare_adjusted))) -``` - -Because the LDA procedure weights the information from documents (census periods) according to the number of words (rodents captured), we now believe it is most appropriate to run the LDA on _unadjusted_ trapping data, and we recommend that users of LDATS do so. However, to maintain consistency with Christensen et al 2018, we will proceed using the _adjusted_ rodent table in this vignette. - -```{r switch to adjusted rodents} -rodents[[1]] <- paper_dat -``` - -The LDATS rodent data comes with a `document_covariate_table`, which we will use later as the predictor variables for the changepoint models. In this table, time is expressed as new moon numbers. Later we will want to be able to interpret the results in terms of census dates. We will add a column to the `document_covariate_table` to convert new moon numbers to census dates. We will not reference this column in any of the formulas we pass to the changepoint models, so it will be ignored until we need it. - -```{r add dates to covariate table} -head(rodents$document_covariate_table) - -new_cov_table <- dplyr::left_join(rodents$document_covariate_table, - dplyr::select(moondat, newmoonnumber, censusdate), - by = c("newmoon" = "newmoonnumber")) %>% - dplyr::rename(date = censusdate) - -rodents$document_covariate_table <- new_cov_table -``` - -# Identify community groups using LDA - -While LDATS can run start-to-finish with `LDATS::LDA_TS`, here we will work through the process function-by-function to isolate differences with the paper. For a breakdown of the `LDA_TS` pipeline, see the [`codebase` vignette](https://weecology.github.io/LDATS/articles/LDATS_codebase.html). - -First, we run the LDA models from LDATS to identify the number of topics: - -```{r LDATS LDAs, eval = params$run_models} -ldats_ldas <- LDATS::LDA_set(document_term_table = rodents$document_term_table, - topics = 2:6, nseeds = nseeds) -ldats_ldamodel <- LDATS::select_LDA(LDA_models = ldats_ldas)[[1]] - -saveRDS(ldats_ldamodel, file = file.path(vignette_files, "ldats_ldamodel.RDS")) -``` - -Second, we run the LDA models from Christensen et al. to do the same task: - -```{r paper LDAs, eval = params$run_models} -source(file.path(vignette_files, "AIC_model_selection.R")) -source(file.path(vignette_files, "LDA-distance.R")) - -# Some of the functions require the data to be stored in the `dat` variable -dat <- paper_dat - -# Fit a bunch of LDA models with different seeds -# Only use even numbers for seeds because consecutive seeds give identical results -seeds <- 2 * seq(nseeds) - -# repeat LDA model fit and AIC calculation with a bunch of different seeds to test robustness of the analysis -best_ntopic <- repeat_VEM(paper_dat, - seeds, - topic_min = 2, - topic_max = 6) -hist(best_ntopic$k, breaks = seq(from = 0.5, to = 9.5), - xlab = "best # of topics", main = "") - -# 2b. how different is species composition of 4 community-types when LDA is run with different seeds? -# ================================================================== -# get the best 100 seeds where 4 topics was the best LDA model -seeds_4topics <- best_ntopic %>% - filter(k == 4) %>% - arrange(aic) %>% - head(min(100, nseeds)) %>% - pull(SEED) - -# choose seed with highest log likelihood for all following analyses -# (also produces plot of community composition for "best" run compared to "worst") - -png(file.path(vignette_files, "lda_distances.png"), width = 800, height = 400) -dat <- paper_dat # calculate_LDA_distance has some required named variables -best_seed <- calculate_LDA_distance(paper_dat, seeds_4topics) -dev.off() -mean_dist <- unlist(best_seed)[2] -max_dist <- unlist(best_seed)[3] - -# ================================================================== -# 3. run LDA model -# ================================================================== -ntopics <- 4 -SEED <- unlist(best_seed)[1] # For the paper, use seed 206 -ldamodel <- LDA(paper_dat, ntopics, control = list(seed = SEED), method = "VEM") - -saveRDS(ldamodel, file = file.path(vignette_files, "paper_ldamodel.RDS")) -``` - -```{r} -knitr::include_graphics(file.path(vignette_files, "lda_distances.png")) -``` - -## Plots - -To visualize the LDA assignment of species to topics, we load in the saved LDA models from previously: -```{r} -ldamodel <- readRDS(file.path(vignette_files, "paper_ldamodel.RDS")) -ldats_ldamodel <- readRDS(file.path(vignette_files, "ldats_ldamodel.RDS")) -``` - -How the paper LDA model assigns species to topics: -```{r plot paper LDA, fig.width = 7, fig.height = 6} -plot(ldamodel, cols = NULL, option = "D") -``` - -How the LDATS LDA model assigns species to topics: -```{r plot LDATS LDA, fig.width = 7, fig.height = 6} -plot(ldats_ldamodel, cols = NULL, option = "D") -``` - -The paper method finds 4 topics and LDATS finds 6. This is because of an update to the model selection procedure. The paper conservatively overestimates the number of parameters (by counting all of the variational parameters) and therefore overpenalizes the AIC for models with more topics. Comparatively, the LDATS method now uses the number of parameters remaining after the variational approximation, as returned by the LDA object. For this vignette, we will compare the results from using both LDA models. - -# Changepoint models - -We will compare four combinations of LDA + changepoint models: - -* LDATS LDA + LDATS changepoint -* LDATS LDA + paper changepoint -* Paper LDA + LDATS changepoint -* Paper LDA + paper changepoint - -Having divided the data to generate catch-per-effort, the paper changepoint model weighted all sample periods equally. In comparison, LDATS does not force an equal weighting, but assumes that as default, and can weight sample periods according to how many individuals were captured (controlled by the `weights` argument to `LDA_TS`, and easily calculated for a document-term matrix using `document_term_weights`. We now believe it is more appropriate to weight periods proportional to captures in the time series (despite the LDA function returning only proportions of each topic), and this is what we recommend for LDATS users. For the purposes of comparison, however, we will continue set all weights = 1 for both changepoint models. For an example of LDATS run with proportional weights, see the [rodents vignette](https://weecology.github.io/LDATS/articles/rodents-example.html). - -## Running paper changepoint models - -We define a few helper functions for running the changepoints model of Christensen et al. and processing the output to obtain the dates: - -```{r paper changepoint models} -#### Run changepoint #### -source(file.path(vignette_files, "changepointmodel.r")) - -find_changepoints <- function(lda_model, paper_covariates, n_changepoints = 1:6) -{ - # set up parameters for model - x <- dplyr::select(paper_covariates, - year_continuous, - sin_year, - cos_year) - - # run models with 1, 2, 3, 4, 5, 6 changepoints - cpt_results <- data.frame(n_changepoints = n_changepoints) - cpt_results$cpt_model <- lapply(cpt_results$n_changepoints, - function(n_changepoints) - { - changepoint_model(lda_model, x, n_changepoints, maxit = nit, - weights = rep(1, NROW(x))) - }) - return(cpt_results) -} - -# Among a selection of models with different # of changepoints, -# - compute AIC -# - select the model with the best AIC -# - get the posterior distributions for the changepoints -select_cpt_model <- function(cpt_results, ntopics) -{ - # compute log likelihood as the mean deviance - cpt_results$mean_deviances <- vapply(cpt_results$cpt_model, - function(cpt_model) {mean(cpt_model$saved_lls)}, - 0) - - # compute AIC = ( -2 * log likelihood) + 2 * (#parameters) - cpt_results$AIC <- cpt_results$mean_deviances * -2 + - 2 * (3 * (ntopics - 1) * (cpt_results$n_changepoints + 1) + - (cpt_results$n_changepoints)) - - # select the best model - cpt <- cpt_results$cpt_model[[which.min(cpt_results$AIC)]] - return(cpt) -} - -# transform the output from `compute_cpt` and match up the time indices with -# dates from the original data -get_dates <- function(cpt, covariates = paper_covariates) -{ - cpt$saved[,1,] %>% - t() %>% - as.data.frame() %>% - reshape::melt() %>% - dplyr::left_join(covariates, by = c("value" = "index")) -} -``` - -### LDATS LDA and paper changepoint - -Run the Christensen et al. time series model to identify changepoints on the LDA topics selected by LDATS: -```{r run LDATS LDA and paper cpt, eval = params$run_models} -ldats_paper_results <- find_changepoints(ldats_ldamodel, paper_covariates) - -saveRDS(ldats_paper_results, file = file.path(vignette_files, "ldats_paper.RDS")) -``` - -Extract the dates of the changepoints: -```{r compute changepoints for LDATS LDA and paper cpt} -ldats_paper_results <- readRDS(file.path(vignette_files, "ldats_paper.RDS")) - -ldats_paper_cpt <- select_cpt_model(ldats_paper_results, - ntopics = ldats_ldamodel@k) -ldats_paper_cpt_dates <- get_dates(ldats_paper_cpt) -``` - -### Paper LDA and paper changepoint - -Run the Christensen et al. time series model to identify changepoints on the LDA topics selected by Christensen et al.: -```{r run paper LDA and paper cpt, eval = params$run_models} -paper_paper_results <- find_changepoints(ldamodel, paper_covariates) - -saveRDS(paper_paper_results, file = file.path(vignette_files, "paper_paper.RDS")) -``` - -Extract the dates of the changepoints: -```{r compute changepoints for paper LDA and paper cpt} -paper_paper_results <- readRDS(file.path(vignette_files, "paper_paper.RDS")) - -paper_paper_cpt <- select_cpt_model(paper_paper_results, - ntopics = ldamodel@k) -paper_paper_cpt_dates <- get_dates(ldats_paper_cpt) -``` - -## Running LDATS changepoint models - -### LDATS LDA and LDATS changepoint - -Run the LDATS time series model to identify changepoints on the LDA topics selected by LDATS: - -```{r run LDATS LDA and LDATS cpt, eval = params$run_models} -ldats_ldats_results <- TS_on_LDA(LDA_models = ldats_ldamodel, - document_covariate_table = rodents$document_covariate_table, - formulas = ~ sin_year + cos_year, - nchangepoints = 1:6, - timename = "newmoon", - weights = NULL, - control = list(nit = nit)) - -saveRDS(ldats_ldats_results, file = file.path(vignette_files, "ldats_ldats.RDS")) -``` - -Unlike the paper changepoint model, LDATS can recognize that sampling periods may not be equidistant, and can place changepoint estimates at new moons if they fall between nonconsecutive sampling periods. We can estimate the dates corresponding to those new moons, extrapolating from the census dates for adjacent census periods. - -```{r construct lookup table for LDATS output for changepoint times} -# make the full sequence of possible newmoon values -full_index <- seq(min(rodents$document_covariate_table$newmoon), - max(rodents$document_covariate_table$newmoon)) - -# generate a lookup table with dates for the newmoons, using `approx` to -# linearly interpolate the missing values -ldats_dates <- approx(rodents$document_covariate_table$newmoon, - as.Date(rodents$document_covariate_table$date), - full_index) %>% - as.data.frame() %>% - mutate(index = x, - date = as.Date(y, origin = "1970-01-01")) %>% - select(index, date) -``` - -Select the best time series model and extract the dates of the changepoints: - -```{r compute changepoints for LDATS LDA and LDATS cpt} -ldats_ldats_results <- readRDS(file.path(vignette_files, "ldats_ldats.RDS")) - -ldats_ldats_cpt <- select_TS(ldats_ldats_results) - -ldats_ldats_cpt_dates <- ldats_ldats_cpt$rhos %>% - as.data.frame() %>% - reshape::melt() %>% - dplyr::left_join(ldats_dates, by = c("value" = "index")) -``` - -### Paper LDA and LDATS changepoint - -Run the LDATS time series model to identify changepoints on the LDA topics selected by Christensen et al.: - -```{r run paper LDA and LDATS cpt, eval = params$run_models} -paper_ldats_results <- TS_on_LDA(LDA_models = ldamodel, - document_covariate_table = rodents$document_covariate_table, - formulas = ~ sin_year + cos_year, - nchangepoints = 1:6, - - timename = "newmoon", - weights = NULL, - control = list(nit = nit)) - - -saveRDS(paper_ldats_results, file = file.path(vignette_files, "paper_ldats.RDS")) -``` - -Select the best time series model and extract the dates of the changepoints: - -```{r select paper lda + ldats cpt} -paper_ldats_results <- readRDS(file.path(vignette_files, "paper_ldats.RDS")) - -paper_ldats_cpt <- select_TS(paper_ldats_results) - -paper_ldats_cpt_dates <- paper_ldats_cpt$rhos %>% - as.data.frame() %>% - reshape::melt() %>% - dplyr::left_join(ldats_dates, by = c("value" = "index")) -``` - -## How many changepoints were identified? - -```{r} -nlevels(ldats_paper_cpt_dates$variable) -nlevels(paper_paper_cpt_dates$variable) -nlevels(ldats_ldats_cpt_dates$variable) -nlevels(paper_ldats_cpt_dates$variable) -``` - -All of the models find four changepoints. - -## Plot changepoint models - -### Paper LDA and LDATS changepoint - -```{r plot paper LDA and LDATS cpts, fig.width = 7, fig.height = 6} -plot(paper_ldats_cpt) -``` - -### LDATS LDA and LDATS changepoint -```{r plot ldats LDA and LDATS cpt, fig.width = 7, fig.height = 6} -plot(ldats_ldats_cpt) -``` - -### Paper LDA and paper changepoint -```{r plot paper LDA and paper cpt, fig.width = 7, fig.height = 6} -paper_cpts <- find_changepoint_location(paper_paper_cpt) -ntopics <- ldamodel@k - -paper_cpt_plot <- get_ll_non_memoized_plot(ldamodel, paper_covariates, paper_cpts, make_plot = TRUE, - weights = rep(1, NROW(paper_covariates))) - -annual_hist(paper_paper_cpt, paper_covariates$year_continuous) -paper_cpt_plot -``` - -### LDATS LDA and paper changepoint -```{r plot LDATS lda and paper cpt, fig.width = 7, fig.height = 6} -ldats_cpts <- find_changepoint_location(ldats_paper_cpt) -ntopics <- ldats_ldamodel@k - -ldats_cpt_plot <- get_ll_non_memoized_plot(ldats_ldamodel, paper_covariates, ldats_cpts, make_plot = TRUE, - weights = rep(1, NROW(paper_covariates))) - -annual_hist(ldats_paper_cpt, paper_covariates$year_continuous) -ldats_cpt_plot -``` - -The results of the changepoint model appear robust to both choice of LDA model and choice of changepoint model. - -## Report changepoint dates -```{r report cpt dates, include = F} -cpt_dates <- dplyr::bind_rows("paperLDA_paperCPT" = paper_paper_cpt_dates, - "ldatsLDA_paperCPT" = ldats_paper_cpt_dates, - "ldatsLDA_ldatsCPT" = ldats_ldats_cpt_dates, - "paperLDA_ldatsCPT" = paper_ldats_cpt_dates, - .id = "analysis") %>% - dplyr::group_by(analysis, variable) %>% - dplyr::summarize(date = mean(date)) %>% - dplyr::ungroup() %>% - dplyr::rename(changepoint = variable) %>% - tidyr::spread(analysis, date) -``` - -```{r print cpt dates} -knitr::kable(cpt_dates) -``` - -The choice of LDA has more influence on the changepoint locations than the choice of changepoint model - probably because the LDATS LDA has 6 topics, and the paper LDA has 4. However, all of the models agree to within 6 months in most cases, and a year for the broader early 1990s changepoint. diff --git a/vignettes/rodents-example-files/changepoint_models.Rds b/vignettes/rodents-example-files/changepoint_models.Rds deleted file mode 100644 index 7d832a99..00000000 Binary files a/vignettes/rodents-example-files/changepoint_models.Rds and /dev/null differ diff --git a/vignettes/rodents-example-files/lda_model_set.Rds b/vignettes/rodents-example-files/lda_model_set.Rds deleted file mode 100644 index 8ed7fb87..00000000 Binary files a/vignettes/rodents-example-files/lda_model_set.Rds and /dev/null differ diff --git a/vignettes/rodents-example-files/lda_ts_results.Rds b/vignettes/rodents-example-files/lda_ts_results.Rds deleted file mode 100644 index c80d5a42..00000000 Binary files a/vignettes/rodents-example-files/lda_ts_results.Rds and /dev/null differ diff --git a/vignettes/rodents-example.Rmd b/vignettes/rodents-example.Rmd index ef390a6d..fa046ca9 100644 --- a/vignettes/rodents-example.Rmd +++ b/vignettes/rodents-example.Rmd @@ -21,26 +21,35 @@ vers <- packageVersion("LDATS") today <- Sys.Date() ``` -This vignette walks through an example of **`LDATS`** at the command line and -was constructed using **`LDATS`** version `r vers` on `r today`. +```{r download files, include = FALSE} + vignette_files <- tempdir() + dir.create(file.path(vignette_files, "output"), showWarnings = FALSE) + github_path <- "https://github.com/weecology/LDATS-replications/raw/master/output/" + files_to_download <- c("rodents_example_lda_model_set.RDS", "rodents_example_ts_model_set.RDS", + "rodents_example_lda_ts_model_set.RDS") + + for (file in files_to_download) { + download.file(url = paste0(github_path, file), + destfile = file.path(vignette_files, "output", file), + mode = "wb") + } +``` + +This vignette walks through an example of **`LDATS`** at the command line and was constructed using **`LDATS`** version `r vers` on `r today`. ## Installation -To obtain the most recent version of **LDATS**, install and load the most recent -version from GitHub: +To obtain the most recent version of **LDATS**, install and load the most recent version from GitHub: -```{r, eval=FALSE} +```{r, eval = FALSE} install.packages("devtools") devtools::install_github("weecology/LDATS") -library(LDATS) ``` ## Data -For this vignette, we will be using rodent data from the control plots of the -[Portal Project](https://github.com/weecology/portaldata), which come with -the **LDATS** package (`data(rodents)`). +For this vignette, we will be using rodent data from the control plots of the [Portal Project](https://github.com/weecology/portaldata), which come with the **LDATS** package (`data(rodents)`). `rodents` contains two data tables, a `document_term_table` and a `document_covariate_table`. @@ -57,36 +66,36 @@ head(rodents$document_covariate_table, 10) ## Stage 1: LDA models -We use `LDA_set()` to run replicate LDA models (each with its own seed) with varying numbers of topics (`2:5`) and `select_LDA()` to select the best model. +We use `LDA()` to run replicate LDA models (each with its own seed) with varying numbers of topics (`2:5`), which includes a run of `select_LDA()` to select the best model(s). We use the `control` argument to pass controls to the LDA function via a `list`. In this case, we can set `quiet = TRUE` to make the model run quietly. -```{r lda_set, eval =F} -lda_model_set <- LDA_set(document_term_table = rodents$document_term_table, - topics = c(2:5), - nseeds = 10, - control = list(quiet = TRUE)) - +```{r lda_set, eval = FALSE} +lda_model_set <- LDA(data = rodents, topics = 2:5, replicates = 10, + control = list(quiet = TRUE)) ``` +```{r save lda model set, include = FALSE, eval = FALSE} +saveRDS(lda_model_set, file.path(vignette_files, "output", "rodents_example_lda_model_set.RDS")) +``` If we do not pass any controls, by default, `quiet = FALSE` (here run with only `2:3` topics and `2` seeds, to keep output short): -```{r lda set not quiet, eval =F} -lda_model_set2 <- LDA_set(document_term_table = rodents$document_term_table, - topics = c(2:3), - nseeds = 2) +```{r lda set not quiet, eval = FALSE} +lda_model_set2 <- LDA(data = rodents, topics = c(2:3), replicates = 2) ``` -`LDA_set()` returns a list of LDA models. We use `select_LDA()` to identify the best number of topics and choice of seed from our set of models. By default, we will choose models based on minimum AIC. To use different selection criteria, define the appropriate functions and specify them by passing `list(measurer = [measurer function], selector = [max, min, etc])` to the `control` argument. +By default, we will choose models based on minimum AIC. To use different selection criteria, define the appropriate functions and specify them by passing `list(measurer = [measurer function], selector = [max, min, etc])` to the `control` argument. + + -```{r load lda model set, include = F} -load(here::here('vignettes', 'rodents-example-files', 'lda_model_set.Rds')) +```{r load lda model set, include = FALSE} +lda_model_set <- readRDS(file.path(vignette_files, "output", "rodents_example_lda_model_set.Rds")) rm(lda_model_set2) ``` ```{r select LDA} -selected_lda_model <- select_LDA(lda_model_set) +selected_lda_model <- select_LDA(lda_model_set$LDAs)[[1]] ``` @@ -95,53 +104,58 @@ We can access the results of the model: ```{r LDA results} # Number of topics: -selected_lda_model[[1]]@k +selected_lda_model$topics # Topic composition of communities at each time step # Columns are topics; rows are time steps -head(selected_lda_model[[1]]@gamma) - +head(selected_lda_model$document_topic_table) ``` -`LDATS` includes flexible plot functionality for LDAs and time series. The top panel illustrates topic composition by species, and the bottom panel shows the proportion of the community made up of each topic over time. For all the available plot options see `?plot.LDA_VEM`. - -```{r plot lda, fig.width=7, fig.height=6} -plot(selected_lda_model[[1]]) +`LDATS` includes flexible plot functionality for LDAs and time series. +The top panel illustrates topic composition by species, and the bottom panel shows the proportion of the community made up of each topic over time. +For all the available plot options see `?plot.LDA`. +```{r plot lda, fig.width = 7, fig.height = 6} +plot(selected_lda_model) ``` ## Stage 2: TS changepoint models -We use `TS_on_LDA()` to run LDATS changepoint models with `0:6` changepoints, and then use `select_TS()` to find the best-fit model of these. - -Here, `TS_on_LDA()` predicts the `gamma` (the proportion of the community made of up each topic) from our LDA model(s) as a function of `sin_year` and `cos_year` in the `document_covariate_table`. We use `document_weights()` to weight the information from each time step according to the total number of rodents captured at that time step. +We use `TS` to run LDATS changepoint models with `0:6` changepoints, which includes a call to `select_TS()` to find the best-fit model(s) of these. +Here, `TS()` predicts the `gamma` (the proportion of the community made of up each topic) from our LDA model(s) as a function of `sin_year` and `cos_year` in the `document_covariate_table`. +We use `document_weights()` to weight the information from each time step according to the total number of rodents captured at that time step. -```{r ts on lda, eval = F} -changepoint_models <- TS_on_LDA(LDA_models = selected_lda_model, - document_covariate_table = rodents$document_covariate_table, - formulas = ~ sin_year + cos_year, - nchangepoints = c(0:1), - timename = "newmoon", - weights = document_weights(rodents$document_term_table), - control = list(nit = 1000)) +```{r ts set, eval = FALSE} +ts_model_set <- TS(LDAs = lda_model_set, + formulas = ~ sin_year + cos_year, + nchangepoints = 0:1, + timename = "newmoon", + weights = TRUE, + control = list(method_args = + list(control = ldats_classic_control(nit = 1000)))) +``` +```{r save ts model set, include = FALSE, eval = FALSE} +saveRDS(ts_model_set, file.path(vignette_files, "output", "rodents_example_ts_model_set.RDS")) ``` +We can adjust options (default settings can be seen using `TS_control()`) for both TS functions by passing a list to the `control` argument. +For a full list see `?TS_control`. Here we illustrate adjusting the number of ptMCMC iterations - the default is 10000, but it is convenient to use fewer iterations for code development. -We can adjust options (default settings can be seen using `TS_control()`) for both TS functions by passing a list to the `control` argument. For a full list see `?TS_control`. Here we illustrate adjusting the number of ptMCMC iterations - the default is 10000, but it is convenient to use fewer iterations for code development. - -Also, it is important to note that by default the TS functions take the name of the time-step column from the `document_covariate_table` to be `"time"`. To pass a different column name, use the `timename` argument in `TS_on_LDA()`. +Also, it is important to note that by default the TS functions take the name of the time-step column from the `document_covariate_table` to be `"time"`. +To pass a different column name, use the `timename` argument in `TS()`. -`select_TS()` will identify the best-fit changepoint model of the models from `TS_on_LDA()`. As with `select_LDA()`, we can adjust the `measurer` and `selector` functions using the `control` argument list. +`select_TS()` will identify the best-fit changepoint model of the models from `TS_on_LDA()`. +As with `select_LDA()`, we can adjust the `measurer` and `selector` functions using the `control` argument list. -```{r reload ts, include = F} -load(here::here('vignettes', 'rodents-example-files', 'changepoint_models.Rds')) +```{r load ts model set, include = FALSE} +ts_model_set <- readRDS(file.path(vignette_files, "output", "rodents_example_ts_model_set.RDS")) ``` ```{r select ts} -selected_changepoint_model <- select_TS(changepoint_models) +selected_changepoint_model <- select_TS(ts_model_set$TSs)[[1]] ``` We can access the results of the selected changepoint model: @@ -155,7 +169,7 @@ selected_changepoint_model$rho_summary # Raw estimates for timesteps for each changepoint # Changepoints are columns -head(selected_changepoint_model$rhos) +head(selected_changepoint_model$focal_rhos) ``` @@ -167,20 +181,25 @@ plot(selected_changepoint_model) ## Full analysis with `LDA_TS` -Finally, we can perform an entire LDATS analysis, including all of the above steps, using the `LDA_TS()` function and passing options to the LDA and TS functions as a `list` to the `control` argument. The default is for `LDA_TS` to weight the time series model based on the document sizes, so we do not need to tell it to do so. +Finally, we can perform an entire LDATS analysis, including all of the above steps, using the `LDA_TS()` function and passing options to the LDA and TS functions as a `list` to the `control` argument. +The default is for `LDA_TS` to weight the time series model based on the document sizes, so we do not need to tell it to do so. -```{r lda_ts, eval = F} +```{r lda_ts, eval = FALSE} lda_ts_results <- LDA_TS(data = rodents, - nseeds = 10, + replicates = 10, topics = 2:5, formulas = ~ sin_year + cos_year, nchangepoints= 0:1, timename = "newmoon", - control = list(nit = 1000)) + control = list(TS_method_args = + list(control = ldats_classic_control(nit = 1000)))) ``` -```{r load ldats results, include = F} -load(here::here('vignettes', 'rodents-example-files', 'lda_ts_results.Rds')) +```{r save lda ts model set, include = FALSE, eval = FALSE} +saveRDS(lda_ts_results, file.path(vignette_files, "output", "rodents_example_lda_ts_model_set.RDS")) +``` +```{r load lda ts model set, include = FALSE} +lda_ts_results <- readRDS(file.path(vignette_files, "output", "rodents_example_lda_ts_model_set.RDS")) ``` `LDA_TS()` returns a list of all the model objects, and we can access their contents as above: @@ -189,13 +208,13 @@ load(here::here('vignettes', 'rodents-example-files', 'lda_ts_results.Rds')) names(lda_ts_results) # Number of topics -lda_ts_results$`Selected LDA model`$k@k +lda_ts_results$"LDA models"$selected_LDAs[[1]]$topics # Number of changepoints -lda_ts_results$`Selected TS model`$nchangepoints +lda_ts_results$"TS models"$selected_TSs[[1]]$nchangepoints # Summary of changepoint locations -lda_ts_results$`Selected TS model`$rho_summary +lda_ts_results$"TS models"$selected_TSs[[1]]$rho_summary ``` Finally, we can plot the `LDA_TS` results.