diff --git a/.gitignore b/.gitignore index 1a5d4154..c9fcae53 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ revdep/ # Claude Code local settings .claude/ .DS_Store +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index 4da8a5cf..781abe3d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,6 +62,7 @@ Suggests: readxl, stringr, tidyr (>= 1.3.0), + factoextra, rgl Imports: MASS, @@ -78,7 +79,9 @@ Imports: methods, knitr, htmlwidgets, - webshot2 + webshot2, + forcats, + rlang Description: Provides additional data sets, methods and documentation to complement the 'vcd' package for Visualizing Categorical Data and the 'gnm' package for Generalized Nonlinear Models. In particular, 'vcdExtra' extends mosaic, assoc and sieve plots from 'vcd' to handle 'glm()' and 'gnm()' models and diff --git a/NAMESPACE b/NAMESPACE index 09ab11dd..c264acd0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,11 +52,13 @@ export(Summarise) export(as_array) export(as_caseform) export(as_freqform) +export(as_matrix) export(as_table) export(assoc_graph) export(blogits) export(center3d) export(collapse.table) +export(collapse_levels) export(color_table) export(conditional) export(cutfac) @@ -96,6 +98,10 @@ importFrom(ca,multilines) importFrom(dplyr,all_of) importFrom(dplyr,as_tibble) importFrom(dplyr,everything) +importFrom(dplyr,is.tbl) +importFrom(dplyr,rename) +importFrom(dplyr,summarise) +importFrom(forcats,fct_collapse) importFrom(gnm,meanResiduals) importFrom(grDevices,col2rgb) importFrom(grDevices,hsv) @@ -118,6 +124,8 @@ importFrom(htmlwidgets,saveWidget) importFrom(knitr,include_graphics) importFrom(knitr,is_html_output) importFrom(methods,is) +importFrom(rlang,.data) +importFrom(rlang,set_names) importFrom(stats,as.formula) importFrom(stats,chisq.test) importFrom(stats,deviance) diff --git a/R/as_array.R b/R/as_array.R index 09481c1d..11990735 100644 --- a/R/as_array.R +++ b/R/as_array.R @@ -4,16 +4,29 @@ #' column containing the frequencies (`freq`) must be supplied if `obj` is in #' frequency form. #' -#' @param obj object to be converted to an array -#' @param freq If `obj` is in frequency form, this is the name of the frequency column. Leave as `NULL` if `obj` is in any other form. -#' @param dims A character vector of dimensions. If not specified, all variables apart from `freq` will be used as dimensions +#' @param obj +#' Object to be converted to an array. +#' @param freq +#' If `obj` is in frequency form, this is the name of the frequency column. +#' Leave as `NULL` if `obj` is in any other form. +#' @param dims +#' A character vector of dimensions. If not specified, all variables apart from +#' `freq` will be used as dimensions. +#' @param prop +#' If set to `TRUE`, returns an array of proportions (that sum to 1). May also +#' be set to a character or numeric vector of dimensions to be used as margins +#' from which proportions will be computed. #' @return object in array form #' #' @details -#' Unclasses the \code{as_table()} function to return an object in array form. +#' Unclasses the \code{\link{as_table}} function to return an object in array form. #' #' @author Gavin M. Klorfine #' +#' @seealso +#' \code{\link{as_table}}, \code{\link{as_freqform}}, \code{\link{as_caseform}}, +#' \code{\link{as_matrix}} +#' #' @examples #' library(vcdExtra) #' @@ -38,9 +51,27 @@ #' # For specific dimensions #' as_array(tidy_freqForm, freq = "n", dims = c("Hair", "Eye")) |> str() #' +#' #-----For proportions-----# +#' +#' as_array(freqForm, freq = "Freq", prop = TRUE) |> # proportions relative to grand total +#' head(c(4,4,1)) +#' +#' # Marginalize proportions along "Sex" (i.e., male proportions sum to 1, female proportions sum to 1) +#' as_array(freqForm, freq = "Freq", prop = "Sex") |> head(c(4,4,1)) +#' +#' as_array(freqForm, freq = "Freq", prop = 3) |> head(c(4,4,1)) # Same as above +#' +#' # Marginalize proportions along multiple variables +#' as_array(freqForm, freq = "Freq", prop = c("Hair", "Sex")) |> head(c(4,4,1)) +#' +#' as_array(freqForm, freq = "Freq", prop = c(1, 3)) |> head(c(4,4,1)) # Same as above +#' +#' # Using dims and prop arguments in tandem +#' as_array(freqForm, freq = "Freq", dims = c("Hair", "Eye"), prop = TRUE) +#' #' #' @export -as_array <- function(obj, freq = NULL, dims = NULL){ - return(unclass(as_table(obj, freq, dims))) # Unclass as_table output +as_array <- function(obj, freq = NULL, dims = NULL, prop = NULL){ + return(unclass(as_table(obj, freq, dims, prop))) # Unclass as_table output } \ No newline at end of file diff --git a/R/as_caseform.R b/R/as_caseform.R index a9344a28..1de399ee 100644 --- a/R/as_caseform.R +++ b/R/as_caseform.R @@ -4,22 +4,32 @@ #' column containing the frequencies (`freq`) must be supplied if `obj` is in #' frequency form. Returns a tibble if `tidy` is set to `TRUE`. #' -#' @param obj object to be converted to case form -#' @param freq If `obj` is in frequency form, this is the name of the frequency column. If `obj` is in any other form, do not supply an argument (see "Details") -#' @param dims A character vector of dimensions. If not specified, all variables apart from `freq` will be used as dimensions -#' @param tidy returns a tibble if set to TRUE +#' @param obj +#' Object to be converted to case form. +#' @param freq +#' If `obj` is in frequency form, this is the name of the frequency column. If +#' `obj` is in any other form, do not supply an argument (see "Details"). +#' @param dims +#' A character vector of dimensions. If not specified, all variables apart from +#' `freq` will be used as dimensions. +#' @param tidy +#' Returns a tibble if set to `TRUE`. #' @return object in case form. #' #' @details -#' A wrapper for \code{expand.dft()} that is able to handle arrays. +#' A wrapper for \code{\link{expand.dft}} that is able to handle arrays. #' #' If a frequency column is not supplied, this function defaults to "Freq" -#' just like \code{expand.dft()}. Converts `obj` to a table using -#' \code{as_table()} before converting to case form. +#' just like \code{\link{expand.dft}}. Converts `obj` to a table using +#' \code{\link{as_table}} before converting to case form. #' #' @author Gavin M. Klorfine #' -#' @importFrom dplyr as_tibble +#' @seealso +#' \code{\link{as_table}}, \code{\link{as_freqform}}, \code{\link{as_array}}, +#' \code{\link{as_matrix}}, \code{\link{expand.dft}} +#' +#' @importFrom dplyr as_tibble is.tbl #' #' @examples #' library(vcdExtra) @@ -48,11 +58,21 @@ as_caseform <- function(obj, freq = "Freq", dims = NULL, tidy = TRUE){ - tab <- expand.dft(as_table(obj, freq = freq, dims = dims), freq = freq) + if ((dplyr::is.tbl(obj) || is.data.frame(obj))) + freqs <- obj[[freq]] + else + freqs <- as.numeric(obj) + + if (any(freqs %% 1 != 0)) + stop("Frequency column contains decimal values.") + if (any(freqs < 0)) + stop("Frequency column contains negative values.") + + tab <- as_table(obj, freq = freq, dims = dims) + cf <- expand.dft(tab, freq = freq) - if (tidy){ - tab <- dplyr::as_tibble(tab) - } + if (tidy) + cf <- dplyr::as_tibble(cf) - return(tab) + return(cf) } \ No newline at end of file diff --git a/R/as_freqform.R b/R/as_freqform.R index a9449612..d9da5680 100644 --- a/R/as_freqform.R +++ b/R/as_freqform.R @@ -1,24 +1,39 @@ #' Convert any form (case or table form) into frequency form. #' -#' A wrapper for \code{as.data.frame()} that is able to properly handle arrays. +#' A wrapper for \code{\link[base]{as.data.frame}} that is able to properly handle arrays. #' Converts object (`obj`) in case or table form into frequency form. The #' column containing the frequencies (`freq`) must be supplied if `obj` is #' already in frequency form (and you are using this function to select #' dimensions). Returns a tibble if `tidy` is set to `TRUE`. #' -#' @param obj object to be converted to frequency form -#' @param freq If `obj` is already in frequency form, this is the name of the frequency column. If `obj` is in any other form, do not supply an argument (see "Details") -#' @param dims A character vector of dimensions. If not specified, all variables apart from `freq` will be used as dimensions -#' @param tidy returns a tibble if set to TRUE -#' @return object in frequency form. +#' @param obj +#' Object to be converted to frequency form. +#' @param freq +#' If `obj` is already in frequency form, this is the name of the frequency +#' column. If `obj` is in any other form, do not supply an argument (see "Details"). +#' @param dims +#' A character vector of dimensions. If not specified, all variables apart from +#' `freq` will be used as dimensions. +#' @param prop +#' If set to `TRUE`, the resulting "frequency" column will contain proportions +#' (that sum to 1). May also be set to a character or numeric vector of +#' dimensions to be used as margins from which proportions will be computed. +#' The resulting "frequency" column is renamed to "Prop." +#' @param tidy +#' Returns a tibble if set to `TRUE`. +#' @return Object in frequency form. #' #' @details -#' Converts `obj` to a table using \code{as_table()} before converting to +#' Converts `obj` to a table using \code{\link{as_table}} before converting to #' frequency form #' #' @author Gavin M. Klorfine #' -#' @importFrom dplyr as_tibble +#' @seealso +#' \code{\link{as_table}}, \code{\link{as_caseform}}, \code{\link{as_array}}, +#' \code{\link{as_matrix}} +#' +#' @importFrom dplyr as_tibble rename all_of #' #' @examples #' library(vcdExtra) @@ -44,16 +59,38 @@ #' #' as_freqform(tableForm, dims = c("Hair", "Eye")) |> str() #' +#' #-----For proportions-----# +#' +#' as_freqform(tableForm, prop = TRUE) |> head() # print only Sex == Male rows +#' +#' # Marginalize proportions along "Sex" (i.e., male proportions sum to 1, female proportions sum to 1) +#' as_freqform(tableForm, prop = "Sex") |> head() +#' +#' as_freqform(tableForm, prop = 3) |> head() # Same as above +#' +#' # Marginalize proportions along multiple variables +#' as_freqform(tableForm, prop = c("Hair", "Sex")) |> head() +#' +#' as_freqform(tableForm, prop = c(1, 3)) |> head() # Same as above +#' +#' # Using dims and prop arguments in tandem +#' as_freqform(tableForm, dims = c("Hair", "Eye"), prop = TRUE) #' #' @export -as_freqform <- function(obj, freq = NULL, dims = NULL, tidy = TRUE){ +as_freqform <- function(obj, freq = NULL, dims = NULL, prop = NULL, tidy = TRUE){ - tab <- as.data.frame(as_table(obj, freq = freq, dims = dims)) + tab <- as.data.frame(as_table(obj, freq = freq, dims = dims, prop = prop)) - if (tidy){ + if (tidy) tab <- dplyr::as_tibble(tab) - } + + # Account for new column named "Freq" if freq was NULL + if (is.null(freq)) + freq <- "Freq" + + if (!is.null(prop)) + tab <- tab |> dplyr::rename("Prop" = dplyr::all_of(freq)) return(tab) } \ No newline at end of file diff --git a/R/as_matrix.R b/R/as_matrix.R new file mode 100644 index 00000000..7944ee46 --- /dev/null +++ b/R/as_matrix.R @@ -0,0 +1,78 @@ +#' Convert frequency, case, or table form data into a matrix. +#' +#' Converts object (`obj`) in frequency, case or table form into a matrix of +#' specified dimensions (`dims`). The column containing the frequencies (`freq`) +#' must be supplied if `obj` is in frequency form. +#' +#' @param obj +#' Object to be converted into a matrix. +#' @param freq +#' If `obj` is in frequency form, this is the name of the frequency column. +#' Leave as `NULL` if `obj` is in any other form. +#' @param dims +#' A character vector of dimensions. If not specified, all variables apart from +#' `freq` will be used as dimensions. +#' @param prop +#' If set to `TRUE`, returns a matrix of proportions (that sum to 1). May also +#' be set to a character or numeric vector of dimensions to be used as margins +#' from which proportions will be computed. +#' @return Object in matrix form. +#' +#' @details +#' First converts `obj` into an array using \code{\link{as_array}}. Then a +#' check is made to ensure the user inputted a 2D `obj`. If `obj` is not 2D, an +#' error is returned. If `obj` is 2D, \code{\link[base]{as.matrix}} is applied. +#' +#' @author Gavin M. Klorfine +#' +#' @seealso +#' \code{\link{as_array}}, \code{\link{as_table}}, \code{\link{as_freqform}}, +#' \code{\link{as_caseform}} +#' +#' @examples +#' library(vcdExtra) +#' +#' data("HairEyeColor") +#' +#' freqForm <- as.data.frame(HairEyeColor) # Generate frequency form data +#' tidy_freqForm <- dplyr::as_tibble(HairEyeColor) # Generate tidy frequency form data +#' caseForm <- expand.dft(freqForm) # Generate case form data +#' arrayDat <- as_array(HairEyeColor) # Generate an array +#' +#' # Table form -> matrix +#' as_matrix(HairEyeColor, dims = c("Hair", "Sex")) |> str() +#' +#' # Frequency form -> matrix +#' as_matrix(freqForm, freq = "Freq", dims = c("Hair", "Sex")) |> str() +#' +#' # Case form -> matrix form +#' as_matrix(caseForm, dims = c("Hair", "Sex")) |> str() +#' +#' # Frequency (tibble) form -> matrix form +#' as_matrix(tidy_freqForm, freq = "n", dims = c("Hair", "Sex")) |> str() +#' +#' #-----For proportions-----# +#' +#' # Proportions relative to grand total +#' as_matrix(HairEyeColor, dims = c("Hair", "Sex"), prop = TRUE) +#' +#' # Marginalize proportions along "Sex" (i.e., male proportions sum to 1, +#' # female proportions sum to 1) +#' as_matrix(HairEyeColor, dims = c("Hair", "Sex"), prop = "Sex") +#' +#' as_matrix(HairEyeColor, dims = c("Hair", "Sex"), prop = 2) # Same as above +#' +#' +#' @export + +as_matrix <- function(obj, freq = NULL, dims = NULL, prop = NULL){ + + tab <- as_array(obj, freq = freq, dims = dims, prop = prop) + + if (length(dim(tab)) == 2){ # If number of dimensions equal 2 + return(as.matrix(tab)) + } + else{ + stop("Please supply an object with two dimensions. You may use the `dims` argument.") + } +} \ No newline at end of file diff --git a/R/as_table.R b/R/as_table.R index 369adaf2..4d9f97cf 100644 --- a/R/as_table.R +++ b/R/as_table.R @@ -4,11 +4,19 @@ #' column containing the frequencies (`freq`) must be supplied if `obj` is in #' frequency form. Optionally returns a table of proportions with (optionally) specified margins. #' -#' @param obj object to be converted to table form -#' @param freq If `obj` is in frequency form, this is the name of the frequency column. Leave as `NULL` if `obj` is in any other form. -#' @param dims A character vector of dimensions. If not specified, all variables apart from `freq` will be used as dimensions -#' @param prop If set to TRUE, returns a table of proportions. May also be set to a character or numeric vector of margins. -#' @return object in table form +#' @param obj +#' Object to be converted to table form. +#' @param freq +#' If `obj` is in frequency form, this is the name of the frequency column. +#' Leave as `NULL` if `obj` is in any other form. +#' @param dims +#' A character vector of dimensions. If not specified, all variables apart from +#' `freq` will be used as dimensions. +#' @param prop +#' If set to `TRUE`, returns a table of proportions (that sum to 1). May also +#' be set to a character or numeric vector of dimensions to be used as margins +#' from which proportions will be computed. +#' @return Object in table form. #' #' @details #' If `obj` was in table form to begin with, it is returned to the user as-is @@ -20,6 +28,10 @@ #' #' @author Gavin M. Klorfine #' +#' @seealso +#' \code{\link{as_freqform}}, \code{\link{as_caseform}}, \code{\link{as_array}}, +#' \code{\link{as_matrix}} +#' #' @importFrom stats reformulate xtabs #' @importFrom methods is #' @@ -83,7 +95,7 @@ as_table <- function(obj, freq = NULL, dims = NULL, prop = NULL){ tab_or_array <- TRUE } # If obj is a tibble, convert to data frame - else if (is(obj, "table")){ + else if (is(obj, "tbl")){ obj <- as.data.frame(obj) } diff --git a/R/collapse.table.R b/R/collapse.table.R index cc13a71a..12da20c4 100644 --- a/R/collapse.table.R +++ b/R/collapse.table.R @@ -1,124 +1,128 @@ -# collapse a contingency table or ftable by re-assigning levels of table variables -# revised to accept an array also - - - -#' Collapse Levels of a Table -#' -#' Collapse (or re-label) variables in a a contingency table, array or -#' `ftable` object by re-assigning levels of the table variables. -#' -#' Each of the \code{\dots{}} arguments must be of the form `variable = -#' levels`, where `variable` is the name of one of the table dimensions, -#' and `levels` is a character or numeric vector of length equal to the -#' corresponding dimension of the table. -#' -#' @param table A \code{\link[base]{table}}, \code{\link[base]{array}} or -#' \code{\link[stats]{ftable}} object -#' @param \dots A collection of one or more assignments of factors of the table -#' to a list of levels -#' @return A `xtabs` and `table` object, representing the original -#' table with one or more of its factors collapsed or rearranged into other -#' levels. -#' @author Michael Friendly -#' @seealso \code{\link{expand.dft}} expands a frequency data frame to case -#' form. -#' -#' \code{\link[base]{margin.table}} "collapses" a table in a different way, by -#' summing over table dimensions. -#' @keywords manip attribute -#' @examples -#' -#' # create some sample data in table form -#' sex <- c("Male", "Female") -#' age <- letters[1:6] -#' education <- c("low", 'med', 'high') -#' data <- expand.grid(sex=sex, age=age, education=education) -#' counts <- rpois(36, 100) -#' data <- cbind(data, counts) -#' t1 <- xtabs(counts ~ sex + age + education, data=data) -#' structable(t1) -#' -#' ## age a b c d e f -#' ## sex education -#' ## Male low 119 101 109 85 99 93 -#' ## med 94 98 103 108 84 84 -#' ## high 81 88 96 110 100 92 -#' ## Female low 107 104 95 86 103 96 -#' ## med 104 98 94 95 110 106 -#' ## high 93 85 90 109 99 86 -#' -#' -#' # collapse age to 3 levels -#' t2 <- collapse.table(t1, age=c("A", "A", "B", "B", "C", "C")) -#' structable(t2) -#' -#' ## age A B C -#' ## sex education -#' ## Male low 220 194 192 -#' ## med 192 211 168 -#' ## high 169 206 192 -#' ## Female low 211 181 199 -#' ## med 202 189 216 -#' ## high 178 199 185 -#' -#' -#' # collapse age to 3 levels and pool education: "low" and "med" to "low" -#' t3 <- collapse.table(t1, age=c("A", "A", "B", "B", "C", "C"), -#' education=c("low", "low", "high")) -#' structable(t3) -#' -#' ## age A B C -#' ## sex education -#' ## Male low 412 405 360 -#' ## high 169 206 192 -#' ## Female low 413 370 415 -#' ## high 178 199 185 -#' -#' -#' -#' # change labels for levels of education to 1:3 -#' t4 <- collapse.table(t1, education=1:3) -#' structable(t4) -#' -#' structable(t4) -#' ## age a b c d e f -#' ## sex education -#' ## Male 1 119 101 109 85 99 93 -#' ## 2 94 98 103 108 84 84 -#' ## 3 81 88 96 110 100 92 -#' ## Female 1 107 104 95 86 103 96 -#' ## 2 104 98 94 95 110 106 -#' ## 3 93 85 90 109 99 86 -#' -#' -#' -#' -#' @export collapse.table -collapse.table <- function(table, ...) { - nargs <- length(args <- list(...)) - if (!nargs) - return(table) - if (inherits(table, "ftable")) - table <- as.table(table) - if (inherits(table, "array")) - table <- as.table(table) - if (inherits(table, "table")) { - tvars <- names(dimnames(table)) - table <- as.data.frame.table(table) - freq <- table[,"Freq"] - } - else stop("Argument must be a table, array or ftable object") - - names <- names(args) - for (i in 1:nargs) { - vals <- args[[i]] - nm <- names[[i]] - if(any(nm==tvars)) levels(table[[nm]]) <- vals - else warning(nm, " is not among the table variables.") - } -# term <- paste(tvars, collapse = '+') -# form <- as.formula(paste("freq ~", term)) -# cat("term: ", term, "\n") - xtabs(as.formula(paste("freq ~", paste(tvars, collapse = '+'))), data=table) -} +# collapse a contingency table or ftable by re-assigning levels of table variables +# revised to accept an array also + + + +#' Collapse Levels of a Table +#' +#' Collapse (or re-label) variables in a a contingency table, array or +#' `ftable` object by re-assigning levels of the table variables. +#' +#' Each of the \code{\dots{}} arguments must be of the form `variable = +#' levels`, where `variable` is the name of one of the table dimensions, +#' and `levels` is a character or numeric vector of length equal to the +#' corresponding dimension of the table. +#' +#' @param table A \code{\link[base]{table}}, \code{\link[base]{array}} or +#' \code{\link[stats]{ftable}} object +#' @param \dots A collection of one or more assignments of factors of the table +#' to a list of levels +#' @return A `xtabs` and `table` object, representing the original +#' table with one or more of its factors collapsed or rearranged into other +#' levels. +#' @author Michael Friendly +#' @seealso +#' \code{\link{expand.dft}} and \code{\link{as_caseform}}: expands a frequency +#' data frame to case form. +#' +#' \code{\link[base]{margin.table}} "collapses" a table in a different way, by +#' summing over table dimensions. +#' +#' \code{\link{collapse_levels}} collapses in the same manner as +#' \code{collapse.table} but also works for frequency and case form data. +#' @keywords manip attribute +#' @examples +#' +#' # create some sample data in table form +#' sex <- c("Male", "Female") +#' age <- letters[1:6] +#' education <- c("low", 'med', 'high') +#' data <- expand.grid(sex=sex, age=age, education=education) +#' counts <- rpois(36, 100) +#' data <- cbind(data, counts) +#' t1 <- xtabs(counts ~ sex + age + education, data=data) +#' structable(t1) +#' +#' ## age a b c d e f +#' ## sex education +#' ## Male low 119 101 109 85 99 93 +#' ## med 94 98 103 108 84 84 +#' ## high 81 88 96 110 100 92 +#' ## Female low 107 104 95 86 103 96 +#' ## med 104 98 94 95 110 106 +#' ## high 93 85 90 109 99 86 +#' +#' +#' # collapse age to 3 levels +#' t2 <- collapse.table(t1, age=c("A", "A", "B", "B", "C", "C")) +#' structable(t2) +#' +#' ## age A B C +#' ## sex education +#' ## Male low 220 194 192 +#' ## med 192 211 168 +#' ## high 169 206 192 +#' ## Female low 211 181 199 +#' ## med 202 189 216 +#' ## high 178 199 185 +#' +#' +#' # collapse age to 3 levels and pool education: "low" and "med" to "low" +#' t3 <- collapse.table(t1, age=c("A", "A", "B", "B", "C", "C"), +#' education=c("low", "low", "high")) +#' structable(t3) +#' +#' ## age A B C +#' ## sex education +#' ## Male low 412 405 360 +#' ## high 169 206 192 +#' ## Female low 413 370 415 +#' ## high 178 199 185 +#' +#' +#' +#' # change labels for levels of education to 1:3 +#' t4 <- collapse.table(t1, education=1:3) +#' structable(t4) +#' +#' structable(t4) +#' ## age a b c d e f +#' ## sex education +#' ## Male 1 119 101 109 85 99 93 +#' ## 2 94 98 103 108 84 84 +#' ## 3 81 88 96 110 100 92 +#' ## Female 1 107 104 95 86 103 96 +#' ## 2 104 98 94 95 110 106 +#' ## 3 93 85 90 109 99 86 +#' +#' +#' +#' +#' @export collapse.table +collapse.table <- function(table, ...) { + nargs <- length(args <- list(...)) + if (!nargs) + return(table) + if (inherits(table, "ftable")) + table <- as.table(table) + if (inherits(table, "array")) + table <- as.table(table) + if (inherits(table, "table")) { + tvars <- names(dimnames(table)) + table <- as.data.frame.table(table) + freq <- table[,"Freq"] + } + else stop("Argument must be a table, array or ftable object") + + names <- names(args) + for (i in 1:nargs) { + vals <- args[[i]] + nm <- names[[i]] + if(any(nm==tvars)) levels(table[[nm]]) <- vals + else warning(nm, " is not among the table variables.") + } +# term <- paste(tvars, collapse = '+') +# form <- as.formula(paste("freq ~", term)) +# cat("term: ", term, "\n") + xtabs(as.formula(paste("freq ~", paste(tvars, collapse = '+'))), data=table) +} diff --git a/R/collapse_levels.R b/R/collapse_levels.R new file mode 100644 index 00000000..cfb7f2a6 --- /dev/null +++ b/R/collapse_levels.R @@ -0,0 +1,152 @@ +#' Collapse the levels of a dataset +#' +#' Collapses the levels of a dataset (of any form) into those specified. May +#' also be used to re-name levels. Ensure argument \code{freq} is supplied +#' should your data be in frequency form (and the frequency column differs in +#' name from default, "Freq"). +#' +#' @param x The dataset to be collapsed. +#' @param freq Supply only if your data is in frequency form AND your frequency +#' column differs in name from the default ("Freq"). +#' @param \dots A collection of one or more assignments of dataset variables to +#' a list of levels in the format +#' \code{new_level = c("old_level_1", "old_level_2", ..., "old_level_n")}. +#' @return The collapsed dataset in its original form (i.e., the initial form of +#' \code{x}). +#' +#' @details +#' First converts the object \code{x} into a frequency form data frame. Then, +#' \code{\link[forcats]{fct_collapse}} is used to collapse variable levels. +#' Next, duplicate rows (an artefact of collapsing) are aggregated via +#' \code{\link[dplyr]{summarise}}. Last, the frequency form data frame is +#' converted back into the initial form of object \code{x}. +#' +#' The exceptions to this are objects in case form, which are passed directly +#' to \code{\link[forcats]{fct_collapse}} (and duplicate rows are not aggregated). +#' +#' @author Gavin M. Klorfine +#' +#' @seealso +#' \code{\link[forcats]{fct_collapse}}, \code{\link{collapse.table}} +#' +#' Tidy conversion functions: +#' \code{link{as_table}}, \code{link{as_freqform}}, \code{link{as_caseform}}, +#' \code{link{as_matrix}}, \code{link{as_array}}, +#' +#' @importFrom methods is +#' @importFrom rlang set_names .data +#' @importFrom dplyr as_tibble summarise all_of +#' @importFrom forcats fct_collapse +#' +#' @examples +#' data("HairEyeColor") # Table form data +#' str(HairEyeColor) +#' +#' collapse_levels( +#' HairEyeColor, # Dataset +#' Hair = list( # List of arguments for first variable +#' Dark = c("Black", "Brown"), # Collapse "Black" and "Brown" -> "Dark" +#' Light = c("Blond", "Red") # Collapse "Blond" and "Red" -> "Light" +#' ), +#' Eye = list( # List of arguments for second variable +#' Common = c("Brown"), # Collapse (rename) "Brown" -> "Common" +#' Uncommon = c("Blue", "Green", "Hazel") +#' ) +#' ) |> str() +#' +#' # To illustrate `freq` argument usage, convert Hoyt dataset to frequency form +#' # (ff) and then rename frequency column to "n" +#' +#' data("Hoyt", package = "vcdExtra") +#' ff_Hoyt <- as_freqform(Hoyt) +#' names(ff_Hoyt)[length(ff_Hoyt)] <- "n" +#' str(ff_Hoyt) +#' +#' collapse_levels( +#' ff_Hoyt, +#' +#' # Ensure to supply if data is in frequency form and frequency column name +#' # differs from "Freq" +#' freq = "n", +#' +#' Occupation = list( +#' High = c(1, 2), +#' Middle = 3, +#' Low = 4, +#' VeryLow = c(5, 6, 7) +#' ) +#' ) |> str() +#' +#' +#' @export + +collapse_levels <- function(x, freq = "Freq", ...){ + + argms <- list(...) + x_class <- NULL + cf <- FALSE + + if (is(x, "array")) + x_class <- "array" + else if (is(x, "matrix")) + x_class <- "matrix" + else if (is(x, "table")) + x_class <- "table" + else if (is(x, "tbl")) + x_class <- "tbl" + else if (is(x, "data.frame")) + x_class <- "data.frame" + + # Handle case form data + if (!(freq %in% names(x)) && (is(x, "data.frame") || is(x, "tbl"))) { + for (i in 1:length(argms)) { + var <- names(argms)[i] + x[[var]] <- forcats::fct_collapse( + x[[var]], + !!!rlang::set_names(argms[[i]], names(argms[[i]])) + ) + } + return(x) + } + else # If not case form, convert to data frame + coll_x <- as_freqform(x, freq = freq, tidy = FALSE) + + freq <- "Freq" + + fact_lvls <- list() # Initialize list + + # Iterate through and collapse factors according to specified levels + for (i in 1:length(names(argms))){ + + # Gather the specified levels + for (j in 1:length(argms[[i]])){ + fact_lvls <- append(fact_lvls, list(argms[[i]][[j]])) + } + + # Collapse factor according to specified levels + coll_x[[names(argms)[i]]] <- forcats::fct_collapse( + coll_x[[names(argms)[i]]], + !!!rlang::set_names(fact_lvls, names(argms[[i]])) + ) + + fact_lvls <- list() # Reset list for the next factor + } + + # Combine duplicate rows + groups = setdiff(names(coll_x), freq) + coll_x <- coll_x |> + dplyr::summarise(Freq = sum(.data[[freq]]), .by = dplyr::all_of(groups)) + + # Return the user a collapsed object of same type as their input + if (!is.null(x_class)){ + if (x_class == "array") + coll_x <- as_array(coll_x, freq = "Freq") + else if (x_class == "matrix") + coll_x <- as_matrix(coll_x, freq = "Freq") + else if (x_class == "table") + coll_x <- as_table(coll_x, freq = "Freq") + else if (x_class == "tbl") + coll_x <- dplyr::as_tibble(coll_x) + } + return (coll_x) +} \ No newline at end of file diff --git a/R/expand.dft.R b/R/expand.dft.R index fa8f7774..90c873e8 100644 --- a/R/expand.dft.R +++ b/R/expand.dft.R @@ -1,96 +1,101 @@ -# Originally from Marc Schwarz -# Ref: http://tolstoy.newcastle.edu.au/R/e6/help/09/01/1873.html - -# 23 Feb 22: Fix warning from type.convert - - - -#' Expand a frequency table to case form -#' -#' Converts a frequency table, given either as a table object or a data frame -#' in frequency form to a data frame representing individual observations in -#' the table. -#' -#' `expand.table` is a synonym for `expand.dft`. -#' -#' @aliases expand.dft expand.table -#' @param x A table object, or a data frame in frequency form containing -#' factors and one numeric variable representing the cell frequency for that -#' combination of factors. -#' @param var.names A list of variable names for the factors, if you wish to -#' override those already in the table -#' @param freq The name of the frequency variable in the table -#' @param \dots Other arguments passed down to `type.convert`. In -#' particular, pay attention to `na.strings` (default: -#' `na.strings=NA` if there are missing cells) and `as.is` (default: -#' `as.is=FALSE`, converting character vectors to factors). -#' @return A data frame containing the factors in the table and as many -#' observations as are represented by the total of the `freq` variable. -#' @author Mark Schwarz -#' @seealso \code{\link[utils]{type.convert}}, -#' \code{\link[gnm]{expandCategorical}} -#' @references Originally posted on R-Help, Jan 20, 2009, -#' http://tolstoy.newcastle.edu.au/R/e6/help/09/01/1873.html -#' -#' Friendly, M. and Meyer, D. (2016). *Discrete Data Analysis with R: -#' Visualization and Modeling Techniques for Categorical and Count Data*. Boca -#' Raton, FL: Chapman & Hall/CRC. . -#' @keywords manip array -#' @examples -#' -#' library(vcd) -#' art <- xtabs(~Treatment + Improved, data = Arthritis) -#' art -#' artdf <- expand.dft(art) -#' str(artdf) -#' -#' # 1D case -#' (tab <- table(sample(head(letters), 20, replace=TRUE))) -#' expand.table(tab, var.names="letter") -#' -#' -#' @export -expand.dft <- function(x, var.names = NULL, freq = "Freq", ...) -{ - # allow: a table object, or a data frame in frequency form - if(inherits(x, "table")) - x <- as.data.frame.table(x, responseName = freq) - - freq.col <- which(colnames(x) == freq) - if (length(freq.col) == 0) - stop(paste(sQuote("freq"), "not found in column names")) - - DF <- sapply(1:nrow(x), - function(i) x[rep(i, each = x[i, freq.col, drop = TRUE]), ], - simplify = FALSE) - - DF <- do.call("rbind", DF)[, -freq.col, drop=FALSE] - - for (i in 1:ncol(DF)) - { - DF[[i]] <- type.convert(as.character(DF[[i]]), as.is=TRUE, ...) -## DONE ##: Generates warning: -## 1: In type.convert.default(as.character(DF[[i]]), ...) : -## 'as.is' should be specified by the caller; using TRUE - } - - rownames(DF) <- NULL - - if (!is.null(var.names)) - { - if (length(var.names) < dim(DF)[2]) - { - stop(paste("Too few", sQuote("var.names"), "given.")) - } else if (length(var.names) > dim(DF)[2]) { - stop(paste("Too many", sQuote("var.names"), "given.")) - } else { - names(DF) <- var.names - } - } - - DF -} - -# make this a synonym -#' @export -expand.table <- expand.dft +# Originally from Marc Schwarz +# Ref: http://tolstoy.newcastle.edu.au/R/e6/help/09/01/1873.html + +# 23 Feb 22: Fix warning from type.convert + + + +#' Expand a frequency table to case form +#' +#' Converts a frequency table, given either as a table object or a data frame +#' in frequency form to a data frame representing individual observations in +#' the table. +#' +#' `expand.table` is a synonym for `expand.dft`. +#' +#' @aliases expand.dft expand.table +#' @param x A table object, or a data frame in frequency form containing +#' factors and one numeric variable representing the cell frequency for that +#' combination of factors. +#' @param var.names A list of variable names for the factors, if you wish to +#' override those already in the table +#' @param freq The name of the frequency variable in the table +#' @param \dots Other arguments passed down to `type.convert`. In +#' particular, pay attention to `na.strings` (default: +#' `na.strings=NA` if there are missing cells) and `as.is` (default: +#' `as.is=FALSE`, converting character vectors to factors). +#' @return A data frame containing the factors in the table and as many +#' observations as are represented by the total of the `freq` variable. +#' @author Mark Schwarz +#' @seealso +#' \code{\link[utils]{type.convert}}, +#' \code{\link[gnm]{expandCategorical}}, \code{\link{as_caseform}}, +#' \code{\link{as_table}}, \code{\link{as_freqform}}, +#' \code{\link{as_array}}, \code{\link{as_matrix}} +#' +#' @references Originally posted on R-Help, Jan 20, 2009, +#' http://tolstoy.newcastle.edu.au/R/e6/help/09/01/1873.html +#' +#' Friendly, M. and Meyer, D. (2016). *Discrete Data Analysis with R: +#' Visualization and Modeling Techniques for Categorical and Count Data*. Boca +#' Raton, FL: Chapman & Hall/CRC. . +#' @keywords manip array +#' @examples +#' +#' library(vcd) +#' art <- xtabs(~Treatment + Improved, data = Arthritis) +#' art +#' artdf <- expand.dft(art) +#' str(artdf) +#' +#' # 1D case +#' (tab <- table(sample(head(letters), 20, replace=TRUE))) +#' expand.table(tab, var.names="letter") +#' +#' +#' @export expand.dft +expand.dft <- function(x, var.names = NULL, freq = "Freq", ...) +{ + # allow: a table object, or a data frame in frequency form + if(inherits(x, "table")) + x <- as.data.frame.table(x, responseName = freq) + + freq.col <- which(colnames(x) == freq) + if (length(freq.col) == 0) + stop(paste(sQuote("freq"), "not found in column names")) + + DF <- sapply(1:nrow(x), + function(i) x[rep(i, each = x[i, freq.col, drop = TRUE]), ], + simplify = FALSE) + + DF <- do.call("rbind", DF)[, -freq.col, drop=FALSE] + + for (i in 1:ncol(DF)) + { + DF[[i]] <- type.convert(as.character(DF[[i]]), as.is=TRUE, ...) +## DONE ##: Generates warning: +## 1: In type.convert.default(as.character(DF[[i]]), ...) : +## 'as.is' should be specified by the caller; using TRUE + } + + rownames(DF) <- NULL + + if (!is.null(var.names)) + { + if (length(var.names) < dim(DF)[2]) + { + stop(paste("Too few", sQuote("var.names"), "given.")) + } else if (length(var.names) > dim(DF)[2]) { + stop(paste("Too many", sQuote("var.names"), "given.")) + } else { + names(DF) <- var.names + } + } + + DF +} + +# make this a synonym +#' @usage expand.table(x, var.names = NULL, freq = "Freq", ...) +#' @export expand.table +expand.table <- expand.dft diff --git a/_pkgdown.yml b/_pkgdown.yml index 49adfc9d..448eb743 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,174 +1,177 @@ -url: https://friendly.github.io/vcdExtra - -template: - bootstrap: 5 - # For pkgdown >= 2.1.0, can use: math-rendering: mathjax - math-rendering: mathjax - params: - bootswatch: cosmo - -articles: - - title: "Tutorials" - desc: "Vignettes introducing the package" - contents: - - a1-creating - - a2-tests - - a3-loglinear - - a4-mosaics - - a5-demo-housing - - a6-mobility - - a7-continuous - - datasets - - tidyCats - - - title: "Other" - desc: "Additional articles" - contents: - - articles/color_table - -reference: - - - title: Mosaics - desc: Extensions of `vcd::mosaic()` and related displays to `glm` and `gnm` objects and 3D displays - contents: - - assoc.glm - - mosaic.glm - - mosaic.glmlist - - glmlist - - sieve.glm - - mosaic3d - - center3d - - split3d - - range3d - - seq_mosaic - - labeling_points - - pairs_diagonal_mosaic - - - title: Loglinear models - desc: Functions for constructing and manipulating loglinear models - contents: - - conditional - - Crossings - - joint - - Kway - - loglin2formula - - loglin2string - - loglmlist - - markov - - mutual - - saturated - - seq_loglm - - get_models - - get_model - - - title: Other graphical methods - desc: Other functions for visualizing contingency tables - contents: - - assoc_graph - - plot.assoc_graph - - color_table - - knit_include - - mcaplot - - - - title: Statistical tests - desc: Statistical tests for categorical data - contents: - - CMHtest - - GKgamma - - print.GKgamma - - print.Kappa - - HLtest - - HosmerLemeshow - - logLik.loglm - - LRstats - - modFit - - modFit.glm - - modFit.loglm - - woolf_test - - zero.test - - - title: Distributions - desc: Logseries distributions - contents: - - dlogseries - - plogseries - - qlogseries - - rlogseries - - - title: Conversions - desc: converting between table, freq, case, array forms - contents: - - as_table - - as_array - - as_freqform - - as_caseform - - - title: Utility - desc: Utility functions in the package - contents: - - datasets - - blogits - - collapse.table - - cutfac - - expand.dft - - expand.table - - - title: Other - desc: Other functions in the package (not yet classified) - contents: - - Summarise - - update.xtabs - - vcdExtra-package - - - title: Data - desc: Some small data sets used in examples or in [DDAR](http://ddar.datavis.ca/) - contents: - - Abortion - - Accident - - AirCrash - - Alligator - - Asbestos - - Bartlett - - Burt - - Caesar - - Cancer - - Cormorants - - CrabSatellites - - CyclingDeaths - - DaytonSurvey - - Depends - - Detergent - - Donner - - Draft1970 - - Draft1970table - - Dyke - - Fungicide - - GSS - - Geissler - - Gilby - - Glass - - HairEyePlace - - Hauser79 - - Heart - - Heckman - - HospVisits - - HouseTasks - - Hoyt - - ICU - - JobSat - - Mammograms - - Mental - - Mice - - Mobility - - PhdPubs - - Reinis - - ShakeWords - - TV - - Titanicp - - Toxaemia - - Vietnam - - Vote1980 - - WorkerSat - - Yamaguchi87 - +url: https://friendly.github.io/vcdExtra + +template: + bootstrap: 5 + # For pkgdown >= 2.1.0, can use: math-rendering: mathjax + math-rendering: mathjax + params: + bootswatch: cosmo + +articles: + - title: "Tutorials" + desc: "Vignettes introducing the package" + contents: + - a1-creating + - a1a-convert-collapse + - a2-tests + - a3-loglinear + - a4-mosaics + - a5-demo-housing + - a6-mobility + - a7-continuous + - datasets + - tidyCats + + - title: "Other" + desc: "Additional articles" + contents: + - articles/color_table + +reference: + + - title: Mosaics + desc: Extensions of `vcd::mosaic()` and related displays to `glm` and `gnm` objects and 3D displays + contents: + - assoc.glm + - mosaic.glm + - mosaic.glmlist + - glmlist + - sieve.glm + - mosaic3d + - center3d + - split3d + - range3d + - seq_mosaic + - labeling_points + - pairs_diagonal_mosaic + + - title: Loglinear models + desc: Functions for constructing and manipulating loglinear models + contents: + - conditional + - Crossings + - joint + - Kway + - loglin2formula + - loglin2string + - loglmlist + - markov + - mutual + - saturated + - seq_loglm + - get_models + - get_model + + - title: Other graphical methods + desc: Other functions for visualizing contingency tables + contents: + - assoc_graph + - plot.assoc_graph + - color_table + - knit_include + - mcaplot + + + - title: Statistical tests + desc: Statistical tests for categorical data + contents: + - CMHtest + - GKgamma + - print.GKgamma + - print.Kappa + - HLtest + - HosmerLemeshow + - logLik.loglm + - LRstats + - modFit + - modFit.glm + - modFit.loglm + - woolf_test + - zero.test + + - title: Distributions + desc: Logseries distributions + contents: + - dlogseries + - plogseries + - qlogseries + - rlogseries + + - title: Conversions + desc: Converting between table, freq, case, array, and matrix forms + contents: + - as_table + - as_array + - as_freqform + - as_caseform + - as_matrix + + - title: Utility + desc: Utility functions in the package + contents: + - datasets + - blogits + - collapse_levels + - collapse.table + - cutfac + - expand.dft + - expand.table + + - title: Other + desc: Other functions in the package (not yet classified) + contents: + - Summarise + - update.xtabs + - vcdExtra-package + + - title: Data + desc: Some small data sets used in examples or in [DDAR](http://ddar.datavis.ca/) + contents: + - Abortion + - Accident + - AirCrash + - Alligator + - Asbestos + - Bartlett + - Burt + - Caesar + - Cancer + - Cormorants + - CrabSatellites + - CyclingDeaths + - DaytonSurvey + - Depends + - Detergent + - Donner + - Draft1970 + - Draft1970table + - Dyke + - Fungicide + - GSS + - Geissler + - Gilby + - Glass + - HairEyePlace + - Hauser79 + - Heart + - Heckman + - HospVisits + - HouseTasks + - Hoyt + - ICU + - JobSat + - Mammograms + - Mental + - Mice + - Mobility + - PhdPubs + - Reinis + - ShakeWords + - TV + - Titanicp + - Toxaemia + - Vietnam + - Vote1980 + - WorkerSat + - Yamaguchi87 + diff --git a/man/as_array.Rd b/man/as_array.Rd index 49c8814b..78974a1b 100644 --- a/man/as_array.Rd +++ b/man/as_array.Rd @@ -4,14 +4,20 @@ \alias{as_array} \title{Convert frequency, case, or table form data into an array} \usage{ -as_array(obj, freq = NULL, dims = NULL) +as_array(obj, freq = NULL, dims = NULL, prop = NULL) } \arguments{ -\item{obj}{object to be converted to an array} +\item{obj}{Object to be converted to an array.} -\item{freq}{If \code{obj} is in frequency form, this is the name of the frequency column. Leave as \code{NULL} if \code{obj} is in any other form.} +\item{freq}{If \code{obj} is in frequency form, this is the name of the frequency column. +Leave as \code{NULL} if \code{obj} is in any other form.} -\item{dims}{A character vector of dimensions. If not specified, all variables apart from \code{freq} will be used as dimensions} +\item{dims}{A character vector of dimensions. If not specified, all variables apart from +\code{freq} will be used as dimensions.} + +\item{prop}{If set to \code{TRUE}, returns an array of proportions (that sum to 1). May also +be set to a character or numeric vector of dimensions to be used as margins +from which proportions will be computed.} } \value{ object in array form @@ -22,7 +28,7 @@ column containing the frequencies (\code{freq}) must be supplied if \code{obj} i frequency form. } \details{ -Unclasses the \code{as_table()} function to return an object in array form. +Unclasses the \code{\link{as_table}} function to return an object in array form. } \examples{ library(vcdExtra) @@ -48,7 +54,29 @@ as_array(tidy_freqForm, freq = "n") |> str() # For specific dimensions as_array(tidy_freqForm, freq = "n", dims = c("Hair", "Eye")) |> str() +#-----For proportions-----# + +as_array(freqForm, freq = "Freq", prop = TRUE) |> # proportions relative to grand total + head(c(4,4,1)) + +# Marginalize proportions along "Sex" (i.e., male proportions sum to 1, female proportions sum to 1) +as_array(freqForm, freq = "Freq", prop = "Sex") |> head(c(4,4,1)) + +as_array(freqForm, freq = "Freq", prop = 3) |> head(c(4,4,1)) # Same as above + +# Marginalize proportions along multiple variables +as_array(freqForm, freq = "Freq", prop = c("Hair", "Sex")) |> head(c(4,4,1)) +as_array(freqForm, freq = "Freq", prop = c(1, 3)) |> head(c(4,4,1)) # Same as above + +# Using dims and prop arguments in tandem +as_array(freqForm, freq = "Freq", dims = c("Hair", "Eye"), prop = TRUE) + + +} +\seealso{ +\code{\link{as_table}}, \code{\link{as_freqform}}, \code{\link{as_caseform}}, +\code{\link{as_matrix}} } \author{ Gavin M. Klorfine diff --git a/man/as_caseform.Rd b/man/as_caseform.Rd index b82e8790..b781bb89 100644 --- a/man/as_caseform.Rd +++ b/man/as_caseform.Rd @@ -7,13 +7,15 @@ as_caseform(obj, freq = "Freq", dims = NULL, tidy = TRUE) } \arguments{ -\item{obj}{object to be converted to case form} +\item{obj}{Object to be converted to case form.} -\item{freq}{If \code{obj} is in frequency form, this is the name of the frequency column. If \code{obj} is in any other form, do not supply an argument (see "Details")} +\item{freq}{If \code{obj} is in frequency form, this is the name of the frequency column. If +\code{obj} is in any other form, do not supply an argument (see "Details").} -\item{dims}{A character vector of dimensions. If not specified, all variables apart from \code{freq} will be used as dimensions} +\item{dims}{A character vector of dimensions. If not specified, all variables apart from +\code{freq} will be used as dimensions.} -\item{tidy}{returns a tibble if set to TRUE} +\item{tidy}{Returns a tibble if set to \code{TRUE}.} } \value{ object in case form. @@ -24,11 +26,11 @@ column containing the frequencies (\code{freq}) must be supplied if \code{obj} i frequency form. Returns a tibble if \code{tidy} is set to \code{TRUE}. } \details{ -A wrapper for \code{expand.dft()} that is able to handle arrays. +A wrapper for \code{\link{expand.dft}} that is able to handle arrays. If a frequency column is not supplied, this function defaults to "Freq" -just like \code{expand.dft()}. Converts \code{obj} to a table using -\code{as_table()} before converting to case form. +just like \code{\link{expand.dft}}. Converts \code{obj} to a table using +\code{\link{as_table}} before converting to case form. } \examples{ library(vcdExtra) @@ -53,6 +55,10 @@ as_caseform(arrayDat) |> str() as_caseform(tableForm, dims = c("Hair", "Eye")) |> str() +} +\seealso{ +\code{\link{as_table}}, \code{\link{as_freqform}}, \code{\link{as_array}}, +\code{\link{as_matrix}}, \code{\link{expand.dft}} } \author{ Gavin M. Klorfine diff --git a/man/as_freqform.Rd b/man/as_freqform.Rd index 5528babd..1e23dd67 100644 --- a/man/as_freqform.Rd +++ b/man/as_freqform.Rd @@ -4,29 +4,36 @@ \alias{as_freqform} \title{Convert any form (case or table form) into frequency form.} \usage{ -as_freqform(obj, freq = NULL, dims = NULL, tidy = TRUE) +as_freqform(obj, freq = NULL, dims = NULL, prop = NULL, tidy = TRUE) } \arguments{ -\item{obj}{object to be converted to frequency form} +\item{obj}{Object to be converted to frequency form.} -\item{freq}{If \code{obj} is already in frequency form, this is the name of the frequency column. If \code{obj} is in any other form, do not supply an argument (see "Details")} +\item{freq}{If \code{obj} is already in frequency form, this is the name of the frequency +column. If \code{obj} is in any other form, do not supply an argument (see "Details").} -\item{dims}{A character vector of dimensions. If not specified, all variables apart from \code{freq} will be used as dimensions} +\item{dims}{A character vector of dimensions. If not specified, all variables apart from +\code{freq} will be used as dimensions.} -\item{tidy}{returns a tibble if set to TRUE} +\item{prop}{If set to \code{TRUE}, the resulting "frequency" column will contain proportions +(that sum to 1). May also be set to a character or numeric vector of +dimensions to be used as margins from which proportions will be computed. +The resulting "frequency" column is renamed to "Prop."} + +\item{tidy}{Returns a tibble if set to \code{TRUE}.} } \value{ -object in frequency form. +Object in frequency form. } \description{ -A wrapper for \code{as.data.frame()} that is able to properly handle arrays. +A wrapper for \code{\link[base]{as.data.frame}} that is able to properly handle arrays. Converts object (\code{obj}) in case or table form into frequency form. The column containing the frequencies (\code{freq}) must be supplied if \code{obj} is already in frequency form (and you are using this function to select dimensions). Returns a tibble if \code{tidy} is set to \code{TRUE}. } \details{ -Converts \code{obj} to a table using \code{as_table()} before converting to +Converts \code{obj} to a table using \code{\link{as_table}} before converting to frequency form } \examples{ @@ -53,7 +60,27 @@ as_freqform(freqForm, freq = "Freq", dims = c("Hair", "Eye")) |> str() as_freqform(tableForm, dims = c("Hair", "Eye")) |> str() +#-----For proportions-----# + +as_freqform(tableForm, prop = TRUE) |> head() # print only Sex == Male rows + +# Marginalize proportions along "Sex" (i.e., male proportions sum to 1, female proportions sum to 1) +as_freqform(tableForm, prop = "Sex") |> head() + +as_freqform(tableForm, prop = 3) |> head() # Same as above +# Marginalize proportions along multiple variables +as_freqform(tableForm, prop = c("Hair", "Sex")) |> head() + +as_freqform(tableForm, prop = c(1, 3)) |> head() # Same as above + +# Using dims and prop arguments in tandem +as_freqform(tableForm, dims = c("Hair", "Eye"), prop = TRUE) + +} +\seealso{ +\code{\link{as_table}}, \code{\link{as_caseform}}, \code{\link{as_array}}, +\code{\link{as_matrix}} } \author{ Gavin M. Klorfine diff --git a/man/as_matrix.Rd b/man/as_matrix.Rd new file mode 100644 index 00000000..0da3d013 --- /dev/null +++ b/man/as_matrix.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_matrix.R +\name{as_matrix} +\alias{as_matrix} +\title{Convert frequency, case, or table form data into a matrix.} +\usage{ +as_matrix(obj, freq = NULL, dims = NULL, prop = NULL) +} +\arguments{ +\item{obj}{Object to be converted into a matrix.} + +\item{freq}{If \code{obj} is in frequency form, this is the name of the frequency column. +Leave as \code{NULL} if \code{obj} is in any other form.} + +\item{dims}{A character vector of dimensions. If not specified, all variables apart from +\code{freq} will be used as dimensions.} + +\item{prop}{If set to \code{TRUE}, returns a matrix of proportions (that sum to 1). May also +be set to a character or numeric vector of dimensions to be used as margins +from which proportions will be computed.} +} +\value{ +Object in matrix form. +} +\description{ +Converts object (\code{obj}) in frequency, case or table form into a matrix of +specified dimensions (\code{dims}). The column containing the frequencies (\code{freq}) +must be supplied if \code{obj} is in frequency form. +} +\details{ +First converts \code{obj} into an array using \code{\link{as_array}}. Then a +check is made to ensure the user inputted a 2D \code{obj}. If \code{obj} is not 2D, an +error is returned. If \code{obj} is 2D, \code{\link[base]{as.matrix}} is applied. +} +\examples{ +library(vcdExtra) + +data("HairEyeColor") + +freqForm <- as.data.frame(HairEyeColor) # Generate frequency form data +tidy_freqForm <- dplyr::as_tibble(HairEyeColor) # Generate tidy frequency form data +caseForm <- expand.dft(freqForm) # Generate case form data +arrayDat <- as_array(HairEyeColor) # Generate an array + +# Table form -> matrix +as_matrix(HairEyeColor, dims = c("Hair", "Sex")) |> str() + +# Frequency form -> matrix +as_matrix(freqForm, freq = "Freq", dims = c("Hair", "Sex")) |> str() + +# Case form -> matrix form +as_matrix(caseForm, dims = c("Hair", "Sex")) |> str() + +# Frequency (tibble) form -> matrix form +as_matrix(tidy_freqForm, freq = "n", dims = c("Hair", "Sex")) |> str() + +#-----For proportions-----# + +# Proportions relative to grand total +as_matrix(HairEyeColor, dims = c("Hair", "Sex"), prop = TRUE) + +# Marginalize proportions along "Sex" (i.e., male proportions sum to 1, +# female proportions sum to 1) +as_matrix(HairEyeColor, dims = c("Hair", "Sex"), prop = "Sex") + +as_matrix(HairEyeColor, dims = c("Hair", "Sex"), prop = 2) # Same as above + + +} +\seealso{ +\code{\link{as_array}}, \code{\link{as_table}}, \code{\link{as_freqform}}, +\code{\link{as_caseform}} +} +\author{ +Gavin M. Klorfine +} diff --git a/man/as_table.Rd b/man/as_table.Rd index ec092718..2c55d4e5 100644 --- a/man/as_table.Rd +++ b/man/as_table.Rd @@ -7,16 +7,20 @@ as_table(obj, freq = NULL, dims = NULL, prop = NULL) } \arguments{ -\item{obj}{object to be converted to table form} +\item{obj}{Object to be converted to table form.} -\item{freq}{If \code{obj} is in frequency form, this is the name of the frequency column. Leave as \code{NULL} if \code{obj} is in any other form.} +\item{freq}{If \code{obj} is in frequency form, this is the name of the frequency column. +Leave as \code{NULL} if \code{obj} is in any other form.} -\item{dims}{A character vector of dimensions. If not specified, all variables apart from \code{freq} will be used as dimensions} +\item{dims}{A character vector of dimensions. If not specified, all variables apart from +\code{freq} will be used as dimensions.} -\item{prop}{If set to TRUE, returns a table of proportions. May also be set to a character or numeric vector of margins.} +\item{prop}{If set to \code{TRUE}, returns a table of proportions (that sum to 1). May also +be set to a character or numeric vector of dimensions to be used as margins +from which proportions will be computed.} } \value{ -object in table form +Object in table form. } \description{ Converts object (\code{obj}) in frequency or case form into table form. The @@ -73,6 +77,10 @@ as_table(freqForm, freq = "Freq", prop = c(1, 3)) |> head(c(4,4,1)) # Same as ab as_table(freqForm, freq = "Freq", dims = c("Hair", "Eye"), prop = TRUE) +} +\seealso{ +\code{\link{as_freqform}}, \code{\link{as_caseform}}, \code{\link{as_array}}, +\code{\link{as_matrix}} } \author{ Gavin M. Klorfine diff --git a/man/collapse.table.Rd b/man/collapse.table.Rd index 684ada50..9c17dc6b 100644 --- a/man/collapse.table.Rd +++ b/man/collapse.table.Rd @@ -96,11 +96,14 @@ structable(t4) } \seealso{ -\code{\link{expand.dft}} expands a frequency data frame to case -form. +\code{\link{expand.dft}} and \code{\link{as_caseform}}: expands a frequency +data frame to case form. \code{\link[base]{margin.table}} "collapses" a table in a different way, by summing over table dimensions. + +\code{\link{collapse_levels}} collapses in the same manner as +\code{collapse.table} but also works for frequency and case form data. } \author{ Michael Friendly diff --git a/man/collapse_levels.Rd b/man/collapse_levels.Rd new file mode 100644 index 00000000..0a19e4c6 --- /dev/null +++ b/man/collapse_levels.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/collapse_levels.R +\name{collapse_levels} +\alias{collapse_levels} +\title{Collapse the levels of a dataset} +\usage{ +collapse_levels(x, freq = "Freq", ...) +} +\arguments{ +\item{x}{The dataset to be collapsed.} + +\item{freq}{Supply only if your data is in frequency form AND your frequency +column differs in name from the default ("Freq").} + +\item{\dots}{A collection of one or more assignments of dataset variables to +a list of levels in the format +\code{new_level = c("old_level_1", "old_level_2", ..., "old_level_n")}.} +} +\value{ +The collapsed dataset in its original form (i.e., the initial form of +\code{x}). +} +\description{ +Collapses the levels of a dataset (of any form) into those specified. May +also be used to re-name levels. Ensure argument \code{freq} is supplied +should your data be in frequency form (and the frequency column differs in +name from default, "Freq"). +} +\details{ +First converts the object \code{x} into a frequency form data frame. Then, +\code{\link[forcats]{fct_collapse}} is used to collapse variable levels. +Next, duplicate rows (an artefact of collapsing) are aggregated via +\code{\link[dplyr]{summarise}}. Last, the frequency form data frame is +converted back into the initial form of object \code{x}. + +The exceptions to this are objects in case form, which are passed directly +to \code{\link[forcats]{fct_collapse}} (and duplicate rows are not aggregated). +} +\examples{ +data("HairEyeColor") # Table form data +str(HairEyeColor) + +collapse_levels( + HairEyeColor, # Dataset + Hair = list( # List of arguments for first variable + Dark = c("Black", "Brown"), # Collapse "Black" and "Brown" -> "Dark" + Light = c("Blond", "Red") # Collapse "Blond" and "Red" -> "Light" + ), + Eye = list( # List of arguments for second variable + Common = c("Brown"), # Collapse (rename) "Brown" -> "Common" + Uncommon = c("Blue", "Green", "Hazel") + ) +) |> str() + +# To illustrate `freq` argument usage, convert Hoyt dataset to frequency form +# (ff) and then rename frequency column to "n" + +data("Hoyt", package = "vcdExtra") +ff_Hoyt <- as_freqform(Hoyt) +names(ff_Hoyt)[length(ff_Hoyt)] <- "n" +str(ff_Hoyt) + +collapse_levels( + ff_Hoyt, + + # Ensure to supply if data is in frequency form and frequency column name + # differs from "Freq" + freq = "n", + + Occupation = list( + High = c(1, 2), + Middle = 3, + Low = 4, + VeryLow = c(5, 6, 7) + ) +) |> str() + + +} +\seealso{ +\code{\link[forcats]{fct_collapse}}, \code{\link{collapse.table}} + +Tidy conversion functions: +\code{link{as_table}}, \code{link{as_freqform}}, \code{link{as_caseform}}, +\code{link{as_matrix}}, \code{link{as_array}}, +} +\author{ +Gavin M. Klorfine +} diff --git a/man/expand.dft.Rd b/man/expand.dft.Rd index dfe13da1..6a1a7a7f 100644 --- a/man/expand.dft.Rd +++ b/man/expand.dft.Rd @@ -58,7 +58,9 @@ Raton, FL: Chapman & Hall/CRC. \url{http://ddar.datavis.ca}. } \seealso{ \code{\link[utils]{type.convert}}, -\code{\link[gnm]{expandCategorical}} +\code{\link[gnm]{expandCategorical}}, \code{\link{as_caseform}}, +\code{\link{as_table}}, \code{\link{as_freqform}}, +\code{\link{as_array}}, \code{\link{as_matrix}} } \author{ Mark Schwarz diff --git a/tests/testthat/test-as_array.R b/tests/testthat/test-as_array.R new file mode 100644 index 00000000..67f14752 --- /dev/null +++ b/tests/testthat/test-as_array.R @@ -0,0 +1,37 @@ +data("HairEyeColor") + +arrayDat <- unclass(HairEyeColor) # Generate array form data +freqForm <- as.data.frame(HairEyeColor) # Generate frequency form data +tidy_freqForm <- dplyr::as_tibble(HairEyeColor) # Generate tidy frequency form data +caseForm <- expand.dft(freqForm) # Generate case form data + +test_that("Array form input returns an array", { + ary <- as_array(arrayDat) + expect_true(is.array(ary)) +}) + +test_that("Table form input returns an array", { + ary <- as_array(HairEyeColor) + expect_true(is.array(ary)) +}) + +test_that("Tibble frequency form input returns an array", { + ary <- as_array(tidy_freqForm, freq = "n") + expect_true(is.array(ary)) +}) + +test_that("Frequency form input returns an array", { + ary <- as_array(freqForm, freq = "Freq") + expect_true(is.array(ary)) +}) + +test_that("Case form input returns an array", { + ary <- as_array(caseForm) + expect_true(is.array(ary)) +}) + +test_that("Check if input was modified", { + ary <- sum(as_array(HairEyeColor)) + expect_equal(ary, sum(HairEyeColor)) +}) + diff --git a/tests/testthat/test-as_caseform.R b/tests/testthat/test-as_caseform.R new file mode 100644 index 00000000..1f7c1528 --- /dev/null +++ b/tests/testthat/test-as_caseform.R @@ -0,0 +1,17 @@ +data("HairEyeColor") + +freqForm <- as.data.frame(HairEyeColor) # Generate frequency form data +tidy_freqForm <- dplyr::as_tibble(HairEyeColor) # Generate tidy frequency form data +tableForm <- as_table(HairEyeColor) # Generate table form data +arrayDat <- as_array(HairEyeColor) # Generate an array + +test_that("Number of rows equal to number of entries", { + expect_equal( + sum(freqForm$Freq), + HairEyeColor |> as_caseform() |> nrow() + ) +}) + +test_that("Returns tibble by default", { + expect_s3_class(as_caseform(freqForm), "tbl") +}) \ No newline at end of file diff --git a/tests/testthat/test-as_freqform.R b/tests/testthat/test-as_freqform.R new file mode 100644 index 00000000..44e0d025 --- /dev/null +++ b/tests/testthat/test-as_freqform.R @@ -0,0 +1,37 @@ +data("HairEyeColor") + +arrayDat <- as_array(HairEyeColor) # Generate an array +freqForm <- as.data.frame(HairEyeColor) # Generate frequency form data +tidy_freqForm <- dplyr::as_tibble(HairEyeColor) # Generate tidy frequency form data +caseForm <- expand.dft(freqForm) # Generate case form data + +test_that("Array form input returns a tibble", { + ff <- as_freqform(arrayDat) + expect_s3_class(ff, "tbl") +}) + +test_that("Table form input returns a tibble", { + ff <- as_freqform(HairEyeColor) + expect_s3_class(ff, "tbl") +}) + +test_that("Tibble frequency form input returns a tibble", { + ff <- as_freqform(tidy_freqForm, freq = "n") + expect_s3_class(ff, "tbl") +}) + +test_that("Frequency form input returns a tibble", { + ff <- as_freqform(freqForm, freq = "Freq") + expect_s3_class(ff, "tbl") +}) + +test_that("Case form input returns a tibble", { + ff <- as_freqform(caseForm) + expect_s3_class(ff, "tbl") +}) + +test_that("Tibble frequency form input returns unmodified", { + ff <- as_freqform(tidy_freqForm, freq = "n") + expect_equal(sum(ff$Freq), sum(tidy_freqForm$n)) +}) + diff --git a/tests/testthat/test-as_matrix.R b/tests/testthat/test-as_matrix.R new file mode 100644 index 00000000..3c0e26ad --- /dev/null +++ b/tests/testthat/test-as_matrix.R @@ -0,0 +1,38 @@ +data("HairEyeColor") + +arrayDat <- unclass(HairEyeColor) # Generate array form data +freqForm <- as.data.frame(HairEyeColor) # Generate frequency form data +tidy_freqForm <- dplyr::as_tibble(HairEyeColor) # Generate tidy frequency form data +caseForm <- expand.dft(freqForm) # Generate case form data + +test_that("Matrix returns when array supplied", { + mat <- as_matrix(arrayDat, dim = c("Hair", "Eye")) + expect_true(inherits(mat, "matrix")) +}) + +test_that("Matrix returns when table supplied", { + mat <- as_matrix(HairEyeColor, dim = c("Hair", "Eye")) + expect_true(inherits(mat, "matrix")) +}) + +test_that("Matrix returns when frequency form supplied", { + mat <- as_matrix(freqForm, freq = "Freq", dim = c("Hair", "Eye")) + expect_true(inherits(mat, "matrix")) +}) + +test_that("Matrix returns when (tibble) frequency form supplied", { + mat <- as_matrix(tidy_freqForm, freq = "n", dim = c("Hair", "Eye")) + expect_true(inherits(mat, "matrix")) +}) + +test_that("Matrix returns when case form supplied", { + mat <- as_matrix(caseForm, dim = c("Hair", "Eye")) + expect_true(inherits(mat, "matrix")) +}) + +test_that("Correct dimensions returned", { + mat <- as_matrix(HairEyeColor, dim = c("Hair", "Eye")) + expect_true(all(names(dimnames(mat)) == c("Hair", "Eye"))) +}) + + \ No newline at end of file diff --git a/tests/testthat/test-collapse_levels.R b/tests/testthat/test-collapse_levels.R new file mode 100644 index 00000000..14dd16d8 --- /dev/null +++ b/tests/testthat/test-collapse_levels.R @@ -0,0 +1,34 @@ +data("HairEyeColor") +data("Hoyt", package = "vcdExtra") + +test_that("Equal sum produced via comparison with HairEyeColor data", { + coll.sum <- collapse_levels( + HairEyeColor, + Hair = list( + Dark = c("Black", "Brown"), + Light = c("Blond", "Red") + ), + Eye = list( + Common = c("Brown"), + Uncommon = c("Blue", "Green", "Hazel") + ) + ) |> sum() + expect_equal(sum(HairEyeColor), coll.sum) +}) + +test_that("Equal sum produced via comparison with Hoyt data", { + ff_Hoyt <- as_freqform(Hoyt) + names(ff_Hoyt)[length(ff_Hoyt)] <- "n" + + ff_coll <- collapse_levels( + ff_Hoyt, + freq = "n", + Occupation = list( + High = c(1, 2), + Middle = 3, + Low = 4, + VeryLow = c(5, 6, 7) + ) + ) + expect_equal(sum(ff_coll$Freq), sum(Hoyt)) +}) diff --git a/vignettes/a1-creating.Rmd b/vignettes/a1-creating.Rmd index 03d7cb8d..2a49b393 100644 --- a/vignettes/a1-creating.Rmd +++ b/vignettes/a1-creating.Rmd @@ -506,6 +506,11 @@ The `vcdExtra::Titanicp` data set contains information on 1309 passengers on the including `sibsp`, the number of (0:8) siblings or spouses aboard, and `parch` (0:6), the number of parents or children aboard, but the table is quite sparse. +```{r titanicSAVE, include=FALSE, echo=FALSE} +# Save unmodified Titanicp data for collapse_levels() example +saveDat <- Titanicp +``` + ```{r titanicp1} table(Titanicp$sibsp, Titanicp$parch) ``` @@ -543,6 +548,51 @@ or grouping categories according to their frequency: * `forcats::fct_collapse()`: Collapse factor levels into manually defined groups. * `forcats::fct_recode()`: Change factor levels by hand. +## Collapsing table levels: `collapse_levels()` + +As seen in the previous section, collapsing variables from datasets in frequency +or case form (in R) is not as intuitive/easy as it should be. Thus, the +`collapse_levels()` function was written to easily collapse variables contained +in datasets of *any form*. + +Note that when your object `X` is in frequency form, an argument of +`freq = "your frequency column name"` must be supplied. This argument defaults +to a value of `freq = "Freq"`. + +The use of this function is illustrated below using the same `Titanicp` data. + +***Example*** +First, factorize the `sibsp` and `parch` variables. Then collapse variables into +categories of `0`, `1`, and `2+`. + +```{r titanicLOAD, include=FALSE, echo=FALSE} +# Load unmodified Titanicp data for collapse_levels() example +Titanicp <- saveDat +``` + +```{r collapse_level-ex1} +str(Titanicp) # The original dataset + +Titanicp <- within(Titanicp, { + parchF <- factor(parch) + sibspF <- factor(sibsp) +}) + +Titanicp <- collapse_levels( + Titanicp, + parchF = list( + "2+" = c("2", "3", "4", "5", "6", "9") + ), + sibspF = list( + "2+" = c("2", "3", "4", "5", "8") + ) +) + +table(Titanicp$sibspF, Titanicp$parchF) +``` + +For a more thorough overview of `collapse_levels()`, see +[1a. Steps Toward Tidy Categorical Data Analysis](a1a-convert-collapse.html). ## Converting among frequency tables and data frames @@ -617,6 +667,47 @@ Art.df <- expand.dft(Art.tab) str(Art.df) ``` +### Tidy conversions + +To make conversions more intuitive, `vcdExtra` includes tidy conversion +functions `as_table()`, `as_matrix()`, `as_array()`, `as_freqform()`, and `as_caseform()` to easily convert any form to the form of the function's +namesake (i.e., `as_table()` converts any form into table form). Arguments in +these functions are of similar format, with conversions *from* frequency form +requiring the column containing frequencies to be specified via the `freq` +argument. Likewise, `dims` (dimensions) may be specified, with output summing +over excluded variables. For a more thorough overview of these functions, see +[1a. Steps Toward Tidy Categorical Data Analysis](a1a-convert-collapse.html). + +***Example***: +Convert the `GSStab` in table form to a `data.frame` in frequency form. +```{r, tidyconvert-ex1} +# tidy = TRUE (default) returns a tibble +GSS.ff <- as_freqform(GSStab, tidy = FALSE) +GSS.ff +``` + +***Example***: +Convert the `GSStab` data in frequency form (`GSS.ff`) back to table form. +```{r, tidyconvert-ex2} +as_table(GSS.ff, freq = "Freq") # When present, freq column must be specified +``` + +***Example***: Convert the `Arthritis` data in case form to a 3-way table of +`Treatment` $\times$ `Sex` $\times$ `Improved`. +```{r, tidyconvert-ex3} +tidy_Art.tab <- as_table(Arthritis, dims = c("Treatment", "Sex", "Improved")) +str(tidy_Art.tab) + +ftable(tidy_Art.tab) +``` + +***Example***: Convert the `Arthritis` data in table form (`tidy_Art.tab`) back +to a `data.frame` in case form, with factors `Treatment`, `Sex` and `Improved`. +```{r, tidyconvert-ex4} +tidy_Art.df <- as_caseform(tidy_Art.tab, tidy = FALSE) +str(tidy_Art.df) +``` + ## A complex example {#sec:complex} If you've followed so far, you're ready for a more complicated example. diff --git a/vignettes/a1a-convert-collapse.Rmd b/vignettes/a1a-convert-collapse.Rmd new file mode 100644 index 00000000..6bf5019e --- /dev/null +++ b/vignettes/a1a-convert-collapse.Rmd @@ -0,0 +1,428 @@ +--- +title: "1a. Steps Toward Tidy Categorical Data Analysis" +subtitle: "May the Forms Be with You: Novel Functions to Intuitively Convert Among Forms and Collapse Variable Levels Presented Using the `starwars` Data." +author: "Gavin M. Klorfine" +output: rmarkdown::html_vignette +package: vcdExtra +vignette: > + %\VignetteIndexEntry{1a. Steps Toward Tidy Categorical Data Analysis} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + message = FALSE, + warning = FALSE, + fig.height = 6, + fig.width = 7, + dev = "png", + comment = "##" +) + +library(vcdExtra) +library(dplyr) +library(tidyr) +``` + +

+ +

+ +# Overview + +While R provides many intuitive facilities for the manipulation of continuous +variables (such as those in the +[`tidyverse`](https://CRAN.R-project.org/package=tidyverse) collection of +packages), it somewhat lacks the equivalent for categorical data. Two such areas +include the collapsing of variable levels (e.g., combining hair +colours of "Brown" and "Black" into a "Dark" category) and the conversion +between forms of categorical data (e.g., from a `table` of entries to a +`data.frame` containing frequencies for each combination of variable levels). + +## Tidy Collapsing + +In R, when trying to collapse levels of a variable in a dataset (e.g., combining +hair colours of "Brown" and "Black" into a "Dark" category), it was often the +case that one would need to first convert amongst forms, "collapse" their data, aggregate the duplicate rows, and finally convert back to the initial form. + +`collapse_levels()` simplifies this process, allowing for the intuitive +collapsing of variable levels for datasets of any form. One just needs to ensure +that an argument of `freq = "the frequency column name"` is supplied when the +inputted dataset is in frequency form. + +Functionality of `collapse_levels()` is demonstrated below +using the `starwars` data from the +[`dplyr`](https://CRAN.R-project.org/package=dplyr) package. This dataset +contains case form data on various characters in the Star Wars franchise. +Variables considered in this vignette are a character's `hair_color`, +`skin_color`, and `eye_color`. Taken as is, this would correspond to an +$11 \times 28 \times 15$ contingency table... Time to collapse! + +Here I load the `starwars` data and select the variables of interest. For +simplicity, I then remove rows containing `NA` values. + +```{r overoll_loadselect} +data("starwars", package = "dplyr") + +star_case <- starwars |> + dplyr::select(c("hair_color", "skin_color", "eye_color")) |> + tidyr::drop_na() + +str(star_case) +``` + +First, taking a look at the levels of variable `hair_color`, there are many +ways one might want to collapse these categories: + +```{r overcoll_hairunique} +unique(star_case$hair_color) +``` + +***Example***: +Likely the most natural of these ways is the following: + +1. Collapse different spellings of `"blond"` (i.e., `"blond"` and `"blonde"` become `Blonde`). +1. Collapse different shades of `"brown"` (i.e., `"brown"` and `"brown, grey"` become `Brown`). +1. Collapse different shades of `"auburn"` (i.e., `"auburn, white"`, `"auburn, grey"`, and `"auburn"` become `Auburn`). +1. Keep `"none"` as-is. +1. Keep `"white"` as-is. +1. Keep `"grey"` as-is. +1. Keep `"Black"` as-is. + +Here is how to do this using `collapse_levels()`: + +```{r overcoll_ex1} +collapsed.star_case <- collapse_levels( + star_case, # The dataset + hair_color = list( # Assign the variable to be collapsed to a list + + # Format the list as NewLevel = c("old1", "old2", ..., "oldn") + Blonde = c("blond", "blonde"), + Brown = c("brown", "brown, grey"), + Auburn = c("auburn, white", "auburn, grey", "auburn") + ) +) +str(collapsed.star_case) +unique(collapsed.star_case$hair_color) +``` + +Second, one might also want to collapse levels of variable `skin_color`: + +```{r overcoll_skinunique} +unique(star_case$skin_color) +``` + +***Example***: +I decided to arbitrarily collapse these as follows: + +1. Keep `"none"` as-is. +1. Keep`"unknown"` as-is. +1. `Shades`, comprising all levels that begin with `"white"`, `"grey"`, `"dark"`, `"light"`, and `"fair"`. +1. `Rainbows`, comprising all other levels. + +Note that when working with real data, arbitrary decisions involving the +collapsing of variable levels are a *VERY* bad idea. Collapses should be +grounded in strong, data-driven justification. Arbitrary collapsing is employed +in this vignette purely for pedagogical and illustrative purposes. + +```{r overcoll_ex2} +collapsed.star_case <- collapse_levels( + collapsed.star_case, + skin_color = list( + Shades = c( + "fair", "white", "light", "dark", "grey", "grey, red", + "grey, blue", "white, blue", "grey, green, yellow", "fair, green, yellow" + ), + Rainbows = c( + "green", "pale", "metal", "brown mottle", "brown", "mottled green", + "orange", "blue, grey", "red", "blue", "yellow", "tan", "silver, red", + "green, grey", "red, blue, white", "brown, white" + ) + ) +) +str(collapsed.star_case) +unique(collapsed.star_case$skin_color) +``` + +Third, one may also want to collapse levels of variable `eye_color`: + +```{r overcoll_eyeunique} +unique(star_case$eye_color) +``` + +***Example***: +Again, I decided to arbitrarily collapse these as follows: + +1. `Normal`, with levels of typical human eye color (i.e., `"blue"`, `"blue-gray"`, `"brown"`, `"hazel"`, and `"dark"`). +1. `Abnormal`, with levels of eye colours that would be abnormal for humans (e.g., `"red"`, `"pink"`, `"gold"`, etc.). +1. Keep `unknown` as-is. + +```{r overcoll_ex3} +collapsed.star_case <- collapse_levels( + collapsed.star_case, + eye_color = list( + Normal = c("blue", "brown", "blue-gray", "hazel", "dark"), + Abnormal = c( + "yellow", "red", "orange", "black", "pink", "red, blue", "gold", + "green, yellow", "white" + ) + ) +) +str(collapsed.star_case) +unique(collapsed.star_case$eye_color) +``` + +In addition, one may want (and is able) to collapse levels of multiple variables +in a single call to `collapse_levels()`. + +***Example***: +To illustrate this (and to provide an easy working example for the following +"Tidy Conversions" section), the `collapsed.star_case` data is arbitrarily +collapsed as follows to correspond to a $3 \times 3 \times 3$ contingency table: + +1. Variable `hair_color`: + a. `Dark` corresponding to levels `"Brown"`, `"black"`, and `"Auburn"`. + b. `Light` corresponding to levels `"Blonde"`, `"white"`, and `"grey"`. + c. Keep `"none"` as-is. +1. Variable `skin_color`: + a. `Other` corresponding to levels `"none"` and `"unknown"`. + b. Keep `Rainbows` as-is. + c. Keep `Shades` as-is. +1. Variable `eye_color` kept as-is. + +```{r overcoll_ex4} +collapsed.star_case <- collapse_levels( + collapsed.star_case, + hair_color = list( # First variable + Dark = c("Brown", "black", "Auburn"), + Light = c("Blonde", "white", "grey") + ), + skin_color = list( # Second variable + Other = c("none", "unknown") + ) +) +unique(collapsed.star_case$hair_color) +unique(collapsed.star_case$skin_color) +str(collapsed.star_case) +``` + +## Tidy Conversions + +Until now, converting amongst forms of categorical data in R has been somewhat +onerous. As outlined in +[1. Creating and manipulating frequency tables]( a1-creating.html), +the below table shows the typical process for converting among forms +(`A`, `B`, and `C` represent categorical variables, `X` represents an R data +object): + +| **From this** | | **To this** | | +|:-----------------|:--------------------|:---------------------|-------------------| +| | _Case form_ | _Frequency form_ | _Table form_ | +| _Case form_ | noop | `xtabs(~A+B)` | `table(A,B)` | +| _Frequency form_ | `expand.dft(X)` | noop | `xtabs(count~A+B)`| +| _Table form_ | `expand.dft(X)` | `as.data.frame(X)` | noop | + +Instead, one may simply use `as_table(X)` to convert to table form, +`as_freqform(X)` to convert to frequency form, and `as_caseform(X)` to convert +to case form. These are illustrated in the network (node/edge) diagram below: + +

+ +

+ +Additionally, there are functions `as_array(X)` and `as_matrix(X)` +for converting to those respective types. + +Like `collapse_levels()`, the single thing to keep in mind when employing these functions is the following: +when your object `X` is in frequency form, an argument of +`freq = "your frequency column name"` must be supplied. Besides this, the rote +memory work of having to remember which function to use to convert form X to +form Y is now completely removed. + +Functionality of these "tidy" conversion functions are demonstrated below +using the `collapsed.star_case` data from the most recent example (i.e., the +data corresponding to a $3 \times 3 \times 3$ contingency table). + +***Example***: +Convert the `collapsed.star_case` data into frequency form. Name this data +`star_freqform`. + +```{r overconv-ex1} +star_freqform <- as_freqform(collapsed.star_case) + +str(star_freqform) +``` + +Note that if one would like a data frame instead of a tibble, an argument of +`tidy = FALSE` needs to be provided. Naturally, this `tidy` argument is present +only in functions `as_freqform()` and `as_caseform()`. + +***Example***: +Convert the `collapsed.star_case` data into a data frame in frequency form. + +```{r overconv-ex2} +as_freqform(collapsed.star_case, tidy = FALSE) |> str() +``` + +***Example***: +Convert the frequency form data, `star_freqform`, into table form. Name this +data `star_tab`. Because we are converting *from* frequency form, the +`freq = "frequency column name"` argument must be supplied. + +```{r overconv-ex3} +star_tab <- as_table(star_freqform, freq = "Freq") + +str(star_tab) +``` + +***Example***: +Convert the table form data, `star_tab`, into an array. Name this +data `star_array`. + +```{r overconv-ex4} +star_array <- as_array(star_tab) + +class(star_array) +str(star_array) +``` + +To convert to a matrix, one also needs to specify row and column dimensions. +This is done using the `dims = c("dim1", "dim2", ..., "dim_n")` argument, which +works by summing over the dimensions excluded from this call. The first provided +dimension is taken as the row dimension, with the second dimension taken as the +column dimension. + +***Example***: +Convert the array form data, `star_array`, into a matrix with dimensions +`"hair_color"` and `"eye_color"`. Name this data `star_mat`. + +```{r overconv-ex5} +star_mat <- as_matrix(star_array, dims = c("hair_color", "eye_color")) + +class(star_mat) +str(star_mat) +``` + +Note that the `dims` argument works the same way for all other tidy conversion +functions. + +***Example***: +Convert the table form data, `star_tab`, into frequency form with dimensions +`"hair_color"` and `"eye_color"`. + +```{r overconv-ex6} +as_freqform(star_tab, dims = c("hair_color", "eye_color")) |> str() +``` + +#### Proportions + +The last piece of these conversion functions is the `prop` argument, allowing +users to convert cells/frequencies to proportions. Calculated proportions may +either be relative to the grand total (`prop = TRUE`) or to one or more margins +(`prop = c("margin1", "margin2", ... "margin_n")`). + +Note that `as_caseform()` is the only of the tidy conversion functions to not +include a `prop` argument. Also, `as_caseform()` will not convert proportional +data.^[This was a deliberate choice, as once proportions are relative to +margins, it becomes unclear how to convert these proportions back to +the original entries.] + +***Example***: +Convert `star_mat` into a table of proportions that are relative to the grand +total. + +```{r propconv-ex1} +star_mat # To view the original + +as_table(star_mat, prop = TRUE) +``` + +***Example***: +Convert `star_mat` into a table of proportions that are relative to the marginal +sums of `hair_color`. + +```{r propconv-ex2} +as_table(star_mat, prop = "hair_color") +``` + +***Example***: +Convert `star_mat` into a table of proportions that are relative to the marginal +sums of both `hair_color` and `eye_color`. Since these are the only two +dimensions, cell proportions will all be equal to $1.0$ (except for cells where +no data exists). + +```{r propconv-ex3} +as_table(star_mat, prop = c("hair_color", "eye_color")) +``` + +# Taken Together + +Taking `collapse_levels()` and the tidy conversion functions together, one now +has an intuitive framework for manipulating categorical data. + +***Example***: +The `starwars` data also has a variable named `homeworld`, specifying the planet +that a given character was from. The below code does the following: + +1. Create data `home_star` from dataset `starwars`. The new data includes both `homeworld` and the previous variables of interest (`hair_color`, `eye_color`, and `skin_color`). Missing values are then omitted. +1. Sort `homeworld` alphabetically. +1. Collapse the first half of the sorted `homeworld`s into a level named `abc`. +1. Collapse the second half of the sorted `homeworld`s into a level named `xyz`. +1. Collapse `eye_color` according to the previous `Abnormal`, `Normal`, and `"unknown"` conventions. +1. Convert the collapsed data into a table with dimensions `homeworld` and +`eye_color`. Call this table `tab.home_star` and plot the result in a mosaic +display. +1. Convert `tab.home_star` into a matrix of proportions (relative to the grand total). + +```{r tt-ex1} +home_star <- starwars |> + dplyr::select(c("hair_color", "skin_color", "eye_color", "homeworld")) |> + tidyr::drop_na() + +# Sort unique levels of homeworld +lvls <- home_star$homeworld |> unique() |> sort() +lvls + +# Collapse variable levels +collapsed.home_star <- collapse_levels( + home_star, + homeworld = list( + abc = lvls[1:(length(lvls)/2)], + xyz = lvls[(length(lvls)/2 + 1):length(lvls)] + ), + eye_color = list( + Normal = c("blue", "brown", "blue-gray", "hazel", "dark"), + Abnormal = c( + "yellow", "red", "orange", "black", "pink", "red, blue", "gold", + "green, yellow", "white" + ) + ) +) +# Convert to table of dimensions 'homeworld' and 'eye_color' +tab.home_star <- as_table(collapsed.home_star, dims = c("homeworld", "eye_color")) + +# Plot as mosaic display +mosaic(tab.home_star, shading = TRUE, gp = shading_Friendly) + +# Convert table into matrix of proportions. Note argument 'dims' was not supplied +# as we already know that there are exactly 2 dimensions. +as_matrix(tab.home_star, prop = TRUE) +``` + +Thus, this constitutes a pipeline for working with categorical data: + +1. Gather data and clean it. +1. Collapse levels when substantively necessary. +1. Convert forms, select dimensions, and/or take proportions if necessary. + +```{r ttpipeline, eval=FALSE} +dataset |> # Gather the data + select(...) |> drop_na() |> ... |> # Clean the data + collapse_levels(...) |> # Collapse levels as necessary + as_form(...) # Convert forms, select dimensions, take proportions +``` + +When viewed this way, these functions appear to be the start of a grammar of +categorical data analysis. \ No newline at end of file diff --git a/vignettes/fig/convnetwork.png b/vignettes/fig/convnetwork.png new file mode 100644 index 00000000..e64e4a98 Binary files /dev/null and b/vignettes/fig/convnetwork.png differ diff --git a/vignettes/fig/formhex.png b/vignettes/fig/formhex.png new file mode 100644 index 00000000..08005b62 Binary files /dev/null and b/vignettes/fig/formhex.png differ