diff --git a/DESCRIPTION b/DESCRIPTION index 1e3c5417..8dfd57d3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: jsmodule Title: 'RStudio' Addins and 'Shiny' Modules for Medical Research -Version: 2.0.0 -Date: 2025-12-05 +Version: 2.0.1 +Date: 2025-12-09 Authors@R: c( person("Jinseob", "Kim", email = "jinseob2kim@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9403-605X")), person("Zarathu", role = c("cph", "fnd")), diff --git a/NEWS.md b/NEWS.md index 0ef38f18..ad189657 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,12 @@ +# jsmodule 2.0.1 +## Update +- Generalized all variable names in AI Assistant prompts to use placeholder patterns (`_var` suffix), making code examples applicable to any dataset instead of being tied to specific example data (e.g., `age`, `sex`, `time`, `status`, `rx`). +- Added `# IMPORTANT: Replace with your actual variable names` comments to all code examples in AI prompts to guide users in adapting generated code to their data. +- Improved AI Assistant prompt clarity by replacing dataset-specific examples with generic placeholders across all statistical analysis sections. + +## Bugfix +- Fixed AI Assistant result handling to properly support mixed result types (tables, plots, and flextables) returned from analysis code. + # jsmodule 2.0.0 ## New Features - Introduced the `jsmodule AI Assistant`, which connects to Anthropic, OpenAI, and Google LLMs so you can generate analyses, scripts, and export-ready visuals without leaving the app. diff --git a/R/aiAssistant.R b/R/aiAssistant.R index 0716acc5..af487fe5 100644 --- a/R/aiAssistant.R +++ b/R/aiAssistant.R @@ -335,7 +335,7 @@ aiAssistantUI <- function(id, show_api_config = TRUE) { shinyWidgets::actionBttn( ns("check_api_key"), "Check", - icon = icon("check"), + icon = icon("key"), style = "material-flat", color = "primary", size = "sm" @@ -674,7 +674,8 @@ aiAssistant <- function(input, output, session, data, data_label, allowed_packages <- c( "jstable", "jskm", "jsmodule", "survival", "ggplot2", "ggpubr", "pROC", "data.table", - "DT", "gridExtra", "GGally" + "DT", "gridExtra", "GGally", "forestploter", + "MatchIt", "timeROC" ) # Reactive values for model list and settings @@ -993,12 +994,13 @@ aiAssistant <- function(input, output, session, data, data_label, style = "btn-default" ) ), + tags$hr(style = "margin-top: 20px; margin-bottom: 15px;"), shinyWidgets::actionBttn( session$ns("apply_config"), "Apply Configuration", - icon = icon("check"), + icon = icon("save"), style = "material-flat", - color = "primary", + color = "success", size = "sm", block = TRUE ) @@ -1136,7 +1138,8 @@ aiAssistant <- function(input, output, session, data, data_label, display_history <- reactiveVal(list()) current_code <- reactiveVal("") execution_result <- reactiveVal(NULL) - result_type <- reactiveVal("none") # "plot", "table", "text", "error", "none", "loading" + result_type <- reactiveVal("none") # "plot", "table", "text", "error", "none", "loading", "mixed" + current_result_index <- reactiveVal(1) # For navigating through all result types # Token tracking token_usage <- reactiveVal(list( @@ -1151,19 +1154,108 @@ aiAssistant <- function(input, output, session, data, data_label, determine_result_type <- function(res, store_result = TRUE) { result_info <- list() - # Single plot - if (inherits(res, c("ggplot", "gg", "gtable", "grob", "recordedplot"))) { + # Check for NULL or empty result + if (is.null(res)) { + result_info$type <- "unknown" + result_info$value <- NULL + result_info$message <- "No result was generated. The code executed but did not return a value." + + if (store_result) { + execution_result(NULL) + result_type("unknown") + } + return(result_info) + } + + # Single plot (comprehensive plot class support) + # - ggplot/gg: ggplot2, ggpubr, jskm, pROC::ggroc + # - ggmatrix: GGally::ggpairs + # - gtable/gTree/grob: grid graphics, gridExtra::grid.arrange/arrangeGrob, forestploter::forest + # - recordedplot: base R plot() + if (inherits(res, c("ggplot", "gg", "gtable", "gTree", "grob", "recordedplot", "ggmatrix", "forestplot"))) { result_info$type <- "plot" result_info$value <- list(res) result_info$message <- "Plot generated successfully. The plot is now displayed in the Results panel." # Multiple plots } else if (is.list(res) && length(res) > 0 && - all(sapply(res, function(x) inherits(x, c("ggplot", "gg", "gtable", "grob", "recordedplot"))))) { + all(sapply(res, function(x) inherits(x, c("ggplot", "gg", "gtable", "gTree", "grob", "recordedplot", "ggmatrix", "forestplot"))))) { result_info$type <- "plot" result_info$value <- res result_info$message <- sprintf("Generated %d plots successfully. All plots are displayed in the Results panel.", length(res)) + # Multiple tables (list of data frames or matrices) + } else if (is.list(res) && length(res) > 0 && + all(sapply(res, function(x) is.data.frame(x) || is.matrix(x)))) { + result_info$type <- "table" + result_info$value <- res + result_info$message <- sprintf("Generated %d tables successfully. All tables are displayed in the Results panel.", length(res)) + + # Mixed plots and tables + } else if (is.list(res) && length(res) > 0) { + # Separate plots, tables, and other results + plots <- list() + tables <- list() + others <- list() + + for (item in res) { + if (inherits(item, c("ggplot", "gg", "gtable", "gTree", "grob", "recordedplot", "ggmatrix", "forestplot"))) { + plots <- c(plots, list(item)) + } else if (is.data.frame(item) || is.matrix(item)) { + tables <- c(tables, list(item)) + } else { + # Store other types (summary, text, etc.) + others <- c(others, list(item)) + } + } + + # Count total results + total_count <- length(plots) + length(tables) + length(others) + has_multiple_types <- sum(c(length(plots) > 0, length(tables) > 0, length(others) > 0)) > 1 + + # If we have multiple types, use mixed + if (has_multiple_types) { + result_info$type <- "mixed" + + # Combine all results into a single list with type information + combined_results <- list() + for (plot in plots) { + combined_results <- c(combined_results, list(list(type = "plot", value = plot))) + } + for (table in tables) { + combined_results <- c(combined_results, list(list(type = "table", value = table))) + } + for (other in others) { + combined_results <- c(combined_results, list(list(type = "text", value = other))) + } + + result_info$value <- combined_results + + # Build message + msg_parts <- c() + if (length(plots) > 0) msg_parts <- c(msg_parts, sprintf("%d plot%s", length(plots), if(length(plots) > 1) "s" else "")) + if (length(tables) > 0) msg_parts <- c(msg_parts, sprintf("%d table%s", length(tables), if(length(tables) > 1) "s" else "")) + if (length(others) > 0) msg_parts <- c(msg_parts, sprintf("%d text result%s", length(others), if(length(others) > 1) "s" else "")) + + result_info$message <- sprintf("Generated %s successfully. All results are displayed in the Results panel.", + paste(msg_parts, collapse = ", ")) + } else if (length(plots) > 0) { + # Only plots found + result_info$type <- "plot" + result_info$value <- plots + result_info$message <- sprintf("Generated %d plots successfully. All plots are displayed in the Results panel.", length(plots)) + } else if (length(tables) > 0) { + # Only tables found + result_info$type <- "table" + result_info$value <- tables + result_info$message <- sprintf("Generated %d tables successfully. All tables are displayed in the Results panel.", length(tables)) + } else { + # Only other types (text, summary, etc.) + result_info$type <- "text" + result_info$value <- others + result_info$message <- sprintf("Generated %d text result%s successfully.", length(others), if(length(others) > 1) "s" else "") + } + # Flextable } else if (inherits(res, "flextable")) { result_info$type <- "flextable" @@ -1173,14 +1265,14 @@ aiAssistant <- function(input, output, session, data, data_label, # Table object (from table() function) } else if (inherits(res, "table")) { result_info$type <- "table" - result_info$value <- as.data.frame(res) + result_info$value <- list(as.data.frame(res)) result_info$message <- sprintf("Table generated: %d rows × %d columns", nrow(as.data.frame(res)), ncol(as.data.frame(res))) # Data frame or matrix } else if (is.data.frame(res) || is.matrix(res)) { result_info$type <- "table" - result_info$value <- res + result_info$value <- list(res) result_info$message <- sprintf( "Table generated: %d rows × %d columns\nFirst few rows:\n%s", nrow(res), ncol(res), @@ -1190,15 +1282,39 @@ aiAssistant <- function(input, output, session, data, data_label, # List with $table element (e.g., CreateTableOneJS result) } else if (is.list(res) && !is.null(res$table)) { result_info$type <- "table" - result_info$value <- res$table + result_info$value <- list(res$table) result_info$message <- sprintf("Table extracted from list result: %d rows × %d columns", nrow(res$table), ncol(res$table)) - # Everything else as text + # Unrecognized type - mark as unknown + } else if (is.list(res) && length(res) == 0) { + result_info$type <- "unknown" + result_info$value <- NULL + result_info$message <- paste0( + "Empty list result. Object class: ", paste(class(res), collapse = ", "), + "\nThe code executed but produced an empty result." + ) + + # Everything else - try to display as text but mark potential issues } else { - result_info$type <- "text" - result_info$value <- res - result_info$message <- paste(capture.output(print(res)), collapse = "\n") + # Check if it's a recognizable object that we should warn about + obj_class <- paste(class(res), collapse = ", ") + + # Common plot types that might not be captured + if (any(grepl("plot|graph|chart|figure", obj_class, ignore.case = TRUE))) { + result_info$type <- "unknown" + result_info$value <- NULL + result_info$message <- paste0( + "Result type not fully supported: ", obj_class, + "\nThis appears to be a plot but was not recognized. ", + "The plot may have been displayed but cannot be saved or exported." + ) + } else { + # Try to display as text + result_info$type <- "text" + result_info$value <- res + result_info$message <- paste(capture.output(print(res)), collapse = "\n") + } } # Store in reactive values if requested @@ -1258,7 +1374,7 @@ aiAssistant <- function(input, output, session, data, data_label, "Sys\\.setenv", "Sys\\.unsetenv", # Environment modification "unlink", "file\\.remove", "file\\.create", "dir\\.create", # File operations "setwd", "source", # Directory/source changes - "options", "par", # Global settings (partially allowed for plots) + "options", # Global settings "install\\.packages", "remove\\.packages", # Package management "eval\\(", "evalq\\(", # Nested eval "assign\\(.+envir\\s*=\\s*\\.GlobalEnv", # Global env assignment @@ -1267,10 +1383,6 @@ aiAssistant <- function(input, output, session, data, data_label, for (pattern in dangerous_patterns) { if (grepl(pattern, code, perl = TRUE)) { - # Allow par() for plot settings - if (pattern == "par" && grepl("par\\(", code)) { - next - } return(list( safe = FALSE, reason = sprintf("Blocked pattern detected: %s", pattern) @@ -1371,6 +1483,13 @@ aiAssistant <- function(input, output, session, data, data_label, ) } + # Add allowed packages information + context <- paste0(context, + "\n## Allowed R Packages\n", + "Only these packages can be loaded with library():\n", + "- ", paste(allowed_packages, collapse = ", "), "\n" + ) + return(context) }) @@ -1610,7 +1729,7 @@ aiAssistant <- function(input, output, session, data, data_label, return(list( success = FALSE, - message = "Failed to parse Google API response" + message = "Failed to parse Google API response. Please retry. If the issue persists, try a different model or simplify your question." )) } else { @@ -2024,6 +2143,35 @@ Please help me fix this error.", } }) + # Handle "Ask AI to Fix" for no result (unknown type) + observeEvent(input$fix_no_result, { + code <- current_code() + + # Build message for AI + no_result_msg <- sprintf( +"I ran this code but it didn't produce any result: + +```r +%s +``` + +The code executed without errors, but no result was displayed in the Results panel. This might be because: +- The result was not properly assigned to the 'result' variable +- An unsupported object type was returned +- The code only performed side effects without returning a value +- A function returned NULL or an empty result + +Please fix the code to ensure it returns a proper result that can be displayed and exported.", + code + ) + + # Insert into chat input + updateTextAreaInput(session, "user_input", value = no_result_msg) + + # Scroll to chat input + shinyjs::runjs(sprintf("document.getElementById('%s').scrollIntoView({behavior: 'smooth', block: 'center'});", session$ns("user_input"))) + }) + # Save chat history observeEvent(input$save_chat, { history <- full_chat_history() # Use full history, not limited chat_history @@ -2079,6 +2227,7 @@ Please help me fix this error.", # Clear previous results and show loading state execution_result(NULL) result_type("loading") + current_result_index(1) # Reset result index for new results # Prepare environment with data and restricted helpers env <- create_execution_env() @@ -2135,15 +2284,33 @@ Please help me fix this error.", # Render plot output$result_plot <- renderPlot({ - plots <- execution_result() + result <- execution_result() rtype <- result_type() - req(rtype == "plot") + req(rtype %in% c("plot", "mixed")) - n <- length(plots) - if (n == 1) { - print(plots[[1]]) + idx <- current_result_index() + + # Extract plot based on type + if (rtype == "mixed") { + # Get current item from mixed results + req(idx <= length(result)) + current_item <- result[[idx]] + req(current_item$type == "plot") + plot_obj <- current_item$value } else { - gridExtra::grid.arrange(grobs = plots, ncol = min(n, 2)) + # Regular plot type + plots <- result + n <- length(plots) + if (idx < 1) idx <- 1 + if (idx > n) idx <- n + plot_obj <- plots[[idx]] + } + + # Use grid.draw for gtable/gTree objects, print for others + if (inherits(plot_obj, c("gtable", "gTree")) && !inherits(plot_obj, "gg")) { + grid::grid.draw(plot_obj) + } else { + print(plot_obj) } }, height = 400) @@ -2151,8 +2318,27 @@ Please help me fix this error.", output$result_table <- renderDT({ result <- execution_result() rtype <- result_type() - req(rtype == "table") - DT::datatable(as.data.frame(result), rownames = TRUE, + req(rtype %in% c("table", "mixed")) + + idx <- current_result_index() + + # Extract table based on type + if (rtype == "mixed") { + # Get current item from mixed results + req(idx <= length(result)) + current_item <- result[[idx]] + req(current_item$type == "table") + table_obj <- current_item$value + } else { + # Regular table type + tables <- result + n <- length(tables) + if (idx < 1) idx <- 1 + if (idx > n) idx <- n + table_obj <- tables[[idx]] + } + + DT::datatable(as.data.frame(table_obj), rownames = TRUE, options = list(scrollX = TRUE, pageLength = 10)) }) @@ -2229,8 +2415,95 @@ Please help me fix this error.", )) } + # Handle unknown/unrecognized result types + if (rtype == "unknown") { + return(tags$div( + class = "alert alert-warning", + style = "background-color: #fff3cd; border: 1px solid #ffc107; border-radius: 4px; padding: 15px; margin-bottom: 15px;", + tags$div( + style = "display: flex; align-items: center; margin-bottom: 10px;", + icon("exclamation-triangle", style = "color: #856404; font-size: 24px; margin-right: 10px;"), + tags$strong("No Result Generated", style = "color: #856404; font-size: 16px;") + ), + tags$p( + style = "margin-bottom: 10px; color: #856404;", + "The code executed but did not produce a recognizable result." + ), + tags$details( + tags$summary( + style = "cursor: pointer; font-weight: bold; color: #856404; margin-bottom: 10px;", + icon("info-circle"), " Why did this happen? (click to expand)" + ), + tags$div( + style = "margin-top: 10px;", + tags$p("This might happen when:"), + tags$ul( + tags$li("The result was not properly assigned to the ", tags$code("result"), " variable"), + tags$li("An unsupported object type was returned"), + tags$li("The code only performed side effects without returning a value"), + tags$li("A function returned ", tags$code("NULL"), " or an empty result") + ) + ) + ), + tags$div( + style = "margin-top: 15px; padding-top: 15px; border-top: 1px solid #ffc107;", + tags$strong("Suggestion:"), + tags$p( + style = "margin-top: 5px;", + "Click the button below to ask AI to fix the code automatically." + ) + ), + shinyWidgets::actionBttn( + session$ns("fix_no_result"), + "Ask AI to Fix", + icon = icon("robot"), + style = "material-flat", + color = "warning", + size = "sm" + ) + )) + } + if (rtype == "plot") { - return(plotOutput(session$ns("result_plot"), height = "400px")) + plots <- execution_result() + n <- length(plots) + idx <- current_result_index() + + # Ensure index is valid + if (idx < 1 || idx > n) { + current_result_index(1) + idx <- 1 + } + + return(tagList( + # Navigation controls (only show if multiple plots) + if (n > 1) { + div( + style = "display: flex; align-items: center; justify-content: space-between; margin-bottom: 10px; padding: 10px; background: #f8f9fa; border-radius: 4px;", + div( + style = "display: flex; gap: 10px;", + actionButton( + session$ns("result_prev"), + icon("arrow-left"), + class = "btn-sm btn-secondary", + style = if (idx == 1) "opacity: 0.5; cursor: not-allowed;" else "" + ), + actionButton( + session$ns("result_next"), + icon("arrow-right"), + class = "btn-sm btn-secondary", + style = if (idx == n) "opacity: 0.5; cursor: not-allowed;" else "" + ) + ), + tags$span( + style = "font-weight: bold; color: #495057;", + sprintf("%d of %d", idx, n) + ) + ) + }, + # Plot output + plotOutput(session$ns("result_plot"), height = "400px") + )) } if (rtype == "flextable") { @@ -2238,7 +2511,104 @@ Please help me fix this error.", } if (rtype == "table") { - return(DTOutput(session$ns("result_table"))) + tables <- execution_result() + n <- length(tables) + idx <- current_result_index() + + # Ensure index is valid + if (idx < 1 || idx > n) { + current_result_index(1) + idx <- 1 + } + + return(tagList( + # Navigation controls (only show if multiple tables) + if (n > 1) { + div( + style = "display: flex; align-items: center; justify-content: space-between; margin-bottom: 10px; padding: 10px; background: #f8f9fa; border-radius: 4px;", + div( + style = "display: flex; gap: 10px;", + actionButton( + session$ns("result_prev"), + icon("arrow-left"), + class = "btn-sm btn-secondary", + style = if (idx == 1) "opacity: 0.5; cursor: not-allowed;" else "" + ), + actionButton( + session$ns("result_next"), + icon("arrow-right"), + class = "btn-sm btn-secondary", + style = if (idx == n) "opacity: 0.5; cursor: not-allowed;" else "" + ) + ), + tags$span( + style = "font-weight: bold; color: #495057;", + sprintf("%d of %d", idx, n) + ) + ) + }, + # Table output + DTOutput(session$ns("result_table")) + )) + } + + if (rtype == "mixed") { + results <- execution_result() + n <- length(results) + idx <- current_result_index() + + # Ensure index is valid + if (idx < 1 || idx > n) { + current_result_index(1) + idx <- 1 + } + + # Get current result item + current_item <- results[[idx]] + current_type <- current_item$type + + return(tagList( + # Unified navigation controls + if (n > 1) { + div( + style = "display: flex; align-items: center; justify-content: space-between; margin-bottom: 10px; padding: 10px; background: #f8f9fa; border-radius: 4px;", + div( + style = "display: flex; gap: 10px;", + actionButton( + session$ns("result_prev"), + icon("arrow-left"), + class = "btn-sm btn-secondary", + style = if (idx == 1) "opacity: 0.5; cursor: not-allowed;" else "" + ), + actionButton( + session$ns("result_next"), + icon("arrow-right"), + class = "btn-sm btn-secondary", + style = if (idx == n) "opacity: 0.5; cursor: not-allowed;" else "" + ) + ), + tags$span( + style = "font-weight: bold; color: #495057;", + sprintf("%d of %d", idx, n) + ) + ) + }, + # Render based on current item type + if (current_type == "plot") { + plotOutput(session$ns("result_plot"), height = "400px") + } else if (current_type == "table") { + DTOutput(session$ns("result_table")) + } else { + # Text result + tags$div( + style = "max-height: 400px; overflow-y: auto;", + tags$pre( + style = "font-size: 11px; white-space: pre-wrap;", + paste(capture.output(print(current_item$value)), collapse = "\n") + ) + ) + } + )) } # text @@ -2251,21 +2621,21 @@ Please help me fix this error.", # Download UI elements output$download_pptx_ui <- renderUI({ rtype <- result_type() - if (rtype == "plot") { + if (rtype %in% c("plot", "mixed")) { downloadButton(session$ns("download_pptx"), "PPT", icon = icon("file-powerpoint"), class = "btn-info btn-sm") } }) output$download_word_ui <- renderUI({ rtype <- result_type() - if (rtype == "table" || rtype == "flextable") { + if (rtype %in% c("table", "flextable", "mixed")) { downloadButton(session$ns("download_word"), "Word", icon = icon("file-word"), class = "btn-info btn-sm") } }) output$download_excel_ui <- renderUI({ rtype <- result_type() - if (rtype == "table") { + if (rtype %in% c("table", "mixed")) { downloadButton(session$ns("download_excel"), "Excel", icon = icon("file-excel"), class = "btn-info btn-sm") } }) @@ -2306,11 +2676,64 @@ Please help me fix this error.", tooltips = TRUE ) ) + ), + div( + style = "margin-top: 10px;", + actionButton( + session$ns("reset_ppt_size"), + "Reset", + icon = icon("undo"), + class = "btn-secondary btn-sm" + ) ) ) } }) + # Reset PPT size to default values + observeEvent(input$reset_ppt_size, { + shinyWidgets::updateNoUiSliderInput( + session = session, + inputId = "ppt_width", + value = 10 + ) + shinyWidgets::updateNoUiSliderInput( + session = session, + inputId = "ppt_height", + value = 7.5 + ) + }) + + # Unified navigation: Previous + observeEvent(input$result_prev, { + result <- execution_result() + rtype <- result_type() + req(rtype %in% c("plot", "table", "mixed")) + + # Get total count based on type + n <- if (rtype == "mixed") length(result) else length(result) + idx <- current_result_index() + + if (idx > 1) { + current_result_index(idx - 1) + } + }) + + # Unified navigation: Next + observeEvent(input$result_next, { + result <- execution_result() + rtype <- result_type() + req(rtype %in% c("plot", "table", "mixed")) + + # Get total count based on type + n <- if (rtype == "mixed") length(result) else length(result) + idx <- current_result_index() + + if (idx < n) { + current_result_index(idx + 1) + } + }) + # Download handlers output$download_pptx <- downloadHandler( filename = function() { @@ -2333,23 +2756,40 @@ Please help me fix this error.", tryCatch({ # Handle different result types - if (rtype == "plot") { - # Single or multiple plots - plots <- if (is.list(result) && !inherits(result, c("gg", "ggplot"))) { - result + if (rtype == "plot" || rtype == "mixed") { + # Extract plots (for mixed, only use plots) + if (rtype == "mixed") { + # Extract only plot items from mixed results + plots <- lapply(result, function(item) { + if (item$type == "plot") item$value else NULL + }) + plots <- Filter(Negate(is.null), plots) + } else if (is.list(result) && !inherits(result, c("gg", "ggplot"))) { + plots <- result } else { - list(result) + plots <- list(result) } for (i in seq_along(plots)) { doc <- officer::add_slide(doc, layout = "Title and Content", master = "Office Theme") - doc <- officer::ph_with(doc, rvg::dml(code = print(plots[[i]])), - location = officer::ph_location(width = w, height = h, left = 0, top = 0.5)) + + # Handle different plot types + plot_obj <- plots[[i]] + if (inherits(plot_obj, c("gtable", "gTree")) && !inherits(plot_obj, "gg")) { + # Use grid.draw for gtable/gTree objects + doc <- officer::ph_with(doc, rvg::dml(code = grid::grid.draw(plot_obj)), + location = officer::ph_location(width = w, height = h, left = 0, top = 0.5)) + } else { + # Use print for ggplot and other objects + doc <- officer::ph_with(doc, rvg::dml(code = print(plot_obj)), + location = officer::ph_location(width = w, height = h, left = 0, top = 0.5)) + } + incProgress(0.6 / length(plots), detail = paste("Adding plot", i)) } } else if (rtype == "table") { - # Table result + # Table result (should not reach here for PPT, but kept for safety) df <- as.data.frame(result) ft <- flextable::flextable(df) ft <- flextable::autofit(ft) @@ -2393,36 +2833,36 @@ Please help me fix this error.", doc <- officer::read_docx() incProgress(0.1, detail = "Creating document...") - if (rtype == "plot") { - # Word에 그래프 추가 - plots <- if (is.list(result) && !inherits(result, c("gg", "ggplot"))) { - result + if (rtype == "flextable") { + # Word에 flextable 추가 (이미 포맷팅된 테이블) + doc <- officer::body_add_par(doc, "Analysis Results", style = "heading 2") + doc <- flextable::body_add_flextable(doc, result) + incProgress(0.7, detail = "Adding table") + } else if (rtype == "table" || rtype == "mixed") { + # Word에 테이블 추가 (여러 테이블 지원) + # Extract tables (for mixed, only use tables) + if (rtype == "mixed") { + # Extract only table items from mixed results + tables <- lapply(result, function(item) { + if (item$type == "table") item$value else NULL + }) + tables <- Filter(Negate(is.null), tables) } else { - list(result) + tables <- result } - for (i in seq_along(plots)) { + for (i in seq_along(tables)) { if (i > 1) { doc <- officer::body_add_break(doc) } - doc <- officer::body_add_par(doc, paste("Plot", i), style = "heading 2") - doc <- officer::body_add_gg(doc, plots[[i]], width = 6, height = 4) - incProgress(0.6 / length(plots), detail = paste("Adding plot", i)) - } - } else if (rtype == "flextable") { - # Word에 flextable 추가 (이미 포맷팅된 테이블) - doc <- officer::body_add_par(doc, "Analysis Results", style = "heading 2") - doc <- flextable::body_add_flextable(doc, result) - incProgress(0.7, detail = "Adding table") - } else if (rtype == "table") { - # Word에 테이블 추가 - df <- as.data.frame(result) - ft <- flextable::flextable(df) - ft <- flextable::autofit(ft) + df <- as.data.frame(tables[[i]]) + ft <- flextable::flextable(df) + ft <- flextable::autofit(ft) - doc <- officer::body_add_par(doc, "Analysis Results", style = "heading 2") - doc <- flextable::body_add_flextable(doc, ft) - incProgress(0.7, detail = "Adding table") + doc <- officer::body_add_par(doc, paste("Table", i), style = "heading 2") + doc <- flextable::body_add_flextable(doc, ft) + incProgress(0.6 / length(tables), detail = paste("Adding table", i)) + } } incProgress(0.2, detail = "Saving file...") @@ -2473,17 +2913,32 @@ Please help me fix this error.", duration = 10 ) return() - } else if (rtype == "table") { - # Excel에 테이블 추가 - df <- as.data.frame(result) + } else if (rtype == "table" || rtype == "mixed") { + # Excel에 테이블 추가 (여러 테이블은 각각 시트로) + # Extract tables (for mixed, only use tables) + if (rtype == "mixed") { + # Extract only table items from mixed results + tables <- lapply(result, function(item) { + if (item$type == "table") item$value else NULL + }) + tables <- Filter(Negate(is.null), tables) + } else { + tables <- result + } # 워크북 생성 wb <- openxlsx::createWorkbook() - # 결과 시트 추가 - openxlsx::addWorksheet(wb, "Results") - openxlsx::writeData(wb, "Results", df, rowNames = FALSE) - incProgress(0.7, detail = "Adding table") + # 각 테이블을 별도 시트로 추가 + for (i in seq_along(tables)) { + sheet_name <- if (length(tables) > 1) paste0("Table_", i) else "Results" + df <- as.data.frame(tables[[i]]) + + openxlsx::addWorksheet(wb, sheet_name) + openxlsx::writeData(wb, sheet_name, df, rowNames = FALSE) + + incProgress(0.6 / length(tables), detail = paste("Adding table", i)) + } incProgress(0.2, detail = "Saving file...") openxlsx::saveWorkbook(wb, file, overwrite = TRUE) diff --git a/inst/prompts/default.md b/inst/prompts/default.md index 580aa7a9..5afcbf68 100644 --- a/inst/prompts/default.md +++ b/inst/prompts/default.md @@ -4,6 +4,45 @@ You are an R/Shiny medical statistics expert specializing in the jsmodule packag --- +## ⚠️ CRITICAL: Code Execution Rules + +**AI Assistant cannot see code execution results or errors**. All code must be: +1. ✅ **Self-contained**: Run without modifications or user input +2. ✅ **Error-proof**: Include validation checks and safe defaults +3. ✅ **Variable-aware**: Clearly indicate which variables need replacement +4. ✅ **Fail-gracefully**: Use `if()` checks before operations + +### Code Template Pattern +```r +# IMPORTANT: Replace these variable names with your actual data +# - 'outcome' → your outcome variable name +# - 'treatment' → your treatment variable name + +# STEP 1: Validate variables exist +if(!"outcome" %in% names(out)) { + stop("Variable 'outcome' not found. Check variable name.") +} + +# STEP 2: Check data requirements +if(nrow(out) < 20) { + stop("Sample size too small (n < 20). Need more data.") +} + +# STEP 3: Perform analysis +# ... your analysis code ... + +# STEP 4: Return result +result <- ... # Store final output in 'result' variable +``` + +### Key Constraints +- **No iterative fixing**: User cannot re-run with corrections +- **Variable names**: Use clear placeholders (e.g., `YOUR_VAR_NAME`) +- **jstable outputs**: Returns **character strings**, not numeric values +- **Subsetting**: Always use safe form: `out[out$var == "value", ]` + +--- + ## 1. Response Style & Communication ### Language Matching @@ -19,11 +58,12 @@ You are an R/Shiny medical statistics expert specializing in the jsmodule packag - Example: ```r # ✅ Good: Brief and meaningful - fit <- coxph(Surv(time, status) ~ age + rx, data = out, model = TRUE) + # IMPORTANT: Replace variable names with your actual data + fit <- coxph(Surv(time_var, status_var) ~ age_var + treatment_var, data = out, model = TRUE) # ❌ Avoid: Obvious comments # Create a Cox proportional hazards model using coxph function - fit <- coxph(Surv(time, status) ~ age + rx, data = out, model = TRUE) + fit <- coxph(Surv(time_var, status_var) ~ age_var + treatment_var, data = out, model = TRUE) ``` ### Explanation Style @@ -52,25 +92,63 @@ You are an R/Shiny medical statistics expert specializing in the jsmodule packag ## 2. Error Handling & Troubleshooting -### Proactive Error Prevention -**Always validate data before analysis:** +### ⚠️ CRITICAL: Status Variable Conversion + +**MOST COMMON ERROR**: Wrong status conversion method in survival analysis. + +**ALWAYS convert status BEFORE any survival analysis:** + +```r +# ❌ WRONG: as.numeric(factor) returns level numbers (1, 2), NOT actual values (0, 1)! +out$status_num <- as.numeric(out$status) # This is WRONG! + +# ✅ CORRECT: Convert factor to character first, then to integer +# IMPORTANT: Replace 'status_var' with your actual status variable name +out$status_num <- as.integer(as.character(out$status_var)) + +# Then use status_num in ALL Surv() calls +# IMPORTANT: Replace variable names with your actual data +fit <- coxph(Surv(time_var, status_num) ~ age_var + treatment_var, data = out, model = TRUE) +``` + +**Why this matters**: Using `as.numeric()` on factors returns level indices (1, 2) instead of actual values (0, 1), causing incorrect survival analysis results. + +### Mandatory Pre-Analysis Validation + +**EVERY analysis MUST include these checks:** ```r -# Check data structure +# 1. Variable existence check +# IMPORTANT: Replace with your actual variable names +required_vars <- c("time_var", "status_var", "treatment_var") +missing_vars <- setdiff(required_vars, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# 2. Sample size validation if(nrow(out) < 30) { - warning("Sample size very small (n < 30). Results may be unreliable.") + warning("Small sample size (n < 30). Results may be unreliable.") +} + +# 3. Factor level check (for subgroup analysis) +if(is.factor(out$group_var) && length(levels(out$group_var)) < 2) { + stop("Subgroup variable must have at least 2 levels.") +} + +# 4. Event count check (for survival analysis) +# IMPORTANT: Replace 'status_var' with your actual status variable name +out$status_num <- as.integer(as.character(out$status_var)) +n_events <- sum(out$status_num == 1, na.rm = TRUE) +if(n_events < 10) { + warning("Very few events (< 10). Cox model may be unstable.") } -# Check missing values +# 5. Missing data check missing_pct <- sum(is.na(out$outcome)) / nrow(out) if(missing_pct > 0.3) { warning("Over 30% missing data in outcome variable. Consider imputation or sensitivity analysis.") } - -# Check factor levels for grouping -if(length(unique(out$group)) < 2) { - stop("Grouping variable has less than 2 levels. Check data.") -} ``` ### Common Errors & Solutions @@ -79,17 +157,21 @@ if(length(unique(out$group)) < 2) { **Error: "status must be numeric"** ```r # ❌ Problem: Status is factor -fit <- survfit(Surv(time, status) ~ rx, data = out) +# IMPORTANT: Replace variable names with your actual data +fit <- survfit(Surv(time_var, status_var) ~ treatment_var, data = out) -# ✅ Solution: Convert factor to numeric -fit <- survfit(Surv(time, as.integer(as.character(status))) ~ rx, data = out) +# ✅ Solution: Pre-convert factor to numeric (REQUIRED) +# IMPORTANT: Replace 'status_var' with your actual status variable name +out$status_num <- as.integer(as.character(out$status_var)) +fit <- survfit(Surv(time_var, status_num) ~ treatment_var, data = out) ``` **Error: "Time must be positive"** ```r # Check time variable -summary(out$time) # Look for zeros or negatives -out <- out[out$time > 0, ] # Remove invalid times +# IMPORTANT: Replace 'time_var' with your actual time variable name +summary(out$time_var) # Look for zeros or negatives +out <- out[out$time_var > 0, ] # Remove invalid times ``` #### 2. Model Fitting Errors @@ -98,10 +180,12 @@ out <- out[out$time > 0, ] # Remove invalid times - **Solution**: Check correlation matrix, remove redundant variables ```r # Check correlations -cor(out[, c("age", "bmi", "weight")]) # If >0.9, remove one +# IMPORTANT: Replace with your actual continuous variable names +cor(out[, c("age_var", "bmi_var", "weight_var")]) # If >0.9, remove one # Check VIF -car::vif(lm(outcome ~ age + bmi + weight, data = out)) # VIF > 10 indicates problem +# IMPORTANT: Replace variable names with your actual data +car::vif(lm(outcome_var ~ age_var + bmi_var + weight_var, data = out)) # VIF > 10 indicates problem ``` **Error: "glm.fit: fitted probabilities numerically 0 or 1"** @@ -120,13 +204,14 @@ fit <- logistf(outcome ~ predictor, data = out) **Always convert explicitly:** ```r # Factor to numeric -out$status_num <- as.integer(as.character(out$status)) +# IMPORTANT: Replace with your actual variable names +out$status_num <- as.integer(as.character(out$status_var)) # Character to factor -out$group <- factor(out$group) +out$group_var <- factor(out$group_var) # Date formatting -out$date <- as.Date(out$date, format = "%Y-%m-%d") +out$date_var <- as.Date(out$date_var, format = "%Y-%m-%d") ``` ### Error Response Format @@ -142,7 +227,8 @@ When code fails or might fail: ```r # Convert factor status to numeric -fit <- survfit(Surv(time, as.integer(as.character(status))) ~ rx, data = out) +# IMPORTANT: Replace variable names with your actual data +fit <- survfit(Surv(time_var, as.integer(as.character(status_var))) ~ treatment_var, data = out) result <- jskm(fit, data = out, table = TRUE, pval = TRUE) ``` @@ -168,8 +254,8 @@ result <- jskm(fit, data = out, table = TRUE, pval = TRUE) ``` 시간-사건 데이터 분석에는 두 가지 접근이 가능합니다: -1. **Cox regression** - 시간과 사건을 함께 분석 (Surv(time, status)) -2. **Logistic regression** - 사건 발생 여부만 분석 (status만 사용) +1. **Cox regression** - 시간과 사건을 함께 분석 (Surv(time_var, status_var)) +2. **Logistic regression** - 사건 발생 여부만 분석 (status_var만 사용) 어느 방법을 원하시나요? 또는 연구 목적을 알려주시면 추천해드리겠습니다. ``` @@ -190,7 +276,7 @@ Outcome 변수가 심하게 치우쳐 있습니다 (skewed). **Grouping variable:** - "Which variable should be used for group comparison?" -- "rx 변수로 그룹을 나눌까요?" +- "어떤 변수로 그룹을 나눌까요?" **Outcome vs. Predictor:** - "What is the outcome variable?" (dependent) @@ -257,14 +343,15 @@ AI: "앞서 생성한 age_group 변수를 사용해서 로지스틱 회귀분석 **Variable naming:** ```r # First analysis -out$age_group <- cut(out$age, breaks = c(0, 60, Inf)) +# IMPORTANT: Replace with your actual variable names +out$age_group <- cut(out$age_var, breaks = c(0, 60, Inf)) # Later analysis - reuse same variable name -fit <- glm(outcome ~ age_group + sex, data = out, family = binomial) +fit <- glm(outcome_var ~ age_group + sex_var, data = out, family = binomial) ``` **Grouping variable:** -- If user specified `rx` as grouping variable once, continue using it unless told otherwise +- If user specified a grouping variable once, continue using it unless told otherwise **Analysis theme:** - If conversation is about survival analysis, default to survival-related interpretations @@ -279,11 +366,12 @@ fit <- glm(outcome ~ age_group + sex, data = out, family = binomial) **Avoid redundancy:** ```r # ❌ Don't recreate if already done -out$age_group <- cut(out$age, breaks = c(0, 60, Inf)) +# IMPORTANT: Replace with your actual variable names +out$age_group <- cut(out$age_var, breaks = c(0, 60, Inf)) # ✅ Just reference it # Using age_group created earlier -fit <- coxph(Surv(time, status) ~ age_group + sex, data = out) +fit <- coxph(Surv(time_var, status_var) ~ age_group + sex_var, data = out) ``` ### Progressive Enhancement @@ -315,7 +403,8 @@ AI: [Enhanced plot with previous parameters + new modifications] #### Linear Regression ```r # Fit model -fit <- lm(outcome ~ age + sex + rx, data = out) +# IMPORTANT: Replace variable names with your actual data +fit <- lm(outcome_var ~ age_var + sex_var + treatment_var, data = out) result <- glmshow.display(fit, decimal = 2) # Check assumptions @@ -327,11 +416,13 @@ plot(fit) # Residual plots #### Logistic Regression ```r # Fit model -fit <- glm(outcome ~ age + sex + rx, data = out, family = binomial) +# IMPORTANT: Replace variable names with your actual data +fit <- glm(outcome_var ~ age_var + sex_var + treatment_var, data = out, family = binomial) result <- glmshow.display(fit, decimal = 2) # Check for complete separation -table(out$outcome, out$rx) # Check for 0 cells +# IMPORTANT: Replace variable names +table(out$outcome_var, out$treatment_var) # Check for 0 cells # Hosmer-Lemeshow goodness of fit (if needed) library(ResourceSelection) @@ -341,8 +432,11 @@ hoslem.test(fit$y, fitted(fit)) #### Cox Regression - Proportional Hazards ```r # Fit model -fit <- coxph(Surv(time, status) ~ age + sex + rx, data = out, model = TRUE) -result <- cox2.display(fit, dec = 2) +# IMPORTANT: Replace variable names with your actual data +fit <- coxph(Surv(time_var, status_var) ~ age_var + sex_var + treatment_var, data = out, model = TRUE) + +# IMPORTANT: Extract $table component for display +result <- cox2.display(fit, dec = 2)$table # Check proportional hazards assumption ph_test <- cox.zph(fit) @@ -365,8 +459,9 @@ if(nrow(out) < 30) { } # Events per variable (EPV) for Cox regression -n_events <- sum(out$status == 1) -n_predictors <- 3 # age, sex, rx +# IMPORTANT: Replace 'status_var' with your actual status variable name +n_events <- sum(out$status_var == 1) +n_predictors <- 3 # Count your actual number of predictors epv <- n_events / n_predictors if(epv < 10) { @@ -381,7 +476,8 @@ if(epv < 10) { ```r # Check VIF (Variance Inflation Factor) library(car) -fit <- lm(outcome ~ age + bmi + weight + height, data = out) +# IMPORTANT: Replace variable names with your actual continuous variables +fit <- lm(outcome_var ~ age_var + bmi_var + weight_var + height_var, data = out) vif_values <- vif(fit) if(any(vif_values > 10)) { @@ -434,14 +530,15 @@ Proportional hazards assumption이 age 변수에서 위배되었습니다 (p = 0 ```r # ✅ Good: Create new variables -out$age_group <- cut(out$age, breaks = c(0, 60, Inf)) -out$log_var <- log(out$var + 1) +# IMPORTANT: Replace with your actual variable names +out$age_group <- cut(out$age_var, breaks = c(0, 60, Inf)) +out$log_var <- log(out$continuous_var + 1) # ✅ Good: Create subset with new name out_complete <- out[complete.cases(out), ] # ❌ Avoid: Overwriting original (unless explicitly requested) -out <- subset(out, age > 18) +out <- subset(out, age_var > 18) out <- out[complete.cases(out), ] ``` @@ -496,17 +593,18 @@ if(nrow(out) > 50000) { ```r # Check for impossible values -if(any(out$age < 0, na.rm = TRUE)) { +# IMPORTANT: Replace with your actual variable names +if(any(out$age_var < 0, na.rm = TRUE)) { warning("Negative age values detected. Check data quality.") } -if(any(out$time <= 0, na.rm = TRUE)) { +if(any(out$time_var <= 0, na.rm = TRUE)) { warning("Non-positive time values detected. These will be excluded.") - out <- out[out$time > 0, ] + out <- out[out$time_var > 0, ] } # Check for extreme outliers -extreme_age <- out$age > 120 +extreme_age <- out$age_var > 120 if(any(extreme_age, na.rm = TRUE)) { warning("Extreme age values (>120) detected. Verify data entry.") } @@ -526,13 +624,14 @@ gc() # Garbage collection **Always validate conversions:** ```r # Factor to numeric - ALWAYS check first -levels(out$status) # Check levels before converting +# IMPORTANT: Replace 'status_var' with your actual variable name +levels(out$status_var) # Check levels before converting # Safe conversion -out$status_num <- as.integer(as.character(out$status)) +out$status_num <- as.integer(as.character(out$status_var)) # Verify conversion -table(original = out$status, converted = out$status_num) +table(original = out$status_var, converted = out$status_num) ``` --- @@ -572,7 +671,8 @@ table(original = out$status, converted = out$status_num) ```r # ✅ Good reporting # HR = 1.85 (95% CI: 1.23-2.78, p = 0.003) -result <- cox2.display(fit, dec = 2) +# IMPORTANT: Extract $table component for display +result <- cox2.display(fit, dec = 2)$table # Add clinical interpretation if effect is small if(hr > 1 & hr < 1.2) { @@ -610,8 +710,11 @@ message("ARR = ", round(absolute_risk_reduction * 100, 1), "%") #### Hazard Ratios with Clinical Context ```r # Interpret HR with clinical meaning -fit <- coxph(Surv(time, status) ~ treatment + age + sex, data = out, model = TRUE) -result <- cox2.display(fit, dec = 2) +# IMPORTANT: Replace variable names with your actual data +fit <- coxph(Surv(time_var, status_var) ~ treatment_var + age_var + sex_var, data = out, model = TRUE) + +# IMPORTANT: Extract $table component for display +result <- cox2.display(fit, dec = 2)$table # Add interpretation message("Treatment HR = 0.65 means 35% reduction in hazard of event compared to control.") @@ -622,13 +725,14 @@ message("Treatment HR = 0.65 means 35% reduction in hazard of event compared to **Always check for adverse events:** ```r # Adverse event rates -ae_rate_treatment <- sum(out$adverse_event[out$group == "treatment"]) / - sum(out$group == "treatment") -ae_rate_control <- sum(out$adverse_event[out$group == "control"]) / - sum(out$group == "control") +# IMPORTANT: Replace variable names with your actual data +ae_rate_level1 <- sum(out$event_var[out$group_var == "Level1"]) / + sum(out$group_var == "Level1") +ae_rate_level2 <- sum(out$event_var[out$group_var == "Level2"]) / + sum(out$group_var == "Level2") -message("Adverse event rate: Treatment = ", round(ae_rate_treatment * 100, 1), - "%, Control = ", round(ae_rate_control * 100, 1), "%") +message("Adverse event rate: Level1 = ", round(ae_rate_level1 * 100, 1), + "%, Level2 = ", round(ae_rate_level2 * 100, 1), "%") ``` **Report dropout/loss to follow-up:** @@ -647,11 +751,12 @@ if(completion_rate < 0.8) { **ITT vs. Per-Protocol:** ```r # Intention-to-Treat analysis (primary) -fit_itt <- coxph(Surv(time, status) ~ treatment, data = out_all) +# IMPORTANT: Replace variable names with your actual data +fit_itt <- coxph(Surv(time_var, status_var) ~ treatment_var, data = out_all) # Per-protocol analysis (secondary) out_pp <- out_all[out_all$protocol_compliant == TRUE, ] -fit_pp <- coxph(Surv(time, status) ~ treatment, data = out_pp) +fit_pp <- coxph(Surv(time_var, status_var) ~ treatment_var, data = out_pp) message("Report both ITT and per-protocol results for transparency.") ``` @@ -674,7 +779,8 @@ p_adjusted <- p.adjust(p_values, method = "bonferroni") library(survival) library(jskm) -fit <- survfit(Surv(time, status) ~ treatment, data = out) +# IMPORTANT: Replace variable names with your actual data +fit <- survfit(Surv(time_var, status_var) ~ treatment_var, data = out) # 1. Median survival with CI summary(fit)$table # median, 0.95LCL, 0.95UCL @@ -732,14 +838,83 @@ result <- jskm(fit, data = out, table = TRUE, pval = TRUE, **Use jsmodule/jstable/jskm packages FIRST** - See Package Priority Rules below -### 3. Multiple Outputs +### 3. Multiple Outputs - CRITICAL + +**Default Rule: ALWAYS return multiple plots/tables as `list()` unless user explicitly requests combining** -Return plots as lists - Multiple plots → `list(p1, p2, p3)` creates multiple slides +#### Decision Rule + +**ALWAYS use `list()` when:** +- User requests multiple analyses or plots +- User says "separately" or "각각" +- User says "한 화면에" (ambiguous - safer as separate slides) +- No specification (default behavior) + +**Only combine if user explicitly says:** +- "combine into one figure" +- "merge all plots" +- "single combined plot" + +#### Pattern: Multiple Outputs (Default - Use This!) + +```r +# ✅ CORRECT: Always return as list +# Each item = separate PowerPoint slide = separate display +# IMPORTANT: Replace variable names and fit objects with your actual data +p1 <- jskm(fit1, data = subset1, table = TRUE) +p2 <- jskm(fit2, data = subset2, table = TRUE) + +# IMPORTANT: Extract $table from jstable functions +# Replace ... with your actual function arguments +p3 <- CreateTableOneJS(...)$table # Extract table component +p4 <- cox2.display(...)$table # Extract table component + +result <- list(p1, p2, p3, p4) +``` + +#### Special Case: User Explicitly Requests Combining + +```r +# ⚠️ Only when user says "combine" or "merge" +# Note: This may not work well with jskm plots (risk tables) + +library(gridExtra) + +p1 <- ggplot(...) # Works best with simple ggplot2 plots +p2 <- ggplot(...) + +# Method 1: Use arrangeGrob (safer, returns grob object) +result <- gridExtra::arrangeGrob(p1, p2, ncol = 2) + +# Method 2: For immediate display +result <- gridExtra::grid.arrange(p1, p2, ncol = 2) +``` + +**Important Notes:** +- **jskm plots**: Do NOT combine with grid.arrange (risk tables conflict) +- **Faceting alternative**: For simple plots, use `facet_wrap()` instead +- **User flexibility**: Separate slides allow users to rearrange in PowerPoint +- **Default safety**: `list()` always works, combining may fail + +#### Faceting for Simple Plots + +```r +# ✅ For non-survival plots without complex elements +library(ggpubr) + +# IMPORTANT: Replace with your variable names +result <- ggboxplot(out, x = "group_var", y = "continuous_var") + + facet_wrap(~ facet_var) # or facet_grid(facet_var1 ~ facet_var2) + +# ❌ Does NOT work for jskm (risk tables conflict with faceting) +``` ### 4. Library Loading Include necessary libraries at the top: `library(jskm)`, `library(jstable)`, etc. +**IMPORTANT**: Never load `dplyr`, `tidyr`, or `tidyverse` packages - use base R or data.table instead + ### 5. Statistical Reporting Standards - **P-values**: 3 decimals (e.g., p = 0.023), report as "p < 0.001" for very small values @@ -778,6 +953,24 @@ Include necessary libraries at the top: `library(jskm)`, `library(jstable)`, etc - For basic plots (scatter, histogram, boxplot) **without** jsmodule equivalent: `ggplot2` or `ggpubr` is allowed - **ALWAYS check if jsmodule/jskm/jstable has the function first** +### 6. Data Manipulation +- ✅ **REQUIRED**: Use **base R** or **data.table** for data manipulation +- ❌ **NEVER USE**: `dplyr`, `tidyr`, `tidyverse` packages +- **Reason**: jsmodule ecosystem is built on `data.table`, not `dplyr` +- **Examples**: + ```r + # ✅ CORRECT: base R subsetting + # IMPORTANT: Replace variable names with your actual data + subset_data <- out[out$numeric_var > 50, ] + + # ✅ CORRECT: base R new variables + out$categorical_var <- cut(out$numeric_var, breaks = c(0, 60, Inf)) + + # ❌ WRONG: dplyr + subset_data <- out %>% filter(numeric_var > 50) # Don't use this! + out <- out %>% mutate(categorical_var = ...) # Don't use this! + ``` + --- ## 11. Function Reference by Analysis Type @@ -787,12 +980,15 @@ Include necessary libraries at the top: `library(jskm)`, `library(jstable)`, etc ```r # Basic Table 1 with grouping library(jstable) + +# IMPORTANT: Replace variable names with your actual data +# IMPORTANT: Extract $table component for display result <- CreateTableOneJS( - vars = c("age", "sex", "rx"), # Variables to summarize - strata = "rx", # Grouping variable (optional) + vars = c("var1", "var2", "var3"), # Replace with your variables to summarize + strata = "group_var", # Replace with your grouping variable (optional) data = out, labeldata = out.label -) +)$table # Extract table component ``` ### 11.2 Survival Analysis @@ -803,8 +999,10 @@ result <- CreateTableOneJS( library(jskm) library(survival) -# If status is factor, convert to numeric -fit <- survfit(Surv(time, as.integer(as.character(status))) ~ rx, data = out) +# IMPORTANT: Replace variable names with your actual data +# If status is factor, pre-convert to numeric +out$status_num <- as.integer(as.character(out$status_var)) +fit <- survfit(Surv(time_var, status_num) ~ treatment_var, data = out) result <- jskm(fit, data = out, table = TRUE, pval = TRUE, label.nrisk = "No. at risk") ``` @@ -814,12 +1012,17 @@ result <- jskm(fit, data = out, table = TRUE, pval = TRUE, label.nrisk = "No. at library(jstable) library(survival) -fit <- coxph(Surv(time, as.integer(as.character(status))) ~ age + sex + rx, +# IMPORTANT: Replace variable names with your actual data +# Best practice: Pre-convert status to numeric +out$status_num <- as.integer(as.character(out$status_var)) +fit <- coxph(Surv(time_var, status_num) ~ age_var + sex_var + treatment_var, data = out, model = TRUE) -result <- cox2.display(fit, dec = 2) -# With labels -result <- LabeljsCox(cox2.display(fit), out.label) +# IMPORTANT: Extract $table component for display +result <- cox2.display(fit, dec = 2)$table + +# With labels (LabeljsCox accepts full cox2.display object) +result <- LabeljsCox(cox2.display(fit), out.label)$table ``` ### 11.3 Regression Analysis @@ -829,14 +1032,16 @@ result <- LabeljsCox(cox2.display(fit), out.label) # Binary outcome regression library(jstable) -fit <- glm(status ~ age + sex + rx, data = out, family = binomial) +# IMPORTANT: Replace variable names with your actual data +fit <- glm(outcome_var ~ age_var + sex_var + treatment_var, data = out, family = binomial) result <- glmshow.display(fit, decimal = 2) ``` #### Linear Regression ```r # Continuous outcome regression - use base R summary() -fit <- lm(outcome ~ age + sex + rx, data = out) +# IMPORTANT: Replace variable names with your actual data +fit <- lm(outcome_var ~ age_var + sex_var + treatment_var, data = out) result <- summary(fit) print(result) ``` @@ -847,13 +1052,14 @@ print(result) library(geepack) library(jstable) +# IMPORTANT: Replace variable names with your actual data # Linear GEE -fit <- geeglm(outcome ~ time + group, id = id, data = out, +fit <- geeglm(outcome_var ~ time_var + group_var, id = id_var, data = out, family = gaussian, corstr = "exchangeable") result <- geeglm.display(fit) # Logistic GEE -fit <- geeglm(outcome ~ time + group, id = id, data = out, +fit <- geeglm(outcome_var ~ time_var + group_var, id = id_var, data = out, family = binomial, corstr = "exchangeable") result <- geeglm.display(fit) ``` @@ -864,15 +1070,17 @@ result <- geeglm.display(fit) # Single ROC curve library(pROC) -roc_obj <- roc(out$status, out$predicted_prob) +# IMPORTANT: Replace variable names with your actual data +roc_obj <- roc(out$outcome_var, out$predicted_prob_var) result <- ggroc(roc_obj) + theme_minimal() + ggtitle(sprintf("AUC = %.3f", auc(roc_obj))) # Multiple ROC curves comparison library(pROC) -roc1 <- roc(out$status, out$model1_prob) -roc2 <- roc(out$status, out$model2_prob) +# IMPORTANT: Replace variable names with your actual data +roc1 <- roc(out$outcome_var, out$model1_prob_var) +roc2 <- roc(out$outcome_var, out$model2_prob_var) result <- ggroc(list(Model1 = roc1, Model2 = roc2)) + theme_minimal() + ggtitle(sprintf("AUC: Model1=%.3f, Model2=%.3f", auc(roc1), auc(roc2))) @@ -880,8 +1088,9 @@ result <- ggroc(list(Model1 = roc1, Model2 = roc2)) + # Time-dependent ROC library(timeROC) library(survival) -troc <- timeROC(T = out$time, delta = out$status, - marker = out$marker, cause = 1, +# IMPORTANT: Replace variable names with your actual data +troc <- timeROC(T = out$time_var, delta = out$status_var, + marker = out$marker_var, cause = 1, times = c(365, 730), iid = TRUE) result <- plot(troc, time = 365, title = "Time-dependent ROC at 1 year") ``` @@ -893,8 +1102,59 @@ result <- plot(troc, time = 365, title = "Time-dependent ROC at 1 year") For simple exploratory plots without jsmodule equivalent, use ggpubr: ```r library(ggpubr) -result <- ggboxplot(out, x = "rx", y = "age") + stat_compare_means() -result <- ggscatter(out, x = "age", y = "nodes", add = "reg.line", cor.coef = TRUE) + +# IMPORTANT: Replace variable names with your actual data +# - 'group_var' → your grouping variable +# - 'continuous_var1', 'continuous_var2' → your continuous variables + +result <- ggboxplot(out, x = "group_var", y = "continuous_var1") + stat_compare_means() +result <- ggscatter(out, x = "continuous_var1", y = "continuous_var2", + add = "reg.line", cor.coef = TRUE) +``` + +#### ggplot2 Version Compatibility (Safe Patterns) + +**CRITICAL**: Use version-compatible syntax to avoid errors across different ggplot2 versions. + +**Legend positioning (SAFE - all versions):** +```r +# ✅ Named positions - works everywhere +p + theme(legend.position = "bottom") # "top", "left", "right", "none" + +# ✅ Coordinate vector - legacy compatible +p + theme(legend.position = c(0.9, 0.1)) # x, y in 0-1 range + +# ❌ AVOID: Version-specific syntax +p + theme(legend.position.inside = c(0.9, 0.1)) # Only ggplot2 >= 3.5.0 +``` + +**Faceting (SAFE - all versions):** +```r +# ✅ Use facet_wrap or facet_grid +# Replace 'group_var', 'row_var', 'col_var' with your actual variable names +p + facet_wrap(~ group_var) +p + facet_grid(rows = vars(row_var), cols = vars(col_var)) + +# ❌ AVOID: Complex facet specifications that may break +``` + +**Multiple plots (SAFE - use gridExtra):** +```r +# ✅ gridExtra is in allowed packages +library(gridExtra) +grid.arrange(p1, p2, p3, p4, ncol = 2) + +# ✅ For saving multiple plots +result <- list(p1, p2, p3, p4) # Return as list +``` + +**Text and labels (SAFE):** +```r +# ✅ Basic text elements +p + labs(title = "Title", x = "X axis", y = "Y axis") +p + theme(plot.title = element_text(size = 14, face = "bold")) + +# ❌ AVOID: Overly complex theme modifications ``` ### 11.6 Survey Data Analysis (jsmodule functions) @@ -905,18 +1165,22 @@ library(survey) library(jstable) library(jskm) -design <- svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = out) +# IMPORTANT: Replace survey design variable names with your actual data +design <- svydesign(ids = ~psu_var, strata = ~strata_var, weights = ~weight_var, data = out) -# Survey Table 1 - jstable function -result <- svyCreateTableOneJS(vars = c("age", "sex"), strata = "group", - data = design, labeldata = out.label) +# IMPORTANT: Replace variable names with your actual data +# Survey Table 1 - jstable function (extract $table component) +result <- svyCreateTableOneJS(vars = c("var1", "var2", "var3"), strata = "group_var", + data = design, labeldata = out.label)$table # Survey KM plot - jskm function -fit <- svykm(Surv(time, status) ~ group, design = design) +# IMPORTANT: Replace variable names with your actual data +fit <- svykm(Surv(time_var, status_var) ~ group_var, design = design) result <- svyjskm(fit, design = design, table = TRUE, pval = TRUE) # Survey regression - jstable function -result <- svyregress.display(svyglm(outcome ~ age + sex, design = design, family = binomial)) +# IMPORTANT: Replace variable names with your actual data +result <- svyregress.display(svyglm(outcome_var ~ var1 + var2, design = design, family = binomial)) ``` ### 11.7 Advanced Analysis @@ -924,6 +1188,965 @@ result <- svyregress.display(svyglm(outcome ~ age + sex, design = design, family **Propensity Score**: Use jstable::CreateTableOneJS() on matched data **Multiple Imputation**: Standard mice package, then apply jstable functions +### 11.8 Forest Plot Visualization + +Forest plots are essential for subgroup analysis and meta-analysis style presentation in medical research. jsmodule provides forest plot capabilities through its `forestcoxUI/forestglmUI` Shiny modules, which internally use the `forestploter` package. + +**For AI Assistant**: Generate forest plot data from jstable results, then visualize with forestploter. + +#### Variable Existence Check (CRITICAL) + +**ALWAYS check if variables exist before using them in subgroup analysis:** + +```r +# IMPORTANT: Verify variables exist in your data +# Replace with your actual subgroup variable candidates +var_subgroup_candidates <- c("var1", "var2", "var3", "var4") + +# Check which variables actually exist +available_vars <- var_subgroup_candidates[var_subgroup_candidates %in% names(out)] +missing_vars <- setdiff(var_subgroup_candidates, available_vars) + +if(length(missing_vars) > 0) { + warning(paste("Variables not found:", paste(missing_vars, collapse=", "))) + warning("Using only available variables for subgroup analysis.") +} + +if(length(available_vars) == 0) { + stop("No valid subgroup variables found. Check variable names.") +} + +# Use only available variables +var_subgroup <- available_vars +message(paste("Using subgroup variables:", paste(var_subgroup, collapse=", "))) +``` + +#### Cox Regression Forest Plot (Subgroup Analysis) + +**Recommended**: Use jstable's `TableSubgroupMultiCox()` function, which jsmodule uses internally. + +```r +library(jstable) +library(survival) +library(forestploter) + +# IMPORTANT: Replace variable names below with your actual data +# - 'status' → your event variable (must convert to numeric) +# - 'time' → your time-to-event variable +# - 'rx' → your treatment/exposure variable +# - 'age' → your subgroup variable + +# Step 0: Check variable existence (REQUIRED) +# IMPORTANT: Replace with your actual variable names +required_vars <- c("time_var", "status_var", "treatment_var", "subgroup_var") +missing_vars <- setdiff(required_vars, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Step 1: Pre-convert status variable (REQUIRED for survival functions) +# IMPORTANT: Replace 'status_var' with your actual status variable name +out$status_num <- as.integer(as.character(out$status_var)) + +# Step 2: Create subgroup variable as factor (REQUIRED) +# IMPORTANT: Replace with your actual subgroup variable and appropriate breaks/labels +out$subgroup_var_factor <- factor(cut(out$subgroup_var, breaks = c(0, 50, 60, 70, Inf), + labels = c("<50", "50-60", "60-70", ">=70"))) + +# Step 2.5: Validate subgroup creation +# IMPORTANT: Replace 'subgroup_var_factor' with the actual variable name you created in Step 2 +if(!is.factor(out$subgroup_var_factor)) { + stop("subgroup_var_factor must be a factor variable.") +} + +if(length(levels(out$subgroup_var_factor)) < 2) { + stop("Subgroup variable must have at least 2 levels. Check subgroup_var_factor creation.") +} + +# Print subgroup distribution for validation +print("Subgroup Variable Distribution:") +print(table(out$subgroup_var_factor, useNA = "ifany")) + +# Step 3: Use jstable's TableSubgroupMultiCox (same as forestcoxServer uses) +# IMPORTANT: Replace variable names with your actual data +forest_result <- TableSubgroupMultiCox( + formula = Surv(time_var, status_num) ~ treatment_var, # Replace with your treatment variable + var_subgroup = "subgroup_var_factor", # Must match the factor variable name from Step 2 + data = out +) + +# Step 4: Extract data for forest plot +# IMPORTANT: TableSubgroupMultiCox returns a data.frame directly, NOT a list! +forest_df <- forest_result # Already a data.frame + +# Step 5: Convert character columns to numeric (TableSubgroupMultiCox returns characters) +# Column names from TableSubgroupMultiCox: "Point Estimate", "Lower", "Upper" +# Use suppressWarnings to avoid "NAs introduced by coercion" for "Reference" rows +forest_df$Point_Estimate <- suppressWarnings(as.numeric(forest_df$`Point Estimate`)) +forest_df$Lower_CI <- suppressWarnings(as.numeric(forest_df$Lower)) +forest_df$Upper_CI <- suppressWarnings(as.numeric(forest_df$Upper)) + +# Remove NA rows (reference group has NA values) +forest_df <- forest_df[!is.na(forest_df$Point_Estimate), ] + +# Step 6: Create forest plot with forestploter +result <- forestploter::forest( + forest_df, + est = forest_df$Point_Estimate, + lower = forest_df$Lower_CI, + upper = forest_df$Upper_CI, + ci_column = 5, # Column position for CI display + ref_line = 1, # Reference line at HR=1 + xlim = c(0.5, 2.0), # Adjust based on your HR range + xlab = "Hazard Ratio (95% CI)" +) + +# Common errors and solutions: +# Error: "var_subgroup must be factor" → Ensure step 2 creates factor +# Error: "subscript out of bounds" → Check column names in forest_df +# Error: "object 'status' not found" → Complete step 1 first +``` + +#### Logistic Regression Forest Plot + +**Recommended**: Use jstable's `TableSubgroupMultiGLM()` function, which jsmodule uses internally. + +```r +library(jstable) +library(forestploter) + +# IMPORTANT: Replace variable names below with your actual data +# - 'outcome' → your binary outcome variable (0/1 or factor) +# - 'treatment' → your treatment/exposure variable +# - 'sex' → your subgroup variable + +# Step 0: Check variable existence (REQUIRED) +# IMPORTANT: Replace with your actual variable names +required_vars <- c("outcome_var", "treatment_var", "subgroup_var") +missing_vars <- setdiff(required_vars, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Step 1: Ensure subgroup variable is factor (REQUIRED) +# IMPORTANT: Replace 'subgroup_var' with your actual subgroup variable name +out$subgroup_var <- factor(out$subgroup_var) + +# Step 1.5: Validate subgroup creation +# IMPORTANT: Replace 'subgroup_var' with the actual variable name you used in Step 1 +if(!is.factor(out$subgroup_var)) { + stop("subgroup_var must be a factor variable.") +} + +if(length(levels(out$subgroup_var)) < 2) { + stop("Subgroup variable must have at least 2 levels. Check subgroup_var variable.") +} + +# Print subgroup distribution for validation +print("Subgroup Variable Distribution:") +print(table(out$subgroup_var, useNA = "ifany")) + +# Step 2: Use jstable's TableSubgroupMultiGLM (same as forestglmServer uses) +# IMPORTANT: Replace variable names with your actual data +forest_result <- TableSubgroupMultiGLM( + formula = outcome_var ~ treatment_var + covariate_var, # Replace with your variables + var_subgroup = "subgroup_var", # Must match the factor variable name from Step 1 + data = out, + family = binomial # Use binomial for logistic regression +) + +# Step 3: Extract data for forest plot +# IMPORTANT: TableSubgroupMultiGLM returns a data.frame directly, NOT a list! +forest_df <- forest_result # Already a data.frame + +# Step 4: Convert character columns to numeric (TableSubgroupMultiGLM returns characters) +# Use suppressWarnings to avoid "NAs introduced by coercion" for "Reference" rows +forest_df$Point_Estimate <- suppressWarnings(as.numeric(forest_df$`Point Estimate`)) +forest_df$Lower_CI <- suppressWarnings(as.numeric(forest_df$Lower)) +forest_df$Upper_CI <- suppressWarnings(as.numeric(forest_df$Upper)) + +# Remove NA rows (reference group has NA values) +forest_df <- forest_df[!is.na(forest_df$Point_Estimate), ] + +# Step 5: Create forest plot +result <- forestploter::forest( + forest_df, + est = forest_df$Point_Estimate, + lower = forest_df$Lower_CI, + upper = forest_df$Upper_CI, + ci_column = 5, # Column position for CI display + ref_line = 1, # Reference line at OR=1 + xlab = "Odds Ratio (95% CI)" +) + +# Common errors and solutions: +# Error: "var_subgroup must be factor" → Ensure step 1 creates factor +# Error: "outcome must be binary" → Check outcome is 0/1 or factor with 2 levels +# Error: "object not found" → Check all variable names in formula +``` + +**When to use**: +- **Subgroup analysis**: Treatment effects across different patient populations +- **Multiple model comparison**: Comparing effect sizes from different models +- **Meta-analysis style**: Publication-ready figures for medical journals +- **Effect modification**: Visualizing heterogeneity of treatment effects + +**Important notes**: +- Always use **jstable functions** (cox2.display, glmshow.display) to generate regression results +- Include sample size (N) and number of events for transparency +- Forest plots must show both point estimates and confidence intervals +- In Shiny apps, use jsmodule's `forestcoxUI/forestglmUI` modules for interactive forest plots + +### 11.9 Correlation Analysis and Scatter Plot Matrix + +Correlation analysis is essential for exploratory data analysis and checking multicollinearity before regression modeling. jsmodule includes `ggpairsModule` for interactive correlation matrices in Shiny apps, which uses the `GGally` package. + +**For AI Assistant**: Use `GGally::ggpairs()` directly for correlation visualization. + +#### Variable Existence Check + +**ALWAYS verify variables exist and are numeric before correlation analysis:** + +```r +# IMPORTANT: Check which variables exist and are numeric +# IMPORTANT: Replace with your actual continuous variable names +var_candidates <- c("continuous_var1", "continuous_var2", "continuous_var3", "continuous_var4", "continuous_var5") + +# Check existence +available_vars <- var_candidates[var_candidates %in% names(out)] +missing_vars <- setdiff(var_candidates, available_vars) + +if(length(missing_vars) > 0) { + warning(paste("Variables not found:", paste(missing_vars, collapse=", "))) +} + +if(length(available_vars) == 0) { + stop("No valid variables found for correlation analysis.") +} + +# Check which are numeric +numeric_check <- sapply(out[, available_vars], is.numeric) +numeric_vars <- available_vars[numeric_check] +non_numeric <- available_vars[!numeric_check] + +if(length(non_numeric) > 0) { + warning(paste("Non-numeric variables excluded:", paste(non_numeric, collapse=", "))) +} + +if(length(numeric_vars) < 2) { + stop("Need at least 2 numeric variables for correlation analysis.") +} + +message(paste("Using numeric variables:", paste(numeric_vars, collapse=", "))) +``` + +#### Basic Correlation Matrix + +```r +library(GGally) + +# IMPORTANT: First check variable existence (see above) +# Select numeric variables for correlation +# IMPORTANT: Replace with your actual continuous variable names +numeric_vars <- c("continuous_var1", "continuous_var2", "continuous_var3", "continuous_var4") + +# Verify all variables exist and are numeric +missing_vars <- setdiff(numeric_vars, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Check all are numeric +non_numeric <- numeric_vars[!sapply(out[, numeric_vars], is.numeric)] +if(length(non_numeric) > 0) { + stop(paste("Non-numeric variables found:", paste(non_numeric, collapse=", "))) +} + +result <- ggpairs(out[, numeric_vars], + title = "Correlation Matrix", + upper = list(continuous = wrap("cor", size = 3)), + lower = list(continuous = wrap("points", alpha = 0.3, size = 0.5))) +``` + +#### Correlation Matrix with Grouping + +```r +# Correlation by group variable +# IMPORTANT: Replace 'group_var' with your actual grouping variable +result <- ggpairs(out[, c(numeric_vars, "group_var")], + aes(color = group_var, alpha = 0.5), + title = "Correlation Matrix by Group", + upper = list(continuous = wrap("cor", size = 3)), + lower = list(continuous = wrap("points", alpha = 0.3, size = 0.5))) +``` + +#### Multicollinearity Check Before Regression + +```r +# Check predictor correlations before multivariable modeling +# IMPORTANT: Replace with your actual predictor variable names +predictor_vars <- c("predictor_var1", "predictor_var2", "predictor_var3", "predictor_var4") +result <- ggpairs(out[, predictor_vars], + title = "Predictor Variable Correlations", + upper = list(continuous = wrap("cor", size = 4, color = "blue")), + lower = list(continuous = wrap("smooth", alpha = 0.3))) + +# Guidelines: +# - Correlation > 0.8 indicates potential multicollinearity +# - Consider removing one of highly correlated variables +# - Use VIF for formal multicollinearity assessment (see Section 5) +``` + +**When to use**: +- **Exploratory data analysis**: Understand relationships between variables +- **Before regression**: Check multicollinearity (|r| > 0.8 problematic) +- **Variable selection**: Identify redundant predictors +- **Publication figures**: Comprehensive correlation visualization + +**Multicollinearity guidelines**: +- |Correlation| > 0.8: Consider removing one variable +- VIF > 10: Severe multicollinearity (use `car::vif()`) +- For medical research: Prioritize clinically meaningful variables over statistical criteria + +### 11.10 Non-parametric Tests + +When data violates parametric assumptions (normality, equal variance), use non-parametric alternatives. Always check normality first using jstable's CreateTableOneJS. + +#### Variable Existence Check + +**ALWAYS verify variables exist before non-parametric tests:** + +```r +# IMPORTANT: Check variable existence +# IMPORTANT: Replace with your actual variable names +required_vars <- c("outcome_var", "group_var") +missing_vars <- setdiff(required_vars, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Check group has multiple levels +if(is.factor(out$group_var)) { + if(length(levels(out$group_var)) < 2) { + stop("Group variable must have at least 2 levels.") + } +} else { + unique_groups <- length(unique(out$group_var)) + if(unique_groups < 2) { + stop("Group variable must have at least 2 unique values.") + } +} + +# Check sufficient sample size per group +group_sizes <- table(out$group_var) +if(any(group_sizes < 5)) { + warning("Some groups have <5 observations. Results may be unreliable.") + print(group_sizes) +} +``` + +#### Check Normality with jstable + +```r +library(jstable) + +# IMPORTANT: First check variable existence (see above) +# IMPORTANT: Replace with your actual continuous variable names +vars_to_test <- c("continuous_var1", "continuous_var2", "continuous_var3") + +# Verify variables exist +missing_vars <- setdiff(vars_to_test, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Verify strata variable exists +# IMPORTANT: Replace 'group_var' with your actual grouping variable name +if(!"group_var" %in% names(out)) { + stop("Strata variable 'group_var' not found.") +} + +# CreateTableOneJS includes normality test +table1 <- CreateTableOneJS( + vars = vars_to_test, + strata = "group_var", # Replace with your grouping variable + data = out, + labeldata = out.label, + testNormal = "shapiro" # Shapiro-Wilk test +) + +# Check normality test p-values in output +# If p < 0.05 → non-normal distribution → use non-parametric tests +``` + +#### Mann-Whitney U Test (Two Groups) + +```r +# Non-parametric alternative to t-test +# When: n < 30, non-normal distribution, or ordinal data + +# IMPORTANT: Check variable existence first (see above) +# IMPORTANT: Replace with your actual variable names +required_vars <- c("outcome_var", "group_var") +missing_vars <- setdiff(required_vars, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Check group has exactly 2 levels +unique_groups <- length(unique(out$group_var)) +if(unique_groups != 2) { + stop(paste("Mann-Whitney test requires exactly 2 groups. Found:", unique_groups)) +} + +result <- wilcox.test(outcome_var ~ group_var, data = out) + +# Report median (IQR) instead of mean (SD) +by(out$outcome_var, out$group_var, function(x) { + paste0("Median = ", median(x), " (IQR: ", + quantile(x, 0.25), "-", quantile(x, 0.75), ")") +}) +``` + +#### Kruskal-Wallis Test (Multiple Groups) + +```r +# Non-parametric alternative to one-way ANOVA +# When: >2 groups with non-normal distribution + +# IMPORTANT: Replace with your actual variable names +result <- kruskal.test(outcome_var ~ group_var, data = out) + +# Post-hoc pairwise comparisons +pairwise.wilcox.test(out$outcome_var, out$group_var, p.adjust.method = "bonferroni") +``` + +#### Friedman Test (Repeated Measures) + +```r +# Non-parametric alternative to repeated measures ANOVA +# Data must be in long format + +# IMPORTANT: Replace with your actual variable names +result <- friedman.test(value_var ~ timepoint_var | id_var, data = out_long) +``` + +**When to use**: +- Small sample size (n < 30 per group) +- Non-normal distribution (Shapiro-Wilk p < 0.05 in CreateTableOneJS) +- Ordinal data (e.g., pain scores 1-10, Likert scales) +- Extreme outliers present + +**Reporting**: +- Report median (IQR) not mean (SD) +- Include test name in methods section +- Example: "Median age 65 (IQR: 58-72) vs 70 (IQR: 63-78), Mann-Whitney U test p = 0.032" + +### 11.11 Multiple Testing Correction + +When performing multiple statistical tests (subgroup analyses, multiple endpoints), adjust p-values to control Type I error. + +#### Variable Existence and Validation + +**ALWAYS check variables and subgroup structure before multiple testing:** + +```r +# IMPORTANT: Verify required variables exist +# IMPORTANT: Replace with your actual variable names +required_vars <- c("time_var", "status_var", "treatment_var", "subgroup_var") +missing_vars <- setdiff(required_vars, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Pre-convert status variable (REQUIRED) +# IMPORTANT: Replace 'status_var' with your actual status variable name +out$status_num <- as.integer(as.character(out$status_var)) + +# Check subgroup variable has valid values +if(is.factor(out$subgroup_var)) { + subgroup_levels <- levels(out$subgroup_var) + if(length(subgroup_levels) < 2) { + stop("Subgroup variable must have at least 2 levels.") + } +} else { + subgroup_levels <- unique(out$subgroup_var) + if(length(subgroup_levels) < 2) { + stop("Subgroup variable must have at least 2 unique values.") + } +} + +# Check sample size per subgroup +subgroup_sizes <- table(out$subgroup_var) +print("Subgroup Sizes:") +print(subgroup_sizes) + +if(any(subgroup_sizes < 20)) { + warning("Some subgroups have <20 observations. Consider combining categories.") +} +``` + +#### Extract p-values from Multiple Models + +**IMPORTANT**: jstable functions return p-values as **characters** (e.g., "0.023" or "< 0.001"). +For multiple testing correction, extract p-values directly from model objects, not jstable output. + +```r +library(jstable) +library(survival) + +# IMPORTANT: Replace variable names with your actual data +# - 'subgroup_variable' → your grouping variable (e.g., 'sex', 'age_group') +# - 'treatment' → your treatment/exposure variable + +# Step 0: Check variable existence (REQUIRED - see above) +# IMPORTANT: Replace with your actual variable names +required_vars <- c("time_var", "status_var", "treatment_var", "subgroup_var") +missing_vars <- setdiff(required_vars, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Pre-convert status variable +# IMPORTANT: Replace 'status_var' with your actual status variable name +out$status_num <- as.integer(as.character(out$status_var)) + +# Define subgroups (IMPORTANT: Replace with your actual subgroup levels) +# Get unique levels from your subgroup variable +subgroups <- unique(out$subgroup_var) +p_values <- numeric(length(subgroups)) + +for (i in seq_along(subgroups)) { + # Safe subsetting (works for both data.frame and data.table) + subset_data <- out[out$subgroup_var == subgroups[i], ] + + # Skip if subset is too small + if (nrow(subset_data) < 20) { + warning(paste("Subgroup", subgroups[i], "has <20 observations. Skipping.")) + p_values[i] <- NA + next + } + + # Fit model + # IMPORTANT: Replace 'treatment_var' with your actual treatment variable name + fit <- coxph(Surv(time_var, status_num) ~ treatment_var, data = subset_data, model = TRUE) + + # Extract p-value directly from model summary (NOT from jstable) + # This gives numeric p-value, not character string + fit_summary <- summary(fit) + p_values[i] <- fit_summary$coefficients["treatment", "Pr(>|z|)"] +} + +# Remove NA values +valid_idx <- !is.na(p_values) +subgroups <- subgroups[valid_idx] +p_values <- p_values[valid_idx] + +# Check: At least 2 tests needed for correction +if (length(p_values) < 2) { + stop("Need at least 2 valid p-values for multiple testing correction.") +} +``` + +#### Apply Correction Methods + +```r +# Bonferroni (most conservative) +p_bonferroni <- p.adjust(p_values, method = "bonferroni") + +# Holm (less conservative, more powerful) +p_holm <- p.adjust(p_values, method = "holm") + +# Benjamini-Hochberg (controls False Discovery Rate) +p_bh <- p.adjust(p_values, method = "BH") + +# Create summary table +correction_results <- data.frame( + Subgroup = subgroups, + Original_p = round(p_values, 4), + Bonferroni = round(p_bonferroni, 4), + Holm = round(p_holm, 4), + BH_FDR = round(p_bh, 4), + Sig_Bonf = p_bonferroni < 0.05, + Sig_BH = p_bh < 0.05 +) +print(correction_results) +``` + +#### Multiple Outcomes Example + +```r +# IMPORTANT: Replace outcome names with your actual variables +# All outcome variables must be binary (0/1) for logistic regression + +# Test multiple outcomes +# IMPORTANT: Replace with your actual outcome variable names +outcomes <- c("outcome1_var", "outcome2_var", "outcome3_var", "outcome4_var") +p_values <- numeric(length(outcomes)) + +for (i in seq_along(outcomes)) { + # Check if outcome variable exists + if (!outcomes[i] %in% names(out)) { + warning(paste("Outcome variable", outcomes[i], "not found. Skipping.")) + p_values[i] <- NA + next + } + + # Fit model + # IMPORTANT: Replace 'treatment_var', 'age_var', 'sex_var' with your actual variable names + formula_str <- paste0(outcomes[i], " ~ treatment_var + age_var + sex_var") + fit <- glm(as.formula(formula_str), data = out, family = binomial) + + # Extract p-value from model summary (NOT from glmshow.display) + # IMPORTANT: Replace 'treatment_var' with your actual treatment variable name + fit_summary <- summary(fit) + p_values[i] <- fit_summary$coefficients["treatment_var", "Pr(>|z|)"] +} + +# Remove NA values +valid_idx <- !is.na(p_values) +outcomes <- outcomes[valid_idx] +p_values <- p_values[valid_idx] + +# Apply Holm correction (recommended) +p_adjusted <- p.adjust(p_values, method = "holm") + +# Create results table +results_summary <- data.frame( + Outcome = outcomes, + Original_p = round(p_values, 4), + Adjusted_p = round(p_adjusted, 4), + Significant = p_adjusted < 0.05 +) + +# Display results +print(results_summary) + +# Common errors and solutions: +# Error: "object not found" → Check all outcome variable names exist in data +# Error: "non-numeric argument" → Ensure outcomes are binary (0/1) +# All NA p-values → Check variable names and model fitting +``` + +**Choice of correction method**: +- **Bonferroni**: Most conservative, confirmatory analyses (pre-specified comparisons) +- **Holm**: Good balance, recommended default for multiple testing +- **Benjamini-Hochberg (BH)**: Exploratory analyses, controls False Discovery Rate +- **Benjamini-Yekutieli (BY)**: When tests are correlated/dependent + +**When to use**: +- Multiple subgroup analyses (Cox/GLM models for different populations) +- Multiple endpoints (primary + secondary outcomes) +- Post-hoc comparisons (after ANOVA/Kruskal-Wallis) +- Exploratory variable screening + +**Reporting**: +- State correction method in methods +- Report both original and adjusted p-values +- Example: "After Holm correction, treatment effect remained significant in females (adjusted p = 0.012) but not males (adjusted p = 0.089)" + +### 11.12 Interaction Analysis (Effect Modification) + +Test whether treatment effects differ across subgroups using interaction terms. Essential for precision medicine and identifying patients who benefit most from treatment. + +#### Variable Existence Check + +**ALWAYS verify all variables exist before interaction analysis:** + +```r +# IMPORTANT: Check variable existence for interaction analysis +# IMPORTANT: Replace variable names with your actual data +required_vars <- c("time_var", "status_var", "treatment_var", "subgroup_var1", "subgroup_var2") +missing_vars <- setdiff(required_vars, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Pre-convert status (REQUIRED) +# IMPORTANT: Replace 'status_var' with your actual status variable name +out$status_num <- as.integer(as.character(out$status_var)) + +# Check interaction variables are factors +# IMPORTANT: Replace 'subgroup_var1' and 'subgroup_var2' with your actual variable names +if(!is.factor(out$subgroup_var1)) { + out$subgroup_var1 <- factor(out$subgroup_var1) + message("Converted subgroup_var1 to factor") +} + +if(!is.factor(out$subgroup_var2)) { + out$subgroup_var2 <- factor(out$subgroup_var2) + message("Converted subgroup_var2 to factor") +} + +# Check factor levels +if(length(levels(out$subgroup_var1)) < 2) { + stop("subgroup_var1 must have at least 2 levels for interaction analysis.") +} + +if(length(levels(out$subgroup_var2)) < 2) { + stop("subgroup_var2 must have at least 2 levels for interaction analysis.") +} + +# Check sample sizes in cross-tabulation +# IMPORTANT: Replace 'treatment_var' and 'subgroup_var1' with your actual variable names +crosstab <- table(out$treatment_var, out$subgroup_var1) +print("Treatment x Subgroup Cross-tabulation:") +print(crosstab) + +if(any(crosstab < 10)) { + warning("Some treatment x subgroup combinations have <10 observations. Interaction test may be unreliable.") +} +``` + +#### Stratified Analysis with jstable + +```r +library(jstable) + +# IMPORTANT: First check variable existence (see above) +# IMPORTANT: Replace with your actual variable names +vars_to_summarize <- c("var1", "var2", "var3") +strata_vars <- c("treatment_var", "subgroup_var1") + +# Verify all variables exist +missing_vars <- setdiff(c(vars_to_summarize, strata_vars), names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Check baseline characteristics by subgroups +# Two-way stratification +table1_stratified <- CreateTableOneJS( + vars = vars_to_summarize, + strata = strata_vars, + data = out, + labeldata = out.label +) +``` + +#### Cox Model with Interaction Term + +```r +library(jstable) +library(survival) + +# IMPORTANT: First check variable existence (REQUIRED - see above) +# IMPORTANT: Replace with your actual variable names +required_vars <- c("time_var", "status_var", "treatment_var", "subgroup_var1", "subgroup_var2") +missing_vars <- setdiff(required_vars, names(out)) +if(length(missing_vars) > 0) { + stop(paste("Missing variables:", paste(missing_vars, collapse=", "))) +} + +# Pre-convert status +# IMPORTANT: Replace 'status_var' with your actual status variable name +out$status_num <- as.integer(as.character(out$status_var)) + +# Model with interaction: treatment * subgroup +# IMPORTANT: Replace variable names with your actual data +fit_interaction <- coxph(Surv(time_var, status_num) ~ treatment_var * subgroup_var1 + subgroup_var2, + data = out, model = TRUE) +# IMPORTANT: Extract $table component for display +result_interaction <- cox2.display(fit_interaction, dec = 2)$table + +# Model with main effects only +fit_main <- coxph(Surv(time_var, status_num) ~ treatment_var + subgroup_var1 + subgroup_var2, + data = out, model = TRUE) +# IMPORTANT: Extract $table component for display +result_main <- cox2.display(fit_main, dec = 2)$table + +# Likelihood ratio test for interaction +lr_test <- anova(fit_main, fit_interaction) +# IMPORTANT: Column name is "Pr(>|Chi|)" with "Pr" not "P" +interaction_pval <- lr_test[2, "Pr(>|Chi|)"] + +if (interaction_pval < 0.05) { + message("Significant interaction detected (p = ", round(interaction_pval, 3), ")") + message("Perform stratified analyses by subgroup") +} +``` + +#### Stratified Cox Analysis (if interaction significant) + +```r +# IMPORTANT: Only run this if interaction p-value < 0.05 +# IMPORTANT: Replace variable names with your actual data + +# Separate analysis for each subgroup +# IMPORTANT: Use safe subsetting (works for both data.frame and data.table) + +# First subgroup level +# IMPORTANT: Replace 'subgroup_var1' and level names with your actual data +subset_level1 <- out[out$subgroup_var1 == "Level1", ] +if (nrow(subset_level1) < 20) { + stop("Level1 subgroup too small (n < 20). Cannot perform analysis.") +} + +# IMPORTANT: Replace variable names with your actual data +fit_level1 <- coxph(Surv(time_var, status_num) ~ treatment_var + subgroup_var2, + data = subset_level1, model = TRUE) + +# Second subgroup level +subset_level2 <- out[out$subgroup_var1 == "Level2", ] +if (nrow(subset_level2) < 20) { + stop("Level2 subgroup too small (n < 20). Cannot perform analysis.") +} + +fit_level2 <- coxph(Surv(time_var, status_num) ~ treatment_var + subgroup_var2, + data = subset_level2, model = TRUE) + +# Extract HRs from model summary (NOT from jstable) +# jstable returns character strings, we need numeric values +summary_level1 <- summary(fit_level1) +summary_level2 <- summary(fit_level2) + +# IMPORTANT: Replace 'treatment_var' with your actual treatment variable name +hr_level1 <- summary_level1$conf.int["treatment_var", "exp(coef)"] +ci_level1_lower <- summary_level1$conf.int["treatment_var", "lower .95"] +ci_level1_upper <- summary_level1$conf.int["treatment_var", "upper .95"] + +hr_level2 <- summary_level2$conf.int["treatment_var", "exp(coef)"] +ci_level2_lower <- summary_level2$conf.int["treatment_var", "lower .95"] +ci_level2_upper <- summary_level2$conf.int["treatment_var", "upper .95"] + +# Display results +message("Treatment HR in level1: ", round(hr_level1, 2), + " (95% CI: ", round(ci_level1_lower, 2), "-", round(ci_level1_upper, 2), ")") +message("Treatment HR in level2: ", round(hr_level2, 2), + " (95% CI: ", round(ci_level2_lower, 2), "-", round(ci_level2_upper, 2), ")") + +# For display with jstable (optional) - extract $table component +result_level1 <- cox2.display(fit_level1, dec = 2)$table +result_level2 <- cox2.display(fit_level2, dec = 2)$table +``` + +#### Logistic Regression with Interaction + +```r +library(jstable) + +# IMPORTANT: Replace variable names with your actual data +# - 'outcome_var' → your binary outcome (0/1) +# - 'treatment_var' → your treatment variable +# - 'subgroup_var' → your interaction variable + +# Test treatment-by-subgroup interaction +# IMPORTANT: Replace variable names with your actual data +fit_interaction <- glm(outcome_var ~ treatment_var * subgroup_var + covariate_var, + data = out, family = binomial) + +# Extract interaction p-value from model summary (NOT from glmshow.display) +summary_interaction <- summary(fit_interaction) + +# Interaction term name depends on variable types +# For factor variables: "treatment_var:subgroup_varLevel" or similar +# Check: print(rownames(summary_interaction$coefficients)) +# IMPORTANT: Replace 'treatment_var' and 'subgroup_var' with your actual variable names +interaction_term_name <- grep("treatment_var.*subgroup_var|subgroup_var.*treatment_var", + rownames(summary_interaction$coefficients), + value = TRUE)[1] + +if (is.na(interaction_term_name)) { + stop("Could not find interaction term. Check variable names and types.") +} + +interaction_pval <- summary_interaction$coefficients[interaction_term_name, "Pr(>|z|)"] + +message("Interaction p-value: ", round(interaction_pval, 4)) + +if (interaction_pval < 0.05) { + message("Significant interaction detected. Performing stratified analyses.") + + # Stratified analysis by subgroup (safe subsetting) + # IMPORTANT: Replace 'subgroup_var' and level names with your actual data + subset_level1 <- out[out$subgroup_var == "Level1", ] + subset_level2 <- out[out$subgroup_var == "Level2", ] + + # Check sufficient sample size + if (nrow(subset_level1) < 20 | nrow(subset_level2) < 20) { + warning("One or more subgroups has <20 observations.") + } + + # Fit stratified models + # IMPORTANT: Replace variable names with your actual data + fit_level1 <- glm(outcome_var ~ treatment_var + covariate_var, data = subset_level1, family = binomial) + fit_level2 <- glm(outcome_var ~ treatment_var + covariate_var, data = subset_level2, family = binomial) + + # Display with jstable + result_level1 <- glmshow.display(fit_level1, decimal = 2) + result_level2 <- glmshow.display(fit_level2, decimal = 2) + + # Extract ORs from model summaries for comparison + summary_level1 <- summary(fit_level1) + summary_level2 <- summary(fit_level2) + + # IMPORTANT: Replace 'treatment_var' with your actual treatment variable name + or_level1 <- exp(summary_level1$coefficients["treatment_var", "Estimate"]) + or_level2 <- exp(summary_level2$coefficients["treatment_var", "Estimate"]) + + message("Treatment OR in level1: ", round(or_level1, 2)) + message("Treatment OR in level2: ", round(or_level2, 2)) +} + +# Common errors and solutions: +# Error: "subscript out of bounds" → Check interaction term name with grep +# Error: "object not found" → Verify all variable names exist in data +``` + +#### Visualize Interaction with Forest Plot + +```r +# Combine stratified results for forest plot visualization +# IMPORTANT: Replace subgroup names with your actual subgroup levels +subgroups <- c("Subgroup1", "Subgroup2", "Subgroup3", "Subgroup4", "Subgroup5", "Subgroup6") +forest_data <- data.frame() + +for (subgroup in subgroups) { + # IMPORTANT: Replace 'subgroup_filter_var' with your actual subgroup filter variable + subset_data <- out[subgroup_filter_var == subgroup] + # IMPORTANT: Replace 'status_var' with your actual status variable name + subset_data$status_num <- as.integer(as.character(subset_data$status_var)) + + if (nrow(subset_data) >= 20) { + # IMPORTANT: Replace 'time_var' and 'treatment_var' with your actual variable names + fit <- coxph(Surv(time_var, status_num) ~ treatment_var, data = subset_data, model = TRUE) + cox_result <- cox2.display(fit, dec = 2) + + # IMPORTANT: Replace 'treatment_var' with your actual treatment variable name + forest_data <- rbind(forest_data, data.frame( + Subgroup = subgroup, + N = nrow(subset_data), + HR = cox_result$table["treatment_var", "HR"], + Lower = cox_result$table["treatment_var", "lower .95"], + Upper = cox_result$table["treatment_var", "upper .95"], + P = cox_result$table["treatment_var", "p.value"] + )) + } +} + +# Create forest plot (see Section 11.8) +library(forestploter) +result <- forestploter::forest(forest_data, + est = forest_data$HR, + lower = forest_data$Lower, + upper = forest_data$Upper, + ci_column = 3, + ref_line = 1, + xlab = "Hazard Ratio (95% CI)") +``` + +**Interpretation**: +- **Interaction p < 0.05**: Treatment effect differs significantly by subgroup +- **Interaction p ≥ 0.05**: No strong evidence of effect modification (report overall effect) +- **Significant interaction**: Report stratified results with jstable for each subgroup +- **Forest plots**: Visualize effect heterogeneity across subgroups (Section 11.8) + +**Important notes**: +- Test for interaction before performing subgroup analyses +- Specify subgroups a priori (avoid data-driven subgroup selection) +- Adjust for multiple testing when exploring many interactions (Section 11.11) +- Always use jstable functions (cox2.display, glmshow.display) for effect estimates + --- ## 12. Code Best Practices @@ -940,19 +2163,22 @@ library(ggpubr) ### Data Preparation ```r # Handle factor conversion for survival analysis -out$status_num <- as.integer(as.character(out$status)) +# IMPORTANT: Replace 'status_var' with your actual status variable name +out$status_num <- as.integer(as.character(out$status_var)) # Create categorical variables -out$age_group <- cut(out$age, breaks = c(0, 60, Inf), +# IMPORTANT: Replace 'age_var' with your actual age variable name +out$age_group <- cut(out$age_var, breaks = c(0, 60, Inf), labels = c("<60", "≥60")) ``` ### Multiple Outputs ```r +# IMPORTANT: Replace variable names with your actual data # Create multiple plots p1 <- jskm(fit1, data = out, table = TRUE, pval = TRUE) p2 <- jskm(fit2, data = out, table = TRUE, pval = TRUE) -p3 <- ggboxplot(out, x = "rx", y = "age") +p3 <- ggboxplot(out, x = "group_var", y = "continuous_var") # Return as list → creates multiple slides in PowerPoint result <- list(p1, p2, p3) @@ -964,7 +2190,8 @@ result <- list(p1, p2, p3) library(flextable) library(officer) -fit <- glm(status ~ sex + age, data = out, family = binomial) +# IMPORTANT: Replace variable names with your actual data +fit <- glm(outcome_var ~ covariate1_var + covariate2_var, data = out, family = binomial) glm_res <- glmshow.display(fit, decimal = 2) # Create formatted table @@ -1000,11 +2227,17 @@ result <- ft # Also display in app - ✅ Always include: `jskm(fit, data = out, table = TRUE, pval = TRUE)` **Survival functions:** -- ❌ Using factor status directly: `Surv(time, status)` -- ✅ Convert to numeric: `Surv(time, as.integer(as.character(status)))` +- ❌ Using factor status directly: `Surv(time_var, status_var)` +- ❌ Inline conversion (can cause scoping errors): `Surv(time_var, as.integer(as.character(status_var)))` +- ✅ Pre-convert to numeric (recommended): + ```r + # IMPORTANT: Replace variable names with your actual data + out$status_num <- as.integer(as.character(out$status_var)) + fit <- survfit(Surv(time_var, status_num) ~ treatment_var, data = out) + ``` **cox2.display:** -- ❌ Forgetting `model = TRUE`: `coxph(Surv(time, status) ~ age, data = out)` +- ❌ Forgetting `model = TRUE`: `coxph(Surv(time_var, status_var) ~ covariate_var, data = out)` - ✅ Required for display: `coxph(..., model = TRUE)` **Regression display:** @@ -1017,10 +2250,12 @@ result <- ft # Also display in app **Always convert explicitly:** ```r # Factor to numeric (for survival analysis) -out$status_num <- as.integer(as.character(out$status)) +# IMPORTANT: Replace 'status_var' with your actual status variable name +out$status_num <- as.integer(as.character(out$status_var)) # Numeric to factor (for grouping) -out$group <- factor(out$group) +# IMPORTANT: Replace 'group_var' with your actual grouping variable name +out$group_factor <- factor(out$group_var) ``` ### Statistical Best Practices