From 67871a19654230208acfced9c3eba067071c3cbc Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Mon, 24 Nov 2025 08:09:30 -0600 Subject: [PATCH 1/8] update github action --- .github/workflows/R-CMD-check.yaml | 5 +---- .github/workflows/test-coverage.yaml | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 32c3e56..e5003bf 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -39,10 +39,7 @@ jobs: - name: Install R dependencies uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: rcmdcheck, keras3 # Add keras to the list of extra packages - - - name: Install Keras Python dependencies - run: R -e 'keras3::install_keras()' # This runs the install_keras function from R + extra-packages: rcmdcheck - uses: r-lib/actions/check-r-package@v2 with: diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 9ee1f3d..de06a69 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -25,12 +25,9 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::covr, any::xml2, rcmdcheck, keras3 + extra-packages: any::covr, any::xml2, rcmdcheck needs: coverage - - name: Install Keras Python dependencies - run: R -e 'keras3::install_keras()' # This runs the install_keras function from R - - name: Test coverage run: | cov <- covr::package_coverage( From a4cafc397c6f4e32a1f1d80b5d46e5dbbcaf62ae Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Mon, 24 Nov 2025 08:36:15 -0600 Subject: [PATCH 2/8] removing basilisk --- .github/workflows/R-CMD-check.yaml | 12 +++++++----- DESCRIPTION | 1 - 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index e5003bf..ef68b86 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -1,5 +1,4 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +# Workflow derived from [https://github.com/r-lib/actions/tree/v2/examples](https://github.com/r-lib/actions/tree/v2/examples) on: push: branches: [main, master] @@ -24,7 +23,6 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes - steps: - uses: actions/checkout@v3 @@ -39,8 +37,12 @@ jobs: - name: Install R dependencies uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: rcmdcheck - + extra-packages: any::rcmdcheck + needs: check + - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true + # If the vignette continues to hang, uncomment the lines below to skip it temporarily + # args: 'c("--no-vignettes", "--no-build-vignettes")' + # build_args: 'c("--no-build-vignettes")' \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 0592d67..fee8c90 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,7 +38,6 @@ Suggests: testthat, tidygraph, viridis -SystemRequirements: Python (via basilisk) LinkingTo: Rcpp VignetteBuilder: knitr From e723113f569057b370775e9d51df1173988a7f80 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Mon, 24 Nov 2025 08:46:40 -0600 Subject: [PATCH 3/8] no eval of getIMGT --- vignettes/immApex.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/immApex.Rmd b/vignettes/immApex.Rmd index afe09c1..6f1883f 100644 --- a/vignettes/immApex.Rmd +++ b/vignettes/immApex.Rmd @@ -58,7 +58,7 @@ Parameters for ```getIMGT()``` Here, we will use the ```getIMGT()``` function to get the amino acid sequences for the TRBV region to get all the sequences by V gene allele. -```{r, eval=knitr::is_html_output()} +```{r, eval=F} # Function to check IMGT website availability is_imgt_available <- function() { tryCatch({ From 15b192600641d8e238c413f261d4aa0f7f4c9319 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Mon, 24 Nov 2025 08:46:50 -0600 Subject: [PATCH 4/8] Update immApex.Rmd --- vignettes/immApex.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/immApex.Rmd b/vignettes/immApex.Rmd index 6f1883f..6bbabb3 100644 --- a/vignettes/immApex.Rmd +++ b/vignettes/immApex.Rmd @@ -121,7 +121,7 @@ Parameters for ```inferCDR``` * **sequence.type** Type of sequence - "aa" for amino acid or "nt" for nucleotide * **sequences** The specific regions of the CDR loop to get from the data. -```{r } +```{r eval = FALSE} if (is_imgt_available()) { Adaptive_example <- inferCDR(Adaptive_example, chain = "TRB", From 045cc5a2c83ca864667a96c39d1662eecf741b8a Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Mon, 24 Nov 2025 09:46:18 -0600 Subject: [PATCH 5/8] updating vignette for eval --- man/buildNetwork.Rd | 4 +--- vignettes/immApex.Rmd | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/man/buildNetwork.Rd b/man/buildNetwork.Rd index fa6d7c8..4a26d5e 100644 --- a/man/buildNetwork.Rd +++ b/man/buildNetwork.Rd @@ -60,9 +60,7 @@ to use for alignment-based metrics (`"nw"`, `"sw"`). Options include: \item{`"PAM70"`} - PAM70 matrix \item{`"PAM120"`} - PAM120 matrix \item{`"PAM250"`} - PAM250 matrix (distantly related) - \item{`"identity"`} - Simple identity matrix (match=1, mismatch=-1) - } -Or provide a custom numeric matrix with row/column names as amino acid codes.} + }} \item{normalize}{Character string specifying how to normalize distances: \itemize{ diff --git a/vignettes/immApex.Rmd b/vignettes/immApex.Rmd index 6bbabb3..73f6c4d 100644 --- a/vignettes/immApex.Rmd +++ b/vignettes/immApex.Rmd @@ -58,7 +58,7 @@ Parameters for ```getIMGT()``` Here, we will use the ```getIMGT()``` function to get the amino acid sequences for the TRBV region to get all the sequences by V gene allele. -```{r, eval=F} +```{r, eval=FALSE} # Function to check IMGT website availability is_imgt_available <- function() { tryCatch({ @@ -121,7 +121,7 @@ Parameters for ```inferCDR``` * **sequence.type** Type of sequence - "aa" for amino acid or "nt" for nucleotide * **sequences** The specific regions of the CDR loop to get from the data. -```{r eval = FALSE} +```{r eval=FALSE} if (is_imgt_available()) { Adaptive_example <- inferCDR(Adaptive_example, chain = "TRB", From 3880a391ba6d881534bd4aaf535a35ead32bb521 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Mon, 24 Nov 2025 10:10:51 -0600 Subject: [PATCH 6/8] Update immApex.Rmd --- vignettes/immApex.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/immApex.Rmd b/vignettes/immApex.Rmd index 73f6c4d..dfe0f46 100644 --- a/vignettes/immApex.Rmd +++ b/vignettes/immApex.Rmd @@ -58,11 +58,11 @@ Parameters for ```getIMGT()``` Here, we will use the ```getIMGT()``` function to get the amino acid sequences for the TRBV region to get all the sequences by V gene allele. -```{r, eval=FALSE} +```{r} # Function to check IMGT website availability is_imgt_available <- function() { tryCatch({ - r <- httr::HEAD("https://www.imgt.org", timeout(5)) + r <- httr::HEAD("https://www.imgt.org", httr::timeout(5)) httr::status_code(r) == 200 }, error = function(e) { FALSE @@ -121,7 +121,7 @@ Parameters for ```inferCDR``` * **sequence.type** Type of sequence - "aa" for amino acid or "nt" for nucleotide * **sequences** The specific regions of the CDR loop to get from the data. -```{r eval=FALSE} +```{r} if (is_imgt_available()) { Adaptive_example <- inferCDR(Adaptive_example, chain = "TRB", From 79c3db83180904b07e652a320d3309fd75bb6805 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Mon, 24 Nov 2025 12:16:22 -0600 Subject: [PATCH 7/8] Update immApex.Rmd removing imgt-dependent vignette --- vignettes/immApex.Rmd | 69 ------------------------------------------- 1 file changed, 69 deletions(-) diff --git a/vignettes/immApex.Rmd b/vignettes/immApex.Rmd index dfe0f46..620325c 100644 --- a/vignettes/immApex.Rmd +++ b/vignettes/immApex.Rmd @@ -44,47 +44,6 @@ suppressMessages(library(ggraph)) # Acquiring and Preparing Repertoire Data -## getIMGT - -Depending on the sequencing technology and the version, we might want to expand the length of our sequence embedding approach. The first step in the process is pulling the reference sequences from the ImMunoGeneTics (IMGT) system using ```getIMGT()```. More information for IMGT can be found at [imgt.org](https://www.imgt.org/). Data from IMGT is under a CC BY-NC-ND 4.0 license. Please be aware that attribution is required for usage and should not be used to create commercial or derivative work. - -Parameters for ```getIMGT()``` - -* **species** One or two word designation of species. Currently supporting: "human", "mouse", "rat", "rabbit", "rhesus monkey", "sheep", "pig", "platypus", "alpaca", "dog", "chicken", and "ferret" -* **chain** Sequence chain to access -* **frame** Designation for "all", "inframe" or "inframe+gap" -* **region** Sequence gene loci to access -* **sequence.type** Type of sequence - "aa" for amino acid or "nt" for nucleotide - -Here, we will use the ```getIMGT()``` function to get the amino acid sequences for the TRBV region to get all the sequences by V gene allele. - -```{r} -# Function to check IMGT website availability -is_imgt_available <- function() { - tryCatch({ - r <- httr::HEAD("https://www.imgt.org", httr::timeout(5)) - httr::status_code(r) == 200 - }, error = function(e) { - FALSE - }) -} - -# Run getIMGT only if the website is available -if (is_imgt_available()) { - TRBV_aa <- getIMGT(species = "human", - chain = "TRB", - frame = "inframe", - region = "v", - sequence.type = "aa") - - # Display first sequence as an example - TRBV_aa[[1]][1] -} else { - # Display a message if IMGT is not available - "IMGT website is not accessible at the moment." -} -``` - ## formatGenes Immune receptor nomenclature can be highly variable across sequencing platforms. When preparing data for models, we can use ```formatGenes()``` to universalize the gene formats into IMGT nomenclature. @@ -109,34 +68,6 @@ Adaptive_example <- formatGenes(immapex_example.data[["Adaptive"]], head(Adaptive_example[,c("aminoAcid","vGeneName", "v_IMGT", "v_IMGT.check")]) ``` -## inferCDR - -We can now use ```inferCDR()``` to add additional sequence elements to our example data using the outputs of ```formatGenes()``` and ```getIMGT()```. Here, we will use the function to isolate the complementarity-determining regions (CDR) 1 and 2. If the gene nomenclature does not match the IMGT the result will be NA for the given sequences. Likewise, if the IMGT nomenclature has been simplified, the first allelic match will be used for sequence extraction. - -Parameters for ```inferCDR``` - -* **input.data** Data frame of sequencing data or output from formatGenes(). -* **reference** IMGT sequences from ```getIMGT()``` -* **technology** The sequencing technology employed - 'TenX', "Adaptive', or 'AIRR', -* **sequence.type** Type of sequence - "aa" for amino acid or "nt" for nucleotide -* **sequences** The specific regions of the CDR loop to get from the data. - -```{r} -if (is_imgt_available()) { - Adaptive_example <- inferCDR(Adaptive_example, - chain = "TRB", - reference = TRBV_aa, - technology = "Adaptive", - sequence.type = "aa", - sequences = c("CDR1", "CDR2")) - - Adaptive_example[200:210,c("CDR1_IMGT", "CDR2_IMGT")] -} else { - # Display a message if IMGT is not available - "IMGT website is not accessible at the moment." -} -``` - # Generating and Augmenting Sequence Sets ## generateSequences From cc2948b63c305064e8fcd05d8e95295572825682 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Mon, 24 Nov 2025 12:32:05 -0600 Subject: [PATCH 8/8] reduce computational footprint of vignette --- vignettes/immApex.Rmd | 79 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 5 deletions(-) diff --git a/vignettes/immApex.Rmd b/vignettes/immApex.Rmd index 620325c..9f12090 100644 --- a/vignettes/immApex.Rmd +++ b/vignettes/immApex.Rmd @@ -44,6 +44,47 @@ suppressMessages(library(ggraph)) # Acquiring and Preparing Repertoire Data +## getIMGT + +Depending on the sequencing technology and the version, we might want to expand the length of our sequence embedding approach. The first step in the process is pulling the reference sequences from the ImMunoGeneTics (IMGT) system using ```getIMGT()```. More information for IMGT can be found at [imgt.org](https://www.imgt.org/). Data from IMGT is under a CC BY-NC-ND 4.0 license. Please be aware that attribution is required for usage and should not be used to create commercial or derivative work. + +Parameters for ```getIMGT()``` + +* **species** One or two word designation of species. Currently supporting: "human", "mouse", "rat", "rabbit", "rhesus monkey", "sheep", "pig", "platypus", "alpaca", "dog", "chicken", and "ferret" +* **chain** Sequence chain to access +* **frame** Designation for "all", "inframe" or "inframe+gap" +* **region** Sequence gene loci to access +* **sequence.type** Type of sequence - "aa" for amino acid or "nt" for nucleotide + +Here, we will use the ```getIMGT()``` function to get the amino acid sequences for the TRBV region to get all the sequences by V gene allele. + +```{r} +# Function to check IMGT website availability +is_imgt_available <- function() { + tryCatch({ + r <- httr::HEAD("https://www.imgt.org", httr::timeout(5)) + httr::status_code(r) == 200 + }, error = function(e) { + FALSE + }) +} + +# Run getIMGT only if the website is available +if (is_imgt_available()) { + TRBV_aa <- getIMGT(species = "human", + chain = "TRB", + frame = "inframe", + region = "v", + sequence.type = "aa") + + # Display first sequence as an example + TRBV_aa[[1]][1] +} else { + # Display a message if IMGT is not available + "IMGT website is not accessible at the moment." +} +``` + ## formatGenes Immune receptor nomenclature can be highly variable across sequencing platforms. When preparing data for models, we can use ```formatGenes()``` to universalize the gene formats into IMGT nomenclature. @@ -68,6 +109,34 @@ Adaptive_example <- formatGenes(immapex_example.data[["Adaptive"]], head(Adaptive_example[,c("aminoAcid","vGeneName", "v_IMGT", "v_IMGT.check")]) ``` +## inferCDR + +We can now use ```inferCDR()``` to add additional sequence elements to our example data using the outputs of ```formatGenes()``` and ```getIMGT()```. Here, we will use the function to isolate the complementarity-determining regions (CDR) 1 and 2. If the gene nomenclature does not match the IMGT the result will be NA for the given sequences. Likewise, if the IMGT nomenclature has been simplified, the first allelic match will be used for sequence extraction. + +Parameters for ```inferCDR``` + +* **input.data** Data frame of sequencing data or output from formatGenes(). +* **reference** IMGT sequences from ```getIMGT()``` +* **technology** The sequencing technology employed - 'TenX', "Adaptive', or 'AIRR', +* **sequence.type** Type of sequence - "aa" for amino acid or "nt" for nucleotide +* **sequences** The specific regions of the CDR loop to get from the data. + +```{r} +if (is_imgt_available()) { + Adaptive_example <- inferCDR(Adaptive_example, + chain = "TRB", + reference = TRBV_aa, + technology = "Adaptive", + sequence.type = "aa", + sequences = c("CDR1", "CDR2")) + + Adaptive_example[200:210,c("CDR1_IMGT", "CDR2_IMGT")] +} else { + # Display a message if IMGT is not available + "IMGT website is not accessible at the moment." +} +``` + # Generating and Augmenting Sequence Sets ## generateSequences @@ -86,7 +155,7 @@ Parameters for ```generateSequences()``` ```{r } sequences <- generateSequences(prefix.motif = "CAS", suffix.motif = "YF", - number.of.sequences = 1000, + number.of.sequences = 200, min.length = 8, max.length = 16) sequences <- unique(sequences) @@ -96,7 +165,7 @@ head(sequences) If we want to generate nucleotide sequences instead of amino acids, we must to change the **sequence.dictionary**. ```{r } -nucleotide.sequences <- generateSequences(number.of.sequences = 1000, +nucleotide.sequences <- generateSequences(number.of.sequences = 200, min.length = 8, max.length = 16, sequence.dictionary = c("A", "C", "T", "G")) @@ -506,15 +575,15 @@ First, we'll simulate two distinct classes of sequences using ```generateSequenc # Step 1a: Generate two distinct classes of sequences class1.sequences <- generateSequences(prefix.motif = "CAS", min.length = 3, - number.of.sequences = 500) + number.of.sequences = 250) class2.sequences <- generateSequences(prefix.motif = "CSG", min.length = 3, - number.of.sequences = 500) + number.of.sequences = 250) # Combine sequences and create labels all.sequences <- c(class1.sequences, class2.sequences) -labels <- as.factor(c(rep("Class1", 500), rep("Class2", 500))) +labels <- as.factor(c(rep("Class1", 250), rep("Class2", 250))) # Step 1b: Use propertyEncoder to create a feature matrix from Atchley factors feature.matrix <- propertyEncoder(all.sequences,