From ca80aad374f29293048a4c69216839d2151d97c4 Mon Sep 17 00:00:00 2001 From: Hakon G Date: Thu, 26 Jun 2025 11:10:05 +0000 Subject: [PATCH 1/3] feat(ENGKNOW-2566): support for more GT values --- documentation/src/command/CSVCC.rst | 71 ++++++++++++++++++- .../main/scala/gorsat/Analysis/GorCsvCC.scala | 66 +++++++++++++---- .../scala/gorsat/Analysis/GtGenAnalysis.scala | 4 +- 3 files changed, 125 insertions(+), 16 deletions(-) diff --git a/documentation/src/command/CSVCC.rst b/documentation/src/command/CSVCC.rst index 93daa7149..e9c6adfb7 100644 --- a/documentation/src/command/CSVCC.rst +++ b/documentation/src/command/CSVCC.rst @@ -7,13 +7,18 @@ ===== CSVCC ===== -The **CSVCC** command is used aggregate or count genotypes stored in horizontal CSV format. See :ref:`CSVSEL` for more -description of the requirements for the data input. +The **CSVCC** command is used aggregate or count genotypes stored in horizontal values column in a CSV or fixed element size format. +See :ref:`CSVSEL` for more description of the requirements for the data input. The aggregation is grouped according to the phenotype relation, which can be of the form (tag,status) or (tag,pheno,status). Typically, the relation is a binary relation with PN and case vs control status. Affection status for multiple phenotypes can be represented by using the class. +Note, only non-zero GTcounts are in the output, hence when this output is used with :ref:`PIVOT` the -e option should be used with zero, +e.g. see examples below. + +Also, CSVCC is slower when GT values outside the range 0-6 are used and when a separator is used instead of fixed value size. + Usage ===== @@ -89,6 +94,68 @@ It is also possible to perform case-control statistics for multiple phenotypes, | SELECT 1,2,reference,call,pheno,pVal_mm | PIVOT -gc reference,call pheno -v 'PhenoA,PhenoB' -e 'NA' +Below are unit tests that show equivalence queries that use GROUP count and compare it with CSVCC: + +.. code-block:: gor + + create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket; + + create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C' + | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T') + | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T') + | multimap -cartesian <(nor [#bucket#] | rownum) + | calc gt 1+mod(rownum*4+pos,10) + | select 1,2,ref,alt,gt,pn + | where PN != 'PN0' + | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn); + + gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN + | group 1 -gc ref,alt,value -count + | calc source 'group' + | rename value GT | rename allcount GTcount + + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | csvcc -gc ref,alt -vs 1 [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'vs' + | hide cc + ) + + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | replace values fsvmap(values,1,'x',',') + | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'sep' + | hide cc + ) + | group 1 -gc 2-source[-1] -set -dis -sc source + | throwif dis_source != 3 + +An example showing GT values that are of different lengths: + +.. code-block:: gor + + /* same creates as in previous example * + gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN + | replace value value*2 + | group 1 -gc ref,alt,value -count + | calc source 'group' + | rename value GT | rename allcount GTcount + + + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | replace values fsvmap(values,1,'int(x)*2',',') + | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'sep' + | hide cc + ) + | group 1 -gc 2-source[-1] -set -dis -sc source + | throwif dis_scour != 2 + + Related commands ---------------- diff --git a/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala b/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala index 06be500df..0f6fdbe8a 100644 --- a/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala +++ b/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala @@ -34,6 +34,7 @@ import org.gorpipe.gor.session.GorSession import org.gorpipe.model.gor.RowObj import org.gorpipe.model.gor.iterators.LineIterator +import scala.collection.mutable import scala.collection.mutable.ArrayBuffer object GorCsvCC { @@ -77,6 +78,8 @@ object GorCsvCC { var splitArr: Array[SaHolder] = _ var phenoStatusCounter: Array[Array[Int]] = _ var phenoStatusFloatCounter: Array[Array[Double]] = _ + var phenoExtraValues : Array[scala.collection.mutable.HashMap[String,Int]] = _ + var hashUsed : Array[Boolean] = _ } val useGroup: Boolean = if (grCols.nonEmpty) true else false @@ -104,23 +107,28 @@ object GorCsvCC { sh.splitArr = new Array[SaHolder](maxUsedBuckets) sh.phenoStatusCounter = new Array[Array[Int]](maxPhenoStats) sh.phenoStatusFloatCounter = new Array[Array[Double]](maxPhenoStats) + sh.phenoExtraValues = new Array[mutable.HashMap[String, Int]](maxPhenoStats) + sh.hashUsed = new Array[Boolean](maxPhenoStats) + var i = 0 while (i < maxPhenoStats) { - sh.phenoStatusCounter(i) = new Array[Int](5) - sh.phenoStatusFloatCounter(i) = new Array[Double](5) + sh.phenoStatusCounter(i) = new Array[Int](7) + sh.phenoStatusFloatCounter(i) = new Array[Double](7) + sh.hashUsed(i) = false + sh.phenoExtraValues(i) = scala.collection.mutable.HashMap.empty[String, Int] i += 1 } } var i = 0 while (i < sh.buckRows.length) { sh.buckRows(i) = null - if (valSize == -1) sh.splitArr(i) = SaHolder(new scala.collection.mutable.ArrayBuffer[Int](100)) + if (valSize == -1) sh.splitArr(i) = SaHolder(new scala.collection.mutable.ArrayBuffer[Int](1000)) i += 1 } i = 0 while (i < maxPhenoStats) { var j = 0 - while (j < 5) { + while (j < 7) { sh.phenoStatusCounter(i)(j) = 0 sh.phenoStatusFloatCounter(i)(j) = 0.0 j += 1 @@ -189,19 +197,45 @@ object GorCsvCC { } else throw new RuntimeException("Problem with input data when generating row: " + line + "\n\n") } else { var start = 0 + var stop = 0 if (valSize == -1) { val sah = sh.splitArr(buckNo) if (buckPos > 0) start = sah.seps(buckPos - 1) + 1 + stop = sah.seps(buckPos) } else { if (buckPos > 0) start = buckPos * (valSize + sepSize) } if (!use_prob) { - val gt = rstr.charAt(start) - if (gt == '0') sh.phenoStatusCounter(phenostatus)(0) += 1 - else if (gt == '1') sh.phenoStatusCounter(phenostatus)(1) += 1 - else if (gt == '2') sh.phenoStatusCounter(phenostatus)(2) += 1 - else sh.phenoStatusCounter(phenostatus)(3) += 1 + if (valSize == 1) { + val gt = rstr.charAt(start) + if (gt == '0') sh.phenoStatusCounter(phenostatus)(0) += 1 + else if (gt == '1') sh.phenoStatusCounter(phenostatus)(1) += 1 + else if (gt == '2') sh.phenoStatusCounter(phenostatus)(2) += 1 + else if (gt == '3') sh.phenoStatusCounter(phenostatus)(3) += 1 + else if (gt == '4') sh.phenoStatusCounter(phenostatus)(4) += 1 + else if (gt == '5') sh.phenoStatusCounter(phenostatus)(5) += 1 + else { + sh.hashUsed(phenostatus) = true + val key = gt.toString + val newCount = sh.phenoExtraValues(phenostatus).getOrElse(key, 0) + 1 + sh.phenoExtraValues(phenostatus).update(key, newCount) + } + } else { + val gt = rstr.subSequence(start,stop).toString + if (gt == "0") sh.phenoStatusCounter(phenostatus)(0) += 1 + else if (gt == "1") sh.phenoStatusCounter(phenostatus)(1) += 1 + else if (gt == "2") sh.phenoStatusCounter(phenostatus)(2) += 1 + else if (gt == "3") sh.phenoStatusCounter(phenostatus)(3) += 1 + else if (gt == "4") sh.phenoStatusCounter(phenostatus)(4) += 1 + else if (gt == "5") sh.phenoStatusCounter(phenostatus)(5) += 1 + else { + sh.hashUsed(phenostatus) = true + val newCount = sh.phenoExtraValues(phenostatus).getOrElse(gt, 0) + 1 + sh.phenoExtraValues(phenostatus).update(gt, newCount) + } + } + } else { if (rstr.charAt(start) == ' ') { sh.phenoStatusCounter(phenostatus)(3) += 1 @@ -242,21 +276,29 @@ object GorCsvCC { } phenorow += 1 } + var i = 0 while (i < maxPhenoStats) { val pheno = tbpi.phenoMap(i) var j = 0 - while (j < 4.max(uc + 1)) { + while (j < 6.max(uc + 1)) { if (nextProcessor.wantsNoMore) return if (!use_prob || use_threshold) { val counts = sh.phenoStatusCounter(i)(j) - nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + counts)) + if (counts > 0) nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + counts)) } else { val counts = sh.phenoStatusFloatCounter(i)(j) - nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + fd3.format(counts))) + if (counts > 0.0) nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + fd3.format(counts))) } j += 1 } + + if (sh.hashUsed(i)) { + for (key <- sh.phenoExtraValues(i).keys.toList.sorted) { + val counts : Int = sh.phenoExtraValues(i)(key) + nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + key + '\t' + counts)) + } + } i += 1 } } catch { diff --git a/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala b/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala index 3e8b03379..c61b02ee5 100644 --- a/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala +++ b/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala @@ -61,7 +61,7 @@ object GtGenAnalysis { sh.buckValueCols(i).setLength(buckSize) var j = 0 while (j < buckSize) { - sh.buckValueCols(i).setCharAt(j, '4') // Unspecified, only 4 will possibly be changed to 0 and not 3. + sh.buckValueCols(i).setCharAt(j, '3') // Missing j += 1 } i += 1 @@ -291,7 +291,7 @@ object GtGenAnalysis { } //####### - super.process(RowObj(s"${lr.colsSlice(0, bucketCol+1)}\t${lSeg.values.toString().replace('4','3')}")) + super.process(RowObj(s"${lr.colsSlice(0, bucketCol+1)}\t${lSeg.values.toString() }")) if ((lr.chr == lastLeftChr && maxLeftStop < leftStop) || lr.chr != lastLeftChr) maxLeftStop = leftStop lastLeftChr = lr.chr From 3df010aac4504fecf1a5481d33100afb6b79d60e Mon Sep 17 00:00:00 2001 From: Hakon G Date: Thu, 26 Jun 2025 11:16:14 +0000 Subject: [PATCH 2/3] feat(ENGKNOW-2566): support for more GT values --- documentation/src/command/CSVCC.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/src/command/CSVCC.rst b/documentation/src/command/CSVCC.rst index e9c6adfb7..823f4b6de 100644 --- a/documentation/src/command/CSVCC.rst +++ b/documentation/src/command/CSVCC.rst @@ -153,7 +153,7 @@ An example showing GT values that are of different lengths: | hide cc ) | group 1 -gc 2-source[-1] -set -dis -sc source - | throwif dis_scour != 2 + | throwif dis_source != 2 Related commands From 293d96db08bbdd18f8cac22dde55acbe623c7a53 Mon Sep 17 00:00:00 2001 From: Gisli Magnusson Date: Thu, 26 Jun 2025 18:16:27 +0000 Subject: [PATCH 3/3] fix(ENGKNOW-2566): Add tests. --- gortools/src/test/java/gorsat/UTestCSVCC.java | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/gortools/src/test/java/gorsat/UTestCSVCC.java b/gortools/src/test/java/gorsat/UTestCSVCC.java index 35b9f7686..06c9f7d9b 100644 --- a/gortools/src/test/java/gorsat/UTestCSVCC.java +++ b/gortools/src/test/java/gorsat/UTestCSVCC.java @@ -91,4 +91,76 @@ public void automaticUnzipOfValues() { Assert.assertEquals(normalLines, inflatedLines); } + + @Test + public void testCSVCCvsGroup() { + TestUtils.runGorPipe(""" + create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket; + + create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C' + | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T') + | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T') + | multimap -cartesian <(nor [#bucket#] | rownum) + | calc gt 1+mod(rownum*4+pos,10) + | select 1,2,ref,alt,gt,pn + | where PN != 'PN0' + | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn); + + gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN + | group 1 -gc ref,alt,value -count + | calc source 'group' + | rename value GT | rename allcount GTcount + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | csvcc -gc ref,alt -vs 1 [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'vs' + | hide cc + ) + + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | replace values fsvmap(values,1,'x',',') + | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'sep' + | hide cc + ) + | group 1 -gc 2-source[-1] -set -dis -sc source + | throwif dis_source != 3 + """ ); + } + + @Test + public void testGTValuesOfDifferentLengths() { + TestUtils.runGorPipe(""" + create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket; + + create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C' + | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T') + | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T') + | multimap -cartesian <(nor [#bucket#] | rownum) + | calc gt 1+mod(rownum*4+pos,10) + | select 1,2,ref,alt,gt,pn + | where PN != 'PN0' + | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn); + + gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN + | replace value value*2 + | group 1 -gc ref,alt,value -count + | calc source 'group' + | rename value GT | rename allcount GTcount + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | replace values fsvmap(values,1,'int(x)*2',',') + | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'sep' + | hide cc + ) + | group 1 -gc 2-source[-1] -set -dis -sc source + | throwif dis_source != 2 + + """ ); + } + }