From ca80aad374f29293048a4c69216839d2151d97c4 Mon Sep 17 00:00:00 2001
From: Hakon G <hgudbjartsson@genedx.com>
Date: Thu, 26 Jun 2025 11:10:05 +0000
Subject: [PATCH 1/3] feat(ENGKNOW-2566): support for more GT values

---
 documentation/src/command/CSVCC.rst           | 71 ++++++++++++++++++-
 .../main/scala/gorsat/Analysis/GorCsvCC.scala | 66 +++++++++++++----
 .../scala/gorsat/Analysis/GtGenAnalysis.scala |  4 +-
 3 files changed, 125 insertions(+), 16 deletions(-)

diff --git a/documentation/src/command/CSVCC.rst b/documentation/src/command/CSVCC.rst
index 93daa7149..e9c6adfb7 100644
--- a/documentation/src/command/CSVCC.rst
+++ b/documentation/src/command/CSVCC.rst
@@ -7,13 +7,18 @@
 =====
 CSVCC
 =====
-The **CSVCC** command is used aggregate or count genotypes stored in horizontal CSV format.  See :ref:`CSVSEL` for more
-description of the requirements for the data input.
+The **CSVCC** command is used aggregate or count genotypes stored in horizontal values column in a CSV or fixed element size format.
+See :ref:`CSVSEL` for more description of the requirements for the data input.
 
 The aggregation is grouped according to the phenotype relation, which can be of the form (tag,status) or (tag,pheno,status).
 Typically, the relation is a binary relation with PN and case vs control status.  Affection status for multiple phenotypes
 can be represented by using the class.
 
+Note, only non-zero GTcounts are in the output, hence when this output is used with :ref:`PIVOT` the -e option should be used with zero,
+e.g. see examples below.
+
+Also, CSVCC is slower when GT values outside the range 0-6 are used and when a separator is used instead of fixed value size.
+
 Usage
 =====
 
@@ -89,6 +94,68 @@ It is also possible to perform case-control statistics for multiple phenotypes,
     | SELECT 1,2,reference,call,pheno,pVal_mm
     | PIVOT -gc reference,call pheno -v 'PhenoA,PhenoB' -e 'NA'
 
+Below are unit tests that show equivalence queries that use GROUP count and compare it with CSVCC:
+
+.. code-block:: gor
+
+    create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket;
+
+    create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C'
+    | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T')
+    | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T')
+    | multimap -cartesian <(nor [#bucket#] | rownum)
+    | calc gt 1+mod(rownum*4+pos,10)
+    | select 1,2,ref,alt,gt,pn
+    | where PN != 'PN0'
+    | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn);
+
+    gor [#gt#]
+    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN
+    | group 1 -gc ref,alt,value -count
+    | calc source 'group'
+    | rename value GT | rename allcount GTcount
+
+    | merge <(gor [#gt#]
+    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+    | csvcc -gc ref,alt -vs 1 [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+    | calc source 'vs'
+    | hide cc
+    )
+
+    | merge <(gor [#gt#]
+    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+    | replace values fsvmap(values,1,'x',',')
+    | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+    | calc source 'sep'
+    | hide cc
+    )
+    | group 1 -gc 2-source[-1] -set -dis -sc source
+    | throwif dis_source != 3
+
+An example showing GT values that are of different lengths:
+
+.. code-block:: gor
+
+    /* same creates as in previous example *
+    gor [#gt#]
+    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN
+    | replace value value*2
+    | group 1 -gc ref,alt,value -count
+    | calc source 'group'
+    | rename value GT | rename allcount GTcount
+
+
+    | merge <(gor [#gt#]
+    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+    | replace values fsvmap(values,1,'int(x)*2',',')
+    | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+    | calc source 'sep'
+    | hide cc
+    )
+    | group 1 -gc 2-source[-1] -set -dis -sc source
+    | throwif dis_scour != 2
+
+
 Related commands
 ----------------
 
diff --git a/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala b/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala
index 06be500df..0f6fdbe8a 100644
--- a/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala
+++ b/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala
@@ -34,6 +34,7 @@ import org.gorpipe.gor.session.GorSession
 import org.gorpipe.model.gor.RowObj
 import org.gorpipe.model.gor.iterators.LineIterator
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 object GorCsvCC {
@@ -77,6 +78,8 @@ object GorCsvCC {
       var splitArr: Array[SaHolder] = _
       var phenoStatusCounter: Array[Array[Int]] = _
       var phenoStatusFloatCounter: Array[Array[Double]] = _
+      var phenoExtraValues : Array[scala.collection.mutable.HashMap[String,Int]] = _
+      var hashUsed : Array[Boolean] = _
     }
 
     val useGroup: Boolean = if (grCols.nonEmpty) true else false
@@ -104,23 +107,28 @@ object GorCsvCC {
         sh.splitArr = new Array[SaHolder](maxUsedBuckets)
         sh.phenoStatusCounter = new Array[Array[Int]](maxPhenoStats)
         sh.phenoStatusFloatCounter = new Array[Array[Double]](maxPhenoStats)
+        sh.phenoExtraValues = new Array[mutable.HashMap[String, Int]](maxPhenoStats)
+        sh.hashUsed = new Array[Boolean](maxPhenoStats)
+
         var i = 0
         while (i < maxPhenoStats) {
-          sh.phenoStatusCounter(i) = new Array[Int](5)
-          sh.phenoStatusFloatCounter(i) = new Array[Double](5)
+          sh.phenoStatusCounter(i) = new Array[Int](7)
+          sh.phenoStatusFloatCounter(i) = new Array[Double](7)
+          sh.hashUsed(i) = false
+          sh.phenoExtraValues(i) = scala.collection.mutable.HashMap.empty[String, Int]
           i += 1
         }
       }
       var i = 0
       while (i < sh.buckRows.length) {
         sh.buckRows(i) = null
-        if (valSize == -1) sh.splitArr(i) = SaHolder(new scala.collection.mutable.ArrayBuffer[Int](100))
+        if (valSize == -1) sh.splitArr(i) = SaHolder(new scala.collection.mutable.ArrayBuffer[Int](1000))
         i += 1
       }
       i = 0
       while (i < maxPhenoStats) {
         var j = 0
-        while (j < 5) {
+        while (j < 7) {
           sh.phenoStatusCounter(i)(j) = 0
           sh.phenoStatusFloatCounter(i)(j) = 0.0
           j += 1
@@ -189,19 +197,45 @@ object GorCsvCC {
               } else throw new RuntimeException("Problem with input data when generating row: " + line + "\n\n")
             } else {
               var start = 0
+              var stop = 0
 
               if (valSize == -1) {
                 val sah = sh.splitArr(buckNo)
                 if (buckPos > 0) start = sah.seps(buckPos - 1) + 1
+                stop = sah.seps(buckPos)
               } else {
                 if (buckPos > 0) start = buckPos * (valSize + sepSize)
               }
               if (!use_prob) {
-                val gt = rstr.charAt(start)
-                if (gt == '0') sh.phenoStatusCounter(phenostatus)(0) += 1
-                else if (gt == '1') sh.phenoStatusCounter(phenostatus)(1) += 1
-                else if (gt == '2') sh.phenoStatusCounter(phenostatus)(2) += 1
-                else sh.phenoStatusCounter(phenostatus)(3) += 1
+                if (valSize == 1) {
+                  val gt = rstr.charAt(start)
+                  if (gt == '0') sh.phenoStatusCounter(phenostatus)(0) += 1
+                  else if (gt == '1') sh.phenoStatusCounter(phenostatus)(1) += 1
+                  else if (gt == '2') sh.phenoStatusCounter(phenostatus)(2) += 1
+                  else if (gt == '3') sh.phenoStatusCounter(phenostatus)(3) += 1
+                  else if (gt == '4') sh.phenoStatusCounter(phenostatus)(4) += 1
+                  else if (gt == '5') sh.phenoStatusCounter(phenostatus)(5) += 1
+                  else {
+                    sh.hashUsed(phenostatus) = true
+                    val key = gt.toString
+                    val newCount = sh.phenoExtraValues(phenostatus).getOrElse(key, 0) + 1
+                    sh.phenoExtraValues(phenostatus).update(key, newCount)
+                  }
+                } else {
+                  val gt = rstr.subSequence(start,stop).toString
+                  if (gt == "0") sh.phenoStatusCounter(phenostatus)(0) += 1
+                  else if (gt == "1") sh.phenoStatusCounter(phenostatus)(1) += 1
+                  else if (gt == "2") sh.phenoStatusCounter(phenostatus)(2) += 1
+                  else if (gt == "3") sh.phenoStatusCounter(phenostatus)(3) += 1
+                  else if (gt == "4") sh.phenoStatusCounter(phenostatus)(4) += 1
+                  else if (gt == "5") sh.phenoStatusCounter(phenostatus)(5) += 1
+                  else {
+                    sh.hashUsed(phenostatus) = true
+                    val newCount = sh.phenoExtraValues(phenostatus).getOrElse(gt, 0) + 1
+                    sh.phenoExtraValues(phenostatus).update(gt, newCount)
+                  }
+                }
+
               } else {
                 if (rstr.charAt(start) == ' ') {
                   sh.phenoStatusCounter(phenostatus)(3) += 1
@@ -242,21 +276,29 @@ object GorCsvCC {
             }
             phenorow += 1
           }
+
           var i = 0
           while (i < maxPhenoStats) {
             val pheno = tbpi.phenoMap(i)
             var j = 0
-            while (j < 4.max(uc + 1)) {
+            while (j < 6.max(uc + 1)) {
               if (nextProcessor.wantsNoMore) return
               if (!use_prob || use_threshold) {
                 val counts = sh.phenoStatusCounter(i)(j)
-                nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + counts))
+                if (counts > 0) nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + counts))
               } else {
                 val counts = sh.phenoStatusFloatCounter(i)(j)
-                nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + fd3.format(counts)))
+                if (counts > 0.0) nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + fd3.format(counts)))
               }
               j += 1
             }
+
+            if (sh.hashUsed(i)) {
+              for (key <- sh.phenoExtraValues(i).keys.toList.sorted) {
+                val counts : Int = sh.phenoExtraValues(i)(key)
+                nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + key + '\t' + counts))
+              }
+            }
             i += 1
           }
         } catch {
diff --git a/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala b/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala
index 3e8b03379..c61b02ee5 100644
--- a/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala
+++ b/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala
@@ -61,7 +61,7 @@ object GtGenAnalysis {
         sh.buckValueCols(i).setLength(buckSize)
         var j = 0
         while (j < buckSize) {
-          sh.buckValueCols(i).setCharAt(j, '4') // Unspecified, only 4 will possibly be changed to 0 and not 3.
+          sh.buckValueCols(i).setCharAt(j, '3') // Missing
           j += 1
         }
         i += 1
@@ -291,7 +291,7 @@ object GtGenAnalysis {
       }
 
       //#######
-      super.process(RowObj(s"${lr.colsSlice(0, bucketCol+1)}\t${lSeg.values.toString().replace('4','3')}"))
+      super.process(RowObj(s"${lr.colsSlice(0, bucketCol+1)}\t${lSeg.values.toString() }"))
 
       if ((lr.chr == lastLeftChr && maxLeftStop < leftStop) || lr.chr != lastLeftChr) maxLeftStop = leftStop
       lastLeftChr = lr.chr

From 3df010aac4504fecf1a5481d33100afb6b79d60e Mon Sep 17 00:00:00 2001
From: Hakon G <hgudbjartsson@genedx.com>
Date: Thu, 26 Jun 2025 11:16:14 +0000
Subject: [PATCH 2/3] feat(ENGKNOW-2566): support for more GT values

---
 documentation/src/command/CSVCC.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/src/command/CSVCC.rst b/documentation/src/command/CSVCC.rst
index e9c6adfb7..823f4b6de 100644
--- a/documentation/src/command/CSVCC.rst
+++ b/documentation/src/command/CSVCC.rst
@@ -153,7 +153,7 @@ An example showing GT values that are of different lengths:
     | hide cc
     )
     | group 1 -gc 2-source[-1] -set -dis -sc source
-    | throwif dis_scour != 2
+    | throwif dis_source != 2
 
 
 Related commands

From 293d96db08bbdd18f8cac22dde55acbe623c7a53 Mon Sep 17 00:00:00 2001
From: Gisli Magnusson <gmagnusson@genedx.com>
Date: Thu, 26 Jun 2025 18:16:27 +0000
Subject: [PATCH 3/3] fix(ENGKNOW-2566):  Add tests.

---
 gortools/src/test/java/gorsat/UTestCSVCC.java | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/gortools/src/test/java/gorsat/UTestCSVCC.java b/gortools/src/test/java/gorsat/UTestCSVCC.java
index 35b9f7686..06c9f7d9b 100644
--- a/gortools/src/test/java/gorsat/UTestCSVCC.java
+++ b/gortools/src/test/java/gorsat/UTestCSVCC.java
@@ -91,4 +91,76 @@ public void automaticUnzipOfValues() {
 
         Assert.assertEquals(normalLines, inflatedLines);
     }
+
+    @Test
+    public void testCSVCCvsGroup() {
+        TestUtils.runGorPipe("""
+                create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket;
+                
+                create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C'
+                    | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T')
+                    | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T')
+                    | multimap -cartesian <(nor [#bucket#] | rownum)
+                    | calc gt 1+mod(rownum*4+pos,10)
+                    | select 1,2,ref,alt,gt,pn
+                    | where PN != 'PN0'
+                    | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn);
+                    
+                    gor [#gt#]
+                    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN
+                    | group 1 -gc ref,alt,value -count
+                    | calc source 'group'
+                    | rename value GT | rename allcount GTcount
+                    | merge <(gor [#gt#]
+                    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+                    | csvcc -gc ref,alt -vs 1 [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+                    | calc source 'vs'
+                    | hide cc
+                    )
+                    
+                    | merge <(gor [#gt#]
+                    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+                    | replace values fsvmap(values,1,'x',',')
+                    | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+                    | calc source 'sep'
+                    | hide cc
+                    )
+                    | group 1 -gc 2-source[-1] -set -dis -sc source
+                    | throwif dis_source != 3
+                """ );
+    }
+
+    @Test
+    public void testGTValuesOfDifferentLengths() {
+        TestUtils.runGorPipe("""
+                 create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket;
+                
+                create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C'
+                    | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T')
+                    | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T')
+                    | multimap -cartesian <(nor [#bucket#] | rownum)
+                    | calc gt 1+mod(rownum*4+pos,10)
+                    | select 1,2,ref,alt,gt,pn
+                    | where PN != 'PN0'
+                    | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn);
+                    
+                gor [#gt#]
+                | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN
+                | replace value value*2
+                | group 1 -gc ref,alt,value -count
+                | calc source 'group'
+                | rename value GT | rename allcount GTcount
+                | merge <(gor [#gt#]
+                | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+                | replace values fsvmap(values,1,'int(x)*2',',')
+                | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+                | calc source 'sep'
+                | hide cc
+                )
+                | group 1 -gc 2-source[-1] -set -dis -sc source
+                | throwif dis_source != 2                    
+                    
+                """ );
+    }
+
 }