gorpipe · gmagnu · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025
diff --git a/documentation/src/command/CSVCC.rst b/documentation/src/command/CSVCC.rst
@@ -7,13 +7,18 @@
 =====
 CSVCC
 =====
-The **CSVCC** command is used aggregate or count genotypes stored in horizontal CSV format.  See :ref:`CSVSEL` for more
-description of the requirements for the data input.
+The **CSVCC** command is used aggregate or count genotypes stored in horizontal values column in a CSV or fixed element size format.
+See :ref:`CSVSEL` for more description of the requirements for the data input.
 
 The aggregation is grouped according to the phenotype relation, which can be of the form (tag,status) or (tag,pheno,status).
 Typically, the relation is a binary relation with PN and case vs control status.  Affection status for multiple phenotypes
 can be represented by using the class.
 
+Note, only non-zero GTcounts are in the output, hence when this output is used with :ref:`PIVOT` the -e option should be used with zero,
+e.g. see examples below.
+
+Also, CSVCC is slower when GT values outside the range 0-6 are used and when a separator is used instead of fixed value size.
+
 Usage
 =====
 
@@ -89,6 +94,68 @@ It is also possible to perform case-control statistics for multiple phenotypes,
     | SELECT 1,2,reference,call,pheno,pVal_mm
     | PIVOT -gc reference,call pheno -v 'PhenoA,PhenoB' -e 'NA'
 
+Below are unit tests that show equivalence queries that use GROUP count and compare it with CSVCC:
+
+.. code-block:: gor
+
+    create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket;
+
+    create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C'
+    | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T')
+    | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T')
+    | multimap -cartesian <(nor [#bucket#] | rownum)
+    | calc gt 1+mod(rownum*4+pos,10)
+    | select 1,2,ref,alt,gt,pn
+    | where PN != 'PN0'
+    | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn);
+
+    gor [#gt#]
+    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN
+    | group 1 -gc ref,alt,value -count
+    | calc source 'group'
+    | rename value GT | rename allcount GTcount
+
+    | merge <(gor [#gt#]
+    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+    | csvcc -gc ref,alt -vs 1 [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+    | calc source 'vs'
+    | hide cc
+    )
+
+    | merge <(gor [#gt#]
+    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+    | replace values fsvmap(values,1,'x',',')
+    | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+    | calc source 'sep'
+    | hide cc
+    )
+    | group 1 -gc 2-source[-1] -set -dis -sc source
+    | throwif dis_source != 3
+
+An example showing GT values that are of different lengths:
+
+.. code-block:: gor
+
+    /* same creates as in previous example *
+    gor [#gt#]
+    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN
+    | replace value value*2
+    | group 1 -gc ref,alt,value -count
+    | calc source 'group'
+    | rename value GT | rename allcount GTcount
+
+
+    | merge <(gor [#gt#]
+    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+    | replace values fsvmap(values,1,'int(x)*2',',')
+    | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+    | calc source 'sep'
+    | hide cc
+    )
+    | group 1 -gc 2-source[-1] -set -dis -sc source
+    | throwif dis_source != 2
+
+
 Related commands
 ----------------
 

diff --git a/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala b/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala
@@ -34,6 +34,7 @@ import org.gorpipe.gor.session.GorSession
 import org.gorpipe.model.gor.RowObj
 import org.gorpipe.model.gor.iterators.LineIterator
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 object GorCsvCC {
@@ -77,6 +78,8 @@ object GorCsvCC {
       var splitArr: Array[SaHolder] = _
       var phenoStatusCounter: Array[Array[Int]] = _
       var phenoStatusFloatCounter: Array[Array[Double]] = _
+      var phenoExtraValues : Array[scala.collection.mutable.HashMap[String,Int]] = _
+      var hashUsed : Array[Boolean] = _
     }
 
     val useGroup: Boolean = if (grCols.nonEmpty) true else false
@@ -104,23 +107,28 @@ object GorCsvCC {
         sh.splitArr = new Array[SaHolder](maxUsedBuckets)
         sh.phenoStatusCounter = new Array[Array[Int]](maxPhenoStats)
         sh.phenoStatusFloatCounter = new Array[Array[Double]](maxPhenoStats)
+        sh.phenoExtraValues = new Array[mutable.HashMap[String, Int]](maxPhenoStats)
+        sh.hashUsed = new Array[Boolean](maxPhenoStats)
+
         var i = 0
         while (i < maxPhenoStats) {
-          sh.phenoStatusCounter(i) = new Array[Int](5)
-          sh.phenoStatusFloatCounter(i) = new Array[Double](5)
+          sh.phenoStatusCounter(i) = new Array[Int](7)
+          sh.phenoStatusFloatCounter(i) = new Array[Double](7)
+          sh.hashUsed(i) = false
+          sh.phenoExtraValues(i) = scala.collection.mutable.HashMap.empty[String, Int]
           i += 1
         }
       }
       var i = 0
       while (i < sh.buckRows.length) {
         sh.buckRows(i) = null
-        if (valSize == -1) sh.splitArr(i) = SaHolder(new scala.collection.mutable.ArrayBuffer[Int](100))
+        if (valSize == -1) sh.splitArr(i) = SaHolder(new scala.collection.mutable.ArrayBuffer[Int](1000))
         i += 1
       }
       i = 0
       while (i < maxPhenoStats) {
         var j = 0
-        while (j < 5) {
+        while (j < 7) {
           sh.phenoStatusCounter(i)(j) = 0
           sh.phenoStatusFloatCounter(i)(j) = 0.0
           j += 1
@@ -189,19 +197,45 @@ object GorCsvCC {
               } else throw new RuntimeException("Problem with input data when generating row: " + line + "\n\n")
             } else {
               var start = 0
+              var stop = 0
 
               if (valSize == -1) {
                 val sah = sh.splitArr(buckNo)
                 if (buckPos > 0) start = sah.seps(buckPos - 1) + 1
+                stop = sah.seps(buckPos)
               } else {
                 if (buckPos > 0) start = buckPos * (valSize + sepSize)
               }
               if (!use_prob) {
-                val gt = rstr.charAt(start)
-                if (gt == '0') sh.phenoStatusCounter(phenostatus)(0) += 1
-                else if (gt == '1') sh.phenoStatusCounter(phenostatus)(1) += 1
-                else if (gt == '2') sh.phenoStatusCounter(phenostatus)(2) += 1
-                else sh.phenoStatusCounter(phenostatus)(3) += 1
+                if (valSize == 1) {
+                  val gt = rstr.charAt(start)
+                  if (gt == '0') sh.phenoStatusCounter(phenostatus)(0) += 1
+                  else if (gt == '1') sh.phenoStatusCounter(phenostatus)(1) += 1
+                  else if (gt == '2') sh.phenoStatusCounter(phenostatus)(2) += 1
+                  else if (gt == '3') sh.phenoStatusCounter(phenostatus)(3) += 1
+                  else if (gt == '4') sh.phenoStatusCounter(phenostatus)(4) += 1
+                  else if (gt == '5') sh.phenoStatusCounter(phenostatus)(5) += 1
+                  else {
+                    sh.hashUsed(phenostatus) = true
+                    val key = gt.toString
+                    val newCount = sh.phenoExtraValues(phenostatus).getOrElse(key, 0) + 1
+                    sh.phenoExtraValues(phenostatus).update(key, newCount)
+                  }
+                } else {
+                  val gt = rstr.subSequence(start,stop).toString
+                  if (gt == "0") sh.phenoStatusCounter(phenostatus)(0) += 1
+                  else if (gt == "1") sh.phenoStatusCounter(phenostatus)(1) += 1
+                  else if (gt == "2") sh.phenoStatusCounter(phenostatus)(2) += 1
+                  else if (gt == "3") sh.phenoStatusCounter(phenostatus)(3) += 1
+                  else if (gt == "4") sh.phenoStatusCounter(phenostatus)(4) += 1
+                  else if (gt == "5") sh.phenoStatusCounter(phenostatus)(5) += 1
+                  else {
+                    sh.hashUsed(phenostatus) = true
+                    val newCount = sh.phenoExtraValues(phenostatus).getOrElse(gt, 0) + 1
+                    sh.phenoExtraValues(phenostatus).update(gt, newCount)
+                  }
+                }
+
               } else {
                 if (rstr.charAt(start) == ' ') {
                   sh.phenoStatusCounter(phenostatus)(3) += 1
@@ -242,21 +276,29 @@ object GorCsvCC {
             }
             phenorow += 1
           }
+
           var i = 0
           while (i < maxPhenoStats) {
             val pheno = tbpi.phenoMap(i)
             var j = 0
-            while (j < 4.max(uc + 1)) {
+            while (j < 6.max(uc + 1)) {
               if (nextProcessor.wantsNoMore) return
               if (!use_prob || use_threshold) {
                 val counts = sh.phenoStatusCounter(i)(j)
-                nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + counts))
+                if (counts > 0) nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + counts))
               } else {
                 val counts = sh.phenoStatusFloatCounter(i)(j)
-                nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + fd3.format(counts)))
+                if (counts > 0.0) nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + fd3.format(counts)))
               }
               j += 1
             }
+
+            if (sh.hashUsed(i)) {
+              for (key <- sh.phenoExtraValues(i).keys.toList.sorted) {
+                val counts : Int = sh.phenoExtraValues(i)(key)
+                nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + key + '\t' + counts))
+              }
+            }
             i += 1
           }
         } catch {

diff --git a/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala b/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala
@@ -61,7 +61,7 @@ object GtGenAnalysis {
         sh.buckValueCols(i).setLength(buckSize)
         var j = 0
         while (j < buckSize) {
-          sh.buckValueCols(i).setCharAt(j, '4') // Unspecified, only 4 will possibly be changed to 0 and not 3.
+          sh.buckValueCols(i).setCharAt(j, '3') // Missing
           j += 1
         }
         i += 1
@@ -291,7 +291,7 @@ object GtGenAnalysis {
       }
 
       //#######
-      super.process(RowObj(s"${lr.colsSlice(0, bucketCol+1)}\t${lSeg.values.toString().replace('4','3')}"))
+      super.process(RowObj(s"${lr.colsSlice(0, bucketCol+1)}\t${lSeg.values.toString() }"))
 
       if ((lr.chr == lastLeftChr && maxLeftStop < leftStop) || lr.chr != lastLeftChr) maxLeftStop = leftStop
       lastLeftChr = lr.chr

diff --git a/gortools/src/test/java/gorsat/UTestCSVCC.java b/gortools/src/test/java/gorsat/UTestCSVCC.java
@@ -91,4 +91,76 @@ public void automaticUnzipOfValues() {
 
         Assert.assertEquals(normalLines, inflatedLines);
     }
+
+    @Test
+    public void testCSVCCvsGroup() {
+        TestUtils.runGorPipe("""
+                create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket;
+
+                create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C'
+                    | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T')
+                    | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T')
+                    | multimap -cartesian <(nor [#bucket#] | rownum)
+                    | calc gt 1+mod(rownum*4+pos,10)
+                    | select 1,2,ref,alt,gt,pn
+                    | where PN != 'PN0'
+                    | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn);
+
+                    gor [#gt#]
+                    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN
+                    | group 1 -gc ref,alt,value -count
+                    | calc source 'group'
+                    | rename value GT | rename allcount GTcount
+                    | merge <(gor [#gt#]
+                    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+                    | csvcc -gc ref,alt -vs 1 [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+                    | calc source 'vs'
+                    | hide cc
+                    )
+
+                    | merge <(gor [#gt#]
+                    | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+                    | replace values fsvmap(values,1,'x',',')
+                    | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+                    | calc source 'sep'
+                    | hide cc
+                    )
+                    | group 1 -gc 2-source[-1] -set -dis -sc source
+                    | throwif dis_source != 3
+                """ );
+    }
+
+    @Test
+    public void testGTValuesOfDifferentLengths() {
+        TestUtils.runGorPipe("""
+                 create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket;
+
+                create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C'
+                    | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T')
+                    | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T')
+                    | multimap -cartesian <(nor [#bucket#] | rownum)
+                    | calc gt 1+mod(rownum*4+pos,10)
+                    | select 1,2,ref,alt,gt,pn
+                    | where PN != 'PN0'
+                    | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn);
+
+                gor [#gt#]
+                | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN
+                | replace value value*2
+                | group 1 -gc ref,alt,value -count
+                | calc source 'group'
+                | rename value GT | rename allcount GTcount
+                | merge <(gor [#gt#]
+                | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn)
+                | replace values fsvmap(values,1,'int(x)*2',',')
+                | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno')
+                | calc source 'sep'
+                | hide cc
+                )
+                | group 1 -gc 2-source[-1] -set -dis -sc source
+                | throwif dis_source != 2                    
+
+                """ );
+    }
+
 }