diff --git a/documentation/src/command/CSVCC.rst b/documentation/src/command/CSVCC.rst index 93daa714..823f4b6d 100644 --- a/documentation/src/command/CSVCC.rst +++ b/documentation/src/command/CSVCC.rst @@ -7,13 +7,18 @@ ===== CSVCC ===== -The **CSVCC** command is used aggregate or count genotypes stored in horizontal CSV format. See :ref:`CSVSEL` for more -description of the requirements for the data input. +The **CSVCC** command is used aggregate or count genotypes stored in horizontal values column in a CSV or fixed element size format. +See :ref:`CSVSEL` for more description of the requirements for the data input. The aggregation is grouped according to the phenotype relation, which can be of the form (tag,status) or (tag,pheno,status). Typically, the relation is a binary relation with PN and case vs control status. Affection status for multiple phenotypes can be represented by using the class. +Note, only non-zero GTcounts are in the output, hence when this output is used with :ref:`PIVOT` the -e option should be used with zero, +e.g. see examples below. + +Also, CSVCC is slower when GT values outside the range 0-6 are used and when a separator is used instead of fixed value size. + Usage ===== @@ -89,6 +94,68 @@ It is also possible to perform case-control statistics for multiple phenotypes, | SELECT 1,2,reference,call,pheno,pVal_mm | PIVOT -gc reference,call pheno -v 'PhenoA,PhenoB' -e 'NA' +Below are unit tests that show equivalence queries that use GROUP count and compare it with CSVCC: + +.. code-block:: gor + + create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket; + + create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C' + | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T') + | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T') + | multimap -cartesian <(nor [#bucket#] | rownum) + | calc gt 1+mod(rownum*4+pos,10) + | select 1,2,ref,alt,gt,pn + | where PN != 'PN0' + | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn); + + gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN + | group 1 -gc ref,alt,value -count + | calc source 'group' + | rename value GT | rename allcount GTcount + + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | csvcc -gc ref,alt -vs 1 [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'vs' + | hide cc + ) + + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | replace values fsvmap(values,1,'x',',') + | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'sep' + | hide cc + ) + | group 1 -gc 2-source[-1] -set -dis -sc source + | throwif dis_source != 3 + +An example showing GT values that are of different lengths: + +.. code-block:: gor + + /* same creates as in previous example * + gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN + | replace value value*2 + | group 1 -gc ref,alt,value -count + | calc source 'group' + | rename value GT | rename allcount GTcount + + + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | replace values fsvmap(values,1,'int(x)*2',',') + | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'sep' + | hide cc + ) + | group 1 -gc 2-source[-1] -set -dis -sc source + | throwif dis_source != 2 + + Related commands ---------------- diff --git a/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala b/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala index 06be500d..0f6fdbe8 100644 --- a/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala +++ b/gortools/src/main/scala/gorsat/Analysis/GorCsvCC.scala @@ -34,6 +34,7 @@ import org.gorpipe.gor.session.GorSession import org.gorpipe.model.gor.RowObj import org.gorpipe.model.gor.iterators.LineIterator +import scala.collection.mutable import scala.collection.mutable.ArrayBuffer object GorCsvCC { @@ -77,6 +78,8 @@ object GorCsvCC { var splitArr: Array[SaHolder] = _ var phenoStatusCounter: Array[Array[Int]] = _ var phenoStatusFloatCounter: Array[Array[Double]] = _ + var phenoExtraValues : Array[scala.collection.mutable.HashMap[String,Int]] = _ + var hashUsed : Array[Boolean] = _ } val useGroup: Boolean = if (grCols.nonEmpty) true else false @@ -104,23 +107,28 @@ object GorCsvCC { sh.splitArr = new Array[SaHolder](maxUsedBuckets) sh.phenoStatusCounter = new Array[Array[Int]](maxPhenoStats) sh.phenoStatusFloatCounter = new Array[Array[Double]](maxPhenoStats) + sh.phenoExtraValues = new Array[mutable.HashMap[String, Int]](maxPhenoStats) + sh.hashUsed = new Array[Boolean](maxPhenoStats) + var i = 0 while (i < maxPhenoStats) { - sh.phenoStatusCounter(i) = new Array[Int](5) - sh.phenoStatusFloatCounter(i) = new Array[Double](5) + sh.phenoStatusCounter(i) = new Array[Int](7) + sh.phenoStatusFloatCounter(i) = new Array[Double](7) + sh.hashUsed(i) = false + sh.phenoExtraValues(i) = scala.collection.mutable.HashMap.empty[String, Int] i += 1 } } var i = 0 while (i < sh.buckRows.length) { sh.buckRows(i) = null - if (valSize == -1) sh.splitArr(i) = SaHolder(new scala.collection.mutable.ArrayBuffer[Int](100)) + if (valSize == -1) sh.splitArr(i) = SaHolder(new scala.collection.mutable.ArrayBuffer[Int](1000)) i += 1 } i = 0 while (i < maxPhenoStats) { var j = 0 - while (j < 5) { + while (j < 7) { sh.phenoStatusCounter(i)(j) = 0 sh.phenoStatusFloatCounter(i)(j) = 0.0 j += 1 @@ -189,19 +197,45 @@ object GorCsvCC { } else throw new RuntimeException("Problem with input data when generating row: " + line + "\n\n") } else { var start = 0 + var stop = 0 if (valSize == -1) { val sah = sh.splitArr(buckNo) if (buckPos > 0) start = sah.seps(buckPos - 1) + 1 + stop = sah.seps(buckPos) } else { if (buckPos > 0) start = buckPos * (valSize + sepSize) } if (!use_prob) { - val gt = rstr.charAt(start) - if (gt == '0') sh.phenoStatusCounter(phenostatus)(0) += 1 - else if (gt == '1') sh.phenoStatusCounter(phenostatus)(1) += 1 - else if (gt == '2') sh.phenoStatusCounter(phenostatus)(2) += 1 - else sh.phenoStatusCounter(phenostatus)(3) += 1 + if (valSize == 1) { + val gt = rstr.charAt(start) + if (gt == '0') sh.phenoStatusCounter(phenostatus)(0) += 1 + else if (gt == '1') sh.phenoStatusCounter(phenostatus)(1) += 1 + else if (gt == '2') sh.phenoStatusCounter(phenostatus)(2) += 1 + else if (gt == '3') sh.phenoStatusCounter(phenostatus)(3) += 1 + else if (gt == '4') sh.phenoStatusCounter(phenostatus)(4) += 1 + else if (gt == '5') sh.phenoStatusCounter(phenostatus)(5) += 1 + else { + sh.hashUsed(phenostatus) = true + val key = gt.toString + val newCount = sh.phenoExtraValues(phenostatus).getOrElse(key, 0) + 1 + sh.phenoExtraValues(phenostatus).update(key, newCount) + } + } else { + val gt = rstr.subSequence(start,stop).toString + if (gt == "0") sh.phenoStatusCounter(phenostatus)(0) += 1 + else if (gt == "1") sh.phenoStatusCounter(phenostatus)(1) += 1 + else if (gt == "2") sh.phenoStatusCounter(phenostatus)(2) += 1 + else if (gt == "3") sh.phenoStatusCounter(phenostatus)(3) += 1 + else if (gt == "4") sh.phenoStatusCounter(phenostatus)(4) += 1 + else if (gt == "5") sh.phenoStatusCounter(phenostatus)(5) += 1 + else { + sh.hashUsed(phenostatus) = true + val newCount = sh.phenoExtraValues(phenostatus).getOrElse(gt, 0) + 1 + sh.phenoExtraValues(phenostatus).update(gt, newCount) + } + } + } else { if (rstr.charAt(start) == ' ') { sh.phenoStatusCounter(phenostatus)(3) += 1 @@ -242,21 +276,29 @@ object GorCsvCC { } phenorow += 1 } + var i = 0 while (i < maxPhenoStats) { val pheno = tbpi.phenoMap(i) var j = 0 - while (j < 4.max(uc + 1)) { + while (j < 6.max(uc + 1)) { if (nextProcessor.wantsNoMore) return if (!use_prob || use_threshold) { val counts = sh.phenoStatusCounter(i)(j) - nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + counts)) + if (counts > 0) nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + counts)) } else { val counts = sh.phenoStatusFloatCounter(i)(j) - nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + fd3.format(counts))) + if (counts > 0.0) nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + j + '\t' + fd3.format(counts))) } j += 1 } + + if (sh.hashUsed(i)) { + for (key <- sh.phenoExtraValues(i).keys.toList.sorted) { + val counts : Int = sh.phenoExtraValues(i)(key) + nextProcessor.process(RowObj(line + '\t' + pheno + '\t' + key + '\t' + counts)) + } + } i += 1 } } catch { diff --git a/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala b/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala index 3e8b0337..c61b02ee 100644 --- a/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala +++ b/gortools/src/main/scala/gorsat/Analysis/GtGenAnalysis.scala @@ -61,7 +61,7 @@ object GtGenAnalysis { sh.buckValueCols(i).setLength(buckSize) var j = 0 while (j < buckSize) { - sh.buckValueCols(i).setCharAt(j, '4') // Unspecified, only 4 will possibly be changed to 0 and not 3. + sh.buckValueCols(i).setCharAt(j, '3') // Missing j += 1 } i += 1 @@ -291,7 +291,7 @@ object GtGenAnalysis { } //####### - super.process(RowObj(s"${lr.colsSlice(0, bucketCol+1)}\t${lSeg.values.toString().replace('4','3')}")) + super.process(RowObj(s"${lr.colsSlice(0, bucketCol+1)}\t${lSeg.values.toString() }")) if ((lr.chr == lastLeftChr && maxLeftStop < leftStop) || lr.chr != lastLeftChr) maxLeftStop = leftStop lastLeftChr = lr.chr diff --git a/gortools/src/test/java/gorsat/UTestCSVCC.java b/gortools/src/test/java/gorsat/UTestCSVCC.java index 35b9f768..06c9f7d9 100644 --- a/gortools/src/test/java/gorsat/UTestCSVCC.java +++ b/gortools/src/test/java/gorsat/UTestCSVCC.java @@ -91,4 +91,76 @@ public void automaticUnzipOfValues() { Assert.assertEquals(normalLines, inflatedLines); } + + @Test + public void testCSVCCvsGroup() { + TestUtils.runGorPipe(""" + create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket; + + create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C' + | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T') + | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T') + | multimap -cartesian <(nor [#bucket#] | rownum) + | calc gt 1+mod(rownum*4+pos,10) + | select 1,2,ref,alt,gt,pn + | where PN != 'PN0' + | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn); + + gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN + | group 1 -gc ref,alt,value -count + | calc source 'group' + | rename value GT | rename allcount GTcount + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | csvcc -gc ref,alt -vs 1 [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'vs' + | hide cc + ) + + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | replace values fsvmap(values,1,'x',',') + | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'sep' + | hide cc + ) + | group 1 -gc 2-source[-1] -set -dis -sc source + | throwif dis_source != 3 + """ ); + } + + @Test + public void testGTValuesOfDifferentLengths() { + TestUtils.runGorPipe(""" + create #bucket# = norrows 10 | calc bucket 'b1' | calc PN 'PN'+str(#1) | select pn,bucket; + + create #gt# = gorrow chr1,1 | calc alt 'A' | calc ref 'C' + | merge <(gorrow chr1,2 | calc alt 'G' | calc ref 'T') + | merge <(gorrow chr1,3 | calc alt 'A' | calc ref 'T') + | multimap -cartesian <(nor [#bucket#] | rownum) + | calc gt 1+mod(rownum*4+pos,10) + | select 1,2,ref,alt,gt,pn + | where PN != 'PN0' + | gtgen -gc ref,alt [#bucket#] <(gorrow chr1,0,1 | multimap -cartesian [#bucket#] | select 1-3,pn); + + gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) -tag PN + | replace value value*2 + | group 1 -gc ref,alt,value -count + | calc source 'group' + | rename value GT | rename allcount GTcount + | merge <(gor [#gt#] + | csvsel -gc ref,alt,bucket -vs 1 [#bucket#] <(nor [#bucket#] | select pn) + | replace values fsvmap(values,1,'int(x)*2',',') + | csvcc -gc ref,alt -s ',' [#bucket#] <(nor [#bucket#] | select pn | calc pheno 'pheno') + | calc source 'sep' + | hide cc + ) + | group 1 -gc 2-source[-1] -set -dis -sc source + | throwif dis_source != 2 + + """ ); + } + }