kevlar-dev · standage · Aug 8, 2017 · Aug 4, 2017 · Aug 7, 2017 · Aug 8, 2017
diff --git a/kevlar/cli/count.py b/kevlar/cli/count.py
@@ -60,9 +60,10 @@ def subparser(subparsers):
 
     mem_desc = """\
     Specify how much memory to allocate for the sketch data structures
-    used to store k-mer counts. The first control sample will be
-    allocated the full amount of specifed `--memory`, and all subsequent
-    samples will be allocated a fraction thereof.
+    used to store k-mer counts. If `--mem-frac` is not set, all samples will be
+    allocated `MEM` bytes. If `--mem-frac` is set, then the first control
+    sample will be allocated `MEM` bytes, and all other samples will be
+    allocated `MEM * F` bytes.
     """
     mem_desc = textwrap.dedent(mem_desc)
     memory_args = subparser.add_argument_group('Memory allocation', mem_desc)
@@ -72,9 +73,9 @@ def subparser(subparsers):
         'the initial control sample; default is 1M'
     )
     memory_args.add_argument(
-        '-f', '--mem-frac', type=float, default=0.1, metavar='F',
+        '-f', '--mem-frac', type=float, default=None, metavar='F',
         help='fraction of the total memory to allocate to subsequent samples; '
-        'default is 0.1'
+        'should be between 0.0 and 1.0'
     )
     memory_args.add_argument(
         '--max-fpr', type=float, default=0.2, metavar='FPR',

diff --git a/kevlar/count.py b/kevlar/count.py
@@ -15,19 +15,27 @@
 import kevlar
 
 
+def split_infiles_outfiles(filelist):
+    outfiles = [flist[0] for flist in filelist]
+    infilelists = [flist[1:] for flist in filelist]
+    return outfiles, infilelists
+
+
 def main(args):
     if (args.num_bands is None) is not (args.band is None):
         raise ValueError('Must specify --num-bands and --band together')
+    myband = args.band - 1 if args.band else None
 
     timer = kevlar.Timer()
     timer.start()
 
     timer.start('loadctrl')
     print('[kevlar::count] Loading control samples', file=args.logfile)
-    controls = kevlar.counting.load_samples_with_dilution(
-        args.control, args.ksize, args.memory, memfraction=args.mem_frac,
-        maxfpr=args.max_fpr, maxabund=args.ctrl_max, masks=None,
-        numbands=args.num_bands, band=args.band, logfile=args.logfile
+    outfiles, infilelists = split_infiles_outfiles(args.control)
+    controls = kevlar.counting.load_samples(
+        infilelists, args.ksize, args.memory, outfiles=outfiles,
+        memfraction=args.mem_frac, maxfpr=args.max_fpr, maxabund=args.ctrl_max,
+        mask=None, numbands=args.num_bands, band=myband, logfile=args.logfile
     )
     elapsed = timer.stop('loadctrl')
     numcontrols = len(controls)
@@ -36,10 +44,13 @@ def main(args):
 
     print('[kevlar::count] Loading case samples', file=args.logfile)
     timer.start('loadcase')
-    cases = kevlar.counting.load_samples_with_dilution(
-        args.case, args.ksize, args.memory, memfraction=args.mem_frac,
-        maxfpr=args.max_fpr, maxabund=args.ctrl_max, masks=controls,
-        numbands=args.num_bands, band=args.band, logfile=args.logfile
+    outfiles, infilelists = split_infiles_outfiles(args.case)
+    casemask = outfiles[0] if args.mem_frac else None
+    cases = kevlar.counting.load_samples(
+        infilelists, args.ksize, args.memory, outfiles=outfiles,
+        memfraction=args.mem_frac, maxfpr=args.max_fpr, maxabund=args.ctrl_max,
+        mask=casemask, numbands=args.num_bands, band=myband,
+        logfile=args.logfile
     )
     elapsed = timer.stop('loadcase')
     numcases = len(cases)

diff --git a/kevlar/counting.py b/kevlar/counting.py
@@ -19,38 +19,43 @@ class KevlarSampleIOError(ValueError):
     pass
 
 
+class KevlarOutfileMismatchError(ValueError):
+    pass
+
+
 def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2,
-                        masks=None, maskmaxabund=1, numbands=None, band=None,
+                        mask=None, maskmaxabund=1, numbands=None, band=None,
                         outfile=None, logfile=sys.stderr):
     """
     Compute k-mer abundances for the specified sequence input.
 
     Expected input is a list of one or more FASTA/FASTQ files corresponding
     to a single sample. A counttable is created and populated with abundances
-    of all k-mers observed in the input.
+    of all k-mers observed in the input. If `mask` is provided, only k-mers not
+    present in the mask will be loaded.
     """
     message = 'loading sample from ' + ','.join(seqfiles)
     print('[kevlar::counting]    ', message, file=logfile)
 
     sketch = khmer.Counttable(ksize, memory / 4, 4)
     n, nkmers = 0, 0
-    for n, read in enumerate(kevlar.multi_file_iter_khmer(seqfiles), 1):
-        for subseq in kevlar.clean_subseqs(read.sequence, ksize):
-            for kmer in sketch.get_kmers(subseq):
-                if numbands:
-                    khash = sketch.hash(kmer)
-                    if khash & (numbands - 1) != band - 1:
-                        continue
-                if masks:
-                    for mask in masks:
-                        if mask.get(kmer) > maskmaxabund:
-                            break
-                    else:
-                        sketch.add(kmer)
-                        nkmers += 1
-                else:
-                    sketch.add(kmer)
-                    nkmers += 1
+    for seqfile in seqfiles:
+        if mask:
+            if numbands:
+                nr, nk = sketch.consume_seqfile_banding_with_mask(
+                    seqfile, numbands, band, mask
+                )
+            else:
+                nr, nk = sketch.consume_seqfile_with_mask(seqfile, mask)
+        else:
+            if numbands:
+                nr, nk = sketch.consume_seqfile_banding(
+                    seqfile, numbands, band
+                )
+            else:
+                nr, nk = sketch.consume_seqfile(seqfile)
+        n += nr
+        nkmers += nk
 
     message = 'done loading reads'
     if numbands:
@@ -62,77 +67,55 @@ def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2,
     if fpr > maxfpr:
         message += ' (FPR too high, bailing out!!!)'
         raise SystemExit(message)
-    else:
-        if outfile:
-            if not outfile.endswith(('.ct', '.counttable')):
-                outfile += '.counttable'
-            sketch.save(outfile)
-            message += '; saved to "{:s}"'.format(outfile)
-        print('[kevlar::counting]    ', message, file=logfile)
+
+    if outfile:
+        if not outfile.endswith(('.ct', '.counttable')):
+            outfile += '.counttable'
+        sketch.save(outfile)
+        message += '; saved to "{:s}"'.format(outfile)
+    print('[kevlar::counting]    ', message, file=logfile)
 
     return sketch
 
 
-def load_samples(samplelists, ksize, memory, maxfpr=0.2, numbands=None,
-                 band=None, logfile=sys.stderr):
+def load_samples(samplelists, ksize, memory, mask=None, memfraction=None,
+                 maxfpr=0.2, maxabund=1, numbands=None, band=None,
+                 outfiles=None, logfile=sys.stderr):
     """
     Load a group of related samples using a memory-efficient strategy.
 
-    Samples loaded initially are used as masks for subsequently loaded samples.
-    The first sample is allocated the full amount of memory, while subsequent
-    samples require only a fraction since they are first checked against the
-    mask(s).
-    """
-    numsamples = len(samplelists)
-    message = 'computing k-mer abundances for {:d} samples'.format(numsamples)
-    print('[kevlar::counting]    ', message, file=logfile)
-
-    sketches = list()
-    for seqfiles in samplelists:
-        sketch = load_sample_seqfile(
-            seqfiles, ksize, memory, maxfpr=maxfpr, numbands=numbands,
-            band=band, outfile=None, logfile=logfile
-        )
-        sketches.append(sketch)
-    return sketches
+    By default, each sample is loaded into a dedicated counttable, which occupy
+    `memory` bytes of memory each. Setting `memfraction` to a value between 0.0
+    and 1.0 will activate "masked" mode.
 
+    If `mask` is provided, it serves as a mask for all other samples. If it is
+    not provided, the first sample is loaded normally and then serves as a mask
+    for all subsequent samples.
 
-def load_samples_with_dilution(samplelists, ksize, memory, memfraction=0.1,
-                               maxfpr=0.2, maxabund=1, masks=None,
-                               numbands=None, band=None, skipsave=False,
-                               logfile=sys.stderr):
-    """
-    Load a group of related samples using a memory-efficient strategy.
-
-    Samples loaded initially are used as masks for subsequently loaded samples.
-    The first sample is allocated the full amount of memory, while subsequent
-    samples require only a fraction since they are first checked against the
-    mask(s).
+    In "masked mode", sample uses only `memory * memfraction` bytes of memory,
+    and any k-mer present in the mask (above a given threshold `maxabund`) is
+    ignored. In this way, we avoid taking up space storing abundances for
+    k-mers we know we're not interested in.
     """
     numsamples = len(samplelists)
+    if outfiles is None:
+        outfiles = [None] * numsamples
+    if numsamples != len(outfiles):
+        message = '# of samples ({:d}) '.format(numsamples)
+        message += 'does not match # of outfiles ({:d})'.format(len(outfiles))
+        raise KevlarOutfileMismatchError(message)
     message = 'computing k-mer abundances for {:d} samples'.format(numsamples)
     print('[kevlar::counting]    ', message, file=logfile)
 
     sketches = list()
-    for samplelist in samplelists:
-        if len(samplelist) < 2:
-            message = 'must specify an output file and at least one input file'
-            raise KevlarSampleIOError(message)
-        outfile = samplelist[0]
-        seqfiles = samplelist[1:]
-        if masks:
-            mymasks = masks
-            sketchmem = memory * memfraction
-        elif len(sketches) == 0:
-            mymasks = None
-            sketchmem = memory
-        else:
-            mymasks = sketches
-            sketchmem = memory * memfraction
+    for seqfiles, outfile in zip(samplelists, outfiles):
+        sketchmem = memory if memfraction is None else memory * memfraction
+        mymask = mask
+        if memfraction is not None and len(sketches) == 0 and mask is None:
+            mymask = sketches[0]
         sketch = load_sample_seqfile(
-            seqfiles, ksize, sketchmem, maxfpr=maxfpr, masks=mymasks,
-            maskmaxabund=maxabund, numbands=numbands, band=band,
-            outfile=outfile, logfile=logfile
+            seqfiles, ksize, sketchmem, maxfpr=maxfpr, mask=mymask,
+            numbands=numbands, band=band, outfile=outfile, logfile=logfile
         )
         sketches.append(sketch)
     return sketches
@@ -150,7 +133,6 @@ def load_samples_sketchfiles(sketchfiles, maxfpr=0.2, logfile=sys.stderr):
         if fpr > maxfpr:
             message += ' (FPR too high, bailing out!!!)'
             raise SystemExit(message)
-        else:
-            print(message, file=logfile)
+        print(message, file=logfile)
         sketches.append(sketch)
     return sketches
diff --git a/kevlar/novel.py b/kevlar/novel.py
@@ -58,6 +58,7 @@ def kmer_is_interesting(kmer, casecounts, controlcounts, case_min=5,
 def main(args):
     if (not args.num_bands) is not (not args.band):
         raise ValueError('Must specify --num-bands and --band together')
+    myband = args.band - 1 if args.band else None
 
     timer = kevlar.Timer()
     timer.start()
@@ -77,7 +78,8 @@ def main(args):
     else:
         controls = kevlar.counting.load_samples(
             args.control, args.ksize, args.memory, maxfpr=args.max_fpr,
-            numbands=args.num_bands, band=args.band, logfile=args.logfile
+            memfraction=None, numbands=args.num_bands, band=myband,
+            logfile=args.logfile
         )
     elapsed = timer.stop('loadctrl')
     message = 'Control samples loaded in {:.2f} sec'.format(elapsed)
@@ -101,7 +103,8 @@ def main(args):
     else:
         cases = kevlar.counting.load_samples(
             args.case, args.ksize, args.memory, maxfpr=args.max_fpr,
-            numbands=args.num_bands, band=args.band, logfile=args.logfile
+            memfraction=None, numbands=args.num_bands, band=myband,
+            logfile=args.logfile
         )
     elapsed = timer.stop('loadcases')
     print('[kevlar::novel] Case samples loaded in {:.2f} sec'.format(elapsed),

diff --git a/kevlar/tests/test_count.py b/kevlar/tests/test_count.py
@@ -18,8 +18,8 @@
 
 @pytest.mark.parametrize('numbands,band,kmers_stored', [
     (0, 0, 15600),
-    (2, 1, 7992),
-    (16, 7, 1218),
+    (2, 1, 7937),
+    (16, 7, 1068),
 ])
 def test_count_simple(numbands, band, kmers_stored, capsys):
     with NamedTemporaryFile(suffix='.counttable') as ctrl1out, \

diff --git a/kevlar/tests/test_pipe.py b/kevlar/tests/test_pipe.py
@@ -27,14 +27,14 @@ def test_trio2(capsys):
             'novel', '--case', case,
             '--control', controls[0], '--control', controls[1],
             '--band', str(i+1), '--num-bands', '4', '--out', novelouts[i],
-            '--memory', '200K', '--ksize', '31',
+            '--memory', '400K', '--ksize', '31',
             '--case-min', '8', '--ctrl-max', '1'
         ]
         args = kevlar.cli.parser().parse_args(arglist)
         kevlar.novel.main(args)
 
     arglist = [
-        'collect', '--memory', '5K', '--ksize', '31', '--minabund', '8',
+        'collect', '--memory', '10K', '--ksize', '31', '--minabund', '8',
         '--collapse'
     ] + novelouts
     args = kevlar.cli.parser().parse_args(arglist)