From 5d9ad6089ce80759012acbf7a79d32d7e0d64e22 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Fri, 4 Aug 2017 15:59:23 -0700 Subject: [PATCH 1/5] Refactor loading code, memfrac, dilution, etc. Closes #101. --- kevlar/cli/count.py | 11 +-- kevlar/count.py | 27 +++++--- kevlar/counting.py | 134 ++++++++++++++++--------------------- kevlar/novel.py | 6 +- kevlar/tests/test_count.py | 4 +- 5 files changed, 89 insertions(+), 93 deletions(-) diff --git a/kevlar/cli/count.py b/kevlar/cli/count.py index c28bc48..b659ca8 100644 --- a/kevlar/cli/count.py +++ b/kevlar/cli/count.py @@ -60,9 +60,10 @@ def subparser(subparsers): mem_desc = """\ Specify how much memory to allocate for the sketch data structures - used to store k-mer counts. The first control sample will be - allocated the full amount of specifed `--memory`, and all subsequent - samples will be allocated a fraction thereof. + used to store k-mer counts. If `--mem-frac` is not set, all samples will be + allocated `MEM` bytes. If `--mem-frac` is set, then the first control + sample will be allocated `MEM` bytes, and all other samples will be + allocated `MEM * F` bytes. """ mem_desc = textwrap.dedent(mem_desc) memory_args = subparser.add_argument_group('Memory allocation', mem_desc) @@ -72,9 +73,9 @@ def subparser(subparsers): 'the initial control sample; default is 1M' ) memory_args.add_argument( - '-f', '--mem-frac', type=float, default=0.1, metavar='F', + '-f', '--mem-frac', type=float, default=None, metavar='F', help='fraction of the total memory to allocate to subsequent samples; ' - 'default is 0.1' + 'should be between 0.0 and 1.0' ) memory_args.add_argument( '--max-fpr', type=float, default=0.2, metavar='FPR', diff --git a/kevlar/count.py b/kevlar/count.py index a947d60..f33d4aa 100644 --- a/kevlar/count.py +++ b/kevlar/count.py @@ -15,6 +15,12 @@ import kevlar +def split_infiles_outfiles(filelist): + outfiles = [flist[0] for flist in filelist] + infilelists = [flist[1:] for flist in filelist] + return outfiles, infilelists + + def main(args): if (args.num_bands is None) is not (args.band is None): raise ValueError('Must specify --num-bands and --band together') @@ -24,10 +30,12 @@ def main(args): timer.start('loadctrl') print('[kevlar::count] Loading control samples', file=args.logfile) - controls = kevlar.counting.load_samples_with_dilution( - args.control, args.ksize, args.memory, memfraction=args.mem_frac, - maxfpr=args.max_fpr, maxabund=args.ctrl_max, masks=None, - numbands=args.num_bands, band=args.band, logfile=args.logfile + outfiles, infilelists = split_infiles_outfiles(args.control) + controls = kevlar.counting.load_samples( + infilelists, args.ksize, args.memory, outfiles=outfiles, + memfraction=args.mem_frac, maxfpr=args.max_fpr, maxabund=args.ctrl_max, + mask=None, numbands=args.num_bands, band=args.band, + logfile=args.logfile ) elapsed = timer.stop('loadctrl') numcontrols = len(controls) @@ -36,10 +44,13 @@ def main(args): print('[kevlar::count] Loading case samples', file=args.logfile) timer.start('loadcase') - cases = kevlar.counting.load_samples_with_dilution( - args.case, args.ksize, args.memory, memfraction=args.mem_frac, - maxfpr=args.max_fpr, maxabund=args.ctrl_max, masks=controls, - numbands=args.num_bands, band=args.band, logfile=args.logfile + outfiles, infilelists = split_infiles_outfiles(args.case) + casemask = outfiles[0] if args.mem_frac else None + cases = kevlar.counting.load_samples( + infilelists, args.ksize, args.memory, outfiles=outfiles, + memfraction=args.mem_frac, maxfpr=args.max_fpr, maxabund=args.ctrl_max, + mask=casemask, numbands=args.num_bands, band=args.band, + logfile=args.logfile ) elapsed = timer.stop('loadcase') numcases = len(cases) diff --git a/kevlar/counting.py b/kevlar/counting.py index 4cf164c..896975e 100644 --- a/kevlar/counting.py +++ b/kevlar/counting.py @@ -19,38 +19,43 @@ class KevlarSampleIOError(ValueError): pass +class KevlarOutfileMismatchError(ValueError): + pass + + def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2, - masks=None, maskmaxabund=1, numbands=None, band=None, + mask=None, maskmaxabund=1, numbands=None, band=None, outfile=None, logfile=sys.stderr): """ Compute k-mer abundances for the specified sequence input. Expected input is a list of one or more FASTA/FASTQ files corresponding to a single sample. A counttable is created and populated with abundances - of all k-mers observed in the input. + of all k-mers observed in the input. If `mask` is provided, only k-mers not + present in the mask will be loaded. """ message = 'loading sample from ' + ','.join(seqfiles) print('[kevlar::counting] ', message, file=logfile) sketch = khmer.Counttable(ksize, memory / 4, 4) n, nkmers = 0, 0 - for n, read in enumerate(kevlar.multi_file_iter_khmer(seqfiles), 1): - for subseq in kevlar.clean_subseqs(read.sequence, ksize): - for kmer in sketch.get_kmers(subseq): - if numbands: - khash = sketch.hash(kmer) - if khash & (numbands - 1) != band - 1: - continue - if masks: - for mask in masks: - if mask.get(kmer) > maskmaxabund: - break - else: - sketch.add(kmer) - nkmers += 1 - else: - sketch.add(kmer) - nkmers += 1 + for seqfile in seqfiles: + if mask: + if numbands: + nr, nk = sketch.consume_seqfile_banding_with_mask( + seqfile, numbands, band, mask + ) + else: + nr, nk = sketch.consume_seqfile_with_mask(seqfile, mask) + else: + if numbands: + nr, nk = sketch.consume_seqfile_banding( + seqfile, numbands, band + ) + else: + nr, nk = sketch.consume_seqfile(seqfile) + n += nr + nkmers += nk message = 'done loading reads' if numbands: @@ -62,77 +67,55 @@ def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2, if fpr > maxfpr: message += ' (FPR too high, bailing out!!!)' raise SystemExit(message) - else: - if outfile: - if not outfile.endswith(('.ct', '.counttable')): - outfile += '.counttable' - sketch.save(outfile) - message += '; saved to "{:s}"'.format(outfile) - print('[kevlar::counting] ', message, file=logfile) + + if outfile: + if not outfile.endswith(('.ct', '.counttable')): + outfile += '.counttable' + sketch.save(outfile) + message += '; saved to "{:s}"'.format(outfile) + print('[kevlar::counting] ', message, file=logfile) return sketch -def load_samples(samplelists, ksize, memory, maxfpr=0.2, numbands=None, - band=None, logfile=sys.stderr): +def load_samples(samplelists, ksize, memory, mask=None, memfraction=None, + maxfpr=0.2, maxabund=1, numbands=None, band=None, + outfiles=None, logfile=sys.stderr): """ Load a group of related samples using a memory-efficient strategy. - Samples loaded initially are used as masks for subsequently loaded samples. - The first sample is allocated the full amount of memory, while subsequent - samples require only a fraction since they are first checked against the - mask(s). - """ - numsamples = len(samplelists) - message = 'computing k-mer abundances for {:d} samples'.format(numsamples) - print('[kevlar::counting] ', message, file=logfile) - - sketches = list() - for seqfiles in samplelists: - sketch = load_sample_seqfile( - seqfiles, ksize, memory, maxfpr=maxfpr, numbands=numbands, - band=band, outfile=None, logfile=logfile - ) - sketches.append(sketch) - return sketches + By default, each sample is loaded into a dedicated counttable, which occupy + `memory` bytes of memory each. Setting `memfraction` to a value between 0.0 + and 1.0 will activate "masked" mode. + If `mask` is provided, it serves as a mask for all other samples. If it is + not provided, the first sample is loaded normally and then serves as a mask + for all subsequent samples. -def load_samples_with_dilution(samplelists, ksize, memory, memfraction=0.1, - maxfpr=0.2, maxabund=1, masks=None, - numbands=None, band=None, skipsave=False, - logfile=sys.stderr): - """ - Load a group of related samples using a memory-efficient strategy. - - Samples loaded initially are used as masks for subsequently loaded samples. - The first sample is allocated the full amount of memory, while subsequent - samples require only a fraction since they are first checked against the - mask(s). + In "masked mode", sample uses only `memory * memfraction` bytes of memory, + and any k-mer present in the mask (above a given threshold `maxabund`) is + ignored. In this way, we avoid taking up space storing abundances for + k-mers we know we're not interested in. """ numsamples = len(samplelists) + if outfiles is None: + outfiles = [None] * numsamples + if numsamples != len(outfiles): + message = '# of samples ({:d}) '.format(numsamples) + message += 'does not match # of outfiles ({:d})'.format(len(outfiles)) + raise KevlarOutfileMismatchError(message) message = 'computing k-mer abundances for {:d} samples'.format(numsamples) print('[kevlar::counting] ', message, file=logfile) sketches = list() - for samplelist in samplelists: - if len(samplelist) < 2: - message = 'must specify an output file and at least one input file' - raise KevlarSampleIOError(message) - outfile = samplelist[0] - seqfiles = samplelist[1:] - if masks: - mymasks = masks - sketchmem = memory * memfraction - elif len(sketches) == 0: - mymasks = None - sketchmem = memory - else: - mymasks = sketches - sketchmem = memory * memfraction + for seqfiles, outfile in zip(samplelists, outfiles): + sketchmem = memory if memfraction is None else memory * memfraction + mymask = mask + if memfraction is not None and len(sketches) == 0 and mask is None: + mymask = sketches[0] sketch = load_sample_seqfile( - seqfiles, ksize, sketchmem, maxfpr=maxfpr, masks=mymasks, - maskmaxabund=maxabund, numbands=numbands, band=band, - outfile=outfile, logfile=logfile + seqfiles, ksize, sketchmem, maxfpr=maxfpr, mask=mymask, + numbands=numbands, band=band, outfile=outfile, logfile=logfile ) sketches.append(sketch) return sketches @@ -150,7 +133,6 @@ def load_samples_sketchfiles(sketchfiles, maxfpr=0.2, logfile=sys.stderr): if fpr > maxfpr: message += ' (FPR too high, bailing out!!!)' raise SystemExit(message) - else: - print(message, file=logfile) + print(message, file=logfile) sketches.append(sketch) return sketches diff --git a/kevlar/novel.py b/kevlar/novel.py index 86b4df7..e82e64c 100644 --- a/kevlar/novel.py +++ b/kevlar/novel.py @@ -68,7 +68,8 @@ def main(args): else: controls = kevlar.counting.load_samples( args.control, args.ksize, args.memory, maxfpr=args.max_fpr, - numbands=args.num_bands, band=args.band, logfile=args.logfile + memfraction=None, numbands=args.num_bands, band=args.band, + logfile=args.logfile ) elapsed = timer.stop('loadctrl') message = 'Control samples loaded in {:.2f} sec'.format(elapsed) @@ -92,7 +93,8 @@ def main(args): else: cases = kevlar.counting.load_samples( args.case, args.ksize, args.memory, maxfpr=args.max_fpr, - numbands=args.num_bands, band=args.band, logfile=args.logfile + memfraction=None, numbands=args.num_bands, band=args.band, + logfile=args.logfile ) elapsed = timer.stop('loadcases') print('[kevlar::novel] Case samples loaded in {:.2f} sec'.format(elapsed), diff --git a/kevlar/tests/test_count.py b/kevlar/tests/test_count.py index 895b3eb..a8281bb 100644 --- a/kevlar/tests/test_count.py +++ b/kevlar/tests/test_count.py @@ -18,8 +18,8 @@ @pytest.mark.parametrize('numbands,band,kmers_stored', [ (0, 0, 15600), - (2, 1, 7992), - (16, 7, 1218), + (2, 1, 7663), + (16, 7, 859), ]) def test_count_simple(numbands, band, kmers_stored, capsys): with NamedTemporaryFile(suffix='.counttable') as ctrl1out, \ From 38d7d70716cb3c3136e5e679bf797f1c96cd4c1e Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Mon, 7 Aug 2017 13:16:04 -0700 Subject: [PATCH 2/5] Increase memory --- kevlar/tests/test_pipe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kevlar/tests/test_pipe.py b/kevlar/tests/test_pipe.py index ce53ee7..b6e820a 100644 --- a/kevlar/tests/test_pipe.py +++ b/kevlar/tests/test_pipe.py @@ -27,14 +27,14 @@ def test_trio2(capsys): 'novel', '--case', case, '--control', controls[0], '--control', controls[1], '--band', str(i+1), '--num-bands', '4', '--out', novelouts[i], - '--memory', '200K', '--ksize', '31', + '--memory', '400K', '--ksize', '31', '--case-min', '8', '--ctrl-max', '1' ] args = kevlar.cli.parser().parse_args(arglist) kevlar.novel.main(args) arglist = [ - 'collect', '--memory', '5K', '--ksize', '31', '--minabund', '8', + 'collect', '--memory', '10K', '--ksize', '31', '--minabund', '8', '--collapse' ] + novelouts args = kevlar.cli.parser().parse_args(arglist) From e9c28afbcfc1ba05cf3f0783700bf3245020a88a Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Mon, 7 Aug 2017 20:52:35 -0700 Subject: [PATCH 3/5] Memory --- kevlar/tests/test_pipe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kevlar/tests/test_pipe.py b/kevlar/tests/test_pipe.py index b6e820a..f9e372a 100644 --- a/kevlar/tests/test_pipe.py +++ b/kevlar/tests/test_pipe.py @@ -27,14 +27,14 @@ def test_trio2(capsys): 'novel', '--case', case, '--control', controls[0], '--control', controls[1], '--band', str(i+1), '--num-bands', '4', '--out', novelouts[i], - '--memory', '400K', '--ksize', '31', + '--memory', '350K', '--ksize', '31', '--case-min', '8', '--ctrl-max', '1' ] args = kevlar.cli.parser().parse_args(arglist) kevlar.novel.main(args) arglist = [ - 'collect', '--memory', '10K', '--ksize', '31', '--minabund', '8', + 'collect', '--memory', '5K', '--ksize', '31', '--minabund', '8', '--collapse' ] + novelouts args = kevlar.cli.parser().parse_args(arglist) From 8b7c4f725440c7a3549ea413fc36b6c5b78d87fb Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Mon, 7 Aug 2017 21:48:37 -0700 Subject: [PATCH 4/5] Fix banding --- kevlar/count.py | 6 +++--- kevlar/novel.py | 5 +++-- kevlar/tests/test_pipe.py | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/kevlar/count.py b/kevlar/count.py index f33d4aa..fd71833 100644 --- a/kevlar/count.py +++ b/kevlar/count.py @@ -24,6 +24,7 @@ def split_infiles_outfiles(filelist): def main(args): if (args.num_bands is None) is not (args.band is None): raise ValueError('Must specify --num-bands and --band together') + myband = args.band - 1 if args.band else None timer = kevlar.Timer() timer.start() @@ -34,8 +35,7 @@ def main(args): controls = kevlar.counting.load_samples( infilelists, args.ksize, args.memory, outfiles=outfiles, memfraction=args.mem_frac, maxfpr=args.max_fpr, maxabund=args.ctrl_max, - mask=None, numbands=args.num_bands, band=args.band, - logfile=args.logfile + mask=None, numbands=args.num_bands, band=myband, logfile=args.logfile ) elapsed = timer.stop('loadctrl') numcontrols = len(controls) @@ -49,7 +49,7 @@ def main(args): cases = kevlar.counting.load_samples( infilelists, args.ksize, args.memory, outfiles=outfiles, memfraction=args.mem_frac, maxfpr=args.max_fpr, maxabund=args.ctrl_max, - mask=casemask, numbands=args.num_bands, band=args.band, + mask=casemask, numbands=args.num_bands, band=myband, logfile=args.logfile ) elapsed = timer.stop('loadcase') diff --git a/kevlar/novel.py b/kevlar/novel.py index 644fe65..c8a5ee0 100644 --- a/kevlar/novel.py +++ b/kevlar/novel.py @@ -58,6 +58,7 @@ def kmer_is_interesting(kmer, casecounts, controlcounts, case_min=5, def main(args): if (not args.num_bands) is not (not args.band): raise ValueError('Must specify --num-bands and --band together') + myband = args.band - 1 if args.band else None timer = kevlar.Timer() timer.start() @@ -77,7 +78,7 @@ def main(args): else: controls = kevlar.counting.load_samples( args.control, args.ksize, args.memory, maxfpr=args.max_fpr, - memfraction=None, numbands=args.num_bands, band=args.band, + memfraction=None, numbands=args.num_bands, band=myband, logfile=args.logfile ) elapsed = timer.stop('loadctrl') @@ -102,7 +103,7 @@ def main(args): else: cases = kevlar.counting.load_samples( args.case, args.ksize, args.memory, maxfpr=args.max_fpr, - memfraction=None, numbands=args.num_bands, band=args.band, + memfraction=None, numbands=args.num_bands, band=myband, logfile=args.logfile ) elapsed = timer.stop('loadcases') diff --git a/kevlar/tests/test_pipe.py b/kevlar/tests/test_pipe.py index f9e372a..b6e820a 100644 --- a/kevlar/tests/test_pipe.py +++ b/kevlar/tests/test_pipe.py @@ -27,14 +27,14 @@ def test_trio2(capsys): 'novel', '--case', case, '--control', controls[0], '--control', controls[1], '--band', str(i+1), '--num-bands', '4', '--out', novelouts[i], - '--memory', '350K', '--ksize', '31', + '--memory', '400K', '--ksize', '31', '--case-min', '8', '--ctrl-max', '1' ] args = kevlar.cli.parser().parse_args(arglist) kevlar.novel.main(args) arglist = [ - 'collect', '--memory', '5K', '--ksize', '31', '--minabund', '8', + 'collect', '--memory', '10K', '--ksize', '31', '--minabund', '8', '--collapse' ] + novelouts args = kevlar.cli.parser().parse_args(arglist) From 9d3a89118cc4bfe9886f30dc9a529b1f14e80e5d Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Mon, 7 Aug 2017 22:38:12 -0700 Subject: [PATCH 5/5] Changes to compensate for banding bugfix --- kevlar/tests/test_count.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kevlar/tests/test_count.py b/kevlar/tests/test_count.py index a8281bb..e55f420 100644 --- a/kevlar/tests/test_count.py +++ b/kevlar/tests/test_count.py @@ -18,8 +18,8 @@ @pytest.mark.parametrize('numbands,band,kmers_stored', [ (0, 0, 15600), - (2, 1, 7663), - (16, 7, 859), + (2, 1, 7937), + (16, 7, 1068), ]) def test_count_simple(numbands, band, kmers_stored, capsys): with NamedTemporaryFile(suffix='.counttable') as ctrl1out, \