Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions kevlar/cli/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,10 @@ def subparser(subparsers):

mem_desc = """\
Specify how much memory to allocate for the sketch data structures
used to store k-mer counts. The first control sample will be
allocated the full amount of specifed `--memory`, and all subsequent
samples will be allocated a fraction thereof.
used to store k-mer counts. If `--mem-frac` is not set, all samples will be
allocated `MEM` bytes. If `--mem-frac` is set, then the first control
sample will be allocated `MEM` bytes, and all other samples will be
allocated `MEM * F` bytes.
"""
mem_desc = textwrap.dedent(mem_desc)
memory_args = subparser.add_argument_group('Memory allocation', mem_desc)
Expand All @@ -72,9 +73,9 @@ def subparser(subparsers):
'the initial control sample; default is 1M'
)
memory_args.add_argument(
'-f', '--mem-frac', type=float, default=0.1, metavar='F',
'-f', '--mem-frac', type=float, default=None, metavar='F',
help='fraction of the total memory to allocate to subsequent samples; '
'default is 0.1'
'should be between 0.0 and 1.0'
)
memory_args.add_argument(
'--max-fpr', type=float, default=0.2, metavar='FPR',
Expand Down
27 changes: 19 additions & 8 deletions kevlar/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,27 @@
import kevlar


def split_infiles_outfiles(filelist):
outfiles = [flist[0] for flist in filelist]
infilelists = [flist[1:] for flist in filelist]
return outfiles, infilelists


def main(args):
if (args.num_bands is None) is not (args.band is None):
raise ValueError('Must specify --num-bands and --band together')
myband = args.band - 1 if args.band else None

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The human interface (CLI) expects the band to be 1-based (band \in {1..numbands}), whereas the Python and C++ APIs expect band to be 0-based (band \in {0..numbands-1}).


timer = kevlar.Timer()
timer.start()

timer.start('loadctrl')
print('[kevlar::count] Loading control samples', file=args.logfile)
controls = kevlar.counting.load_samples_with_dilution(
args.control, args.ksize, args.memory, memfraction=args.mem_frac,
maxfpr=args.max_fpr, maxabund=args.ctrl_max, masks=None,
numbands=args.num_bands, band=args.band, logfile=args.logfile
outfiles, infilelists = split_infiles_outfiles(args.control)
controls = kevlar.counting.load_samples(
infilelists, args.ksize, args.memory, outfiles=outfiles,
memfraction=args.mem_frac, maxfpr=args.max_fpr, maxabund=args.ctrl_max,
mask=None, numbands=args.num_bands, band=myband, logfile=args.logfile
)
elapsed = timer.stop('loadctrl')
numcontrols = len(controls)
Expand All @@ -36,10 +44,13 @@ def main(args):

print('[kevlar::count] Loading case samples', file=args.logfile)
timer.start('loadcase')
cases = kevlar.counting.load_samples_with_dilution(
args.case, args.ksize, args.memory, memfraction=args.mem_frac,
maxfpr=args.max_fpr, maxabund=args.ctrl_max, masks=controls,
numbands=args.num_bands, band=args.band, logfile=args.logfile
outfiles, infilelists = split_infiles_outfiles(args.case)
casemask = outfiles[0] if args.mem_frac else None
cases = kevlar.counting.load_samples(
infilelists, args.ksize, args.memory, outfiles=outfiles,
memfraction=args.mem_frac, maxfpr=args.max_fpr, maxabund=args.ctrl_max,
mask=casemask, numbands=args.num_bands, band=myband,
logfile=args.logfile
)
elapsed = timer.stop('loadcase')
numcases = len(cases)
Expand Down
134 changes: 58 additions & 76 deletions kevlar/counting.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,38 +19,43 @@ class KevlarSampleIOError(ValueError):
pass


class KevlarOutfileMismatchError(ValueError):
pass


def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2,
masks=None, maskmaxabund=1, numbands=None, band=None,
mask=None, maskmaxabund=1, numbands=None, band=None,
outfile=None, logfile=sys.stderr):
"""
Compute k-mer abundances for the specified sequence input.

Expected input is a list of one or more FASTA/FASTQ files corresponding
to a single sample. A counttable is created and populated with abundances
of all k-mers observed in the input.
of all k-mers observed in the input. If `mask` is provided, only k-mers not
present in the mask will be loaded.
"""
message = 'loading sample from ' + ','.join(seqfiles)
print('[kevlar::counting] ', message, file=logfile)

sketch = khmer.Counttable(ksize, memory / 4, 4)
n, nkmers = 0, 0
for n, read in enumerate(kevlar.multi_file_iter_khmer(seqfiles), 1):
for subseq in kevlar.clean_subseqs(read.sequence, ksize):
for kmer in sketch.get_kmers(subseq):
if numbands:
khash = sketch.hash(kmer)
if khash & (numbands - 1) != band - 1:
continue
if masks:
for mask in masks:
if mask.get(kmer) > maskmaxabund:
break
else:
sketch.add(kmer)
nkmers += 1
else:
sketch.add(kmer)
nkmers += 1
for seqfile in seqfiles:
if mask:
if numbands:
nr, nk = sketch.consume_seqfile_banding_with_mask(

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bulk loading with mask now available from khmer master.

seqfile, numbands, band, mask
)
else:
nr, nk = sketch.consume_seqfile_with_mask(seqfile, mask)
else:
if numbands:
nr, nk = sketch.consume_seqfile_banding(
seqfile, numbands, band
)
else:
nr, nk = sketch.consume_seqfile(seqfile)
n += nr
nkmers += nk

message = 'done loading reads'
if numbands:
Expand All @@ -62,77 +67,55 @@ def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2,
if fpr > maxfpr:
message += ' (FPR too high, bailing out!!!)'
raise SystemExit(message)
else:
if outfile:
if not outfile.endswith(('.ct', '.counttable')):
outfile += '.counttable'
sketch.save(outfile)
message += '; saved to "{:s}"'.format(outfile)
print('[kevlar::counting] ', message, file=logfile)

if outfile:
if not outfile.endswith(('.ct', '.counttable')):

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This behavior (with respect to filename extensions for saved sketches) should be documented somewhere. A single function to handle reading/writing sketches would help. One already exists, but it isn't suited for all cases IIRC.

outfile += '.counttable'
sketch.save(outfile)
message += '; saved to "{:s}"'.format(outfile)
print('[kevlar::counting] ', message, file=logfile)

return sketch


def load_samples(samplelists, ksize, memory, maxfpr=0.2, numbands=None,
band=None, logfile=sys.stderr):
def load_samples(samplelists, ksize, memory, mask=None, memfraction=None,
maxfpr=0.2, maxabund=1, numbands=None, band=None,
outfiles=None, logfile=sys.stderr):
"""
Load a group of related samples using a memory-efficient strategy.

Samples loaded initially are used as masks for subsequently loaded samples.
The first sample is allocated the full amount of memory, while subsequent
samples require only a fraction since they are first checked against the
mask(s).
"""
numsamples = len(samplelists)
message = 'computing k-mer abundances for {:d} samples'.format(numsamples)
print('[kevlar::counting] ', message, file=logfile)

sketches = list()
for seqfiles in samplelists:
sketch = load_sample_seqfile(
seqfiles, ksize, memory, maxfpr=maxfpr, numbands=numbands,
band=band, outfile=None, logfile=logfile
)
sketches.append(sketch)
return sketches
By default, each sample is loaded into a dedicated counttable, which occupy
`memory` bytes of memory each. Setting `memfraction` to a value between 0.0
and 1.0 will activate "masked" mode.

If `mask` is provided, it serves as a mask for all other samples. If it is
not provided, the first sample is loaded normally and then serves as a mask
for all subsequent samples.

def load_samples_with_dilution(samplelists, ksize, memory, memfraction=0.1,
maxfpr=0.2, maxabund=1, masks=None,
numbands=None, band=None, skipsave=False,
logfile=sys.stderr):
"""
Load a group of related samples using a memory-efficient strategy.

Samples loaded initially are used as masks for subsequently loaded samples.
The first sample is allocated the full amount of memory, while subsequent
samples require only a fraction since they are first checked against the
mask(s).
In "masked mode", sample uses only `memory * memfraction` bytes of memory,
and any k-mer present in the mask (above a given threshold `maxabund`) is
ignored. In this way, we avoid taking up space storing abundances for
k-mers we know we're not interested in.
"""
numsamples = len(samplelists)
if outfiles is None:
outfiles = [None] * numsamples
if numsamples != len(outfiles):
message = '# of samples ({:d}) '.format(numsamples)
message += 'does not match # of outfiles ({:d})'.format(len(outfiles))
raise KevlarOutfileMismatchError(message)
message = 'computing k-mer abundances for {:d} samples'.format(numsamples)
print('[kevlar::counting] ', message, file=logfile)

sketches = list()
for samplelist in samplelists:
if len(samplelist) < 2:
message = 'must specify an output file and at least one input file'
raise KevlarSampleIOError(message)
outfile = samplelist[0]
seqfiles = samplelist[1:]
if masks:
mymasks = masks
sketchmem = memory * memfraction
elif len(sketches) == 0:
mymasks = None
sketchmem = memory
else:
mymasks = sketches
sketchmem = memory * memfraction
for seqfiles, outfile in zip(samplelists, outfiles):
sketchmem = memory if memfraction is None else memory * memfraction
mymask = mask
if memfraction is not None and len(sketches) == 0 and mask is None:
mymask = sketches[0]
sketch = load_sample_seqfile(
seqfiles, ksize, sketchmem, maxfpr=maxfpr, masks=mymasks,
maskmaxabund=maxabund, numbands=numbands, band=band,
outfile=outfile, logfile=logfile
seqfiles, ksize, sketchmem, maxfpr=maxfpr, mask=mymask,
numbands=numbands, band=band, outfile=outfile, logfile=logfile
)
sketches.append(sketch)
return sketches
Expand All @@ -150,7 +133,6 @@ def load_samples_sketchfiles(sketchfiles, maxfpr=0.2, logfile=sys.stderr):
if fpr > maxfpr:
message += ' (FPR too high, bailing out!!!)'
raise SystemExit(message)
else:
print(message, file=logfile)
print(message, file=logfile)
sketches.append(sketch)
return sketches
7 changes: 5 additions & 2 deletions kevlar/novel.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def kmer_is_interesting(kmer, casecounts, controlcounts, case_min=5,
def main(args):
if (not args.num_bands) is not (not args.band):
raise ValueError('Must specify --num-bands and --band together')
myband = args.band - 1 if args.band else None

timer = kevlar.Timer()
timer.start()
Expand All @@ -77,7 +78,8 @@ def main(args):
else:
controls = kevlar.counting.load_samples(
args.control, args.ksize, args.memory, maxfpr=args.max_fpr,
numbands=args.num_bands, band=args.band, logfile=args.logfile
memfraction=None, numbands=args.num_bands, band=myband,
logfile=args.logfile
)
elapsed = timer.stop('loadctrl')
message = 'Control samples loaded in {:.2f} sec'.format(elapsed)
Expand All @@ -101,7 +103,8 @@ def main(args):
else:
cases = kevlar.counting.load_samples(
args.case, args.ksize, args.memory, maxfpr=args.max_fpr,
numbands=args.num_bands, band=args.band, logfile=args.logfile
memfraction=None, numbands=args.num_bands, band=myband,
logfile=args.logfile
)
elapsed = timer.stop('loadcases')
print('[kevlar::novel] Case samples loaded in {:.2f} sec'.format(elapsed),
Expand Down
4 changes: 2 additions & 2 deletions kevlar/tests/test_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

@pytest.mark.parametrize('numbands,band,kmers_stored', [
(0, 0, 15600),
(2, 1, 7992),
(16, 7, 1218),
(2, 1, 7937),
(16, 7, 1068),
])
def test_count_simple(numbands, band, kmers_stored, capsys):
with NamedTemporaryFile(suffix='.counttable') as ctrl1out, \
Expand Down
4 changes: 2 additions & 2 deletions kevlar/tests/test_pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ def test_trio2(capsys):
'novel', '--case', case,
'--control', controls[0], '--control', controls[1],
'--band', str(i+1), '--num-bands', '4', '--out', novelouts[i],
'--memory', '200K', '--ksize', '31',
'--memory', '400K', '--ksize', '31',
'--case-min', '8', '--ctrl-max', '1'
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.novel.main(args)

arglist = [
'collect', '--memory', '5K', '--ksize', '31', '--minabund', '8',
'collect', '--memory', '10K', '--ksize', '31', '--minabund', '8',
'--collapse'
] + novelouts
args = kevlar.cli.parser().parse_args(arglist)
Expand Down