diff --git a/README.bio b/README.bio index 64f0c98..caf20bc 100644 --- a/README.bio +++ b/README.bio @@ -2,39 +2,42 @@ Bioawk is an extension to Brian Kernighan's awk acquired from [1], adding the support of several common biological data formats, including optionally gzip'ed BED, GFF, SAM, VCF, FASTA/Q and TAB-delimited formats with the column names. It also adds a few built-in functions including, as of now, and(), or(), xor(), -reverse() and revcomp(). The following are a few examples demonstrating the new -functionality: +reverse(), revcomp(), and fastx(). The following are a few examples demonstrating +the new functionality: 1. Extract unmapped reads without header: - awk -c sam 'and($flag,4)' aln.sam.gz + awk -c sam 'and($flag, 4)' aln.sam.gz 2. Extract mapped reads with header: - awk -c sam -H '!and($flag,4)' + awk -c sam -H '!and($flag, 4)' 3. Reverse complement FASTA: - awk -c fastx '{print ">"$name;print revcomp($seq)}' seq.fa.gz + awk -c fastx '{print fastx($name, revcomp($seq))}' seq.fa.gz 4. Create FASTA from SAM (uses revcomp if FLAG & 16) samtools view aln.bam | \ - awk -c sam '{s=$seq; if(and($flag, 16)) {s=revcomp($seq)} print ">"$qname"\n"s}' + awk -c sam '{s=$seq; if(and($flag, 16)) {s=revcomp($seq)} print fastx($qname, s)}' -5. Get the %GC from FASTA: +5. Get the %GC table from FASTA: - awk -c fastx '{print ">"$name;print gc($seq)}' seq.fa.gz + awk -c fastx '{print $name, gc($seq)}' seq.fa.gz -6. Get the mean Phred quality score from FASTQ: +6. Get the mean Phred quality score table from FASTQ: - awk -c fastx '{print ">"$name;print meanqual($qual)}' seq.fq.gz + awk -c fastx '{print $name, meanqual($qual)}' seq.fq.gz 7. Take column name from the first line (where "age" appears in the first line of input.txt): awk -c header '{print $age}' input.txt +8. Use awk's redirection to split a FASTA file by sequence lengths. + + awk -c fastx '{if (length($seq) < 35) {print fastx($name, $seq) > "short.fasta"} else {print fastx($name, $seq) > "long.fasta"}}' seq.fq.gz Note that when "-c" is not specified and the new built-in functions are not used, bioawk should behave exactly the same as the original BWK awk. At least diff --git a/addon.c b/addon.c index 1b31aba..4ac468b 100644 --- a/addon.c +++ b/addon.c @@ -230,6 +230,34 @@ Cell *bio_func(int f, Cell *x, Node **a) if (buf[i] - 33 >= thres) ++cnt; setfval(y, (Awkfloat)cnt); } + } else if (f == BIO_FFASTX) { + if (a[1]->nnext == 0) { + FATAL("fastx requires at least two arguments"); + } else { + char *buf, *name, *seq, *qual; + int bufsz=3*recsize; + int has_qual; + z = execute(a[1]->nnext); + if ((has_qual = a[1]->nnext->nnext != 0)) { + y = execute(a[1]->nnext->nnext); + qual = getsval(y); + } + + if ((buf = (char *) malloc(bufsz)) == NULL) + FATAL("out of memory in fastx"); + + name = getsval(x); + seq = getsval(z); + if (!has_qual) { + sprintf(buf, ">%s\n%s", name, seq); + } else { + if (strlen(seq) != strlen(qual)) + WARNING("fastx arguments seq and qual are not same length"); + sprintf(buf, "@%s\n%s\n+\n%s", name, seq, qual); + } + setsval(y, buf); + free(buf); + } } /* else: never happens */ return y; } diff --git a/addon.h b/addon.h index 948c7b0..06dfc50 100644 --- a/addon.h +++ b/addon.h @@ -41,6 +41,7 @@ int bio_getrec(char **pbuf, int *psize, int isrecord); #define BIO_FMEANQUAL 204 #define BIO_FQUALCOUNT 205 #define BIO_FTRIMQ 206 +#define BIO_FFASTX 207 struct Cell; diff --git a/lex.c b/lex.c index d470364..3b96d74 100644 --- a/lex.c +++ b/lex.c @@ -58,11 +58,12 @@ Keyword keywords[] ={ /* keep sorted: binary searched */ { "else", ELSE, ELSE }, { "exit", EXIT, EXIT }, { "exp", FEXP, BLTIN }, + { "fastx", BIO_FFASTX, BLTIN }, { "fflush", FFLUSH, BLTIN }, { "for", FOR, FOR }, { "func", FUNC, FUNC }, { "function", FUNC, FUNC }, - { "gc", BIO_FGC, BLTIN }, + { "gc", BIO_FGC, BLTIN }, { "getline", GETLINE, GETLINE }, { "gsub", GSUB, GSUB }, { "if", IF, IF }, @@ -72,7 +73,7 @@ Keyword keywords[] ={ /* keep sorted: binary searched */ { "length", FLENGTH, BLTIN }, { "log", FLOG, BLTIN }, { "match", MATCHFCN, MATCHFCN }, - { "meanqual", BIO_FMEANQUAL, BLTIN }, + { "meanqual", BIO_FMEANQUAL, BLTIN }, { "next", NEXT, NEXT }, { "nextfile", NEXTFILE, NEXTFILE }, { "or", BIO_FOR, BLTIN }, @@ -81,8 +82,8 @@ Keyword keywords[] ={ /* keep sorted: binary searched */ { "qualcount", BIO_FQUALCOUNT, BLTIN }, { "rand", FRAND, BLTIN }, { "return", RETURN, RETURN }, - { "revcomp",BIO_FREVCOMP, BLTIN }, - { "reverse",BIO_FREVERSE, BLTIN }, + { "revcomp", BIO_FREVCOMP, BLTIN }, + { "reverse", BIO_FREVERSE, BLTIN }, { "sin", FSIN, BLTIN }, { "split", SPLIT, SPLIT }, { "sprintf", SPRINTF, SPRINTF },