From 34ae7d23303b5bd8717b4b0b0f70d760d43045d5 Mon Sep 17 00:00:00 2001 From: Aaron Quinlan Date: Wed, 28 Dec 2011 14:38:12 -0500 Subject: [PATCH 1/8] Support pre-defined col. defns' for genomics --- nawk/addon.c | 96 ++++++++++++++++++++++++++++++++++++++++++---------- nawk/addon.h | 13 +++++-- nawk/main.c | 18 ++++++++-- nawk/run.c | 2 +- 4 files changed, 107 insertions(+), 22 deletions(-) diff --git a/nawk/addon.c b/nawk/addon.c index e7a8566..49fc7c5 100644 --- a/nawk/addon.c +++ b/nawk/addon.c @@ -1,9 +1,11 @@ #include #include +#include #include "awk.h" #include "addon.h" -int lh3_has_colnm = 0; +const char *lh3_col_defn = NULL; +const char *valid_coldefs[] = {"header", "bed", "bedgraph", "sam", "vcf", NULL}; static void set_colnm_aux(const char *p, int col) { @@ -12,25 +14,85 @@ static void set_colnm_aux(const char *p, int col) for (q = p; *q; ++q) if (!isdigit(*q)) break; if (*q == 0) return; /* do not set if string p is an integer */ - if ((x = lookup(p, symtab)) != NULL) + if ((x = lookup(p, symtab)) != NULL) { x->tval = NUM, x->fval = col; + } +} + +int isvalid_coldef(const char *request) +{ + int i; + for (i = 0; valid_coldefs[i] != NULL; ++i) + if (strcmp(request, valid_coldefs[i]) == 0) + return 1; + return 0; +} + +void print_valid_coldefs() +{ + printf("valid -c options include:\n"); + int i; + for (i = 0; valid_coldefs[i] != NULL; ++i) + printf(" %s\n", valid_coldefs[i]); } void lh3_set_colnm() { - char *p, *q, c; - int i; - if (lh3_has_colnm == 0) return; - for (p = record; *p && isspace(*p); ++p); /* skip leading spaces */ - for (i = 1, q = p; *q; ++q) { - if (!isspace(*q)) continue; - c = *q; /* backup the space */ - *q = 0; /* terminate the field */ - set_colnm_aux(p, i); - *q = c; /* change back */ - ++i; - for (p = q + 1; *p && isspace(*p); ++p); /* skip contiguous spaces */ - q = p; - } - set_colnm_aux(p, i); /* the last column */ + if (lh3_col_defn == NULL) return; + + if (strcmp(lh3_col_defn, "header") == 0) { + char *p, *q, c; + int i; + for (p = record; *p && isspace(*p); ++p); /* skip leading spaces */ + for (i = 1, q = p; *q; ++q) { + if (!isspace(*q)) continue; + c = *q; /* backup the space */ + *q = 0; /* terminate the field */ + set_colnm_aux(p, i); + *q = c; /* change back */ + ++i; + for (p = q + 1; *p && isspace(*p); ++p); /* skip contiguous spaces */ + q = p; + } + set_colnm_aux(p, i); /* the last column */ + } + else if (strcmp(lh3_col_defn, "bed") == 0) { + set_colnm_aux("chrom", 1); + set_colnm_aux("start", 2); + set_colnm_aux("end", 3); + set_colnm_aux("name", 4); + set_colnm_aux("score", 5); + set_colnm_aux("strand", 6); + } + else if (strcmp(lh3_col_defn, "bedgraph") == 0) { + set_colnm_aux("chrom", 1); + set_colnm_aux("start", 2); + set_colnm_aux("end", 3); + set_colnm_aux("score", 4); + } + else if (strcmp(lh3_col_defn, "sam") == 0) { + set_colnm_aux("qname", 1); + set_colnm_aux("flag", 2); + set_colnm_aux("rname", 3); + set_colnm_aux("pos", 4); + set_colnm_aux("mapq", 5); + set_colnm_aux("cigar", 6); + set_colnm_aux("rnext", 7); + set_colnm_aux("pnext", 8); + set_colnm_aux("tlen", 9); + set_colnm_aux("seq", 10); + set_colnm_aux("qual", 11); + // todo: any intellignet way to handle tags? + } + else if (strcmp(lh3_col_defn, "vcf") == 0) { + set_colnm_aux("chrom", 1); + set_colnm_aux("pos", 2); + set_colnm_aux("is", 3); + set_colnm_aux("ref", 4); + set_colnm_aux("alt", 5); + set_colnm_aux("qual", 6); + set_colnm_aux("filter", 7); + set_colnm_aux("info", 8); + // todo: any intellignet way to handle genotypes? + } } diff --git a/nawk/addon.h b/nawk/addon.h index c97ae00..2685105 100644 --- a/nawk/addon.h +++ b/nawk/addon.h @@ -1,7 +1,16 @@ #ifndef ADDON_H #define ADDON_H -extern int lh3_has_colnm; -void lh3_set_colnm(); +//extern int lh3_has_colnm; + +/* column definition, "header" reads frist line + other current options include: + bed, bedgraph, sam, vcf +*/ +extern const char *lh3_col_defn; + +void lh3_set_colnm(); /* assoc. col names w/ numbers */ +int isvalid_coldef(const char *); /* is the req. col defn. valid? */ +void print_valid_coldefs(); /* print a list of supported col defns. */ #endif diff --git a/nawk/main.c b/nawk/main.c index 3ee3952..ccb30d8 100644 --- a/nawk/main.c +++ b/nawk/main.c @@ -58,7 +58,7 @@ int safe = 0; /* 1 => "safe" mode */ int main(int argc, char *argv[]) { const char *fs = NULL; - + setlocale(LC_CTYPE, ""); setlocale(LC_NUMERIC, "C"); /* for parsing cmdline & prog */ cmdname = argv[0]; @@ -144,7 +144,21 @@ int main(int argc, char *argv[]) printf("awk %s\n", version); break; case 'c': - lh3_has_colnm = 1; + if (argv[1][2] != 0) { /* arg is -csomething */ + lh3_col_defn = &argv[1][2]; + } else { /* arg is -v something */ + argc--; argv++; + if (argc <= 1) + FATAL("no variable name"); + if (isvalid_coldef(argv[1])) + lh3_col_defn = argv[1]; + else { + //printf("yep\n"); + print_valid_coldefs(); + exit(1); + //FATAL("invalid -v option argument: %s", argv[1]); + } + } break; default: WARNING("unknown option %s ignored", argv[1]); diff --git a/nawk/run.c b/nawk/run.c index 821c431..564ca95 100644 --- a/nawk/run.c +++ b/nawk/run.c @@ -189,7 +189,7 @@ Cell *program(Node **a, int n) /* execute an awk program */ } if (a[1] || a[2]) while (getrec(&record, &recsize, 1) > 0) { - if (lh3_has_colnm && (int)(*NR + .499) == 1) lh3_set_colnm(); + if (lh3_col_defn != NULL && (int)(*NR + .499) == 1) lh3_set_colnm(); x = execute(a[1]); if (isexit(x)) break; From 9397c7ac349b518cb0009b342fe267a1f80bdb47 Mon Sep 17 00:00:00 2001 From: Aaron Quinlan Date: Wed, 28 Dec 2011 14:42:35 -0500 Subject: [PATCH 2/8] cleanup --- nawk/main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nawk/main.c b/nawk/main.c index ccb30d8..cfd48cd 100644 --- a/nawk/main.c +++ b/nawk/main.c @@ -146,17 +146,15 @@ int main(int argc, char *argv[]) case 'c': if (argv[1][2] != 0) { /* arg is -csomething */ lh3_col_defn = &argv[1][2]; - } else { /* arg is -v something */ + } else { /* arg is -c something */ argc--; argv++; if (argc <= 1) FATAL("no variable name"); if (isvalid_coldef(argv[1])) lh3_col_defn = argv[1]; else { - //printf("yep\n"); print_valid_coldefs(); exit(1); - //FATAL("invalid -v option argument: %s", argv[1]); } } break; From 838d545748de6fdaa68642206d0352422dd2aaa8 Mon Sep 17 00:00:00 2001 From: Aaron Quinlan Date: Wed, 28 Dec 2011 15:09:39 -0500 Subject: [PATCH 3/8] fixed VCF id column. --- nawk/addon.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nawk/addon.c b/nawk/addon.c index 49fc7c5..3a1da8b 100644 --- a/nawk/addon.c +++ b/nawk/addon.c @@ -14,9 +14,8 @@ static void set_colnm_aux(const char *p, int col) for (q = p; *q; ++q) if (!isdigit(*q)) break; if (*q == 0) return; /* do not set if string p is an integer */ - if ((x = lookup(p, symtab)) != NULL) { + if ((x = lookup(p, symtab)) != NULL) x->tval = NUM, x->fval = col; - } } int isvalid_coldef(const char *request) @@ -87,7 +86,7 @@ void lh3_set_colnm() else if (strcmp(lh3_col_defn, "vcf") == 0) { set_colnm_aux("chrom", 1); set_colnm_aux("pos", 2); - set_colnm_aux("is", 3); + set_colnm_aux("id", 3); set_colnm_aux("ref", 4); set_colnm_aux("alt", 5); set_colnm_aux("qual", 6); From fb4de49a394e6514ea0e8a6e8b62768f9313d223 Mon Sep 17 00:00:00 2001 From: Aaron Quinlan Date: Wed, 28 Dec 2011 18:50:51 -0500 Subject: [PATCH 4/8] report headers for sam (^@) and vcf (^#) --- nawk/addon.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/nawk/addon.c b/nawk/addon.c index 3a1da8b..6a9576c 100644 --- a/nawk/addon.c +++ b/nawk/addon.c @@ -82,6 +82,11 @@ void lh3_set_colnm() set_colnm_aux("seq", 10); set_colnm_aux("qual", 11); // todo: any intellignet way to handle tags? + + // auto-report any header lines + while (getrec(&record, &recsize, 1) > 0 && record[0] == '@') { + printf("%s\n", record); + } } else if (strcmp(lh3_col_defn, "vcf") == 0) { set_colnm_aux("chrom", 1); @@ -93,5 +98,10 @@ void lh3_set_colnm() set_colnm_aux("filter", 7); set_colnm_aux("info", 8); // todo: any intellignet way to handle genotypes? + + // auto-report any header lines + while (getrec(&record, &recsize, 1) > 0 && record[0] == '#') { + printf("%s\n", record); + } } } From 6f4a2988b2377257e67d8342ce07042ab4766cfb Mon Sep 17 00:00:00 2001 From: Aaron Quinlan Date: Wed, 28 Dec 2011 20:11:31 -0500 Subject: [PATCH 5/8] force FS and OFS to tab iff [bed\|bedgraph\|vcf\|sam\|etc] --- nawk/addon.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/nawk/addon.c b/nawk/addon.c index 6a9576c..f1f764a 100644 --- a/nawk/addon.c +++ b/nawk/addon.c @@ -62,12 +62,18 @@ void lh3_set_colnm() set_colnm_aux("name", 4); set_colnm_aux("score", 5); set_colnm_aux("strand", 6); + // force tab delimited input and output + *FS = "\t"; + *OFS = "\t"; } else if (strcmp(lh3_col_defn, "bedgraph") == 0) { set_colnm_aux("chrom", 1); set_colnm_aux("start", 2); set_colnm_aux("end", 3); set_colnm_aux("score", 4); + // force tab delimited input and output + *FS = "\t"; + *OFS = "\t"; } else if (strcmp(lh3_col_defn, "sam") == 0) { set_colnm_aux("qname", 1); @@ -81,8 +87,10 @@ void lh3_set_colnm() set_colnm_aux("tlen", 9); set_colnm_aux("seq", 10); set_colnm_aux("qual", 11); - // todo: any intellignet way to handle tags? - + // todo: any intelligent way to handle tags? + // force tab delimited input and output + *FS = "\t"; + *OFS = "\t"; // auto-report any header lines while (getrec(&record, &recsize, 1) > 0 && record[0] == '@') { printf("%s\n", record); @@ -97,8 +105,10 @@ void lh3_set_colnm() set_colnm_aux("qual", 6); set_colnm_aux("filter", 7); set_colnm_aux("info", 8); - // todo: any intellignet way to handle genotypes? - + // todo: any intelligent way to handle genotypes? + // force tab delimited input and output + *FS = "\t"; + *OFS = "\t"; // auto-report any header lines while (getrec(&record, &recsize, 1) > 0 && record[0] == '#') { printf("%s\n", record); From ef3dc692523399d52842983c94720e371c7b835c Mon Sep 17 00:00:00 2001 From: Aaron Quinlan Date: Wed, 28 Dec 2011 20:23:46 -0500 Subject: [PATCH 6/8] support GFF and GFF, allowing for group or attribute as the 9th column --- nawk/addon.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/nawk/addon.c b/nawk/addon.c index f1f764a..51f1fb5 100644 --- a/nawk/addon.c +++ b/nawk/addon.c @@ -5,7 +5,7 @@ #include "addon.h" const char *lh3_col_defn = NULL; -const char *valid_coldefs[] = {"header", "bed", "bedgraph", "sam", "vcf", NULL}; +const char *valid_coldefs[] = {"header", "bed", "bedgraph", "sam", "vcf", "gff", "gtf", NULL}; static void set_colnm_aux(const char *p, int col) { @@ -114,4 +114,24 @@ void lh3_set_colnm() printf("%s\n", record); } } + else if (strcmp(lh3_col_defn, "gff") == 0 || strcmp(lh3_col_defn, "gtf") == 0) { + set_colnm_aux("seqname", 1); + set_colnm_aux("source", 2); + set_colnm_aux("feature", 3); + set_colnm_aux("start", 4); + set_colnm_aux("end", 5); + set_colnm_aux("score", 6); + set_colnm_aux("filter", 7); + set_colnm_aux("strand", 8); + set_colnm_aux("group", 9); // allow group or attribute, as + set_colnm_aux("attribute", 9); // GFF v1 used group, v2 uses attribute + // todo: any intelligent way to handle genotypes? + // force tab delimited input and output + *FS = "\t"; + *OFS = "\t"; + // auto-report any header lines + while (getrec(&record, &recsize, 1) > 0 && record[0] == '#') { + printf("%s\n", record); + } + } } From 4b45165242ffd33a1a7dddf4651830016e6848f7 Mon Sep 17 00:00:00 2001 From: Aaron Quinlan Date: Wed, 28 Dec 2011 20:37:04 -0500 Subject: [PATCH 7/8] remove comment --- nawk/addon.c | 1 - 1 file changed, 1 deletion(-) diff --git a/nawk/addon.c b/nawk/addon.c index 51f1fb5..7227523 100644 --- a/nawk/addon.c +++ b/nawk/addon.c @@ -125,7 +125,6 @@ void lh3_set_colnm() set_colnm_aux("strand", 8); set_colnm_aux("group", 9); // allow group or attribute, as set_colnm_aux("attribute", 9); // GFF v1 used group, v2 uses attribute - // todo: any intelligent way to handle genotypes? // force tab delimited input and output *FS = "\t"; *OFS = "\t"; From 7a9244a8b21ff111624778b6be49bd246fe36e2f Mon Sep 17 00:00:00 2001 From: arq5x Date: Mon, 2 Jan 2012 14:53:55 -0500 Subject: [PATCH 8/8] full bed def'n & "-c help" --- nawk/addon.c | 164 +++++++++++++++++++++++++++++++-------------------- nawk/main.c | 8 +-- 2 files changed, 104 insertions(+), 68 deletions(-) diff --git a/nawk/addon.c b/nawk/addon.c index 7227523..802299b 100644 --- a/nawk/addon.c +++ b/nawk/addon.c @@ -5,17 +5,40 @@ #include "addon.h" const char *lh3_col_defn = NULL; -const char *valid_coldefs[] = {"header", "bed", "bedgraph", "sam", "vcf", "gff", "gtf", NULL}; +const char *valid_coldefs[] = {"header", "bed", "bedgraph", "sam", "vcf", "gff", NULL}; + +/*BED*/ +const char *bed_coldefs[] = {"chrom", "start", "end", + "name", "score", "strand", + "thickstart", "thickend", "rgb", + "blockcount", "blocksizes", "blockstarts", + NULL}; +/*BEDGRAPH*/ +const char *bedgraph_coldefs[] = {"chrom", "start", "end", "score", NULL}; +/*SAM*/ +const char *sam_coldefs[] = {"qname", "flag", "rname", + "pos", "mapq", "cigar", + "rnext", "pnext", "tlen", + "seq", "qual", NULL}; +/*VCF*/ +const char *vcf_coldefs[] = {"chrom", "pos", "id", + "ref", "alt", "qual", + "filter" "info", NULL}; +/*GFF/GTF*/ +const char *gff_coldefs[] = {"seqname", "source", "feature", + "start", "end", "score", + "filter", "strand", "group", + "attribute", NULL}; static void set_colnm_aux(const char *p, int col) { - const char *q; - Cell *x; - for (q = p; *q; ++q) - if (!isdigit(*q)) break; - if (*q == 0) return; /* do not set if string p is an integer */ - if ((x = lookup(p, symtab)) != NULL) - x->tval = NUM, x->fval = col; + const char *q; + Cell *x; + for (q = p; *q; ++q) + if (!isdigit(*q)) break; + if (*q == 0) return; /* do not set if string p is an integer */ + if ((x = lookup(p, symtab)) != NULL) + x->tval = NUM, x->fval = col; } int isvalid_coldef(const char *request) @@ -31,8 +54,42 @@ void print_valid_coldefs() { printf("valid -c options include:\n"); int i; - for (i = 0; valid_coldefs[i] != NULL; ++i) - printf(" %s\n", valid_coldefs[i]); + for (i = 0; valid_coldefs[i] != NULL; ++i) + { + const char *option = valid_coldefs[i]; + printf(" %d. \"%s\"\n", i+1,option); + + if (strcmp(option, "header") == 0) + printf(" input should contain a col. defn header as first line\n"); + else if (strcmp(option, "bed") == 0) { + int j; + for (j = 0; bed_coldefs[j] != NULL; ++j) + printf(" %s: column $%d\n", bed_coldefs[j], j+1); + } + else if (strcmp(option, "bedgraph") == 0) { + int j; + for (j = 0; bedgraph_coldefs[j] != NULL; ++j) + printf(" %s: column $%d\n", bedgraph_coldefs[j], j+1); + } + else if (strcmp(option, "sam") == 0) { + int j; + for (j = 0; sam_coldefs[j] != NULL; ++j) + printf(" %s: column $%d\n", sam_coldefs[j], j+1); + } + else if (strcmp(option, "vcf") == 0) { + int j; + for (j = 0; vcf_coldefs[j] != NULL; ++j) + printf(" %s: column $%d\n", vcf_coldefs[j], j+1); + } + else if (strcmp(option, "gff") == 0) { + int j; + for (j = 0; gff_coldefs[j] != NULL; ++j) + if (strcmp(gff_coldefs[j], "attribute") != 0) + printf(" %s: column $%d\n", gff_coldefs[j], j+1); + else + printf(" %s: column $%d\n", gff_coldefs[j], j); + } + } } void lh3_set_colnm() @@ -40,54 +97,41 @@ void lh3_set_colnm() if (lh3_col_defn == NULL) return; if (strcmp(lh3_col_defn, "header") == 0) { - char *p, *q, c; - int i; - for (p = record; *p && isspace(*p); ++p); /* skip leading spaces */ - for (i = 1, q = p; *q; ++q) { - if (!isspace(*q)) continue; - c = *q; /* backup the space */ - *q = 0; /* terminate the field */ - set_colnm_aux(p, i); - *q = c; /* change back */ - ++i; - for (p = q + 1; *p && isspace(*p); ++p); /* skip contiguous spaces */ - q = p; - } - set_colnm_aux(p, i); /* the last column */ + char *p, *q, c; + int i; + for (p = record; *p && isspace(*p); ++p); /* skip leading spaces */ + for (i = 1, q = p; *q; ++q) { + if (!isspace(*q)) continue; + c = *q; /* backup the space */ + *q = 0; /* terminate the field */ + set_colnm_aux(p, i); + *q = c; /* change back */ + ++i; + for (p = q + 1; *p && isspace(*p); ++p); /* skip contiguous spaces */ + q = p; + } + set_colnm_aux(p, i); /* the last column */ } else if (strcmp(lh3_col_defn, "bed") == 0) { - set_colnm_aux("chrom", 1); - set_colnm_aux("start", 2); - set_colnm_aux("end", 3); - set_colnm_aux("name", 4); - set_colnm_aux("score", 5); - set_colnm_aux("strand", 6); + int i; + for (i = 0; bed_coldefs[i] != NULL; ++i) + set_colnm_aux(bed_coldefs[i], i+1); // force tab delimited input and output *FS = "\t"; *OFS = "\t"; } else if (strcmp(lh3_col_defn, "bedgraph") == 0) { - set_colnm_aux("chrom", 1); - set_colnm_aux("start", 2); - set_colnm_aux("end", 3); - set_colnm_aux("score", 4); + int i; + for (i = 0; bedgraph_coldefs[i] != NULL; ++i) + set_colnm_aux(bedgraph_coldefs[i], i+1); // force tab delimited input and output *FS = "\t"; *OFS = "\t"; } else if (strcmp(lh3_col_defn, "sam") == 0) { - set_colnm_aux("qname", 1); - set_colnm_aux("flag", 2); - set_colnm_aux("rname", 3); - set_colnm_aux("pos", 4); - set_colnm_aux("mapq", 5); - set_colnm_aux("cigar", 6); - set_colnm_aux("rnext", 7); - set_colnm_aux("pnext", 8); - set_colnm_aux("tlen", 9); - set_colnm_aux("seq", 10); - set_colnm_aux("qual", 11); - // todo: any intelligent way to handle tags? + int i; + for (i = 0; sam_coldefs[i] != NULL; ++i) + set_colnm_aux(sam_coldefs[i], i+1); // force tab delimited input and output *FS = "\t"; *OFS = "\t"; @@ -97,14 +141,9 @@ void lh3_set_colnm() } } else if (strcmp(lh3_col_defn, "vcf") == 0) { - set_colnm_aux("chrom", 1); - set_colnm_aux("pos", 2); - set_colnm_aux("id", 3); - set_colnm_aux("ref", 4); - set_colnm_aux("alt", 5); - set_colnm_aux("qual", 6); - set_colnm_aux("filter", 7); - set_colnm_aux("info", 8); + int i; + for (i = 0; vcf_coldefs[i] != NULL; ++i) + set_colnm_aux(vcf_coldefs[i], i+1); // todo: any intelligent way to handle genotypes? // force tab delimited input and output *FS = "\t"; @@ -115,16 +154,13 @@ void lh3_set_colnm() } } else if (strcmp(lh3_col_defn, "gff") == 0 || strcmp(lh3_col_defn, "gtf") == 0) { - set_colnm_aux("seqname", 1); - set_colnm_aux("source", 2); - set_colnm_aux("feature", 3); - set_colnm_aux("start", 4); - set_colnm_aux("end", 5); - set_colnm_aux("score", 6); - set_colnm_aux("filter", 7); - set_colnm_aux("strand", 8); - set_colnm_aux("group", 9); // allow group or attribute, as - set_colnm_aux("attribute", 9); // GFF v1 used group, v2 uses attribute + int i; + for (i = 0; gff_coldefs[i] != NULL; ++i) + // allow "group" and "attribute" to be the ninth column + if (strcmp(lh3_col_defn, "attribute") != 0) + set_colnm_aux(gff_coldefs[i], i+1); + else + set_colnm_aux(gff_coldefs[i], i); // force tab delimited input and output *FS = "\t"; *OFS = "\t"; diff --git a/nawk/main.c b/nawk/main.c index cfd48cd..2a757cc 100644 --- a/nawk/main.c +++ b/nawk/main.c @@ -150,12 +150,12 @@ int main(int argc, char *argv[]) argc--; argv++; if (argc <= 1) FATAL("no variable name"); - if (isvalid_coldef(argv[1])) - lh3_col_defn = argv[1]; - else { - print_valid_coldefs(); + if (!isvalid_coldef(argv[1]) || strcmp(argv[1], "help") == 0) { + print_valid_coldefs(); exit(1); } + else + lh3_col_defn = argv[1]; } break; default: