diff --git a/nawk/addon.c b/nawk/addon.c index e7a8566..802299b 100644 --- a/nawk/addon.c +++ b/nawk/addon.c @@ -1,36 +1,172 @@ #include #include +#include #include "awk.h" #include "addon.h" -int lh3_has_colnm = 0; +const char *lh3_col_defn = NULL; +const char *valid_coldefs[] = {"header", "bed", "bedgraph", "sam", "vcf", "gff", NULL}; + +/*BED*/ +const char *bed_coldefs[] = {"chrom", "start", "end", + "name", "score", "strand", + "thickstart", "thickend", "rgb", + "blockcount", "blocksizes", "blockstarts", + NULL}; +/*BEDGRAPH*/ +const char *bedgraph_coldefs[] = {"chrom", "start", "end", "score", NULL}; +/*SAM*/ +const char *sam_coldefs[] = {"qname", "flag", "rname", + "pos", "mapq", "cigar", + "rnext", "pnext", "tlen", + "seq", "qual", NULL}; +/*VCF*/ +const char *vcf_coldefs[] = {"chrom", "pos", "id", + "ref", "alt", "qual", + "filter" "info", NULL}; +/*GFF/GTF*/ +const char *gff_coldefs[] = {"seqname", "source", "feature", + "start", "end", "score", + "filter", "strand", "group", + "attribute", NULL}; static void set_colnm_aux(const char *p, int col) { - const char *q; - Cell *x; - for (q = p; *q; ++q) - if (!isdigit(*q)) break; - if (*q == 0) return; /* do not set if string p is an integer */ - if ((x = lookup(p, symtab)) != NULL) - x->tval = NUM, x->fval = col; + const char *q; + Cell *x; + for (q = p; *q; ++q) + if (!isdigit(*q)) break; + if (*q == 0) return; /* do not set if string p is an integer */ + if ((x = lookup(p, symtab)) != NULL) + x->tval = NUM, x->fval = col; +} + +int isvalid_coldef(const char *request) +{ + int i; + for (i = 0; valid_coldefs[i] != NULL; ++i) + if (strcmp(request, valid_coldefs[i]) == 0) + return 1; + return 0; +} + +void print_valid_coldefs() +{ + printf("valid -c options include:\n"); + int i; + for (i = 0; valid_coldefs[i] != NULL; ++i) + { + const char *option = valid_coldefs[i]; + printf(" %d. \"%s\"\n", i+1,option); + + if (strcmp(option, "header") == 0) + printf(" input should contain a col. defn header as first line\n"); + else if (strcmp(option, "bed") == 0) { + int j; + for (j = 0; bed_coldefs[j] != NULL; ++j) + printf(" %s: column $%d\n", bed_coldefs[j], j+1); + } + else if (strcmp(option, "bedgraph") == 0) { + int j; + for (j = 0; bedgraph_coldefs[j] != NULL; ++j) + printf(" %s: column $%d\n", bedgraph_coldefs[j], j+1); + } + else if (strcmp(option, "sam") == 0) { + int j; + for (j = 0; sam_coldefs[j] != NULL; ++j) + printf(" %s: column $%d\n", sam_coldefs[j], j+1); + } + else if (strcmp(option, "vcf") == 0) { + int j; + for (j = 0; vcf_coldefs[j] != NULL; ++j) + printf(" %s: column $%d\n", vcf_coldefs[j], j+1); + } + else if (strcmp(option, "gff") == 0) { + int j; + for (j = 0; gff_coldefs[j] != NULL; ++j) + if (strcmp(gff_coldefs[j], "attribute") != 0) + printf(" %s: column $%d\n", gff_coldefs[j], j+1); + else + printf(" %s: column $%d\n", gff_coldefs[j], j); + } + } } void lh3_set_colnm() { - char *p, *q, c; - int i; - if (lh3_has_colnm == 0) return; - for (p = record; *p && isspace(*p); ++p); /* skip leading spaces */ - for (i = 1, q = p; *q; ++q) { - if (!isspace(*q)) continue; - c = *q; /* backup the space */ - *q = 0; /* terminate the field */ - set_colnm_aux(p, i); - *q = c; /* change back */ - ++i; - for (p = q + 1; *p && isspace(*p); ++p); /* skip contiguous spaces */ - q = p; - } - set_colnm_aux(p, i); /* the last column */ + if (lh3_col_defn == NULL) return; + + if (strcmp(lh3_col_defn, "header") == 0) { + char *p, *q, c; + int i; + for (p = record; *p && isspace(*p); ++p); /* skip leading spaces */ + for (i = 1, q = p; *q; ++q) { + if (!isspace(*q)) continue; + c = *q; /* backup the space */ + *q = 0; /* terminate the field */ + set_colnm_aux(p, i); + *q = c; /* change back */ + ++i; + for (p = q + 1; *p && isspace(*p); ++p); /* skip contiguous spaces */ + q = p; + } + set_colnm_aux(p, i); /* the last column */ + } + else if (strcmp(lh3_col_defn, "bed") == 0) { + int i; + for (i = 0; bed_coldefs[i] != NULL; ++i) + set_colnm_aux(bed_coldefs[i], i+1); + // force tab delimited input and output + *FS = "\t"; + *OFS = "\t"; + } + else if (strcmp(lh3_col_defn, "bedgraph") == 0) { + int i; + for (i = 0; bedgraph_coldefs[i] != NULL; ++i) + set_colnm_aux(bedgraph_coldefs[i], i+1); + // force tab delimited input and output + *FS = "\t"; + *OFS = "\t"; + } + else if (strcmp(lh3_col_defn, "sam") == 0) { + int i; + for (i = 0; sam_coldefs[i] != NULL; ++i) + set_colnm_aux(sam_coldefs[i], i+1); + // force tab delimited input and output + *FS = "\t"; + *OFS = "\t"; + // auto-report any header lines + while (getrec(&record, &recsize, 1) > 0 && record[0] == '@') { + printf("%s\n", record); + } + } + else if (strcmp(lh3_col_defn, "vcf") == 0) { + int i; + for (i = 0; vcf_coldefs[i] != NULL; ++i) + set_colnm_aux(vcf_coldefs[i], i+1); + // todo: any intelligent way to handle genotypes? + // force tab delimited input and output + *FS = "\t"; + *OFS = "\t"; + // auto-report any header lines + while (getrec(&record, &recsize, 1) > 0 && record[0] == '#') { + printf("%s\n", record); + } + } + else if (strcmp(lh3_col_defn, "gff") == 0 || strcmp(lh3_col_defn, "gtf") == 0) { + int i; + for (i = 0; gff_coldefs[i] != NULL; ++i) + // allow "group" and "attribute" to be the ninth column + if (strcmp(lh3_col_defn, "attribute") != 0) + set_colnm_aux(gff_coldefs[i], i+1); + else + set_colnm_aux(gff_coldefs[i], i); + // force tab delimited input and output + *FS = "\t"; + *OFS = "\t"; + // auto-report any header lines + while (getrec(&record, &recsize, 1) > 0 && record[0] == '#') { + printf("%s\n", record); + } + } } diff --git a/nawk/addon.h b/nawk/addon.h index c97ae00..2685105 100644 --- a/nawk/addon.h +++ b/nawk/addon.h @@ -1,7 +1,16 @@ #ifndef ADDON_H #define ADDON_H -extern int lh3_has_colnm; -void lh3_set_colnm(); +//extern int lh3_has_colnm; + +/* column definition, "header" reads frist line + other current options include: + bed, bedgraph, sam, vcf +*/ +extern const char *lh3_col_defn; + +void lh3_set_colnm(); /* assoc. col names w/ numbers */ +int isvalid_coldef(const char *); /* is the req. col defn. valid? */ +void print_valid_coldefs(); /* print a list of supported col defns. */ #endif diff --git a/nawk/main.c b/nawk/main.c index 3ee3952..2a757cc 100644 --- a/nawk/main.c +++ b/nawk/main.c @@ -58,7 +58,7 @@ int safe = 0; /* 1 => "safe" mode */ int main(int argc, char *argv[]) { const char *fs = NULL; - + setlocale(LC_CTYPE, ""); setlocale(LC_NUMERIC, "C"); /* for parsing cmdline & prog */ cmdname = argv[0]; @@ -144,7 +144,19 @@ int main(int argc, char *argv[]) printf("awk %s\n", version); break; case 'c': - lh3_has_colnm = 1; + if (argv[1][2] != 0) { /* arg is -csomething */ + lh3_col_defn = &argv[1][2]; + } else { /* arg is -c something */ + argc--; argv++; + if (argc <= 1) + FATAL("no variable name"); + if (!isvalid_coldef(argv[1]) || strcmp(argv[1], "help") == 0) { + print_valid_coldefs(); + exit(1); + } + else + lh3_col_defn = argv[1]; + } break; default: WARNING("unknown option %s ignored", argv[1]); diff --git a/nawk/run.c b/nawk/run.c index 821c431..564ca95 100644 --- a/nawk/run.c +++ b/nawk/run.c @@ -189,7 +189,7 @@ Cell *program(Node **a, int n) /* execute an awk program */ } if (a[1] || a[2]) while (getrec(&record, &recsize, 1) > 0) { - if (lh3_has_colnm && (int)(*NR + .499) == 1) lh3_set_colnm(); + if (lh3_col_defn != NULL && (int)(*NR + .499) == 1) lh3_set_colnm(); x = execute(a[1]); if (isexit(x)) break;