From ddc5c10352946e4e6ec979edd3ee3db2519d6e57 Mon Sep 17 00:00:00 2001 From: dnbaker Date: Thu, 3 Oct 2019 16:27:56 -0400 Subject: [PATCH 1/5] updates --- .gitmodules | 3 +++ Makefile | 2 +- hts_util.hpp | 4 ++-- htslib | 1 + varcount.cpp | 14 +++++--------- 5 files changed, 12 insertions(+), 12 deletions(-) create mode 100644 .gitmodules create mode 160000 htslib diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..b6e1f14 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "htslib"] + path = htslib + url = https://github.com/samtools/htslib diff --git a/Makefile b/Makefile index 86a35c0..ac0e96d 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CXX=g++ -CXX_FLAGS=-std=c++11 -O3 -Wall -Wextra +CXX_FLAGS=-std=c++11 -O3 -Wall -Wextra -Ihtslib LIBS=-lhts CPP=$(wildcard *.cpp) diff --git a/hts_util.hpp b/hts_util.hpp index d4ac5df..4eb3159 100644 --- a/hts_util.hpp +++ b/hts_util.hpp @@ -28,7 +28,7 @@ namespace hts_util { std::string ref = ""; // the following WON'T be used in comparators! std::string id = ""; - std::array ad = {0, 0}; + alignas(8) std::array ad = {0, 0}; // int32_t rc = 0; // int32_t ac = 0; bool rec_start = 0; @@ -182,7 +182,7 @@ namespace hts_util { int32_t* get_genotype(bcf_hdr_t* hdr, bcf1_t* rec, int* ngt) { int32_t *gt_arr = NULL, ngt_arr = 0; *ngt = bcf_get_genotypes(hdr, rec, >_arr, &ngt_arr); - if ( ngt<=0 ) return NULL; // GT not present + if ( *ngt<=0 ) return NULL; // GT not present else return gt_arr; } diff --git a/htslib b/htslib new file mode 160000 index 0000000..7fc833d --- /dev/null +++ b/htslib @@ -0,0 +1 @@ +Subproject commit 7fc833d07462ff59dfbad30e8f517f4301eed433 diff --git a/varcount.cpp b/varcount.cpp index efda991..d721015 100644 --- a/varcount.cpp +++ b/varcount.cpp @@ -52,14 +52,10 @@ std::array gt_by_threshold(const VcntArgs& args, const hts_util::Var gts[0] = bcf_gt_missing; if (args.thres < 0 && v.ad[1]) { gts[0] = bcf_gt_phased(v.ad[1] != 0); - } else if ((v.ad[1] || v.ad[0]) && std::abs(v.ad[0] - v.ad[1]) >= args.thres) { - if (v.ad[0] < v.ad[1]) { - gts[0] = bcf_gt_phased(1); - } else if (v.ad[0] > v.ad[1]) { - gts[0] = bcf_gt_phased(0); - } else { // v.ad[0] == v.ad[1] - gts[0] = bcf_gt_phased(args.alt_default); - } + } else if (*(reinterpret_cast(&v.ad[0])) && std::abs(v.ad[0] - v.ad[1]) >= args.thres) { + gts[0] = v.ad[0] < v.ad[1] ? bcf_gt_phased(1) : + v.ad[0] > v.ad[1] ? bcf_gt_phased(0) : + bcf_gt_phased(args.alt_default); } gts[1] = gts[0]; return gts; @@ -149,7 +145,7 @@ void varcount(const VcntArgs& args) { x |= var_match(vv, av); } // x ? ++(v_cached->ad[1]) : ++(v_cached->ad[0]); - x ? ++(vv.ad[1]) : ++(vv.ad[0]); + ++vv.ad[x != 0]; } if (args.verbose) { fprintf(stderr, "v %s ", bam_get_qname(aln)); From e857eb2b117ecd452de35fa61bc771f071817c04 Mon Sep 17 00:00:00 2001 From: dnbaker Date: Thu, 3 Oct 2019 17:11:14 -0400 Subject: [PATCH 2/5] Eliminate heap allocation for string splitting. --- hts_util.hpp | 6 +++--- util.hpp | 13 +++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/hts_util.hpp b/hts_util.hpp index 4eb3159..e354621 100644 --- a/hts_util.hpp +++ b/hts_util.hpp @@ -44,11 +44,11 @@ namespace hts_util { std::string id = ids.size() == (b->n_allele - 1) ? ids[i-1] : std::string(b->d.id); if (alt[0] == '.') continue; if (strlen(alt) < strlen(ref)) { // DEL - vs.push_back(Var(b->pos, VTYPE::V_DEL, alt, ref, id)); // don't need alt here + vs.push_back(Var(b->pos, VTYPE::V_DEL, alt, ref, std::move(id))); // don't need alt here } else if (strlen(alt) > strlen(ref)) { // INS - vs.push_back(Var(b->pos, VTYPE::V_INS, alt, ref, id)); // don't need ref here + vs.push_back(Var(b->pos, VTYPE::V_INS, alt, ref, std::move(id))); // don't need ref here } else { // SNP - vs.push_back(Var(b->pos, VTYPE::V_SNP, alt, ref, id)); // don't need ref here + vs.push_back(Var(b->pos, VTYPE::V_SNP, alt, ref, std::move(id))); // don't need ref here } } vs[0].rec_start = 1; diff --git a/util.hpp b/util.hpp index df66058..03e2104 100644 --- a/util.hpp +++ b/util.hpp @@ -9,6 +9,7 @@ std::vector parse_ids(const char* s) { std::vector ss; +#if TAHER char* s__ = (char*) malloc(sizeof(char) * (strlen(s) + 1)); strcpy(s__, s); char* s_ = std::strtok(s__, ";"); @@ -17,6 +18,18 @@ std::vector parse_ids(const char* s) { s_ = std::strtok(NULL, ";"); } free(s__); +#else + const char *p1 = s, *p2 = std::strchr(p1, ';'); + for(;;) { + ss.emplace_back(p1, p2 - p1); + p1 = p2 + 1; + if((p2 = std::strchr(p1, ';')) == nullptr) { + p2 = std::strchr(p1, '\0'); + ss.emplace_back(p1, p2 - p1); + break; + } + } +#endif return ss; } From c7fc901bcd2be50bbae42457ae46bcab620fc7ad Mon Sep 17 00:00:00 2001 From: dnbaker Date: Fri, 4 Oct 2019 01:28:58 -0400 Subject: [PATCH 3/5] Playing around. --- Makefile | 16 ++++++++++++++-- hts_util.hpp | 45 +++++++++++++++++++++++++++++--------------- mdparse.hpp | 3 ++- next_gt.cpp | 13 ++++++++++++- util.hpp | 20 +++++++------------- varcount.cpp | 52 ++++++++++++++++++++++++++------------------------- vcf_score.cpp | 6 +++--- 7 files changed, 95 insertions(+), 60 deletions(-) diff --git a/Makefile b/Makefile index 6451e11..0af78a1 100644 --- a/Makefile +++ b/Makefile @@ -9,20 +9,32 @@ LIBS=-lz -lbz2 -llzma -lcurl CPP=$(wildcard *.cpp) OBJ=$(CPP:%.cpp=%.o) +all: varcount next_gt test unit + htslib/libhts.a: cd htslib && make libhts.a -all: varcount compare_vcfs - varcount: varcount.o hts_util.hpp $(CXX) $(CXX_FLAGS) $^ htslib/libhts.a -o $@ $(LIBS) compare_vcfs: compare_vcfs.o hts_util.hpp $(CXX) $(CXX_FLAGS) $^ htslib/libhts.a -o $@ $(LIBS) +vcf_score: vcf_score.o hts_util.hpp + $(CXX) $(CXX_FLAGS) $^ htslib/libhts.a -o $@ $(LIBS) + +next_gt: next_gt.o hts_util.hpp + $(CXX) $(CXX_FLAGS) $^ htslib/libhts.a -o $@ $(LIBS) + test: test.o hts_util.hpp $(CXX) $(CXX_FLAGS) $^ htslib/libhts.a -o $@ $(LIBS) +unit: testutil.o + $(CXX) $< -o $@ $(CXX_FLAGS) $(LIBS) + +testutil: testutil.o + $(CXX) $< -o $@ $(CXX_FLAGS) $(LIBS) + %.o: %.cpp $(CXX) $(CXX_FLAGS) -c $< -o $@ $(LIBS) diff --git a/hts_util.hpp b/hts_util.hpp index e354621..b2598d5 100644 --- a/hts_util.hpp +++ b/hts_util.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,7 @@ namespace hts_util { std::string alt = ""; std::string ref = ""; // the following WON'T be used in comparators! - std::string id = ""; + std::string id = ""; alignas(8) std::array ad = {0, 0}; // int32_t rc = 0; // int32_t ac = 0; @@ -38,14 +39,15 @@ namespace hts_util { std::vector vs; char* ref = b->d.allele[0]; // to deal with multi-allele ids: - std::vector ids = parse_ids(b->d.id); + std::vector ids = util::parse_ids(b->d.id); for (uint32_t i = 1; i < b->n_allele; ++i) { char* alt = b->d.allele[i]; - std::string id = ids.size() == (b->n_allele - 1) ? ids[i-1] : std::string(b->d.id); - if (alt[0] == '.') continue; - if (strlen(alt) < strlen(ref)) { // DEL + std::string id = ids.size() == static_cast((b->n_allele - 1)) ? ids[i-1] : std::string(b->d.id); + if (*alt == '.') continue; + const size_t alen = std::strlen(alt); + if (alen < strlen(ref)) { // DEL vs.push_back(Var(b->pos, VTYPE::V_DEL, alt, ref, std::move(id))); // don't need alt here - } else if (strlen(alt) > strlen(ref)) { // INS + } else if (alen > strlen(ref)) { // INS vs.push_back(Var(b->pos, VTYPE::V_INS, alt, ref, std::move(id))); // don't need ref here } else { // SNP vs.push_back(Var(b->pos, VTYPE::V_SNP, alt, ref, std::move(id))); // don't need ref here @@ -79,18 +81,24 @@ namespace hts_util { uint32_t* cs = bam_get_cigar(aln); int32_t qpos = 0; int32_t rpos = aln->core.pos; - int32_t rlen, qlen; + int32_t qlen, rlen, oplen, optype; for (uint32_t i = 0; i < aln->core.n_cigar; ++i) { - rlen = (bam_cigar_type(cs[i]) & 2) ? bam_cigar_oplen(cs[i]) : 0; - qlen = (bam_cigar_type(cs[i]) & 1) ? bam_cigar_oplen(cs[i]) : 0; - if (bam_cigar_op(cs[i]) == BAM_CINS) { - std::string ins_seq = ""; + uint32_t csv = cs[i]; + oplen = bam_cigar_oplen(csv); + optype = bam_cigar_type(csv); + rlen = (optype & 2) ? oplen : 0; + qlen = (optype & 1) ? oplen : 0; + switch(bam_cigar_op(csv)) { + case BAM_CINS: { + std::string ins_seq; ins_seq += seq_nt16_str[bam_seqi(bam_get_seq(aln), qpos-1)]; // adding the previous character here for compatibility with VCF format for (uint32_t j = 0; j < bam_cigar_oplen(cs[i]); ++j) { ins_seq += seq_nt16_str[bam_seqi(bam_get_seq(aln), qpos+j)]; } vs.push_back(Var(rpos-1, VTYPE::V_INS, ins_seq, ins_seq.substr(0,1))); - } else if (bam_cigar_op(cs[i]) == BAM_CMATCH) { + } + break; + case BAM_CMATCH: { // check potential snps here for (auto it = snps.begin(); it != snps.end(); ) { int32_t s = it->second; @@ -101,7 +109,9 @@ namespace hts_util { it = snps.erase(it); // we do this so we don't recheck snps that have already been determined, but... maybe deleting it would actually be more expensive? } else ++it; } - } else if (bam_cigar_op(cs[i]) == BAM_CDEL) { + } + break; + case BAM_CDEL: { for (auto it = dels.begin(); it != dels.end(); ) { int32_t d = it->second; if (d == rpos - 1) { @@ -112,13 +122,14 @@ namespace hts_util { } else ++it; } } + } // switch(bam_cigar_type(csv)) rpos += rlen; qpos += qlen; } return vs; } - }; + }; // struct Var @@ -212,6 +223,10 @@ namespace hts_util { } return pls; } -}; + struct htslib_error: public std::runtime_error { + htslib_error(std::string msg): std::runtime_error(std::string("htslib error: ") + msg) {} + }; + +} // hts_util #endif // VARCOUNT_HPP diff --git a/mdparse.hpp b/mdparse.hpp index ed16267..e912d37 100644 --- a/mdparse.hpp +++ b/mdparse.hpp @@ -11,7 +11,8 @@ struct MDPos { uint32_t p = 0; - std::string str = ""; // maybe we can get away with char* instead? + std::string str; // maybe we can get away with char* instead? + // maybe leave it -- it's probably < 16, so it should stay on the stack through SSO. int st = MD_END; }; diff --git a/next_gt.cpp b/next_gt.cpp index 69c4d4b..6f3e1d4 100644 --- a/next_gt.cpp +++ b/next_gt.cpp @@ -36,6 +36,17 @@ struct VarGT { bool rec_start = 0; }; +static inline bool operator==(const VarGT& lv, const VarGT& rv) { + bcf1_t &l = *lv.rec, &r = *rv.rec; + return l.pos == r.pos && std::strcmp(r.d.allele[0], l.d.allele[0]) == 0 && + std::strcmp(r.d.allele[1], l.d.allele[1]) == 0 && + lv.gt1 == rv.gt1 && + lv.gt2 == rv.gt2; +} +static inline bool operator!=(const VarGT& lv, const VarGT& rv) { + return !(lv == rv); +} + static inline bool var_match(const VarGT& lv, const VarGT& rv) { bcf1_t* l = lv.rec; bcf1_t* r = rv.rec; @@ -79,7 +90,7 @@ void compare_vcfs(CmpVcfArgs args) { bool match = 0; if (vars1_iter != vmap->end()) { for (const auto& v1: vars1_iter->second) { - if (var_match(v1, v2)) { + if (v1 == v2) { match = 1; break; } diff --git a/util.hpp b/util.hpp index 03e2104..944aa61 100644 --- a/util.hpp +++ b/util.hpp @@ -6,19 +6,10 @@ #include #include - -std::vector parse_ids(const char* s) { +namespace util { +template +std::vector parse_dsv(const char* s) { std::vector ss; -#if TAHER - char* s__ = (char*) malloc(sizeof(char) * (strlen(s) + 1)); - strcpy(s__, s); - char* s_ = std::strtok(s__, ";"); - while (s_ != NULL) { - ss.push_back(s_); - s_ = std::strtok(NULL, ";"); - } - free(s__); -#else const char *p1 = s, *p2 = std::strchr(p1, ';'); for(;;) { ss.emplace_back(p1, p2 - p1); @@ -29,8 +20,11 @@ std::vector parse_ids(const char* s) { break; } } -#endif return ss; } +std::vector parse_ids(const char* s) { + return parse_dsv<';'>(s); +} +} // util #endif diff --git a/varcount.cpp b/varcount.cpp index a918e1a..7a61d93 100644 --- a/varcount.cpp +++ b/varcount.cpp @@ -6,6 +6,7 @@ #include #include #include "hts_util.hpp" +using hts_util::htslib_error; struct VcntArgs { enum GT_TYPE {GTN, GTL, GTT, GTA}; @@ -55,7 +56,7 @@ std::array gt_by_threshold(const VcntArgs& args, const hts_util::Var gts[0] = v.ad[0] < v.ad[1] ? bcf_gt_phased(1) : v.ad[0] > v.ad[1] ? bcf_gt_phased(0) : bcf_gt_phased(args.alt_default); - } + } gts[1] = gts[0]; return gts; } @@ -69,7 +70,7 @@ std::array gt_by_alt_evidence(const VcntArgs& args, const hts_util:: } else { gts[0] = bcf_gt_missing; } - gts[1] = gts[0]; // force homozygous + gts[1] = gts[0]; // force homozygous return gts; } @@ -106,7 +107,7 @@ void varcount(const VcntArgs& args) { if (bcf_hdr_set_samples(vcf_hdr, NULL, 0)) { fprintf(stderr, "error setting samples\n"); exit(1); - } + } hts_util::contig2map_map contig2vars(hts_util::bcf_to_map(vcf_fp, vcf_hdr)); @@ -179,7 +180,7 @@ void varcount(const VcntArgs& args) { bcf_hdr_append(out_vcf_hdr, "##FORMAT="); if (args.gt) bcf_hdr_append(out_vcf_hdr, "##FORMAT="); - if (args.gl) + if (args.gl) bcf_hdr_append(out_vcf_hdr, "##FORMAT="); if (bcf_hdr_write(out_vcf_fp, out_vcf_hdr)) { fprintf(stderr, "bcf_hdr_write error\n"); @@ -192,43 +193,43 @@ void varcount(const VcntArgs& args) { // TODO: sort the output? for (const auto& vs: contig2vars) { // vs: {seq, map} for (const auto& v: vs.second) { // v: {pos, Varlist} - auto it = v.second.begin(); - for (auto it = v.second.begin(); it != v.second.end(); ++it) { - std::string allele_str(it->ref + "," + it->alt); + for(const auto var: v.second) { + //for (auto it = v.second.begin(); it != v.second.end(); ++it) { + std::string allele_str(var.ref + "," + var.alt); out_vcf_rec->rid = bcf_hdr_name2id(out_vcf_hdr, vs.first.data()); - out_vcf_rec->pos = it->pos; - out_vcf_rec->rlen = it->ref.size(); + out_vcf_rec->pos = var.pos; + out_vcf_rec->rlen = var.ref.size(); bcf_update_alleles_str(out_vcf_hdr, out_vcf_rec, allele_str.data()); - bcf_update_id(out_vcf_hdr, out_vcf_rec, it->id.data()); - bcf_update_info_int32(out_vcf_hdr, out_vcf_rec, "ALTCNT", &(it->ad[1]), 1); - bcf_update_info_int32(out_vcf_hdr, out_vcf_rec, "REFCNT", &(it->ad[0]), 1); + bcf_update_id(out_vcf_hdr, out_vcf_rec, var.id.data()); + bcf_update_info_int32(out_vcf_hdr, out_vcf_rec, "ALTCNT", &(var.ad[1]), 1); + bcf_update_info_int32(out_vcf_hdr, out_vcf_rec, "REFCNT", &(var.ad[0]), 1); std::array pls; std::array gts; if (args.gl || (args.gt == VcntArgs::GT_TYPE::GTL)) { /* TODO: handle indels smarter. Use individual base qualities */ - pls = hts_util::get_pls_naive_normalized(it->ad[0], it->ad[1], args.e); + pls = hts_util::get_pls_naive_normalized(var.ad[0], var.ad[1], args.e); } int gt_pass = 1; - int count_pass = (it->ad[0] + it->ad[1] >= args.min_c && - it->ad[1] >= args.min_ac && - it->ad[0] >= args.min_rc); + int count_pass = (var.ad[0] + var.ad[1] >= args.min_c && + var.ad[1] >= args.min_ac && + var.ad[0] >= args.min_rc); if (count_pass && args.gt) { // naive genotyping - if (args.gt == VcntArgs::GT_TYPE::GTA) { - gts = gt_by_alt_evidence(args, *it); - } else if (args.gt == VcntArgs::GT_TYPE::GTT) { - gts = gt_by_threshold(args, *it); - } else { // default: genotype by likelihood - gts = gt_by_likelihood(args, pls); + switch(args.gt) { + case VcntArgs::GT_TYPE::GTA: + gts = gt_by_alt_evidence(args, var); break; + case VcntArgs::GT_TYPE::GTT: + gts = gt_by_threshold(args, var); break; + default: + gts = gt_by_likelihood(args, pls); } bcf_update_genotypes(out_vcf_hdr, out_vcf_rec, gts.data(), 2); gt_pass = !bcf_gt_is_missing(gts[0]); } if (args.keep || ( count_pass && gt_pass )) { if (args.gl) bcf_update_format_int32(out_vcf_hdr, out_vcf_rec, "PL", pls.data(), 3); - bcf_update_format_int32(out_vcf_hdr, out_vcf_rec, "AD", it->ad.data(), 2); + bcf_update_format_int32(out_vcf_hdr, out_vcf_rec, "AD", var.ad.data(), 2); if (bcf_write(out_vcf_fp, out_vcf_hdr, out_vcf_rec)) { - fprintf(stderr, "bcf_write error\n"); - exit(1); + throw htslib_error("bcf_write failed"); } } bcf_clear(out_vcf_rec); @@ -240,6 +241,7 @@ void varcount(const VcntArgs& args) { bcf_hdr_destroy(vcf_hdr); bcf_close(vcf_fp); + bcf_destroy(out_vcf_rec); } void print_help() { diff --git a/vcf_score.cpp b/vcf_score.cpp index bd3b0fc..d0dd56b 100644 --- a/vcf_score.cpp +++ b/vcf_score.cpp @@ -20,7 +20,7 @@ static inline int gta(int i) { } struct VarGT { - VarGT(hts_util::Var v, int g1, int g2) : var(v), gt1(g1), gt2(g2) {} + VarGT(hts_util::Var v, int g1, int g2) : gt1(g1), gt2(g2), var(v) {} int gt1 = 0; int gt2 = 0; hts_util::Var var; @@ -31,10 +31,10 @@ struct VarGT { int* gts = hts_util::get_genotype(hdr, b, &ngt); if (ngt < 2) { fprintf(stderr, "not diploid!\n"); exit(1); } auto vs = hts_util::Var::from_bcf(hdr, b); - for (int i = 0; i < vs.size(); ++i) { + for (unsigned i = 0; i < vs.size(); ++i) { /* this hack allows compatibility with vcfs that don't allow multi-allelic lines */ for (int j = 0; j < 2; ++j) { - if (bcf_gt_allele(gts[j]) == i+1) { + if (static_cast(bcf_gt_allele(gts[j])) == i+1) { gts[j] = bcf_gt_is_phased(gts[j]) ? bcf_gt_phased(1) : bcf_gt_unphased(1); } else if (bcf_gt_allele(gts[j]) != 0) { gts[j] = bcf_gt_is_phased(gts[j]) ? bcf_gt_phased(2) : bcf_gt_unphased(2); From d24b70202b972b1c09f7cbea35066ab1c4c4eabd Mon Sep 17 00:00:00 2001 From: dnbaker Date: Fri, 4 Oct 2019 10:16:37 -0400 Subject: [PATCH 4/5] template --- util.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util.hpp b/util.hpp index 944aa61..7e5fb20 100644 --- a/util.hpp +++ b/util.hpp @@ -10,11 +10,11 @@ namespace util { template std::vector parse_dsv(const char* s) { std::vector ss; - const char *p1 = s, *p2 = std::strchr(p1, ';'); + const char *p1 = s, *p2 = std::strchr(p1, SEP); for(;;) { ss.emplace_back(p1, p2 - p1); p1 = p2 + 1; - if((p2 = std::strchr(p1, ';')) == nullptr) { + if((p2 = std::strchr(p1, SEP)) == nullptr) { p2 = std::strchr(p1, '\0'); ss.emplace_back(p1, p2 - p1); break; From ed2e3073bbbe4d19a421e201147ad99bddfbee3f Mon Sep 17 00:00:00 2001 From: dnbaker Date: Fri, 4 Oct 2019 13:49:49 -0400 Subject: [PATCH 5/5] move Makefile so it isn't clobbered. --- Makefile => manual_makefile/Makefile | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Makefile => manual_makefile/Makefile (100%) diff --git a/Makefile b/manual_makefile/Makefile similarity index 100% rename from Makefile rename to manual_makefile/Makefile