Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "htslib"]
path = htslib
url = https://github.com/samtools/htslib
55 changes: 35 additions & 20 deletions hts_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <array>
#include <list>
#include <string>
#include <stdexcept>
#include <htslib/vcf.h>
#include <htslib/sam.h>
#include <htslib/hts.h>
Expand All @@ -27,8 +28,8 @@ namespace hts_util {
std::string alt = "";
std::string ref = "";
// the following WON'T be used in comparators!
std::string id = "";
std::array<int32_t, 2> ad = {0, 0};
std::string id = "";
alignas(8) std::array<int32_t, 2> ad = {0, 0};
// int32_t rc = 0;
// int32_t ac = 0;
bool rec_start = 0;
Expand All @@ -38,17 +39,18 @@ namespace hts_util {
std::vector<Var> vs;
char* ref = b->d.allele[0];
// to deal with multi-allele ids:
std::vector<std::string> ids = parse_ids(b->d.id);
std::vector<std::string> ids = util::parse_ids(b->d.id);
for (uint32_t i = 1; i < b->n_allele; ++i) {
char* alt = b->d.allele[i];
std::string id = ids.size() == (b->n_allele - 1) ? ids[i-1] : std::string(b->d.id);
if (alt[0] == '.') continue;
if (strlen(alt) < strlen(ref)) { // DEL
vs.push_back(Var(b->pos, VTYPE::V_DEL, alt, ref, id)); // don't need alt here
} else if (strlen(alt) > strlen(ref)) { // INS
vs.push_back(Var(b->pos, VTYPE::V_INS, alt, ref, id)); // don't need ref here
std::string id = ids.size() == static_cast<unsigned>((b->n_allele - 1)) ? ids[i-1] : std::string(b->d.id);
if (*alt == '.') continue;
const size_t alen = std::strlen(alt);
if (alen < strlen(ref)) { // DEL
vs.push_back(Var(b->pos, VTYPE::V_DEL, alt, ref, std::move(id))); // don't need alt here
} else if (alen > strlen(ref)) { // INS
vs.push_back(Var(b->pos, VTYPE::V_INS, alt, ref, std::move(id))); // don't need ref here
} else { // SNP
vs.push_back(Var(b->pos, VTYPE::V_SNP, alt, ref, id)); // don't need ref here
vs.push_back(Var(b->pos, VTYPE::V_SNP, alt, ref, std::move(id))); // don't need ref here
}
}
vs[0].rec_start = 1;
Expand Down Expand Up @@ -79,18 +81,24 @@ namespace hts_util {
uint32_t* cs = bam_get_cigar(aln);
int32_t qpos = 0;
int32_t rpos = aln->core.pos;
int32_t rlen, qlen;
int32_t qlen, rlen, oplen, optype;
for (uint32_t i = 0; i < aln->core.n_cigar; ++i) {
rlen = (bam_cigar_type(cs[i]) & 2) ? bam_cigar_oplen(cs[i]) : 0;
qlen = (bam_cigar_type(cs[i]) & 1) ? bam_cigar_oplen(cs[i]) : 0;
if (bam_cigar_op(cs[i]) == BAM_CINS) {
std::string ins_seq = "";
uint32_t csv = cs[i];
oplen = bam_cigar_oplen(csv);
optype = bam_cigar_type(csv);
rlen = (optype & 2) ? oplen : 0;
qlen = (optype & 1) ? oplen : 0;
switch(bam_cigar_op(csv)) {
case BAM_CINS: {
std::string ins_seq;
ins_seq += seq_nt16_str[bam_seqi(bam_get_seq(aln), qpos-1)]; // adding the previous character here for compatibility with VCF format
for (uint32_t j = 0; j < bam_cigar_oplen(cs[i]); ++j) {
ins_seq += seq_nt16_str[bam_seqi(bam_get_seq(aln), qpos+j)];
}
vs.push_back(Var(rpos-1, VTYPE::V_INS, ins_seq, ins_seq.substr(0,1)));
} else if (bam_cigar_op(cs[i]) == BAM_CMATCH) {
}
break;
case BAM_CMATCH: {
// check potential snps here
for (auto it = snps.begin(); it != snps.end(); ) {
int32_t s = it->second;
Expand All @@ -101,7 +109,9 @@ namespace hts_util {
it = snps.erase(it); // we do this so we don't recheck snps that have already been determined, but... maybe deleting it would actually be more expensive?
} else ++it;
}
} else if (bam_cigar_op(cs[i]) == BAM_CDEL) {
}
break;
case BAM_CDEL: {
for (auto it = dels.begin(); it != dels.end(); ) {
int32_t d = it->second;
if (d == rpos - 1) {
Expand All @@ -112,13 +122,14 @@ namespace hts_util {
} else ++it;
}
}
} // switch(bam_cigar_type(csv))
rpos += rlen;
qpos += qlen;
}
return vs;
}

};
}; // struct Var



Expand Down Expand Up @@ -182,7 +193,7 @@ namespace hts_util {
int32_t* get_genotype(bcf_hdr_t* hdr, bcf1_t* rec, int* ngt) {
int32_t *gt_arr = NULL, ngt_arr = 0;
*ngt = bcf_get_genotypes(hdr, rec, &gt_arr, &ngt_arr);
if ( ngt<=0 ) return NULL; // GT not present
if ( *ngt<=0 ) return NULL; // GT not present

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this was a bug, since pointers are always nonnegative.

else return gt_arr;
}

Expand Down Expand Up @@ -212,6 +223,10 @@ namespace hts_util {
}
return pls;
}
};
struct htslib_error: public std::runtime_error {
htslib_error(std::string msg): std::runtime_error(std::string("htslib error: ") + msg) {}
};

} // hts_util

#endif // VARCOUNT_HPP
1 change: 1 addition & 0 deletions htslib
Submodule htslib added at 7fc833
44 changes: 44 additions & 0 deletions manual_makefile/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
CXX=g++
WARNINGS=-Wall -Wextra -Wno-char-subscripts \
-Wpointer-arith -Wwrite-strings -Wdisabled-optimization \
-Wformat -Wcast-align -Wno-unused-function -Wno-unused-parameter \
-pedantic -Wunused-variable -Wno-attributes -Wno-pedantic -Wno-ignored-attributes
CXX_FLAGS=-std=c++11 -O3 -Wall -Wextra -Ihtslib $(WARNINGS)

LIBS=-lz -lbz2 -llzma -lcurl
CPP=$(wildcard *.cpp)
OBJ=$(CPP:%.cpp=%.o)

all: varcount next_gt test unit

htslib/libhts.a:
cd htslib && make libhts.a

varcount: varcount.o hts_util.hpp
$(CXX) $(CXX_FLAGS) $^ htslib/libhts.a -o $@ $(LIBS)

compare_vcfs: compare_vcfs.o hts_util.hpp
$(CXX) $(CXX_FLAGS) $^ htslib/libhts.a -o $@ $(LIBS)

vcf_score: vcf_score.o hts_util.hpp
$(CXX) $(CXX_FLAGS) $^ htslib/libhts.a -o $@ $(LIBS)

next_gt: next_gt.o hts_util.hpp
$(CXX) $(CXX_FLAGS) $^ htslib/libhts.a -o $@ $(LIBS)

test: test.o hts_util.hpp
$(CXX) $(CXX_FLAGS) $^ htslib/libhts.a -o $@ $(LIBS)

unit: testutil.o
$(CXX) $< -o $@ $(CXX_FLAGS) $(LIBS)

testutil: testutil.o
$(CXX) $< -o $@ $(CXX_FLAGS) $(LIBS)

%.o: %.cpp
$(CXX) $(CXX_FLAGS) -c $< -o $@ $(LIBS)

.PHONY: clean

clean:
-rm varcount test $(OBJ)
3 changes: 2 additions & 1 deletion mdparse.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@

struct MDPos {
uint32_t p = 0;
std::string str = ""; // maybe we can get away with char* instead?
std::string str; // maybe we can get away with char* instead?
// maybe leave it -- it's probably < 16, so it should stay on the stack through SSO.
int st = MD_END;
};

Expand Down
13 changes: 12 additions & 1 deletion next_gt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,17 @@ struct VarGT {
bool rec_start = 0;
};

static inline bool operator==(const VarGT& lv, const VarGT& rv) {
bcf1_t &l = *lv.rec, &r = *rv.rec;
return l.pos == r.pos && std::strcmp(r.d.allele[0], l.d.allele[0]) == 0 &&
std::strcmp(r.d.allele[1], l.d.allele[1]) == 0 &&
lv.gt1 == rv.gt1 &&
lv.gt2 == rv.gt2;
}
static inline bool operator!=(const VarGT& lv, const VarGT& rv) {
return !(lv == rv);
}

static inline bool var_match(const VarGT& lv, const VarGT& rv) {
bcf1_t* l = lv.rec;
bcf1_t* r = rv.rec;
Expand Down Expand Up @@ -79,7 +90,7 @@ void compare_vcfs(CmpVcfArgs args) {
bool match = 0;
if (vars1_iter != vmap->end()) {
for (const auto& v1: vars1_iter->second) {
if (var_match(v1, v2)) {
if (v1 == v2) {
match = 1;
break;
}
Expand Down
25 changes: 16 additions & 9 deletions util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,25 @@
#include <cstring>
#include <string>


std::vector<std::string> parse_ids(const char* s) {
namespace util {
template<char SEP=';'>
std::vector<std::string> parse_dsv(const char* s) {
std::vector<std::string> ss;
char* s__ = (char*) malloc(sizeof(char) * (strlen(s) + 1));
strcpy(s__, s);
char* s_ = std::strtok(s__, ";");
while (s_ != NULL) {
ss.push_back(s_);
s_ = std::strtok(NULL, ";");
const char *p1 = s, *p2 = std::strchr(p1, SEP);
for(;;) {
ss.emplace_back(p1, p2 - p1);
p1 = p2 + 1;
if((p2 = std::strchr(p1, SEP)) == nullptr) {
p2 = std::strchr(p1, '\0');
ss.emplace_back(p1, p2 - p1);
break;
}
}
free(s__);
return ss;
}
std::vector<std::string> parse_ids(const char* s) {
return parse_dsv<';'>(s);
}
} // util

#endif
66 changes: 32 additions & 34 deletions varcount.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <htslib/hts.h>
#include <getopt.h>
#include "hts_util.hpp"
using hts_util::htslib_error;

struct VcntArgs {
enum GT_TYPE {GTN, GTL, GTT, GTA};
Expand Down Expand Up @@ -51,15 +52,11 @@ std::array<int32_t, 2> gt_by_threshold(const VcntArgs& args, const hts_util::Var
gts[0] = bcf_gt_missing;
if (args.thres < 0 && v.ad[1]) {
gts[0] = bcf_gt_phased(v.ad[1] != 0);
} else if ((v.ad[1] || v.ad[0]) && std::abs(v.ad[0] - v.ad[1]) >= args.thres) {
if (v.ad[0] < v.ad[1]) {
gts[0] = bcf_gt_phased(1);
} else if (v.ad[0] > v.ad[1]) {
gts[0] = bcf_gt_phased(0);
} else { // v.ad[0] == v.ad[1]
gts[0] = bcf_gt_phased(args.alt_default);
}
}
} else if (*(reinterpret_cast<const uint64_t *>(&v.ad[0])) && std::abs(v.ad[0] - v.ad[1]) >= args.thres) {
gts[0] = v.ad[0] < v.ad[1] ? bcf_gt_phased(1) :
v.ad[0] > v.ad[1] ? bcf_gt_phased(0) :
bcf_gt_phased(args.alt_default);
}
gts[1] = gts[0];
return gts;
}
Expand All @@ -73,7 +70,7 @@ std::array<int32_t, 2> gt_by_alt_evidence(const VcntArgs& args, const hts_util::
} else {
gts[0] = bcf_gt_missing;
}
gts[1] = gts[0]; // force homozygous
gts[1] = gts[0]; // force homozygous
return gts;
}

Expand Down Expand Up @@ -110,7 +107,7 @@ void varcount(const VcntArgs& args) {
if (bcf_hdr_set_samples(vcf_hdr, NULL, 0)) {
fprintf(stderr, "error setting samples\n");
exit(1);
}
}

hts_util::contig2map_map<hts_util::Var> contig2vars(hts_util::bcf_to_map(vcf_fp, vcf_hdr));

Expand Down Expand Up @@ -149,7 +146,7 @@ void varcount(const VcntArgs& args) {
x |= var_match(vv, av);
}
// x ? ++(v_cached->ad[1]) : ++(v_cached->ad[0]);
x ? ++(vv.ad[1]) : ++(vv.ad[0]);
++vv.ad[x != 0];
}
if (args.verbose) {
fprintf(stderr, "v %s ", bam_get_qname(aln));
Expand Down Expand Up @@ -183,7 +180,7 @@ void varcount(const VcntArgs& args) {
bcf_hdr_append(out_vcf_hdr, "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic Depth\">");
if (args.gt)
bcf_hdr_append(out_vcf_hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
if (args.gl)
if (args.gl)
bcf_hdr_append(out_vcf_hdr, "##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
if (bcf_hdr_write(out_vcf_fp, out_vcf_hdr)) {
fprintf(stderr, "bcf_hdr_write error\n");
Expand All @@ -196,43 +193,43 @@ void varcount(const VcntArgs& args) {
// TODO: sort the output?
for (const auto& vs: contig2vars) { // vs: {seq, map}
for (const auto& v: vs.second) { // v: {pos, Varlist}
auto it = v.second.begin();
for (auto it = v.second.begin(); it != v.second.end(); ++it) {
std::string allele_str(it->ref + "," + it->alt);
for(const auto var: v.second) {
//for (auto it = v.second.begin(); it != v.second.end(); ++it) {
std::string allele_str(var.ref + "," + var.alt);
out_vcf_rec->rid = bcf_hdr_name2id(out_vcf_hdr, vs.first.data());
out_vcf_rec->pos = it->pos;
out_vcf_rec->rlen = it->ref.size();
out_vcf_rec->pos = var.pos;
out_vcf_rec->rlen = var.ref.size();
bcf_update_alleles_str(out_vcf_hdr, out_vcf_rec, allele_str.data());
bcf_update_id(out_vcf_hdr, out_vcf_rec, it->id.data());
bcf_update_info_int32(out_vcf_hdr, out_vcf_rec, "ALTCNT", &(it->ad[1]), 1);
bcf_update_info_int32(out_vcf_hdr, out_vcf_rec, "REFCNT", &(it->ad[0]), 1);
bcf_update_id(out_vcf_hdr, out_vcf_rec, var.id.data());
bcf_update_info_int32(out_vcf_hdr, out_vcf_rec, "ALTCNT", &(var.ad[1]), 1);
bcf_update_info_int32(out_vcf_hdr, out_vcf_rec, "REFCNT", &(var.ad[0]), 1);
std::array<int32_t, 3> pls;
std::array<int32_t, 2> gts;
if (args.gl || (args.gt == VcntArgs::GT_TYPE::GTL)) {
/* TODO: handle indels smarter. Use individual base qualities */
pls = hts_util::get_pls_naive_normalized(it->ad[0], it->ad[1], args.e);
pls = hts_util::get_pls_naive_normalized(var.ad[0], var.ad[1], args.e);
}
int gt_pass = 1;
int count_pass = (it->ad[0] + it->ad[1] >= args.min_c &&
it->ad[1] >= args.min_ac &&
it->ad[0] >= args.min_rc);
int count_pass = (var.ad[0] + var.ad[1] >= args.min_c &&
var.ad[1] >= args.min_ac &&
var.ad[0] >= args.min_rc);
if (count_pass && args.gt) { // naive genotyping
if (args.gt == VcntArgs::GT_TYPE::GTA) {
gts = gt_by_alt_evidence(args, *it);
} else if (args.gt == VcntArgs::GT_TYPE::GTT) {
gts = gt_by_threshold(args, *it);
} else { // default: genotype by likelihood
gts = gt_by_likelihood(args, pls);
switch(args.gt) {
case VcntArgs::GT_TYPE::GTA:
gts = gt_by_alt_evidence(args, var); break;
case VcntArgs::GT_TYPE::GTT:
gts = gt_by_threshold(args, var); break;
default:
gts = gt_by_likelihood(args, pls);
}
bcf_update_genotypes(out_vcf_hdr, out_vcf_rec, gts.data(), 2);
gt_pass = !bcf_gt_is_missing(gts[0]);
}
if (args.keep || ( count_pass && gt_pass )) {
if (args.gl) bcf_update_format_int32(out_vcf_hdr, out_vcf_rec, "PL", pls.data(), 3);
bcf_update_format_int32(out_vcf_hdr, out_vcf_rec, "AD", it->ad.data(), 2);
bcf_update_format_int32(out_vcf_hdr, out_vcf_rec, "AD", var.ad.data(), 2);
if (bcf_write(out_vcf_fp, out_vcf_hdr, out_vcf_rec)) {
fprintf(stderr, "bcf_write error\n");
exit(1);
throw htslib_error("bcf_write failed");
}
}
bcf_clear(out_vcf_rec);
Expand All @@ -244,6 +241,7 @@ void varcount(const VcntArgs& args) {

bcf_hdr_destroy(vcf_hdr);
bcf_close(vcf_fp);
bcf_destroy(out_vcf_rec);
}

void print_help() {
Expand Down
Loading