Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deps/libbdsg
Submodule libbdsg updated 39 files
+5 −3 CMakeLists.txt
+3 −1 Makefile
+2 −2 bdsg/cmake_bindings/bdsg/graph_proxy.cpp
+2 −2 bdsg/cmake_bindings/bdsg/graph_proxy_1.cpp
+2 −2 bdsg/cmake_bindings/bdsg/internal/base_packed_graph.cpp
+1 −1 bdsg/cmake_bindings/bdsg/internal/eades_algorithm.cpp
+1 −6 bdsg/cmake_bindings/bdsg/internal/hash_map.cpp
+1 −1 bdsg/cmake_bindings/bdsg/internal/is_single_stranded.cpp
+12 −10 bdsg/cmake_bindings/bdsg/internal/mapped_structs_1.cpp
+5 −5 bdsg/cmake_bindings/bdsg/overlays/packed_path_position_overlay.cpp
+46 −44 bdsg/cmake_bindings/bdsg/overlays/packed_reference_path_overlay.cpp
+5 −6 bdsg/cmake_bindings/bdsg/overlays/path_position_overlays.cpp
+3 −1 bdsg/cmake_bindings/bdsg/overlays/path_subgraph_overlay.cpp
+3 −3 bdsg/cmake_bindings/bdsg/overlays/reference_path_overlay.cpp
+1 −1 bdsg/cmake_bindings/bdsg/overlays/strand_split_overlay.cpp
+1 −1 bdsg/cmake_bindings/bdsg/overlays/vectorizable_overlays.cpp
+2 −2 bdsg/cmake_bindings/bdsg/packed_graph.cpp
+17 −30 bdsg/cmake_bindings/bdsg/snarl_distance_index.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/expanding_overlay_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/mutable_path_metadata.cpp
+2 −2 bdsg/cmake_bindings/handlegraph/mutable_path_mutable_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_metadata.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_position_handle_graph.cpp
+2 −2 bdsg/cmake_bindings/handlegraph/trivially_serializable.cpp
+3 −1 bdsg/cmake_bindings/handlegraph/types.cpp
+0 −1 bdsg/cmake_bindings/std/bdsg/internal/binder_hook_bind.cpp
+27 −7 bdsg/include/bdsg/overlays/packed_path_position_overlay.hpp
+24 −21 bdsg/include/bdsg/overlays/packed_reference_path_overlay.hpp
+23 −6 bdsg/include/bdsg/overlays/path_position_overlays.hpp
+10 −19 bdsg/include/bdsg/overlays/reference_path_overlay.hpp
+1 −1 bdsg/include/bdsg/snarl_distance_index.hpp
+35 −8 bdsg/src/packed_path_position_overlay.cpp
+33 −20 bdsg/src/packed_reference_path_overlay.cpp
+128 −69 bdsg/src/path_position_overlays.cpp
+18 −7 bdsg/src/reference_path_overlay.cpp
+39 −2 bdsg/src/test_libbdsg.cpp
+6 −3 make_and_run_binder.py
75 changes: 54 additions & 21 deletions src/subcommand/clip_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,23 +278,59 @@ int main_clip(int argc, char** argv) {
// need snarls if input regions are provided, or doing snarl based clipping
bool need_snarls = snarl_option || !bed_path.empty();

// TodO: FIX!! shouldn't need pp without BED coordinates
// TODO: FIX!! shouldn't need pp without BED coordinates
need_pp = need_pp || need_snarls;

if (!bed_path.empty()) {
// load the BED file
parse_bed_regions(bed_path, bed_regions);
if (verbose) {
logger.info() << "Loaded " << bed_regions.size() << " BED regions" << endl;
}
}

// It's going to be a little expensive to find the paths with basenames
// matching the prefixes (algorithmically, if not in practice), so we fill
// them in once and then re-use them.
std::vector<std::string> graph_paths_matched;
if (need_pp) {
pp_graph = overlay_helper.apply(graph.get());
// Figure out the paths we're going to need regions on.
std::unordered_set<std::string> position_path_names;

for (const Region& region : bed_regions) {
// For each region already defined (from the BED), we need its sequence indexed
position_path_names.insert(region.seq);
}

if (!ref_prefixes.empty()) {
// If we want all paths matching some prefixes
graph->for_each_path_handle([&](path_handle_t path_handle) {
// Look at all ther paths
std::string path_name = graph->get_path_name(path_handle);
subrange_t subrange;
// And get their base path names
path_name = Paths::strip_subrange(path_name, &subrange);
for (const string& ref_prefix : ref_prefixes) {
if (path_name.compare(0, ref_prefix.length(), ref_prefix) == 0) {
// And make sure they're indexed if they match the prefix.
position_path_names.insert(path_name);
// And remember them to then index all of
graph_paths_matched.emplace_back(std::move(path_name));
break;
}
}
});
}

// Get the overlay, indexing all the paths we will need to care about
pp_graph = overlay_helper.apply(graph.get(), position_path_names);
if (verbose) {
logger.info() << "Computed path position overlay of input graph" << endl;
}
}

if (need_snarls) {
// load the BED file
if (!bed_path.empty()) {
parse_bed_regions(bed_path, bed_regions);
if (verbose) {
logger.info() << "Loaded " << bed_regions.size() << " BED regions" << endl;
}
// contig names left in this set are *not* in the graph
unordered_set<string> contig_set;
for (const Region& region : bed_regions) {
Expand Down Expand Up @@ -328,20 +364,17 @@ int main_clip(int argc, char** argv) {
} else {
assert(need_pp);
assert(!ref_prefixes.empty());
// load the BED regions from the reference path prefix
pp_graph->for_each_path_handle([&](path_handle_t path_handle) {
string path_name = pp_graph->get_path_name(path_handle);
subrange_t subrange;
path_name = Paths::strip_subrange(path_name, &subrange);
int64_t offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first;
for (const string& ref_prefix : ref_prefixes) {
if (path_name.compare(0, ref_prefix.length(), ref_prefix) == 0) {
Region region = {path_name, offset, offset + (int64_t)pp_graph->get_path_length(path_handle) - 1};
bed_regions.push_back(region);
break;
}
}
});

for (auto& path_name : graph_paths_matched) {
path_handle_t path_handle = pp_graph->get_path_handle(path_name);
// Fill in the BED regions from the paths matching the
// prefixes, now that we can get the lengths.
std::string base_name = get_path_base_name(*pp_graph, path_handle);
int64_t offset = get_path_base_offset(*pp_graph, path_handle);
Region region = {base_name, offset, offset + (int64_t)pp_graph->get_path_length(path_handle) - 1};
bed_regions.push_back(region);
}

if (verbose) {
logger.info() << "Inferred " << bed_regions.size()
<< " BED regions from paths in the graph" << endl;
Expand Down
38 changes: 26 additions & 12 deletions src/subcommand/find_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ void help_find(char** argv) {
<< " -Z, --min-mem N minimum length of the MEM [1]" << endl
<< " -D, --distance return distance on path between pair of nodes (-n)" << endl
<< " if -P not used, best path chosen heurstically" << endl
<< " -Q, --paths-named STR return all paths with name prefix STR (may repeat)" << endl;
<< " -Q, --paths-named STR return all paths with name prefix STR (may repeat)" << endl
<< " (deprecated)" << endl;

}

Expand Down Expand Up @@ -400,14 +401,36 @@ int main_find(int argc, char** argv) {
}
}

// Parse any targets
// handle targets from BED
if (!bed_targets_file.empty()) {
parse_bed_regions(bed_targets_file, targets);
}
// those given on the command line
for (auto& target : targets_str) {
Region region;
parse_region(target, region);
targets.push_back(region);
}

// Find out paths we will need to make position queries on, in case they
// aren't already the right sense.
std::unordered_set<std::string> required_position_paths;
for (const Region& r : targets) {
required_position_paths.insert(r.seq);
}
if (!path_name.empty()) {
required_position_paths.insert(path_name);
}

PathPositionHandleGraph* xindex = nullptr;
unique_ptr<PathHandleGraph> path_handle_graph;
bdsg::PathPositionOverlayHelper overlay_helper;
bool input_gfa = false;
if (!xg_name.empty()) {
path_handle_graph = vg::io::VPKG::load_one<PathHandleGraph>(xg_name);
input_gfa = dynamic_cast<GFAHandleGraph*>(path_handle_graph.get()) != nullptr;
xindex = overlay_helper.apply(path_handle_graph.get());
xindex = overlay_helper.apply(path_handle_graph.get(), required_position_paths);

// Remove node ids that do not exist in the graph.
std::vector<nid_t> final_ids;
Expand Down Expand Up @@ -617,16 +640,6 @@ int main_find(int argc, char** argv) {
cout << xindex->get_path_name(path_handle) << endl;
});
}
// handle targets from BED
if (!bed_targets_file.empty()) {
parse_bed_regions(bed_targets_file, targets);
}
// those given on the command line
for (auto& target : targets_str) {
Region region;
parse_region(target, region);
targets.push_back(region);
}
if (!targets.empty()) {
auto output_graph = get_output_graph();
auto& graph = *output_graph;
Expand Down Expand Up @@ -807,6 +820,7 @@ int main_find(int argc, char** argv) {
vg::io::save_handle_graph(&graph, cout);
}
if (extract_paths) {
logger.warn() << "vg paths -Q/--paths-named is deprecated due to the partial Protobuf graph output format. Consider vg paths --extract-fasta instead." << std::endl;
for (auto& pattern : extract_path_patterns) {

// We want to write uncompressed protobuf Graph objects containing our paths.
Expand Down
19 changes: 15 additions & 4 deletions src/subcommand/gampcompare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,7 @@ int main_gampcompare(int argc, char** argv) {
}
path_handle_graph = vg::io::VPKG::load_one<PathHandleGraph>(graph_stream);
}

bdsg::PathPositionOverlayHelper overlay_helper;
PathPositionHandleGraph* path_position_handle_graph = overlay_helper.apply(path_handle_graph.get());


// We will collect all the truth positions
string_hash_map<string, map<string ,vector<pair<size_t, bool> > > > true_positions;
function<void(Alignment&)> record_truth = [&true_positions](Alignment& aln) {
Expand All @@ -152,6 +149,20 @@ int main_gampcompare(int argc, char** argv) {
}
vg::io::for_each_parallel(truth_file_in, record_truth);
}

// Once we know the truth positions we know the paths we need to index
std::unordered_set<std::string> truth_paths;
for (auto& aln_positions : true_positions) {
for (auto& path_and_positions : aln_positions.second) {
truth_paths.insert(path_and_positions.first);
}
}


bdsg::PathPositionOverlayHelper overlay_helper;
// Index the reference and generic paths, plus any paths that positions are on, for position queries.
// TODO: Can we actually end up using any non-reference/non-generic paths in the comparison?
PathPositionHandleGraph* path_position_handle_graph = overlay_helper.apply(path_handle_graph.get(), truth_paths);

// A buffer we use for the TSV output
vector<vector<tuple<int64_t, bool, int64_t, int64_t, string>>> buffers(get_thread_count());
Expand Down
4 changes: 4 additions & 0 deletions src/subcommand/inject_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ int main_inject(int argc, char** argv) {
logger.error() << "Graph (-x) is required" << endl;
}
unique_ptr<PathHandleGraph> path_handle_graph = vg::io::VPKG::load_one<PathHandleGraph>(xg_name);

// Path position index the graph.
// TODO: We can't index the paths the input BAM is actually on; it needs to
// be on reference paths in the graph.
bdsg::PathPositionOverlayHelper overlay_helper;
PathPositionHandleGraph* xgidx = overlay_helper.apply(path_handle_graph.get());

Expand Down
21 changes: 14 additions & 7 deletions src/subcommand/mcmc_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,17 +165,13 @@ int main_mcmc(int argc, char** argv) {
if(vg_graph == nullptr || vg_graph == 0) {
logger.error() << "Graph is NULL" << endl;
}
PathPositionHandleGraph* graph = nullptr;
graph = overlay_helper.apply(vg_graph);


// Check our paths
for (const string& ref_path : ref_paths) {
if (!graph->has_path(ref_path)) {
if (!vg_graph->has_path(ref_path)) {
logger.error() << "Reference path \"" << ref_path << "\" not found in graph" << endl;
}
}

// Check our offsets
if (ref_path_offsets.size() != 0 && ref_path_offsets.size() != ref_paths.size()) {
logger.error() << "when using -o, the same number paths must be given with -p" << endl;
Expand All @@ -185,7 +181,18 @@ int main_mcmc(int argc, char** argv) {
logger.error() << "when using -l, the same number paths must be given with -p" << endl;
}

// No paths specified: use them all
PathPositionHandleGraph* graph = nullptr;
{
// Path-position index all the extra paths we need to work on, plus the
// reference and generic paths.
std::unordered_set<std::string> target_paths;
for (auto& name : ref_paths) {
target_paths.insert(name);
}
graph = overlay_helper.apply(vg_graph, target_paths);
}

// No paths specified: use all the reference and generic paths
if (ref_paths.empty()) {
graph->for_each_path_of_sense({PathSense::REFERENCE, PathSense::GENERIC}, [&](path_handle_t path_handle) {
const string& name = graph->get_path_name(path_handle);
Expand All @@ -195,7 +202,7 @@ int main_mcmc(int argc, char** argv) {
});

}

// Check if VCF output file is specified
ofstream vcf_file_out;
if(!vcf_out.empty()){
Expand Down
32 changes: 21 additions & 11 deletions src/subcommand/stats_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1179,30 +1179,38 @@ int main_stats(int argc, char** argv) {
std::unordered_map<nid_t, size_t> extra_node_weight;
constexpr size_t EXTRA_WEIGHT = 10000000000;

// Collect paths in our assigned snarl coordinate sample
std::vector<std::string> ref_path_names;
if (snarl_stats && !snarl_sample.empty()) {
graph->for_each_path_of_sample(snarl_sample, [&](path_handle_t path_handle) {
ref_path_names.push_back(graph->get_path_name(path_handle));
});
if (ref_path_names.empty()) {
logger.error() << "unable to find any paths of --snarl-sample " << snarl_sample << endl;
}
}

// Turn them into an unordered_set so we can make sure they're all positionally indexed
std::unordered_set<std::string> target_paths(ref_path_names.begin(), ref_path_names.end());

// additional indexes only needed when finding --snarl-sample coordinates
unique_ptr<PathTraversalFinder> path_trav_finder;
bdsg::PathPositionOverlayHelper overlay_helper;
PathPositionHandleGraph* pp_graph = dynamic_cast<PathPositionHandleGraph*>(graph);;
unordered_map<const Snarl*, PathInterval> snarl_to_ref;
PathPositionHandleGraph* pp_graph = dynamic_cast<PathPositionHandleGraph*>(graph);
if (pp_graph == nullptr) {
pp_graph = overlay_helper.apply(graph, target_paths);
}

if (snarl_stats) {
// TSV header
if (!snarl_sample.empty()) {
// optionally prefix with BED-like refpath coordinates if --snarl-sample given
cout <<"Contig\tStartPos\tEndPos\t";

if (pp_graph == nullptr) {
pp_graph = overlay_helper.apply(graph);
}
vector<string> ref_path_names;
pp_graph->for_each_path_of_sample(snarl_sample, [&](path_handle_t path_handle) {
ref_path_names.push_back(graph->get_path_name(path_handle));
// Try and pin down the tips of the sample
extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle)))] += EXTRA_WEIGHT;
extra_node_weight[graph->get_id(graph->get_handle_of_step(graph->path_back(path_handle)))] += EXTRA_WEIGHT;
});
if (ref_path_names.empty()) {
logger.error() << "unable to find any paths of --snarl-sample" << endl;
}
path_trav_finder = unique_ptr<PathTraversalFinder>(new PathTraversalFinder(*pp_graph, ref_path_names));
}
cout << "Start\tStart-Reversed\tEnd\tEnd-Reversed\tUltrabubble\tUnary\tShallow-Nodes"
Expand All @@ -1212,6 +1220,8 @@ int main_stats(int argc, char** argv) {

// First compute the snarls
manager = IntegratedSnarlFinder(*graph, extra_node_weight).find_snarls_parallel();

std::unordered_map<const Snarl*, PathInterval> snarl_to_ref;

manager.for_each_snarl_preorder([&](const Snarl* snarl) {
// Loop over all the snarls and print stats.
Expand Down
8 changes: 7 additions & 1 deletion test/t/05_vg_find.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 30
plan tests 31

vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg
is $? 0 "construction"
Expand Down Expand Up @@ -142,3 +142,9 @@ is $? 0 "find nodes that map to the provided node ids"

rm -f x.vg x.gbwt x.mapping x.unfolded.vg
rm -f expected.gfa found.gfa

# We wish we could test specifically for which paths are indexed, but we can't.
# So we test to make sure at least paths that shouldn't normally be indexed are
# indexed when asked about.
is "$(vg find -n 5 -P sample1#1#chr1#0 -x graphs/gfa_with_reference.gfa | cut -f2)" "4" "vg find can find positions along non-reference paths asked about specifically"

Loading
Loading