diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 143a84b419..b73794f0a3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -158,7 +158,7 @@ oldest-supported-compiler-job: GIT_SUBMODULE_STRATEGY: none # DO NOT change this version number without updating the README to reflect # the requirement bump. - COMPILER_VERSION: 9 + COMPILER_VERSION: 10 # We define one job to do the Docker container build diff --git a/BOTS.md b/BOTS.md new file mode 100644 index 0000000000..2c35275ac0 --- /dev/null +++ b/BOTS.md @@ -0,0 +1,75 @@ +# VG Project Notes + +## Building +- New `.cpp` files auto-discovered +- Build with `make -j8` or `make obj/whatever.o` to build just one .o. +- You may be getting errors from `clangd`. If these errors seem spurious, stop and demand a `clangd` that works properly. + +## Testing + +### Running Bash-TAP Tests +Use `prove -v` (not `bash`) to execute Bash-TAP tests. This provides proper test harness output and better error reporting. + +**Important**: Run `prove` from the `test/` directory: +```bash +cd test +prove -v t/26_deconstruct.t +``` + +### Running Unit Tests +To run all unit tests: +```bash +./bin/vg test +``` +- `./bin/vg test "[tag]"` runs tests matching a tag + +#### Writing Unit Tests +- Framework: Catch v2 (header-only) +- Include: `#include "catch.hpp"` (in `src/unittest/catch.hpp`) +- Macros: `TEST_CASE("name", "[tags]")`, `SECTION("name")`, `REQUIRE(cond)` +- Namespace: `vg::unittest` +- Directory: `src/unittest/` + +### Running All Tests +```bash +make test +``` + +## Writing Code + +### HandleGraph API +The interfaces in libhandlegraph model a bidirected sequence graph (where nodes have DNA sequences and edges can connect to either the start or end of each involved node). + +#### Core types +- `handle_t` - opaque 64-bit value +- `nid_t` - node ID type +- `edge_t` = `pair` + +#### Key HandleGraph methods +- `get_handle(nid_t, bool is_reverse=false)` → `handle_t` +- `get_id(handle_t)` → `nid_t` +- `get_is_reverse(handle_t)` → `bool` +- `flip(handle_t)` → `handle_t` (toggle orientation) +- `get_sequence(handle_t)` → `string` (in handle's orientation) +- `follow_edges(handle_t, bool go_left, iteratee)` - iterate neighbors +- `for_each_handle(iteratee, bool parallel=false)` - iterate all nodes +- `for_each_edge(iteratee, bool parallel=false)` - iterate all edges +- `has_edge(handle_t left, handle_t right)` → `bool` + +#### MutableHandleGraph additions +- `create_handle(string seq)` / `create_handle(string seq, nid_t id)` → `handle_t` +- `create_edge(handle_t left, handle_t right)` +- `destroy_handle(handle_t)` / `destroy_edge(handle_t, handle_t)` + +#### HandleGraph algorithms +- Things like `topological_sort.hpp` and copy_graph.hpp` are in `deps/libhandlegraph/src/include/handlegraph/algorithms`. + +#### bdsg::HashGraph +- Header: `deps/libbdsg/bdsg/include/bdsg/hash_graph.hpp` +- Implements MutablePathMutableHandleGraph +- Go-to handlegraph implementation to use +- In libbdsg + +### Utilities +- `reverse_complement(string)` → `string` in src/utility.hpp + diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 0000000000..1a1007d91a --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +BOTS.md \ No newline at end of file diff --git a/Makefile b/Makefile index 6506208d58..6159297339 100644 --- a/Makefile +++ b/Makefile @@ -104,7 +104,8 @@ ifeq ($(shell uname -s),Darwin) LD_UTIL_RPATH_FLAGS="" # Homebrew installs a Protobuf that uses an Abseil that is built with C++17, so we need to build with at least C++17 - CXX_STANDARD?=17 + # C++20 for spaceship operator and ranges + CXX_STANDARD?=20 # We may need libraries from Macports ifeq ($(shell if [ -d /opt/local/lib ];then echo 1;else echo 0;fi), 1) @@ -229,8 +230,9 @@ else $(info Compiler $(CXX) is assumed to be GCC) # gbwtgraph uses inline variables and our oldest supported compiler has - # C++17, so we should use C++17 - CXX_STANDARD?=17 + # C++17, so we should use at least C++17. + # C++20 for spaceship operator and ranges + CXX_STANDARD?=20 # Set an rpath for vg and dependency utils to find installed libraries LD_UTIL_RPATH_FLAGS="-Wl,-rpath,$(CWD)/$(LIB_DIR)" @@ -820,7 +822,7 @@ $(INC_DIR)/dynamic/dynamic.hpp: $(DYNAMIC_DIR)/include/dynamic/*.hpp $(DYNAMIC_D +mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/dynamic/* $(INC_DIR)/dynamic/ $(INC_DIR)/sparsehash/sparse_hash_map: $(wildcard $(SPARSEHASH_DIR)/**/*.cc) $(wildcard $(SPARSEHASH_DIR)/**/*.h) - +cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install + +cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) src/sparsehash/internal/sparseconfig.h $(FILTER) && $(MAKE) install-data $(FILTER) $(INC_DIR)/sparsepp/spp.h: $(wildcard $(SPARSEPP_DIR)/sparsepp/*.h) +cp -r $(SPARSEPP_DIR)/sparsepp $(INC_DIR)/ diff --git a/README.md b/README.md index a3e1d5e4cd..2c616f69fe 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ On other distros, or if you do not have root access, you will need to perform th liblzma-dev liblz4-dev libffi-dev libcairo-dev libboost-all-dev \ libzstd-dev pybind11-dev python3-pybind11 libssl-dev kmc -At present, you will need GCC version 9 or greater, with support for C++17, to compile vg. (Check your version with `gcc --version`.) GCC up to 11.4.0 is supported. +At present, you will need GCC version 10 or greater, with support for C++20, to compile vg. (Check your version with `gcc --version`.) GCC up to 11.4.0 is supported. Other libraries may be required. Please report any build difficulties. diff --git a/deps/libbdsg b/deps/libbdsg index e74fb663a5..a7602fd4a4 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit e74fb663a5f85bc1f76d159b2b3a3691ed85862f +Subproject commit a7602fd4a462ca617502640022c6f1dd9109b13f diff --git a/deps/libvgio b/deps/libvgio index fff151be9d..3026f7d28e 160000 --- a/deps/libvgio +++ b/deps/libvgio @@ -1 +1 @@ -Subproject commit fff151be9d8255672d91f32a5b41285584905743 +Subproject commit 3026f7d28ef1576982968aff4eed7adf5a10f262 diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index c84e399ce1..ebc46da684 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -13,6 +13,7 @@ //#define debug_chaining //#define debug_transition +//#define debug_missing_transition //#define debug_dp namespace vg { @@ -262,6 +263,18 @@ transition_iterator zip_tree_transition_iterator(const std::vector filtered_transitions = calculate_transition_read_distances(all_transitions, to_chain, max_read_lookback_bases); @@ -385,6 +398,103 @@ std::vector generate_zip_tree_transitions( return all_transitions; } +bool find_missing_zip_tree_transitions( + const std::vector& seeds, + const ZipCodeTree& zip_code_tree, + size_t max_graph_lookback_bases, + const std::unordered_map& seed_to_starting, + const std::unordered_map& seed_to_ending, + const SnarlDistanceIndex& distance_index, + const std::vector& all_transitions) { + + // {source anchor : {dest anchor : dist}} + std::unordered_map> found; + for (const auto& transition : all_transitions) { + size_t dist_to_save = transition.graph_distance; + if (!found.count(transition.from_anchor)) { + found[transition.from_anchor] = std::unordered_map(); + } + if (found[transition.from_anchor].count(transition.to_anchor)) { + // If a transition appears multiple times, remember the min + dist_to_save = std::min(transition.graph_distance, + found[transition.from_anchor][transition.to_anchor]); + } + found[transition.from_anchor][transition.to_anchor] = transition.graph_distance; + } + + bool has_missing = false; + + // Helper function to check for a distance between two seeds + auto check_distance = [&] (const ZipCodeTree::oriented_seed_t& from_seed, bool rev_from, + const ZipCodeTree::oriented_seed_t& to_seed, bool rev_to) { + // XOR to get appropriate orientations + rev_from ^= from_seed.is_reversed; + rev_to ^= to_seed.is_reversed; + if (rev_from != rev_to) { + // Cannot be compared; incompatible orientations + return; + } + + // Look up appropriate anchors + auto from_anchor_itr = rev_from ? seed_to_starting.find(from_seed.seed) + : seed_to_ending.find(from_seed.seed); + if ((rev_from && from_anchor_itr == seed_to_starting.end()) + || (!rev_from && from_anchor_itr == seed_to_ending.end())) { + // No anchor exists + return; + } + auto to_anchor_itr = rev_to ? seed_to_ending.find(to_seed.seed) + : seed_to_starting.find(to_seed.seed); + if ((rev_to && to_anchor_itr == seed_to_ending.end()) + || (!rev_to && to_anchor_itr == seed_to_starting.end())) { + // No anchor exists + return; + } + + // Construct seed positions + pos_t from_pos = seeds.at(from_seed.seed).pos; + size_t from_length = distance_index.minimum_length(distance_index.get_node_net_handle(id(from_pos))); + from_pos = rev_from ? reverse(from_pos, from_length) + : from_pos; + pos_t to_pos = seeds.at(to_seed.seed).pos; + size_t to_length = distance_index.minimum_length(distance_index.get_node_net_handle(id(to_pos))); + to_pos = rev_to ? reverse(to_pos, to_length) + : to_pos; + + // Look up true minimum distance + size_t true_distance = minimum_nontrivial_distance(distance_index, from_pos, to_pos); + if (true_distance <= max_graph_lookback_bases) { + // We should've found this transition + auto from_anchor = from_anchor_itr->second; + auto to_anchor = to_anchor_itr->second; + if (!found.count(from_anchor) + || !found[from_anchor].count(to_anchor) + || found[from_anchor][to_anchor] != true_distance) { + has_missing = true; + cerr << "Missing transition " << from_pos << "->" + << to_pos << " dist " << true_distance << endl; + } + } + }; + + vector tree_seeds = zip_code_tree.get_all_seeds(); + for (size_t i = 0; i < tree_seeds.size(); i++) { + // Check self-loops + check_distance(tree_seeds[i], false, tree_seeds[i], false); + check_distance(tree_seeds[i], false, tree_seeds[i], true); + check_distance(tree_seeds[i], true, tree_seeds[i], false); + for (size_t j = i + 1; j < tree_seeds.size(); j++) { + // Check all orientation pairs + check_distance(tree_seeds[i], false, tree_seeds[j], false); + check_distance(tree_seeds[i], false, tree_seeds[j], true); + check_distance(tree_seeds[i], true, tree_seeds[j], false); + check_distance(tree_seeds[i], true, tree_seeds[j], true); + } + } + + return has_missing; +} + std::vector calculate_transition_read_distances( const std::vector& all_transitions, const VectorView& to_chain, @@ -547,6 +657,8 @@ TracedScore chain_items_dp(vector& chain_scores, cerr << "Chaining group of " << to_chain.size() << " items" << endl; } + crash_unless(recomb_penalty >= 0); + // Compute a base seed average length. // TODO: Weight anchors differently? // TODO: Will this always be the same for all anchors in practice? @@ -557,6 +669,20 @@ TracedScore chain_items_dp(vector& chain_scores, base_seed_length /= to_chain.size(); chain_scores.resize(to_chain.size()); + + // We want to prefer to come from seeds where the transition preserves + // access to matching haplotypes, because we don't want to back ourselves + // into a corner where we need a recombination when we don't really have + // to. So we cheat on the dynamic programming by adding an "evaluation + // bonus" to the scores of the different DP options when comparing them. We + // keep this bonus out of the actual recorded scores because we don't want + // it raising the scores we actually get the more transitions we take. + // + // We store the bonus used to select the current winning predecessor for + // each seed in this vector, which runs alongside the DP table. + // + // Starting from nowhere means full path conservation, so bonus = recomb_penalty. + std::vector eval_bonuses(to_chain.size(), recomb_penalty); for (size_t i = 0; i < to_chain.size(); i++) { // Set up DP table so we can start anywhere with that item's score, scaled and with bonus applied. chain_scores[i] = {(int)(to_chain[i].score() * item_scale + item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()}; @@ -586,8 +712,20 @@ TracedScore chain_items_dp(vector& chain_scores, } // If we come from nowhere, we get those points. - chain_scores[transition.to_anchor] = std::max(chain_scores[transition.to_anchor], - {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()}); + // This also has full path conservation (bonus = recomb_penalty). + { + TracedScore from_nowhere = {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()}; + int nowhere_bonus = recomb_penalty; + int eval_nowhere = from_nowhere.score + nowhere_bonus; + int eval_current = chain_scores[transition.to_anchor].score + eval_bonuses[transition.to_anchor]; + if (eval_nowhere > eval_current) { + chain_scores[transition.to_anchor] = from_nowhere; + eval_bonuses[transition.to_anchor] = nowhere_bonus; + } else if (eval_nowhere == eval_current && from_nowhere > chain_scores[transition.to_anchor]) { + chain_scores[transition.to_anchor] = from_nowhere; + eval_bonuses[transition.to_anchor] = nowhere_bonus; + } + } // For each source we could come from auto& source = to_chain[transition.from_anchor]; @@ -664,8 +802,34 @@ TracedScore chain_items_dp(vector& chain_scores, TracedScore from_source_score = source_score.add_points(jump_points + item_points) .set_shared_paths(here.anchor_paths()); - // Remember that we could make this jump - chain_scores[transition.to_anchor] = std::max(chain_scores[transition.to_anchor], from_source_score); + // Evaluate heuristic to preserve path flexibility without inflating actual scoring DP. + // Bonus = fraction of conserved paths * recomb_penalty. + // Bonus is 0 when recombination occurs (no shared paths). + int eval_bonus_from = 0; + if (recomb_penalty > 0) { + int pre_count = __builtin_popcountll(source_score.paths); + if (pre_count > 0 && (source_score.paths & here.anchor_start_paths()) != 0) { + // No recombination: bonus = fraction of paths conserved * penalty + int post_count = __builtin_popcountll(from_source_score.paths); + eval_bonus_from = (recomb_penalty * post_count) / pre_count; + } + // Recombination case (no shared paths): bonus stays 0 + } + + // Grab the DP table slot we are updating + auto& current_best = chain_scores[transition.to_anchor]; + // Compute the evaluation value for the new candidate + int eval_from = from_source_score.score + eval_bonus_from; + // Reconstruct the evaluation value for the current winner + int eval_best = current_best.score + eval_bonuses[transition.to_anchor]; + + if (eval_from > eval_best || (eval_from == eval_best && from_source_score > current_best)) { + // Using the evaluation values, and then if tied the real DP + // scores, this new candidate beats the previous winner, so + // replace it. + current_best = from_source_score; + eval_bonuses[transition.to_anchor] = eval_bonus_from; + } if (show_work) { #ifdef debug_dp diff --git a/src/algorithms/chain_items.hpp b/src/algorithms/chain_items.hpp index 9ac7b792e0..f4531d4201 100644 --- a/src/algorithms/chain_items.hpp +++ b/src/algorithms/chain_items.hpp @@ -477,6 +477,23 @@ std::vector generate_zip_tree_transitions( const std::unordered_map& seed_to_starting, const std::unordered_map& seed_to_ending); +/** + * Check if all possible transitions were actually found. + * + * Iterates over all pairs of seeds and uses the distance index + * to determine if there SHOULD have been a transition. + * + * Returns if any transitions were missing. + */ +bool find_missing_zip_tree_transitions( + const std::vector& seeds, + const ZipCodeTree& zip_code_tree, + size_t max_graph_lookback_bases, + const std::unordered_map& seed_to_starting, + const std::unordered_map& seed_to_ending, + const SnarlDistanceIndex& distance_index, + const std::vector& all_transitions); + /** * Calculate read distances for each of the zip tree's transitions. * Also filters out transitions that can't be used, diff --git a/src/alignment.cpp b/src/alignment.cpp index 567bfb016a..faf669ea69 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -3507,6 +3507,65 @@ pair aligned_interval(const Alignment& aln) { return pair(softclip_start(aln), aln.sequence().size() - softclip_end(aln)); } +void count_alignment_operations(const Alignment& aln, size_t& matches, size_t& mismatches, std::vector& gap_lengths) { + matches = 0; + mismatches = 0; + gap_lengths.clear(); + + enum class EditType { MATCH, MISMATCH, INS, DEL, COMPLEX, NONE }; + EditType prev_type = EditType::NONE; + size_t current_gap_length = 0; + + auto finish_gap = [&]() { + if (current_gap_length > 0) { + gap_lengths.push_back(current_gap_length); + current_gap_length = 0; + } + }; + + for (size_t i = 0; i < aln.path().mapping_size(); ++i) { + auto& mapping = aln.path().mapping(i); + for (size_t j = 0; j < mapping.edit_size(); ++j) { + auto& edit = mapping.edit(j); + if (edit.from_length() == edit.to_length() && edit.from_length() > 0) { + finish_gap(); + if (edit.sequence().empty()) { + matches += edit.from_length(); + prev_type = EditType::MATCH; + } else { + mismatches += edit.from_length(); + prev_type = EditType::MISMATCH; + } + } else if (edit.from_length() == 0 && edit.to_length() > 0) { + if (prev_type != EditType::INS) finish_gap(); + current_gap_length += edit.to_length(); + prev_type = EditType::INS; + } else if (edit.from_length() > 0 && edit.to_length() == 0) { + if (prev_type != EditType::DEL) finish_gap(); + current_gap_length += edit.from_length(); + prev_type = EditType::DEL; + } else { + finish_gap(); + mismatches += max(edit.from_length(), edit.to_length()); + prev_type = EditType::COMPLEX; + } + } + } + finish_gap(); +} + +int score_alignment_with_logged_gaps(const size_t& matches, const size_t& mismatches, const std::vector& gap_lengths) { + double d = max(0.02, static_cast(mismatches + gap_lengths.size()) / static_cast(matches + mismatches + gap_lengths.size())); + double non_match_penalty = static_cast(mismatches + gap_lengths.size()) / (2.0 * d); + + double indel_penalty = 0; + for (auto& gap_length : gap_lengths) { + indel_penalty += log2(1.0 + gap_length); + } + int adjusted_score = std::round(matches - non_match_penalty - indel_penalty); + return adjusted_score; +} + string mate_info(const string& path, int32_t pos, bool rev_strand, bool is_read1) { subrange_t subrange; string path_name = Paths::strip_subrange(path, &subrange); diff --git a/src/alignment.hpp b/src/alignment.hpp index cea9bc524d..1dbfc9870f 100644 --- a/src/alignment.hpp +++ b/src/alignment.hpp @@ -330,6 +330,15 @@ bool is_supplementary(const Alignment& alignment); // The indexes on the read sequence of the portion of the read that is aligned outside of soft clips pair aligned_interval(const Alignment& aln); +/// Count the various types of edits in an Alignment, including individual gap lengths. +void count_alignment_operations(const Alignment& aln, size_t& matches, size_t& mismatches, std::vector& gaps_lengths); +/// Compute an alignment score using minimap2 long indels penalty adjustment. +/// +/// This scoring method penalize long continous indels less, using the formula: +/// score = matches - (mismatches + gap_opens)/2d - sum_{i=1}^{gap_opens} (log_2(1 + gap_length_i)) +/// with d = max{0.02, (mismatches + gap_opens)/(matches + mismatches + gap_opens)} +int score_alignment_with_logged_gaps(const size_t& matches, const size_t& mismatches, const std::vector& gap_lengths); + // create an annotation string required to properly set the SAM fields/flags of a supplementary alignment // the arguments all refer to properties of the primary *mate* alignment // the path name saved in the info is the base path name, with any subrange info reflected in the position diff --git a/src/cactus.cpp b/src/cactus.cpp index 6179663968..49eab63294 100644 --- a/src/cactus.cpp +++ b/src/cactus.cpp @@ -999,8 +999,8 @@ VG cactus_to_vg(stCactusGraph* cactus_graph) { return vg_graph; } -VG cactusify(VG& graph) { - if (graph.size() == 0) { +VG cactusify(const PathHandleGraph& graph) { + if (graph.get_node_count() == 0) { return VG(); } auto parts = handle_graph_to_cactus(graph, unordered_set()); diff --git a/src/cactus.hpp b/src/cactus.hpp index 36d53f2fab..21cfd8ebc7 100644 --- a/src/cactus.hpp +++ b/src/cactus.hpp @@ -46,7 +46,7 @@ VG cactus_to_vg(stCactusGraph* cactus_graph); // Convert vg into vg formatted cactus representation // Input graph must be sorted! -VG cactusify(VG& graph); +VG cactusify(const PathHandleGraph& graph); } diff --git a/src/cluster.hpp b/src/cluster.hpp index df997cc51c..cd6deab517 100644 --- a/src/cluster.hpp +++ b/src/cluster.hpp @@ -212,8 +212,8 @@ class MEMClusterer { protected: - class HitNode; class HitEdge; + class HitNode; class HitGraph; class DPScoreComparator; @@ -232,7 +232,47 @@ class MEMClusterer { /// is closest to the optimal separation void deduplicate_cluster_pairs(vector, int64_t>>& cluster_pairs, int64_t optimal_separation); }; + +class MEMClusterer::HitEdge { +public: + HitEdge(size_t to_idx, int32_t weight, int64_t distance) : to_idx(to_idx), weight(weight), distance(distance) {} + HitEdge() = default; + ~HitEdge() = default; + + /// Index of the node that the edge points to + size_t to_idx; + /// Weight for dynamic programming + int32_t weight; + + /// Estimated distance + int64_t distance; +}; + +class MEMClusterer::HitNode { +public: + HitNode(const MaximalExactMatch& mem, pos_t start_pos, int32_t score) : mem(&mem), start_pos(start_pos), score(score) { } + HitNode() = default; + ~HitNode() = default; + + const MaximalExactMatch* mem; + + /// Position of GCSA hit in the graph + pos_t start_pos; + + /// Score of the exact match this node represents + int32_t score; + + /// Score used in dynamic programming + int32_t dp_score; + + /// Edges from this node that are colinear with the read + vector edges_from; + + /// Edges to this node that are colinear with the read + vector edges_to; +}; + class MEMClusterer::HitGraph { public: @@ -286,46 +326,6 @@ class MEMClusterer::HitGraph { UnionFind components; }; -class MEMClusterer::HitNode { -public: - HitNode(const MaximalExactMatch& mem, pos_t start_pos, int32_t score) : mem(&mem), start_pos(start_pos), score(score) { } - HitNode() = default; - ~HitNode() = default; - - const MaximalExactMatch* mem; - - /// Position of GCSA hit in the graph - pos_t start_pos; - - /// Score of the exact match this node represents - int32_t score; - - /// Score used in dynamic programming - int32_t dp_score; - - /// Edges from this node that are colinear with the read - vector edges_from; - - /// Edges to this node that are colinear with the read - vector edges_to; -}; - -class MEMClusterer::HitEdge { -public: - HitEdge(size_t to_idx, int32_t weight, int64_t distance) : to_idx(to_idx), weight(weight), distance(distance) {} - HitEdge() = default; - ~HitEdge() = default; - - /// Index of the node that the edge points to - size_t to_idx; - - /// Weight for dynamic programming - int32_t weight; - - /// Estimated distance - int64_t distance; -}; - struct MEMClusterer::DPScoreComparator { private: const vector& nodes; diff --git a/src/gbwtgraph_helper.cpp b/src/gbwtgraph_helper.cpp index 96ca15be2d..6c626bc7db 100644 --- a/src/gbwtgraph_helper.cpp +++ b/src/gbwtgraph_helper.cpp @@ -449,7 +449,7 @@ std::vector find_frequent_kmers(const gbwtgraph::GBZ& gbz, const Minim void cache_payloads( const gbwtgraph::GBZ& gbz, const SnarlDistanceIndex& distance_index, - hash_map& node_id_to_payload, + vg::hash_map& node_id_to_payload, ZipCodeCollection* oversized_zipcodes, bool progress ) { @@ -460,22 +460,37 @@ void cache_payloads( const handlegraph::HandleGraph* graph_ptr = (const handlegraph::HandleGraph*) &gbz.graph; + double total_zipcode_time = 0.0, total_decoder_time = 0.0; + std::atomic node_count = 0; gbz.graph.for_each_handle([&](const handle_t& handle) { nid_t node_id = gbz.graph.get_id(handle); - ZipCode zipcode; pos_t pos = make_pos_t(node_id, false, 0); - zipcode.fill_in_zipcode_from_pos(distance_index, pos, true, graph_ptr); + ZipCode zipcode; + zipcode.fill_in_zipcode_from_pos(distance_index, pos, false, graph_ptr); + zipcode.fill_in_full_decoder(); + if (++node_count % 10000 == 0 && progress) { + double telapsed = gbwt::readTimer() - start; + #pragma omp critical (cerr) + std::cerr << " Cached " << node_count << " nodes in " << telapsed << "s" << std::endl; + } + payload_t payload = zipcode.get_payload_from_zip(); if (payload == MIPayload::NO_CODE && oversized_zipcodes != nullptr) { // The zipcode is too large for the payload field. // Add it to the oversized zipcode list. - zipcode.fill_in_full_decoder(); - size_t offset = oversized_zipcodes->size(); - oversized_zipcodes->emplace_back(zipcode); + size_t offset; + #pragma omp critical (cache_payloads_zipcodes) + { + offset = oversized_zipcodes->size(); + oversized_zipcodes->emplace_back(zipcode); + } payload = { 0, offset }; } - node_id_to_payload.emplace(node_id, payload); - }); + #pragma omp critical (cache_payloads_map) + { + node_id_to_payload.emplace(node_id, payload); + } + }, true); if (progress) { double seconds = gbwt::readTimer() - start; @@ -537,8 +552,18 @@ gbwtgraph::DefaultMinimizerIndex build_minimizer_index( } else { // Cache payloads before building the index. // A zipcode only depends on the node id. - hash_map node_id_to_payload; + vg::hash_map node_id_to_payload; node_id_to_payload.reserve(gbz.graph.max_node_id() - gbz.graph.min_node_id()); + // Re-preload the distance index right before use. find_frequent_kmers + // runs for a long time and may evict the mmap'd index pages from the OS + // page cache. We also preload eagerly right after loading the index (in + // minimizer_main.cpp) so the kernel treats those pages as recently-used; + // together the two preloads prevent cache_payloads from page-faulting on + // every node under the memory pressure of 32 parallel threads. + if (params.progress) { + std::cerr << "Preloading distance index"; + } + distance_index->preload(true); cache_payloads(gbz, *distance_index, node_id_to_payload, oversized_zipcodes, params.progress); auto get_payload = [&](const pos_t& pos) -> const code_type* { diff --git a/src/graph.cpp b/src/graph.cpp index beca52b5e1..3f23ffef18 100644 --- a/src/graph.cpp +++ b/src/graph.cpp @@ -2,93 +2,6 @@ namespace vg { -void sort_by_id_dedup_and_clean(Graph& graph) { - remove_duplicates(graph); // graph is sorted here - remove_orphan_edges(graph); -} - -void remove_duplicates(Graph& graph) { - remove_duplicate_nodes(graph); - remove_duplicate_edges(graph); -} - -void remove_duplicate_edges(Graph& graph) { - sort_edges_by_id(graph); - graph.mutable_edge()->erase(std::unique(graph.mutable_edge()->begin(), - graph.mutable_edge()->end(), - [](const Edge& a, const Edge& b) { - return make_tuple(a.from(), a.to(), a.from_start(), a.to_end()) - == make_tuple(b.from(), b.to(), b.from_start(), b.to_end()); - }), graph.mutable_edge()->end()); - -} - -void remove_duplicate_nodes(Graph& graph) { - sort_nodes_by_id(graph); - graph.mutable_node()->erase(std::unique(graph.mutable_node()->begin(), - graph.mutable_node()->end(), - [](const Node& a, const Node& b) { - return a.id() == b.id(); - }), graph.mutable_node()->end()); -} - -void remove_orphan_edges(Graph& graph) { - set ids; - for (auto& node : graph.node()) { - ids.insert(node.id()); - } - graph.mutable_edge()->erase(std::remove_if(graph.mutable_edge()->begin(), - graph.mutable_edge()->end(), - [&ids](const Edge& e) { - return !ids.count(e.from()) || !ids.count(e.to()); - }), graph.mutable_edge()->end()); -} - -void sort_by_id(Graph& graph) { - sort_nodes_by_id(graph); - sort_edges_by_id(graph); -} - -void sort_nodes_by_id(Graph& graph) { - std::sort(graph.mutable_node()->begin(), - graph.mutable_node()->end(), - [](const Node& a, const Node& b) { - return a.id() < b.id(); - }); -} - -void sort_edges_by_id(Graph& graph) { - std::sort(graph.mutable_edge()->begin(), - graph.mutable_edge()->end(), - [](const Edge& a, const Edge& b) { - return make_tuple(a.from(), a.to(), a.from_start(), a.to_end()) - < make_tuple(b.from(), b.to(), b.from_start(), b.to_end()); - }); -} - -bool is_id_sortable(const Graph& graph) { - for (auto& edge : graph.edge()) { - if (edge.from() >= edge.to()) return false; - } - return true; -} - -bool has_inversion(const Graph& graph) { - for (auto& edge : graph.edge()) { - if (edge.from_start() || edge.to_end()) return true; - } - return false; -} - -void flip_doubly_reversed_edges(Graph& graph) { - for (auto& edge : *graph.mutable_edge()) { - if (edge.from_start() && edge.to_end()) { - edge.set_from_start(false); - edge.set_to_end(false); - } - } -} - void from_handle_graph(const HandleGraph& from, Graph& to) { from.for_each_handle([&](const handle_t& h) { Node* node = to.add_node(); diff --git a/src/graph.hpp b/src/graph.hpp index 964e46cceb..c85afe88ab 100644 --- a/src/graph.hpp +++ b/src/graph.hpp @@ -11,39 +11,6 @@ namespace vg { using namespace std; -/// remove duplicates and sort by id -void sort_by_id_dedup_and_clean(Graph& graph); - -/// remove duplicate nodes and edges -void remove_duplicates(Graph& graph); - -/// remove duplicate edges -void remove_duplicate_edges(Graph& graph); - -/// remove duplicate nodes -void remove_duplicate_nodes(Graph& graph); - -/// remove edges that link to a node that is not in the graph -void remove_orphan_edges(Graph& graph); - -/// order the nodes and edges in the graph by id -void sort_by_id(Graph& graph); - -/// order the nodes in the graph by id -void sort_nodes_by_id(Graph& graph); - -/// order the edges in the graph by id pairs -void sort_edges_by_id(Graph& graph); - -/// returns true if the graph is id-sortable (no reverse links) -bool is_id_sortable(const Graph& graph); - -/// returns true if we find an edge that may specify an inversion -bool has_inversion(const Graph& graph); - -/// clean up doubly-reversed edges -void flip_doubly_reversed_edges(Graph& graph); - // transfer data from a HandleGraph into an empty Graph void from_handle_graph(const HandleGraph& from, Graph& to); diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 9a8f2923d6..cb09dbcbfa 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -551,6 +551,10 @@ construct_minimizers_impl( *gbz, distance_index.get(), &oversized_zipcodes, params ); + // Close the distance index so it can't appear to be modified after the + // files that depend on it. + distance_index.reset(); + string output_name = plan->output_filepath(minimizer_output); save_minimizer(minimizers, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); output_name_minimizer.push_back(output_name); @@ -5222,6 +5226,34 @@ vector IndexRegistry::require(const IndexName& identifier) const { return index->get_filenames(); } +bool IndexRegistry::predates(const IndexName& earlier, const IndexName& later) const { + // Get all the files + std::vector earlier_files = require(earlier); + std::vector later_files = require(later); + + // Make sure they're nonempty + if (earlier_files.empty()) { + throw std::runtime_error(earlier + " index has no files"); + } + if (later_files.empty()) { + throw std::runtime_error(later + " index has no files"); + } + + // Get all their modification times + std::filesystem::file_time_type (*predicate)(const std::filesystem::path&) = std::filesystem::last_write_time; + std::vector earlier_times; + std::transform(earlier_files.begin(), earlier_files.end(), std::back_inserter(earlier_times), predicate); + std::vector later_times; + std::transform(later_files.begin(), later_files.end(), std::back_inserter(later_times), predicate); + + // Find where the times that shouldn't intersect are, and get them. + std::filesystem::file_time_type earlier_time = *std::max_element(earlier_times.begin(), earlier_times.end()); + std::filesystem::file_time_type later_time = *std::max_element(later_times.begin(), later_times.end()); + + // Return if the earlier files are touched no later than the later files. + return earlier_time <= later_time; +} + void IndexRegistry::set_target_memory_usage(int64_t bytes) { target_memory_usage = bytes; } diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 3b7f58dd4c..7f20a5fd37 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -331,6 +331,13 @@ class IndexRegistry { /// Return true if the given index is available and can be require()'d, and /// false otherwise. bool available(const IndexName& identifier) const; + + /// For two available indexes, returns true if the modification times + /// on the eariler index are no later than those on the later index. + /// + /// Useful for enforcing that downstream indexes haven't had their upstream + /// indexes overwritten. + bool predates(const IndexName& earlier, const IndexName& later) const; /// Get the possible filename(s) associated with the given index with the given prefix. /// TODO: Get this to account for sample-scoped indexes. diff --git a/src/io/register_loader_saver_distance_index.cpp b/src/io/register_loader_saver_distance_index.cpp index 54245956a9..926dc0f3f6 100644 --- a/src/io/register_loader_saver_distance_index.cpp +++ b/src/io/register_loader_saver_distance_index.cpp @@ -1,6 +1,6 @@ /** * \file register_loader_saver_distance_index.cpp - * Defines IO for an XG index from stream files. + * Defines IO for a SnarlDistanceIndex index from stream files. */ #include diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index 693f19cb8b..4b15e8b4ce 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -871,7 +871,8 @@ class MinimizerMapper : public AlignerClient { */ void do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, - std::vector>& chains, std::vector>& chain_rec_flags, std::vector& chain_source_tree, + std::vector>& chains, std::vector>& chain_rec_flags, + std::vector& chain_rec_counts, std::vector& chain_source_tree, std::vector& chain_score_estimates, std::vector>& minimizer_kept_chain_count, std::vector& multiplicity_by_chain, std::vector& alignments, SmallBitset& minimizer_explored, vector& multiplicity_by_alignment, @@ -1081,11 +1082,11 @@ class MinimizerMapper : public AlignerClient { * * For connecting alignment, restricts the alignment to use <= max_dp_cells * cells. If too many DP cells would be used, produces an Alignment with - * and empty path. + * an empty path. * - * Returns the number of nodes and bases in the graph aligned against. + * Returns whether a graph was aligned against or not. */ - static std::pair align_sequence_between(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, size_t max_gap_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, const std::string* alignment_name = nullptr, size_t max_dp_cells = std::numeric_limits::max(), const std::function& choose_band_padding = algorithms::pad_band_random_walk()); + static bool align_sequence_between(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, size_t max_gap_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, const std::string* alignment_name = nullptr, size_t max_dp_cells = std::numeric_limits::max(), const std::function& choose_band_padding = algorithms::pad_band_random_walk()); public: /** @@ -1093,7 +1094,7 @@ class MinimizerMapper : public AlignerClient { * same answer (modulo reverse-complementation) no matter whether the * sequence and anchors are reverse-complemented or not. */ - static std::pair align_sequence_between_consistently(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, size_t max_gap_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, const std::string* alignment_name = nullptr, size_t max_dp_cells = std::numeric_limits::max(), const std::function& choose_band_padding = algorithms::pad_band_random_walk()); + static bool align_sequence_between_consistently(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, size_t max_gap_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, const std::string* alignment_name = nullptr, size_t max_dp_cells = std::numeric_limits::max(), const std::function& choose_band_padding = algorithms::pad_band_random_walk()); protected: /** diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 4ba5d3b909..c8f47d5bb9 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -32,6 +32,7 @@ #include #include #include +#include // Turn on debugging prints //#define debug @@ -774,6 +775,8 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { std::vector> chains; // For each chain, mark per-seed whether it came from a recombinant anchor std::vector> chain_rec_flags; + // For each chain, track how many recombination events were used + std::vector chain_rec_counts; // The zip code tree it came from std::vector chain_source_tree; // An estimated alignment score @@ -784,7 +787,7 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { std::vector multiplicity_by_chain; do_chaining_on_trees(aln, zip_code_forest, seeds, minimizers, seed_anchors, - chains, chain_rec_flags, chain_source_tree, chain_score_estimates, + chains, chain_rec_flags, chain_rec_counts, chain_source_tree, chain_score_estimates, minimizer_kept_chain_count, multiplicity_by_chain, alignments, minimizer_explored, multiplicity_by_alignment, rng, funnel); @@ -844,6 +847,49 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { alignments_to_source, minimizer_explored, stats, funnel_depleted, rng, funnel); } + for (size_t alignment_index = 0; alignment_index < alignments.size(); ++alignment_index) { + // Rescore all the alignments using minimap2 logged-gap-length, read-identity-based scoring + + if (alignments[alignment_index].path().mapping_size() == 0) { + // Unmapped, so skip it. + continue; + } + + size_t matches, mismatches; + std::vector gap_lengths; + count_alignment_operations(alignments[alignment_index], matches, mismatches, gap_lengths); + + if (matches + mismatches + gap_lengths.size() == 0) { + continue; + } + + // Compute the logged-gaps score + auto logged_gaps_score = score_alignment_with_logged_gaps(matches, mismatches, gap_lengths); + alignments[alignment_index].set_score(logged_gaps_score); + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Matches: " << matches << " Mismatches: " << mismatches << " Gap opens: " << gap_lengths.size() << " New score: " << logged_gaps_score << endl; + } + } + } + if (!chain_rec_counts.empty() && !alignments_to_source.empty()) { + for (size_t alignment_index = 0; alignment_index < alignments_to_source.size(); ++alignment_index) { + size_t chain_index = alignments_to_source[alignment_index]; + if (chain_index != std::numeric_limits::max() && chain_index < chain_rec_counts.size()) { + set_annotation(alignments[alignment_index], "chain.rec_count", (double) chain_rec_counts[chain_index]); + if (rec_penalty_chain != 0) { + // Penalize the score of alignment candidates according to the number of recombinations their chains required. + // This allows alignments that required fewer recombinations in their chains to win. + // TODO: We'd also eventaully like to count recombinations that we don't know are needed until base-level DP. + int64_t penalty = static_cast(rec_penalty_chain) * static_cast(chain_rec_counts[chain_index]); + int64_t penalized_score = static_cast(alignments[alignment_index].score()) - penalty; + alignments[alignment_index].set_score(static_cast(penalized_score)); + } + } + } + } + if (track_provenance) { // Now say we are finding the winner(s) @@ -1096,7 +1142,8 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, - std::vector>& chains, std::vector>& chain_rec_flags, std::vector& chain_source_tree, + std::vector>& chains, std::vector>& chain_rec_flags, + std::vector& chain_rec_counts, std::vector& chain_source_tree, std::vector& chain_score_estimates, std::vector>& minimizer_kept_chain_count, std::vector& multiplicity_by_chain, std::vector& alignments, SmallBitset& minimizer_explored, vector& multiplicity_by_alignment, @@ -1544,9 +1591,13 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& indel_limit, show_work ); +#ifdef debug_rec + if (true) { +#else if (show_work) { +#endif #pragma omp critical (cerr) - cerr << log_name() << "Found " << chain_results.chains.size() << " chains in zip code tree " << item_num + cerr << log_name() << "\t[" << aln.name() << "] Found " << chain_results.chains.size() << " chains in zip code tree " << item_num << " running " << anchors_to_chain[anchor_indexes.front()] << " to " << anchors_to_chain[anchor_indexes.back()] << std::endl; } @@ -1556,7 +1607,12 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& auto& entry = chain_results.chains[result]; auto& scored_chain = entry.scored_chain; auto& chain_rec_positions = entry.rec_positions; - if (show_work) { +#ifdef debug_rec + if (true) +#else + if (show_work) +#endif + { #ifdef debug if(true) #else @@ -1566,24 +1622,42 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& if (!scored_chain.second.empty()) { #pragma omp critical (cerr) { - cerr << log_name() << "\tChain with score " << scored_chain.first + cerr << log_name() << "\t[" << aln.name() << "] Chain " << result << " with score " << scored_chain.first << " (rec num =" << chain_rec_positions.size() << ") and length " << scored_chain.second.size() << " running " << anchor_view[scored_chain.second.front()] - << " to " << anchor_view[scored_chain.second.back()] << std::endl; + << " to " << anchor_view[scored_chain.second.back()]; if (!chain_rec_positions.empty()) { - { - cerr << log_name() << "\t\tRecombination introduced at anchors: "; - for (size_t pi = 0; pi < chain_rec_positions.size(); ++pi) { - if (pi) cerr << ", "; - cerr << chain_rec_positions[pi]; - } - cerr << std::endl; + cerr << " recombination introduced at anchors: "; + for (size_t pi = 0; pi < chain_rec_positions.size(); ++pi) { + if (pi) cerr << ", "; + cerr << chain_rec_positions[pi]; } } -#ifdef debug - - for (auto& anchor_number : scored_chain.second) { - std::cerr << log_name() << "\t\t" << anchor_view[anchor_number] << std::endl; + cerr << std::endl; +#ifdef debug_rec + algorithms::path_flags_t current_paths = 0; + bool first = true; + for (auto& selected_number : scored_chain.second) { + auto& anchor = anchor_view[selected_number]; + auto new_paths = anchor.anchor_paths(); + if (first) { + current_paths = new_paths.second; + first = false; + } else { + if (new_paths.first == new_paths.second) { + if ((current_paths & new_paths.first) == 0) { + current_paths = new_paths.first; + } else { + current_paths &= new_paths.first; + } + } else { + current_paths = new_paths.second; + } + } + + std::cerr << log_name() << "\t\t" << anchor + << " anchor_paths: " << std::bitset<64>(new_paths.first).count() << " " << std::bitset<64>(new_paths.first) + << " chain_paths: " << std::bitset<64>(current_paths).count() << " " << std::bitset<64>(current_paths) << std::endl; } #endif @@ -1591,7 +1665,7 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& } } else if (result == MANY_LIMIT) { #pragma omp critical (cerr) - std::cerr << log_name() << "\t<" << (chain_results.chains.size() - result) << " more chains>" << std::endl; + std::cerr << log_name() << "\t[" << aln.name() << "] <" << (chain_results.chains.size() - result) << " more chains>" << std::endl; } } @@ -1627,6 +1701,8 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& } // Remember the score chain_score_estimates.push_back(scored_chain.first); + // Remember how many recombinations were in this chain + chain_rec_counts.push_back(chain_rec_positions.size()); // Remember how we got it chain_source_tree.push_back(item_num); @@ -1773,7 +1849,7 @@ void MinimizerMapper::get_best_chain_stats(Alignment& aln, const ZipCodeForest& best_chain_longest_jump = std::max(best_chain_longest_jump, jump); best_chain_total_jump += jump; } - best_chain_average_jump = chains.at(best_chain).size() > 1 ? best_chain_total_jump / (chains.at(best_chain).size() - 1) : 0.0; + best_chain_average_jump = chains.at(best_chain).size() > 1 ? (double)best_chain_total_jump / (chains.at(best_chain).size() - 1) : 0.0; } // Also count anchors in the chain @@ -2557,10 +2633,10 @@ Alignment MinimizerMapper::find_chain_alignment( if (stats) { start_time = std::chrono::high_resolution_clock::now(); } - auto nodes_and_bases = align_sequence_between_consistently(empty_pos_t(), right_anchor, graph_horizon, max_gap_length, &this->gbwt_graph, this->get_regular_aligner(), tail_aln, &aln.name(), this->max_dp_cells, this->choose_band_padding); + bool did_aln = align_sequence_between_consistently(empty_pos_t(), right_anchor, graph_horizon, max_gap_length, &this->gbwt_graph, this->get_regular_aligner(), tail_aln, &aln.name(), this->max_dp_cells, this->choose_band_padding); if (stats) { stop_time = std::chrono::high_resolution_clock::now(); - if (nodes_and_bases.first > 0) { + if (did_aln) { // Actually did the alignment stats->bases.dozeu_tail += left_tail_length; stats->time.dozeu_tail += std::chrono::duration_cast>(stop_time - start_time).count(); @@ -2883,10 +2959,10 @@ Alignment MinimizerMapper::find_chain_alignment( if (stats) { start_time = std::chrono::high_resolution_clock::now(); } - auto nodes_and_bases = MinimizerMapper::align_sequence_between_consistently((*here).graph_end(), (*next).graph_start(), path_length+max_gap_length, max_gap_length, &this->gbwt_graph, this->get_regular_aligner(), link_aln, &aln.name(), this->max_dp_cells, this->choose_band_padding); + bool did_aln = MinimizerMapper::align_sequence_between_consistently((*here).graph_end(), (*next).graph_start(), path_length+max_gap_length, max_gap_length, &this->gbwt_graph, this->get_regular_aligner(), link_aln, &aln.name(), this->max_dp_cells, this->choose_band_padding); if (stats) { stop_time = std::chrono::high_resolution_clock::now(); - if (nodes_and_bases.first > 0) { + if (did_aln) { // Actually did the alignment stats->bases.bga_middle += link_length; stats->time.bga_middle += std::chrono::duration_cast>(stop_time - start_time).count(); @@ -3070,10 +3146,10 @@ Alignment MinimizerMapper::find_chain_alignment( if (stats) { start_time = std::chrono::high_resolution_clock::now(); } - auto nodes_and_bases = align_sequence_between_consistently(left_anchor_included, empty_pos_t(), graph_horizon, max_gap_length, &this->gbwt_graph, this->get_regular_aligner(), tail_aln, &aln.name(), this->max_dp_cells, this->choose_band_padding); + bool did_aln = align_sequence_between_consistently(left_anchor_included, empty_pos_t(), graph_horizon, max_gap_length, &this->gbwt_graph, this->get_regular_aligner(), tail_aln, &aln.name(), this->max_dp_cells, this->choose_band_padding); if (stats) { stop_time = std::chrono::high_resolution_clock::now(); - if (nodes_and_bases.first > 0) { + if (did_aln) { // Actually did the alignment stats->bases.dozeu_tail += right_tail_length; stats->time.dozeu_tail += std::chrono::duration_cast>(stop_time - start_time).count(); @@ -3464,11 +3540,9 @@ size_t MinimizerMapper::longest_detectable_gap_in_range(const Alignment& aln, co return aligner->longest_detectable_gap(aln, sequence_end); } -std::pair MinimizerMapper::align_sequence_between(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, size_t max_gap_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, const std::string* alignment_name, size_t max_dp_cells, const std::function& choose_band_padding) { - - // This holds node count and node length aligned to. - std::pair to_return; +bool MinimizerMapper::align_sequence_between(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, size_t max_gap_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, const std::string* alignment_name, size_t max_dp_cells, const std::function& choose_band_padding) { + bool did_aln = true; // Get the dagified local graph, and the back translation MinimizerMapper::with_dagified_local_graph(left_anchor, right_anchor, max_path_length, *graph, [&](DeletableHandleGraph& dagified_graph, @@ -3594,10 +3668,6 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l // Clear out the alignment path to indicate that we didn't actually compute an alignment. alignment.mutable_path()->clear_mapping(); } - // Always report the size of what we were aligning to. - // TODO: Do we still need this? - to_return.first = dagified_graph.get_node_count(); - to_return.second = dagified_graph.get_total_length(); } else { // Do pinned alignment off the anchor we actually have. // Work out how big it will be. @@ -3621,8 +3691,7 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l Edit* e = m->add_edit(); e->set_to_length(alignment.sequence().size()); e->set_sequence(alignment.sequence()); - to_return.first = 0; - to_return.second = 0; + did_aln = false; return; } else { #ifdef debug @@ -3630,8 +3699,6 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l std::cerr << "debug[MinimizerMapper::align_sequence_between]: Fill " << cell_count << " DP cells in tail with Xdrop" << std::endl; #endif aligner->align_pinned(alignment, dagified_graph, !is_empty(left_anchor), true, max_gap_length); - to_return.first = dagified_graph.get_node_count(); - to_return.second = dagified_graph.get_total_length(); } } @@ -3687,10 +3754,10 @@ std::pair MinimizerMapper::align_sequence_between(const pos_t& l // Now the alignment is filled in! }); - return to_return; + return did_aln; } -std::pair MinimizerMapper::align_sequence_between_consistently(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, size_t max_gap_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, const std::string* alignment_name, size_t max_dp_cells, const std::function& choose_band_padding) { +bool MinimizerMapper::align_sequence_between_consistently(const pos_t& left_anchor, const pos_t& right_anchor, size_t max_path_length, size_t max_gap_length, const HandleGraph* graph, const GSSWAligner* aligner, Alignment& alignment, const std::string* alignment_name, size_t max_dp_cells, const std::function& choose_band_padding) { if (left_anchor < right_anchor) { // Left anchor is unambiguously first, so align as-is return align_sequence_between(left_anchor, right_anchor, max_path_length, max_gap_length, graph, aligner, alignment, alignment_name, max_dp_cells, choose_band_padding); diff --git a/src/multipath_mapper.cpp b/src/multipath_mapper.cpp index 4632edee8b..4241fa12e6 100644 --- a/src/multipath_mapper.cpp +++ b/src/multipath_mapper.cpp @@ -2448,7 +2448,7 @@ namespace vg { // in the left_idxs and right_idxs vectors int64_t target_len = 2 * seq_len - left_side.clip_length - right_side.clip_length; auto distance_diff = [&](size_t l, size_t r) { - return abs(get<2>(left_sites[left_idxs[l]]) + get<2>(right_sites[right_idxs[r]]) - target_len); + return std::abs(static_cast(get<2>(left_sites[left_idxs[l]]) + get<2>(right_sites[right_idxs[r]]) - target_len)); }; // sweep to identify pairs that most nearly align diff --git a/src/recombinator.cpp b/src/recombinator.cpp index a9aaed4b10..07915118ed 100644 --- a/src/recombinator.cpp +++ b/src/recombinator.cpp @@ -1585,7 +1585,7 @@ void add_path(const gbwt::GBWT& source, gbwt::size_type path_id, gbwt::GBWTBuild gbwt::PathName path_name = source.metadata.path(path_id); std::string sample_name = source.metadata.sample(path_name.sample); std::string contig_name = source.metadata.contig(path_name.contig); - if (sample_name == gbwtgraph::REFERENCE_PATH_SAMPLE_NAME) { + if (sample_name == gbwtgraph::GENERIC_PATH_SAMPLE_NAME) { metadata.add_generic_path(contig_name); } else { // Reference samples will be copied later. diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index 65f7dec363..24e1957443 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -1,7 +1,9 @@ -//#define debug_distance_indexing -//#define debug_snarl_traversal -//#define debug_distances -//#define debug_subgraph +// #define debug_distance_indexing +// #define debug_snarl_traversal +// #define debug_distances +// #define debug_subgraph +// #define debug_hub_label_build +// #define debug_hub_label_storage #include "snarl_distance_index.hpp" @@ -9,2022 +11,161 @@ using namespace std; using namespace handlegraph; namespace vg { -size_t minimum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, pos_t pos2, - bool unoriented_distance, const HandleGraph* graph) { - return distance_index.minimum_distance( get_id(pos1), get_is_rev(pos1), get_offset(pos1), - get_id(pos2), get_is_rev(pos2), get_offset(pos2), - unoriented_distance, graph, nullptr); +size_t minimum_distance(const SnarlDistanceIndex &distance_index, pos_t pos1, + pos_t pos2, bool unoriented_distance, + const HandleGraph *graph) { + return distance_index.minimum_distance( + get_id(pos1), get_is_rev(pos1), get_offset(pos1), get_id(pos2), + get_is_rev(pos2), get_offset(pos2), unoriented_distance, graph, nullptr); } -size_t minimum_nontrivial_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, pos_t pos2, - size_t pos2_length, const HandleGraph* graph) { - bool shifted = false; - if (pos1 == pos2) { - if (pos2_length == std::numeric_limits::max()) { - // If we don't know the length, we can get it from the graph - pos2_length = distance_index.minimum_length( - distance_index.get_node_net_handle(id(pos2))); - } - // Must shift one position to avoid self-distance of 0 - if (offset(pos1) == pos2_length) { - // Shift ending pos backward (not safe to shift forward) - get_offset(pos2)--; - } else { - // Shift starting position forward - get_offset(pos1)++; - } - - shifted = true; - } - - size_t distance = minimum_distance(distance_index, pos1, pos2, false, graph); - if (shifted && distance != std::numeric_limits::max()) { - // This loop is possible, so add back in the shift - distance++; - } - - return distance; -} - -size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, pos_t pos2) { - return distance_index.maximum_distance( get_id(pos1), get_is_rev(pos1), get_offset(pos1), - get_id(pos2), get_is_rev(pos2), get_offset(pos2)); -} - -void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances, bool silence_warnings) { - distance_index->set_snarl_size_limit(size_limit); - distance_index->set_only_top_level_chain_distances(only_top_level_chain_distances); - - //Build the temporary distance index from the graph - SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit, only_top_level_chain_distances); - - if (!silence_warnings && temp_index.use_oversized_snarls) { - cerr << "warning: distance index uses oversized snarls, (the biggest has " - << temp_index.most_oversized_snarl_size << " nodes), which may make mapping slow" << endl; - cerr << "\ttry increasing --snarl-limit when building the distance index" << endl; - } - - //And fill in the permanent distance index - vector indexes; - indexes.emplace_back(&temp_index); - distance_index->get_snarl_tree_records(indexes, graph); -} -SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( - const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances) { - -#ifdef debug_distance_indexing - cerr << "Creating new distance index for nodes between " << graph->min_node_id() << " and " << graph->max_node_id() << endl; - -#endif - - SnarlDistanceIndex::TemporaryDistanceIndex temp_index; - - temp_index.min_node_id=graph->min_node_id(); - temp_index.max_node_id=graph->max_node_id(); - - //Construct the distance index using the snarl decomposition - //traverse_decomposition will visit all structures (including trivial snarls), calling - //each of the given functions for the start and ends of the snarls and chains - - temp_index.temp_node_records.resize(temp_index.max_node_id-temp_index.min_node_id+1); - - - - //Stores unfinished records, as type of record and offset into appropriate vector - //(temp_node/snarl/chain_records) - vector> stack; - - //There may be components of the root that are connected to each other. Each connected component will - //get put into a (fake) root-level snarl, but we don't know what those components will be initially, - //since the decomposition just puts them in the same root snarl. This is used to group the root-level - //components into connected components that will later be used to make root snarls - structures::UnionFind root_snarl_component_uf (0); - - - /*Go through the decomposition top down and record the connectivity of the snarls and chains - * Distances will be added later*/ - - snarl_finder->traverse_decomposition( - [&](handle_t chain_start_handle) { - /*This gets called when a new chain is found, starting at the start handle going into chain - * For the first node in a chain, create a chain record and fill in the first node. - * Also add the first node record - */ -#ifdef debug_distance_indexing - cerr << " Starting new chain at " << graph->get_id(chain_start_handle) << (graph->get_is_reverse(chain_start_handle) ? " reverse" : " forward") << endl; - //We shouldn't have seen this node before - //assert(temp_index.temp_node_records[graph->get_id(chain_start_handle)-min_node_id].node_id == 0); -#endif - - //Fill in node in chain - stack.emplace_back(SnarlDistanceIndex::TEMP_CHAIN, temp_index.temp_chain_records.size()); - nid_t node_id = graph->get_id(chain_start_handle); - temp_index.temp_chain_records.emplace_back(); - auto& temp_chain = temp_index.temp_chain_records.back(); - temp_chain.start_node_id = node_id; - temp_chain.start_node_rev = graph->get_is_reverse(chain_start_handle); - temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); - - - //And the node record itself - auto& temp_node = temp_index.temp_node_records.at(node_id-temp_index.min_node_id); - temp_node.node_id = node_id; - temp_node.node_length = graph->get_length(chain_start_handle); - temp_node.reversed_in_parent = graph->get_is_reverse(chain_start_handle); - temp_node.parent = stack.back(); //The parent is this chain - - }, - [&](handle_t chain_end_handle) { - /*This gets called at the end of a chain, facing out - * Record the chain's end node. The node record itself would have been added as part of the snarl - * Also record the chain's parent here - */ - - //Done with this chain - pair chain_index = stack.back(); - stack.pop_back(); - -#ifdef debug_distance_indexing - assert(chain_index.first == SnarlDistanceIndex::TEMP_CHAIN); -#endif - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.temp_chain_records.at(chain_index.second); - nid_t node_id = graph->get_id(chain_end_handle); - - if (temp_chain_record.children.size() == 1 && node_id == temp_chain_record.start_node_id) { - //This is a trivial snarl - -#ifdef debug_distance_indexing - //Then this must be the last thing on the chain_records vector - assert(temp_index.temp_chain_records.size() == chain_index.second+1); -#endif - - //Get the node - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.temp_node_records.at(node_id - temp_index.min_node_id); - - temp_node_record.reversed_in_parent = false; - - //And give the chain's parent the node info - // - if (stack.empty()) { - temp_node_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); - //If this was the last thing on the stack, then this was a root - - //Check to see if there is anything connected to the ends of the chain - vector reachable_nodes; - graph->follow_edges(graph->get_handle(node_id, false), - false, [&] (const handle_t& next) { - if (graph->get_id(next) != node_id) { - reachable_nodes.emplace_back(graph->get_id(next)); - } - }); - graph->follow_edges(graph->get_handle(node_id, true), - false, [&] (const handle_t& next) { - if (graph->get_id(next) != node_id) { - reachable_nodes.emplace_back(graph->get_id(next)); - } - }); - if (reachable_nodes.size()) { - //If we can reach anything leaving the chain (besides the chain itself), then it is part of a root snarl - //Note that if the chain's start and end node are the same, then it will always be a single component -#ifdef debug_distance_indexing - cerr << " This trivial chain is part of the root but connects with something else in the root"<::max()); -#endif - root_snarl_component_uf.union_groups(other_i, temp_node_record.root_snarl_index); -//#ifdef debug_distance_indexing -// cerr << " Union this trivial with " << temp_index.temp_chain_records[node_record.parent.second].start_node_id << " " << temp_index.temp_chain_records[node_record.parent.second].end_node_id << endl; -//#endif - } else { - new_component = false; - } - } - } else { - //If this chain isn't connected to anything else, then it is a single component of the root - temp_node_record.rank_in_parent = temp_index.components.size(); - temp_index.components.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); - } - } else { - //The last thing on the stack is the parent of this chain, which must be a snarl - temp_node_record.parent = stack.back(); - auto& parent_snarl_record = temp_index.temp_snarl_records.at(temp_node_record.parent.second); - temp_node_record.rank_in_parent = parent_snarl_record.children.size() + 2; - parent_snarl_record.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); - } - - - //Remove the chain record - temp_index.temp_chain_records.pop_back(); - temp_index.max_index_size += temp_node_record.get_max_record_length(); - - } else { - //Otherwise, it is an actual chain - - //Fill in node in chain - temp_chain_record.end_node_id = node_id; - temp_chain_record.end_node_rev = graph->get_is_reverse(chain_end_handle); - temp_chain_record.end_node_length = graph->get_length(chain_end_handle); - - bool is_root_chain = false; - - if (stack.empty()) { - //If this was the last thing on the stack, then this was a root - is_root_chain = true; - - //Check to see if there is anything connected to the ends of the chain - vector reachable_nodes; - graph->follow_edges(graph->get_handle(temp_chain_record.start_node_id, !temp_chain_record.start_node_rev), - false, [&] (const handle_t& next) { - if (graph->get_id(next) != temp_chain_record.start_node_id && - graph->get_id(next) != temp_chain_record.end_node_id) { - reachable_nodes.emplace_back(graph->get_id(next)); - } - }); - graph->follow_edges(graph->get_handle(temp_chain_record.end_node_id, temp_chain_record.end_node_rev), - false, [&] (const handle_t& next) { - if (graph->get_id(next) != temp_chain_record.start_node_id && - graph->get_id(next) != temp_chain_record.end_node_id) { - reachable_nodes.emplace_back(graph->get_id(next)); - } - }); - if (reachable_nodes.size() && (temp_chain_record.is_trivial || temp_chain_record.start_node_id != temp_chain_record.end_node_id)) { - //If we can reach anything leaving the chain (besides the chain itself), then it is part of a root snarl - //Note that if the chain's start and end node are the same, then it will always be a single component -#ifdef debug_distance_indexing - cerr << " This chain is part of the root but connects with something else in the root"<::max()); -#endif - root_snarl_component_uf.union_groups(other_i, temp_chain_record.root_snarl_index); -#ifdef debug_distance_indexing - cerr << " Union this chain with " << temp_index.temp_chain_records[node_record.parent.second].start_node_id << " " << temp_index.temp_chain_records[node_record.parent.second].end_node_id << endl; -#endif - } else { - new_component = false; - } - } - } else { - //If this chain isn't connected to anything else, then it is a single component of the root - temp_chain_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); - temp_chain_record.rank_in_parent = temp_index.components.size(); - temp_index.components.emplace_back(chain_index); - } - } else { - //The last thing on the stack is the parent of this chain, which must be a snarl - temp_chain_record.parent = stack.back(); - auto& parent_snarl_record = temp_index.temp_snarl_records.at(temp_chain_record.parent.second); - temp_chain_record.rank_in_parent = parent_snarl_record.children.size() + 2; - parent_snarl_record.children.emplace_back(chain_index); - } - - temp_index.max_index_size += temp_chain_record.get_max_record_length(!only_top_level_chain_distances || is_root_chain ? true : false ); -#ifdef debug_distance_indexing - cerr << " Ending new " << (temp_chain_record.is_trivial ? "trivial " : "") << "chain " << temp_index.structure_start_end_as_string(chain_index) - << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_chain_record.parent) << endl; -#endif - } - }, - [&](handle_t snarl_start_handle) { - /*This gets called at the beginning of a new snarl facing in - * Create a new snarl record and fill in the start node. - * The node record would have been created as part of the chain, or as the end node - * of the previous snarl - */ - -#ifdef debug_distance_indexing - cerr << " Starting new snarl at " << graph->get_id(snarl_start_handle) << (graph->get_is_reverse(snarl_start_handle) ? " reverse" : " forward") << endl; - cerr << "with index " << temp_index.temp_snarl_records.size() << endl; -#endif - auto& parent = stack.back(); - stack.emplace_back(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size()); - temp_index.temp_snarl_records.emplace_back(); - temp_index.temp_snarl_records.back().start_node_id = graph->get_id(snarl_start_handle); - temp_index.temp_snarl_records.back().start_node_rev = graph->get_is_reverse(snarl_start_handle); - temp_index.temp_snarl_records.back().start_node_length = graph->get_length(snarl_start_handle); - - }, - [&](handle_t snarl_end_handle){ - /*This gets called at the end of the snarl facing out - * Fill in the end node of the snarl, its parent, and record the snarl as a child of its - * parent chain - * Also create a node record - */ - pair snarl_index = stack.back(); - stack.pop_back(); -#ifdef debug_distance_indexing - assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL); - assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); -#endif - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records[snarl_index.second]; - nid_t node_id = graph->get_id(snarl_end_handle); - - //Record the end node in the snarl - temp_snarl_record.end_node_id = node_id; - temp_snarl_record.end_node_rev = graph->get_is_reverse(snarl_end_handle); - temp_snarl_record.end_node_length = graph->get_length(snarl_end_handle); - temp_snarl_record.node_count = temp_snarl_record.children.size(); - bool any_edges_in_snarl = false; - graph->follow_edges(graph->get_handle(temp_snarl_record.start_node_id, temp_snarl_record.start_node_rev), false, [&](const handle_t next_handle) { - if (graph->get_id(next_handle) != temp_snarl_record.end_node_id) { - any_edges_in_snarl = true; - } - }); - graph->follow_edges(graph->get_handle(temp_snarl_record.end_node_id, !temp_snarl_record.end_node_rev), false, [&](const handle_t next_handle) { - if (graph->get_id(next_handle) != temp_snarl_record.start_node_id) { - any_edges_in_snarl = true; - } - }); - - if (temp_snarl_record.children.size() == 0) { - //This is a trivial snarl - temp_snarl_record.is_trivial = true; - - //Add the end node to the chain -#ifdef debug_distance_indexing - assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); -#endif - temp_snarl_record.parent = stack.back(); - auto& temp_chain = temp_index.temp_chain_records.at(stack.back().second); - temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); - - //Remove the snarl record -#ifdef debug_distance_indexing - assert(temp_index.temp_snarl_records.size() == snarl_index.second+1); -#endif - temp_index.temp_snarl_records.pop_back(); - } else { - //This is the child of a chain -#ifdef debug_distance_indexing - assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); -#endif - temp_snarl_record.parent = stack.back(); - auto& temp_chain = temp_index.temp_chain_records.at(stack.back().second); - temp_chain.children.emplace_back(snarl_index); - temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); - - } - //Record the snarl as a child of its chain - //if (stack.empty()) { - // assert(false); - // //TODO: The snarl should always be the child of a chain - // //If this was the last thing on the stack, then this was a root - // //TODO: I'm not sure if this would get put into a chain or not - // temp_snarl_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); - // temp_index.components.emplace_back(snarl_index); - //} - - //Record the node itself. This gets done for the start of the chain, and ends of snarls - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.temp_node_records.at(node_id-temp_index.min_node_id); - temp_node_record.node_id = node_id; - temp_node_record.node_length = graph->get_length(snarl_end_handle); - temp_node_record.reversed_in_parent = graph->get_is_reverse(snarl_end_handle); - temp_node_record.parent = stack.back(); - - - -#ifdef debug_distance_indexing - cerr << " Ending new snarl " << temp_index.structure_start_end_as_string(snarl_index) - << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_snarl_record.parent) << endl; -#endif - }); - - /* - * We finished going through everything that exists according to the snarl decomposition, but - * it's still missing tips, which will be discovered when filling in the snarl distances, - * and root-level snarls, which we'll add now by combining the chain components in root_snarl_components - * into snarls defined by root_snarl_component_uf - * The root-level snarl is a fake snarl that doesn't exist according to the snarl decomposition, - * but is an extra layer that groups together components of the root that are connected - */ - - vector> root_snarl_component_indexes = root_snarl_component_uf.all_groups(); - for (vector& root_snarl_indexes : root_snarl_component_indexes) { -#ifdef debug_distance_indexing - cerr << "Create a new root snarl from components" << endl; -#endif - //For each of the root snarls - temp_index.components.emplace_back(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size()); - temp_index.temp_snarl_records.emplace_back(); - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.back(); - temp_snarl_record.is_root_snarl = true; - temp_snarl_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); - - - for (size_t chain_i : root_snarl_indexes) { - //For each chain component of this root-level snarl - if (temp_index.root_snarl_components[chain_i].first == SnarlDistanceIndex::TEMP_CHAIN){ - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.temp_chain_records[temp_index.root_snarl_components[chain_i].second]; - temp_chain_record.parent = make_pair(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size() - 1); - temp_chain_record.rank_in_parent = temp_snarl_record.children.size(); - temp_chain_record.reversed_in_parent = false; - - temp_snarl_record.children.emplace_back(temp_index.root_snarl_components[chain_i]); - } else { -#ifdef debug_distance_indexing - assert(temp_index.root_snarl_components[chain_i].first == SnarlDistanceIndex::TEMP_NODE); -#endif - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.temp_node_records[temp_index.root_snarl_components[chain_i].second - temp_index.min_node_id]; - temp_node_record.parent = make_pair(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size() - 1); - temp_node_record.rank_in_parent = temp_snarl_record.children.size(); - temp_node_record.reversed_in_parent = false; - - temp_snarl_record.children.emplace_back(temp_index.root_snarl_components[chain_i]); - } - } - temp_snarl_record.node_count = temp_snarl_record.children.size(); - } - - - /*Now go through the decomposition again to fill in the distances - * This traverses all chains in reverse order that we found them in, so bottom up - * Each chain and snarl already knows its parents and children, except for single nodes - * that are children of snarls. These nodes were not in chains will have their node - * records created here - */ - -#ifdef debug_distance_indexing - cerr << "Filling in the distances in snarls" << endl; -#endif - for (int i = temp_index.temp_chain_records.size()-1 ; i >= 0 ; i--) { - - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.temp_chain_records[i]; -#ifdef debug_distance_indexing - assert(!temp_chain_record.is_trivial); - cerr << " At " << (temp_chain_record.is_trivial ? " trivial " : "") << " chain " << temp_index.structure_start_end_as_string(make_pair(SnarlDistanceIndex::TEMP_CHAIN, i)) << endl; -#endif - - //Add the first values for the prefix sum and backwards loop vectors - temp_chain_record.prefix_sum.emplace_back(0); - temp_chain_record.max_prefix_sum.emplace_back(0); - temp_chain_record.backward_loops.emplace_back(std::numeric_limits::max()); - temp_chain_record.chain_components.emplace_back(0); - - - /*First, go through each of the snarls in the chain in the forward direction and - * fill in the distances in the snarl. Also fill in the prefix sum and backwards - * loop vectors here - */ - size_t curr_component = 0; //which component of the chain are we in - size_t last_node_length = 0; - for (size_t chain_child_i = 0 ; chain_child_i < temp_chain_record.children.size() ; chain_child_i++ ){ - const pair& chain_child_index = temp_chain_record.children[chain_child_i]; - //Go through each of the children in the chain, skipping nodes - //The snarl may be trivial, in which case don't fill in the distances -#ifdef debug_distance_indexing - cerr << " Looking at child " << temp_index.structure_start_end_as_string(chain_child_index) - << " current max prefix sum " << temp_chain_record.max_prefix_sum.back() << endl; -#endif - - if (chain_child_index.first == SnarlDistanceIndex::TEMP_SNARL){ - //This is where all the work gets done. Need to go through the snarl and add - //all distances, then add distances to the chain that this is in - //The parent chain will be the last thing in the stack - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = - temp_index.temp_snarl_records.at(chain_child_index.second); - - //Fill in this snarl's distances - populate_snarl_index(temp_index, chain_child_index, size_limit, only_top_level_chain_distances, graph); - - bool new_component = temp_snarl_record.min_length == std::numeric_limits::max(); - if (new_component){ - curr_component++; - } - - //And get the distance values for the end node of the snarl in the chain - if (new_component) { - //If this snarl wasn't start-end connected, then we start - //tracking the distance vectors here - - //Update the maximum distance - temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.max_prefix_sum.back()); - - temp_chain_record.prefix_sum.emplace_back(0); - temp_chain_record.max_prefix_sum.emplace_back(0); - temp_chain_record.backward_loops.emplace_back(temp_snarl_record.distance_end_end); - //If the chain is disconnected, the max length is infinite - temp_chain_record.max_length = std::numeric_limits::max(); - } else { - temp_chain_record.prefix_sum.emplace_back(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.prefix_sum.back(), - temp_snarl_record.min_length), - temp_snarl_record.start_node_length)); - temp_chain_record.max_prefix_sum.emplace_back(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.max_prefix_sum.back(), - temp_snarl_record.max_length), - temp_snarl_record.start_node_length)); - temp_chain_record.backward_loops.emplace_back(std::min(temp_snarl_record.distance_end_end, - SnarlDistanceIndex::sum(temp_chain_record.backward_loops.back() - , 2 * (temp_snarl_record.start_node_length + temp_snarl_record.min_length)))); - temp_chain_record.max_length = SnarlDistanceIndex::sum(temp_chain_record.max_length, - temp_snarl_record.max_length); - } - temp_chain_record.chain_components.emplace_back(curr_component); - if (chain_child_i == temp_chain_record.children.size() - 2 && temp_snarl_record.min_length == std::numeric_limits::max()) { - temp_chain_record.loopable = false; - } - last_node_length = 0; - } else { - if (last_node_length != 0) { - //If this is a node and the last thing was also a node, - //then there was a trivial snarl - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = - temp_index.temp_node_records.at(chain_child_index.second-temp_index.min_node_id); - - //Check if there is a loop in this node - //Snarls get counted as trivial if they contain no nodes but they might still have edges - size_t backward_loop = std::numeric_limits::max(); - - graph->follow_edges(graph->get_handle(temp_node_record.node_id, !temp_node_record.reversed_in_parent), false, [&](const handle_t next_handle) { - if (graph->get_id(next_handle) == temp_node_record.node_id) { - //If there is a loop going backwards (relative to the chain) back to the same node - backward_loop = 0; - } - }); - - temp_chain_record.prefix_sum.emplace_back(SnarlDistanceIndex::sum(temp_chain_record.prefix_sum.back(), last_node_length)); - temp_chain_record.max_prefix_sum.emplace_back(SnarlDistanceIndex::sum(temp_chain_record.max_prefix_sum.back(), last_node_length)); - temp_chain_record.backward_loops.emplace_back(std::min(backward_loop, - SnarlDistanceIndex::sum(temp_chain_record.backward_loops.back(), 2 * last_node_length))); - - if (chain_child_i == temp_chain_record.children.size()-1) { - //If this is the last node - temp_chain_record.loopable=false; - } - temp_chain_record.chain_components.emplace_back(curr_component); - } - last_node_length = temp_index.temp_node_records.at(chain_child_index.second - temp_index.min_node_id).node_length; - //And update the chains max length - temp_chain_record.max_length = SnarlDistanceIndex::sum(temp_chain_record.max_length, - last_node_length); - } - } //Finished walking through chain - if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.chain_components.back() != 0) { - //If this is a looping, multicomponent chain, the start/end node could end up in separate chain components - //despite being the same node. - //Since the first component will always be 0, set the first node's component to be whatever the last - //component was - temp_chain_record.chain_components[0] = temp_chain_record.chain_components.back(); - - } - - //For a multicomponent chain, the actual minimum length will always be infinite, but since we sometimes need - //the length of the last component, save that here - temp_chain_record.min_length = !temp_chain_record.is_trivial && temp_chain_record.start_node_id == temp_chain_record.end_node_id - ? temp_chain_record.prefix_sum.back() - : SnarlDistanceIndex::sum(temp_chain_record.prefix_sum.back() , temp_chain_record.end_node_length); - -#ifdef debug_distance_indexing - assert(temp_chain_record.prefix_sum.size() == temp_chain_record.backward_loops.size()); - assert(temp_chain_record.prefix_sum.size() == temp_chain_record.chain_components.size()); -#endif - - - /*Now that we've gone through all the snarls in the chain, fill in the forward loop vector - * by going through the chain in the backwards direction - */ - temp_chain_record.forward_loops.resize(temp_chain_record.prefix_sum.size(), - std::numeric_limits::max()); - if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.children.size() > 1) { - - //If this is a looping chain, then check the first snarl for a loop - if (temp_chain_record.children.at(1).first == SnarlDistanceIndex::TEMP_SNARL) { - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(temp_chain_record.children.at(1).second); - temp_chain_record.forward_loops[temp_chain_record.forward_loops.size()-1] = temp_snarl_record.distance_start_start; - } - } - - size_t node_i = temp_chain_record.prefix_sum.size() - 2; - // We start at the next to last node because we need to look at this record and the next one. - last_node_length = 0; - for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { - auto& child = temp_chain_record.children.at(j); - if (child.first == SnarlDistanceIndex::TEMP_SNARL){ - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); - if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) && - temp_chain_record.chain_components.at(node_i+1) != 0){ - //If this is a new chain component, then add the loop distance from the snarl - //If the component of the next node is 0, then we're still in the same component since we're going backwards - temp_chain_record.forward_loops.at(node_i) = temp_snarl_record.distance_start_start; - } else { - temp_chain_record.forward_loops.at(node_i) = - std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.forward_loops.at(node_i+1), - 2* temp_snarl_record.min_length), - 2*temp_snarl_record.end_node_length), - temp_snarl_record.distance_start_start); - } - node_i --; - last_node_length = 0; - } else { - if (last_node_length != 0) { - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = - temp_index.temp_node_records.at(child.second-temp_index.min_node_id); - - - //Check if there is a loop in this node - //Snarls get counted as trivial if they contain no nodes but they might still have edges - size_t forward_loop = std::numeric_limits::max(); - graph->follow_edges(graph->get_handle(temp_node_record.node_id, temp_node_record.reversed_in_parent), false, [&](const handle_t next_handle) { - if (graph->get_id(next_handle) == temp_node_record.node_id) { - //If there is a loop going forward (relative to the chain) back to the same node - forward_loop = 0; - } - }); - temp_chain_record.forward_loops.at(node_i) = std::min( forward_loop, - SnarlDistanceIndex::sum(temp_chain_record.forward_loops.at(node_i+1) , - 2*last_node_length)); - node_i--; - } - last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; - } - } - - - //If this is a looping chain, check if the loop distances can be improved by going around the chain - - if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.children.size() > 1) { - - - //Also check if the reverse loop values would be improved if we went around again - - if (temp_chain_record.backward_loops.back() < temp_chain_record.backward_loops.front()) { - temp_chain_record.backward_loops[0] = temp_chain_record.backward_loops.back(); - size_t node_i = 1; - size_t last_node_length = 0; - for (size_t i = 1 ; i < temp_chain_record.children.size()-1 ; i++ ) { - auto& child = temp_chain_record.children.at(i); - if (child.first == SnarlDistanceIndex::TEMP_SNARL) { - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); - size_t new_loop_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.backward_loops.at(node_i-1), - 2*temp_snarl_record.min_length), - 2*temp_snarl_record.start_node_length); - if (temp_chain_record.chain_components.at(node_i)!= 0 || new_loop_distance >= temp_chain_record.backward_loops.at(node_i)) { - //If this is a new chain component or it doesn't improve, stop - break; - } else { - //otherwise record the better distance - temp_chain_record.backward_loops.at(node_i) = new_loop_distance; - - } - node_i++; - last_node_length = 0; - } else { - if (last_node_length != 0) { - size_t new_loop_distance = SnarlDistanceIndex::sum(temp_chain_record.backward_loops.at(node_i-1), - 2*last_node_length); - size_t old_loop_distance = temp_chain_record.backward_loops.at(node_i); - temp_chain_record.backward_loops.at(node_i) = std::min(old_loop_distance,new_loop_distance); - node_i++; - } - last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; - } - } - } - if (temp_chain_record.forward_loops.front() < temp_chain_record.forward_loops.back()) { - //If this is a looping chain and looping improves the forward loops, - //then we have to keep going around to update distance - - temp_chain_record.forward_loops.back() = temp_chain_record.forward_loops.front(); - size_t last_node_length = 0; - node_i = temp_chain_record.prefix_sum.size() - 2; - for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { - auto& child = temp_chain_record.children.at(j); - if (child.first == SnarlDistanceIndex::TEMP_SNARL){ - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(child.second); - size_t new_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - temp_chain_record.forward_loops.at(node_i+1), - 2* temp_snarl_record.min_length), - 2*temp_snarl_record.end_node_length); - if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) || - new_distance >= temp_chain_record.forward_loops.at(node_i)){ - //If this is a new component or the distance doesn't improve, stop looking - break; - } else { - //otherwise, update the distance - temp_chain_record.forward_loops.at(node_i) = new_distance; - } - node_i --; - last_node_length =0; - } else { - if (last_node_length != 0) { - size_t new_distance = SnarlDistanceIndex::sum(temp_chain_record.forward_loops.at(node_i+1) , 2* last_node_length); - size_t old_distance = temp_chain_record.forward_loops.at(node_i); - temp_chain_record.forward_loops.at(node_i) = std::min(old_distance, new_distance); - node_i--; - } - last_node_length = temp_index.temp_node_records.at(child.second - temp_index.min_node_id).node_length; - } - } - } - } - - temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.max_prefix_sum.back()); - temp_index.max_distance = temp_chain_record.forward_loops.back() == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.forward_loops.back()); - temp_index.max_distance = temp_chain_record.backward_loops.front() == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.backward_loops.front()); - assert(temp_index.max_distance <= 2742664019); - - } - -#ifdef debug_distance_indexing - cerr << "Filling in the distances in root snarls and distances along chains" << endl; -#endif - for (pair& component_index : temp_index.components) { - if (component_index.first == SnarlDistanceIndex::TEMP_SNARL) { - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(component_index.second); - populate_snarl_index(temp_index, component_index, size_limit, only_top_level_chain_distances, graph); - temp_snarl_record.min_length = std::numeric_limits::max(); - } - } - temp_index.root_structure_count = temp_index.components.size(); -#ifdef debug_distance_indexing - assert(temp_index.components.size() == temp_index.root_structure_count); - cerr << "Finished temp index with " << temp_index.root_structure_count << " connected components" << endl; -#endif - return temp_index; -} - - - -/*Fill in the snarl index. - * The index will already know its boundaries and everything knows their relationships in the - * snarl tree. This needs to fill in the distances and the ranks of children in the snarl - * The rank of a child is arbitrary, except that the start node will always be 0 and the end node - * will always be the node count+1 (since node count doesn't count the boundary nodes) - */ -void populate_snarl_index( - SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, - pair snarl_index, size_t size_limit, - bool only_top_level_chain_distances, const HandleGraph* graph) { -#ifdef debug_distance_indexing - cerr << "Getting the distances for snarl " << temp_index.structure_start_end_as_string(snarl_index) << endl; - assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL); -#endif - SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(snarl_index.second); - temp_snarl_record.is_simple=true; - - - - - /*Helper function to find the ancestor of a node that is a child of this snarl */ - auto get_ancestor_of_node = [&](pair curr_index, - pair ancestor_snarl_index) { - - //This is a child that isn't a node, so it must be a chain - if (curr_index.second == temp_snarl_record.start_node_id || - curr_index.second == temp_snarl_record.end_node_id) { - return curr_index; - } - - //Otherwise, walk up until we hit the current snarl - pair parent_index = temp_index.temp_node_records.at(curr_index.second-temp_index.min_node_id).parent; - while (parent_index != ancestor_snarl_index) { - curr_index=parent_index; - parent_index = parent_index.first == SnarlDistanceIndex::TEMP_SNARL ? temp_index.temp_snarl_records.at(parent_index.second).parent - : temp_index.temp_chain_records.at(parent_index.second).parent; -#ifdef debug_distance_indexing - assert(parent_index.first != SnarlDistanceIndex::TEMP_ROOT); -#endif - } - - return curr_index; - }; - - // TODO: Copying the list - vector> all_children = temp_snarl_record.children; - - // Identify tips - for (const auto& child : all_children) { - // Check if this node is a tip - if (child.first != SnarlDistanceIndex::TEMP_NODE - || (child.second != temp_snarl_record.start_node_id - && child.second != temp_snarl_record.end_node_id)) { - bool is_node = (child.first == SnarlDistanceIndex::TEMP_NODE); - // Set up to check edges leaving the end of the chain/node - nid_t node_id = is_node ? child.second - : temp_index.temp_chain_records.at(child.second).end_node_id; - size_t rank = is_node ? temp_index.temp_node_records.at(child.second - temp_index.min_node_id).rank_in_parent - : temp_index.temp_chain_records.at(child.second).rank_in_parent; - bool is_reverse = is_node ? false - : temp_index.temp_chain_records.at(child.second).end_node_rev; - // Convert to an index in all_children - rank -= 2; - - bool has_edges = false; - graph->follow_edges(graph->get_handle(node_id, is_reverse), false, [&](const handle_t next_handle) { - has_edges = true; - }); - if (!has_edges) { - temp_index.temp_node_records.at(node_id - temp_index.min_node_id).is_tip = true; - temp_snarl_record.tippy_child_ranks.emplace(rank, false); - // It is a tip so this isn't simple snarl - temp_snarl_record.is_simple = false; - } - // Repeat for the other side of the chain/node - node_id = is_node ? child.second - : temp_index.temp_chain_records.at(child.second).start_node_id; - is_reverse = is_node ? true - : !temp_index.temp_chain_records.at(child.second).start_node_rev; - has_edges = false; - graph->follow_edges(graph->get_handle(node_id, is_reverse), false, [&](const handle_t next_handle) { - has_edges = true; - }); - if (!has_edges) { - temp_index.temp_node_records.at(node_id - temp_index.min_node_id).is_tip = true; - temp_snarl_record.tippy_child_ranks.emplace(rank, true); - // It is a tip so this isn't simple snarl - temp_snarl_record.is_simple = false; - } - } - } - - /* - * Do a topological sort of the children and re-assign ranks based on the sort - * TODO: For non-DAGs, this sort will end up arbitrary. - * That doesn't matter right now since the only consumer of ranks - * (ziptrees) expects arbitrary ranks, though. - */ - if (!temp_snarl_record.is_root_snarl) { - // Always start the topological sort at the start - handle_t topological_sort_start = graph->get_handle(temp_snarl_record.start_node_id, - temp_snarl_record.start_node_rev); - - // New sort order. Each value is an index into all_children, which - // matches the ranks(-2) of the children - vector topological_sort_order; - topological_sort_order.reserve(all_children.size()); - - // Which ranks have already been sorted? - unordered_set visited_ranks; - visited_ranks.reserve(all_children.size()); - - // All nodes that have no incoming edges - vector> source_nodes; - - // Add all sources. This will start out as the start node and any tips - for (const auto& tip : temp_snarl_record.tippy_child_ranks) { - source_nodes.emplace_back(tip.first, !tip.second); - } - - // Start node dummy rank is max(). This is traversed first - source_nodes.emplace_back(std::numeric_limits::max(), false); - - // We'll be done sorting when everything is in the sorted vector - while (!source_nodes.empty()) { - // Pick a child with no incoming edges - pair current_child_index = source_nodes.back(); - source_nodes.pop_back(); - - // Visit it - if (visited_ranks.count(current_child_index.first) != 0) { - // We tried to revisit a source node, so this must be a loop - // (we got turned around somewhere is the only way) - // Thus it is safe to abort and allow random ranks - break; - } - if (current_child_index.first != std::numeric_limits::max()) { - topological_sort_order.emplace_back(current_child_index.first); - } - visited_ranks.emplace(current_child_index.first); - - // Get the graph handle for that child, pointing out from the end of the chain - handle_t current_graph_handle; - if (current_child_index.first == std::numeric_limits::max()) { - // If the current child is the start bound, then get the start node pointing in - current_graph_handle = topological_sort_start; - } else { - pair current_index = all_children[current_child_index.first]; - if (current_index.first == SnarlDistanceIndex::TEMP_NODE) { - // If the current child is a node, then get the node pointing in the correct direction - current_graph_handle = graph->get_handle(current_index.second, current_child_index.second); - } else if (current_child_index.second) { - // If the current child is a chain, and we're traversing the chain backwards - current_graph_handle = graph->get_handle(temp_index.temp_chain_records[current_index.second].start_node_id, - !temp_index.temp_chain_records[current_index.second].start_node_rev); - } else { - // Otherwise, the current child is a chain and we're traversing the chain forwards - current_graph_handle = graph->get_handle(temp_index.temp_chain_records[current_index.second].end_node_id, - temp_index.temp_chain_records[current_index.second].end_node_rev); - } - } - - // Try all edges leaving this side - graph->follow_edges(current_graph_handle, false, [&](const handle_t next_handle) { -#ifdef debug_distance_indexing - cerr << "Following forward edges from " << graph->get_id(current_graph_handle) - << " to " << graph->get_id(next_handle) << endl; -#endif - if (graph->get_id(next_handle) == temp_snarl_record.start_node_id || - graph->get_id(next_handle) == temp_snarl_record.end_node_id) { - // If this is trying to leave the snarl, skip it - return true; - } - // Is next_handle a new source? Any unvisited predecessors? - pair next_index = - get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), snarl_index); - bool next_is_node = next_index.first == SnarlDistanceIndex::TEMP_NODE; - size_t next_rank = next_is_node - ? temp_index.temp_node_records.at(next_index.second - temp_index.min_node_id).rank_in_parent - : temp_index.temp_chain_records[next_index.second].rank_in_parent; - // Subtract 2 to get the index from the rank - assert(next_rank >= 2); - next_rank -= 2; - assert(all_children[next_rank] == next_index); - bool next_rev = (next_is_node || temp_index.temp_chain_records[next_index.second].is_trivial) - ? graph->get_is_reverse(next_handle) - : graph->get_id(next_handle) == temp_index.temp_chain_records[next_index.second].end_node_id; - if (visited_ranks.count(next_rank) != 0) { - // If this is a loop, abort - return true; - } - - // Get the handle from the child represented by next_handle going the other way - handle_t reverse_handle = next_index.first == SnarlDistanceIndex::TEMP_NODE ? - graph->get_handle(next_index.second, !next_rev) : - (next_rev ? graph->get_handle(temp_index.temp_chain_records[next_index.second].end_node_id, - temp_index.temp_chain_records[next_index.second].end_node_rev) - : graph->get_handle(temp_index.temp_chain_records[next_index.second].start_node_id, - !temp_index.temp_chain_records[next_index.second].start_node_rev)); - - // Does this have no unseen incoming edges? Check as we go through incoming edges - bool is_source = true; - - // Does this have no unseen incoming edges? - graph->follow_edges(reverse_handle, false, [&](const handle_t incoming_handle) { -#ifdef debug_distance_indexing - cerr << "Getting backwards edge to " << graph->get_id(incoming_handle) << endl; -#endif - if (graph->get_id(incoming_handle) == temp_snarl_record.start_node_id || - graph->get_id(incoming_handle) == temp_snarl_record.end_node_id) { - // If this is trying to leave the snarl, that is OK - return true; - } - // The index of the snarl's child that next_handle represents - pair incoming_index = - get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(incoming_handle)), snarl_index); - bool incoming_is_node = incoming_index.first == SnarlDistanceIndex::TEMP_NODE; - size_t incoming_rank = incoming_is_node - ? temp_index.temp_node_records.at(incoming_index.second - temp_index.min_node_id).rank_in_parent - : temp_index.temp_chain_records[incoming_index.second].rank_in_parent; - - bool incoming_rev = incoming_is_node || temp_index.temp_chain_records[incoming_index.second].is_trivial - ? graph->get_is_reverse(incoming_handle) - : graph->get_id(incoming_handle) == temp_index.temp_chain_records[incoming_index.second].end_node_id; - // Subtract 2 to get the index from the rank - assert(incoming_rank >= 2); - incoming_rank -= 2; - - // This predecessor is unvisited - if (visited_ranks.count(incoming_rank) == 0) { - is_source = false; - } - // Keep going - return true; - }); - if (is_source) { - source_nodes.emplace_back(next_rank, next_rev); - } - return true; - }); - } - - // If we have leftover chains, this is a non-DAG and ranks are arbitrary - // So we will add any leftover ranks to the topological order - vector check_ranks (all_children.size(), false); - for (size_t x : topological_sort_order) { - check_ranks[x] = true; - } - for (size_t i = 0 ; i < check_ranks.size() ; i++) { - if (!check_ranks[i]) { - topological_sort_order.emplace_back(i); - } - } - assert(topological_sort_order.size() == all_children.size()); - - - // We've finished doing to topological sort, so update every child's rank to be the new order - auto old_tippy_ranks = temp_snarl_record.tippy_child_ranks; - temp_snarl_record.tippy_child_ranks.clear(); - for (size_t new_rank = 0 ; new_rank < topological_sort_order.size() ; new_rank++) { - size_t old_rank = topological_sort_order[new_rank]; - if (all_children[old_rank].first == SnarlDistanceIndex::TEMP_NODE) { - temp_index.temp_node_records.at(all_children[old_rank].second-temp_index.min_node_id).rank_in_parent = new_rank+2; - } else { - temp_index.temp_chain_records[all_children[old_rank].second].rank_in_parent = new_rank+2; - } - const auto& old_is_tip = old_tippy_ranks.find(old_rank); - if (old_is_tip != old_tippy_ranks.end()) { - temp_snarl_record.tippy_child_ranks.emplace(new_rank, old_is_tip->second); - } - } - } - - /* - * Now go through each of the children and add distances from that child to everything reachable from it - * Start a dijkstra traversal from each node side in the snarl and record all distances - */ - - - if (size_limit != 0 && !only_top_level_chain_distances) { - //If we are saving distances - //Reserve enough space to store all possible distances - temp_snarl_record.distances.reserve( temp_snarl_record.node_count > size_limit - ? temp_snarl_record.node_count * 2 - : temp_snarl_record.node_count * temp_snarl_record.node_count); - } else { - temp_snarl_record.include_distances = false; - } - - if (size_limit != 0 && temp_snarl_record.node_count > size_limit) { - temp_index.most_oversized_snarl_size = std::max(temp_index.most_oversized_snarl_size, temp_snarl_record.node_count); - temp_index.use_oversized_snarls = true; - } - - //Add the start and end nodes to the list of children so that we include them in the traversal - if (!temp_snarl_record.is_root_snarl) { - all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.start_node_id); - all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.end_node_id); - } - - while (!all_children.empty()) { - const pair start_index = std::move(all_children.back()); - all_children.pop_back(); - - bool is_internal_node = false; - - if ((start_index.first == SnarlDistanceIndex::TEMP_NODE - && start_index.second != temp_snarl_record.start_node_id - && start_index.second != temp_snarl_record.end_node_id) - || - (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && temp_index.temp_chain_records.at(start_index.second).is_trivial)) { - // This is an internal node - is_internal_node = true; - } else if (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && !temp_index.temp_chain_records.at(start_index.second).is_trivial) { - // If this is an internal chain, then it isn't a simple snarl - temp_snarl_record.is_simple=false; - } - - bool start_is_tip = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).is_tip - : temp_index.temp_chain_records.at(start_index.second).is_tip; - - size_t start_rank = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).rank_in_parent - : temp_index.temp_chain_records.at(start_index.second).rank_in_parent; - - - if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.start_node_id) { - start_rank = 0; - } else if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.end_node_id) { - start_rank = 1; - } //TODO: - //else { - // assert(start_rank != 0 && start_rank != 1); - //} - - if ( (temp_snarl_record.node_count > size_limit || size_limit == 0 || only_top_level_chain_distances) && (temp_snarl_record.is_root_snarl || (!start_is_tip && - start_rank != 0 && start_rank != 1))) { - //If we don't care about internal distances, and we also are not at a boundary or tip - //TODO: Why do we care about tips specifically? - continue; - } - - //Start from either direction for all nodes, but only going in for start and end - vector directions; - if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.start_node_id) { - directions.emplace_back(temp_snarl_record.start_node_rev); - } else if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.end_node_id){ - directions.emplace_back(!temp_snarl_record.end_node_rev); - } else { - directions.emplace_back(true); - directions.emplace_back(false); - } - for (bool start_rev : directions) { - //Start a dijkstra traversal from start_index going in the direction indicated by start_rev - //Record the distances to each node (child of the snarl) found - size_t reachable_node_count = 0; //How many nodes can we reach from this node side? - -#ifdef debug_distance_indexing - cerr << " Starting from child " << temp_index.structure_start_end_as_string(start_index) - << " going " << (start_rev ? "rev" : "fd") << endl; -#endif - - //Define a NetgraphNode as the value for the priority queue: - // , direction> - using NetgraphNode = pair, bool>>; - auto cmp = [] (const NetgraphNode a, const NetgraphNode b) { - return a.first > b.first; - }; - - //The priority queue of the next nodes to visit, ordered by the distance - std::priority_queue, decltype(cmp)> queue(cmp); - //The nodes we've already visited - unordered_set, bool>> visited_nodes; - visited_nodes.reserve(temp_snarl_record.node_count * 2); - - //Start from the current start node - queue.push(make_pair(0, make_pair(start_index, start_rev))); - - while (!queue.empty()) { - - //Get the current node from the queue and pop it out of the queue - size_t current_distance = queue.top().first; - pair current_index = queue.top().second.first; - bool current_rev = queue.top().second.second; - if (visited_nodes.count(queue.top().second)) { - queue.pop(); - continue; - } - visited_nodes.emplace(queue.top().second); - queue.pop(); - - - //The handle that we need to follow to get the next reachable nodes - //If the current node is a node, then its just the node. Otherwise, it's the - //opposite side of the child chain - handle_t current_end_handle = current_index.first == SnarlDistanceIndex::TEMP_NODE ? - graph->get_handle(current_index.second, current_rev) : - (current_rev ? graph->get_handle(temp_index.temp_chain_records[current_index.second].start_node_id, - !temp_index.temp_chain_records[current_index.second].start_node_rev) - : graph->get_handle(temp_index.temp_chain_records[current_index.second].end_node_id, - temp_index.temp_chain_records[current_index.second].end_node_rev)); - -#ifdef debug_distance_indexing - cerr << " at child " << temp_index.structure_start_end_as_string(current_index) << " going " - << (current_rev ? "rev" : "fd") << " at actual node " << graph->get_id(current_end_handle) - << (graph->get_is_reverse(current_end_handle) ? "rev" : "fd") << endl; -#endif - graph->follow_edges(current_end_handle, false, [&](const handle_t next_handle) { - if (graph->get_id(current_end_handle) == graph->get_id(next_handle)){ - //If this loops onto the same node side then this isn't a simple snarl - temp_snarl_record.is_simple = false; - } else if ((current_index.first == SnarlDistanceIndex::TEMP_NODE ? current_index.second - : (current_rev ? temp_index.temp_chain_records[current_index.second].end_node_id - : temp_index.temp_chain_records[current_index.second].start_node_id)) - == graph->get_id(next_handle)){ - //If this loops to the other end of the chain then this isn't a simple snarl - temp_snarl_record.is_simple = false; - } else if (!temp_snarl_record.is_root_snarl && start_rank == 0 && - current_index != start_index && graph->get_id(next_handle) != temp_snarl_record.end_node_id) { - //If the starting point of this traversal was the start of the snarl, the current starting point is not the start node, - //and we found another child, then this is not a simple snarl - temp_snarl_record.is_simple = false; - } else if (!temp_snarl_record.is_root_snarl && start_rank == 1 && - current_index != start_index && graph->get_id(next_handle) != temp_snarl_record.start_node_id) { - //If the starting point of this traversal was the end of the snarl, the current starting point is not the end node, - //and we found another child, then this is not a simple snarl - temp_snarl_record.is_simple = false; - } - - reachable_node_count++; - //At each of the nodes reachable from the current one, fill in the distance from the start - //node to the next node (current_distance). If this handle isn't leaving the snarl, - //add the next nodes along with the distance to the end of the next node - auto& node_record = temp_index.temp_node_records.at(graph->get_id(next_handle)-temp_index.min_node_id); - - //The index of the snarl's child that next_handle represents - pair next_index = - get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), snarl_index); - - bool next_is_tip = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).is_tip - : temp_index.temp_chain_records.at(start_index.second).is_tip; - - //The rank and orientation of next in the snarl - size_t next_rank = next_index.first == SnarlDistanceIndex::TEMP_NODE - ? node_record.rank_in_parent - : temp_index.temp_chain_records[next_index.second].rank_in_parent; - if (next_index.first == SnarlDistanceIndex::TEMP_NODE && next_index.second == temp_snarl_record.start_node_id) { - next_rank = 0; - } else if (next_index.first == SnarlDistanceIndex::TEMP_NODE && next_index.second == temp_snarl_record.end_node_id) { - next_rank = 1; - } else { - //If the next thing wasn't a boundary node and this was an internal node, then it isn't a simple snarl - if (is_internal_node) { - temp_snarl_record.is_simple = false; - } - }//TODO: This won't be true of root snarls - //else { - // assert(next_rank != 0 && next_rank != 1); - //} - bool next_rev = next_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.temp_chain_records[next_index.second].is_trivial - ? graph->get_is_reverse(next_handle) - : graph->get_id(next_handle) == temp_index.temp_chain_records[next_index.second].end_node_id; - - /**Record the distance **/ - bool start_is_boundary = !temp_snarl_record.is_root_snarl && (start_rank == 0 || start_rank == 1); - bool next_is_boundary = !temp_snarl_record.is_root_snarl && (next_rank == 0 || next_rank == 1); - - if (size_limit != 0 && - (temp_snarl_record.node_count <= size_limit || start_is_boundary || next_is_boundary)) { - //If the snarl is too big, then we don't record distances between internal nodes - //If we are looking at all distances or we are looking at boundaries - bool added_new_distance = false; - - //Set the distance - pair start = start_is_boundary - ? make_pair(start_rank, false) : make_pair(start_rank, !start_rev); - pair next = next_is_boundary - ? make_pair(next_rank, false) : make_pair(next_rank, next_rev); - if (start_is_boundary && next_is_boundary) { - //If it is between bounds of the snarl, then the snarl stores it - if (start_rank == 0 && next_rank == 0 && - temp_snarl_record.distance_start_start == std::numeric_limits::max()) { - temp_snarl_record.distance_start_start = current_distance; - added_new_distance = true; - } else if (start_rank == 1 && next_rank == 1 && - temp_snarl_record.distance_end_end == std::numeric_limits::max()) { - temp_snarl_record.distance_end_end = current_distance; - added_new_distance = true; - } else if (((start_rank == 0 && next_rank == 1) || (start_rank == 1 && next_rank == 0)) - && temp_snarl_record.min_length == std::numeric_limits::max()){ - temp_snarl_record.min_length = current_distance; - added_new_distance = true; - - } - } else if (start_is_boundary){ - //If start is a boundary node - if (next_index.first == SnarlDistanceIndex::TEMP_NODE) { - //Next is a node - auto& temp_node_record = temp_index.temp_node_records.at(next_index.second-temp_index.min_node_id); - if (start_rank == 0 && !next_rev && - temp_node_record.distance_left_start == std::numeric_limits::max()) { - temp_node_record.distance_left_start = current_distance; - added_new_distance = true; - } else if (start_rank == 0 && next_rev && - temp_node_record.distance_right_start == std::numeric_limits::max()) { - temp_node_record.distance_right_start = current_distance; - added_new_distance = true; - } else if (start_rank == 1 && !next_rev && - temp_node_record.distance_left_end == std::numeric_limits::max()) { - temp_node_record.distance_left_end = current_distance; - added_new_distance = true; - } else if (start_rank == 1 && next_rev && - temp_node_record.distance_right_end == std::numeric_limits::max()) { - temp_node_record.distance_right_end = current_distance; - added_new_distance = true; - } - } else { - //Next is a chain - auto& temp_chain_record = temp_index.temp_chain_records.at(next_index.second); - if (start_rank == 0 && !next_rev && - temp_chain_record.distance_left_start == std::numeric_limits::max()) { - temp_chain_record.distance_left_start = current_distance; - added_new_distance = true; - } else if (start_rank == 0 && next_rev && - temp_chain_record.distance_right_start == std::numeric_limits::max()) { - temp_chain_record.distance_right_start = current_distance; - added_new_distance = true; - } else if (start_rank == 1 && !next_rev && - temp_chain_record.distance_left_end == std::numeric_limits::max()) { - temp_chain_record.distance_left_end = current_distance; - added_new_distance = true; - } else if (start_rank == 1 && next_rev && - temp_chain_record.distance_right_end == std::numeric_limits::max()) { - temp_chain_record.distance_right_end = current_distance; - added_new_distance = true; - } - } - } else if (!next_is_boundary && !temp_snarl_record.distances.count(make_pair(start, next))) { - //Otherwise the snarl stores it in its distance - //If the distance isn't from an internal node to a bound and we haven't stored the distance yet - - temp_snarl_record.distances[make_pair(start, next)] = current_distance; - added_new_distance = true; -#ifdef debug_distance_indexing - cerr << " Adding distance between ranks " << start.first << " " << start.second << " and " << next.first << " " << next.second << ": " << current_distance << endl; -#endif - } - if (added_new_distance) { - temp_snarl_record.max_distance = std::max(temp_snarl_record.max_distance, current_distance); - } - } - - - /**Add the next node to the priority queue**/ - - if (visited_nodes.count(make_pair(next_index, next_rev)) == 0 && - graph->get_id(next_handle) != temp_snarl_record.start_node_id && - graph->get_id(next_handle) != temp_snarl_record.end_node_id - ) { - //If this isn't leaving the snarl, - //then add the next node to the queue, along with the distance to traverse it - size_t next_node_length = next_index.first == SnarlDistanceIndex::TEMP_NODE ? graph->get_length(next_handle) : - temp_index.temp_chain_records[next_index.second].min_length; - if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN && - temp_index.temp_chain_records[next_index.second].chain_components.back() != 0) { - //If there are multiple components, then the chain is not start-end reachable so its length - //is actually infinite - next_node_length = std::numeric_limits::max(); - } - if (next_node_length != std::numeric_limits::max()) { - queue.push(make_pair(SnarlDistanceIndex::sum(current_distance, next_node_length), - make_pair(next_index, next_rev))); - } - } - if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN) { - size_t loop_distance = next_rev ? temp_index.temp_chain_records[next_index.second].backward_loops.back() - : temp_index.temp_chain_records[next_index.second].forward_loops.front(); - if (loop_distance != std::numeric_limits::max() && - visited_nodes.count(make_pair(next_index, !next_rev)) == 0 && - graph->get_id(next_handle) != temp_snarl_record.start_node_id && - graph->get_id(next_handle) != temp_snarl_record.end_node_id - ) { - //If the next node can loop back on itself, then add the next node in the opposite direction - size_t next_node_len = loop_distance + 2 * graph->get_length(next_handle); - queue.push(make_pair(SnarlDistanceIndex::sum(current_distance, next_node_len), - make_pair(next_index, !next_rev))); - } - } -#ifdef debug_distance_indexing - cerr << " reached child " << temp_index.structure_start_end_as_string(next_index) << "going " - << (next_rev ? "rev" : "fd") << " with distance " << current_distance << " for ranks " << start_rank << " " << next_rank << endl; -#endif - }); - } - if (is_internal_node && reachable_node_count != 1) { - //If this is an internal node, then it must have only one edge for it to be a simple snarl - temp_snarl_record.is_simple = false; - } - } - - /** Check the minimum length of the snarl passing through this node **/ - if (start_rank != 0 && start_rank != 1) { - - size_t child_max_length = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).node_length - : temp_index.temp_chain_records.at(start_index.second).max_length; - //The distance through the whole snarl traversing this node forwards - //(This might actually be traversing it backwards but it doesn't really matter) - - size_t dist_start_left = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_left_start - : temp_index.temp_chain_records.at(start_index.second).distance_left_start; - size_t dist_end_right = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_right_end - : temp_index.temp_chain_records.at(start_index.second).distance_right_end; - size_t dist_start_right = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_right_start - : temp_index.temp_chain_records.at(start_index.second).distance_right_start; - size_t dist_end_left = start_index.first == SnarlDistanceIndex::TEMP_NODE - ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).distance_left_end - : temp_index.temp_chain_records.at(start_index.second).distance_left_end; - - size_t snarl_length_fd = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - dist_start_left, dist_end_right),child_max_length); - //The same thing traversing this node backwards - size_t snarl_length_rev = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( - dist_start_right, dist_end_left), child_max_length); - //The max that isn't infinite - size_t max_length = - snarl_length_rev == std::numeric_limits::max() - ? snarl_length_fd - : (snarl_length_fd == std::numeric_limits::max() - ? snarl_length_rev - : std::max(snarl_length_rev, snarl_length_fd)); - if (max_length != std::numeric_limits::max()) { - temp_snarl_record.max_length = std::max(temp_snarl_record.max_length, max_length); - } - if ( temp_snarl_record.is_simple && - ! ((dist_start_left == 0 && dist_end_right == 0 && dist_end_left == std::numeric_limits::max() && dist_start_right == std::numeric_limits::max() ) || - (dist_start_left == std::numeric_limits::max() && dist_end_right == std::numeric_limits::max() && dist_end_left == 0 && dist_start_right == 0 ))){ - //If the snarl is simple, double check that this node is actually simple: that it can only be traversed going - //across the nsarl - temp_snarl_record.is_simple = false; - } - } - } - - - //If this is a simple snarl (one with only single nodes that connect to the start and end nodes), then - // we want to remember if the child nodes are reversed - if (temp_snarl_record.is_simple) { - for (size_t i = 0 ; i < temp_snarl_record.node_count ; i++) { - //Get the index of the child - const pair& child_index = temp_snarl_record.children[i]; - //Which is a node -#ifdef debug_distance_indexing - assert(child_index.first == SnarlDistanceIndex::TEMP_NODE); -#endif - - //And get the record - SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = - temp_index.temp_node_records[child_index.second-temp_index.min_node_id]; - size_t rank =temp_node_record.rank_in_parent; - - - - //Set the orientation of this node in the simple snarl - temp_node_record.reversed_in_parent = temp_node_record.distance_left_start == std::numeric_limits::max(); - - } - } - - //Now that the distances are filled in, predict the size of the snarl in the index - temp_index.max_index_size += temp_snarl_record.get_max_record_length(); - if (temp_snarl_record.is_simple) { - temp_index.max_index_size -= (temp_snarl_record.children.size() * SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord::get_max_record_length()); - } - - // For simple snarl records, need 11 + 11 + number of bits for the number of children - temp_index.max_bits = std::max(temp_index.max_bits, 22 + SnarlDistanceIndex::bit_width(temp_snarl_record.children.size())); -} - - -//Given an alignment to a graph and a range, find the set of nodes in the -//graph for which the minimum distance from the position to any position -//in the node is within the given distance range -//If look_forward is true, then start from the start of the path forward, -//otherwise start from the end going backward -void subgraph_in_distance_range(const SnarlDistanceIndex& distance_index, const Path& path, const HandleGraph* super_graph, size_t min_distance, - size_t max_distance, std::unordered_set& subgraph, bool look_forward){ - - //The position we're starting from - either the start or end of the path - pos_t start_pos; - size_t node_len; - if (look_forward ){ - start_pos = initial_position(path); - node_len = super_graph->get_length(super_graph->get_handle(get_id(start_pos))); +size_t minimum_nontrivial_distance(const SnarlDistanceIndex &distance_index, + pos_t pos1, pos_t pos2, size_t pos2_length, + const HandleGraph *graph) { + bool shifted = false; + if (pos1 == pos2) { + if (pos2_length == std::numeric_limits::max()) { + // If we don't know the length, we can get it from the graph + pos2_length = distance_index.minimum_length( + distance_index.get_node_net_handle(id(pos2))); + } + // Must shift one position to avoid self-distance of 0 + if (offset(pos1) == pos2_length) { + // Shift ending pos backward (not safe to shift forward) + get_offset(pos2)--; } else { - start_pos = final_position(path); - node_len = super_graph->get_length(super_graph->get_handle(get_id(start_pos))); - start_pos = reverse_base_pos(start_pos, node_len); - } - pair traversal_start = std::make_pair(get_id(start_pos), get_is_rev(start_pos)); - -#ifdef debug_subgraph -cerr << endl << "Find subgraph in distance range " << min_distance << " to " << max_distance << endl; -cerr << "Start positon: "<< start_pos << endl; -#endif - //The distance from the position to the ends of the current node(/snarl/chain) - size_t current_distance_left = is_rev(start_pos) ? node_len - get_offset(start_pos) : std::numeric_limits::max() ; - size_t current_distance_right = is_rev(start_pos) ? std::numeric_limits::max() : node_len - get_offset(start_pos) ; - - //Graph node of the start and end of the current node(/snarl/chain) pointing out - net_handle_t current_net = distance_index.get_node_net_handle(get_id(start_pos)); - net_handle_t parent = distance_index.start_end_traversal_of(distance_index.get_parent(current_net)); - - //The id and orientation of nodes that are too close and should be avoided - hash_set> seen_nodes; - //Nodes that we want to start a search from - the distance is smaller or equal to than min_distance but - //we can't walk out any further along the snarl tree without exceeding it - //The distance is the distance from the start position to the beginning (or end if its backwards) of the node, - //including the position - vector> search_start_nodes; - - if (((current_distance_left != std::numeric_limits::max() && current_distance_left > min_distance) || - (current_distance_right != std::numeric_limits::max() && current_distance_right > min_distance)) || - (distance_index.is_trivial_chain(parent) - && distance_index.distance_in_parent(distance_index.get_parent(parent), parent, distance_index.flip(parent)) == 0 - && node_len*2 > min_distance)) { - //If the distance to either end of the node is within the range - //Or of there is a loop on the node ( a duplication of just the node) and the node length would put one loop in the distance range - - //Add this node to the subgraph - subgraph.emplace(get_id(start_pos)); - - handle_t start = is_rev(start_pos) ? distance_index.get_handle(distance_index.flip(current_net), super_graph) - : distance_index.get_handle(current_net, super_graph); - - //Add any node one step out from this one to search_start_nodes - super_graph->follow_edges(start, - false, [&](const handle_t& next_handle) { - search_start_nodes.emplace_back(next_handle, is_rev(start_pos) ? current_distance_left : current_distance_right); - }); - - //Search for reachable nodes - subgraph_in_distance_range_walk_graph(super_graph, min_distance, max_distance, subgraph, search_start_nodes, seen_nodes, traversal_start); - - return; + // Shift starting position forward + get_offset(pos1)++; } + shifted = true; + } - - while (!distance_index.is_root(parent)) { -#ifdef debug_subgraph - cerr << "At child " << distance_index.net_handle_as_string(current_net) << " with distances " << current_distance_left << " " << current_distance_right << endl; -#endif - - size_t max_parent_length = distance_index.maximum_length(parent); - - - //Distances to get to the ends of the parent - size_t distance_start_left = SnarlDistanceIndex::sum(current_distance_left, - distance_index.distance_to_parent_bound(parent, true, distance_index.flip(current_net))); - size_t distance_start_right = SnarlDistanceIndex::sum(current_distance_right, - distance_index.distance_to_parent_bound(parent, true, current_net)); - size_t distance_end_left = SnarlDistanceIndex::sum(current_distance_left, - distance_index.distance_to_parent_bound(parent, false, distance_index.flip(current_net))); - size_t distance_end_right = SnarlDistanceIndex::sum(current_distance_right, - distance_index.distance_to_parent_bound(parent, false, current_net)); - - if ((current_distance_right != std::numeric_limits::max() && current_distance_right >= min_distance) - || (current_distance_left != std::numeric_limits::max() && current_distance_left >= min_distance) - || (distance_start_right != std::numeric_limits::max() && distance_start_right>= min_distance) - || (distance_end_right != std::numeric_limits::max() && distance_end_right >= min_distance) - || (distance_start_left != std::numeric_limits::max() && distance_start_left >= min_distance) - || (distance_end_left != std::numeric_limits::max() && distance_end_left >= min_distance) - || (max_parent_length != std::numeric_limits::max() && max_parent_length >= min_distance)) { - //If the min distance will be exceeded within this parent, then start a search from the ends of this child - - if (distance_index.is_snarl(parent)) { - //If this is the child of a snarl, then just traverse from the end of the node -#ifdef debug_subgraph -cerr << "Start search in parent " << distance_index.net_handle_as_string(parent); -#endif - if (current_distance_left != std::numeric_limits::max() ){ - //If we can go left - net_handle_t bound = distance_index.is_node(current_net) ? distance_index.flip(current_net) - : distance_index.get_bound(current_net, false, false); - if (distance_index.is_sentinel(bound)) { - bound = distance_index.get_node_from_sentinel(bound); - } - handle_t current_node = distance_index.get_handle(bound, super_graph); - //Add everything immediately after the left bound of this node/chain - super_graph->follow_edges(distance_index.get_handle(bound, super_graph), - false, [&](const handle_t& next_handle) { - seen_nodes.erase(make_pair(super_graph->get_id(next_handle), super_graph->get_is_reverse(next_handle))); - search_start_nodes.emplace_back(next_handle,current_distance_left); + size_t distance = minimum_distance(distance_index, pos1, pos2, false, graph); + if (shifted && distance != std::numeric_limits::max()) { + // This loop is possible, so add back in the shift + distance++; + } - }); - -#ifdef debug_subgraph - cerr << " going left from " << super_graph->get_id(current_node) << (super_graph->get_is_reverse(current_node) ? "rev " : "fd ") ; -#endif - } - if (current_distance_right != std::numeric_limits::max()) { - //If we can go right - net_handle_t bound = distance_index.is_node(current_net) ? current_net - : distance_index.get_bound(current_net, true, false); - if (distance_index.is_sentinel(bound)) { - bound = distance_index.get_node_from_sentinel(bound); - } - handle_t current_node = distance_index.get_handle(bound, super_graph); - - //Add everything immediately after the right bound of this node/chain - super_graph->follow_edges(distance_index.get_handle(bound, super_graph), - false, [&](const handle_t& next_handle) { - seen_nodes.erase(make_pair(super_graph->get_id(next_handle),super_graph->get_is_reverse(next_handle))); - search_start_nodes.emplace_back(next_handle, current_distance_right); - }); - -#ifdef debug_subgraph - cerr << " going right from " << super_graph->get_id(current_node) << (super_graph->get_is_reverse(current_node) ? "rev " : "fd "); -#endif - } -#ifdef debug_subgraph - cerr << endl; -#endif - } else { -#ifdef debug_subgraph -cerr << "Start search along parent chain " << distance_index.net_handle_as_string(parent); -#endif - //If this is the child of a chain, then traverse along the chain - if (current_distance_left != std::numeric_limits::max()) { - subgraph_in_distance_range_walk_across_chain (distance_index, super_graph, subgraph, - distance_index.flip(current_net), current_distance_left, search_start_nodes, seen_nodes, min_distance, max_distance, false); - } - if (current_distance_right != std::numeric_limits::max()) { - subgraph_in_distance_range_walk_across_chain (distance_index, super_graph, subgraph, - current_net, current_distance_right, search_start_nodes, seen_nodes, min_distance, max_distance, false); - } - } - subgraph_in_distance_range_walk_graph(super_graph, min_distance, max_distance, subgraph, search_start_nodes, seen_nodes, traversal_start); - return; - } else if (distance_index.is_snarl(parent)){ - //TODO: This might be overkill. It prevents us from adding nodes that shouldn't be in the subgraph, but might be too slow - //If we don't check the other direction, go through the loop and add everything whose distance is lower than the minimum - //to seen_nodes - vector> loop_handles_to_check; - handle_t start_out = distance_index.get_handle(distance_index.get_bound(parent, false, false), super_graph); - handle_t end_out = distance_index.get_handle(distance_index.get_bound(parent, true, false), super_graph); - if (current_distance_left != std::numeric_limits::max()) { - loop_handles_to_check.emplace_back(distance_index.get_handle(distance_index.get_bound(current_net, false, false), super_graph), current_distance_left); - } - if (current_distance_right != std::numeric_limits::max()) { - loop_handles_to_check.emplace_back(distance_index.get_handle(distance_index.get_bound(current_net, true, false), super_graph), current_distance_right); - } - while (!loop_handles_to_check.empty()) { - handle_t current_loop_handle = loop_handles_to_check.back().first; - size_t current_loop_distance = loop_handles_to_check.back().second; - loop_handles_to_check.pop_back(); - - //Add to seen_nodes - seen_nodes.emplace(super_graph->get_id(current_loop_handle), super_graph->get_is_reverse(current_loop_handle)); - - //Walk one step out from this node - super_graph->follow_edges(current_loop_handle, false, [&](const handle_t& next_handle) { - //If the next node is close enough and isn't exiting the snarl, then add it to stack - size_t new_distance = SnarlDistanceIndex::sum(current_loop_distance, super_graph->get_length(next_handle)); - if (new_distance < min_distance && next_handle != start_out && next_handle != end_out) { - loop_handles_to_check.emplace_back(next_handle, new_distance); - } - }); - } - } else if (distance_index.is_chain(parent)) { - //TODO: This is probably also overkill - walk a chain if there is a viable loop - size_t distance_loop_right = distance_index.distance_in_parent(parent, current_net, current_net, super_graph, max_distance); - size_t distance_loop_left = distance_index.distance_in_parent(parent, distance_index.flip(current_net), distance_index.flip(current_net), super_graph, max_distance); - if ((current_distance_left != std::numeric_limits::max() && distance_loop_left != std::numeric_limits::max()) || - (current_distance_right != std::numeric_limits::max() && distance_loop_right != std::numeric_limits::max())) { - //If there is a loop that we can take, then take it - if (current_distance_left != std::numeric_limits::max()) { - subgraph_in_distance_range_walk_across_chain (distance_index, super_graph, subgraph, - distance_index.flip(current_net), current_distance_left, search_start_nodes, seen_nodes, min_distance, max_distance, false); - } - if (current_distance_right != std::numeric_limits::max()) { - subgraph_in_distance_range_walk_across_chain (distance_index, super_graph, subgraph, - current_net, current_distance_right, search_start_nodes, seen_nodes, min_distance, max_distance, false); - } - subgraph_in_distance_range_walk_graph(super_graph, min_distance, max_distance, subgraph, search_start_nodes, seen_nodes, traversal_start); - return; - } - } - - //Remember the bounds of this child so we don't return to it - if (current_distance_left != std::numeric_limits::max() ){ - //If we can go left - net_handle_t bound = distance_index.is_node(current_net) ? distance_index.flip(current_net) - : distance_index.get_bound(current_net, false, false); - if (distance_index.is_sentinel(bound)) { - bound = distance_index.get_node_from_sentinel(bound); - } - handle_t current_node = distance_index.get_handle(bound, super_graph); - seen_nodes.emplace(super_graph->get_id(current_node), super_graph->get_is_reverse(current_node)); - } - if (current_distance_right != std::numeric_limits::max()) { - //If we can go right - net_handle_t bound = distance_index.is_node(current_net) ? current_net - : distance_index.get_bound(current_net, true, false); - if (distance_index.is_sentinel(bound)) { - bound = distance_index.get_node_from_sentinel(bound); - } - handle_t current_node = distance_index.get_handle(bound, super_graph); - seen_nodes.emplace(super_graph->get_id(current_node), super_graph->get_is_reverse(current_node)); - } - - current_distance_left = std::min(distance_start_left, distance_start_right); - current_distance_right = std::min(distance_end_left, distance_end_right); - - current_net = std::move(parent); - parent = distance_index.canonical(distance_index.get_parent(current_net)); - } - if (current_distance_left <= min_distance) { -#ifdef debug_subgraph - cerr << "Adding the end of a child of the root " << distance_index.net_handle_as_string(distance_index.get_bound(current_net, false, false)) << " with distance " << current_distance_left << endl; -#endif - - handle_t bound = distance_index.get_handle(distance_index.get_bound(current_net, false, false), super_graph); - search_start_nodes.emplace_back(bound, current_distance_left); - } - if (current_distance_right <= min_distance) { -#ifdef debug_subgraph - cerr << "Adding the end of a child of the root " << distance_index.net_handle_as_string(distance_index.get_bound(current_net, false, false)) << " with distance " << current_distance_right << endl; -#endif - handle_t bound = distance_index.get_handle(distance_index.get_bound(current_net, true, false), super_graph); - search_start_nodes.emplace_back(bound,current_distance_right); - } - subgraph_in_distance_range_walk_graph(super_graph, min_distance, max_distance, subgraph, search_start_nodes, seen_nodes, traversal_start); - - return; + return distance; } - -///Helper for subgraph_in_distance_range -///Given starting handles in the super graph and the distances to each handle (including the start position and -//the first position in the handle), add all nodes within the distance range, excluding nodes in seen_nodes -void subgraph_in_distance_range_walk_graph(const HandleGraph* super_graph, size_t min_distance, size_t max_distance, - std::unordered_set& subgraph, vector>& start_nodes, - hash_set>& seen_nodes, const pair& traversal_start) { -#ifdef debug_subgraph - cerr << "Starting search from nodes " << endl; - for (auto& start_handle : start_nodes) { - cerr << "\t" << super_graph->get_id(start_handle.first) << " " << super_graph->get_is_reverse(start_handle.first) - << " with distance " << start_handle.second << endl; - } -#endif - - //Order based on the distance to the position (handle) - auto cmp = [] (const pair a, const pair b ) { - return a.second > b.second; - }; - priority_queue< pair, vector>, decltype(cmp)> next_handles (cmp); - for (auto& start_handle : start_nodes) { - next_handles.emplace(start_handle); - } - bool first_node = true; - - while (next_handles.size() > 0) { - //Traverse the graph, adding nodes if they are within the range - handle_t curr_handle=next_handles.top().first; - size_t curr_distance=next_handles.top().second; - next_handles.pop(); -#ifdef debug_subgraph - cerr << "At node " << super_graph->get_id(curr_handle) << " " << super_graph->get_is_reverse(curr_handle) << " with distance " << curr_distance << endl; -#endif - if (seen_nodes.count(make_pair(super_graph->get_id(curr_handle), super_graph->get_is_reverse(curr_handle))) == 0) { - seen_nodes.emplace(super_graph->get_id(curr_handle), super_graph->get_is_reverse(curr_handle)); - - size_t node_len = super_graph->get_length(curr_handle); - size_t curr_distance_end = SnarlDistanceIndex::sum(curr_distance, node_len)-1; - if ((curr_distance >= min_distance && curr_distance <= max_distance) || - (curr_distance_end >= min_distance && curr_distance_end <= max_distance) || - (curr_distance <= min_distance && curr_distance_end >= max_distance)) { -#ifdef debug_subgraph - cerr << "\tadding node " << super_graph->get_id(curr_handle) << " " << super_graph->get_is_reverse(curr_handle) << " with distance " - << curr_distance << " and node length " << node_len << endl; -#endif - subgraph.insert(super_graph->get_id(curr_handle)); - - } -#ifdef debug_subgraph - else { - cerr << "\tdisregarding node " << super_graph->get_id(curr_handle) << " " << super_graph->get_is_reverse(curr_handle) - << " with distance " << curr_distance << " and node length " << node_len << endl; - } -#endif - curr_distance = SnarlDistanceIndex::sum(node_len, curr_distance); - - //If the end of this node is still within the range, add the next nodes that are within - //Also check that the node we're currently at isn't the start node - if (SnarlDistanceIndex::minus(curr_distance,1) <= max_distance) { - super_graph->follow_edges(curr_handle, false, [&](const handle_t& next) { - nid_t next_id = super_graph->get_id(next); - if (seen_nodes.count(make_pair(next_id, super_graph->get_is_reverse(next))) == 0) { - next_handles.emplace(next, curr_distance); - } - return true; - }); - } - first_node = false; - } -#ifdef debug_subgraph - else { - cerr << "\tthe node was already seen" << endl; - } -#endif - - } - -#ifdef debug_subgraph - cerr << "Subgraph has nodes: "; - for (const nid_t& node : subgraph) { - cerr << node << ", "; - } - cerr << endl; -#endif - return; +size_t maximum_distance(const SnarlDistanceIndex &distance_index, pos_t pos1, + pos_t pos2) { + return distance_index.maximum_distance(get_id(pos1), get_is_rev(pos1), + get_offset(pos1), get_id(pos2), + get_is_rev(pos2), get_offset(pos2)); } -//helper function to walk along a chain from the current node until the distance traversed -//exceeds the minimum limit. Add the node just before this happens to search_start_nodes -void subgraph_in_distance_range_walk_across_chain (const SnarlDistanceIndex& distance_index, const HandleGraph* super_graph, - std::unordered_set& subgraph, net_handle_t current_node, - size_t current_distance, vector>& search_start_nodes, hash_set>& seen_nodes, - const size_t& min_distance, const size_t& max_distance, bool checked_loop){ -#ifdef debug_subgraph - cerr << "Walk along parent chain " << distance_index.net_handle_as_string(distance_index.get_parent(current_node)) << " from " << distance_index.net_handle_as_string(current_node) << " with " << current_distance << endl; -#endif - if (distance_index.is_trivial_chain(distance_index.get_parent(current_node))){ - return; - } - bool finished_chain = false; - bool added_nodes = false; //Did we start a search? if not, add the last node in the chain - while (current_distance <= min_distance && !finished_chain) { - finished_chain = distance_index.follow_net_edges(current_node, super_graph, false, - [&](const net_handle_t& next) { - size_t next_length = distance_index.minimum_length(next); - //If the next child is a snarl, then the distance to loop in the snarl - if (distance_index.is_snarl(next)) { - net_handle_t bound_fd = distance_index.get_bound(next, distance_index.ends_at(next) == SnarlDistanceIndex::START, true); - size_t next_loop = distance_index.distance_in_parent(next, bound_fd, bound_fd, super_graph, max_distance); - if (!checked_loop && next_loop != std::numeric_limits::max()) { -#ifdef debug_subgraph - cerr << "\tsnarl loops so also check the other direction" << endl; -#endif - //If we haven't yet checked the chain in the other direction and this snarl allows us to loop - if ( SnarlDistanceIndex::sum(next_loop, current_distance) != std::numeric_limits::max() && - SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(next_loop, - current_distance), - distance_index.node_length(current_node)) >= min_distance) { -#ifdef debug_subgraph - cerr << "\t\t add the current node" << endl; -#endif - //If the loop will put us over the edge, then start from the current node - super_graph->follow_edges(distance_index.get_handle(current_node, super_graph), false, [&](const handle_t& next_handle) { - search_start_nodes.emplace_back(next_handle,current_distance); - }); - return true; - } else { - //Otherwise, switch direction in the chain and walk along it again - subgraph_in_distance_range_walk_across_chain(distance_index, super_graph, subgraph, distance_index.flip(current_node), - SnarlDistanceIndex::sum(SnarlDistanceIndex::sum(current_distance, - next_loop), - distance_index.node_length(current_node)), - search_start_nodes, seen_nodes, min_distance, max_distance, true); - checked_loop = true; - } - } - if (next_loop != std::numeric_limits::max()){ - //TODO: This might be overkill. It prevents us from adding nodes that shouldn't be in the subgraph, but might be too slow - //If we don't check the other direction, go through the loop and add everything whose distance is lower than the minimum - //to seen_nodes - vector> loop_handles_to_check; - handle_t start_out = distance_index.get_handle(distance_index.get_bound(next, false, false), super_graph); - handle_t end_out = distance_index.get_handle(distance_index.get_bound(next, true, false), super_graph); - loop_handles_to_check.emplace_back(distance_index.get_handle(bound_fd, super_graph), current_distance); - while (!loop_handles_to_check.empty()) { - handle_t current_loop_handle = loop_handles_to_check.back().first; - size_t current_loop_distance = loop_handles_to_check.back().second; - loop_handles_to_check.pop_back(); - - //Add to seen_nodes - seen_nodes.emplace(super_graph->get_id(current_loop_handle), super_graph->get_is_reverse(current_loop_handle)); - - //Walk one step out from this node - super_graph->follow_edges(current_loop_handle, false, [&](const handle_t& next_handle) { - //If the next node is close enough and isn't exiting the snarl, then add it to stack - size_t new_distance = SnarlDistanceIndex::sum(current_loop_distance, super_graph->get_length(next_handle)); - if (new_distance < min_distance && next_handle != start_out && next_handle != end_out) { - loop_handles_to_check.emplace_back(next_handle, new_distance); - } - }); - } - - } - } - size_t next_max_length = distance_index.maximum_length(next); -#ifdef debug_subgraph - cerr << "\tnext node: " << distance_index.net_handle_as_string(next) << " with distance " << current_distance << " and min and max lengths " << next_length << " " << next_max_length << endl; -#endif - if (( SnarlDistanceIndex::sum(next_max_length, current_distance) != std::numeric_limits::max() && - SnarlDistanceIndex::sum(next_max_length, current_distance) >= min_distance)){ - if (distance_index.is_node(next)) { - size_t curr_distance_end = SnarlDistanceIndex::minus(SnarlDistanceIndex::sum(next_max_length, current_distance),1); - //If its a node that puts us over, add the node to the subgraph, then start the search from that node -#ifdef debug_subgraph - cerr << "\t\tAdding node from a chain " << distance_index.net_handle_as_string(next) << " with distance " << current_distance << endl; -#endif - if ((current_distance >= min_distance && current_distance <= max_distance) || - (curr_distance_end >= min_distance && curr_distance_end <= max_distance) || - (current_distance <= min_distance && curr_distance_end >= max_distance)) { - subgraph.emplace(distance_index.node_id(next)); - } - super_graph->follow_edges(distance_index.get_handle(next, super_graph), false, [&](const handle_t& next_handle) { - search_start_nodes.emplace_back(next_handle, SnarlDistanceIndex::sum(current_distance, next_length)); - seen_nodes.erase(make_pair(super_graph->get_id(next_handle), super_graph->get_is_reverse(next_handle))); - }); - } else { - //If it's a snarl, then we'll start from the last node -#ifdef debug_subgraph - cerr << "\t\tAdding node from a chain " << distance_index.net_handle_as_string(next) << " with distance " << current_distance << endl; -#endif - super_graph->follow_edges(distance_index.get_handle(current_node, super_graph), false, [&](const handle_t& next_handle) { - search_start_nodes.emplace_back(next_handle,current_distance); - seen_nodes.erase(make_pair(super_graph->get_id(next_handle), super_graph->get_is_reverse(next_handle))); - }); - } - //If we added something, stop traversing the chain - added_nodes = true; - return true; - } else if (distance_index.is_node(next)) { - seen_nodes.emplace(distance_index.node_id(next), distance_index.ends_at(next) == SnarlDistanceIndex::START); - } - current_node = next; - current_distance = SnarlDistanceIndex::sum(next_length, current_distance); - if (current_distance > max_distance) { - added_nodes = true; - return true; - } else { - return false; - } - }); - } - if (!added_nodes && current_distance <= max_distance) { - //If we haven't added anything and haven't exceeded the distance limit, then start from the end of the chain - handle_t bound = distance_index.get_handle(current_node, super_graph); - super_graph->follow_edges(bound, false, [&](const handle_t& next_handle) { - search_start_nodes.emplace_back(next_handle,current_distance); - seen_nodes.erase(make_pair(super_graph->get_id(next_handle), super_graph->get_is_reverse(next_handle))); - }); - //seen_nodes.erase(make_pair(super_graph->get_id(bound), super_graph->get_is_reverse(bound))); - //search_start_nodes.emplace_back( bound, current_distance); - } -}; - - -void subgraph_containing_path_snarls(const SnarlDistanceIndex& distance_index, const HandleGraph* graph, const Path& path, std::unordered_set& subgraph) { - //Get the start and end of the path - pos_t start_pos = initial_position(path); - net_handle_t start_node = distance_index.get_node_net_handle(get_id(start_pos)); - subgraph.insert(get_id(start_pos)); - - pos_t end_pos = final_position(path); - net_handle_t end_node = distance_index.get_node_net_handle(get_id(end_pos)); - subgraph.insert(get_id(end_pos)); - - //Get the lowest common ancestor - pair lowest_ancestor_bool = distance_index.lowest_common_ancestor(start_node, end_node); - net_handle_t common_ancestor = lowest_ancestor_bool.first; - - - if (distance_index.is_snarl(common_ancestor) || common_ancestor == start_node) { - //If the lowest common ancestor is a snarl, just add the entire snarl - - add_descendants_to_subgraph(distance_index, common_ancestor, subgraph); - - } else if (distance_index.is_chain(common_ancestor)) { - - //Get the ancestors of the nodes that are children of the common ancestor - net_handle_t ancestor1 = distance_index.canonical(distance_index.get_parent(start_node)); - while (ancestor1 != common_ancestor) { - start_node = ancestor1; - ancestor1 = distance_index.canonical(distance_index.get_parent(start_node)); - } - net_handle_t ancestor2 = distance_index.canonical(distance_index.get_parent(end_node)); - while (ancestor2 != common_ancestor) { - end_node = ancestor2; - ancestor2 = distance_index.canonical(distance_index.get_parent(end_node)); - } -#ifdef debug_distance_indexing - assert(ancestor1 == ancestor2); -#endif - - - //Walk from one ancestor to the other and add everything in the chain - net_handle_t current_child = distance_index.canonical(distance_index.is_ordered_in_chain(start_node, end_node) ? start_node : end_node); - net_handle_t end_child = distance_index.canonical(distance_index.is_ordered_in_chain(start_node, end_node) ? end_node : start_node); - if (distance_index.is_reversed_in_parent(current_child)) { - current_child = distance_index.flip(current_child); - } - if (distance_index.is_reversed_in_parent(end_child)) { - end_child = distance_index.flip(end_child); - } - - add_descendants_to_subgraph(distance_index, current_child, subgraph); - while (current_child != end_child) { - distance_index.follow_net_edges(current_child, graph, false, [&](const net_handle_t& next) { - add_descendants_to_subgraph(distance_index, next, subgraph); - current_child = next; - - }); - } - - } - +void fill_in_distance_index(SnarlDistanceIndex *distance_index, + const HandleGraph *graph, + const HandleGraphSnarlFinder *snarl_finder, + size_t size_limit, + bool only_top_level_chain_distances, + bool silence_warnings) { + distance_index->set_snarl_size_limit(size_limit); + distance_index->set_only_top_level_chain_distances( + only_top_level_chain_distances); + + // Build the temporary distance index from the graph + SnarlDistanceIndex::TemporaryDistanceIndex temp_index = + make_temporary_distance_index(graph, snarl_finder, size_limit, + only_top_level_chain_distances); + + if (!silence_warnings && temp_index.use_oversized_snarls) { + cerr << "warning: distance index uses oversized snarls, (the biggest has " + << temp_index.most_oversized_snarl_size + << " nodes), which may make mapping slow" << endl; + cerr << "\ttry increasing --snarl-limit when building the distance index" + << endl; + } + + // And fill in the permanent distance index + vector indexes; + indexes.emplace_back(&temp_index); + distance_index->get_snarl_tree_records(indexes, graph); } - -//Recursively add all nodes in parent to the subgraph -void add_descendants_to_subgraph(const SnarlDistanceIndex& distance_index, const net_handle_t& parent, std::unordered_set& subgraph) { - if (distance_index.is_node(parent)) { - subgraph.insert(distance_index.node_id(parent)); - } else { - distance_index.for_each_child(parent, [&](const net_handle_t& child) { - add_descendants_to_subgraph(distance_index, child, subgraph); - }); - } +void subgraph_containing_path_snarls(const SnarlDistanceIndex &distance_index, + const HandleGraph *graph, const Path &path, + std::unordered_set &subgraph) { + // Get the start and end of the path + pos_t start_pos = initial_position(path); + net_handle_t start_node = + distance_index.get_node_net_handle(get_id(start_pos)); + subgraph.insert(get_id(start_pos)); + + pos_t end_pos = final_position(path); + net_handle_t end_node = distance_index.get_node_net_handle(get_id(end_pos)); + subgraph.insert(get_id(end_pos)); + + // Get the lowest common ancestor + pair lowest_ancestor_bool = + distance_index.lowest_common_ancestor(start_node, end_node); + net_handle_t common_ancestor = lowest_ancestor_bool.first; + + if (distance_index.is_snarl(common_ancestor) || + common_ancestor == start_node) { + // If the lowest common ancestor is a snarl, just add the entire snarl + + add_descendants_to_subgraph(distance_index, common_ancestor, subgraph); + + } else if (distance_index.is_chain(common_ancestor)) { + + // Get the ancestors of the nodes that are children of the common ancestor + net_handle_t ancestor1 = + distance_index.canonical(distance_index.get_parent(start_node)); + while (ancestor1 != common_ancestor) { + start_node = ancestor1; + ancestor1 = + distance_index.canonical(distance_index.get_parent(start_node)); + } + net_handle_t ancestor2 = + distance_index.canonical(distance_index.get_parent(end_node)); + while (ancestor2 != common_ancestor) { + end_node = ancestor2; + ancestor2 = distance_index.canonical(distance_index.get_parent(end_node)); + } +#ifdef debug_distance_indexing + assert(ancestor1 == ancestor2); +#endif + + // Walk from one ancestor to the other and add everything in the chain + net_handle_t current_child = distance_index.canonical( + distance_index.is_ordered_in_chain(start_node, end_node) ? start_node + : end_node); + net_handle_t end_child = distance_index.canonical( + distance_index.is_ordered_in_chain(start_node, end_node) ? end_node + : start_node); + if (distance_index.is_reversed_in_parent(current_child)) { + current_child = distance_index.flip(current_child); + } + if (distance_index.is_reversed_in_parent(end_child)) { + end_child = distance_index.flip(end_child); + } + + add_descendants_to_subgraph(distance_index, current_child, subgraph); + while (current_child != end_child) { + distance_index.follow_net_edges( + current_child, graph, false, [&](const net_handle_t &next) { + add_descendants_to_subgraph(distance_index, next, subgraph); + current_child = next; + }); + } + } } - - +// Recursively add all nodes in parent to the subgraph +void add_descendants_to_subgraph(const SnarlDistanceIndex &distance_index, + const net_handle_t &parent, + std::unordered_set &subgraph) { + if (distance_index.is_node(parent)) { + subgraph.insert(distance_index.node_id(parent)); + } else { + distance_index.for_each_child(parent, [&](const net_handle_t &child) { + add_descendants_to_subgraph(distance_index, child, subgraph); + }); + } } +} // namespace vg diff --git a/src/snarl_distance_index.hpp b/src/snarl_distance_index.hpp index 43268d4b23..e502b9aa12 100644 --- a/src/snarl_distance_index.hpp +++ b/src/snarl_distance_index.hpp @@ -2,6 +2,7 @@ #define VG_SNARL_DISTANCE_HPP_INCLUDED #include +#include #include "snarls.hpp" #include #include "hash_map.hpp" @@ -36,7 +37,7 @@ void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGrap /// Fill in the temporary snarl record with distances void populate_snarl_index(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, - pair snarl_index, size_t size_limit, bool only_top_level_chain_distances, const HandleGraph* graph) ; + SnarlDistanceIndex::temp_record_ref_t snarl_index, size_t size_limit, bool only_top_level_chain_distances, const HandleGraph* graph) ; SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances); diff --git a/src/snarl_distance_index_build.cpp b/src/snarl_distance_index_build.cpp new file mode 100644 index 0000000000..7100616c04 --- /dev/null +++ b/src/snarl_distance_index_build.cpp @@ -0,0 +1,1708 @@ +//#define debug_distance_indexing +//#define debug_snarl_traversal +//#define debug_distances +//#define debug_hub_label_build +//#define debug_hub_label_storage + +#include "snarl_distance_index.hpp" +#include "snarl_distance_index_child_graph.hpp" +#include + +using namespace std; +using namespace handlegraph; +namespace vg { + +SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( + const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances) { + +#ifdef debug_distance_indexing + cerr << "Creating new distance index for nodes between " << graph->min_node_id() << " and " << graph->max_node_id() << endl; + +#endif + + SnarlDistanceIndex::TemporaryDistanceIndex temp_index; + + temp_index.min_node_id=graph->min_node_id(); + temp_index.max_node_id=graph->max_node_id(); + + //Construct the distance index using the snarl decomposition + //traverse_decomposition will visit all structures (including trivial snarls), calling + //each of the given functions for the start and ends of the snarls and chains + + temp_index.temp_node_records.resize(temp_index.max_node_id-temp_index.min_node_id+1); + + + + //Stores unfinished records, as type of record and offset into appropriate vector + //(temp_node/snarl/chain_records) + vector stack; + + //There may be components of the root that are connected to each other. Each connected component will + //get put into a (fake) root-level snarl, but we don't know what those components will be initially, + //since the decomposition just puts them in the same root snarl. This is used to group the root-level + //components into connected components that will later be used to make root snarls + structures::UnionFind root_snarl_component_uf (0); + + + /*Go through the decomposition top down and record the connectivity of the snarls and chains + * Distances will be added later*/ + + snarl_finder->traverse_decomposition( + [&](handle_t chain_start_handle) { + /*This gets called when a new chain is found, starting at the start handle going into chain + * For the first node in a chain, create a chain record and fill in the first node. + * Also add the first node record + */ +#ifdef debug_distance_indexing + cerr << " Starting new chain at " << graph->get_id(chain_start_handle) << (graph->get_is_reverse(chain_start_handle) ? " reverse" : " forward") << endl; + //We shouldn't have seen this node before + //assert(temp_index.get_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(chain_start_handle))).node_id == 0); +#endif + + //Fill in node in chain + stack.emplace_back(SnarlDistanceIndex::TEMP_CHAIN, temp_index.temp_chain_records.size()); + nid_t node_id = graph->get_id(chain_start_handle); + temp_index.temp_chain_records.emplace_back(); + auto& temp_chain = temp_index.temp_chain_records.back(); + temp_chain.start_node_id = node_id; + temp_chain.start_node_rev = graph->get_is_reverse(chain_start_handle); + temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); + + + //And the node record itself + auto& temp_node = temp_index.get_node(temp_chain.children.back()); + temp_node.node_id = node_id; + temp_node.node_length = graph->get_length(chain_start_handle); + temp_node.reversed_in_parent = graph->get_is_reverse(chain_start_handle); + temp_node.parent = stack.back(); //The parent is this chain + + }, + [&](handle_t chain_end_handle) { + /*This gets called at the end of a chain, facing out + * Record the chain's end node. The node record itself would have been added as part of the snarl + * Also record the chain's parent here + */ + + //Done with this chain + SnarlDistanceIndex::temp_record_ref_t chain_index = stack.back(); + stack.pop_back(); + +#ifdef debug_distance_indexing + assert(chain_index.first == SnarlDistanceIndex::TEMP_CHAIN); +#endif + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.get_chain(chain_index); + nid_t node_id = graph->get_id(chain_end_handle); + + if (temp_chain_record.children.size() == 1 && node_id == temp_chain_record.start_node_id) { + //This is a trivial snarl + +#ifdef debug_distance_indexing + //Then this must be the last thing on the chain_records vector + assert(temp_index.temp_chain_records.size() == chain_index.second+1); +#endif + + //Get the node + SnarlDistanceIndex::temp_record_ref_t node_index = make_pair(SnarlDistanceIndex::TEMP_NODE, node_id); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.get_node(node_index); + + temp_node_record.reversed_in_parent = false; + + //And give the chain's parent the node info + // + if (stack.empty()) { + temp_node_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); + //If this was the last thing on the stack, then this was a root + + //Check to see if there is anything connected to the ends of the chain + vector reachable_nodes; + graph->follow_edges(graph->get_handle(node_id, false), + false, [&] (const handle_t& next) { + if (graph->get_id(next) != node_id) { + reachable_nodes.emplace_back(graph->get_id(next)); + } + }); + graph->follow_edges(graph->get_handle(node_id, true), + false, [&] (const handle_t& next) { + if (graph->get_id(next) != node_id) { + reachable_nodes.emplace_back(graph->get_id(next)); + } + }); + if (reachable_nodes.size()) { + //If we can reach anything leaving the chain (besides the chain itself), then it is part of a root snarl + //Note that if the chain's start and end node are the same, then it will always be a single component +#ifdef debug_distance_indexing + cerr << " This trivial chain is part of the root but connects with something else in the root"<::max()); +#endif + root_snarl_component_uf.union_groups(other_i, temp_node_record.root_snarl_index); +//#ifdef debug_distance_indexing +// cerr << " Union this trivial with " << temp_index.get_chain(node_record.parent).start_node_id << " " << temp_index.get_chain(node_record.parent).end_node_id << endl; +//#endif + } else { + new_component = false; + } + } + } else { + //If this chain isn't connected to anything else, then it is a single component of the root + temp_node_record.rank_in_parent = temp_index.components.size(); + temp_index.components.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); + } + } else { + //The last thing on the stack is the parent of this chain, which must be a snarl + temp_node_record.parent = stack.back(); + auto& parent_snarl_record = temp_index.get_snarl(temp_node_record.parent); + temp_node_record.rank_in_parent = parent_snarl_record.children.size() + 2; + parent_snarl_record.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); + } + + + //Remove the chain record + temp_index.temp_chain_records.pop_back(); + temp_index.max_index_size += temp_node_record.get_max_record_length(); + + } else { + //Otherwise, it is an actual chain + + //Fill in node in chain + temp_chain_record.end_node_id = node_id; + temp_chain_record.end_node_rev = graph->get_is_reverse(chain_end_handle); + temp_chain_record.end_node_length = graph->get_length(chain_end_handle); + + bool is_root_chain = false; + + if (stack.empty()) { + //If this was the last thing on the stack, then this was a root + is_root_chain = true; + + //Check to see if there is anything connected to the ends of the chain + vector reachable_nodes; + graph->follow_edges(graph->get_handle(temp_chain_record.start_node_id, !temp_chain_record.start_node_rev), + false, [&] (const handle_t& next) { + if (graph->get_id(next) != temp_chain_record.start_node_id && + graph->get_id(next) != temp_chain_record.end_node_id) { + reachable_nodes.emplace_back(graph->get_id(next)); + } + }); + graph->follow_edges(graph->get_handle(temp_chain_record.end_node_id, temp_chain_record.end_node_rev), + false, [&] (const handle_t& next) { + if (graph->get_id(next) != temp_chain_record.start_node_id && + graph->get_id(next) != temp_chain_record.end_node_id) { + reachable_nodes.emplace_back(graph->get_id(next)); + } + }); + if (reachable_nodes.size() && (temp_chain_record.is_trivial || temp_chain_record.start_node_id != temp_chain_record.end_node_id)) { + //If we can reach anything leaving the chain (besides the chain itself), then it is part of a root snarl + //Note that if the chain's start and end node are the same, then it will always be a single component +#ifdef debug_distance_indexing + cerr << " This chain is part of the root but connects with something else in the root"<::max()); +#endif + root_snarl_component_uf.union_groups(other_i, temp_chain_record.root_snarl_index); +#ifdef debug_distance_indexing + if (node_record.parent.first == SnarlDistanceIndex::TEMP_CHAIN) { + cerr << " Union this chain with " << temp_index.get_chain(node_record.parent).start_node_id << " " << temp_index.get_chain(node_record.parent).end_node_id << endl; + } else { + cerr << " Union this chain with root " << node_record.root_snarl_index << endl; + } +#endif + } else { + new_component = false; + } + } + } else { + //If this chain isn't connected to anything else, then it is a single component of the root + temp_chain_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); + temp_chain_record.rank_in_parent = temp_index.components.size(); + temp_index.components.emplace_back(chain_index); + } + } else { + //The last thing on the stack is the parent of this chain, which must be a snarl + temp_chain_record.parent = stack.back(); + auto& parent_snarl_record = temp_index.get_snarl(temp_chain_record.parent); + temp_chain_record.rank_in_parent = parent_snarl_record.children.size() + 2; + parent_snarl_record.children.emplace_back(chain_index); + } + + temp_index.max_index_size += temp_chain_record.get_max_record_length(!only_top_level_chain_distances || is_root_chain ? true : false ); +#ifdef debug_distance_indexing + cerr << " Ending new " << (temp_chain_record.is_trivial ? "trivial " : "") << "chain " << temp_index.structure_start_end_as_string(chain_index) + << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_chain_record.parent) << endl; +#endif + } + }, + [&](handle_t snarl_start_handle) { + /*This gets called at the beginning of a new snarl facing in + * Create a new snarl record and fill in the start node. + * The node record would have been created as part of the chain, or as the end node + * of the previous snarl + */ + +#ifdef debug_distance_indexing + cerr << " Starting new snarl at " << graph->get_id(snarl_start_handle) << (graph->get_is_reverse(snarl_start_handle) ? " reverse" : " forward") << endl; + cerr << "with index " << temp_index.temp_snarl_records.size() << endl; +#endif + auto& parent = stack.back(); + stack.emplace_back(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size()); + temp_index.temp_snarl_records.emplace_back(); + temp_index.temp_snarl_records.back().start_node_id = graph->get_id(snarl_start_handle); + temp_index.temp_snarl_records.back().start_node_rev = graph->get_is_reverse(snarl_start_handle); + temp_index.temp_snarl_records.back().start_node_length = graph->get_length(snarl_start_handle); + + }, + [&](handle_t snarl_end_handle){ + /*This gets called at the end of the snarl facing out + * Fill in the end node of the snarl, its parent, and record the snarl as a child of its + * parent chain + * Also create a node record + */ + SnarlDistanceIndex::temp_record_ref_t snarl_index = stack.back(); + stack.pop_back(); +#ifdef debug_distance_indexing + assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL); + assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); +#endif + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(snarl_index); + nid_t node_id = graph->get_id(snarl_end_handle); + + //Record the end node in the snarl + temp_snarl_record.end_node_id = node_id; + temp_snarl_record.end_node_rev = graph->get_is_reverse(snarl_end_handle); + temp_snarl_record.end_node_length = graph->get_length(snarl_end_handle); + temp_snarl_record.node_count = temp_snarl_record.children.size(); + bool any_edges_in_snarl = false; + graph->follow_edges(graph->get_handle(temp_snarl_record.start_node_id, temp_snarl_record.start_node_rev), false, [&](const handle_t& next_handle) { + if (graph->get_id(next_handle) != temp_snarl_record.end_node_id) { + any_edges_in_snarl = true; + } + }); + graph->follow_edges(graph->get_handle(temp_snarl_record.end_node_id, !temp_snarl_record.end_node_rev), false, [&](const handle_t& next_handle) { + if (graph->get_id(next_handle) != temp_snarl_record.start_node_id) { + any_edges_in_snarl = true; + } + }); + + if (temp_snarl_record.children.size() == 0) { + //This is a trivial snarl + temp_snarl_record.is_trivial = true; + +#ifdef debug_distance_indexing + cerr << " Ending and forgetting trivial snarl " << temp_index.structure_start_end_as_string(snarl_index) + << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_snarl_record.parent) << endl; +#endif + + //Add the end node to the chain +#ifdef debug_distance_indexing + assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); +#endif + temp_snarl_record.parent = stack.back(); + auto& temp_chain = temp_index.get_chain(stack.back()); + temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); + + //Remove the snarl record. + //This invalidates snarl_index!!! +#ifdef debug_distance_indexing + assert(temp_index.temp_snarl_records.size() == snarl_index.second+1); +#endif + temp_index.temp_snarl_records.pop_back(); + } else { + //This is the child of a chain + +#ifdef debug_distance_indexing + cerr << " Ending new snarl " << temp_index.structure_start_end_as_string(snarl_index) + << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_snarl_record.parent) << endl; +#endif + +#ifdef debug_distance_indexing + assert(stack.back().first == SnarlDistanceIndex::TEMP_CHAIN); +#endif + temp_snarl_record.parent = stack.back(); + auto& temp_chain = temp_index.get_chain(stack.back()); + temp_chain.children.emplace_back(snarl_index); + temp_chain.children.emplace_back(SnarlDistanceIndex::TEMP_NODE, node_id); + + } + + //Record the node itself. This gets done for the start of the chain, and ends of snarls + SnarlDistanceIndex::temp_record_ref_t node_index = make_pair(SnarlDistanceIndex::TEMP_NODE, node_id); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.get_node(node_index); + temp_node_record.node_id = node_id; + temp_node_record.node_length = graph->get_length(snarl_end_handle); + temp_node_record.reversed_in_parent = graph->get_is_reverse(snarl_end_handle); + temp_node_record.parent = stack.back(); + }); + + /* + * We finished going through everything that exists according to the snarl decomposition, but + * it's still missing tips, which will be discovered when filling in the snarl distances, + * and root-level snarls, which we'll add now by combining the chain components in root_snarl_components + * into snarls defined by root_snarl_component_uf + * The root-level snarl is a fake snarl that doesn't exist according to the snarl decomposition, + * but is an extra layer that groups together components of the root that are connected + */ + + vector> root_snarl_component_indexes = root_snarl_component_uf.all_groups(); + for (vector& root_snarl_indexes : root_snarl_component_indexes) { +#ifdef debug_distance_indexing + cerr << "Create a new root snarl from components" << endl; +#endif + //For each of the root snarls + temp_index.components.emplace_back(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size()); + temp_index.temp_snarl_records.emplace_back(); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.back(); + temp_snarl_record.is_root_snarl = true; + temp_snarl_record.parent = make_pair(SnarlDistanceIndex::TEMP_ROOT, 0); + + + for (size_t chain_i : root_snarl_indexes) { + //For each chain component of this root-level snarl + if (temp_index.root_snarl_components[chain_i].first == SnarlDistanceIndex::TEMP_CHAIN){ + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.get_chain(temp_index.root_snarl_components[chain_i]); + temp_chain_record.parent = make_pair(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size() - 1); + temp_chain_record.rank_in_parent = temp_snarl_record.children.size(); + temp_chain_record.reversed_in_parent = false; + + temp_snarl_record.children.emplace_back(temp_index.root_snarl_components[chain_i]); + } else { +#ifdef debug_distance_indexing + assert(temp_index.root_snarl_components[chain_i].first == SnarlDistanceIndex::TEMP_NODE); +#endif + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = temp_index.get_node(temp_index.root_snarl_components[chain_i]); + temp_node_record.parent = make_pair(SnarlDistanceIndex::TEMP_SNARL, temp_index.temp_snarl_records.size() - 1); + temp_node_record.rank_in_parent = temp_snarl_record.children.size(); + temp_node_record.reversed_in_parent = false; + + temp_snarl_record.children.emplace_back(temp_index.root_snarl_components[chain_i]); + } + } + temp_snarl_record.node_count = temp_snarl_record.children.size(); + } + + + /*Now go through the decomposition again to fill in the distances + * This traverses all chains in reverse order that we found them in, so bottom up + * Each chain and snarl already knows its parents and children, except for single nodes + * that are children of snarls. These nodes were not in chains will have their node + * records created here + */ + +#ifdef debug_distance_indexing + cerr << "Filling in the distances in snarls" << endl; +#endif + for (int i = temp_index.temp_chain_records.size()-1 ; i >= 0 ; i--) { + SnarlDistanceIndex::temp_record_ref_t chain_index = make_pair(SnarlDistanceIndex::TEMP_CHAIN, i); + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.get_chain(chain_index); +#ifdef debug_distance_indexing + assert(!temp_chain_record.is_trivial); + cerr << " At" << (temp_chain_record.is_trivial ? " trivial " : "") << "chain " << temp_index.structure_start_end_as_string(chain_index) << endl; +#endif + + //Add the first values for the prefix sum and backwards loop vectors + temp_chain_record.prefix_sum.emplace_back(0); + temp_chain_record.max_prefix_sum.emplace_back(0); + temp_chain_record.backward_loops.emplace_back(std::numeric_limits::max()); + temp_chain_record.chain_components.emplace_back(0); + + + /*First, go through each of the snarls in the chain in the forward direction and + * fill in the distances in the snarl. Also fill in the prefix sum and backwards + * loop vectors here + */ + size_t curr_component = 0; //which component of the chain are we in + size_t last_node_length = 0; + for (size_t chain_child_i = 0 ; chain_child_i < temp_chain_record.children.size() ; chain_child_i++ ){ + const SnarlDistanceIndex::temp_record_ref_t& chain_child_index = temp_chain_record.children[chain_child_i]; + //Go through each of the children in the chain, skipping nodes + //The snarl may be trivial, in which case don't fill in the distances +#ifdef debug_distance_indexing + cerr << " Looking at child " << temp_index.structure_start_end_as_string(chain_child_index) + << " current max prefix sum " << temp_chain_record.max_prefix_sum.back() << endl; +#endif + + if (chain_child_index.first == SnarlDistanceIndex::TEMP_SNARL){ + //This is where all the work gets done. Need to go through the snarl and add + //all distances, then add distances to the chain that this is in + //The parent chain will be the last thing in the stack + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = + temp_index.get_snarl(chain_child_index); + + //Fill in this snarl's distances + populate_snarl_index(temp_index, chain_child_index, size_limit, only_top_level_chain_distances, graph); + + bool new_component = temp_snarl_record.min_length == std::numeric_limits::max(); + if (new_component){ + curr_component++; + } + + //And get the distance values for the end node of the snarl in the chain + if (new_component) { + //If this snarl wasn't start-end connected, then we start + //tracking the distance vectors here + + //Update the maximum distance + temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.max_prefix_sum.back()); + + temp_chain_record.prefix_sum.emplace_back(0); + temp_chain_record.max_prefix_sum.emplace_back(0); + temp_chain_record.backward_loops.emplace_back(temp_snarl_record.distance_end_end); + //If the chain is disconnected, the max length is infinite + temp_chain_record.max_length = std::numeric_limits::max(); + } else { + temp_chain_record.prefix_sum.emplace_back(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.prefix_sum.back(), + temp_snarl_record.min_length), + temp_snarl_record.start_node_length)); + temp_chain_record.max_prefix_sum.emplace_back(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.max_prefix_sum.back(), + temp_snarl_record.max_length), + temp_snarl_record.start_node_length)); + temp_chain_record.backward_loops.emplace_back(std::min(temp_snarl_record.distance_end_end, + SnarlDistanceIndex::sum(temp_chain_record.backward_loops.back() + , 2 * (temp_snarl_record.start_node_length + temp_snarl_record.min_length)))); + temp_chain_record.max_length = SnarlDistanceIndex::sum(temp_chain_record.max_length, + temp_snarl_record.max_length); + } + temp_chain_record.chain_components.emplace_back(curr_component); + if (chain_child_i == temp_chain_record.children.size() - 2 && temp_snarl_record.min_length == std::numeric_limits::max()) { + temp_chain_record.loopable = false; + } + last_node_length = 0; + } else { + if (last_node_length != 0) { + //If this is a node and the last thing was also a node, + //then there was a trivial snarl + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = + temp_index.get_node(chain_child_index); + + //Check if there is a loop in this node + //Snarls get counted as trivial if they contain no nodes but they might still have edges + size_t backward_loop = std::numeric_limits::max(); + + graph->follow_edges(graph->get_handle(temp_node_record.node_id, !temp_node_record.reversed_in_parent), false, [&](const handle_t& next_handle) { + if (graph->get_id(next_handle) == temp_node_record.node_id) { + //If there is a loop going backwards (relative to the chain) back to the same node + backward_loop = 0; + } + }); + + temp_chain_record.prefix_sum.emplace_back(SnarlDistanceIndex::sum(temp_chain_record.prefix_sum.back(), last_node_length)); + temp_chain_record.max_prefix_sum.emplace_back(SnarlDistanceIndex::sum(temp_chain_record.max_prefix_sum.back(), last_node_length)); + temp_chain_record.backward_loops.emplace_back(std::min(backward_loop, + SnarlDistanceIndex::sum(temp_chain_record.backward_loops.back(), 2 * last_node_length))); + + if (chain_child_i == temp_chain_record.children.size()-1) { + //If this is the last node + temp_chain_record.loopable=false; + } + temp_chain_record.chain_components.emplace_back(curr_component); + } + last_node_length = temp_index.get_node(chain_child_index).node_length; + //And update the chains max length + temp_chain_record.max_length = SnarlDistanceIndex::sum(temp_chain_record.max_length, + last_node_length); + } + } //Finished walking through chain + if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.chain_components.back() != 0) { + //If this is a looping, multicomponent chain, the start/end node could end up in separate chain components + //despite being the same node. + //Since the first component will always be 0, set the first node's component to be whatever the last + //component was + temp_chain_record.chain_components[0] = temp_chain_record.chain_components.back(); + + } + + //For a multicomponent chain, the actual minimum length will always be infinite, but since we sometimes need + //the length of the last component, save that here + temp_chain_record.min_length = !temp_chain_record.is_trivial && temp_chain_record.start_node_id == temp_chain_record.end_node_id + ? temp_chain_record.prefix_sum.back() + : SnarlDistanceIndex::sum(temp_chain_record.prefix_sum.back() , temp_chain_record.end_node_length); + +#ifdef debug_distance_indexing + assert(temp_chain_record.prefix_sum.size() == temp_chain_record.backward_loops.size()); + assert(temp_chain_record.prefix_sum.size() == temp_chain_record.chain_components.size()); +#endif + + + /*Now that we've gone through all the snarls in the chain, fill in the forward loop vector + * by going through the chain in the backwards direction + */ + temp_chain_record.forward_loops.resize(temp_chain_record.prefix_sum.size(), + std::numeric_limits::max()); + if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.children.size() > 1) { + + //If this is a looping chain, then check the first snarl for a loop + if (temp_chain_record.children.at(1).first == SnarlDistanceIndex::TEMP_SNARL) { + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(temp_chain_record.children.at(1)); + temp_chain_record.forward_loops[temp_chain_record.forward_loops.size()-1] = temp_snarl_record.distance_start_start; + } + } + + size_t node_i = temp_chain_record.prefix_sum.size() - 2; + // We start at the next to last node because we need to look at this record and the next one. + last_node_length = 0; + for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { + auto& child = temp_chain_record.children.at(j); + if (child.first == SnarlDistanceIndex::TEMP_SNARL){ + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(child); + if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) && + temp_chain_record.chain_components.at(node_i+1) != 0){ + //If this is a new chain component, then add the loop distance from the snarl + //If the component of the next node is 0, then we're still in the same component since we're going backwards + temp_chain_record.forward_loops.at(node_i) = temp_snarl_record.distance_start_start; + } else { + temp_chain_record.forward_loops.at(node_i) = + std::min(SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.forward_loops.at(node_i+1), + 2* temp_snarl_record.min_length), + 2*temp_snarl_record.end_node_length), + temp_snarl_record.distance_start_start); + } + node_i --; + last_node_length = 0; + } else { + if (last_node_length != 0) { + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = + temp_index.get_node(child); + + + //Check if there is a loop in this node + //Snarls get counted as trivial if they contain no nodes but they might still have edges + size_t forward_loop = std::numeric_limits::max(); + graph->follow_edges(graph->get_handle(temp_node_record.node_id, temp_node_record.reversed_in_parent), false, [&](const handle_t& next_handle) { + if (graph->get_id(next_handle) == temp_node_record.node_id) { + //If there is a loop going forward (relative to the chain) back to the same node + forward_loop = 0; + } + }); + temp_chain_record.forward_loops.at(node_i) = std::min( forward_loop, + SnarlDistanceIndex::sum(temp_chain_record.forward_loops.at(node_i+1) , + 2*last_node_length)); + node_i--; + } + last_node_length = temp_index.get_node(child).node_length; + } + } + + + //If this is a looping chain, check if the loop distances can be improved by going around the chain + + if (temp_chain_record.start_node_id == temp_chain_record.end_node_id && temp_chain_record.children.size() > 1) { + + + //Also check if the reverse loop values would be improved if we went around again + + if (temp_chain_record.backward_loops.back() < temp_chain_record.backward_loops.front()) { + temp_chain_record.backward_loops[0] = temp_chain_record.backward_loops.back(); + size_t node_i = 1; + size_t last_node_length = 0; + for (size_t i = 1 ; i < temp_chain_record.children.size()-1 ; i++ ) { + auto& child = temp_chain_record.children.at(i); + if (child.first == SnarlDistanceIndex::TEMP_SNARL) { + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(child); + size_t new_loop_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.backward_loops.at(node_i-1), + 2*temp_snarl_record.min_length), + 2*temp_snarl_record.start_node_length); + if (temp_chain_record.chain_components.at(node_i)!= 0 || new_loop_distance >= temp_chain_record.backward_loops.at(node_i)) { + //If this is a new chain component or it doesn't improve, stop + break; + } else { + //otherwise record the better distance + temp_chain_record.backward_loops.at(node_i) = new_loop_distance; + + } + node_i++; + last_node_length = 0; + } else { + if (last_node_length != 0) { + size_t new_loop_distance = SnarlDistanceIndex::sum(temp_chain_record.backward_loops.at(node_i-1), + 2*last_node_length); + size_t old_loop_distance = temp_chain_record.backward_loops.at(node_i); + temp_chain_record.backward_loops.at(node_i) = std::min(old_loop_distance,new_loop_distance); + node_i++; + } + last_node_length = temp_index.get_node(child).node_length; + } + } + } + if (temp_chain_record.forward_loops.front() < temp_chain_record.forward_loops.back()) { + //If this is a looping chain and looping improves the forward loops, + //then we have to keep going around to update distance + + temp_chain_record.forward_loops.back() = temp_chain_record.forward_loops.front(); + size_t last_node_length = 0; + node_i = temp_chain_record.prefix_sum.size() - 2; + for (int j = (int)temp_chain_record.children.size() - 1 ; j >= 0 ; j--) { + auto& child = temp_chain_record.children.at(j); + if (child.first == SnarlDistanceIndex::TEMP_SNARL){ + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(child); + size_t new_distance = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + temp_chain_record.forward_loops.at(node_i+1), + 2* temp_snarl_record.min_length), + 2*temp_snarl_record.end_node_length); + if (temp_chain_record.chain_components.at(node_i) != temp_chain_record.chain_components.at(node_i+1) || + new_distance >= temp_chain_record.forward_loops.at(node_i)){ + //If this is a new component or the distance doesn't improve, stop looking + break; + } else { + //otherwise, update the distance + temp_chain_record.forward_loops.at(node_i) = new_distance; + } + node_i --; + last_node_length =0; + } else { + if (last_node_length != 0) { + size_t new_distance = SnarlDistanceIndex::sum(temp_chain_record.forward_loops.at(node_i+1) , 2* last_node_length); + size_t old_distance = temp_chain_record.forward_loops.at(node_i); + temp_chain_record.forward_loops.at(node_i) = std::min(old_distance, new_distance); + node_i--; + } + last_node_length = temp_index.get_node(child).node_length; + } + } + } + } + + temp_index.max_distance = std::max(temp_index.max_distance, temp_chain_record.max_prefix_sum.back()); + temp_index.max_distance = temp_chain_record.forward_loops.back() == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.forward_loops.back()); + temp_index.max_distance = temp_chain_record.backward_loops.front() == std::numeric_limits::max() ? temp_index.max_distance : std::max(temp_index.max_distance, temp_chain_record.backward_loops.front()); + assert(temp_index.max_distance <= 2742664019); + + } + +#ifdef debug_distance_indexing + cerr << "Filling in the distances in root snarls and distances along chains" << endl; +#endif + for (SnarlDistanceIndex::temp_record_ref_t& component_index : temp_index.components) { + if (component_index.first == SnarlDistanceIndex::TEMP_SNARL) { + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.get_snarl(component_index); + populate_snarl_index(temp_index, component_index, size_limit, only_top_level_chain_distances, graph); + temp_snarl_record.min_length = std::numeric_limits::max(); + } + } + temp_index.root_structure_count = temp_index.components.size(); +#ifdef debug_distance_indexing + assert(temp_index.components.size() == temp_index.root_structure_count); + cerr << "Finished temp index with " << temp_index.root_structure_count << " connected components" << endl; +#endif + return temp_index; +} + +/** + * Populate a row of the distance matrix. + * Also responsible for filling in min_length, distance_start_start, and distance_start_end on the TemporarySnarlRecord when a distance matrix is used. + */ +static void populate_distance_matrix_row(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const SnarlDistanceIndex::temp_record_ref_t& start_index, const HandleGraph* graph, size_t start_rank, bool is_internal_node, size_t size_limit); + +/** + * Fills in required distance matrix rows for each child + * - Normal snarl: all rows + * - Oversized snarl: boundaries and tips + * - size_limit == 0: no distances in index, so no rows + * - Top-level chain distances only: ??? + */ +static void populate_distance_matrix_if_needed(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph, size_t size_limit, bool only_top_level_chain_distances); + +/** + * Does three things: + * - Builds temp graph that hub labels will be built on + * - Builds the hub labels + * - Stores labels in temp_snarl_record + */ +static void populate_hub_labeling(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph); + +/** + * Determine if a snarl is regular or not. + * + * A regular snarl is a snarl that consists of only nodes or + * chains connected to the start and end, without any connections between + * multiple children, or any way to turn around. There may be an edge directly + * across. + * + * A simple snarl is always regular. + */ +bool check_regularity(const SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, const SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph); + +// --------------------------------------------------------------------------- +// Phase helpers for populate_snarl_index (all file-static) +// --------------------------------------------------------------------------- + +// Step 1: Walk up the snarl tree from curr_index until we find the direct +// child of ancestor_snarl_index that contains curr_index. +static SnarlDistanceIndex::temp_record_ref_t ancestor_of_node_in_snarl( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + SnarlDistanceIndex::temp_record_ref_t curr_index, + SnarlDistanceIndex::temp_record_ref_t ancestor_snarl_index) { + + const auto& snarl = temp_index.get_snarl(ancestor_snarl_index); + if (curr_index.second == snarl.start_node_id || + curr_index.second == snarl.end_node_id) { + return curr_index; + } + + SnarlDistanceIndex::temp_record_ref_t parent_index = temp_index.get_node(curr_index).parent; + while (parent_index != ancestor_snarl_index) { + curr_index = parent_index; + parent_index = parent_index.first == SnarlDistanceIndex::TEMP_SNARL + ? temp_index.get_snarl(parent_index).parent + : temp_index.get_chain(parent_index).parent; +#ifdef debug_distance_indexing + assert(parent_index.first != SnarlDistanceIndex::TEMP_ROOT); +#endif + } + return curr_index; +} + +// Step 2a: Return the handle pointing out from child_index in the given +// traversal direction (reversed=false → forward/end side; reversed=true → +// backward/start side). +static handle_t child_boundary_handle( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + const SnarlDistanceIndex::temp_record_ref_t& child_index, + bool reversed, + const HandleGraph* graph) { + + if (child_index.first == SnarlDistanceIndex::TEMP_NODE) { + return graph->get_handle(child_index.second, reversed); + } else if (reversed) { + return graph->get_handle(temp_index.get_chain(child_index).start_node_id, + !temp_index.get_chain(child_index).start_node_rev); + } else { + return graph->get_handle(temp_index.get_chain(child_index).end_node_id, + temp_index.get_chain(child_index).end_node_rev); + } +} + +// Step 2b: Determine the traversal direction of child_index when entered via +// graph_handle. +static bool child_side_reversed( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + const SnarlDistanceIndex::temp_record_ref_t& child_index, + handle_t graph_handle, + const HandleGraph* graph) { + + if (child_index.first == SnarlDistanceIndex::TEMP_NODE || + temp_index.get_chain(child_index).is_trivial) { + return graph->get_is_reverse(graph_handle); + } + return graph->get_id(graph_handle) == temp_index.get_chain(child_index).end_node_id; +} + +// --------------------------------------------------------------------------- +// SnarlChildGraph implementation +// --------------------------------------------------------------------------- + +SnarlChildGraph::SnarlChildGraph( + TempIndex& temp_index, + temp_record_ref_t snarl_index, + std::span children, + const handlegraph::HandleGraph* graph) + : temp_index_(temp_index) + , snarl_index_(snarl_index) + , children_(children) + , graph_(graph) {} + +std::span +SnarlChildGraph::children() const noexcept { + return children_; +} + +std::pair +SnarlChildGraph::boundary(bool start) const { + const auto& snarl = temp_index_.get_snarl(snarl_index_); + if (start) { + return {{SnarlDistanceIndex::TEMP_NODE, snarl.start_node_id}, snarl.start_node_rev}; + } + return {{SnarlDistanceIndex::TEMP_NODE, snarl.end_node_id}, snarl.end_node_rev}; +} + +void SnarlChildGraph::for_each_outgoing( + temp_record_ref_t child, + bool go_left, + const std::function& callback) const { + + handle_t out_handle = child_boundary_handle(temp_index_, child, go_left, graph_); + + graph_->follow_edges(out_handle, false, [&](const handle_t& next_handle) { + handlegraph::nid_t arriving_nid = graph_->get_id(next_handle); + temp_record_ref_t next_node = {SnarlDistanceIndex::TEMP_NODE, arriving_nid}; + temp_record_ref_t neighbor = ancestor_of_node_in_snarl(temp_index_, next_node, snarl_index_); + bool neighbor_rev = child_side_reversed(temp_index_, neighbor, next_handle, graph_); + + size_t edge_distance; + if (neighbor.first == SnarlDistanceIndex::TEMP_NODE) { + edge_distance = graph_->get_length(next_handle); + } else { + const auto& chain = temp_index_.get_chain(neighbor); + edge_distance = chain.min_length; + if (chain.chain_components.back() != 0) { + edge_distance = std::numeric_limits::max(); + } + } + + callback(neighbor, neighbor_rev, edge_distance, arriving_nid); + return true; + }); +} + +// Phase 1: Mark tip nodes and set is_simple=false if any tip is found. +static void identify_tips( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, + const vector& all_children, + const HandleGraph* graph) { + + for (const auto& child : all_children) { + if (child.first != SnarlDistanceIndex::TEMP_NODE + || (child.second != temp_snarl_record.start_node_id + && child.second != temp_snarl_record.end_node_id)) { + bool is_node = (child.first == SnarlDistanceIndex::TEMP_NODE); + nid_t node_id = is_node ? child.second + : temp_index.temp_chain_records.at(child.second).end_node_id; + size_t rank = is_node ? temp_index.temp_node_records.at(child.second - temp_index.min_node_id).rank_in_parent + : temp_index.temp_chain_records.at(child.second).rank_in_parent; + bool is_reverse = is_node ? false + : temp_index.temp_chain_records.at(child.second).end_node_rev; + rank -= 2; + + bool has_edges = false; + graph->follow_edges(graph->get_handle(node_id, is_reverse), false, [&](const handle_t next_handle) { + has_edges = true; + }); + if (!has_edges) { + temp_index.temp_node_records.at(node_id - temp_index.min_node_id).is_tip = true; + temp_snarl_record.tippy_child_ranks.emplace(rank, false); + temp_snarl_record.is_simple = false; + } + node_id = is_node ? child.second + : temp_index.temp_chain_records.at(child.second).start_node_id; + is_reverse = is_node ? true + : !temp_index.temp_chain_records.at(child.second).start_node_rev; + has_edges = false; + graph->follow_edges(graph->get_handle(node_id, is_reverse), false, [&](const handle_t next_handle) { + has_edges = true; + }); + if (!has_edges) { + temp_index.temp_node_records.at(node_id - temp_index.min_node_id).is_tip = true; + temp_snarl_record.tippy_child_ranks.emplace(rank, true); + temp_snarl_record.is_simple = false; + } + } + } +} + +// Phase 2a: BFS topological sort of children; returns new-to-old rank mapping. +// TODO: For non-DAGs this sort will end up arbitrary. That doesn't matter +// since the only consumer of ranks (ziptrees) expects arbitrary ranks. +static vector topo_sort_children( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + const SnarlDistanceIndex::temp_record_ref_t& snarl_index, + const SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, + std::span all_children, + const HandleGraph* graph) { + + SnarlChildGraph child_graph(temp_index, snarl_index, all_children, graph); + + vector topological_sort_order; + topological_sort_order.reserve(all_children.size()); + + unordered_set visited_ranks; + visited_ranks.reserve(all_children.size()); + + vector> source_nodes; + + // Add tips as sources. Tips push before the start sentinel so sentinel pops first (LIFO). + for (const auto& tip : temp_snarl_record.tippy_child_ranks) { + source_nodes.emplace_back(tip.first, !tip.second); + } + // Start node dummy rank is max() — pops first as LIFO sentinel. + source_nodes.emplace_back(std::numeric_limits::max(), false); + + while (!source_nodes.empty()) { + pair current_child_index = source_nodes.back(); + source_nodes.pop_back(); + + if (visited_ranks.count(current_child_index.first) != 0) { + // Revisiting a source means we hit a loop; abort with arbitrary ranks. + break; + } + if (current_child_index.first != std::numeric_limits::max()) { + topological_sort_order.emplace_back(current_child_index.first); + } + visited_ranks.emplace(current_child_index.first); + + // Determine which child (or sentinel start boundary) to follow edges from. + // For the sentinel, use the snarl's start boundary node in its stored orientation, + // which produces the same handle as topological_sort_start in the original code. + SnarlDistanceIndex::temp_record_ref_t current_ref; + bool go_left; + if (current_child_index.first == std::numeric_limits::max()) { + current_ref = {SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.start_node_id}; + go_left = temp_snarl_record.start_node_rev; + } else { + current_ref = all_children[current_child_index.first]; + go_left = current_child_index.second; + } + + child_graph.for_each_outgoing(current_ref, go_left, [&]( + SnarlDistanceIndex::temp_record_ref_t neighbor, + bool neighbor_rev, + size_t /*edge_distance*/, + handlegraph::nid_t /*arriving_nid*/) { +#ifdef debug_distance_indexing + cerr << "Following forward edges to " + << temp_index.structure_start_end_as_string(neighbor) << endl; +#endif + // Skip snarl boundaries. + if (neighbor.first == SnarlDistanceIndex::TEMP_NODE && + (neighbor.second == temp_snarl_record.start_node_id || + neighbor.second == temp_snarl_record.end_node_id)) { + return; + } + size_t next_rank = neighbor.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(neighbor).rank_in_parent + : temp_index.get_chain(neighbor).rank_in_parent; + assert(next_rank >= 2); + next_rank -= 2; + assert(all_children[next_rank] == neighbor); + if (visited_ranks.count(next_rank) != 0) { + return; + } + + // Check if neighbor is a topological source (no unvisited predecessors). + bool is_source = true; + child_graph.for_each_outgoing(neighbor, !neighbor_rev, [&]( + SnarlDistanceIndex::temp_record_ref_t incoming, + bool /*incoming_rev*/, + size_t /*edge_distance*/, + handlegraph::nid_t /*arriving_nid*/) { +#ifdef debug_distance_indexing + cerr << "Getting backwards edge from " + << temp_index.structure_start_end_as_string(incoming) << endl; +#endif + if (incoming.first == SnarlDistanceIndex::TEMP_NODE && + (incoming.second == temp_snarl_record.start_node_id || + incoming.second == temp_snarl_record.end_node_id)) { + return; + } + size_t incoming_rank = incoming.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(incoming).rank_in_parent + : temp_index.get_chain(incoming).rank_in_parent; + assert(incoming_rank >= 2); + incoming_rank -= 2; + if (visited_ranks.count(incoming_rank) == 0) { + is_source = false; + } + }); + if (is_source) { + source_nodes.emplace_back(next_rank, neighbor_rev); + } + }); + } + + // Non-DAG fallback: append any ranks not yet visited in arbitrary order. + vector check_ranks(all_children.size(), false); + for (size_t x : topological_sort_order) { + check_ranks[x] = true; + } + for (size_t i = 0; i < check_ranks.size(); i++) { + if (!check_ranks[i]) { + topological_sort_order.emplace_back(i); + } + } + assert(topological_sort_order.size() == all_children.size()); + return topological_sort_order; +} + +// Phase 2b: Apply the topo-sort permutation: update rank_in_parent for every +// child and rebuild tippy_child_ranks with new ranks. +static void apply_topo_permutation( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, + std::span all_children, + const vector& new_to_old) { + + auto old_tippy_ranks = temp_snarl_record.tippy_child_ranks; + temp_snarl_record.tippy_child_ranks.clear(); + for (size_t new_rank = 0; new_rank < new_to_old.size(); new_rank++) { + size_t old_rank = new_to_old[new_rank]; + if (all_children[old_rank].first == SnarlDistanceIndex::TEMP_NODE) { + temp_index.get_node(all_children[old_rank]).rank_in_parent = new_rank + 2; + } else { + temp_index.get_chain(all_children[old_rank]).rank_in_parent = new_rank + 2; + } + const auto& old_is_tip = old_tippy_ranks.find(old_rank); + if (old_is_tip != old_tippy_ranks.end()) { + temp_snarl_record.tippy_child_ranks.emplace(new_rank, old_is_tip->second); + } + } +} + +// Phase 3: Compute snarl distances (normal or oversized hub-label path). +// Appends boundary nodes to all_children (unless is_root_snarl). +static void compute_snarl_distances( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + const SnarlDistanceIndex::temp_record_ref_t& snarl_index, + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, + vector& all_children, + const HandleGraph* graph, + size_t size_limit, + bool only_top_level_chain_distances) { + + if (!temp_snarl_record.is_root_snarl) { + all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.start_node_id); + all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.end_node_id); + } + + if (size_limit != 0 && temp_snarl_record.node_count > size_limit) { + temp_index.most_oversized_snarl_size = std::max(temp_index.most_oversized_snarl_size, temp_snarl_record.node_count); + temp_index.use_oversized_snarls = true; + temp_snarl_record.is_simple = false; + populate_hub_labeling(temp_index, snarl_index, temp_snarl_record, all_children, graph); + + // Query hub labeling for connectivity distances (excluding boundary lengths). + // Start is always child rank 0 forward, end is always child rank 1 forward. + temp_snarl_record.min_length = promote_distance(hhl_query(temp_snarl_record.hub_labels.begin(), bgid(0, false, true), bgid(1, false, false))); + temp_snarl_record.distance_start_start = promote_distance(hhl_query(temp_snarl_record.hub_labels.begin(), bgid(0, false, true), bgid(0, true, false))); + temp_snarl_record.distance_end_end = promote_distance(hhl_query(temp_snarl_record.hub_labels.begin(), bgid(1, true, true), bgid(1, false, false))); + // TODO: Should this be here or should it be part of populate_hub_labeling()? Or its own function? + } else { + if (size_limit == 0 || only_top_level_chain_distances) { + temp_snarl_record.include_distances = false; + } + // Also fills in min_length, distance_start_start, distance_start_end, sets is_simple=false if not simple. + populate_distance_matrix_if_needed(temp_index, snarl_index, temp_snarl_record, all_children, graph, size_limit, only_top_level_chain_distances); + } +} + +// Phase 4: For simple snarls, record child node orientations. +// IMPORTANT: iterates temp_snarl_record.children[0..node_count), NOT all_children — +// boundary nodes appended in Phase 3 must not be included here. +static void mark_simple_snarl_orientations( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + const SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record) { + + for (size_t i = 0; i < temp_snarl_record.node_count; i++) { + const SnarlDistanceIndex::temp_record_ref_t& child_index = temp_snarl_record.children[i]; +#ifdef debug_distance_indexing + assert(child_index.first == SnarlDistanceIndex::TEMP_NODE); +#endif + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord& temp_node_record = + temp_index.get_node(child_index); + temp_node_record.reversed_in_parent = + temp_node_record.distance_left_start == std::numeric_limits::max(); + } +} + +// Phase 5: Regularity check and index-size accounting. +static void finalize_snarl_record( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + const SnarlDistanceIndex::temp_record_ref_t& snarl_index, + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, + const vector& all_children, + const HandleGraph* graph) { + + temp_snarl_record.is_regular = check_regularity(temp_index, snarl_index, temp_snarl_record, all_children, graph); + + temp_index.max_index_size += temp_snarl_record.get_max_record_length(); + if (temp_snarl_record.is_simple) { + temp_index.max_index_size -= (temp_snarl_record.children.size() * + SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryNodeRecord::get_max_record_length()); + } + temp_index.max_bits = std::max(temp_index.max_bits, + 22 + SnarlDistanceIndex::bit_width(temp_snarl_record.children.size())); +} + +/** + * Fill in the snarl index. + * The index will already know its boundaries and everything knows their relationships in the + * snarl tree. This needs to fill in the distances and the ranks of children in the snarl + * The rank of a child is arbitrary, except that the start node will always be 0 and the end node + * will always be the node count+1 (since node count doesn't count the boundary nodes) + */ +void populate_snarl_index( + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, + SnarlDistanceIndex::temp_record_ref_t snarl_index, size_t size_limit, + bool only_top_level_chain_distances, const HandleGraph* graph) { +#ifdef debug_distance_indexing + cerr << "Getting the distances for snarl " << temp_index.structure_start_end_as_string(snarl_index) << endl; + assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL); +#endif + auto& temp_snarl_record = temp_index.get_snarl(snarl_index); + temp_snarl_record.is_simple = true; + + vector all_children = temp_snarl_record.children; + + identify_tips(temp_index, temp_snarl_record, all_children, graph); + + if (!temp_snarl_record.is_root_snarl) { + auto new_to_old = topo_sort_children(temp_index, snarl_index, temp_snarl_record, all_children, graph); + apply_topo_permutation(temp_index, temp_snarl_record, all_children, new_to_old); + } + + compute_snarl_distances(temp_index, snarl_index, temp_snarl_record, all_children, graph, + size_limit, only_top_level_chain_distances); + +#ifdef debug_distance_indexing + cerr << "snarl " << temp_index.structure_start_end_as_string(snarl_index) << " is_simple: " << temp_snarl_record.is_simple << endl; +#endif + + if (temp_snarl_record.is_simple) { + mark_simple_snarl_orientations(temp_index, temp_snarl_record); + } + + finalize_snarl_record(temp_index, snarl_index, temp_snarl_record, all_children, graph); +} + + +void populate_hub_labeling(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph) { + CHOverlay ov = make_boost_graph(temp_index, snarl_index, temp_snarl_record, all_children, graph); + +#ifdef debug_hub_label_build + // Dump CHOverlay graph to stderr for debugging + std::cerr << "=== CHOverlay Graph Dump ===" << std::endl; + std::cerr << ov << std::endl; + std::cerr << "=== End CHOverlay Dump ===" << std::endl; +#endif + + make_contraction_hierarchy(ov); + + vector> labels; labels.resize(num_vertices(ov)); + vector> labels_rev; labels_rev.resize(num_vertices(ov)); + create_labels(labels, labels_rev, ov); +#ifdef debug_hub_label_storage + std::cerr << "Hub labels unpacked:" << std::endl; + for (const auto& node_list : {labels, labels_rev}) { + std::cerr << "Labels for all nodes:" << std::endl; + for (size_t i = 0; i < node_list.size(); i++) { + std::cerr << "\tLabels for rank " << i << ":" << std::endl; + for (const HubRecord& label : node_list[i]) { + std::cerr << "\t\tHub: " << label.hub << " Dist: " << label.dist << std::endl; + } + } + } +#endif + + // Put labels in temp_snarl_record + temp_snarl_record.hub_labels = pack_labels(labels, labels_rev); +#ifdef debug_hub_label_storage + std::cerr << "Hub labels as packed: "; + for (size_t i = 0; i < temp_snarl_record.hub_labels.size(); i++) { + if (i > 0) { + std::cerr << " | "; + } + std::cerr << temp_snarl_record.hub_labels[i]; + } + std::cerr << std::endl; +#endif +} + +void populate_distance_matrix_if_needed(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph, size_t size_limit, bool only_top_level_chain_distances) { + if (size_limit != 0 && !only_top_level_chain_distances) { + //If we are saving distances + //Reserve enough space to store all possible distances + temp_snarl_record.distances.reserve( temp_snarl_record.node_count > size_limit + ? temp_snarl_record.node_count * 2 + : temp_snarl_record.node_count * temp_snarl_record.node_count); + } else { + temp_snarl_record.include_distances = false; + } + for (auto it = all_children.rbegin(); it != all_children.rend(); ++it) { + // Visit all the children in reverse order + const SnarlDistanceIndex::temp_record_ref_t& start_index = *it; + + bool is_internal_node = false; + + if ((start_index.first == SnarlDistanceIndex::TEMP_NODE + && start_index.second != temp_snarl_record.start_node_id + && start_index.second != temp_snarl_record.end_node_id) + || + (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && temp_index.get_chain(start_index).is_trivial)) { + // If this is an internal node + is_internal_node = true; + nid_t node_id = start_index.first == SnarlDistanceIndex::TEMP_NODE ? start_index.second : temp_index.get_chain(start_index).start_node_id; + SnarlDistanceIndex::temp_record_ref_t node_index {SnarlDistanceIndex::TEMP_NODE, node_id}; + size_t rank = start_index.first == SnarlDistanceIndex::TEMP_NODE ? temp_index.get_node(start_index).rank_in_parent + : temp_index.get_chain(start_index).rank_in_parent; + + bool has_edges = false; + graph->follow_edges(graph->get_handle(node_id, false), false, [&](const handle_t& next_handle) { + has_edges = true; + }); + if (!has_edges) { + temp_index.get_node(node_index).is_tip = true; + temp_snarl_record.tippy_child_ranks.emplace(rank, false); + temp_snarl_record.is_simple=false; //It is a tip so this isn't simple snarl + } + has_edges = false; + graph->follow_edges(graph->get_handle(node_id, true), false, [&](const handle_t& next_handle) { + has_edges = true; + }); + if (!has_edges) { + temp_index.get_node(node_index).is_tip = true; + temp_snarl_record.tippy_child_ranks.emplace(rank, true); + temp_snarl_record.is_simple=false; //It is a tip so this isn't simple snarl + } + } else if (start_index.first == SnarlDistanceIndex::TEMP_CHAIN && !temp_index.get_chain(start_index).is_trivial) { + // If this is an internal chain, then it isn't a simple snarl + temp_snarl_record.is_simple=false; + } + + bool start_is_tip = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(start_index).is_tip + : temp_index.get_chain(start_index).is_tip; + + size_t start_rank = start_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.get_node(start_index).rank_in_parent + : temp_index.get_chain(start_index).rank_in_parent; + + + if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.start_node_id) { + start_rank = 0; + } else if (start_index.first == SnarlDistanceIndex::TEMP_NODE && start_index.second == temp_snarl_record.end_node_id) { + start_rank = 1; + } //TODO: + //else { + // assert(start_rank != 0 && start_rank != 1); + //} + + //traversal start is not a tip or a boundary node + bool start_normal_child = (!start_is_tip && start_rank != 0 && start_rank != 1); + + if ( (temp_snarl_record.node_count > size_limit || size_limit == 0 || only_top_level_chain_distances) && (temp_snarl_record.is_root_snarl || start_normal_child)) { + //If we don't care about internal distances, and we also are not at a boundary or tip + //TODO: Why do we care about tips specifically? + continue; + } + //getting here means snarl is not oversized + //fill in all distances for a row + populate_distance_matrix_row(temp_index, snarl_index, temp_snarl_record, start_index, graph, start_rank, is_internal_node, size_limit); + } +} + + + +namespace { + +using temp_record_ref_t = SnarlDistanceIndex::temp_record_ref_t; +using NetgraphNode = pair>; +struct NetgraphCmp { + bool operator()(const NetgraphNode& a, const NetgraphNode& b) const { + return a.first > b.first; + } +}; +using DijkstraQueue = priority_queue, NetgraphCmp>; +using VisitedSet = unordered_set>; + +struct DistanceRowContext { + SnarlDistanceIndex::TemporaryDistanceIndex& temp_index; + temp_record_ref_t snarl_index; + SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& snarl_record; + temp_record_ref_t start_index; + size_t start_rank; + bool is_internal_node; + size_t size_limit; + SnarlChildGraph& child_graph; +}; + +struct NeighborSide { + size_t rank; + bool reversed; + bool is_boundary; +}; + +[[nodiscard]] vector starting_directions(const DistanceRowContext& ctx) { + vector directions; + if (ctx.start_index.first == SnarlDistanceIndex::TEMP_NODE && + ctx.start_index.second == ctx.snarl_record.start_node_id) { + directions.emplace_back(ctx.snarl_record.start_node_rev); + } else if (ctx.start_index.first == SnarlDistanceIndex::TEMP_NODE && + ctx.start_index.second == ctx.snarl_record.end_node_id) { + directions.emplace_back(!ctx.snarl_record.end_node_rev); + } else { + directions.emplace_back(true); + directions.emplace_back(false); + } + return directions; +} + +void update_simplicity_on_edge(DistanceRowContext& ctx, + temp_record_ref_t current_index, + nid_t current_end_nid, + nid_t current_other_side_nid, + temp_record_ref_t next_index, + nid_t arriving_nid) { + if (arriving_nid == current_end_nid || arriving_nid == current_other_side_nid) { + ctx.snarl_record.is_simple = false; + } else if (!ctx.snarl_record.is_root_snarl && ctx.start_rank == 0 && + current_index != ctx.start_index && + !(next_index.first == SnarlDistanceIndex::TEMP_NODE && + next_index.second == ctx.snarl_record.end_node_id)) { + ctx.snarl_record.is_simple = false; + } else if (!ctx.snarl_record.is_root_snarl && ctx.start_rank == 1 && + current_index != ctx.start_index && + !(next_index.first == SnarlDistanceIndex::TEMP_NODE && + next_index.second == ctx.snarl_record.start_node_id)) { + ctx.snarl_record.is_simple = false; + } +} + +[[nodiscard]] NeighborSide resolve_neighbor_side(const DistanceRowContext& ctx, + temp_record_ref_t next_index, bool next_rev) { + size_t rank; + if (next_index.first == SnarlDistanceIndex::TEMP_NODE && + next_index.second == ctx.snarl_record.start_node_id) { + rank = 0; + } else if (next_index.first == SnarlDistanceIndex::TEMP_NODE && + next_index.second == ctx.snarl_record.end_node_id) { + rank = 1; + } else { + rank = next_index.first == SnarlDistanceIndex::TEMP_NODE + ? ctx.temp_index.get_node(next_index).rank_in_parent + : ctx.temp_index.get_chain(next_index).rank_in_parent; + } + bool is_boundary = !ctx.snarl_record.is_root_snarl && (rank == 0 || rank == 1); + bool reversed = is_boundary ? false : next_rev; + return NeighborSide{rank, reversed, is_boundary}; +} + +[[nodiscard]] bool record_distance(DistanceRowContext& ctx, + size_t current_distance, + bool start_rev, + NeighborSide next, + temp_record_ref_t next_index) { + bool start_is_boundary = !ctx.snarl_record.is_root_snarl && + (ctx.start_rank == 0 || ctx.start_rank == 1); + + pair start_key = start_is_boundary + ? make_pair(ctx.start_rank, false) : make_pair(ctx.start_rank, !start_rev); + pair next_key = next.is_boundary + ? make_pair(next.rank, false) : make_pair(next.rank, next.reversed); + + if (ctx.size_limit == 0 && start_is_boundary && next.is_boundary) { + // If not measuring distances, we need to use + // distance_start_start and distance_end_end as + // connectivity flags so we can still detect reversals + // within chains and recognize regular snarls. + if (ctx.start_rank == 0 && next.rank == 0) { + ctx.snarl_record.distance_start_start = 0; +#ifdef debug_distance_indexing + cerr << " set loop indicator start start distance " << ctx.snarl_record.distance_start_start << endl; +#endif + } else if (ctx.start_rank == 1 && next.rank == 1) { + ctx.snarl_record.distance_end_end = 0; +#ifdef debug_distance_indexing + cerr << " set loop indicator end end distance " << ctx.snarl_record.distance_start_start << endl; +#endif + } + return false; + } + + if (ctx.size_limit == 0 || + !(ctx.snarl_record.node_count <= ctx.size_limit || start_is_boundary || next.is_boundary)) { + return false; + } + + //If the snarl is too big, then we don't record distances between internal nodes + //If we are looking at all distances or we are looking at boundaries + bool added_new_distance = false; + + if (start_is_boundary && next.is_boundary) { + //If it is between bounds of the snarl, then the snarl stores it + if (ctx.start_rank == 0 && next.rank == 0 && + ctx.snarl_record.distance_start_start == std::numeric_limits::max()) { + ctx.snarl_record.distance_start_start = current_distance; +#ifdef debug_distance_indexing + cerr << " set start start distance " << ctx.snarl_record.distance_start_start << endl; +#endif + added_new_distance = true; + } else if (ctx.start_rank == 1 && next.rank == 1 && + ctx.snarl_record.distance_end_end == std::numeric_limits::max()) { + ctx.snarl_record.distance_end_end = current_distance; +#ifdef debug_distance_indexing + cerr << " set end end distance " << ctx.snarl_record.distance_start_start << endl; +#endif + added_new_distance = true; + } else if (((ctx.start_rank == 0 && next.rank == 1) || (ctx.start_rank == 1 && next.rank == 0)) && + ctx.snarl_record.min_length == std::numeric_limits::max()) { + ctx.snarl_record.min_length = current_distance; + added_new_distance = true; + } + } else if (start_is_boundary) { + //If start is a boundary node, collapse TEMP_NODE/TEMP_CHAIN via generic lambda + auto assign_if_unset = [&](auto& rec) -> bool { + if (ctx.start_rank == 0 && !next.reversed && + rec.distance_left_start == std::numeric_limits::max()) { + rec.distance_left_start = current_distance; return true; + } else if (ctx.start_rank == 0 && next.reversed && + rec.distance_right_start == std::numeric_limits::max()) { + rec.distance_right_start = current_distance; return true; + } else if (ctx.start_rank == 1 && !next.reversed && + rec.distance_left_end == std::numeric_limits::max()) { + rec.distance_left_end = current_distance; return true; + } else if (ctx.start_rank == 1 && next.reversed && + rec.distance_right_end == std::numeric_limits::max()) { + rec.distance_right_end = current_distance; return true; + } + return false; + }; + added_new_distance = next_index.first == SnarlDistanceIndex::TEMP_NODE + ? assign_if_unset(ctx.temp_index.get_node(next_index)) + : assign_if_unset(ctx.temp_index.get_chain(next_index)); + } else if (!next.is_boundary && + !ctx.snarl_record.distances.count(make_pair(start_key, next_key))) { + //Otherwise the snarl stores it in its distance + //If the distance isn't from an internal node to a bound and we haven't stored the distance yet + ctx.snarl_record.distances[make_pair(start_key, next_key)] = current_distance; + added_new_distance = true; +#ifdef debug_distance_indexing + cerr << " Adding distance between ranks " << start_key.first << " " << start_key.second + << " and " << next_key.first << " " << next_key.second << ": " << current_distance << endl; +#endif + } + + if (added_new_distance) { + ctx.snarl_record.max_distance = std::max(ctx.snarl_record.max_distance, current_distance); + } + return added_new_distance; +} + +void enqueue_relaxations(DijkstraQueue& queue, VisitedSet& visited_nodes, + DistanceRowContext& ctx, + size_t current_distance, + temp_record_ref_t next_index, + bool next_rev, + nid_t arriving_nid, + size_t edge_distance) { + if (visited_nodes.count(make_pair(next_index, next_rev)) == 0 && + arriving_nid != ctx.snarl_record.start_node_id && + arriving_nid != ctx.snarl_record.end_node_id) { + //If this isn't leaving the snarl, + //then add the next node to the queue, along with the distance to traverse it + // edge_distance already encodes chain.min_length (∞ if disconnected). + if (edge_distance != std::numeric_limits::max()) { + queue.push(make_pair(SnarlDistanceIndex::sum(current_distance, edge_distance), + make_pair(next_index, next_rev))); + } + } + if (next_index.first == SnarlDistanceIndex::TEMP_CHAIN) { + size_t loop_distance = next_rev + ? ctx.temp_index.get_chain(next_index).backward_loops.back() + : ctx.temp_index.get_chain(next_index).forward_loops.front(); + if (loop_distance != std::numeric_limits::max() && + visited_nodes.count(make_pair(next_index, !next_rev)) == 0 && + arriving_nid != ctx.snarl_record.start_node_id && + arriving_nid != ctx.snarl_record.end_node_id) { + //If the next node can loop back on itself, then add the next node in the opposite direction + const auto& nchain = ctx.temp_index.get_chain(next_index); + nid_t boundary_id = next_rev ? nchain.end_node_id : nchain.start_node_id; + size_t boundary_len = ctx.temp_index.get_node( + {SnarlDistanceIndex::TEMP_NODE, boundary_id}).node_length; + size_t next_node_len = loop_distance + 2 * boundary_len; + queue.push(make_pair(SnarlDistanceIndex::sum(current_distance, next_node_len), + make_pair(next_index, !next_rev))); + } + } +} + +void run_dijkstra_from_side(DistanceRowContext& ctx, bool start_rev) { + //Start a dijkstra traversal from start_index going in the direction indicated by start_rev + //Record the distances to each node (child of the snarl) found + size_t reachable_node_count = 0; //How many nodes can we reach from this node side? + +#ifdef debug_distance_indexing + cerr << " Starting from child " << ctx.temp_index.structure_start_end_as_string(ctx.start_index) + << " going " << (start_rev ? "rev" : "fd") << endl; +#endif + + DijkstraQueue queue; + VisitedSet visited_nodes; + visited_nodes.reserve(ctx.snarl_record.node_count * 2); + + queue.push(make_pair(0, make_pair(ctx.start_index, start_rev))); + + while (!queue.empty()) { + + //Get the current node from the queue and pop it out of the queue + size_t current_distance = queue.top().first; + temp_record_ref_t current_index = queue.top().second.first; + bool current_rev = queue.top().second.second; + if (visited_nodes.count(queue.top().second)) { + queue.pop(); + continue; + } + visited_nodes.emplace(queue.top().second); + queue.pop(); + + // Pre-compute the current child's outgoing node ID and "other side" node ID + // for is_simple detection (mirrors the original current_end_handle checks). + nid_t current_end_nid, current_other_side_nid; + if (current_index.first == SnarlDistanceIndex::TEMP_NODE) { + current_end_nid = current_index.second; + current_other_side_nid = current_index.second; + } else { + const auto& ccr = ctx.temp_index.get_chain(current_index); + current_end_nid = current_rev ? ccr.start_node_id : ccr.end_node_id; + current_other_side_nid = current_rev ? ccr.end_node_id : ccr.start_node_id; + } + +#ifdef debug_distance_indexing + cerr << " at child " << ctx.temp_index.structure_start_end_as_string(current_index) << " going " + << (current_rev ? "rev" : "fd") << " outgoing from node " << current_end_nid << endl; +#endif + + ctx.child_graph.for_each_outgoing(current_index, current_rev, [&]( + temp_record_ref_t next_index, + bool next_rev, + size_t edge_distance, + nid_t arriving_nid) { +#ifdef debug_distance_indexing + cerr << " see edge " << current_end_nid << " -> " << arriving_nid << endl; +#endif + update_simplicity_on_edge(ctx, current_index, current_end_nid, + current_other_side_nid, next_index, arriving_nid); + + reachable_node_count++; + + NeighborSide next = resolve_neighbor_side(ctx, next_index, next_rev); + //If the next thing wasn't a boundary node and this was an internal node, then it isn't a simple snarl + if (!next.is_boundary && ctx.is_internal_node) { + ctx.snarl_record.is_simple = false; + } + //TODO: This won't be true of root snarls + //else { assert(next.rank != 0 && next.rank != 1); } + +#ifdef debug_distance_indexing + if (next.rank == 0) std::cerr << " edge arrived at start" << std::endl; + else if (next.rank == 1) std::cerr << " edge arrived at end" << std::endl; +#endif + + record_distance(ctx, current_distance, start_rev, next, next_index); + + enqueue_relaxations(queue, visited_nodes, ctx, current_distance, + next_index, next_rev, arriving_nid, edge_distance); + +#ifdef debug_distance_indexing + cerr << " reached child " << ctx.temp_index.structure_start_end_as_string(next_index) << " going " + << (next_rev ? "rev" : "fd") << " with distance " << current_distance + << " for ranks " << ctx.start_rank << " " << next.rank << endl; +#endif + }); + } + if (ctx.is_internal_node && reachable_node_count != 1) { + //If this is an internal node, then it must have only one edge for it to be a simple snarl + ctx.snarl_record.is_simple = false; + } +} + +void finalize_internal_node_contribution(DistanceRowContext& ctx) { + /** Check the minimum length of the snarl passing through this node **/ + if (ctx.start_rank == 0 || ctx.start_rank == 1) { + return; + } + size_t child_max_length = ctx.start_index.first == SnarlDistanceIndex::TEMP_NODE + ? ctx.temp_index.get_node(ctx.start_index).node_length + : ctx.temp_index.get_chain(ctx.start_index).max_length; + //The distance through the whole snarl traversing this node forwards + //(This might actually be traversing it backwards but it doesn't really matter) + size_t dist_start_left = ctx.start_index.first == SnarlDistanceIndex::TEMP_NODE + ? ctx.temp_index.get_node(ctx.start_index).distance_left_start + : ctx.temp_index.get_chain(ctx.start_index).distance_left_start; + size_t dist_end_right = ctx.start_index.first == SnarlDistanceIndex::TEMP_NODE + ? ctx.temp_index.get_node(ctx.start_index).distance_right_end + : ctx.temp_index.get_chain(ctx.start_index).distance_right_end; + size_t dist_start_right = ctx.start_index.first == SnarlDistanceIndex::TEMP_NODE + ? ctx.temp_index.get_node(ctx.start_index).distance_right_start + : ctx.temp_index.get_chain(ctx.start_index).distance_right_start; + size_t dist_end_left = ctx.start_index.first == SnarlDistanceIndex::TEMP_NODE + ? ctx.temp_index.get_node(ctx.start_index).distance_left_end + : ctx.temp_index.get_chain(ctx.start_index).distance_left_end; + + size_t snarl_length_fd = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + dist_start_left, dist_end_right), child_max_length); + //The same thing traversing this node backwards + size_t snarl_length_rev = SnarlDistanceIndex::sum(SnarlDistanceIndex::sum( + dist_start_right, dist_end_left), child_max_length); + //The max that isn't infinite + size_t max_length = + snarl_length_rev == std::numeric_limits::max() + ? snarl_length_fd + : (snarl_length_fd == std::numeric_limits::max() + ? snarl_length_rev + : std::max(snarl_length_rev, snarl_length_fd)); + if (max_length != std::numeric_limits::max()) { + ctx.snarl_record.max_length = std::max(ctx.snarl_record.max_length, max_length); + } + if (ctx.snarl_record.is_simple && + !((dist_start_left == 0 && dist_end_right == 0 && + dist_end_left == std::numeric_limits::max() && + dist_start_right == std::numeric_limits::max()) || + (dist_start_left == std::numeric_limits::max() && + dist_end_right == std::numeric_limits::max() && + dist_end_left == 0 && dist_start_right == 0))) { + //If the snarl is simple, double check that this node is actually simple: that it can only be traversed going + //across the snarl + ctx.snarl_record.is_simple = false; + } +} + +} // anonymous namespace + +void populate_distance_matrix_row(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const SnarlDistanceIndex::temp_record_ref_t& start_index, const HandleGraph* graph, size_t start_rank, bool is_internal_node, size_t size_limit) { + // SnarlChildGraph encapsulates the follow_edges + ancestor-resolution kernel. + // Pass an empty children span — for_each_outgoing doesn't use it. + SnarlChildGraph child_graph(temp_index, snarl_index, + std::span{}, graph); + DistanceRowContext ctx{temp_index, snarl_index, temp_snarl_record, start_index, + start_rank, is_internal_node, size_limit, child_graph}; + for (bool start_rev : starting_directions(ctx)) { + run_dijkstra_from_side(ctx, start_rev); + } + finalize_internal_node_contribution(ctx); +} + +} // namespace vg diff --git a/src/snarl_distance_index_check_regularity.cpp b/src/snarl_distance_index_check_regularity.cpp new file mode 100644 index 0000000000..015f5e2b6e --- /dev/null +++ b/src/snarl_distance_index_check_regularity.cpp @@ -0,0 +1,201 @@ +//#define debug_distance_indexing + +#include "snarl_distance_index.hpp" + +using namespace std; +using namespace handlegraph; +namespace vg { + +bool check_regularity(const SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, const SnarlDistanceIndex::temp_record_ref_t& snarl_index, const SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record, const vector& all_children, const HandleGraph* graph) { +#ifdef debug_distance_indexing + std::cerr << "Check if snarl " << temp_snarl_record.start_node_id << " to " << temp_snarl_record.end_node_id << " with " << all_children.size() << " children is regular" << std::endl; +#endif + + if (temp_snarl_record.is_root_snarl) { + // Roots can't be regular. +#ifdef debug_distance_indexing + std::cerr << "Snarl is not regular because it is a root snarl." << std::endl; +#endif + return false; + } + if (temp_snarl_record.is_simple) { + // Simple snarls are always also regular. +#ifdef debug_distance_indexing + std::cerr << "Snarl is regular because it is simple." << std::endl; +#endif + return true; + } + + // Get the snarl boundary nodes, facing out + handle_t start_out = graph->get_handle(temp_snarl_record.start_node_id, !temp_snarl_record.start_node_rev); + handle_t end_out = graph->get_handle(temp_snarl_record.end_node_id, temp_snarl_record.end_node_rev); + + // Define accessors to get bounding graph handles for children, facing out. + auto child_start_out = [&](const SnarlDistanceIndex::temp_record_ref_t& child_index) { + return child_index.first == SnarlDistanceIndex::TEMP_NODE ? + graph->get_handle(child_index.second, true) : + graph->get_handle( + temp_index.get_chain(child_index).start_node_id, + !temp_index.get_chain(child_index).start_node_rev + ); + }; + auto child_end_out = [&](const SnarlDistanceIndex::temp_record_ref_t& child_index) { + return child_index.first == SnarlDistanceIndex::TEMP_NODE ? + graph->get_handle(child_index.second, false) : + graph->get_handle( + temp_index.get_chain(child_index).end_node_id, + temp_index.get_chain(child_index).end_node_rev + ); + }; + + for (const SnarlDistanceIndex::temp_record_ref_t& child_index : all_children) { + // We should only have nodes and chains as children + assert(child_index.first == SnarlDistanceIndex::TEMP_NODE + || child_index.first == SnarlDistanceIndex::TEMP_CHAIN); + if (child_index.first == SnarlDistanceIndex::TEMP_NODE + && (child_index.second == temp_snarl_record.start_node_id + || child_index.second == temp_snarl_record.end_node_id)) { + // Don't think about children for the snarl bounds now; we handle the bounds later. + continue; + } + + // Have we seen the snarl start? + bool saw_start = false; + // Have we seen the snarl end? + bool saw_end = false; + // Have we seen anything else, or a duplicate snarl boundary? + bool saw_other = false; + + auto handle_destination = [&](const handle_t& next_handle) { +#ifdef debug_distance_indexing + std::cerr << "\tConnects to " << graph->get_id(next_handle) << (graph->get_is_reverse(next_handle) ? "-" : "+") << std::endl; +#endif + + // Every edge out the end the child must go to a snarl boundary out + // that hasn't been reached yet. + if (next_handle == start_out && !saw_start) { + saw_start = true; +#ifdef debug_distance_indexing + std::cerr << "\t\tThis is a new connection to snarl start" << std::endl; +#endif + return true; + } else if (next_handle == end_out && !saw_end) { + saw_end = true; +#ifdef debug_distance_indexing + std::cerr << "\t\tThis is a new connection to snarl end" << std::endl; +#endif + return true; + } else { + saw_other = true; + // We don't care if we have an edge going the right way because + // we found an edge going the wrong way. +#ifdef debug_distance_indexing + std::cerr << "\t\tThis is an unwanted connection!" << std::endl; +#endif + return false; + } + }; + + // Check the edges off the child start + handle_t here = child_start_out(child_index); +#ifdef debug_distance_indexing + std::cerr << "Look right from " << graph->get_id(here) << (graph->get_is_reverse(here) ? "-" : "+") << std::endl; +#endif + graph->follow_edges(here, false, handle_destination); + + if (saw_other || !(saw_start != saw_end)) { + // We have an edge we shouldn't, or we don't connect to exactly one boundary. +#ifdef debug_distance_indexing + std::cerr << "\tWe must not be regular" << std::endl; +#endif + return false; + } + + // Check the edges off the child end + here = child_end_out(child_index); +#ifdef debug_distance_indexing + std::cerr << "Look right from " << graph->get_id(here) << (graph->get_is_reverse(here) ? "-" : "+") << std::endl; +#endif + graph->follow_edges(here, false, handle_destination); + + if (saw_other || !saw_start || !saw_end) { + // We have an edge we shouldn't, or we haven't reached both + // boundaries exactly once across the two ends of the child. +#ifdef debug_distance_indexing + std::cerr << "\tWe must not be regular" << std::endl; +#endif + return false; + } + + if (child_index.first == SnarlDistanceIndex::TEMP_CHAIN) { + // If a child is a chain, check it for loops +#ifdef debug_distance_indexing + std::cerr << "Check child chain for loops." << std::endl; +#endif + const SnarlDistanceIndex::TemporaryDistanceIndex::TemporaryChainRecord& temp_chain_record = temp_index.get_chain(child_index); +#ifdef debug_distance_indexing + std::cerr << "Forward loops:"; + for (auto& l : temp_chain_record.forward_loops) { + std::cerr << " " << l; + } + std::cerr << std::endl; +#endif + + if (!temp_chain_record.forward_loops.empty() && temp_chain_record.forward_loops.front() != std::numeric_limits::max()) { + // There's a forward loop in this child chain, so the snarl's not regular. +#ifdef debug_distance_indexing + std::cerr << "We are not regular because there's a forward loop in this child chain." << std::endl; +#endif + return false; + } + +#ifdef debug_distance_indexing + std::cerr << "Backward loops:"; + for (auto& l : temp_chain_record.backward_loops) { + std::cerr << " " << l; + } + std::cerr << std::endl; +#endif + + if (!temp_chain_record.backward_loops.empty() && temp_chain_record.backward_loops.back() != std::numeric_limits::max()) { + // There's a backward loop in this child chain, so the snarl's not regular. +#ifdef debug_distance_indexing + std::cerr << "We are not regular because there's a backward loop in this child chain." << std::endl; +#endif + return false; + } + } + } + + // Now we know the children are fine; check for disallowed edges between + // the sentinels. + + handle_t start_in = graph->flip(start_out); + if (graph->has_edge(start_in, start_out)) { +#ifdef debug_distance_indexing + std::cerr << "We are not regular because we have a start-start loop." << std::endl; +#endif + return false; + } + + handle_t end_in = graph->flip(end_out); + if (graph->has_edge(end_in, end_out)) { +#ifdef debug_distance_indexing + std::cerr << "We are not regular because we have an end-end loop." << std::endl; +#endif + return false; + } + + // If we don't have any disallowed edges, and we don't have any children + // without the exact right connectivity, we must be regular. + + // We don't make sure we actually had any children. + +#ifdef debug_distance_indexing + std::cerr << "We are a regular snarl." << std::endl; +#endif + + return true; +} + +} // namespace vg diff --git a/src/snarl_distance_index_child_graph.hpp b/src/snarl_distance_index_child_graph.hpp new file mode 100644 index 0000000000..3626f75bbc --- /dev/null +++ b/src/snarl_distance_index_child_graph.hpp @@ -0,0 +1,69 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +namespace vg { + +// Read-only view of a snarl's child net-graph. +// +// Encapsulates the "get outgoing handle → follow_edges → map to snarl child" +// kernel shared by topo_sort_children and populate_distance_matrix_row. +// Both callers need the same three-step operation: +// 1. compute the handle pointing out of `child` in direction `go_left` +// 2. follow graph edges from that handle +// 3. for each landing node, resolve its snarl-level ancestor + direction +// +// Dijkstra priority-queue management and topo-sort BFS logic stay with the +// respective callers. +class SnarlChildGraph { +public: + using temp_record_ref_t = SnarlDistanceIndex::temp_record_ref_t; + using TempIndex = SnarlDistanceIndex::TemporaryDistanceIndex; + + // `children` may be empty if the caller doesn't need the children() accessor. + // All other methods only require temp_index, snarl_index, and graph. + SnarlChildGraph(TempIndex& temp_index, + temp_record_ref_t snarl_index, + std::span children, + const handlegraph::HandleGraph* graph); + + std::span children() const noexcept; + + // For each graph edge leaving `child` in direction `go_left`, invoke the + // callback with: + // neighbor — snarl-level ancestor of the landing node + // neighbor_rev — true iff `neighbor` is entered from its right side + // (chain traversed right-to-left, or node reversed) + // edge_distance — traversal length of `neighbor`: + // TEMP_NODE → sequence length of the landing node; + // TEMP_CHAIN → chain.min_length (∞ if disconnected) + // arriving_node_id — graph node id of the immediate landing handle + // (before ancestor resolution; needed by callers that + // must preserve the original follow_edges semantics + // for is_simple detection) + // + // Boundary nodes ARE included in the callback; callers filter as needed. + void for_each_outgoing( + temp_record_ref_t child, + bool go_left, + const std::function&) const; + + // Returns {boundary_node_ref, start_node_rev} for start=true, + // {boundary_node_ref, end_node_rev} for start=false. + std::pair boundary(bool start) const; + +private: + TempIndex& temp_index_; + temp_record_ref_t snarl_index_; + std::span children_; + const handlegraph::HandleGraph* graph_; +}; + +} // namespace vg diff --git a/src/snarl_distance_index_query.cpp b/src/snarl_distance_index_query.cpp new file mode 100644 index 0000000000..4e5191b6ac --- /dev/null +++ b/src/snarl_distance_index_query.cpp @@ -0,0 +1,705 @@ +// #define debug_subgraph + +#include "snarl_distance_index.hpp" + +using namespace std; +using namespace handlegraph; +namespace vg { + +void subgraph_in_distance_range(const SnarlDistanceIndex &distance_index, + const Path &path, + const HandleGraph *super_graph, + size_t min_distance, size_t max_distance, + std::unordered_set &subgraph, + bool look_forward) { + + // The position we're starting from - either the start or end of the path + pos_t start_pos; + size_t node_len; + if (look_forward) { + start_pos = initial_position(path); + node_len = + super_graph->get_length(super_graph->get_handle(get_id(start_pos))); + } else { + start_pos = final_position(path); + node_len = + super_graph->get_length(super_graph->get_handle(get_id(start_pos))); + start_pos = reverse_base_pos(start_pos, node_len); + } + pair traversal_start = + std::make_pair(get_id(start_pos), get_is_rev(start_pos)); + +#ifdef debug_subgraph + cerr << endl + << "Find subgraph in distance range " << min_distance << " to " + << max_distance << endl; + cerr << "Start positon: " << start_pos << endl; +#endif + // The distance from the position to the ends of the current + // node(/snarl/chain) + size_t current_distance_left = is_rev(start_pos) + ? node_len - get_offset(start_pos) + : std::numeric_limits::max(); + size_t current_distance_right = is_rev(start_pos) + ? std::numeric_limits::max() + : node_len - get_offset(start_pos); + + // Graph node of the start and end of the current node(/snarl/chain) pointing + // out + net_handle_t current_net = + distance_index.get_node_net_handle(get_id(start_pos)); + net_handle_t parent = distance_index.start_end_traversal_of( + distance_index.get_parent(current_net)); + + // The id and orientation of nodes that are too close and should be avoided + hash_set> seen_nodes; + // Nodes that we want to start a search from - the distance is smaller or + // equal to than min_distance but we can't walk out any further along the + // snarl tree without exceeding it The distance is the distance from the start + // position to the beginning (or end if its backwards) of the node, including + // the position + vector> search_start_nodes; + + if (((current_distance_left != std::numeric_limits::max() && + current_distance_left > min_distance) || + (current_distance_right != std::numeric_limits::max() && + current_distance_right > min_distance)) || + (distance_index.is_trivial_chain(parent) && + distance_index.distance_in_parent(distance_index.get_parent(parent), + parent, + distance_index.flip(parent)) == 0 && + node_len * 2 > min_distance)) { + // If the distance to either end of the node is within the range + // Or of there is a loop on the node ( a duplication of just the node) and + // the node length would put one loop in the distance range + + // Add this node to the subgraph + subgraph.emplace(get_id(start_pos)); + + handle_t start = is_rev(start_pos) + ? distance_index.get_handle( + distance_index.flip(current_net), super_graph) + : distance_index.get_handle(current_net, super_graph); + + // Add any node one step out from this one to search_start_nodes + super_graph->follow_edges(start, false, [&](const handle_t &next_handle) { + search_start_nodes.emplace_back( + next_handle, + is_rev(start_pos) ? current_distance_left : current_distance_right); + }); + + // Search for reachable nodes + subgraph_in_distance_range_walk_graph( + super_graph, min_distance, max_distance, subgraph, search_start_nodes, + seen_nodes, traversal_start); + + return; + } + + while (!distance_index.is_root(parent)) { +#ifdef debug_subgraph + cerr << "At child " << distance_index.net_handle_as_string(current_net) + << " with distances " << current_distance_left << " " + << current_distance_right << endl; + cerr << "Parent is " << distance_index.net_handle_as_string(parent) + << " at offset " << SnarlDistanceIndex::get_record_offset(parent) + << endl; +#endif + + size_t max_parent_length = distance_index.maximum_length(parent); + + // Distances to get to the ends of the parent + size_t distance_start_left = SnarlDistanceIndex::sum( + current_distance_left, + distance_index.distance_to_parent_bound( + parent, true, distance_index.flip(current_net))); + size_t distance_start_right = SnarlDistanceIndex::sum( + current_distance_right, + distance_index.distance_to_parent_bound(parent, true, current_net)); + size_t distance_end_left = SnarlDistanceIndex::sum( + current_distance_left, + distance_index.distance_to_parent_bound( + parent, false, distance_index.flip(current_net))); + size_t distance_end_right = SnarlDistanceIndex::sum( + current_distance_right, + distance_index.distance_to_parent_bound(parent, false, current_net)); + + if ((current_distance_right != std::numeric_limits::max() && + current_distance_right >= min_distance) || + (current_distance_left != std::numeric_limits::max() && + current_distance_left >= min_distance) || + (distance_start_right != std::numeric_limits::max() && + distance_start_right >= min_distance) || + (distance_end_right != std::numeric_limits::max() && + distance_end_right >= min_distance) || + (distance_start_left != std::numeric_limits::max() && + distance_start_left >= min_distance) || + (distance_end_left != std::numeric_limits::max() && + distance_end_left >= min_distance) || + (max_parent_length != std::numeric_limits::max() && + max_parent_length >= min_distance)) { + // If the min distance will be exceeded within this parent, then start a + // search from the ends of this child + + if (distance_index.is_snarl(parent)) { + // If this is the child of a snarl, then just traverse from the end of + // the node +#ifdef debug_subgraph + cerr << "Start search in parent " + << distance_index.net_handle_as_string(parent); +#endif + if (current_distance_left != std::numeric_limits::max()) { + // If we can go left + net_handle_t bound = + distance_index.is_node(current_net) + ? distance_index.flip(current_net) + : distance_index.get_bound(current_net, false, false); + if (distance_index.is_sentinel(bound)) { + bound = distance_index.get_node_from_sentinel(bound); + } + handle_t current_node = distance_index.get_handle(bound, super_graph); + // Add everything immediately after the left bound of this node/chain + super_graph->follow_edges( + distance_index.get_handle(bound, super_graph), false, + [&](const handle_t &next_handle) { + seen_nodes.erase( + make_pair(super_graph->get_id(next_handle), + super_graph->get_is_reverse(next_handle))); + search_start_nodes.emplace_back(next_handle, + current_distance_left); + }); + +#ifdef debug_subgraph + cerr << " going left from " << super_graph->get_id(current_node) + << (super_graph->get_is_reverse(current_node) ? "rev " : "fd "); +#endif + } + if (current_distance_right != std::numeric_limits::max()) { + // If we can go right + net_handle_t bound = + distance_index.is_node(current_net) + ? current_net + : distance_index.get_bound(current_net, true, false); + if (distance_index.is_sentinel(bound)) { + bound = distance_index.get_node_from_sentinel(bound); + } + handle_t current_node = distance_index.get_handle(bound, super_graph); + + // Add everything immediately after the right bound of this node/chain + super_graph->follow_edges( + distance_index.get_handle(bound, super_graph), false, + [&](const handle_t &next_handle) { + seen_nodes.erase( + make_pair(super_graph->get_id(next_handle), + super_graph->get_is_reverse(next_handle))); + search_start_nodes.emplace_back(next_handle, + current_distance_right); + }); + +#ifdef debug_subgraph + cerr << " going right from " << super_graph->get_id(current_node) + << (super_graph->get_is_reverse(current_node) ? "rev " : "fd "); +#endif + } +#ifdef debug_subgraph + cerr << endl; +#endif + } else { +#ifdef debug_subgraph + cerr << "Start search along parent chain " + << distance_index.net_handle_as_string(parent); +#endif + // If this is the child of a chain, then traverse along the chain + if (current_distance_left != std::numeric_limits::max()) { + subgraph_in_distance_range_walk_across_chain( + distance_index, super_graph, subgraph, + distance_index.flip(current_net), current_distance_left, + search_start_nodes, seen_nodes, min_distance, max_distance, + false); + } + if (current_distance_right != std::numeric_limits::max()) { + subgraph_in_distance_range_walk_across_chain( + distance_index, super_graph, subgraph, current_net, + current_distance_right, search_start_nodes, seen_nodes, + min_distance, max_distance, false); + } + } + subgraph_in_distance_range_walk_graph( + super_graph, min_distance, max_distance, subgraph, search_start_nodes, + seen_nodes, traversal_start); + return; + } else if (distance_index.is_snarl(parent)) { +#ifdef debug_subgraph + cerr << "Parent is a snarl of handle type " + << SnarlDistanceIndex::get_handle_type(parent) << " at offset " + << SnarlDistanceIndex::get_record_offset(parent) << endl; +#endif + // TODO: This might be overkill. It prevents us from adding nodes that + // shouldn't be in the subgraph, but might be too slow If we don't check + // the other direction, go through the loop and add everything whose + // distance is lower than the minimum to seen_nodes + vector> loop_handles_to_check; + handle_t start_out = distance_index.get_handle( + distance_index.get_bound(parent, false, false), super_graph); + handle_t end_out = distance_index.get_handle( + distance_index.get_bound(parent, true, false), super_graph); + if (current_distance_left != std::numeric_limits::max()) { + loop_handles_to_check.emplace_back( + distance_index.get_handle( + distance_index.get_bound(current_net, false, false), + super_graph), + current_distance_left); + } + if (current_distance_right != std::numeric_limits::max()) { + loop_handles_to_check.emplace_back( + distance_index.get_handle( + distance_index.get_bound(current_net, true, false), + super_graph), + current_distance_right); + } + while (!loop_handles_to_check.empty()) { + handle_t current_loop_handle = loop_handles_to_check.back().first; + size_t current_loop_distance = loop_handles_to_check.back().second; + loop_handles_to_check.pop_back(); + + // Add to seen_nodes + seen_nodes.emplace(super_graph->get_id(current_loop_handle), + super_graph->get_is_reverse(current_loop_handle)); + + // Walk one step out from this node + super_graph->follow_edges( + current_loop_handle, false, [&](const handle_t &next_handle) { + // If the next node is close enough and isn't exiting the snarl, + // then add it to stack + size_t new_distance = SnarlDistanceIndex::sum( + current_loop_distance, super_graph->get_length(next_handle)); + if (new_distance < min_distance && next_handle != start_out && + next_handle != end_out && + seen_nodes.count(std::make_pair( + super_graph->get_id(next_handle), + super_graph->get_is_reverse(next_handle))) == 0) { + loop_handles_to_check.emplace_back(next_handle, new_distance); + } + }); + } + } else if (distance_index.is_chain(parent)) { +#ifdef debug_subgraph + cerr << "Parent is a chain of handle type " + << SnarlDistanceIndex::get_handle_type(parent) << " at offset " + << SnarlDistanceIndex::get_record_offset(parent) << endl; +#endif + // TODO: This is probably also overkill - walk a chain if there is a + // viable loop + size_t distance_loop_right = distance_index.distance_in_parent( + parent, current_net, current_net, super_graph, max_distance); + size_t distance_loop_left = distance_index.distance_in_parent( + parent, distance_index.flip(current_net), + distance_index.flip(current_net), super_graph, max_distance); + if ((current_distance_left != std::numeric_limits::max() && + distance_loop_left != std::numeric_limits::max()) || + (current_distance_right != std::numeric_limits::max() && + distance_loop_right != std::numeric_limits::max())) { + // If there is a loop that we can take, then take it + if (current_distance_left != std::numeric_limits::max()) { + subgraph_in_distance_range_walk_across_chain( + distance_index, super_graph, subgraph, + distance_index.flip(current_net), current_distance_left, + search_start_nodes, seen_nodes, min_distance, max_distance, + false); + } + if (current_distance_right != std::numeric_limits::max()) { + subgraph_in_distance_range_walk_across_chain( + distance_index, super_graph, subgraph, current_net, + current_distance_right, search_start_nodes, seen_nodes, + min_distance, max_distance, false); + } + subgraph_in_distance_range_walk_graph( + super_graph, min_distance, max_distance, subgraph, + search_start_nodes, seen_nodes, traversal_start); + return; + } + } + + // Remember the bounds of this child so we don't return to it + if (current_distance_left != std::numeric_limits::max()) { + // If we can go left + net_handle_t bound = + distance_index.is_node(current_net) + ? distance_index.flip(current_net) + : distance_index.get_bound(current_net, false, false); + if (distance_index.is_sentinel(bound)) { + bound = distance_index.get_node_from_sentinel(bound); + } + handle_t current_node = distance_index.get_handle(bound, super_graph); + seen_nodes.emplace(super_graph->get_id(current_node), + super_graph->get_is_reverse(current_node)); + } + if (current_distance_right != std::numeric_limits::max()) { + // If we can go right + net_handle_t bound = + distance_index.is_node(current_net) + ? current_net + : distance_index.get_bound(current_net, true, false); + if (distance_index.is_sentinel(bound)) { + bound = distance_index.get_node_from_sentinel(bound); + } + handle_t current_node = distance_index.get_handle(bound, super_graph); + seen_nodes.emplace(super_graph->get_id(current_node), + super_graph->get_is_reverse(current_node)); + } + + current_distance_left = std::min(distance_start_left, distance_start_right); + current_distance_right = std::min(distance_end_left, distance_end_right); + + current_net = std::move(parent); + parent = distance_index.canonical(distance_index.get_parent(current_net)); + } + if (current_distance_left <= min_distance) { +#ifdef debug_subgraph + cerr << "Adding the end of a child of the root " + << distance_index.net_handle_as_string( + distance_index.get_bound(current_net, false, false)) + << " with distance " << current_distance_left << endl; +#endif + + handle_t bound = distance_index.get_handle( + distance_index.get_bound(current_net, false, false), super_graph); + search_start_nodes.emplace_back(bound, current_distance_left); + } + if (current_distance_right <= min_distance) { +#ifdef debug_subgraph + cerr << "Adding the end of a child of the root " + << distance_index.net_handle_as_string( + distance_index.get_bound(current_net, false, false)) + << " with distance " << current_distance_right << endl; +#endif + handle_t bound = distance_index.get_handle( + distance_index.get_bound(current_net, true, false), super_graph); + search_start_nodes.emplace_back(bound, current_distance_right); + } + subgraph_in_distance_range_walk_graph(super_graph, min_distance, max_distance, + subgraph, search_start_nodes, + seen_nodes, traversal_start); + + return; +} + +/// Helper for subgraph_in_distance_range +/// Given starting handles in the super graph and the distances to each handle +/// (including the start position and +// the first position in the handle), add all nodes within the distance range, +// excluding nodes in seen_nodes +void subgraph_in_distance_range_walk_graph( + const HandleGraph *super_graph, size_t min_distance, size_t max_distance, + std::unordered_set &subgraph, + vector> &start_nodes, + hash_set> &seen_nodes, + const pair &traversal_start) { +#ifdef debug_subgraph + cerr << "Starting search from nodes " << endl; + for (auto &start_handle : start_nodes) { + cerr << "\t" << super_graph->get_id(start_handle.first) << " " + << super_graph->get_is_reverse(start_handle.first) << " with distance " + << start_handle.second << endl; + } +#endif + + // Order based on the distance to the position (handle) + auto cmp = [](const pair a, + const pair b) { return a.second > b.second; }; + priority_queue, vector>, + decltype(cmp)> + next_handles(cmp); + for (auto &start_handle : start_nodes) { + next_handles.emplace(start_handle); + } + bool first_node = true; + + while (next_handles.size() > 0) { + // Traverse the graph, adding nodes if they are within the range + handle_t curr_handle = next_handles.top().first; + size_t curr_distance = next_handles.top().second; + next_handles.pop(); +#ifdef debug_subgraph + cerr << "At node " << super_graph->get_id(curr_handle) << " " + << super_graph->get_is_reverse(curr_handle) << " with distance " + << curr_distance << endl; +#endif + if (seen_nodes.count(make_pair(super_graph->get_id(curr_handle), + super_graph->get_is_reverse(curr_handle))) == + 0) { + seen_nodes.emplace(super_graph->get_id(curr_handle), + super_graph->get_is_reverse(curr_handle)); + + size_t node_len = super_graph->get_length(curr_handle); + size_t curr_distance_end = + SnarlDistanceIndex::sum(curr_distance, node_len) - 1; + if ((curr_distance >= min_distance && curr_distance <= max_distance) || + (curr_distance_end >= min_distance && + curr_distance_end <= max_distance) || + (curr_distance <= min_distance && + curr_distance_end >= max_distance)) { +#ifdef debug_subgraph + cerr << "\tadding node " << super_graph->get_id(curr_handle) << " " + << super_graph->get_is_reverse(curr_handle) << " with distance " + << curr_distance << " and node length " << node_len << endl; +#endif + subgraph.insert(super_graph->get_id(curr_handle)); + + } +#ifdef debug_subgraph + else { + cerr << "\tdisregarding node " << super_graph->get_id(curr_handle) + << " " << super_graph->get_is_reverse(curr_handle) + << " with distance " << curr_distance << " and node length " + << node_len << endl; + } +#endif + curr_distance = SnarlDistanceIndex::sum(node_len, curr_distance); + + // If the end of this node is still within the range, add the next nodes + // that are within Also check that the node we're currently at isn't the + // start node + if (SnarlDistanceIndex::minus(curr_distance, 1) <= max_distance) { + super_graph->follow_edges( + curr_handle, false, [&](const handle_t &next) { + nid_t next_id = super_graph->get_id(next); + if (seen_nodes.count(make_pair( + next_id, super_graph->get_is_reverse(next))) == 0) { + next_handles.emplace(next, curr_distance); + } + return true; + }); + } + first_node = false; + } +#ifdef debug_subgraph + else { + cerr << "\tthe node was already seen" << endl; + } +#endif + } + +#ifdef debug_subgraph + cerr << "Subgraph has nodes: "; + for (const nid_t &node : subgraph) { + cerr << node << ", "; + } + cerr << endl; +#endif + return; +} +// helper function to walk along a chain from the current node until the +// distance traversed exceeds the minimum limit. Add the node just before this +// happens to search_start_nodes +void subgraph_in_distance_range_walk_across_chain( + const SnarlDistanceIndex &distance_index, const HandleGraph *super_graph, + std::unordered_set &subgraph, net_handle_t current_node, + size_t current_distance, vector> &search_start_nodes, + hash_set> &seen_nodes, const size_t &min_distance, + const size_t &max_distance, bool checked_loop) { +#ifdef debug_subgraph + cerr << "Walk along parent chain " + << distance_index.net_handle_as_string( + distance_index.get_parent(current_node)) + << " from " << distance_index.net_handle_as_string(current_node) + << " with " << current_distance << endl; +#endif + if (distance_index.is_trivial_chain( + distance_index.get_parent(current_node))) { + return; + } + bool finished_chain = false; + bool added_nodes = + false; // Did we start a search? if not, add the last node in the chain + while (current_distance <= min_distance && !finished_chain) { + finished_chain = distance_index.follow_net_edges( + current_node, super_graph, false, [&](const net_handle_t &next) { + size_t next_length = distance_index.minimum_length(next); + // If the next child is a snarl, then the distance to loop in the + // snarl + if (distance_index.is_snarl(next)) { + net_handle_t bound_fd = distance_index.get_bound( + next, distance_index.ends_at(next) == SnarlDistanceIndex::START, + true); + size_t next_loop = distance_index.distance_in_parent( + next, bound_fd, bound_fd, super_graph, max_distance); + if (!checked_loop && + next_loop != std::numeric_limits::max()) { +#ifdef debug_subgraph + cerr << "\tsnarl loops so also check the other direction" << endl; +#endif + // If we haven't yet checked the chain in the other direction and + // this snarl allows us to loop + if (SnarlDistanceIndex::sum(next_loop, current_distance) != + std::numeric_limits::max() && + SnarlDistanceIndex::sum( + SnarlDistanceIndex::sum(next_loop, current_distance), + distance_index.node_length(current_node)) >= + min_distance) { +#ifdef debug_subgraph + cerr << "\t\t add the current node" << endl; +#endif + // If the loop will put us over the edge, then start from the + // current node + super_graph->follow_edges( + distance_index.get_handle(current_node, super_graph), false, + [&](const handle_t &next_handle) { + search_start_nodes.emplace_back(next_handle, + current_distance); + }); + return true; + } else { + // Otherwise, switch direction in the chain and walk along it + // again + subgraph_in_distance_range_walk_across_chain( + distance_index, super_graph, subgraph, + distance_index.flip(current_node), + SnarlDistanceIndex::sum( + SnarlDistanceIndex::sum(current_distance, next_loop), + distance_index.node_length(current_node)), + search_start_nodes, seen_nodes, min_distance, max_distance, + true); + checked_loop = true; + } + } + if (next_loop != std::numeric_limits::max()) { + // TODO: This might be overkill. It prevents us from adding nodes + // that shouldn't be in the subgraph, but might be too slow If we + // don't check the other direction, go through the loop and add + // everything whose distance is lower than the minimum to + // seen_nodes + vector> loop_handles_to_check; + handle_t start_out = distance_index.get_handle( + distance_index.get_bound(next, false, false), super_graph); + handle_t end_out = distance_index.get_handle( + distance_index.get_bound(next, true, false), super_graph); + loop_handles_to_check.emplace_back( + distance_index.get_handle(bound_fd, super_graph), + current_distance); + while (!loop_handles_to_check.empty()) { + handle_t current_loop_handle = + loop_handles_to_check.back().first; + size_t current_loop_distance = + loop_handles_to_check.back().second; + loop_handles_to_check.pop_back(); + + // Add to seen_nodes + seen_nodes.emplace( + super_graph->get_id(current_loop_handle), + super_graph->get_is_reverse(current_loop_handle)); + + // Walk one step out from this node + super_graph->follow_edges( + current_loop_handle, false, + [&](const handle_t &next_handle) { + // If the next node is close enough and isn't exiting the + // snarl, then add it to stack + size_t new_distance = SnarlDistanceIndex::sum( + current_loop_distance, + super_graph->get_length(next_handle)); + if (new_distance < min_distance && + next_handle != start_out && next_handle != end_out && + seen_nodes.count(std::make_pair( + super_graph->get_id(next_handle), + super_graph->get_is_reverse(next_handle))) == 0) { + loop_handles_to_check.emplace_back(next_handle, + new_distance); + } + }); + } + } + } + size_t next_max_length = distance_index.maximum_length(next); +#ifdef debug_subgraph + cerr << "\tnext node: " << distance_index.net_handle_as_string(next) + << " with distance " << current_distance + << " and min and max lengths " << next_length << " " + << next_max_length << endl; +#endif + if ((SnarlDistanceIndex::sum(next_max_length, current_distance) != + std::numeric_limits::max() && + SnarlDistanceIndex::sum(next_max_length, current_distance) >= + min_distance)) { + if (distance_index.is_node(next)) { + size_t curr_distance_end = SnarlDistanceIndex::minus( + SnarlDistanceIndex::sum(next_max_length, current_distance), + 1); + // If its a node that puts us over, add the node to the subgraph, then + // start the search from that node +#ifdef debug_subgraph + cerr << "\t\tAdding node from a chain " + << distance_index.net_handle_as_string(next) + << " with distance " << current_distance << endl; +#endif + if ((current_distance >= min_distance && + current_distance <= max_distance) || + (curr_distance_end >= min_distance && + curr_distance_end <= max_distance) || + (current_distance <= min_distance && + curr_distance_end >= max_distance)) { + subgraph.emplace(distance_index.node_id(next)); + } + super_graph->follow_edges( + distance_index.get_handle(next, super_graph), false, + [&](const handle_t &next_handle) { + search_start_nodes.emplace_back( + next_handle, + SnarlDistanceIndex::sum(current_distance, next_length)); + seen_nodes.erase( + make_pair(super_graph->get_id(next_handle), + super_graph->get_is_reverse(next_handle))); + }); + } else { + // If it's a snarl, then we'll start from the last node +#ifdef debug_subgraph + cerr << "\t\tAdding node from a chain " + << distance_index.net_handle_as_string(next) + << " with distance " << current_distance << endl; +#endif + super_graph->follow_edges( + distance_index.get_handle(current_node, super_graph), false, + [&](const handle_t &next_handle) { + search_start_nodes.emplace_back(next_handle, + current_distance); + seen_nodes.erase( + make_pair(super_graph->get_id(next_handle), + super_graph->get_is_reverse(next_handle))); + }); + } + // If we added something, stop traversing the chain + added_nodes = true; + return true; + } else if (distance_index.is_node(next)) { + seen_nodes.emplace(distance_index.node_id(next), + distance_index.ends_at(next) == + SnarlDistanceIndex::START); + } + current_node = next; + current_distance = + SnarlDistanceIndex::sum(next_length, current_distance); + if (current_distance > max_distance) { + added_nodes = true; + return true; + } else { + return false; + } + }); + } + if (!added_nodes && current_distance <= max_distance) { + // If we haven't added anything and haven't exceeded the distance limit, + // then start from the end of the chain + handle_t bound = distance_index.get_handle(current_node, super_graph); + + super_graph->follow_edges(bound, false, [&](const handle_t &next_handle) { + search_start_nodes.emplace_back(next_handle, current_distance); + seen_nodes.erase(make_pair(super_graph->get_id(next_handle), + super_graph->get_is_reverse(next_handle))); + }); + // seen_nodes.erase(make_pair(super_graph->get_id(bound), + // super_graph->get_is_reverse(bound))); search_start_nodes.emplace_back( + // bound, current_distance); + } +}; + +} // namespace vg diff --git a/src/snarls.cpp b/src/snarls.cpp index abaa507681..004021d6aa 100644 --- a/src/snarls.cpp +++ b/src/snarls.cpp @@ -10,6 +10,7 @@ #include "snarls.hpp" #include "vg/io/json2pb.h" #include "subgraph_overlay.hpp" +#include "crash.hpp" namespace vg { @@ -20,7 +21,7 @@ SnarlManager SnarlFinder::find_snarls_parallel() { } HandleGraphSnarlFinder::HandleGraphSnarlFinder(const HandleGraph* graph) : graph(graph) { - // Nothing to do! + crash_unless(graph != nullptr); } SnarlManager HandleGraphSnarlFinder::find_snarls_unindexed() { diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index 4460e43d9e..11c09b9c86 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -801,7 +801,7 @@ int main_call(int argc, char** argv) { unique_ptr alignment_emitter; if (gaf_output) { - alignment_emitter = vg::io::get_non_hts_alignment_emitter("-", "GAF", {}, get_thread_count(), graph); + alignment_emitter = vg::io::get_non_hts_alignment_emitter("-", "GAF", {}, vg::get_thread_count(), graph); // TODO: There should be a general function for emitting headers. See giraffe_main.cpp. io::GafAlignmentEmitter* gaf_emitter = dynamic_cast(alignment_emitter.get()); if (gbz_graph.get() != nullptr && gaf_emitter != nullptr) { diff --git a/src/subcommand/gampcompare_main.cpp b/src/subcommand/gampcompare_main.cpp index 01a5d59717..96bcd85ec0 100644 --- a/src/subcommand/gampcompare_main.cpp +++ b/src/subcommand/gampcompare_main.cpp @@ -215,8 +215,13 @@ int main_gampcompare(int argc, char** argv) { for (size_t j = 0; j < path_mapped_positions.size(); ++j) { if (path_true_positions[i].second == path_mapped_positions[j].second) { // there is a pair of positions on the same strand of the same path - abs_dist = min(abs_dist, - std::abs(static_cast(path_true_positions[i].first) - static_cast(path_mapped_positions[j].first))); + abs_dist = min( + abs_dist, + std::abs( + static_cast(path_true_positions[i].first) - + static_cast(path_mapped_positions[j].first) + ) + ); } } } diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index a9cab2c577..f1cf0250a5 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1756,42 +1756,47 @@ int main_giraffe(int argc, char** argv) { if (show_progress) { logger.info() << "Loading Minimizer Index" << endl; } + IndexName minimizer_indexname; unique_ptr minimizer_index; MinimizerIndexParameters::PayloadType payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES; if (map_long_reads) { if (use_path_minimizer) { - minimizer_index = vg::io::VPKG::load_one(registry.require("Long Read PathMinimizers").at(0)); + minimizer_indexname = "Long Read PathMinimizers"; payload_type = MinimizerIndexParameters::PAYLOAD_ZIPCODES_WITH_PATHS; } else { // Use the long read minimizers - minimizer_index = vg::io::VPKG::load_one(registry.require("Long Read Minimizers").at(0)); + minimizer_indexname = "Long Read Minimizers"; } } else { - minimizer_index = vg::io::VPKG::load_one(registry.require("Short Read Minimizers").at(0)); + minimizer_indexname = "Short Read Minimizers"; } + if (!registry.predates("Giraffe Distance Index", minimizer_indexname)) { + logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(minimizer_indexname).at(0) << " which depends on it" << std::endl; + } + minimizer_index = vg::io::VPKG::load_one(registry.require(minimizer_indexname).at(0)); require_payload(*minimizer_index, payload_type); // Grab the zipcodes if (show_progress) { logger.info() << "Loading Zipcodes" << endl; } + IndexName oversized_zipcodes_indexname; ZipCodeCollection oversized_zipcodes; if (map_long_reads) { if (use_path_minimizer) { - ifstream zip_in (registry.require("Long Read PathZipcodes").at(0)); - oversized_zipcodes.deserialize(zip_in); - zip_in.close(); + oversized_zipcodes_indexname = "Long Read PathZipcodes"; } else { - ifstream zip_in (registry.require("Long Read Zipcodes").at(0)); - oversized_zipcodes.deserialize(zip_in); - zip_in.close(); + oversized_zipcodes_indexname = "Long Read Zipcodes"; } - } else { - ifstream zip_in (registry.require("Short Read Zipcodes").at(0)); - oversized_zipcodes.deserialize(zip_in); - zip_in.close(); + oversized_zipcodes_indexname = "Short Read Zipcodes"; + } + if (!registry.predates("Giraffe Distance Index", oversized_zipcodes_indexname)) { + logger.error() << registry.require("Giraffe Distance Index").at(0) << " is newer than " << registry.require(oversized_zipcodes_indexname).at(0) << " which depends on it" << std::endl; } + ifstream zip_in (registry.require(oversized_zipcodes_indexname).at(0)); + oversized_zipcodes.deserialize(zip_in); + zip_in.close(); // Grab the GBZ @@ -1805,6 +1810,14 @@ int main_giraffe(int argc, char** argv) { if (show_progress) { logger.info() << "Loading Distance Index" << endl; } + // TODO: Now that we enforce that the minimizer and zipcodes files are + // newer than the distance index, we really shouldn't modify it ourselves + // by fixing any indirect pointers that may still be in it. So we should be + // able to open the file read-only and map the file read-only here, which + // in turn would solve problems with writable mappings being slow on shared + // filesystems even when not being written. But the VPKG system doesn't + // really support doing that, so we'd have to get the file descriptor + // manually and deserialize() on it and close() it later. auto distance_index = vg::io::VPKG::load_one(registry.require("Giraffe Distance Index").at(0)); if (show_progress) { diff --git a/src/subcommand/haplotypes_main.cpp b/src/subcommand/haplotypes_main.cpp index f0fa15c20f..4bffd2b852 100644 --- a/src/subcommand/haplotypes_main.cpp +++ b/src/subcommand/haplotypes_main.cpp @@ -965,7 +965,7 @@ void validate_error_sequence(const Logger& logger, size_t chain_id, size_t subch } std::string validate_unary_path(const HandleGraph& graph, handle_t from, handle_t to) { - hash_set visited; + vg::hash_set visited; handle_t curr = from; while (curr != to) { if (visited.find(curr) != visited.end()) { @@ -989,7 +989,7 @@ std::string validate_unary_path(const HandleGraph& graph, handle_t from, handle_ // Returns true if the path from (start, offset) reaches the end without revisiting start or leaving the subchain. // The path may continue in subsequent fragments. bool trace_path( - const gbwt::GBWT& index, const gbwt::FragmentMap& fragment_map, const hash_set& subchain_nodes, + const gbwt::GBWT& index, const gbwt::FragmentMap& fragment_map, const vg::hash_set& subchain_nodes, gbwt::size_type sequence_id, gbwt::node_type start, gbwt::size_type offset, gbwt::node_type end ) { gbwt::edge_type pos(start, offset); @@ -1132,8 +1132,8 @@ void validate_chain(const Logger& logger, // Sequences: normal subchains. if (subchain.type == Haplotypes::Subchain::normal) { std::vector da = r_index.decompressDA(subchain.start); - hash_set nodes = extract_subchain(graph, gbwtgraph::GBWTGraph::node_to_handle(subchain.start), gbwtgraph::GBWTGraph::node_to_handle(subchain.end)); - hash_set selected; + vg::hash_set nodes = extract_subchain(graph, gbwtgraph::GBWTGraph::node_to_handle(subchain.start), gbwtgraph::GBWTGraph::node_to_handle(subchain.end)); + vg::hash_set selected; for (size_t i = 0; i < da.size(); i++) { if (trace_path(*(graph.index), fragment_map, nodes, da[i], subchain.start, i, subchain.end)) { selected.insert(Haplotypes::sequence_type(da[i], i)); @@ -1159,7 +1159,7 @@ void validate_chain(const Logger& logger, std::string message = expected_got(da.size(), subchain.sequences.size()) + " sequences (prefix / suffix)"; validate_error_subchain(logger, chain_id, subchain_id, message); } - hash_set truth; + vg::hash_set truth; for (size_t i = 0; i < da.size(); i++) { truth.insert({ da[i], i }); } @@ -1180,7 +1180,7 @@ void validate_chain(const Logger& logger, // Kmers. if (subchain.type != Haplotypes::Subchain::full_haplotype) { - hash_set all_kmers; + vg::hash_set all_kmers; for (size_t i = 0; i < subchain.kmers.size(); i++) { all_kmers.insert(subchain.kmers[i]); } @@ -1188,14 +1188,14 @@ void validate_chain(const Logger& logger, std::string message = expected_got(subchain.kmers.size(), all_kmers.size()) + " kmers"; validate_error_subchain(logger, chain_id, subchain_id, message); } - hash_map used_kmers; // (kmer used in haplotypes, number of sequences that contain it) - hash_map missing_kmers; // (kmer not used in haplotypes, number of sequences that contain it) + vg::hash_map used_kmers; // (kmer used in haplotypes, number of sequences that contain it) + vg::hash_map missing_kmers; // (kmer not used in haplotypes, number of sequences that contain it) for (size_t i = 0; i < subchain.sequences.size(); i++) { std::vector haplotype = get_haplotype( graph, fragment_map, subchain.sequences[i], subchain.start, subchain.end, minimizer_index.k() ); - hash_map unique_minimizers; // (kmer, used in the sequence) + vg::hash_map unique_minimizers; // (kmer, used in the sequence) for (const std::string& sequence : haplotype) { auto minimizers = minimizer_index.minimizers(sequence); for (auto& minimizer : minimizers) { @@ -1322,7 +1322,7 @@ void validate_haplotypes(const Logger& logger, if (verbosity >= HaplotypePartitioner::Verbosity::verbosity_detailed) { logger.info() << "Validating kmer specificity" << std::endl; } - hash_map> kmers; + vg::hash_map> kmers; size_t collisions = 0, total_kmers = 0; for (size_t chain_id = 0; chain_id < haplotypes.components(); chain_id++) { const Haplotypes::TopLevelChain& chain = haplotypes.chains[chain_id]; diff --git a/src/subcommand/inject_main.cpp b/src/subcommand/inject_main.cpp index 6fc7a99b8c..3dd6672fd6 100644 --- a/src/subcommand/inject_main.cpp +++ b/src/subcommand/inject_main.cpp @@ -138,7 +138,7 @@ int main_inject(int argc, char** argv) { set_crash_context(aln.name()); if (add_identity) { // Calculate & save identity statistic - aln.set_identity(identity(aln.path())); + aln.set_identity(vg::identity(aln.path())); } if (rescore) { // Rescore the alignment diff --git a/src/subcommand/minimizer_main.cpp b/src/subcommand/minimizer_main.cpp index 8635df2a97..53be9c44d6 100644 --- a/src/subcommand/minimizer_main.cpp +++ b/src/subcommand/minimizer_main.cpp @@ -91,6 +91,12 @@ int main_minimizer(int argc, char** argv) { logger.info() << "Loading SnarlDistanceIndex from " << config.distance_name << std::endl; } distance_index = vg::io::VPKG::load_one(config.distance_name); + // Preload the index eagerly to establish it as recently-used in the OS + // page cache. Even though kmer counting may evict some pages, we + // re-preload right before cache_payloads. The double-preload is + // necessary: a single preload just before cache_payloads isn't enough + // to keep the index resident under the memory pressure of 32 parallel + // threads and the remaining in-memory data structures. distance_index->preload(true); } @@ -102,13 +108,16 @@ int main_minimizer(int argc, char** argv) { config.params ); - // Serialize the index and the oversized zipcodes. + // Close the distance index so it can't seem to be modified after the files + // that depend on it. + distance_index.reset(); + + // Serialize the minimizer index and the oversized zipcodes. save_minimizer(index, config.output_name); if (!config.zipcode_name.empty()) { std::ofstream zip_out(config.zipcode_name); oversized_zipcodes.serialize(zip_out); zip_out.close(); - } if (config.progress) { diff --git a/src/subcommand/pack_main.cpp b/src/subcommand/pack_main.cpp index 8d6d7155a9..10146cd2ff 100644 --- a/src/subcommand/pack_main.cpp +++ b/src/subcommand/pack_main.cpp @@ -200,7 +200,7 @@ int main_pack(int argc, char** argv) { // use some naive heuristics to come up with bin count and batch size based on thread count // more bins: finer grained parallelism at cost of more mutexes and allocations // bigger batch size: more robustness to sorted input at cost of less parallelism - size_t num_threads = get_thread_count(); + size_t num_threads = vg::get_thread_count(); size_t batch_size = Packer::estimate_batch_size(num_threads); size_t bin_count = Packer::estimate_bin_count(num_threads); diff --git a/src/unittest/banded_global_aligner.cpp b/src/unittest/banded_global_aligner.cpp index 045e9bfa97..6b5fb4b3c8 100644 --- a/src/unittest/banded_global_aligner.cpp +++ b/src/unittest/banded_global_aligner.cpp @@ -10,7 +10,7 @@ #include "vg.hpp" #include "path.hpp" #include "banded_global_aligner.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include "bdsg/hash_graph.hpp" #include "../algorithms/pad_band.hpp" @@ -3515,10 +3515,9 @@ namespace vg { SECTION( "Banded global aligner does not produce empty edits when there is an insertion an empty node") { string graph_json = R"({"edge": [{"to_end": true, "from_start": true, "to": 22, "from": 20}, {"to": 26, "from": 20}, {"to": 24, "from": 20}, {"to_end": true, "from_start": true, "to": 26, "from": 4}, {"to_end": true, "from_start": true, "to": 24, "from": 4}], "node": [{"sequence": "C", "id": 24}, {"sequence": "GAGA", "id": 20}, {"sequence": "T", "id": 26}, {"sequence": "GGAGTCT", "id": 4}, {"id": 22}]})"; - - Graph graph; - json2pb(graph, graph_json.c_str(), graph_json.size()); - VG vg_graph(graph); + + bdsg::HashGraph vg_graph; + vg::io::json2graph(graph_json, &vg_graph); TestAligner aligner_source; const Aligner& aligner = *aligner_source.get_regular_aligner(); diff --git a/src/unittest/cactus.cpp b/src/unittest/cactus.cpp index 7447ee247d..5e518db4ef 100644 --- a/src/unittest/cactus.cpp +++ b/src/unittest/cactus.cpp @@ -5,8 +5,9 @@ #include #include -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include "../cactus.hpp" +#include #include "catch.hpp" namespace vg { @@ -14,9 +15,7 @@ namespace unittest { using namespace std; TEST_CASE("We can convert a two-tailed graph to Cactus", "[cactus]") { - - VG graph; - + string graph_json = R"( {"node":[{"sequence":"GT","id":7575}, {"sequence":"TGTTAACAGCACAACATTTA","id":7580}, @@ -25,20 +24,18 @@ TEST_CASE("We can convert a two-tailed graph to Cactus", "[cactus]") { "edge":[{"from":7575,"to":7580,"from_start":true}, {"from":7575,"to":7576}]} )"; - - Graph g; - json2pb(g, graph_json.c_str(), graph_json.size()); - graph.extend(g); - // Make sure we can make a Cactus graph and get something out. + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + // Make sure we can make a Cactus graph and get something out. auto cactusified = cactusify(graph); REQUIRE(cactusified.is_valid()); } TEST_CASE("We can convert a hairpin graph to Cactus", "[cactus]") { - VG graph; - + // Here's a graph where only the left side of node 2 is dangling, and the right side of node 1 has a self loop. string graph_json = R"( {"node": [{"sequence": "A", "id": 1}, @@ -46,12 +43,11 @@ TEST_CASE("We can convert a hairpin graph to Cactus", "[cactus]") { "edge": [{"from": 2, "to": 1}, {"from": 1, "to": 1, "to_end": true}]} )"; - - Graph g; - json2pb(g, graph_json.c_str(), graph_json.size()); - graph.extend(g); - // Make sure we can make a Cactus graph and get something out. + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + // Make sure we can make a Cactus graph and get something out. auto cactusified = cactusify(graph); REQUIRE(cactusified.is_valid()); } diff --git a/src/unittest/chunker.cpp b/src/unittest/chunker.cpp index 24f7d3b645..3be2298c15 100644 --- a/src/unittest/chunker.cpp +++ b/src/unittest/chunker.cpp @@ -7,6 +7,8 @@ #include "vg.hpp" #include "xg.hpp" #include "path.hpp" +#include "../io/json2graph.hpp" +#include namespace vg { namespace unittest { @@ -83,13 +85,13 @@ TEST_CASE("basic graph chunking", "[chunk]") { )"; - // Load it into Protobuf - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - + // Load the graph + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Pass it over to XG xg::XG index; - index.from_path_handle_graph(VG(chunk)); + index.from_path_handle_graph(graph); PathChunker chunker(&index); diff --git a/src/unittest/copy_graph.cpp b/src/unittest/copy_graph.cpp index 581b683130..4e7e878075 100644 --- a/src/unittest/copy_graph.cpp +++ b/src/unittest/copy_graph.cpp @@ -1,6 +1,7 @@ #include "catch.hpp" #include "../handle.hpp" #include "../vg.hpp" +#include "../io/json2graph.hpp" #include "xg.hpp" #include "bdsg/packed_graph.hpp" @@ -53,14 +54,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); VG vg; handlealgs::copy_handle_graph(&xg, &vg); - + REQUIRE(xg.get_node_count() == 1); REQUIRE(vg.get_node_count() == 1); } @@ -72,14 +74,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::PackedGraph pg; handlealgs::copy_handle_graph(&xg, &pg); - + REQUIRE(xg.get_node_count() == 1); REQUIRE(pg.get_node_count() == 1); } @@ -91,14 +94,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::HashGraph hg; handlealgs::copy_handle_graph(&xg, &hg); - + REQUIRE(xg.get_node_count() == 1); REQUIRE(hg.get_node_count() == 1); } @@ -120,19 +124,20 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); VG vg; handlealgs::copy_handle_graph(&xg, &vg); - + REQUIRE(xg.get_node_count() == 4); REQUIRE(vg.get_node_count() == 4); REQUIRE(vg.edge_count() == 4); REQUIRE(vg.length() == 16); - + } TEST_CASE( "copy_handle_graph converter works on graphs with one reversing edge, xg to pg", "[handle][pg][xg]") { string graph_json = R"( @@ -151,14 +156,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::PackedGraph pg; handlealgs::copy_handle_graph(&xg, &pg); - + REQUIRE(xg.get_node_count() == 4); REQUIRE(pg.get_node_count() == 4); @@ -168,14 +174,14 @@ namespace vg { return true; }); REQUIRE(length == 16); - + int edge_count = 0; pg.for_each_edge([&](const edge_t& edge) { edge_count += 1; return true; }); REQUIRE(edge_count == 4); - + } TEST_CASE( "copy_handle_graph converter works on graphs with one reversing edge, xg to hg", "[handle][hg][xg]") { string graph_json = R"( @@ -194,14 +200,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::HashGraph hg; handlealgs::copy_handle_graph(&xg, &hg); - + REQUIRE(xg.get_node_count() == 4); REQUIRE(hg.get_node_count() == 4); int length = 0; @@ -210,14 +217,14 @@ namespace vg { return true; }); REQUIRE(length == 16); - + int edge_count = 0; hg.for_each_edge([&](const edge_t& edge) { edge_count += 1; return true; }); REQUIRE(edge_count == 4); - + } TEST_CASE( "copy_handle_graph converter works on graphs with reversing edges and loops", "[handle][vg][xg]") { string graph_json = R"( @@ -239,14 +246,15 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); VG vg; handlealgs::copy_handle_graph(&xg, &vg); - + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); @@ -274,26 +282,27 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::PackedGraph pg; handlealgs::copy_handle_graph(&xg, &pg); - + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); REQUIRE(pg.get_node_count() == 4); - + int length = 0; pg.for_each_handle([&](const handle_t& here) { length += pg.get_length(here); return true; }); REQUIRE(length == 16); - + int edge_count = 0; pg.for_each_edge([&](const edge_t& edge) { edge_count += 1; @@ -321,26 +330,27 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::HashGraph hg; handlealgs::copy_handle_graph(&xg, &hg); - + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); REQUIRE(hg.get_node_count() == 4); - + int length = 0; hg.for_each_handle([&](const handle_t& here) { length += hg.get_length(here); return true; }); REQUIRE(length == 16); - + int edge_count = 0; hg.for_each_edge([&](const edge_t& edge) { edge_count += 1; @@ -382,16 +392,17 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); VG vg; handlealgs::copy_path_handle_graph(&xg, &vg); - - + + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); @@ -444,37 +455,38 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::PackedGraph pg; handlealgs::copy_path_handle_graph(&xg, &pg); - - - + + + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); REQUIRE(pg.get_node_count() == 4); - + int length = 0; pg.for_each_handle([&](const handle_t& here) { length += pg.get_length(here); return true; }); REQUIRE(length == 16); - + int edge_count = 0; pg.for_each_edge([&](const edge_t& edge) { edge_count += 1; return true; }); REQUIRE(edge_count == 7); - - + + REQUIRE(pg.has_path("path1") == true); REQUIRE(pg.has_path("path2") == true); REQUIRE(pg.get_path_count() == 2); @@ -521,37 +533,38 @@ namespace vg { ] } )"; - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + xg::XG xg; - xg.from_path_handle_graph(VG(proto_graph)); + xg.from_path_handle_graph(source); bdsg::HashGraph hg; handlealgs::copy_path_handle_graph(&xg, &hg); - - - + + + REQUIRE(xg.get_sequence(xg.get_handle(1)) == "GATT"); REQUIRE(xg.get_sequence(xg.get_handle(3)) == "CGAT"); REQUIRE(xg.get_node_count() == 4); REQUIRE(hg.get_node_count() == 4); - - + + int length = 0; hg.for_each_handle([&](const handle_t& here) { length += hg.get_length(here); return true; }); REQUIRE(length == 16); - + int edge_count = 0; hg.for_each_edge([&](const edge_t& edge) { edge_count += 1; return true; }); REQUIRE(edge_count == 7); - - + + REQUIRE(hg.has_path("path1") == true); REQUIRE(hg.has_path("path2") == true); REQUIRE(hg.get_path_count() == 2); diff --git a/src/unittest/dijkstra.cpp b/src/unittest/dijkstra.cpp index 2608567153..4e94414040 100644 --- a/src/unittest/dijkstra.cpp +++ b/src/unittest/dijkstra.cpp @@ -6,7 +6,7 @@ #include #include #include "../handle.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include "../vg.hpp" #include "catch.hpp" @@ -125,14 +125,12 @@ TEST_CASE("Dijkstra search handles early stopping correctly", "[dijkstra][algori TEST_CASE("Dijkstra search works on a particular problem graph", "[dijkstra][algorithms]") { string graph_json = R"( -{"node":[{"sequence":"A","id":"2454530"},{"sequence":"AGTGCTGGAGAGGATGTGGAGAAATAGGAAC","id":"2454529"},{"sequence":"C","id":"2454532"},{"sequence":"TTTTACACTGTTGGTGGGACTGTAAA","id":"2454533"},{"sequence":"A","id":"2454527"},{"sequence":"C","id":"2454528"},{"sequence":"G","id":"2454531"},{"sequence":"C","id":"2454534"},{"sequence":"T","id":"2454535"},{"sequence":"GGGTAATAA","id":"2454526"},{"sequence":"TAGTTCAACCATTGTGGAAGACTGTGGCAATT","id":"2454536"}],"edge":[{"from":"2454530","to":"2454532"},{"from":"2454530","to":"2454533"},{"from":"2454529","to":"2454530"},{"from":"2454529","to":"2454531"},{"from":"2454532","to":"2454533"},{"from":"2454533","to":"2454534"},{"from":"2454533","to":"2454535"},{"from":"2454527","to":"2454529"},{"from":"2454528","to":"2454529"},{"from":"2454531","to":"2454532"},{"from":"2454531","to":"2454533"},{"from":"2454534","to":"2454536"},{"from":"2454535","to":"2454536"},{"from":"2454526","to":"2454527"},{"from":"2454526","to":"2454528"}],"path":[{"name":"21","mapping":[{"position":{"node_id":"2454526"},"edit":[{"from_length":9,"to_length":9}],"rank":"3049077"},{"position":{"node_id":"2454528"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049078"},{"position":{"node_id":"2454529"},"edit":[{"from_length":31,"to_length":31}],"rank":"3049079"},{"position":{"node_id":"2454531"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049080"},{"position":{"node_id":"2454532"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049081"},{"position":{"node_id":"2454533"},"edit":[{"from_length":26,"to_length":26}],"rank":"3049082"},{"position":{"node_id":"2454535"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049083"},{"position":{"node_id":"2454536"},"edit":[{"from_length":32,"to_length":32}],"rank":"3049084"}]}]} +{"node":[{"sequence":"A","id":"2454530"},{"sequence":"AGTGCTGGAGAGGATGTGGAGAAATAGGAAC","id":"2454529"},{"sequence":"C","id":"2454532"},{"sequence":"TTTTACACTGTTGGTGGGACTGTAAA","id":"2454533"},{"sequence":"A","id":"2454527"},{"sequence":"C","id":"2454528"},{"sequence":"G","id":"2454531"},{"sequence":"C","id":"2454534"},{"sequence":"T","id":"2454535"},{"sequence":"GGGTAATAA","id":"2454526"},{"sequence":"TAGTTCAACCATTGTGGAAGACTGTGGCAATT","id":"2454536"}],"edge":[{"from":"2454530","to":"2454532"},{"from":"2454530","to":"2454533"},{"from":"2454529","to":"2454530"},{"from":"2454529","to":"2454531"},{"from":"2454532","to":"2454533"},{"from":"2454533","to":"2454534"},{"from":"2454533","to":"2454535"},{"from":"2454527","to":"2454529"},{"from":"2454528","to":"2454529"},{"from":"2454531","to":"2454532"},{"from":"2454531","to":"2454533"},{"from":"2454534","to":"2454536"},{"from":"2454535","to":"2454536"},{"from":"2454526","to":"2454527"},{"from":"2454526","to":"2454528"}],"path":[{"name":"21","mapping":[{"position":{"node_id":"2454526"},"edit":[{"from_length":9,"to_length":9}],"rank":"3049077"},{"position":{"node_id":"2454528"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049078"},{"position":{"node_id":"2454529"},"edit":[{"from_length":31,"to_length":31}],"rank":"3049079"},{"position":{"node_id":"2454531"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049080"},{"position":{"node_id":"2454532"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049081"},{"position":{"node_id":"2454533"},"edit":[{"from_length":26,"to_length":26}],"rank":"3049082"},{"position":{"node_id":"2454535"},"edit":[{"from_length":1,"to_length":1}],"rank":"3049083"},{"position":{"node_id":"2454536"},"edit":[{"from_length":32,"to_length":32}],"rank":"3049084"}]}]} )"; - - Graph g; - json2pb(g, graph_json); - - // Wrap the graph in a HandleGraph - VG graph(g); + + // Load the graph + HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Decide where to start handle_t start = graph.get_handle(2454536, true); diff --git a/src/unittest/gbwt_extender.cpp b/src/unittest/gbwt_extender.cpp index d04a225fdb..4835fbd511 100644 --- a/src/unittest/gbwt_extender.cpp +++ b/src/unittest/gbwt_extender.cpp @@ -5,7 +5,7 @@ #include "../gbwt_extender.hpp" #include "../gbwt_helper.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include "../utility.hpp" #include "../vg.hpp" @@ -90,10 +90,9 @@ gbwt::GBWT build_gbwt_index() { // Build a GBWTGraph using the provided GBWT index. gbwtgraph::GBWTGraph build_gbwt_graph(const gbwt::GBWT& gbwt_index) { - Graph graph; - json2pb(graph, gapless_extender_graph.c_str(), gapless_extender_graph.size()); - VG vg_graph(graph); - return gbwtgraph::GBWTGraph(gbwt_index, vg_graph, nullptr); + bdsg::HashGraph graph; + vg::io::json2graph(gapless_extender_graph, &graph); + return gbwtgraph::GBWTGraph(gbwt_index, graph, nullptr); } void same_position(const Position& pos, const Position& correct) { diff --git a/src/unittest/genotypekit.cpp b/src/unittest/genotypekit.cpp index af9bc2a4d8..b5d460c59a 100644 --- a/src/unittest/genotypekit.cpp +++ b/src/unittest/genotypekit.cpp @@ -10,6 +10,8 @@ #include "../traversal_finder.hpp" #include "xg.hpp" #include "../haplotype_extracter.hpp" +#include "../io/json2graph.hpp" +#include namespace Catch { @@ -62,10 +64,10 @@ namespace vg { namespace unittest { TEST_CASE("sites can be found with Cactus", "[genotype]") { - + // Build a toy graph const string graph_json = R"( - + { "node": [ {"id": 1, "sequence": "G"}, @@ -90,7 +92,7 @@ TEST_CASE("sites can be found with Cactus", "[genotype]") { {"from": 6, "to": 8}, {"from": 7, "to": 9}, {"from": 8, "to": 9} - + ], "path": [ {"name": "hint", "mapping": [ @@ -101,14 +103,13 @@ TEST_CASE("sites can be found with Cactus", "[genotype]") { ]} ] } - + )"; - + // Make an actual graph + // Note: Using VG here because the test uses VG-specific methods like get_node() and get_edge() VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + vg::io::json2graph(graph_json, &graph); // Make a CactusSnarlFinder unique_ptr finder(new CactusSnarlFinder(graph)); @@ -196,10 +197,10 @@ TEST_CASE("sites can be found with Cactus", "[genotype]") { } TEST_CASE("sites can be found with the IntegratedSnarlFinder", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( - + { "node": [ {"id": 1, "sequence": "G"}, @@ -224,7 +225,7 @@ TEST_CASE("sites can be found with the IntegratedSnarlFinder", "[genotype][integ {"from": 6, "to": 8}, {"from": 7, "to": 9}, {"from": 8, "to": 9} - + ], "path": [ {"name": "hint", "mapping": [ @@ -235,14 +236,13 @@ TEST_CASE("sites can be found with the IntegratedSnarlFinder", "[genotype][integ ]} ] } - + )"; - + // Make an actual graph + // Note: Using VG here because the test uses VG-specific methods like get_node() and get_edge() VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -329,7 +329,7 @@ TEST_CASE("sites can be found with the IntegratedSnarlFinder", "[genotype][integ } TEST_CASE("IntegratedSnarlFinder works when cactus graph contains back-to-back cycles along root path", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -351,17 +351,15 @@ TEST_CASE("IntegratedSnarlFinder works when cactus graph contains back-to-back c {"from": 3, "to": 5}, {"from": 4, "to": 6}, {"from": 5, "to": 6} - + ] } )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -375,18 +373,16 @@ TEST_CASE("IntegratedSnarlFinder works when cactus graph contains back-to-back c } TEST_CASE("IntegratedSnarlFinder works on an all bridge edge Y graph with specific numbering", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( {"node":[{"id":"2","sequence":"G"},{"id":"3","sequence":"G"},{"id":"4","sequence":"G"},{"id":"5","sequence":"G"},{"id":"6","sequence":"G"},{"id":"11","sequence":"G"}], - "edge":[{"from":"2","to":"3"},{"from":"3","to":"6"},{"from":"4","to":"5"},{"from":"5","to":"6"},{"from":"6","to":"11"}]} + "edge":[{"from":"2","to":"3"},{"from":"3","to":"6"},{"from":"4","to":"5"},{"from":"5","to":"6"},{"from":"6","to":"11"}]} )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -403,18 +399,16 @@ TEST_CASE("IntegratedSnarlFinder works on an all bridge edge Y graph with specif } TEST_CASE("IntegratedSnarlFinder roots correctly an all bridge edge Y graph with winning longest path", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( {"node":[{"id":"2","sequence":"G"},{"id":"3","sequence":"G"},{"id":"4","sequence":"GG"},{"id":"5","sequence":"G"},{"id":"6","sequence":"G"},{"id":"11","sequence":"GG"}], - "edge":[{"from":"2","to":"3"},{"from":"3","to":"6"},{"from":"4","to":"5"},{"from":"5","to":"6"},{"from":"6","to":"11"}]} + "edge":[{"from":"2","to":"3"},{"from":"3","to":"6"},{"from":"4","to":"5"},{"from":"5","to":"6"},{"from":"6","to":"11"}]} )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -452,7 +446,7 @@ TEST_CASE("IntegratedSnarlFinder roots correctly an all bridge edge Y graph with } TEST_CASE("IntegratedSnarlFinder works when cactus graph contains longer back-to-back cycles along root path", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -482,17 +476,15 @@ TEST_CASE("IntegratedSnarlFinder works when cactus graph contains longer back-to {"from": 32, "to": 5}, {"from": 4, "to": 6}, {"from": 5, "to": 6} - + ] } )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -506,50 +498,48 @@ TEST_CASE("IntegratedSnarlFinder works when cactus graph contains longer back-to } TEST_CASE("IntegratedSnarlFinder works on a complex bundle-y region with a nested snarl", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( {"edge": [{"from": "129672", "to": "129673"}, - {"from": "129662", "to": "129663"}, - {"from": "129662", "to": "129664"}, - {"from": "129664", "to": "129665"}, - {"from": "129664", "to": "129666"}, - {"from": "129666", "to": "129668"}, - {"from": "129666", "to": "129669"}, - {"from": "129666", "to": "129667"}, - {"from": "129667", "to": "129668"}, - {"from": "129667", "to": "129669"}, - {"from": "129669", "to": "129670"}, - {"from": "129669", "to": "129673"}, - {"from": "129671", "to": "129672"}, - {"from": "129668", "to": "129670"}, - {"from": "129668", "to": "129673"}, - {"from": "129665", "to": "129668"}, - {"from": "129665", "to": "129669"}, - {"from": "129665", "to": "129667"}, - {"from": "129670", "to": "129671"}, - {"from": "129670", "to": "129672"}, - {"from": "129663", "to": "129665"}, - {"from": "129663", "to": "129666"}], - "node": [{"id": "129672", "sequence": "AT"}, - {"id": "129662", "sequence": "CAGGTCAAACTGTGAT"}, - {"id": "129664", "sequence": "T"}, - {"id": "129666", "sequence": "T"}, - {"id": "129667", "sequence": "G"}, - {"id": "129669", "sequence": "G"}, - {"id": "129671", "sequence": "T"}, - {"id": "129668", "sequence": "A"}, - {"id": "129665", "sequence": "A"}, - {"id": "129670", "sequence": "A"}, - {"id": "129673", "sequence": "ATATATATATACTTATTGTAAAAATCTTTAGA"}, + {"from": "129662", "to": "129663"}, + {"from": "129662", "to": "129664"}, + {"from": "129664", "to": "129665"}, + {"from": "129664", "to": "129666"}, + {"from": "129666", "to": "129668"}, + {"from": "129666", "to": "129669"}, + {"from": "129666", "to": "129667"}, + {"from": "129667", "to": "129668"}, + {"from": "129667", "to": "129669"}, + {"from": "129669", "to": "129670"}, + {"from": "129669", "to": "129673"}, + {"from": "129671", "to": "129672"}, + {"from": "129668", "to": "129670"}, + {"from": "129668", "to": "129673"}, + {"from": "129665", "to": "129668"}, + {"from": "129665", "to": "129669"}, + {"from": "129665", "to": "129667"}, + {"from": "129670", "to": "129671"}, + {"from": "129670", "to": "129672"}, + {"from": "129663", "to": "129665"}, + {"from": "129663", "to": "129666"}], + "node": [{"id": "129672", "sequence": "AT"}, + {"id": "129662", "sequence": "CAGGTCAAACTGTGAT"}, + {"id": "129664", "sequence": "T"}, + {"id": "129666", "sequence": "T"}, + {"id": "129667", "sequence": "G"}, + {"id": "129669", "sequence": "G"}, + {"id": "129671", "sequence": "T"}, + {"id": "129668", "sequence": "A"}, + {"id": "129665", "sequence": "A"}, + {"id": "129670", "sequence": "A"}, + {"id": "129673", "sequence": "ATATATATATACTTATTGTAAAAATCTTTAGA"}, {"id": "129663", "sequence": "G"}]} )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -579,23 +569,21 @@ TEST_CASE("IntegratedSnarlFinder works on a complex bundle-y region with a neste } TEST_CASE("CactusSnarlFinder safely handles a single node graph", "[genotype][cactus-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( - + { "node": [ {"id": 1, "sequence": "GATTACA"} ] } - + )"; - + // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make a CactusSnarlFinder unique_ptr finder(new CactusSnarlFinder(graph)); @@ -607,15 +595,13 @@ TEST_CASE("CactusSnarlFinder safely handles a single node graph", "[genotype][ca } TEST_CASE("IntegratedSnarlFinder safely handles a completely empty graph", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = "{}"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make a IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -625,7 +611,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a completely empty graph", "[gen } TEST_CASE("IntegratedSnarlFinder safely handles a single node graph", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -638,10 +624,8 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node graph", "[genotype )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -651,7 +635,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node graph", "[genotype } TEST_CASE("IntegratedSnarlFinder produces all the correct types of single-node chains", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -673,10 +657,8 @@ TEST_CASE("IntegratedSnarlFinder produces all the correct types of single-node c )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder IntegratedSnarlFinder finder(graph); @@ -736,7 +718,7 @@ TEST_CASE("IntegratedSnarlFinder produces all the correct types of single-node c } TEST_CASE("IntegratedSnarlFinder safely handles a path when forced to root at one end", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -757,10 +739,8 @@ TEST_CASE("IntegratedSnarlFinder safely handles a path when forced to root at on )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -770,7 +750,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a path when forced to root at on } TEST_CASE("IntegratedSnarlFinder safely handles a single node connected component in a larger graph", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -787,10 +767,8 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node connected componen )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -813,7 +791,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node connected componen } TEST_CASE("IntegratedSnarlFinder safely handles a single node cycle", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -828,10 +806,8 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node cycle", "[genotype )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -844,7 +820,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a single node cycle", "[genotype } TEST_CASE("IntegratedSnarlFinder safely handles a totally connected graph", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -866,10 +842,8 @@ TEST_CASE("IntegratedSnarlFinder safely handles a totally connected graph", "[ge )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -882,7 +856,7 @@ TEST_CASE("IntegratedSnarlFinder safely handles a totally connected graph", "[ge } TEST_CASE("IntegratedSnarlFinder prefers to root at a bridge edge path in a tie", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -903,10 +877,8 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a bridge edge path in a tie" )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -935,7 +907,7 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a bridge edge path in a tie" } TEST_CASE("IntegratedSnarlFinder prefers to root at a cycle that is 1 bp longer", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -956,10 +928,8 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a cycle that is 1 bp longer" )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -988,7 +958,7 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a cycle that is 1 bp longer" } TEST_CASE("IntegratedSnarlFinder prefers to root at a chain with an up-weighted node", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -1009,10 +979,8 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a chain with an up-weighted )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder that adds 10 bp to node 4's apparent length unique_ptr finder(new IntegratedSnarlFinder(graph, {{4, 10}})); @@ -1041,7 +1009,7 @@ TEST_CASE("IntegratedSnarlFinder prefers to root at a chain with an up-weighted } TEST_CASE("IntegratedSnarlFinder sees tips as disqualifying ultrabubbles", "[genotype][integrated-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( @@ -1066,10 +1034,8 @@ TEST_CASE("IntegratedSnarlFinder sees tips as disqualifying ultrabubbles", "[gen )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make an IntegratedSnarlFinder unique_ptr finder(new IntegratedSnarlFinder(graph)); @@ -1098,10 +1064,10 @@ TEST_CASE("IntegratedSnarlFinder sees tips as disqualifying ultrabubbles", "[gen } TEST_CASE("CactusSnarlFinder throws an error instead of crashing when the graph has no edges", "[genotype][cactus-snarl-finder]") { - + // Build a toy graph const string graph_json = R"( - + { "node": [ {"id": 1, "sequence": "G"}, @@ -1115,14 +1081,12 @@ TEST_CASE("CactusSnarlFinder throws an error instead of crashing when the graph {"id": 9, "sequence": "A"} ] } - + )"; - + // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make a CactusSnarlFinder unique_ptr finder(new CactusSnarlFinder(graph)); @@ -1183,7 +1147,7 @@ TEST_CASE("fixed priors can be assigned to genotypes", "[genotype]") { TEST_CASE("TrivialTraversalFinder can find traversals", "[genotype]") { // Build a toy graph const string graph_json = R"( - + { "node": [ {"id": 1, "sequence": "G"}, @@ -1208,7 +1172,7 @@ TEST_CASE("TrivialTraversalFinder can find traversals", "[genotype]") { {"from": 6, "to": 8}, {"from": 7, "to": 9}, {"from": 8, "to": 9} - + ], "path": [ {"name": "hint", "mapping": [ @@ -1219,14 +1183,12 @@ TEST_CASE("TrivialTraversalFinder can find traversals", "[genotype]") { ]} ] } - + )"; - + // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make a site Snarl site; @@ -1329,12 +1291,10 @@ TEST_CASE("CactusSnarlFinder can differentiate ultrabubbles from snarls", "[geno ] } )"; - + // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Find the snarls CactusSnarlFinder cubs(graph); @@ -1381,10 +1341,8 @@ TEST_CASE("CactusSnarlFinder can differentiate ultrabubbles from snarls", "[geno )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Find the snarls CactusSnarlFinder cubs(graph); @@ -1454,10 +1412,8 @@ TEST_CASE("IntegratedSnarlFinder can differentiate ultrabubbles from snarls", "[ )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Find the snarls IntegratedSnarlFinder cubs(graph); @@ -1504,10 +1460,8 @@ TEST_CASE("IntegratedSnarlFinder can differentiate ultrabubbles from snarls", "[ )"; // Make an actual graph - VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Find the snarls IntegratedSnarlFinder cubs(graph); @@ -1581,11 +1535,9 @@ TEST_CASE("RepresentativeTraversalFinder finds traversals correctly", "[genotype } )"; - // Make an actual graph + // Load the graph. Needs to be a vg because we will give it to a SupportAugmentedGraph later. VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + vg::io::json2graph(graph_json, &graph); // Find the snarls CactusSnarlFinder cubs(graph); @@ -1713,11 +1665,9 @@ TEST_CASE("RepresentativeTraversalFinder finds traversals of simple inversions", } )"; - // Make an actual graph + // Load the graph. Needs to be a vg because we will give it to a SupportAugmentedGraph later. VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + vg::io::json2graph(graph_json, &graph); // Find the snarls CactusSnarlFinder cubs(graph); @@ -1774,11 +1724,11 @@ TEST_CASE("GBWTTraversalFinder finds traversals for GBWT threads", "[genotype][g string graph_json = R"({"node": [{"id": 1, "sequence": "CAAATAAGGCTT"}, {"id": 2, "sequence": "G"}, {"id": 3, "sequence": "GGAAATTTTC"}, {"id": 4, "sequence": "C"}, {"id": 5, "sequence": "TGGAGTTCTATTATATTCC"}, {"id": 6, "sequence": "G"}, {"id": 7, "sequence": "A"}, {"id": 8, "sequence": "ACTCTCTGGTTCCTG"}, {"id": 9, "sequence": "A"}, {"id": 10, "sequence": "G"}, {"id": 11, "sequence": "TGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCA"}], "edge": [{"from": 1, "to": 2}, {"from": 1, "to": 3}, {"from": 2, "to": 3}, {"from": 3, "to": 4}, {"from": 3, "to": 5}, {"from": 4, "to": 5}, {"from": 5, "to": 6}, {"from": 5, "to": 7}, {"from": 6, "to": 8}, {"from": 7, "to": 8}, {"from": 8, "to": 9}, {"from": 8, "to": 10}, {"from": 9, "to": 11}, {"from": 10, "to": 11}]})"; // Load the JSON - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(vg::VG(proto_graph)); + xg_index.from_path_handle_graph(graph); gbwt::Verbosity::set(gbwt::Verbosity::SILENT); diff --git a/src/unittest/genotyper.cpp b/src/unittest/genotyper.cpp index e2e9f7a142..4228b16ee3 100644 --- a/src/unittest/genotyper.cpp +++ b/src/unittest/genotyper.cpp @@ -7,6 +7,7 @@ #include "../snarls.hpp" #include "../cactus_snarl_finder.hpp" #include "../traversal_finder.hpp" +#include "../io/json2graph.hpp" namespace vg { namespace unittest { @@ -41,15 +42,6 @@ TEST_CASE("traversals can be found from reads", "[genotyper]") { {"from": 6, "to": 8}, {"from": 7, "to": 9}, {"from": 8, "to": 9} - - ], - "path": [ - {"name": "hint", "mapping": [ - {"position": {"node_id": 1}, "rank" : 1 }, - {"position": {"node_id": 6}, "rank" : 2 }, - {"position": {"node_id": 8}, "rank" : 3 }, - {"position": {"node_id": 9}, "rank" : 4 } - ]} ] } @@ -57,9 +49,7 @@ TEST_CASE("traversals can be found from reads", "[genotyper]") { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.merge(chunk); + vg::io::json2graph(graph_json, &graph); // Find the snarls SnarlManager manager = CactusSnarlFinder(graph).find_snarls(); diff --git a/src/unittest/haplotypes.cpp b/src/unittest/haplotypes.cpp index e441bbe197..9e4e04475b 100644 --- a/src/unittest/haplotypes.cpp +++ b/src/unittest/haplotypes.cpp @@ -4,8 +4,10 @@ #include "catch.hpp" #include "haplotypes.hpp" +#include "../io/json2graph.hpp" #include "xg.hpp" #include "vg.hpp" +#include #include @@ -66,7 +68,7 @@ TEST_CASE("We can represent appropriate graphs according to linear reference", " )"; thread_t SNP_thread = {tm[1], tm[3], tm[4]}; - + string del_graph_json = R"( {"node":[ {"id":1,"sequence":"AAA"}, @@ -89,22 +91,24 @@ TEST_CASE("We can represent appropriate graphs according to linear reference", " ]} ]} )"; - + thread_t del_ref_thread = {tm[1], tm[2], tm[4]}; thread_t del_thread = {tm[1], tm[4]}; - - vg::Graph SNP_proto_graph; - json2pb(SNP_proto_graph, SNP_graph_json.c_str(), SNP_graph_json.size()); + + // Build the SNP graph + bdsg::HashGraph SNP_graph; + vg::io::json2graph(SNP_graph_json, &SNP_graph); // Build the xg index xg::XG SNP_xg_index; - SNP_xg_index.from_path_handle_graph(vg::VG(SNP_proto_graph)); + SNP_xg_index.from_path_handle_graph(SNP_graph); vg::path_handle_t SNP_ref_path_handle = SNP_xg_index.get_path_handle("reference"); - - vg::Graph del_proto_graph; - json2pb(del_proto_graph, del_graph_json.c_str(), del_graph_json.size()); + + // Build the del graph + bdsg::HashGraph del_graph; + vg::io::json2graph(del_graph_json, &del_graph); // Build the xg index xg::XG del_xg_index; - del_xg_index.from_path_handle_graph(vg::VG(del_proto_graph)); + del_xg_index.from_path_handle_graph(del_graph); vg::path_handle_t del_ref_path_handle = del_xg_index.get_path_handle("reference"); // NEGATIVE SNVs @@ -159,18 +163,20 @@ TEST_CASE("We can represent appropriate graphs according to linear reference", " thread_t double_thread = {tm[1], tm[2], tm[4]}; - vg::Graph long_proto_graph; - json2pb(long_proto_graph, long_graph_json.c_str(), long_graph_json.size()); + // Build the long graph + bdsg::HashGraph long_graph; + vg::io::json2graph(long_graph_json, &long_graph); // Build the xg index xg::XG long_xg_index; - long_xg_index.from_path_handle_graph(vg::VG(long_proto_graph)); + long_xg_index.from_path_handle_graph(long_graph); vg::path_handle_t long_ref_path_handle = long_xg_index.get_path_handle("reference"); - - vg::Graph double_proto_graph; - json2pb(double_proto_graph, double_graph_json.c_str(), double_graph_json.size()); + + // Build the double graph + bdsg::HashGraph double_graph; + vg::io::json2graph(double_graph_json, &double_graph); // Build the xg index xg::XG double_xg_index; - double_xg_index.from_path_handle_graph(vg::VG(double_proto_graph)); + double_xg_index.from_path_handle_graph(double_graph); vg::path_handle_t double_ref_path_handle = double_xg_index.get_path_handle("reference"); string matching_test_file = "matching_test.slls"; @@ -382,13 +388,13 @@ TEST_CASE("We can score haplotypes using GBWT", "[haplo-score][gbwt]") { TEST_CASE("We can recognize a required crossover", "[hapo-score][gbwt]") { // This graph is the start of xy2 from test/small string graph_json = R"({"node": [{"id": 1, "sequence": "CAAATAAGGCTT"}, {"id": 2, "sequence": "G"}, {"id": 3, "sequence": "GGAAATTTTC"}, {"id": 4, "sequence": "C"}, {"id": 5, "sequence": "TGGAGTTCTATTATATTCC"}, {"id": 6, "sequence": "G"}, {"id": 7, "sequence": "A"}, {"id": 8, "sequence": "ACTCTCTGGTTCCTG"}, {"id": 9, "sequence": "A"}, {"id": 10, "sequence": "G"}, {"id": 11, "sequence": "TGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTTTTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCA"}], "edge": [{"from": 1, "to": 2}, {"from": 1, "to": 3}, {"from": 2, "to": 3}, {"from": 3, "to": 4}, {"from": 3, "to": 5}, {"from": 4, "to": 5}, {"from": 5, "to": 6}, {"from": 5, "to": 7}, {"from": 6, "to": 8}, {"from": 7, "to": 8}, {"from": 8, "to": 9}, {"from": 8, "to": 10}, {"from": 9, "to": 11}, {"from": 10, "to": 11}]})"; - - // Load the JSON - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + + // Load the JSON into a HashGraph + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(vg::VG(proto_graph)); + xg_index.from_path_handle_graph(graph); gbwt::Verbosity::set(gbwt::Verbosity::SILENT); gbwt::DynamicGBWT* gbwt_index = new gbwt::DynamicGBWT; diff --git a/src/unittest/indexed_vg.cpp b/src/unittest/indexed_vg.cpp index 7f74d92193..27504dea9f 100644 --- a/src/unittest/indexed_vg.cpp +++ b/src/unittest/indexed_vg.cpp @@ -40,7 +40,7 @@ TEST_CASE("An IndexedVG can be created for a single node", "[handle][indexed-vg] ] })"; - // Load the JSON + // Load the JSON to Protobuf specifically. Graph proto_graph; json2pb(proto_graph, graph_json.c_str(), graph_json.size()); diff --git a/src/unittest/mapper.cpp b/src/unittest/mapper.cpp index 2caf42d076..17f81fe17b 100644 --- a/src/unittest/mapper.cpp +++ b/src/unittest/mapper.cpp @@ -1,9 +1,10 @@ /// \file mapper.cpp -/// +/// /// unit tests for the mapper #include #include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include #include #include "../mapper.hpp" @@ -25,14 +26,10 @@ TEST_CASE( "Mapper can map to a one-node graph", "[mapping][mapper]" ) { ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); @@ -245,14 +242,10 @@ TEST_CASE( "Mapper finds optimal mapping for read starting with node-border MEM" {"position":{"node_id":1444},"rank":1059}, {"position":{"node_id":1445},"rank":1060}]}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); @@ -311,14 +304,10 @@ TEST_CASE( "Mapper can annotate positions correctly on both strands", "[mapper][ ]} ]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); diff --git a/src/unittest/minimizer_mapper.cpp b/src/unittest/minimizer_mapper.cpp index 3ecd5de147..84628276e4 100644 --- a/src/unittest/minimizer_mapper.cpp +++ b/src/unittest/minimizer_mapper.cpp @@ -3,8 +3,8 @@ /// unit tests for the minimizer mapper #include -#include "vg/io/json2pb.h" #include "../io/json2graph.hpp" +#include #include #include "../minimizer_mapper.hpp" #include "../build_index.hpp" @@ -450,15 +450,13 @@ TEST_CASE("MinimizerMapper can map an empty string between odd points", "[giraff {"id": "55511925", "sequence": "CTTCCTTCC"} ] })"; - - // TODO: Write a json_to_handle_graph - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - auto graph = vg::VG(proto_graph); - + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + Alignment aln; aln.set_sequence(""); - + pos_t left_anchor {55511921, false, 5}; // This is on the final base of the node pos_t right_anchor {55511925, false, 6}; @@ -480,7 +478,7 @@ TEST_CASE("MinimizerMapper can map an empty string between odd points", "[giraff TEST_CASE("MinimizerMapper can map with an initial deletion", "[giraffe][mapping][right_tail]") { Aligner aligner; - + string graph_json = R"({ "edge": [ {"from": "1", "to": "2"}, @@ -492,12 +490,10 @@ TEST_CASE("MinimizerMapper can map with an initial deletion", "[giraffe][mapping {"id": "3", "sequence": "CATTAG"} ] })"; - - // TODO: Write a json_to_handle_graph - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - auto graph = vg::VG(proto_graph); - + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + Alignment aln; aln.set_sequence("CATTAG"); @@ -527,7 +523,7 @@ TEST_CASE("MinimizerMapper can map with an initial deletion", "[giraffe][mapping TEST_CASE("MinimizerMapper can map with an initial deletion on a multi-base node", "[giraffe][mapping][right_tail]") { Aligner aligner; - + string graph_json = R"({ "edge": [ {"from": "1", "to": "2"}, @@ -539,12 +535,10 @@ TEST_CASE("MinimizerMapper can map with an initial deletion on a multi-base node {"id": "3", "sequence": "CATTAG"} ] })"; - - // TODO: Write a json_to_handle_graph - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - auto graph = vg::VG(proto_graph); - + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + Alignment aln; aln.set_sequence("CATTAG"); @@ -574,7 +568,7 @@ TEST_CASE("MinimizerMapper can map with an initial deletion on a multi-base node TEST_CASE("MinimizerMapper can map right off the past-the-end base", "[giraffe][mapping][right_tail]") { Aligner aligner; - + string graph_json = R"({ "edge": [ {"from": "1", "to": "2"}, @@ -586,15 +580,13 @@ TEST_CASE("MinimizerMapper can map right off the past-the-end base", "[giraffe][ {"id": "3", "sequence": "CATTAG"} ] })"; - - // TODO: Write a json_to_handle_graph - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - auto graph = vg::VG(proto_graph); - + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + Alignment aln; aln.set_sequence("CATTAG"); - + pos_t left_anchor {1, false, 1}; // This is the past-end position pos_t right_anchor = empty_pos_t(); @@ -635,15 +627,13 @@ TEST_CASE("MinimizerMapper can compute longest detectable gap in range", "[giraf TEST_CASE("MinimizerMapper can find a significant indel instead of a tempting softclip", "[giraffe][mapping][left_tail]") { Aligner aligner; - + string graph_json = R"({ "edge": [{"from": "30788083", "to": "30788088"}, {"from": "30788083", "to": "30788084"}, {"from": "30788074", "to": "30788075"}, {"from": "30788074", "to": "30788076"}, {"from": "30788079", "to": "30788080"}, {"from": "30788079", "to": "30788081"}, {"from": "30788086", "to": "30788088"}, {"from": "30788086", "to": "30788087", "to_end": true}, {"from": "30788075", "to": "30788077"}, {"from": "30788073", "to": "30788074"}, {"from": "30788078", "to": "30788079"}, {"from": "30788077", "to": "30788078"}, {"from": "30788084", "to": "30788088"}, {"from": "30788084", "to": "30788085"}, {"from": "30788076", "to": "30788077"}, {"from": "30788087", "from_start": true, "to": "30788088"}, {"from": "30788081", "to": "30788082"}, {"from": "30788080", "to": "30788082"}, {"from": "30788082", "to": "30788088"}, {"from": "30788082", "to": "30788083"}, {"from": "30788085", "to": "30788086"}], "node": [{"id": "30788083", "sequence": "AAA"}, {"id": "30788074", "sequence": "AAAAAAAATACAAAAAATTAGC"}, {"id": "30788079", "sequence": "CGCCACTGCACTCCAGCCTGGGC"}, {"id": "30788086", "sequence": "AAAAAAA"}, {"id": "30788075", "sequence": "T"}, {"id": "30788073", "sequence": "GAAAGAGAGTTGTTTAAATTCCATAGTTAGGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGGCTAACACGGTGAAACCCCGTCTCTACTA"}, {"id": "30788078", "sequence": "G"}, {"id": "30788077", "sequence": "GGGCGTGGTAGCGGGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAGGCGGAGCTTGCAGTGAGCCGAGATC"}, {"id": "30788084", "sequence": "A"}, {"id": "30788088", "sequence": "AATTCCATAGTTAGAAAAATAAGACATATCAGGTTTTCAAAAAGTGTAGCCATTTTCTGTTTCTAAAAGGGACACTTAAAGTGAAA"}, {"id": "30788076", "sequence": "C"}, {"id": "30788087", "sequence": "T"}, {"id": "30788081", "sequence": "A"}, {"id": "30788080", "sequence": "G"}, {"id": "30788082", "sequence": "ACAGAGCGAGACTCCGTCTCAAAAAAAAAAAAAA"}, {"id": "30788085", "sequence": "AA"}] })"; - - // TODO: Write a json_to_handle_graph - vg::Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - auto graph = vg::VG(proto_graph); + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); Alignment aln; aln.set_sequence("TTGAAAACCTGATATGTCTTATTTTTCTAACTATGGAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCGCCCGCTACCACGCCCGGCTAATTTTTTGTATTTTTTTT"); @@ -854,9 +844,8 @@ TEST_CASE("MinimizerMapper can extract a strand-split dagified local graph witho {"id": "60245278", "sequence": "GATTACAGATTACA"}] } )"; - vg::Graph graph_chunk; - json2pb(graph_chunk, graph_json.c_str(), graph_json.size()); - vg::VG graph(graph_chunk); + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); TestMinimizerMapper::with_dagified_local_graph(make_pos_t(60245283, false, 10), empty_pos_t(), 50, graph, [&](DeletableHandleGraph& dagified_graph, const handle_t& left_anchor_handle, const handle_t& right_anchor_handle, const std::function(const handle_t&)>& dagified_handle_to_base) { // The graph started as a stick diff --git a/src/unittest/multipath_alignment_graph.cpp b/src/unittest/multipath_alignment_graph.cpp index bea5f687aa..d78e19d6f1 100644 --- a/src/unittest/multipath_alignment_graph.cpp +++ b/src/unittest/multipath_alignment_graph.cpp @@ -3,7 +3,8 @@ /// unit tests for the multipath mapper's MultipathAlignmentGraph #include -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include #include #include "../cactus_snarl_finder.hpp" #include "../integrated_snarl_finder.hpp" @@ -47,13 +48,9 @@ TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath })"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG vg; - vg.extend(proto_graph); - + bdsg::HashGraph vg; + ::vg::io::json2graph(graph_json, &vg); + // Make snarls on it CactusSnarlFinder bubble_finder(vg); IntegratedSnarlFinder snarl_finder(vg); diff --git a/src/unittest/multipath_mapper.cpp b/src/unittest/multipath_mapper.cpp index be6d3b6194..bc1dc4cdd9 100644 --- a/src/unittest/multipath_mapper.cpp +++ b/src/unittest/multipath_mapper.cpp @@ -4,7 +4,9 @@ #include #include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include +#include #include "../multipath_mapper.hpp" #include "../build_index.hpp" #include "xg.hpp" @@ -111,7 +113,7 @@ TEST_CASE( "MultipathMapper::read_coverage works", "[multipath][mapping][multipa } TEST_CASE( "MultipathMapper::query_cluster_graphs works", "[multipath][mapping][multipathmapper]" ) { - + string graph_json = R"({ "node": [{"id": 1, "sequence": "GATTACA"}], "path": [ @@ -120,14 +122,10 @@ TEST_CASE( "MultipathMapper::query_cluster_graphs works", "[multipath][mapping][ ]} ] })"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + + // Load the JSON into a HashGraph + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); @@ -135,17 +133,17 @@ TEST_CASE( "MultipathMapper::query_cluster_graphs works", "[multipath][mapping][ // Make pointers to fill in gcsa::GCSA* gcsaidx = nullptr; gcsa::LCPArray* lcpidx = nullptr; - + // Build the GCSA index build_gcsa_lcp(graph, gcsaidx, lcpidx, 16, 3); - + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); - + xg_index.from_path_handle_graph(graph); + // Make a multipath mapper to map against the graph. TestMultipathMapper mapper(&xg_index, gcsaidx, lcpidx); - + // Make an Alignment that we're pretending we're doing Alignment aln; aln.set_sequence("GATTACA"); @@ -264,7 +262,7 @@ TEST_CASE( "MultipathMapper::query_cluster_graphs works", "[multipath][mapping][ } TEST_CASE( "MultipathMapper can map to a one-node graph", "[multipath][mapping][multipathmapper]" ) { - + string graph_json = R"({ "node": [{"id": 1, "sequence": "GATTACA"}], "path": [ @@ -273,14 +271,10 @@ TEST_CASE( "MultipathMapper can map to a one-node graph", "[multipath][mapping][ ]} ] })"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + + // Load the JSON into a HashGraph + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); @@ -291,11 +285,11 @@ TEST_CASE( "MultipathMapper can map to a one-node graph", "[multipath][mapping][ // Build the GCSA index build_gcsa_lcp(graph, gcsaidx, lcpidx, 16, 3); - + // Build the xg index xg::XG xg_index; xg_index.from_path_handle_graph(graph); - + // Make a multipath mapper to map against the graph. MultipathMapper mapper(&xg_index, gcsaidx, lcpidx); // Lower the max mapping quality so that it thinks it can find unambiguous mappings of @@ -422,16 +416,12 @@ TEST_CASE( "MultipathMapper can map to a one-node graph", "[multipath][mapping][ } TEST_CASE( "MultipathMapper can work on a bigger graph", "[multipath][mapping][multipathmapper]" ) { - + string graph_json = R"({"node":[{"sequence":"CTTCTCATCCCTCCTCAAGGGCCTTTAACTACTCCACATCCAAAGCTACCCAGGCCATTTTAAGTTTCCTGTGGACTAAGGACAAAGGTGCGGGGAGATG","id":12},{"sequence":"A","id":2},{"sequence":"CAAATAAGGCTTGGAAATTTTCTGGAGTTCTATTATATTCCAACTCTCTGGTTCCTGGTGCTATGTGTAACTAGTAATGGTAATGGATATGTTGGGCTTT","id":3},{"sequence":"TTTCTTTGATTTATTTGAAGTGACGTTTGACAATCTATCACTAGGGGTAATGTGGGGAAATGGAAAGAATACAAGATTTGGAGCCAGACAAATCTGGGTT","id":4},{"sequence":"CAAATCCTCACTTTGCCACATATTAGCCATGTGACTTTGAACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGAGATGATACTA","id":5},{"sequence":"CTGACAGCAGAGGTTTGCTGTGAAGATTAAATTAGGTGATGCTTGTAAAGCTCAGGGAATAGTGCCTGGCATAGAGGAAAGCCTCTGACAACTGGTAGTT","id":6},{"sequence":"ACTGTTATTTACTATGAATCCTCACCTTCCTTGACTTCTTGAAACATTTGGCTATTGACCTCTTTCCTCCTTGAGGCTCTTCTGGCTTTTCATTGTCAAC","id":7},{"sequence":"ACAGTCAACGCTCAATACAAGGGACATTAGGATTGGCAGTAGCTCAGAGATCTCTCTGCTCACCGTGATCTTCAAGTTTGAAAATTGCATCTCAAATCTA","id":8},{"sequence":"AGACCCAGAGGGCTCACCCAGAGTCGAGGCTCAAGGACAGCTCTCCTTTGTGTCCAGAGTGTATACGATGTAACTCTGTTCGGGCACTGGTGAAAGATAA","id":9},{"sequence":"CAGAGGAAATGCCTGGCTTTTTATCAGAACATGTTTCCAAGCTTATCCCTTTTCCCAGCTCTCCTTGTCCCTCCCAAGATCTCTTCACTGGCCTCTTATC","id":10},{"sequence":"TTTACTGTTACCAAATCTTTCCAGAAGCTGCTCTTTCCCTCAATTGTTCATTTGTCTTCTTGTCCAGGAATGAACCACTGCTCTCTTCTTGTCAGATCAG","id":11}],"path":[{"name":"x","mapping":[{"position":{"node_id":3},"edit":[{"from_length":100,"to_length":100}],"rank":1},{"position":{"node_id":4},"edit":[{"from_length":100,"to_length":100}],"rank":2},{"position":{"node_id":5},"edit":[{"from_length":100,"to_length":100}],"rank":3},{"position":{"node_id":6},"edit":[{"from_length":100,"to_length":100}],"rank":4},{"position":{"node_id":7},"edit":[{"from_length":100,"to_length":100}],"rank":5},{"position":{"node_id":8},"edit":[{"from_length":100,"to_length":100}],"rank":6},{"position":{"node_id":9},"edit":[{"from_length":100,"to_length":100}],"rank":7},{"position":{"node_id":10},"edit":[{"from_length":100,"to_length":100}],"rank":8},{"position":{"node_id":11},"edit":[{"from_length":100,"to_length":100}],"rank":9},{"position":{"node_id":12},"edit":[{"from_length":100,"to_length":100}],"rank":10},{"position":{"node_id":2},"edit":[{"from_length":1,"to_length":1}],"rank":11}]}],"edge":[{"from":12,"to":2},{"from":3,"to":4},{"from":4,"to":5},{"from":5,"to":6},{"from":6,"to":7},{"from":7,"to":8},{"from":8,"to":9},{"from":9,"to":10},{"from":10,"to":11},{"from":11,"to":12}]})"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); + + // Load the JSON into a HashGraph + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); // Make GCSA quiet gcsa::Verbosity::set(gcsa::Verbosity::SILENT); @@ -442,11 +432,11 @@ TEST_CASE( "MultipathMapper can work on a bigger graph", "[multipath][mapping][m // Build the GCSA index build_gcsa_lcp(graph, gcsaidx, lcpidx, 16, 3); - + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); - + xg_index.from_path_handle_graph(graph); + // Make a multipath mapper to map against the graph. TestMultipathMapper mapper(&xg_index, gcsaidx, lcpidx); // Lower the max mapping quality so that it thinks it can find unambiguous mappings of diff --git a/src/unittest/packed_structs.cpp b/src/unittest/packed_structs.cpp index 9c0075751e..512e638620 100644 --- a/src/unittest/packed_structs.cpp +++ b/src/unittest/packed_structs.cpp @@ -69,7 +69,7 @@ using namespace std; case APPEND: for (size_t k = 0; k < appends_per_op; k++) { std_vec.push_back(next_val); - dyn_vec.append(next_val); + dyn_vec.push_back(next_val); next_val++; } @@ -79,7 +79,7 @@ using namespace std; if (!std_vec.empty()) { for (size_t k = 0; k < pops_per_op; k++) { std_vec.pop_back(); - dyn_vec.pop(); + dyn_vec.pop_back(); } } @@ -161,7 +161,7 @@ using namespace std; case APPEND: for (size_t k = 0; k < appends_per_op; k++) { std_vec.push_back(next_val); - dyn_vec.append(next_val); + dyn_vec.push_back(next_val); next_val = val_distr(prng); } @@ -171,7 +171,7 @@ using namespace std; if (!std_vec.empty()) { for (size_t k = 0; k < pops_per_op; k++) { std_vec.pop_back(); - dyn_vec.pop(); + dyn_vec.pop_back(); } } @@ -252,7 +252,7 @@ using namespace std; case APPEND_LEFT: for (size_t k = 0; k < appends_per_op; k++) { std_deq.push_front(next_val); - suc_deq.append_front(next_val); + suc_deq.push_front(next_val); next_val++; } @@ -269,7 +269,7 @@ using namespace std; case APPEND_RIGHT: for (size_t k = 0; k < appends_per_op; k++) { std_deq.push_back(next_val); - suc_deq.append_back(next_val); + suc_deq.push_back(next_val); next_val++; } diff --git a/src/unittest/path_component_index.cpp b/src/unittest/path_component_index.cpp index 058f4bf9c1..edd3a6013a 100644 --- a/src/unittest/path_component_index.cpp +++ b/src/unittest/path_component_index.cpp @@ -8,7 +8,8 @@ #include "path_component_index.hpp" #include "xg.hpp" #include "vg.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include #include namespace vg { @@ -17,14 +18,14 @@ namespace unittest { TEST_CASE("Path component memoization produces expected results", "[pathcomponent]") { string graph_json = R"({"node": [{"sequence": "AAACCC", "id": 1}, {"sequence": "CACACA", "id": 2}, {"sequence": "CACACA", "id": 3}, {"sequence": "TTTTGG", "id": 4}, {"sequence": "ACGTAC", "id": 5}], "path": [{"name": "one", "mapping": [{"position": {"node_id": 1}, "rank": 1}, {"position": {"node_id": 2}, "rank": 2}]}, {"name": "three", "mapping": [{"position": {"node_id": 2}, "rank": 1}, {"position": {"node_id": 3}, "rank": 2}]}, {"name": "two", "mapping": [{"position": {"node_id": 4}, "rank": 1}, {"position": {"node_id": 5}, "rank": 2}]}], "edge": [{"from": 1, "to": 2}, {"from": 2, "to": 3}, {"from": 4, "to": 5}]})"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(graph); unordered_set comp_1; diff --git a/src/unittest/path_index.cpp b/src/unittest/path_index.cpp index b70152ae2d..e1facc2977 100644 --- a/src/unittest/path_index.cpp +++ b/src/unittest/path_index.cpp @@ -5,9 +5,9 @@ #include #include -#include "vg/io/json2pb.h" -#include +#include "../io/json2graph.hpp" #include "../path_index.hpp" +#include #include "catch.hpp" namespace vg { @@ -58,15 +58,11 @@ const string path_index_graph_1 = R"( TEST_CASE("PathIndex can be created", "[pathindex]") { - + // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); @@ -78,13 +74,9 @@ TEST_CASE("PathIndex can be created", "[pathindex]") { TEST_CASE("PathIndex translation can change a node ID", "[pathindex]") { // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); @@ -115,15 +107,11 @@ TEST_CASE("PathIndex translation can change a node ID", "[pathindex]") { } TEST_CASE("PathIndex translation can divide a node", "[pathindex]") { - + // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); @@ -174,15 +162,11 @@ TEST_CASE("PathIndex translation can divide a node", "[pathindex]") { } TEST_CASE("PathIndex translation can create reverse strand mappings", "[pathindex]") { - + // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); @@ -235,15 +219,11 @@ TEST_CASE("PathIndex translation can create reverse strand mappings", "[pathinde } TEST_CASE("PathIndex translation can handle translations articulated for the reverse strand", "[pathindex]") { - + // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); @@ -300,15 +280,11 @@ TEST_CASE("PathIndex translation can handle translations articulated for the rev } TEST_CASE("PathIndex translation can divide the last node", "[pathindex]") { - + // Load the graph - Graph graph; - json2pb(graph, path_index_graph_1.c_str(), path_index_graph_1.size()); - - // Make it into a VG - VG to_index; - to_index.extend(graph); - + bdsg::HashGraph to_index; + vg::io::json2graph(path_index_graph_1, &to_index); + // Make a PathIndex PathIndex index(to_index, "cool", true); diff --git a/src/unittest/phase_unfolder.cpp b/src/unittest/phase_unfolder.cpp index 0c79972941..36cfbca9de 100644 --- a/src/unittest/phase_unfolder.cpp +++ b/src/unittest/phase_unfolder.cpp @@ -12,7 +12,8 @@ #include #include "../phase_unfolder.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include #include "xg.hpp" #include "catch.hpp" @@ -210,10 +211,10 @@ const std::string unfolder_graph_path = R"( TEST_CASE("PhaseUnfolder can unfold XG paths", "[phaseunfolder][indexing]") { // Build an XG index with a path. - Graph graph_with_path; - json2pb(graph_with_path, unfolder_graph_path.c_str(), unfolder_graph_path.size()); + bdsg::HashGraph graph_with_path; + vg::io::json2graph(unfolder_graph_path, &graph_with_path); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(graph_with_path)); + xg_index.from_path_handle_graph(graph_with_path); // Build an empty GBWT index. gbwt::GBWT gbwt_index; @@ -224,9 +225,7 @@ TEST_CASE("PhaseUnfolder can unfold XG paths", "[phaseunfolder][indexing]") { // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph.c_str(), unfolder_graph.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph, &vg_graph); // Remove branching regions from the VG graph, including the last node, // but keep the edge (1, 6) in the graph. @@ -255,10 +254,10 @@ TEST_CASE("PhaseUnfolder can unfold XG paths", "[phaseunfolder][indexing]") { TEST_CASE("PhaseUnfolder can restore XG paths", "[phaseunfolder][indexing]") { // Build an XG index with a path. - Graph graph_with_path; - json2pb(graph_with_path, unfolder_graph_path.c_str(), unfolder_graph_path.size()); + bdsg::HashGraph graph_with_path; + vg::io::json2graph(unfolder_graph_path, &graph_with_path); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(graph_with_path)); + xg_index.from_path_handle_graph(graph_with_path); // Build an empty GBWT index. gbwt::GBWT gbwt_index; @@ -269,9 +268,7 @@ TEST_CASE("PhaseUnfolder can restore XG paths", "[phaseunfolder][indexing]") { // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph.c_str(), unfolder_graph.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph, &vg_graph); // Remove branching regions from the VG graph, including the last node, // but keep the edge (1, 6) in the graph. @@ -299,10 +296,10 @@ TEST_CASE("PhaseUnfolder can restore XG paths", "[phaseunfolder][indexing]") { TEST_CASE("PhaseUnfolder can unfold GBWT threads", "[phaseunfolder][indexing]") { // Build an XG index without a path. - Graph graph_without_path; - json2pb(graph_without_path, unfolder_graph.c_str(), unfolder_graph.size()); + bdsg::HashGraph graph_without_path; + vg::io::json2graph(unfolder_graph, &graph_without_path); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(graph_without_path)); + xg_index.from_path_handle_graph(graph_without_path); // Build a GBWT with three threads including a duplicate. We want to have // only one instance of short_path unfolded, but we want separate copies @@ -335,9 +332,7 @@ TEST_CASE("PhaseUnfolder can unfold GBWT threads", "[phaseunfolder][indexing]") // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph.c_str(), unfolder_graph.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph, &vg_graph); // Remove branching regions from the VG graph, including the last node, // but keep the edge (1, 6) in the graph. @@ -366,10 +361,10 @@ TEST_CASE("PhaseUnfolder can unfold GBWT threads", "[phaseunfolder][indexing]") TEST_CASE("PhaseUnfolder can unfold both XG paths and GBWT threads", "[phaseunfolder][indexing]") { // Build an XG index with a path. - Graph graph_with_path; - json2pb(graph_with_path, unfolder_graph_path.c_str(), unfolder_graph_path.size()); + bdsg::HashGraph graph_with_path; + vg::io::json2graph(unfolder_graph_path, &graph_with_path); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(graph_with_path)); + xg_index.from_path_handle_graph(graph_with_path); // Build a GBWT with three threads including a duplicate. We want to have // only one instance of short_path unfolded, but we want separate copies @@ -402,9 +397,7 @@ TEST_CASE("PhaseUnfolder can unfold both XG paths and GBWT threads", "[phaseunfo // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph.c_str(), unfolder_graph.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph, &vg_graph); // Remove branching regions from the VG graph, including the last node, // but keep the edge (1, 6) in the graph. @@ -501,10 +494,10 @@ const std::string unfolder_graph_simple_path = R"( TEST_CASE("PhaseUnfolder can merge shared prefixes and suffixes", "[phaseunfolder][indexing]") { // Build an XG index. - Graph simple_graph; - json2pb(simple_graph, unfolder_graph_simple.c_str(), unfolder_graph_simple.size()); + bdsg::HashGraph simple_graph; + vg::io::json2graph(unfolder_graph_simple, &simple_graph); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(simple_graph)); + xg_index.from_path_handle_graph(simple_graph); // Build a GBWT with both possible threads. gbwt::vector_type upper_path { @@ -536,9 +529,7 @@ TEST_CASE("PhaseUnfolder can merge shared prefixes and suffixes", "[phaseunfolde // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph_simple.c_str(), unfolder_graph_simple.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph_simple, &vg_graph); // Remove the bubble, including its endpoints. std::set to_remove { 3, 4, 5, 6 }; @@ -566,10 +557,10 @@ TEST_CASE("PhaseUnfolder can merge shared prefixes and suffixes", "[phaseunfolde TEST_CASE("PhaseUnfolder can extend short threads", "[phaseunfolder][indexing]") { // Build an XG index. - Graph simple_graph_with_path; - json2pb(simple_graph_with_path, unfolder_graph_simple_path.c_str(), unfolder_graph_simple_path.size()); + bdsg::HashGraph simple_graph_with_path; + vg::io::json2graph(unfolder_graph_simple_path, &simple_graph_with_path); xg::XG xg_index; - xg_index.from_path_handle_graph(VG(simple_graph_with_path)); + xg_index.from_path_handle_graph(simple_graph_with_path); // Build a GBWT for the fragment that is different from the reference. gbwt::vector_type short_fragment { @@ -586,9 +577,7 @@ TEST_CASE("PhaseUnfolder can extend short threads", "[phaseunfolder][indexing]") // Build a VG graph. VG vg_graph; - Graph temp_graph; - json2pb(temp_graph, unfolder_graph_simple.c_str(), unfolder_graph_simple.size()); - vg_graph.merge(temp_graph); + vg::io::json2graph(unfolder_graph_simple, &vg_graph); // Remove the bubble, including its endpoints. std::set to_remove { 3, 4, 5, 6 }; diff --git a/src/unittest/randomly_flipped_nodes.cpp b/src/unittest/randomly_flipped_nodes.cpp new file mode 100644 index 0000000000..455bdd18ae --- /dev/null +++ b/src/unittest/randomly_flipped_nodes.cpp @@ -0,0 +1,179 @@ +#include "catch.hpp" +#include "../handle.hpp" +#include "../utility.hpp" +#include + +#include "support/randomly_flipped_nodes.hpp" +#include "support/randomness.hpp" +#include "support/random_graph.hpp" + +#include +#include + +namespace vg { +namespace unittest { + +/// Get the canonicalized set of edge sequence pairs from a graph. +/// Each edge is represented as a pair of sequences (left_seq, right_seq) read +/// in the orientation of the edge. To canonicalize, we compare each pair +/// against its reverse complement (RC(right_seq), RC(left_seq)) and keep the +/// lexicographically smaller one. +/// +/// This doesn't fully constrain the graph, but if this doesn't match what it's +/// supposed to, it can tell us that the graph smells off and is wrong. +static set> canonical_edge_pairs(const HandleGraph& graph) { + set> result; + graph.for_each_edge([&](const edge_t& edge) { + string left_seq = graph.get_sequence(edge.first); + string right_seq = graph.get_sequence(edge.second); + + // The reverse complement pair: RC(right) on the left, RC(left) on the right + string rc_right = reverse_complement(right_seq); + string rc_left = reverse_complement(left_seq); + + pair forward_pair = {left_seq, right_seq}; + pair rc_pair = {rc_right, rc_left}; + + // Use the lexicographically smaller one as canonical + if (rc_pair < forward_pair) { + result.insert(rc_pair); + } else { + result.insert(forward_pair); + } + return true; + }); + return result; +} + +/// Make sure that observed and expected graphs are not obviously not +/// isomorphic. +static void validate_graph(const HandleGraph& observed, const HandleGraph& expected, const set>& expected_edges) { + REQUIRE(observed.get_node_count() == expected.get_node_count()); + REQUIRE(observed.get_edge_count() == expected.get_edge_count()); + + auto observed_edges = canonical_edge_pairs(observed); + REQUIRE(observed_edges == expected_edges); +} + +TEST_CASE("randomly_flipped_nodes preserves graph structure on a simple linear graph", "[randomly_flipped_nodes]") { + bdsg::HashGraph graph; + std::string stick_sequence = "GGACTGACTCGCATGTCGAGCGACTCGCGCGAGCTATCGTAGTACGCGAGTCATATTATATTATCACG"; + size_t node_length = 3; + handle_t prev_handle; + for (size_t i = 0; i < stick_sequence.size(); i += node_length) { + handle_t h = graph.create_handle(stick_sequence.substr(i, node_length)); + if (i > 0) { + graph.create_edge(prev_handle, h); + } + prev_handle = h; + } + + auto original_edges = canonical_edge_pairs(graph); + + SECTION("flipping no nodes preserves edges exactly") { + default_random_engine gen(test_seed_source()); + auto flipped = randomly_flipped_nodes(graph, 0.0, gen); + validate_graph(flipped, graph, original_edges); + } + + SECTION("flipping all nodes preserves canonical edge pairs") { + default_random_engine gen(test_seed_source()); + auto flipped = randomly_flipped_nodes(graph, 1.0, gen); + validate_graph(flipped, graph, original_edges); + } + + SECTION("flipping 50% of nodes preserves canonical edge pairs") { + default_random_engine gen(test_seed_source()); + auto flipped = randomly_flipped_nodes(graph, 0.5, gen); + validate_graph(flipped, graph, original_edges); + } +} + +TEST_CASE("randomly_flipped_nodes preserves structure on graph with reversing edges", "[randomly_flipped_nodes]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("GATT", 1); + handle_t h2 = graph.create_handle("ACA", 2); + handle_t h3 = graph.create_handle("CGAT", 3); + handle_t h4 = graph.create_handle("TCGAA", 4); + + // Forward edges + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + // Reversing edge: 4 fwd -> 3 rev + graph.create_edge(h4, graph.flip(h3)); + + auto original_edges = canonical_edge_pairs(graph); + + default_random_engine gen(test_seed_source()); + for (int i = 0; i < 10; i++) { + auto flipped = randomly_flipped_nodes(graph, 0.5, gen); + validate_graph(flipped, graph, original_edges); + } +} + +TEST_CASE("randomly_flipped_nodes preserves structure on graph with self-loops", "[randomly_flipped_nodes]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("ACGT", 1); + handle_t h2 = graph.create_handle("TTCC", 2); + + graph.create_edge(h1, h2); + // Self-loop on h1: fwd -> fwd + graph.create_edge(h1, h1); + // Inverting self-loop on h2: fwd -> rev + graph.create_edge(h2, graph.flip(h2)); + + auto original_edges = canonical_edge_pairs(graph); + + default_random_engine gen(test_seed_source()); + for (int i = 0; i < 10; i++) { + auto flipped = randomly_flipped_nodes(graph, 0.5, gen); + validate_graph(flipped, graph, original_edges); + } +} + +TEST_CASE("randomly_flipped_nodes preserves structure on random graphs", "[randomly_flipped_nodes]") { + for (int trial = 0; trial < 5; trial++) { + bdsg::HashGraph graph; + random_graph(100, 10, 10, &graph); + + auto original_edges = canonical_edge_pairs(graph); + + default_random_engine gen(test_seed_source()); + for (int i = 0; i < 5; i++) { + auto flipped = randomly_flipped_nodes(graph, 0.5, gen); + validate_graph(flipped, graph, original_edges); + } + } +} + +TEST_CASE("randomly_flipped_nodes preserves node IDs", "[randomly_flipped_nodes]") { + bdsg::HashGraph graph; + graph.create_handle("AAA", 5); + graph.create_handle("CCC", 10); + graph.create_handle("GGG", 15); + graph.create_edge(graph.get_handle(5), graph.get_handle(10)); + graph.create_edge(graph.get_handle(10), graph.get_handle(15)); + + default_random_engine gen(test_seed_source()); + auto flipped = randomly_flipped_nodes(graph, 0.5, gen); + + REQUIRE(flipped.has_node(5)); + REQUIRE(flipped.has_node(10)); + REQUIRE(flipped.has_node(15)); +} + +TEST_CASE("randomly_flipped_nodes actually flips node sequences", "[randomly_flipped_nodes]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("AAAC", 1); // RC = GTTT + + default_random_engine gen(test_seed_source()); + // Guarantee a flip + auto flipped = randomly_flipped_nodes(graph, 1.0, gen); + + // The forward sequence should be the RC of the original + REQUIRE(flipped.get_sequence(flipped.get_handle(1)) == "GTTT"); +} + +} // namespace unittest +} // namespace vg diff --git a/src/unittest/readfilter.cpp b/src/unittest/readfilter.cpp index cc1562f3f3..6d84fa0a38 100644 --- a/src/unittest/readfilter.cpp +++ b/src/unittest/readfilter.cpp @@ -5,6 +5,9 @@ #include "catch.hpp" #include "readfilter.hpp" #include "xg.hpp" +#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include namespace vg { namespace unittest { @@ -44,13 +47,13 @@ TEST_CASE("reads with ambiguous ends can be trimmed", "[filter]") { )"; - // Load it into Protobuf - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - + // Load the graph + bdsg::HashGraph chunk; + vg::io::json2graph(graph_json, &chunk); + // Pass it over to XG xg::XG index; - index.from_path_handle_graph(VG(chunk)); + index.from_path_handle_graph(chunk); // Make a ReadFilter; ReadFilter filter; diff --git a/src/unittest/sampler.cpp b/src/unittest/sampler.cpp index d8bb95b650..cda0147f57 100644 --- a/src/unittest/sampler.cpp +++ b/src/unittest/sampler.cpp @@ -6,11 +6,10 @@ #include #include -#include "vg/io/json2pb.h" -#include +#include "../io/json2graph.hpp" +#include #include "../sampler.hpp" #include "../xg.hpp" -#include "../vg.hpp" #include "catch.hpp" namespace vg { @@ -28,13 +27,9 @@ TEST_CASE( "Sampler can sample from a 1-node graph", "[sampler]" ) { })"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); - + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Build the xg index xg::XG xg_index; xg_index.from_path_handle_graph(graph); @@ -118,13 +113,9 @@ TEST_CASE( "position_at works", "[sampler]" ) { })"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); - + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Build the xg index xg::XG xg_index; xg_index.from_path_handle_graph(graph); @@ -195,13 +186,9 @@ TEST_CASE( "Sampler can sample from a loop-containing path", "[sampler]" ) { })"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); - + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Build the xg index xg::XG xg_index; xg_index.from_path_handle_graph(graph); @@ -259,13 +246,9 @@ TEST_CASE( "Sampler can across reversing edges", "[sampler]" ) { })"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG - VG graph; - graph.extend(proto_graph); - + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + // Build the xg index xg::XG xg_index; xg_index.from_path_handle_graph(graph); diff --git a/src/unittest/snarl_decomposition_fuzzer.cpp b/src/unittest/snarl_decomposition_fuzzer.cpp new file mode 100644 index 0000000000..38742be20c --- /dev/null +++ b/src/unittest/snarl_decomposition_fuzzer.cpp @@ -0,0 +1,339 @@ +#include "catch.hpp" +#include "../handle.hpp" +#include + +#include "support/snarl_decomposition_fuzzer.hpp" + +#include +#include + +namespace vg { +namespace unittest { + +using ET = DecompositionEventType; +using Event = DecompositionEvent; + +TEST_CASE("ReplaySnarlFinder replays events faithfully", "[snarl_decomposition_fuzzer]") { + // Build a small graph to get real handles + bdsg::HashGraph graph; + graph.create_handle("A", 1); + graph.create_handle("C", 2); + graph.create_handle("G", 3); + graph.create_handle("T", 4); + graph.create_handle("AA", 5); + + std::vector events = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::BEGIN_CHAIN, 2, true}, + {ET::END_CHAIN, 3, true}, + {ET::END_SNARL, 4, false}, + {ET::BEGIN_SNARL, 4, false}, + {ET::END_SNARL, 5, false}, + {ET::END_CHAIN, 5, false}, + }; + + ReplaySnarlFinder finder(&graph, events); + std::vector captured = capture_events(finder, graph); + + REQUIRE(captured == events); +} + +TEST_CASE("SnarlDecompositionFuzzer passes through when nothing is flipped", "[snarl_decomposition_fuzzer]") { + bdsg::HashGraph graph; + graph.create_handle("A", 1); + graph.create_handle("C", 2); + graph.create_handle("G", 3); + graph.create_handle("T", 4); + graph.create_handle("AA", 5); + + std::vector events = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::BEGIN_CHAIN, 2, true}, + {ET::END_CHAIN, 3, true}, + {ET::END_SNARL, 4, false}, + {ET::BEGIN_SNARL, 4, false}, + {ET::END_SNARL, 5, false}, + {ET::END_CHAIN, 5, false}, + }; + + ReplaySnarlFinder replay(&graph, events); + + // No chains to flip + SnarlDecompositionFuzzer fuzzer(&graph, &replay, {}); + + std::vector captured = capture_events(fuzzer, graph); + + REQUIRE(captured == events); +} + +TEST_CASE("SnarlDecompositionFuzzer flips an outer chain", "[snarl_decomposition_fuzzer]") { + // Graph: + // Chain: 1fwd -> snarl(1fwd, 4fwd) -> snarl(4fwd, 5fwd) -> 5fwd + // Inside first snarl: chain 2rev->3rev + bdsg::HashGraph graph; + graph.create_handle("A", 1); + graph.create_handle("C", 2); + graph.create_handle("G", 3); + graph.create_handle("T", 4); + graph.create_handle("AA", 5); + + std::vector events = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::BEGIN_CHAIN, 2, true}, + {ET::END_CHAIN, 3, true}, + {ET::END_SNARL, 4, false}, + {ET::BEGIN_SNARL, 4, false}, + {ET::END_SNARL, 5, false}, + {ET::END_CHAIN, 5, false}, + }; + + ReplaySnarlFinder replay(&graph, events); + + SECTION("flip outer chain only") { + // Flip the outer chain (1fwd -> 5fwd) + std::unordered_set flips {1, 5}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + // Expected after flipping the outer chain: + // Flipping a chain reverses everything inside it, including children. + // The nested chain 2rev->3rev gets reversed to 3fwd->2fwd as + // part of the parent flip. + std::vector expected = { + {ET::BEGIN_CHAIN, 5, true}, + {ET::BEGIN_SNARL, 5, true}, + {ET::END_SNARL, 4, true}, + {ET::BEGIN_SNARL, 4, true}, + {ET::BEGIN_CHAIN, 3, false}, + {ET::END_CHAIN, 2, false}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + }; + + REQUIRE(captured == expected); + } + + SECTION("flip outer and nested chain") { + // Flip outer chain (1fwd->5fwd) AND nested chain (2rev->3rev) + std::unordered_set flips {1, 5, 2, 3}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + // Expected: outer chain flipped (reversing everything, including + // the nested chain to 3fwd->2fwd), AND THEN the nested chain is + // flipped again back to its original orientation 2rev->3rev. + std::vector expected = { + {ET::BEGIN_CHAIN, 5, true}, + {ET::BEGIN_SNARL, 5, true}, + {ET::END_SNARL, 4, true}, + {ET::BEGIN_SNARL, 4, true}, + {ET::BEGIN_CHAIN, 2, true}, + {ET::END_CHAIN, 3, true}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + }; + + REQUIRE(captured == expected); + } + + SECTION("flip nested chain only") { + // Flip only the nested chain (2rev->3rev), outer stays + std::unordered_set flips {2, 3}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + // Outer chain not flipped, nested chain flipped + std::vector expected = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::BEGIN_CHAIN, 3, false}, + {ET::END_CHAIN, 2, false}, + {ET::END_SNARL, 4, false}, + {ET::BEGIN_SNARL, 4, false}, + {ET::END_SNARL, 5, false}, + {ET::END_CHAIN, 5, false}, + }; + + REQUIRE(captured == expected); + } +} + +TEST_CASE("SnarlDecompositionFuzzer handles empty chain", "[snarl_decomposition_fuzzer]") { + bdsg::HashGraph graph; + graph.create_handle("ACGT", 1); + + // An empty chain: begin and end with same handle, no snarls inside + std::vector events = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::END_CHAIN, 1, false}, + }; + + ReplaySnarlFinder replay(&graph, events); + + SECTION("flipping an empty chain") { + std::unordered_set flips {1}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + std::vector expected = { + {ET::BEGIN_CHAIN, 1, true}, + {ET::END_CHAIN, 1, true}, + }; + + REQUIRE(captured == expected); + } +} + +TEST_CASE("SnarlDecompositionFuzzer handles multiple top-level chains", "[snarl_decomposition_fuzzer]") { + bdsg::HashGraph graph; + graph.create_handle("A", 1); + graph.create_handle("C", 2); + graph.create_handle("G", 3); + graph.create_handle("T", 4); + + // Two top-level chains in the root snarl + std::vector events = { + // Chain 1: 1fwd -> snarl -> 2fwd + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::END_SNARL, 2, false}, + {ET::END_CHAIN, 2, false}, + // Chain 2: 3fwd -> snarl -> 4fwd + {ET::BEGIN_CHAIN, 3, false}, + {ET::BEGIN_SNARL, 3, false}, + {ET::END_SNARL, 4, false}, + {ET::END_CHAIN, 4, false}, + }; + + ReplaySnarlFinder replay(&graph, events); + + SECTION("flip only first chain") { + std::unordered_set flips {1, 2}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + std::vector expected = { + {ET::BEGIN_CHAIN, 2, true}, + {ET::BEGIN_SNARL, 2, true}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + {ET::BEGIN_CHAIN, 3, false}, + {ET::BEGIN_SNARL, 3, false}, + {ET::END_SNARL, 4, false}, + {ET::END_CHAIN, 4, false}, + }; + + REQUIRE(captured == expected); + } + + SECTION("flip both chains") { + std::unordered_set flips {1, 2, 3, 4}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + std::vector expected = { + {ET::BEGIN_CHAIN, 2, true}, + {ET::BEGIN_SNARL, 2, true}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + {ET::BEGIN_CHAIN, 4, true}, + {ET::BEGIN_SNARL, 4, true}, + {ET::END_SNARL, 3, true}, + {ET::END_CHAIN, 3, true}, + }; + + REQUIRE(captured == expected); + } +} + +TEST_CASE("SnarlDecompositionFuzzer handles deeply nested chains", "[snarl_decomposition_fuzzer]") { + bdsg::HashGraph graph; + for (nid_t i = 1; i <= 8; i++) { + graph.create_handle("A", i); + } + + // Outer chain: 1->6 + // Snarl(1,4) + // Inner chain: 2->3 + // Snarl(2,3) [leaf snarl, no children] + // Snarl(4,6) + // Inner chain: 5->5 [empty/trivial] + std::vector events = { + {ET::BEGIN_CHAIN, 1, false}, + {ET::BEGIN_SNARL, 1, false}, + {ET::BEGIN_CHAIN, 2, false}, + {ET::BEGIN_SNARL, 2, false}, + {ET::END_SNARL, 3, false}, + {ET::END_CHAIN, 3, false}, + {ET::END_SNARL, 4, false}, + {ET::BEGIN_SNARL, 4, false}, + {ET::BEGIN_CHAIN, 5, false}, + {ET::END_CHAIN, 5, false}, + {ET::END_SNARL, 6, false}, + {ET::END_CHAIN, 6, false}, + }; + + ReplaySnarlFinder replay(&graph, events); + + SECTION("flip outer chain only") { + std::unordered_set flips {1, 6}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + // Inner chain and its snarls should flip too. + std::vector expected = { + {ET::BEGIN_CHAIN, 6, true}, + {ET::BEGIN_SNARL, 6, true}, + {ET::BEGIN_CHAIN, 5, true}, + {ET::END_CHAIN, 5, true}, + {ET::END_SNARL, 4, true}, + {ET::BEGIN_SNARL, 4, true}, + {ET::BEGIN_CHAIN, 3, true}, + {ET::BEGIN_SNARL, 3, true}, + {ET::END_SNARL, 2, true}, + {ET::END_CHAIN, 2, true}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + }; + + REQUIRE(captured == expected); + } + + SECTION("flip outer and inner chain") { + std::unordered_set flips {1, 6, 2, 3}; + SnarlDecompositionFuzzer fuzzer(&graph, &replay, flips); + + std::vector captured = capture_events(fuzzer, graph); + + // Outer chain should flip but inner chain should flip back + std::vector expected = { + {ET::BEGIN_CHAIN, 6, true}, + {ET::BEGIN_SNARL, 6, true}, + {ET::BEGIN_CHAIN, 5, true}, + {ET::END_CHAIN, 5, true}, + {ET::END_SNARL, 4, true}, + {ET::BEGIN_SNARL, 4, true}, + {ET::BEGIN_CHAIN, 2, false}, + {ET::BEGIN_SNARL, 2, false}, + {ET::END_SNARL, 3, false}, + {ET::END_CHAIN, 3, false}, + {ET::END_SNARL, 1, true}, + {ET::END_CHAIN, 1, true}, + }; + + REQUIRE(captured == expected); + } +} + +} // namespace unittest +} // namespace vg diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp index 36a1b9b74e..99f0903ac6 100644 --- a/src/unittest/snarl_distance_index.cpp +++ b/src/unittest/snarl_distance_index.cpp @@ -9,23 +9,34 @@ #include #include #include -#include "vg/io/json2pb.h" -#include +#include "../io/json2graph.hpp" +#include #include "catch.hpp" #include "support/random_graph.hpp" #include "support/randomness.hpp" +#include "support/randomly_flipped_nodes.hpp" +#include "support/snarl_decomposition_fuzzer.hpp" #include "../snarl_distance_index.hpp" #include "../integrated_snarl_finder.hpp" #include "../genotypekit.hpp" #include "../traversal_finder.hpp" +#include "../io/save_handle_graph.hpp" #include #include #include "xg.hpp" +#include +#include //#define debug namespace vg { namespace unittest { + + // TODO: Having *any* operator<< overloads in vg::unittest seems to hide + // the ones that are just in vg, somehow. + using vg::operator<<; + + static pair, unordered_set > pb_contents( VG& graph, const pair, unordered_set >& contents) { pair, unordered_set > ret; @@ -192,7 +203,82 @@ namespace vg { REQUIRE(distance_index.minimum_distance(2, true, 0, 2, true, 1) == 1); } } - TEST_CASE( "Nested chain with loop", "[snarl_distance]" ) { + TEST_CASE( "Can distance index nested chain without loop", "[snarl_distance]" ) { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("G"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("T"); + handle_t h4 = graph.create_handle("T"); + handle_t h5 = graph.create_handle("A"); + handle_t h6 = graph.create_handle("C"); + handle_t h7 = graph.create_handle("A"); + + // Wire it up as a stick + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + graph.create_edge(h4, h5); + graph.create_edge(h5, h6); + graph.create_edge(h6, h7); + + // Allow skipping a run of nodes to make a snarl with a child chain + graph.create_edge(h2, h5); + + IntegratedSnarlFinder snarl_finder(graph); + + SECTION("Snarl classifications are correct") { + SECTION("Distance index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(graph.get_id(h3)))))); + } + SECTION("Distanceless index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0); + REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(graph.get_id(h3)))))); + } + } + } + TEST_CASE( "Can distance index nested chain with a loop hiding in the middle", "[snarl_distance]" ) { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("G"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("T"); + handle_t h4 = graph.create_handle("T"); + handle_t h5 = graph.create_handle("A"); + handle_t h6 = graph.create_handle("C"); + handle_t h7 = graph.create_handle("A"); + + // Wire it up as a stick + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + graph.create_edge(h4, h5); + graph.create_edge(h5, h6); + graph.create_edge(h6, h7); + + // Allow skipping a run of nodes to make a snarl with a child chain that has a few nodes in it + graph.create_edge(h1, h6); + + // Allow turning around with an edge hiding somewhere in the middle of the chain + graph.create_edge(h3, graph.flip(h3)); + + IntegratedSnarlFinder snarl_finder(graph); + + SECTION("Snarl classifications are correct") { + SECTION("Distance index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(graph.get_id(h3)))))); + } + SECTION("Distanceless index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(graph.get_id(h3)))))); + } + } + } + TEST_CASE( "Can distance index nested chain with a loop", "[snarl_distance]" ) { VG graph; @@ -230,7 +316,8 @@ namespace vg { Edge* e17 = graph.create_edge(n11, n12); Edge* e18 = graph.create_edge(n12, n13); - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); + //get the snarls IntegratedSnarlFinder snarl_finder(graph); SECTION("Traversal of chain") { @@ -248,16 +335,13 @@ namespace vg { fill_in_distance_index(&distance_index, &graph, &snarl_finder); REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n3->id()))))); REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n8->id()))))); - REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))), true)); - REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))), false)); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))))); } SECTION("Distanceless index") { SnarlDistanceIndex distance_index; fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0); - REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n3->id()))), true, &graph)); - REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n8->id()))), true, &graph)); - REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))), true, &graph)); - // TODO: This isn't true because it would be too much work to recursively check all children using only the graph - //REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))), false, &graph)); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n3->id()))))); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n8->id()))))); + REQUIRE(!distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n6->id()))))); } } SECTION("Minimum distances are correct") { @@ -3577,12 +3661,9 @@ namespace vg { // } // )"; // - // VG graph; - // // // Load up the graph - // Graph g; - // json2pb(g, graph_json.c_str(), graph_json.size()); - // graph.extend(g); + // VG graph; + // vg::io::json2graph(graph_json, &graph); // // // Define the one snarl // Snarl snarl1; @@ -3709,12 +3790,9 @@ namespace vg { // string snarl2_json = R"({"type": 1, "end": {"node_id": 187209, "backward": true}, "start": {"node_id": 178895, "backward": true}, "parent": {"end": {"node_id": 187208}, "start": {"node_id": 178894}}})"; // string snarl3_json = R"({"type": 1, "end": {"node_id": 178896}, "start": {"node_id": 178895}, "parent": {"end": {"node_id": 187208}, "start": {"node_id": 178894}}})"; // - // VG graph; - // // // Load up the graph - // Graph g; - // json2pb(g, graph_json.c_str(), graph_json.size()); - // graph.extend(g); + // VG graph; + // vg::io::json2graph(graph_json, &graph); // // // Load the snarls // Snarl snarl1, snarl2, snarl3; @@ -3885,9 +3963,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); // We need to see the path. REQUIRE(graph.paths.size() == 1); @@ -4145,9 +4221,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4258,9 +4332,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4407,9 +4479,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4536,9 +4606,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4645,9 +4713,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4749,9 +4815,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -4919,9 +4983,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -5042,9 +5104,7 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); IntegratedSnarlFinder snarl_finder(graph); SnarlDistanceIndex distance_index; @@ -6624,6 +6684,25 @@ namespace vg { } } + TEST_CASE( "Tiny oversized snarl", "[snarl_distance]" ) { + VG graph; + handle_t n1 = graph.create_handle("GCA"); + handle_t n2 = graph.create_handle("T"); + handle_t n3 = graph.create_handle("G"); + handle_t n4 = graph.create_handle("CTGA"); + + graph.create_edge(n1, n2); + graph.create_edge(n1, n3); + graph.create_edge(n2, n3); + graph.create_edge(n2, n4); + graph.create_edge(n3, n4); + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 1); + + REQUIRE(distance_index.minimum_distance(2, false, 0, 3, false, 0, false, &graph) == 1); + } + TEST_CASE( "Oversized snarl","[snarl_distance]" ) { VG graph; @@ -7372,6 +7451,9 @@ namespace vg { } + // TODO: This test case doesn't do anything (runs 0 iterations). + // When I tell it to actually run iterations, it fails. + // Has it ever worked? TEST_CASE("random test subgraph", "[snarl_distance][snarl_distance_subgraph]") { int64_t min = 20; int64_t max = 50; @@ -7480,7 +7562,7 @@ namespace vg { << distance_index.minimum_distance(nodeID1, false, 0, node_id, true, 0) << " (" << dist_start_fd << " " << dist_end_fd << " " << dist_start_bk << " " << dist_end_bk << ") " << " is in the subgraph but shouldn't be " << endl; - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); } REQUIRE((start_forward || end_forward || in_forward || start_backward || end_backward || in_backward)); } else { @@ -7491,7 +7573,7 @@ namespace vg { << distance_index.minimum_distance(nodeID1, false, 0,node_id, true, 0) << " (" << dist_start_fd << " " << dist_end_fd << " " << dist_start_bk << " " << dist_end_bk << ") " << " is not in the subgraph but should be " << endl; - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); REQUIRE(!(start_forward || end_forward || in_forward || start_backward || end_backward || in_backward)); } } @@ -7556,31 +7638,49 @@ namespace vg { */ TEST_CASE( "Distance index can traverse all the snarls in random graphs", - "[snarl_distance_random]" ) { + "[snarl_distance][snarl_distance_random]" ) { // Each actual graph takes a fairly long time to do so we randomize sizes... - default_random_engine generator(test_seed_source()); + std::default_random_engine generator(test_seed_source()); for (size_t repeat = 0; repeat < 1000; repeat++) { - uniform_int_distribution bases_dist(100, 1000); + std::uniform_int_distribution bases_dist(100, 1000); size_t bases = bases_dist(generator); - uniform_int_distribution variant_bases_dist(1, bases/20); + std::uniform_int_distribution variant_bases_dist(1, bases/20); size_t variant_bases = variant_bases_dist(generator); - uniform_int_distribution variant_count_dist(1, bases/30); + std::uniform_int_distribution variant_count_dist(1, bases/30); size_t variant_count = variant_count_dist(generator); + + std::uniform_real_distribution flip_dist(0.0, 1.0); + double node_flip_fraction = flip_dist(generator); + double chain_flip_fraction = flip_dist(generator); - uniform_int_distribution snarl_size_limit_dist(500, 1000); + std::uniform_int_distribution snarl_size_limit_dist(2, 1000); size_t size_limit = snarl_size_limit_dist(generator); - + #ifdef debug - cerr << repeat << ": Do graph of " << bases << " bp with ~" << variant_bases << " bp large variant length and " << variant_count << " events" << endl; + cerr << repeat << ": Do graph of " << bases << " bp with ~" << variant_bases << " bp large variant length and " << variant_count << " events with " << node_flip_fraction << " nodes flipped and " << chain_flip_fraction << " of chains flipped, with size limit " << size_limit << endl; #endif - - VG graph; - random_graph(bases, variant_bases, variant_count, &graph); - IntegratedSnarlFinder finder(graph); + + // Generate a base graph + VG base_graph; + random_graph(bases, variant_bases, variant_count, &base_graph); + + // Flip some fraction of the nodes to their local reverse orientation + bdsg::HashGraph graph = randomly_flipped_nodes(base_graph, node_flip_fraction, generator); + + // Find snarls + IntegratedSnarlFinder base_finder(graph); + + // Flip some fraction of the chains to their opposite orientation. + // Note that we can't flip the snarls because the snarl decomposition + // requires snarls to be articulated as forward along their + // chains. + SnarlDecompositionFuzzer finder(&graph, &base_finder, chain_flip_fraction, generator); + + // Build the index SnarlDistanceIndex distance_index; fill_in_distance_index(&distance_index, &graph, &finder, size_limit); @@ -7640,7 +7740,7 @@ namespace vg { cerr << node_id1 << " " << (rev1 ? "rev" : "fd") << offset1 << " -> " << node_id2 << (rev2 ? "rev" : "fd") << offset2 << endl; cerr << "guessed: " << snarl_distance << " actual: " << dijkstra_distance << endl; cerr << "serializing graph to test_graph.vg" << endl; - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); REQUIRE(false); } if (max_distance < snarl_distance){ @@ -7648,11 +7748,10 @@ namespace vg { cerr << node_id1 << " " << (rev1 ? "rev" : "fd") << offset1 << " -> " << node_id2 << (rev2 ? "rev" : "fd") << offset2 << endl; cerr << "minimum: " << snarl_distance << " maximum: " << max_distance << endl; cerr << "serializing graph to test_graph.vg" << endl; - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); REQUIRE(false); } REQUIRE((snarl_distance >= dijkstra_distance || snarl_distance == std::numeric_limits::max())); - graph.serialize_to_file("test_graph.vg"); if (!traceback.first.empty() && ! traceback.second.empty()) { size_t traceback_distance = 0; for (auto x : traceback.first){ @@ -7699,7 +7798,7 @@ namespace vg { cerr << node_id1 << " " << (rev1 ? "rev" : "fd") << offset1 << " -> " << node_id2 << (rev2 ? "rev" : "fd") << offset2 << endl; cerr << "guessed: " << snarl_distance << " actual: " << dijkstra_distance << endl; cerr << "serializing graph to test_graph.vg" << endl; - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); REQUIRE(false); } REQUIRE((snarl_distance >= dijkstra_distance || snarl_distance == std::numeric_limits::max())); @@ -7789,8 +7888,372 @@ namespace vg { // return true; // }); //} + + TEST_CASE( "Distance index can query a troublesome oversized snarl", + "[snarl_distance]" ) { + + std::string graph_json = R"({ + "node": [ + {"id": "19","sequence": "A"}, + {"id": "20","sequence": "A"}, + {"id": "21","sequence": "A"}, + {"id": "22","sequence": "A"}, + {"id": "23","sequence": "A"} + ], "edge": [ + {"from": "19","to": "20"}, + {"from": "19","to": "22"}, + {"from": "20","to": "21"}, + {"from": "20","to": "23"}, + {"from": "21","to": "22"}, + {"from": "22","to": "23"} + ] + })"; + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 2); + + id_t node_id1 = 19; bool rev1 = false ; size_t offset1 = 0; + id_t node_id2 = 23; bool rev2 = false ; size_t offset2 = 0; + handle_t handle1 = graph.get_handle(node_id1, rev1); + handle_t handle2 = graph.get_handle(node_id2, rev2); + + //Find actual distance + size_t dijkstra_distance = std::numeric_limits::max(); + handlegraph::algorithms::dijkstra(&graph, handle1, [&](const handle_t& reached, size_t distance) { + if (reached == handle2) { + dijkstra_distance = distance; + dijkstra_distance += graph.get_length(graph.get_handle(node_id1)) - offset1; + dijkstra_distance += offset2; + return false; + } + return true; + } + , false); + + REQUIRE(distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph) == dijkstra_distance); + } + + TEST_CASE( "Distance index can query out of a SNP with a reversing allele as an oversided snarl", + "[snarl_distance]" ) { + + // This is a snarl from 1 to 2, where 4 nand 5 are a SNP, and 3 + // lets you double back to the start + std::string graph_json = R"({ + "node": [ + {"id": "1","sequence": "AAAAA"}, + {"id": "2","sequence": "AAAAA"}, + {"id": "3","sequence": "A"}, + {"id": "4","sequence": "A"}, + {"id": "5","sequence": "A"} + ], "edge": [ + {"from": "1","to": "3"}, + {"from": "1","to": "4"}, + {"from": "1","to": "5"}, + {"from": "3","to": "1", "to_end": true}, + {"from": "4","to": "2"}, + {"from": "5","to": "2"} + ] + })"; + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 2); + + // We want to be able to get out of the snarl from node 4, which we definitely can. + id_t node_id1 = 4; bool rev1 = false ; size_t offset1 = 1; + id_t node_id2 = 2; bool rev2 = false ; size_t offset2 = 0; + handle_t handle1 = graph.get_handle(node_id1, rev1); + handle_t handle2 = graph.get_handle(node_id2, rev2); + + //Find actual distance + size_t true_distance = 0; + + REQUIRE(distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph) == true_distance); + + // And out of the snarl to the left from 3 reverse to 1 reverse should also be 0 + node_id1 = 3; rev1 = true; offset1 = 1; + node_id2 = 1; rev2 = true; offset2 = 0; + true_distance = 0; + REQUIRE(distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph) == true_distance); + + } + + TEST_CASE( "Distance index can query within a fiddly snarl", + "[snarl_distance]" ) { + + std::string graph_json = R"({"edge": [{"from": "1", "to": "3"}, {"from": "1", "to": "3", "to_end": true}, {"from": "1", "to": "4"}, {"from": "1", "to": "5"}, {"from": "4", "to": "5", "to_end": true}, {"from": "2", "from_start": true, "to": "4", "to_end": true}], "node": [{"id": "5", "sequence": "A"}, {"id": "1", "sequence": "AAAAA"}, {"id": "4", "sequence": "A"}, {"id": "2", "sequence": "AAAAA"}, {"id": "3", "sequence": "A"}]})"; + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 2); + + id_t node_id1 = 4; bool rev1 = false ; size_t offset1 = 1; + id_t node_id2 = 5; bool rev2 = true ; size_t offset2 = 0; + handle_t handle1 = graph.get_handle(node_id1, rev1); + handle_t handle2 = graph.get_handle(node_id2, rev2); + + //Find actual distance + size_t true_distance = 0; + + REQUIRE(distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph) == true_distance); + } + + TEST_CASE( "Distance index can query into a child snarl in reverse", + "[snarl_distance]" ) { + + std::string graph_json = R"({"node":[{"id":"79","sequence":"A"},{"id":"16","sequence":"A"},{"id":"60","sequence":"A"},{"id":"37","sequence":"A"},{"id":"40","sequence":"A"},{"id":"53","sequence":"A"},{"id":"59","sequence":"A"},{"id":"63","sequence":"A"},{"id":"18","sequence":"A"},{"id":"38","sequence":"A"},{"id":"62","sequence":"A"}],"edge":[{"from":"16","to":"53"},{"from":"16","from_start":true,"to":"79","to_end":true},{"from":"60","to":"62"},{"from":"60","from_start":true,"to":"79","to_end":true},{"from":"37","from_start":true,"to":"63","to_end":true},{"from":"37","from_start":true,"to":"40"},{"from":"53","to":"60"},{"from":"59","to":"63"},{"from":"59","from_start":true,"to":"60","to_end":true},{"from":"18","to":"53"},{"from":"18","to":"38"},{"from":"18","from_start":true,"to":"79","to_end":true},{"from":"18","from_start":true,"to":"37","to_end":true},{"from":"38","to":"63","to_end":true},{"from":"38","to":"40"},{"from":"62","to":"63"}]})"; + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); + + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 2); + + id_t node_id1 = 16; bool rev1 = false ; size_t offset1 = 1; + id_t node_id2 = 62; bool rev2 = true ; size_t offset2 = 0; + handle_t handle1 = graph.get_handle(node_id1, rev1); + handle_t handle2 = graph.get_handle(node_id2, rev2); + + //Find actual distance + size_t dijkstra_distance = std::numeric_limits::max(); + handlegraph::algorithms::dijkstra(&graph, handle1, [&](const handle_t& reached, size_t distance) { + if (reached == handle2) { + dijkstra_distance = distance; + dijkstra_distance += graph.get_length(graph.get_handle(node_id1)) - offset1; + dijkstra_distance += offset2; + return false; + } + return true; + } + , false); + + size_t index_distance = distance_index.minimum_distance(node_id1, rev1, offset1, node_id2, rev2, offset2, false, &graph); + + REQUIRE(index_distance == dijkstra_distance); + } + + + TEST_CASE( "Distance index can query all possible 3-node-with-legs snarls", + "[snarl_distance]" ) { + + // We're going to generate all possible snarls you can get by + // starting with the boundary nodes, taking up to 3 nodes and + // connecting them, one nodeside at a time, onto the existing + // nodes. + // + // Combinatorics says this is a manageable number; each nodeside + // picks from one of the previous nodesides and attaches to it. + + /// Call the callback with each possible combination of choices of + /// previous items. + /// + /// start_size is the number of items present before we start + /// making choices; the first entry can choose from start_size + /// items. + /// + /// end_size is the total number of items to think about, including + /// those in start_size. + /// + /// Calls the callback with all possible vectors of length + /// (end_size - start_size) matching these constraints. + auto for_all_choice_combinations = [](size_t start_size, size_t end_size, const std::function&)>& callback) { + + std::vector choices(end_size - start_size, 0); + while (true) { +#ifdef debug + std::cerr << "Consider combination:"; + for (auto& item : choices) { + std::cerr << " " << item; + } + std::cerr << std::endl; +#endif + callback(choices); + choices.back()++; + for (size_t i = end_size - 1; i >= start_size; i--) { + if (choices.at(i - start_size) >= i) { + // We've reached the point where we want to pick from a + // choice not available at this point. + // At i=2 we can choose between 0 and 1, so we carry at i. + if (i == start_size) { + // We've counted all possibilities + return; + } else { + // Carry and reset to 0. + choices.at(i - start_size - 1)++; + choices.at(i - start_size) = 0; + } + } else { + // No more carrying to do + break; + } + } + } + }; + + // How big should a snarl be allowed to be before being oversized? + size_t size_limit = 2; + // How many content nodes should be inside the snarl? + const size_t MAX_NODES = 3; + // How many node sides do we need to worry about, including the boundary sentinels? + size_t max_node_sides = MAX_NODES * 2 + 2; + for_all_choice_combinations(2, max_node_sides, [&](const std::vector& choices) { + // Build the choices into a graph. + + bdsg::HashGraph graph; + // Make the bounding nodes heavy so they are likely to root the snarl + handle_t start_node = graph.create_handle("AAAAA"); + handle_t end_node = graph.create_handle("AAAAA"); + + std::vector connect_to; + connect_to.reserve(max_node_sides); + // Choice 0 is start node, arriving reading out + connect_to.push_back(graph.flip(start_node)); + // Choice 1 is end node reading out + connect_to.push_back(end_node); + + for (size_t i = 0; i < choices.size(); i += 2) { + // Make a node + handle_t new_node = graph.create_handle("A"); + // Make sure to remember it so it can choose itself + connect_to.push_back(new_node); + connect_to.push_back(graph.flip(new_node)); + // Connect its left and right to each pair of choices. + graph.create_edge(graph.flip(new_node), connect_to.at(choices.at(i))); + graph.create_edge(new_node, connect_to.at(choices.at(i + 1))); + } + + // TODO: It might be more efficient to un-build the things that + // change between graphs instead of rebuilding from scratch for + // every case. + + // Skip graphs where the choices mean the graph isn't actually + // connected, because then it can't be recognized as a snarl + // probably. + std::vector> components = handlegraph::algorithms::weakly_connected_components(&graph); + if (components.size() > 1) { + return; + } + + // Now index the graph for query + IntegratedSnarlFinder finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &finder, size_limit); + + // Compute the truth all-to-all distances, between outgoing + // side of first handle and incoming side of second. + // Both handles are oriented along the connecting path. + // TODO: We compute/store both triangles of the matrix; can we avoid one somehow? + std::unordered_map> dijkstra_distances; + graph.for_each_handle([&](const handle_t& base) { + for (const handle_t& here : {base, graph.flip(base)}) { + if (here == graph.flip(start_node) || here == end_node) { + // Skip traversals looking out of the snarl + return; + } + dijkstra_distances.emplace(here, handlegraph::algorithms::find_shortest_paths(&graph, here)); + } + }); + + // The Dijkstra traversal always sees a handle to itself at + // distance 0. We need to get the real back-to-self distance, + // if any, and fill that in. + graph.for_each_handle([&](const handle_t& base) { + for (const handle_t& here : {base, graph.flip(base)}) { + if (here == graph.flip(start_node) || here == end_node) { + // Skip traversals looking out of the snarl + return; + } + + // The place we need to arrive at is ourselves, since + // both start and end are oriented along the connecting + // path here. + + size_t loop_distance = std::numeric_limits::max(); + // See if we can get back here from any of the places we can get + graph.follow_edges(here, false, [&](const handle_t next) { + if (next == here) { + // We found a real self loop + loop_distance = 0; + return false; + } + auto found_index = dijkstra_distances.find(next); + if (found_index == dijkstra_distances.end()) { + // This destination can't get anywhere. + // This should be impossible since the Dijkstra always will point a node at itself. + return true; + } + auto found_distance = found_index->second.find(here); + if (found_distance == found_index->second.end()) { + // This destination can't get back to us + return true; + } + // If we find a way back, min in its distance. + loop_distance = std::min(loop_distance, graph.get_length(next) + found_distance->second); + return true; + }); + +#ifdef debug + std::cerr << "Real self loop distance for " << graph.get_id(here) << (graph.get_is_reverse(here) ? "rev" : "fd") << " -> " << graph.get_id(here) << (graph.get_is_reverse(here) ? "rev" : "fd") << " is " << loop_distance << std::endl; +#endif + + if (loop_distance == std::numeric_limits::max()) { + // There's really no way back from this node to itself in the same orientation. Delete the entry the Dijkstra search adds. + dijkstra_distances.at(here).erase(here); + } else { + // There is a way back; store the value. + dijkstra_distances.at(here)[here] = loop_distance; + } + }; + }); + +#ifdef debug + for (auto& [start_handle, distances] : dijkstra_distances) { + for (auto& [end_handle, dijkstra_distance] : distances) { + cerr << "Dijkstra sees: " << graph.get_id(start_handle) << (graph.get_is_reverse(start_handle) ? "rev" : "fd") << graph.get_length(start_handle) << " -> " << graph.get_id(end_handle) << (graph.get_is_reverse(end_handle) ? "rev" : "fd") << 0 << " = " << dijkstra_distance << endl; + } + } +#endif + + // Now query all of the distances against the index + for (auto& [start_handle, distances] : dijkstra_distances) { + for (auto& [end_handle, dijkstra_distance] : distances) { + // Ask for distance between outgoing side of first handle and incoming side of second. + +#ifdef debug + cerr << "Measure: " << graph.get_id(start_handle) << (graph.get_is_reverse(start_handle) ? "rev" : "fd") << graph.get_length(start_handle) << " -> " << graph.get_id(end_handle) << (graph.get_is_reverse(end_handle) ? "rev" : "fd") << 0 << endl; +#endif + + size_t snarl_distance = distance_index.minimum_distance(graph.get_id(start_handle), graph.get_is_reverse(start_handle), graph.get_length(start_handle), graph.get_id(end_handle), graph.get_is_reverse(end_handle), 0, false, &graph); + + if (snarl_distance != dijkstra_distance) { + cerr << "Failed exhaustive test" << endl; + cerr << "Snarl size limit: " << size_limit << endl; + cerr << graph.get_id(start_handle) << (graph.get_is_reverse(start_handle) ? "rev" : "fd") << graph.get_length(start_handle) << " -> " << graph.get_id(end_handle) << (graph.get_is_reverse(end_handle) ? "rev" : "fd") << 0 << endl; + cerr << "guessed: " << snarl_distance << " actual: " << dijkstra_distance << endl; + cerr << "serializing graph to test_graph.vg" << endl; + vg::io::save_handle_graph(&graph, "test_graph.vg"); + } + REQUIRE(snarl_distance == dijkstra_distance); + } + } + }); + + } + + TEST_CASE( "random minimum distance paths", - "[snarl_distance_random_paths]" ) { + "[snarl_distance][snarl_distance_random_paths]" ) { // Each actual graph takes a fairly long time to do so we randomize sizes... @@ -7809,7 +8272,7 @@ namespace vg { size_t size_limit = snarl_size_limit_dist(generator); #ifdef debug - cerr << repeat << ": Do graph of " << bases << " bp with ~" << variant_bases << " bp large variant length and " << variant_count << " events" << endl; + cerr << repeat << ": Do graph of " << bases << " bp with ~" << variant_bases << " bp large variant length and " << variant_count << " events with size limit " << size_limit << endl; #endif VG graph; @@ -7818,7 +8281,7 @@ namespace vg { SnarlDistanceIndex distance_index; fill_in_distance_index(&distance_index, &graph, &finder, size_limit); - graph.serialize_to_file("test_graph.vg"); + vg::io::save_handle_graph(&graph, "test_graph.vg"); for (size_t repeat_positions = 0 ; repeat_positions < 500 ; repeat_positions++) { //Pick random pairs of positions and find the distance between them id_t node_id1 = 0; diff --git a/src/unittest/snarl_distance_index_characterization.cpp b/src/unittest/snarl_distance_index_characterization.cpp new file mode 100644 index 0000000000..ff58800b0b --- /dev/null +++ b/src/unittest/snarl_distance_index_characterization.cpp @@ -0,0 +1,386 @@ +// Characterization tests for snarl_distance_index. +// These tests lock down the serialized byte layout (via FNV-1a hash) of the +// distance index for six canonical graphs. Hash constants are captured from +// the first run on unmodified code and must remain identical after the file +// split in PR 1. + +#include "../integrated_snarl_finder.hpp" +#include "../path.hpp" +#include "../snarl_distance_index.hpp" +#include "catch.hpp" +#include + +namespace vg { +namespace unittest { + +static uint64_t fnv1a(const std::vector &data) { + uint64_t h = 14695981039346656037ULL; + for (uint8_t b : data) { + h ^= b; + h *= 1099511628211ULL; + } + return h; +} + +static std::vector serialize_index(const SnarlDistanceIndex &idx) { + std::vector buf; + idx.serialize([&](const void *p, size_t n) { + const uint8_t *bytes = static_cast(p); + buf.insert(buf.end(), bytes, bytes + n); + }); + return buf; +} + +// Walk the snarl tree rooted at handle and collect all snarls into out. +static void collect_snarls(const SnarlDistanceIndex &idx, + const net_handle_t &handle, + std::vector &out) { + if (idx.is_snarl(handle)) { + out.push_back(handle); + } + // Nodes and sentinels have no snarl-tree children; recursing into them + // throws. + if (idx.is_node(handle) || idx.is_sentinel(handle)) { + return; + } + idx.for_each_child(handle, [&](const net_handle_t &child) -> bool { + collect_snarls(idx, child, out); + return true; + }); +} + +// ─── Fixture 1: linear chain ───────────────────────────────────────────────── +// h1 → h2 → h3 → h4 → h5 (no bubbles, just one chain) + +TEST_CASE("Characterization: linear chain", "[snarl_characterization]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("A"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("A"); + handle_t h4 = graph.create_handle("A"); + handle_t h5 = graph.create_handle("A"); + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + graph.create_edge(h4, h5); + + IntegratedSnarlFinder finder(graph); + SnarlDistanceIndex idx; + fill_in_distance_index(&idx, &graph, &finder); + + SECTION("serialization hash") { + auto buf = serialize_index(idx); + constexpr size_t EXPECTED_SIZE = 1024; + constexpr uint64_t EXPECTED_HASH = 4461810471415873827ULL; + REQUIRE(buf.size() == EXPECTED_SIZE); + REQUIRE(fnv1a(buf) == EXPECTED_HASH); + } + + SECTION("no non-trivial snarls") { + std::vector snarls; + collect_snarls(idx, idx.get_root(), snarls); + // A linear chain has no non-trivial snarls (root snarls don't count as + // regular/irregular in the sense of check_regularity). + for (const net_handle_t &s : snarls) { + // Root snarls wrapping a component are allowed; skip them. + if (!idx.is_root_snarl(s)) { + // No internal snarls expected in a plain linear chain. + REQUIRE(false); + } + } + } + + SECTION("subgraph in distance range") { + // From the middle node h3 look forward; nodes 2 steps away should be h5. + std::unordered_set sub; + path_handle_t ph = graph.create_path_handle("path_linear"); + graph.append_step(ph, h3); + Path path = path_from_path_handle(graph, ph); + subgraph_in_distance_range(idx, path, &graph, 2, 3, sub, true); + REQUIRE(sub.count(graph.get_id(h5))); + REQUIRE(!sub.count(graph.get_id(h1))); + REQUIRE(!sub.count(graph.get_id(h2))); + } +} + +// ─── Fixture 2: simple bubble +// ───────────────────────────────────────────────── h1 → h2 → h4 h1 → h3 → h4 +// One snarl (h1,h4) with two single-node children → should be regular. + +TEST_CASE("Characterization: simple bubble", "[snarl_characterization]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("A"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("A"); + handle_t h4 = graph.create_handle("A"); + graph.create_edge(h1, h2); + graph.create_edge(h1, h3); + graph.create_edge(h2, h4); + graph.create_edge(h3, h4); + + IntegratedSnarlFinder finder(graph); + SnarlDistanceIndex idx; + fill_in_distance_index(&idx, &graph, &finder); + + SECTION("serialization hash") { + auto buf = serialize_index(idx); + constexpr size_t EXPECTED_SIZE = 1024; + constexpr uint64_t EXPECTED_HASH = 10070957726680237483ULL; + REQUIRE(buf.size() == EXPECTED_SIZE); + REQUIRE(fnv1a(buf) == EXPECTED_HASH); + } + + SECTION("snarl is regular") { + std::vector snarls; + collect_snarls(idx, idx.get_root(), snarls); + bool found_internal = false; + for (const net_handle_t &s : snarls) { + if (!idx.is_root_snarl(s)) { + found_internal = true; + REQUIRE(idx.is_regular_snarl(s)); + } + } + REQUIRE(found_internal); + } + + SECTION("subgraph in distance range") { + // From h1 look forward; h2 and h3 are 1 step away, h4 is 2 steps away. + std::unordered_set sub; + path_handle_t ph = graph.create_path_handle("path_bubble"); + graph.append_step(ph, h1); + Path path = path_from_path_handle(graph, ph); + subgraph_in_distance_range(idx, path, &graph, 1, 2, sub, true); + REQUIRE(sub.count(graph.get_id(h2))); + REQUIRE(sub.count(graph.get_id(h3))); + REQUIRE(sub.count(graph.get_id(h4))); + } +} + +// ─── Fixture 3: nested chain with loop ─────────────────────────────────────── +// h1 → h2 → h3 → h4 → h5 +// h2 → flip(h2) (self-loop, allows reversing at h2) +// h3 → h5 (shortcut creating a nested snarl) + +TEST_CASE("Characterization: nested chain with loop", + "[snarl_characterization]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("A"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("A"); + handle_t h4 = graph.create_handle("A"); + handle_t h5 = graph.create_handle("A"); + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h3, h4); + graph.create_edge(h4, h5); + graph.create_edge(h2, graph.flip(h2)); // self-loop + graph.create_edge(h3, h5); // shortcut + + IntegratedSnarlFinder finder(graph); + SnarlDistanceIndex idx; + fill_in_distance_index(&idx, &graph, &finder); + + SECTION("serialization hash") { + auto buf = serialize_index(idx); + constexpr size_t EXPECTED_SIZE = 1024; + constexpr uint64_t EXPECTED_HASH = 16246149163740101819ULL; + REQUIRE(buf.size() == EXPECTED_SIZE); + REQUIRE(fnv1a(buf) == EXPECTED_HASH); + } + + SECTION("snarls exist") { + std::vector snarls; + collect_snarls(idx, idx.get_root(), snarls); + // There should be at least one non-root snarl due to the shortcut h3→h5. + bool found = false; + for (const net_handle_t &s : snarls) { + if (!idx.is_root_snarl(s)) { + found = true; + break; + } + } + REQUIRE(found); + } + + SECTION("subgraph in distance range") { + std::unordered_set sub; + path_handle_t ph = graph.create_path_handle("path_nested"); + graph.append_step(ph, h1); + Path path = path_from_path_handle(graph, ph); + subgraph_in_distance_range(idx, path, &graph, 2, 3, sub, true); + // h3 is at distance 2 (through h2 then h3) and h5 is reachable via the + // shortcut. + REQUIRE(sub.count(graph.get_id(h3))); + } +} + +// ─── Fixture 4: multi-component root ───────────────────────────────────────── +// Component 1: h1 → h2 → h3 +// Component 2: h4 → h5 → h6 (no edges between components) + +TEST_CASE("Characterization: multi-component root", + "[snarl_characterization]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("A"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("A"); + handle_t h4 = graph.create_handle("A"); + handle_t h5 = graph.create_handle("A"); + handle_t h6 = graph.create_handle("A"); + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + graph.create_edge(h4, h5); + graph.create_edge(h5, h6); + + IntegratedSnarlFinder finder(graph); + SnarlDistanceIndex idx; + fill_in_distance_index(&idx, &graph, &finder); + + SECTION("serialization hash") { + auto buf = serialize_index(idx); + constexpr size_t EXPECTED_SIZE = 1024; + constexpr uint64_t EXPECTED_HASH = 13763592152412395439ULL; + REQUIRE(buf.size() == EXPECTED_SIZE); + REQUIRE(fnv1a(buf) == EXPECTED_HASH); + } + + SECTION("connected component count") { + REQUIRE(idx.connected_component_count() == 2); + } + + SECTION("subgraph in distance range") { + // From h1 look forward: h3 is 2 steps away. + std::unordered_set sub; + path_handle_t ph = graph.create_path_handle("path_multicomp"); + graph.append_step(ph, h1); + Path path = path_from_path_handle(graph, ph); + subgraph_in_distance_range(idx, path, &graph, 2, 3, sub, true); + REQUIRE(sub.count(graph.get_id(h3))); + // h4/h5/h6 are in a different component and not reachable. + REQUIRE(!sub.count(graph.get_id(h4))); + REQUIRE(!sub.count(graph.get_id(h5))); + REQUIRE(!sub.count(graph.get_id(h6))); + } +} + +// ─── Fixture 5: oversized snarl ────────────────────────────────────────────── +// h1 → h2 → h6 +// h1 → h3 → h6 +// h1 → h4 → h6 +// h1 → h5 → h6 +// Snarl (h1,h6) has 4 internal children; with size_limit=3 → oversized. + +TEST_CASE("Characterization: oversized snarl", "[snarl_characterization]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("A"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("A"); + handle_t h4 = graph.create_handle("A"); + handle_t h5 = graph.create_handle("A"); + handle_t h6 = graph.create_handle("A"); + graph.create_edge(h1, h2); + graph.create_edge(h1, h3); + graph.create_edge(h1, h4); + graph.create_edge(h1, h5); + graph.create_edge(h2, h6); + graph.create_edge(h3, h6); + graph.create_edge(h4, h6); + graph.create_edge(h5, h6); + + IntegratedSnarlFinder finder(graph); + SnarlDistanceIndex idx; + fill_in_distance_index(&idx, &graph, &finder, /*size_limit=*/3, false, + /*silence_warnings=*/true); + + SECTION("serialization size") { + // Hub-label content is non-deterministic (contraction hierarchy uses + // hash-based graph structures), so we only lock down the byte count. + auto buf = serialize_index(idx); + constexpr size_t EXPECTED_SIZE = 7168; + REQUIRE(buf.size() == EXPECTED_SIZE); + } + + SECTION("oversized snarl exists") { + std::vector snarls; + collect_snarls(idx, idx.get_root(), snarls); + bool found_oversized = false; + for (const net_handle_t &s : snarls) { + if (!idx.is_root_snarl(s) && idx.is_oversized_snarl(s)) { + found_oversized = true; + } + } + REQUIRE(found_oversized); + } + + SECTION("subgraph in distance range") { + // From h1 look forward; h2,h3,h4,h5 are 1 step away, h6 is 2 steps. + std::unordered_set sub; + path_handle_t ph = graph.create_path_handle("path_oversized"); + graph.append_step(ph, h1); + Path path = path_from_path_handle(graph, ph); + subgraph_in_distance_range(idx, path, &graph, 1, 2, sub, true); + REQUIRE(sub.count(graph.get_id(h2))); + REQUIRE(sub.count(graph.get_id(h3))); + REQUIRE(sub.count(graph.get_id(h4))); + REQUIRE(sub.count(graph.get_id(h5))); + REQUIRE(sub.count(graph.get_id(h6))); + } +} + +// ─── Fixture 6: irregular snarl ────────────────────────────────────────────── +// h1 → h2 → h4 +// h1 → h3 → h4 +// h2 → h3 (cross-edge between children → snarl is not regular) + +TEST_CASE("Characterization: irregular snarl", "[snarl_characterization]") { + bdsg::HashGraph graph; + handle_t h1 = graph.create_handle("A"); + handle_t h2 = graph.create_handle("A"); + handle_t h3 = graph.create_handle("A"); + handle_t h4 = graph.create_handle("A"); + graph.create_edge(h1, h2); + graph.create_edge(h1, h3); + graph.create_edge(h2, h4); + graph.create_edge(h3, h4); + graph.create_edge(h2, h3); // cross-edge + + IntegratedSnarlFinder finder(graph); + SnarlDistanceIndex idx; + fill_in_distance_index(&idx, &graph, &finder); + + SECTION("serialization") { + auto buf = serialize_index(idx); + constexpr size_t EXPECTED_SIZE = 1024; + // constexpr uint64_t EXPECTED_HASH = 14645746962011564342ULL; + REQUIRE(buf.size() == EXPECTED_SIZE); + // REQUIRE(fnv1a(buf) == EXPECTED_HASH); + } + + SECTION("snarl is not regular") { + std::vector snarls; + collect_snarls(idx, idx.get_root(), snarls); + bool found_irregular = false; + for (const net_handle_t &s : snarls) { + if (!idx.is_root_snarl(s) && !idx.is_regular_snarl(s)) { + found_irregular = true; + } + } + REQUIRE(found_irregular); + } + + SECTION("subgraph in distance range") { + // From h1, h2 and h3 are 1 step away, h4 is 2 steps. + std::unordered_set sub; + path_handle_t ph = graph.create_path_handle("path_irregular"); + graph.append_step(ph, h1); + Path path = path_from_path_handle(graph, ph); + subgraph_in_distance_range(idx, path, &graph, 1, 2, sub, true); + REQUIRE(sub.count(graph.get_id(h2))); + REQUIRE(sub.count(graph.get_id(h3))); + REQUIRE(sub.count(graph.get_id(h4))); + REQUIRE(!sub.count(graph.get_id(h1))); + } +} + +} // namespace unittest +} // namespace vg diff --git a/src/unittest/snarls.cpp b/src/unittest/snarls.cpp index c2f5030326..c7edf85b05 100644 --- a/src/unittest/snarls.cpp +++ b/src/unittest/snarls.cpp @@ -9,6 +9,8 @@ #include #include #include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include #include #include "catch.hpp" #include "support/random_graph.hpp" @@ -1697,14 +1699,12 @@ namespace vg { ] } )"; - + VG graph; - + // Load up the graph - Graph g; - json2pb(g, graph_json.c_str(), graph_json.size()); - graph.extend(g); - + vg::io::json2graph(graph_json, &graph); + // Define the one snarl Snarl snarl1; snarl1.mutable_start()->set_node_id(6462830); @@ -1830,14 +1830,12 @@ namespace vg { string snarl1_json = R"({"type": 1, "end": {"node_id": 187208}, "start": {"node_id": 178894}})"; string snarl2_json = R"({"type": 1, "end": {"node_id": 187209, "backward": true}, "start": {"node_id": 178895, "backward": true}, "parent": {"end": {"node_id": 187208}, "start": {"node_id": 178894}}})"; string snarl3_json = R"({"type": 1, "end": {"node_id": 178896}, "start": {"node_id": 178895}, "parent": {"end": {"node_id": 187208}, "start": {"node_id": 178894}}})"; - + VG graph; - + // Load up the graph - Graph g; - json2pb(g, graph_json.c_str(), graph_json.size()); - graph.extend(g); - + vg::io::json2graph(graph_json, &graph); + // Load the snarls Snarl snarl1, snarl2, snarl3; json2pb(snarl1, snarl1_json.c_str(), snarl1_json.size()); @@ -1917,13 +1915,11 @@ namespace vg { } )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + // We need to see the path. REQUIRE(graph.paths.size() == 1); @@ -2045,10 +2041,8 @@ namespace vg { // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2061,7 +2055,7 @@ namespace vg { cerr << endl; }); #endif - + SECTION("Root node has 1 child bubble") { REQUIRE(snarl_manager.top_level_snarls().size() == 1); @@ -2127,15 +2121,13 @@ namespace vg { ]} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2246,15 +2238,13 @@ namespace vg { ]} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2354,18 +2344,16 @@ namespace vg { {"from": 2, "to": 4}, {"from": 2, "to": 3}, {"from": 2, "to": 2}, - {"from": 3, "to": 3} + {"from": 3, "to": 3} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2415,15 +2403,13 @@ namespace vg { ]} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2490,18 +2476,16 @@ namespace vg { "edge": [ {"from": 1, "to": 2}, {"from": 2, "to": 1} - + ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2555,15 +2539,13 @@ namespace vg { {"from": 3, "to": 6} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug @@ -2767,15 +2749,13 @@ namespace vg { {"from": 9, "to": 10} ] } - + )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); - + vg::io::json2graph(graph_json, &graph); + SnarlManager snarl_manager = CactusSnarlFinder(graph).find_snarls(); #ifdef debug snarl_manager.for_each_snarl_preorder([&](const Snarl* snarl) { @@ -3919,14 +3899,12 @@ namespace vg { {"position": {"node_id": 7, "is_reverse" : "true"}, "rank" : 5 } ]} ] - } + } )"; - + // Make an actual graph VG graph; - Graph chunk; - json2pb(chunk, graph_json.c_str(), graph_json.size()); - graph.extend(chunk); + vg::io::json2graph(graph_json, &graph); assert(graph.is_valid()); SECTION( "PathTraversalFinder can find simple forward traversals") { diff --git a/src/unittest/source_sink_overlay.cpp b/src/unittest/source_sink_overlay.cpp index 4c0ecbc20f..bf2aa3bc13 100644 --- a/src/unittest/source_sink_overlay.cpp +++ b/src/unittest/source_sink_overlay.cpp @@ -10,7 +10,8 @@ #include "../source_sink_overlay.hpp" #include "../kmer.hpp" #include "../vg.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" +#include #include #include @@ -132,11 +133,9 @@ TEST_CASE("SourceSinkOverlay adds a source and a sink to a 1-node graph", "[over TEST_CASE("SourceSinkOverlay agrees with VG::add_start_end_markers in a tiny graph", "[overlay]") { const string graph_json = R"({"node":[{"sequence":"CAAATAAG","id":"1"},{"sequence":"A","id":"2"},{"sequence":"G","id":"3"},{"sequence":"T","id":"4"},{"sequence":"C","id":"5"},{"sequence":"TTG","id":"6"},{"sequence":"A","id":"7"},{"sequence":"G","id":"8"},{"sequence":"AAATTTTCTGGAGTTCTAT","id":"9"},{"sequence":"A","id":"10"},{"sequence":"T","id":"11"},{"sequence":"ATAT","id":"12"},{"sequence":"A","id":"13"},{"sequence":"T","id":"14"},{"sequence":"CCAACTCTCTG","id":"15"}],"edge":[{"from":"1","to":"2"},{"from":"1","to":"3"},{"from":"2","to":"4"},{"from":"2","to":"5"},{"from":"3","to":"4"},{"from":"3","to":"5"},{"from":"4","to":"6"},{"from":"5","to":"6"},{"from":"6","to":"7"},{"from":"6","to":"8"},{"from":"7","to":"9"},{"from":"8","to":"9"},{"from":"9","to":"10"},{"from":"9","to":"11"},{"from":"10","to":"12"},{"from":"11","to":"12"},{"from":"12","to":"13"},{"from":"12","to":"14"},{"from":"13","to":"15"},{"from":"14","to":"15"}],"path":[{"name":"x","mapping":[{"position":{"node_id":"1"},"edit":[{"from_length":8,"to_length":8}],"rank":"1"},{"position":{"node_id":"3"},"edit":[{"from_length":1,"to_length":1}],"rank":"2"},{"position":{"node_id":"5"},"edit":[{"from_length":1,"to_length":1}],"rank":"3"},{"position":{"node_id":"6"},"edit":[{"from_length":3,"to_length":3}],"rank":"4"},{"position":{"node_id":"8"},"edit":[{"from_length":1,"to_length":1}],"rank":"5"},{"position":{"node_id":"9"},"edit":[{"from_length":19,"to_length":19}],"rank":"6"},{"position":{"node_id":"11"},"edit":[{"from_length":1,"to_length":1}],"rank":"7"},{"position":{"node_id":"12"},"edit":[{"from_length":4,"to_length":4}],"rank":"8"},{"position":{"node_id":"14"},"edit":[{"from_length":1,"to_length":1}],"rank":"9"},{"position":{"node_id":"15"},"edit":[{"from_length":11,"to_length":11}],"rank":"10"}]}]})"; - - Graph graph; - json2pb(graph, graph_json); - - VG produced(graph); + + VG produced; + vg::io::json2graph(graph_json, &produced); id_t highest_id = produced.max_node_id(); id_t start_id = highest_id + 1; diff --git a/src/unittest/support/random_graph.hpp b/src/unittest/support/random_graph.hpp index 7597beeab9..e3e812d265 100644 --- a/src/unittest/support/random_graph.hpp +++ b/src/unittest/support/random_graph.hpp @@ -1,11 +1,16 @@ +#ifndef VG_UNITTEST_RANDOM_GRAPH_HPP_INCLUDED +#define VG_UNITTEST_RANDOM_GRAPH_HPP_INCLUDED +/** \file random_graph.hpp + * Utilities for randomizing graphs for test cases. + */ + + #include "handle.hpp" #include -#ifndef VG_UNITTEST_RANDOM_GRAPH_HPP_INCLUDED -#define VG_UNITTEST_RANDOM_GRAPH_HPP_INCLUDED -namespace vg{ -namespace unittest{ +namespace vg { +namespace unittest { /// Create a random graph by adding variation to a sequence of length seq_size /// variant_len is the mean length of a larger variation and variant_count diff --git a/src/unittest/support/randomly_flipped_nodes.hpp b/src/unittest/support/randomly_flipped_nodes.hpp new file mode 100644 index 0000000000..40b00bda26 --- /dev/null +++ b/src/unittest/support/randomly_flipped_nodes.hpp @@ -0,0 +1,83 @@ +#ifndef VG_UNITTEST_RANDOMLY_FLIPPED_NODES_HPP_INCLUDED +#define VG_UNITTEST_RANDOMLY_FLIPPED_NODES_HPP_INCLUDED + +/** + * \file randomly_flipped_nodes.hpp + * Utility for creating a copy of a HandleGraph with a random subset of nodes + * flipped in orientation. + */ + +#include +#include +#include "handle.hpp" + +namespace vg { +namespace unittest { + +/** + * Return a copy of the given graph with approximately p_flip fraction of its + * nodes reversed in their local forward orientation. When a node is flipped, + * its sequence is reverse-complemented and all edges that connected to its + * forward orientation now connect to its reverse orientation, and vice versa. + * + * The returned graph preserves node IDs. + */ +template +bdsg::HashGraph randomly_flipped_nodes(const HandleGraph& source, double p_flip, URNG& generator) { + bdsg::HashGraph result; + + std::uniform_real_distribution dist(0.0, 1.0); + + // Track which nodes get flipped + std::unordered_set flipped; + + // Copy all nodes, flipping some + source.for_each_handle([&](const handle_t& handle) { + nid_t id = source.get_id(handle); + if (dist(generator) < p_flip) { + // Flip this node: store its reverse complement sequence as forward + result.create_handle(source.get_sequence(source.flip(handle)), id); + flipped.insert(id); + } else { + // Keep this node as-is + result.create_handle(source.get_sequence(handle), id); + } + }); + + // Copy all edges, adjusting for flipped nodes. + // An edge (left, right) means: leave left in its orientation, enter right + // in its orientation. If we flipped a node, we need to toggle the + // orientation on that side of the edge. + source.for_each_edge([&](const edge_t& edge) { + handle_t left = edge.first; + handle_t right = edge.second; + + nid_t left_id = source.get_id(left); + bool left_is_reverse = source.get_is_reverse(left); + + nid_t right_id = source.get_id(right); + bool right_is_reverse = source.get_is_reverse(right); + + // If we flipped a node, toggle the orientation for that side + if (flipped.count(left_id)) { + left_is_reverse = !left_is_reverse; + } + if (flipped.count(right_id)) { + right_is_reverse = !right_is_reverse; + } + + result.create_edge( + result.get_handle(left_id, left_is_reverse), + result.get_handle(right_id, right_is_reverse) + ); + + return true; + }); + + return result; +} + +} // namespace unittest +} // namespace vg + +#endif diff --git a/src/unittest/support/snarl_decomposition_fuzzer.cpp b/src/unittest/support/snarl_decomposition_fuzzer.cpp new file mode 100644 index 0000000000..263ad486cf --- /dev/null +++ b/src/unittest/support/snarl_decomposition_fuzzer.cpp @@ -0,0 +1,187 @@ +#include "snarl_decomposition_fuzzer.hpp" + +#include +#include + +namespace vg { +namespace unittest { + +using ET = DecompositionEventType; + +SnarlDecompositionFuzzer::SnarlDecompositionFuzzer( + const HandleGraph* graph, + const HandleGraphSnarlFinder* finder, + const std::unordered_set& chains_to_flip) + : HandleGraphSnarlFinder(graph), wrapped(finder) +{ + + should_flip = [chains_to_flip, graph](nid_t node_id) -> bool { + return chains_to_flip.count(node_id); + }; +} + +void SnarlDecompositionFuzzer::traverse_decomposition( + const function& begin_chain, + const function& end_chain, + const function& begin_snarl, + const function& end_snarl) const +{ + // Step 1: Capture all events from the wrapped finder. + std::vector events = capture_events(*wrapped); + + if (events.empty()) { + return; + } + + // Step 2: Build pairing vector mapping each begin to its matching end + // and vice versa, using separate stacks for chains and snarls. + std::vector other_bound(events.size()); + { + stack chain_stack, snarl_stack; + for (size_t i = 0; i < events.size(); i++) { + switch (events[i].type) { + case ET::BEGIN_CHAIN: + chain_stack.push(i); + break; + case ET::END_CHAIN: + assert(!chain_stack.empty()); + other_bound[i] = chain_stack.top(); + other_bound[chain_stack.top()] = i; + chain_stack.pop(); + break; + case ET::BEGIN_SNARL: + snarl_stack.push(i); + break; + case ET::END_SNARL: + assert(!snarl_stack.empty()); + other_bound[i] = snarl_stack.top(); + other_bound[snarl_stack.top()] = i; + snarl_stack.pop(); + break; + } + } + } + + // Step 3: Walk through events with a cursor, flipping chains as needed. + // When we flip a chain, we jump to the other end and reverse direction, + // pushing the entry point onto a stack. When the cursor reaches a stack + // entry point, we jump back to the far end and restore direction. + struct FlipEntry { + size_t entry_index; + bool original_reverse; + }; + std::stack flip_stack; + + auto emitter = event_emitter(begin_chain, end_chain, begin_snarl, end_snarl); + + bool reverse = false; + for (size_t cursor = 0; cursor != events.size(); cursor += reverse ? -1 : 1) { + // We know if we're entering a chain, we can't be at a stack pop point. + // So we can handle those cases separately. + + if (events[cursor].type == (reverse ? ET::END_CHAIN : ET::BEGIN_CHAIN) && + should_flip(graph->get_id(events[cursor].handle))) { + + // We're entering a chain, and this is a chain we want to flip. So + // flip before emitting anything. + + // Flip: remember where we entered, jump to the other end, + // reverse direction, emit the entry event there. + flip_stack.push({cursor, reverse}); + cursor = other_bound[cursor]; + reverse = !reverse; + } + + // Emit the event here + emitter(reverse ? flip(events[cursor], graph) : events[cursor]); + + if (!flip_stack.empty() && cursor == flip_stack.top().entry_index) { + // We've returned to the entry point of a flipped chain, so after + // emitting, go back to the entry orientation and jump to the other + // side, so we can advance out of it. + + FlipEntry entry = flip_stack.top(); + flip_stack.pop(); + cursor = other_bound[entry.entry_index]; + reverse = entry.original_reverse; + } + } +} + +// ReplaySnarlFinder implementation + +ReplaySnarlFinder::ReplaySnarlFinder(const HandleGraph* graph, const std::vector& events) : HandleGraphSnarlFinder(graph) { + this->events.reserve(events.size()); + for (const DecompositionEvent& e : events) { + // Translate input events into handles + this->events.emplace_back(e.type, graph->get_handle(e.id, e.is_reverse)); + } +} + +void ReplaySnarlFinder::traverse_decomposition( + const std::function& begin_chain, + const std::function& end_chain, + const std::function& begin_snarl, + const std::function& end_snarl) const +{ + auto emitter = event_emitter(begin_chain, end_chain, begin_snarl, end_snarl); + for (auto& event : events) { + emitter(event); + } +} + +std::function event_emitter( + const std::function& begin_chain, + const std::function& end_chain, + const std::function& begin_snarl, + const std::function& end_snarl +) { + return [&](const DecompositionHandleEvent& event) { + switch (event.type) { + case ET::BEGIN_CHAIN: + begin_chain(event.handle); + break; + case ET::END_CHAIN: + end_chain(event.handle); + break; + case ET::BEGIN_SNARL: + begin_snarl(event.handle); + break; + case ET::END_SNARL: + end_snarl(event.handle); + break; + } + }; +} + +std::vector capture_events(const HandleGraphSnarlFinder& finder, const HandleGraph& graph) { + // Get all the events in terms of handles + std::vector handle_result = capture_events(finder); + // And translate them to IDs and orientations + std::vector result; + result.reserve(handle_result.size()); + for (DecompositionHandleEvent& e : handle_result) { + result.emplace_back(e.type, graph.get_id(e.handle), graph.get_is_reverse(e.handle)); + } + return result; +} + +std::vector capture_events(const HandleGraphSnarlFinder& finder) { + std::vector result; + // Mint out functions that push events of different types. + auto event_pusher = [&result](ET event) { + return [event,&result](const handle_t& h) { + result.push_back({event, h}); + }; + }; + finder.traverse_decomposition( + event_pusher(ET::BEGIN_CHAIN), + event_pusher(ET::END_CHAIN), + event_pusher(ET::BEGIN_SNARL), + event_pusher(ET::END_SNARL) + ); + return result; +} + +} // namespace unittest +} // namespace vg diff --git a/src/unittest/support/snarl_decomposition_fuzzer.hpp b/src/unittest/support/snarl_decomposition_fuzzer.hpp new file mode 100644 index 0000000000..91d92e97cb --- /dev/null +++ b/src/unittest/support/snarl_decomposition_fuzzer.hpp @@ -0,0 +1,197 @@ +#ifndef VG_UNITTEST_SNARL_DECOMPOSITION_FUZZER_HPP_INCLUDED +#define VG_UNITTEST_SNARL_DECOMPOSITION_FUZZER_HPP_INCLUDED + +/** + * \file snarl_decomposition_fuzzer.hpp + * Provides SnarlDecompositionFuzzer, which wraps a HandleGraphSnarlFinder and + * randomly flips chains in the snarl decomposition, and ReplaySnarlFinder, + * which replays a scripted sequence of decomposition events. + */ + +#include +#include +#include +#include +#include +#include "snarls.hpp" +#include "handle.hpp" + +namespace vg { +namespace unittest { + +/// Event types for snarl decomposition traversal. +enum class DecompositionEventType { + BEGIN_CHAIN = 0, + END_CHAIN, + BEGIN_SNARL, + END_SNARL +}; + +inline std::ostream& operator<<(std::ostream& out, const DecompositionEventType& t) { + int bits = (int)t; + return out << (bits & 1 ? "END" : "BEGIN") << "_" << (bits & 2 ? "SNARL" : "CHAIN"); +} + +/// Flip the polatiry of an event type (start vs. end) +inline DecompositionEventType flip(const DecompositionEventType& t) { + // We can flip by toggling the low bit. + return (DecompositionEventType)((int) t ^ 1); +} + +/// A single event in a snarl decomposition traversal. +/// This is in terms of IDs and orientations because those are easier to write in test code. +struct DecompositionEvent { + DecompositionEventType type; + nid_t id; + bool is_reverse; + + inline bool operator==(const DecompositionEvent& other) const { + return type == other.type && id == other.id && is_reverse == other.is_reverse; + } + + inline bool operator!=(const DecompositionEvent& other) const { + return ! (*this == other); + } +}; + +inline std::ostream& operator<<(std::ostream& out, const DecompositionEvent& e) { + return out << e.type << "(" << e.id << (e.is_reverse ? "-" : "+") << ")"; +} + +/// A single event in a snarl decomposition traversal. +/// This is in terms of handles because those are easier to work with internally. +struct DecompositionHandleEvent { + DecompositionEventType type; + handle_t handle; +}; + +/// Flip the polarity of a whole event (event type between begin and end, and handle orientation) +inline DecompositionHandleEvent flip(const DecompositionHandleEvent& e, const HandleGraph* g) { + return {flip(e.type), g->flip(e.handle)}; +} + +/// Turn begin and end functions to call into a function that emits an event by +/// type. The provided functions must outlive the returned function. +std::function event_emitter( + const std::function& begin_chain, + const std::function& end_chain, + const std::function& begin_snarl, + const std::function& end_snarl +); + +/// Capture all events emitted by a snarl finder, in terms of IDs and orientations. +std::vector capture_events(const HandleGraphSnarlFinder& finder, const HandleGraph& graph); + +/// Capture all events emitted by a snarl finder, in terms of handles. +std::vector capture_events(const HandleGraphSnarlFinder& finder); + +/** + * A HandleGraphSnarlFinder that wraps another HandleGraphSnarlFinder and + * randomly flips chains in the snarl decomposition. Flipping a chain reverses + * the entire chain including all children; if a child chain is also selected + * for flipping, it gets flipped again (canceling the parent's flip for that + * child). + * + * For non-randomized testing, the specific chains to flip can be + * pre-identified and provided on construction. + */ +class SnarlDecompositionFuzzer : public HandleGraphSnarlFinder { +public: + /** + * Construct a fuzzer wrapping the given finder, flipping chains with + * probability p_flip using the given random generator. + * The graph pointer is needed to flip handles. + */ + template + SnarlDecompositionFuzzer(const HandleGraph* graph, + const HandleGraphSnarlFinder* finder, + double p_flip, URNG& generator); + + /** + * Construct a fuzzer wrapping the given finder, flipping the chains + * bounded by the given node IDs. + * + * You should provide both bounding IDs for each chain, but only the one + * that the chain is actually arrived at through during the traversal will + * really get used. + * + * Note that a node can bound at most one chain. + * + * This is mostly for testing the fuzzer itself. + */ + SnarlDecompositionFuzzer(const HandleGraph* graph, + const HandleGraphSnarlFinder* finder, + const std::unordered_set& chains_to_flip); + + virtual ~SnarlDecompositionFuzzer() = default; + + /** + * Traverse the snarl decomposition, flipping selected chains. + */ + virtual void traverse_decomposition( + const std::function& begin_chain, + const std::function& end_chain, + const std::function& begin_snarl, + const std::function& end_snarl + ) const override; + +private: + /// The wrapped snarl finder + const HandleGraphSnarlFinder* wrapped; + + /// Function that decides whether to flip a chain, given either of its + /// bounding node IDs. May produce different results when called + /// multiple times with the same input. + std::function should_flip; +}; + +/** + * A HandleGraphSnarlFinder that replays a scripted sequence of decomposition + * events. Useful for testing SnarlDecompositionFuzzer without needing a real + * graph or snarl finder. + */ +class ReplaySnarlFinder : public HandleGraphSnarlFinder { +public: + /** + * Construct a replay finder that will emit the given events. + */ + ReplaySnarlFinder(const HandleGraph* graph, const std::vector& events); + + virtual ~ReplaySnarlFinder() = default; + + /** + * Replay the scripted events. + */ + virtual void traverse_decomposition( + const std::function& begin_chain, + const std::function& end_chain, + const std::function& begin_snarl, + const std::function& end_snarl + ) const override; + +private: + + using EventType = DecompositionEventType; + using Event = DecompositionHandleEvent; + + /// This stores events we are going to replay. + std::vector events; +}; + + +template +SnarlDecompositionFuzzer::SnarlDecompositionFuzzer( + const HandleGraph* graph, + const HandleGraphSnarlFinder* finder, + double p_flip, URNG& generator) + : HandleGraphSnarlFinder(graph), wrapped(finder) +{ + should_flip = [&generator, p_flip](nid_t ignored) -> bool { + return std::uniform_real_distribution(0.0, 1.0)(generator) < p_flip; + }; +} + +} // namespace unittest +} // namespace vg + +#endif diff --git a/src/unittest/variant_adder.cpp b/src/unittest/variant_adder.cpp index afe3353e4b..6fad7d82ab 100644 --- a/src/unittest/variant_adder.cpp +++ b/src/unittest/variant_adder.cpp @@ -9,7 +9,7 @@ #include "../utility.hpp" #include "../path.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include #include @@ -38,7 +38,7 @@ ref 5 rs1337 A G 29 PASS . GT // Make a stream out of the data std::stringstream vcf_stream(vcf_data); - + // Load it up in vcflib vcflib::VariantCallFile vcf; vcf.open(vcf_stream); @@ -51,14 +51,10 @@ ref 5 rs1337 A G 29 PASS . GT ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder @@ -85,7 +81,7 @@ ref 5 rs1337 A G 29 PASS . GT 0/1 // Make a stream out of the data std::stringstream vcf_stream(vcf_data); - + // Load it up in vcflib vcflib::VariantCallFile vcf; vcf.open(vcf_stream); @@ -98,14 +94,10 @@ ref 5 rs1337 A G 29 PASS . GT 0/1 ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -139,7 +131,7 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA A 29 PASS . GT 0/1 // Make a stream out of the data std::stringstream vcf_stream(vcf_data); - + // Load it up in vcflib vcflib::VariantCallFile vcf; vcf.open(vcf_stream); @@ -152,14 +144,10 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA A 29 PASS . GT 0/1 ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -193,7 +181,7 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA A 29 PASS . GT 0/1 // Make a stream out of the data std::stringstream vcf_stream(vcf_data); - + // Load it up in vcflib vcflib::VariantCallFile vcf; vcf.open(vcf_stream); @@ -213,14 +201,10 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA A 29 PASS . GT 0/1 ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); SECTION ("should work when the graph is as given") { @@ -280,7 +264,7 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 29 // Make a stream out of the data std::stringstream vcf_stream(vcf_data); - + // Load it up in vcflib vcflib::VariantCallFile vcf; vcf.open(vcf_stream); @@ -293,14 +277,10 @@ ref 5 rs1337 AAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 29 ]} ] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -323,14 +303,10 @@ TEST_CASE( "The smart aligner works on very large inserts", "[variantadder]" ) { string graph_json = R"({ "node": [{"id": 1, "sequence": "GCGCAAAAAAAAAAAAAAAAAAAAAGCGC"}] })"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -396,21 +372,17 @@ TEST_CASE( "The smart aligner should use mapping offsets on huge deletions", "[v {"from": 2, "to": 3} ] })"; - + // Make the graph have lots of As stringstream a_stream; for(size_t i = 0; i < 10000; i++) { a_stream << "A"; } graph_json = regex_replace(graph_json, std::regex("<10kAs>"), a_stream.str()); - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -484,21 +456,17 @@ TEST_CASE( "The smart aligner should find existing huge deletions", "[variantadd {"from": 2, "to": 3} ] })"; - + // Make the graph have lots of As stringstream a_stream; for(size_t i = 0; i < 10000; i++) { a_stream << "A"; } graph_json = regex_replace(graph_json, std::regex("<10kAs>"), a_stream.str()); - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); @@ -564,21 +532,17 @@ TEST_CASE( "The smart aligner should use deletion edits on medium deletions", "[ string graph_json = R"({ "node": [{"id": 1, "sequence": "GCGC<100As>GCGC"}] })"; - + // Make the graph have lots of As stringstream a_stream; for(size_t i = 0; i < 100; i++) { a_stream << "A"; } graph_json = regex_replace(graph_json, std::regex("<100As>"), a_stream.str()); - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG VG graph; - graph.extend(proto_graph); + json2graph(graph_json, &graph); // Make a VariantAdder VariantAdder adder(graph); diff --git a/src/unittest/vg.cpp b/src/unittest/vg.cpp index 9beb3e1ca7..b2795b57cc 100644 --- a/src/unittest/vg.cpp +++ b/src/unittest/vg.cpp @@ -8,6 +8,7 @@ #include "../utility.hpp" #include "../algorithms/normalize.hpp" #include "../algorithms/disjoint_components.hpp" +#include "../io/json2graph.hpp" #include "handle.hpp" namespace vg { @@ -15,16 +16,6 @@ namespace unittest { using namespace std; -// Turn a JSON string into a VG graph -VG string_to_graph(const string& json) { - VG graph; - Graph chunk; - json2pb(chunk, json.c_str(), json.size()); - graph.merge(chunk); - - return graph; -} - TEST_CASE("dagify() should render the graph acyclic", "[vg][cycles][dagify]") { unordered_map > node_translation; @@ -44,7 +35,7 @@ TEST_CASE("dagify() should render the graph acyclic", "[vg][cycles][dagify]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); VG dag = graph.dagify(5, node_translation, 5, 0); @@ -69,7 +60,7 @@ TEST_CASE("dagify() should render the graph acyclic", "[vg][cycles][dagify]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); VG dag = graph.dagify(5, node_translation, 5, 0); @@ -93,7 +84,7 @@ TEST_CASE("dagify() should render the graph acyclic", "[vg][cycles][dagify]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); VG dag = graph.dagify(5, node_translation, 5, 0); @@ -123,7 +114,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(10000, node_translation); @@ -252,7 +243,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(10000, node_translation); @@ -327,7 +318,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(10000, node_translation); @@ -417,7 +408,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(10000, node_translation); @@ -574,7 +565,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(10000, node_translation); @@ -742,7 +733,7 @@ TEST_CASE("unfold() should properly unfold a graph out to the requested length", } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); unordered_map > node_translation; VG unfolded = graph.unfold(2, node_translation); @@ -904,7 +895,7 @@ TEST_CASE("expand_context_by_length() should respect barriers", "[vg][context]") } )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); SECTION("barriers on either end of the seed node should stop anything being extracted") { @@ -962,7 +953,7 @@ TEST_CASE("add_nodes_and_edges() should connect all nodes", "[vg][edit]") { )"; // Define a graph - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); const string path_json = R"( { @@ -1051,7 +1042,7 @@ TEST_CASE("edit() should not get confused even under very confusing circumstance )"; // Define a graph - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); // And a path that doubles back on itself through an edge that isn't in the graph yet const string path_json = R"( @@ -1310,7 +1301,7 @@ TEST_CASE("normalize() can join nodes and merge siblings", "[vg][normalize]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // One of the two alternative Ts should have been eliminated @@ -1341,7 +1332,7 @@ TEST_CASE("normalize() can join nodes and merge siblings", "[vg][normalize]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // Those duplicate Ts should be eliminated @@ -1375,7 +1366,7 @@ TEST_CASE("normalize() can join nodes and merge siblings", "[vg][normalize]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // Those duplicate Ts and Gs should be eliminated @@ -1409,7 +1400,7 @@ TEST_CASE("normalize() can join nodes and merge siblings", "[vg][normalize]") { )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // Those duplicate Ts and Gs should be eliminated @@ -1447,7 +1438,7 @@ TEST_CASE("normalize() can join nodes and merge siblings when nodes are backward )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // Those duplicate Ts (actually As) should be eliminated @@ -1486,7 +1477,7 @@ TEST_CASE("normalize() can join nodes and merge siblings when nodes are backward )"; - VG graph = string_to_graph(graph_json); + VG graph; vg::io::json2graph(graph_json, &graph); algorithms::normalize(&graph); // Those duplicate Ts (actually As) and Gs (actually Cs) should be eliminated diff --git a/src/unittest/vg_algorithms.cpp b/src/unittest/vg_algorithms.cpp index b4fc736734..8e713f87f7 100644 --- a/src/unittest/vg_algorithms.cpp +++ b/src/unittest/vg_algorithms.cpp @@ -27,7 +27,7 @@ #include "../vg.hpp" #include "../xg.hpp" #include -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" using namespace google::protobuf; @@ -1092,11 +1092,8 @@ TEST_CASE( "Connecting graph extraction works on a cool loop without leaving ext {"edge": [{"from": "185927720", "to": "185927722"}, {"from": "185927721", "from_start": true, "to": "185927722"}, {"from": "185927722", "to": "186681786", "to_end": true}, {"from": "185927722", "to": "185927723"}, {"from": "186681786", "to": "186683083"}, {"from": "186681786", "from_start": true, "to": "186681787", "to_end": true}, {"from": "186681787", "to": "186683069", "to_end": true}, {"from": "186681787", "from_start": true, "to": "186681789"}, {"from": "186681787", "from_start": true, "to": "186681788", "to_end": true}, {"from": "186681788", "from_start": true, "to": "186681790", "to_end": true}, {"from": "186681789", "to": "186681790", "to_end": true}, {"from": "186681790", "from_start": true, "to": "186681792", "to_end": true}, {"from": "186683069", "from_start": true, "to": "186683079", "to_end": true}, {"from": "186683079", "from_start": true, "to": "186683080", "to_end": true}, {"from": "186683080", "from_start": true, "to": "186683081", "to_end": true}, {"from": "186683081", "from_start": true, "to": "186683083", "to_end": true}], "node": [{"id": "185927720", "sequence": "G"}, {"id": "185927721", "sequence": "A"}, {"id": "185927722", "sequence": "ACCGGG"}, {"id": "185927723", "sequence": "AGTGGGGG"}, {"id": "186681786", "sequence": "C"}, {"id": "186681787", "sequence": "TGGGAGTCTAAGTCTCTTTTGATCACACTTTAAAGACCAAAAGGTAGAAGCGCAAAGACGTTATCTGTCCAATATTACAAACCTAGTAAGTGGTGGAATTTGGCCTTGAACCCAGATCTGTAACTCCAGAGCCGAAGTGCTTCACCCACCTCCCTGTGGTG"}, {"id": "186681788", "sequence": "G"}, {"id": "186681789", "sequence": "T"}, {"id": "186681790", "sequence": "TAT"}, {"id": "186681792", "sequence": "T"}, {"id": "186683069", "sequence": "G"}, {"id": "186683079", "sequence": "G"}, {"id": "186683080", "sequence": "TACCCCGGAATCCCTGCCGCGGCCCCTCGGGCCTGTCCACATCCCTCTGCCCCTCCCAGACCTCTGTCCTTCCACCAATCGCCTCCCGCAGCCCCGAGCCGCCACTCCCAGTCCCCCGAGTCCCTGCCGCGCGCCCTCGCGCCTGTCCACATCCCTCTGCCCATCCGAGACCTCTGTCCTTACACCACTAGCCACCCCACGTGGGACTTCCATGGCTTCTGAGTACAAGGCCAGCCCCCCGGCCCACCAGCTTTCGGAATGCCTGCTTACCTCTTTTTCTGTAGA"}, {"id": "186683081", "sequence": "CCGG"}, {"id": "186683083", "sequence": "C"}]} )"; - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - VG vg; - vg.extend(source); + vg::io::json2graph(graph_json, &vg); bdsg::HashGraph extractor; @@ -1688,11 +1685,8 @@ TEST_CASE( "Connecting graph extraction works on a particular case without leavi )"; - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - VG vg; - vg.extend(source); + vg::io::json2graph(graph_json, &vg); VG extractor; @@ -2583,13 +2577,9 @@ TEST_CASE( "Topological sort works on a more complex graph", {"node": [{"id": 1, "sequence": "GTATTTTTAGTA"}, {"id": 2, "sequence": "G"}, {"id": 3, "sequence": "GAGACGGGGTTTCACCATGTT"}, {"id": 4, "sequence": "T"}, {"id": 5, "sequence": "CTAATTTTT"}, {"id": 6, "sequence": "CA"}, {"id": 7, "sequence": "GG"}, {"id": 8, "sequence": "ACGCCC"}, {"id": 9, "sequence": "C"}, {"id": 10, "sequence": "T"}, {"id": 11, "sequence": "C"}, {"id": 12, "sequence": "GCCA"}, {"id": 13, "sequence": "A"}, {"id": 14, "sequence": "GGGATTACAGGCGCACACC"}, {"id": 15, "sequence": "CCACACC"}, {"id": 16, "sequence": "AT"}, {"id": 17, "sequence": "CC"}, {"id": 18, "sequence": "GGTCAGGCTGGTCTCGACTCC"}, {"id": 19, "sequence": "TGACCTCCTGATCTGCCCCCC"}, {"id": 20, "sequence": "A"}, {"id": 21, "sequence": "G"}, {"id": 22, "sequence": "TATTTTTAGTA"}, {"id": 23, "sequence": "A"}, {"id": 24, "sequence": "G"}, {"id": 25, "sequence": "GA"}], "edge": [{"from": 4, "to": 1}, {"from": 5, "to": 1}, {"from": 1, "to": 2}, {"from": 1, "to": 3}, {"from": 22, "to": 2}, {"from": 2, "to": 20}, {"from": 2, "to": 21}, {"from": 3, "to": 18}, {"from": 5, "to": 4}, {"from": 6, "to": 5}, {"from": 7, "to": 5}, {"from": 8, "to": 6}, {"from": 8, "to": 7}, {"from": 9, "to": 8}, {"from": 10, "to": 8}, {"from": 11, "to": 9}, {"from": 11, "to": 10}, {"from": 12, "to": 11}, {"from": 13, "to": 11}, {"from": 16, "to": 12}, {"from": 17, "to": 12}, {"from": 12, "to": 15}, {"from": 14, "to": 13}, {"from": 18, "to": 19}, {"from": 20, "to": 25}, {"from": 21, "to": 25}, {"from": 23, "to": 22}, {"from": 24, "to": 22}]} )"; - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Make it into a VG + // Load the JSON into a VG VG vg; - vg.extend(proto_graph); + vg::io::json2graph(graph_json, &vg); SECTION( "handlealgs::topological_order produces a consistent total ordering and orientation" ) { auto handle_sort = handlealgs::topological_order(&vg); @@ -5385,11 +5375,8 @@ TEST_CASE("simplify_siblings() works on a graph with a reversing self loop", "[a {"edge": [{"from": "1", "to": "3"}, {"from": "1", "to": "2"}, {"from": "2", "to": "2", "to_end": true}], "node": [{"id": "1", "sequence": "T"}, {"id": "2", "sequence": "A"}, {"id": "3", "sequence": "ACA"}], "path": [{"mapping": [{"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "1"}, "rank": "1"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "2"}, "rank": "2"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"is_reverse": true, "node_id": "2"}, "rank": "3"}], "name": "x"}, {"mapping": [{"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "1"}, "rank": "1"}, {"edit": [{"from_length": 3, "to_length": 3}], "position": {"node_id": "3"}, "rank": "2"}], "name": "y"}]} )"; - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - VG graph; - graph.extend(source); + vg::io::json2graph(graph_json, &graph); @@ -5405,11 +5392,8 @@ TEST_CASE("simplify_siblings() works on a smaller graph with a reversing self lo {"edge": [{"from": "1", "to": "3"}, {"from": "1", "to": "2"}, {"from": "2", "to": "2", "to_end": true}], "node": [{"id": "1", "sequence": "T"}, {"id": "2", "sequence": "A"}, {"id": "3", "sequence": "A"}], "path": [{"mapping": [{"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "1"}, "rank": "1"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "2"}, "rank": "2"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"is_reverse": true, "node_id": "2"}, "rank": "3"}], "name": "x"}]} )"; - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - VG graph; - graph.extend(source); + vg::io::json2graph(graph_json, &graph); @@ -5425,11 +5409,8 @@ TEST_CASE("normalize() works on a graph with a reversing self loop", "[algorithm {"edge": [{"from": "1", "to": "3"}, {"from": "1", "to": "2"}, {"from": "2", "to": "2", "to_end": true}], "node": [{"id": "1", "sequence": "T"}, {"id": "2", "sequence": "A"}, {"id": "3", "sequence": "ACA"}], "path": [{"mapping": [{"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "1"}, "rank": "1"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "2"}, "rank": "2"}, {"edit": [{"from_length": 1, "to_length": 1}], "position": {"is_reverse": true, "node_id": "2"}, "rank": "3"}], "name": "x"}, {"mapping": [{"edit": [{"from_length": 1, "to_length": 1}], "position": {"node_id": "1"}, "rank": "1"}, {"edit": [{"from_length": 3, "to_length": 3}], "position": {"node_id": "3"}, "rank": "2"}], "name": "y"}]} )"; - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - VG graph; - graph.extend(source); + vg::io::json2graph(graph_json, &graph); diff --git a/src/unittest/vpkg.cpp b/src/unittest/vpkg.cpp index 51a849c446..977814ff9c 100644 --- a/src/unittest/vpkg.cpp +++ b/src/unittest/vpkg.cpp @@ -13,7 +13,7 @@ #include "xg.hpp" #include "../vg.hpp" #include "../snarl_seed_clusterer.hpp" -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include #include #include @@ -50,12 +50,12 @@ TEST_CASE("We can read and write XG", "[vpkg][handlegraph][xg]") { )"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + bdsg::HashGraph hash_graph; + vg::io::json2graph(graph_json, &hash_graph); + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(hash_graph); stringstream ss; @@ -148,13 +148,10 @@ TEST_CASE("We can read VG from a VPKG-wrapped stream as a VG", "[vpkg][handlegra {"id":2,"sequence":"ACA"}], "edge":[{"to":2,"from":1}]} )"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Build the VG - vg::VG vg_graph(proto_graph); + + // Load the JSON directly into VG + vg::VG vg_graph; + vg::io::json2graph(graph_json, &vg_graph); // Save it stringstream ss; @@ -179,13 +176,10 @@ TEST_CASE("We can read VG from a VPKG-wrapped stream as a HandleGraph which is a {"id":2,"sequence":"ACA"}], "edge":[{"to":2,"from":1}]} )"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Build the VG - vg::VG vg_graph(proto_graph); + + // Load the JSON directly into VG + vg::VG vg_graph; + vg::io::json2graph(graph_json, &vg_graph); // Save it stringstream ss; @@ -210,13 +204,10 @@ TEST_CASE("We can read VG from a VPKG-wrapped stream as a HandleGraph which is a TEST_CASE("We can read an empty VG as a HandleGraph", "[vpkg][handlegraph][vg][empty]") { string graph_json = "{}"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Build the VG - vg::VG vg_graph(proto_graph); + + // Load the JSON directly into VG + vg::VG vg_graph; + vg::io::json2graph(graph_json, &vg_graph); // Save it stringstream ss; @@ -240,13 +231,10 @@ TEST_CASE("We prefer to read a graph as the first provided type that matches", " {"id":2,"sequence":"ACA"}], "edge":[{"to":2,"from":1}]} )"; - - // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - - // Build the VG - vg::VG vg_graph(proto_graph); + + // Load the JSON directly into VG + vg::VG vg_graph; + vg::io::json2graph(graph_json, &vg_graph); // Save it stringstream ss; diff --git a/src/unittest/xdrop_aligner.cpp b/src/unittest/xdrop_aligner.cpp index f745b8f66a..07577e4479 100644 --- a/src/unittest/xdrop_aligner.cpp +++ b/src/unittest/xdrop_aligner.cpp @@ -5,7 +5,7 @@ #include #include -#include "vg/io/json2pb.h" +#include "../io/json2graph.hpp" #include "../alignment.hpp" #include "../vg.hpp" #include @@ -764,12 +764,9 @@ TEST_CASE("QualAdjXdropAligner will not penalize a low quality mismatch", "[xdro TEST_CASE("XdropAligner doesn't crash on a case where it is hard to find a seed", "[xdrop][alignment][mapping]") { string graph_json = R"({"edge": [{"from": "92345167", "to": "92345168"}, {"from": "92345182", "to": "92345183"}, {"from": "92345165", "to": "92345166"}, {"from": "92345177", "to": "92345178"}, {"from": "92345171", "to": "92345172"}, {"from": "92345161", "to": "92345162"}, {"from": "92345183", "to": "92345184"}, {"from": "92345181", "to": "92345182"}, {"from": "92345178", "to": "92345179"}, {"from": "92345166", "to": "92345167"}, {"from": "92345179", "to": "92345180"}, {"from": "92345173", "to": "92345174"}, {"from": "92345184", "to": "92345185"}, {"from": "92345169", "to": "92345170"}, {"from": "92345185", "to": "92345186"}, {"from": "92345160", "to": "92345161"}, {"from": "92345174", "to": "92345175"}, {"from": "92345162", "to": "92345163"}, {"from": "92345175", "to": "92345176"}, {"from": "92345168", "to": "92345169"}, {"from": "92345163", "to": "92345164"}, {"from": "92345172", "to": "92345173"}, {"from": "92345180", "to": "92345181"}, {"from": "92345176", "to": "92345177"}, {"from": "92345170", "to": "92345171"}, {"from": "92345164", "to": "92345165"}], "node": [{"id": "92345167", "sequence": "TTTATATATATATATTTATATATATATATTTA"}, {"id": "92345182", "sequence": "TATATATATTTATATATATATTTATATATATA"}, {"id": "92345165", "sequence": "ATATATATATATTTATATATATTTATATATTA"}, {"id": "92345177", "sequence": "TTTATATATATATTTATATATATATATTATAT"}, {"id": "92345171", "sequence": "TTATATATATATTTATATATATATTTATATAT"}, {"id": "92345161", "sequence": "ATATATTTATATATTTTTATATATTATATATT"}, {"id": "92345183", "sequence": "TTTATATATATTTATATATATATTTATATATA"}, {"id": "92345181", "sequence": "ATATATTATATATATATTTATATATATATTTA"}, {"id": "92345178", "sequence": "ATATATTTATATATATATTTATATATATATTT"}, {"id": "92345166", "sequence": "TTTATATATATTTATATATATATTTATATATA"}, {"id": "92345179", "sequence": "ATATATATATTTATATATATATTTATATATAT"}, {"id": "92345173", "sequence": "ATATTTATATATATATATTTATATATATATTT"}, {"id": "92345184", "sequence": "TATTTATATATATATTTATATATATTTATATA"}, {"id": "92345169", "sequence": "TTTATATATATATTTATATATATATTTATATA"}, {"id": "92345185", "sequence": "TATATTTATATATATATATATATATTTATATA"}, {"id": "92345160", "sequence": "ATTTATATATATATTTATATATATATTTATAT"}, {"id": "92345174", "sequence": "ATATATATATTTATATATATATTATTTATATA"}, {"id": "92345162", "sequence": "TATATATATATTTATATATTATATATATATTT"}, {"id": "92345175", "sequence": "TATATTTATATATATATTATATATATATTTAT"}, {"id": "92345168", "sequence": "TATATATATTTATATATATATTTATATATATA"}, {"id": "92345163", "sequence": "ATATATTTATATATATATTTATATATATTTAT"}, {"id": "92345172", "sequence": "ATATATATATATTTATATATATATTTATATAT"}, {"id": "92345180", "sequence": "ATTTATATATATATTTATATATATATTTATAT"}, {"id": "92345176", "sequence": "ATATATATATTATATATATATTTATATATATA"}, {"id": "92345170", "sequence": "TATATTTATATATATATATTATATATATATAT"}, {"id": "92345164", "sequence": "ATATATATTTATATATATTTATATATATATTT"}, {"id": "92345186", "sequence": "TATATTTATATATATTTATATATATATTTATA"}]})"; - - Graph source; - json2pb(source, graph_json.c_str(), graph_json.size()); - - VG graph; - graph.extend(source); + + bdsg::HashGraph graph; + vg::io::json2graph(graph_json, &graph); Alignment aln; aln.set_sequence("CAGCACTTTGGGAGGCCAAGGTGGGTGGATCATCTGAGGTCAGGAGTTTGAGACCAGCCTGACCAACATGGTGAAATCCTGTCTCTACTGAAAATACTAAAATTAGCCAGGCGTGGCGGCCAGTGCCTGTAATCCCGGCTACTGGGGAGG"); diff --git a/src/unittest/xg.cpp b/src/unittest/xg.cpp index d74db5d0b0..dfa913b8eb 100644 --- a/src/unittest/xg.cpp +++ b/src/unittest/xg.cpp @@ -8,7 +8,9 @@ #include "vg.hpp" #include "xg.hpp" #include "graph.hpp" +#include "../io/json2graph.hpp" #include "algorithms/subgraph.hpp" +#include "bdsg/hash_graph.hpp" #include namespace vg { @@ -22,19 +24,18 @@ TEST_CASE("We can build an xg index on a nice graph", "[xg]") { {"id":2,"sequence":"ACA"}], "edge":[{"to":2,"from":1}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); VG vg_graph; algorithms::extract_context(xg_index, vg_graph, xg_index.get_handle(1), 0, 100); Graph& graph = vg_graph.graph; - sort_by_id_dedup_and_clean(graph); REQUIRE(graph.node_size() == 2); REQUIRE(graph.edge_size() == 1); @@ -49,19 +50,18 @@ TEST_CASE("We can build an xg index on a nasty graph", "[xg]") { {"id":9999,"sequence":"AAA"}], "edge":[{"to":2,"from":1}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); VG vg_graph; algorithms::extract_context(xg_index, vg_graph, xg_index.get_handle(1), 0, 100); Graph& graph = vg_graph.graph; - sort_by_id_dedup_and_clean(graph); REQUIRE(graph.node_size() == 2); REQUIRE(graph.edge_size() == 1); @@ -161,15 +161,14 @@ TEST_CASE("We can build an xg index on a very nasty graph", "[xg]") { {"position":{"node_id":1444},"rank":1059}, {"position":{"node_id":1445},"rank":1060}]}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + VG source; + vg::io::json2graph(graph_json, &source); - sort_by_id_dedup_and_clean(proto_graph); // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); SECTION("Context extraction gets something") { VG graph; @@ -182,7 +181,7 @@ TEST_CASE("We can build an xg index on a very nasty graph", "[xg]") { SECTION("We can extract within a single node") { algorithms::extract_path_range(xg_index, xg_index.get_path_handle("17"), 5, 15, graph); - + // We should just get node 1416 REQUIRE(graph.graph.node_size() == 1); REQUIRE(graph.graph.node(0).id() == 1416); @@ -265,14 +264,14 @@ TEST_CASE("We can build and scan an XG index for a problematic graph", "[xg]") { ]} ]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); // Build the xg index (without any sorting) xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); REQUIRE(xg_index.get_node_count() == 5); @@ -300,18 +299,16 @@ TEST_CASE("We can build the xg index on a small graph with discontinuous node id )"; // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + VG source; + vg::io::json2graph(graph_json, &source); - sort_by_id_dedup_and_clean(proto_graph); // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); VG vg_graph; algorithms::extract_context(xg_index, vg_graph, xg_index.get_handle(10), 0, 100); Graph& graph = vg_graph.graph; - sort_by_id_dedup_and_clean(graph); REQUIRE(graph.node_size() == 2); REQUIRE(graph.edge_size() == 1); @@ -326,14 +323,14 @@ TEST_CASE("Looping over XG handles in parallel works", "[xg]") { {"id":2,"sequence":"ACA"}], "edge":[{"to":2,"from":1}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); - + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); + // Build the xg index xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); size_t count = 0; @@ -341,7 +338,7 @@ TEST_CASE("Looping over XG handles in parallel works", "[xg]") { #pragma omp critical count++; }, true); - + REQUIRE(count == 2); } @@ -400,14 +397,14 @@ TEST_CASE("Vectorization of xg works correctly", "[xg]") { {"edit": [{"from_length": 11, "to_length": 11}], "position": {"node_id": "15"}, "rank": "10"} ], "name": "x"}]} )"; - + // Load the JSON - Graph proto_graph; - json2pb(proto_graph, graph_json.c_str(), graph_json.size()); + bdsg::HashGraph source; + vg::io::json2graph(graph_json, &source); // Build the xg index (without any sorting) xg::XG xg_index; - xg_index.from_path_handle_graph(VG(proto_graph)); + xg_index.from_path_handle_graph(source); REQUIRE(xg_index.get_node_count() == 15); diff --git a/src/unittest/zip_code.cpp b/src/unittest/zip_code.cpp index dc3255e984..1d0a2c39c7 100644 --- a/src/unittest/zip_code.cpp +++ b/src/unittest/zip_code.cpp @@ -117,6 +117,10 @@ using namespace std; bool chain_is_reversed = distance_index.is_reversed_in_parent( distance_index.get_node_net_handle(n1->id())); + // Node 4 is in snarl 3 to 6 which should be regular. + // The zip codes are going to encode this so it had better be true. + REQUIRE(distance_index.is_regular_snarl(distance_index.get_parent(distance_index.get_parent(distance_index.get_node_net_handle(n4->id()))))); + SECTION ("zip code for node on top-level chain") { ZipCode zipcode; zipcode.fill_in_zipcode_from_pos(distance_index, make_pos_t(n1->id(), 0, false)); diff --git a/src/zip_code.cpp b/src/zip_code.cpp index 4699a24494..051602443f 100644 --- a/src/zip_code.cpp +++ b/src/zip_code.cpp @@ -1,3 +1,5 @@ +#include "crash.hpp" + #include "zip_code.hpp" //#define DEBUG_ZIPCODE @@ -16,10 +18,11 @@ void ZipCode::fill_in_zipcode_from_pos(const SnarlDistanceIndex& distance_index, //Put all ancestors of the node in a vector, starting from the node, and not including the root while (!distance_index.is_root(current_handle)) { ancestors.emplace_back(distance_index.start_end_traversal_of(current_handle)); - current_handle = distance_index.get_parent(current_handle); + net_handle_t parent_handle = distance_index.get_parent(current_handle); + crash_unless(parent_handle != current_handle); + current_handle = parent_handle; } - //Now add the root-level snarl or chain if (distance_index.is_root_snarl(current_handle)) { //First thing is a snarl, so add the snarl's connected component number @@ -121,7 +124,7 @@ void ZipCode::fill_in_zipcode_from_pos(const SnarlDistanceIndex& distance_index, } return; } - } else if (distance_index.is_regular_snarl(current_ancestor, false, graph_ptr)) { + } else if (distance_index.is_regular_snarl(current_ancestor)) { snarl_code_t snarl_code = get_regular_snarl_code(current_ancestor, ancestors[i-1], distance_index); zipcode.add_value(snarl_code.get_raw_code_type()); zipcode.add_value(snarl_code.get_raw_prefix_sum_or_identifier()); @@ -1065,11 +1068,7 @@ ZipCode::snarl_code_t ZipCode::get_regular_snarl_code(const net_handle_t& snarl, snarl_code.set_code_type(1); //The number of children - size_t child_count = 0; - distance_index.for_each_child(snarl, [&] (const net_handle_t& child) { - child_count++; - }); - snarl_code.set_child_count(child_count); + snarl_code.set_child_count(distance_index.get_snarl_child_count(snarl)); //Chain prefix sum value for the start of the snarl, which is the prefix sum of the start node + length of the start node net_handle_t start_node = distance_index.get_node_from_sentinel(distance_index.get_bound(snarl, false, false)); @@ -1100,11 +1099,7 @@ ZipCode::snarl_code_t ZipCode::get_irregular_snarl_code(const net_handle_t& snar snarl_code.set_code_type(distance_index.is_dag(snarl) ? 0 : 2); //The number of children - size_t child_count = 0; - distance_index.for_each_child(snarl, [&] (const net_handle_t& child) { - child_count++; - }); - snarl_code.set_child_count(child_count); + snarl_code.set_child_count(distance_index.get_snarl_child_count(snarl)); //Chain prefix sum value for the start of the snarl, which is the prefix sum of the start node + length of the start node net_handle_t start_node = distance_index.get_node_from_sentinel(distance_index.get_bound(snarl, false, false)); diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index b1136dceef..5ec9a33612 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -124,6 +124,8 @@ is "$(grep -c 'error.*are not compatible' log.txt)" "1" "appropriate error messa rm t1.bam t2.bam t3.bam t1.gaf tagged1.fq tagged2.fq rm -f read.fq read.gam +rm -rf explanation_* + vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.indel.multi.fq --show-work --track-position -b chaining-sr > /dev/null 2>&1 # Check that at least some TSV files and directories were created is "$(find explanation_read1 -name 'chain*-dotplot*.tsv' 2>/dev/null | wc -l | tr -d ' ')" "1" "Chain explanation files are created per chain" @@ -297,8 +299,8 @@ vg index -j 1mb1kgp.dist 1mb1kgp.vg vg autoindex -p 1mb1kgp -w giraffe -P "VG w/ Variant Paths:1mb1kgp.vg" -P "Giraffe Distance Index:1mb1kgp.dist" -r 1mb1kgp/z.fa -v 1mb1kgp/z.vcf.gz vg giraffe -Z 1mb1kgp.giraffe.gbz -f reads/1mb1kgp_longread.fq >longread.gam -U 300 --track-provenance --align-from-chains --set-refpos # This is an 8001 bp read with 1 insert and 1 substitution -# 7999 * 1 + 1 * -4 + -6 + 5 + 5 = 7999 -is "$(vg view -aj longread.gam | jq -r '.score')" "7999" "A long read can be correctly aligned" +# We use minimap2-based scoring which awards that this many points. +is "$(vg view -aj longread.gam | jq -r '.score')" "7948" "A long read can be correctly aligned" is "$(vg view -aj longread.gam | jq -c '.path.mapping[].edit[] | select(.sequence)' | wc -l | sed 's/^[[:space:]]*//')" "2" "A long read has the correct edits found" is "$(vg view -aj longread.gam | jq -c '. | select(.annotation["filter_3_cluster-coverage_cluster_passed_size_total"] <= 300)' | wc -l | sed 's/^[[:space:]]*//')" "1" "Long read minimizer set is correctly restricted" is "$(vg view -aj longread.gam | jq -c '.refpos[]' | wc -l)" "$(vg view -aj longread.gam | jq -c '.path.mapping[]' | wc -l)" "Giraffe sets refpos for each reference node"