diff --git a/merge.go b/merge.go index 0b79df1f..8526e9d1 100644 --- a/merge.go +++ b/merge.go @@ -647,13 +647,13 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, return 0, nil, seg.ErrClosed } // get the edgeList for this segment - edgeList := segment.EdgeList() + edgeList := segment.edgeList() // if no edgeList, nothing to do if edgeList == nil { continue } newSegDocNums := rv[segI] - edgeList.Iterate(func(oldChild uint64, oldParent uint64) bool { + edgeList.iterate(func(oldChild uint64, oldParent uint64) bool { newParent := newSegDocNums[oldParent] newChild := newSegDocNums[oldChild] if newParent != docDropped && diff --git a/nested_cache.go b/nested_cache.go index 2d273ba2..741eeaf5 100644 --- a/nested_cache.go +++ b/nested_cache.go @@ -52,11 +52,12 @@ func (nc *nestedIndexCache) initialize(numDocs uint64, edgeListOffset uint64, me return fmt.Errorf("error reading number of edges in nested edge list") } pos += uint64(read) - // if no documents or edges/nested documents, return - if numDocs == 0 || numEdges == 0 { + // if no documents or edges/nested documents or invalid state, return + if numDocs == 0 || numEdges == 0 || numDocs <= numEdges { return nil } - edgeList := NewEdgeList(numDocs, numEdges) + // create and cache our edge list + edgeList := newEdgeList(numDocs, numEdges) for i := uint64(0); i < numEdges; i++ { child, read := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) if read <= 0 { @@ -68,17 +69,49 @@ func (nc *nestedIndexCache) initialize(numDocs uint64, edgeListOffset uint64, me return fmt.Errorf("error reading parent doc id in nested edge list") } pos += uint64(read) - edgeList.AddEdge(child, parent) + edgeList.addEdge(child, parent) + } + // create and cache our descendant store + numRoots := numDocs - numEdges + descendantStore := newDescendantStore(numRoots) + // populate the descendant store using the following invariants: + // Invariant: child docNums is always > parent docNums + // Invariant: descendants of root docNum R is always a + // contiguous range of docNums [R + 1 : R + 1 + N) + // where N is the number of descendants of R + roots := make([]uint64, 0, numRoots) + for docNum := uint64(0); docNum < numDocs; docNum++ { + if _, ok := edgeList.parent(docNum); !ok { + roots = append(roots, docNum) + } + } + // descendants of each root are the contiguous range of + // docNums between it and the next root + for idx, root := range roots { + start := root + 1 + end := numDocs + nextIdx := idx + 1 + if nextIdx < len(roots) { + end = roots[nextIdx] + } + numDescendants := end - start + if numDescendants > 0 { + bitmap := roaring.New() + bitmap.AddRange(start, end) + descendantStore.add(root, bitmap) + } } + nc.cache = &nestedCacheEntry{ el: edgeList, + ds: descendantStore, } return nil } type nestedCacheEntry struct { - // edgeList[child] = parent - el EdgeList + el edgeList + ds descendantStore } func (nc *nestedIndexCache) ancestry(docNum uint64, prealloc []index.AncestorID) []index.AncestorID { @@ -90,7 +123,7 @@ func (nc *nestedIndexCache) ancestry(docNum uint64, prealloc []index.AncestorID) } current := docNum for { - parent, ok := cache.el.Parent(current) + parent, ok := cache.el.parent(current) if !ok { break } @@ -100,7 +133,15 @@ func (nc *nestedIndexCache) ancestry(docNum uint64, prealloc []index.AncestorID) return prealloc } -func (nc *nestedIndexCache) edgeList() EdgeList { +func (nc *nestedIndexCache) descendants(root uint64) (*roaring.Bitmap, bool) { + cache := nc.cache + if cache == nil || cache.ds == nil { + return nil, false + } + return cache.ds.descendants(root) +} + +func (nc *nestedIndexCache) edgeList() edgeList { cache := nc.cache if cache == nil || cache.el == nil { return nil @@ -108,36 +149,47 @@ func (nc *nestedIndexCache) edgeList() EdgeList { return cache.el } +func (nc *nestedIndexCache) descendantStore() descendantStore { + cache := nc.cache + if cache == nil || cache.ds == nil { + return nil + } + return cache.ds +} + func (nc *nestedIndexCache) countNested() uint64 { cache := nc.cache if cache == nil || cache.el == nil { return 0 } - return cache.el.Count() + return cache.el.count() } -// countRoot returns the number of root documents in the given bitmap -func (nc *nestedIndexCache) countRoot(bm *roaring.Bitmap) uint64 { - var totalDocs uint64 - if bm == nil { - // if bitmap is empty, return 0 +// countRootDeleted returns the number of root documents in the given bitmap that are deleted +func (nc *nestedIndexCache) countRootDeleted(bm *roaring.Bitmap) uint64 { + // empty bitmap means no root documents + if bm == nil || bm.IsEmpty() { + return 0 + } + totalDocs := bm.GetCardinality() + // if no nested documents, all documents in the bitmap are root documents + if nc.countNested() == 0 { return totalDocs } - totalDocs = bm.GetCardinality() - cache := nc.cache - if cache == nil || cache.el == nil { - // if cache is nil, no nested docs, so all docs are root docs - // so just return the cardinality of the bitmap + // get the edgeList for this segment + el := nc.edgeList() + if el == nil { return totalDocs } // count nested documents in the bitmap, a nested doc is one that has a parent in the edge list var nestedDocCount uint64 - bm.Iterate(func(docNum uint32) bool { - if _, ok := cache.el.Parent(uint64(docNum)); ok { + bmItr := bm.Iterator() + for bmItr.HasNext() { + docNum := bmItr.Next() + if _, ok := el.parent(uint64(docNum)); ok { nestedDocCount++ } - return true - }) + } // root docs = total docs - nested docs if totalDocs < nestedDocCount { // should not happen, but just in case @@ -148,21 +200,21 @@ func (nc *nestedIndexCache) countRoot(bm *roaring.Bitmap) uint64 { // ------------------------------------------------------- -// EdgeList provides an interface to access parent of a child document -type EdgeList interface { - // Parent returns the parent of the given child document ID, +// edgeList provides an interface to access parent of a child document +type edgeList interface { + // parent returns the parent of the given child document ID, // and a boolean indicating if the parent exists. - Parent(child uint64) (uint64, bool) + parent(child uint64) (uint64, bool) - // AddEdge adds an edge from child to parent in the edge list. - AddEdge(child uint64, parent uint64) + // addEdge adds an edge from child to parent in the edge list. + addEdge(child uint64, parent uint64) - // Count returns the number of edges in the edge list. - Count() uint64 + // count returns the number of edges in the edge list. + count() uint64 - // Iterate iterates over all edges in the edge list, calling the provided function + // iterate iterates over all edges in the edge list, calling the provided function // with each child-parent pair. If the function returns false, iteration stops. - Iterate(func(child uint64, parent uint64) bool) + iterate(func(child uint64, parent uint64) bool) } type edgeListMap struct { @@ -175,20 +227,20 @@ func newEdgeListMap(numEdges uint64) *edgeListMap { } } -func (elm *edgeListMap) Parent(child uint64) (uint64, bool) { +func (elm *edgeListMap) parent(child uint64) (uint64, bool) { parent, ok := elm.edges[child] return parent, ok } -func (elm *edgeListMap) AddEdge(child uint64, parent uint64) { +func (elm *edgeListMap) addEdge(child uint64, parent uint64) { elm.edges[child] = parent } -func (elm *edgeListMap) Count() uint64 { +func (elm *edgeListMap) count() uint64 { return uint64(len(elm.edges)) } -func (elm *edgeListMap) Iterate(f func(child uint64, parent uint64) bool) { +func (elm *edgeListMap) iterate(f func(child uint64, parent uint64) bool) { for child, parent := range elm.edges { if !f(child, parent) { return @@ -197,7 +249,7 @@ func (elm *edgeListMap) Iterate(f func(child uint64, parent uint64) bool) { } type edgeListSlice struct { - count uint64 + numEdges uint64 sentinel uint64 edges []uint64 } @@ -209,13 +261,13 @@ func newEdgeListSlice(numDocs uint64, numEdges uint64) *edgeListSlice { edges[i] = sentinel } return &edgeListSlice{ - count: numEdges, + numEdges: numEdges, sentinel: sentinel, edges: edges, } } -func (els *edgeListSlice) Parent(child uint64) (uint64, bool) { +func (els *edgeListSlice) parent(child uint64) (uint64, bool) { if child >= uint64(len(els.edges)) { return 0, false } @@ -226,21 +278,21 @@ func (els *edgeListSlice) Parent(child uint64) (uint64, bool) { return parent, true } -func (el *edgeListSlice) AddEdge(child uint64, parent uint64) { - if child >= uint64(len(el.edges)) { +func (els *edgeListSlice) addEdge(child uint64, parent uint64) { + if child >= uint64(len(els.edges)) { // out of bounds, ignore as this should not happen return } - el.edges[child] = parent + els.edges[child] = parent } -func (el *edgeListSlice) Count() uint64 { - return el.count +func (els *edgeListSlice) count() uint64 { + return els.numEdges } -func (el *edgeListSlice) Iterate(f func(child uint64, parent uint64) bool) { - for child, parent := range el.edges { - if parent != el.sentinel { +func (els *edgeListSlice) iterate(f func(child uint64, parent uint64) bool) { + for child, parent := range els.edges { + if parent != els.sentinel { if !f(uint64(child), parent) { return } @@ -248,7 +300,7 @@ func (el *edgeListSlice) Iterate(f func(child uint64, parent uint64) bool) { } } -// nestedCacheRatio defines the threshold ratio of nested documents to total documents. +// edgeListMapThreshold defines the threshold ratio of nested documents to total documents. // It is derived using the following reasoning: // // Let N = number of nested documents (i.e., edges in the edge list) @@ -273,9 +325,9 @@ func (el *edgeListSlice) Iterate(f func(child uint64, parent uint64) bool) { // we use a map for the edge list; otherwise, we use a slice. var edgeListMapThreshold = 8.0 / 30.0 -// NewEdgeList creates a new EdgeList instance based on the provided +// newEdgeList creates a new edgeList instance based on the provided // constants, the total number of documents and the number of nested documents/edges. -func NewEdgeList(numDocs uint64, numEdges uint64) EdgeList { +func newEdgeList(numDocs uint64, numEdges uint64) edgeList { if numDocs == 0 || numEdges == 0 { // no edges, return nil return nil @@ -288,3 +340,36 @@ func NewEdgeList(numDocs uint64, numEdges uint64) EdgeList { // use slice representation return newEdgeListSlice(numDocs, numEdges) } + +// ------------------------------------------------------- + +// descendantStore provides an interface to access precomputed descendant bitmaps for root documents +type descendantStore interface { + // add a descendant bitmap for a root document + add(root uint64, descendants *roaring.Bitmap) + // returns the descendant bitmap for a root document, with an indication of its existence + descendants(root uint64) (*roaring.Bitmap, bool) +} + +type descendantStoreMap struct { + m map[uint64]*roaring.Bitmap +} + +func newDescendantStoreMap(numRoots uint64) *descendantStoreMap { + return &descendantStoreMap{ + m: make(map[uint64]*roaring.Bitmap, numRoots), + } +} + +func (dsm *descendantStoreMap) add(root uint64, descendants *roaring.Bitmap) { + dsm.m[root] = descendants +} + +func (dsm *descendantStoreMap) descendants(root uint64) (*roaring.Bitmap, bool) { + bm, ok := dsm.m[root] + return bm, ok +} + +func newDescendantStore(numRoots uint64) descendantStore { + return newDescendantStoreMap(numRoots) +} diff --git a/nested_test.go b/nested_test.go index 2c1b3a84..198651c6 100644 --- a/nested_test.go +++ b/nested_test.go @@ -337,12 +337,12 @@ func TestNestedSegment(t *testing.T) { } // Verify edge list exists - el := sb.EdgeList() + el := sb.edgeList() if el == nil { t.Fatal("expected non-nil edge list") } - if el.Count() != expectedNestedDocs { - t.Fatalf("expected edge list count %d, got %d", expectedNestedDocs, el.Count()) + if el.count() != expectedNestedDocs { + t.Fatalf("expected edge list count %d, got %d", expectedNestedDocs, el.count()) } // Test ancestry lookups @@ -543,12 +543,12 @@ func TestNestedSegmentMerge(t *testing.T) { } // Verify edge list in merged segment - el := mergedSb.EdgeList() + el := mergedSb.edgeList() if el == nil { t.Fatal("expected non-nil edge list in merged segment") } - if el.Count() != expectedNested { - t.Fatalf("merged edge list: expected %d edges, got %d", expectedNested, el.Count()) + if el.count() != expectedNested { + t.Fatalf("merged edge list: expected %d edges, got %d", expectedNested, el.count()) } // Verify ancestry in merged segment @@ -669,13 +669,13 @@ func TestNestedSegmentMergeWithDeletes(t *testing.T) { t.Fatalf("expected 1 root doc in merged segment, got %d", rootCount) } // Should have 1 edge in edge list - el := mergedSb.EdgeList() + el := mergedSb.edgeList() if el == nil { t.Fatal("expected non-nil edge list in merged segment") } - if el.Count() != 1 { - t.Fatalf("merged edge list: expected 1 edge, got %d", el.Count()) + if el.count() != 1 { + t.Fatalf("merged edge list: expected 1 edge, got %d", el.count()) } // Verify ancestry testAncestry := []struct { @@ -928,7 +928,7 @@ func TestNestedSegmentEdgeListIteration(t *testing.T) { }() sb := seg.(*Segment) - el := sb.EdgeList() + el := sb.edgeList() if el == nil { t.Fatal("expected non-nil edge list") } @@ -948,13 +948,13 @@ func TestNestedSegmentEdgeListIteration(t *testing.T) { } // Verify edge count - if el.Count() != uint64(len(expectedEdges)) { - t.Fatalf("expected %d edges, got %d", len(expectedEdges), el.Count()) + if el.count() != uint64(len(expectedEdges)) { + t.Fatalf("expected %d edges, got %d", len(expectedEdges), el.count()) } // Verify Parent lookups for child, expectedParent := range expectedEdges { - parent, ok := el.Parent(child) + parent, ok := el.parent(child) if !ok { t.Errorf("expected parent for child %d", child) continue @@ -965,13 +965,13 @@ func TestNestedSegmentEdgeListIteration(t *testing.T) { } // Verify root has no parent - if _, ok := el.Parent(0); ok { + if _, ok := el.parent(0); ok { t.Error("root should have no parent") } // Test iteration foundEdges := make(map[uint64]uint64) - el.Iterate(func(child uint64, parent uint64) bool { + el.iterate(func(child uint64, parent uint64) bool { foundEdges[child] = parent return true }) @@ -1020,7 +1020,7 @@ func TestNestedSegmentNoNesting(t *testing.T) { } // Edge list should be nil for flat documents - el := sb.EdgeList() + el := sb.edgeList() if el != nil { t.Fatal("expected nil edge list for flat documents") } @@ -1178,18 +1178,18 @@ func TestNestedSegmentEdgeListMap(t *testing.T) { } // Edge list should exist - el := sb.EdgeList() + el := sb.edgeList() if el == nil { t.Fatal("expected non-nil edge list") } // Verify edge count - if el.Count() != 1 { - t.Fatalf("expected 1 edge, got %d", el.Count()) + if el.count() != 1 { + t.Fatalf("expected 1 edge, got %d", el.count()) } // Test Parent lookup - child1 is doc 10, parent is root10 which is doc 9 - parent, ok := el.Parent(10) + parent, ok := el.parent(10) if !ok { t.Fatal("expected parent for child1") } @@ -1198,13 +1198,13 @@ func TestNestedSegmentEdgeListMap(t *testing.T) { } // Verify root has no parent - if _, ok := el.Parent(0); ok { + if _, ok := el.parent(0); ok { t.Error("root should have no parent") } // Test iteration foundEdges := make(map[uint64]uint64) - el.Iterate(func(child uint64, parent uint64) bool { + el.iterate(func(child uint64, parent uint64) bool { foundEdges[child] = parent return true }) @@ -1243,35 +1243,35 @@ func TestNestedSegmentEdgeListMap(t *testing.T) { func TestEdgeList(t *testing.T) { t.Run("edgeListMap", func(t *testing.T) { el := newEdgeListMap(3) - el.AddEdge(1, 0) - el.AddEdge(2, 0) - el.AddEdge(3, 1) + el.addEdge(1, 0) + el.addEdge(2, 0) + el.addEdge(3, 1) - if el.Count() != 3 { - t.Fatalf("expected count 3, got %d", el.Count()) + if el.count() != 3 { + t.Fatalf("expected count 3, got %d", el.count()) } // Test Parent - parent, ok := el.Parent(1) + parent, ok := el.parent(1) if !ok || parent != 0 { t.Errorf("expected parent 0 for child 1, got %d, ok=%v", parent, ok) } - parent, ok = el.Parent(3) + parent, ok = el.parent(3) if !ok || parent != 1 { t.Errorf("expected parent 1 for child 3, got %d, ok=%v", parent, ok) } - _, ok = el.Parent(0) + _, ok = el.parent(0) if ok { t.Error("expected no parent for root") } - _, ok = el.Parent(99) + _, ok = el.parent(99) if ok { t.Error("expected no parent for non-existent doc") } // Test Iterate found := make(map[uint64]uint64) - el.Iterate(func(child uint64, parent uint64) bool { + el.iterate(func(child uint64, parent uint64) bool { found[child] = parent return true }) @@ -1283,36 +1283,36 @@ func TestEdgeList(t *testing.T) { // Test edgeListSlice t.Run("edgeListSlice", func(t *testing.T) { el := newEdgeListSlice(10, 3) - el.AddEdge(1, 0) - el.AddEdge(2, 0) - el.AddEdge(3, 1) + el.addEdge(1, 0) + el.addEdge(2, 0) + el.addEdge(3, 1) - if el.Count() != 3 { - t.Fatalf("expected count 3, got %d", el.Count()) + if el.count() != 3 { + t.Fatalf("expected count 3, got %d", el.count()) } // Test Parent - parent, ok := el.Parent(1) + parent, ok := el.parent(1) if !ok || parent != 0 { t.Errorf("expected parent 0 for child 1, got %d, ok=%v", parent, ok) } - parent, ok = el.Parent(3) + parent, ok = el.parent(3) if !ok || parent != 1 { t.Errorf("expected parent 1 for child 3, got %d, ok=%v", parent, ok) } - _, ok = el.Parent(0) + _, ok = el.parent(0) if ok { t.Error("expected no parent for root") } // Out of bounds - _, ok = el.Parent(99) + _, ok = el.parent(99) if ok { t.Error("expected no parent for out of bounds doc") } // Test Iterate found := make(map[uint64]uint64) - el.Iterate(func(child uint64, parent uint64) bool { + el.iterate(func(child uint64, parent uint64) bool { found[child] = parent return true }) @@ -1321,29 +1321,29 @@ func TestEdgeList(t *testing.T) { } // Test AddEdge out of bounds (should be silently ignored) - el.AddEdge(100, 0) // out of bounds, should not panic + el.addEdge(100, 0) // out of bounds, should not panic }) // Test NewEdgeList threshold selection t.Run("NewEdgeList", func(t *testing.T) { // Ratio < 8/30 should use map - el := NewEdgeList(100, 10) // ratio = 0.1 < 0.267 + el := newEdgeList(100, 10) // ratio = 0.1 < 0.267 if _, ok := el.(*edgeListMap); !ok { t.Error("expected edgeListMap for low ratio") } // Ratio >= 8/30 should use slice - el = NewEdgeList(100, 50) // ratio = 0.5 > 0.267 + el = newEdgeList(100, 50) // ratio = 0.5 > 0.267 if _, ok := el.(*edgeListSlice); !ok { t.Error("expected edgeListSlice for high ratio") } // Zero cases - el = NewEdgeList(0, 0) + el = newEdgeList(0, 0) if el != nil { t.Error("expected nil for 0 docs") } - el = NewEdgeList(10, 0) + el = newEdgeList(10, 0) if el != nil { t.Error("expected nil for 0 edges") } diff --git a/segment.go b/segment.go index 7da09d57..5c6a2e50 100644 --- a/segment.go +++ b/segment.go @@ -886,51 +886,65 @@ func (sb *SegmentBase) CountRoot(deleted *roaring.Bitmap) uint64 { // dR = D - dS // Therefore, the count of root docs excluding deleted ones is: // R - dR = (T - S) - (D - dS) - return (sb.Count() - sb.countNested()) - (sb.nstIndexCache.countRoot(deleted)) + return sb.countRoot() - sb.countRootDeleted(deleted) } -// AddNestedDocuments returns a bitmap containing the original document numbers in drops, -// plus any descendant document numbers for each dropped document. The drops -// parameter represents a set of document numbers to be dropped, and the returned +// AddNestedDocuments returns a bitmap containing the original root document numbers in drops, +// plus any descendant document numbers for each dropped root document. The drops +// parameter represents a set of root document numbers to be dropped, and the returned // bitmap includes both the original drops and all their descendants (if any). +// NOTE: This method MODIFIES the drops bitmap in place. +// NOTE: This method EXPECTS that the drops bitmap contains ONLY root document numbers. func (sb *SegmentBase) AddNestedDocuments(drops *roaring.Bitmap) *roaring.Bitmap { - // If no drops or no subDocs, nothing to do + // If no drops or no nested documents, nothing to do if drops == nil || drops.GetCardinality() == 0 || sb.countNested() == 0 { return drops } - // Get the edge list for this segment - el := sb.EdgeList() - // Algorithm => iterate through each child->parent mapping in the edge list, - // and for each pair, check if the parent is in the drops bitmap. - // If it is, and the child is also not already in the drops bitmap, - // add the child to the drops. Repeat this process until no - // new additions are made in an iteration. - changed := true - for changed { - changed = false - el.Iterate(func(child uint64, parent uint64) bool { - if drops.Contains(uint32(parent)) && !drops.Contains(uint32(child)) { - drops.Add(uint32(child)) - changed = true - } - return true - }) + ds := sb.descendantStore() + if ds == nil { + return drops + } + descendantBM := roaring.New() + dropsItr := drops.Iterator() + for dropsItr.HasNext() { + rootDoc := uint64(dropsItr.Next()) + if bm, ok := ds.descendants(rootDoc); ok { + descendantBM.Or(bm) + } } + drops.Or(descendantBM) return drops } -// EdgeList returns an EdgeList interface representing the parent-child relationships between documents in the segment. -// The EdgeList interface allows iteration over child-parent document pairs, enabling navigation of document hierarchies. +// edgeList returns an edgeList interface representing the parent-child relationships between documents in the segment. +// The edgeList interface allows iteration over child-parent document pairs, enabling navigation of document hierarchies. // The underlying implementation may use a map or a slice, but callers should rely on the interface methods. -func (sb *SegmentBase) EdgeList() EdgeList { +func (sb *SegmentBase) edgeList() edgeList { return sb.nstIndexCache.edgeList() } +// descendantStore returns a descendantStore interface that provides access to the descendants of documents in the segment. +// The descendantStore interface allows retrieval of descendant document numbers for a given root document number. +// The underlying implementation may use a map or a slice, but callers should rely on the interface methods. +func (sb *SegmentBase) descendantStore() descendantStore { + return sb.nstIndexCache.descendantStore() +} + // Utility method to count the number of nested documents in the segment, not exported. func (sb *SegmentBase) countNested() uint64 { return sb.nstIndexCache.countNested() } +// Utility method to count the number of root documents in the given bitmap, not exported. +func (sb *SegmentBase) countRoot() uint64 { + return sb.Count() - sb.countNested() +} + +// Utility method to count the number of root documents that are marked as deleted in the given bitmap, not exported. +func (sb *SegmentBase) countRootDeleted(deleted *roaring.Bitmap) uint64 { + return sb.nstIndexCache.countRootDeleted(deleted) +} + func (sb *SegmentBase) CallbackId() string { return sb.fileReader.id }