From 8d6f6a8449d16e57299e2a36dc22cddf0e372bb4 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 25 Jun 2026 01:45:53 +0530 Subject: [PATCH 1/8] add countRoot optimization --- nested_cache.go | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/nested_cache.go b/nested_cache.go index 2d273ba2..a4d8e0ba 100644 --- a/nested_cache.go +++ b/nested_cache.go @@ -118,26 +118,24 @@ func (nc *nestedIndexCache) countNested() uint64 { // countRoot returns the number of root documents in the given bitmap func (nc *nestedIndexCache) countRoot(bm *roaring.Bitmap) uint64 { - var totalDocs uint64 - if bm == nil { - // if bitmap is empty, return 0 - return totalDocs + // empty bitmap means no root documents + if bm == nil || bm.IsEmpty() { + return 0 } - totalDocs = bm.GetCardinality() - cache := nc.cache - if cache == nil || cache.el == nil { - // if cache is nil, no nested docs, so all docs are root docs - // so just return the cardinality of the bitmap + totalDocs := bm.GetCardinality() + // if no nested documents, all documents in the bitmap are root documents + if nc.countNested() == 0 { return totalDocs } // count nested documents in the bitmap, a nested doc is one that has a parent in the edge list var nestedDocCount uint64 - bm.Iterate(func(docNum uint32) bool { - if _, ok := cache.el.Parent(uint64(docNum)); ok { + bmItr := bm.Iterator() + for bmItr.HasNext() { + docNum := bmItr.Next() + if _, ok := nc.cache.el.Parent(uint64(docNum)); ok { nestedDocCount++ } - return true - }) + } // root docs = total docs - nested docs if totalDocs < nestedDocCount { // should not happen, but just in case From a7f3a2ab9a5c6e0795376f815821fa2b1e0915de Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 25 Jun 2026 18:16:33 +0530 Subject: [PATCH 2/8] optimize --- segment.go | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/segment.go b/segment.go index 7da09d57..e1e18877 100644 --- a/segment.go +++ b/segment.go @@ -893,6 +893,8 @@ func (sb *SegmentBase) CountRoot(deleted *roaring.Bitmap) uint64 { // plus any descendant document numbers for each dropped document. The drops // parameter represents a set of document numbers to be dropped, and the returned // bitmap includes both the original drops and all their descendants (if any). +// NOTE: This method MODIFIES the drops bitmap in place. +// NOTE: This method EXPECTS that the drops bitmap contains ONLY root document numbers. func (sb *SegmentBase) AddNestedDocuments(drops *roaring.Bitmap) *roaring.Bitmap { // If no drops or no subDocs, nothing to do if drops == nil || drops.GetCardinality() == 0 || sb.countNested() == 0 { @@ -900,22 +902,24 @@ func (sb *SegmentBase) AddNestedDocuments(drops *roaring.Bitmap) *roaring.Bitmap } // Get the edge list for this segment el := sb.EdgeList() - // Algorithm => iterate through each child->parent mapping in the edge list, - // and for each pair, check if the parent is in the drops bitmap. - // If it is, and the child is also not already in the drops bitmap, - // add the child to the drops. Repeat this process until no - // new additions are made in an iteration. - changed := true - for changed { - changed = false - el.Iterate(func(child uint64, parent uint64) bool { - if drops.Contains(uint32(parent)) && !drops.Contains(uint32(child)) { - drops.Add(uint32(child)) - changed = true + descendants := roaring.New() + total := sb.Count() + dropsItr := drops.Iterator() + for dropsItr.HasNext() { + droppedDoc := uint64(dropsItr.Next()) + // every document from droppedDoc+1 to the next + // root document is a descendant of droppedDoc + start := droppedDoc + 1 + cur := start + for cur < total { + if _, ok := el.Parent(cur); !ok { + break } - return true - }) + cur++ + } + descendants.AddRange(start, cur) } + drops.Or(descendants) return drops } From dbf35201ee375fa523fca197346268673891e6e1 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 26 Jun 2026 12:42:55 +0530 Subject: [PATCH 3/8] fix --- merge.go | 4 +- nested_cache.go | 239 +++++++++++++++++++++++++++++++++++++----------- nested_test.go | 94 +++++++++---------- segment.go | 51 ++++++----- 4 files changed, 266 insertions(+), 122 deletions(-) diff --git a/merge.go b/merge.go index f12321d7..dafc661d 100644 --- a/merge.go +++ b/merge.go @@ -609,13 +609,13 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, return 0, nil, seg.ErrClosed } // get the edgeList for this segment - edgeList := segment.EdgeList() + edgeList := segment.edgeList() // if no edgeList, nothing to do if edgeList == nil { continue } newSegDocNums := rv[segI] - edgeList.Iterate(func(oldChild uint64, oldParent uint64) bool { + edgeList.iterate(func(oldChild uint64, oldParent uint64) bool { newParent := newSegDocNums[oldParent] newChild := newSegDocNums[oldChild] if newParent != docDropped && diff --git a/nested_cache.go b/nested_cache.go index 2d273ba2..513168a4 100644 --- a/nested_cache.go +++ b/nested_cache.go @@ -52,11 +52,12 @@ func (nc *nestedIndexCache) initialize(numDocs uint64, edgeListOffset uint64, me return fmt.Errorf("error reading number of edges in nested edge list") } pos += uint64(read) - // if no documents or edges/nested documents, return - if numDocs == 0 || numEdges == 0 { + // if no documents or edges/nested documents or invalid state, return + if numDocs == 0 || numEdges == 0 || numDocs <= numEdges { return nil } - edgeList := NewEdgeList(numDocs, numEdges) + // create and cache our edge list + edgeList := newEdgeList(numDocs, numEdges) for i := uint64(0); i < numEdges; i++ { child, read := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) if read <= 0 { @@ -68,17 +69,41 @@ func (nc *nestedIndexCache) initialize(numDocs uint64, edgeListOffset uint64, me return fmt.Errorf("error reading parent doc id in nested edge list") } pos += uint64(read) - edgeList.AddEdge(child, parent) + edgeList.addEdge(child, parent) + } + // create and cache our descendant store + descendantStore := newDescendantStore(numDocs, numDocs-numEdges) + // populate the descendant store using the following invariants: + // Invariant: child docNums is always > parent docNums + // Invariant: descendants of root docNum R is always a contiguous range of docNums [R+1, R+N] where N is the number of descendants of R + currentRoot := uint64(0) + currentBitmap := roaring.New() + for i := uint64(0); i < numDocs; i++ { + _, ok := edgeList.parent(i) + if !ok { + if currentBitmap.GetCardinality() > 0 { + descendantStore.add(currentRoot, currentBitmap) + currentBitmap = roaring.New() + } + currentRoot = i + } else { + currentBitmap.Add(uint32(i)) + } } + if currentBitmap.GetCardinality() > 0 { + descendantStore.add(currentRoot, currentBitmap) + } + nc.cache = &nestedCacheEntry{ el: edgeList, + ds: descendantStore, } return nil } type nestedCacheEntry struct { - // edgeList[child] = parent - el EdgeList + el edgeList + ds descendantStore } func (nc *nestedIndexCache) ancestry(docNum uint64, prealloc []index.AncestorID) []index.AncestorID { @@ -90,7 +115,7 @@ func (nc *nestedIndexCache) ancestry(docNum uint64, prealloc []index.AncestorID) } current := docNum for { - parent, ok := cache.el.Parent(current) + parent, ok := cache.el.parent(current) if !ok { break } @@ -100,7 +125,15 @@ func (nc *nestedIndexCache) ancestry(docNum uint64, prealloc []index.AncestorID) return prealloc } -func (nc *nestedIndexCache) edgeList() EdgeList { +func (nc *nestedIndexCache) descendants(root uint64) (*roaring.Bitmap, bool) { + cache := nc.cache + if cache == nil || cache.ds == nil { + return nil, false + } + return cache.ds.descendants(root) +} + +func (nc *nestedIndexCache) edgeList() edgeList { cache := nc.cache if cache == nil || cache.el == nil { return nil @@ -108,36 +141,42 @@ func (nc *nestedIndexCache) edgeList() EdgeList { return cache.el } +func (nc *nestedIndexCache) descendantStore() descendantStore { + cache := nc.cache + if cache == nil || cache.ds == nil { + return nil + } + return cache.ds +} + func (nc *nestedIndexCache) countNested() uint64 { cache := nc.cache if cache == nil || cache.el == nil { return 0 } - return cache.el.Count() + return cache.el.count() } -// countRoot returns the number of root documents in the given bitmap -func (nc *nestedIndexCache) countRoot(bm *roaring.Bitmap) uint64 { - var totalDocs uint64 - if bm == nil { - // if bitmap is empty, return 0 - return totalDocs +// countRootDeleted returns the number of root documents in the given bitmap that are deleted +func (nc *nestedIndexCache) countRootDeleted(bm *roaring.Bitmap) uint64 { + // empty bitmap means no root documents + if bm == nil || bm.IsEmpty() { + return 0 } - totalDocs = bm.GetCardinality() - cache := nc.cache - if cache == nil || cache.el == nil { - // if cache is nil, no nested docs, so all docs are root docs - // so just return the cardinality of the bitmap + totalDocs := bm.GetCardinality() + // if no nested documents, all documents in the bitmap are root documents + if nc.countNested() == 0 { return totalDocs } // count nested documents in the bitmap, a nested doc is one that has a parent in the edge list var nestedDocCount uint64 - bm.Iterate(func(docNum uint32) bool { - if _, ok := cache.el.Parent(uint64(docNum)); ok { + bmItr := bm.Iterator() + for bmItr.HasNext() { + docNum := bmItr.Next() + if _, ok := nc.cache.el.parent(uint64(docNum)); ok { nestedDocCount++ } - return true - }) + } // root docs = total docs - nested docs if totalDocs < nestedDocCount { // should not happen, but just in case @@ -148,21 +187,21 @@ func (nc *nestedIndexCache) countRoot(bm *roaring.Bitmap) uint64 { // ------------------------------------------------------- -// EdgeList provides an interface to access parent of a child document -type EdgeList interface { - // Parent returns the parent of the given child document ID, +// edgeList provides an interface to access parent of a child document +type edgeList interface { + // parent returns the parent of the given child document ID, // and a boolean indicating if the parent exists. - Parent(child uint64) (uint64, bool) + parent(child uint64) (uint64, bool) - // AddEdge adds an edge from child to parent in the edge list. - AddEdge(child uint64, parent uint64) + // addEdge adds an edge from child to parent in the edge list. + addEdge(child uint64, parent uint64) - // Count returns the number of edges in the edge list. - Count() uint64 + // count returns the number of edges in the edge list. + count() uint64 - // Iterate iterates over all edges in the edge list, calling the provided function + // iterate iterates over all edges in the edge list, calling the provided function // with each child-parent pair. If the function returns false, iteration stops. - Iterate(func(child uint64, parent uint64) bool) + iterate(func(child uint64, parent uint64) bool) } type edgeListMap struct { @@ -175,20 +214,20 @@ func newEdgeListMap(numEdges uint64) *edgeListMap { } } -func (elm *edgeListMap) Parent(child uint64) (uint64, bool) { +func (elm *edgeListMap) parent(child uint64) (uint64, bool) { parent, ok := elm.edges[child] return parent, ok } -func (elm *edgeListMap) AddEdge(child uint64, parent uint64) { +func (elm *edgeListMap) addEdge(child uint64, parent uint64) { elm.edges[child] = parent } -func (elm *edgeListMap) Count() uint64 { +func (elm *edgeListMap) count() uint64 { return uint64(len(elm.edges)) } -func (elm *edgeListMap) Iterate(f func(child uint64, parent uint64) bool) { +func (elm *edgeListMap) iterate(f func(child uint64, parent uint64) bool) { for child, parent := range elm.edges { if !f(child, parent) { return @@ -197,7 +236,7 @@ func (elm *edgeListMap) Iterate(f func(child uint64, parent uint64) bool) { } type edgeListSlice struct { - count uint64 + numEdges uint64 sentinel uint64 edges []uint64 } @@ -209,13 +248,13 @@ func newEdgeListSlice(numDocs uint64, numEdges uint64) *edgeListSlice { edges[i] = sentinel } return &edgeListSlice{ - count: numEdges, + numEdges: numEdges, sentinel: sentinel, edges: edges, } } -func (els *edgeListSlice) Parent(child uint64) (uint64, bool) { +func (els *edgeListSlice) parent(child uint64) (uint64, bool) { if child >= uint64(len(els.edges)) { return 0, false } @@ -226,21 +265,21 @@ func (els *edgeListSlice) Parent(child uint64) (uint64, bool) { return parent, true } -func (el *edgeListSlice) AddEdge(child uint64, parent uint64) { - if child >= uint64(len(el.edges)) { +func (els *edgeListSlice) addEdge(child uint64, parent uint64) { + if child >= uint64(len(els.edges)) { // out of bounds, ignore as this should not happen return } - el.edges[child] = parent + els.edges[child] = parent } -func (el *edgeListSlice) Count() uint64 { - return el.count +func (els *edgeListSlice) count() uint64 { + return els.numEdges } -func (el *edgeListSlice) Iterate(f func(child uint64, parent uint64) bool) { - for child, parent := range el.edges { - if parent != el.sentinel { +func (els *edgeListSlice) iterate(f func(child uint64, parent uint64) bool) { + for child, parent := range els.edges { + if parent != els.sentinel { if !f(uint64(child), parent) { return } @@ -248,7 +287,7 @@ func (el *edgeListSlice) Iterate(f func(child uint64, parent uint64) bool) { } } -// nestedCacheRatio defines the threshold ratio of nested documents to total documents. +// edgeListMapThreshold defines the threshold ratio of nested documents to total documents. // It is derived using the following reasoning: // // Let N = number of nested documents (i.e., edges in the edge list) @@ -273,9 +312,9 @@ func (el *edgeListSlice) Iterate(f func(child uint64, parent uint64) bool) { // we use a map for the edge list; otherwise, we use a slice. var edgeListMapThreshold = 8.0 / 30.0 -// NewEdgeList creates a new EdgeList instance based on the provided +// newEdgeList creates a new edgeList instance based on the provided // constants, the total number of documents and the number of nested documents/edges. -func NewEdgeList(numDocs uint64, numEdges uint64) EdgeList { +func newEdgeList(numDocs uint64, numEdges uint64) edgeList { if numDocs == 0 || numEdges == 0 { // no edges, return nil return nil @@ -288,3 +327,99 @@ func NewEdgeList(numDocs uint64, numEdges uint64) EdgeList { // use slice representation return newEdgeListSlice(numDocs, numEdges) } + +// ------------------------------------------------------- + +// descendantStore provides an interface to access precomputed descendant bitmaps for root documents +type descendantStore interface { + // add a descendant bitmap for a root document + add(root uint64, descendants *roaring.Bitmap) + // returns the descendant bitmap for a root document, with an indication of its existence + descendants(root uint64) (*roaring.Bitmap, bool) +} + +type descendantStoreMap struct { + m map[uint64]*roaring.Bitmap +} + +func newDescendantStoreMap(numRoots uint64) *descendantStoreMap { + return &descendantStoreMap{ + m: make(map[uint64]*roaring.Bitmap, numRoots), + } +} + +func (dsm *descendantStoreMap) add(root uint64, descendants *roaring.Bitmap) { + dsm.m[root] = descendants +} + +func (dsm *descendantStoreMap) descendants(root uint64) (*roaring.Bitmap, bool) { + bm, ok := dsm.m[root] + return bm, ok +} + +type descendantStoreSlice struct { + numDocs uint64 + descBitmaps []*roaring.Bitmap +} + +func newDescendantStoreSlice(numDocs uint64, numRoots uint64) *descendantStoreSlice { + return &descendantStoreSlice{ + numDocs: numDocs, + descBitmaps: make([]*roaring.Bitmap, numDocs), + } +} + +func (dss *descendantStoreSlice) add(root uint64, descendants *roaring.Bitmap) { + if root >= dss.numDocs { + // out of bounds, ignore as this should not happen + return + } + dss.descBitmaps[root] = descendants +} + +func (dss *descendantStoreSlice) descendants(root uint64) (*roaring.Bitmap, bool) { + if root >= dss.numDocs { + return nil, false + } + bm := dss.descBitmaps[root] + if bm == nil { + return nil, false + } + return bm, true +} + +// descendantStoreMapThreshold defines the threshold ratio of root documents to total documents. +// It is derived using the following reasoning: +// +// Let R = number of root documents +// Let T = total number of documents +// +// Memory usage if the descendant store is stored as a map[uint64]*roaring.Bitmap: +// +// ~30 bytes per entry (key + value + map overhead) +// Total ≈ 30 * R bytes +// +// Memory usage if the descendant store is stored as a []*roaring.Bitmap: +// +// 8 bytes per entry +// Total ≈ 8 * T bytes +// +// We want the threshold at which a map becomes more memory-efficient than a slice: +// +// 30R < 8T +// R/T < 8/30 +// +// Therefore, if the ratio of root documents to total documents is less than 8/30, +// we use a map for the descendant store; otherwise, we use a slice. +var descendantStoreMapThreshold = 8.0 / 30.0 + +func newDescendantStore(numDocs uint64, numRoots uint64) descendantStore { + if numDocs == 0 || numRoots == 0 { + return nil + } + ratio := float64(numRoots) / float64(numDocs) + if ratio < descendantStoreMapThreshold { + return newDescendantStoreMap(numRoots) + } + return newDescendantStoreSlice(numDocs, numRoots) +} diff --git a/nested_test.go b/nested_test.go index 7a99f66e..390800cd 100644 --- a/nested_test.go +++ b/nested_test.go @@ -337,12 +337,12 @@ func TestNestedSegment(t *testing.T) { } // Verify edge list exists - el := sb.EdgeList() + el := sb.edgeList() if el == nil { t.Fatal("expected non-nil edge list") } - if el.Count() != expectedNestedDocs { - t.Fatalf("expected edge list count %d, got %d", expectedNestedDocs, el.Count()) + if el.count() != expectedNestedDocs { + t.Fatalf("expected edge list count %d, got %d", expectedNestedDocs, el.count()) } // Test ancestry lookups @@ -543,12 +543,12 @@ func TestNestedSegmentMerge(t *testing.T) { } // Verify edge list in merged segment - el := mergedSb.EdgeList() + el := mergedSb.edgeList() if el == nil { t.Fatal("expected non-nil edge list in merged segment") } - if el.Count() != expectedNested { - t.Fatalf("merged edge list: expected %d edges, got %d", expectedNested, el.Count()) + if el.count() != expectedNested { + t.Fatalf("merged edge list: expected %d edges, got %d", expectedNested, el.count()) } // Verify ancestry in merged segment @@ -669,13 +669,13 @@ func TestNestedSegmentMergeWithDeletes(t *testing.T) { t.Fatalf("expected 1 root doc in merged segment, got %d", rootCount) } // Should have 1 edge in edge list - el := mergedSb.EdgeList() + el := mergedSb.edgeList() if el == nil { t.Fatal("expected non-nil edge list in merged segment") } - if el.Count() != 1 { - t.Fatalf("merged edge list: expected 1 edge, got %d", el.Count()) + if el.count() != 1 { + t.Fatalf("merged edge list: expected 1 edge, got %d", el.count()) } // Verify ancestry testAncestry := []struct { @@ -928,7 +928,7 @@ func TestNestedSegmentEdgeListIteration(t *testing.T) { }() sb := seg.(*Segment) - el := sb.EdgeList() + el := sb.edgeList() if el == nil { t.Fatal("expected non-nil edge list") } @@ -948,13 +948,13 @@ func TestNestedSegmentEdgeListIteration(t *testing.T) { } // Verify edge count - if el.Count() != uint64(len(expectedEdges)) { - t.Fatalf("expected %d edges, got %d", len(expectedEdges), el.Count()) + if el.count() != uint64(len(expectedEdges)) { + t.Fatalf("expected %d edges, got %d", len(expectedEdges), el.count()) } // Verify Parent lookups for child, expectedParent := range expectedEdges { - parent, ok := el.Parent(child) + parent, ok := el.parent(child) if !ok { t.Errorf("expected parent for child %d", child) continue @@ -965,13 +965,13 @@ func TestNestedSegmentEdgeListIteration(t *testing.T) { } // Verify root has no parent - if _, ok := el.Parent(0); ok { + if _, ok := el.parent(0); ok { t.Error("root should have no parent") } // Test iteration foundEdges := make(map[uint64]uint64) - el.Iterate(func(child uint64, parent uint64) bool { + el.iterate(func(child uint64, parent uint64) bool { foundEdges[child] = parent return true }) @@ -1020,7 +1020,7 @@ func TestNestedSegmentNoNesting(t *testing.T) { } // Edge list should be nil for flat documents - el := sb.EdgeList() + el := sb.edgeList() if el != nil { t.Fatal("expected nil edge list for flat documents") } @@ -1178,18 +1178,18 @@ func TestNestedSegmentEdgeListMap(t *testing.T) { } // Edge list should exist - el := sb.EdgeList() + el := sb.edgeList() if el == nil { t.Fatal("expected non-nil edge list") } // Verify edge count - if el.Count() != 1 { - t.Fatalf("expected 1 edge, got %d", el.Count()) + if el.count() != 1 { + t.Fatalf("expected 1 edge, got %d", el.count()) } // Test Parent lookup - child1 is doc 10, parent is root10 which is doc 9 - parent, ok := el.Parent(10) + parent, ok := el.parent(10) if !ok { t.Fatal("expected parent for child1") } @@ -1198,13 +1198,13 @@ func TestNestedSegmentEdgeListMap(t *testing.T) { } // Verify root has no parent - if _, ok := el.Parent(0); ok { + if _, ok := el.parent(0); ok { t.Error("root should have no parent") } // Test iteration foundEdges := make(map[uint64]uint64) - el.Iterate(func(child uint64, parent uint64) bool { + el.iterate(func(child uint64, parent uint64) bool { foundEdges[child] = parent return true }) @@ -1243,35 +1243,35 @@ func TestNestedSegmentEdgeListMap(t *testing.T) { func TestEdgeList(t *testing.T) { t.Run("edgeListMap", func(t *testing.T) { el := newEdgeListMap(3) - el.AddEdge(1, 0) - el.AddEdge(2, 0) - el.AddEdge(3, 1) + el.addEdge(1, 0) + el.addEdge(2, 0) + el.addEdge(3, 1) - if el.Count() != 3 { - t.Fatalf("expected count 3, got %d", el.Count()) + if el.count() != 3 { + t.Fatalf("expected count 3, got %d", el.count()) } // Test Parent - parent, ok := el.Parent(1) + parent, ok := el.parent(1) if !ok || parent != 0 { t.Errorf("expected parent 0 for child 1, got %d, ok=%v", parent, ok) } - parent, ok = el.Parent(3) + parent, ok = el.parent(3) if !ok || parent != 1 { t.Errorf("expected parent 1 for child 3, got %d, ok=%v", parent, ok) } - _, ok = el.Parent(0) + _, ok = el.parent(0) if ok { t.Error("expected no parent for root") } - _, ok = el.Parent(99) + _, ok = el.parent(99) if ok { t.Error("expected no parent for non-existent doc") } // Test Iterate found := make(map[uint64]uint64) - el.Iterate(func(child uint64, parent uint64) bool { + el.iterate(func(child uint64, parent uint64) bool { found[child] = parent return true }) @@ -1283,36 +1283,36 @@ func TestEdgeList(t *testing.T) { // Test edgeListSlice t.Run("edgeListSlice", func(t *testing.T) { el := newEdgeListSlice(10, 3) - el.AddEdge(1, 0) - el.AddEdge(2, 0) - el.AddEdge(3, 1) + el.addEdge(1, 0) + el.addEdge(2, 0) + el.addEdge(3, 1) - if el.Count() != 3 { - t.Fatalf("expected count 3, got %d", el.Count()) + if el.count() != 3 { + t.Fatalf("expected count 3, got %d", el.count()) } // Test Parent - parent, ok := el.Parent(1) + parent, ok := el.parent(1) if !ok || parent != 0 { t.Errorf("expected parent 0 for child 1, got %d, ok=%v", parent, ok) } - parent, ok = el.Parent(3) + parent, ok = el.parent(3) if !ok || parent != 1 { t.Errorf("expected parent 1 for child 3, got %d, ok=%v", parent, ok) } - _, ok = el.Parent(0) + _, ok = el.parent(0) if ok { t.Error("expected no parent for root") } // Out of bounds - _, ok = el.Parent(99) + _, ok = el.parent(99) if ok { t.Error("expected no parent for out of bounds doc") } // Test Iterate found := make(map[uint64]uint64) - el.Iterate(func(child uint64, parent uint64) bool { + el.iterate(func(child uint64, parent uint64) bool { found[child] = parent return true }) @@ -1321,29 +1321,29 @@ func TestEdgeList(t *testing.T) { } // Test AddEdge out of bounds (should be silently ignored) - el.AddEdge(100, 0) // out of bounds, should not panic + el.addEdge(100, 0) // out of bounds, should not panic }) // Test NewEdgeList threshold selection t.Run("NewEdgeList", func(t *testing.T) { // Ratio < 8/30 should use map - el := NewEdgeList(100, 10) // ratio = 0.1 < 0.267 + el := newEdgeList(100, 10) // ratio = 0.1 < 0.267 if _, ok := el.(*edgeListMap); !ok { t.Error("expected edgeListMap for low ratio") } // Ratio >= 8/30 should use slice - el = NewEdgeList(100, 50) // ratio = 0.5 > 0.267 + el = newEdgeList(100, 50) // ratio = 0.5 > 0.267 if _, ok := el.(*edgeListSlice); !ok { t.Error("expected edgeListSlice for high ratio") } // Zero cases - el = NewEdgeList(0, 0) + el = newEdgeList(0, 0) if el != nil { t.Error("expected nil for 0 docs") } - el = NewEdgeList(10, 0) + el = newEdgeList(10, 0) if el != nil { t.Error("expected nil for 0 edges") } diff --git a/segment.go b/segment.go index 27ed7154..dcf910b6 100644 --- a/segment.go +++ b/segment.go @@ -849,47 +849,56 @@ func (sb *SegmentBase) CountRoot(deleted *roaring.Bitmap) uint64 { // dR = D - dS // Therefore, the count of root docs excluding deleted ones is: // R - dR = (T - S) - (D - dS) - return (sb.Count() - sb.countNested()) - (sb.nstIndexCache.countRoot(deleted)) + return sb.countRoot() - sb.countRootDeleted(deleted) } // AddNestedDocuments returns a bitmap containing the original document numbers in drops, // plus any descendant document numbers for each dropped document. The drops // parameter represents a set of document numbers to be dropped, and the returned // bitmap includes both the original drops and all their descendants (if any). +// NOTE: This method MODIFIES the drops bitmap in place. +// NOTE: This method EXPECTS that the drops bitmap contains ONLY root document numbers. func (sb *SegmentBase) AddNestedDocuments(drops *roaring.Bitmap) *roaring.Bitmap { // If no drops or no subDocs, nothing to do if drops == nil || drops.GetCardinality() == 0 || sb.countNested() == 0 { return drops } - // Get the edge list for this segment - el := sb.EdgeList() - // Algorithm => iterate through each child->parent mapping in the edge list, - // and for each pair, check if the parent is in the drops bitmap. - // If it is, and the child is also not already in the drops bitmap, - // add the child to the drops. Repeat this process until no - // new additions are made in an iteration. - changed := true - for changed { - changed = false - el.Iterate(func(child uint64, parent uint64) bool { - if drops.Contains(uint32(parent)) && !drops.Contains(uint32(child)) { - drops.Add(uint32(child)) - changed = true - } - return true - }) + ds := sb.descendantStore() + dropsItr := drops.Iterator() + for dropsItr.HasNext() { + rootDoc := uint64(dropsItr.Next()) + if bm, ok := ds.descendants(rootDoc); ok { + drops.Or(bm) + } } return drops } -// EdgeList returns an EdgeList interface representing the parent-child relationships between documents in the segment. -// The EdgeList interface allows iteration over child-parent document pairs, enabling navigation of document hierarchies. +// edgeList returns an edgeList interface representing the parent-child relationships between documents in the segment. +// The edgeList interface allows iteration over child-parent document pairs, enabling navigation of document hierarchies. // The underlying implementation may use a map or a slice, but callers should rely on the interface methods. -func (sb *SegmentBase) EdgeList() EdgeList { +func (sb *SegmentBase) edgeList() edgeList { return sb.nstIndexCache.edgeList() } +// descendantStore returns a descendantStore interface that provides access to the descendants of documents in the segment. +// The descendantStore interface allows retrieval of descendant document numbers for a given root document number. +// The underlying implementation may use a map or a slice, but callers should rely on the interface methods. +func (sb *SegmentBase) descendantStore() descendantStore { + return sb.nstIndexCache.descendantStore() +} + // Utility method to count the number of nested documents in the segment, not exported. func (sb *SegmentBase) countNested() uint64 { return sb.nstIndexCache.countNested() } + +// Utility method to count the number of root documents in the given bitmap, not exported. +func (sb *SegmentBase) countRoot() uint64 { + return sb.Count() - sb.countNested() +} + +// Utility method to count the number of root documents that are marked as deleted in the given bitmap, not exported. +func (sb *SegmentBase) countRootDeleted(deleted *roaring.Bitmap) uint64 { + return sb.nstIndexCache.countRootDeleted(deleted) +} From fb98b321fd4e822f5bd22eaa939310b76ef0f449 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 26 Jun 2026 12:55:20 +0530 Subject: [PATCH 4/8] fix bug --- segment.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/segment.go b/segment.go index dcf910b6..5696610e 100644 --- a/segment.go +++ b/segment.go @@ -859,18 +859,20 @@ func (sb *SegmentBase) CountRoot(deleted *roaring.Bitmap) uint64 { // NOTE: This method MODIFIES the drops bitmap in place. // NOTE: This method EXPECTS that the drops bitmap contains ONLY root document numbers. func (sb *SegmentBase) AddNestedDocuments(drops *roaring.Bitmap) *roaring.Bitmap { - // If no drops or no subDocs, nothing to do + // If no drops or no nested documents, nothing to do if drops == nil || drops.GetCardinality() == 0 || sb.countNested() == 0 { return drops } ds := sb.descendantStore() + descendantBM := roaring.New() dropsItr := drops.Iterator() for dropsItr.HasNext() { rootDoc := uint64(dropsItr.Next()) if bm, ok := ds.descendants(rootDoc); ok { - drops.Or(bm) + descendantBM.Or(bm) } } + drops.Or(descendantBM) return drops } From c29dedae7fdbfd134dfe2f861cea9044bf19fd0f Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 26 Jun 2026 15:29:52 +0530 Subject: [PATCH 5/8] optimize --- nested_cache.go | 102 ++++++++++++------------------------------------ 1 file changed, 25 insertions(+), 77 deletions(-) diff --git a/nested_cache.go b/nested_cache.go index 513168a4..3093db92 100644 --- a/nested_cache.go +++ b/nested_cache.go @@ -72,26 +72,34 @@ func (nc *nestedIndexCache) initialize(numDocs uint64, edgeListOffset uint64, me edgeList.addEdge(child, parent) } // create and cache our descendant store - descendantStore := newDescendantStore(numDocs, numDocs-numEdges) + numRoots := numDocs - numEdges + descendantStore := newDescendantStore(numDocs, numRoots) // populate the descendant store using the following invariants: // Invariant: child docNums is always > parent docNums - // Invariant: descendants of root docNum R is always a contiguous range of docNums [R+1, R+N] where N is the number of descendants of R - currentRoot := uint64(0) - currentBitmap := roaring.New() - for i := uint64(0); i < numDocs; i++ { - _, ok := edgeList.parent(i) - if !ok { - if currentBitmap.GetCardinality() > 0 { - descendantStore.add(currentRoot, currentBitmap) - currentBitmap = roaring.New() - } - currentRoot = i - } else { - currentBitmap.Add(uint32(i)) + // Invariant: descendants of root docNum R is always a + // contiguous range of docNums [R + 1 : R + 1 + N) + // where N is the number of descendants of R + roots := make([]uint64, 0, numRoots) + for docNum := uint64(0); docNum < numDocs; docNum++ { + if _, ok := edgeList.parent(docNum); !ok { + roots = append(roots, docNum) } } - if currentBitmap.GetCardinality() > 0 { - descendantStore.add(currentRoot, currentBitmap) + // descendants of each root are the contiguous range of + // docNums between it and the next root + for idx, root := range roots { + start := root + 1 + end := numDocs + nextIdx := idx + 1 + if nextIdx < len(roots) { + end = roots[nextIdx] + } + numDescendants := end - start + if numDescendants > 0 { + bitmap := roaring.New() + bitmap.AddRange(start, end) + descendantStore.add(root, bitmap) + } } nc.cache = &nestedCacheEntry{ @@ -357,69 +365,9 @@ func (dsm *descendantStoreMap) descendants(root uint64) (*roaring.Bitmap, bool) return bm, ok } -type descendantStoreSlice struct { - numDocs uint64 - descBitmaps []*roaring.Bitmap -} - -func newDescendantStoreSlice(numDocs uint64, numRoots uint64) *descendantStoreSlice { - return &descendantStoreSlice{ - numDocs: numDocs, - descBitmaps: make([]*roaring.Bitmap, numDocs), - } -} - -func (dss *descendantStoreSlice) add(root uint64, descendants *roaring.Bitmap) { - if root >= dss.numDocs { - // out of bounds, ignore as this should not happen - return - } - dss.descBitmaps[root] = descendants -} - -func (dss *descendantStoreSlice) descendants(root uint64) (*roaring.Bitmap, bool) { - if root >= dss.numDocs { - return nil, false - } - bm := dss.descBitmaps[root] - if bm == nil { - return nil, false - } - return bm, true -} - -// descendantStoreMapThreshold defines the threshold ratio of root documents to total documents. -// It is derived using the following reasoning: -// -// Let R = number of root documents -// Let T = total number of documents -// -// Memory usage if the descendant store is stored as a map[uint64]*roaring.Bitmap: -// -// ~30 bytes per entry (key + value + map overhead) -// Total ≈ 30 * R bytes -// -// Memory usage if the descendant store is stored as a []*roaring.Bitmap: -// -// 8 bytes per entry -// Total ≈ 8 * T bytes -// -// We want the threshold at which a map becomes more memory-efficient than a slice: -// -// 30R < 8T -// R/T < 8/30 -// -// Therefore, if the ratio of root documents to total documents is less than 8/30, -// we use a map for the descendant store; otherwise, we use a slice. -var descendantStoreMapThreshold = 8.0 / 30.0 - func newDescendantStore(numDocs uint64, numRoots uint64) descendantStore { if numDocs == 0 || numRoots == 0 { return nil } - ratio := float64(numRoots) / float64(numDocs) - if ratio < descendantStoreMapThreshold { - return newDescendantStoreMap(numRoots) - } - return newDescendantStoreSlice(numDocs, numRoots) + return newDescendantStoreMap(numRoots) } From 6fff86fb66f3cf8e649271f617ff73683151c9af Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 26 Jun 2026 15:38:50 +0530 Subject: [PATCH 6/8] fix --- segment.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/segment.go b/segment.go index 46c096b5..7ca1537d 100644 --- a/segment.go +++ b/segment.go @@ -941,3 +941,7 @@ func (sb *SegmentBase) countRoot() uint64 { func (sb *SegmentBase) countRootDeleted(deleted *roaring.Bitmap) uint64 { return sb.nstIndexCache.countRootDeleted(deleted) } + +func (sb *SegmentBase) CallbackId() string { + return sb.fileReader.id +} From 7924ed163bedfec8e9cbbcef3620dbb5331468fc Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 26 Jun 2026 15:45:55 +0530 Subject: [PATCH 7/8] fix --- nested_cache.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/nested_cache.go b/nested_cache.go index 3093db92..6d6fbe3b 100644 --- a/nested_cache.go +++ b/nested_cache.go @@ -73,7 +73,7 @@ func (nc *nestedIndexCache) initialize(numDocs uint64, edgeListOffset uint64, me } // create and cache our descendant store numRoots := numDocs - numEdges - descendantStore := newDescendantStore(numDocs, numRoots) + descendantStore := newDescendantStore(numRoots) // populate the descendant store using the following invariants: // Invariant: child docNums is always > parent docNums // Invariant: descendants of root docNum R is always a @@ -365,9 +365,6 @@ func (dsm *descendantStoreMap) descendants(root uint64) (*roaring.Bitmap, bool) return bm, ok } -func newDescendantStore(numDocs uint64, numRoots uint64) descendantStore { - if numDocs == 0 || numRoots == 0 { - return nil - } +func newDescendantStore(numRoots uint64) descendantStore { return newDescendantStoreMap(numRoots) } From 1bfe82ced636725ff58fe1e8ec73b9a73f93b2f3 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 26 Jun 2026 16:10:59 +0530 Subject: [PATCH 8/8] apply copilot suggestions --- nested_cache.go | 7 ++++++- segment.go | 9 ++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/nested_cache.go b/nested_cache.go index 6d6fbe3b..741eeaf5 100644 --- a/nested_cache.go +++ b/nested_cache.go @@ -176,12 +176,17 @@ func (nc *nestedIndexCache) countRootDeleted(bm *roaring.Bitmap) uint64 { if nc.countNested() == 0 { return totalDocs } + // get the edgeList for this segment + el := nc.edgeList() + if el == nil { + return totalDocs + } // count nested documents in the bitmap, a nested doc is one that has a parent in the edge list var nestedDocCount uint64 bmItr := bm.Iterator() for bmItr.HasNext() { docNum := bmItr.Next() - if _, ok := nc.cache.el.parent(uint64(docNum)); ok { + if _, ok := el.parent(uint64(docNum)); ok { nestedDocCount++ } } diff --git a/segment.go b/segment.go index 7ca1537d..5c6a2e50 100644 --- a/segment.go +++ b/segment.go @@ -889,9 +889,9 @@ func (sb *SegmentBase) CountRoot(deleted *roaring.Bitmap) uint64 { return sb.countRoot() - sb.countRootDeleted(deleted) } -// AddNestedDocuments returns a bitmap containing the original document numbers in drops, -// plus any descendant document numbers for each dropped document. The drops -// parameter represents a set of document numbers to be dropped, and the returned +// AddNestedDocuments returns a bitmap containing the original root document numbers in drops, +// plus any descendant document numbers for each dropped root document. The drops +// parameter represents a set of root document numbers to be dropped, and the returned // bitmap includes both the original drops and all their descendants (if any). // NOTE: This method MODIFIES the drops bitmap in place. // NOTE: This method EXPECTS that the drops bitmap contains ONLY root document numbers. @@ -901,6 +901,9 @@ func (sb *SegmentBase) AddNestedDocuments(drops *roaring.Bitmap) *roaring.Bitmap return drops } ds := sb.descendantStore() + if ds == nil { + return drops + } descendantBM := roaring.New() dropsItr := drops.Iterator() for dropsItr.HasNext() {