Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
259 changes: 259 additions & 0 deletions lib/rag/chunking.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
import { describe, expect, it, vi } from "vitest";
import { chunkNoteDocument } from "@/lib/rag/chunking";
import type { NoteDocument, NoteListItem } from "@/lib/notes/types";

const countWords = (value: string) => value.match(/[A-Za-z0-9_]+/g)?.length ?? 0;

function paragraph(id: string, text: string): NoteDocument["blocks"][number] {
return {
id,
type: "paragraph",
data: {
text,
},
};
}

function header(
id: string,
level: 1 | 2 | 3 | 4 | 5 | 6,
text: string,
): NoteDocument["blocks"][number] {
return {
id,
type: "header",
data: {
level,
text,
},
};
}

function code(id: string, source: string): NoteDocument["blocks"][number] {
return {
id,
type: "code",
data: {
code: source,
},
};
}

function nestedListItem(depth: number): NoteListItem {
if (depth === 0) {
return {
content: "leaf detail",
items: [],
};
}

return {
content: `depth ${depth}`,
items: [nestedListItem(depth - 1)],
};
}

describe("chunkNoteDocument", () => {
it("creates semantic chunks from H1-H3 sections and carries heading context", () => {
const document: NoteDocument = {
time: 1,
blocks: [
header("h1", 1, "Biology"),
paragraph("p1", "Cells regulate transport."),
header("h2", 2, "Photosynthesis"),
paragraph("p2", "Chloroplasts convert light energy."),
header("h3", 3, "Light reactions"),
paragraph("p3", "Photosystems split water."),
header("h4", 4, "Tiny detail"),
paragraph("p4", "H4 stays inside the active H3 chunk."),
],
};

const chunks = chunkNoteDocument(document, {
maxTokens: 80,
countTokens: countWords,
noteId: "note-1",
});

expect(chunks).toHaveLength(3);
expect(chunks.map((chunk) => chunk.headingPath.map((heading) => heading.text))).toEqual([
["Biology"],
["Biology", "Photosynthesis"],
["Biology", "Photosynthesis", "Light reactions"],
]);
expect(chunks.map((chunk) => chunk.sourceBlockIds)).toEqual([
["p1"],
["p2"],
["p3", "h4", "p4"],
]);
expect(chunks[0]).toMatchObject({
id: "note-1:chunk:0",
content: "# Biology\n\nCells regulate transport.",
tokenCount: 4,
isOversized: false,
});
expect(chunks[2].content).toBe(
[
"# Biology",
"## Photosynthesis",
"### Light reactions",
"",
"Photosystems split water.",
"",
"#### Tiny detail",
"",
"H4 stays inside the active H3 chunk.",
].join("\n"),
);
});

it("splits chunks within a section when adding another block would exceed the token limit", () => {
const document: NoteDocument = {
time: 1,
blocks: [
header("h1", 1, "RAG"),
header("h2", 2, "Retrieval"),
paragraph("p1", "semantic search ranks context"),
paragraph("p2", "query embeddings guide generation"),
],
};

const chunks = chunkNoteDocument(document, {
maxTokens: 6,
countTokens: countWords,
});

expect(chunks).toHaveLength(2);
expect(chunks.map((chunk) => chunk.content)).toEqual([
"# RAG\n## Retrieval\n\nsemantic search ranks context",
"# RAG\n## Retrieval\n\nquery embeddings guide generation",
]);
expect(chunks.map((chunk) => chunk.tokenCount)).toEqual([6, 6]);
expect(chunks.every((chunk) => chunk.headingPath.map((heading) => heading.text).join(" > ") === "RAG > Retrieval")).toBe(
true,
);
});

it("serializes deeply nested lists in document order without dropping nested content", () => {
const document: NoteDocument = {
time: 1,
blocks: [
header("h1", 1, "Algorithms"),
{
id: "list-1",
type: "list",
data: {
style: "unordered",
items: [nestedListItem(8)],
},
},
],
};

const chunks = chunkNoteDocument(document, {
maxTokens: 80,
countTokens: countWords,
});

expect(chunks).toHaveLength(1);
expect(chunks[0].sourceBlockIds).toEqual(["list-1"]);
expect(chunks[0].content).toContain("- depth 8");
expect(chunks[0].content).toContain(" - leaf detail");
expect(chunks[0].tokenCount).toBe(19);
});

it("skips empty headings, empty blocks, and blank list items without creating empty chunks", () => {
const document: NoteDocument = {
time: 1,
blocks: [
header("empty-heading", 1, " "),
paragraph("empty-paragraph", " <br> "),
{
id: "empty-list",
type: "list",
data: {
style: "unordered",
items: [
{
content: " ",
items: [],
},
],
},
},
],
};

expect(
chunkNoteDocument(document, {
maxTokens: 20,
countTokens: countWords,
}),
).toEqual([]);
});

it("keeps an oversized code block intact and isolates following content into a new chunk", () => {
const document: NoteDocument = {
time: 1,
blocks: [
header("h1", 1, "Runtime"),
code("code-1", "const alpha = 1;\nconst beta = 2;\nconst gamma = alpha + beta;"),
paragraph("p1", "The code initializes values."),
],
};

const chunks = chunkNoteDocument(document, {
maxTokens: 5,
countTokens: countWords,
});

expect(chunks).toHaveLength(2);
expect(chunks[0]).toMatchObject({
sourceBlockIds: ["code-1"],
isOversized: true,
splitReason: "oversized-block",
});
expect(chunks[0].content).toBe(
[
"# Runtime",
"",
"```",
"const alpha = 1;",
"const beta = 2;",
"const gamma = alpha + beta;",
"```",
].join("\n"),
);
expect(chunks[1]).toMatchObject({
sourceBlockIds: ["p1"],
content: "# Runtime\n\nThe code initializes values.",
isOversized: false,
});
});

it("counts each non-empty heading or block once so token accounting stays linear", () => {
const countTokens = vi.fn(countWords);
const document: NoteDocument = {
time: 1,
blocks: [
header("h1", 1, "Linear accounting"),
paragraph("p1", "first block"),
paragraph("p2", "second block"),
paragraph("p3", "third block"),
],
};

chunkNoteDocument(document, {
maxTokens: 6,
countTokens,
});

expect(countTokens).toHaveBeenCalledTimes(4);
expect(countTokens.mock.calls.map(([value]) => value)).toEqual([
"Linear accounting",
"first block",
"second block",
"third block",
]);
});
});
Loading
Loading