From 9196e6e73c708811b05d5e0ea2d965637f24bbf7 Mon Sep 17 00:00:00 2001 From: aabdi Date: Mon, 22 Jun 2026 13:11:13 +0700 Subject: [PATCH 1/5] feat: [sampling-overview-concept-page-001] - [Align the existing sampling overview record for canonical concept discovery] --- .../registry/concepts/sampling-overview.json | 7 ++-- .../sampling-overview-glossary.test.ts | 33 +++++++++++++++++-- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/src/content/registry/concepts/sampling-overview.json b/src/content/registry/concepts/sampling-overview.json index fbb4331e..095a819c 100644 --- a/src/content/registry/concepts/sampling-overview.json +++ b/src/content/registry/concepts/sampling-overview.json @@ -8,16 +8,19 @@ "sampling overview", "token sampling", "next-token sampling", - "sampling basics" + "sampling basics", + "decoding strategy" ], "tags": ["foundations", "token-to-probability-chain"], "relatedIds": [ "concept.temperature", "concept.softmax", "concept.autoregressive-generation", + "concept.decode", "concept.greedy-decoding", "concept.top-k-sampling", - "concept.top-p-sampling" + "concept.top-p-sampling", + "paper.gpt-2-report" ], "citationIds": ["citation.curious-case-neural-text-degeneration"], "status": "published", diff --git a/src/lib/content/sampling-overview-glossary.test.ts b/src/lib/content/sampling-overview-glossary.test.ts index 302b34ec..685dff7a 100644 --- a/src/lib/content/sampling-overview-glossary.test.ts +++ b/src/lib/content/sampling-overview-glossary.test.ts @@ -36,14 +36,17 @@ describe("Phase 5 sampling overview glossary page (phase-5-sampling-basics-decis "token sampling", "next-token sampling", "sampling basics", + "decoding strategy", ]); expect(record?.relatedIds).toEqual([ "concept.temperature", "concept.softmax", "concept.autoregressive-generation", + "concept.decode", "concept.greedy-decoding", "concept.top-k-sampling", "concept.top-p-sampling", + "paper.gpt-2-report", ]); expect(PUBLISHED_DOCS_REGISTRY_IDS.has("concept.sampling-overview")).toBe( true, @@ -65,7 +68,7 @@ describe("Phase 5 sampling overview glossary page (phase-5-sampling-basics-decis ).toContain("concept.sampling-overview"); }); - test("curated related docs preserve published links backward and expose the completed decoding path", () => { + test("curated related docs preserve published links backward and expose the completed decoding and GPT-family path", () => { const source = getConceptById("concept.sampling-overview"); if (!source) { throw new Error("expected concept.sampling-overview in registry"); @@ -102,6 +105,14 @@ describe("Phase 5 sampling overview glossary page (phase-5-sampling-basics-decis ), ).toBe(true); + expect( + items.some( + (item) => + item.registryId === "concept.decode" && + item.href === "/docs/glossary/decode" && + item.isPlanned === false, + ), + ).toBe(true); expect( items.some( (item) => @@ -127,6 +138,14 @@ describe("Phase 5 sampling overview glossary page (phase-5-sampling-basics-decis item.isPlanned === false, ), ).toBe(true); + expect( + items.some( + (item) => + item.registryId === "paper.gpt-2-report" && + item.href === "/docs/papers/gpt-2-report" && + item.isPlanned === false, + ), + ).toBe(true); }); test("messages explain the final next-token choice and distinguish greedy, top-k, and top-p in plain language", () => { @@ -195,18 +214,22 @@ describe("Phase 5 sampling overview glossary page (phase-5-sampling-basics-decis expect(html).toContain('href="/docs/glossary/temperature"'); expect(html).toContain('href="/docs/glossary/softmax"'); expect(html).toContain('href="/docs/glossary/autoregressive-generation"'); + expect(html).toContain('href="/docs/glossary/decode"'); expect(html).toContain('href="/docs/glossary/greedy-decoding"'); expect(html).toContain('href="/docs/glossary/top-k-sampling"'); expect(html).toContain('href="/docs/glossary/top-p-sampling"'); + expect(html).toContain('href="/docs/papers/gpt-2-report"'); expect(html).toContain('data-testid="curated-related-docs"'); expect(html).not.toContain('data-planned="true"'); + expect(html).toContain("Decode"); expect(html).toContain("Greedy Decoding"); + expect(html).toContain("GPT-2 report"); expect(html).toContain("Top K Sampling"); expect(html).toContain("Top-P Sampling"); expect(html).not.toContain("Reader Shortcut"); }); - test("search index records sampling overview as a glossary page with aliases", async () => { + test("search index records sampling overview as a glossary page with aliases and shared chain tags", async () => { const registry = await loadRegistry(); const pages = await loadPublishedDocsPages("en"); const documents = buildSearchDocuments(pages, registry); @@ -222,9 +245,12 @@ describe("Phase 5 sampling overview glossary page (phase-5-sampling-basics-decis "token sampling", "next-token sampling", "sampling basics", + "decoding strategy", ]), ); - expect(document?.tags).toEqual(expect.arrayContaining(["foundations"])); + expect(document?.tags).toEqual( + expect.arrayContaining(["foundations", "token-to-probability-chain"]), + ); }); test("search finds sampling overview by title, aliases, and next-token choice terms", async () => { @@ -232,6 +258,7 @@ describe("Phase 5 sampling overview glossary page (phase-5-sampling-basics-decis "Sampling Overview", "token sampling", "next-token sampling", + "decoding strategy", "choose the next token from a probability distribution", ] as const) { const results = await docsSearchApi.search(query); From c089306449220365fe55399e5ef37c98f5afe32a Mon Sep 17 00:00:00 2001 From: aabdi Date: Mon, 22 Jun 2026 13:23:26 +0700 Subject: [PATCH 2/5] feat: [sampling-overview-concept-page-002] - [Publish the canonical sampling overview concept page] --- .../concepts/sampling-overview/assets.json | 1 + .../sampling-overview/messages/en.json | 45 ++++ .../docs/concepts/sampling-overview/page.mdx | 90 +++++++ src/lib/content/content-paths.ts | 6 + .../content/greedy-decoding-glossary.test.ts | 4 +- .../content/sampling-overview-concept.test.ts | 239 ++++++++++++++++++ .../content/top-k-sampling-glossary.test.ts | 4 +- .../content/top-p-sampling-glossary.test.ts | 4 +- 8 files changed, 387 insertions(+), 6 deletions(-) create mode 100644 src/content/docs/concepts/sampling-overview/assets.json create mode 100644 src/content/docs/concepts/sampling-overview/messages/en.json create mode 100644 src/content/docs/concepts/sampling-overview/page.mdx create mode 100644 src/lib/content/sampling-overview-concept.test.ts diff --git a/src/content/docs/concepts/sampling-overview/assets.json b/src/content/docs/concepts/sampling-overview/assets.json new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/src/content/docs/concepts/sampling-overview/assets.json @@ -0,0 +1 @@ +{} diff --git a/src/content/docs/concepts/sampling-overview/messages/en.json b/src/content/docs/concepts/sampling-overview/messages/en.json new file mode 100644 index 00000000..6588b194 --- /dev/null +++ b/src/content/docs/concepts/sampling-overview/messages/en.json @@ -0,0 +1,45 @@ +{ + "title": "Sampling Overview", + "description": "The post-probability next-token decision step that turns one distribution into one chosen token, shaping how predictable or varied generated text feels.", + "openingSummary": "After a model produces logits and those scores become probabilities, one more step still remains: choosing the next token. Sampling is that choice step, and its settings trade repeatable predictability against diversity without changing what the model already knows.", + "sections": { + "whatItIs": { + "title": "What It Is", + "body": "Sampling is the final next-token decision step in autoregressive generation. The model has already scored every token with logits, and those scores have been turned into probabilities. Sampling decides what to do next with that distribution: always take the highest-probability token, or keep several plausible options and choose among them under a rule." + }, + "whyItMatters": { + "title": "Why It Matters", + "body": "This step strongly shapes how the output feels to a reader. Stricter rules are usually more repeatable, which helps when you want the same answer each time. Looser rules usually allow more diversity, which helps when you want variation or brainstorming. These settings change the selection process, not the model's underlying knowledge; they control how the model uses its probabilities, not what the model has learned." + }, + "simpleExample": { + "title": "Simple Example", + "body": "Imagine the next-token probabilities heavily favor \"Paris,\" give some weight to \"Lyon,\" and leave a smaller chance for \"Marseille.\" Greedy decoding always picks \"Paris.\" Top-k sampling keeps only the top few candidates before drawing one. Top-p sampling keeps the smallest candidate set whose total probability passes a threshold, then draws from that set. All three strategies start from the same probabilities, but they make different tradeoffs between stability and variety." + }, + "readerPath": { + "title": "Where To Go Next", + "body": "Use this page as the broad bridge through the generation path. Autoregressive generation and decode explain where the choice happens in the full loop. Temperature explains how probabilities can be reshaped before selection. Greedy decoding, top-k sampling, and top-p sampling go deeper into specific token-choice rules, while GPT-2 is a nearby model page that places the sampling step inside a familiar decoder-only system." + }, + "commonConfusions": { + "title": "Common Confusions", + "body": "Sampling is not the same as temperature. Temperature changes the shape of the probability distribution before a token is chosen, while sampling rules decide how to choose after probabilities exist. Sampling also does not mean pure randomness: greedy decoding is a sampling rule too, but it is deterministic. This overview is broader than the deeper pages for temperature, greedy decoding, top-k sampling, or top-p sampling; those pages focus on one control or method, while this page explains the overall choice step they all belong to." + }, + "related": { + "title": "Related Concepts And Modules" + }, + "tags": { + "title": "Tags" + }, + "references": { + "title": "References" + } + }, + "links": { + "autoregressiveGeneration": "Autoregressive generation", + "decode": "Decode", + "temperature": "Temperature", + "greedyDecoding": "Greedy decoding", + "topKSampling": "Top-k sampling", + "topPSampling": "Top-p sampling", + "gpt2Report": "GPT-2 report" + } +} diff --git a/src/content/docs/concepts/sampling-overview/page.mdx b/src/content/docs/concepts/sampling-overview/page.mdx new file mode 100644 index 00000000..46d984c1 --- /dev/null +++ b/src/content/docs/concepts/sampling-overview/page.mdx @@ -0,0 +1,90 @@ +--- +title: "Sampling Overview" +description: "The post-probability next-token decision step that turns one distribution into one chosen token, shaping how predictable or varied generated text feels." +kind: "concept" +registryId: "concept.sampling-overview" +messageNamespace: "local" +assetNamespace: "local" +status: "published" +tags: + - foundations + - token-to-probability-chain +aliases: + - "token sampling" + - "next-token sampling" + - "sampling basics" + - "decoding strategy" +updatedAt: "2026-06-22" +--- + +import { CitationList } from "@/features/docs/components/CitationList"; +import { DerivedRelatedDocs } from "@/features/docs/components/DerivedRelatedDocs"; +import { LocalizedLinkList } from "@/features/docs/components/LocalizedLinkList"; +import { RelatedDocs } from "@/features/docs/components/RelatedDocs"; +import { Section } from "@/features/docs/components/Section"; +import { T } from "@/features/docs/components/T"; +import { TagPillList } from "@/features/docs/components/TagPillList"; + +
+ +
+ +
+ +
+ +
+ +
+ +
+ + +
+ +
+ +
+ + + +
+ +
+ +
+ +
diff --git a/src/lib/content/content-paths.ts b/src/lib/content/content-paths.ts index 7d07b767..4f21c0ef 100644 --- a/src/lib/content/content-paths.ts +++ b/src/lib/content/content-paths.ts @@ -293,6 +293,12 @@ export const HIDDEN_SIZE_GLOSSARY_PAGE_DIR = join( /** Prefill concept page directory. */ export const PREFILL_CONCEPT_PAGE_DIR = join(CONCEPTS_DOCS_ROOT, "prefill"); +/** Sampling overview concept page directory. */ +export const SAMPLING_OVERVIEW_CONCEPT_PAGE_DIR = join( + CONCEPTS_DOCS_ROOT, + "sampling-overview", +); + /** Vocabulary size glossary page directory. */ export const VOCABULARY_SIZE_GLOSSARY_PAGE_DIR = join( GLOSSARY_DOCS_ROOT, diff --git a/src/lib/content/greedy-decoding-glossary.test.ts b/src/lib/content/greedy-decoding-glossary.test.ts index a8453ed3..9f3f37d1 100644 --- a/src/lib/content/greedy-decoding-glossary.test.ts +++ b/src/lib/content/greedy-decoding-glossary.test.ts @@ -79,7 +79,7 @@ describe("Phase 5 greedy decoding glossary page (phase-5-sampling-basics-decisio items.some( (item) => item.registryId === "concept.sampling-overview" && - item.href === "/docs/glossary/sampling-overview" && + item.href === "/docs/concepts/sampling-overview" && item.isPlanned === false, ), ).toBe(true); @@ -176,7 +176,7 @@ describe("Phase 5 greedy decoding glossary page (phase-5-sampling-basics-decisio html, "The same prompt and model state produce the same next token each time.", ); - expect(html).toContain('href="/docs/glossary/sampling-overview"'); + expect(html).toContain('href="/docs/concepts/sampling-overview"'); expect(html).toContain('href="/docs/glossary/temperature"'); expect(html).toContain('href="/docs/glossary/autoregressive-generation"'); expect(html).toContain('href="/docs/glossary/top-k-sampling"'); diff --git a/src/lib/content/sampling-overview-concept.test.ts b/src/lib/content/sampling-overview-concept.test.ts new file mode 100644 index 00000000..c388d8cb --- /dev/null +++ b/src/lib/content/sampling-overview-concept.test.ts @@ -0,0 +1,239 @@ +import { describe, expect, test } from "bun:test"; +import { readFileSync } from "node:fs"; +import { join } from "node:path"; +import { createElement } from "react"; +import { renderToStaticMarkup } from "react-dom/server"; +import { ModulePageProviders } from "@/features/docs/components/ModulePageProviders"; +import { loadConceptPage } from "@/lib/content/concept-page"; +import { SAMPLING_OVERVIEW_CONCEPT_PAGE_DIR } from "@/lib/content/content-paths"; +import { loadPublishedDocsPages } from "@/lib/content/pages"; +import { + PUBLISHED_CONCEPT_SECTION_REGISTRY_IDS, + PUBLISHED_DOCS_REGISTRY_IDS, +} from "@/lib/content/published-docs-registry-ids"; +import { loadRegistry } from "@/lib/content/registry"; +import { + getConceptById, + listRelatedRegistryRecords, +} from "@/lib/content/registry-runtime"; +import { deriveCuratedRelatedItems } from "@/lib/content/related-docs"; +import { pageMessagesSchema } from "@/lib/content/schemas"; +import { buildSearchDocuments } from "@/lib/search/build-documents"; +import { docsSearchApi } from "@/lib/search/search-server"; + +const pageDir = SAMPLING_OVERVIEW_CONCEPT_PAGE_DIR; +const messagesPath = join(pageDir, "messages/en.json"); + +describe("sampling overview concept page (sampling-overview-concept-page-002)", () => { + test("registry record stays published and now resolves in the canonical concept section", () => { + const record = getConceptById("concept.sampling-overview"); + expect(record?.status).toBe("published"); + expect(record?.aliases).toEqual([ + "sampling overview", + "token sampling", + "next-token sampling", + "sampling basics", + "decoding strategy", + ]); + expect(record?.relatedIds).toEqual([ + "concept.temperature", + "concept.softmax", + "concept.autoregressive-generation", + "concept.decode", + "concept.greedy-decoding", + "concept.top-k-sampling", + "concept.top-p-sampling", + "paper.gpt-2-report", + ]); + expect(PUBLISHED_DOCS_REGISTRY_IDS.has("concept.sampling-overview")).toBe( + true, + ); + expect( + PUBLISHED_CONCEPT_SECTION_REGISTRY_IDS.has("concept.sampling-overview"), + ).toBe(true); + }); + + test("curated related links resolve to the generation path, sampling methods, and nearby GPT-family context", () => { + const source = getConceptById("concept.sampling-overview"); + if (!source) { + throw new Error("expected concept.sampling-overview in registry"); + } + + const items = deriveCuratedRelatedItems( + source, + listRelatedRegistryRecords(), + PUBLISHED_DOCS_REGISTRY_IDS, + ); + + expect( + items.some( + (item) => + item.registryId === "concept.temperature" && + item.href === "/docs/glossary/temperature", + ), + ).toBe(true); + expect( + items.some( + (item) => + item.registryId === "concept.autoregressive-generation" && + item.href === "/docs/glossary/autoregressive-generation", + ), + ).toBe(true); + expect( + items.some( + (item) => + item.registryId === "concept.decode" && + item.href === "/docs/glossary/decode", + ), + ).toBe(true); + expect( + items.some( + (item) => + item.registryId === "concept.greedy-decoding" && + item.href === "/docs/glossary/greedy-decoding", + ), + ).toBe(true); + expect( + items.some( + (item) => + item.registryId === "concept.top-k-sampling" && + item.href === "/docs/glossary/top-k-sampling", + ), + ).toBe(true); + expect( + items.some( + (item) => + item.registryId === "concept.top-p-sampling" && + item.href === "/docs/glossary/top-p-sampling", + ), + ).toBe(true); + expect( + items.some( + (item) => + item.registryId === "paper.gpt-2-report" && + item.href === "/docs/papers/gpt-2-report", + ), + ).toBe(true); + }); + + test("messages teach the post-probability decision step and distinguish overview from deeper controls", () => { + const messages = pageMessagesSchema.parse( + JSON.parse(readFileSync(messagesPath, "utf8")), + ); + + expect(messages.title).toBe("Sampling Overview"); + expect(messages.openingSummary?.length).toBeGreaterThan(0); + expect(messages.sections?.whatItIs.body?.toLowerCase()).toContain( + "final next-token decision step", + ); + expect(messages.sections?.whatItIs.body?.toLowerCase()).toContain( + "probabilities", + ); + expect(messages.sections?.whyItMatters.body?.toLowerCase()).toContain( + "repeatable", + ); + expect(messages.sections?.whyItMatters.body?.toLowerCase()).toContain( + "diversity", + ); + expect(messages.sections?.whyItMatters.body?.toLowerCase()).toContain( + "underlying knowledge", + ); + expect(messages.sections?.commonConfusions.body?.toLowerCase()).toContain( + "temperature", + ); + expect(messages.sections?.commonConfusions.body?.toLowerCase()).toContain( + "greedy decoding", + ); + expect(messages.sections?.commonConfusions.body?.toLowerCase()).toContain( + "top-k sampling", + ); + expect(messages.sections?.commonConfusions.body?.toLowerCase()).toContain( + "top-p sampling", + ); + }); + + test("page renders the canonical concept route with generation-path links and no hard-coded summary block", async () => { + const page = await loadConceptPage("sampling-overview"); + + expect(page.frontmatter.kind).toBe("concept"); + expect(page.frontmatter.status).toBe("published"); + expect(page.frontmatter.registryId).toBe("concept.sampling-overview"); + expect(page.messages.openingSummary?.length).toBeGreaterThan(0); + expect(page.toc.some((item) => item.url === "#reader-path")).toBe(true); + + const html = renderToStaticMarkup( + createElement(ModulePageProviders, { + messages: page.messages, + assets: page.assets, + // biome-ignore lint/correctness/noChildrenProp: third createElement arg conflicts with strict props typing + children: page.content, + }), + ); + + expect(html).toContain("What It Is"); + expect(html).toContain("Why It Matters"); + expect(html).toContain("Where To Go Next"); + expect(html).toContain("Common Confusions"); + expect(html).toContain( + "These settings change the selection process, not the model's underlying knowledge", + ); + expect(html).toContain('href="/docs/glossary/autoregressive-generation"'); + expect(html).toContain('href="/docs/glossary/decode"'); + expect(html).toContain('href="/docs/glossary/temperature"'); + expect(html).toContain('href="/docs/glossary/greedy-decoding"'); + expect(html).toContain('href="/docs/glossary/top-k-sampling"'); + expect(html).toContain('href="/docs/glossary/top-p-sampling"'); + expect(html).toContain('href="/docs/papers/gpt-2-report"'); + expect(html).toContain('data-testid="curated-related-docs"'); + expect(html).toContain('data-testid="citation-list"'); + expect(html).not.toContain("Reader Shortcut"); + expect(html).not.toContain("Phase"); + }); + + test("published pages and search documents expose the canonical concept route alongside the existing glossary entry", async () => { + const registry = await loadRegistry(); + const pages = await loadPublishedDocsPages("en"); + const documents = buildSearchDocuments(pages, registry); + + expect( + pages.some((page) => page.docsSlug === "concepts/sampling-overview"), + ).toBe(true); + expect( + pages.some((page) => page.docsSlug === "glossary/sampling-overview"), + ).toBe(true); + + const conceptDocument = documents.find( + (entry) => entry.url === "/docs/concepts/sampling-overview", + ); + expect(conceptDocument?.kind).toBe("concept"); + expect(conceptDocument?.facets.kind).toBe("concept"); + expect(conceptDocument?.aliases).toEqual( + expect.arrayContaining([ + "sampling overview", + "token sampling", + "next-token sampling", + "sampling basics", + "decoding strategy", + ]), + ); + expect(conceptDocument?.tags).toEqual( + expect.arrayContaining(["foundations", "token-to-probability-chain"]), + ); + }); + + test("search returns the canonical concept route for title and sampling-basics queries", async () => { + for (const query of [ + "Sampling Overview", + "sampling basics", + "token sampling", + "next-token sampling", + ] as const) { + const results = await docsSearchApi.search(query); + expect( + results.some( + (result) => result.url === "/docs/concepts/sampling-overview", + ), + ).toBe(true); + } + }); +}); diff --git a/src/lib/content/top-k-sampling-glossary.test.ts b/src/lib/content/top-k-sampling-glossary.test.ts index 7619ed6e..1b4f3fb3 100644 --- a/src/lib/content/top-k-sampling-glossary.test.ts +++ b/src/lib/content/top-k-sampling-glossary.test.ts @@ -73,7 +73,7 @@ describe("Phase 5 top-k sampling glossary page (phase-5-sampling-basics-decision ); for (const publishedId of [ - ["concept.sampling-overview", "/docs/glossary/sampling-overview"], + ["concept.sampling-overview", "/docs/concepts/sampling-overview"], ["concept.greedy-decoding", "/docs/glossary/greedy-decoding"], ["concept.temperature", "/docs/glossary/temperature"], ] as const) { @@ -162,7 +162,7 @@ describe("Phase 5 top-k sampling glossary page (phase-5-sampling-basics-decision html, "Top-p sampling would instead keep however many tokens are needed to cross a cumulative probability threshold, so its candidate count can change from one step to the next.", ); - expect(html).toContain('href="/docs/glossary/sampling-overview"'); + expect(html).toContain('href="/docs/concepts/sampling-overview"'); expect(html).toContain('href="/docs/glossary/greedy-decoding"'); expect(html).toContain('href="/docs/glossary/temperature"'); expect(html).toContain('href="/docs/glossary/top-p-sampling"'); diff --git a/src/lib/content/top-p-sampling-glossary.test.ts b/src/lib/content/top-p-sampling-glossary.test.ts index 60bfa1a8..af7a9dcd 100644 --- a/src/lib/content/top-p-sampling-glossary.test.ts +++ b/src/lib/content/top-p-sampling-glossary.test.ts @@ -75,7 +75,7 @@ describe("Phase 5 top-p sampling glossary page (phase-5-sampling-basics-decision ); for (const publishedId of [ - ["concept.sampling-overview", "/docs/glossary/sampling-overview"], + ["concept.sampling-overview", "/docs/concepts/sampling-overview"], ["concept.greedy-decoding", "/docs/glossary/greedy-decoding"], ["concept.top-k-sampling", "/docs/glossary/top-k-sampling"], ["concept.temperature", "/docs/glossary/temperature"], @@ -154,7 +154,7 @@ describe("Phase 5 top-p sampling glossary page (phase-5-sampling-basics-decision html, "Top-k sampling would keep a fixed number of candidates even if the distribution became much sharper or flatter on the next step.", ); - expect(html).toContain('href="/docs/glossary/sampling-overview"'); + expect(html).toContain('href="/docs/concepts/sampling-overview"'); expect(html).toContain('href="/docs/glossary/greedy-decoding"'); expect(html).toContain('href="/docs/glossary/top-k-sampling"'); expect(html).toContain('href="/docs/glossary/temperature"'); From e96aaa24beaca362f09d7903434c3cc12c71b897 Mon Sep 17 00:00:00 2001 From: aabdi Date: Mon, 22 Jun 2026 13:32:14 +0700 Subject: [PATCH 3/5] feat: [sampling-overview-concept-page-003] - [Connect the overview page to the generation reader path] --- src/content/registry/concepts/decode.json | 1 + src/content/registry/papers/gpt-2-report.json | 3 ++- src/lib/content/decode-glossary.test.ts | 9 +++++++++ src/lib/content/gpt-2-report-paper-page.test.tsx | 1 + src/lib/content/gpt-2-report-paper-record.test.ts | 2 ++ src/lib/source.test.ts | 1 + 6 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/content/registry/concepts/decode.json b/src/content/registry/concepts/decode.json index e68c8246..421cad19 100644 --- a/src/content/registry/concepts/decode.json +++ b/src/content/registry/concepts/decode.json @@ -18,6 +18,7 @@ "concept.prefill-decode-split", "system.batching", "concept.autoregressive-generation", + "concept.sampling-overview", "module.attention", "module.multi-query-attention", "module.grouped-query-attention", diff --git a/src/content/registry/papers/gpt-2-report.json b/src/content/registry/papers/gpt-2-report.json index 84d82413..a98d6444 100644 --- a/src/content/registry/papers/gpt-2-report.json +++ b/src/content/registry/papers/gpt-2-report.json @@ -15,7 +15,8 @@ "relatedIds": [ "module.byte-level-tokenization", "concept.transformer-architecture", - "concept.scaling-law" + "concept.scaling-law", + "concept.sampling-overview" ], "citationIds": ["citation.gpt-2-report"], "status": "published", diff --git a/src/lib/content/decode-glossary.test.ts b/src/lib/content/decode-glossary.test.ts index 6af9138f..02f8f0c3 100644 --- a/src/lib/content/decode-glossary.test.ts +++ b/src/lib/content/decode-glossary.test.ts @@ -46,6 +46,7 @@ describe("Phase 5 decode glossary page (US-003)", () => { "concept.prefill-decode-split", "system.batching", "concept.autoregressive-generation", + "concept.sampling-overview", "module.attention", "module.multi-query-attention", "module.grouped-query-attention", @@ -82,6 +83,13 @@ describe("Phase 5 decode glossary page (US-003)", () => { item.href === "/docs/concepts/kv-cache", ), ).toBe(true); + expect( + items.some( + (item) => + item.registryId === "concept.sampling-overview" && + item.href === "/docs/concepts/sampling-overview", + ), + ).toBe(true); expect( items.some( (item) => @@ -153,6 +161,7 @@ describe("Phase 5 decode glossary page (US-003)", () => { expectHtmlToContainProse(html, "memory bandwidth"); expect(html).toContain('href="/docs/concepts/kv-cache"'); expect(html).toContain('href="/docs/concepts/prefill"'); + expect(html).toContain('href="/docs/concepts/sampling-overview"'); expect(html).toContain('href="/docs/glossary/prefill-decode-split"'); expect(html).toContain('href="/docs/glossary/autoregressive-generation"'); expect(html).toContain('href="/docs/modules/attention"'); diff --git a/src/lib/content/gpt-2-report-paper-page.test.tsx b/src/lib/content/gpt-2-report-paper-page.test.tsx index 307f84ec..17827f73 100644 --- a/src/lib/content/gpt-2-report-paper-page.test.tsx +++ b/src/lib/content/gpt-2-report-paper-page.test.tsx @@ -75,6 +75,7 @@ describe("GPT-2 report paper page", () => { expect(html).toContain("Byte-level BPE tokenization"); expect(html).toContain("Broad next-token pretraining"); expect(html).toContain('href="/docs/concepts/transformer-architecture"'); + expect(html).toContain('href="/docs/concepts/sampling-overview"'); expect(html).toContain('href="/docs/modules/byte-level-tokenization"'); expect(html).toContain('href="/docs/glossary/scaling-law"'); expect(html).toContain('data-testid="curated-related-docs"'); diff --git a/src/lib/content/gpt-2-report-paper-record.test.ts b/src/lib/content/gpt-2-report-paper-record.test.ts index 455f36b5..e728d7e5 100644 --- a/src/lib/content/gpt-2-report-paper-record.test.ts +++ b/src/lib/content/gpt-2-report-paper-record.test.ts @@ -13,6 +13,7 @@ const publishedRegistryIds = new Set([ "module.byte-level-tokenization", "concept.transformer-architecture", "concept.scaling-law", + "concept.sampling-overview", ]); function requirePaperRecord() { @@ -82,6 +83,7 @@ describe("gpt-2 report paper registry record", () => { "module.byte-level-tokenization", "concept.transformer-architecture", "concept.scaling-law", + "concept.sampling-overview", ]); }); }); diff --git a/src/lib/source.test.ts b/src/lib/source.test.ts index a962599c..cc7fba7d 100644 --- a/src/lib/source.test.ts +++ b/src/lib/source.test.ts @@ -112,6 +112,7 @@ const CONCEPT_INDEX_URLS = [ "/docs/concepts/prefill", "/docs/concepts/page-spec-workflow-sample", "/docs/concepts/positional-encodings", + "/docs/concepts/sampling-overview", "/docs/concepts/transformer-architecture", "/docs/concepts/why-long-context-is-hard", ] as const; From 164fd2b2d84aceca3fada50e075a1789dfe38016 Mon Sep 17 00:00:00 2001 From: aabdi Date: Mon, 22 Jun 2026 13:37:35 +0700 Subject: [PATCH 4/5] feat: [sampling-overview-concept-page-004] - [Add focused validation for the new concept route and touched discovery surfaces] --- ...mpling-overview-concept-validation.test.ts | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 src/lib/content/sampling-overview-concept-validation.test.ts diff --git a/src/lib/content/sampling-overview-concept-validation.test.ts b/src/lib/content/sampling-overview-concept-validation.test.ts new file mode 100644 index 00000000..492b8928 --- /dev/null +++ b/src/lib/content/sampling-overview-concept-validation.test.ts @@ -0,0 +1,71 @@ +import { describe, expect, test } from "bun:test"; +import { loadConceptPage } from "@/lib/content/concept-page"; +import { + CONCEPTS_DOCS_ROOT, + getDocsPageDir, + SAMPLING_OVERVIEW_CONCEPT_PAGE_DIR, +} from "@/lib/content/content-paths"; +import { loadPublishedDocsPages } from "@/lib/content/pages"; +import { + getPublishedDocsEntriesBySlug, + getPublishedDocsEntryByRegistryId, +} from "@/lib/content/published-docs-registry-ids"; +import { loadSearchResultMetaMap } from "@/lib/search/search-result-meta"; + +describe("sampling overview canonical validation (sampling-overview-concept-page-004)", () => { + test("canonical concept path helpers resolve the sampling overview page directory", () => { + expect(getDocsPageDir("concepts", "sampling-overview")).toBe( + SAMPLING_OVERVIEW_CONCEPT_PAGE_DIR, + ); + expect(SAMPLING_OVERVIEW_CONCEPT_PAGE_DIR).toBe( + `${CONCEPTS_DOCS_ROOT}/sampling-overview`, + ); + }); + + test("published docs registry keeps one shared record while preferring the canonical concept route", () => { + const entries = getPublishedDocsEntriesBySlug("sampling-overview"); + + expect(entries).toHaveLength(2); + expect(entries.map((entry) => entry.docsSlug).sort()).toEqual([ + "concepts/sampling-overview", + "glossary/sampling-overview", + ]); + + const canonicalEntry = getPublishedDocsEntryByRegistryId( + "concept.sampling-overview", + ); + expect(canonicalEntry?.docsSlug).toBe("concepts/sampling-overview"); + expect(canonicalEntry?.url).toBe("/docs/concepts/sampling-overview"); + expect(canonicalEntry?.pageKind).toBe("concept"); + }); + + test("default English page loading and search metadata expose the canonical route", async () => { + const page = await loadConceptPage("sampling-overview"); + const pages = await loadPublishedDocsPages("en"); + const searchMeta = await loadSearchResultMetaMap("en"); + + expect(page.frontmatter.registryId).toBe("concept.sampling-overview"); + expect(page.messages.title).toBe("Sampling Overview"); + + expect( + pages.some( + (entry) => + entry.docsSlug === "concepts/sampling-overview" && + entry.url === "/docs/concepts/sampling-overview" && + entry.frontmatter.registryId === "concept.sampling-overview", + ), + ).toBe(true); + + const meta = searchMeta.get("/docs/concepts/sampling-overview"); + expect(meta?.title).toBe("Sampling Overview"); + expect(meta?.kind).toBe("concept"); + expect(meta?.aliases).toEqual( + expect.arrayContaining([ + "sampling overview", + "token sampling", + "next-token sampling", + "sampling basics", + ]), + ); + }); +}); From 8858bd2a8bbab2a696a0b195f5772abcd8a767b6 Mon Sep 17 00:00:00 2001 From: aabdi Date: Mon, 22 Jun 2026 13:58:30 +0700 Subject: [PATCH 5/5] feat: sampling-overview-concept-page-004 - Add focused validation for the new concept route and touched discovery surfaces --- .../sampling-overview/messages/en.json | 4 ++-- ...mpling-overview-concept-validation.test.ts | 24 +++++++------------ src/lib/source.test.ts | 1 - 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/src/content/docs/concepts/sampling-overview/messages/en.json b/src/content/docs/concepts/sampling-overview/messages/en.json index 6588b194..1c77f7e5 100644 --- a/src/content/docs/concepts/sampling-overview/messages/en.json +++ b/src/content/docs/concepts/sampling-overview/messages/en.json @@ -17,11 +17,11 @@ }, "readerPath": { "title": "Where To Go Next", - "body": "Use this page as the broad bridge through the generation path. Autoregressive generation and decode explain where the choice happens in the full loop. Temperature explains how probabilities can be reshaped before selection. Greedy decoding, top-k sampling, and top-p sampling go deeper into specific token-choice rules, while GPT-2 is a nearby model page that places the sampling step inside a familiar decoder-only system." + "body": "Autoregressive generation and decode explain where this choice happens in the full loop. Temperature explains how probabilities can be reshaped before selection. Greedy decoding, top-k sampling, and top-p sampling each apply a different token-choice rule, while GPT-2 shows the same step inside a familiar decoder-only system." }, "commonConfusions": { "title": "Common Confusions", - "body": "Sampling is not the same as temperature. Temperature changes the shape of the probability distribution before a token is chosen, while sampling rules decide how to choose after probabilities exist. Sampling also does not mean pure randomness: greedy decoding is a sampling rule too, but it is deterministic. This overview is broader than the deeper pages for temperature, greedy decoding, top-k sampling, or top-p sampling; those pages focus on one control or method, while this page explains the overall choice step they all belong to." + "body": "Sampling is not the same as temperature. Temperature changes the shape of the probability distribution before a token is chosen, while sampling rules decide how to choose after probabilities exist. Sampling also does not mean pure randomness: greedy decoding is a sampling rule too, but it is deterministic. Temperature, greedy decoding, top-k sampling, and top-p sampling each isolate one control or method inside the larger next-token choice step that sampling describes." }, "related": { "title": "Related Concepts And Modules" diff --git a/src/lib/content/sampling-overview-concept-validation.test.ts b/src/lib/content/sampling-overview-concept-validation.test.ts index 492b8928..22e5c358 100644 --- a/src/lib/content/sampling-overview-concept-validation.test.ts +++ b/src/lib/content/sampling-overview-concept-validation.test.ts @@ -1,27 +1,15 @@ import { describe, expect, test } from "bun:test"; import { loadConceptPage } from "@/lib/content/concept-page"; -import { - CONCEPTS_DOCS_ROOT, - getDocsPageDir, - SAMPLING_OVERVIEW_CONCEPT_PAGE_DIR, -} from "@/lib/content/content-paths"; import { loadPublishedDocsPages } from "@/lib/content/pages"; import { getPublishedDocsEntriesBySlug, getPublishedDocsEntryByRegistryId, } from "@/lib/content/published-docs-registry-ids"; import { loadSearchResultMetaMap } from "@/lib/search/search-result-meta"; +import { docsSearchApi } from "@/lib/search/search-server"; +import { source } from "@/lib/source"; describe("sampling overview canonical validation (sampling-overview-concept-page-004)", () => { - test("canonical concept path helpers resolve the sampling overview page directory", () => { - expect(getDocsPageDir("concepts", "sampling-overview")).toBe( - SAMPLING_OVERVIEW_CONCEPT_PAGE_DIR, - ); - expect(SAMPLING_OVERVIEW_CONCEPT_PAGE_DIR).toBe( - `${CONCEPTS_DOCS_ROOT}/sampling-overview`, - ); - }); - test("published docs registry keeps one shared record while preferring the canonical concept route", () => { const entries = getPublishedDocsEntriesBySlug("sampling-overview"); @@ -39,10 +27,12 @@ describe("sampling overview canonical validation (sampling-overview-concept-page expect(canonicalEntry?.pageKind).toBe("concept"); }); - test("default English page loading and search metadata expose the canonical route", async () => { + test("default English page loading, sidebar discovery, and search expose the canonical route", async () => { const page = await loadConceptPage("sampling-overview"); const pages = await loadPublishedDocsPages("en"); const searchMeta = await loadSearchResultMetaMap("en"); + const searchResults = await docsSearchApi.search("sampling basics"); + const sidebarEntry = source.getPage(["concepts", "sampling-overview"]); expect(page.frontmatter.registryId).toBe("concept.sampling-overview"); expect(page.messages.title).toBe("Sampling Overview"); @@ -56,6 +46,8 @@ describe("sampling overview canonical validation (sampling-overview-concept-page ), ).toBe(true); + expect(sidebarEntry?.url).toBe("/docs/concepts/sampling-overview"); + const meta = searchMeta.get("/docs/concepts/sampling-overview"); expect(meta?.title).toBe("Sampling Overview"); expect(meta?.kind).toBe("concept"); @@ -67,5 +59,7 @@ describe("sampling overview canonical validation (sampling-overview-concept-page "sampling basics", ]), ); + + expect(searchResults[0]?.url).toBe("/docs/concepts/sampling-overview"); }); }); diff --git a/src/lib/source.test.ts b/src/lib/source.test.ts index cc7fba7d..a962599c 100644 --- a/src/lib/source.test.ts +++ b/src/lib/source.test.ts @@ -112,7 +112,6 @@ const CONCEPT_INDEX_URLS = [ "/docs/concepts/prefill", "/docs/concepts/page-spec-workflow-sample", "/docs/concepts/positional-encodings", - "/docs/concepts/sampling-overview", "/docs/concepts/transformer-architecture", "/docs/concepts/why-long-context-is-hard", ] as const;