diff --git a/src/app/docs/docs-slug-renderer.test.tsx b/src/app/docs/docs-slug-renderer.test.tsx index 448a2f14..28c31774 100644 --- a/src/app/docs/docs-slug-renderer.test.tsx +++ b/src/app/docs/docs-slug-renderer.test.tsx @@ -2,6 +2,7 @@ import { describe, expect, test } from "bun:test"; import { buildDocsPageMetadata, renderDocsSlugPage, + resolveLocalDocsShellDescription, } from "@/app/docs/docs-slug-renderer"; describe("docs slug renderer locale gating", () => { @@ -243,4 +244,21 @@ describe("docs slug renderer locale gating", () => { ); } }); + + test("local non-glossary docs shell prefers openingSummary when present", () => { + const shellDescription = resolveLocalDocsShellDescription({ + description: + "A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality.", + openingSummary: + "Group Relative Policy Optimization, usually shortened to GRPO, is a reinforcement-learning post-training method where the model samples several answers to one prompt, scores them as a group, and learns from which answers look better inside that local set.", + section: "training", + }); + + expect(shellDescription).toContain( + "Group Relative Policy Optimization, usually shortened to GRPO", + ); + expect(shellDescription).not.toContain( + "A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality.", + ); + }); }); diff --git a/src/app/docs/docs-slug-renderer.tsx b/src/app/docs/docs-slug-renderer.tsx index 437a0027..9b9678bd 100644 --- a/src/app/docs/docs-slug-renderer.tsx +++ b/src/app/docs/docs-slug-renderer.tsx @@ -20,6 +20,19 @@ import { localizedRouteAlternates } from "@/lib/i18n/route-locale"; import { source } from "@/lib/source"; import { getMDXComponents } from "../../../mdx-components"; +export function resolveLocalDocsShellDescription(options: { + description: string; + openingSummary?: string; + section: string; +}) { + const { description, openingSummary, section } = options; + if (section === "glossary") { + return description; + } + + return openingSummary ?? description; +} + function buildDocsPageAlternates(docsSlug: string) { const alternates = localizedRouteAlternates({ surface: "docs-page", @@ -53,11 +66,16 @@ async function renderLocalDocsPage( const loadedPage = await loadLocalDocsPage(localRef, locale); const uiMessages = await loadUiMessages(locale); + const shellDescriptionText = resolveLocalDocsShellDescription({ + description: loadedPage.messages.description, + openingSummary: loadedPage.messages.openingSummary, + section: localRef.section, + }); const description = localRef.section === "glossary" ? ( ) : ( - loadedPage.messages.description + shellDescriptionText ); return ( diff --git a/src/content/docs/training/grpo/assets.json b/src/content/docs/training/grpo/assets.json new file mode 100644 index 00000000..04ee7b59 --- /dev/null +++ b/src/content/docs/training/grpo/assets.json @@ -0,0 +1,10 @@ +{ + "trainingFlow": { + "type": "graph", + "graphId": "graph.grpo-training-flow", + "webRenderer": "react-flow", + "printRenderer": "vertical-svg", + "altKey": "assets.trainingFlow.alt", + "captionKey": "assets.trainingFlow.caption" + } +} diff --git a/src/content/docs/training/grpo/messages/en.json b/src/content/docs/training/grpo/messages/en.json new file mode 100644 index 00000000..7998b9c8 --- /dev/null +++ b/src/content/docs/training/grpo/messages/en.json @@ -0,0 +1,94 @@ +{ + "title": "Group Relative Policy Optimization", + "description": "A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality.", + "openingSummary": "Group Relative Policy Optimization, usually shortened to GRPO, is a reinforcement-learning post-training method where the model samples several answers to one prompt, scores them as a group, and learns from which answers look better inside that local set.", + "sections": { + "whatItIs": { + "title": "What It Is", + "body": "Group Relative Policy Optimization is a policy-update method used after pretraining or supervised fine-tuning. Instead of judging one answer in isolation, it samples a small group of candidate answers for the same prompt and uses their relative ranking to decide which behaviors should be reinforced." + }, + "whyItExists": { + "title": "Why It Exists", + "body": "A single reward score can be noisy, and PPO-style language-model training often adds the extra cost of a learned critic to estimate a baseline. GRPO tries to keep the signal useful while simplifying the loop: the group itself supplies the local baseline, so the model can learn from which samples look better than their neighbors." + }, + "howItWorks": { + "title": "How It Works", + "body": "For one prompt, the policy generates several completions, a reward function scores them, and those scores are normalized within that group. Answers above the group average get a positive learning signal, answers below it get a negative one, and the policy is updated so future samples are more likely to resemble the stronger members of the group." + }, + "comparedToNearbyRegimes": { + "title": "Compared To Nearby Regimes", + "body": "Group Relative Policy Optimization still sits inside the broader alignment family, but it is narrower than Reinforcement Learning from Human Feedback as a full pipeline. Reinforcement Learning from Human Feedback often means collecting preference data, training a reward model, and then running a reinforcement-learning update such as Proximal Policy Optimization. GRPO keeps the reinforcement-learning loop, but it replaces the learned critic-style baseline with relative ranking inside one sampled group. Direct Preference Optimization moves in a different direction: it stays closer to supervised-style optimization on chosen-versus-rejected pairs, while GRPO uses rewards over several sampled answers and updates the policy from that within-group ordering instead of from one pairwise objective alone." + }, + "limitationsAndFailureModes": { + "title": "Limitations And Failure Modes", + "body": "The method still depends on reward quality. If the reward function prefers shallow tricks, the whole group can drift in the wrong direction together. Relative scoring also means a weak group can still produce a misleading winner if every sampled answer is bad." + }, + "related": { + "title": "Related To" + }, + "tags": { + "title": "Tags" + }, + "references": { + "title": "References" + } + }, + "callouts": { + "trainingFlowGraph": { + "title": "GRPO training flow", + "body": "A visual walkthrough of one prompt, grouped sampling, relative scoring, and the policy update." + }, + "trainingFlowLegend": { + "title": "Graph legend", + "body": "How to read each stage of the GRPO training flow." + } + }, + "links": { + "trainingFlowLegendPrompt": "One prompt anchors the whole local comparison.", + "trainingFlowLegendSampling": "The policy samples several candidate answers for that same prompt.", + "trainingFlowLegendRelativeScoring": "Those answers are scored relative to each other inside the sampled group.", + "trainingFlowLegendPolicyUpdate": "The policy update reinforces answers that beat the group baseline and discourages weaker ones." + }, + "assets": { + "trainingFlow": { + "alt": "A training flow from one prompt to a group of sampled answers, then to relative scoring inside the group, and finally to a policy update.", + "caption": "GRPO learns from which answers win inside each sampled group instead of relying on one separate critic-estimated baseline." + } + }, + "math": { + "grpoAdvantage": { + "label": "Grouped relative advantage sketch", + "formula": "A_i = \\frac{r_i - \\operatorname{mean}(r_{1:G})}{\\operatorname{std}(r_{1:G})}", + "variableDefinitions": { + "advantage": { + "term": "A_i", + "definition": "normalized advantage for sampled answer i" + }, + "reward": { + "term": "r_i", + "definition": "reward score assigned to sampled answer i" + }, + "groupSize": { + "term": "G", + "definition": "number of sampled answers in the comparison group" + } + } + } + }, + "graph": { + "nodes": { + "prompt": { + "label": "One prompt" + }, + "sampleGroup": { + "label": "Sample a group of answers" + }, + "relativeScore": { + "label": "Score answers relative to the group" + }, + "policyUpdate": { + "label": "Update the policy" + } + } + } +} diff --git a/src/content/docs/training/grpo/page.mdx b/src/content/docs/training/grpo/page.mdx new file mode 100644 index 00000000..e75d83ff --- /dev/null +++ b/src/content/docs/training/grpo/page.mdx @@ -0,0 +1,124 @@ +--- +title: "Group Relative Policy Optimization" +description: "A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality." +kind: "training-regime" +registryId: "training-regime.grpo" +messageNamespace: "local" +assetNamespace: "local" +status: "published" +tags: + - foundations +aliases: + - "GRPO" + - "Group Relative Preference Optimization" +updatedAt: "2026-06-19" +--- + +import { CitationList } from "@/features/docs/components/CitationList"; +import { BlockMath } from "@/features/docs/components/Math"; +import { RelatedDocs } from "@/features/docs/components/RelatedDocs"; +import { Section } from "@/features/docs/components/Section"; +import { T } from "@/features/docs/components/T"; +import { TagPillList } from "@/features/docs/components/TagPillList"; +import { TrainingRegimeAtAGlance } from "@/features/models/components/TrainingRegimeAtAGlance"; +import { TrainingRegimeFlow } from "@/features/models/components/TrainingRegimeFlow"; + + + + +
+ +
+ +
+ +
+ +
+ +

+ +

+ +
+

+ +

+
    +
  • + +
  • +
  • + +
  • +
  • + +
  • +
  • + +
  • +
+
+ +
+ +
+ + + +
+ +
+ +
+ + + +
+ +
+ +
+ +
diff --git a/src/content/registry/citations/deepseek-r1-paper.json b/src/content/registry/citations/deepseek-r1-paper.json new file mode 100644 index 00000000..d8c810d7 --- /dev/null +++ b/src/content/registry/citations/deepseek-r1-paper.json @@ -0,0 +1,20 @@ +{ + "id": "citation.deepseek-r1-paper", + "slug": "deepseek-r1-paper", + "kind": "citation", + "defaultTitleKey": "title", + "defaultSummaryKey": "summary", + "aliases": ["DeepSeek-R1 paper"], + "tags": ["foundations"], + "relatedIds": [], + "citationIds": [], + "status": "published", + "createdAt": "2026-06-19T00:00:00.000Z", + "updatedAt": "2026-06-19T00:00:00.000Z", + "citationType": "paper", + "authors": ["DeepSeek-AI"], + "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", + "year": 2025, + "url": "https://arxiv.org/abs/2501.12948", + "mla": "DeepSeek-AI. \"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.\" arXiv, 2025, https://arxiv.org/abs/2501.12948." +} diff --git a/src/content/registry/citations/deepseekmath-paper.json b/src/content/registry/citations/deepseekmath-paper.json new file mode 100644 index 00000000..c4a55291 --- /dev/null +++ b/src/content/registry/citations/deepseekmath-paper.json @@ -0,0 +1,32 @@ +{ + "id": "citation.deepseekmath-paper", + "slug": "deepseekmath-paper", + "kind": "citation", + "defaultTitleKey": "title", + "defaultSummaryKey": "summary", + "aliases": ["DeepSeekMath paper", "GRPO paper"], + "tags": ["foundations"], + "relatedIds": [], + "citationIds": [], + "status": "published", + "createdAt": "2026-06-19T00:00:00.000Z", + "updatedAt": "2026-06-19T00:00:00.000Z", + "citationType": "paper", + "authors": [ + "Zhihong Shao", + "Peiyi Wang", + "Qihao Zhu", + "Runxin Xu", + "Junxiao Song", + "Xiao Bi", + "Haowei Zhang", + "Mingchuan Zhang", + "Y. K. Li", + "Yu Wu", + "Daya Guo" + ], + "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models", + "year": 2024, + "url": "https://arxiv.org/abs/2402.03300", + "mla": "Shao, Zhihong, et al. \"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models.\" arXiv, 2024, https://arxiv.org/abs/2402.03300." +} diff --git a/src/content/registry/concepts/alignment.json b/src/content/registry/concepts/alignment.json index ace37f0f..6fcafc7d 100644 --- a/src/content/registry/concepts/alignment.json +++ b/src/content/registry/concepts/alignment.json @@ -7,6 +7,7 @@ "aliases": ["Alignment", "RLHF", "preference alignment", "safety alignment"], "tags": ["foundations", "taxonomy"], "relatedIds": [ + "training-regime.grpo", "concept.model-capacity", "concept.overfitting", "concept.generalization", diff --git a/src/content/registry/graphs/grpo-training-flow.json b/src/content/registry/graphs/grpo-training-flow.json new file mode 100644 index 00000000..b5164d02 --- /dev/null +++ b/src/content/registry/graphs/grpo-training-flow.json @@ -0,0 +1,84 @@ +{ + "id": "graph.grpo-training-flow", + "slug": "grpo-training-flow", + "kind": "graph", + "defaultTitleKey": "title", + "defaultSummaryKey": "description", + "aliases": ["GRPO training flow"], + "tags": ["foundations"], + "relatedIds": [], + "citationIds": ["citation.deepseekmath-paper"], + "status": "published", + "createdAt": "2026-06-19T00:00:00.000Z", + "updatedAt": "2026-06-19T00:00:00.000Z", + "subjectId": "training-regime.grpo", + "graphType": "recursive-module-graph", + "rootNodeId": "prompt", + "layout": "vertical-expandable", + "defaultExpandedDepth": 1, + "supportedRenderers": ["react-flow", "vertical-svg"], + "nodes": [ + { + "id": "prompt", + "labelKey": "graph.nodes.prompt.label", + "moduleKind": "input", + "position": { "x": 250, "y": 0 }, + "size": { "width": 220, "height": 70 }, + "visualRole": "process-node", + "childNodeIds": ["sample-group"] + }, + { + "id": "sample-group", + "labelKey": "graph.nodes.sampleGroup.label", + "moduleKind": "block", + "position": { "x": 250, "y": 110 }, + "size": { "width": 220, "height": 80 }, + "visualRole": "summary-node", + "childNodeIds": ["relative-score"] + }, + { + "id": "relative-score", + "labelKey": "graph.nodes.relativeScore.label", + "moduleKind": "block", + "position": { "x": 250, "y": 240 }, + "size": { "width": 220, "height": 80 }, + "visualRole": "summary-node", + "childNodeIds": ["policy-update"] + }, + { + "id": "policy-update", + "labelKey": "graph.nodes.policyUpdate.label", + "moduleKind": "output", + "position": { "x": 250, "y": 370 }, + "size": { "width": 220, "height": 70 }, + "visualRole": "process-node", + "childNodeIds": [] + } + ], + "edges": [ + { + "id": "prompt-sample-group", + "source": "prompt", + "target": "sample-group", + "edgeKind": "data-flow", + "sourceHandleSide": "bottom", + "targetHandleSide": "top" + }, + { + "id": "sample-group-relative-score", + "source": "sample-group", + "target": "relative-score", + "edgeKind": "data-flow", + "sourceHandleSide": "bottom", + "targetHandleSide": "top" + }, + { + "id": "relative-score-policy-update", + "source": "relative-score", + "target": "policy-update", + "edgeKind": "data-flow", + "sourceHandleSide": "bottom", + "targetHandleSide": "top" + } + ] +} diff --git a/src/content/registry/training-regimes/grpo.json b/src/content/registry/training-regimes/grpo.json new file mode 100644 index 00000000..4da4b3f0 --- /dev/null +++ b/src/content/registry/training-regimes/grpo.json @@ -0,0 +1,41 @@ +{ + "id": "training-regime.grpo", + "slug": "grpo", + "kind": "training-regime", + "defaultTitleKey": "title", + "defaultSummaryKey": "description", + "aliases": [ + "GRPO", + "group relative policy optimization", + "group relative preference optimization", + "group-relative policy optimization", + "group-relative preference optimization" + ], + "tags": ["foundations"], + "relatedIds": ["concept.alignment"], + "citationIds": ["citation.deepseekmath-paper", "citation.deepseek-r1-paper"], + "status": "published", + "createdAt": "2026-06-19T00:00:00.000Z", + "updatedAt": "2026-06-19T00:00:00.000Z", + "releaseDate": "2024-02-05", + "authors": [ + "Zhihong Shao", + "Peiyi Wang", + "Qihao Zhu", + "Runxin Xu", + "Junxiao Song", + "Xiao Bi", + "Haowei Zhang", + "Mingchuan Zhang", + "Y. K. Li", + "Yu Wu", + "Daya Guo" + ], + "sourceId": "citation.deepseekmath-paper", + "regimeType": "optimization", + "usedByModelIds": [], + "relatedModuleIds": [], + "paperIds": [], + "conceptType": "training", + "variantGroup": "group-relative-reinforcement-learning" +} diff --git a/src/features/docs/components/Math.tsx b/src/features/docs/components/Math.tsx index 91335658..9ea4ae4c 100644 --- a/src/features/docs/components/Math.tsx +++ b/src/features/docs/components/Math.tsx @@ -1,11 +1,19 @@ +"use client"; + import katex from "katex"; +import { MissingMessageKey } from "@/features/docs/components/MissingMessageKey"; +import { ProseAutoLinkText } from "@/features/docs/components/ProseAutoLinkText"; +import { useOptionalPageMessagesContext } from "@/features/docs/components/page-messages-context"; +import { lookupMessage } from "@/lib/content/messages"; type MathProps = { - formula: string; + formula?: string; + label?: string; + mathId?: string; }; export function InlineMath({ formula }: MathProps) { - const html = katex.renderToString(formula, { + const html = katex.renderToString(formula ?? "", { throwOnError: false, displayMode: false, }); @@ -19,20 +27,108 @@ export function InlineMath({ formula }: MathProps) { ); } -export function BlockMath({ formula }: MathProps) { - const html = katex.renderToString(formula, { +function MathVariableDefinitions({ mathId }: { mathId: string }) { + const context = useOptionalPageMessagesContext(); + + if (!context) { + return null; + } + + const { messages, isDev } = context; + const definitionsKey = `math.${mathId}.variableDefinitions`; + const definitions = messages.math?.[mathId]?.variableDefinitions; + + if (!definitions || Object.keys(definitions).length === 0) { + if (isDev) { + return ; + } + return null; + } + + return ( +
+
+ {Object.entries(definitions).map(([id, row]) => ( +
+
+ +
+
+ +
+
+ ))} +
+
+ ); +} + +export function BlockMath({ formula, label, mathId }: MathProps) { + const context = useOptionalPageMessagesContext(); + const resolvedFormula = + mathId && context + ? lookupMessage(context.messages, `math.${mathId}.formula`) + : null; + const resolvedLabel = + mathId && context + ? lookupMessage(context.messages, `math.${mathId}.label`) + : null; + + if (mathId && context?.isDev) { + if (!resolvedFormula?.ok) { + return ( + + ); + } + + if (!resolvedLabel?.ok) { + return ( + + ); + } + } + + const displayFormula = resolvedFormula?.ok ? resolvedFormula.value : formula; + + if (!displayFormula) { + return null; + } + const displayLabel = resolvedLabel?.ok ? resolvedLabel.value : label; + const html = katex.renderToString(displayFormula, { throwOnError: false, displayMode: true, }); return ( -
+
+ {displayLabel ? ( +

+ {displayLabel} +

+ ) : null} +
+ {mathId ? : null} +
); } diff --git a/src/lib/content/graph-registry-runtime.generated.ts b/src/lib/content/graph-registry-runtime.generated.ts index f003fadb..18313738 100644 --- a/src/lib/content/graph-registry-runtime.generated.ts +++ b/src/lib/content/graph-registry-runtime.generated.ts @@ -21,6 +21,7 @@ import groupedQueryAttentionComputeFlowGraphRecord from "@/content/registry/grap import groupedQueryAttentionComputeSchemaGraphRecord from "@/content/registry/graphs/grouped-query-attention-compute-schema.json"; import groupedQueryAttentionGqaComparisonGraphRecord from "@/content/registry/graphs/grouped-query-attention-gqa-comparison.json"; import groupedQueryAttentionMhaComparisonGraphRecord from "@/content/registry/graphs/grouped-query-attention-mha-comparison.json"; +import grpoTrainingFlowGraphRecord from "@/content/registry/graphs/grpo-training-flow.json"; import heavilyCompressedAttentionFlowGraphRecord from "@/content/registry/graphs/heavily-compressed-attention-flow.json"; import layerNormComputeFlowGraphRecord from "@/content/registry/graphs/layer-norm-compute-flow.json"; import leakyReluActivationFlowGraphRecord from "@/content/registry/graphs/leaky-relu-activation-flow.json"; @@ -73,6 +74,7 @@ export const graphRecords: GraphRecord[] = [ graphRecordSchema.parse(groupedQueryAttentionComputeSchemaGraphRecord), graphRecordSchema.parse(groupedQueryAttentionGqaComparisonGraphRecord), graphRecordSchema.parse(groupedQueryAttentionMhaComparisonGraphRecord), + graphRecordSchema.parse(grpoTrainingFlowGraphRecord), graphRecordSchema.parse(heavilyCompressedAttentionFlowGraphRecord), graphRecordSchema.parse(layerNormComputeFlowGraphRecord), graphRecordSchema.parse(leakyReluActivationFlowGraphRecord), diff --git a/src/lib/content/graph-registry-runtime.test.ts b/src/lib/content/graph-registry-runtime.test.ts index 97a26ab1..97ac587d 100644 --- a/src/lib/content/graph-registry-runtime.test.ts +++ b/src/lib/content/graph-registry-runtime.test.ts @@ -161,10 +161,13 @@ describe("graph-registry-runtime", () => { test("lists all bundled graph records", () => { const records = listGraphRecords(); - expect(records.length).toBe(49); + expect(records.length).toBe(50); expect(records.map((record) => record.id)).toContain( "graph.bpe-compute-flow", ); + expect(records.map((record) => record.id)).toContain( + "graph.grpo-training-flow", + ); expect(records.map((record) => record.id)).toContain( "graph.sentencepiece-compute-flow", ); diff --git a/src/lib/content/grpo-training-regime-comparisons.test.tsx b/src/lib/content/grpo-training-regime-comparisons.test.tsx new file mode 100644 index 00000000..b3b3a490 --- /dev/null +++ b/src/lib/content/grpo-training-regime-comparisons.test.tsx @@ -0,0 +1,67 @@ +import { describe, expect, test } from "bun:test"; +import { createElement } from "react"; +import { renderToReadableStream } from "react-dom/server"; +import { ModulePageProviders } from "@/features/docs/components/ModulePageProviders"; +import { loadTrainingRegimePageFromDisk } from "@/lib/content/training-regime-page-load"; + +describe("grpo training regime comparisons", () => { + test("page explains nearby alignment methods and renders stable reader links", async () => { + const page = await loadTrainingRegimePageFromDisk("grpo"); + const stream = await renderToReadableStream( + createElement( + ModulePageProviders, + { + messages: page.messages, + assets: page.assets, + }, + page.content, + ), + ); + await stream.allReady; + const html = await new Response(stream).text(); + const normalizedHtml = html.toLowerCase(); + + expect(page.frontmatter.registryId).toBe("training-regime.grpo"); + expect(normalizedHtml).toContain( + "reinforcement learning from human feedback", + ); + expect(html).toContain("Proximal Policy Optimization"); + expect(html).toContain("Direct Preference Optimization"); + expect(html).toContain("pairwise objective"); + expect(html).toContain("relative ranking inside one sampled group"); + expect(html).toContain(">Alignment<"); + expect(html).toContain(">RLHF<"); + expect(html).toContain(">PPO<"); + expect(html).toContain(">DPO<"); + expect(html).toContain('href="/docs/glossary/alignment"'); + expect(html).toContain('href="/search?q=ppo"'); + expect(html).toContain('href="/search?q=dpo"'); + }); + + test("page renders the graph title, legend, and symbol-only math definitions for the GRPO loop", async () => { + const page = await loadTrainingRegimePageFromDisk("grpo"); + const stream = await renderToReadableStream( + createElement( + ModulePageProviders, + { + messages: page.messages, + assets: page.assets, + }, + page.content, + ), + ); + await stream.allReady; + const html = await new Response(stream).text(); + + expect(html).toContain("GRPO training flow"); + expect(html).toContain("Graph legend"); + expect(html).toContain("One prompt anchors the whole local comparison."); + expect(html).toContain('data-page-math-formula="grpoAdvantage"'); + expect(html).toContain( + 'data-page-math-variable-definitions="grpoAdvantage"', + ); + expect(html).toContain("normalized advantage for sampled answer i"); + expect(html).toContain("reward score assigned to sampled answer i"); + expect(html).toContain("number of sampled answers in the comparison group"); + }); +}); diff --git a/src/lib/content/grpo-training-regime-contract.test.ts b/src/lib/content/grpo-training-regime-contract.test.ts new file mode 100644 index 00000000..2ec35f73 --- /dev/null +++ b/src/lib/content/grpo-training-regime-contract.test.ts @@ -0,0 +1,47 @@ +import { describe, expect, test } from "bun:test"; +import { loadLocalDocsPage } from "@/lib/content/local-docs-page"; +import { getTrainingRegimeById } from "@/lib/content/registry-runtime.generated"; +import { docsSearchApi } from "@/lib/search/search-server"; +import { source } from "@/lib/source"; + +describe("grpo training regime contract", () => { + test("canonical route, localized content, registry metadata, and discovery query resolve together", async () => { + const [page, searchResults] = await Promise.all([ + loadLocalDocsPage({ + section: "training", + slug: "grpo", + }), + docsSearchApi.search("group relative preference optimization"), + ]); + + const route = source.getPage(["training", "grpo"]); + const record = getTrainingRegimeById("training-regime.grpo"); + + expect(route?.url).toBe("/docs/training/grpo"); + expect(record).toBeDefined(); + if (!record) { + throw new Error("Expected training-regime.grpo registry record to exist"); + } + expect(page.frontmatter.registryId).toBe(record.id); + expect(page.messages.title).toBe("Group Relative Policy Optimization"); + expect(page.messages.openingSummary).toContain( + "samples several answers to one prompt", + ); + expect(page.messages.sections?.howItWorks?.body).toContain( + "normalized within that group", + ); + + expect(record.kind).toBe("training-regime"); + expect(record.slug).toBe("grpo"); + expect(record.aliases).toEqual( + expect.arrayContaining([ + "GRPO", + "group relative preference optimization", + ]), + ); + expect(record.relatedIds).toContain("concept.alignment"); + expect(record.variantGroup).toBe("group-relative-reinforcement-learning"); + + expect(searchResults[0]?.url).toBe("/docs/training/grpo"); + }); +}); diff --git a/src/lib/content/local-docs-page.test.ts b/src/lib/content/local-docs-page.test.ts index d945b054..2c8468f3 100644 --- a/src/lib/content/local-docs-page.test.ts +++ b/src/lib/content/local-docs-page.test.ts @@ -162,6 +162,18 @@ describe("docs source local pages", () => { expect(page.toc.some(hasTocUrl("#what-it-is"))).toBe(true); }); + test("loadLocalDocsPage resolves the canonical GRPO training page through the shared route contract", async () => { + const page = await loadLocalDocsPage({ + section: "training", + slug: "grpo", + }); + + expect(page.messages.title).toBe("Group Relative Policy Optimization"); + expect(page.messages.sections?.howItWorks?.title).toBe("How It Works"); + expect(page.frontmatter.registryId).toBe("training-regime.grpo"); + expect(page.toc.some(hasTocUrl("#what-it-is"))).toBe(true); + }); + test("loadLocalDocsPage resolves shipped vietnamese canonical page messages without changing the shared MDX route contract", async () => { const page = await loadLocalDocsPage( { diff --git a/src/lib/content/published-docs-registry-manifest.ts b/src/lib/content/published-docs-registry-manifest.ts index b462163e..f2b527f9 100644 --- a/src/lib/content/published-docs-registry-manifest.ts +++ b/src/lib/content/published-docs-registry-manifest.ts @@ -34,6 +34,14 @@ export const GENERATED_PUBLISHED_DOCS_ENTRIES = [ pageKind: "training-regime", section: "training", }, + { + registryId: "training-regime.grpo", + slug: "grpo", + docsSlug: "training/grpo", + url: "/docs/training/grpo", + pageKind: "training-regime", + section: "training", + }, { registryId: "training-regime.on-policy-distillation", slug: "on-policy-distillation", @@ -1161,6 +1169,7 @@ export const GENERATED_PUBLISHED_DOCS_REGISTRY_IDS = [ "system.routing", "training-regime.dpo", "training-regime.fp4-quantization-aware-training", + "training-regime.grpo", "training-regime.on-policy-distillation", "training-regime.specialist-training", ] as const; diff --git a/src/lib/content/published-docs-routing-contract.test.ts b/src/lib/content/published-docs-routing-contract.test.ts index bb7ef5a7..a8aaf7d0 100644 --- a/src/lib/content/published-docs-routing-contract.test.ts +++ b/src/lib/content/published-docs-routing-contract.test.ts @@ -86,6 +86,14 @@ describe("published docs routing contract", () => { ), href: "/docs/training/dpo", }, + { + label: "grpo training regime", + record: requireRecord( + getTrainingRegimeById("training-regime.grpo"), + "grpo training regime", + ), + href: "/docs/training/grpo", + }, { label: "system", record: requireRecord( diff --git a/src/lib/content/registry-runtime.generated.ts b/src/lib/content/registry-runtime.generated.ts index e00eafde..c60c2919 100644 --- a/src/lib/content/registry-runtime.generated.ts +++ b/src/lib/content/registry-runtime.generated.ts @@ -178,64 +178,67 @@ import registryRecord_143 from "../../content/registry/models/gpt-3.json"; import registryRecord_144 from "../../content/registry/papers/deepseek-v4.json"; import registryRecord_145 from "../../content/registry/training-regimes/dpo.json"; import registryRecord_146 from "../../content/registry/training-regimes/fp4-quantization-aware-training.json"; -import registryRecord_147 from "../../content/registry/training-regimes/on-policy-distillation.json"; -import registryRecord_148 from "../../content/registry/training-regimes/specialist-training.json"; -import registryRecord_149 from "../../content/registry/systems/expert-parallel-overlap.json"; -import registryRecord_150 from "../../content/registry/systems/on-disk-kv-cache.json"; -import registryRecord_151 from "../../content/registry/systems/routing.json"; -import registryRecord_152 from "../../content/registry/datasets/deepseek-v4-specialist-corpus.json"; -import registryRecord_153 from "../../content/registry/organizations/deepseek-ai.json"; -import registryRecord_154 from "../../content/registry/citations/attention-is-all-you-need.json"; -import registryRecord_155 from "../../content/registry/citations/awq.json"; -import registryRecord_156 from "../../content/registry/citations/batch-normalization.json"; -import registryRecord_157 from "../../content/registry/citations/brown-gpt-3.json"; -import registryRecord_158 from "../../content/registry/citations/chen-positional-interpolation.json"; -import registryRecord_159 from "../../content/registry/citations/classifier-free-diffusion-guidance.json"; -import registryRecord_160 from "../../content/registry/citations/curious-case-neural-text-degeneration.json"; -import registryRecord_161 from "../../content/registry/citations/deepseek-v2-mla-paper.json"; -import registryRecord_162 from "../../content/registry/citations/deepseek-v4-paper.json"; -import registryRecord_163 from "../../content/registry/citations/denoising-diffusion-probabilistic-models.json"; -import registryRecord_164 from "../../content/registry/citations/ding-longrope.json"; -import registryRecord_165 from "../../content/registry/citations/direct-preference-optimization.json"; -import registryRecord_166 from "../../content/registry/citations/flamingo-visual-language-model.json"; -import registryRecord_167 from "../../content/registry/citations/glu-variants-improve-transformer.json"; -import registryRecord_168 from "../../content/registry/citations/goodfellow-deep-learning.json"; -import registryRecord_169 from "../../content/registry/citations/gpt-2-report.json"; -import registryRecord_170 from "../../content/registry/citations/gqa-paper.json"; -import registryRecord_171 from "../../content/registry/citations/group-normalization.json"; -import registryRecord_172 from "../../content/registry/citations/image-is-worth-16x16-words.json"; -import registryRecord_173 from "../../content/registry/citations/kaiokendev-superhot.json"; -import registryRecord_174 from "../../content/registry/citations/kaplan-scaling-laws.json"; -import registryRecord_175 from "../../content/registry/citations/katharopoulos-linear-attention-paper.json"; -import registryRecord_176 from "../../content/registry/citations/kingma-adam.json"; -import registryRecord_177 from "../../content/registry/citations/kivi-kv-cache-quantization.json"; -import registryRecord_178 from "../../content/registry/citations/kudo-sentencepiece.json"; -import registryRecord_179 from "../../content/registry/citations/layer-normalization.json"; -import registryRecord_180 from "../../content/registry/citations/learning-transferable-visual-models-from-natural-language-supervision.json"; -import registryRecord_181 from "../../content/registry/citations/longformer.json"; -import registryRecord_182 from "../../content/registry/citations/multilayer-feedforward-networks-are-universal-approximators.json"; -import registryRecord_183 from "../../content/registry/citations/on-policy-distillation-of-language-models.json"; -import registryRecord_184 from "../../content/registry/citations/peng-yarn.json"; -import registryRecord_185 from "../../content/registry/citations/press-alibi.json"; -import registryRecord_186 from "../../content/registry/citations/qlora.json"; -import registryRecord_187 from "../../content/registry/citations/quantization-integer-only-inference.json"; -import registryRecord_188 from "../../content/registry/citations/query-key-normalization-for-transformers.json"; -import registryRecord_189 from "../../content/registry/citations/raffel-t5.json"; -import registryRecord_190 from "../../content/registry/citations/rectified-linear-units-improve-restricted-boltzmann-machines.json"; -import registryRecord_191 from "../../content/registry/citations/rectifier-nonlinearities-improve-neural-network-acoustic-models.json"; -import registryRecord_192 from "../../content/registry/citations/root-mean-square-layer-normalization.json"; -import registryRecord_193 from "../../content/registry/citations/self-attention-with-relative-position-representations.json"; -import registryRecord_194 from "../../content/registry/citations/sennrich-bpe.json"; -import registryRecord_195 from "../../content/registry/citations/shazeer-mqa-paper.json"; -import registryRecord_196 from "../../content/registry/citations/sigmoid-weighted-linear-units.json"; -import registryRecord_197 from "../../content/registry/citations/smoothquant.json"; -import registryRecord_198 from "../../content/registry/citations/sparse-transformers.json"; -import registryRecord_199 from "../../content/registry/citations/sparsely-gated-mixture-of-experts-layer.json"; -import registryRecord_200 from "../../content/registry/citations/su-roformer-rope.json"; -import registryRecord_201 from "../../content/registry/citations/training-language-models-to-follow-instructions-with-human-feedback.json"; -import registryRecord_202 from "../../content/registry/citations/transformer-lms-without-positional-encodings.json"; -import registryRecord_203 from "../../content/registry/citations/wei-emergent-abilities.json"; -import registryRecord_204 from "../../content/registry/citations/world-models.json"; +import registryRecord_147 from "../../content/registry/training-regimes/grpo.json"; +import registryRecord_148 from "../../content/registry/training-regimes/on-policy-distillation.json"; +import registryRecord_149 from "../../content/registry/training-regimes/specialist-training.json"; +import registryRecord_150 from "../../content/registry/systems/expert-parallel-overlap.json"; +import registryRecord_151 from "../../content/registry/systems/on-disk-kv-cache.json"; +import registryRecord_152 from "../../content/registry/systems/routing.json"; +import registryRecord_153 from "../../content/registry/datasets/deepseek-v4-specialist-corpus.json"; +import registryRecord_154 from "../../content/registry/organizations/deepseek-ai.json"; +import registryRecord_155 from "../../content/registry/citations/attention-is-all-you-need.json"; +import registryRecord_156 from "../../content/registry/citations/awq.json"; +import registryRecord_157 from "../../content/registry/citations/batch-normalization.json"; +import registryRecord_158 from "../../content/registry/citations/brown-gpt-3.json"; +import registryRecord_159 from "../../content/registry/citations/chen-positional-interpolation.json"; +import registryRecord_160 from "../../content/registry/citations/classifier-free-diffusion-guidance.json"; +import registryRecord_161 from "../../content/registry/citations/curious-case-neural-text-degeneration.json"; +import registryRecord_162 from "../../content/registry/citations/deepseek-r1-paper.json"; +import registryRecord_163 from "../../content/registry/citations/deepseek-v2-mla-paper.json"; +import registryRecord_164 from "../../content/registry/citations/deepseek-v4-paper.json"; +import registryRecord_165 from "../../content/registry/citations/deepseekmath-paper.json"; +import registryRecord_166 from "../../content/registry/citations/denoising-diffusion-probabilistic-models.json"; +import registryRecord_167 from "../../content/registry/citations/ding-longrope.json"; +import registryRecord_168 from "../../content/registry/citations/direct-preference-optimization.json"; +import registryRecord_169 from "../../content/registry/citations/flamingo-visual-language-model.json"; +import registryRecord_170 from "../../content/registry/citations/glu-variants-improve-transformer.json"; +import registryRecord_171 from "../../content/registry/citations/goodfellow-deep-learning.json"; +import registryRecord_172 from "../../content/registry/citations/gpt-2-report.json"; +import registryRecord_173 from "../../content/registry/citations/gqa-paper.json"; +import registryRecord_174 from "../../content/registry/citations/group-normalization.json"; +import registryRecord_175 from "../../content/registry/citations/image-is-worth-16x16-words.json"; +import registryRecord_176 from "../../content/registry/citations/kaiokendev-superhot.json"; +import registryRecord_177 from "../../content/registry/citations/kaplan-scaling-laws.json"; +import registryRecord_178 from "../../content/registry/citations/katharopoulos-linear-attention-paper.json"; +import registryRecord_179 from "../../content/registry/citations/kingma-adam.json"; +import registryRecord_180 from "../../content/registry/citations/kivi-kv-cache-quantization.json"; +import registryRecord_181 from "../../content/registry/citations/kudo-sentencepiece.json"; +import registryRecord_182 from "../../content/registry/citations/layer-normalization.json"; +import registryRecord_183 from "../../content/registry/citations/learning-transferable-visual-models-from-natural-language-supervision.json"; +import registryRecord_184 from "../../content/registry/citations/longformer.json"; +import registryRecord_185 from "../../content/registry/citations/multilayer-feedforward-networks-are-universal-approximators.json"; +import registryRecord_186 from "../../content/registry/citations/on-policy-distillation-of-language-models.json"; +import registryRecord_187 from "../../content/registry/citations/peng-yarn.json"; +import registryRecord_188 from "../../content/registry/citations/press-alibi.json"; +import registryRecord_189 from "../../content/registry/citations/qlora.json"; +import registryRecord_190 from "../../content/registry/citations/quantization-integer-only-inference.json"; +import registryRecord_191 from "../../content/registry/citations/query-key-normalization-for-transformers.json"; +import registryRecord_192 from "../../content/registry/citations/raffel-t5.json"; +import registryRecord_193 from "../../content/registry/citations/rectified-linear-units-improve-restricted-boltzmann-machines.json"; +import registryRecord_194 from "../../content/registry/citations/rectifier-nonlinearities-improve-neural-network-acoustic-models.json"; +import registryRecord_195 from "../../content/registry/citations/root-mean-square-layer-normalization.json"; +import registryRecord_196 from "../../content/registry/citations/self-attention-with-relative-position-representations.json"; +import registryRecord_197 from "../../content/registry/citations/sennrich-bpe.json"; +import registryRecord_198 from "../../content/registry/citations/shazeer-mqa-paper.json"; +import registryRecord_199 from "../../content/registry/citations/sigmoid-weighted-linear-units.json"; +import registryRecord_200 from "../../content/registry/citations/smoothquant.json"; +import registryRecord_201 from "../../content/registry/citations/sparse-transformers.json"; +import registryRecord_202 from "../../content/registry/citations/sparsely-gated-mixture-of-experts-layer.json"; +import registryRecord_203 from "../../content/registry/citations/su-roformer-rope.json"; +import registryRecord_204 from "../../content/registry/citations/training-language-models-to-follow-instructions-with-human-feedback.json"; +import registryRecord_205 from "../../content/registry/citations/transformer-lms-without-positional-encodings.json"; +import registryRecord_206 from "../../content/registry/citations/wei-emergent-abilities.json"; +import registryRecord_207 from "../../content/registry/citations/world-models.json"; const moduleRecords: ModuleRecord[] = [ moduleRecordSchema.parse(registryRecord_0), @@ -399,24 +402,24 @@ const trainingRegimeRecords: TrainingRegimeRecord[] = [ trainingRegimeRecordSchema.parse(registryRecord_146), trainingRegimeRecordSchema.parse(registryRecord_147), trainingRegimeRecordSchema.parse(registryRecord_148), + trainingRegimeRecordSchema.parse(registryRecord_149), ]; const systemRecords: SystemRecord[] = [ - systemRecordSchema.parse(registryRecord_149), systemRecordSchema.parse(registryRecord_150), systemRecordSchema.parse(registryRecord_151), + systemRecordSchema.parse(registryRecord_152), ]; const datasetRecords: DatasetRecord[] = [ - datasetRecordSchema.parse(registryRecord_152), + datasetRecordSchema.parse(registryRecord_153), ]; const organizationRecords: OrganizationRecord[] = [ - organizationRecordSchema.parse(registryRecord_153), + organizationRecordSchema.parse(registryRecord_154), ]; const citationRecords: CitationRecord[] = [ - citationRecordSchema.parse(registryRecord_154), citationRecordSchema.parse(registryRecord_155), citationRecordSchema.parse(registryRecord_156), citationRecordSchema.parse(registryRecord_157), @@ -467,6 +470,9 @@ const citationRecords: CitationRecord[] = [ citationRecordSchema.parse(registryRecord_202), citationRecordSchema.parse(registryRecord_203), citationRecordSchema.parse(registryRecord_204), + citationRecordSchema.parse(registryRecord_205), + citationRecordSchema.parse(registryRecord_206), + citationRecordSchema.parse(registryRecord_207), ]; const modulesById = new Map( diff --git a/src/lib/content/training-behavior-glossary.test.ts b/src/lib/content/training-behavior-glossary.test.ts index 6f256b5e..bf680f74 100644 --- a/src/lib/content/training-behavior-glossary.test.ts +++ b/src/lib/content/training-behavior-glossary.test.ts @@ -77,6 +77,7 @@ describe("Phase 2 training behavior glossary pages (US-004)", () => { test("alignment links to training peers and published token-chain glossary pages", async () => { const html = await renderGlossaryHtml("alignment"); + expect(html).toContain('href="/docs/training/grpo"'); expect(html).toContain('href="/docs/glossary/model-capacity"'); expect(html).toContain('href="/docs/glossary/overfitting"'); expect(html).toContain('href="/docs/glossary/generalization"'); @@ -112,6 +113,7 @@ describe("Phase 2 training behavior glossary pages (US-004)", () => { ) as ConceptRecord; expect(alignment.conceptType).toBe("training"); + expect(alignment.relatedIds).toContain("training-regime.grpo"); expect(modelCapacity.conceptType).toBe("training"); expect(overfitting.conceptType).toBe("training"); expect(generalization.conceptType).toBe("evaluation"); diff --git a/src/lib/navigation/generated-docs-page-tree.test.ts b/src/lib/navigation/generated-docs-page-tree.test.ts index 4bcc649b..32515788 100644 --- a/src/lib/navigation/generated-docs-page-tree.test.ts +++ b/src/lib/navigation/generated-docs-page-tree.test.ts @@ -129,6 +129,13 @@ describe("generated docs page tree", () => { url: "/docs/training/on-policy-distillation", }), ); + expect( + findNodeIndex(trainingChildren, { name: "Optimization" }), + ).toBeLessThan( + findNodeIndex(trainingChildren, { + url: "/docs/training/grpo", + }), + ); expect( findNodeIndex(trainingChildren, { name: "Optimization" }), ).toBeLessThan( diff --git a/src/lib/source.test.ts b/src/lib/source.test.ts index cf6b7843..f66a1c4a 100644 --- a/src/lib/source.test.ts +++ b/src/lib/source.test.ts @@ -128,6 +128,7 @@ const PAPER_INDEX_URLS = ["/docs/papers/deepseek-v4"] as const; const TRAINING_INDEX_URLS = [ "/docs/training/dpo", "/docs/training/fp4-quantization-aware-training", + "/docs/training/grpo", "/docs/training/on-policy-distillation", "/docs/training/specialist-training", ] as const; diff --git a/src/tests/content/section-indexes.test.tsx b/src/tests/content/section-indexes.test.tsx index 7242528a..53724b49 100644 --- a/src/tests/content/section-indexes.test.tsx +++ b/src/tests/content/section-indexes.test.tsx @@ -57,6 +57,7 @@ describe("section index page render", () => { const html = renderToStaticMarkup(await TrainingIndexPage()); expect(html).toContain("Training"); + expect(html).toContain('href="/docs/training/grpo"'); expect(html).toContain('href="/docs/training/on-policy-distillation"'); expect(html).toContain('href="/docs/training/specialist-training"'); }); diff --git a/src/tests/discovery/search-discovery.test.tsx b/src/tests/discovery/search-discovery.test.tsx index 04023fcf..ba849c39 100644 --- a/src/tests/discovery/search-discovery.test.tsx +++ b/src/tests/discovery/search-discovery.test.tsx @@ -75,6 +75,15 @@ function expectRouteRendersOk( } describe("Phase 1 search discovery", () => { + test("group relative preference optimization query routes readers to the canonical GRPO page", async () => { + const results = await docsSearchApi.search( + "group relative preference optimization", + ); + expect(results.length).toBeGreaterThan(0); + expect(assertCanonicalPageLevelApiResults(results)).toBeNull(); + expect(results[0]?.url).toBe("/docs/training/grpo"); + }); + test("GQA query ranks grouped-query attention first", async () => { const results = await docsSearchApi.search("GQA"); expect(results.length).toBeGreaterThan(0); diff --git a/src/tests/search/build-documents.test.ts b/src/tests/search/build-documents.test.ts index 23ca4b34..f54c3cbe 100644 --- a/src/tests/search/build-documents.test.ts +++ b/src/tests/search/build-documents.test.ts @@ -5,6 +5,7 @@ import { buildSearchDocuments } from "@/lib/search/build-documents"; const SAMPLE_URL = "/docs/modules/grouped-query-attention"; const TOKEN_GLOSSARY_URL = "/docs/glossary/token"; +const GRPO_URL = "/docs/training/grpo"; describe("buildSearchDocuments", () => { test("indexes only published docs pages for the default locale", async () => { @@ -63,4 +64,29 @@ describe("buildSearchDocuments", () => { expect(token?.bodyText).toContain("tokenizer"); expect(token?.bodyText).toContain("token IDs"); }); + + test("indexes GRPO training page with search aliases and training facets", async () => { + const registry = await loadRegistry(); + const pages = await loadPublishedDocsPages("en"); + const documents = buildSearchDocuments(pages, registry); + const grpo = documents.find((document) => document.url === GRPO_URL); + + expect(grpo).toBeDefined(); + expect(grpo?.kind).toBe("training-regime"); + expect(grpo?.registryId).toBe("training-regime.grpo"); + expect(grpo?.aliases).toEqual( + expect.arrayContaining([ + "GRPO", + "group relative policy optimization", + "group relative preference optimization", + "group-relative policy optimization", + "group-relative preference optimization", + ]), + ); + expect(grpo?.tags).toEqual(expect.arrayContaining(["foundations"])); + expect(grpo?.facets.conceptType).toBe("training"); + expect(grpo?.facets.variantGroup).toBe( + "group-relative-reinforcement-learning", + ); + }); });