Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions src/app/docs/docs-slug-renderer.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { describe, expect, test } from "bun:test";
import {
buildDocsPageMetadata,
renderDocsSlugPage,
resolveLocalDocsShellDescription,
} from "@/app/docs/docs-slug-renderer";

describe("docs slug renderer locale gating", () => {
Expand Down Expand Up @@ -243,4 +244,21 @@ describe("docs slug renderer locale gating", () => {
);
}
});

test("local non-glossary docs shell prefers openingSummary when present", () => {
const shellDescription = resolveLocalDocsShellDescription({
description:
"A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality.",
openingSummary:
"Group Relative Policy Optimization, usually shortened to GRPO, is a reinforcement-learning post-training method where the model samples several answers to one prompt, scores them as a group, and learns from which answers look better inside that local set.",
section: "training",
});

expect(shellDescription).toContain(
"Group Relative Policy Optimization, usually shortened to GRPO",
);
expect(shellDescription).not.toContain(
"A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality.",
);
});
});
20 changes: 19 additions & 1 deletion src/app/docs/docs-slug-renderer.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@ import { localizedRouteAlternates } from "@/lib/i18n/route-locale";
import { source } from "@/lib/source";
import { getMDXComponents } from "../../../mdx-components";

export function resolveLocalDocsShellDescription(options: {
description: string;
openingSummary?: string;
section: string;
}) {
const { description, openingSummary, section } = options;
if (section === "glossary") {
return description;
}

return openingSummary ?? description;
}

function buildDocsPageAlternates(docsSlug: string) {
const alternates = localizedRouteAlternates({
surface: "docs-page",
Expand Down Expand Up @@ -53,11 +66,16 @@ async function renderLocalDocsPage(

const loadedPage = await loadLocalDocsPage(localRef, locale);
const uiMessages = await loadUiMessages(locale);
const shellDescriptionText = resolveLocalDocsShellDescription({
description: loadedPage.messages.description,
openingSummary: loadedPage.messages.openingSummary,
section: localRef.section,
});
const description =
localRef.section === "glossary" ? (
<DocsAutoLinkedDescription text={loadedPage.messages.description} />
) : (
loadedPage.messages.description
shellDescriptionText
);

return (
Expand Down
10 changes: 10 additions & 0 deletions src/content/docs/training/grpo/assets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"trainingFlow": {
"type": "graph",
"graphId": "graph.grpo-training-flow",
"webRenderer": "react-flow",
"printRenderer": "vertical-svg",
"altKey": "assets.trainingFlow.alt",
"captionKey": "assets.trainingFlow.caption"
}
}
94 changes: 94 additions & 0 deletions src/content/docs/training/grpo/messages/en.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"title": "Group Relative Policy Optimization",
"description": "A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality.",
"openingSummary": "Group Relative Policy Optimization, usually shortened to GRPO, is a reinforcement-learning post-training method where the model samples several answers to one prompt, scores them as a group, and learns from which answers look better inside that local set.",
"sections": {
"whatItIs": {
"title": "What It Is",
"body": "Group Relative Policy Optimization is a policy-update method used after pretraining or supervised fine-tuning. Instead of judging one answer in isolation, it samples a small group of candidate answers for the same prompt and uses their relative ranking to decide which behaviors should be reinforced."
},
"whyItExists": {
"title": "Why It Exists",
"body": "A single reward score can be noisy, and PPO-style language-model training often adds the extra cost of a learned critic to estimate a baseline. GRPO tries to keep the signal useful while simplifying the loop: the group itself supplies the local baseline, so the model can learn from which samples look better than their neighbors."
},
"howItWorks": {
"title": "How It Works",
"body": "For one prompt, the policy generates several completions, a reward function scores them, and those scores are normalized within that group. Answers above the group average get a positive learning signal, answers below it get a negative one, and the policy is updated so future samples are more likely to resemble the stronger members of the group."
},
"comparedToNearbyRegimes": {
"title": "Compared To Nearby Regimes",
"body": "Group Relative Policy Optimization still sits inside the broader alignment family, but it is narrower than Reinforcement Learning from Human Feedback as a full pipeline. Reinforcement Learning from Human Feedback often means collecting preference data, training a reward model, and then running a reinforcement-learning update such as Proximal Policy Optimization. GRPO keeps the reinforcement-learning loop, but it replaces the learned critic-style baseline with relative ranking inside one sampled group. Direct Preference Optimization moves in a different direction: it stays closer to supervised-style optimization on chosen-versus-rejected pairs, while GRPO uses rewards over several sampled answers and updates the policy from that within-group ordering instead of from one pairwise objective alone."
},
"limitationsAndFailureModes": {
"title": "Limitations And Failure Modes",
"body": "The method still depends on reward quality. If the reward function prefers shallow tricks, the whole group can drift in the wrong direction together. Relative scoring also means a weak group can still produce a misleading winner if every sampled answer is bad."
},
"related": {
"title": "Related To"
},
"tags": {
"title": "Tags"
},
"references": {
"title": "References"
}
},
"callouts": {
"trainingFlowGraph": {
"title": "GRPO training flow",
"body": "A visual walkthrough of one prompt, grouped sampling, relative scoring, and the policy update."
},
"trainingFlowLegend": {
"title": "Graph legend",
"body": "How to read each stage of the GRPO training flow."
}
},
"links": {
"trainingFlowLegendPrompt": "One prompt anchors the whole local comparison.",
"trainingFlowLegendSampling": "The policy samples several candidate answers for that same prompt.",
"trainingFlowLegendRelativeScoring": "Those answers are scored relative to each other inside the sampled group.",
"trainingFlowLegendPolicyUpdate": "The policy update reinforces answers that beat the group baseline and discourages weaker ones."
},
"assets": {
"trainingFlow": {
"alt": "A training flow from one prompt to a group of sampled answers, then to relative scoring inside the group, and finally to a policy update.",
"caption": "GRPO learns from which answers win inside each sampled group instead of relying on one separate critic-estimated baseline."
}
},
"math": {
"grpoAdvantage": {
"label": "Grouped relative advantage sketch",
"formula": "A_i = \\frac{r_i - \\operatorname{mean}(r_{1:G})}{\\operatorname{std}(r_{1:G})}",
"variableDefinitions": {
"advantage": {
"term": "A_i",
"definition": "normalized advantage for sampled answer i"
},
"reward": {
"term": "r_i",
"definition": "reward score assigned to sampled answer i"
},
"groupSize": {
"term": "G",
"definition": "number of sampled answers in the comparison group"
}
}
}
},
"graph": {
"nodes": {
"prompt": {
"label": "One prompt"
},
"sampleGroup": {
"label": "Sample a group of answers"
},
"relativeScore": {
"label": "Score answers relative to the group"
},
"policyUpdate": {
"label": "Update the policy"
}
}
}
}
124 changes: 124 additions & 0 deletions src/content/docs/training/grpo/page.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
---
title: "Group Relative Policy Optimization"
description: "A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality."
kind: "training-regime"
registryId: "training-regime.grpo"
messageNamespace: "local"
assetNamespace: "local"
status: "published"
tags:
- foundations
aliases:
- "GRPO"
- "Group Relative Preference Optimization"
updatedAt: "2026-06-19"
---

import { CitationList } from "@/features/docs/components/CitationList";
import { BlockMath } from "@/features/docs/components/Math";
import { RelatedDocs } from "@/features/docs/components/RelatedDocs";
import { Section } from "@/features/docs/components/Section";
import { T } from "@/features/docs/components/T";
import { TagPillList } from "@/features/docs/components/TagPillList";
import { TrainingRegimeAtAGlance } from "@/features/models/components/TrainingRegimeAtAGlance";
import { TrainingRegimeFlow } from "@/features/models/components/TrainingRegimeFlow";


<TrainingRegimeAtAGlance registryId="training-regime.grpo" />

<Section id="what-it-is" titleKey="sections.whatItIs.title">
<T k="sections.whatItIs.body" />
</Section>

<Section id="why-it-exists" titleKey="sections.whyItExists.title">
<T k="sections.whyItExists.body" />
</Section>

<Section id="how-it-works" titleKey="sections.howItWorks.title">
<T k="sections.howItWorks.body" />
<h3 className="text-base font-semibold text-foreground">
<T k="callouts.trainingFlowGraph.title" />
</h3>
<TrainingRegimeFlow registryId="training-regime.grpo" assetId="trainingFlow" />
<div className="rounded-lg border border-border bg-card p-4">
<p className="text-sm font-medium text-foreground">
<T k="callouts.trainingFlowLegend.title" />
</p>
<ul className="mt-3 space-y-2 pl-5 text-sm text-muted-foreground">
<li>
<T k="links.trainingFlowLegendPrompt" />
</li>
<li>
<T k="links.trainingFlowLegendSampling" />
</li>
<li>
<T k="links.trainingFlowLegendRelativeScoring" />
</li>
<li>
<T k="links.trainingFlowLegendPolicyUpdate" />
</li>
</ul>
</div>
<BlockMath mathId="grpoAdvantage" />
</Section>

<Section
id="compared-to-nearby-regimes"
titleKey="sections.comparedToNearbyRegimes.title"
>
<T k="sections.comparedToNearbyRegimes.body" />

<ul className="mt-4 space-y-2 list-none pl-0">
<li>
<a
className="text-primary underline underline-offset-4"
href="/docs/glossary/alignment"
>
Alignment
</a>
</li>
<li>
<a
className="text-primary underline underline-offset-4"
href="/docs/glossary/alignment"
>
RLHF
</a>
</li>
<li>
<a
className="text-primary underline underline-offset-4"
href="/search?q=ppo"
>
PPO
</a>
</li>
<li>
<a
className="text-primary underline underline-offset-4"
href="/search?q=dpo"
>
DPO
</a>
</li>
</ul>
</Section>

<Section
id="limitations-and-failure-modes"
titleKey="sections.limitationsAndFailureModes.title"
>
<T k="sections.limitationsAndFailureModes.body" />
</Section>

<Section id="related" titleKey="sections.related.title">
<RelatedDocs registryId="training-regime.grpo" />
</Section>

<Section id="tags" titleKey="sections.tags.title">
<TagPillList registryId="training-regime.grpo" showDescriptions />
</Section>

<Section id="references" titleKey="sections.references.title">
<CitationList registryId="training-regime.grpo" />
</Section>
20 changes: 20 additions & 0 deletions src/content/registry/citations/deepseek-r1-paper.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"id": "citation.deepseek-r1-paper",
"slug": "deepseek-r1-paper",
"kind": "citation",
"defaultTitleKey": "title",
"defaultSummaryKey": "summary",
"aliases": ["DeepSeek-R1 paper"],
"tags": ["foundations"],
"relatedIds": [],
"citationIds": [],
"status": "published",
"createdAt": "2026-06-19T00:00:00.000Z",
"updatedAt": "2026-06-19T00:00:00.000Z",
"citationType": "paper",
"authors": ["DeepSeek-AI"],
"title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
"year": 2025,
"url": "https://arxiv.org/abs/2501.12948",
"mla": "DeepSeek-AI. \"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.\" arXiv, 2025, https://arxiv.org/abs/2501.12948."
}
32 changes: 32 additions & 0 deletions src/content/registry/citations/deepseekmath-paper.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"id": "citation.deepseekmath-paper",
"slug": "deepseekmath-paper",
"kind": "citation",
"defaultTitleKey": "title",
"defaultSummaryKey": "summary",
"aliases": ["DeepSeekMath paper", "GRPO paper"],
"tags": ["foundations"],
"relatedIds": [],
"citationIds": [],
"status": "published",
"createdAt": "2026-06-19T00:00:00.000Z",
"updatedAt": "2026-06-19T00:00:00.000Z",
"citationType": "paper",
"authors": [
"Zhihong Shao",
"Peiyi Wang",
"Qihao Zhu",
"Runxin Xu",
"Junxiao Song",
"Xiao Bi",
"Haowei Zhang",
"Mingchuan Zhang",
"Y. K. Li",
"Yu Wu",
"Daya Guo"
],
"title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
"year": 2024,
"url": "https://arxiv.org/abs/2402.03300",
"mla": "Shao, Zhihong, et al. \"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models.\" arXiv, 2024, https://arxiv.org/abs/2402.03300."
}
1 change: 1 addition & 0 deletions src/content/registry/concepts/alignment.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"aliases": ["Alignment", "RLHF", "preference alignment", "safety alignment"],
"tags": ["foundations", "taxonomy"],
"relatedIds": [
"training-regime.grpo",
"concept.model-capacity",
"concept.overfitting",
"concept.generalization",
Expand Down
Loading
Loading