portpowered · AndreasAbdi · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/src/app/docs/docs-slug-renderer.test.tsx b/src/app/docs/docs-slug-renderer.test.tsx
@@ -2,6 +2,7 @@ import { describe, expect, test } from "bun:test";
 import {
   buildDocsPageMetadata,
   renderDocsSlugPage,
+  resolveLocalDocsShellDescription,
 } from "@/app/docs/docs-slug-renderer";
 
 describe("docs slug renderer locale gating", () => {
@@ -243,4 +244,21 @@ describe("docs slug renderer locale gating", () => {
       );
     }
   });
+
+  test("local non-glossary docs shell prefers openingSummary when present", () => {
+    const shellDescription = resolveLocalDocsShellDescription({
+      description:
+        "A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality.",
+      openingSummary:
+        "Group Relative Policy Optimization, usually shortened to GRPO, is a reinforcement-learning post-training method where the model samples several answers to one prompt, scores them as a group, and learns from which answers look better inside that local set.",
+      section: "training",
+    });
+
+    expect(shellDescription).toContain(
+      "Group Relative Policy Optimization, usually shortened to GRPO",
+    );
+    expect(shellDescription).not.toContain(
+      "A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality.",
+    );
+  });
 });
diff --git a/src/app/docs/docs-slug-renderer.tsx b/src/app/docs/docs-slug-renderer.tsx
@@ -20,6 +20,19 @@ import { localizedRouteAlternates } from "@/lib/i18n/route-locale";
 import { source } from "@/lib/source";
 import { getMDXComponents } from "../../../mdx-components";
 
+export function resolveLocalDocsShellDescription(options: {
+  description: string;
+  openingSummary?: string;
+  section: string;
+}) {
+  const { description, openingSummary, section } = options;
+  if (section === "glossary") {
+    return description;
+  }
+
+  return openingSummary ?? description;
+}
+
 function buildDocsPageAlternates(docsSlug: string) {
   const alternates = localizedRouteAlternates({
     surface: "docs-page",
@@ -53,11 +66,16 @@ async function renderLocalDocsPage(
 
   const loadedPage = await loadLocalDocsPage(localRef, locale);
   const uiMessages = await loadUiMessages(locale);
+  const shellDescriptionText = resolveLocalDocsShellDescription({
+    description: loadedPage.messages.description,
+    openingSummary: loadedPage.messages.openingSummary,
+    section: localRef.section,
+  });
   const description =
     localRef.section === "glossary" ? (
       <DocsAutoLinkedDescription text={loadedPage.messages.description} />
     ) : (
-      loadedPage.messages.description
+      shellDescriptionText
     );
 
   return (

diff --git a/src/content/docs/training/grpo/assets.json b/src/content/docs/training/grpo/assets.json
@@ -0,0 +1,10 @@
+{
+  "trainingFlow": {
+    "type": "graph",
+    "graphId": "graph.grpo-training-flow",
+    "webRenderer": "react-flow",
+    "printRenderer": "vertical-svg",
+    "altKey": "assets.trainingFlow.alt",
+    "captionKey": "assets.trainingFlow.caption"
+  }
+}
diff --git a/src/content/docs/training/grpo/messages/en.json b/src/content/docs/training/grpo/messages/en.json
@@ -0,0 +1,94 @@
+{
+  "title": "Group Relative Policy Optimization",
+  "description": "A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality.",
+  "openingSummary": "Group Relative Policy Optimization, usually shortened to GRPO, is a reinforcement-learning post-training method where the model samples several answers to one prompt, scores them as a group, and learns from which answers look better inside that local set.",
+  "sections": {
+    "whatItIs": {
+      "title": "What It Is",
+      "body": "Group Relative Policy Optimization is a policy-update method used after pretraining or supervised fine-tuning. Instead of judging one answer in isolation, it samples a small group of candidate answers for the same prompt and uses their relative ranking to decide which behaviors should be reinforced."
+    },
+    "whyItExists": {
+      "title": "Why It Exists",
+      "body": "A single reward score can be noisy, and PPO-style language-model training often adds the extra cost of a learned critic to estimate a baseline. GRPO tries to keep the signal useful while simplifying the loop: the group itself supplies the local baseline, so the model can learn from which samples look better than their neighbors."
+    },
+    "howItWorks": {
+      "title": "How It Works",
+      "body": "For one prompt, the policy generates several completions, a reward function scores them, and those scores are normalized within that group. Answers above the group average get a positive learning signal, answers below it get a negative one, and the policy is updated so future samples are more likely to resemble the stronger members of the group."
+    },
+    "comparedToNearbyRegimes": {
+      "title": "Compared To Nearby Regimes",
+      "body": "Group Relative Policy Optimization still sits inside the broader alignment family, but it is narrower than Reinforcement Learning from Human Feedback as a full pipeline. Reinforcement Learning from Human Feedback often means collecting preference data, training a reward model, and then running a reinforcement-learning update such as Proximal Policy Optimization. GRPO keeps the reinforcement-learning loop, but it replaces the learned critic-style baseline with relative ranking inside one sampled group. Direct Preference Optimization moves in a different direction: it stays closer to supervised-style optimization on chosen-versus-rejected pairs, while GRPO uses rewards over several sampled answers and updates the policy from that within-group ordering instead of from one pairwise objective alone."
+    },
+    "limitationsAndFailureModes": {
+      "title": "Limitations And Failure Modes",
+      "body": "The method still depends on reward quality. If the reward function prefers shallow tricks, the whole group can drift in the wrong direction together. Relative scoring also means a weak group can still produce a misleading winner if every sampled answer is bad."
+    },
+    "related": {
+      "title": "Related To"
+    },
+    "tags": {
+      "title": "Tags"
+    },
+    "references": {
+      "title": "References"
+    }
+  },
+  "callouts": {
+    "trainingFlowGraph": {
+      "title": "GRPO training flow",
+      "body": "A visual walkthrough of one prompt, grouped sampling, relative scoring, and the policy update."
+    },
+    "trainingFlowLegend": {
+      "title": "Graph legend",
+      "body": "How to read each stage of the GRPO training flow."
+    }
+  },
+  "links": {
+    "trainingFlowLegendPrompt": "One prompt anchors the whole local comparison.",
+    "trainingFlowLegendSampling": "The policy samples several candidate answers for that same prompt.",
+    "trainingFlowLegendRelativeScoring": "Those answers are scored relative to each other inside the sampled group.",
+    "trainingFlowLegendPolicyUpdate": "The policy update reinforces answers that beat the group baseline and discourages weaker ones."
+  },
+  "assets": {
+    "trainingFlow": {
+      "alt": "A training flow from one prompt to a group of sampled answers, then to relative scoring inside the group, and finally to a policy update.",
+      "caption": "GRPO learns from which answers win inside each sampled group instead of relying on one separate critic-estimated baseline."
+    }
+  },
+  "math": {
+    "grpoAdvantage": {
+      "label": "Grouped relative advantage sketch",
+      "formula": "A_i = \\frac{r_i - \\operatorname{mean}(r_{1:G})}{\\operatorname{std}(r_{1:G})}",
+      "variableDefinitions": {
+        "advantage": {
+          "term": "A_i",
+          "definition": "normalized advantage for sampled answer i"
+        },
+        "reward": {
+          "term": "r_i",
+          "definition": "reward score assigned to sampled answer i"
+        },
+        "groupSize": {
+          "term": "G",
+          "definition": "number of sampled answers in the comparison group"
+        }
+      }
+    }
+  },
+  "graph": {
+    "nodes": {
+      "prompt": {
+        "label": "One prompt"
+      },
+      "sampleGroup": {
+        "label": "Sample a group of answers"
+      },
+      "relativeScore": {
+        "label": "Score answers relative to the group"
+      },
+      "policyUpdate": {
+        "label": "Update the policy"
+      }
+    }
+  }
+}
diff --git a/src/content/docs/training/grpo/page.mdx b/src/content/docs/training/grpo/page.mdx
@@ -0,0 +1,124 @@
+---
+title: "Group Relative Policy Optimization"
+description: "A post-training reinforcement-learning method that compares several sampled answers for the same prompt and updates the model from their relative quality."
+kind: "training-regime"
+registryId: "training-regime.grpo"
+messageNamespace: "local"
+assetNamespace: "local"
+status: "published"
+tags:
+  - foundations
+aliases:
+  - "GRPO"
+  - "Group Relative Preference Optimization"
+updatedAt: "2026-06-19"
+---
+
+import { CitationList } from "@/features/docs/components/CitationList";
+import { BlockMath } from "@/features/docs/components/Math";
+import { RelatedDocs } from "@/features/docs/components/RelatedDocs";
+import { Section } from "@/features/docs/components/Section";
+import { T } from "@/features/docs/components/T";
+import { TagPillList } from "@/features/docs/components/TagPillList";
+import { TrainingRegimeAtAGlance } from "@/features/models/components/TrainingRegimeAtAGlance";
+import { TrainingRegimeFlow } from "@/features/models/components/TrainingRegimeFlow";
+
+
+<TrainingRegimeAtAGlance registryId="training-regime.grpo" />
+
+<Section id="what-it-is" titleKey="sections.whatItIs.title">
+  <T k="sections.whatItIs.body" />
+</Section>
+
+<Section id="why-it-exists" titleKey="sections.whyItExists.title">
+  <T k="sections.whyItExists.body" />
+</Section>
+
+<Section id="how-it-works" titleKey="sections.howItWorks.title">
+  <T k="sections.howItWorks.body" />
+  <h3 className="text-base font-semibold text-foreground">
+    <T k="callouts.trainingFlowGraph.title" />
+  </h3>
+  <TrainingRegimeFlow registryId="training-regime.grpo" assetId="trainingFlow" />
+  <div className="rounded-lg border border-border bg-card p-4">
+    <p className="text-sm font-medium text-foreground">
+      <T k="callouts.trainingFlowLegend.title" />
+    </p>
+    <ul className="mt-3 space-y-2 pl-5 text-sm text-muted-foreground">
+      <li>
+        <T k="links.trainingFlowLegendPrompt" />
+      </li>
+      <li>
+        <T k="links.trainingFlowLegendSampling" />
+      </li>
+      <li>
+        <T k="links.trainingFlowLegendRelativeScoring" />
+      </li>
+      <li>
+        <T k="links.trainingFlowLegendPolicyUpdate" />
+      </li>
+    </ul>
+  </div>
+  <BlockMath mathId="grpoAdvantage" />
+</Section>
+
+<Section
+  id="compared-to-nearby-regimes"
+  titleKey="sections.comparedToNearbyRegimes.title"
+>
+  <T k="sections.comparedToNearbyRegimes.body" />
+
+  <ul className="mt-4 space-y-2 list-none pl-0">
+    <li>
+      <a
+        className="text-primary underline underline-offset-4"
+        href="/docs/glossary/alignment"
+      >
+        Alignment
+      </a>
+    </li>
+    <li>
+      <a
+        className="text-primary underline underline-offset-4"
+        href="/docs/glossary/alignment"
+      >
+        RLHF
+      </a>
+    </li>
+    <li>
+      <a
+        className="text-primary underline underline-offset-4"
+        href="/search?q=ppo"
+      >
+        PPO
+      </a>
+    </li>
+    <li>
+      <a
+        className="text-primary underline underline-offset-4"
+        href="/search?q=dpo"
+      >
+        DPO
+      </a>
+    </li>
+  </ul>
+</Section>
+
+<Section
+  id="limitations-and-failure-modes"
+  titleKey="sections.limitationsAndFailureModes.title"
+>
+  <T k="sections.limitationsAndFailureModes.body" />
+</Section>
+
+<Section id="related" titleKey="sections.related.title">
+  <RelatedDocs registryId="training-regime.grpo" />
+</Section>
+
+<Section id="tags" titleKey="sections.tags.title">
+  <TagPillList registryId="training-regime.grpo" showDescriptions />
+</Section>
+
+<Section id="references" titleKey="sections.references.title">
+  <CitationList registryId="training-regime.grpo" />
+</Section>
diff --git a/src/content/registry/citations/deepseek-r1-paper.json b/src/content/registry/citations/deepseek-r1-paper.json
@@ -0,0 +1,20 @@
+{
+  "id": "citation.deepseek-r1-paper",
+  "slug": "deepseek-r1-paper",
+  "kind": "citation",
+  "defaultTitleKey": "title",
+  "defaultSummaryKey": "summary",
+  "aliases": ["DeepSeek-R1 paper"],
+  "tags": ["foundations"],
+  "relatedIds": [],
+  "citationIds": [],
+  "status": "published",
+  "createdAt": "2026-06-19T00:00:00.000Z",
+  "updatedAt": "2026-06-19T00:00:00.000Z",
+  "citationType": "paper",
+  "authors": ["DeepSeek-AI"],
+  "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
+  "year": 2025,
+  "url": "https://arxiv.org/abs/2501.12948",
+  "mla": "DeepSeek-AI. \"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.\" arXiv, 2025, https://arxiv.org/abs/2501.12948."
+}
diff --git a/src/content/registry/citations/deepseekmath-paper.json b/src/content/registry/citations/deepseekmath-paper.json
@@ -0,0 +1,32 @@
+{
+  "id": "citation.deepseekmath-paper",
+  "slug": "deepseekmath-paper",
+  "kind": "citation",
+  "defaultTitleKey": "title",
+  "defaultSummaryKey": "summary",
+  "aliases": ["DeepSeekMath paper", "GRPO paper"],
+  "tags": ["foundations"],
+  "relatedIds": [],
+  "citationIds": [],
+  "status": "published",
+  "createdAt": "2026-06-19T00:00:00.000Z",
+  "updatedAt": "2026-06-19T00:00:00.000Z",
+  "citationType": "paper",
+  "authors": [
+    "Zhihong Shao",
+    "Peiyi Wang",
+    "Qihao Zhu",
+    "Runxin Xu",
+    "Junxiao Song",
+    "Xiao Bi",
+    "Haowei Zhang",
+    "Mingchuan Zhang",
+    "Y. K. Li",
+    "Yu Wu",
+    "Daya Guo"
+  ],
+  "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
+  "year": 2024,
+  "url": "https://arxiv.org/abs/2402.03300",
+  "mla": "Shao, Zhihong, et al. \"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models.\" arXiv, 2024, https://arxiv.org/abs/2402.03300."
+}
diff --git a/src/content/registry/concepts/alignment.json b/src/content/registry/concepts/alignment.json
@@ -7,6 +7,7 @@
   "aliases": ["Alignment", "RLHF", "preference alignment", "safety alignment"],
   "tags": ["foundations", "taxonomy"],
   "relatedIds": [
+    "training-regime.grpo",
     "concept.model-capacity",
     "concept.overfitting",
     "concept.generalization",