portpowered · AndreasAbdi · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/src/content/docs/concepts/decode/assets.json b/src/content/docs/concepts/decode/assets.json
@@ -0,0 +1 @@
+{}
diff --git a/src/content/docs/concepts/decode/messages/en.json b/src/content/docs/concepts/decode/messages/en.json
@@ -0,0 +1,47 @@
+{
+  "title": "Decode",
+  "description": "The repeated next-token stage that reuses saved prefix state, extends the KV cache one position at a time, and makes inter-token latency the main reader-visible cost after prefill.",
+  "openingSummary": "After prefill has processed the prompt, decode is the repeated next-token loop that keeps reusing saved key-value state, adds one new position at a time, and usually turns inter-token latency plus cache growth into the dominant feel of a long response.",
+  "sections": {
+    "whatItIs": {
+      "title": "What It Is",
+      "body": "Decode is the stage of autoregressive generation that starts after prefill has already read the prompt and built the first key-value cache state. Each turn computes the next token from the current prefix, appends that token, writes one more cache position, and then repeats until generation stops. It repeats one position at a time because the saved cache already summarizes the earlier prompt, so the model only has to process the newest step instead of rereading the whole prompt on every turn."
+    },
+    "whyItMatters": {
+      "title": "Why It Matters",
+      "body": "Decode often controls the feel of a streamed answer because readers experience it as one token arriving after another. The model no longer has to reread the whole prompt from scratch, but each step still depends on moving through a growing cache and running another forward pass for the newest position. That is why inter-token latency, cache growth, memory bandwidth, and serving density show up so often in decode discussions."
+    },
+    "simpleExample": {
+      "title": "Simple Example",
+      "body": "Suppose a prompt has already been processed and the model is about to generate a 50-token reply. Decode produces token 1, extends the prefix, updates the cache for that new position, then repeats for token 2, token 3, and the rest of the answer. Each turn is smaller than the prompt pass, but dozens of repeated turns can dominate the total user-visible wait."
+    },
+    "commonConfusions": {
+      "title": "Common Confusions",
+      "body": "Decode is not the same as the decoder module. The decoder is the model stack; decode is the runtime loop that calls that stack again and again during generation. Decode is also not the same as prefill: prefill reads the existing prompt once, while decode handles one newly generated token at a time. Finally, decode is not the same as sampling. Decode produces the next-step hidden state and logits, while sampling decides how one token is chosen from those scores."
+    },
+    "servingPath": {
+      "title": "Serving Path",
+      "body": "Use the nearby pages below to trace what decode depends on, what it changes in the serving stack, and where to go next if you want either the systems view or the token-choice view."
+    },
+    "related": {
+      "title": "Related Concepts And Systems"
+    },
+    "tags": {
+      "title": "Tags"
+    },
+    "references": {
+      "title": "References"
+    }
+  },
+  "links": {
+    "prefill": "Prefill",
+    "prefillDecodeSplit": "Prefill/decode split",
+    "kvCache": "KV cache",
+    "autoregressiveGeneration": "Autoregressive generation",
+    "batching": "Batching",
+    "continuousBatching": "Continuous batching",
+    "memory": "Memory",
+    "speculativeDecoding": "Speculative decoding",
+    "samplingOverview": "Sampling overview"
+  }
+}
diff --git a/src/content/docs/concepts/decode/page.mdx b/src/content/docs/concepts/decode/page.mdx
@@ -0,0 +1,95 @@
+---
+title: Decode
+description: The repeated next-token stage that reuses saved prefix state, extends the KV cache one position at a time, and makes inter-token latency the main reader-visible cost after prefill.
+kind: "concept"
+registryId: "concept.decode"
+messageNamespace: "local"
+assetNamespace: "local"
+status: "published"
+tags:
+  - foundations
+  - attention
+  - kv-cache
+aliases:
+  - "Decode"
+  - "decoding"
+  - "token-by-token generation"
+  - "next-token step"
+  - "inter-token generation"
+  - "inter-token latency"
+updatedAt: "2026-06-22"
+---
+
+import { CitationList } from "@/features/docs/components/CitationList";
+import { DerivedRelatedDocs } from "@/features/docs/components/DerivedRelatedDocs";
+import { LocalizedLinkList } from "@/features/docs/components/LocalizedLinkList";
+import { RelatedDocs } from "@/features/docs/components/RelatedDocs";
+import { Section } from "@/features/docs/components/Section";
+import { T } from "@/features/docs/components/T";
+import { TagPillList } from "@/features/docs/components/TagPillList";
+
+<Section id="what-it-is" titleKey="sections.whatItIs.title">
+  <T k="sections.whatItIs.body" />
+</Section>
+
+<Section id="why-it-matters" titleKey="sections.whyItMatters.title">
+  <T k="sections.whyItMatters.body" />
+</Section>
+
+<Section id="simple-example" titleKey="sections.simpleExample.title">
+  <T k="sections.simpleExample.body" />
+</Section>
+
+<Section id="common-confusions" titleKey="sections.commonConfusions.title">
+  <T k="sections.commonConfusions.body" />
+</Section>
+
+<Section id="serving-path" titleKey="sections.servingPath.title">
+  <T k="sections.servingPath.body" />
+  <LocalizedLinkList
+    items={[
+      { href: "/docs/concepts/prefill", labelKey: "links.prefill" },
+      { href: "/docs/glossary/prefill-decode-split", labelKey: "links.prefillDecodeSplit" },
+      { href: "/docs/concepts/kv-cache", labelKey: "links.kvCache" },
+      {
+        href: "/docs/glossary/autoregressive-generation",
+        labelKey: "links.autoregressiveGeneration",
+      },
+      { href: "/docs/systems/batching", labelKey: "links.batching" },
+      {
+        href: "/docs/systems/continuous-batching",
+        labelKey: "links.continuousBatching",
+      },
+      { href: "/docs/systems/memory", labelKey: "links.memory" },
+      {
+        href: "/docs/systems/speculative-decoding",
+        labelKey: "links.speculativeDecoding",
+      },
+      {
+        href: "/docs/glossary/sampling-overview",
+        labelKey: "links.samplingOverview",
+      },
+    ]}
+  />
+</Section>
+
+<Section id="related" titleKey="sections.related.title">
+  <DerivedRelatedDocs
+    registryId="concept.decode"
+    groups={[
+      "same-concept-type",
+      "shared-tags",
+      "curated-related"
+    ]}
+  />
+
+  <RelatedDocs registryId="concept.decode" />
+</Section>
+
+<Section id="tags" titleKey="sections.tags.title">
+  <TagPillList registryId="concept.decode" showDescriptions />
+</Section>
+
+<Section id="references" titleKey="sections.references.title">
+  <CitationList registryId="concept.decode" />
+</Section>
diff --git a/src/content/docs/concepts/kv-cache/page.mdx b/src/content/docs/concepts/kv-cache/page.mdx
@@ -41,7 +41,7 @@ import { TagPillList } from "@/features/docs/components/TagPillList";
   <LocalizedLinkList
     items={[
       { href: "/docs/glossary/prefill", labelKey: "links.prefill" },
-      { href: "/docs/glossary/decode", labelKey: "links.decode" },
+      { href: "/docs/concepts/decode", labelKey: "links.decode" },
       {
         href: "/docs/glossary/prefill-decode-split",
         labelKey: "links.prefillDecodeSplit",

diff --git a/src/content/docs/concepts/prefill/page.mdx b/src/content/docs/concepts/prefill/page.mdx
@@ -49,7 +49,7 @@ import { TagPillList } from "@/features/docs/components/TagPillList";
   <LocalizedLinkList
     items={[
       { href: "/docs/glossary/kv-cache", labelKey: "links.kvCache" },
-      { href: "/docs/glossary/decode", labelKey: "links.decode" },
+      { href: "/docs/concepts/decode", labelKey: "links.decode" },
       {
         href: "/docs/glossary/prefill-decode-split",
         labelKey: "links.prefillDecodeSplit",

diff --git a/src/content/docs/glossary/decode/page.mdx b/src/content/docs/glossary/decode/page.mdx
@@ -15,6 +15,8 @@ aliases:
   - "decoding"
   - "token-by-token generation"
   - "next-token step"
+  - "inter-token generation"
+  - "inter-token latency"
 updatedAt: "2026-06-18"
 ---
 

diff --git a/src/content/docs/glossary/kv-cache/page.mdx b/src/content/docs/glossary/kv-cache/page.mdx
@@ -46,7 +46,7 @@ import { TagPillList } from "@/features/docs/components/TagPillList";
   <LocalizedLinkList
     items={[
       { href: "/docs/concepts/prefill", labelKey: "links.prefill" },
-      { href: "/docs/glossary/decode", labelKey: "links.decode" },
+      { href: "/docs/concepts/decode", labelKey: "links.decode" },
     ]}
   />
 </Section>

diff --git a/src/content/docs/glossary/prefill-decode-split/page.mdx b/src/content/docs/glossary/prefill-decode-split/page.mdx
@@ -46,7 +46,7 @@ import { TagPillList } from "@/features/docs/components/TagPillList";
     items={[
       { href: "/docs/concepts/kv-cache", labelKey: "links.kvCache" },
       { href: "/docs/concepts/prefill", labelKey: "links.prefill" },
-      { href: "/docs/glossary/decode", labelKey: "links.decode" },
+      { href: "/docs/concepts/decode", labelKey: "links.decode" },
       {
         href: "/search?q=paged%20attention",
         labelKey: "links.pagedAttention",

diff --git a/src/content/docs/glossary/prefill/page.mdx b/src/content/docs/glossary/prefill/page.mdx
@@ -46,7 +46,7 @@ import { TagPillList } from "@/features/docs/components/TagPillList";
   <LocalizedLinkList
     items={[
       { href: "/docs/concepts/kv-cache", labelKey: "links.kvCache" },
-      { href: "/docs/glossary/decode", labelKey: "links.decode" },
+      { href: "/docs/concepts/decode", labelKey: "links.decode" },
       {
         href: "/docs/glossary/prefill-decode-split",
         labelKey: "links.prefillDecodeSplit",

diff --git a/src/content/registry/concepts/decode.json b/src/content/registry/concepts/decode.json
@@ -9,15 +9,20 @@
     "decoding",
     "token-by-token generation",
     "next-token step",
-    "inter-token generation"
+    "inter-token generation",
+    "inter-token latency"
   ],
   "tags": ["foundations", "attention", "kv-cache"],
   "relatedIds": [
     "concept.prefill",
     "concept.kv-cache",
     "concept.prefill-decode-split",
     "system.batching",
+    "system.continuous-batching",
+    "system.memory",
+    "system.speculative-decoding",
     "concept.autoregressive-generation",
+    "concept.sampling-overview",
     "module.attention",
     "module.multi-query-attention",
     "module.grouped-query-attention",
@@ -30,7 +35,7 @@
   ],
   "status": "published",
   "createdAt": "2026-06-18T00:00:00.000Z",
-  "updatedAt": "2026-06-18T00:00:00.000Z",
+  "updatedAt": "2026-06-22T00:00:00.000Z",
   "conceptType": "inference",
   "sidebarGrouping": {
     "glossary": "sequence-and-attention"

diff --git a/src/lib/content/architecture.ts b/src/lib/content/architecture.ts
@@ -11,6 +11,49 @@ export type ArchitectureEntry = {
   slug: string;
 };
 
+function preferArchitecturePageCandidate(
+  current: DocsPageSource,
+  candidate: DocsPageSource,
+): DocsPageSource {
+  if (
+    current.frontmatter.kind !== "concept" &&
+    candidate.frontmatter.kind === "concept"
+  ) {
+    return candidate;
+  }
+
+  return current;
+}
+
+function dedupeArchitecturePagesByRegistryId(
+  pages: DocsPageSource[],
+): DocsPageSource[] {
+  const dedupedPages: DocsPageSource[] = [];
+  const pagesByRegistryId = new Map<string, number>();
+
+  for (const page of pages) {
+    const { registryId } = page.frontmatter;
+    if (!registryId) {
+      dedupedPages.push(page);
+      continue;
+    }
+
+    const existingIndex = pagesByRegistryId.get(registryId);
+    if (existingIndex === undefined) {
+      pagesByRegistryId.set(registryId, dedupedPages.length);
+      dedupedPages.push(page);
+      continue;
+    }
+
+    dedupedPages[existingIndex] = preferArchitecturePageCandidate(
+      dedupedPages[existingIndex],
+      page,
+    );
+  }
+
+  return dedupedPages;
+}
+
 function isConceptRecord(
   record: ReturnType<typeof getRegistryRecord>,
 ): record is ConceptRecord {
@@ -112,5 +155,8 @@ export async function loadPublishedArchitectureEntries(
   const pages = (await loadShippedLocalizedDocsPages(locale)).filter((page) =>
     isArchitectureRelatedPage(page, indexes),
   );
-  return sortArchitectureEntriesByTitle(pages.map(toArchitectureEntry), locale);
+  return sortArchitectureEntriesByTitle(
+    dedupeArchitecturePagesByRegistryId(pages).map(toArchitectureEntry),
+    locale,
+  );
 }
diff --git a/src/lib/content/batching-system-page.test.ts b/src/lib/content/batching-system-page.test.ts
@@ -165,7 +165,7 @@ describe("batching docs route render", () => {
     expect(html).toContain("Legend:");
     expect(html).toContain("Queue requests");
     expect(html).toContain('href="/docs/concepts/prefill"');
-    expect(html).toContain('href="/docs/glossary/decode"');
+    expect(html).toContain('href="/docs/concepts/decode"');
     expect(html).toContain('href="/docs/glossary/prefill-decode-split"');
     expect(html).toContain('href="/docs/concepts/kv-cache"');
     expect(html).toContain('href="/docs/systems/continuous-batching"');

diff --git a/src/lib/content/content-paths.ts b/src/lib/content/content-paths.ts
@@ -293,6 +293,9 @@ export const HIDDEN_SIZE_GLOSSARY_PAGE_DIR = join(
 /** Prefill concept page directory. */
 export const PREFILL_CONCEPT_PAGE_DIR = join(CONCEPTS_DOCS_ROOT, "prefill");
 
+/** Decode concept page directory. */
+export const DECODE_CONCEPT_PAGE_DIR = join(CONCEPTS_DOCS_ROOT, "decode");
+
 /** Vocabulary size glossary page directory. */
 export const VOCABULARY_SIZE_GLOSSARY_PAGE_DIR = join(
   GLOSSARY_DOCS_ROOT,

diff --git a/src/lib/content/content-reconciliation-attention-tag.test.ts b/src/lib/content/content-reconciliation-attention-tag.test.ts
@@ -97,6 +97,7 @@ describe("Phase 2/3 reconciliation attention tag landing (US-007)", () => {
     const conceptGroup = groups.find((group) => group.kind === "concept");
     expect(conceptGroup?.kindLabel).toBe("Concept");
     expect(conceptGroup?.resources.map((resource) => resource.url)).toEqual([
+      "/docs/concepts/decode",
       "/docs/concepts/kv-cache",
       "/docs/concepts/prefill",
     ]);
@@ -164,7 +165,7 @@ describe("Phase 2/3 reconciliation attention tag page render (US-007)", () => {
     expect(html).toContain("Linear Attention");
     expect(html).toContain('href="/docs/concepts/kv-cache"');
     expect(html).toContain('href="/docs/glossary/autoregressive-generation"');
-    expect(html).toContain('href="/docs/glossary/decode"');
+    expect(html).toContain('href="/docs/concepts/decode"');
     expect(html).toContain('href="/docs/glossary/kv-cache"');
     expect(html).toContain('href="/docs/concepts/prefill"');
     expect(html).toContain('href="/docs/glossary/token"');

diff --git a/src/lib/content/content-reconciliation-browse-index.test.ts b/src/lib/content/content-reconciliation-browse-index.test.ts
@@ -21,7 +21,7 @@ describe("Phase 2/3 reconciliation browse indexes (US-004)", () => {
     for (const url of [
       "/docs/concepts/transformer-architecture",
       "/docs/concepts/positional-encodings",
-      "/docs/glossary/kv-cache",
+      "/docs/concepts/kv-cache",
       "/docs/glossary/token",
     ]) {
       expect(architectureHtml).toContain(`href="${url}"`);

diff --git a/src/lib/content/continuous-batching-system-page.test.ts b/src/lib/content/continuous-batching-system-page.test.ts
@@ -171,7 +171,7 @@ describe("continuous batching docs route render", () => {
       ),
     );
 
-    expect(html).toContain('href="/docs/glossary/decode"');
+    expect(html).toContain('href="/docs/concepts/decode"');
     expect(html).toContain('href="/docs/glossary/prefill-decode-split"');
     expect(html).toContain('href="/docs/concepts/kv-cache"');
     expect(html).toContain('href="/docs/systems/batching"');