Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/content/docs/concepts/decode/assets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
47 changes: 47 additions & 0 deletions src/content/docs/concepts/decode/messages/en.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"title": "Decode",
"description": "The repeated next-token stage that reuses saved prefix state, extends the KV cache one position at a time, and makes inter-token latency the main reader-visible cost after prefill.",
"openingSummary": "After prefill has processed the prompt, decode is the repeated next-token loop that keeps reusing saved key-value state, adds one new position at a time, and usually turns inter-token latency plus cache growth into the dominant feel of a long response.",
"sections": {
"whatItIs": {
"title": "What It Is",
"body": "Decode is the stage of autoregressive generation that starts after prefill has already read the prompt and built the first key-value cache state. Each turn computes the next token from the current prefix, appends that token, writes one more cache position, and then repeats until generation stops. It repeats one position at a time because the saved cache already summarizes the earlier prompt, so the model only has to process the newest step instead of rereading the whole prompt on every turn."
},
"whyItMatters": {
"title": "Why It Matters",
"body": "Decode often controls the feel of a streamed answer because readers experience it as one token arriving after another. The model no longer has to reread the whole prompt from scratch, but each step still depends on moving through a growing cache and running another forward pass for the newest position. That is why inter-token latency, cache growth, memory bandwidth, and serving density show up so often in decode discussions."
},
"simpleExample": {
"title": "Simple Example",
"body": "Suppose a prompt has already been processed and the model is about to generate a 50-token reply. Decode produces token 1, extends the prefix, updates the cache for that new position, then repeats for token 2, token 3, and the rest of the answer. Each turn is smaller than the prompt pass, but dozens of repeated turns can dominate the total user-visible wait."
},
"commonConfusions": {
"title": "Common Confusions",
"body": "Decode is not the same as the decoder module. The decoder is the model stack; decode is the runtime loop that calls that stack again and again during generation. Decode is also not the same as prefill: prefill reads the existing prompt once, while decode handles one newly generated token at a time. Finally, decode is not the same as sampling. Decode produces the next-step hidden state and logits, while sampling decides how one token is chosen from those scores."
},
"servingPath": {
"title": "Serving Path",
"body": "Use the nearby pages below to trace what decode depends on, what it changes in the serving stack, and where to go next if you want either the systems view or the token-choice view."
},
"related": {
"title": "Related Concepts And Systems"
},
"tags": {
"title": "Tags"
},
"references": {
"title": "References"
}
},
"links": {
"prefill": "Prefill",
"prefillDecodeSplit": "Prefill/decode split",
"kvCache": "KV cache",
"autoregressiveGeneration": "Autoregressive generation",
"batching": "Batching",
"continuousBatching": "Continuous batching",
"memory": "Memory",
"speculativeDecoding": "Speculative decoding",
"samplingOverview": "Sampling overview"
}
}
95 changes: 95 additions & 0 deletions src/content/docs/concepts/decode/page.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
---
title: Decode
description: The repeated next-token stage that reuses saved prefix state, extends the KV cache one position at a time, and makes inter-token latency the main reader-visible cost after prefill.
kind: "concept"
registryId: "concept.decode"
messageNamespace: "local"
assetNamespace: "local"
status: "published"
tags:
- foundations
- attention
- kv-cache
aliases:
- "Decode"
- "decoding"
- "token-by-token generation"
- "next-token step"
- "inter-token generation"
- "inter-token latency"
updatedAt: "2026-06-22"
---

import { CitationList } from "@/features/docs/components/CitationList";
import { DerivedRelatedDocs } from "@/features/docs/components/DerivedRelatedDocs";
import { LocalizedLinkList } from "@/features/docs/components/LocalizedLinkList";
import { RelatedDocs } from "@/features/docs/components/RelatedDocs";
import { Section } from "@/features/docs/components/Section";
import { T } from "@/features/docs/components/T";
import { TagPillList } from "@/features/docs/components/TagPillList";

<Section id="what-it-is" titleKey="sections.whatItIs.title">
<T k="sections.whatItIs.body" />
</Section>

<Section id="why-it-matters" titleKey="sections.whyItMatters.title">
<T k="sections.whyItMatters.body" />
</Section>

<Section id="simple-example" titleKey="sections.simpleExample.title">
<T k="sections.simpleExample.body" />
</Section>

<Section id="common-confusions" titleKey="sections.commonConfusions.title">
<T k="sections.commonConfusions.body" />
</Section>

<Section id="serving-path" titleKey="sections.servingPath.title">
<T k="sections.servingPath.body" />
<LocalizedLinkList
items={[
{ href: "/docs/concepts/prefill", labelKey: "links.prefill" },
{ href: "/docs/glossary/prefill-decode-split", labelKey: "links.prefillDecodeSplit" },
{ href: "/docs/concepts/kv-cache", labelKey: "links.kvCache" },
{
href: "/docs/glossary/autoregressive-generation",
labelKey: "links.autoregressiveGeneration",
},
{ href: "/docs/systems/batching", labelKey: "links.batching" },
{
href: "/docs/systems/continuous-batching",
labelKey: "links.continuousBatching",
},
{ href: "/docs/systems/memory", labelKey: "links.memory" },
{
href: "/docs/systems/speculative-decoding",
labelKey: "links.speculativeDecoding",
},
{
href: "/docs/glossary/sampling-overview",
labelKey: "links.samplingOverview",
},
]}
/>
</Section>

<Section id="related" titleKey="sections.related.title">
<DerivedRelatedDocs
registryId="concept.decode"
groups={[
"same-concept-type",
"shared-tags",
"curated-related"
]}
/>

<RelatedDocs registryId="concept.decode" />
</Section>

<Section id="tags" titleKey="sections.tags.title">
<TagPillList registryId="concept.decode" showDescriptions />
</Section>

<Section id="references" titleKey="sections.references.title">
<CitationList registryId="concept.decode" />
</Section>
2 changes: 1 addition & 1 deletion src/content/docs/concepts/kv-cache/page.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ import { TagPillList } from "@/features/docs/components/TagPillList";
<LocalizedLinkList
items={[
{ href: "/docs/glossary/prefill", labelKey: "links.prefill" },
{ href: "/docs/glossary/decode", labelKey: "links.decode" },
{ href: "/docs/concepts/decode", labelKey: "links.decode" },
{
href: "/docs/glossary/prefill-decode-split",
labelKey: "links.prefillDecodeSplit",
Expand Down
2 changes: 1 addition & 1 deletion src/content/docs/concepts/prefill/page.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ import { TagPillList } from "@/features/docs/components/TagPillList";
<LocalizedLinkList
items={[
{ href: "/docs/glossary/kv-cache", labelKey: "links.kvCache" },
{ href: "/docs/glossary/decode", labelKey: "links.decode" },
{ href: "/docs/concepts/decode", labelKey: "links.decode" },
{
href: "/docs/glossary/prefill-decode-split",
labelKey: "links.prefillDecodeSplit",
Expand Down
2 changes: 2 additions & 0 deletions src/content/docs/glossary/decode/page.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ aliases:
- "decoding"
- "token-by-token generation"
- "next-token step"
- "inter-token generation"
- "inter-token latency"
updatedAt: "2026-06-18"
---

Expand Down
2 changes: 1 addition & 1 deletion src/content/docs/glossary/kv-cache/page.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ import { TagPillList } from "@/features/docs/components/TagPillList";
<LocalizedLinkList
items={[
{ href: "/docs/concepts/prefill", labelKey: "links.prefill" },
{ href: "/docs/glossary/decode", labelKey: "links.decode" },
{ href: "/docs/concepts/decode", labelKey: "links.decode" },
]}
/>
</Section>
Expand Down
2 changes: 1 addition & 1 deletion src/content/docs/glossary/prefill-decode-split/page.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ import { TagPillList } from "@/features/docs/components/TagPillList";
items={[
{ href: "/docs/concepts/kv-cache", labelKey: "links.kvCache" },
{ href: "/docs/concepts/prefill", labelKey: "links.prefill" },
{ href: "/docs/glossary/decode", labelKey: "links.decode" },
{ href: "/docs/concepts/decode", labelKey: "links.decode" },
{
href: "/search?q=paged%20attention",
labelKey: "links.pagedAttention",
Expand Down
2 changes: 1 addition & 1 deletion src/content/docs/glossary/prefill/page.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ import { TagPillList } from "@/features/docs/components/TagPillList";
<LocalizedLinkList
items={[
{ href: "/docs/concepts/kv-cache", labelKey: "links.kvCache" },
{ href: "/docs/glossary/decode", labelKey: "links.decode" },
{ href: "/docs/concepts/decode", labelKey: "links.decode" },
{
href: "/docs/glossary/prefill-decode-split",
labelKey: "links.prefillDecodeSplit",
Expand Down
9 changes: 7 additions & 2 deletions src/content/registry/concepts/decode.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,20 @@
"decoding",
"token-by-token generation",
"next-token step",
"inter-token generation"
"inter-token generation",
"inter-token latency"
],
"tags": ["foundations", "attention", "kv-cache"],
"relatedIds": [
"concept.prefill",
"concept.kv-cache",
"concept.prefill-decode-split",
"system.batching",
"system.continuous-batching",
"system.memory",
"system.speculative-decoding",
"concept.autoregressive-generation",
"concept.sampling-overview",
"module.attention",
"module.multi-query-attention",
"module.grouped-query-attention",
Expand All @@ -30,7 +35,7 @@
],
"status": "published",
"createdAt": "2026-06-18T00:00:00.000Z",
"updatedAt": "2026-06-18T00:00:00.000Z",
"updatedAt": "2026-06-22T00:00:00.000Z",
"conceptType": "inference",
"sidebarGrouping": {
"glossary": "sequence-and-attention"
Expand Down
48 changes: 47 additions & 1 deletion src/lib/content/architecture.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,49 @@ export type ArchitectureEntry = {
slug: string;
};

function preferArchitecturePageCandidate(
current: DocsPageSource,
candidate: DocsPageSource,
): DocsPageSource {
if (
current.frontmatter.kind !== "concept" &&
candidate.frontmatter.kind === "concept"
) {
return candidate;
}

return current;
}

function dedupeArchitecturePagesByRegistryId(
pages: DocsPageSource[],
): DocsPageSource[] {
const dedupedPages: DocsPageSource[] = [];
const pagesByRegistryId = new Map<string, number>();

for (const page of pages) {
const { registryId } = page.frontmatter;
if (!registryId) {
dedupedPages.push(page);
continue;
}

const existingIndex = pagesByRegistryId.get(registryId);
if (existingIndex === undefined) {
pagesByRegistryId.set(registryId, dedupedPages.length);
dedupedPages.push(page);
continue;
}

dedupedPages[existingIndex] = preferArchitecturePageCandidate(
dedupedPages[existingIndex],
page,
);
}

return dedupedPages;
}

function isConceptRecord(
record: ReturnType<typeof getRegistryRecord>,
): record is ConceptRecord {
Expand Down Expand Up @@ -112,5 +155,8 @@ export async function loadPublishedArchitectureEntries(
const pages = (await loadShippedLocalizedDocsPages(locale)).filter((page) =>
isArchitectureRelatedPage(page, indexes),
);
return sortArchitectureEntriesByTitle(pages.map(toArchitectureEntry), locale);
return sortArchitectureEntriesByTitle(
dedupeArchitecturePagesByRegistryId(pages).map(toArchitectureEntry),
locale,
);
}
2 changes: 1 addition & 1 deletion src/lib/content/batching-system-page.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ describe("batching docs route render", () => {
expect(html).toContain("Legend:");
expect(html).toContain("Queue requests");
expect(html).toContain('href="/docs/concepts/prefill"');
expect(html).toContain('href="/docs/glossary/decode"');
expect(html).toContain('href="/docs/concepts/decode"');
expect(html).toContain('href="/docs/glossary/prefill-decode-split"');
expect(html).toContain('href="/docs/concepts/kv-cache"');
expect(html).toContain('href="/docs/systems/continuous-batching"');
Expand Down
3 changes: 3 additions & 0 deletions src/lib/content/content-paths.ts
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,9 @@ export const HIDDEN_SIZE_GLOSSARY_PAGE_DIR = join(
/** Prefill concept page directory. */
export const PREFILL_CONCEPT_PAGE_DIR = join(CONCEPTS_DOCS_ROOT, "prefill");

/** Decode concept page directory. */
export const DECODE_CONCEPT_PAGE_DIR = join(CONCEPTS_DOCS_ROOT, "decode");

/** Vocabulary size glossary page directory. */
export const VOCABULARY_SIZE_GLOSSARY_PAGE_DIR = join(
GLOSSARY_DOCS_ROOT,
Expand Down
3 changes: 2 additions & 1 deletion src/lib/content/content-reconciliation-attention-tag.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ describe("Phase 2/3 reconciliation attention tag landing (US-007)", () => {
const conceptGroup = groups.find((group) => group.kind === "concept");
expect(conceptGroup?.kindLabel).toBe("Concept");
expect(conceptGroup?.resources.map((resource) => resource.url)).toEqual([
"/docs/concepts/decode",
"/docs/concepts/kv-cache",
"/docs/concepts/prefill",
]);
Expand Down Expand Up @@ -164,7 +165,7 @@ describe("Phase 2/3 reconciliation attention tag page render (US-007)", () => {
expect(html).toContain("Linear Attention");
expect(html).toContain('href="/docs/concepts/kv-cache"');
expect(html).toContain('href="/docs/glossary/autoregressive-generation"');
expect(html).toContain('href="/docs/glossary/decode"');
expect(html).toContain('href="/docs/concepts/decode"');
expect(html).toContain('href="/docs/glossary/kv-cache"');
expect(html).toContain('href="/docs/concepts/prefill"');
expect(html).toContain('href="/docs/glossary/token"');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ describe("Phase 2/3 reconciliation browse indexes (US-004)", () => {
for (const url of [
"/docs/concepts/transformer-architecture",
"/docs/concepts/positional-encodings",
"/docs/glossary/kv-cache",
"/docs/concepts/kv-cache",
"/docs/glossary/token",
]) {
expect(architectureHtml).toContain(`href="${url}"`);
Expand Down
2 changes: 1 addition & 1 deletion src/lib/content/continuous-batching-system-page.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ describe("continuous batching docs route render", () => {
),
);

expect(html).toContain('href="/docs/glossary/decode"');
expect(html).toContain('href="/docs/concepts/decode"');
expect(html).toContain('href="/docs/glossary/prefill-decode-split"');
expect(html).toContain('href="/docs/concepts/kv-cache"');
expect(html).toContain('href="/docs/systems/batching"');
Expand Down
Loading
Loading