Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion src/app/docs/docs-slug-renderer.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {
import { createRelativeLink } from "fumadocs-ui/mdx";
import { notFound } from "next/navigation";
import { DocsAutoLinkedDescription } from "@/features/docs/components/DocsAutoLinkedDescription";
import { DocsFoldedSummary } from "@/features/docs/components/DocsFoldedSummary";
import { DocsPageBreadcrumb } from "@/features/docs/components/DocsPageBreadcrumb";
import { ModulePageProviders } from "@/features/docs/components/ModulePageProviders";
import {
Expand All @@ -17,9 +18,13 @@ import { isDocsPageShippedForLocale } from "@/lib/content/pages";
import { loadUiMessages } from "@/lib/content/ui-messages";
import { defaultLocale, type SiteLocale } from "@/lib/i18n/locale-routing";
import { localizedRouteAlternates } from "@/lib/i18n/route-locale";
import { source } from "@/lib/source";
import { getMDXComponents } from "../../../mdx-components";

async function loadDocsSource() {
const { source } = await import("@/lib/source");
return source;
}

function buildDocsPageAlternates(docsSlug: string) {
const alternates = localizedRouteAlternates({
surface: "docs-page",
Expand All @@ -46,6 +51,7 @@ async function renderLocalDocsPage(
return null;
}

const source = await loadDocsSource();
const page = source.getPage(slug);
if (!page) {
return null;
Expand Down Expand Up @@ -75,6 +81,12 @@ async function renderLocalDocsPage(
/>
<DocsTitle>{loadedPage.messages.title}</DocsTitle>
<DocsDescription>{description}</DocsDescription>
{localRef.section !== "glossary" &&
loadedPage.messages.openingSummary?.length ? (
<DocsFoldedSummary>
{loadedPage.messages.openingSummary}
</DocsFoldedSummary>
) : null}
<DocsBody>
<article data-registry-id={loadedPage.frontmatter.registryId}>
{loadedPage.content}
Expand All @@ -99,6 +111,7 @@ export async function renderDocsSlugPage(
return localPage;
}

const source = await loadDocsSource();
const page = source.getPage(slug);

if (!page) {
Expand Down Expand Up @@ -145,6 +158,7 @@ export async function buildDocsPageMetadata(
const localRef = parseLocalDocsPageRef(slug);

if (localRef) {
const source = await loadDocsSource();
const page = source.getPage(slug);
if (page && docsSlug) {
const loadedPage = await loadLocalDocsPage(localRef, locale);
Expand All @@ -156,6 +170,7 @@ export async function buildDocsPageMetadata(
}
}

const source = await loadDocsSource();
const page = source.getPage(slug);

if (!page) {
Expand Down
26 changes: 26 additions & 0 deletions src/content/docs/modules/cross-attention/assets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"computeFlow": {
"type": "attention-variant-graph",
"defaultVariantId": "cross",
"variants": [
{
"variantId": "self",
"graphId": "graph.multi-head-attention-time-pattern",
"labelKey": "assets.computeFlow.variants.self.label"
},
{
"variantId": "cross",
"graphId": "graph.cross-attention-memory-pattern",
"labelKey": "assets.computeFlow.variants.cross.label"
}
],
"webRenderer": "react-flow",
"printRenderer": "mermaid",
"altKey": "assets.computeFlow.alt",
"captionKey": "assets.computeFlow.caption"
},
"comparisonTable": {
"type": "table",
"tableId": "table.cross-attention-comparison"
}
}
199 changes: 199 additions & 0 deletions src/content/docs/modules/cross-attention/messages/en.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
{
"title": "Cross-Attention",
"description": "An attention pattern where queries come from one stream while keys and values come from a different memory source.",
"openingSummary": "Cross-attention is the attention pattern that lets one sequence ask questions of a different sequence or memory bank, which is why decoder layers can read encoder outputs and multimodal models can fuse text with image or audio features.",
"sections": {
"whatItIs": {
"title": "What It Is",
"body": "Cross-attention is an attention variant in which the query vectors come from one stream, but the key and value vectors come from somewhere else. The model still performs a weighted lookup, but it no longer reads only from the same sequence that produced the query."
},
"whyItExists": {
"title": "Why It Exists",
"body": "Self-attention is enough when one sequence only needs to mix information inside itself. Cross-attention solves the different problem of conditioning one stream on information stored elsewhere, such as a decoder reading an encoder memory or a text stack reading image features."
},
"howItWorks": {
"title": "How It Works",
"body": "A target stream produces queries, while a separate source stream produces keys and values. Each query scores the external memory slots, turns those scores into weights with softmax, and blends the matching value vectors into a context vector for the target stream. The critical difference from self-attention is the memory source: the target tokens ask, but a different representation answers."
},
"mathOrComputeSchema": {
"title": "Math Or Compute Schema",
"body": "The formulas below contrast self-attention with cross-attention. The weighted lookup is still scaled dot-product attention, but cross-attention changes which hidden states create the queries versus the keys and values."
},
"comparedToNearbyModules": {
"title": "Compared To Nearby Modules",
"body": "Compared with the broader attention overview, cross-attention is one specific memory-source pattern rather than the whole family. Compared with multi-head attention, it can still use multiple heads but changes where keys and values come from. Compared with causal attention, the key difference is not next-token masking: causal attention usually still reads the same growing sequence, while cross-attention reads a different memory and can appear inside a causal decoder block. Compared with bidirectional attention, cross-attention again changes the memory source rather than simply opening left and right context inside one sequence."
},
"exampleArchitectures": {
"title": "Example Architectures",
"body": "Cross-attention appears in encoder-decoder Transformers where decoder states read encoder outputs, in multimodal models where text queries image or audio features, and in retrieval-style systems where the active stream reads an external memory representation."
},
"limitationsAndTradeoffs": {
"title": "Limitations And Tradeoffs",
"body": "Cross-attention adds another memory interface, which means extra projections, more tensors to keep available, and more places where weak source representations can hurt the result. If the external memory is noisy or badly aligned with the target stream, the lookup can pull in the wrong evidence."
},
"whyItStillMatters": {
"title": "Why It Still Matters",
"body": "Many important model designs depend on one stream reading another without collapsing both into one shared sequence. A dedicated cross-attention page makes that bridge mechanism clear instead of leaving it implied inside larger architecture diagrams."
},
"related": {
"title": "Related"
},
"tags": {
"title": "Tags"
},
"references": {
"title": "References"
}
},
"assets": {
"computeFlow": {
"alt": "Self-attention versus cross-attention memory source comparison",
"caption": "Cross-attention keeps the query on the active target stream while keys and values stay on a separate memory source, unlike self-attention where all three come from the same sequence.",
"variants": {
"self": {
"label": "Self-attention"
},
"cross": {
"label": "Cross-attention"
}
}
},
"comparisonTable": {}
},
"tables": {
"comparison": {
"columns": {
"cross": {
"title": "Cross-Attention"
},
"attention": {
"title": "Attention Overview"
},
"mha": {
"title": "Multi-Head Attention"
},
"causal": {
"title": "Causal Attention"
},
"bidirectional": {
"title": "Bidirectional Attention"
}
},
"dimensions": {
"querySource": "Where queries come from",
"memorySource": "Where keys and values come from",
"mainUse": "Main use context"
},
"values": {
"cross": {
"querySource": "The active target stream that needs outside information",
"memorySource": "A different source sequence or memory bank",
"mainUse": "Conditioning one stream on another, such as decoder-on-encoder or text-on-image"
},
"attention": {
"querySource": "Depends on the attention pattern being discussed",
"memorySource": "May be the same sequence or a separate memory depending on the design",
"mainUse": "General weighted lookup across Transformer modules"
},
"mha": {
"querySource": "The same sequence that also produces keys and values in standard self-attention",
"memorySource": "The same sequence, split across multiple heads",
"mainUse": "Baseline dense attention inside Transformer blocks"
},
"causal": {
"querySource": "The active generated prefix up to the current token",
"memorySource": "The same growing sequence, but future positions are masked out",
"mainUse": "Autoregressive decoding where each token predicts the next one without seeing future tokens"
},
"bidirectional": {
"querySource": "The visible sequence being encoded",
"memorySource": "The same visible sequence with left and right context available",
"mainUse": "Encoder-style full-context understanding"
}
}
}
},
"math": {
"selfAttentionSchema": {
"label": "Self-attention",
"formula": "\\mathrm{Attention}(Q(X), K(X), V(X)) = \\mathrm{softmax}\\!\\left(\\frac{Q(X) K(X)^{\\top}}{\\sqrt{d_k}}\\right) V(X)",
"variableDefinitions": {
"x": {
"term": "X",
"definition": "Hidden states from the same source sequence."
},
"q": {
"term": "Q(X)",
"definition": "Query projection of X."
},
"k": {
"term": "K(X)",
"definition": "Key projection of X."
},
"v": {
"term": "V(X)",
"definition": "Value projection of X."
},
"dk": {
"term": "d_k",
"definition": "Key dimension per head."
}
}
},
"crossAttentionSchema": {
"label": "Cross-attention",
"formula": "\\mathrm{Attention}(Q(Y), K(X), V(X)) = \\mathrm{softmax}\\!\\left(\\frac{Q(Y) K(X)^{\\top}}{\\sqrt{d_k}}\\right) V(X)",
"variableDefinitions": {
"x": {
"term": "X",
"definition": "Hidden states from the external memory source."
},
"y": {
"term": "Y",
"definition": "Hidden states from the active target stream."
},
"q": {
"term": "Q(Y)",
"definition": "Query projection of Y."
},
"k": {
"term": "K(X)",
"definition": "Key projection of X."
},
"v": {
"term": "V(X)",
"definition": "Value projection of X."
},
"dk": {
"term": "d_k",
"definition": "Key dimension per head."
}
}
}
},
"graph": {
"nodes": {
"timeCurrentQuery": {
"label": "q_t"
},
"timeKv0": {
"label": "KV^X_0"
},
"timeKv1": {
"label": "KV^X_1"
},
"timeKvEllipsis": {
"label": "\\cdots"
},
"timeKvT3": {
"label": "KV^X_{s-3}"
},
"timeKvT2": {
"label": "KV^X_{s-2}"
},
"timeKvT1": {
"label": "KV^X_{s-1}"
}
}
}
}
85 changes: 85 additions & 0 deletions src/content/docs/modules/cross-attention/page.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
---
title: Cross-Attention
description: How cross-attention lets one sequence query a different memory source instead of reading only from itself.
kind: "module"
registryId: "module.cross-attention"
messageNamespace: "local"
assetNamespace: "local"
status: "published"
tags:
- attention
aliases:
- "cross attention"
- "cross-attention"
- "encoder-decoder attention"
updatedAt: "2026-06-21"
---

import { CitationList } from "@/features/docs/components/CitationList";
import { RelatedDocs } from "@/features/docs/components/RelatedDocs";
import { Section } from "@/features/docs/components/Section";
import { T } from "@/features/docs/components/T";
import { TagPillList } from "@/features/docs/components/TagPillList";
import { ModuleComparisonTable } from "@/features/models/components/ModuleComparisonTable";
import { ModuleGraph } from "@/features/models/components/ModuleGraph";
import { ModuleAtAGlance } from "@/features/models/components/ModuleAtAGlance";
import { ModuleAttentionSchemaComparison } from "@/features/models/components/ModuleAttentionSchemaComparison";
import { ModelsUsingModule } from "@/features/models/components/ModelsUsingModule";


<ModuleAtAGlance registryId="module.cross-attention" />

<Section id="what-it-is" titleKey="sections.whatItIs.title">
<T k="sections.whatItIs.body" />
</Section>

<Section id="why-it-exists" titleKey="sections.whyItExists.title">
<T k="sections.whyItExists.body" />
</Section>

<Section id="how-it-works" titleKey="sections.howItWorks.title">
<T k="sections.howItWorks.body" />

<ModuleGraph registryId="module.cross-attention" assetId="computeFlow" />
</Section>

<Section id="math-or-compute-schema" titleKey="sections.mathOrComputeSchema.title">
<T k="sections.mathOrComputeSchema.body" />
<ModuleAttentionSchemaComparison
schemaIds={["selfAttention", "crossAttention"]}
/>
</Section>

<Section id="compared-to-nearby-modules" titleKey="sections.comparedToNearbyModules.title">
<T k="sections.comparedToNearbyModules.body" />

<ModuleComparisonTable
registryId="module.cross-attention"
assetId="comparisonTable"
/>
</Section>

<Section id="example-architectures" titleKey="sections.exampleArchitectures.title">
<T k="sections.exampleArchitectures.body" />
<ModelsUsingModule registryId="module.cross-attention" />
</Section>

<Section id="limitations-and-tradeoffs" titleKey="sections.limitationsAndTradeoffs.title">
<T k="sections.limitationsAndTradeoffs.body" />
</Section>

<Section id="why-it-still-matters" titleKey="sections.whyItStillMatters.title">
<T k="sections.whyItStillMatters.body" />
</Section>

<Section id="related" titleKey="sections.related.title">
<RelatedDocs registryId="module.cross-attention" />
</Section>

<Section id="tags" titleKey="sections.tags.title">
<TagPillList registryId="module.cross-attention" showDescriptions />
</Section>

<Section id="references" titleKey="sections.references.title">
<CitationList registryId="module.cross-attention" />
</Section>
Loading
Loading