diff --git a/acestep/streaming/config.py b/acestep/streaming/config.py
index 65fadf56..c66ab167 100644
--- a/acestep/streaming/config.py
+++ b/acestep/streaming/config.py
@@ -52,6 +52,14 @@ class SessionConfig:
fixture_name: str | None = None
use_server_fixture: bool = False
stem_source_mode: str | None = None
+ # Text-to-music mode: no input audio at all. The server synthesizes a
+ # silent source of ``text2music_duration_s`` seconds and conditions the
+ # diffusion purely on the prompt (canonical silence latent = the
+ # model's trained "no reference" signal), so generation is text-only.
+ # The client sends NO binary PCM frame during the handshake when this
+ # is set (mirrors ``use_server_fixture``).
+ text2music: bool = False
+ text2music_duration_s: float = 60.0
enabled_loras: list = field(default_factory=list)
lora_strengths: dict = field(default_factory=dict)
lora_paths: list = field(default_factory=list)
diff --git a/acestep/streaming/session.py b/acestep/streaming/session.py
index 0814b207..14333203 100644
--- a/acestep/streaming/session.py
+++ b/acestep/streaming/session.py
@@ -114,6 +114,7 @@
_resolve_bpm_key_source,
_try_load_sidecar,
SAMPLE_RATE,
+ resolve_text2music_source,
)
from acestep.streaming.state import SessionState
from acestep.streaming.stems import (
@@ -935,11 +936,19 @@ def _apply_swap_if_pending(self) -> None:
tags = state.swap_pending.get("tags")
requested_key = state.swap_pending.get("key")
requested_time_sig = state.swap_pending.get("time_signature")
- new_fixture_name = state.swap_pending.get("fixture_name")
- new_stem_source_mode = resolve_upload_stem_source_mode(
- new_fixture_name,
- state.swap_pending.get("stem_source_mode"),
- known_fixtures=KNOWN_FIXTURES,
+ new_text2music = bool(state.swap_pending.get("text2music"))
+ # Text-to-music swap: the placeholder waveform carries no
+ # fixture identity and the stem machinery doesn't apply.
+ new_fixture_name = (
+ None if new_text2music
+ else state.swap_pending.get("fixture_name")
+ )
+ new_stem_source_mode = None if new_text2music else (
+ resolve_upload_stem_source_mode(
+ new_fixture_name,
+ state.swap_pending.get("stem_source_mode"),
+ known_fixtures=KNOWN_FIXTURES,
+ )
)
if new_wf is None:
return
@@ -949,6 +958,7 @@ def _apply_swap_if_pending(self) -> None:
state.swap_pending["time_signature"] = None
state.swap_pending["fixture_name"] = None
state.swap_pending["stem_source_mode"] = None
+ state.swap_pending["text2music"] = False
# Initialized to None so the finally below can None-guard
# cleanly in the (rare) case an exception fires between the
@@ -1028,16 +1038,27 @@ def _apply_swap_if_pending(self) -> None:
return
new_audio_in = Audio(waveform=new_wf, sample_rate=SAMPLE_RATE)
- new_source, new_bpm, new_key, new_time_sig = (
- _resolve_bpm_key_source(
- self.session,
- audio_in=new_audio_in,
- fixture_name=new_fixture_name,
- samples=int(new_wf.shape[1]),
- key_override=requested_key,
- time_signature_override=requested_time_sig,
+ if new_text2music:
+ new_source, new_bpm, new_key, new_time_sig = (
+ resolve_text2music_source(
+ self.session, samples=int(new_wf.shape[1]),
+ )
+ )
+ if requested_key:
+ new_key = requested_key
+ if requested_time_sig:
+ new_time_sig = requested_time_sig
+ else:
+ new_source, new_bpm, new_key, new_time_sig = (
+ _resolve_bpm_key_source(
+ self.session,
+ audio_in=new_audio_in,
+ fixture_name=new_fixture_name,
+ samples=int(new_wf.shape[1]),
+ key_override=requested_key,
+ time_signature_override=requested_time_sig,
+ )
)
- )
new_upload_stems, new_stem_error, new_source, new_wf = (
extract_and_select_upload_stem(
new_wf,
@@ -1858,11 +1879,16 @@ def swap_source(
time_signature: str | None = None,
fixture_name: str | None = None,
stem_source_mode: str | None = None,
+ text2music: bool = False,
origin: CommandOrigin = CommandOrigin.PRIMARY,
) -> None:
"""Stage a source swap. The runner applies it inside
``before_tick``; publishes :class:`SwapReady` or
- :class:`SwapFailed` when the swap completes."""
+ :class:`SwapFailed` when the swap completes.
+
+ ``text2music=True`` marks ``audio`` as a synthesized silent
+ placeholder: the swap resolves the canonical-silence source
+ (prompt-only conditioning) instead of encoding the waveform."""
state = self.state
state.last_activity_ts = time.monotonic()
effective_tags = tags or state.prompt_text
@@ -1877,6 +1903,7 @@ def swap_source(
state.swap_pending["stem_source_mode"] = normalize_stem_source_mode(
stem_source_mode,
)
+ state.swap_pending["text2music"] = bool(text2music)
@requires_capability("write_audio", "write_audio")
def write_audio(
@@ -2090,8 +2117,12 @@ def create(
fast_vae = config.fast_vae
walk_window = config.walk_window
walk_window_s = config.walk_window_s
- fixture_name = config.fixture_name
- stem_source_mode = resolve_upload_stem_source_mode(
+ # Text-to-music: no input audio. The waveform is the synthesized
+ # silence placeholder; fixture / stem machinery doesn't apply
+ # (running Mel-Band RoFormer on zeros would be pure waste).
+ text2music = config.text2music
+ fixture_name = None if text2music else config.fixture_name
+ stem_source_mode = None if text2music else resolve_upload_stem_source_mode(
fixture_name,
normalize_stem_source_mode(config.stem_source_mode),
known_fixtures=KNOWN_FIXTURES,
@@ -2242,14 +2273,21 @@ def create(
audio_in = Audio(waveform=waveform, sample_rate=SAMPLE_RATE)
- source, detected_bpm, detected_key, detected_time_signature = (
- _resolve_bpm_key_source(
- engine_session,
- audio_in=audio_in,
- fixture_name=fixture_name,
- samples=int(waveform.shape[1]),
+ if text2music:
+ source, detected_bpm, detected_key, detected_time_signature = (
+ resolve_text2music_source(
+ engine_session, samples=int(waveform.shape[1]),
+ )
+ )
+ else:
+ source, detected_bpm, detected_key, detected_time_signature = (
+ _resolve_bpm_key_source(
+ engine_session,
+ audio_in=audio_in,
+ fixture_name=fixture_name,
+ samples=int(waveform.shape[1]),
+ )
)
- )
upload_stems, stem_error, source, waveform = (
extract_and_select_upload_stem(
diff --git a/acestep/streaming/source.py b/acestep/streaming/source.py
index a9ffef81..2d2f7111 100644
--- a/acestep/streaming/source.py
+++ b/acestep/streaming/source.py
@@ -133,6 +133,79 @@ def _load_clip_waveform(name: str) -> torch.Tensor:
return _load_waveform_from_path(str(resolve_audio_clip(name)))
+# ---------------------------------------------------------------------------
+# Text-to-music (no input audio)
+# ---------------------------------------------------------------------------
+
+# Conditioning defaults baked into the text encoder when there is no audio
+# to detect them from. 120 BPM / C major / 4 are the model family's
+# most-supported values; the operator can re-steer key / time signature
+# live via the ``prompt`` command.
+TEXT2MUSIC_BPM = 120
+TEXT2MUSIC_KEY = "C major"
+TEXT2MUSIC_TIME_SIGNATURE = "4"
+
+# Floor for the synthesized silent source. Below ~10 s the loop seam
+# dominates the listening experience; the ceiling is the caller's TRT
+# profile cap (same clamp every uploaded source gets).
+TEXT2MUSIC_MIN_DURATION_S = 10.0
+
+# Samples per latent frame at the 48 kHz / 25 fps latent rate.
+_SAMPLES_PER_LATENT_FRAME = SAMPLE_RATE // 25
+
+
+def text2music_waveform(duration_s: float, *, max_seconds: float) -> torch.Tensor:
+ """Synthesize the silent stereo placeholder for a text-to-music source.
+
+ The waveform only seeds the playback ring buffer (the user hears
+ silence until generated slices land) and sets the session length;
+ callers run it through the same TRT-cap / pool-alignment trim as an
+ uploaded source.
+ """
+ dur = min(
+ max(float(duration_s), TEXT2MUSIC_MIN_DURATION_S), float(max_seconds),
+ )
+ return torch.zeros(2, int(dur * SAMPLE_RATE))
+
+
+def resolve_text2music_source(
+ session: Session, *, samples: int,
+) -> tuple[PreparedSource, int, str, str]:
+ """Text-to-music analog of :func:`_resolve_bpm_key_source`.
+
+ Both the source latent and the context latent are the CANONICAL
+ silence latent from the checkpoint (``EmptyLatent``), not a VAE
+ encode of digital zeros: ``silence_latent`` is what the model was
+ trained to read as "no reference audio" (its forward uses it to
+ simulate text2music mode), so structure conditioning is genuinely
+ absent rather than "semantic hints of an all-zero clip". This also
+ skips the VAE encode + semantic extract entirely, along with
+ librosa beat-tracking and CNN key detection (both meaningless on
+ silence — beat_track returns 0 BPM, which would poison the text
+ conditioning).
+ """
+ from acestep.nodes.vae_nodes import EmptyLatent
+
+ frames = samples // _SAMPLES_PER_LATENT_FRAME
+ latent = EmptyLatent().execute(
+ model=session.model, frames=frames,
+ )["latent"]
+ source = PreparedSource(
+ latent=latent,
+ context_latent=Latent(tensor=latent.tensor.clone()),
+ )
+ logger.info(
+ "text2music_source_ready frames={} duration_s={:.1f}",
+ frames, samples / SAMPLE_RATE,
+ )
+ return (
+ source,
+ TEXT2MUSIC_BPM,
+ TEXT2MUSIC_KEY,
+ TEXT2MUSIC_TIME_SIGNATURE,
+ )
+
+
_VALID_TIME_SIG_STRS = frozenset(str(s) for s in VALID_TIME_SIGNATURES)
diff --git a/acestep/streaming/state.py b/acestep/streaming/state.py
index 3e294e88..c55bf9e2 100644
--- a/acestep/streaming/state.py
+++ b/acestep/streaming/state.py
@@ -38,6 +38,7 @@ def _default_swap_pending() -> dict:
"time_signature": None,
"fixture_name": None,
"stem_source_mode": None,
+ "text2music": False,
}
diff --git a/demos/realtime_motion_graph_web/protocol.py b/demos/realtime_motion_graph_web/protocol.py
index 2a7f8e45..f52a9083 100644
--- a/demos/realtime_motion_graph_web/protocol.py
+++ b/demos/realtime_motion_graph_web/protocol.py
@@ -380,13 +380,19 @@ class EventSpec:
description="When true, the server loads the named source "
"off its own disk and NO binary frame is "
"sent."),
+ FieldSpec("text2music", "bool",
+ description="When true, the server swaps to a synthesized "
+ "silent source (text-to-music mode: generation "
+ "is conditioned on the prompt alone) and NO "
+ "binary frame is sent. fixture_name is "
+ "ignored."),
),
binary=True,
binary_optional=True,
requires="swap",
description="Replace the playback source in-flight. A binary PCM frame "
- "follows UNLESS use_server_source is set. Acked by "
- "swap_ready (+ binary buffer) / swap_failed.",
+ "follows UNLESS use_server_source or text2music is set. "
+ "Acked by swap_ready (+ binary buffer) / swap_failed.",
),
CommandSpec(
"write_audio",
diff --git a/demos/realtime_motion_graph_web/web/app/globals.css b/demos/realtime_motion_graph_web/web/app/globals.css
index 7ffb55e8..9560a501 100644
--- a/demos/realtime_motion_graph_web/web/app/globals.css
+++ b/demos/realtime_motion_graph_web/web/app/globals.css
@@ -7102,6 +7102,45 @@ body.curve-open #install-video-area #graph {
opacity: 1;
}
+/* Pinned text-to-music sleeve — prompt-only generation, no input audio.
+ Sits in the pinned region above the upload sleeve, separated from the
+ scroll list by the same hairline treatment. */
+.audio-source-sleeve--text2music {
+ flex-shrink: 0;
+ position: relative;
+ margin-top: 2px;
+ padding-top: 9px;
+ color: var(--accent-hover);
+ text-transform: uppercase;
+ letter-spacing: var(--tracking-wide);
+ font-size: 10px;
+}
+.audio-source-sleeve--text2music::before {
+ content: "";
+ position: absolute;
+ top: 0;
+ left: 8px;
+ right: 8px;
+ height: 1px;
+ background: linear-gradient(90deg,
+ transparent,
+ var(--frame-line) 25%,
+ var(--frame-line) 75%,
+ transparent);
+}
+.audio-source-sleeve--text2music:hover {
+ background: var(--accent-medium);
+ color: var(--accent-hover);
+}
+.audio-source-sleeve-art--text2music {
+ border: none;
+ color: var(--accent);
+ width: 14px;
+ height: 14px;
+ opacity: 1;
+}
+.audio-source-sleeve-art--text2music::after { display: none; }
+
/* Pinned upload sleeve — always rendered, separated from the scroll list
above by a thin hairline so it reads as a distinct "you" slot. */
.audio-source-sleeve--upload {
diff --git a/demos/realtime_motion_graph_web/web/components/Performance/AudioSourceCrate.tsx b/demos/realtime_motion_graph_web/web/components/Performance/AudioSourceCrate.tsx
index e0ed79d0..9c66703f 100644
--- a/demos/realtime_motion_graph_web/web/components/Performance/AudioSourceCrate.tsx
+++ b/demos/realtime_motion_graph_web/web/components/Performance/AudioSourceCrate.tsx
@@ -16,6 +16,11 @@ import { useUploadOnboardingHint } from "@/hooks/useUploadOnboardingHint";
import { trimAudioBuffer } from "@/lib/audio/trimAudioBuffer";
import { useConfig } from "@/lib/config";
import { LOCAL_MODE } from "@/lib/runtime";
+import {
+ TEXT2MUSIC_LABEL,
+ TEXT2MUSIC_SOURCE,
+ isText2Music,
+} from "@/lib/text2music";
import { useCustomTracksStore } from "@/store/useCustomTracksStore";
import { usePerformanceStore } from "@/store/usePerformanceStore";
import { useSessionStore } from "@/store/useSessionStore";
@@ -85,6 +90,28 @@ function MicIcon({ size = 14 }: { size?: number }) {
);
}
+function TextPromptIcon({ size = 14 }: { size?: number }) {
+ return (
+
+ );
+}
+
function NoteIcon({ size = 16 }: { size?: number }) {
return (