daydreamlive · leszko · Jun 12, 2026
diff --git a/acestep/streaming/config.py b/acestep/streaming/config.py
@@ -52,6 +52,14 @@ class SessionConfig:
     fixture_name: str | None = None
     use_server_fixture: bool = False
     stem_source_mode: str | None = None
+    # Text-to-music mode: no input audio at all. The server synthesizes a
+    # silent source of ``text2music_duration_s`` seconds and conditions the
+    # diffusion purely on the prompt (canonical silence latent = the
+    # model's trained "no reference" signal), so generation is text-only.
+    # The client sends NO binary PCM frame during the handshake when this
+    # is set (mirrors ``use_server_fixture``).
+    text2music: bool = False
+    text2music_duration_s: float = 60.0
     enabled_loras: list = field(default_factory=list)
     lora_strengths: dict = field(default_factory=dict)
     lora_paths: list = field(default_factory=list)

diff --git a/acestep/streaming/session.py b/acestep/streaming/session.py
@@ -114,6 +114,7 @@
     _resolve_bpm_key_source,
     _try_load_sidecar,
     SAMPLE_RATE,
+    resolve_text2music_source,
 )
 from acestep.streaming.state import SessionState
 from acestep.streaming.stems import (
@@ -935,11 +936,19 @@ def _apply_swap_if_pending(self) -> None:
             tags = state.swap_pending.get("tags")
             requested_key = state.swap_pending.get("key")
             requested_time_sig = state.swap_pending.get("time_signature")
-            new_fixture_name = state.swap_pending.get("fixture_name")
-            new_stem_source_mode = resolve_upload_stem_source_mode(
-                new_fixture_name,
-                state.swap_pending.get("stem_source_mode"),
-                known_fixtures=KNOWN_FIXTURES,
+            new_text2music = bool(state.swap_pending.get("text2music"))
+            # Text-to-music swap: the placeholder waveform carries no
+            # fixture identity and the stem machinery doesn't apply.
+            new_fixture_name = (
+                None if new_text2music
+                else state.swap_pending.get("fixture_name")
+            )
+            new_stem_source_mode = None if new_text2music else (
+                resolve_upload_stem_source_mode(
+                    new_fixture_name,
+                    state.swap_pending.get("stem_source_mode"),
+                    known_fixtures=KNOWN_FIXTURES,
+                )
             )
             if new_wf is None:
                 return
@@ -949,6 +958,7 @@ def _apply_swap_if_pending(self) -> None:
             state.swap_pending["time_signature"] = None
             state.swap_pending["fixture_name"] = None
             state.swap_pending["stem_source_mode"] = None
+            state.swap_pending["text2music"] = False
 
         # Initialized to None so the finally below can None-guard
         # cleanly in the (rare) case an exception fires between the
@@ -1028,16 +1038,27 @@ def _apply_swap_if_pending(self) -> None:
                     return
 
             new_audio_in = Audio(waveform=new_wf, sample_rate=SAMPLE_RATE)
-            new_source, new_bpm, new_key, new_time_sig = (
-                _resolve_bpm_key_source(
-                    self.session,
-                    audio_in=new_audio_in,
-                    fixture_name=new_fixture_name,
-                    samples=int(new_wf.shape[1]),
-                    key_override=requested_key,
-                    time_signature_override=requested_time_sig,
+            if new_text2music:
+                new_source, new_bpm, new_key, new_time_sig = (
+                    resolve_text2music_source(
+                        self.session, samples=int(new_wf.shape[1]),
+                    )
+                )
+                if requested_key:
+                    new_key = requested_key
+                if requested_time_sig:
+                    new_time_sig = requested_time_sig
+            else:
+                new_source, new_bpm, new_key, new_time_sig = (
+                    _resolve_bpm_key_source(
+                        self.session,
+                        audio_in=new_audio_in,
+                        fixture_name=new_fixture_name,
+                        samples=int(new_wf.shape[1]),
+                        key_override=requested_key,
+                        time_signature_override=requested_time_sig,
+                    )
                 )
-            )
             new_upload_stems, new_stem_error, new_source, new_wf = (
                 extract_and_select_upload_stem(
                     new_wf,
@@ -1858,11 +1879,16 @@ def swap_source(
         time_signature: str | None = None,
         fixture_name: str | None = None,
         stem_source_mode: str | None = None,
+        text2music: bool = False,
         origin: CommandOrigin = CommandOrigin.PRIMARY,
     ) -> None:
         """Stage a source swap. The runner applies it inside
         ``before_tick``; publishes :class:`SwapReady` or
-        :class:`SwapFailed` when the swap completes."""
+        :class:`SwapFailed` when the swap completes.
+
+        ``text2music=True`` marks ``audio`` as a synthesized silent
+        placeholder: the swap resolves the canonical-silence source
+        (prompt-only conditioning) instead of encoding the waveform."""
         state = self.state
         state.last_activity_ts = time.monotonic()
         effective_tags = tags or state.prompt_text
@@ -1877,6 +1903,7 @@ def swap_source(
             state.swap_pending["stem_source_mode"] = normalize_stem_source_mode(
                 stem_source_mode,
             )
+            state.swap_pending["text2music"] = bool(text2music)
 
     @requires_capability("write_audio", "write_audio")
     def write_audio(
@@ -2090,8 +2117,12 @@ def create(
         fast_vae = config.fast_vae
         walk_window = config.walk_window
         walk_window_s = config.walk_window_s
-        fixture_name = config.fixture_name
-        stem_source_mode = resolve_upload_stem_source_mode(
+        # Text-to-music: no input audio. The waveform is the synthesized
+        # silence placeholder; fixture / stem machinery doesn't apply
+        # (running Mel-Band RoFormer on zeros would be pure waste).
+        text2music = config.text2music
+        fixture_name = None if text2music else config.fixture_name
+        stem_source_mode = None if text2music else resolve_upload_stem_source_mode(
             fixture_name,
             normalize_stem_source_mode(config.stem_source_mode),
             known_fixtures=KNOWN_FIXTURES,
@@ -2242,14 +2273,21 @@ def create(
 
             audio_in = Audio(waveform=waveform, sample_rate=SAMPLE_RATE)
 
-            source, detected_bpm, detected_key, detected_time_signature = (
-                _resolve_bpm_key_source(
-                    engine_session,
-                    audio_in=audio_in,
-                    fixture_name=fixture_name,
-                    samples=int(waveform.shape[1]),
+            if text2music:
+                source, detected_bpm, detected_key, detected_time_signature = (
+                    resolve_text2music_source(
+                        engine_session, samples=int(waveform.shape[1]),
+                    )
+                )
+            else:
+                source, detected_bpm, detected_key, detected_time_signature = (
+                    _resolve_bpm_key_source(
+                        engine_session,
+                        audio_in=audio_in,
+                        fixture_name=fixture_name,
+                        samples=int(waveform.shape[1]),
+                    )
                 )
-            )
 
             upload_stems, stem_error, source, waveform = (
                 extract_and_select_upload_stem(

diff --git a/acestep/streaming/source.py b/acestep/streaming/source.py
@@ -133,6 +133,79 @@ def _load_clip_waveform(name: str) -> torch.Tensor:
     return _load_waveform_from_path(str(resolve_audio_clip(name)))
 
 
+# ---------------------------------------------------------------------------
+# Text-to-music (no input audio)
+# ---------------------------------------------------------------------------
+
+# Conditioning defaults baked into the text encoder when there is no audio
+# to detect them from. 120 BPM / C major / 4 are the model family's
+# most-supported values; the operator can re-steer key / time signature
+# live via the ``prompt`` command.
+TEXT2MUSIC_BPM = 120
+TEXT2MUSIC_KEY = "C major"
+TEXT2MUSIC_TIME_SIGNATURE = "4"
+
+# Floor for the synthesized silent source. Below ~10 s the loop seam
+# dominates the listening experience; the ceiling is the caller's TRT
+# profile cap (same clamp every uploaded source gets).
+TEXT2MUSIC_MIN_DURATION_S = 10.0
+
+# Samples per latent frame at the 48 kHz / 25 fps latent rate.
+_SAMPLES_PER_LATENT_FRAME = SAMPLE_RATE // 25
+
+
+def text2music_waveform(duration_s: float, *, max_seconds: float) -> torch.Tensor:
+    """Synthesize the silent stereo placeholder for a text-to-music source.
+
+    The waveform only seeds the playback ring buffer (the user hears
+    silence until generated slices land) and sets the session length;
+    callers run it through the same TRT-cap / pool-alignment trim as an
+    uploaded source.
+    """
+    dur = min(
+        max(float(duration_s), TEXT2MUSIC_MIN_DURATION_S), float(max_seconds),
+    )
+    return torch.zeros(2, int(dur * SAMPLE_RATE))
+
+
+def resolve_text2music_source(
+    session: Session, *, samples: int,
+) -> tuple[PreparedSource, int, str, str]:
+    """Text-to-music analog of :func:`_resolve_bpm_key_source`.
+
+    Both the source latent and the context latent are the CANONICAL
+    silence latent from the checkpoint (``EmptyLatent``), not a VAE
+    encode of digital zeros: ``silence_latent`` is what the model was
+    trained to read as "no reference audio" (its forward uses it to
+    simulate text2music mode), so structure conditioning is genuinely
+    absent rather than "semantic hints of an all-zero clip". This also
+    skips the VAE encode + semantic extract entirely, along with
+    librosa beat-tracking and CNN key detection (both meaningless on
+    silence — beat_track returns 0 BPM, which would poison the text
+    conditioning).
+    """
+    from acestep.nodes.vae_nodes import EmptyLatent
+
+    frames = samples // _SAMPLES_PER_LATENT_FRAME
+    latent = EmptyLatent().execute(
+        model=session.model, frames=frames,
+    )["latent"]
+    source = PreparedSource(
+        latent=latent,
+        context_latent=Latent(tensor=latent.tensor.clone()),
+    )
+    logger.info(
+        "text2music_source_ready frames={} duration_s={:.1f}",
+        frames, samples / SAMPLE_RATE,
+    )
+    return (
+        source,
+        TEXT2MUSIC_BPM,
+        TEXT2MUSIC_KEY,
+        TEXT2MUSIC_TIME_SIGNATURE,
+    )
+
+
 _VALID_TIME_SIG_STRS = frozenset(str(s) for s in VALID_TIME_SIGNATURES)
 
 

diff --git a/acestep/streaming/state.py b/acestep/streaming/state.py
@@ -38,6 +38,7 @@ def _default_swap_pending() -> dict:
         "time_signature": None,
         "fixture_name": None,
         "stem_source_mode": None,
+        "text2music": False,
     }
 
 

diff --git a/demos/realtime_motion_graph_web/protocol.py b/demos/realtime_motion_graph_web/protocol.py
@@ -380,13 +380,19 @@ class EventSpec:
                       description="When true, the server loads the named source "
                                   "off its own disk and NO binary frame is "
                                   "sent."),
+            FieldSpec("text2music", "bool",
+                      description="When true, the server swaps to a synthesized "
+                                  "silent source (text-to-music mode: generation "
+                                  "is conditioned on the prompt alone) and NO "
+                                  "binary frame is sent. fixture_name is "
+                                  "ignored."),
         ),
         binary=True,
         binary_optional=True,
         requires="swap",
         description="Replace the playback source in-flight. A binary PCM frame "
-                    "follows UNLESS use_server_source is set. Acked by "
-                    "swap_ready (+ binary buffer) / swap_failed.",
+                    "follows UNLESS use_server_source or text2music is set. "
+                    "Acked by swap_ready (+ binary buffer) / swap_failed.",
     ),
     CommandSpec(
         "write_audio",

diff --git a/demos/realtime_motion_graph_web/web/app/globals.css b/demos/realtime_motion_graph_web/web/app/globals.css
@@ -7102,6 +7102,45 @@ body.curve-open #install-video-area #graph {
   opacity: 1;
 }
 
+/* Pinned text-to-music sleeve — prompt-only generation, no input audio.
+   Sits in the pinned region above the upload sleeve, separated from the
+   scroll list by the same hairline treatment. */
+.audio-source-sleeve--text2music {
+  flex-shrink: 0;
+  position: relative;
+  margin-top: 2px;
+  padding-top: 9px;
+  color: var(--accent-hover);
+  text-transform: uppercase;
+  letter-spacing: var(--tracking-wide);
+  font-size: 10px;
+}
+.audio-source-sleeve--text2music::before {
+  content: "";
+  position: absolute;
+  top: 0;
+  left: 8px;
+  right: 8px;
+  height: 1px;
+  background: linear-gradient(90deg,
+    transparent,
+    var(--frame-line) 25%,
+    var(--frame-line) 75%,
+    transparent);
+}
+.audio-source-sleeve--text2music:hover {
+  background: var(--accent-medium);
+  color: var(--accent-hover);
+}
+.audio-source-sleeve-art--text2music {
+  border: none;
+  color: var(--accent);
+  width: 14px;
+  height: 14px;
+  opacity: 1;
+}
+.audio-source-sleeve-art--text2music::after { display: none; }
+
 /* Pinned upload sleeve — always rendered, separated from the scroll list
    above by a thin hairline so it reads as a distinct "you" slot. */
 .audio-source-sleeve--upload {