diff --git a/acestep/streaming/config.py b/acestep/streaming/config.py index 65fadf56..c66ab167 100644 --- a/acestep/streaming/config.py +++ b/acestep/streaming/config.py @@ -52,6 +52,14 @@ class SessionConfig: fixture_name: str | None = None use_server_fixture: bool = False stem_source_mode: str | None = None + # Text-to-music mode: no input audio at all. The server synthesizes a + # silent source of ``text2music_duration_s`` seconds and conditions the + # diffusion purely on the prompt (canonical silence latent = the + # model's trained "no reference" signal), so generation is text-only. + # The client sends NO binary PCM frame during the handshake when this + # is set (mirrors ``use_server_fixture``). + text2music: bool = False + text2music_duration_s: float = 60.0 enabled_loras: list = field(default_factory=list) lora_strengths: dict = field(default_factory=dict) lora_paths: list = field(default_factory=list) diff --git a/acestep/streaming/session.py b/acestep/streaming/session.py index 0814b207..14333203 100644 --- a/acestep/streaming/session.py +++ b/acestep/streaming/session.py @@ -114,6 +114,7 @@ _resolve_bpm_key_source, _try_load_sidecar, SAMPLE_RATE, + resolve_text2music_source, ) from acestep.streaming.state import SessionState from acestep.streaming.stems import ( @@ -935,11 +936,19 @@ def _apply_swap_if_pending(self) -> None: tags = state.swap_pending.get("tags") requested_key = state.swap_pending.get("key") requested_time_sig = state.swap_pending.get("time_signature") - new_fixture_name = state.swap_pending.get("fixture_name") - new_stem_source_mode = resolve_upload_stem_source_mode( - new_fixture_name, - state.swap_pending.get("stem_source_mode"), - known_fixtures=KNOWN_FIXTURES, + new_text2music = bool(state.swap_pending.get("text2music")) + # Text-to-music swap: the placeholder waveform carries no + # fixture identity and the stem machinery doesn't apply. + new_fixture_name = ( + None if new_text2music + else state.swap_pending.get("fixture_name") + ) + new_stem_source_mode = None if new_text2music else ( + resolve_upload_stem_source_mode( + new_fixture_name, + state.swap_pending.get("stem_source_mode"), + known_fixtures=KNOWN_FIXTURES, + ) ) if new_wf is None: return @@ -949,6 +958,7 @@ def _apply_swap_if_pending(self) -> None: state.swap_pending["time_signature"] = None state.swap_pending["fixture_name"] = None state.swap_pending["stem_source_mode"] = None + state.swap_pending["text2music"] = False # Initialized to None so the finally below can None-guard # cleanly in the (rare) case an exception fires between the @@ -1028,16 +1038,27 @@ def _apply_swap_if_pending(self) -> None: return new_audio_in = Audio(waveform=new_wf, sample_rate=SAMPLE_RATE) - new_source, new_bpm, new_key, new_time_sig = ( - _resolve_bpm_key_source( - self.session, - audio_in=new_audio_in, - fixture_name=new_fixture_name, - samples=int(new_wf.shape[1]), - key_override=requested_key, - time_signature_override=requested_time_sig, + if new_text2music: + new_source, new_bpm, new_key, new_time_sig = ( + resolve_text2music_source( + self.session, samples=int(new_wf.shape[1]), + ) + ) + if requested_key: + new_key = requested_key + if requested_time_sig: + new_time_sig = requested_time_sig + else: + new_source, new_bpm, new_key, new_time_sig = ( + _resolve_bpm_key_source( + self.session, + audio_in=new_audio_in, + fixture_name=new_fixture_name, + samples=int(new_wf.shape[1]), + key_override=requested_key, + time_signature_override=requested_time_sig, + ) ) - ) new_upload_stems, new_stem_error, new_source, new_wf = ( extract_and_select_upload_stem( new_wf, @@ -1858,11 +1879,16 @@ def swap_source( time_signature: str | None = None, fixture_name: str | None = None, stem_source_mode: str | None = None, + text2music: bool = False, origin: CommandOrigin = CommandOrigin.PRIMARY, ) -> None: """Stage a source swap. The runner applies it inside ``before_tick``; publishes :class:`SwapReady` or - :class:`SwapFailed` when the swap completes.""" + :class:`SwapFailed` when the swap completes. + + ``text2music=True`` marks ``audio`` as a synthesized silent + placeholder: the swap resolves the canonical-silence source + (prompt-only conditioning) instead of encoding the waveform.""" state = self.state state.last_activity_ts = time.monotonic() effective_tags = tags or state.prompt_text @@ -1877,6 +1903,7 @@ def swap_source( state.swap_pending["stem_source_mode"] = normalize_stem_source_mode( stem_source_mode, ) + state.swap_pending["text2music"] = bool(text2music) @requires_capability("write_audio", "write_audio") def write_audio( @@ -2090,8 +2117,12 @@ def create( fast_vae = config.fast_vae walk_window = config.walk_window walk_window_s = config.walk_window_s - fixture_name = config.fixture_name - stem_source_mode = resolve_upload_stem_source_mode( + # Text-to-music: no input audio. The waveform is the synthesized + # silence placeholder; fixture / stem machinery doesn't apply + # (running Mel-Band RoFormer on zeros would be pure waste). + text2music = config.text2music + fixture_name = None if text2music else config.fixture_name + stem_source_mode = None if text2music else resolve_upload_stem_source_mode( fixture_name, normalize_stem_source_mode(config.stem_source_mode), known_fixtures=KNOWN_FIXTURES, @@ -2242,14 +2273,21 @@ def create( audio_in = Audio(waveform=waveform, sample_rate=SAMPLE_RATE) - source, detected_bpm, detected_key, detected_time_signature = ( - _resolve_bpm_key_source( - engine_session, - audio_in=audio_in, - fixture_name=fixture_name, - samples=int(waveform.shape[1]), + if text2music: + source, detected_bpm, detected_key, detected_time_signature = ( + resolve_text2music_source( + engine_session, samples=int(waveform.shape[1]), + ) + ) + else: + source, detected_bpm, detected_key, detected_time_signature = ( + _resolve_bpm_key_source( + engine_session, + audio_in=audio_in, + fixture_name=fixture_name, + samples=int(waveform.shape[1]), + ) ) - ) upload_stems, stem_error, source, waveform = ( extract_and_select_upload_stem( diff --git a/acestep/streaming/source.py b/acestep/streaming/source.py index a9ffef81..2d2f7111 100644 --- a/acestep/streaming/source.py +++ b/acestep/streaming/source.py @@ -133,6 +133,79 @@ def _load_clip_waveform(name: str) -> torch.Tensor: return _load_waveform_from_path(str(resolve_audio_clip(name))) +# --------------------------------------------------------------------------- +# Text-to-music (no input audio) +# --------------------------------------------------------------------------- + +# Conditioning defaults baked into the text encoder when there is no audio +# to detect them from. 120 BPM / C major / 4 are the model family's +# most-supported values; the operator can re-steer key / time signature +# live via the ``prompt`` command. +TEXT2MUSIC_BPM = 120 +TEXT2MUSIC_KEY = "C major" +TEXT2MUSIC_TIME_SIGNATURE = "4" + +# Floor for the synthesized silent source. Below ~10 s the loop seam +# dominates the listening experience; the ceiling is the caller's TRT +# profile cap (same clamp every uploaded source gets). +TEXT2MUSIC_MIN_DURATION_S = 10.0 + +# Samples per latent frame at the 48 kHz / 25 fps latent rate. +_SAMPLES_PER_LATENT_FRAME = SAMPLE_RATE // 25 + + +def text2music_waveform(duration_s: float, *, max_seconds: float) -> torch.Tensor: + """Synthesize the silent stereo placeholder for a text-to-music source. + + The waveform only seeds the playback ring buffer (the user hears + silence until generated slices land) and sets the session length; + callers run it through the same TRT-cap / pool-alignment trim as an + uploaded source. + """ + dur = min( + max(float(duration_s), TEXT2MUSIC_MIN_DURATION_S), float(max_seconds), + ) + return torch.zeros(2, int(dur * SAMPLE_RATE)) + + +def resolve_text2music_source( + session: Session, *, samples: int, +) -> tuple[PreparedSource, int, str, str]: + """Text-to-music analog of :func:`_resolve_bpm_key_source`. + + Both the source latent and the context latent are the CANONICAL + silence latent from the checkpoint (``EmptyLatent``), not a VAE + encode of digital zeros: ``silence_latent`` is what the model was + trained to read as "no reference audio" (its forward uses it to + simulate text2music mode), so structure conditioning is genuinely + absent rather than "semantic hints of an all-zero clip". This also + skips the VAE encode + semantic extract entirely, along with + librosa beat-tracking and CNN key detection (both meaningless on + silence — beat_track returns 0 BPM, which would poison the text + conditioning). + """ + from acestep.nodes.vae_nodes import EmptyLatent + + frames = samples // _SAMPLES_PER_LATENT_FRAME + latent = EmptyLatent().execute( + model=session.model, frames=frames, + )["latent"] + source = PreparedSource( + latent=latent, + context_latent=Latent(tensor=latent.tensor.clone()), + ) + logger.info( + "text2music_source_ready frames={} duration_s={:.1f}", + frames, samples / SAMPLE_RATE, + ) + return ( + source, + TEXT2MUSIC_BPM, + TEXT2MUSIC_KEY, + TEXT2MUSIC_TIME_SIGNATURE, + ) + + _VALID_TIME_SIG_STRS = frozenset(str(s) for s in VALID_TIME_SIGNATURES) diff --git a/acestep/streaming/state.py b/acestep/streaming/state.py index 3e294e88..c55bf9e2 100644 --- a/acestep/streaming/state.py +++ b/acestep/streaming/state.py @@ -38,6 +38,7 @@ def _default_swap_pending() -> dict: "time_signature": None, "fixture_name": None, "stem_source_mode": None, + "text2music": False, } diff --git a/demos/realtime_motion_graph_web/protocol.py b/demos/realtime_motion_graph_web/protocol.py index 2a7f8e45..f52a9083 100644 --- a/demos/realtime_motion_graph_web/protocol.py +++ b/demos/realtime_motion_graph_web/protocol.py @@ -380,13 +380,19 @@ class EventSpec: description="When true, the server loads the named source " "off its own disk and NO binary frame is " "sent."), + FieldSpec("text2music", "bool", + description="When true, the server swaps to a synthesized " + "silent source (text-to-music mode: generation " + "is conditioned on the prompt alone) and NO " + "binary frame is sent. fixture_name is " + "ignored."), ), binary=True, binary_optional=True, requires="swap", description="Replace the playback source in-flight. A binary PCM frame " - "follows UNLESS use_server_source is set. Acked by " - "swap_ready (+ binary buffer) / swap_failed.", + "follows UNLESS use_server_source or text2music is set. " + "Acked by swap_ready (+ binary buffer) / swap_failed.", ), CommandSpec( "write_audio", diff --git a/demos/realtime_motion_graph_web/web/app/globals.css b/demos/realtime_motion_graph_web/web/app/globals.css index 7ffb55e8..9560a501 100644 --- a/demos/realtime_motion_graph_web/web/app/globals.css +++ b/demos/realtime_motion_graph_web/web/app/globals.css @@ -7102,6 +7102,45 @@ body.curve-open #install-video-area #graph { opacity: 1; } +/* Pinned text-to-music sleeve — prompt-only generation, no input audio. + Sits in the pinned region above the upload sleeve, separated from the + scroll list by the same hairline treatment. */ +.audio-source-sleeve--text2music { + flex-shrink: 0; + position: relative; + margin-top: 2px; + padding-top: 9px; + color: var(--accent-hover); + text-transform: uppercase; + letter-spacing: var(--tracking-wide); + font-size: 10px; +} +.audio-source-sleeve--text2music::before { + content: ""; + position: absolute; + top: 0; + left: 8px; + right: 8px; + height: 1px; + background: linear-gradient(90deg, + transparent, + var(--frame-line) 25%, + var(--frame-line) 75%, + transparent); +} +.audio-source-sleeve--text2music:hover { + background: var(--accent-medium); + color: var(--accent-hover); +} +.audio-source-sleeve-art--text2music { + border: none; + color: var(--accent); + width: 14px; + height: 14px; + opacity: 1; +} +.audio-source-sleeve-art--text2music::after { display: none; } + /* Pinned upload sleeve — always rendered, separated from the scroll list above by a thin hairline so it reads as a distinct "you" slot. */ .audio-source-sleeve--upload { diff --git a/demos/realtime_motion_graph_web/web/components/Performance/AudioSourceCrate.tsx b/demos/realtime_motion_graph_web/web/components/Performance/AudioSourceCrate.tsx index e0ed79d0..9c66703f 100644 --- a/demos/realtime_motion_graph_web/web/components/Performance/AudioSourceCrate.tsx +++ b/demos/realtime_motion_graph_web/web/components/Performance/AudioSourceCrate.tsx @@ -16,6 +16,11 @@ import { useUploadOnboardingHint } from "@/hooks/useUploadOnboardingHint"; import { trimAudioBuffer } from "@/lib/audio/trimAudioBuffer"; import { useConfig } from "@/lib/config"; import { LOCAL_MODE } from "@/lib/runtime"; +import { + TEXT2MUSIC_LABEL, + TEXT2MUSIC_SOURCE, + isText2Music, +} from "@/lib/text2music"; import { useCustomTracksStore } from "@/store/useCustomTracksStore"; import { usePerformanceStore } from "@/store/usePerformanceStore"; import { useSessionStore } from "@/store/useSessionStore"; @@ -85,6 +90,28 @@ function MicIcon({ size = 14 }: { size?: number }) { ); } +function TextPromptIcon({ size = 14 }: { size?: number }) { + return ( + + ); +} + function NoteIcon({ size = 16 }: { size?: number }) { return ( ({ name, kind: "fixture" as const })), ...customNames.map((name) => ({ name, kind: "custom" as const })), ]; - const displayedName = fixture || (tracks[0]?.name ?? "—"); + const displayedName = isText2Music(fixture) + ? TEXT2MUSIC_LABEL + : fixture || (tracks[0]?.name ?? "—"); return ( <> @@ -424,6 +453,36 @@ export function AudioSourceCrate() { ); })} + {/* Text-to-music sleeve — generation from the prompt alone, no + input audio. Pinned with the upload sleeve so it's always + visible regardless of fixture count. */} + + + {TEXT2MUSIC_LABEL} + + {/* Upload sleeve is pinned outside the scroll region so it stays visible regardless of fixture count. Always rendered. */}