Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions acestep/streaming/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ class SessionConfig:
fixture_name: str | None = None
use_server_fixture: bool = False
stem_source_mode: str | None = None
# Text-to-music mode: no input audio at all. The server synthesizes a
# silent source of ``text2music_duration_s`` seconds and conditions the
# diffusion purely on the prompt (canonical silence latent = the
# model's trained "no reference" signal), so generation is text-only.
# The client sends NO binary PCM frame during the handshake when this
# is set (mirrors ``use_server_fixture``).
text2music: bool = False
text2music_duration_s: float = 60.0
enabled_loras: list = field(default_factory=list)
lora_strengths: dict = field(default_factory=dict)
lora_paths: list = field(default_factory=list)
Expand Down
86 changes: 62 additions & 24 deletions acestep/streaming/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
_resolve_bpm_key_source,
_try_load_sidecar,
SAMPLE_RATE,
resolve_text2music_source,
)
from acestep.streaming.state import SessionState
from acestep.streaming.stems import (
Expand Down Expand Up @@ -935,11 +936,19 @@ def _apply_swap_if_pending(self) -> None:
tags = state.swap_pending.get("tags")
requested_key = state.swap_pending.get("key")
requested_time_sig = state.swap_pending.get("time_signature")
new_fixture_name = state.swap_pending.get("fixture_name")
new_stem_source_mode = resolve_upload_stem_source_mode(
new_fixture_name,
state.swap_pending.get("stem_source_mode"),
known_fixtures=KNOWN_FIXTURES,
new_text2music = bool(state.swap_pending.get("text2music"))
# Text-to-music swap: the placeholder waveform carries no
# fixture identity and the stem machinery doesn't apply.
new_fixture_name = (
None if new_text2music
else state.swap_pending.get("fixture_name")
)
new_stem_source_mode = None if new_text2music else (
resolve_upload_stem_source_mode(
new_fixture_name,
state.swap_pending.get("stem_source_mode"),
known_fixtures=KNOWN_FIXTURES,
)
)
if new_wf is None:
return
Expand All @@ -949,6 +958,7 @@ def _apply_swap_if_pending(self) -> None:
state.swap_pending["time_signature"] = None
state.swap_pending["fixture_name"] = None
state.swap_pending["stem_source_mode"] = None
state.swap_pending["text2music"] = False

# Initialized to None so the finally below can None-guard
# cleanly in the (rare) case an exception fires between the
Expand Down Expand Up @@ -1028,16 +1038,27 @@ def _apply_swap_if_pending(self) -> None:
return

new_audio_in = Audio(waveform=new_wf, sample_rate=SAMPLE_RATE)
new_source, new_bpm, new_key, new_time_sig = (
_resolve_bpm_key_source(
self.session,
audio_in=new_audio_in,
fixture_name=new_fixture_name,
samples=int(new_wf.shape[1]),
key_override=requested_key,
time_signature_override=requested_time_sig,
if new_text2music:
new_source, new_bpm, new_key, new_time_sig = (
resolve_text2music_source(
self.session, samples=int(new_wf.shape[1]),
)
)
if requested_key:
new_key = requested_key
if requested_time_sig:
new_time_sig = requested_time_sig
else:
new_source, new_bpm, new_key, new_time_sig = (
_resolve_bpm_key_source(
self.session,
audio_in=new_audio_in,
fixture_name=new_fixture_name,
samples=int(new_wf.shape[1]),
key_override=requested_key,
time_signature_override=requested_time_sig,
)
)
)
new_upload_stems, new_stem_error, new_source, new_wf = (
extract_and_select_upload_stem(
new_wf,
Expand Down Expand Up @@ -1858,11 +1879,16 @@ def swap_source(
time_signature: str | None = None,
fixture_name: str | None = None,
stem_source_mode: str | None = None,
text2music: bool = False,
origin: CommandOrigin = CommandOrigin.PRIMARY,
) -> None:
"""Stage a source swap. The runner applies it inside
``before_tick``; publishes :class:`SwapReady` or
:class:`SwapFailed` when the swap completes."""
:class:`SwapFailed` when the swap completes.

``text2music=True`` marks ``audio`` as a synthesized silent
placeholder: the swap resolves the canonical-silence source
(prompt-only conditioning) instead of encoding the waveform."""
state = self.state
state.last_activity_ts = time.monotonic()
effective_tags = tags or state.prompt_text
Expand All @@ -1877,6 +1903,7 @@ def swap_source(
state.swap_pending["stem_source_mode"] = normalize_stem_source_mode(
stem_source_mode,
)
state.swap_pending["text2music"] = bool(text2music)

@requires_capability("write_audio", "write_audio")
def write_audio(
Expand Down Expand Up @@ -2090,8 +2117,12 @@ def create(
fast_vae = config.fast_vae
walk_window = config.walk_window
walk_window_s = config.walk_window_s
fixture_name = config.fixture_name
stem_source_mode = resolve_upload_stem_source_mode(
# Text-to-music: no input audio. The waveform is the synthesized
# silence placeholder; fixture / stem machinery doesn't apply
# (running Mel-Band RoFormer on zeros would be pure waste).
text2music = config.text2music
fixture_name = None if text2music else config.fixture_name
stem_source_mode = None if text2music else resolve_upload_stem_source_mode(
fixture_name,
normalize_stem_source_mode(config.stem_source_mode),
known_fixtures=KNOWN_FIXTURES,
Expand Down Expand Up @@ -2242,14 +2273,21 @@ def create(

audio_in = Audio(waveform=waveform, sample_rate=SAMPLE_RATE)

source, detected_bpm, detected_key, detected_time_signature = (
_resolve_bpm_key_source(
engine_session,
audio_in=audio_in,
fixture_name=fixture_name,
samples=int(waveform.shape[1]),
if text2music:
source, detected_bpm, detected_key, detected_time_signature = (
resolve_text2music_source(
engine_session, samples=int(waveform.shape[1]),
)
)
else:
source, detected_bpm, detected_key, detected_time_signature = (
_resolve_bpm_key_source(
engine_session,
audio_in=audio_in,
fixture_name=fixture_name,
samples=int(waveform.shape[1]),
)
)
)

upload_stems, stem_error, source, waveform = (
extract_and_select_upload_stem(
Expand Down
73 changes: 73 additions & 0 deletions acestep/streaming/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,79 @@ def _load_clip_waveform(name: str) -> torch.Tensor:
return _load_waveform_from_path(str(resolve_audio_clip(name)))


# ---------------------------------------------------------------------------
# Text-to-music (no input audio)
# ---------------------------------------------------------------------------

# Conditioning defaults baked into the text encoder when there is no audio
# to detect them from. 120 BPM / C major / 4 are the model family's
# most-supported values; the operator can re-steer key / time signature
# live via the ``prompt`` command.
TEXT2MUSIC_BPM = 120
TEXT2MUSIC_KEY = "C major"
TEXT2MUSIC_TIME_SIGNATURE = "4"

# Floor for the synthesized silent source. Below ~10 s the loop seam
# dominates the listening experience; the ceiling is the caller's TRT
# profile cap (same clamp every uploaded source gets).
TEXT2MUSIC_MIN_DURATION_S = 10.0

# Samples per latent frame at the 48 kHz / 25 fps latent rate.
_SAMPLES_PER_LATENT_FRAME = SAMPLE_RATE // 25


def text2music_waveform(duration_s: float, *, max_seconds: float) -> torch.Tensor:
"""Synthesize the silent stereo placeholder for a text-to-music source.

The waveform only seeds the playback ring buffer (the user hears
silence until generated slices land) and sets the session length;
callers run it through the same TRT-cap / pool-alignment trim as an
uploaded source.
"""
dur = min(
max(float(duration_s), TEXT2MUSIC_MIN_DURATION_S), float(max_seconds),
)
return torch.zeros(2, int(dur * SAMPLE_RATE))


def resolve_text2music_source(
session: Session, *, samples: int,
) -> tuple[PreparedSource, int, str, str]:
"""Text-to-music analog of :func:`_resolve_bpm_key_source`.

Both the source latent and the context latent are the CANONICAL
silence latent from the checkpoint (``EmptyLatent``), not a VAE
encode of digital zeros: ``silence_latent`` is what the model was
trained to read as "no reference audio" (its forward uses it to
simulate text2music mode), so structure conditioning is genuinely
absent rather than "semantic hints of an all-zero clip". This also
skips the VAE encode + semantic extract entirely, along with
librosa beat-tracking and CNN key detection (both meaningless on
silence — beat_track returns 0 BPM, which would poison the text
conditioning).
"""
from acestep.nodes.vae_nodes import EmptyLatent

frames = samples // _SAMPLES_PER_LATENT_FRAME
latent = EmptyLatent().execute(
model=session.model, frames=frames,
)["latent"]
source = PreparedSource(
latent=latent,
context_latent=Latent(tensor=latent.tensor.clone()),
)
logger.info(
"text2music_source_ready frames={} duration_s={:.1f}",
frames, samples / SAMPLE_RATE,
)
return (
source,
TEXT2MUSIC_BPM,
TEXT2MUSIC_KEY,
TEXT2MUSIC_TIME_SIGNATURE,
)


_VALID_TIME_SIG_STRS = frozenset(str(s) for s in VALID_TIME_SIGNATURES)


Expand Down
1 change: 1 addition & 0 deletions acestep/streaming/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def _default_swap_pending() -> dict:
"time_signature": None,
"fixture_name": None,
"stem_source_mode": None,
"text2music": False,
}


Expand Down
10 changes: 8 additions & 2 deletions demos/realtime_motion_graph_web/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,13 +380,19 @@ class EventSpec:
description="When true, the server loads the named source "
"off its own disk and NO binary frame is "
"sent."),
FieldSpec("text2music", "bool",
description="When true, the server swaps to a synthesized "
"silent source (text-to-music mode: generation "
"is conditioned on the prompt alone) and NO "
"binary frame is sent. fixture_name is "
"ignored."),
),
binary=True,
binary_optional=True,
requires="swap",
description="Replace the playback source in-flight. A binary PCM frame "
"follows UNLESS use_server_source is set. Acked by "
"swap_ready (+ binary buffer) / swap_failed.",
"follows UNLESS use_server_source or text2music is set. "
"Acked by swap_ready (+ binary buffer) / swap_failed.",
),
CommandSpec(
"write_audio",
Expand Down
39 changes: 39 additions & 0 deletions demos/realtime_motion_graph_web/web/app/globals.css
Original file line number Diff line number Diff line change
Expand Up @@ -7102,6 +7102,45 @@ body.curve-open #install-video-area #graph {
opacity: 1;
}

/* Pinned text-to-music sleeve — prompt-only generation, no input audio.
Sits in the pinned region above the upload sleeve, separated from the
scroll list by the same hairline treatment. */
.audio-source-sleeve--text2music {
flex-shrink: 0;
position: relative;
margin-top: 2px;
padding-top: 9px;
color: var(--accent-hover);
text-transform: uppercase;
letter-spacing: var(--tracking-wide);
font-size: 10px;
}
.audio-source-sleeve--text2music::before {
content: "";
position: absolute;
top: 0;
left: 8px;
right: 8px;
height: 1px;
background: linear-gradient(90deg,
transparent,
var(--frame-line) 25%,
var(--frame-line) 75%,
transparent);
}
.audio-source-sleeve--text2music:hover {
background: var(--accent-medium);
color: var(--accent-hover);
}
.audio-source-sleeve-art--text2music {
border: none;
color: var(--accent);
width: 14px;
height: 14px;
opacity: 1;
}
.audio-source-sleeve-art--text2music::after { display: none; }

/* Pinned upload sleeve — always rendered, separated from the scroll list
above by a thin hairline so it reads as a distinct "you" slot. */
.audio-source-sleeve--upload {
Expand Down
Loading