feat(ml): generación de audio en ComfyUI (ACE-Step) — builder comfyui_build_audio_workflow + fetch_output_audio
Soporte nativo de audio texto->música/SFX en ComfyUI 0.26.0 capitalizado como funciones del registry: - comfyui_build_audio_workflow (pura): builder ACE-Step en API format. Cadena CheckpointLoaderSimple -> TextEncodeAceStepAudio + ConditioningZeroOut + EmptyAceStepLatentAudio -> ModelSamplingSD3 -> KSampler -> VAEDecodeAudio -> SaveAudio. Params seconds/seed/steps/cfg/shift/lyrics. Tags comfyui,audio,ace-step. - comfyui_fetch_output_audio (impura): baja el .flac/.wav/.mp3 del output (clave 'audio'). Hermana de comfyui_fetch_output_video, que no sirve para audio. Modelo ACE-Step v1 3.5B (Apache 2.0, abierto). Stable Audio Open 1.0 descartado por estar gated (HTTP 403) en HuggingFace. Cabe en 8GB con --lowvram. Verificado e2e: 2 .flac reales generados desde texto (4.0s y 8.0s, seeds distintos), duración exacta confirmada con ffprobe. Tests 6+5 verdes. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,90 @@
|
||||
"""Tests de estructura para comfyui_build_audio_workflow (funcion pura, ACE-Step)."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from ml.comfyui_build_audio_workflow import comfyui_build_audio_workflow
|
||||
from _comfyui_wf_assert import assert_api_format, class_types, node_by_ct
|
||||
|
||||
|
||||
def test_estructura_y_nodos_acestep():
|
||||
wf = comfyui_build_audio_workflow(
|
||||
"AUDIO_ace_step_v1_3.5b.safetensors", "retro coin sfx"
|
||||
)
|
||||
assert_api_format(wf)
|
||||
cts = class_types(wf)
|
||||
for ct in (
|
||||
"CheckpointLoaderSimple",
|
||||
"TextEncodeAceStepAudio",
|
||||
"ConditioningZeroOut",
|
||||
"EmptyAceStepLatentAudio",
|
||||
"ModelSamplingSD3",
|
||||
"KSampler",
|
||||
"VAEDecodeAudio",
|
||||
"SaveAudio",
|
||||
):
|
||||
assert ct in cts, f"falta nodo {ct}"
|
||||
assert len(wf) == 8
|
||||
|
||||
|
||||
def test_ckpt_y_prompt_reflejados():
|
||||
wf = comfyui_build_audio_workflow("AUDIO_x.safetensors", "magic spell whoosh")
|
||||
assert node_by_ct(wf, "CheckpointLoaderSimple")["inputs"]["ckpt_name"] == "AUDIO_x.safetensors"
|
||||
enc = node_by_ct(wf, "TextEncodeAceStepAudio")
|
||||
assert enc["inputs"]["tags"] == "magic spell whoosh"
|
||||
assert enc["inputs"]["lyrics"] == ""
|
||||
|
||||
|
||||
def test_cableado_ksampler():
|
||||
wf = comfyui_build_audio_workflow("AUDIO_x.safetensors", "p")
|
||||
ks = node_by_ct(wf, "KSampler")["inputs"]
|
||||
# model viene de ModelSamplingSD3 ("11"), no del checkpoint directo
|
||||
assert ks["model"] == ["11", 0]
|
||||
assert ks["positive"] == ["6", 0]
|
||||
# negative pasa por ConditioningZeroOut ("10")
|
||||
assert ks["negative"] == ["10", 0]
|
||||
assert ks["latent_image"] == ["5", 0]
|
||||
assert ks["denoise"] == 1.0
|
||||
# ModelSamplingSD3 toma el MODEL del checkpoint
|
||||
assert node_by_ct(wf, "ModelSamplingSD3")["inputs"]["model"] == ["4", 0]
|
||||
# VAEDecodeAudio usa el VAE del checkpoint
|
||||
assert node_by_ct(wf, "VAEDecodeAudio")["inputs"]["vae"] == ["4", 2]
|
||||
# ConditioningZeroOut deriva del positive
|
||||
assert node_by_ct(wf, "ConditioningZeroOut")["inputs"]["conditioning"] == ["6", 0]
|
||||
|
||||
|
||||
def test_edge_seconds_y_seed_variables():
|
||||
wf_a = comfyui_build_audio_workflow("c", "p", seconds=4.0, seed=42)
|
||||
wf_b = comfyui_build_audio_workflow("c", "p", seconds=8.0, seed=99)
|
||||
assert node_by_ct(wf_a, "EmptyAceStepLatentAudio")["inputs"]["seconds"] == 4.0
|
||||
assert node_by_ct(wf_b, "EmptyAceStepLatentAudio")["inputs"]["seconds"] == 8.0
|
||||
assert node_by_ct(wf_a, "KSampler")["inputs"]["seed"] == 42
|
||||
assert node_by_ct(wf_b, "KSampler")["inputs"]["seed"] == 99
|
||||
|
||||
|
||||
def test_params_reflejados():
|
||||
wf = comfyui_build_audio_workflow(
|
||||
"c", "p",
|
||||
lyrics="la la la", steps=30, cfg=4.0, sampler_name="dpmpp_2m",
|
||||
scheduler="karras", shift=3.5, lyrics_strength=0.7,
|
||||
filename_prefix="audio/mio",
|
||||
)
|
||||
enc = node_by_ct(wf, "TextEncodeAceStepAudio")["inputs"]
|
||||
assert enc["lyrics"] == "la la la"
|
||||
assert enc["lyrics_strength"] == 0.7
|
||||
ks = node_by_ct(wf, "KSampler")["inputs"]
|
||||
assert ks["steps"] == 30
|
||||
assert ks["cfg"] == 4.0
|
||||
assert ks["sampler_name"] == "dpmpp_2m"
|
||||
assert ks["scheduler"] == "karras"
|
||||
assert node_by_ct(wf, "ModelSamplingSD3")["inputs"]["shift"] == 3.5
|
||||
assert node_by_ct(wf, "SaveAudio")["inputs"]["filename_prefix"] == "audio/mio"
|
||||
|
||||
|
||||
def test_determinismo():
|
||||
a = comfyui_build_audio_workflow("c", "p", seconds=5.0, seed=7)
|
||||
b = comfyui_build_audio_workflow("c", "p", seconds=5.0, seed=7)
|
||||
assert a == b
|
||||
Reference in New Issue
Block a user