68f0ce0dae
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
327 lines
11 KiB
Python
327 lines
11 KiB
Python
"""Garantiza que ComfyUI corre como servicio systemd-user resiliente y sano.
|
|
|
|
Funcion impura: instala/actualiza el unit systemd-user `comfyui.service`, lo
|
|
habilita y arranca, y comprueba la salud del backend HTTP. Idempotente: si el
|
|
servicio ya esta gestionado por systemd, activo y respondiendo, no toca nada.
|
|
|
|
Migracion limpia: si ComfyUI ya corre a mano (puerto ocupado por un proceso
|
|
`main.py` que systemd NO gestiona), lo para con SIGTERM y lo levanta via
|
|
systemd, para que a partir de ese momento se reinicie solo (Restart=always).
|
|
|
|
Solo depende de la stdlib (subprocess, urllib, os, signal, time, re). No lanza
|
|
excepciones: siempre devuelve un dict de estado.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import signal
|
|
import subprocess
|
|
import time
|
|
import urllib.request
|
|
|
|
|
|
def _default_runner(cmd):
|
|
"""Ejecuta un comando capturando salida. Inyectable para tests."""
|
|
return subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
|
|
|
|
def _detect_lowvram(vram_mib):
|
|
"""Decide si conviene --lowvram segun la VRAM total en MiB.
|
|
|
|
GPUs con <= 8200 MiB (tarjetas de 8 GB) ganan estabilidad con --lowvram para
|
|
modelos grandes (Flux, video). Si no hay dato de VRAM (None), NO asume
|
|
lowvram: devuelve False para no penalizar GPUs grandes sin necesidad.
|
|
"""
|
|
return vram_mib is not None and vram_mib <= 8200
|
|
|
|
|
|
def _query_vram_mib(runner):
|
|
"""Lee la VRAM total (MiB) de la primera GPU via nvidia-smi. None si falla."""
|
|
try:
|
|
r = runner(
|
|
[
|
|
"nvidia-smi",
|
|
"--query-gpu=memory.total",
|
|
"--format=csv,noheader,nounits",
|
|
]
|
|
)
|
|
if r.returncode == 0 and r.stdout.strip():
|
|
return int(r.stdout.strip().splitlines()[0].strip())
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def _render_unit(python_bin, main_py, working_dir, port, lowvram, description):
|
|
"""Construye el texto del unit systemd-user. Pura (sin I/O)."""
|
|
exec_start = f"{python_bin} {main_py} --port {port}"
|
|
if lowvram:
|
|
exec_start += " --lowvram"
|
|
return (
|
|
"[Unit]\n"
|
|
f"Description={description}\n"
|
|
"After=network-online.target\n"
|
|
"Wants=network-online.target\n"
|
|
"\n"
|
|
"[Service]\n"
|
|
"Type=simple\n"
|
|
f"WorkingDirectory={working_dir}\n"
|
|
f"ExecStart={exec_start}\n"
|
|
# Restart=always (NO on-failure): un SIGTERM limpio es exit success y
|
|
# con on-failure el servicio no reviviria. Ver .claude/rules/function_tags.md.
|
|
"Restart=always\n"
|
|
"RestartSec=5\n"
|
|
"\n"
|
|
"[Install]\n"
|
|
"WantedBy=default.target\n"
|
|
)
|
|
|
|
|
|
def _health(port, path="/system_stats", timeout=3):
|
|
"""True si GET http://127.0.0.1:<port><path> responde 2xx."""
|
|
url = f"http://127.0.0.1:{port}{path}"
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
|
return 200 <= resp.status < 300
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _wait_health(port, timeout, interval=2.0):
|
|
"""Sondea la salud hasta que responda 2xx o se agote el timeout."""
|
|
deadline = time.monotonic() + timeout
|
|
while time.monotonic() < deadline:
|
|
if _health(port):
|
|
return True
|
|
time.sleep(interval)
|
|
return _health(port)
|
|
|
|
|
|
def _systemctl(runner, *args):
|
|
return runner(["systemctl", "--user", *args])
|
|
|
|
|
|
def _unit_active_state(runner, unit_name):
|
|
"""Devuelve el ActiveState del unit: active|inactive|failed|... o '' si no existe."""
|
|
r = _systemctl(runner, "is-active", unit_name)
|
|
return (r.stdout or r.stderr or "").strip()
|
|
|
|
|
|
def _pid_listening_on_port(port, runner):
|
|
"""PID del proceso que escucha en 127.0.0.1:<port>, o None. Via `ss`."""
|
|
try:
|
|
r = runner(["ss", "-ltnpH", f"sport = :{port}"])
|
|
if r.returncode == 0:
|
|
m = re.search(r"pid=(\d+)", r.stdout or "")
|
|
if m:
|
|
return int(m.group(1))
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def _is_comfy_process(pid):
|
|
"""True si la cmdline del PID contiene 'main.py' (proceso ComfyUI a mano)."""
|
|
try:
|
|
with open(f"/proc/{pid}/cmdline", "rb") as f:
|
|
cmd = f.read().replace(b"\0", b" ").decode(errors="replace")
|
|
return "main.py" in cmd
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _terminate_manual(pid, port, runner, wait_s=25.0):
|
|
"""SIGTERM al proceso a mano y espera a que libere el puerto. No usa SIGKILL."""
|
|
try:
|
|
os.kill(pid, signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
return True
|
|
except Exception:
|
|
return False
|
|
deadline = time.monotonic() + wait_s
|
|
while time.monotonic() < deadline:
|
|
if _pid_listening_on_port(port, runner) is None:
|
|
return True
|
|
time.sleep(1.0)
|
|
# Reintento suave de SIGTERM antes de rendirse (nunca SIGKILL: no destructivo).
|
|
try:
|
|
os.kill(pid, signal.SIGTERM)
|
|
except Exception:
|
|
pass
|
|
time.sleep(3.0)
|
|
return _pid_listening_on_port(port, runner) is None
|
|
|
|
|
|
def comfyui_ensure_server(
|
|
*,
|
|
port=8188,
|
|
lowvram=None,
|
|
health_timeout=60,
|
|
comfyui_dir="~/ComfyUI",
|
|
unit_name="comfyui",
|
|
runner=None,
|
|
):
|
|
"""Garantiza ComfyUI corriendo y sano como servicio systemd-user.
|
|
|
|
Args:
|
|
port: puerto HTTP del backend ComfyUI (default 8188).
|
|
lowvram: True/False fuerza el flag --lowvram; None autodetecta por VRAM
|
|
(GPUs <= 8 GB -> True).
|
|
health_timeout: segundos maximos esperando a que /system_stats responda
|
|
tras arrancar el servicio.
|
|
comfyui_dir: raiz de la instalacion de ComfyUI (con .venv/ y main.py).
|
|
unit_name: nombre del unit systemd-user (sin .service).
|
|
runner: callable(cmd:list)->CompletedProcess inyectable para tests.
|
|
|
|
Returns:
|
|
dict con: ok, active (ActiveState), port, health (bool), error (str|None),
|
|
lowvram (bool), unit_path, migrated (bool), reloaded (bool),
|
|
idempotent (bool).
|
|
"""
|
|
runner = runner or _default_runner
|
|
result = {
|
|
"ok": False,
|
|
"active": None,
|
|
"port": port,
|
|
"health": False,
|
|
"error": None,
|
|
"lowvram": None,
|
|
"unit_path": None,
|
|
"migrated": False,
|
|
"reloaded": False,
|
|
"idempotent": False,
|
|
}
|
|
|
|
comfyui_dir = os.path.abspath(os.path.expanduser(comfyui_dir))
|
|
python_bin = os.path.join(comfyui_dir, ".venv", "bin", "python")
|
|
main_py = os.path.join(comfyui_dir, "main.py")
|
|
if not os.path.exists(python_bin):
|
|
result["error"] = f"venv python no encontrado: {python_bin}"
|
|
return result
|
|
if not os.path.exists(main_py):
|
|
result["error"] = f"main.py no encontrado: {main_py}"
|
|
return result
|
|
|
|
# 1. Resolver lowvram (autodetect por VRAM si es None).
|
|
lv = lowvram if lowvram is not None else _detect_lowvram(_query_vram_mib(runner))
|
|
result["lowvram"] = bool(lv)
|
|
|
|
# 2. Renderizar e instalar el unit (solo reescribe si cambio el contenido).
|
|
content = _render_unit(
|
|
python_bin, main_py, comfyui_dir, port, lv,
|
|
"ComfyUI (Stable Diffusion / Flux backend) gestionado por el registry",
|
|
)
|
|
unit_dir = os.path.expanduser("~/.config/systemd/user")
|
|
try:
|
|
os.makedirs(unit_dir, exist_ok=True)
|
|
except Exception as e:
|
|
result["error"] = f"no se pudo crear {unit_dir}: {e}"
|
|
return result
|
|
unit_path = os.path.join(unit_dir, f"{unit_name}.service")
|
|
result["unit_path"] = unit_path
|
|
|
|
existing = None
|
|
if os.path.exists(unit_path):
|
|
try:
|
|
with open(unit_path, "r") as f:
|
|
existing = f.read()
|
|
except Exception:
|
|
existing = None
|
|
changed = existing != content
|
|
if changed:
|
|
tmp = unit_path + ".tmp"
|
|
try:
|
|
with open(tmp, "w") as f:
|
|
f.write(content)
|
|
os.replace(tmp, unit_path)
|
|
except Exception as e:
|
|
result["error"] = f"no se pudo escribir el unit: {e}"
|
|
return result
|
|
rl = _systemctl(runner, "daemon-reload")
|
|
result["reloaded"] = rl.returncode == 0
|
|
if rl.returncode != 0:
|
|
result["error"] = f"daemon-reload fallo: {(rl.stderr or '').strip()}"
|
|
return result
|
|
|
|
# 3. Habilitar (idempotente; el linger del usuario ya debe estar activo).
|
|
en = _systemctl(runner, "enable", unit_name)
|
|
if en.returncode != 0:
|
|
result["error"] = (
|
|
f"systemctl --user enable {unit_name} fallo: "
|
|
f"{(en.stderr or '').strip()}. "
|
|
"Si es por falta de linger: `loginctl enable-linger $USER`."
|
|
)
|
|
return result
|
|
|
|
# 4. Estado actual: salud HTTP + si systemd ya lo gestiona.
|
|
active_state = _unit_active_state(runner, unit_name)
|
|
health_now = _health(port)
|
|
|
|
if health_now and active_state == "active":
|
|
# Ya gestionado por systemd y sano -> idempotente, no tocar.
|
|
result["ok"] = True
|
|
result["health"] = True
|
|
result["active"] = "active"
|
|
result["idempotent"] = not changed
|
|
return result
|
|
|
|
if health_now and active_state != "active":
|
|
# Proceso a mano ocupa el puerto y systemd NO lo gestiona -> migrar limpio.
|
|
pid = _pid_listening_on_port(port, runner)
|
|
if pid and _is_comfy_process(pid):
|
|
if not _terminate_manual(pid, port, runner):
|
|
result["error"] = (
|
|
f"no se pudo liberar el puerto {port} (PID {pid}) con SIGTERM; "
|
|
"no arranco el servicio para no duplicar el bind."
|
|
)
|
|
return result
|
|
result["migrated"] = True
|
|
elif pid:
|
|
result["error"] = (
|
|
f"puerto {port} ocupado por PID {pid} que no parece ComfyUI; "
|
|
"no lo toco ni arranco el servicio."
|
|
)
|
|
return result
|
|
# Si pid es None pero health_now True: race raro; seguimos a start.
|
|
|
|
# 5. Arrancar via systemd y esperar salud.
|
|
st = _systemctl(runner, "start", unit_name)
|
|
if st.returncode != 0:
|
|
result["active"] = _unit_active_state(runner, unit_name)
|
|
result["error"] = (
|
|
f"systemctl --user start {unit_name} fallo: "
|
|
f"{(st.stderr or '').strip()}. Diagnostica con "
|
|
f"`journalctl --user -u {unit_name} -n 50`."
|
|
)
|
|
return result
|
|
|
|
healthy = _wait_health(port, health_timeout)
|
|
result["active"] = _unit_active_state(runner, unit_name)
|
|
result["health"] = healthy
|
|
result["ok"] = healthy
|
|
if not healthy:
|
|
result["error"] = (
|
|
f"el unit arranco pero /system_stats no respondio 2xx en "
|
|
f"{health_timeout}s. Revisa `journalctl --user -u {unit_name} -n 50`."
|
|
)
|
|
return result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import json
|
|
import sys
|
|
|
|
kwargs = {}
|
|
for arg in sys.argv[1:]:
|
|
if arg.startswith("--port="):
|
|
kwargs["port"] = int(arg.split("=", 1)[1])
|
|
elif arg == "--lowvram":
|
|
kwargs["lowvram"] = True
|
|
elif arg == "--no-lowvram":
|
|
kwargs["lowvram"] = False
|
|
elif arg.startswith("--health-timeout="):
|
|
kwargs["health_timeout"] = int(arg.split("=", 1)[1])
|
|
elif arg.startswith("--comfyui-dir="):
|
|
kwargs["comfyui_dir"] = arg.split("=", 1)[1]
|
|
print(json.dumps(comfyui_ensure_server(**kwargs), indent=2))
|