feat(infra): auto-commit con 3 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,326 @@
|
||||
"""Garantiza que ComfyUI corre como servicio systemd-user resiliente y sano.
|
||||
|
||||
Funcion impura: instala/actualiza el unit systemd-user `comfyui.service`, lo
|
||||
habilita y arranca, y comprueba la salud del backend HTTP. Idempotente: si el
|
||||
servicio ya esta gestionado por systemd, activo y respondiendo, no toca nada.
|
||||
|
||||
Migracion limpia: si ComfyUI ya corre a mano (puerto ocupado por un proceso
|
||||
`main.py` que systemd NO gestiona), lo para con SIGTERM y lo levanta via
|
||||
systemd, para que a partir de ese momento se reinicie solo (Restart=always).
|
||||
|
||||
Solo depende de la stdlib (subprocess, urllib, os, signal, time, re). No lanza
|
||||
excepciones: siempre devuelve un dict de estado.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
|
||||
def _default_runner(cmd):
|
||||
"""Ejecuta un comando capturando salida. Inyectable para tests."""
|
||||
return subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
|
||||
|
||||
def _detect_lowvram(vram_mib):
|
||||
"""Decide si conviene --lowvram segun la VRAM total en MiB.
|
||||
|
||||
GPUs con <= 8200 MiB (tarjetas de 8 GB) ganan estabilidad con --lowvram para
|
||||
modelos grandes (Flux, video). Si no hay dato de VRAM (None), NO asume
|
||||
lowvram: devuelve False para no penalizar GPUs grandes sin necesidad.
|
||||
"""
|
||||
return vram_mib is not None and vram_mib <= 8200
|
||||
|
||||
|
||||
def _query_vram_mib(runner):
|
||||
"""Lee la VRAM total (MiB) de la primera GPU via nvidia-smi. None si falla."""
|
||||
try:
|
||||
r = runner(
|
||||
[
|
||||
"nvidia-smi",
|
||||
"--query-gpu=memory.total",
|
||||
"--format=csv,noheader,nounits",
|
||||
]
|
||||
)
|
||||
if r.returncode == 0 and r.stdout.strip():
|
||||
return int(r.stdout.strip().splitlines()[0].strip())
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _render_unit(python_bin, main_py, working_dir, port, lowvram, description):
|
||||
"""Construye el texto del unit systemd-user. Pura (sin I/O)."""
|
||||
exec_start = f"{python_bin} {main_py} --port {port}"
|
||||
if lowvram:
|
||||
exec_start += " --lowvram"
|
||||
return (
|
||||
"[Unit]\n"
|
||||
f"Description={description}\n"
|
||||
"After=network-online.target\n"
|
||||
"Wants=network-online.target\n"
|
||||
"\n"
|
||||
"[Service]\n"
|
||||
"Type=simple\n"
|
||||
f"WorkingDirectory={working_dir}\n"
|
||||
f"ExecStart={exec_start}\n"
|
||||
# Restart=always (NO on-failure): un SIGTERM limpio es exit success y
|
||||
# con on-failure el servicio no reviviria. Ver .claude/rules/function_tags.md.
|
||||
"Restart=always\n"
|
||||
"RestartSec=5\n"
|
||||
"\n"
|
||||
"[Install]\n"
|
||||
"WantedBy=default.target\n"
|
||||
)
|
||||
|
||||
|
||||
def _health(port, path="/system_stats", timeout=3):
|
||||
"""True si GET http://127.0.0.1:<port><path> responde 2xx."""
|
||||
url = f"http://127.0.0.1:{port}{path}"
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
||||
return 200 <= resp.status < 300
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _wait_health(port, timeout, interval=2.0):
|
||||
"""Sondea la salud hasta que responda 2xx o se agote el timeout."""
|
||||
deadline = time.monotonic() + timeout
|
||||
while time.monotonic() < deadline:
|
||||
if _health(port):
|
||||
return True
|
||||
time.sleep(interval)
|
||||
return _health(port)
|
||||
|
||||
|
||||
def _systemctl(runner, *args):
|
||||
return runner(["systemctl", "--user", *args])
|
||||
|
||||
|
||||
def _unit_active_state(runner, unit_name):
|
||||
"""Devuelve el ActiveState del unit: active|inactive|failed|... o '' si no existe."""
|
||||
r = _systemctl(runner, "is-active", unit_name)
|
||||
return (r.stdout or r.stderr or "").strip()
|
||||
|
||||
|
||||
def _pid_listening_on_port(port, runner):
|
||||
"""PID del proceso que escucha en 127.0.0.1:<port>, o None. Via `ss`."""
|
||||
try:
|
||||
r = runner(["ss", "-ltnpH", f"sport = :{port}"])
|
||||
if r.returncode == 0:
|
||||
m = re.search(r"pid=(\d+)", r.stdout or "")
|
||||
if m:
|
||||
return int(m.group(1))
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _is_comfy_process(pid):
|
||||
"""True si la cmdline del PID contiene 'main.py' (proceso ComfyUI a mano)."""
|
||||
try:
|
||||
with open(f"/proc/{pid}/cmdline", "rb") as f:
|
||||
cmd = f.read().replace(b"\0", b" ").decode(errors="replace")
|
||||
return "main.py" in cmd
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _terminate_manual(pid, port, runner, wait_s=25.0):
|
||||
"""SIGTERM al proceso a mano y espera a que libere el puerto. No usa SIGKILL."""
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
deadline = time.monotonic() + wait_s
|
||||
while time.monotonic() < deadline:
|
||||
if _pid_listening_on_port(port, runner) is None:
|
||||
return True
|
||||
time.sleep(1.0)
|
||||
# Reintento suave de SIGTERM antes de rendirse (nunca SIGKILL: no destructivo).
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(3.0)
|
||||
return _pid_listening_on_port(port, runner) is None
|
||||
|
||||
|
||||
def comfyui_ensure_server(
|
||||
*,
|
||||
port=8188,
|
||||
lowvram=None,
|
||||
health_timeout=60,
|
||||
comfyui_dir="~/ComfyUI",
|
||||
unit_name="comfyui",
|
||||
runner=None,
|
||||
):
|
||||
"""Garantiza ComfyUI corriendo y sano como servicio systemd-user.
|
||||
|
||||
Args:
|
||||
port: puerto HTTP del backend ComfyUI (default 8188).
|
||||
lowvram: True/False fuerza el flag --lowvram; None autodetecta por VRAM
|
||||
(GPUs <= 8 GB -> True).
|
||||
health_timeout: segundos maximos esperando a que /system_stats responda
|
||||
tras arrancar el servicio.
|
||||
comfyui_dir: raiz de la instalacion de ComfyUI (con .venv/ y main.py).
|
||||
unit_name: nombre del unit systemd-user (sin .service).
|
||||
runner: callable(cmd:list)->CompletedProcess inyectable para tests.
|
||||
|
||||
Returns:
|
||||
dict con: ok, active (ActiveState), port, health (bool), error (str|None),
|
||||
lowvram (bool), unit_path, migrated (bool), reloaded (bool),
|
||||
idempotent (bool).
|
||||
"""
|
||||
runner = runner or _default_runner
|
||||
result = {
|
||||
"ok": False,
|
||||
"active": None,
|
||||
"port": port,
|
||||
"health": False,
|
||||
"error": None,
|
||||
"lowvram": None,
|
||||
"unit_path": None,
|
||||
"migrated": False,
|
||||
"reloaded": False,
|
||||
"idempotent": False,
|
||||
}
|
||||
|
||||
comfyui_dir = os.path.abspath(os.path.expanduser(comfyui_dir))
|
||||
python_bin = os.path.join(comfyui_dir, ".venv", "bin", "python")
|
||||
main_py = os.path.join(comfyui_dir, "main.py")
|
||||
if not os.path.exists(python_bin):
|
||||
result["error"] = f"venv python no encontrado: {python_bin}"
|
||||
return result
|
||||
if not os.path.exists(main_py):
|
||||
result["error"] = f"main.py no encontrado: {main_py}"
|
||||
return result
|
||||
|
||||
# 1. Resolver lowvram (autodetect por VRAM si es None).
|
||||
lv = lowvram if lowvram is not None else _detect_lowvram(_query_vram_mib(runner))
|
||||
result["lowvram"] = bool(lv)
|
||||
|
||||
# 2. Renderizar e instalar el unit (solo reescribe si cambio el contenido).
|
||||
content = _render_unit(
|
||||
python_bin, main_py, comfyui_dir, port, lv,
|
||||
"ComfyUI (Stable Diffusion / Flux backend) gestionado por el registry",
|
||||
)
|
||||
unit_dir = os.path.expanduser("~/.config/systemd/user")
|
||||
try:
|
||||
os.makedirs(unit_dir, exist_ok=True)
|
||||
except Exception as e:
|
||||
result["error"] = f"no se pudo crear {unit_dir}: {e}"
|
||||
return result
|
||||
unit_path = os.path.join(unit_dir, f"{unit_name}.service")
|
||||
result["unit_path"] = unit_path
|
||||
|
||||
existing = None
|
||||
if os.path.exists(unit_path):
|
||||
try:
|
||||
with open(unit_path, "r") as f:
|
||||
existing = f.read()
|
||||
except Exception:
|
||||
existing = None
|
||||
changed = existing != content
|
||||
if changed:
|
||||
tmp = unit_path + ".tmp"
|
||||
try:
|
||||
with open(tmp, "w") as f:
|
||||
f.write(content)
|
||||
os.replace(tmp, unit_path)
|
||||
except Exception as e:
|
||||
result["error"] = f"no se pudo escribir el unit: {e}"
|
||||
return result
|
||||
rl = _systemctl(runner, "daemon-reload")
|
||||
result["reloaded"] = rl.returncode == 0
|
||||
if rl.returncode != 0:
|
||||
result["error"] = f"daemon-reload fallo: {(rl.stderr or '').strip()}"
|
||||
return result
|
||||
|
||||
# 3. Habilitar (idempotente; el linger del usuario ya debe estar activo).
|
||||
en = _systemctl(runner, "enable", unit_name)
|
||||
if en.returncode != 0:
|
||||
result["error"] = (
|
||||
f"systemctl --user enable {unit_name} fallo: "
|
||||
f"{(en.stderr or '').strip()}. "
|
||||
"Si es por falta de linger: `loginctl enable-linger $USER`."
|
||||
)
|
||||
return result
|
||||
|
||||
# 4. Estado actual: salud HTTP + si systemd ya lo gestiona.
|
||||
active_state = _unit_active_state(runner, unit_name)
|
||||
health_now = _health(port)
|
||||
|
||||
if health_now and active_state == "active":
|
||||
# Ya gestionado por systemd y sano -> idempotente, no tocar.
|
||||
result["ok"] = True
|
||||
result["health"] = True
|
||||
result["active"] = "active"
|
||||
result["idempotent"] = not changed
|
||||
return result
|
||||
|
||||
if health_now and active_state != "active":
|
||||
# Proceso a mano ocupa el puerto y systemd NO lo gestiona -> migrar limpio.
|
||||
pid = _pid_listening_on_port(port, runner)
|
||||
if pid and _is_comfy_process(pid):
|
||||
if not _terminate_manual(pid, port, runner):
|
||||
result["error"] = (
|
||||
f"no se pudo liberar el puerto {port} (PID {pid}) con SIGTERM; "
|
||||
"no arranco el servicio para no duplicar el bind."
|
||||
)
|
||||
return result
|
||||
result["migrated"] = True
|
||||
elif pid:
|
||||
result["error"] = (
|
||||
f"puerto {port} ocupado por PID {pid} que no parece ComfyUI; "
|
||||
"no lo toco ni arranco el servicio."
|
||||
)
|
||||
return result
|
||||
# Si pid es None pero health_now True: race raro; seguimos a start.
|
||||
|
||||
# 5. Arrancar via systemd y esperar salud.
|
||||
st = _systemctl(runner, "start", unit_name)
|
||||
if st.returncode != 0:
|
||||
result["active"] = _unit_active_state(runner, unit_name)
|
||||
result["error"] = (
|
||||
f"systemctl --user start {unit_name} fallo: "
|
||||
f"{(st.stderr or '').strip()}. Diagnostica con "
|
||||
f"`journalctl --user -u {unit_name} -n 50`."
|
||||
)
|
||||
return result
|
||||
|
||||
healthy = _wait_health(port, health_timeout)
|
||||
result["active"] = _unit_active_state(runner, unit_name)
|
||||
result["health"] = healthy
|
||||
result["ok"] = healthy
|
||||
if not healthy:
|
||||
result["error"] = (
|
||||
f"el unit arranco pero /system_stats no respondio 2xx en "
|
||||
f"{health_timeout}s. Revisa `journalctl --user -u {unit_name} -n 50`."
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
import sys
|
||||
|
||||
kwargs = {}
|
||||
for arg in sys.argv[1:]:
|
||||
if arg.startswith("--port="):
|
||||
kwargs["port"] = int(arg.split("=", 1)[1])
|
||||
elif arg == "--lowvram":
|
||||
kwargs["lowvram"] = True
|
||||
elif arg == "--no-lowvram":
|
||||
kwargs["lowvram"] = False
|
||||
elif arg.startswith("--health-timeout="):
|
||||
kwargs["health_timeout"] = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--comfyui-dir="):
|
||||
kwargs["comfyui_dir"] = arg.split("=", 1)[1]
|
||||
print(json.dumps(comfyui_ensure_server(**kwargs), indent=2))
|
||||
Reference in New Issue
Block a user