feat(infra): auto-commit con 3 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-24 11:45:31 +02:00
parent c0b2dce3b0
commit 68f0ce0dae
3 changed files with 572 additions and 0 deletions
@@ -0,0 +1,326 @@
"""Garantiza que ComfyUI corre como servicio systemd-user resiliente y sano.
Funcion impura: instala/actualiza el unit systemd-user `comfyui.service`, lo
habilita y arranca, y comprueba la salud del backend HTTP. Idempotente: si el
servicio ya esta gestionado por systemd, activo y respondiendo, no toca nada.
Migracion limpia: si ComfyUI ya corre a mano (puerto ocupado por un proceso
`main.py` que systemd NO gestiona), lo para con SIGTERM y lo levanta via
systemd, para que a partir de ese momento se reinicie solo (Restart=always).
Solo depende de la stdlib (subprocess, urllib, os, signal, time, re). No lanza
excepciones: siempre devuelve un dict de estado.
"""
import os
import re
import signal
import subprocess
import time
import urllib.request
def _default_runner(cmd):
"""Ejecuta un comando capturando salida. Inyectable para tests."""
return subprocess.run(cmd, capture_output=True, text=True, timeout=30)
def _detect_lowvram(vram_mib):
"""Decide si conviene --lowvram segun la VRAM total en MiB.
GPUs con <= 8200 MiB (tarjetas de 8 GB) ganan estabilidad con --lowvram para
modelos grandes (Flux, video). Si no hay dato de VRAM (None), NO asume
lowvram: devuelve False para no penalizar GPUs grandes sin necesidad.
"""
return vram_mib is not None and vram_mib <= 8200
def _query_vram_mib(runner):
"""Lee la VRAM total (MiB) de la primera GPU via nvidia-smi. None si falla."""
try:
r = runner(
[
"nvidia-smi",
"--query-gpu=memory.total",
"--format=csv,noheader,nounits",
]
)
if r.returncode == 0 and r.stdout.strip():
return int(r.stdout.strip().splitlines()[0].strip())
except Exception:
pass
return None
def _render_unit(python_bin, main_py, working_dir, port, lowvram, description):
"""Construye el texto del unit systemd-user. Pura (sin I/O)."""
exec_start = f"{python_bin} {main_py} --port {port}"
if lowvram:
exec_start += " --lowvram"
return (
"[Unit]\n"
f"Description={description}\n"
"After=network-online.target\n"
"Wants=network-online.target\n"
"\n"
"[Service]\n"
"Type=simple\n"
f"WorkingDirectory={working_dir}\n"
f"ExecStart={exec_start}\n"
# Restart=always (NO on-failure): un SIGTERM limpio es exit success y
# con on-failure el servicio no reviviria. Ver .claude/rules/function_tags.md.
"Restart=always\n"
"RestartSec=5\n"
"\n"
"[Install]\n"
"WantedBy=default.target\n"
)
def _health(port, path="/system_stats", timeout=3):
"""True si GET http://127.0.0.1:<port><path> responde 2xx."""
url = f"http://127.0.0.1:{port}{path}"
try:
with urllib.request.urlopen(url, timeout=timeout) as resp:
return 200 <= resp.status < 300
except Exception:
return False
def _wait_health(port, timeout, interval=2.0):
"""Sondea la salud hasta que responda 2xx o se agote el timeout."""
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
if _health(port):
return True
time.sleep(interval)
return _health(port)
def _systemctl(runner, *args):
return runner(["systemctl", "--user", *args])
def _unit_active_state(runner, unit_name):
"""Devuelve el ActiveState del unit: active|inactive|failed|... o '' si no existe."""
r = _systemctl(runner, "is-active", unit_name)
return (r.stdout or r.stderr or "").strip()
def _pid_listening_on_port(port, runner):
"""PID del proceso que escucha en 127.0.0.1:<port>, o None. Via `ss`."""
try:
r = runner(["ss", "-ltnpH", f"sport = :{port}"])
if r.returncode == 0:
m = re.search(r"pid=(\d+)", r.stdout or "")
if m:
return int(m.group(1))
except Exception:
pass
return None
def _is_comfy_process(pid):
"""True si la cmdline del PID contiene 'main.py' (proceso ComfyUI a mano)."""
try:
with open(f"/proc/{pid}/cmdline", "rb") as f:
cmd = f.read().replace(b"\0", b" ").decode(errors="replace")
return "main.py" in cmd
except Exception:
return False
def _terminate_manual(pid, port, runner, wait_s=25.0):
"""SIGTERM al proceso a mano y espera a que libere el puerto. No usa SIGKILL."""
try:
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
return True
except Exception:
return False
deadline = time.monotonic() + wait_s
while time.monotonic() < deadline:
if _pid_listening_on_port(port, runner) is None:
return True
time.sleep(1.0)
# Reintento suave de SIGTERM antes de rendirse (nunca SIGKILL: no destructivo).
try:
os.kill(pid, signal.SIGTERM)
except Exception:
pass
time.sleep(3.0)
return _pid_listening_on_port(port, runner) is None
def comfyui_ensure_server(
*,
port=8188,
lowvram=None,
health_timeout=60,
comfyui_dir="~/ComfyUI",
unit_name="comfyui",
runner=None,
):
"""Garantiza ComfyUI corriendo y sano como servicio systemd-user.
Args:
port: puerto HTTP del backend ComfyUI (default 8188).
lowvram: True/False fuerza el flag --lowvram; None autodetecta por VRAM
(GPUs <= 8 GB -> True).
health_timeout: segundos maximos esperando a que /system_stats responda
tras arrancar el servicio.
comfyui_dir: raiz de la instalacion de ComfyUI (con .venv/ y main.py).
unit_name: nombre del unit systemd-user (sin .service).
runner: callable(cmd:list)->CompletedProcess inyectable para tests.
Returns:
dict con: ok, active (ActiveState), port, health (bool), error (str|None),
lowvram (bool), unit_path, migrated (bool), reloaded (bool),
idempotent (bool).
"""
runner = runner or _default_runner
result = {
"ok": False,
"active": None,
"port": port,
"health": False,
"error": None,
"lowvram": None,
"unit_path": None,
"migrated": False,
"reloaded": False,
"idempotent": False,
}
comfyui_dir = os.path.abspath(os.path.expanduser(comfyui_dir))
python_bin = os.path.join(comfyui_dir, ".venv", "bin", "python")
main_py = os.path.join(comfyui_dir, "main.py")
if not os.path.exists(python_bin):
result["error"] = f"venv python no encontrado: {python_bin}"
return result
if not os.path.exists(main_py):
result["error"] = f"main.py no encontrado: {main_py}"
return result
# 1. Resolver lowvram (autodetect por VRAM si es None).
lv = lowvram if lowvram is not None else _detect_lowvram(_query_vram_mib(runner))
result["lowvram"] = bool(lv)
# 2. Renderizar e instalar el unit (solo reescribe si cambio el contenido).
content = _render_unit(
python_bin, main_py, comfyui_dir, port, lv,
"ComfyUI (Stable Diffusion / Flux backend) gestionado por el registry",
)
unit_dir = os.path.expanduser("~/.config/systemd/user")
try:
os.makedirs(unit_dir, exist_ok=True)
except Exception as e:
result["error"] = f"no se pudo crear {unit_dir}: {e}"
return result
unit_path = os.path.join(unit_dir, f"{unit_name}.service")
result["unit_path"] = unit_path
existing = None
if os.path.exists(unit_path):
try:
with open(unit_path, "r") as f:
existing = f.read()
except Exception:
existing = None
changed = existing != content
if changed:
tmp = unit_path + ".tmp"
try:
with open(tmp, "w") as f:
f.write(content)
os.replace(tmp, unit_path)
except Exception as e:
result["error"] = f"no se pudo escribir el unit: {e}"
return result
rl = _systemctl(runner, "daemon-reload")
result["reloaded"] = rl.returncode == 0
if rl.returncode != 0:
result["error"] = f"daemon-reload fallo: {(rl.stderr or '').strip()}"
return result
# 3. Habilitar (idempotente; el linger del usuario ya debe estar activo).
en = _systemctl(runner, "enable", unit_name)
if en.returncode != 0:
result["error"] = (
f"systemctl --user enable {unit_name} fallo: "
f"{(en.stderr or '').strip()}. "
"Si es por falta de linger: `loginctl enable-linger $USER`."
)
return result
# 4. Estado actual: salud HTTP + si systemd ya lo gestiona.
active_state = _unit_active_state(runner, unit_name)
health_now = _health(port)
if health_now and active_state == "active":
# Ya gestionado por systemd y sano -> idempotente, no tocar.
result["ok"] = True
result["health"] = True
result["active"] = "active"
result["idempotent"] = not changed
return result
if health_now and active_state != "active":
# Proceso a mano ocupa el puerto y systemd NO lo gestiona -> migrar limpio.
pid = _pid_listening_on_port(port, runner)
if pid and _is_comfy_process(pid):
if not _terminate_manual(pid, port, runner):
result["error"] = (
f"no se pudo liberar el puerto {port} (PID {pid}) con SIGTERM; "
"no arranco el servicio para no duplicar el bind."
)
return result
result["migrated"] = True
elif pid:
result["error"] = (
f"puerto {port} ocupado por PID {pid} que no parece ComfyUI; "
"no lo toco ni arranco el servicio."
)
return result
# Si pid es None pero health_now True: race raro; seguimos a start.
# 5. Arrancar via systemd y esperar salud.
st = _systemctl(runner, "start", unit_name)
if st.returncode != 0:
result["active"] = _unit_active_state(runner, unit_name)
result["error"] = (
f"systemctl --user start {unit_name} fallo: "
f"{(st.stderr or '').strip()}. Diagnostica con "
f"`journalctl --user -u {unit_name} -n 50`."
)
return result
healthy = _wait_health(port, health_timeout)
result["active"] = _unit_active_state(runner, unit_name)
result["health"] = healthy
result["ok"] = healthy
if not healthy:
result["error"] = (
f"el unit arranco pero /system_stats no respondio 2xx en "
f"{health_timeout}s. Revisa `journalctl --user -u {unit_name} -n 50`."
)
return result
if __name__ == "__main__":
import json
import sys
kwargs = {}
for arg in sys.argv[1:]:
if arg.startswith("--port="):
kwargs["port"] = int(arg.split("=", 1)[1])
elif arg == "--lowvram":
kwargs["lowvram"] = True
elif arg == "--no-lowvram":
kwargs["lowvram"] = False
elif arg.startswith("--health-timeout="):
kwargs["health_timeout"] = int(arg.split("=", 1)[1])
elif arg.startswith("--comfyui-dir="):
kwargs["comfyui_dir"] = arg.split("=", 1)[1]
print(json.dumps(comfyui_ensure_server(**kwargs), indent=2))