"""Garantiza que ComfyUI corre como servicio systemd-user resiliente y sano. Funcion impura: instala/actualiza el unit systemd-user `comfyui.service`, lo habilita y arranca, y comprueba la salud del backend HTTP. Idempotente: si el servicio ya esta gestionado por systemd, activo y respondiendo, no toca nada. Migracion limpia: si ComfyUI ya corre a mano (puerto ocupado por un proceso `main.py` que systemd NO gestiona), lo para con SIGTERM y lo levanta via systemd, para que a partir de ese momento se reinicie solo (Restart=always). Solo depende de la stdlib (subprocess, urllib, os, signal, time, re). No lanza excepciones: siempre devuelve un dict de estado. """ import os import re import signal import subprocess import time import urllib.request def _default_runner(cmd): """Ejecuta un comando capturando salida. Inyectable para tests.""" return subprocess.run(cmd, capture_output=True, text=True, timeout=30) def _detect_lowvram(vram_mib): """Decide si conviene --lowvram segun la VRAM total en MiB. GPUs con <= 8200 MiB (tarjetas de 8 GB) ganan estabilidad con --lowvram para modelos grandes (Flux, video). Si no hay dato de VRAM (None), NO asume lowvram: devuelve False para no penalizar GPUs grandes sin necesidad. """ return vram_mib is not None and vram_mib <= 8200 def _query_vram_mib(runner): """Lee la VRAM total (MiB) de la primera GPU via nvidia-smi. None si falla.""" try: r = runner( [ "nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits", ] ) if r.returncode == 0 and r.stdout.strip(): return int(r.stdout.strip().splitlines()[0].strip()) except Exception: pass return None def _render_unit(python_bin, main_py, working_dir, port, lowvram, description): """Construye el texto del unit systemd-user. Pura (sin I/O).""" exec_start = f"{python_bin} {main_py} --port {port}" if lowvram: exec_start += " --lowvram" return ( "[Unit]\n" f"Description={description}\n" "After=network-online.target\n" "Wants=network-online.target\n" "\n" "[Service]\n" "Type=simple\n" f"WorkingDirectory={working_dir}\n" f"ExecStart={exec_start}\n" # Restart=always (NO on-failure): un SIGTERM limpio es exit success y # con on-failure el servicio no reviviria. Ver .claude/rules/function_tags.md. "Restart=always\n" "RestartSec=5\n" "\n" "[Install]\n" "WantedBy=default.target\n" ) def _health(port, path="/system_stats", timeout=3): """True si GET http://127.0.0.1: responde 2xx.""" url = f"http://127.0.0.1:{port}{path}" try: with urllib.request.urlopen(url, timeout=timeout) as resp: return 200 <= resp.status < 300 except Exception: return False def _wait_health(port, timeout, interval=2.0): """Sondea la salud hasta que responda 2xx o se agote el timeout.""" deadline = time.monotonic() + timeout while time.monotonic() < deadline: if _health(port): return True time.sleep(interval) return _health(port) def _systemctl(runner, *args): return runner(["systemctl", "--user", *args]) def _unit_active_state(runner, unit_name): """Devuelve el ActiveState del unit: active|inactive|failed|... o '' si no existe.""" r = _systemctl(runner, "is-active", unit_name) return (r.stdout or r.stderr or "").strip() def _pid_listening_on_port(port, runner): """PID del proceso que escucha en 127.0.0.1:, o None. Via `ss`.""" try: r = runner(["ss", "-ltnpH", f"sport = :{port}"]) if r.returncode == 0: m = re.search(r"pid=(\d+)", r.stdout or "") if m: return int(m.group(1)) except Exception: pass return None def _is_comfy_process(pid): """True si la cmdline del PID contiene 'main.py' (proceso ComfyUI a mano).""" try: with open(f"/proc/{pid}/cmdline", "rb") as f: cmd = f.read().replace(b"\0", b" ").decode(errors="replace") return "main.py" in cmd except Exception: return False def _terminate_manual(pid, port, runner, wait_s=25.0): """SIGTERM al proceso a mano y espera a que libere el puerto. No usa SIGKILL.""" try: os.kill(pid, signal.SIGTERM) except ProcessLookupError: return True except Exception: return False deadline = time.monotonic() + wait_s while time.monotonic() < deadline: if _pid_listening_on_port(port, runner) is None: return True time.sleep(1.0) # Reintento suave de SIGTERM antes de rendirse (nunca SIGKILL: no destructivo). try: os.kill(pid, signal.SIGTERM) except Exception: pass time.sleep(3.0) return _pid_listening_on_port(port, runner) is None def comfyui_ensure_server( *, port=8188, lowvram=None, health_timeout=60, comfyui_dir="~/ComfyUI", unit_name="comfyui", runner=None, ): """Garantiza ComfyUI corriendo y sano como servicio systemd-user. Args: port: puerto HTTP del backend ComfyUI (default 8188). lowvram: True/False fuerza el flag --lowvram; None autodetecta por VRAM (GPUs <= 8 GB -> True). health_timeout: segundos maximos esperando a que /system_stats responda tras arrancar el servicio. comfyui_dir: raiz de la instalacion de ComfyUI (con .venv/ y main.py). unit_name: nombre del unit systemd-user (sin .service). runner: callable(cmd:list)->CompletedProcess inyectable para tests. Returns: dict con: ok, active (ActiveState), port, health (bool), error (str|None), lowvram (bool), unit_path, migrated (bool), reloaded (bool), idempotent (bool). """ runner = runner or _default_runner result = { "ok": False, "active": None, "port": port, "health": False, "error": None, "lowvram": None, "unit_path": None, "migrated": False, "reloaded": False, "idempotent": False, } comfyui_dir = os.path.abspath(os.path.expanduser(comfyui_dir)) python_bin = os.path.join(comfyui_dir, ".venv", "bin", "python") main_py = os.path.join(comfyui_dir, "main.py") if not os.path.exists(python_bin): result["error"] = f"venv python no encontrado: {python_bin}" return result if not os.path.exists(main_py): result["error"] = f"main.py no encontrado: {main_py}" return result # 1. Resolver lowvram (autodetect por VRAM si es None). lv = lowvram if lowvram is not None else _detect_lowvram(_query_vram_mib(runner)) result["lowvram"] = bool(lv) # 2. Renderizar e instalar el unit (solo reescribe si cambio el contenido). content = _render_unit( python_bin, main_py, comfyui_dir, port, lv, "ComfyUI (Stable Diffusion / Flux backend) gestionado por el registry", ) unit_dir = os.path.expanduser("~/.config/systemd/user") try: os.makedirs(unit_dir, exist_ok=True) except Exception as e: result["error"] = f"no se pudo crear {unit_dir}: {e}" return result unit_path = os.path.join(unit_dir, f"{unit_name}.service") result["unit_path"] = unit_path existing = None if os.path.exists(unit_path): try: with open(unit_path, "r") as f: existing = f.read() except Exception: existing = None changed = existing != content if changed: tmp = unit_path + ".tmp" try: with open(tmp, "w") as f: f.write(content) os.replace(tmp, unit_path) except Exception as e: result["error"] = f"no se pudo escribir el unit: {e}" return result rl = _systemctl(runner, "daemon-reload") result["reloaded"] = rl.returncode == 0 if rl.returncode != 0: result["error"] = f"daemon-reload fallo: {(rl.stderr or '').strip()}" return result # 3. Habilitar (idempotente; el linger del usuario ya debe estar activo). en = _systemctl(runner, "enable", unit_name) if en.returncode != 0: result["error"] = ( f"systemctl --user enable {unit_name} fallo: " f"{(en.stderr or '').strip()}. " "Si es por falta de linger: `loginctl enable-linger $USER`." ) return result # 4. Estado actual: salud HTTP + si systemd ya lo gestiona. active_state = _unit_active_state(runner, unit_name) health_now = _health(port) if health_now and active_state == "active": # Ya gestionado por systemd y sano -> idempotente, no tocar. result["ok"] = True result["health"] = True result["active"] = "active" result["idempotent"] = not changed return result if health_now and active_state != "active": # Proceso a mano ocupa el puerto y systemd NO lo gestiona -> migrar limpio. pid = _pid_listening_on_port(port, runner) if pid and _is_comfy_process(pid): if not _terminate_manual(pid, port, runner): result["error"] = ( f"no se pudo liberar el puerto {port} (PID {pid}) con SIGTERM; " "no arranco el servicio para no duplicar el bind." ) return result result["migrated"] = True elif pid: result["error"] = ( f"puerto {port} ocupado por PID {pid} que no parece ComfyUI; " "no lo toco ni arranco el servicio." ) return result # Si pid es None pero health_now True: race raro; seguimos a start. # 5. Arrancar via systemd y esperar salud. st = _systemctl(runner, "start", unit_name) if st.returncode != 0: result["active"] = _unit_active_state(runner, unit_name) result["error"] = ( f"systemctl --user start {unit_name} fallo: " f"{(st.stderr or '').strip()}. Diagnostica con " f"`journalctl --user -u {unit_name} -n 50`." ) return result healthy = _wait_health(port, health_timeout) result["active"] = _unit_active_state(runner, unit_name) result["health"] = healthy result["ok"] = healthy if not healthy: result["error"] = ( f"el unit arranco pero /system_stats no respondio 2xx en " f"{health_timeout}s. Revisa `journalctl --user -u {unit_name} -n 50`." ) return result if __name__ == "__main__": import json import sys kwargs = {} for arg in sys.argv[1:]: if arg.startswith("--port="): kwargs["port"] = int(arg.split("=", 1)[1]) elif arg == "--lowvram": kwargs["lowvram"] = True elif arg == "--no-lowvram": kwargs["lowvram"] = False elif arg.startswith("--health-timeout="): kwargs["health_timeout"] = int(arg.split("=", 1)[1]) elif arg.startswith("--comfyui-dir="): kwargs["comfyui_dir"] = arg.split("=", 1)[1] print(json.dumps(comfyui_ensure_server(**kwargs), indent=2))