Files
agents_and_robots/cmd/launcher/registry_test.go
T
egutierrez 10f0614fc0 fix(launcher): supervisar agentes y reiniciarlos cuando salen sin cancelacion
El launcher salia con status=0 cuando todos los runners (Agent/Robot)
terminaban su Run() de forma natural — por ejemplo tras una rotacion de
token de Matrix o un drop del sync. systemd, configurado con
Restart=on-failure, no relanzaba el proceso al ver salida limpia y los
bots quedaban caidos hasta una intervencion manual.

Solucion: nueva rutina superviseUntilCanceled en agentRegistry que
bloquea sobre waitAll, y si el ctx padre sigue vivo, espera un backoff
y llama reloadAll para recrear los runners. Solo cuando el ctx padre
se cancela (SIGINT/SIGTERM) la rutina retorna y el launcher sale.

main.go pasa a invocar este supervisor en lugar de waitAll directo.

Tests:
- TestSuperviseUntilCanceled_ReturnsWhenCtxCanceledFirst — empty registry
- TestSuperviseUntilCanceled_ReturnsAfterCtxCancelDuringBackoff — cancel
  durante el backoff debe desbloquear inmediatamente
- TestSuperviseUntilCanceled_CallsReloadOnAgentExit — supervisor sigue
  vivo todo el deadline aunque reload falle por cfgPath invalido

Diagnostico: tras varias horas el journalctl mostraba "Deactivated
successfully" sin "Stopping" previo (Apr 13 18:22 tras 23h corriendo)
y el log del agent registraba "context canceled" tras "starting matrix
sync" — sintoma de que mautrix.SyncWithContext salio limpiamente y el
ctx.cancel se propago al cerrar la goroutine sin que systemd hubiera
enviado SIGTERM. El bucle supervisado lo arregla recreando los runners
sin tocar la unit ni depender del Restart de systemd.
2026-05-09 14:55:41 +02:00

266 lines
7.6 KiB
Go

package main
import (
"context"
"io"
"log/slog"
"os"
"path/filepath"
"sync"
"sync/atomic"
"testing"
"time"
"github.com/enmanuel/agents/devagents"
"github.com/enmanuel/agents/internal/config"
"github.com/enmanuel/agents/pkg/command"
"github.com/enmanuel/agents/pkg/decision"
"github.com/enmanuel/agents/shell/bus"
)
func TestReadReloadTarget_missing(t *testing.T) {
got := readReloadTarget(filepath.Join(t.TempDir(), "reload.txt"))
if got != "" {
t.Fatalf("expected empty string for missing file, got %q", got)
}
}
func TestReadReloadTarget_empty(t *testing.T) {
f := filepath.Join(t.TempDir(), "reload.txt")
if err := os.WriteFile(f, []byte(""), 0o644); err != nil {
t.Fatal(err)
}
got := readReloadTarget(f)
if got != "" {
t.Fatalf("expected empty string for empty file, got %q", got)
}
}
func TestReadReloadTarget_star(t *testing.T) {
f := filepath.Join(t.TempDir(), "reload.txt")
if err := os.WriteFile(f, []byte("*\n"), 0o644); err != nil {
t.Fatal(err)
}
got := readReloadTarget(f)
if got != "" {
t.Fatalf("expected empty string for '*', got %q", got)
}
}
func TestReadReloadTarget_agentID(t *testing.T) {
f := filepath.Join(t.TempDir(), "reload.txt")
if err := os.WriteFile(f, []byte("assistant-bot\n"), 0o644); err != nil {
t.Fatal(err)
}
got := readReloadTarget(f)
if got != "assistant-bot" {
t.Fatalf("expected 'assistant-bot', got %q", got)
}
}
func TestReadReloadTarget_whitespace(t *testing.T) {
f := filepath.Join(t.TempDir(), "reload.txt")
if err := os.WriteFile(f, []byte(" asistente-2 \n"), 0o644); err != nil {
t.Fatal(err)
}
got := readReloadTarget(f)
if got != "asistente-2" {
t.Fatalf("expected 'asistente-2', got %q", got)
}
}
// ── isSpecialConfig tests ─────────────────────────────────────────────────
func TestIsSpecialConfig_matchesLoadedSpecial(t *testing.T) {
dir := t.TempDir()
cfg := filepath.Join(dir, "config.yaml")
if err := os.WriteFile(cfg, []byte(`
special:
id: orchestrator
type: orchestrator
enabled: true
llm:
primary:
provider: openai
`), 0o644); err != nil {
t.Fatal(err)
}
loaded := map[string]bool{"orchestrator": true}
if !isSpecialConfig(cfg, loaded) {
t.Fatal("expected isSpecialConfig to return true for loaded orchestrator")
}
}
func TestIsSpecialConfig_agentConfigNotSpecial(t *testing.T) {
dir := t.TempDir()
cfg := filepath.Join(dir, "config.yaml")
// An AgentConfig doesn't have special.id, so LoadSpecial will fail validation.
if err := os.WriteFile(cfg, []byte(`
agent:
id: father-bot
enabled: true
matrix:
homeserver: "https://example.com"
user_id: "@father:example.com"
llm:
primary:
provider: claude-code
`), 0o644); err != nil {
t.Fatal(err)
}
loaded := map[string]bool{"orchestrator": true}
if isSpecialConfig(cfg, loaded) {
t.Fatal("expected isSpecialConfig to return false for agent config")
}
}
func TestIsSpecialConfig_emptyLoadedMap(t *testing.T) {
if isSpecialConfig("any-path", nil) {
t.Fatal("expected false when no specials loaded")
}
if isSpecialConfig("any-path", map[string]bool{}) {
t.Fatal("expected false when empty specials map")
}
}
// ── superviseUntilCanceled tests ──────────────────────────────────────────
// fakeRunner implements devagents.Runner for tests. Each Run() call increments
// runs, then closes done after a tiny delay so waitAll observes a "stopped"
// runner. RestartFactory replaces done on each cycle so the supervisor can
// recreate the runner via reload.
type fakeRunner struct {
mu sync.Mutex
done chan struct{}
runs int32
}
func newFakeRunner() *fakeRunner {
return &fakeRunner{done: make(chan struct{})}
}
func (f *fakeRunner) Run(_ context.Context) error {
atomic.AddInt32(&f.runs, 1)
// Close immediately so the supervisor sees the runner as stopped.
go func() {
f.mu.Lock()
ch := f.done
f.mu.Unlock()
// Wait briefly so waitAll has time to subscribe.
time.Sleep(5 * time.Millisecond)
close(ch)
}()
return nil
}
func (f *fakeRunner) Stop() {}
func (f *fakeRunner) RegisterCommand(_ command.Spec, _ devagents.CommandHandler) {}
func (f *fakeRunner) Done() <-chan struct{} {
f.mu.Lock()
defer f.mu.Unlock()
return f.done
}
func (f *fakeRunner) reset() {
f.mu.Lock()
f.done = make(chan struct{})
f.mu.Unlock()
}
// silentLogger drops everything; keeps tests quiet.
func silentLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(io.Discard, nil))
}
func TestSuperviseUntilCanceled_ReturnsWhenCtxCanceledFirst(t *testing.T) {
r := newAgentRegistry(&launchDeps{agentBus: bus.New(silentLogger())})
ctx, cancel := context.WithCancel(context.Background())
cancel() // cancel before supervise starts
done := make(chan struct{})
go func() {
r.superviseUntilCanceled(ctx, 50*time.Millisecond, nil, silentLogger())
close(done)
}()
select {
case <-done:
case <-time.After(time.Second):
t.Fatal("superviseUntilCanceled did not return after ctx canceled (empty registry)")
}
}
func TestSuperviseUntilCanceled_ReturnsAfterCtxCancelDuringBackoff(t *testing.T) {
// Registry has one fake runner that is already "done". waitAll returns
// immediately, supervise enters the backoff select, ctx cancel during
// backoff must unblock it.
fr := newFakeRunner()
close(fr.done) // already done
r := newAgentRegistry(&launchDeps{agentBus: bus.New(silentLogger())})
r.agents["fake"] = &runningAgent{
runner: fr,
cfg: &config.AgentConfig{Agent: config.AgentMeta{ID: "fake"}},
cfgPath: "",
logger: silentLogger(),
}
ctx, cancel := context.WithCancel(context.Background())
done := make(chan struct{})
go func() {
// Long backoff — only ctx cancel should unblock.
r.superviseUntilCanceled(ctx, 10*time.Second, nil, silentLogger())
close(done)
}()
// Give supervise a moment to enter the backoff select.
time.Sleep(20 * time.Millisecond)
cancel()
select {
case <-done:
case <-time.After(2 * time.Second):
t.Fatal("superviseUntilCanceled did not return after ctx cancel during backoff")
}
}
func TestSuperviseUntilCanceled_CallsReloadOnAgentExit(t *testing.T) {
// Track reloadAll invocations via a custom rulesFor function. reload reads
// from disk via cfgPath; with cfgPath="" config.Load fails and reload
// returns early without invoking rulesFor. So instead of inspecting
// rulesFor calls, we observe that the supervisor loops past the backoff
// at least once and only stops when ctx is canceled.
fr := newFakeRunner()
close(fr.done) // already done — waitAll returns immediately
r := newAgentRegistry(&launchDeps{agentBus: bus.New(silentLogger())})
r.agents["fake"] = &runningAgent{
runner: fr,
cfg: &config.AgentConfig{Agent: config.AgentMeta{ID: "fake"}},
cfgPath: "/nonexistent/cfg.yaml", // reload will fail loading; supervisor still loops
logger: silentLogger(),
}
ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond)
defer cancel()
rulesForCalls := 0
rulesFor := func(string, *slog.Logger) []decision.Rule {
rulesForCalls++
return nil
}
startedAt := time.Now()
r.superviseUntilCanceled(ctx, 30*time.Millisecond, rulesFor, silentLogger())
elapsed := time.Since(startedAt)
// Supervisor must wait until the deadline; never exit early on its own.
if elapsed < 200*time.Millisecond {
t.Fatalf("supervisor returned too early: %s", elapsed)
}
// rulesFor is not called because reload short-circuits on bad cfgPath, but
// the loop must have iterated through several backoff cycles (~6 with 30ms
// backoff over 250ms). The test is best-effort — we simply assert the
// supervisor stayed alive until ctx deadline.
_ = rulesForCalls
}