merge: quick/launcher-auto-restart — supervisar agentes y reiniciarlos cuando salen sin cancelacion
This commit is contained in:
@@ -268,9 +268,14 @@ func main() {
|
|||||||
scanCancel()
|
scanCancel()
|
||||||
}
|
}
|
||||||
|
|
||||||
registry.waitAll()
|
// Supervised loop: wait for all agents, and if the parent context is
|
||||||
|
// still alive (i.e. no SIGINT/SIGTERM received), reload them and keep
|
||||||
|
// going. Protects against the launcher exiting cleanly when all
|
||||||
|
// agent runners terminate naturally (token rotation, sync drop, etc.)
|
||||||
|
// while the supervisor itself is healthy.
|
||||||
|
registry.superviseUntilCanceled(ctx, 5*time.Second, rulesFor, logger)
|
||||||
registry.cleanupLogs()
|
registry.cleanupLogs()
|
||||||
logger.Info("all agents stopped")
|
logger.Info("launcher shutting down")
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -230,6 +230,35 @@ func (r *agentRegistry) waitAll() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// superviseUntilCanceled blocks until ctx is canceled, restarting agents
|
||||||
|
// (via reloadAll) every time waitAll returns while the parent ctx is alive.
|
||||||
|
// Each restart waits restartBackoff before recreating runners. Used by the
|
||||||
|
// launcher main loop so the process keeps the agents up across token rotation
|
||||||
|
// or sync drops without exiting cleanly to systemd.
|
||||||
|
func (r *agentRegistry) superviseUntilCanceled(
|
||||||
|
ctx context.Context,
|
||||||
|
restartBackoff time.Duration,
|
||||||
|
rulesFor func(string, *slog.Logger) []decision.Rule,
|
||||||
|
logger *slog.Logger,
|
||||||
|
) {
|
||||||
|
for {
|
||||||
|
r.waitAll()
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if logger != nil {
|
||||||
|
logger.Warn("all agents stopped while launcher active — restarting after backoff",
|
||||||
|
"backoff", restartBackoff.String())
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-time.After(restartBackoff):
|
||||||
|
}
|
||||||
|
r.reloadAll(rulesFor)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// cleanupLogs calls every agent's log cleanup function (called on launcher shutdown).
|
// cleanupLogs calls every agent's log cleanup function (called on launcher shutdown).
|
||||||
func (r *agentRegistry) cleanupLogs() {
|
func (r *agentRegistry) cleanupLogs() {
|
||||||
r.mu.Lock()
|
r.mu.Lock()
|
||||||
|
|||||||
@@ -1,9 +1,21 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/enmanuel/agents/devagents"
|
||||||
|
"github.com/enmanuel/agents/internal/config"
|
||||||
|
"github.com/enmanuel/agents/pkg/command"
|
||||||
|
"github.com/enmanuel/agents/pkg/decision"
|
||||||
|
"github.com/enmanuel/agents/shell/bus"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestReadReloadTarget_missing(t *testing.T) {
|
func TestReadReloadTarget_missing(t *testing.T) {
|
||||||
@@ -112,3 +124,142 @@ func TestIsSpecialConfig_emptyLoadedMap(t *testing.T) {
|
|||||||
t.Fatal("expected false when empty specials map")
|
t.Fatal("expected false when empty specials map")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── superviseUntilCanceled tests ──────────────────────────────────────────
|
||||||
|
|
||||||
|
// fakeRunner implements devagents.Runner for tests. Each Run() call increments
|
||||||
|
// runs, then closes done after a tiny delay so waitAll observes a "stopped"
|
||||||
|
// runner. RestartFactory replaces done on each cycle so the supervisor can
|
||||||
|
// recreate the runner via reload.
|
||||||
|
type fakeRunner struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
done chan struct{}
|
||||||
|
runs int32
|
||||||
|
}
|
||||||
|
|
||||||
|
func newFakeRunner() *fakeRunner {
|
||||||
|
return &fakeRunner{done: make(chan struct{})}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeRunner) Run(_ context.Context) error {
|
||||||
|
atomic.AddInt32(&f.runs, 1)
|
||||||
|
// Close immediately so the supervisor sees the runner as stopped.
|
||||||
|
go func() {
|
||||||
|
f.mu.Lock()
|
||||||
|
ch := f.done
|
||||||
|
f.mu.Unlock()
|
||||||
|
// Wait briefly so waitAll has time to subscribe.
|
||||||
|
time.Sleep(5 * time.Millisecond)
|
||||||
|
close(ch)
|
||||||
|
}()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeRunner) Stop() {}
|
||||||
|
|
||||||
|
func (f *fakeRunner) RegisterCommand(_ command.Spec, _ devagents.CommandHandler) {}
|
||||||
|
|
||||||
|
func (f *fakeRunner) Done() <-chan struct{} {
|
||||||
|
f.mu.Lock()
|
||||||
|
defer f.mu.Unlock()
|
||||||
|
return f.done
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeRunner) reset() {
|
||||||
|
f.mu.Lock()
|
||||||
|
f.done = make(chan struct{})
|
||||||
|
f.mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
// silentLogger drops everything; keeps tests quiet.
|
||||||
|
func silentLogger() *slog.Logger {
|
||||||
|
return slog.New(slog.NewTextHandler(io.Discard, nil))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSuperviseUntilCanceled_ReturnsWhenCtxCanceledFirst(t *testing.T) {
|
||||||
|
r := newAgentRegistry(&launchDeps{agentBus: bus.New(silentLogger())})
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
cancel() // cancel before supervise starts
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
r.superviseUntilCanceled(ctx, 50*time.Millisecond, nil, silentLogger())
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-time.After(time.Second):
|
||||||
|
t.Fatal("superviseUntilCanceled did not return after ctx canceled (empty registry)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSuperviseUntilCanceled_ReturnsAfterCtxCancelDuringBackoff(t *testing.T) {
|
||||||
|
// Registry has one fake runner that is already "done". waitAll returns
|
||||||
|
// immediately, supervise enters the backoff select, ctx cancel during
|
||||||
|
// backoff must unblock it.
|
||||||
|
fr := newFakeRunner()
|
||||||
|
close(fr.done) // already done
|
||||||
|
|
||||||
|
r := newAgentRegistry(&launchDeps{agentBus: bus.New(silentLogger())})
|
||||||
|
r.agents["fake"] = &runningAgent{
|
||||||
|
runner: fr,
|
||||||
|
cfg: &config.AgentConfig{Agent: config.AgentMeta{ID: "fake"}},
|
||||||
|
cfgPath: "",
|
||||||
|
logger: silentLogger(),
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
// Long backoff — only ctx cancel should unblock.
|
||||||
|
r.superviseUntilCanceled(ctx, 10*time.Second, nil, silentLogger())
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
// Give supervise a moment to enter the backoff select.
|
||||||
|
time.Sleep(20 * time.Millisecond)
|
||||||
|
cancel()
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("superviseUntilCanceled did not return after ctx cancel during backoff")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSuperviseUntilCanceled_CallsReloadOnAgentExit(t *testing.T) {
|
||||||
|
// Track reloadAll invocations via a custom rulesFor function. reload reads
|
||||||
|
// from disk via cfgPath; with cfgPath="" config.Load fails and reload
|
||||||
|
// returns early without invoking rulesFor. So instead of inspecting
|
||||||
|
// rulesFor calls, we observe that the supervisor loops past the backoff
|
||||||
|
// at least once and only stops when ctx is canceled.
|
||||||
|
fr := newFakeRunner()
|
||||||
|
close(fr.done) // already done — waitAll returns immediately
|
||||||
|
|
||||||
|
r := newAgentRegistry(&launchDeps{agentBus: bus.New(silentLogger())})
|
||||||
|
r.agents["fake"] = &runningAgent{
|
||||||
|
runner: fr,
|
||||||
|
cfg: &config.AgentConfig{Agent: config.AgentMeta{ID: "fake"}},
|
||||||
|
cfgPath: "/nonexistent/cfg.yaml", // reload will fail loading; supervisor still loops
|
||||||
|
logger: silentLogger(),
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond)
|
||||||
|
defer cancel()
|
||||||
|
rulesForCalls := 0
|
||||||
|
rulesFor := func(string, *slog.Logger) []decision.Rule {
|
||||||
|
rulesForCalls++
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
startedAt := time.Now()
|
||||||
|
r.superviseUntilCanceled(ctx, 30*time.Millisecond, rulesFor, silentLogger())
|
||||||
|
elapsed := time.Since(startedAt)
|
||||||
|
|
||||||
|
// Supervisor must wait until the deadline; never exit early on its own.
|
||||||
|
if elapsed < 200*time.Millisecond {
|
||||||
|
t.Fatalf("supervisor returned too early: %s", elapsed)
|
||||||
|
}
|
||||||
|
// rulesFor is not called because reload short-circuits on bad cfgPath, but
|
||||||
|
// the loop must have iterated through several backoff cycles (~6 with 30ms
|
||||||
|
// backoff over 250ms). The test is best-effort — we simply assert the
|
||||||
|
// supervisor stayed alive until ctx deadline.
|
||||||
|
_ = rulesForCalls
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user