package main import ( "context" "log/slog" "os" "strings" "sync" "time" "github.com/enmanuel/agents/agents" "github.com/enmanuel/agents/internal/config" "github.com/enmanuel/agents/pkg/decision" "github.com/enmanuel/agents/pkg/orchestration" pksecurity "github.com/enmanuel/agents/pkg/security" "github.com/enmanuel/agents/shell/bus" agentlog "github.com/enmanuel/agents/shell/logger" ) // runningAgent holds a live agent and the metadata needed to recreate it. type runningAgent struct { agent *agents.Agent cfg *config.AgentConfig cfgPath string logger *slog.Logger logCleanup func() } // launchDeps holds shared resources needed to start/reload agents. type launchDeps struct { agentBus *bus.Bus orch *orchHandle logDir string logLevel slog.Level parentCtx context.Context secPolicy pksecurity.SecurityPolicy // centralized security policy loaded from security/ } // agentRegistry tracks all running agents by ID, enabling individual hot-reload. type agentRegistry struct { mu sync.Mutex agents map[string]*runningAgent deps *launchDeps } func newAgentRegistry(deps *launchDeps) *agentRegistry { return &agentRegistry{ agents: make(map[string]*runningAgent), deps: deps, } } // register adds a running agent to the registry and starts its goroutine. func (r *agentRegistry) register(ra *runningAgent) { r.mu.Lock() r.agents[ra.cfg.Agent.ID] = ra r.mu.Unlock() go func() { ra.logger.Info("agent running") if err := ra.agent.Run(r.deps.parentCtx); err != nil { ra.logger.Error("agent stopped with error", "err", err) } }() } // stopAndWait stops a running agent and waits for it to finish. // Caller must NOT hold r.mu. func (r *agentRegistry) stopAndWait(id string) { r.mu.Lock() ra, ok := r.agents[id] r.mu.Unlock() if !ok { return } ra.agent.Stop() select { case <-ra.agent.Done(): case <-time.After(10 * time.Second): ra.logger.Warn("agent did not stop within 10s, forcing", "id", id) } // Unsubscribe from bus so no stale channel remains. r.deps.agentBus.Unsubscribe(bus.AgentID(id)) } // reload stops an agent, re-reads its config, recreates it, and restarts it. func (r *agentRegistry) reload(id string, rulesFor func(string, *slog.Logger) []decision.Rule) { r.mu.Lock() ra, ok := r.agents[id] r.mu.Unlock() if !ok { slog.Warn("reload: agent not found", "id", id) return } cfgPath := ra.cfgPath oldCleanup := ra.logCleanup ra.logger.Info("agent_reload_start", "id", id) // 1. Stop current instance and wait. r.stopAndWait(id) // 2. Cleanup old log writer. if oldCleanup != nil { oldCleanup() } // 3. Re-read config. cfg, err := config.Load(cfgPath) if err != nil { slog.Error("reload: failed to load config", "path", cfgPath, "err", err) return } if !cfg.Agent.Enabled { slog.Info("reload: agent is disabled, not restarting", "id", id) r.mu.Lock() delete(r.agents, id) r.mu.Unlock() return } // 4. New per-agent logger. newLogger, newCleanup, aErr := agentlog.NewAgentLogger(agentlog.LoggerConfig{ BaseDir: r.deps.logDir, AgentID: cfg.Agent.ID, Level: r.deps.logLevel, }) if aErr != nil { newLogger = slog.Default().With("agent", cfg.Agent.ID) newCleanup = func() {} } // 5. Create new agent (validates config before discarding the old one). rules := rulesFor(cfg.Agent.ID, newLogger) agentACL := pksecurity.ResolveACL(cfg.Agent.ID, r.deps.secPolicy) newLogger.Debug("resolved acl for agent (reload)", "agent", cfg.Agent.ID, "acl_empty", agentACL.Empty()) newAgent, err := agents.New(cfg, rules, agentACL, newLogger) if err != nil { newLogger.Error("reload: failed to create agent", "id", id, "err", err) newCleanup() return } // 6. Wire bus and orchestration. newAgent.SetBus(r.deps.agentBus) if r.deps.orch != nil { newAgent.SetInterceptor(r.deps.orch.orchestrator.Intercept) newAgent.SetMembershipNotify(r.deps.orch.orchestrator.NotifyMembership) r.deps.orch.orchestrator.RegisterParticipant(orchestration.ParticipantInfo{ ID: cfg.Agent.ID, MatrixUserID: cfg.Matrix.UserID, Description: cfg.Agent.Description, Capabilities: cfg.Agent.Tags, }) } newRA := &runningAgent{ agent: newAgent, cfg: cfg, cfgPath: cfgPath, logger: newLogger, logCleanup: newCleanup, } r.mu.Lock() r.agents[id] = newRA r.mu.Unlock() // 7. Start new goroutine. go func() { newLogger.Info("agent running") if err := newAgent.Run(r.deps.parentCtx); err != nil { newLogger.Error("agent stopped with error", "err", err) } }() newLogger.Info("agent_reloaded", "id", id) } // reloadAll reloads every registered agent sequentially. func (r *agentRegistry) reloadAll(rulesFor func(string, *slog.Logger) []decision.Rule) { r.mu.Lock() ids := make([]string, 0, len(r.agents)) for id := range r.agents { ids = append(ids, id) } r.mu.Unlock() for _, id := range ids { r.reload(id, rulesFor) } } // waitAll blocks until all registered agents have stopped. func (r *agentRegistry) waitAll() { r.mu.Lock() dones := make([]<-chan struct{}, 0, len(r.agents)) for _, ra := range r.agents { dones = append(dones, ra.agent.Done()) } r.mu.Unlock() for _, done := range dones { <-done } } // cleanupLogs calls every agent's log cleanup function (called on launcher shutdown). func (r *agentRegistry) cleanupLogs() { r.mu.Lock() defer r.mu.Unlock() for _, ra := range r.agents { if ra.logCleanup != nil { ra.logCleanup() } } } // readReloadTarget reads the given file and returns the trimmed content. // Returns "" if the file doesn't exist, is empty, or equals "*" (meaning reload all). func readReloadTarget(path string) string { data, err := os.ReadFile(path) if err != nil { return "" } id := strings.TrimSpace(string(data)) if id == "*" { return "" } return id }