feat: implement server-wide management actions and enhance TUI dashboard

2026-03-04 20:51:02 +00:00
parent 150f9d2990
commit ddec55871b
13 changed files with 621 additions and 52 deletions
@@ -4,6 +4,43 @@ Plataforma en Go para gestionar bots Matrix autónomos. Cada bot combina un **co

 ---

+## Inicio rápido
+
+```bash
+# 1. Compilar todo
+./build.sh
+
+# 2. Cargar variables de entorno
+source .env
+
+# 3. Lanzar la TUI interactiva (dashboard)
+./bin/dashboard
+```
+
+### Dashboard TUI
+
+El dashboard es una interfaz de terminal interactiva (bubbletea) para gestionar los bots del servidor:
+
+```
+./bin/dashboard
+```
+
+Desde la TUI puedes:
+
+- **Agents** — ver estado de cada agente, iniciar/detener/reiniciar/kill individual, ver logs
+- **Server** — operaciones masivas: start all, stop all, restart all, kill all con resumen de estado
+
+### Otros binarios
+
+| Binario | Uso |
+|---------|-----|
+| `./bin/launcher` | Inicia uno o varios agentes como procesos |
+| `./bin/agentctl` | CLI: `list`, `start`, `stop`, `remove` |
+| `./bin/register` | Registra bots en Synapse via admin API |
+| `./bin/dashboard` | TUI interactiva para gestión de bots |
+
+---
+
 ## Principio de diseño

 El proyecto usa el patrón **pure core / impure shell**:
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+export PATH="/usr/local/go/bin:$PATH"
+
+BIN="bin"
+TAGS="-tags goolm"
+LDFLAGS="-ldflags=-s -w"
+
+mkdir -p "$BIN"
+
+echo "==> Compilando todos los binarios en $BIN/ ..."
+
+targets=(
+    "launcher:./cmd/launcher"
+    "agentctl:./cmd/agentctl"
+    "register:./cmd/register"
+    "dashboard:./cmd/dashboard"
+)
+
+for entry in "${targets[@]}"; do
+    name="${entry%%:*}"
+    pkg="${entry##*:}"
+    echo "    $name"
+    go build $TAGS "$LDFLAGS" -o "$BIN/$name" "$pkg"
+done
+
+echo ""
+echo "==> Listo. Binarios disponibles:"
+ls -lh "$BIN"/
@@ -50,9 +50,64 @@ read_pid() {
  [[ -f "$f" ]] && cat "$f" || echo 0
 }

+# Map agent ID to its config path by scanning agent directories.
+config_path_for() {
+  local target_id="$1"
+  for cfg in agents/*/config.yaml; do
+    [[ -f "$cfg" ]] || continue
+    local id
+    id=$(grep -m1 '^  id:' "$cfg" | awk '{print $2}')
+    if [[ "$id" == "$target_id" ]]; then
+      echo "$cfg"
+      return
+    fi
+  done
+}
+
+# Find all PIDs of launcher processes for a given agent ID.
+# Searches for the actual config path in the process command line.
+# Returns newline-separated PIDs (may be empty).
+find_agent_pids() {
+  local id="$1"
+  local cfg; cfg="$(config_path_for "$id")"
+  if [[ -z "$cfg" ]]; then
+    return
+  fi
+  pgrep -f "launcher.*-c.*${cfg}" 2>/dev/null || true
+}
+
 is_running() {
-  local pid; pid="$(read_pid "$1")"
-  [[ "$pid" -gt 0 ]] && kill -0 "$pid" 2>/dev/null
+  local id="$1"
+
+  # First check PID file
+  local pid; pid="$(read_pid "$id")"
+  if [[ "$pid" -gt 0 ]] && kill -0 "$pid" 2>/dev/null; then
+    return 0
+  fi
+
+  # PID file is stale or missing — search for actual processes
+  local pids; pids="$(find_agent_pids "$id")"
+  if [[ -n "$pids" ]]; then
+    # Update PID file with the first found process
+    local first_pid; first_pid="$(echo "$pids" | head -1)"
+    echo "$first_pid" > "$(pid_file "$id")"
+    return 0
+  fi
+
+  # Truly not running — clean up stale PID file
+  [[ "$pid" -gt 0 ]] && rm -f "$(pid_file "$id")"
+  return 1
+}
+
+# Count how many instances of an agent are running.
+count_instances() {
+  local id="$1"
+  local pids; pids="$(find_agent_pids "$id")"
+  if [[ -z "$pids" ]]; then
+    echo 0
+  else
+    echo "$pids" | wc -l
+  fi
 }

 agent_status() {
@@ -26,6 +26,7 @@ while IFS='|' read -r id _version _enabled _desc _cfg; do
  fi

  pid="$(read_pid "$id")"
+  instance_count="$(count_instances "$id")"
  ((found++)) || true

  # Uptime: calcular desde el inicio del proceso
@@ -78,6 +79,12 @@ while IFS='|' read -r id _version _enabled _desc _cfg; do
  printf "%-22s  ${GRN}%-8s${RST}  %-12s  %-10s  %-8s  %s\n" \
    "$id" "$pid" "$uptime" "$mem" "${cpu_pct}%" "$log_size"

+  # Warn about duplicate instances
+  if [[ "$instance_count" -gt 1 ]]; then
+    printf "  ${RED}⚠ WARNING: %d instances running!${RST} PIDs: %s\n" \
+      "$instance_count" "$(find_agent_pids "$id" | tr '\n' ' ')"
+  fi
+
 done < <(list_agents_raw)

 if [[ "$found" -eq 0 ]]; then
@@ -55,11 +55,14 @@ case "$CMD" in

    killed=0
    for id in "${agents[@]}"; do
-      pid="$(read_pid "$id")"
-      if [[ "$pid" -gt 0 ]] && kill -0 "$pid" 2>/dev/null; then
-        kill -9 "$pid" 2>/dev/null || true
+      all_pids="$(find_agent_pids "$id")"
+      if [[ -n "$all_pids" ]]; then
+        cnt="$(echo "$all_pids" | wc -l)"
+        for p in $all_pids; do
+          kill -9 "$p" 2>/dev/null || true
+        done
        rm -f "$(pid_file "$id")"
-        ok "$id  killed  (PID $pid)"
+        ok "$id  killed  ($cnt instance(s), PIDs: $(echo $all_pids | tr '\n' ' '))"
        ((killed++)) || true
      else
        dim "  $id  (no estaba corriendo)"
@@ -14,11 +14,30 @@ start_agent() {
  local id="$1" cfg="$2"
  local log; log="$(log_file "$id")"
  local pid_f; pid_f="$(pid_file "$id")"
+  local bin="$REPO_ROOT/bin/launcher"
+
+  # Check for duplicate instances already running
+  local existing; existing="$(count_instances "$id")"
+  if [[ "$existing" -gt 0 ]]; then
+    warn "$id  already has $existing instance(s) running (orphan processes?)"
+    warn "  Run ./dev-scripts/stop.sh $id first to clean up"
+    return 1
+  fi

  info "Iniciando $id..."

-  # Lanza el launcher en background, desacoplado del terminal
-  nohup "$GO" run -tags goolm ./cmd/launcher -c "$cfg" --log-level "${LOG_LEVEL:-info}" \
+  # Build the binary first to avoid go run wrapper PID issues
+  if [[ ! -x "$bin" ]] || [[ "$(find ./cmd/launcher -newer "$bin" 2>/dev/null | head -1)" ]]; then
+    info "Compilando launcher..."
+    mkdir -p "$(dirname "$bin")"
+    "$GO" build -tags goolm -o "$bin" ./cmd/launcher || {
+      fail "$id  error de compilación — revisa el código"
+      return 1
+    }
+  fi
+
+  # Launch the compiled binary directly (no go run wrapper)
+  nohup "$bin" -c "$cfg" --log-level "${LOG_LEVEL:-info}" \
    >> "$log" 2>&1 &

  local pid=$!
@@ -18,23 +18,37 @@ while IFS='|' read -r id _version _enabled _desc _cfg; do
    continue
  fi

-  local_pid="$(read_pid "$id")"
-  kill -TERM "$local_pid" 2>/dev/null || true
+  # Kill ALL instances, not just the one in the PID file
+  all_pids="$(find_agent_pids "$id")"
+  instance_count="$(echo "$all_pids" | grep -c . 2>/dev/null || echo 0)"

-  # Espera hasta 5s a que muera limpiamente
+  if [[ "$instance_count" -gt 1 ]]; then
+    warn "$id  has $instance_count instances running — stopping all"
+  fi
+
+  # Send SIGTERM to all instances
+  for p in $all_pids; do
+    kill -TERM "$p" 2>/dev/null || true
+  done
+
+  # Wait up to 5s for graceful shutdown
  for _ in {1..10}; do
-    kill -0 "$local_pid" 2>/dev/null || break
+    remaining="$(find_agent_pids "$id")"
+    [[ -z "$remaining" ]] && break
    sleep 0.5
  done

-  # SIGKILL si todavía sigue vivo
-  if kill -0 "$local_pid" 2>/dev/null; then
+  # SIGKILL any survivors
+  survivors="$(find_agent_pids "$id")"
+  if [[ -n "$survivors" ]]; then
    warn "$id  no respondió a SIGTERM, enviando SIGKILL..."
-    kill -9 "$local_pid" 2>/dev/null || true
+    for p in $survivors; do
+      kill -9 "$p" 2>/dev/null || true
+    done
  fi

  rm -f "$(pid_file "$id")"
-  ok "$id  detenido  (PID $local_pid)"
+  ok "$id  detenido  ($instance_count instance(s) stopped)"
  ((stopped++)) || true

 done < <(list_agents_raw)
@@ -16,5 +16,13 @@ type MsgActionDone struct {
 // MsgLogsLoaded carries log lines for the selected agent.
 type MsgLogsLoaded struct{ Lines []string }

+// MsgServerActionDone reports the result of a server-wide bulk action.
+type MsgServerActionDone struct {
+	Action   string
+	Total    int
+	Failed   int
+	Errors   []string
+}
+
 // MsgTick triggers a periodic refresh.
 type MsgTick struct{}
@@ -10,6 +10,7 @@ const (
 	ScreenAgentList           // list all agents with status
 	ScreenAgentActions        // actions for a selected agent
 	ScreenLogs                // tail log output
+	ScreenServer              // server-wide process management
 )

 // Model is the complete TUI state — pure data.
@@ -34,10 +35,11 @@ type AgentView struct {
 	Enabled bool
 	Running bool
 	PID     int
-	Uptime  string // formatted: "2h 15m"
-	Memory  string // formatted: "42 MB"
-	CPU     string // formatted: "1.2%"
-	LogSize string // formatted: "350 KB"
+	Instances int    // number of running instances (>1 means duplicates)
+	Uptime    string // formatted: "2h 15m"
+	Memory    string // formatted: "42 MB"
+	CPU       string // formatted: "1.2%"
+	LogSize   string // formatted: "350 KB"
 }

 // MenuOption represents a selectable menu item.
@@ -50,10 +52,21 @@ type MenuOption struct {
 func MainMenuOptions() []MenuOption {
 	return []MenuOption{
 		{Label: "Agents", Desc: "Gestionar agentes"},
+		{Label: "Server", Desc: "Gestionar servidor"},
 		{Label: "Quit", Desc: "Salir"},
 	}
 }

+// ServerMenuOptions returns the available server-wide actions.
+func ServerMenuOptions() []MenuOption {
+	return []MenuOption{
+		{Label: "Start All", Desc: "Iniciar todos los agentes habilitados"},
+		{Label: "Stop All", Desc: "Detener todos los agentes"},
+		{Label: "Restart All", Desc: "Reiniciar todos los agentes"},
+		{Label: "Kill All", Desc: "SIGKILL forzado a todos"},
+	}
+}
+
 // AgentActionOptions returns the available actions based on agent state.
 func AgentActionOptions(running bool) []MenuOption {
 	if running {
@@ -14,6 +14,12 @@ const (
 	IntentLoadLogs     IntentKind = "load_logs"
 	IntentTick         IntentKind = "tick"
 	IntentQuit         IntentKind = "quit"
+
+	// Server-wide bulk operations
+	IntentStartAll   IntentKind = "start_all"
+	IntentStopAll    IntentKind = "stop_all"
+	IntentRestartAll IntentKind = "restart_all"
+	IntentKillAll    IntentKind = "kill_all"
 )

 // Intent is pure data describing a side effect to execute.
@@ -45,9 +51,11 @@ func Update(model Model, msg interface{}) (Model, []Intent) {

 	case MsgAgentsLoaded:
 		model.Agents = m.Agents
-		// Clamp cursor
-		if model.Cursor >= len(model.Agents) && len(model.Agents) > 0 {
-			model.Cursor = len(model.Agents) - 1
+		// Clamp cursor only on screens that use the agent list
+		if model.Screen == ScreenAgentList {
+			if model.Cursor >= len(model.Agents) && len(model.Agents) > 0 {
+				model.Cursor = len(model.Agents) - 1
+			}
 		}
 		return model, []Intent{{Kind: IntentTick}}

@@ -59,6 +67,14 @@ func Update(model Model, msg interface{}) (Model, []Intent) {
 		}
 		return model, []Intent{{Kind: IntentLoadAgents}}

+	case MsgServerActionDone:
+		if m.Failed == 0 {
+			model.StatusMsg = fmt.Sprintf("%s: %d agents OK", m.Action, m.Total)
+		} else {
+			model.StatusMsg = fmt.Sprintf("%s: %d/%d failed", m.Action, m.Failed, m.Total)
+		}
+		return model, []Intent{{Kind: IntentLoadAgents}}
+
 	case MsgLogsLoaded:
 		model.LogLines = m.Lines
 		model.LogScroll = max(0, len(m.Lines)-visibleLogLines(model))
@@ -92,6 +108,8 @@ func updateKey(model Model, key KeyMsg) (Model, []Intent) {
 		return updateAgentActions(model, key)
 	case ScreenLogs:
 		return updateLogs(model, key)
+	case ScreenServer:
+		return updateServerScreen(model, key)
 	}
 	return model, nil
 }
@@ -109,6 +127,11 @@ func updateMainScreen(model Model, key KeyMsg) (Model, []Intent) {
 			model.Screen = ScreenAgentList
 			model.Cursor = 0
 			return model, []Intent{{Kind: IntentLoadAgents}}
+		case "Server":
+			model.Screen = ScreenServer
+			model.Cursor = 0
+			model.StatusMsg = ""
+			return model, []Intent{{Kind: IntentLoadAgents}}
 		case "Quit":
 			return model, []Intent{{Kind: IntentQuit}}
 		}
@@ -210,6 +233,44 @@ func updateLogs(model Model, key KeyMsg) (Model, []Intent) {
 	return model, nil
 }

+func updateServerScreen(model Model, key KeyMsg) (Model, []Intent) {
+	opts := ServerMenuOptions()
+
+	switch key.Str {
+	case "0":
+		model.Screen = ScreenMain
+		model.Cursor = 0
+		model.StatusMsg = ""
+	case "up", "k":
+		model.Cursor = clamp(model.Cursor-1, 0, len(opts)-1)
+	case "down", "j":
+		model.Cursor = clamp(model.Cursor+1, 0, len(opts)-1)
+	case "enter":
+		if model.Cursor < len(opts) {
+			return executeServerAction(model, opts[model.Cursor].Label)
+		}
+	}
+	return model, nil
+}
+
+func executeServerAction(model Model, action string) (Model, []Intent) {
+	switch action {
+	case "Start All":
+		model.StatusMsg = "Starting all agents..."
+		return model, []Intent{{Kind: IntentStartAll}}
+	case "Stop All":
+		model.StatusMsg = "Stopping all agents..."
+		return model, []Intent{{Kind: IntentStopAll}}
+	case "Restart All":
+		model.StatusMsg = "Restarting all agents..."
+		return model, []Intent{{Kind: IntentRestartAll}}
+	case "Kill All":
+		model.StatusMsg = "Killing all agents..."
+		return model, []Intent{{Kind: IntentKillAll}}
+	}
+	return model, nil
+}
+
 // ── pure helpers ─────────────────────────────────────────────────────────

 func visibleLogLines(m Model) int {
@@ -16,6 +16,8 @@ func View(model Model) string {
 		return viewAgentActions(model)
 	case ScreenLogs:
 		return viewLogs(model)
+	case ScreenServer:
+		return viewServer(model)
 	default:
 		return ""
 	}
@@ -78,6 +80,10 @@ func viewAgentList(m Model) string {

 		b.WriteString(fmt.Sprintf("  %s%s %-20s %-8s %s\n",
 			cursor, icon, a.ID, a.Version, status))
+
+		if a.Instances > 1 {
+			b.WriteString(fmt.Sprintf("     ⚠ WARNING: %d instances running!\n", a.Instances))
+		}
 	}

 	if m.StatusMsg != "" {
@@ -177,6 +183,54 @@ func viewLogs(m Model) string {
 	return b.String()
 }

+func viewServer(m Model) string {
+	var b strings.Builder
+
+	b.WriteString("\n  Server Management\n")
+	b.WriteString("  " + strings.Repeat("─", 44) + "\n")
+
+	// Summary
+	running, stopped, disabled := countStatuses(m.Agents)
+	total := len(m.Agents)
+	if total > 0 {
+		b.WriteString(fmt.Sprintf("  %d agents: %d running, %d stopped, %d disabled\n", total, running, stopped, disabled))
+	} else {
+		b.WriteString("  Loading...\n")
+	}
+
+	// Agent status list (compact)
+	if total > 0 {
+		b.WriteString("\n")
+		for _, a := range m.Agents {
+			icon := "○"
+			if !a.Enabled {
+				icon = " "
+			} else if a.Running {
+				icon = "●"
+			}
+			b.WriteString(fmt.Sprintf("  %s %s\n", icon, a.ID))
+		}
+	}
+
+	b.WriteString("\n")
+
+	// Action menu
+	for i, opt := range ServerMenuOptions() {
+		cursor := "  "
+		if i == m.Cursor {
+			cursor = "> "
+		}
+		b.WriteString(fmt.Sprintf("  %s%-16s %s\n", cursor, opt.Label, opt.Desc))
+	}
+
+	if m.StatusMsg != "" {
+		b.WriteString("\n  " + m.StatusMsg + "\n")
+	}
+
+	b.WriteString("\n  ↑↓ navegar  enter ejecutar  0 volver\n")
+	return b.String()
+}
+
 func countStatuses(agents []AgentView) (running, stopped, disabled int) {
 	for _, a := range agents {
 		switch {
@@ -47,11 +47,12 @@ type Manager struct {
 	runDir     string
 	agentsGlob string
 	binPath    string
+	envFile    string // path to .env file for child processes
 }

 // NewManager creates a Manager. binPath can be empty for auto-detection.
 func NewManager(runDir, agentsGlob, binPath string) *Manager {
-	return &Manager{runDir: runDir, agentsGlob: agentsGlob, binPath: binPath}
+	return &Manager{runDir: runDir, agentsGlob: agentsGlob, binPath: binPath, envFile: ".env"}
 }

 // Scan discovers all agents from config files.
@@ -81,8 +82,8 @@ func (m *Manager) Scan() ([]AgentInfo, error) {

 // Status returns the runtime status for a single agent.
 func (m *Manager) Status(info AgentInfo) AgentStatus {
-	pid := m.readPID(info.ID)
-	running := pid > 0 && m.isAlive(pid)
+	pid := m.resolveRunningPID(info.ID)
+	running := pid > 0
 	return AgentStatus{AgentInfo: info, Running: running, PID: pid}
 }

@@ -101,6 +102,12 @@ func (m *Manager) StatusAll() ([]AgentStatus, error) {

 // Start launches an agent process in the background.
 func (m *Manager) Start(info AgentInfo) error {
+	// Check for orphan instances
+	if existing := m.findProcessPIDs(info.ID); len(existing) > 0 {
+		return fmt.Errorf("agent %q already has %d running instance(s) (PIDs: %v) — stop them first",
+			info.ID, len(existing), existing)
+	}
+
 	if err := os.MkdirAll(m.runDir, 0o755); err != nil {
 		return fmt.Errorf("create run dir: %w", err)
 	}
@@ -113,11 +120,12 @@ func (m *Manager) Start(info AgentInfo) error {
 	bin := m.resolvedBin()
 	var cmd *exec.Cmd
 	if strings.HasPrefix(bin, "go run") {
-		cmd = exec.Command("go", "run", "./cmd/launcher", "-c", info.ConfigPath)
+		cmd = exec.Command("go", "run", "-tags", "goolm", "./cmd/launcher", "-c", info.ConfigPath)
 	} else {
 		cmd = exec.Command(bin, "-c", info.ConfigPath)
 	}

+	cmd.Env = m.buildEnv()
 	cmd.Stdout = logFile
 	cmd.Stderr = logFile
 	cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
@@ -135,49 +143,94 @@ func (m *Manager) Start(info AgentInfo) error {
 	return nil
 }

-// Stop sends SIGTERM, waits up to 5s, then SIGKILL if needed.
+// Stop sends SIGTERM to all instances, waits up to 5s, then SIGKILL if needed.
 func (m *Manager) Stop(id string) error {
-	pid := m.readPID(id)
-	if pid == 0 || !m.isAlive(pid) {
+	pids := m.findProcessPIDs(id)
+	// Also include PID file PID if alive and not already in the list
+	filePID := m.readPID(id)
+	if filePID > 0 && m.isAlive(filePID) {
+		found := false
+		for _, p := range pids {
+			if p == filePID {
+				found = true
+				break
+			}
+		}
+		if !found {
+			pids = append(pids, filePID)
+		}
+	}
+
+	if len(pids) == 0 {
 		return fmt.Errorf("agent %q is not running", id)
 	}

-	if err := syscall.Kill(pid, syscall.SIGTERM); err != nil {
-		return fmt.Errorf("SIGTERM: %w", err)
+	// SIGTERM all instances
+	for _, pid := range pids {
+		_ = syscall.Kill(pid, syscall.SIGTERM)
 	}

 	// Wait up to 5 seconds for graceful shutdown.
 	for i := 0; i < 10; i++ {
-		if !m.isAlive(pid) {
+		allDead := true
+		for _, pid := range pids {
+			if m.isAlive(pid) {
+				allDead = false
+				break
+			}
+		}
+		if allDead {
 			m.removePID(id)
 			return nil
 		}
 		time.Sleep(500 * time.Millisecond)
 	}

-	// Force kill.
-	if m.isAlive(pid) {
-		_ = syscall.Kill(pid, syscall.SIGKILL)
+	// Force kill survivors.
+	for _, pid := range pids {
+		if m.isAlive(pid) {
+			_ = syscall.Kill(pid, syscall.SIGKILL)
+		}
 	}
 	m.removePID(id)
 	return nil
 }

-// Kill sends SIGKILL immediately.
+// Kill sends SIGKILL to all instances immediately.
 func (m *Manager) Kill(id string) error {
-	pid := m.readPID(id)
-	if pid == 0 || !m.isAlive(pid) {
+	pids := m.findProcessPIDs(id)
+	filePID := m.readPID(id)
+	if filePID > 0 && m.isAlive(filePID) {
+		found := false
+		for _, p := range pids {
+			if p == filePID {
+				found = true
+				break
+			}
+		}
+		if !found {
+			pids = append(pids, filePID)
+		}
+	}
+
+	if len(pids) == 0 {
 		return fmt.Errorf("agent %q is not running", id)
 	}
-	err := syscall.Kill(pid, syscall.SIGKILL)
+
+	var lastErr error
+	for _, pid := range pids {
+		if err := syscall.Kill(pid, syscall.SIGKILL); err != nil {
+			lastErr = err
+		}
+	}
 	m.removePID(id)
-	return err
+	return lastErr
 }

 // Stats gathers resource usage for a running agent from /proc.
 func (m *Manager) Stats(id string) (ProcessStats, error) {
-	pid := m.readPID(id)
-	if pid == 0 || !m.isAlive(pid) {
+	pid := m.resolveRunningPID(id)
+	if pid == 0 {
 		return ProcessStats{}, fmt.Errorf("agent %q is not running", id)
 	}

@@ -256,8 +309,12 @@ func (m *Manager) LogTail(id string, lines int) ([]string, error) {

 // IsRunning checks if an agent process is alive.
 func (m *Manager) IsRunning(id string) bool {
-	pid := m.readPID(id)
-	return pid > 0 && m.isAlive(pid)
+	return m.resolveRunningPID(id) > 0
+}
+
+// InstanceCount returns how many launcher processes are running for an agent.
+func (m *Manager) InstanceCount(id string) int {
+	return len(m.findProcessPIDs(id))
 }

 // ReadPID returns the PID from the PID file, or 0.
@@ -285,6 +342,70 @@ func (m *Manager) readPID(id string) int {
 	return pid
 }

+// findProcessPIDs searches for running launcher processes for a given agent ID
+// using pgrep. Returns all matching PIDs.
+func (m *Manager) findProcessPIDs(id string) []int {
+	// First try to find the config path for this agent
+	configPath := m.configPathFor(id)
+	if configPath == "" {
+		return nil
+	}
+	pattern := fmt.Sprintf("launcher.*-c.*%s", configPath)
+	out, err := exec.Command("pgrep", "-f", pattern).Output()
+	if err != nil {
+		return nil
+	}
+	var pids []int
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		if p, err := strconv.Atoi(strings.TrimSpace(line)); err == nil && p > 0 {
+			pids = append(pids, p)
+		}
+	}
+	return pids
+}
+
+// configPathFor returns the config file path for the given agent ID.
+func (m *Manager) configPathFor(id string) string {
+	matches, err := filepath.Glob(m.agentsGlob)
+	if err != nil {
+		return ""
+	}
+	for _, path := range matches {
+		cfg, err := config.LoadMeta(path)
+		if err != nil {
+			continue
+		}
+		if cfg.Agent.ID == id {
+			return path
+		}
+	}
+	return ""
+}
+
+// resolveRunningPID returns the PID of the running agent, checking the PID file
+// first and falling back to process discovery. It also repairs stale PID files.
+func (m *Manager) resolveRunningPID(id string) int {
+	// Check PID file first
+	pid := m.readPID(id)
+	if pid > 0 && m.isAlive(pid) {
+		return pid
+	}
+
+	// PID file is stale or missing — search for actual processes
+	pids := m.findProcessPIDs(id)
+	if len(pids) > 0 {
+		// Repair the PID file with the first found process
+		_ = os.WriteFile(m.pidPath(id), []byte(strconv.Itoa(pids[0])), 0o644)
+		return pids[0]
+	}
+
+	// Clean up stale PID file
+	if pid > 0 {
+		m.removePID(id)
+	}
+	return 0
+}
+
 func (m *Manager) isAlive(pid int) bool {
 	return syscall.Kill(pid, 0) == nil
 }
@@ -293,6 +414,33 @@ func (m *Manager) removePID(id string) {
 	_ = os.Remove(m.pidPath(id))
 }

+// buildEnv returns the environment for child processes: current env + .env file vars.
+func (m *Manager) buildEnv() []string {
+	env := os.Environ()
+	if m.envFile == "" {
+		return env
+	}
+	data, err := os.ReadFile(m.envFile)
+	if err != nil {
+		return env
+	}
+	// Parse KEY=VALUE lines, skip comments and blanks.
+	seen := make(map[string]bool)
+	for _, line := range strings.Split(string(data), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		if idx := strings.Index(line, "="); idx > 0 {
+			key := line[:idx]
+			seen[key] = true
+			env = append(env, line)
+		}
+	}
+	_ = seen // .env values appended last, so they override earlier entries
+	return env
+}
+
 func (m *Manager) resolvedBin() string {
 	if m.binPath != "" {
 		return m.binPath
@@ -44,6 +44,18 @@ func (a *Adapter) RunIntent(intent puretui.Intent) tea.Cmd {
 	case puretui.IntentLoadLogs:
 		return a.loadLogs(intent.AgentID)

+	case puretui.IntentStartAll:
+		return a.startAll()
+
+	case puretui.IntentStopAll:
+		return a.stopAll()
+
+	case puretui.IntentRestartAll:
+		return a.restartAll()
+
+	case puretui.IntentKillAll:
+		return a.killAll()
+
 	case puretui.IntentTick:
 		return a.tick()

@@ -65,13 +77,14 @@ func (a *Adapter) loadAgents() tea.Cmd {
 		views := make([]puretui.AgentView, len(statuses))
 		for i, s := range statuses {
 			v := puretui.AgentView{
-				ID:      s.ID,
-				Name:    s.Name,
-				Version: s.Version,
-				Desc:    s.Desc,
-				Enabled: s.Enabled,
-				Running: s.Running,
-				PID:     s.PID,
+				ID:        s.ID,
+				Name:      s.Name,
+				Version:   s.Version,
+				Desc:      s.Desc,
+				Enabled:   s.Enabled,
+				Running:   s.Running,
+				PID:       s.PID,
+				Instances: a.mgr.InstanceCount(s.ID),
 			}

 			if s.Running {
@@ -147,6 +160,113 @@ func (a *Adapter) restartAgent(id string) tea.Cmd {
 	}
 }

+func (a *Adapter) startAll() tea.Cmd {
+	return func() tea.Msg {
+		agents, err := a.mgr.Scan()
+		if err != nil {
+			return puretui.MsgServerActionDone{Action: "Start All", Errors: []string{err.Error()}, Failed: 1}
+		}
+		var total, failed int
+		var errs []string
+		for _, agent := range agents {
+			if !agent.Enabled {
+				continue
+			}
+			if a.mgr.IsRunning(agent.ID) {
+				continue
+			}
+			total++
+			if err := a.mgr.Start(agent); err != nil {
+				failed++
+				errs = append(errs, fmt.Sprintf("%s: %v", agent.ID, err))
+			}
+		}
+		if total > 0 {
+			time.Sleep(500 * time.Millisecond)
+		}
+		return puretui.MsgServerActionDone{Action: "Start All", Total: total, Failed: failed, Errors: errs}
+	}
+}
+
+func (a *Adapter) stopAll() tea.Cmd {
+	return func() tea.Msg {
+		statuses, err := a.mgr.StatusAll()
+		if err != nil {
+			return puretui.MsgServerActionDone{Action: "Stop All", Errors: []string{err.Error()}, Failed: 1}
+		}
+		var total, failed int
+		var errs []string
+		for _, s := range statuses {
+			if !s.Running {
+				continue
+			}
+			total++
+			if err := a.mgr.Stop(s.ID); err != nil {
+				failed++
+				errs = append(errs, fmt.Sprintf("%s: %v", s.ID, err))
+			}
+		}
+		return puretui.MsgServerActionDone{Action: "Stop All", Total: total, Failed: failed, Errors: errs}
+	}
+}
+
+func (a *Adapter) restartAll() tea.Cmd {
+	return func() tea.Msg {
+		agents, err := a.mgr.Scan()
+		if err != nil {
+			return puretui.MsgServerActionDone{Action: "Restart All", Errors: []string{err.Error()}, Failed: 1}
+		}
+
+		// Stop all running first
+		for _, agent := range agents {
+			if agent.Enabled && a.mgr.IsRunning(agent.ID) {
+				_ = a.mgr.Stop(agent.ID)
+			}
+		}
+		time.Sleep(300 * time.Millisecond)
+
+		// Start all enabled
+		var total, failed int
+		var errs []string
+		for _, agent := range agents {
+			if !agent.Enabled {
+				continue
+			}
+			total++
+			if err := a.mgr.Start(agent); err != nil {
+				failed++
+				errs = append(errs, fmt.Sprintf("%s: %v", agent.ID, err))
+			}
+		}
+		if total > 0 {
+			time.Sleep(500 * time.Millisecond)
+		}
+		return puretui.MsgServerActionDone{Action: "Restart All", Total: total, Failed: failed, Errors: errs}
+	}
+}
+
+func (a *Adapter) killAll() tea.Cmd {
+	return func() tea.Msg {
+		statuses, err := a.mgr.StatusAll()
+		if err != nil {
+			return puretui.MsgServerActionDone{Action: "Kill All", Errors: []string{err.Error()}, Failed: 1}
+		}
+		var total, failed int
+		var errs []string
+		for _, s := range statuses {
+			if !s.Running {
+				continue
+			}
+			total++
+			if err := a.mgr.Kill(s.ID); err != nil {
+				failed++
+				errs = append(errs, fmt.Sprintf("%s: %v", s.ID, err))
+			}
+		}
+		return puretui.MsgServerActionDone{Action: "Kill All", Total: total, Failed: failed, Errors: errs}
+	}
+}
+
 func (a *Adapter) loadLogs(id string) tea.Cmd {
 	return func() tea.Msg {
 		lines, err := a.mgr.LogTail(id, 100)