feat: implement multi-bot orchestration system with LLM routing

Implementa el sistema de orquestación para salas Matrix con múltiples bots. El orquestador es un "special agent" sin identidad Matrix que coordina qué bot responde y cuándo, usando LLM (Claude) para routing y evaluación de calidad. Cambios principales: - pkg/orchestration/task.go: tipos puros (TaskEvent, BotResponse, QualityScore, RoutingDecision) - shell/orchestration/: runtime del orquestador (orchestrator.go, router.go, evaluator.go) - agents/specials/orchestrator/: config + prompts (routing, quality, refinement) - internal/config/: SpecialConfig, OrchestrationCfg, LoadSpecial() - shell/bus/bus.go: protocolo request-reply (SendAndWait, Reply) para delegación - shell/matrix/listener.go: InterceptFunc para interceptar eventos en salas orquestadas - agents/runtime.go: SetBus, listenBus, handleTaskEvent para recibir tareas del orquestador - cmd/launcher/main.go: creación de bus compartido, arranque del orquestador antes de bots Incluye deduplicación para evitar que múltiples listeners en la misma sala disparen el orquestador más de una vez por mensaje. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-06 09:05:42 +00:00
parent 6bef4283c6
commit 2667af52cc
14 changed files with 1001 additions and 7 deletions
@@ -0,0 +1,48 @@
+package orchestration
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	coretypes "github.com/enmanuel/agents/pkg/llm"
+	"github.com/enmanuel/agents/pkg/orchestration"
+)
+
+// evaluate asks the LLM to score the quality of a bot's response.
+func (o *Orchestrator) evaluate(ctx context.Context, question string, response orchestration.BotResponse) orchestration.QualityScore {
+	userContent := fmt.Sprintf("Question: %s\n\nResponse from %s:\n%s", question, response.BotID, response.Text)
+
+	resp, err := o.llm(ctx, coretypes.CompletionRequest{
+		Model:        o.cfg.LLM.Primary.Model,
+		MaxTokens:    o.cfg.LLM.Primary.MaxTokens,
+		Temperature:  o.cfg.LLM.Primary.Temperature,
+		SystemPrompt: o.qualityPrompt,
+		Messages: []coretypes.Message{
+			{Role: coretypes.RoleUser, Content: userContent},
+		},
+	})
+	if err != nil {
+		o.logger.Error("quality evaluation LLM call failed", "err", err)
+		// On LLM failure, assume quality is good enough to stop the pipeline
+		return orchestration.QualityScore{
+			Score:    1.0,
+			Continue: false,
+			Reason:   fmt.Sprintf("evaluation failed: %s, assuming good quality", err),
+		}
+	}
+
+	var qs orchestration.QualityScore
+	if err := json.Unmarshal([]byte(strings.TrimSpace(resp.Content)), &qs); err != nil {
+		o.logger.Warn("failed to parse quality score", "content", resp.Content, "err", err)
+		// On parse failure, assume good quality
+		return orchestration.QualityScore{
+			Score:    1.0,
+			Continue: false,
+			Reason:   fmt.Sprintf("parse failed: %s", err),
+		}
+	}
+
+	return qs
+}
@@ -0,0 +1,332 @@
+// Package orchestration implements the multi-bot orchestrator runtime.
+// The orchestrator intercepts Matrix events in managed rooms and coordinates
+// which bot responds via the in-process bus.
+package orchestration
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+
+	"github.com/enmanuel/agents/internal/config"
+	"github.com/enmanuel/agents/pkg/decision"
+	coretypes "github.com/enmanuel/agents/pkg/llm"
+	"github.com/enmanuel/agents/pkg/orchestration"
+	"github.com/enmanuel/agents/shell/bus"
+	shelllm "github.com/enmanuel/agents/shell/llm"
+)
+
+// Orchestrator coordinates multi-bot rooms. It has no Matrix identity —
+// it intercepts events before they reach bots and delegates via the bus.
+type Orchestrator struct {
+	cfg          *config.SpecialConfig
+	llm          coretypes.CompleteFunc
+	bus          *bus.Bus
+	managedRooms map[string][]string // roomID → []botID
+	participants map[string]orchestration.ParticipantInfo // botID → info
+	logger       *slog.Logger
+
+	// Prompts loaded from files
+	routingPrompt    string
+	qualityPrompt    string
+	refinementPrompt string
+
+	// Dedup: multiple bots in the same room will each trigger Intercept().
+	// We use a set of "room:sender:content" keys to ensure only one fires.
+	seenMu sync.Mutex
+	seen   map[string]bool
+}
+
+// New creates an Orchestrator from its config.
+func New(cfg *config.SpecialConfig, agentBus *bus.Bus, logger *slog.Logger) (*Orchestrator, error) {
+	llmFunc, err := shelllm.FromConfig(cfg.LLM.Primary)
+	if err != nil {
+		return nil, fmt.Errorf("orchestrator LLM: %w", err)
+	}
+
+	managed := make(map[string][]string)
+	for _, room := range cfg.Orchestration.Rooms {
+		managed[room.RoomID] = room.Participants
+	}
+
+	o := &Orchestrator{
+		cfg:          cfg,
+		llm:          llmFunc,
+		bus:          agentBus,
+		managedRooms: managed,
+		participants: make(map[string]orchestration.ParticipantInfo),
+		logger:       logger,
+		seen:         make(map[string]bool),
+	}
+
+	if err := o.loadPrompts(); err != nil {
+		return nil, fmt.Errorf("load prompts: %w", err)
+	}
+
+	return o, nil
+}
+
+// RegisterParticipant adds bot metadata used for LLM routing decisions.
+func (o *Orchestrator) RegisterParticipant(info orchestration.ParticipantInfo) {
+	o.participants[info.ID] = info
+	o.logger.Debug("registered participant", "bot", info.ID, "desc", info.Description)
+}
+
+// ShouldIntercept returns true if the room is managed by this orchestrator.
+func (o *Orchestrator) ShouldIntercept(roomID string) bool {
+	_, ok := o.managedRooms[roomID]
+	return ok
+}
+
+// Intercept is the InterceptFunc used by bot listeners. It checks if the
+// room is managed and, if so, starts the orchestration pipeline asynchronously.
+// Returns true if the event was intercepted (all bots in the room should return true,
+// but only the first one triggers actual routing — the rest are deduped).
+func (o *Orchestrator) Intercept(ctx context.Context, msgCtx decision.MessageContext) bool {
+	if !o.ShouldIntercept(msgCtx.RoomID) {
+		return false
+	}
+
+	// Dedup: multiple bots receive the same event. Only route once.
+	key := msgCtx.RoomID + ":" + msgCtx.SenderID + ":" + msgCtx.Content
+	o.seenMu.Lock()
+	if o.seen[key] {
+		o.seenMu.Unlock()
+		return true // still intercept (don't let the bot handle it) but don't route again
+	}
+	o.seen[key] = true
+	o.seenMu.Unlock()
+
+	// Route asynchronously so the listener isn't blocked.
+	// Clean up the dedup key after routing completes.
+	go func() {
+		defer func() {
+			o.seenMu.Lock()
+			delete(o.seen, key)
+			o.seenMu.Unlock()
+		}()
+		if err := o.Route(ctx, msgCtx); err != nil {
+			o.logger.Error("orchestration failed", "room", msgCtx.RoomID, "err", err)
+		}
+	}()
+	return true
+}
+
+// Route is the main entry point. Called when a human posts in a managed room.
+// It decides which bot(s) should respond and dispatches tasks via the bus.
+func (o *Orchestrator) Route(ctx context.Context, msgCtx decision.MessageContext) error {
+	participants, ok := o.managedRooms[msgCtx.RoomID]
+	if !ok {
+		return fmt.Errorf("room %s is not managed", msgCtx.RoomID)
+	}
+
+	o.logger.Info("orchestrating message",
+		"room", msgCtx.RoomID,
+		"sender", msgCtx.SenderID,
+		"participants", participants,
+		"content_preview", truncate(msgCtx.Content, 80),
+	)
+
+	// Optimization: single bot → dispatch directly without LLM
+	if len(participants) == 1 {
+		o.logger.Debug("single participant, dispatching directly", "bot", participants[0])
+		_, err := o.dispatchAndWait(ctx, participants[0], msgCtx, 0, nil)
+		return err
+	}
+
+	var responses []orchestration.BotResponse
+	var lastBot string
+	maxIter := o.cfg.Orchestration.MaxIterations
+	if maxIter <= 0 {
+		maxIter = 3
+	}
+
+	for i := 0; i < maxIter; i++ {
+		// Route: decide which bot responds
+		var target string
+		var err error
+
+		if i == 0 {
+			rd, routeErr := o.routeInitial(ctx, msgCtx.Content, participants)
+			if routeErr != nil {
+				o.logger.Error("routing failed, falling back to first participant", "err", routeErr)
+				target = participants[0]
+			} else {
+				target = rd.TargetBotID
+				o.logger.Info("routed to bot",
+					"bot", target,
+					"confidence", rd.Confidence,
+					"reason", rd.Reason,
+					"iteration", i,
+				)
+			}
+		} else {
+			rd, routeErr := o.routeRefinement(ctx, msgCtx.Content, responses, participants, lastBot)
+			if routeErr != nil {
+				o.logger.Warn("refinement routing failed, stopping pipeline", "err", routeErr)
+				break
+			}
+			target = rd.TargetBotID
+			o.logger.Info("refinement routed to bot",
+				"bot", target,
+				"reason", rd.Reason,
+				"iteration", i,
+			)
+		}
+
+		// Dispatch: send TaskEvent to bot via bus and wait for response
+		response, err := o.dispatchAndWait(ctx, target, msgCtx, i, responses)
+		if err != nil {
+			o.logger.Error("dispatch failed", "bot", target, "err", err)
+			break
+		}
+
+		responses = append(responses, response)
+		lastBot = target
+
+		o.logger.Info("bot responded",
+			"bot", target,
+			"response_len", len(response.Text),
+			"iteration", i,
+		)
+
+		// Evaluate quality (Fase 3)
+		score := o.evaluate(ctx, msgCtx.Content, response)
+		o.logger.Info("quality evaluated",
+			"score", score.Score,
+			"continue", score.Continue,
+			"reason", score.Reason,
+			"iteration", i,
+		)
+
+		if score.Score >= o.cfg.Orchestration.QualityThreshold || !score.Continue {
+			o.logger.Info("pipeline complete",
+				"iterations", i+1,
+				"final_score", score.Score,
+			)
+			break
+		}
+	}
+
+	return nil
+}
+
+// dispatchAndWait sends a TaskEvent to a bot and waits for its response.
+func (o *Orchestrator) dispatchAndWait(
+	ctx context.Context,
+	botID string,
+	msgCtx decision.MessageContext,
+	iteration int,
+	previousResponses []orchestration.BotResponse,
+) (orchestration.BotResponse, error) {
+	taskID := fmt.Sprintf("orch-%s-%s-%d", msgCtx.RoomID, botID, iteration)
+
+	task := orchestration.TaskEvent{
+		TaskID:            taskID,
+		TargetBotID:       botID,
+		TargetRoomID:      msgCtx.RoomID,
+		OriginalSender:    msgCtx.SenderID,
+		OriginalQuestion:  msgCtx.Content,
+		Iteration:         iteration,
+		PreviousResponses: previousResponses,
+	}
+
+	taskJSON, err := orchestration.MarshalTaskEvent(task)
+	if err != nil {
+		return orchestration.BotResponse{}, fmt.Errorf("marshal task: %w", err)
+	}
+
+	msg := bus.AgentMessage{
+		From:    bus.AgentID(o.cfg.Special.ID),
+		To:      bus.AgentID(botID),
+		Kind:    bus.KindTask,
+		Payload: map[string]string{"task_json": taskJSON},
+	}
+
+	timeout := o.cfg.Orchestration.DelegationTimeout
+	if timeout <= 0 {
+		timeout = 30_000_000_000 // 30s default
+	}
+
+	reply, err := o.bus.SendAndWait(ctx, msg, taskID, timeout)
+	if err != nil {
+		return orchestration.BotResponse{}, err
+	}
+
+	resultJSON, ok := reply.Payload["result_json"]
+	if !ok {
+		return orchestration.BotResponse{}, fmt.Errorf("reply missing result_json")
+	}
+
+	result, err := orchestration.UnmarshalTaskResult(resultJSON)
+	if err != nil {
+		return orchestration.BotResponse{}, fmt.Errorf("unmarshal result: %w", err)
+	}
+
+	if result.Error != "" {
+		return orchestration.BotResponse{}, fmt.Errorf("bot %s error: %s", botID, result.Error)
+	}
+
+	return orchestration.BotResponse{
+		BotID: botID,
+		Text:  result.Text,
+	}, nil
+}
+
+// loadPrompts reads the orchestrator's prompt files.
+func (o *Orchestrator) loadPrompts() error {
+	base := filepath.Join("agents", "specials", "orchestrator", "prompts")
+
+	routing, err := os.ReadFile(filepath.Join(base, "routing.md"))
+	if err != nil {
+		return fmt.Errorf("routing prompt: %w", err)
+	}
+	o.routingPrompt = string(routing)
+
+	quality, err := os.ReadFile(filepath.Join(base, "quality.md"))
+	if err != nil {
+		return fmt.Errorf("quality prompt: %w", err)
+	}
+	o.qualityPrompt = string(quality)
+
+	refinement, err := os.ReadFile(filepath.Join(base, "refinement.md"))
+	if err != nil {
+		return fmt.Errorf("refinement prompt: %w", err)
+	}
+	o.refinementPrompt = string(refinement)
+
+	return nil
+}
+
+// buildParticipantsList formats participant info for LLM prompts.
+func (o *Orchestrator) buildParticipantsList(botIDs []string, exclude string) string {
+	var sb strings.Builder
+	for _, id := range botIDs {
+		if id == exclude {
+			continue
+		}
+		info, ok := o.participants[id]
+		if !ok {
+			sb.WriteString(fmt.Sprintf("- %s: (no description available)\n", id))
+			continue
+		}
+		caps := ""
+		if len(info.Capabilities) > 0 {
+			caps = fmt.Sprintf(" (capabilities: %s)", strings.Join(info.Capabilities, ", "))
+		}
+		sb.WriteString(fmt.Sprintf("- %s: %s%s\n", info.ID, info.Description, caps))
+	}
+	return sb.String()
+}
+
+func truncate(s string, n int) string {
+	runes := []rune(s)
+	if len(runes) <= n {
+		return s
+	}
+	return string(runes[:n]) + "..."
+}
@@ -0,0 +1,107 @@
+package orchestration
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	coretypes "github.com/enmanuel/agents/pkg/llm"
+	"github.com/enmanuel/agents/pkg/orchestration"
+)
+
+// routeInitial asks the LLM which bot should handle the question first.
+func (o *Orchestrator) routeInitial(ctx context.Context, question string, participants []string) (orchestration.RoutingDecision, error) {
+	systemPrompt := strings.ReplaceAll(o.routingPrompt, "{{PARTICIPANTS}}", o.buildParticipantsList(participants, ""))
+
+	resp, err := o.llm(ctx, coretypes.CompletionRequest{
+		Model:        o.cfg.LLM.Primary.Model,
+		MaxTokens:    o.cfg.LLM.Primary.MaxTokens,
+		Temperature:  o.cfg.LLM.Primary.Temperature,
+		SystemPrompt: systemPrompt,
+		Messages: []coretypes.Message{
+			{Role: coretypes.RoleUser, Content: question},
+		},
+	})
+	if err != nil {
+		return orchestration.RoutingDecision{}, fmt.Errorf("LLM routing call: %w", err)
+	}
+
+	var rd orchestration.RoutingDecision
+	if err := json.Unmarshal([]byte(strings.TrimSpace(resp.Content)), &rd); err != nil {
+		o.logger.Warn("failed to parse routing response, raw", "content", resp.Content, "err", err)
+		return orchestration.RoutingDecision{}, fmt.Errorf("parse routing decision: %w", err)
+	}
+
+	// Validate the chosen bot is actually a participant
+	if !contains(participants, rd.TargetBotID) {
+		o.logger.Warn("LLM chose unknown bot, falling back to first", "chosen", rd.TargetBotID)
+		rd.TargetBotID = participants[0]
+		rd.Confidence = 0.5
+		rd.Reason = "fallback: LLM chose unknown bot"
+	}
+
+	return rd, nil
+}
+
+// routeRefinement asks the LLM which bot should improve the response,
+// excluding the last respondent.
+func (o *Orchestrator) routeRefinement(
+	ctx context.Context,
+	question string,
+	responses []orchestration.BotResponse,
+	participants []string,
+	excludeBot string,
+) (orchestration.RoutingDecision, error) {
+	lastResponse := ""
+	if len(responses) > 0 {
+		lastResponse = responses[len(responses)-1].Text
+	}
+
+	systemPrompt := strings.ReplaceAll(o.refinementPrompt, "{{PARTICIPANTS}}", o.buildParticipantsList(participants, excludeBot))
+	systemPrompt = strings.ReplaceAll(systemPrompt, "{{LAST_RESPONSE}}", lastResponse)
+
+	userContent := fmt.Sprintf("Original question: %s\n\nCurrent response that needs improvement:\n%s", question, lastResponse)
+
+	resp, err := o.llm(ctx, coretypes.CompletionRequest{
+		Model:        o.cfg.LLM.Primary.Model,
+		MaxTokens:    o.cfg.LLM.Primary.MaxTokens,
+		Temperature:  o.cfg.LLM.Primary.Temperature,
+		SystemPrompt: systemPrompt,
+		Messages: []coretypes.Message{
+			{Role: coretypes.RoleUser, Content: userContent},
+		},
+	})
+	if err != nil {
+		return orchestration.RoutingDecision{}, fmt.Errorf("LLM refinement call: %w", err)
+	}
+
+	var rd orchestration.RoutingDecision
+	if err := json.Unmarshal([]byte(strings.TrimSpace(resp.Content)), &rd); err != nil {
+		o.logger.Warn("failed to parse refinement response", "content", resp.Content, "err", err)
+		return orchestration.RoutingDecision{}, fmt.Errorf("parse refinement decision: %w", err)
+	}
+
+	// Validate: must be a participant and not the excluded bot
+	if rd.TargetBotID == excludeBot || !contains(participants, rd.TargetBotID) {
+		// Pick first available that isn't excluded
+		for _, p := range participants {
+			if p != excludeBot {
+				rd.TargetBotID = p
+				rd.Reason = "fallback: LLM chose excluded or unknown bot"
+				break
+			}
+		}
+	}
+
+	return rd, nil
+}
+
+func contains(ss []string, s string) bool {
+	for _, v := range ss {
+		if v == s {
+			return true
+		}
+	}
+	return false
+}