feat: implement multi-bot orchestration system with LLM routing

Implementa el sistema de orquestación para salas Matrix con múltiples bots.
El orquestador es un "special agent" sin identidad Matrix que coordina qué bot
responde y cuándo, usando LLM (Claude) para routing y evaluación de calidad.

Cambios principales:
- pkg/orchestration/task.go: tipos puros (TaskEvent, BotResponse, QualityScore, RoutingDecision)
- shell/orchestration/: runtime del orquestador (orchestrator.go, router.go, evaluator.go)
- agents/specials/orchestrator/: config + prompts (routing, quality, refinement)
- internal/config/: SpecialConfig, OrchestrationCfg, LoadSpecial()
- shell/bus/bus.go: protocolo request-reply (SendAndWait, Reply) para delegación
- shell/matrix/listener.go: InterceptFunc para interceptar eventos en salas orquestadas
- agents/runtime.go: SetBus, listenBus, handleTaskEvent para recibir tareas del orquestador
- cmd/launcher/main.go: creación de bus compartido, arranque del orquestador antes de bots

Incluye deduplicación para evitar que múltiples listeners en la misma sala
disparen el orquestador más de una vez por mensaje.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-06 09:05:42 +00:00
parent 6bef4283c6
commit 2667af52cc
14 changed files with 1001 additions and 7 deletions
+48
View File
@@ -0,0 +1,48 @@
package orchestration
import (
"context"
"encoding/json"
"fmt"
"strings"
coretypes "github.com/enmanuel/agents/pkg/llm"
"github.com/enmanuel/agents/pkg/orchestration"
)
// evaluate asks the LLM to score the quality of a bot's response.
func (o *Orchestrator) evaluate(ctx context.Context, question string, response orchestration.BotResponse) orchestration.QualityScore {
userContent := fmt.Sprintf("Question: %s\n\nResponse from %s:\n%s", question, response.BotID, response.Text)
resp, err := o.llm(ctx, coretypes.CompletionRequest{
Model: o.cfg.LLM.Primary.Model,
MaxTokens: o.cfg.LLM.Primary.MaxTokens,
Temperature: o.cfg.LLM.Primary.Temperature,
SystemPrompt: o.qualityPrompt,
Messages: []coretypes.Message{
{Role: coretypes.RoleUser, Content: userContent},
},
})
if err != nil {
o.logger.Error("quality evaluation LLM call failed", "err", err)
// On LLM failure, assume quality is good enough to stop the pipeline
return orchestration.QualityScore{
Score: 1.0,
Continue: false,
Reason: fmt.Sprintf("evaluation failed: %s, assuming good quality", err),
}
}
var qs orchestration.QualityScore
if err := json.Unmarshal([]byte(strings.TrimSpace(resp.Content)), &qs); err != nil {
o.logger.Warn("failed to parse quality score", "content", resp.Content, "err", err)
// On parse failure, assume good quality
return orchestration.QualityScore{
Score: 1.0,
Continue: false,
Reason: fmt.Sprintf("parse failed: %s", err),
}
}
return qs
}
+332
View File
@@ -0,0 +1,332 @@
// Package orchestration implements the multi-bot orchestrator runtime.
// The orchestrator intercepts Matrix events in managed rooms and coordinates
// which bot responds via the in-process bus.
package orchestration
import (
"context"
"fmt"
"log/slog"
"os"
"path/filepath"
"strings"
"sync"
"github.com/enmanuel/agents/internal/config"
"github.com/enmanuel/agents/pkg/decision"
coretypes "github.com/enmanuel/agents/pkg/llm"
"github.com/enmanuel/agents/pkg/orchestration"
"github.com/enmanuel/agents/shell/bus"
shelllm "github.com/enmanuel/agents/shell/llm"
)
// Orchestrator coordinates multi-bot rooms. It has no Matrix identity —
// it intercepts events before they reach bots and delegates via the bus.
type Orchestrator struct {
cfg *config.SpecialConfig
llm coretypes.CompleteFunc
bus *bus.Bus
managedRooms map[string][]string // roomID → []botID
participants map[string]orchestration.ParticipantInfo // botID → info
logger *slog.Logger
// Prompts loaded from files
routingPrompt string
qualityPrompt string
refinementPrompt string
// Dedup: multiple bots in the same room will each trigger Intercept().
// We use a set of "room:sender:content" keys to ensure only one fires.
seenMu sync.Mutex
seen map[string]bool
}
// New creates an Orchestrator from its config.
func New(cfg *config.SpecialConfig, agentBus *bus.Bus, logger *slog.Logger) (*Orchestrator, error) {
llmFunc, err := shelllm.FromConfig(cfg.LLM.Primary)
if err != nil {
return nil, fmt.Errorf("orchestrator LLM: %w", err)
}
managed := make(map[string][]string)
for _, room := range cfg.Orchestration.Rooms {
managed[room.RoomID] = room.Participants
}
o := &Orchestrator{
cfg: cfg,
llm: llmFunc,
bus: agentBus,
managedRooms: managed,
participants: make(map[string]orchestration.ParticipantInfo),
logger: logger,
seen: make(map[string]bool),
}
if err := o.loadPrompts(); err != nil {
return nil, fmt.Errorf("load prompts: %w", err)
}
return o, nil
}
// RegisterParticipant adds bot metadata used for LLM routing decisions.
func (o *Orchestrator) RegisterParticipant(info orchestration.ParticipantInfo) {
o.participants[info.ID] = info
o.logger.Debug("registered participant", "bot", info.ID, "desc", info.Description)
}
// ShouldIntercept returns true if the room is managed by this orchestrator.
func (o *Orchestrator) ShouldIntercept(roomID string) bool {
_, ok := o.managedRooms[roomID]
return ok
}
// Intercept is the InterceptFunc used by bot listeners. It checks if the
// room is managed and, if so, starts the orchestration pipeline asynchronously.
// Returns true if the event was intercepted (all bots in the room should return true,
// but only the first one triggers actual routing — the rest are deduped).
func (o *Orchestrator) Intercept(ctx context.Context, msgCtx decision.MessageContext) bool {
if !o.ShouldIntercept(msgCtx.RoomID) {
return false
}
// Dedup: multiple bots receive the same event. Only route once.
key := msgCtx.RoomID + ":" + msgCtx.SenderID + ":" + msgCtx.Content
o.seenMu.Lock()
if o.seen[key] {
o.seenMu.Unlock()
return true // still intercept (don't let the bot handle it) but don't route again
}
o.seen[key] = true
o.seenMu.Unlock()
// Route asynchronously so the listener isn't blocked.
// Clean up the dedup key after routing completes.
go func() {
defer func() {
o.seenMu.Lock()
delete(o.seen, key)
o.seenMu.Unlock()
}()
if err := o.Route(ctx, msgCtx); err != nil {
o.logger.Error("orchestration failed", "room", msgCtx.RoomID, "err", err)
}
}()
return true
}
// Route is the main entry point. Called when a human posts in a managed room.
// It decides which bot(s) should respond and dispatches tasks via the bus.
func (o *Orchestrator) Route(ctx context.Context, msgCtx decision.MessageContext) error {
participants, ok := o.managedRooms[msgCtx.RoomID]
if !ok {
return fmt.Errorf("room %s is not managed", msgCtx.RoomID)
}
o.logger.Info("orchestrating message",
"room", msgCtx.RoomID,
"sender", msgCtx.SenderID,
"participants", participants,
"content_preview", truncate(msgCtx.Content, 80),
)
// Optimization: single bot → dispatch directly without LLM
if len(participants) == 1 {
o.logger.Debug("single participant, dispatching directly", "bot", participants[0])
_, err := o.dispatchAndWait(ctx, participants[0], msgCtx, 0, nil)
return err
}
var responses []orchestration.BotResponse
var lastBot string
maxIter := o.cfg.Orchestration.MaxIterations
if maxIter <= 0 {
maxIter = 3
}
for i := 0; i < maxIter; i++ {
// Route: decide which bot responds
var target string
var err error
if i == 0 {
rd, routeErr := o.routeInitial(ctx, msgCtx.Content, participants)
if routeErr != nil {
o.logger.Error("routing failed, falling back to first participant", "err", routeErr)
target = participants[0]
} else {
target = rd.TargetBotID
o.logger.Info("routed to bot",
"bot", target,
"confidence", rd.Confidence,
"reason", rd.Reason,
"iteration", i,
)
}
} else {
rd, routeErr := o.routeRefinement(ctx, msgCtx.Content, responses, participants, lastBot)
if routeErr != nil {
o.logger.Warn("refinement routing failed, stopping pipeline", "err", routeErr)
break
}
target = rd.TargetBotID
o.logger.Info("refinement routed to bot",
"bot", target,
"reason", rd.Reason,
"iteration", i,
)
}
// Dispatch: send TaskEvent to bot via bus and wait for response
response, err := o.dispatchAndWait(ctx, target, msgCtx, i, responses)
if err != nil {
o.logger.Error("dispatch failed", "bot", target, "err", err)
break
}
responses = append(responses, response)
lastBot = target
o.logger.Info("bot responded",
"bot", target,
"response_len", len(response.Text),
"iteration", i,
)
// Evaluate quality (Fase 3)
score := o.evaluate(ctx, msgCtx.Content, response)
o.logger.Info("quality evaluated",
"score", score.Score,
"continue", score.Continue,
"reason", score.Reason,
"iteration", i,
)
if score.Score >= o.cfg.Orchestration.QualityThreshold || !score.Continue {
o.logger.Info("pipeline complete",
"iterations", i+1,
"final_score", score.Score,
)
break
}
}
return nil
}
// dispatchAndWait sends a TaskEvent to a bot and waits for its response.
func (o *Orchestrator) dispatchAndWait(
ctx context.Context,
botID string,
msgCtx decision.MessageContext,
iteration int,
previousResponses []orchestration.BotResponse,
) (orchestration.BotResponse, error) {
taskID := fmt.Sprintf("orch-%s-%s-%d", msgCtx.RoomID, botID, iteration)
task := orchestration.TaskEvent{
TaskID: taskID,
TargetBotID: botID,
TargetRoomID: msgCtx.RoomID,
OriginalSender: msgCtx.SenderID,
OriginalQuestion: msgCtx.Content,
Iteration: iteration,
PreviousResponses: previousResponses,
}
taskJSON, err := orchestration.MarshalTaskEvent(task)
if err != nil {
return orchestration.BotResponse{}, fmt.Errorf("marshal task: %w", err)
}
msg := bus.AgentMessage{
From: bus.AgentID(o.cfg.Special.ID),
To: bus.AgentID(botID),
Kind: bus.KindTask,
Payload: map[string]string{"task_json": taskJSON},
}
timeout := o.cfg.Orchestration.DelegationTimeout
if timeout <= 0 {
timeout = 30_000_000_000 // 30s default
}
reply, err := o.bus.SendAndWait(ctx, msg, taskID, timeout)
if err != nil {
return orchestration.BotResponse{}, err
}
resultJSON, ok := reply.Payload["result_json"]
if !ok {
return orchestration.BotResponse{}, fmt.Errorf("reply missing result_json")
}
result, err := orchestration.UnmarshalTaskResult(resultJSON)
if err != nil {
return orchestration.BotResponse{}, fmt.Errorf("unmarshal result: %w", err)
}
if result.Error != "" {
return orchestration.BotResponse{}, fmt.Errorf("bot %s error: %s", botID, result.Error)
}
return orchestration.BotResponse{
BotID: botID,
Text: result.Text,
}, nil
}
// loadPrompts reads the orchestrator's prompt files.
func (o *Orchestrator) loadPrompts() error {
base := filepath.Join("agents", "specials", "orchestrator", "prompts")
routing, err := os.ReadFile(filepath.Join(base, "routing.md"))
if err != nil {
return fmt.Errorf("routing prompt: %w", err)
}
o.routingPrompt = string(routing)
quality, err := os.ReadFile(filepath.Join(base, "quality.md"))
if err != nil {
return fmt.Errorf("quality prompt: %w", err)
}
o.qualityPrompt = string(quality)
refinement, err := os.ReadFile(filepath.Join(base, "refinement.md"))
if err != nil {
return fmt.Errorf("refinement prompt: %w", err)
}
o.refinementPrompt = string(refinement)
return nil
}
// buildParticipantsList formats participant info for LLM prompts.
func (o *Orchestrator) buildParticipantsList(botIDs []string, exclude string) string {
var sb strings.Builder
for _, id := range botIDs {
if id == exclude {
continue
}
info, ok := o.participants[id]
if !ok {
sb.WriteString(fmt.Sprintf("- %s: (no description available)\n", id))
continue
}
caps := ""
if len(info.Capabilities) > 0 {
caps = fmt.Sprintf(" (capabilities: %s)", strings.Join(info.Capabilities, ", "))
}
sb.WriteString(fmt.Sprintf("- %s: %s%s\n", info.ID, info.Description, caps))
}
return sb.String()
}
func truncate(s string, n int) string {
runes := []rune(s)
if len(runes) <= n {
return s
}
return string(runes[:n]) + "..."
}
+107
View File
@@ -0,0 +1,107 @@
package orchestration
import (
"context"
"encoding/json"
"fmt"
"strings"
coretypes "github.com/enmanuel/agents/pkg/llm"
"github.com/enmanuel/agents/pkg/orchestration"
)
// routeInitial asks the LLM which bot should handle the question first.
func (o *Orchestrator) routeInitial(ctx context.Context, question string, participants []string) (orchestration.RoutingDecision, error) {
systemPrompt := strings.ReplaceAll(o.routingPrompt, "{{PARTICIPANTS}}", o.buildParticipantsList(participants, ""))
resp, err := o.llm(ctx, coretypes.CompletionRequest{
Model: o.cfg.LLM.Primary.Model,
MaxTokens: o.cfg.LLM.Primary.MaxTokens,
Temperature: o.cfg.LLM.Primary.Temperature,
SystemPrompt: systemPrompt,
Messages: []coretypes.Message{
{Role: coretypes.RoleUser, Content: question},
},
})
if err != nil {
return orchestration.RoutingDecision{}, fmt.Errorf("LLM routing call: %w", err)
}
var rd orchestration.RoutingDecision
if err := json.Unmarshal([]byte(strings.TrimSpace(resp.Content)), &rd); err != nil {
o.logger.Warn("failed to parse routing response, raw", "content", resp.Content, "err", err)
return orchestration.RoutingDecision{}, fmt.Errorf("parse routing decision: %w", err)
}
// Validate the chosen bot is actually a participant
if !contains(participants, rd.TargetBotID) {
o.logger.Warn("LLM chose unknown bot, falling back to first", "chosen", rd.TargetBotID)
rd.TargetBotID = participants[0]
rd.Confidence = 0.5
rd.Reason = "fallback: LLM chose unknown bot"
}
return rd, nil
}
// routeRefinement asks the LLM which bot should improve the response,
// excluding the last respondent.
func (o *Orchestrator) routeRefinement(
ctx context.Context,
question string,
responses []orchestration.BotResponse,
participants []string,
excludeBot string,
) (orchestration.RoutingDecision, error) {
lastResponse := ""
if len(responses) > 0 {
lastResponse = responses[len(responses)-1].Text
}
systemPrompt := strings.ReplaceAll(o.refinementPrompt, "{{PARTICIPANTS}}", o.buildParticipantsList(participants, excludeBot))
systemPrompt = strings.ReplaceAll(systemPrompt, "{{LAST_RESPONSE}}", lastResponse)
userContent := fmt.Sprintf("Original question: %s\n\nCurrent response that needs improvement:\n%s", question, lastResponse)
resp, err := o.llm(ctx, coretypes.CompletionRequest{
Model: o.cfg.LLM.Primary.Model,
MaxTokens: o.cfg.LLM.Primary.MaxTokens,
Temperature: o.cfg.LLM.Primary.Temperature,
SystemPrompt: systemPrompt,
Messages: []coretypes.Message{
{Role: coretypes.RoleUser, Content: userContent},
},
})
if err != nil {
return orchestration.RoutingDecision{}, fmt.Errorf("LLM refinement call: %w", err)
}
var rd orchestration.RoutingDecision
if err := json.Unmarshal([]byte(strings.TrimSpace(resp.Content)), &rd); err != nil {
o.logger.Warn("failed to parse refinement response", "content", resp.Content, "err", err)
return orchestration.RoutingDecision{}, fmt.Errorf("parse refinement decision: %w", err)
}
// Validate: must be a participant and not the excluded bot
if rd.TargetBotID == excludeBot || !contains(participants, rd.TargetBotID) {
// Pick first available that isn't excluded
for _, p := range participants {
if p != excludeBot {
rd.TargetBotID = p
rd.Reason = "fallback: LLM chose excluded or unknown bot"
break
}
}
}
return rd, nil
}
func contains(ss []string, s string) bool {
for _, v := range ss {
if v == s {
return true
}
}
return false
}