merge: issue/0019c-rate-limiting — rate limiting de tools por room

Rate limiting de tool calls por room usando sliding window en el registry. Config via security.tool_rate_limit en YAML. Loguea tool_rate_limited al exceder el limite. Tests unitarios y de integracion incluidos. Sub-issue 0019c del hardening contra prompt injection.
2026-03-07 19:46:34 +00:00
parent 01a734cd9b 8f1d86333a
commit b4db953211
7 changed files with 295 additions and 14 deletions
@@ -107,7 +107,7 @@ func (a *Agent) cmdTool(ctx context.Context, msgCtx decision.MessageContext) str
 		"args", argsJSON,
 	)

-	result := a.toolReg.Execute(ctx, toolName, argsJSON)
+	result := a.toolReg.ExecuteForRoom(ctx, toolName, argsJSON, msgCtx.RoomID)
 	if result.Err != nil {
 		return fmt.Sprintf("Error ejecutando %s: %s", toolName, result.Err)
 	}
@@ -230,6 +230,29 @@ func New(cfg *config.AgentConfig, rules []decision.Rule, logger *slog.Logger) (*
 	// Tool registry — register tools enabled in config
 	toolReg := buildToolRegistry(cfg, sshExec, matrixClient, memStore, kStore, roomCtx, logger)

+	// Rate limiting for tools
+	if cfg.Security.ToolRateLimit.Enabled {
+		maxCalls := cfg.Security.ToolRateLimit.MaxCallsPerMin
+		if maxCalls <= 0 {
+			maxCalls = 10
+		}
+		rl := tools.NewRateLimiter(maxCalls, time.Minute)
+		toolReg.SetRateLimiter(rl)
+
+		cleanupInterval := cfg.Security.ToolRateLimit.CleanupIntervalS
+		if cleanupInterval <= 0 {
+			cleanupInterval = 60
+		}
+		go func() {
+			ticker := time.NewTicker(time.Duration(cleanupInterval) * time.Second)
+			defer ticker.Stop()
+			for range ticker.C {
+				rl.Cleanup()
+			}
+		}()
+		logger.Info("tool rate limiting enabled", "max_calls_per_min", maxCalls)
+	}
+
 	a := &Agent{
 		cfg:         cfg,
 		acl:         agentACL,
@@ -753,7 +776,7 @@ func (a *Agent) runLLM(ctx context.Context, msgCtx decision.MessageContext) (str
 				a.logger.Warn("failed to send tool call notice", "tool", tc.Name, "err", err)
 			}

-			result := a.toolReg.Execute(ctx, tc.Name, tc.Arguments)
+			result := a.toolReg.ExecuteForRoom(ctx, tc.Name, tc.Arguments, msgCtx.RoomID)

 			output := result.Output
 			if result.Err != nil {
@@ -109,7 +109,7 @@ Este issue es demasiado grande para una sola rama. Se desglosa en sub-issues con
 |-----------|------|---------|-------|--------|
 | **0019a** | `issue/0019a-tool-hardening` | Deny-by-default en tools, path traversal, SSRF, SSH allowlist + syntax, Matrix room auth | 1 (parcial), 5, 6 (parcial) | **completado** |
 | **0019b** | `issue/0019b-input-sanitization` | `pkg/sanitize/` + integracion en runtime.go + config schema | 2, 6 (parcial) | **completado** |
-| **0019c** | `issue/0019c-rate-limiting` | Rate limiting de tools por agente+room en registry | 4, 6 (parcial) | pendiente |
+| **0019c** | `issue/0019c-rate-limiting` | Rate limiting de tools por agente+room en registry | 4, 6 (parcial) | **completado** |
 | **0019d** | `issue/0019d-prompt-hardening-docs` | Hardening de system prompts + docs + activar flag | 1 (restante: base_path), 3, 7 | pendiente |

 ### Progreso por tarea
@@ -132,10 +132,10 @@ Este issue es demasiado grande para una sola rama. Se desglosa en sub-issues con
 - [ ] **3.3** Aplicar a asistente-2
 - [ ] **3.4** Documentar en regla create_agent.md

-#### Fase 4 — pendiente (0019c)
- [ ] **4.1** Rate limiter por agente+room en registry
- [ ] **4.2** Config via `security.tool_rate_limit`
- [ ] **4.3** Loguear al alcanzar limite
+#### Fase 4 — completado (0019c)
+- [x] **4.1** Rate limiter por agente+room en registry
+- [x] **4.2** Config via `security.tool_rate_limit`
+- [x] **4.3** Loguear al alcanzar limite

 #### Fase 5 — completado (0019a)
 - [x] **5.1** SSH: validacion de pipes, subshells, redirects, chains
@@ -147,7 +147,7 @@ Este issue es demasiado grande para una sola rama. Se desglosa en sub-issues con
 - [x] **6.3** Tests SSH allowlist/blocklist (0019a)
 - [x] **6.4** Tests SSRF en http.go (0019a)
 - [x] **6.1** Tests para `pkg/sanitize/` (0019b)
- [ ] **6.5** Tests para rate limiting (0019c)
+- [x] **6.5** Tests para rate limiting (0019c)

 #### Fase 7 — pendiente (0019d)
 - [ ] Actualizar CLAUDE.md
@@ -280,10 +280,18 @@ type SSHTargetCfg struct {
 // ── Security ──────────────────────────────────────────────────────────────

 type SecurityCfg struct {
-	Roles    map[string]RoleCfg `yaml:"roles"`
-	Audit    AuditCfg           `yaml:"audit"`
-	Secrets  SecretsCfg         `yaml:"secrets"`
-	Sanitize SanitizeCfg        `yaml:"sanitize"`
+	Roles         map[string]RoleCfg `yaml:"roles"`
+	Audit         AuditCfg           `yaml:"audit"`
+	Secrets       SecretsCfg         `yaml:"secrets"`
+	Sanitize      SanitizeCfg        `yaml:"sanitize"`
+	ToolRateLimit ToolRateLimitCfg   `yaml:"tool_rate_limit"`
+}
+
+// ToolRateLimitCfg controls per-room rate limiting of tool executions.
+type ToolRateLimitCfg struct {
+	Enabled          bool `yaml:"enabled"`            // enable tool rate limiting (default false)
+	MaxCallsPerMin   int  `yaml:"max_calls_per_min"`  // max tool calls per room per minute (default 10)
+	CleanupIntervalS int  `yaml:"cleanup_interval_s"` // seconds between stale entry cleanup (default 60)
 }

 // SanitizeCfg controls prompt injection detection on incoming messages.
@@ -0,0 +1,70 @@
+package tools
+
+import (
+	"sync"
+	"time"
+)
+
+// RateLimiter tracks tool call counts per key (typically roomID) using a
+// sliding window. It is safe for concurrent use.
+type RateLimiter struct {
+	maxCalls int
+	window   time.Duration
+	mu       sync.Mutex
+	buckets  map[string][]time.Time
+}
+
+// NewRateLimiter creates a rate limiter that allows maxCalls per window per key.
+func NewRateLimiter(maxCalls int, window time.Duration) *RateLimiter {
+	return &RateLimiter{
+		maxCalls: maxCalls,
+		window:   window,
+		buckets:  make(map[string][]time.Time),
+	}
+}
+
+// Allow checks whether a call for the given key is within the rate limit.
+// If allowed, it records the call and returns true. Otherwise returns false.
+func (rl *RateLimiter) Allow(key string) bool {
+	rl.mu.Lock()
+	defer rl.mu.Unlock()
+
+	now := time.Now()
+	cutoff := now.Add(-rl.window)
+
+	// Trim expired entries
+	calls := rl.buckets[key]
+	start := 0
+	for start < len(calls) && calls[start].Before(cutoff) {
+		start++
+	}
+	calls = calls[start:]
+
+	if len(calls) >= rl.maxCalls {
+		rl.buckets[key] = calls
+		return false
+	}
+
+	rl.buckets[key] = append(calls, now)
+	return true
+}
+
+// Cleanup removes stale entries for keys that have no recent calls.
+// Should be called periodically to prevent memory growth.
+func (rl *RateLimiter) Cleanup() {
+	rl.mu.Lock()
+	defer rl.mu.Unlock()
+
+	cutoff := time.Now().Add(-rl.window)
+	for key, calls := range rl.buckets {
+		start := 0
+		for start < len(calls) && calls[start].Before(cutoff) {
+			start++
+		}
+		if start >= len(calls) {
+			delete(rl.buckets, key)
+		} else {
+			rl.buckets[key] = calls[start:]
+		}
+	}
+}
@@ -0,0 +1,161 @@
+package tools
+
+import (
+	"context"
+	"log/slog"
+	"testing"
+	"time"
+)
+
+func TestRateLimiter_AllowWithinLimit(t *testing.T) {
+	rl := NewRateLimiter(3, time.Minute)
+
+	for i := 0; i < 3; i++ {
+		if !rl.Allow("room1") {
+			t.Fatalf("call %d should be allowed", i+1)
+		}
+	}
+}
+
+func TestRateLimiter_DenyOverLimit(t *testing.T) {
+	rl := NewRateLimiter(3, time.Minute)
+
+	for i := 0; i < 3; i++ {
+		rl.Allow("room1")
+	}
+
+	if rl.Allow("room1") {
+		t.Fatal("4th call should be denied")
+	}
+}
+
+func TestRateLimiter_DifferentKeysIndependent(t *testing.T) {
+	rl := NewRateLimiter(2, time.Minute)
+
+	rl.Allow("room1")
+	rl.Allow("room1")
+
+	// room1 is full, but room2 should still be allowed
+	if rl.Allow("room1") {
+		t.Fatal("room1 3rd call should be denied")
+	}
+	if !rl.Allow("room2") {
+		t.Fatal("room2 should be allowed independently")
+	}
+}
+
+func TestRateLimiter_WindowExpiry(t *testing.T) {
+	// Use a very short window for testing
+	rl := NewRateLimiter(2, 50*time.Millisecond)
+
+	rl.Allow("room1")
+	rl.Allow("room1")
+
+	if rl.Allow("room1") {
+		t.Fatal("should be denied before window expires")
+	}
+
+	// Wait for window to expire
+	time.Sleep(60 * time.Millisecond)
+
+	if !rl.Allow("room1") {
+		t.Fatal("should be allowed after window expires")
+	}
+}
+
+func TestRateLimiter_Cleanup(t *testing.T) {
+	rl := NewRateLimiter(5, 50*time.Millisecond)
+
+	rl.Allow("room1")
+	rl.Allow("room2")
+
+	// Wait for entries to expire
+	time.Sleep(60 * time.Millisecond)
+
+	rl.Cleanup()
+
+	rl.mu.Lock()
+	count := len(rl.buckets)
+	rl.mu.Unlock()
+
+	if count != 0 {
+		t.Fatalf("expected 0 buckets after cleanup, got %d", count)
+	}
+}
+
+func TestRateLimiter_CleanupKeepsActive(t *testing.T) {
+	rl := NewRateLimiter(5, time.Minute)
+
+	rl.Allow("room1")
+
+	rl.Cleanup()
+
+	rl.mu.Lock()
+	count := len(rl.buckets)
+	rl.mu.Unlock()
+
+	if count != 1 {
+		t.Fatalf("expected 1 bucket after cleanup of active entries, got %d", count)
+	}
+}
+
+func TestRegistry_ExecuteForRoom_RateLimited(t *testing.T) {
+	logger := slog.Default()
+	reg := NewRegistry(logger)
+
+	// Register a simple echo tool
+	reg.Register(Tool{
+		Def: Def{Name: "echo", Description: "echo tool"},
+		Exec: func(_ context.Context, args map[string]any) Result {
+			return Result{Output: "ok"}
+		},
+	})
+
+	rl := NewRateLimiter(2, time.Minute)
+	reg.SetRateLimiter(rl)
+
+	ctx := context.Background()
+
+	// First two calls succeed
+	r1 := reg.ExecuteForRoom(ctx, "echo", "", "!room:test")
+	if r1.Err != nil {
+		t.Fatalf("call 1 should succeed: %v", r1.Err)
+	}
+	r2 := reg.ExecuteForRoom(ctx, "echo", "", "!room:test")
+	if r2.Err != nil {
+		t.Fatalf("call 2 should succeed: %v", r2.Err)
+	}
+
+	// Third call is rate limited
+	r3 := reg.ExecuteForRoom(ctx, "echo", "", "!room:test")
+	if r3.Err == nil {
+		t.Fatal("call 3 should be rate limited")
+	}
+
+	// Different room still works
+	r4 := reg.ExecuteForRoom(ctx, "echo", "", "!room:other")
+	if r4.Err != nil {
+		t.Fatalf("different room should succeed: %v", r4.Err)
+	}
+}
+
+func TestRegistry_ExecuteForRoom_NoLimiter(t *testing.T) {
+	logger := slog.Default()
+	reg := NewRegistry(logger)
+
+	reg.Register(Tool{
+		Def: Def{Name: "echo", Description: "echo tool"},
+		Exec: func(_ context.Context, args map[string]any) Result {
+			return Result{Output: "ok"}
+		},
+	})
+
+	// No rate limiter set — all calls should succeed
+	ctx := context.Background()
+	for i := 0; i < 20; i++ {
+		r := reg.ExecuteForRoom(ctx, "echo", "", "!room:test")
+		if r.Err != nil {
+			t.Fatalf("call %d should succeed without limiter: %v", i+1, r.Err)
+		}
+	}
+}
@@ -14,8 +14,9 @@ import (

 // Registry holds available tools keyed by name.
 type Registry struct {
-	tools  map[string]Tool
-	logger *slog.Logger
+	tools       map[string]Tool
+	logger      *slog.Logger
+	rateLimiter *RateLimiter // nil when rate limiting is disabled
 }

 // NewRegistry creates an empty registry.
@@ -53,6 +54,24 @@ func (r *Registry) Len() int {
 	return len(r.tools)
 }

+// SetRateLimiter attaches a rate limiter to the registry.
+// When set, ExecuteForRoom checks the limit before running the tool.
+func (r *Registry) SetRateLimiter(rl *RateLimiter) {
+	r.rateLimiter = rl
+}
+
+// ExecuteForRoom is like Execute but checks the per-room rate limit first.
+// If the rate limit is exceeded, it returns an error result without executing.
+func (r *Registry) ExecuteForRoom(ctx context.Context, name, argsJSON, roomID string) Result {
+	if r.rateLimiter != nil && roomID != "" {
+		if !r.rateLimiter.Allow(roomID) {
+			r.logger.Warn("tool_rate_limited", "tool", name, "room", roomID)
+			return Result{Err: fmt.Errorf("rate limit exceeded for room %s: too many tool calls per minute", roomID)}
+		}
+	}
+	return r.Execute(ctx, name, argsJSON)
+}
+
 // Execute looks up a tool by name and runs it. Returns an error result if not found.
 func (r *Registry) Execute(ctx context.Context, name string, argsJSON string) Result {
 	t, ok := r.tools[name]