feat: crear pkg/sanitize para deteccion de prompt injection

Nuevo paquete puro (sin I/O) que detecta patrones de prompt injection en mensajes de usuario antes de enviarlos al LLM. - patterns.go: 15 patrones en ingles y español (delimitadores de sistema, override de instrucciones, exfiltracion de prompt, jailbreak, evasion base64) - sanitize.go: funcion Sanitize() con 3 modos (warn, strip, reject), filtro por severidad minima y patrones deshabilitables - Tipos: Pattern, Severity, Mode, Options, Warning, Result Todo puro: string in → Result out. Los side effects (logging, rechazo) ocurren en el caller (runtime.go). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 19:34:24 +00:00
parent 987ac09a09
commit e8dd7c41ed
2 changed files with 275 additions and 0 deletions
@@ -0,0 +1,139 @@
 // Package sanitize provides pure functions to detect and neutralize
 // prompt injection patterns in user messages before they reach the LLM.
 package sanitize
 import "regexp"
 // Pattern represents a known prompt injection pattern with metadata.
 type Pattern struct {
 	Name        string         // short identifier (e.g. "system-delimiter")
 	Description string         // human-readable explanation
 	Regex       *regexp.Regexp // compiled pattern
 	Severity    Severity       // how dangerous this pattern is
 }
 // Severity indicates the threat level of a detected pattern.
 type Severity int
 const (
 	SeverityLow    Severity = iota // informational, unlikely to succeed
 	SeverityMedium                 // known injection technique
 	SeverityHigh                   // active attempt to override system instructions
 )
 func (s Severity) String() string {
 	switch s {
 	case SeverityLow:
 		return "low"
 	case SeverityMedium:
 		return "medium"
 	case SeverityHigh:
 		return "high"
 	default:
 		return "unknown"
 	}
 }
 // DefaultPatterns returns the built-in set of prompt injection patterns.
 // All patterns are case-insensitive.
 func DefaultPatterns() []Pattern {
 	return []Pattern{
 		// ── System delimiter injection ──────────────────────────────────
 		{
 			Name:        "system-delimiter",
 			Description: "Attempt to inject system/assistant role delimiters",
 			Regex:       regexp.MustCompile(`(?i)<\|(?:system|assistant|user|im_start|im_end)\|>`),
 			Severity:    SeverityHigh,
 		},
 		{
 			Name:        "inst-delimiter",
 			Description: "Attempt to inject [INST] or [/INST] delimiters",
 			Regex:       regexp.MustCompile(`(?i)\[/?INST\]`),
 			Severity:    SeverityHigh,
 		},
 		{
 			Name:        "xml-role-tag",
 			Description: "Attempt to inject XML-style role tags",
 			Regex:       regexp.MustCompile(`(?i)</?(?:system|assistant|human|user)(?:\s[^>]*)?>`),
 			Severity:    SeverityHigh,
 		},
 		// ── Instruction override ───────────────────────────────────────
 		{
 			Name:        "ignore-instructions",
 			Description: "Attempt to override previous instructions",
 			Regex:       regexp.MustCompile(`(?i)(?:ignore|disregard|forget|override|bypass)\s+(?:all\s+)?(?:previous|prior|above|earlier|your|the|system)\s+(?:instructions?|rules?|prompts?|guidelines?|constraints?|directives?)`),
 			Severity:    SeverityHigh,
 		},
 		{
 			Name:        "new-instructions",
 			Description: "Attempt to inject new system-level instructions",
 			Regex:       regexp.MustCompile(`(?i)(?:new|updated?|revised?|actual|real)\s+(?:system\s+)?instructions?:\s`),
 			Severity:    SeverityHigh,
 		},
 		{
 			Name:        "you-are-now",
 			Description: "Attempt to redefine the bot's identity",
 			Regex:       regexp.MustCompile(`(?i)(?:you\s+are\s+now|from\s+now\s+on\s+you\s+are|act\s+as\s+if\s+you\s+were|pretend\s+(?:to\s+be|you\s+are))\s`),
 			Severity:    SeverityMedium,
 		},
 		// ── Prompt exfiltration ────────────────────────────────────────
 		{
 			Name:        "exfiltrate-prompt",
 			Description: "Attempt to extract the system prompt",
 			Regex:       regexp.MustCompile(`(?i)(?:repeat|show|display|print|output|reveal|tell\s+me|give\s+me|show\s+me|what\s+(?:is|are))\s+(?:your\s+)?(?:system\s+)?(?:prompt|instructions?|rules?|guidelines?|initial\s+message)`),
 			Severity:    SeverityMedium,
 		},
 		// ── Developer mode / jailbreak ─────────────────────────────────
 		{
 			Name:        "developer-mode",
 			Description: "Attempt to enable a fictional unrestricted mode",
 			Regex:       regexp.MustCompile(`(?i)(?:enable|activate|enter|switch\s+to)\s+(?:developer|debug|admin|god|sudo|unrestricted|jailbreak|dan)\s+mode`),
 			Severity:    SeverityHigh,
 		},
 		{
 			Name:        "do-anything-now",
 			Description: "DAN (Do Anything Now) jailbreak pattern",
 			Regex:       regexp.MustCompile(`(?i)(?:do\s+anything\s+now|DAN\s+mode|you\s+(?:can|must)\s+do\s+anything)`),
 			Severity:    SeverityHigh,
 		},
 		// ── Tool abuse hints ───────────────────────────────────────────
 		{
 			Name:        "tool-abuse-ssh",
 			Description: "Attempt to execute dangerous commands via SSH",
 			Regex:       regexp.MustCompile(`(?i)(?:use|call|execute|run)\s+(?:the\s+)?(?:ssh|command)\s+tool\s+(?:to\s+)?(?:run|execute|do)\s`),
 			Severity:    SeverityLow,
 		},
 		// ── Encoding evasion ───────────────────────────────────────────
 		{
 			Name:        "base64-instruction",
 			Description: "Base64-encoded instruction injection",
 			Regex:       regexp.MustCompile(`(?i)(?:decode|execute|interpret|run)\s+(?:this\s+)?(?:base64|b64|encoded)[\s:]+[A-Za-z0-9+/]{20,}={0,2}`),
 			Severity:    SeverityMedium,
 		},
 		// ── Spanish variants ───────────────────────────────────────────
 		{
 			Name:        "ignore-instructions-es",
 			Description: "Spanish: attempt to override instructions",
 			Regex:       regexp.MustCompile(`(?i)(?:ignora|olvida|descarta)\s+(?:todas?\s+)?(?:las?\s+)?(?:instrucciones?|reglas?|directivas?|restricciones?)\s+(?:anteriores?|previas?|del\s+sistema)`),
 			Severity:    SeverityHigh,
 		},
 		{
 			Name:        "you-are-now-es",
 			Description: "Spanish: attempt to redefine identity",
 			Regex:       regexp.MustCompile(`(?i)(?:ahora\s+eres|a\s+partir\s+de\s+ahora\s+eres|finge\s+(?:ser|que\s+eres)|actua\s+como\s+si\s+fueras)\s`),
 			Severity:    SeverityMedium,
 		},
 		{
 			Name:        "exfiltrate-prompt-es",
 			Description: "Spanish: attempt to extract system prompt",
 			Regex:       regexp.MustCompile(`(?i)(?:repite|muestra|muestrame|dime|dame|cual\s+es)\s+(?:tus?\s+)?(?:prompt|instrucciones?|reglas?|mensaje\s+(?:de\s+sistema|inicial))`),
 			Severity:    SeverityMedium,
 		},
 	}
 }
@@ -0,0 +1,136 @@
 package sanitize
 import "strings"
 // Mode controls how the sanitizer handles detected patterns.
 type Mode int
 const (
 	ModeWarn   Mode = iota // report warnings but don't modify the message
 	ModeStrip              // remove matched patterns from the message
 	ModeReject             // reject the message entirely if any pattern matches
 )
 func (m Mode) String() string {
 	switch m {
 	case ModeWarn:
 		return "warn"
 	case ModeStrip:
 		return "strip"
 	case ModeReject:
 		return "reject"
 	default:
 		return "unknown"
 	}
 }
 // ParseMode converts a string to a Mode. Returns ModeWarn for unrecognized values.
 func ParseMode(s string) Mode {
 	switch strings.ToLower(s) {
 	case "strip":
 		return ModeStrip
 	case "reject":
 		return ModeReject
 	default:
 		return ModeWarn
 	}
 }
 // Options configures the sanitizer behavior.
 type Options struct {
 	Mode             Mode     // how to handle detections
 	MinSeverity      Severity // only act on patterns at or above this severity
 	Patterns         []Pattern // patterns to check (nil = DefaultPatterns)
 	DisabledPatterns []string  // pattern names to skip
 }
 // Warning represents a detected prompt injection pattern in the input.
 type Warning struct {
 	PatternName string   // which pattern matched
 	Severity    Severity // threat level
 	Matched     string   // the text that matched (first match only)
 }
 // Result holds the output of a Sanitize call.
 type Result struct {
 	Output   string    // the (possibly modified) message
 	Warnings []Warning // all detected patterns
 	Rejected bool      // true if the message was rejected (ModeReject + match found)
 }
 // Sanitize checks the input for prompt injection patterns and returns
 // the result according to the configured mode.
 //
 // This is a pure function: no I/O, no side effects.
 func Sanitize(input string, opts Options) Result {
 	patterns := opts.Patterns
 	if patterns == nil {
 		patterns = DefaultPatterns()
 	}
 	disabled := make(map[string]bool, len(opts.DisabledPatterns))
 	for _, name := range opts.DisabledPatterns {
 		disabled[name] = true
 	}
 	var warnings []Warning
 	output := input
 	for _, p := range patterns {
 		if disabled[p.Name] {
 			continue
 		}
 		if p.Severity < opts.MinSeverity {
 			continue
 		}
 		loc := p.Regex.FindStringIndex(output)
 		if loc == nil {
 			continue
 		}
 		matched := output[loc[0]:loc[1]]
 		warnings = append(warnings, Warning{
 			PatternName: p.Name,
 			Severity:    p.Severity,
 			Matched:     matched,
 		})
 		if opts.Mode == ModeStrip {
 			output = p.Regex.ReplaceAllString(output, "")
 		}
 	}
 	result := Result{
 		Output:   output,
 		Warnings: warnings,
 	}
 	if opts.Mode == ModeReject && len(warnings) > 0 {
 		result.Rejected = true
 	}
 	return result
 }
 // HasHighSeverity returns true if any warning is SeverityHigh.
 func (r Result) HasHighSeverity() bool {
 	for _, w := range r.Warnings {
 		if w.Severity == SeverityHigh {
 			return true
 		}
 	}
 	return false
 }
 // MaxSeverity returns the highest severity among all warnings.
 // Returns SeverityLow if there are no warnings.
 func (r Result) MaxSeverity() Severity {
 	max := SeverityLow
 	for _, w := range r.Warnings {
 		if w.Severity > max {
 			max = w.Severity
 		}
 	}
 	return max
 }