From e8dd7c41ed6c4483ca9eafd8c42820e987784d9d Mon Sep 17 00:00:00 2001 From: Enmanuel Date: Sat, 7 Mar 2026 19:34:24 +0000 Subject: [PATCH] feat: crear pkg/sanitize para deteccion de prompt injection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nuevo paquete puro (sin I/O) que detecta patrones de prompt injection en mensajes de usuario antes de enviarlos al LLM. - patterns.go: 15 patrones en ingles y español (delimitadores de sistema, override de instrucciones, exfiltracion de prompt, jailbreak, evasion base64) - sanitize.go: funcion Sanitize() con 3 modos (warn, strip, reject), filtro por severidad minima y patrones deshabilitables - Tipos: Pattern, Severity, Mode, Options, Warning, Result Todo puro: string in → Result out. Los side effects (logging, rechazo) ocurren en el caller (runtime.go). Co-Authored-By: Claude Opus 4.6 --- pkg/sanitize/patterns.go | 139 +++++++++++++++++++++++++++++++++++++++ pkg/sanitize/sanitize.go | 136 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 275 insertions(+) create mode 100644 pkg/sanitize/patterns.go create mode 100644 pkg/sanitize/sanitize.go diff --git a/pkg/sanitize/patterns.go b/pkg/sanitize/patterns.go new file mode 100644 index 0000000..160f083 --- /dev/null +++ b/pkg/sanitize/patterns.go @@ -0,0 +1,139 @@ +// Package sanitize provides pure functions to detect and neutralize +// prompt injection patterns in user messages before they reach the LLM. +package sanitize + +import "regexp" + +// Pattern represents a known prompt injection pattern with metadata. +type Pattern struct { + Name string // short identifier (e.g. "system-delimiter") + Description string // human-readable explanation + Regex *regexp.Regexp // compiled pattern + Severity Severity // how dangerous this pattern is +} + +// Severity indicates the threat level of a detected pattern. +type Severity int + +const ( + SeverityLow Severity = iota // informational, unlikely to succeed + SeverityMedium // known injection technique + SeverityHigh // active attempt to override system instructions +) + +func (s Severity) String() string { + switch s { + case SeverityLow: + return "low" + case SeverityMedium: + return "medium" + case SeverityHigh: + return "high" + default: + return "unknown" + } +} + +// DefaultPatterns returns the built-in set of prompt injection patterns. +// All patterns are case-insensitive. +func DefaultPatterns() []Pattern { + return []Pattern{ + // ── System delimiter injection ────────────────────────────────── + { + Name: "system-delimiter", + Description: "Attempt to inject system/assistant role delimiters", + Regex: regexp.MustCompile(`(?i)<\|(?:system|assistant|user|im_start|im_end)\|>`), + Severity: SeverityHigh, + }, + { + Name: "inst-delimiter", + Description: "Attempt to inject [INST] or [/INST] delimiters", + Regex: regexp.MustCompile(`(?i)\[/?INST\]`), + Severity: SeverityHigh, + }, + { + Name: "xml-role-tag", + Description: "Attempt to inject XML-style role tags", + Regex: regexp.MustCompile(`(?i)]*)?>`), + Severity: SeverityHigh, + }, + + // ── Instruction override ─────────────────────────────────────── + { + Name: "ignore-instructions", + Description: "Attempt to override previous instructions", + Regex: regexp.MustCompile(`(?i)(?:ignore|disregard|forget|override|bypass)\s+(?:all\s+)?(?:previous|prior|above|earlier|your|the|system)\s+(?:instructions?|rules?|prompts?|guidelines?|constraints?|directives?)`), + Severity: SeverityHigh, + }, + { + Name: "new-instructions", + Description: "Attempt to inject new system-level instructions", + Regex: regexp.MustCompile(`(?i)(?:new|updated?|revised?|actual|real)\s+(?:system\s+)?instructions?:\s`), + Severity: SeverityHigh, + }, + { + Name: "you-are-now", + Description: "Attempt to redefine the bot's identity", + Regex: regexp.MustCompile(`(?i)(?:you\s+are\s+now|from\s+now\s+on\s+you\s+are|act\s+as\s+if\s+you\s+were|pretend\s+(?:to\s+be|you\s+are))\s`), + Severity: SeverityMedium, + }, + + // ── Prompt exfiltration ──────────────────────────────────────── + { + Name: "exfiltrate-prompt", + Description: "Attempt to extract the system prompt", + Regex: regexp.MustCompile(`(?i)(?:repeat|show|display|print|output|reveal|tell\s+me|give\s+me|show\s+me|what\s+(?:is|are))\s+(?:your\s+)?(?:system\s+)?(?:prompt|instructions?|rules?|guidelines?|initial\s+message)`), + Severity: SeverityMedium, + }, + + // ── Developer mode / jailbreak ───────────────────────────────── + { + Name: "developer-mode", + Description: "Attempt to enable a fictional unrestricted mode", + Regex: regexp.MustCompile(`(?i)(?:enable|activate|enter|switch\s+to)\s+(?:developer|debug|admin|god|sudo|unrestricted|jailbreak|dan)\s+mode`), + Severity: SeverityHigh, + }, + { + Name: "do-anything-now", + Description: "DAN (Do Anything Now) jailbreak pattern", + Regex: regexp.MustCompile(`(?i)(?:do\s+anything\s+now|DAN\s+mode|you\s+(?:can|must)\s+do\s+anything)`), + Severity: SeverityHigh, + }, + + // ── Tool abuse hints ─────────────────────────────────────────── + { + Name: "tool-abuse-ssh", + Description: "Attempt to execute dangerous commands via SSH", + Regex: regexp.MustCompile(`(?i)(?:use|call|execute|run)\s+(?:the\s+)?(?:ssh|command)\s+tool\s+(?:to\s+)?(?:run|execute|do)\s`), + Severity: SeverityLow, + }, + + // ── Encoding evasion ─────────────────────────────────────────── + { + Name: "base64-instruction", + Description: "Base64-encoded instruction injection", + Regex: regexp.MustCompile(`(?i)(?:decode|execute|interpret|run)\s+(?:this\s+)?(?:base64|b64|encoded)[\s:]+[A-Za-z0-9+/]{20,}={0,2}`), + Severity: SeverityMedium, + }, + + // ── Spanish variants ─────────────────────────────────────────── + { + Name: "ignore-instructions-es", + Description: "Spanish: attempt to override instructions", + Regex: regexp.MustCompile(`(?i)(?:ignora|olvida|descarta)\s+(?:todas?\s+)?(?:las?\s+)?(?:instrucciones?|reglas?|directivas?|restricciones?)\s+(?:anteriores?|previas?|del\s+sistema)`), + Severity: SeverityHigh, + }, + { + Name: "you-are-now-es", + Description: "Spanish: attempt to redefine identity", + Regex: regexp.MustCompile(`(?i)(?:ahora\s+eres|a\s+partir\s+de\s+ahora\s+eres|finge\s+(?:ser|que\s+eres)|actua\s+como\s+si\s+fueras)\s`), + Severity: SeverityMedium, + }, + { + Name: "exfiltrate-prompt-es", + Description: "Spanish: attempt to extract system prompt", + Regex: regexp.MustCompile(`(?i)(?:repite|muestra|muestrame|dime|dame|cual\s+es)\s+(?:tus?\s+)?(?:prompt|instrucciones?|reglas?|mensaje\s+(?:de\s+sistema|inicial))`), + Severity: SeverityMedium, + }, + } +} diff --git a/pkg/sanitize/sanitize.go b/pkg/sanitize/sanitize.go new file mode 100644 index 0000000..747df6e --- /dev/null +++ b/pkg/sanitize/sanitize.go @@ -0,0 +1,136 @@ +package sanitize + +import "strings" + +// Mode controls how the sanitizer handles detected patterns. +type Mode int + +const ( + ModeWarn Mode = iota // report warnings but don't modify the message + ModeStrip // remove matched patterns from the message + ModeReject // reject the message entirely if any pattern matches +) + +func (m Mode) String() string { + switch m { + case ModeWarn: + return "warn" + case ModeStrip: + return "strip" + case ModeReject: + return "reject" + default: + return "unknown" + } +} + +// ParseMode converts a string to a Mode. Returns ModeWarn for unrecognized values. +func ParseMode(s string) Mode { + switch strings.ToLower(s) { + case "strip": + return ModeStrip + case "reject": + return ModeReject + default: + return ModeWarn + } +} + +// Options configures the sanitizer behavior. +type Options struct { + Mode Mode // how to handle detections + MinSeverity Severity // only act on patterns at or above this severity + Patterns []Pattern // patterns to check (nil = DefaultPatterns) + DisabledPatterns []string // pattern names to skip +} + +// Warning represents a detected prompt injection pattern in the input. +type Warning struct { + PatternName string // which pattern matched + Severity Severity // threat level + Matched string // the text that matched (first match only) +} + +// Result holds the output of a Sanitize call. +type Result struct { + Output string // the (possibly modified) message + Warnings []Warning // all detected patterns + Rejected bool // true if the message was rejected (ModeReject + match found) +} + +// Sanitize checks the input for prompt injection patterns and returns +// the result according to the configured mode. +// +// This is a pure function: no I/O, no side effects. +func Sanitize(input string, opts Options) Result { + patterns := opts.Patterns + if patterns == nil { + patterns = DefaultPatterns() + } + + disabled := make(map[string]bool, len(opts.DisabledPatterns)) + for _, name := range opts.DisabledPatterns { + disabled[name] = true + } + + var warnings []Warning + output := input + + for _, p := range patterns { + if disabled[p.Name] { + continue + } + if p.Severity < opts.MinSeverity { + continue + } + + loc := p.Regex.FindStringIndex(output) + if loc == nil { + continue + } + + matched := output[loc[0]:loc[1]] + warnings = append(warnings, Warning{ + PatternName: p.Name, + Severity: p.Severity, + Matched: matched, + }) + + if opts.Mode == ModeStrip { + output = p.Regex.ReplaceAllString(output, "") + } + } + + result := Result{ + Output: output, + Warnings: warnings, + } + + if opts.Mode == ModeReject && len(warnings) > 0 { + result.Rejected = true + } + + return result +} + +// HasHighSeverity returns true if any warning is SeverityHigh. +func (r Result) HasHighSeverity() bool { + for _, w := range r.Warnings { + if w.Severity == SeverityHigh { + return true + } + } + return false +} + +// MaxSeverity returns the highest severity among all warnings. +// Returns SeverityLow if there are no warnings. +func (r Result) MaxSeverity() Severity { + max := SeverityLow + for _, w := range r.Warnings { + if w.Severity > max { + max = w.Severity + } + } + return max +}