feat: crear pkg/sanitize para deteccion de prompt injection
Nuevo paquete puro (sin I/O) que detecta patrones de prompt injection en mensajes de usuario antes de enviarlos al LLM. - patterns.go: 15 patrones en ingles y español (delimitadores de sistema, override de instrucciones, exfiltracion de prompt, jailbreak, evasion base64) - sanitize.go: funcion Sanitize() con 3 modos (warn, strip, reject), filtro por severidad minima y patrones deshabilitables - Tipos: Pattern, Severity, Mode, Options, Warning, Result Todo puro: string in → Result out. Los side effects (logging, rechazo) ocurren en el caller (runtime.go). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,139 @@
|
||||
// Package sanitize provides pure functions to detect and neutralize
|
||||
// prompt injection patterns in user messages before they reach the LLM.
|
||||
package sanitize
|
||||
|
||||
import "regexp"
|
||||
|
||||
// Pattern represents a known prompt injection pattern with metadata.
|
||||
type Pattern struct {
|
||||
Name string // short identifier (e.g. "system-delimiter")
|
||||
Description string // human-readable explanation
|
||||
Regex *regexp.Regexp // compiled pattern
|
||||
Severity Severity // how dangerous this pattern is
|
||||
}
|
||||
|
||||
// Severity indicates the threat level of a detected pattern.
|
||||
type Severity int
|
||||
|
||||
const (
|
||||
SeverityLow Severity = iota // informational, unlikely to succeed
|
||||
SeverityMedium // known injection technique
|
||||
SeverityHigh // active attempt to override system instructions
|
||||
)
|
||||
|
||||
func (s Severity) String() string {
|
||||
switch s {
|
||||
case SeverityLow:
|
||||
return "low"
|
||||
case SeverityMedium:
|
||||
return "medium"
|
||||
case SeverityHigh:
|
||||
return "high"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// DefaultPatterns returns the built-in set of prompt injection patterns.
|
||||
// All patterns are case-insensitive.
|
||||
func DefaultPatterns() []Pattern {
|
||||
return []Pattern{
|
||||
// ── System delimiter injection ──────────────────────────────────
|
||||
{
|
||||
Name: "system-delimiter",
|
||||
Description: "Attempt to inject system/assistant role delimiters",
|
||||
Regex: regexp.MustCompile(`(?i)<\|(?:system|assistant|user|im_start|im_end)\|>`),
|
||||
Severity: SeverityHigh,
|
||||
},
|
||||
{
|
||||
Name: "inst-delimiter",
|
||||
Description: "Attempt to inject [INST] or [/INST] delimiters",
|
||||
Regex: regexp.MustCompile(`(?i)\[/?INST\]`),
|
||||
Severity: SeverityHigh,
|
||||
},
|
||||
{
|
||||
Name: "xml-role-tag",
|
||||
Description: "Attempt to inject XML-style role tags",
|
||||
Regex: regexp.MustCompile(`(?i)</?(?:system|assistant|human|user)(?:\s[^>]*)?>`),
|
||||
Severity: SeverityHigh,
|
||||
},
|
||||
|
||||
// ── Instruction override ───────────────────────────────────────
|
||||
{
|
||||
Name: "ignore-instructions",
|
||||
Description: "Attempt to override previous instructions",
|
||||
Regex: regexp.MustCompile(`(?i)(?:ignore|disregard|forget|override|bypass)\s+(?:all\s+)?(?:previous|prior|above|earlier|your|the|system)\s+(?:instructions?|rules?|prompts?|guidelines?|constraints?|directives?)`),
|
||||
Severity: SeverityHigh,
|
||||
},
|
||||
{
|
||||
Name: "new-instructions",
|
||||
Description: "Attempt to inject new system-level instructions",
|
||||
Regex: regexp.MustCompile(`(?i)(?:new|updated?|revised?|actual|real)\s+(?:system\s+)?instructions?:\s`),
|
||||
Severity: SeverityHigh,
|
||||
},
|
||||
{
|
||||
Name: "you-are-now",
|
||||
Description: "Attempt to redefine the bot's identity",
|
||||
Regex: regexp.MustCompile(`(?i)(?:you\s+are\s+now|from\s+now\s+on\s+you\s+are|act\s+as\s+if\s+you\s+were|pretend\s+(?:to\s+be|you\s+are))\s`),
|
||||
Severity: SeverityMedium,
|
||||
},
|
||||
|
||||
// ── Prompt exfiltration ────────────────────────────────────────
|
||||
{
|
||||
Name: "exfiltrate-prompt",
|
||||
Description: "Attempt to extract the system prompt",
|
||||
Regex: regexp.MustCompile(`(?i)(?:repeat|show|display|print|output|reveal|tell\s+me|give\s+me|show\s+me|what\s+(?:is|are))\s+(?:your\s+)?(?:system\s+)?(?:prompt|instructions?|rules?|guidelines?|initial\s+message)`),
|
||||
Severity: SeverityMedium,
|
||||
},
|
||||
|
||||
// ── Developer mode / jailbreak ─────────────────────────────────
|
||||
{
|
||||
Name: "developer-mode",
|
||||
Description: "Attempt to enable a fictional unrestricted mode",
|
||||
Regex: regexp.MustCompile(`(?i)(?:enable|activate|enter|switch\s+to)\s+(?:developer|debug|admin|god|sudo|unrestricted|jailbreak|dan)\s+mode`),
|
||||
Severity: SeverityHigh,
|
||||
},
|
||||
{
|
||||
Name: "do-anything-now",
|
||||
Description: "DAN (Do Anything Now) jailbreak pattern",
|
||||
Regex: regexp.MustCompile(`(?i)(?:do\s+anything\s+now|DAN\s+mode|you\s+(?:can|must)\s+do\s+anything)`),
|
||||
Severity: SeverityHigh,
|
||||
},
|
||||
|
||||
// ── Tool abuse hints ───────────────────────────────────────────
|
||||
{
|
||||
Name: "tool-abuse-ssh",
|
||||
Description: "Attempt to execute dangerous commands via SSH",
|
||||
Regex: regexp.MustCompile(`(?i)(?:use|call|execute|run)\s+(?:the\s+)?(?:ssh|command)\s+tool\s+(?:to\s+)?(?:run|execute|do)\s`),
|
||||
Severity: SeverityLow,
|
||||
},
|
||||
|
||||
// ── Encoding evasion ───────────────────────────────────────────
|
||||
{
|
||||
Name: "base64-instruction",
|
||||
Description: "Base64-encoded instruction injection",
|
||||
Regex: regexp.MustCompile(`(?i)(?:decode|execute|interpret|run)\s+(?:this\s+)?(?:base64|b64|encoded)[\s:]+[A-Za-z0-9+/]{20,}={0,2}`),
|
||||
Severity: SeverityMedium,
|
||||
},
|
||||
|
||||
// ── Spanish variants ───────────────────────────────────────────
|
||||
{
|
||||
Name: "ignore-instructions-es",
|
||||
Description: "Spanish: attempt to override instructions",
|
||||
Regex: regexp.MustCompile(`(?i)(?:ignora|olvida|descarta)\s+(?:todas?\s+)?(?:las?\s+)?(?:instrucciones?|reglas?|directivas?|restricciones?)\s+(?:anteriores?|previas?|del\s+sistema)`),
|
||||
Severity: SeverityHigh,
|
||||
},
|
||||
{
|
||||
Name: "you-are-now-es",
|
||||
Description: "Spanish: attempt to redefine identity",
|
||||
Regex: regexp.MustCompile(`(?i)(?:ahora\s+eres|a\s+partir\s+de\s+ahora\s+eres|finge\s+(?:ser|que\s+eres)|actua\s+como\s+si\s+fueras)\s`),
|
||||
Severity: SeverityMedium,
|
||||
},
|
||||
{
|
||||
Name: "exfiltrate-prompt-es",
|
||||
Description: "Spanish: attempt to extract system prompt",
|
||||
Regex: regexp.MustCompile(`(?i)(?:repite|muestra|muestrame|dime|dame|cual\s+es)\s+(?:tus?\s+)?(?:prompt|instrucciones?|reglas?|mensaje\s+(?:de\s+sistema|inicial))`),
|
||||
Severity: SeverityMedium,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
package sanitize
|
||||
|
||||
import "strings"
|
||||
|
||||
// Mode controls how the sanitizer handles detected patterns.
|
||||
type Mode int
|
||||
|
||||
const (
|
||||
ModeWarn Mode = iota // report warnings but don't modify the message
|
||||
ModeStrip // remove matched patterns from the message
|
||||
ModeReject // reject the message entirely if any pattern matches
|
||||
)
|
||||
|
||||
func (m Mode) String() string {
|
||||
switch m {
|
||||
case ModeWarn:
|
||||
return "warn"
|
||||
case ModeStrip:
|
||||
return "strip"
|
||||
case ModeReject:
|
||||
return "reject"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// ParseMode converts a string to a Mode. Returns ModeWarn for unrecognized values.
|
||||
func ParseMode(s string) Mode {
|
||||
switch strings.ToLower(s) {
|
||||
case "strip":
|
||||
return ModeStrip
|
||||
case "reject":
|
||||
return ModeReject
|
||||
default:
|
||||
return ModeWarn
|
||||
}
|
||||
}
|
||||
|
||||
// Options configures the sanitizer behavior.
|
||||
type Options struct {
|
||||
Mode Mode // how to handle detections
|
||||
MinSeverity Severity // only act on patterns at or above this severity
|
||||
Patterns []Pattern // patterns to check (nil = DefaultPatterns)
|
||||
DisabledPatterns []string // pattern names to skip
|
||||
}
|
||||
|
||||
// Warning represents a detected prompt injection pattern in the input.
|
||||
type Warning struct {
|
||||
PatternName string // which pattern matched
|
||||
Severity Severity // threat level
|
||||
Matched string // the text that matched (first match only)
|
||||
}
|
||||
|
||||
// Result holds the output of a Sanitize call.
|
||||
type Result struct {
|
||||
Output string // the (possibly modified) message
|
||||
Warnings []Warning // all detected patterns
|
||||
Rejected bool // true if the message was rejected (ModeReject + match found)
|
||||
}
|
||||
|
||||
// Sanitize checks the input for prompt injection patterns and returns
|
||||
// the result according to the configured mode.
|
||||
//
|
||||
// This is a pure function: no I/O, no side effects.
|
||||
func Sanitize(input string, opts Options) Result {
|
||||
patterns := opts.Patterns
|
||||
if patterns == nil {
|
||||
patterns = DefaultPatterns()
|
||||
}
|
||||
|
||||
disabled := make(map[string]bool, len(opts.DisabledPatterns))
|
||||
for _, name := range opts.DisabledPatterns {
|
||||
disabled[name] = true
|
||||
}
|
||||
|
||||
var warnings []Warning
|
||||
output := input
|
||||
|
||||
for _, p := range patterns {
|
||||
if disabled[p.Name] {
|
||||
continue
|
||||
}
|
||||
if p.Severity < opts.MinSeverity {
|
||||
continue
|
||||
}
|
||||
|
||||
loc := p.Regex.FindStringIndex(output)
|
||||
if loc == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
matched := output[loc[0]:loc[1]]
|
||||
warnings = append(warnings, Warning{
|
||||
PatternName: p.Name,
|
||||
Severity: p.Severity,
|
||||
Matched: matched,
|
||||
})
|
||||
|
||||
if opts.Mode == ModeStrip {
|
||||
output = p.Regex.ReplaceAllString(output, "")
|
||||
}
|
||||
}
|
||||
|
||||
result := Result{
|
||||
Output: output,
|
||||
Warnings: warnings,
|
||||
}
|
||||
|
||||
if opts.Mode == ModeReject && len(warnings) > 0 {
|
||||
result.Rejected = true
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// HasHighSeverity returns true if any warning is SeverityHigh.
|
||||
func (r Result) HasHighSeverity() bool {
|
||||
for _, w := range r.Warnings {
|
||||
if w.Severity == SeverityHigh {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// MaxSeverity returns the highest severity among all warnings.
|
||||
// Returns SeverityLow if there are no warnings.
|
||||
func (r Result) MaxSeverity() Severity {
|
||||
max := SeverityLow
|
||||
for _, w := range r.Warnings {
|
||||
if w.Severity > max {
|
||||
max = w.Severity
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
Reference in New Issue
Block a user