e8dd7c41ed
Nuevo paquete puro (sin I/O) que detecta patrones de prompt injection en mensajes de usuario antes de enviarlos al LLM. - patterns.go: 15 patrones en ingles y español (delimitadores de sistema, override de instrucciones, exfiltracion de prompt, jailbreak, evasion base64) - sanitize.go: funcion Sanitize() con 3 modos (warn, strip, reject), filtro por severidad minima y patrones deshabilitables - Tipos: Pattern, Severity, Mode, Options, Warning, Result Todo puro: string in → Result out. Los side effects (logging, rechazo) ocurren en el caller (runtime.go). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
137 lines
3.1 KiB
Go
137 lines
3.1 KiB
Go
package sanitize
|
|
|
|
import "strings"
|
|
|
|
// Mode controls how the sanitizer handles detected patterns.
|
|
type Mode int
|
|
|
|
const (
|
|
ModeWarn Mode = iota // report warnings but don't modify the message
|
|
ModeStrip // remove matched patterns from the message
|
|
ModeReject // reject the message entirely if any pattern matches
|
|
)
|
|
|
|
func (m Mode) String() string {
|
|
switch m {
|
|
case ModeWarn:
|
|
return "warn"
|
|
case ModeStrip:
|
|
return "strip"
|
|
case ModeReject:
|
|
return "reject"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
// ParseMode converts a string to a Mode. Returns ModeWarn for unrecognized values.
|
|
func ParseMode(s string) Mode {
|
|
switch strings.ToLower(s) {
|
|
case "strip":
|
|
return ModeStrip
|
|
case "reject":
|
|
return ModeReject
|
|
default:
|
|
return ModeWarn
|
|
}
|
|
}
|
|
|
|
// Options configures the sanitizer behavior.
|
|
type Options struct {
|
|
Mode Mode // how to handle detections
|
|
MinSeverity Severity // only act on patterns at or above this severity
|
|
Patterns []Pattern // patterns to check (nil = DefaultPatterns)
|
|
DisabledPatterns []string // pattern names to skip
|
|
}
|
|
|
|
// Warning represents a detected prompt injection pattern in the input.
|
|
type Warning struct {
|
|
PatternName string // which pattern matched
|
|
Severity Severity // threat level
|
|
Matched string // the text that matched (first match only)
|
|
}
|
|
|
|
// Result holds the output of a Sanitize call.
|
|
type Result struct {
|
|
Output string // the (possibly modified) message
|
|
Warnings []Warning // all detected patterns
|
|
Rejected bool // true if the message was rejected (ModeReject + match found)
|
|
}
|
|
|
|
// Sanitize checks the input for prompt injection patterns and returns
|
|
// the result according to the configured mode.
|
|
//
|
|
// This is a pure function: no I/O, no side effects.
|
|
func Sanitize(input string, opts Options) Result {
|
|
patterns := opts.Patterns
|
|
if patterns == nil {
|
|
patterns = DefaultPatterns()
|
|
}
|
|
|
|
disabled := make(map[string]bool, len(opts.DisabledPatterns))
|
|
for _, name := range opts.DisabledPatterns {
|
|
disabled[name] = true
|
|
}
|
|
|
|
var warnings []Warning
|
|
output := input
|
|
|
|
for _, p := range patterns {
|
|
if disabled[p.Name] {
|
|
continue
|
|
}
|
|
if p.Severity < opts.MinSeverity {
|
|
continue
|
|
}
|
|
|
|
loc := p.Regex.FindStringIndex(output)
|
|
if loc == nil {
|
|
continue
|
|
}
|
|
|
|
matched := output[loc[0]:loc[1]]
|
|
warnings = append(warnings, Warning{
|
|
PatternName: p.Name,
|
|
Severity: p.Severity,
|
|
Matched: matched,
|
|
})
|
|
|
|
if opts.Mode == ModeStrip {
|
|
output = p.Regex.ReplaceAllString(output, "")
|
|
}
|
|
}
|
|
|
|
result := Result{
|
|
Output: output,
|
|
Warnings: warnings,
|
|
}
|
|
|
|
if opts.Mode == ModeReject && len(warnings) > 0 {
|
|
result.Rejected = true
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// HasHighSeverity returns true if any warning is SeverityHigh.
|
|
func (r Result) HasHighSeverity() bool {
|
|
for _, w := range r.Warnings {
|
|
if w.Severity == SeverityHigh {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// MaxSeverity returns the highest severity among all warnings.
|
|
// Returns SeverityLow if there are no warnings.
|
|
func (r Result) MaxSeverity() Severity {
|
|
max := SeverityLow
|
|
for _, w := range r.Warnings {
|
|
if w.Severity > max {
|
|
max = w.Severity
|
|
}
|
|
}
|
|
return max
|
|
}
|