unibots/pkg/sanitize/sanitize.go

package sanitize

import "strings"

// Mode controls how the sanitizer handles detected patterns.
type Mode int

const (
	ModeWarn   Mode = iota // report warnings but don't modify the message
	ModeStrip              // remove matched patterns from the message
	ModeReject             // reject the message entirely if any pattern matches
)

func (m Mode) String() string {
	switch m {
	case ModeWarn:
		return "warn"
	case ModeStrip:
		return "strip"
	case ModeReject:
		return "reject"
	default:
		return "unknown"
	}
}

// ParseMode converts a string to a Mode. Returns ModeWarn for unrecognized values.
func ParseMode(s string) Mode {
	switch strings.ToLower(s) {
	case "strip":
		return ModeStrip
	case "reject":
		return ModeReject
	default:
		return ModeWarn
	}
}

// Options configures the sanitizer behavior.
type Options struct {
	Mode             Mode     // how to handle detections
	MinSeverity      Severity // only act on patterns at or above this severity
	Patterns         []Pattern // patterns to check (nil = DefaultPatterns)
	DisabledPatterns []string  // pattern names to skip
}

// Warning represents a detected prompt injection pattern in the input.
type Warning struct {
	PatternName string   // which pattern matched
	Severity    Severity // threat level
	Matched     string   // the text that matched (first match only)
}

// Result holds the output of a Sanitize call.
type Result struct {
	Output   string    // the (possibly modified) message
	Warnings []Warning // all detected patterns
	Rejected bool      // true if the message was rejected (ModeReject + match found)
}

// Sanitize checks the input for prompt injection patterns and returns
// the result according to the configured mode.
//
// This is a pure function: no I/O, no side effects.
func Sanitize(input string, opts Options) Result {
	patterns := opts.Patterns
	if patterns == nil {
		patterns = DefaultPatterns()
	}

	disabled := make(map[string]bool, len(opts.DisabledPatterns))
	for _, name := range opts.DisabledPatterns {
		disabled[name] = true
	}

	var warnings []Warning
	output := input

	for _, p := range patterns {
		if disabled[p.Name] {
			continue
		}
		if p.Severity < opts.MinSeverity {
			continue
		}

		loc := p.Regex.FindStringIndex(output)
		if loc == nil {
			continue
		}

		matched := output[loc[0]:loc[1]]
		warnings = append(warnings, Warning{
			PatternName: p.Name,
			Severity:    p.Severity,
			Matched:     matched,
		})

		if opts.Mode == ModeStrip {
			output = p.Regex.ReplaceAllString(output, "")
		}
	}

	result := Result{
		Output:   output,
		Warnings: warnings,
	}

	if opts.Mode == ModeReject && len(warnings) > 0 {
		result.Rejected = true
	}

	return result
}

// HasHighSeverity returns true if any warning is SeverityHigh.
func (r Result) HasHighSeverity() bool {
	for _, w := range r.Warnings {
		if w.Severity == SeverityHigh {
			return true
		}
	}
	return false
}

// MaxSeverity returns the highest severity among all warnings.
// Returns SeverityLow if there are no warnings.
func (r Result) MaxSeverity() Severity {
	max := SeverityLow
	for _, w := range r.Warnings {
		if w.Severity > max {
			max = w.Severity
		}
	}
	return max
}