Files
2026-05-09 13:29:32 +02:00

130 lines
3.3 KiB
Go

package main
import (
"strings"
"unicode"
)
// sanitizeFTS5 takes free-form input from an LLM and produces a query the
// SQLite FTS5 parser will accept. Rules from CLAUDE.md:
//
// - After `column:` the value must be a single ASCII alnum/underscore token.
// Any other char (`-`, `.`, `:`, space) breaks the parser.
// - Multi-word values must be wrapped in double quotes.
//
// Strategy: if the caller already wrote `column:value`, quote `value` if it
// contains anything but `[A-Za-z0-9_]`. Otherwise treat the whole input as a
// free-text phrase and split on whitespace, quoting tokens that need it.
//
// Returns a query suitable to pass to FTS5 MATCH. Empty input returns "".
func sanitizeFTS5(q string) string {
q = strings.TrimSpace(q)
if q == "" {
return ""
}
// If the query contains FTS5 operators we leave it alone except for token
// quoting per `column:` clauses. This is a heuristic — power users can
// craft their own queries.
if hasOperator(q) {
return quoteColumnClauses(q)
}
// Free text: split, quote each token if needed, join with implicit AND.
parts := strings.Fields(q)
for i, p := range parts {
parts[i] = ftsQuote(p)
}
return strings.Join(parts, " ")
}
func hasOperator(q string) bool {
upper := strings.ToUpper(q)
if strings.Contains(q, ":") {
return true
}
if strings.Contains(upper, " OR ") || strings.Contains(upper, " AND ") || strings.Contains(upper, " NEAR(") || strings.Contains(upper, " NOT ") {
return true
}
if strings.Contains(q, "*") || strings.Contains(q, "(") || strings.Contains(q, "\"") {
return true
}
return false
}
// quoteColumnClauses scans a query and ensures any `column:value` clause has
// a quoted value when value contains non-alnum chars.
func quoteColumnClauses(q string) string {
var b strings.Builder
tokens := tokenize(q)
for i, t := range tokens {
if i > 0 {
b.WriteByte(' ')
}
colon := strings.IndexByte(t, ':')
if colon == -1 || colon == len(t)-1 {
b.WriteString(t)
continue
}
head := t[:colon+1]
val := t[colon+1:]
// Already quoted or starts with paren/star — leave alone.
if strings.HasPrefix(val, "\"") || strings.HasPrefix(val, "(") {
b.WriteString(t)
continue
}
// Strip trailing star for prefix queries to assess the body.
body := strings.TrimSuffix(val, "*")
if isFTSSafeToken(body) {
b.WriteString(t)
continue
}
b.WriteString(head)
b.WriteString(ftsQuote(val))
}
return b.String()
}
// tokenize splits q on whitespace but preserves quoted strings as one token.
func tokenize(q string) []string {
var out []string
var cur strings.Builder
inQ := false
for _, r := range q {
switch {
case r == '"':
inQ = !inQ
cur.WriteRune(r)
case unicode.IsSpace(r) && !inQ:
if cur.Len() > 0 {
out = append(out, cur.String())
cur.Reset()
}
default:
cur.WriteRune(r)
}
}
if cur.Len() > 0 {
out = append(out, cur.String())
}
return out
}
func isFTSSafeToken(s string) bool {
if s == "" {
return false
}
for _, r := range s {
if !(r == '_' || (r >= '0' && r <= '9') || (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')) {
return false
}
}
return true
}
// ftsQuote wraps a token in double quotes for FTS5, escaping inner quotes.
func ftsQuote(s string) string {
if isFTSSafeToken(s) {
return s
}
return "\"" + strings.ReplaceAll(s, "\"", "\"\"") + "\""
}