130 lines
3.3 KiB
Go
130 lines
3.3 KiB
Go
package main
|
|
|
|
import (
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// sanitizeFTS5 takes free-form input from an LLM and produces a query the
|
|
// SQLite FTS5 parser will accept. Rules from CLAUDE.md:
|
|
//
|
|
// - After `column:` the value must be a single ASCII alnum/underscore token.
|
|
// Any other char (`-`, `.`, `:`, space) breaks the parser.
|
|
// - Multi-word values must be wrapped in double quotes.
|
|
//
|
|
// Strategy: if the caller already wrote `column:value`, quote `value` if it
|
|
// contains anything but `[A-Za-z0-9_]`. Otherwise treat the whole input as a
|
|
// free-text phrase and split on whitespace, quoting tokens that need it.
|
|
//
|
|
// Returns a query suitable to pass to FTS5 MATCH. Empty input returns "".
|
|
func sanitizeFTS5(q string) string {
|
|
q = strings.TrimSpace(q)
|
|
if q == "" {
|
|
return ""
|
|
}
|
|
// If the query contains FTS5 operators we leave it alone except for token
|
|
// quoting per `column:` clauses. This is a heuristic — power users can
|
|
// craft their own queries.
|
|
if hasOperator(q) {
|
|
return quoteColumnClauses(q)
|
|
}
|
|
// Free text: split, quote each token if needed, join with implicit AND.
|
|
parts := strings.Fields(q)
|
|
for i, p := range parts {
|
|
parts[i] = ftsQuote(p)
|
|
}
|
|
return strings.Join(parts, " ")
|
|
}
|
|
|
|
func hasOperator(q string) bool {
|
|
upper := strings.ToUpper(q)
|
|
if strings.Contains(q, ":") {
|
|
return true
|
|
}
|
|
if strings.Contains(upper, " OR ") || strings.Contains(upper, " AND ") || strings.Contains(upper, " NEAR(") || strings.Contains(upper, " NOT ") {
|
|
return true
|
|
}
|
|
if strings.Contains(q, "*") || strings.Contains(q, "(") || strings.Contains(q, "\"") {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// quoteColumnClauses scans a query and ensures any `column:value` clause has
|
|
// a quoted value when value contains non-alnum chars.
|
|
func quoteColumnClauses(q string) string {
|
|
var b strings.Builder
|
|
tokens := tokenize(q)
|
|
for i, t := range tokens {
|
|
if i > 0 {
|
|
b.WriteByte(' ')
|
|
}
|
|
colon := strings.IndexByte(t, ':')
|
|
if colon == -1 || colon == len(t)-1 {
|
|
b.WriteString(t)
|
|
continue
|
|
}
|
|
head := t[:colon+1]
|
|
val := t[colon+1:]
|
|
// Already quoted or starts with paren/star — leave alone.
|
|
if strings.HasPrefix(val, "\"") || strings.HasPrefix(val, "(") {
|
|
b.WriteString(t)
|
|
continue
|
|
}
|
|
// Strip trailing star for prefix queries to assess the body.
|
|
body := strings.TrimSuffix(val, "*")
|
|
if isFTSSafeToken(body) {
|
|
b.WriteString(t)
|
|
continue
|
|
}
|
|
b.WriteString(head)
|
|
b.WriteString(ftsQuote(val))
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
// tokenize splits q on whitespace but preserves quoted strings as one token.
|
|
func tokenize(q string) []string {
|
|
var out []string
|
|
var cur strings.Builder
|
|
inQ := false
|
|
for _, r := range q {
|
|
switch {
|
|
case r == '"':
|
|
inQ = !inQ
|
|
cur.WriteRune(r)
|
|
case unicode.IsSpace(r) && !inQ:
|
|
if cur.Len() > 0 {
|
|
out = append(out, cur.String())
|
|
cur.Reset()
|
|
}
|
|
default:
|
|
cur.WriteRune(r)
|
|
}
|
|
}
|
|
if cur.Len() > 0 {
|
|
out = append(out, cur.String())
|
|
}
|
|
return out
|
|
}
|
|
|
|
func isFTSSafeToken(s string) bool {
|
|
if s == "" {
|
|
return false
|
|
}
|
|
for _, r := range s {
|
|
if !(r == '_' || (r >= '0' && r <= '9') || (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// ftsQuote wraps a token in double quotes for FTS5, escaping inner quotes.
|
|
func ftsQuote(s string) string {
|
|
if isFTSSafeToken(s) {
|
|
return s
|
|
}
|
|
return "\"" + strings.ReplaceAll(s, "\"", "\"\"") + "\""
|
|
}
|