ca1bf5a59b
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
554 lines
16 KiB
Go
554 lines
16 KiB
Go
package main
|
|
|
|
import (
|
|
"database/sql"
|
|
"encoding/json"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"unicode"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
)
|
|
|
|
// matchResult holds one candidate function match.
|
|
type matchResult struct {
|
|
ID string `json:"id"`
|
|
Score float64 `json:"score"` // normalized (top=1.0)
|
|
RawScore float64 `json:"raw_score"` // absolute, pre-normalization. Use for confidence gates.
|
|
Signature string `json:"signature"`
|
|
Snippet string `json:"snippet"`
|
|
Lang string `json:"-"`
|
|
Name string `json:"-"`
|
|
Tags string `json:"-"`
|
|
HighConfidence bool `json:"-"` // filled after ranking
|
|
}
|
|
|
|
// matchOutput is the JSON envelope returned by fn match.
|
|
type matchOutput struct {
|
|
Query string `json:"query"`
|
|
Top []matchResult `json:"top"`
|
|
HighConfidence bool `json:"high_confidence"`
|
|
}
|
|
|
|
// fts5Row is a raw row from the FTS query.
|
|
type fts5Row struct {
|
|
id string
|
|
name string
|
|
lang string
|
|
signature string
|
|
description string
|
|
tags string
|
|
rank float64
|
|
}
|
|
|
|
// --- tokenizer ---------------------------------------------------------
|
|
|
|
var (
|
|
reNonAlnum = regexp.MustCompile(`[^a-zA-Z0-9]+`)
|
|
reFlag = regexp.MustCompile(`^-{1,2}[a-zA-Z]`)
|
|
reAbsPath = regexp.MustCompile(`^(/|[A-Za-z]:\\|\\\\)`)
|
|
rePureNumber = regexp.MustCompile(`^\d+$`)
|
|
)
|
|
|
|
// domainStopwords are tokens so generic in this codebase that they add noise
|
|
// rather than signal to the matcher (they match hundreds of functions equally).
|
|
var domainStopwords = map[string]bool{
|
|
"registry": true, "function": true, "functions": true,
|
|
"app": true, "apps": true, "file": true, "files": true,
|
|
"get": true, "set": true, "run": true, "list": true, "add": true,
|
|
"new": true, "all": true, "the": true, "and": true, "for": true,
|
|
"use": true, "fmt": true, "log": true, "err": true, "nil": true,
|
|
"true": true, "false": true, "var": true, "val": true, "str": true,
|
|
"tmp": true, "out": true, "src": true, "dst": true, "opt": true,
|
|
"usr": true, "etc": true, "bin": true, "lib": true, "mnt": true,
|
|
"home": true, "root": true, "host": true, "user": true, "name": true,
|
|
"path": true, "type": true, "data": true, "info": true, "init": true,
|
|
"main": true, "test": true, "util": true, "base": true, "core": true,
|
|
"api": true, "url": true, "uri": true, "http": true, "html": true,
|
|
"json": true, "yaml": true, "toml": true, "conf": true, "config": true,
|
|
"dir": true, "map": true, "key": true, "obj": true,
|
|
"ctx": true, "pkg": true, "mod": true, "cmd": true, "cli": true,
|
|
"help": true, "read": true, "open": true, "close": true, "stop": true,
|
|
"start": true, "end": true, "begin": true, "done": true, "make": true,
|
|
"build": true, "check": true, "scan": true, "load": true, "save": true,
|
|
"send": true, "recv": true, "show": true, "print": true, "write": true,
|
|
"create": true, "update": true, "delete": true, "remove": true,
|
|
"desktop": true, "lucas": true, "windows": true, "linux": true,
|
|
}
|
|
|
|
// tokenize splits a shell command into significant lowercase tokens.
|
|
// It discards flags, absolute paths (keeping basenames), pure numbers,
|
|
// and short tokens (< 3 chars).
|
|
func tokenize(cmd string) []string {
|
|
// Replace common shell operators with spaces so they act as separators
|
|
cmd = strings.NewReplacer("|", " ", ";", " ", "&&", " ", "||", " ",
|
|
"(", " ", ")", " ", "{", " ", "}", " ").Replace(cmd)
|
|
|
|
parts := strings.Fields(cmd)
|
|
seen := map[string]bool{}
|
|
var tokens []string
|
|
|
|
for _, p := range parts {
|
|
// Skip flags like -v, --port, /F, /IM
|
|
if reFlag.MatchString(p) || (len(p) > 1 && p[0] == '/') {
|
|
continue
|
|
}
|
|
// Handle paths: keep only basename without extension
|
|
if reAbsPath.MatchString(p) || strings.ContainsAny(p, "/\\") {
|
|
p = filepath.Base(p)
|
|
ext := filepath.Ext(p)
|
|
if ext != "" {
|
|
p = strings.TrimSuffix(p, ext)
|
|
// also add ext without dot
|
|
extTok := strings.ToLower(strings.TrimPrefix(ext, "."))
|
|
if len(extTok) >= 3 && !seen[extTok] {
|
|
seen[extTok] = true
|
|
tokens = append(tokens, extTok)
|
|
}
|
|
}
|
|
}
|
|
// Split remaining by non-alphanumeric chars
|
|
subparts := reNonAlnum.Split(p, -1)
|
|
for _, sp := range subparts {
|
|
tok := strings.ToLower(sp)
|
|
if len(tok) < 3 {
|
|
continue
|
|
}
|
|
if rePureNumber.MatchString(tok) {
|
|
continue
|
|
}
|
|
if seen[tok] {
|
|
continue
|
|
}
|
|
if domainStopwords[tok] {
|
|
continue
|
|
}
|
|
seen[tok] = true
|
|
tokens = append(tokens, tok)
|
|
}
|
|
}
|
|
return tokens
|
|
}
|
|
|
|
// buildFTSQuery constructs a safe FTS5 OR query from tokens.
|
|
// Tokens with special FTS5 characters are wrapped in double quotes.
|
|
func buildFTSQuery(tokens []string) string {
|
|
if len(tokens) == 0 {
|
|
return ""
|
|
}
|
|
var parts []string
|
|
specialChars := `"'()^*:-.`
|
|
for _, tok := range tokens {
|
|
needsQuoting := false
|
|
for _, c := range tok {
|
|
if strings.ContainsRune(specialChars, c) {
|
|
needsQuoting = true
|
|
break
|
|
}
|
|
}
|
|
if needsQuoting {
|
|
// escape inner double quotes
|
|
escaped := strings.ReplaceAll(tok, `"`, `""`)
|
|
parts = append(parts, `"`+escaped+`"`)
|
|
} else {
|
|
parts = append(parts, tok)
|
|
}
|
|
}
|
|
return strings.Join(parts, " OR ")
|
|
}
|
|
|
|
// --- language penalty heuristics ---------------------------------------
|
|
|
|
// pythonMarkers are tokens that strongly suggest Python code.
|
|
var pythonMarkers = map[string]bool{
|
|
"def": true, "import": true, "class": true, "elif": true,
|
|
"self": true, "lambda": true, "yield": true, "async": true,
|
|
"await": true, "with": true,
|
|
}
|
|
|
|
// bashMarkers are tokens that strongly suggest Bash code.
|
|
var bashMarkers = map[string]bool{
|
|
"chmod": true, "chown": true, "grep": true, "awk": true,
|
|
"sed": true, "curl": true, "wget": true, "ssh": true,
|
|
"rsync": true, "systemctl": true, "apt": true, "yum": true,
|
|
"taskkill": true, "cmd": true, "powershell": true,
|
|
"exe": true, "bat": true,
|
|
}
|
|
|
|
func hasPythonMarkers(tokens []string) bool {
|
|
for _, t := range tokens {
|
|
if pythonMarkers[t] {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func hasBashMarkers(tokens []string) bool {
|
|
for _, t := range tokens {
|
|
if bashMarkers[t] {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// --- scoring -----------------------------------------------------------
|
|
|
|
// scoreHit computes a composite score for a single FTS5 hit.
|
|
// bm25 from SQLite is negative (more negative = better match).
|
|
// Scoring uses an additive boost model: each token that matches a field
|
|
// contributes a flat bonus (name=3.0, tags=2.0, signature=1.5). The total
|
|
// bonus is added to the base BM25 score, not multiplied per-token. This
|
|
// prevents runaway clamping when many tokens all match different functions
|
|
// equally (dashboard + registry + exe → all score 1.0 with the old model).
|
|
func scoreHit(row fts5Row, tokens []string, hasPython, hasBash bool) float64 {
|
|
// Base score from BM25 rank (negative -> positive, bounded [0,1])
|
|
base := 1.0 / (1.0 + math.Abs(row.rank))
|
|
|
|
nameLower := strings.ToLower(row.name)
|
|
tagsLower := strings.ToLower(row.tags)
|
|
sigLower := strings.ToLower(row.signature)
|
|
descLower := strings.ToLower(row.description)
|
|
|
|
var boost float64
|
|
for _, tok := range tokens {
|
|
// Use best-field bonus per token (additive across tokens, not multiplicative)
|
|
tokBoost := 0.0
|
|
if strings.Contains(nameLower, tok) && tokBoost < 3.0 {
|
|
tokBoost = 3.0
|
|
}
|
|
if strings.Contains(tagsLower, tok) && tokBoost < 2.0 {
|
|
tokBoost = 2.0
|
|
}
|
|
if strings.Contains(sigLower, tok) && tokBoost < 1.5 {
|
|
tokBoost = 1.5
|
|
}
|
|
if strings.Contains(descLower, tok) && tokBoost < 1.0 {
|
|
tokBoost = 1.0
|
|
}
|
|
boost += tokBoost
|
|
}
|
|
|
|
// Language penalties (applied to total, not per-token)
|
|
penalty := 1.0
|
|
langLower := strings.ToLower(row.lang)
|
|
if hasPython && langLower == "bash" {
|
|
penalty = 0.5
|
|
}
|
|
if hasBash && langLower == "py" {
|
|
penalty = 0.5
|
|
}
|
|
|
|
// No clamping — scores differentiate via normalisation in the caller
|
|
return (base + boost) * penalty
|
|
}
|
|
|
|
// snippet returns the first ~120 chars of description, trimmed cleanly.
|
|
func snippet(description string, maxLen int) string {
|
|
description = strings.Map(func(r rune) rune {
|
|
if unicode.IsControl(r) && r != '\t' {
|
|
return ' '
|
|
}
|
|
return r
|
|
}, description)
|
|
description = strings.TrimSpace(description)
|
|
if len(description) <= maxLen {
|
|
return description
|
|
}
|
|
// Cut at last space before maxLen
|
|
cut := description[:maxLen]
|
|
if idx := strings.LastIndex(cut, " "); idx > maxLen/2 {
|
|
cut = cut[:idx]
|
|
}
|
|
return cut + "..."
|
|
}
|
|
|
|
// --- FTS5 query --------------------------------------------------------
|
|
|
|
// ftsOnlyQuery returns id + rank from the FTS virtual table only.
|
|
// bm25() must be used without JOIN — it only works in direct FTS queries.
|
|
const ftsOnlyQuery = `
|
|
SELECT id, bm25(functions_fts) AS rank
|
|
FROM functions_fts
|
|
WHERE functions_fts MATCH ?
|
|
ORDER BY rank
|
|
LIMIT 50
|
|
`
|
|
|
|
// fnDetailQuery fetches metadata for a list of IDs.
|
|
const fnDetailQuery = `
|
|
SELECT id, name, lang, signature, description, COALESCE(tags, '[]')
|
|
FROM functions
|
|
WHERE id IN (%s)
|
|
`
|
|
|
|
func runMatch(dbPath string, query string, topN int, minScore float64) ([]matchResult, error) {
|
|
tokens := tokenize(query)
|
|
if len(tokens) == 0 {
|
|
return nil, fmt.Errorf("no significant tokens extracted from: %q", query)
|
|
}
|
|
|
|
ftsQ := buildFTSQuery(tokens)
|
|
if ftsQ == "" {
|
|
return nil, fmt.Errorf("could not build FTS query")
|
|
}
|
|
|
|
// Open normally (not strict read-only) so WAL frames are visible.
|
|
// bm25() with mode=ro fails with "missing row from content table" when
|
|
// the WAL has not been checkpointed — the FTS index references rows that
|
|
// aren't in the main db file yet. We never write anything here.
|
|
conn, err := sql.Open("sqlite3", dbPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("opening db: %w", err)
|
|
}
|
|
defer conn.Close()
|
|
|
|
// Step 1: FTS-only query to get ids + bm25 ranks (no JOIN)
|
|
ftsRows, err := conn.Query(ftsOnlyQuery, ftsQ)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("fts query: %w", err)
|
|
}
|
|
type idRank struct {
|
|
id string
|
|
rank float64
|
|
}
|
|
var ranked []idRank
|
|
for ftsRows.Next() {
|
|
var r idRank
|
|
if err := ftsRows.Scan(&r.id, &r.rank); err != nil {
|
|
continue
|
|
}
|
|
ranked = append(ranked, r)
|
|
}
|
|
ftsRows.Close()
|
|
|
|
if len(ranked) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
// Step 2: fetch metadata for those IDs with a regular SELECT
|
|
rankMap := make(map[string]float64, len(ranked))
|
|
ids := make([]string, 0, len(ranked))
|
|
placeholders := make([]string, 0, len(ranked))
|
|
args := make([]any, 0, len(ranked))
|
|
for _, r := range ranked {
|
|
rankMap[r.id] = r.rank
|
|
ids = append(ids, r.id)
|
|
placeholders = append(placeholders, "?")
|
|
args = append(args, r.id)
|
|
}
|
|
|
|
detailSQL := fmt.Sprintf(fnDetailQuery, strings.Join(placeholders, ","))
|
|
detailRows, err := conn.Query(detailSQL, args...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("detail query: %w", err)
|
|
}
|
|
defer detailRows.Close()
|
|
|
|
hasPython := hasPythonMarkers(tokens)
|
|
hasBash := hasBashMarkers(tokens)
|
|
|
|
var results []matchResult
|
|
for detailRows.Next() {
|
|
var r fts5Row
|
|
if err := detailRows.Scan(&r.id, &r.name, &r.lang, &r.signature, &r.description, &r.tags); err != nil {
|
|
continue
|
|
}
|
|
r.rank = rankMap[r.id]
|
|
score := scoreHit(r, tokens, hasPython, hasBash)
|
|
results = append(results, matchResult{
|
|
ID: r.id,
|
|
Score: score, // rounded after normalisation below
|
|
Signature: r.signature,
|
|
Snippet: snippet(r.description, 120),
|
|
Lang: r.lang,
|
|
Name: r.name,
|
|
Tags: r.tags,
|
|
})
|
|
}
|
|
|
|
// Sort by score descending
|
|
sort.Slice(results, func(i, j int) bool {
|
|
return results[i].Score > results[j].Score
|
|
})
|
|
|
|
// Preserva raw_score (absoluto) ANTES de normalizar — sirve para gates
|
|
// de confidence absoluto. La normalizacion estetica enmascara queries
|
|
// debiles donde el top hit es solo el "menos malo" pero realmente no
|
|
// matchea — sin raw, high_confidence sobre normalized siempre dispara.
|
|
for i := range results {
|
|
results[i].RawScore = results[i].Score
|
|
}
|
|
// Normalise scores so the top result is 1.0 and the rest are relative.
|
|
// This makes the output stable and meaningful regardless of token count.
|
|
if len(results) > 0 && results[0].Score > 0 {
|
|
maxScore := results[0].Score
|
|
for i := range results {
|
|
results[i].Score = math.Round((results[i].Score/maxScore)*1000) / 1000
|
|
}
|
|
}
|
|
|
|
// Filter by min score
|
|
var filtered []matchResult
|
|
for _, r := range results {
|
|
if r.Score >= minScore {
|
|
filtered = append(filtered, r)
|
|
}
|
|
}
|
|
|
|
// Limit to topN
|
|
if len(filtered) > topN {
|
|
filtered = filtered[:topN]
|
|
}
|
|
|
|
return filtered, nil
|
|
}
|
|
|
|
// --- command -----------------------------------------------------------
|
|
|
|
func cmdMatch(args []string) {
|
|
topN := 3
|
|
format := "json"
|
|
minScore := 0.3
|
|
var queryArg string
|
|
|
|
for i := 0; i < len(args); i++ {
|
|
switch args[i] {
|
|
case "--top", "-n":
|
|
i++
|
|
if i < len(args) {
|
|
if n, err := strconv.Atoi(args[i]); err == nil && n > 0 {
|
|
topN = n
|
|
}
|
|
}
|
|
case "--format", "-f":
|
|
i++
|
|
if i < len(args) {
|
|
format = args[i]
|
|
}
|
|
case "--min-score":
|
|
i++
|
|
if i < len(args) {
|
|
if f, err := strconv.ParseFloat(args[i], 64); err == nil {
|
|
minScore = f
|
|
}
|
|
}
|
|
case "--help", "-h":
|
|
fmt.Println(`fn match — fuzzy matcher between a shell command and registry functions
|
|
|
|
Usage:
|
|
fn match [--top N] [--format json|text] [--min-score F] "<command>"
|
|
echo "<command>" | fn match [--top N] [--format json|text] [--min-score F]
|
|
|
|
Flags:
|
|
--top N Return top N results (default: 3)
|
|
--format Output format: json (default) or text
|
|
--min-score F Minimum score threshold 0..1 (default: 0.3)
|
|
|
|
Example:
|
|
fn match "taskkill.exe /IM registry_dashboard.exe /F"
|
|
fn match --top 5 --format text "curl -sf https://api.example.com/health"
|
|
echo "rsync -avz --exclude .git src/ user@host:/opt/app" | fn match`)
|
|
return
|
|
default:
|
|
if !strings.HasPrefix(args[i], "-") {
|
|
queryArg = args[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try stdin if no positional arg
|
|
if queryArg == "" {
|
|
stat, err := os.Stdin.Stat()
|
|
if err == nil && (stat.Mode()&os.ModeCharDevice) == 0 {
|
|
var sb strings.Builder
|
|
buf := make([]byte, 4096)
|
|
for {
|
|
n, err := os.Stdin.Read(buf)
|
|
if n > 0 {
|
|
sb.Write(buf[:n])
|
|
}
|
|
if err != nil {
|
|
break
|
|
}
|
|
}
|
|
queryArg = strings.TrimSpace(sb.String())
|
|
}
|
|
}
|
|
|
|
if queryArg == "" {
|
|
fmt.Fprintln(os.Stderr, "fn match: no command provided. Use --help for usage.")
|
|
os.Exit(1)
|
|
}
|
|
|
|
dbPath := filepath.Join(root(), dbName)
|
|
hits, err := runMatch(dbPath, queryArg, topN, minScore)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "fn match: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
// Compute high_confidence flag. Doble gate:
|
|
// 1. RAW score >= 3.0 — al menos un token con match fuerte de campo
|
|
// (name=3.0 / tags=2.0 / signature=1.5 / description=1.0). Sin esto,
|
|
// la normalizacion devolveria 1.0 incluso para queries que no
|
|
// matchean nada bien (ej. "kelly criterion" -> graph_renderer score
|
|
// raw < 1.0 pero normalized = 1.0).
|
|
// 2. Gap top1/top2 > 1.5 (en raw, no normalized) — el top destaca
|
|
// sobre el siguiente, no es un cluster de matches mediocres.
|
|
const minRawForHighConf = 4.0
|
|
highConf := false
|
|
if len(hits) >= 1 && hits[0].RawScore >= minRawForHighConf {
|
|
if len(hits) >= 2 && hits[1].RawScore > 0 {
|
|
highConf = hits[0].RawScore/hits[1].RawScore > 1.5
|
|
} else {
|
|
highConf = true // solo un hit con raw alta
|
|
}
|
|
}
|
|
|
|
switch format {
|
|
case "text":
|
|
printMatchText(queryArg, hits, highConf)
|
|
default:
|
|
printMatchJSON(queryArg, hits, highConf)
|
|
}
|
|
}
|
|
|
|
func printMatchJSON(query string, hits []matchResult, highConf bool) {
|
|
out := matchOutput{
|
|
Query: query,
|
|
Top: hits,
|
|
HighConfidence: highConf,
|
|
}
|
|
if out.Top == nil {
|
|
out.Top = []matchResult{}
|
|
}
|
|
enc := json.NewEncoder(os.Stdout)
|
|
enc.SetIndent("", " ")
|
|
enc.Encode(out)
|
|
}
|
|
|
|
func printMatchText(query string, hits []matchResult, highConf bool) {
|
|
conf := ""
|
|
if highConf {
|
|
conf = " [HIGH CONFIDENCE]"
|
|
}
|
|
fmt.Printf("TOP MATCHES for: %s%s\n", query, conf)
|
|
if len(hits) == 0 {
|
|
fmt.Println(" (no matches above threshold)")
|
|
return
|
|
}
|
|
for _, h := range hits {
|
|
fmt.Printf(" [%.3f] %s\n", h.Score, h.ID)
|
|
fmt.Printf(" %s\n", h.Signature)
|
|
fmt.Printf(" %s\n", h.Snippet)
|
|
}
|
|
}
|