package main import ( "database/sql" "encoding/json" "fmt" "math" "os" "path/filepath" "regexp" "sort" "strconv" "strings" "unicode" _ "github.com/mattn/go-sqlite3" ) // matchResult holds one candidate function match. type matchResult struct { ID string `json:"id"` Score float64 `json:"score"` // normalized (top=1.0) RawScore float64 `json:"raw_score"` // absolute, pre-normalization. Use for confidence gates. Signature string `json:"signature"` Snippet string `json:"snippet"` Lang string `json:"-"` Name string `json:"-"` Tags string `json:"-"` HighConfidence bool `json:"-"` // filled after ranking } // matchOutput is the JSON envelope returned by fn match. type matchOutput struct { Query string `json:"query"` Top []matchResult `json:"top"` HighConfidence bool `json:"high_confidence"` } // fts5Row is a raw row from the FTS query. type fts5Row struct { id string name string lang string signature string description string tags string rank float64 } // --- tokenizer --------------------------------------------------------- var ( reNonAlnum = regexp.MustCompile(`[^a-zA-Z0-9]+`) reFlag = regexp.MustCompile(`^-{1,2}[a-zA-Z]`) reAbsPath = regexp.MustCompile(`^(/|[A-Za-z]:\\|\\\\)`) rePureNumber = regexp.MustCompile(`^\d+$`) ) // domainStopwords are tokens so generic in this codebase that they add noise // rather than signal to the matcher (they match hundreds of functions equally). var domainStopwords = map[string]bool{ "registry": true, "function": true, "functions": true, "app": true, "apps": true, "file": true, "files": true, "get": true, "set": true, "run": true, "list": true, "add": true, "new": true, "all": true, "the": true, "and": true, "for": true, "use": true, "fmt": true, "log": true, "err": true, "nil": true, "true": true, "false": true, "var": true, "val": true, "str": true, "tmp": true, "out": true, "src": true, "dst": true, "opt": true, "usr": true, "etc": true, "bin": true, "lib": true, "mnt": true, "home": true, "root": true, "host": true, "user": true, "name": true, "path": true, "type": true, "data": true, "info": true, "init": true, "main": true, "test": true, "util": true, "base": true, "core": true, "api": true, "url": true, "uri": true, "http": true, "html": true, "json": true, "yaml": true, "toml": true, "conf": true, "config": true, "dir": true, "map": true, "key": true, "obj": true, "ctx": true, "pkg": true, "mod": true, "cmd": true, "cli": true, "help": true, "read": true, "open": true, "close": true, "stop": true, "start": true, "end": true, "begin": true, "done": true, "make": true, "build": true, "check": true, "scan": true, "load": true, "save": true, "send": true, "recv": true, "show": true, "print": true, "write": true, "create": true, "update": true, "delete": true, "remove": true, "desktop": true, "lucas": true, "windows": true, "linux": true, } // tokenize splits a shell command into significant lowercase tokens. // It discards flags, absolute paths (keeping basenames), pure numbers, // and short tokens (< 3 chars). func tokenize(cmd string) []string { // Replace common shell operators with spaces so they act as separators cmd = strings.NewReplacer("|", " ", ";", " ", "&&", " ", "||", " ", "(", " ", ")", " ", "{", " ", "}", " ").Replace(cmd) parts := strings.Fields(cmd) seen := map[string]bool{} var tokens []string for _, p := range parts { // Skip flags like -v, --port, /F, /IM if reFlag.MatchString(p) || (len(p) > 1 && p[0] == '/') { continue } // Handle paths: keep only basename without extension if reAbsPath.MatchString(p) || strings.ContainsAny(p, "/\\") { p = filepath.Base(p) ext := filepath.Ext(p) if ext != "" { p = strings.TrimSuffix(p, ext) // also add ext without dot extTok := strings.ToLower(strings.TrimPrefix(ext, ".")) if len(extTok) >= 3 && !seen[extTok] { seen[extTok] = true tokens = append(tokens, extTok) } } } // Split remaining by non-alphanumeric chars subparts := reNonAlnum.Split(p, -1) for _, sp := range subparts { tok := strings.ToLower(sp) if len(tok) < 3 { continue } if rePureNumber.MatchString(tok) { continue } if seen[tok] { continue } if domainStopwords[tok] { continue } seen[tok] = true tokens = append(tokens, tok) } } return tokens } // buildFTSQuery constructs a safe FTS5 OR query from tokens. // Tokens with special FTS5 characters are wrapped in double quotes. func buildFTSQuery(tokens []string) string { if len(tokens) == 0 { return "" } var parts []string specialChars := `"'()^*:-.` for _, tok := range tokens { needsQuoting := false for _, c := range tok { if strings.ContainsRune(specialChars, c) { needsQuoting = true break } } if needsQuoting { // escape inner double quotes escaped := strings.ReplaceAll(tok, `"`, `""`) parts = append(parts, `"`+escaped+`"`) } else { parts = append(parts, tok) } } return strings.Join(parts, " OR ") } // --- language penalty heuristics --------------------------------------- // pythonMarkers are tokens that strongly suggest Python code. var pythonMarkers = map[string]bool{ "def": true, "import": true, "class": true, "elif": true, "self": true, "lambda": true, "yield": true, "async": true, "await": true, "with": true, } // bashMarkers are tokens that strongly suggest Bash code. var bashMarkers = map[string]bool{ "chmod": true, "chown": true, "grep": true, "awk": true, "sed": true, "curl": true, "wget": true, "ssh": true, "rsync": true, "systemctl": true, "apt": true, "yum": true, "taskkill": true, "cmd": true, "powershell": true, "exe": true, "bat": true, } func hasPythonMarkers(tokens []string) bool { for _, t := range tokens { if pythonMarkers[t] { return true } } return false } func hasBashMarkers(tokens []string) bool { for _, t := range tokens { if bashMarkers[t] { return true } } return false } // --- scoring ----------------------------------------------------------- // scoreHit computes a composite score for a single FTS5 hit. // bm25 from SQLite is negative (more negative = better match). // Scoring uses an additive boost model: each token that matches a field // contributes a flat bonus (name=3.0, tags=2.0, signature=1.5). The total // bonus is added to the base BM25 score, not multiplied per-token. This // prevents runaway clamping when many tokens all match different functions // equally (dashboard + registry + exe → all score 1.0 with the old model). func scoreHit(row fts5Row, tokens []string, hasPython, hasBash bool) float64 { // Base score from BM25 rank (negative -> positive, bounded [0,1]) base := 1.0 / (1.0 + math.Abs(row.rank)) nameLower := strings.ToLower(row.name) tagsLower := strings.ToLower(row.tags) sigLower := strings.ToLower(row.signature) descLower := strings.ToLower(row.description) var boost float64 for _, tok := range tokens { // Use best-field bonus per token (additive across tokens, not multiplicative) tokBoost := 0.0 if strings.Contains(nameLower, tok) && tokBoost < 3.0 { tokBoost = 3.0 } if strings.Contains(tagsLower, tok) && tokBoost < 2.0 { tokBoost = 2.0 } if strings.Contains(sigLower, tok) && tokBoost < 1.5 { tokBoost = 1.5 } if strings.Contains(descLower, tok) && tokBoost < 1.0 { tokBoost = 1.0 } boost += tokBoost } // Language penalties (applied to total, not per-token) penalty := 1.0 langLower := strings.ToLower(row.lang) if hasPython && langLower == "bash" { penalty = 0.5 } if hasBash && langLower == "py" { penalty = 0.5 } // No clamping — scores differentiate via normalisation in the caller return (base + boost) * penalty } // snippet returns the first ~120 chars of description, trimmed cleanly. func snippet(description string, maxLen int) string { description = strings.Map(func(r rune) rune { if unicode.IsControl(r) && r != '\t' { return ' ' } return r }, description) description = strings.TrimSpace(description) if len(description) <= maxLen { return description } // Cut at last space before maxLen cut := description[:maxLen] if idx := strings.LastIndex(cut, " "); idx > maxLen/2 { cut = cut[:idx] } return cut + "..." } // --- FTS5 query -------------------------------------------------------- // ftsOnlyQuery returns id + rank from the FTS virtual table only. // bm25() must be used without JOIN — it only works in direct FTS queries. const ftsOnlyQuery = ` SELECT id, bm25(functions_fts) AS rank FROM functions_fts WHERE functions_fts MATCH ? ORDER BY rank LIMIT 50 ` // fnDetailQuery fetches metadata for a list of IDs. const fnDetailQuery = ` SELECT id, name, lang, signature, description, COALESCE(tags, '[]') FROM functions WHERE id IN (%s) ` func runMatch(dbPath string, query string, topN int, minScore float64) ([]matchResult, error) { tokens := tokenize(query) if len(tokens) == 0 { return nil, fmt.Errorf("no significant tokens extracted from: %q", query) } ftsQ := buildFTSQuery(tokens) if ftsQ == "" { return nil, fmt.Errorf("could not build FTS query") } // Open normally (not strict read-only) so WAL frames are visible. // bm25() with mode=ro fails with "missing row from content table" when // the WAL has not been checkpointed — the FTS index references rows that // aren't in the main db file yet. We never write anything here. conn, err := sql.Open("sqlite3", dbPath) if err != nil { return nil, fmt.Errorf("opening db: %w", err) } defer conn.Close() // Step 1: FTS-only query to get ids + bm25 ranks (no JOIN) ftsRows, err := conn.Query(ftsOnlyQuery, ftsQ) if err != nil { return nil, fmt.Errorf("fts query: %w", err) } type idRank struct { id string rank float64 } var ranked []idRank for ftsRows.Next() { var r idRank if err := ftsRows.Scan(&r.id, &r.rank); err != nil { continue } ranked = append(ranked, r) } ftsRows.Close() if len(ranked) == 0 { return nil, nil } // Step 2: fetch metadata for those IDs with a regular SELECT rankMap := make(map[string]float64, len(ranked)) ids := make([]string, 0, len(ranked)) placeholders := make([]string, 0, len(ranked)) args := make([]any, 0, len(ranked)) for _, r := range ranked { rankMap[r.id] = r.rank ids = append(ids, r.id) placeholders = append(placeholders, "?") args = append(args, r.id) } detailSQL := fmt.Sprintf(fnDetailQuery, strings.Join(placeholders, ",")) detailRows, err := conn.Query(detailSQL, args...) if err != nil { return nil, fmt.Errorf("detail query: %w", err) } defer detailRows.Close() hasPython := hasPythonMarkers(tokens) hasBash := hasBashMarkers(tokens) var results []matchResult for detailRows.Next() { var r fts5Row if err := detailRows.Scan(&r.id, &r.name, &r.lang, &r.signature, &r.description, &r.tags); err != nil { continue } r.rank = rankMap[r.id] score := scoreHit(r, tokens, hasPython, hasBash) results = append(results, matchResult{ ID: r.id, Score: score, // rounded after normalisation below Signature: r.signature, Snippet: snippet(r.description, 120), Lang: r.lang, Name: r.name, Tags: r.tags, }) } // Sort by score descending sort.Slice(results, func(i, j int) bool { return results[i].Score > results[j].Score }) // Preserva raw_score (absoluto) ANTES de normalizar — sirve para gates // de confidence absoluto. La normalizacion estetica enmascara queries // debiles donde el top hit es solo el "menos malo" pero realmente no // matchea — sin raw, high_confidence sobre normalized siempre dispara. for i := range results { results[i].RawScore = results[i].Score } // Normalise scores so the top result is 1.0 and the rest are relative. // This makes the output stable and meaningful regardless of token count. if len(results) > 0 && results[0].Score > 0 { maxScore := results[0].Score for i := range results { results[i].Score = math.Round((results[i].Score/maxScore)*1000) / 1000 } } // Filter by min score var filtered []matchResult for _, r := range results { if r.Score >= minScore { filtered = append(filtered, r) } } // Limit to topN if len(filtered) > topN { filtered = filtered[:topN] } return filtered, nil } // --- command ----------------------------------------------------------- func cmdMatch(args []string) { topN := 3 format := "json" minScore := 0.3 var queryArg string for i := 0; i < len(args); i++ { switch args[i] { case "--top", "-n": i++ if i < len(args) { if n, err := strconv.Atoi(args[i]); err == nil && n > 0 { topN = n } } case "--format", "-f": i++ if i < len(args) { format = args[i] } case "--min-score": i++ if i < len(args) { if f, err := strconv.ParseFloat(args[i], 64); err == nil { minScore = f } } case "--help", "-h": fmt.Println(`fn match — fuzzy matcher between a shell command and registry functions Usage: fn match [--top N] [--format json|text] [--min-score F] "" echo "" | fn match [--top N] [--format json|text] [--min-score F] Flags: --top N Return top N results (default: 3) --format Output format: json (default) or text --min-score F Minimum score threshold 0..1 (default: 0.3) Example: fn match "taskkill.exe /IM registry_dashboard.exe /F" fn match --top 5 --format text "curl -sf https://api.example.com/health" echo "rsync -avz --exclude .git src/ user@host:/opt/app" | fn match`) return default: if !strings.HasPrefix(args[i], "-") { queryArg = args[i] } } } // Try stdin if no positional arg if queryArg == "" { stat, err := os.Stdin.Stat() if err == nil && (stat.Mode()&os.ModeCharDevice) == 0 { var sb strings.Builder buf := make([]byte, 4096) for { n, err := os.Stdin.Read(buf) if n > 0 { sb.Write(buf[:n]) } if err != nil { break } } queryArg = strings.TrimSpace(sb.String()) } } if queryArg == "" { fmt.Fprintln(os.Stderr, "fn match: no command provided. Use --help for usage.") os.Exit(1) } dbPath := filepath.Join(root(), dbName) hits, err := runMatch(dbPath, queryArg, topN, minScore) if err != nil { fmt.Fprintf(os.Stderr, "fn match: %v\n", err) os.Exit(1) } // Compute high_confidence flag. Doble gate: // 1. RAW score >= 3.0 — al menos un token con match fuerte de campo // (name=3.0 / tags=2.0 / signature=1.5 / description=1.0). Sin esto, // la normalizacion devolveria 1.0 incluso para queries que no // matchean nada bien (ej. "kelly criterion" -> graph_renderer score // raw < 1.0 pero normalized = 1.0). // 2. Gap top1/top2 > 1.5 (en raw, no normalized) — el top destaca // sobre el siguiente, no es un cluster de matches mediocres. const minRawForHighConf = 4.0 highConf := false if len(hits) >= 1 && hits[0].RawScore >= minRawForHighConf { if len(hits) >= 2 && hits[1].RawScore > 0 { highConf = hits[0].RawScore/hits[1].RawScore > 1.5 } else { highConf = true // solo un hit con raw alta } } switch format { case "text": printMatchText(queryArg, hits, highConf) default: printMatchJSON(queryArg, hits, highConf) } } func printMatchJSON(query string, hits []matchResult, highConf bool) { out := matchOutput{ Query: query, Top: hits, HighConfidence: highConf, } if out.Top == nil { out.Top = []matchResult{} } enc := json.NewEncoder(os.Stdout) enc.SetIndent("", " ") enc.Encode(out) } func printMatchText(query string, hits []matchResult, highConf bool) { conf := "" if highConf { conf = " [HIGH CONFIDENCE]" } fmt.Printf("TOP MATCHES for: %s%s\n", query, conf) if len(hits) == 0 { fmt.Println(" (no matches above threshold)") return } for _, h := range hits { fmt.Printf(" [%.3f] %s\n", h.Score, h.ID) fmt.Printf(" %s\n", h.Signature) fmt.Printf(" %s\n", h.Snippet) } }