fn_registry/cmd/fn/match.go

package main

import (
	"database/sql"
	"encoding/json"
	"fmt"
	"math"
	"os"
	"path/filepath"
	"regexp"
	"sort"
	"strconv"
	"strings"
	"unicode"

	_ "github.com/mattn/go-sqlite3"
)

// matchResult holds one candidate function match.
type matchResult struct {
	ID             string  `json:"id"`
	Score          float64 `json:"score"`     // normalized (top=1.0)
	RawScore       float64 `json:"raw_score"` // absolute, pre-normalization. Use for confidence gates.
	Signature      string  `json:"signature"`
	Snippet        string  `json:"snippet"`
	Lang           string  `json:"-"`
	Name           string  `json:"-"`
	Tags           string  `json:"-"`
	HighConfidence bool    `json:"-"` // filled after ranking
}

// matchOutput is the JSON envelope returned by fn match.
type matchOutput struct {
	Query          string        `json:"query"`
	Top            []matchResult `json:"top"`
	HighConfidence bool          `json:"high_confidence"`
}

// fts5Row is a raw row from the FTS query.
type fts5Row struct {
	id          string
	name        string
	lang        string
	signature   string
	description string
	tags        string
	rank        float64
}

// --- tokenizer ---------------------------------------------------------

var (
	reNonAlnum   = regexp.MustCompile(`[^a-zA-Z0-9]+`)
	reFlag       = regexp.MustCompile(`^-{1,2}[a-zA-Z]`)
	reAbsPath    = regexp.MustCompile(`^(/|[A-Za-z]:\\|\\\\)`)
	rePureNumber = regexp.MustCompile(`^\d+$`)
)

// domainStopwords are tokens so generic in this codebase that they add noise
// rather than signal to the matcher (they match hundreds of functions equally).
var domainStopwords = map[string]bool{
	"registry": true, "function": true, "functions": true,
	"app": true, "apps": true, "file": true, "files": true,
	"get": true, "set": true, "run": true, "list": true, "add": true,
	"new": true, "all": true, "the": true, "and": true, "for": true,
	"use": true, "fmt": true, "log": true, "err": true, "nil": true,
	"true": true, "false": true, "var": true, "val": true, "str": true,
	"tmp": true, "out": true, "src": true, "dst": true, "opt": true,
	"usr": true, "etc": true, "bin": true, "lib": true, "mnt": true,
	"home": true, "root": true, "host": true, "user": true, "name": true,
	"path": true, "type": true, "data": true, "info": true, "init": true,
	"main": true, "test": true, "util": true, "base": true, "core": true,
	"api":  true, "url":  true, "uri":  true, "http": true, "html": true,
	"json": true, "yaml": true, "toml": true, "conf": true, "config": true,
	"dir":  true, "map":  true, "key":  true, "obj":  true,
	"ctx":  true, "pkg":  true, "mod":  true, "cmd":  true, "cli":  true,
	"help": true, "read": true, "open": true, "close": true, "stop": true,
	"start": true, "end": true, "begin": true, "done": true, "make": true,
	"build": true, "check": true, "scan": true, "load": true, "save": true,
	"send": true, "recv": true, "show": true, "print": true, "write": true,
	"create": true, "update": true, "delete": true, "remove": true,
	"desktop": true, "lucas": true, "windows": true, "linux": true,
}

// tokenize splits a shell command into significant lowercase tokens.
// It discards flags, absolute paths (keeping basenames), pure numbers,
// and short tokens (< 3 chars).
func tokenize(cmd string) []string {
	// Replace common shell operators with spaces so they act as separators
	cmd = strings.NewReplacer("|", " ", ";", " ", "&&", " ", "||", " ",
		"(", " ", ")", " ", "{", " ", "}", " ").Replace(cmd)

	parts := strings.Fields(cmd)
	seen := map[string]bool{}
	var tokens []string

	for _, p := range parts {
		// Skip flags like -v, --port, /F, /IM
		if reFlag.MatchString(p) || (len(p) > 1 && p[0] == '/') {
			continue
		}
		// Handle paths: keep only basename without extension
		if reAbsPath.MatchString(p) || strings.ContainsAny(p, "/\\") {
			p = filepath.Base(p)
			ext := filepath.Ext(p)
			if ext != "" {
				p = strings.TrimSuffix(p, ext)
				// also add ext without dot
				extTok := strings.ToLower(strings.TrimPrefix(ext, "."))
				if len(extTok) >= 3 && !seen[extTok] {
					seen[extTok] = true
					tokens = append(tokens, extTok)
				}
			}
		}
		// Split remaining by non-alphanumeric chars
		subparts := reNonAlnum.Split(p, -1)
		for _, sp := range subparts {
			tok := strings.ToLower(sp)
			if len(tok) < 3 {
				continue
			}
			if rePureNumber.MatchString(tok) {
				continue
			}
			if seen[tok] {
				continue
			}
			if domainStopwords[tok] {
				continue
			}
			seen[tok] = true
			tokens = append(tokens, tok)
		}
	}
	return tokens
}

// buildFTSQuery constructs a safe FTS5 OR query from tokens.
// Tokens with special FTS5 characters are wrapped in double quotes.
func buildFTSQuery(tokens []string) string {
	if len(tokens) == 0 {
		return ""
	}
	var parts []string
	specialChars := `"'()^*:-.`
	for _, tok := range tokens {
		needsQuoting := false
		for _, c := range tok {
			if strings.ContainsRune(specialChars, c) {
				needsQuoting = true
				break
			}
		}
		if needsQuoting {
			// escape inner double quotes
			escaped := strings.ReplaceAll(tok, `"`, `""`)
			parts = append(parts, `"`+escaped+`"`)
		} else {
			parts = append(parts, tok)
		}
	}
	return strings.Join(parts, " OR ")
}

// --- language penalty heuristics ---------------------------------------

// pythonMarkers are tokens that strongly suggest Python code.
var pythonMarkers = map[string]bool{
	"def": true, "import": true, "class": true, "elif": true,
	"self": true, "lambda": true, "yield": true, "async": true,
	"await": true, "with": true,
}

// bashMarkers are tokens that strongly suggest Bash code.
var bashMarkers = map[string]bool{
	"chmod": true, "chown": true, "grep": true, "awk": true,
	"sed": true, "curl": true, "wget": true, "ssh": true,
	"rsync": true, "systemctl": true, "apt": true, "yum": true,
	"taskkill": true, "cmd": true, "powershell": true,
	"exe": true, "bat": true,
}

func hasPythonMarkers(tokens []string) bool {
	for _, t := range tokens {
		if pythonMarkers[t] {
			return true
		}
	}
	return false
}

func hasBashMarkers(tokens []string) bool {
	for _, t := range tokens {
		if bashMarkers[t] {
			return true
		}
	}
	return false
}

// --- scoring -----------------------------------------------------------

// scoreHit computes a composite score for a single FTS5 hit.
// bm25 from SQLite is negative (more negative = better match).
// Scoring uses an additive boost model: each token that matches a field
// contributes a flat bonus (name=3.0, tags=2.0, signature=1.5).  The total
// bonus is added to the base BM25 score, not multiplied per-token.  This
// prevents runaway clamping when many tokens all match different functions
// equally (dashboard + registry + exe → all score 1.0 with the old model).
func scoreHit(row fts5Row, tokens []string, hasPython, hasBash bool) float64 {
	// Base score from BM25 rank (negative -> positive, bounded [0,1])
	base := 1.0 / (1.0 + math.Abs(row.rank))

	nameLower := strings.ToLower(row.name)
	tagsLower := strings.ToLower(row.tags)
	sigLower := strings.ToLower(row.signature)
	descLower := strings.ToLower(row.description)

	var boost float64
	for _, tok := range tokens {
		// Use best-field bonus per token (additive across tokens, not multiplicative)
		tokBoost := 0.0
		if strings.Contains(nameLower, tok) && tokBoost < 3.0 {
			tokBoost = 3.0
		}
		if strings.Contains(tagsLower, tok) && tokBoost < 2.0 {
			tokBoost = 2.0
		}
		if strings.Contains(sigLower, tok) && tokBoost < 1.5 {
			tokBoost = 1.5
		}
		if strings.Contains(descLower, tok) && tokBoost < 1.0 {
			tokBoost = 1.0
		}
		boost += tokBoost
	}

	// Language penalties (applied to total, not per-token)
	penalty := 1.0
	langLower := strings.ToLower(row.lang)
	if hasPython && langLower == "bash" {
		penalty = 0.5
	}
	if hasBash && langLower == "py" {
		penalty = 0.5
	}

	// No clamping — scores differentiate via normalisation in the caller
	return (base + boost) * penalty
}

// snippet returns the first ~120 chars of description, trimmed cleanly.
func snippet(description string, maxLen int) string {
	description = strings.Map(func(r rune) rune {
		if unicode.IsControl(r) && r != '\t' {
			return ' '
		}
		return r
	}, description)
	description = strings.TrimSpace(description)
	if len(description) <= maxLen {
		return description
	}
	// Cut at last space before maxLen
	cut := description[:maxLen]
	if idx := strings.LastIndex(cut, " "); idx > maxLen/2 {
		cut = cut[:idx]
	}
	return cut + "..."
}

// --- FTS5 query --------------------------------------------------------

// ftsOnlyQuery returns id + rank from the FTS virtual table only.
// bm25() must be used without JOIN — it only works in direct FTS queries.
const ftsOnlyQuery = `
SELECT id, bm25(functions_fts) AS rank
FROM functions_fts
WHERE functions_fts MATCH ?
ORDER BY rank
LIMIT 50
`

// fnDetailQuery fetches metadata for a list of IDs.
const fnDetailQuery = `
SELECT id, name, lang, signature, description, COALESCE(tags, '[]')
FROM functions
WHERE id IN (%s)
`

func runMatch(dbPath string, query string, topN int, minScore float64) ([]matchResult, error) {
	tokens := tokenize(query)
	if len(tokens) == 0 {
		return nil, fmt.Errorf("no significant tokens extracted from: %q", query)
	}

	ftsQ := buildFTSQuery(tokens)
	if ftsQ == "" {
		return nil, fmt.Errorf("could not build FTS query")
	}

	// Open normally (not strict read-only) so WAL frames are visible.
	// bm25() with mode=ro fails with "missing row from content table" when
	// the WAL has not been checkpointed — the FTS index references rows that
	// aren't in the main db file yet. We never write anything here.
	conn, err := sql.Open("sqlite3", dbPath)
	if err != nil {
		return nil, fmt.Errorf("opening db: %w", err)
	}
	defer conn.Close()

	// Step 1: FTS-only query to get ids + bm25 ranks (no JOIN)
	ftsRows, err := conn.Query(ftsOnlyQuery, ftsQ)
	if err != nil {
		return nil, fmt.Errorf("fts query: %w", err)
	}
	type idRank struct {
		id   string
		rank float64
	}
	var ranked []idRank
	for ftsRows.Next() {
		var r idRank
		if err := ftsRows.Scan(&r.id, &r.rank); err != nil {
			continue
		}
		ranked = append(ranked, r)
	}
	ftsRows.Close()

	if len(ranked) == 0 {
		return nil, nil
	}

	// Step 2: fetch metadata for those IDs with a regular SELECT
	rankMap := make(map[string]float64, len(ranked))
	ids := make([]string, 0, len(ranked))
	placeholders := make([]string, 0, len(ranked))
	args := make([]any, 0, len(ranked))
	for _, r := range ranked {
		rankMap[r.id] = r.rank
		ids = append(ids, r.id)
		placeholders = append(placeholders, "?")
		args = append(args, r.id)
	}

	detailSQL := fmt.Sprintf(fnDetailQuery, strings.Join(placeholders, ","))
	detailRows, err := conn.Query(detailSQL, args...)
	if err != nil {
		return nil, fmt.Errorf("detail query: %w", err)
	}
	defer detailRows.Close()

	hasPython := hasPythonMarkers(tokens)
	hasBash := hasBashMarkers(tokens)

	var results []matchResult
	for detailRows.Next() {
		var r fts5Row
		if err := detailRows.Scan(&r.id, &r.name, &r.lang, &r.signature, &r.description, &r.tags); err != nil {
			continue
		}
		r.rank = rankMap[r.id]
		score := scoreHit(r, tokens, hasPython, hasBash)
		results = append(results, matchResult{
			ID:        r.id,
			Score:     score, // rounded after normalisation below
			Signature: r.signature,
			Snippet:   snippet(r.description, 120),
			Lang:      r.lang,
			Name:      r.name,
			Tags:      r.tags,
		})
	}

	// Sort by score descending
	sort.Slice(results, func(i, j int) bool {
		return results[i].Score > results[j].Score
	})

	// Preserva raw_score (absoluto) ANTES de normalizar — sirve para gates
	// de confidence absoluto. La normalizacion estetica enmascara queries
	// debiles donde el top hit es solo el "menos malo" pero realmente no
	// matchea — sin raw, high_confidence sobre normalized siempre dispara.
	for i := range results {
		results[i].RawScore = results[i].Score
	}
	// Normalise scores so the top result is 1.0 and the rest are relative.
	// This makes the output stable and meaningful regardless of token count.
	if len(results) > 0 && results[0].Score > 0 {
		maxScore := results[0].Score
		for i := range results {
			results[i].Score = math.Round((results[i].Score/maxScore)*1000) / 1000
		}
	}

	// Filter by min score
	var filtered []matchResult
	for _, r := range results {
		if r.Score >= minScore {
			filtered = append(filtered, r)
		}
	}

	// Limit to topN
	if len(filtered) > topN {
		filtered = filtered[:topN]
	}

	return filtered, nil
}

// --- command -----------------------------------------------------------

func cmdMatch(args []string) {
	topN := 3
	format := "json"
	minScore := 0.3
	var queryArg string

	for i := 0; i < len(args); i++ {
		switch args[i] {
		case "--top", "-n":
			i++
			if i < len(args) {
				if n, err := strconv.Atoi(args[i]); err == nil && n > 0 {
					topN = n
				}
			}
		case "--format", "-f":
			i++
			if i < len(args) {
				format = args[i]
			}
		case "--min-score":
			i++
			if i < len(args) {
				if f, err := strconv.ParseFloat(args[i], 64); err == nil {
					minScore = f
				}
			}
		case "--help", "-h":
			fmt.Println(`fn match — fuzzy matcher between a shell command and registry functions

Usage:
  fn match [--top N] [--format json|text] [--min-score F] "<command>"
  echo "<command>" | fn match [--top N] [--format json|text] [--min-score F]

Flags:
  --top N        Return top N results (default: 3)
  --format       Output format: json (default) or text
  --min-score F  Minimum score threshold 0..1 (default: 0.3)

Example:
  fn match "taskkill.exe /IM registry_dashboard.exe /F"
  fn match --top 5 --format text "curl -sf https://api.example.com/health"
  echo "rsync -avz --exclude .git src/ user@host:/opt/app" | fn match`)
			return
		default:
			if !strings.HasPrefix(args[i], "-") {
				queryArg = args[i]
			}
		}
	}

	// Try stdin if no positional arg
	if queryArg == "" {
		stat, err := os.Stdin.Stat()
		if err == nil && (stat.Mode()&os.ModeCharDevice) == 0 {
			var sb strings.Builder
			buf := make([]byte, 4096)
			for {
				n, err := os.Stdin.Read(buf)
				if n > 0 {
					sb.Write(buf[:n])
				}
				if err != nil {
					break
				}
			}
			queryArg = strings.TrimSpace(sb.String())
		}
	}

	if queryArg == "" {
		fmt.Fprintln(os.Stderr, "fn match: no command provided. Use --help for usage.")
		os.Exit(1)
	}

	dbPath := filepath.Join(root(), dbName)
	hits, err := runMatch(dbPath, queryArg, topN, minScore)
	if err != nil {
		fmt.Fprintf(os.Stderr, "fn match: %v\n", err)
		os.Exit(1)
	}

	// Compute high_confidence flag. Doble gate:
	//   1. RAW score >= 3.0 — al menos un token con match fuerte de campo
	//      (name=3.0 / tags=2.0 / signature=1.5 / description=1.0). Sin esto,
	//      la normalizacion devolveria 1.0 incluso para queries que no
	//      matchean nada bien (ej. "kelly criterion" -> graph_renderer score
	//      raw < 1.0 pero normalized = 1.0).
	//   2. Gap top1/top2 > 1.5 (en raw, no normalized) — el top destaca
	//      sobre el siguiente, no es un cluster de matches mediocres.
	const minRawForHighConf = 4.0
	highConf := false
	if len(hits) >= 1 && hits[0].RawScore >= minRawForHighConf {
		if len(hits) >= 2 && hits[1].RawScore > 0 {
			highConf = hits[0].RawScore/hits[1].RawScore > 1.5
		} else {
			highConf = true // solo un hit con raw alta
		}
	}

	switch format {
	case "text":
		printMatchText(queryArg, hits, highConf)
	default:
		printMatchJSON(queryArg, hits, highConf)
	}
}

func printMatchJSON(query string, hits []matchResult, highConf bool) {
	out := matchOutput{
		Query:          query,
		Top:            hits,
		HighConfidence: highConf,
	}
	if out.Top == nil {
		out.Top = []matchResult{}
	}
	enc := json.NewEncoder(os.Stdout)
	enc.SetIndent("", "  ")
	enc.Encode(out)
}

func printMatchText(query string, hits []matchResult, highConf bool) {
	conf := ""
	if highConf {
		conf = " [HIGH CONFIDENCE]"
	}
	fmt.Printf("TOP MATCHES for: %s%s\n", query, conf)
	if len(hits) == 0 {
		fmt.Println("  (no matches above threshold)")
		return
	}
	for _, h := range hits {
		fmt.Printf("  [%.3f] %s\n", h.Score, h.ID)
		fmt.Printf("         %s\n", h.Signature)
		fmt.Printf("         %s\n", h.Snippet)
	}
}