fn_registry/functions/infra/vault_search.go

package infra

import (
	"database/sql"
	"fmt"
	"path/filepath"
	"strings"
)

// VaultSearchHit is a single result returned by VaultSearch.
type VaultSearchHit struct {
	VaultPath string `json:"vault_path"`
	VaultName string `json:"vault_name"` // basename of VaultPath (after resolving symlinks)
	RelPath   string `json:"rel_path"`
	Size      int64  `json:"size"`
	Mtime     int64  `json:"mtime"`
	Mime      string `json:"mime"`
	Bucket    string `json:"bucket"`
	SubBucket string `json:"sub_bucket"`
	Snippet   string `json:"snippet"` // FTS5 snippet or empty if match is only by rel_path (fallback)
}

// VaultSearch searches vault_index.db inside vaultPath for files matching query.
//
// Behaviour:
//  1. Opens vault_index.db via VaultIndexOpen.
//  2. If limit <= 0, defaults to 50.
//  3. Runs a FTS5 MATCH query over files_fts to find content matches (when content_text
//     is populated by profilers). Because the FTS5 table uses content='' (contentless),
//     column values are not stored; results are correlated back to files via a LIKE
//     match on rel_path for path tokens, or via an IN clause of matched rowids for
//     content_text matches.
//  4. Also searches files.rel_path with LIKE to find path matches.
//  5. Results from both searches are merged (deduplication by rel_path).
//  6. If both FTS5 and LIKE queries fail, returns the error.
//  7. VaultName is derived from the basename of vaultPath (after resolving symlinks).
func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error) {
	if limit <= 0 {
		limit = 50
	}

	db, err := VaultIndexOpen(vaultPath)
	if err != nil {
		return nil, fmt.Errorf("vault_search: open index: %w", err)
	}
	defer db.Close()

	vaultName := resolveVaultName(vaultPath)

	hits, err := vaultSearchCombined(db, vaultPath, vaultName, query, limit)
	if err != nil {
		return nil, fmt.Errorf("vault_search: %w", err)
	}
	return hits, nil
}

// vaultSearchCombined runs the search using two strategies and merges deduplicated results:
//  1. FTS5 MATCH on files_fts (for content_text when populated by profilers).
//     Correlation back to files uses rowid (reliable for fresh indexes) or falls back.
//  2. LIKE on files.rel_path (always reliable for path searching).
//
// Results are deduplicated by rel_path, up to limit entries.
func vaultSearchCombined(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
	seen := make(map[string]struct{})
	var hits []VaultSearchHit

	// Strategy 1: FTS5 MATCH on content_text (populated by profilers).
	// With contentless FTS5 (content=''), column values are NOT retrievable via SELECT.
	// We get matching rowids from FTS5, then look up files by rowid.
	// This is reliable for content_text matches because VaultIndexWrite inserts
	// content_text rows independently of the path rows (profilers update them).
	// NOTE: for rel_path token matching, strategy 2 (LIKE) is more reliable.
	ftsQuery := safeFTSQuery(query)
	ftsHits, ftsErr := vaultSearchFTSContent(db, vaultPath, vaultName, ftsQuery, limit)
	if ftsErr == nil {
		for _, h := range ftsHits {
			if len(hits) >= limit {
				break
			}
			if _, ok := seen[h.RelPath]; !ok {
				seen[h.RelPath] = struct{}{}
				hits = append(hits, h)
			}
		}
	}
	// If FTS5 failed with a syntax error, that's expected for bad queries — continue.
	// If it failed with a non-syntax error, still continue to LIKE fallback.

	// Strategy 2: LIKE on rel_path — reliable path search.
	// When query contains FTS5 special chars (e.g. "foo:bar:"), extract the first
	// word-like token so the LIKE pattern is still useful.
	likeQuery := simplifyForLike(query)
	if len(hits) < limit && likeQuery != "" {
		remaining := limit - len(hits)
		likeHits, likeErr := vaultSearchLike(db, vaultPath, vaultName, likeQuery, remaining+len(seen))
		if likeErr != nil && ftsErr != nil {
			// Both failed — return a combined error.
			return nil, fmt.Errorf("fts: %v; like: %v", ftsErr, likeErr)
		}
		for _, h := range likeHits {
			if len(hits) >= limit {
				break
			}
			if _, ok := seen[h.RelPath]; !ok {
				seen[h.RelPath] = struct{}{}
				hits = append(hits, h)
			}
		}
	}

	if hits == nil {
		hits = []VaultSearchHit{}
	}
	return hits, nil
}

// vaultSearchFTSContent queries files_fts with a MATCH and correlates results
// back to the files table.
//
// Design note: with content='' (contentless FTS5), SELECT on columns returns ''.
// We get the rowid from the FTS5 match and look up files.rel_path via rowid.
// This works correctly when content_text was populated by a profiler that did NOT
// delete+reinsert the FTS row (i.e. profilers do direct INSERT/UPDATE of content_text
// without changing the rowid). For the current VaultIndexWrite implementation
// (which inserts content_text='' and profilers update it in-place), the rowids
// remain stable after profiling.
func vaultSearchFTSContent(db *sql.DB, vaultPath, vaultName, safeQuery string, limit int) ([]VaultSearchHit, error) {
	// Get matching rowids from FTS5.
	const qRowids = `
		SELECT rowid
		FROM files_fts
		WHERE files_fts MATCH ?
		ORDER BY rank
		LIMIT ?`

	rows, err := db.Query(qRowids, safeQuery, limit)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	var rowids []int64
	for rows.Next() {
		var rid int64
		if err := rows.Scan(&rid); err != nil {
			return nil, err
		}
		rowids = append(rowids, rid)
	}
	if err := rows.Err(); err != nil {
		return nil, err
	}
	if len(rowids) == 0 {
		return nil, nil
	}

	// Look up files by rowid. files uses a TEXT PK so its rowid is implicit.
	// Snippet is empty for contentless FTS5 (snippet() returns NULL there).
	var hits []VaultSearchHit
	for _, rid := range rowids {
		var h VaultSearchHit
		err := db.QueryRow(`
			SELECT rel_path, size, mtime, mime, bucket, sub_bucket
			FROM files WHERE rowid = ?`, rid,
		).Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket)
		if err != nil {
			// rowid mismatch (happens after update cycles) — skip gracefully.
			continue
		}
		h.VaultPath = vaultPath
		h.VaultName = vaultName
		h.Snippet = ""
		hits = append(hits, h)
	}
	return hits, nil
}

// vaultSearchLike searches files.rel_path with LIKE, ordered by mtime DESC.
func vaultSearchLike(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
	const qLike = `
		SELECT rel_path, size, mtime, mime, bucket, sub_bucket
		FROM files
		WHERE rel_path LIKE '%' || ? || '%'
		ORDER BY mtime DESC
		LIMIT ?`

	rows, err := db.Query(qLike, query, limit)
	if err != nil {
		return nil, err
	}
	defer rows.Close()

	var hits []VaultSearchHit
	for rows.Next() {
		var h VaultSearchHit
		if err := rows.Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket); err != nil {
			return nil, err
		}
		h.VaultPath = vaultPath
		h.VaultName = vaultName
		h.Snippet = ""
		hits = append(hits, h)
	}
	return hits, rows.Err()
}

// resolveVaultName returns the basename of vaultPath after resolving symlinks.
// Falls back to filepath.Base if EvalSymlinks fails.
func resolveVaultName(vaultPath string) string {
	resolved, err := filepath.EvalSymlinks(vaultPath)
	if err != nil {
		resolved = vaultPath
	}
	return filepath.Base(resolved)
}

// safeFTSQuery wraps the query in double-quotes if it does not already contain
// FTS5 boolean operators (AND, OR, NOT) or column prefixes (containing ":").
// This prevents FTS5 syntax errors on tokens like "foo:bar:" or "hello-world".
func safeFTSQuery(query string) string {
	q := strings.TrimSpace(query)
	if q == "" {
		return q
	}
	upper := strings.ToUpper(q)
	// If user already uses explicit operators or column prefix, pass through.
	if strings.ContainsAny(q, ":") ||
		strings.Contains(upper, " AND ") ||
		strings.Contains(upper, " OR ") ||
		strings.Contains(upper, " NOT ") {
		return q
	}
	// Escape any double-quotes in the query before wrapping.
	escaped := strings.ReplaceAll(q, `"`, `""`)
	return `"` + escaped + `"`
}

// isFTSSyntaxError returns true when the error looks like an FTS5 query parser error.
func isFTSSyntaxError(err error) bool {
	if err == nil {
		return false
	}
	msg := strings.ToLower(err.Error())
	return strings.Contains(msg, "syntax error") ||
		strings.Contains(msg, "no such column") ||
		strings.Contains(msg, "fts5: syntax error")
}

// simplifyForLike extracts a clean substring from query suitable for LIKE matching.
// When the query contains FTS5 special characters (colons, double-quotes, operators),
// only the first word-like sequence of alphanumeric/underscore/hyphen characters is
// used. This ensures the LIKE fallback remains useful even when the FTS5 query is
// syntactically complex or contains column-prefix syntax like "foo:bar:".
func simplifyForLike(query string) string {
	q := strings.TrimSpace(query)
	var token strings.Builder
	for _, r := range q {
		if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' {
			token.WriteRune(r)
		} else if token.Len() > 0 {
			break
		}
	}
	return token.String()
}