fix(infra): audit_uses_functions detecta imports Python anidados y multilinea (0056)

El parser Python de audit_uses_functions solo reconocia "from <pkg> import X"
con un unico componente de paquete (regex \w+), por lo que:

- "from <pkg>.<subpkg> import X" (import anidado) no matcheaba y la funcion se
  reportaba como falso unused_in_app_md.
- Las listas multilinea con parentesis "from <pkg> import (\n a,\n b,\n)" no se
  parseaban (escaneo linea a linea).

Cambios:
- Regex acepta puntos en el paquete y bloques parentizados multilinea.
- Resolucion validada contra el directorio de paquete del registry derivado de
  file_path (no del campo domain: las funciones metabase viven en
  python/functions/metabase/ pero tienen domain=infra). Imports de librerias
  externas se ignoran -> sin falsos missing.
- parsePyImportedSymbols descarta comentarios "# noqa", maneja "as alias" y
  star imports (tratados como vacio, no soportados por diseno).
- auditFnMeta carga file_path; query SELECT anade file_path.

Tests (functions/infra/audit_uses_functions_test.go):
- TestAuditUsesFunctions_DetectsNestedImport (golden)
- TestAuditUsesFunctions_NoFalsePositiveOnNested (edge: nested + multilinea)
- TestAuditUsesFunctions_StarImport (error/edge: star import no cuenta)

Verificado con fn doctor uses-functions sobre apps reales: drift baja de 11/42 a
9/42. mail_manager (9 falsos por "from infra.X import Y") y demand_radar (3 por
lista multilinea) quedan en 0 drift. El residual de osint_db/osint_web es carga
dinamica via importlib, documentado como fuera de alcance.

audit_uses_functions v1.0.0 -> v1.1.0. CHANGELOG actualizado.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Egutierrez
2026-06-30 13:10:31 +02:00
parent 5501507588
commit 9a7a874a76
4 changed files with 309 additions and 31 deletions
+112 -26
View File
@@ -30,6 +30,7 @@ type auditFnMeta struct {
domain string
lang string
signature string
filePath string // file_path as stored in registry.db (used to derive the Python package dir)
}
// skipDirs are directory names ignored when walking source for audits.
@@ -62,9 +63,11 @@ func auditShouldSkipDir(name string) bool { return auditSkipDirs[name] }
// searches the source for the exported symbol derived from each function name
// (snake_case → PascalCase) to achieve per-function granularity within a package.
//
// For Python apps it scans for "from <pkg> import X" patterns where <pkg> matches
// a known registry domain, then resolves X to a function ID by matching the name
// field in registry.db.
// For Python apps it scans for "from <pkg> import X" patterns where the root of
// <pkg> matches a registry Python package directory (derived from file_path),
// then resolves each imported symbol to a function ID by name within that package.
// Both flat ("from metabase import X") and nested ("from metabase.cards import X")
// imports are handled, as are parenthesised multi-line lists.
//
// Returns an error only if registry.db cannot be opened. Apps where dir_path
// does not exist on disk are reported with Missing/Unused = nil (cannot inspect).
@@ -80,15 +83,15 @@ func AuditUsesFunctions(registryRoot string) ([]UsesFunctionsAudit, error) {
return nil, fmt.Errorf("audit_uses_functions: ping db: %w", err)
}
// Load all Go/Python/TS functions from registry: id → name, domain, lang, signature.
rows, err := db.Query(`SELECT id, name, domain, lang, COALESCE(signature, '') FROM functions WHERE lang IN ('go','py','ts')`)
// Load all Go/Python/TS functions from registry: id → name, domain, lang, signature, file_path.
rows, err := db.Query(`SELECT id, name, domain, lang, COALESCE(signature, ''), COALESCE(file_path, '') FROM functions WHERE lang IN ('go','py','ts')`)
if err != nil {
return nil, fmt.Errorf("audit_uses_functions: query functions: %w", err)
}
allFunctions := make(map[string]auditFnMeta) // id → meta
for rows.Next() {
var m auditFnMeta
if err := rows.Scan(&m.id, &m.name, &m.domain, &m.lang, &m.signature); err != nil {
if err := rows.Scan(&m.id, &m.name, &m.domain, &m.lang, &m.signature, &m.filePath); err != nil {
continue
}
allFunctions[m.id] = m
@@ -341,16 +344,46 @@ func isIdentRune(r rune) bool {
}
// auditPyApp returns function IDs detected in the Python source of appDir.
// Looks for: "from <pkg> import X, Y" patterns and resolves X, Y to function IDs.
var pyFromImportRe = regexp.MustCompile(`from\s+(\w+)\s+import\s+(.+)`)
//
// It recognises "from <pkg> import X, Y" statements where <pkg> is the root of a
// registry package, resolving the imported symbols to function IDs. Both the flat
// form ("from metabase import metabase_get_card") and the nested form
// ("from metabase.cards import metabase_get_card") are handled: the root package
// (the component before the first dot) is validated against the registry's Python
// package directories and each symbol is resolved against the whole package, not
// just the named sub-module. Parenthesised multi-line import lists and trailing
// "# noqa" comments are supported.
//
// Resolution is scoped to the matched package: symbols imported from a package
// that is NOT a registry package directory (e.g. "from numpy import array") are
// ignored, so the audit never produces false "missing" hits for third-party libs.
//
// Star imports ("from <pkg> import *") are NOT supported and yield no symbols —
// star imports are discouraged in the registry; see the .md notes.
//
// The pattern accepts either a parenthesised block (which may span newlines) or
// the rest of a single line as the import list.
var pyFromImportRe = regexp.MustCompile(`from\s+([\w.]+)\s+import\s+(\([\s\S]*?\)|[^\n]+)`)
func auditPyApp(appDir string, all map[string]auditFnMeta) []string {
// Build nameid map for py functions.
nameToID := make(map[string]string) // "metabase_auth" → "metabase_auth_py_infra"
// Build package-dir → (nameid) map for py functions. The package directory
// is the first path component under python/functions/, which is NOT always the
// function's registry domain (e.g. metabase functions live in
// python/functions/metabase/ but have domain=infra), so it is derived from
// file_path rather than the domain field.
pkgFuncs := make(map[string]map[string]string) // "infra" → {"imap_connect": "imap_connect_py_infra"}
for _, m := range all {
if m.lang == "py" {
nameToID[m.name] = m.id
if m.lang != "py" {
continue
}
pkg := pyPackageDir(m.filePath)
if pkg == "" {
continue
}
if pkgFuncs[pkg] == nil {
pkgFuncs[pkg] = make(map[string]string)
}
pkgFuncs[pkg][m.name] = m.id
}
usedSet := make(map[string]bool)
@@ -368,23 +401,25 @@ func auditPyApp(appDir string, all map[string]auditFnMeta) []string {
if !strings.HasSuffix(path, ".py") {
return nil
}
f, err := os.Open(path)
data, err := os.ReadFile(path)
if err != nil {
return nil
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if m := pyFromImportRe.FindStringSubmatch(line); m != nil {
// m[2] = "X, Y, Z" or "X"
names := strings.Split(m[2], ",")
for _, nm := range names {
nm = strings.TrimSpace(nm)
nm = strings.Fields(nm)[0] // strip "as alias"
if id, ok := nameToID[nm]; ok {
usedSet[id] = true
}
for _, m := range pyFromImportRe.FindAllStringSubmatch(string(data), -1) {
// Root package = component before the first dot. Handles both the flat
// ("metabase") and nested ("metabase.cards") import forms, plus relative
// imports ("from .config import X" → root is "" → skipped).
rootPkg := m[1]
if i := strings.IndexByte(rootPkg, '.'); i >= 0 {
rootPkg = rootPkg[:i]
}
funcs, ok := pkgFuncs[rootPkg]
if !ok {
continue
}
for _, sym := range parsePyImportedSymbols(m[2]) {
if id, ok := funcs[sym]; ok {
usedSet[id] = true
}
}
}
@@ -398,6 +433,57 @@ func auditPyApp(appDir string, all map[string]auditFnMeta) []string {
return used
}
// pyPackageDir returns the top-level package directory of a registry Python
// function from its file_path. For "python/functions/metabase/cards.py" it
// returns "metabase". Returns "" when the path is not under python/functions/
// or has no package component.
func pyPackageDir(filePath string) string {
const prefix = "python/functions/"
fp := filepath.ToSlash(filePath)
if !strings.HasPrefix(fp, prefix) {
return ""
}
rest := fp[len(prefix):]
if i := strings.IndexByte(rest, '/'); i >= 0 {
return rest[:i]
}
return ""
}
// parsePyImportedSymbols extracts the imported symbol names from the right-hand
// side of a Python "from X import <rhs>" statement. It handles single-line lists,
// parenthesised multi-line lists, "# ..." line comments and "as alias" renames.
// A bare "*" (star import) yields no symbols.
func parsePyImportedSymbols(rhs string) []string {
// Drop trailing line comments so "import foo # noqa" and
// "import ( # noqa\n a,\n)" don't pollute symbol parsing.
var b strings.Builder
for _, ln := range strings.Split(rhs, "\n") {
if i := strings.IndexByte(ln, '#'); i >= 0 {
ln = ln[:i]
}
b.WriteString(ln)
b.WriteByte('\n')
}
s := strings.TrimSpace(b.String())
s = strings.TrimPrefix(s, "(")
s = strings.TrimSuffix(s, ")")
var out []string
for _, part := range strings.Split(s, ",") {
fields := strings.Fields(part) // splits "foo as bar" → ["foo","as","bar"]
if len(fields) == 0 {
continue
}
sym := strings.TrimSuffix(fields[0], ")") // safety for "a, b)" tails
if sym == "" || sym == "*" {
continue
}
out = append(out, sym)
}
return out
}
// snakeToPascal converts snake_case to PascalCase (Go exported name).
// E.g. "sqlite_open" → "SQLiteOpen", "http_json_response" → "HTTPJSONResponse".
// Common abbreviations are uppercased in full.