fix(infra): audit_uses_functions detecta imports Python anidados y multilinea (0056)

El parser Python de audit_uses_functions solo reconocia "from <pkg> import X"
con un unico componente de paquete (regex \w+), por lo que:

- "from <pkg>.<subpkg> import X" (import anidado) no matcheaba y la funcion se
  reportaba como falso unused_in_app_md.
- Las listas multilinea con parentesis "from <pkg> import (\n a,\n b,\n)" no se
  parseaban (escaneo linea a linea).

Cambios:
- Regex acepta puntos en el paquete y bloques parentizados multilinea.
- Resolucion validada contra el directorio de paquete del registry derivado de
  file_path (no del campo domain: las funciones metabase viven en
  python/functions/metabase/ pero tienen domain=infra). Imports de librerias
  externas se ignoran -> sin falsos missing.
- parsePyImportedSymbols descarta comentarios "# noqa", maneja "as alias" y
  star imports (tratados como vacio, no soportados por diseno).
- auditFnMeta carga file_path; query SELECT anade file_path.

Tests (functions/infra/audit_uses_functions_test.go):
- TestAuditUsesFunctions_DetectsNestedImport (golden)
- TestAuditUsesFunctions_NoFalsePositiveOnNested (edge: nested + multilinea)
- TestAuditUsesFunctions_StarImport (error/edge: star import no cuenta)

Verificado con fn doctor uses-functions sobre apps reales: drift baja de 11/42 a
9/42. mail_manager (9 falsos por "from infra.X import Y") y demand_radar (3 por
lista multilinea) quedan en 0 drift. El residual de osint_db/osint_web es carga
dinamica via importlib, documentado como fuera de alcance.

audit_uses_functions v1.0.0 -> v1.1.0. CHANGELOG actualizado.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Egutierrez
2026-06-30 13:10:31 +02:00
parent 5501507588
commit 9a7a874a76
4 changed files with 309 additions and 31 deletions
+112 -26
View File
@@ -30,6 +30,7 @@ type auditFnMeta struct {
domain string
lang string
signature string
filePath string // file_path as stored in registry.db (used to derive the Python package dir)
}
// skipDirs are directory names ignored when walking source for audits.
@@ -62,9 +63,11 @@ func auditShouldSkipDir(name string) bool { return auditSkipDirs[name] }
// searches the source for the exported symbol derived from each function name
// (snake_case → PascalCase) to achieve per-function granularity within a package.
//
// For Python apps it scans for "from <pkg> import X" patterns where <pkg> matches
// a known registry domain, then resolves X to a function ID by matching the name
// field in registry.db.
// For Python apps it scans for "from <pkg> import X" patterns where the root of
// <pkg> matches a registry Python package directory (derived from file_path),
// then resolves each imported symbol to a function ID by name within that package.
// Both flat ("from metabase import X") and nested ("from metabase.cards import X")
// imports are handled, as are parenthesised multi-line lists.
//
// Returns an error only if registry.db cannot be opened. Apps where dir_path
// does not exist on disk are reported with Missing/Unused = nil (cannot inspect).
@@ -80,15 +83,15 @@ func AuditUsesFunctions(registryRoot string) ([]UsesFunctionsAudit, error) {
return nil, fmt.Errorf("audit_uses_functions: ping db: %w", err)
}
// Load all Go/Python/TS functions from registry: id → name, domain, lang, signature.
rows, err := db.Query(`SELECT id, name, domain, lang, COALESCE(signature, '') FROM functions WHERE lang IN ('go','py','ts')`)
// Load all Go/Python/TS functions from registry: id → name, domain, lang, signature, file_path.
rows, err := db.Query(`SELECT id, name, domain, lang, COALESCE(signature, ''), COALESCE(file_path, '') FROM functions WHERE lang IN ('go','py','ts')`)
if err != nil {
return nil, fmt.Errorf("audit_uses_functions: query functions: %w", err)
}
allFunctions := make(map[string]auditFnMeta) // id → meta
for rows.Next() {
var m auditFnMeta
if err := rows.Scan(&m.id, &m.name, &m.domain, &m.lang, &m.signature); err != nil {
if err := rows.Scan(&m.id, &m.name, &m.domain, &m.lang, &m.signature, &m.filePath); err != nil {
continue
}
allFunctions[m.id] = m
@@ -341,16 +344,46 @@ func isIdentRune(r rune) bool {
}
// auditPyApp returns function IDs detected in the Python source of appDir.
// Looks for: "from <pkg> import X, Y" patterns and resolves X, Y to function IDs.
var pyFromImportRe = regexp.MustCompile(`from\s+(\w+)\s+import\s+(.+)`)
//
// It recognises "from <pkg> import X, Y" statements where <pkg> is the root of a
// registry package, resolving the imported symbols to function IDs. Both the flat
// form ("from metabase import metabase_get_card") and the nested form
// ("from metabase.cards import metabase_get_card") are handled: the root package
// (the component before the first dot) is validated against the registry's Python
// package directories and each symbol is resolved against the whole package, not
// just the named sub-module. Parenthesised multi-line import lists and trailing
// "# noqa" comments are supported.
//
// Resolution is scoped to the matched package: symbols imported from a package
// that is NOT a registry package directory (e.g. "from numpy import array") are
// ignored, so the audit never produces false "missing" hits for third-party libs.
//
// Star imports ("from <pkg> import *") are NOT supported and yield no symbols —
// star imports are discouraged in the registry; see the .md notes.
//
// The pattern accepts either a parenthesised block (which may span newlines) or
// the rest of a single line as the import list.
var pyFromImportRe = regexp.MustCompile(`from\s+([\w.]+)\s+import\s+(\([\s\S]*?\)|[^\n]+)`)
func auditPyApp(appDir string, all map[string]auditFnMeta) []string {
// Build nameid map for py functions.
nameToID := make(map[string]string) // "metabase_auth" → "metabase_auth_py_infra"
// Build package-dir → (nameid) map for py functions. The package directory
// is the first path component under python/functions/, which is NOT always the
// function's registry domain (e.g. metabase functions live in
// python/functions/metabase/ but have domain=infra), so it is derived from
// file_path rather than the domain field.
pkgFuncs := make(map[string]map[string]string) // "infra" → {"imap_connect": "imap_connect_py_infra"}
for _, m := range all {
if m.lang == "py" {
nameToID[m.name] = m.id
if m.lang != "py" {
continue
}
pkg := pyPackageDir(m.filePath)
if pkg == "" {
continue
}
if pkgFuncs[pkg] == nil {
pkgFuncs[pkg] = make(map[string]string)
}
pkgFuncs[pkg][m.name] = m.id
}
usedSet := make(map[string]bool)
@@ -368,23 +401,25 @@ func auditPyApp(appDir string, all map[string]auditFnMeta) []string {
if !strings.HasSuffix(path, ".py") {
return nil
}
f, err := os.Open(path)
data, err := os.ReadFile(path)
if err != nil {
return nil
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
line := strings.TrimSpace(sc.Text())
if m := pyFromImportRe.FindStringSubmatch(line); m != nil {
// m[2] = "X, Y, Z" or "X"
names := strings.Split(m[2], ",")
for _, nm := range names {
nm = strings.TrimSpace(nm)
nm = strings.Fields(nm)[0] // strip "as alias"
if id, ok := nameToID[nm]; ok {
usedSet[id] = true
}
for _, m := range pyFromImportRe.FindAllStringSubmatch(string(data), -1) {
// Root package = component before the first dot. Handles both the flat
// ("metabase") and nested ("metabase.cards") import forms, plus relative
// imports ("from .config import X" → root is "" → skipped).
rootPkg := m[1]
if i := strings.IndexByte(rootPkg, '.'); i >= 0 {
rootPkg = rootPkg[:i]
}
funcs, ok := pkgFuncs[rootPkg]
if !ok {
continue
}
for _, sym := range parsePyImportedSymbols(m[2]) {
if id, ok := funcs[sym]; ok {
usedSet[id] = true
}
}
}
@@ -398,6 +433,57 @@ func auditPyApp(appDir string, all map[string]auditFnMeta) []string {
return used
}
// pyPackageDir returns the top-level package directory of a registry Python
// function from its file_path. For "python/functions/metabase/cards.py" it
// returns "metabase". Returns "" when the path is not under python/functions/
// or has no package component.
func pyPackageDir(filePath string) string {
const prefix = "python/functions/"
fp := filepath.ToSlash(filePath)
if !strings.HasPrefix(fp, prefix) {
return ""
}
rest := fp[len(prefix):]
if i := strings.IndexByte(rest, '/'); i >= 0 {
return rest[:i]
}
return ""
}
// parsePyImportedSymbols extracts the imported symbol names from the right-hand
// side of a Python "from X import <rhs>" statement. It handles single-line lists,
// parenthesised multi-line lists, "# ..." line comments and "as alias" renames.
// A bare "*" (star import) yields no symbols.
func parsePyImportedSymbols(rhs string) []string {
// Drop trailing line comments so "import foo # noqa" and
// "import ( # noqa\n a,\n)" don't pollute symbol parsing.
var b strings.Builder
for _, ln := range strings.Split(rhs, "\n") {
if i := strings.IndexByte(ln, '#'); i >= 0 {
ln = ln[:i]
}
b.WriteString(ln)
b.WriteByte('\n')
}
s := strings.TrimSpace(b.String())
s = strings.TrimPrefix(s, "(")
s = strings.TrimSuffix(s, ")")
var out []string
for _, part := range strings.Split(s, ",") {
fields := strings.Fields(part) // splits "foo as bar" → ["foo","as","bar"]
if len(fields) == 0 {
continue
}
sym := strings.TrimSuffix(fields[0], ")") // safety for "a, b)" tails
if sym == "" || sym == "*" {
continue
}
out = append(out, sym)
}
return out
}
// snakeToPascal converts snake_case to PascalCase (Go exported name).
// E.g. "sqlite_open" → "SQLiteOpen", "http_json_response" → "HTTPJSONResponse".
// Common abbreviations are uppercased in full.
+22 -5
View File
@@ -3,7 +3,7 @@ name: audit_uses_functions
kind: function
lang: go
domain: infra
version: "1.0.0"
version: "1.1.0"
purity: impure
signature: "func AuditUsesFunctions(registryRoot string) ([]UsesFunctionsAudit, error)"
description: "Audita el campo uses_functions de cada app Go y Python registrada en registry.db comparandolo contra los imports reales del codigo fuente. Reporta funciones del registry importadas pero no declaradas (missing_in_app_md) y funciones declaradas pero no detectadas en el codigo (unused_in_app_md). Read-only: no modifica archivos ni la BD."
@@ -23,6 +23,9 @@ tests:
- "missing function detected for Go app"
- "unused function detected for Go app"
- "missing dir returns entry with nil slices"
- "TestAuditUsesFunctions_DetectsNestedImport"
- "TestAuditUsesFunctions_NoFalsePositiveOnNested"
- "TestAuditUsesFunctions_StarImport"
test_file_path: "functions/infra/audit_uses_functions_test.go"
file_path: "functions/infra/audit_uses_functions.go"
---
@@ -55,12 +58,26 @@ Si el nombre exportado real difiere de la convencion (ej. alias de paquete, re-e
## Heuristica Python
Busca `from <pkg> import X, Y` en `.py` de la app. Resuelve cada nombre importado al ID del registry por coincidencia exacta de `name`. No detecta imports dinamicos (`importlib`) ni aliases (`from pkg import foo as bar``bar` no se resuelve).
Busca sentencias `from <pkg> import X, Y` en los `.py` de la app y resuelve cada simbolo importado a su ID del registry:
1. **Paquete raiz**: toma el componente anterior al primer punto de `<pkg>` (`metabase.cards``metabase`). Solo procesa el import si ese paquete raiz es un directorio de paquete Python del registry (derivado de `file_path`, primer componente bajo `python/functions/`). Imports de librerias externas (`from numpy import array`) se ignoran, evitando falsos `missing`.
2. **Imports anidados**: `from metabase.cards import metabase_get_card` resuelve igual que `from metabase import metabase_get_card`. El simbolo se busca en TODO el paquete (`metabase`), no solo en el submodulo nombrado.
3. **Listas multilinea con parentesis**: `from datascience import (\n foo,\n bar,\n)` se parsea entero.
4. **Aliases y comentarios**: `from pkg import foo as bar # noqa` resuelve la funcion importada (`foo`); el alias local y el comentario se descartan.
El directorio de paquete se deriva de `file_path`, NO del campo `domain`: p.ej. las funciones `metabase` viven en `python/functions/metabase/` pero tienen `domain=infra`.
**No soportado** (fuera de alcance, issue 0056):
- `from <pkg> import *` (star import): se trata como vacio (no cuenta como uso). El registry desaconseja star imports.
- Carga dinamica con `importlib.util.spec_from_file_location(...)` o `import pkg` + `pkg.func()`: no son sentencias `from ... import` estaticas y no se detectan (causa el drift residual en apps como `osint_db`/`osint_web` que cargan funciones via wrapper dinamico).
## Notas
- Read-only: no toca la BD ni archivos.
- Apps cuyo `dir_path` no existe en disco se incluyen con `Missing = nil, Unused = nil` (no se puede inspeccionar el codigo).
- Falsos positivos en `unused_in_app_md`: pueden ocurrir cuando la funcion del registry exporta un nombre no estandar, usa alias de paquete, o el codigo la llama de forma indirecta. Confirmar a mano antes de eliminar de `uses_functions`.
- Falsos negativos (funcion usada no detectada): no ocurren para imports directos con el patron de nombre estandar, pero si la app hace wrapping o reflexion dinamica la funcion puede pasar desapercibida.
- Python: solo detecta `from pkg import X`. Los `import pkg` seguidos de `pkg.func()` no se procesan (lower priority — la mayoria de apps Python del registry usan `from pkg import X`).
- Falsos positivos en `unused_in_app_md`: pueden ocurrir cuando la funcion del registry exporta un nombre no estandar (Go), o cuando una app Python la carga de forma dinamica (`importlib`). Confirmar a mano antes de eliminar de `uses_functions`.
- Falsos negativos (funcion usada no detectada): no ocurren para imports estaticos Python (`from pkg[.sub] import X`, incluido multilinea) ni para imports Go con el patron de nombre estandar, pero si la app hace wrapping o reflexion dinamica la funcion puede pasar desapercibida.
## Capability growth log
- v1.1.0 (2026-06-30) — el parser Python detecta imports anidados (`from pkg.subpkg import X`) y listas multilinea con parentesis; resolucion validada contra el directorio de paquete del registry (derivado de `file_path`), eliminando falsos `unused` en apps que usaban esos patrones (issue 0056).
@@ -57,6 +57,177 @@ INSERT INTO functions (id, name, domain, lang, file_path)
}
}
// insertTestFunctions appends extra functions to the test registry.db created by
// createTestRegistryDB. Used by the Python import tests, which need py functions
// whose file_path maps to a real package directory under python/functions/.
func insertTestFunctions(t *testing.T, root string, fns []struct {
id, name, domain, lang, filePath string
}) {
t.Helper()
db, err := sql.Open("sqlite3", filepath.Join(root, "registry.db"))
if err != nil {
t.Fatal(err)
}
defer db.Close()
for _, f := range fns {
if _, err := db.Exec(
`INSERT INTO functions (id, name, domain, lang, file_path) VALUES (?,?,?,?,?)`,
f.id, f.name, f.domain, f.lang, f.filePath,
); err != nil {
t.Fatalf("insert fn %s: %v", f.id, err)
}
}
}
// writePyApp creates a Python app directory with a single source file.
func writePyApp(t *testing.T, root, dirPath, src string) {
t.Helper()
appDir := filepath.Join(root, dirPath)
if err := os.MkdirAll(appDir, 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(appDir, "main.py"), []byte(src), 0644); err != nil {
t.Fatal(err)
}
}
// containsID reports whether ids contains target.
func containsID(ids []string, target string) bool {
for _, id := range ids {
if id == target {
return true
}
}
return false
}
// TestAuditUsesFunctions_DetectsNestedImport verifies that a nested import
// ("from metabase.cards import metabase_get_card") resolves to its function ID.
// The app declares no uses_functions, so the detected import surfaces as Missing.
func TestAuditUsesFunctions_DetectsNestedImport(t *testing.T) {
root := t.TempDir()
createTestRegistryDB(t, root, []struct {
id, lang, dirPath, usesFunctions string
}{
{"nestedapp_py_tools", "py", "apps/nestedapp", `[]`},
})
insertTestFunctions(t, root, []struct {
id, name, domain, lang, filePath string
}{
{"metabase_get_card_py_infra", "metabase_get_card", "infra", "py", "python/functions/metabase/cards.py"},
})
writePyApp(t, root, "apps/nestedapp", `import sys
from metabase.cards import metabase_get_card # noqa: E402
def run():
return metabase_get_card(1)
`)
results, err := AuditUsesFunctions(root)
if err != nil {
t.Fatalf("AuditUsesFunctions: %v", err)
}
if len(results) != 1 {
t.Fatalf("expected 1 result, got %d", len(results))
}
got := results[0]
if !containsID(got.Missing, "metabase_get_card_py_infra") {
t.Errorf("nested import not detected: Missing = %v, want to contain metabase_get_card_py_infra", got.Missing)
}
if len(got.Unused) != 0 {
t.Errorf("Unused = %v, want []", got.Unused)
}
}
// TestAuditUsesFunctions_NoFalsePositiveOnNested verifies that when an app
// imports nested + multi-line parenthesised lists and declares them all in
// uses_functions, no function is reported as unused (the core regression fixed
// by this issue: false "unused" hits for nested/multi-line imports).
func TestAuditUsesFunctions_NoFalsePositiveOnNested(t *testing.T) {
root := t.TempDir()
createTestRegistryDB(t, root, []struct {
id, lang, dirPath, usesFunctions string
}{
{"nofp_py_tools", "py", "apps/nofp",
`["imap_connect_py_infra","smtp_send_py_infra","fetch_reddit_search_py_datascience","score_demand_signal_py_datascience"]`},
})
insertTestFunctions(t, root, []struct {
id, name, domain, lang, filePath string
}{
{"imap_connect_py_infra", "imap_connect", "infra", "py", "python/functions/infra/imap_connect.py"},
{"smtp_send_py_infra", "smtp_send", "infra", "py", "python/functions/infra/smtp_send.py"},
{"fetch_reddit_search_py_datascience", "fetch_reddit_search", "datascience", "py", "python/functions/datascience/fetch_reddit_search.py"},
{"score_demand_signal_py_datascience", "score_demand_signal", "datascience", "py", "python/functions/datascience/score_demand_signal.py"},
})
// Nested imports + a parenthesised multi-line list — both previously missed.
writePyApp(t, root, "apps/nofp", `import sys
from infra.imap_connect import imap_connect # noqa: E402
from infra.smtp_send import smtp_send, SMTPConfigPy # noqa: E402
from datascience import ( # noqa: E402
fetch_reddit_search,
score_demand_signal,
)
def run():
return imap_connect, smtp_send, fetch_reddit_search, score_demand_signal
`)
results, err := AuditUsesFunctions(root)
if err != nil {
t.Fatalf("AuditUsesFunctions: %v", err)
}
if len(results) != 1 {
t.Fatalf("expected 1 result, got %d", len(results))
}
got := results[0]
if len(got.Unused) != 0 {
t.Errorf("false positive unused detected: Unused = %v, want []", got.Unused)
}
if len(got.Missing) != 0 {
t.Errorf("Missing = %v, want []", got.Missing)
}
}
// TestAuditUsesFunctions_StarImport documents that star imports
// ("from <pkg> import *") are NOT treated as using any function: a declared
// function not otherwise referenced is reported as unused.
func TestAuditUsesFunctions_StarImport(t *testing.T) {
root := t.TempDir()
createTestRegistryDB(t, root, []struct {
id, lang, dirPath, usesFunctions string
}{
{"starapp_py_tools", "py", "apps/starapp", `["filter_list_py_core"]`},
})
insertTestFunctions(t, root, []struct {
id, name, domain, lang, filePath string
}{
{"filter_list_py_core", "filter_list", "core", "py", "python/functions/core/core.py"},
})
writePyApp(t, root, "apps/starapp", `from core import *
def run():
return None
`)
results, err := AuditUsesFunctions(root)
if err != nil {
t.Fatalf("AuditUsesFunctions: %v", err)
}
if len(results) != 1 {
t.Fatalf("expected 1 result, got %d", len(results))
}
got := results[0]
if !containsID(got.Unused, "filter_list_py_core") {
t.Errorf("star import should not count as usage: Unused = %v, want to contain filter_list_py_core", got.Unused)
}
if len(got.Missing) != 0 {
t.Errorf("Missing = %v, want []", got.Missing)
}
}
// TestAuditUsesFunctions_DetectsMissing verifies that a Go app that calls
// RandomHexID in its source but declares empty uses_functions gets
// random_hex_id_go_core reported as missing.