From 9a7a874a7682263d8053e1c6b1b01bf4a300780b Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Tue, 30 Jun 2026 13:10:31 +0200 Subject: [PATCH] fix(infra): audit_uses_functions detecta imports Python anidados y multilinea (0056) El parser Python de audit_uses_functions solo reconocia "from import X" con un unico componente de paquete (regex \w+), por lo que: - "from . import X" (import anidado) no matcheaba y la funcion se reportaba como falso unused_in_app_md. - Las listas multilinea con parentesis "from import (\n a,\n b,\n)" no se parseaban (escaneo linea a linea). Cambios: - Regex acepta puntos en el paquete y bloques parentizados multilinea. - Resolucion validada contra el directorio de paquete del registry derivado de file_path (no del campo domain: las funciones metabase viven en python/functions/metabase/ pero tienen domain=infra). Imports de librerias externas se ignoran -> sin falsos missing. - parsePyImportedSymbols descarta comentarios "# noqa", maneja "as alias" y star imports (tratados como vacio, no soportados por diseno). - auditFnMeta carga file_path; query SELECT anade file_path. Tests (functions/infra/audit_uses_functions_test.go): - TestAuditUsesFunctions_DetectsNestedImport (golden) - TestAuditUsesFunctions_NoFalsePositiveOnNested (edge: nested + multilinea) - TestAuditUsesFunctions_StarImport (error/edge: star import no cuenta) Verificado con fn doctor uses-functions sobre apps reales: drift baja de 11/42 a 9/42. mail_manager (9 falsos por "from infra.X import Y") y demand_radar (3 por lista multilinea) quedan en 0 drift. El residual de osint_db/osint_web es carga dinamica via importlib, documentado como fuera de alcance. audit_uses_functions v1.0.0 -> v1.1.0. CHANGELOG actualizado. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 4 + functions/infra/audit_uses_functions.go | 138 ++++++++++++--- functions/infra/audit_uses_functions.md | 27 ++- functions/infra/audit_uses_functions_test.go | 171 +++++++++++++++++++ 4 files changed, 309 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 15379d07..aa9ac087 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ Para contexto detallado del trabajo diario ver `docs/diary/`. Para decisiones ar ## [Unreleased] +### Fixed + +- **`audit_uses_functions` detecta imports Python anidados y multilinea** (issue 0056) — el parser Python ahora reconoce `from . import X` (antes la regex `\w+` rompia ante el punto y la funcion se reportaba como falso `unused_in_app_md`) y listas multilinea con parentesis `from import (\n a,\n b,\n)`. La resolucion se valida contra el directorio de paquete del registry derivado de `file_path` (no del campo `domain`: las funciones `metabase` viven en `python/functions/metabase/` pero tienen `domain=infra`), e ignora imports de librerias externas. Aliases (`as`) y comentarios (`# noqa`) se descartan. Star imports (`from pkg import *`) y carga dinamica (`importlib`) quedan documentados como no soportados. Verificado: `fn doctor uses-functions` baja de 11/42 a 9/42 apps con drift — `mail_manager` (9 falsos positivos por `from infra.X import Y`) y `demand_radar` (3 por lista multilinea `from datascience import (...)`) quedan en 0 drift; el residual de `osint_db`/`osint_web` es carga dinamica via wrapper, fuera de alcance. `audit_uses_functions` v1.0.0 → v1.1.0. + ## 2026-05-17 ### Added diff --git a/functions/infra/audit_uses_functions.go b/functions/infra/audit_uses_functions.go index 79715702..c57deccd 100644 --- a/functions/infra/audit_uses_functions.go +++ b/functions/infra/audit_uses_functions.go @@ -30,6 +30,7 @@ type auditFnMeta struct { domain string lang string signature string + filePath string // file_path as stored in registry.db (used to derive the Python package dir) } // skipDirs are directory names ignored when walking source for audits. @@ -62,9 +63,11 @@ func auditShouldSkipDir(name string) bool { return auditSkipDirs[name] } // searches the source for the exported symbol derived from each function name // (snake_case → PascalCase) to achieve per-function granularity within a package. // -// For Python apps it scans for "from import X" patterns where matches -// a known registry domain, then resolves X to a function ID by matching the name -// field in registry.db. +// For Python apps it scans for "from import X" patterns where the root of +// matches a registry Python package directory (derived from file_path), +// then resolves each imported symbol to a function ID by name within that package. +// Both flat ("from metabase import X") and nested ("from metabase.cards import X") +// imports are handled, as are parenthesised multi-line lists. // // Returns an error only if registry.db cannot be opened. Apps where dir_path // does not exist on disk are reported with Missing/Unused = nil (cannot inspect). @@ -80,15 +83,15 @@ func AuditUsesFunctions(registryRoot string) ([]UsesFunctionsAudit, error) { return nil, fmt.Errorf("audit_uses_functions: ping db: %w", err) } - // Load all Go/Python/TS functions from registry: id → name, domain, lang, signature. - rows, err := db.Query(`SELECT id, name, domain, lang, COALESCE(signature, '') FROM functions WHERE lang IN ('go','py','ts')`) + // Load all Go/Python/TS functions from registry: id → name, domain, lang, signature, file_path. + rows, err := db.Query(`SELECT id, name, domain, lang, COALESCE(signature, ''), COALESCE(file_path, '') FROM functions WHERE lang IN ('go','py','ts')`) if err != nil { return nil, fmt.Errorf("audit_uses_functions: query functions: %w", err) } allFunctions := make(map[string]auditFnMeta) // id → meta for rows.Next() { var m auditFnMeta - if err := rows.Scan(&m.id, &m.name, &m.domain, &m.lang, &m.signature); err != nil { + if err := rows.Scan(&m.id, &m.name, &m.domain, &m.lang, &m.signature, &m.filePath); err != nil { continue } allFunctions[m.id] = m @@ -341,16 +344,46 @@ func isIdentRune(r rune) bool { } // auditPyApp returns function IDs detected in the Python source of appDir. -// Looks for: "from import X, Y" patterns and resolves X, Y to function IDs. -var pyFromImportRe = regexp.MustCompile(`from\s+(\w+)\s+import\s+(.+)`) +// +// It recognises "from import X, Y" statements where is the root of a +// registry package, resolving the imported symbols to function IDs. Both the flat +// form ("from metabase import metabase_get_card") and the nested form +// ("from metabase.cards import metabase_get_card") are handled: the root package +// (the component before the first dot) is validated against the registry's Python +// package directories and each symbol is resolved against the whole package, not +// just the named sub-module. Parenthesised multi-line import lists and trailing +// "# noqa" comments are supported. +// +// Resolution is scoped to the matched package: symbols imported from a package +// that is NOT a registry package directory (e.g. "from numpy import array") are +// ignored, so the audit never produces false "missing" hits for third-party libs. +// +// Star imports ("from import *") are NOT supported and yield no symbols — +// star imports are discouraged in the registry; see the .md notes. +// +// The pattern accepts either a parenthesised block (which may span newlines) or +// the rest of a single line as the import list. +var pyFromImportRe = regexp.MustCompile(`from\s+([\w.]+)\s+import\s+(\([\s\S]*?\)|[^\n]+)`) func auditPyApp(appDir string, all map[string]auditFnMeta) []string { - // Build name→id map for py functions. - nameToID := make(map[string]string) // "metabase_auth" → "metabase_auth_py_infra" + // Build package-dir → (name → id) map for py functions. The package directory + // is the first path component under python/functions/, which is NOT always the + // function's registry domain (e.g. metabase functions live in + // python/functions/metabase/ but have domain=infra), so it is derived from + // file_path rather than the domain field. + pkgFuncs := make(map[string]map[string]string) // "infra" → {"imap_connect": "imap_connect_py_infra"} for _, m := range all { - if m.lang == "py" { - nameToID[m.name] = m.id + if m.lang != "py" { + continue } + pkg := pyPackageDir(m.filePath) + if pkg == "" { + continue + } + if pkgFuncs[pkg] == nil { + pkgFuncs[pkg] = make(map[string]string) + } + pkgFuncs[pkg][m.name] = m.id } usedSet := make(map[string]bool) @@ -368,23 +401,25 @@ func auditPyApp(appDir string, all map[string]auditFnMeta) []string { if !strings.HasSuffix(path, ".py") { return nil } - f, err := os.Open(path) + data, err := os.ReadFile(path) if err != nil { return nil } - defer f.Close() - sc := bufio.NewScanner(f) - for sc.Scan() { - line := strings.TrimSpace(sc.Text()) - if m := pyFromImportRe.FindStringSubmatch(line); m != nil { - // m[2] = "X, Y, Z" or "X" - names := strings.Split(m[2], ",") - for _, nm := range names { - nm = strings.TrimSpace(nm) - nm = strings.Fields(nm)[0] // strip "as alias" - if id, ok := nameToID[nm]; ok { - usedSet[id] = true - } + for _, m := range pyFromImportRe.FindAllStringSubmatch(string(data), -1) { + // Root package = component before the first dot. Handles both the flat + // ("metabase") and nested ("metabase.cards") import forms, plus relative + // imports ("from .config import X" → root is "" → skipped). + rootPkg := m[1] + if i := strings.IndexByte(rootPkg, '.'); i >= 0 { + rootPkg = rootPkg[:i] + } + funcs, ok := pkgFuncs[rootPkg] + if !ok { + continue + } + for _, sym := range parsePyImportedSymbols(m[2]) { + if id, ok := funcs[sym]; ok { + usedSet[id] = true } } } @@ -398,6 +433,57 @@ func auditPyApp(appDir string, all map[string]auditFnMeta) []string { return used } +// pyPackageDir returns the top-level package directory of a registry Python +// function from its file_path. For "python/functions/metabase/cards.py" it +// returns "metabase". Returns "" when the path is not under python/functions/ +// or has no package component. +func pyPackageDir(filePath string) string { + const prefix = "python/functions/" + fp := filepath.ToSlash(filePath) + if !strings.HasPrefix(fp, prefix) { + return "" + } + rest := fp[len(prefix):] + if i := strings.IndexByte(rest, '/'); i >= 0 { + return rest[:i] + } + return "" +} + +// parsePyImportedSymbols extracts the imported symbol names from the right-hand +// side of a Python "from X import " statement. It handles single-line lists, +// parenthesised multi-line lists, "# ..." line comments and "as alias" renames. +// A bare "*" (star import) yields no symbols. +func parsePyImportedSymbols(rhs string) []string { + // Drop trailing line comments so "import foo # noqa" and + // "import ( # noqa\n a,\n)" don't pollute symbol parsing. + var b strings.Builder + for _, ln := range strings.Split(rhs, "\n") { + if i := strings.IndexByte(ln, '#'); i >= 0 { + ln = ln[:i] + } + b.WriteString(ln) + b.WriteByte('\n') + } + s := strings.TrimSpace(b.String()) + s = strings.TrimPrefix(s, "(") + s = strings.TrimSuffix(s, ")") + + var out []string + for _, part := range strings.Split(s, ",") { + fields := strings.Fields(part) // splits "foo as bar" → ["foo","as","bar"] + if len(fields) == 0 { + continue + } + sym := strings.TrimSuffix(fields[0], ")") // safety for "a, b)" tails + if sym == "" || sym == "*" { + continue + } + out = append(out, sym) + } + return out +} + // snakeToPascal converts snake_case to PascalCase (Go exported name). // E.g. "sqlite_open" → "SQLiteOpen", "http_json_response" → "HTTPJSONResponse". // Common abbreviations are uppercased in full. diff --git a/functions/infra/audit_uses_functions.md b/functions/infra/audit_uses_functions.md index a08b8a02..408de899 100644 --- a/functions/infra/audit_uses_functions.md +++ b/functions/infra/audit_uses_functions.md @@ -3,7 +3,7 @@ name: audit_uses_functions kind: function lang: go domain: infra -version: "1.0.0" +version: "1.1.0" purity: impure signature: "func AuditUsesFunctions(registryRoot string) ([]UsesFunctionsAudit, error)" description: "Audita el campo uses_functions de cada app Go y Python registrada en registry.db comparandolo contra los imports reales del codigo fuente. Reporta funciones del registry importadas pero no declaradas (missing_in_app_md) y funciones declaradas pero no detectadas en el codigo (unused_in_app_md). Read-only: no modifica archivos ni la BD." @@ -23,6 +23,9 @@ tests: - "missing function detected for Go app" - "unused function detected for Go app" - "missing dir returns entry with nil slices" + - "TestAuditUsesFunctions_DetectsNestedImport" + - "TestAuditUsesFunctions_NoFalsePositiveOnNested" + - "TestAuditUsesFunctions_StarImport" test_file_path: "functions/infra/audit_uses_functions_test.go" file_path: "functions/infra/audit_uses_functions.go" --- @@ -55,12 +58,26 @@ Si el nombre exportado real difiere de la convencion (ej. alias de paquete, re-e ## Heuristica Python -Busca `from import X, Y` en `.py` de la app. Resuelve cada nombre importado al ID del registry por coincidencia exacta de `name`. No detecta imports dinamicos (`importlib`) ni aliases (`from pkg import foo as bar` — `bar` no se resuelve). +Busca sentencias `from import X, Y` en los `.py` de la app y resuelve cada simbolo importado a su ID del registry: + +1. **Paquete raiz**: toma el componente anterior al primer punto de `` (`metabase.cards` → `metabase`). Solo procesa el import si ese paquete raiz es un directorio de paquete Python del registry (derivado de `file_path`, primer componente bajo `python/functions/`). Imports de librerias externas (`from numpy import array`) se ignoran, evitando falsos `missing`. +2. **Imports anidados**: `from metabase.cards import metabase_get_card` resuelve igual que `from metabase import metabase_get_card`. El simbolo se busca en TODO el paquete (`metabase`), no solo en el submodulo nombrado. +3. **Listas multilinea con parentesis**: `from datascience import (\n foo,\n bar,\n)` se parsea entero. +4. **Aliases y comentarios**: `from pkg import foo as bar # noqa` resuelve la funcion importada (`foo`); el alias local y el comentario se descartan. + +El directorio de paquete se deriva de `file_path`, NO del campo `domain`: p.ej. las funciones `metabase` viven en `python/functions/metabase/` pero tienen `domain=infra`. + +**No soportado** (fuera de alcance, issue 0056): +- `from import *` (star import): se trata como vacio (no cuenta como uso). El registry desaconseja star imports. +- Carga dinamica con `importlib.util.spec_from_file_location(...)` o `import pkg` + `pkg.func()`: no son sentencias `from ... import` estaticas y no se detectan (causa el drift residual en apps como `osint_db`/`osint_web` que cargan funciones via wrapper dinamico). ## Notas - Read-only: no toca la BD ni archivos. - Apps cuyo `dir_path` no existe en disco se incluyen con `Missing = nil, Unused = nil` (no se puede inspeccionar el codigo). -- Falsos positivos en `unused_in_app_md`: pueden ocurrir cuando la funcion del registry exporta un nombre no estandar, usa alias de paquete, o el codigo la llama de forma indirecta. Confirmar a mano antes de eliminar de `uses_functions`. -- Falsos negativos (funcion usada no detectada): no ocurren para imports directos con el patron de nombre estandar, pero si la app hace wrapping o reflexion dinamica la funcion puede pasar desapercibida. -- Python: solo detecta `from pkg import X`. Los `import pkg` seguidos de `pkg.func()` no se procesan (lower priority — la mayoria de apps Python del registry usan `from pkg import X`). +- Falsos positivos en `unused_in_app_md`: pueden ocurrir cuando la funcion del registry exporta un nombre no estandar (Go), o cuando una app Python la carga de forma dinamica (`importlib`). Confirmar a mano antes de eliminar de `uses_functions`. +- Falsos negativos (funcion usada no detectada): no ocurren para imports estaticos Python (`from pkg[.sub] import X`, incluido multilinea) ni para imports Go con el patron de nombre estandar, pero si la app hace wrapping o reflexion dinamica la funcion puede pasar desapercibida. + +## Capability growth log + +- v1.1.0 (2026-06-30) — el parser Python detecta imports anidados (`from pkg.subpkg import X`) y listas multilinea con parentesis; resolucion validada contra el directorio de paquete del registry (derivado de `file_path`), eliminando falsos `unused` en apps que usaban esos patrones (issue 0056). diff --git a/functions/infra/audit_uses_functions_test.go b/functions/infra/audit_uses_functions_test.go index 4a7132a4..dd711393 100644 --- a/functions/infra/audit_uses_functions_test.go +++ b/functions/infra/audit_uses_functions_test.go @@ -57,6 +57,177 @@ INSERT INTO functions (id, name, domain, lang, file_path) } } +// insertTestFunctions appends extra functions to the test registry.db created by +// createTestRegistryDB. Used by the Python import tests, which need py functions +// whose file_path maps to a real package directory under python/functions/. +func insertTestFunctions(t *testing.T, root string, fns []struct { + id, name, domain, lang, filePath string +}) { + t.Helper() + db, err := sql.Open("sqlite3", filepath.Join(root, "registry.db")) + if err != nil { + t.Fatal(err) + } + defer db.Close() + for _, f := range fns { + if _, err := db.Exec( + `INSERT INTO functions (id, name, domain, lang, file_path) VALUES (?,?,?,?,?)`, + f.id, f.name, f.domain, f.lang, f.filePath, + ); err != nil { + t.Fatalf("insert fn %s: %v", f.id, err) + } + } +} + +// writePyApp creates a Python app directory with a single source file. +func writePyApp(t *testing.T, root, dirPath, src string) { + t.Helper() + appDir := filepath.Join(root, dirPath) + if err := os.MkdirAll(appDir, 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(appDir, "main.py"), []byte(src), 0644); err != nil { + t.Fatal(err) + } +} + +// containsID reports whether ids contains target. +func containsID(ids []string, target string) bool { + for _, id := range ids { + if id == target { + return true + } + } + return false +} + +// TestAuditUsesFunctions_DetectsNestedImport verifies that a nested import +// ("from metabase.cards import metabase_get_card") resolves to its function ID. +// The app declares no uses_functions, so the detected import surfaces as Missing. +func TestAuditUsesFunctions_DetectsNestedImport(t *testing.T) { + root := t.TempDir() + createTestRegistryDB(t, root, []struct { + id, lang, dirPath, usesFunctions string + }{ + {"nestedapp_py_tools", "py", "apps/nestedapp", `[]`}, + }) + insertTestFunctions(t, root, []struct { + id, name, domain, lang, filePath string + }{ + {"metabase_get_card_py_infra", "metabase_get_card", "infra", "py", "python/functions/metabase/cards.py"}, + }) + + writePyApp(t, root, "apps/nestedapp", `import sys +from metabase.cards import metabase_get_card # noqa: E402 + +def run(): + return metabase_get_card(1) +`) + + results, err := AuditUsesFunctions(root) + if err != nil { + t.Fatalf("AuditUsesFunctions: %v", err) + } + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + got := results[0] + if !containsID(got.Missing, "metabase_get_card_py_infra") { + t.Errorf("nested import not detected: Missing = %v, want to contain metabase_get_card_py_infra", got.Missing) + } + if len(got.Unused) != 0 { + t.Errorf("Unused = %v, want []", got.Unused) + } +} + +// TestAuditUsesFunctions_NoFalsePositiveOnNested verifies that when an app +// imports nested + multi-line parenthesised lists and declares them all in +// uses_functions, no function is reported as unused (the core regression fixed +// by this issue: false "unused" hits for nested/multi-line imports). +func TestAuditUsesFunctions_NoFalsePositiveOnNested(t *testing.T) { + root := t.TempDir() + createTestRegistryDB(t, root, []struct { + id, lang, dirPath, usesFunctions string + }{ + {"nofp_py_tools", "py", "apps/nofp", + `["imap_connect_py_infra","smtp_send_py_infra","fetch_reddit_search_py_datascience","score_demand_signal_py_datascience"]`}, + }) + insertTestFunctions(t, root, []struct { + id, name, domain, lang, filePath string + }{ + {"imap_connect_py_infra", "imap_connect", "infra", "py", "python/functions/infra/imap_connect.py"}, + {"smtp_send_py_infra", "smtp_send", "infra", "py", "python/functions/infra/smtp_send.py"}, + {"fetch_reddit_search_py_datascience", "fetch_reddit_search", "datascience", "py", "python/functions/datascience/fetch_reddit_search.py"}, + {"score_demand_signal_py_datascience", "score_demand_signal", "datascience", "py", "python/functions/datascience/score_demand_signal.py"}, + }) + + // Nested imports + a parenthesised multi-line list — both previously missed. + writePyApp(t, root, "apps/nofp", `import sys +from infra.imap_connect import imap_connect # noqa: E402 +from infra.smtp_send import smtp_send, SMTPConfigPy # noqa: E402 +from datascience import ( # noqa: E402 + fetch_reddit_search, + score_demand_signal, +) + +def run(): + return imap_connect, smtp_send, fetch_reddit_search, score_demand_signal +`) + + results, err := AuditUsesFunctions(root) + if err != nil { + t.Fatalf("AuditUsesFunctions: %v", err) + } + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + got := results[0] + if len(got.Unused) != 0 { + t.Errorf("false positive unused detected: Unused = %v, want []", got.Unused) + } + if len(got.Missing) != 0 { + t.Errorf("Missing = %v, want []", got.Missing) + } +} + +// TestAuditUsesFunctions_StarImport documents that star imports +// ("from import *") are NOT treated as using any function: a declared +// function not otherwise referenced is reported as unused. +func TestAuditUsesFunctions_StarImport(t *testing.T) { + root := t.TempDir() + createTestRegistryDB(t, root, []struct { + id, lang, dirPath, usesFunctions string + }{ + {"starapp_py_tools", "py", "apps/starapp", `["filter_list_py_core"]`}, + }) + insertTestFunctions(t, root, []struct { + id, name, domain, lang, filePath string + }{ + {"filter_list_py_core", "filter_list", "core", "py", "python/functions/core/core.py"}, + }) + + writePyApp(t, root, "apps/starapp", `from core import * + +def run(): + return None +`) + + results, err := AuditUsesFunctions(root) + if err != nil { + t.Fatalf("AuditUsesFunctions: %v", err) + } + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + got := results[0] + if !containsID(got.Unused, "filter_list_py_core") { + t.Errorf("star import should not count as usage: Unused = %v, want to contain filter_list_py_core", got.Unused) + } + if len(got.Missing) != 0 { + t.Errorf("Missing = %v, want []", got.Missing) + } +} + // TestAuditUsesFunctions_DetectsMissing verifies that a Go app that calls // RandomHexID in its source but declares empty uses_functions gets // random_hex_id_go_core reported as missing.