Files
fn_registry/python/functions/datascience/automatic_eda/chapters_registry.py
T
egutierrez 68f4ddabce feat(eda): capítulo RELACIONES para AutomaticEDA
Añade el capítulo `relaciones` al motor AutomaticEDA: analiza las
relaciones de clave de la tabla/base y se coloca tras `correlacion`,
antes de `modelos`, en CHAPTER_ORDER.

Capas que renderiza (solo las que aplican; None si no hay nada que decir):
- Claves declaradas: PK/FK/UNIQUE reales del esquema DuckDB, vía la nueva
  función `detect_declared_keys_duckdb` (lee `duckdb_constraints()`).
- Candidatos a clave primaria: los `key_candidates` del TableProfile.
- FK candidatas inter-tabla: reusa `infer_fk_containment_duckdb`
  (containment + señal de nombre) y `build_join_graph` (roles de nodos +
  diagrama Mermaid pegable). Solo si la fuente DuckDB tiene varias tablas.
- FK candidatas intra-tabla: heurística nombre + cardinalidad, vía la nueva
  función pura `suggest_intratable_fk_candidates`, marcada como sugerencia.

Engancha al glosario clicable los términos PK, FK, containment/inclusión y
cardinalidad (contrato §11.1) y usa Group (keep-together) para el grafo.

Funciones nuevas del registry (grupo `eda`):
- detect_declared_keys_duckdb (impure, datascience) + test.
- suggest_intratable_fk_candidates (pure, datascience) + test.

Tests: relaciones_test.py (golden intra + inter, edges, no-cut render) +
los tests de ambas funciones. Suite automatic_eda + render_automatic_eda
verde (89 passed). Golden end-to-end con el pipeline render_automatic_eda
verificado sobre titanic (intra) y una BD customers/orders (inter).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 18:15:15 +02:00

155 lines
6.8 KiB
Python

"""Chapter registry — the canonical order of an AutomaticEDA document.
``CHAPTER_ORDER`` declares every chapter the engine will *ever* place, in the
order they appear in the document. Each id maps by convention to a module
``automatic_eda/chapters/<id>.py`` exposing ``build_<id>(profile, ctx) ->
Chapter | None`` and a ``CHAPTER_VERSION`` constant.
This pre-declared order is what lets many agents add chapters in parallel
without contention: an agent only creates its own ``chapters/<id>.py`` module —
it never edits this file. ``build_document`` imports each chapter lazily; a
chapter whose module does not exist yet (not implemented) is simply skipped, so
the document is always renderable with whatever chapters are present today.
``build_document`` never raises: a chapter that errors out is dropped with a
note, and a chapter that returns ``None`` (does not apply to this dataset, e.g.
time series on a dataset with no date column) is omitted.
"""
from __future__ import annotations
import importlib
from . import model
# Canonical document order. Implemented today: portada, overview. The rest are
# placeholders other agents will fill by creating chapters/<id>.py — they will
# appear in this exact position automatically once their module exists.
CHAPTER_ORDER = [
"portada", # cover — BUILT LAST, PLACED FIRST (see build_document).
"overview", # df.head + columns/types/nulls/examples + describe
"analisis_llm", # LLM interpretation — sits next to overview (user request)
"num_distr", # numeric distributions
"cat_distr", # categorical distributions
"calidad", # data quality
"correlacion", # correlations / associations
"relaciones", # key relations: declared/candidate PK + FK (inter/intra-table)
"modelos", # cheap models (PCA/KMeans/outliers)
"timeseries", # time-series analysis
"geospatial", # geospatial
"agregacion", # aggregations / pivots
"glosario", # glossary — ALWAYS LAST; clickable term destinations.
]
# Chapters whose position is special-cased by build_document: portada is built
# last (so it can summarize the rest) but placed first; glosario is built and
# placed last (it reads the terms every other chapter registered).
_PORTADA = "portada"
_GLOSARIO = "glosario"
def build_chapter(chapter_id: str, profile: dict, ctx: dict):
"""Build a single chapter by id, or None if absent/not-applicable/error.
Looks up ``automatic_eda.chapters.<chapter_id>`` and calls its
``build_<chapter_id>(profile, ctx)``. Returns a normalized Chapter, or None
when the module is missing, the builder returns None, or anything raises.
"""
mod_name = f"{__package__}.chapters.{chapter_id}"
try:
mod = importlib.import_module(mod_name)
except Exception: # noqa: BLE001 — chapter not implemented yet → skip.
return None
builder = getattr(mod, f"build_{chapter_id}", None)
if builder is None:
return None
try:
result = builder(profile or {}, ctx or {})
except Exception: # noqa: BLE001 — a broken chapter never aborts the doc.
return None
return model.as_chapter(result)
def build_document(profile: dict, ctx: dict = None) -> list:
"""Build the full ordered list of chapters for a TableProfile.
Args:
profile: the ``eda`` group TableProfile dict (may be None/empty).
ctx: optional context dict carrying presentation metadata not present in
the profile (dataset_name, source_origin, storage, generated_at,
description, granularity, quality_criteria, head_rows, ...).
Returns:
list[Chapter] in canonical order, containing only the chapters that are
implemented and applicable. Never raises.
"""
if not isinstance(profile, dict):
profile = {}
# Copy ctx so the shared collector / summary we add do not leak to the caller.
ctx = dict(ctx) if isinstance(ctx, dict) else {}
# A single glossary collector is shared by every chapter via ctx['glossary'].
# Chapters call ctx['glossary'].add(key, label, definition) and mark in-text
# appearances with [[term:key]]…[[/term]]; the glosario chapter renders the
# registered terms and the renderers wire the clickable links.
glossary = ctx.get("glossary")
if not isinstance(glossary, model.GlossaryCollector):
glossary = model.GlossaryCollector()
ctx["glossary"] = glossary
# 1) Body: every chapter except portada (built last) and glosario (placed
# last), in canonical order. This also fills the glossary collector.
body = []
for cid in CHAPTER_ORDER:
if cid in (_PORTADA, _GLOSARIO):
continue
ch = build_chapter(cid, profile, ctx)
if ch is not None and ch.blocks:
body.append(ch)
# 2) Aggregated summary of the rest, for the cover (user decision: the cover
# is BUILT after the body so it can reflect what the analysis found).
ctx["document_summary"] = _summarize_document(profile, body)
# 3) Build the cover last, place it FIRST.
portada = build_chapter(_PORTADA, profile, ctx)
# 4) Build the glossary last (reads the terms the body registered), place LAST.
glosario = build_chapter(_GLOSARIO, profile, ctx)
chapters = []
if portada is not None and portada.blocks:
chapters.append(portada)
chapters.extend(body)
if glosario is not None and glosario.blocks:
chapters.append(glosario)
return chapters
def _summarize_document(profile: dict, body: list) -> dict:
"""Aggregate a tiny findings summary of the body for the cover. Never raises.
Returns a dict with dataset shape, quality, column-type counts and the list
of chapters actually included — enough for the cover to show a mini-summary
of the analysis without re-deriving anything."""
try:
cols = profile.get("columns") or []
n_num = sum(1 for c in cols if isinstance(c, dict)
and c.get("inferred_type") == "numeric")
n_cat = sum(1 for c in cols if isinstance(c, dict)
and isinstance(c.get("categorical"), dict)
and c.get("categorical", {}).get("top")
and c.get("inferred_type") != "numeric")
return {
"n_chapters": len(body),
"chapter_titles": [getattr(c, "title", "") for c in body],
"n_rows": profile.get("n_rows"),
"n_cols": profile.get("n_cols"),
"quality_score": profile.get("quality_score"),
"n_numeric": n_num,
"n_categorical": n_cat,
"duplicate_pct": profile.get("duplicate_pct"),
"null_cell_pct": profile.get("null_cell_pct"),
}
except Exception: # noqa: BLE001 — the summary is best-effort.
return {"n_chapters": len(body) if isinstance(body, list) else 0}