54a9ab70c7
Permite renderizar un SUBCONJUNTO de capítulos del informe AutomaticEDA (only_chapters=[...]) para iterar/testear un capítulo concreto sin generar el documento entero, garantizando que el capítulo pedido SIEMPRE llegue poblado. - Nuevo módulo automatic_eda/chapter_deps.py: mapa central CHAPTER_DEPS (fuente de verdad) que declara, por capítulo de CHAPTER_ORDER, qué flags de cómputo (run_models/run_series/run_llm) y qué piezas de ctx (raw_numeric, timeseries_raw, geo_points, head_rows, db_path/table) necesita para no salir degradado. Helpers puros: resolve_requirements, resolve_profile_flags, needs_render_ctx, resolve_ctx_data_keys, validate_chapter_ids. - build_document(profile, ctx, only=None): parámetro only opcional que restringe el cuerpo a esos capítulos (portada primera + glosario última siempre). Lee la clave reservada ctx['_only_chapters'] cuando only es None, para propagar la selección a través de los renderers sin modificarlos. Retrocompatible. - render_automatic_eda(..., only_chapters=None): valida los ids (error claro dict-no-throw), resuelve las dependencias activando el cómputo necesario aunque el caller no lo pidiera (un flag explícito siempre prima) y construyendo solo las piezas de ctx que los capítulos pedidos leen (salta build_eda_render_ctx entero si ninguno necesita datos crudos). only_chapters=None produce el documento completo idéntico al de hoy. - Tests: chapter_deps_test.py (resolución pura), build_document_only_test.py (filtro), render_automatic_eda_only_test.py (golden con DuckDB: outliers suelto con IsolationForest poblado por resolución; timeseries activa run_series; eficiencia geospatial sin modelos; edge cases). - .md del pipeline: documenta only_chapters + emit_md; version 1.1.0 -> 1.2.0. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
189 lines
8.9 KiB
Python
189 lines
8.9 KiB
Python
"""Chapter registry — the canonical order of an AutomaticEDA document.
|
|
|
|
``CHAPTER_ORDER`` declares every chapter the engine will *ever* place, in the
|
|
order they appear in the document. Each id maps by convention to a module
|
|
``automatic_eda/chapters/<id>.py`` exposing ``build_<id>(profile, ctx) ->
|
|
Chapter | None`` and a ``CHAPTER_VERSION`` constant.
|
|
|
|
This pre-declared order is what lets many agents add chapters in parallel
|
|
without contention: an agent only creates its own ``chapters/<id>.py`` module —
|
|
it never edits this file. ``build_document`` imports each chapter lazily; a
|
|
chapter whose module does not exist yet (not implemented) is simply skipped, so
|
|
the document is always renderable with whatever chapters are present today.
|
|
|
|
``build_document`` never raises: a chapter that errors out is dropped with a
|
|
note, and a chapter that returns ``None`` (does not apply to this dataset, e.g.
|
|
time series on a dataset with no date column) is omitted.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import importlib
|
|
|
|
from . import model
|
|
|
|
# Canonical document order. Implemented today: portada, overview. The rest are
|
|
# placeholders other agents will fill by creating chapters/<id>.py — they will
|
|
# appear in this exact position automatically once their module exists.
|
|
CHAPTER_ORDER = [
|
|
"portada", # cover — BUILT LAST, PLACED FIRST (see build_document).
|
|
"overview", # df.head + columns/types/nulls/examples + describe
|
|
"analisis_llm", # LLM interpretation — sits next to overview (user request)
|
|
"num_distr", # numeric distributions
|
|
"cat_distr", # categorical distributions
|
|
"text_distr", # free-text / NLP distributions (non-tabular content)
|
|
"calidad", # data quality
|
|
"missingness", # missing-data patterns (co-occurrence of absences; MCAR/MAR)
|
|
"outliers", # atypical values: univariate (Tukey/z) + multivariate (IsolationForest)
|
|
"correlacion", # correlations / associations
|
|
"relaciones", # key relations: declared/candidate PK + FK (inter/intra-table)
|
|
"modelos", # cheap models (PCA/KMeans/outliers)
|
|
"timeseries", # time-series analysis
|
|
"geospatial", # geospatial
|
|
"agregacion", # aggregations / pivots
|
|
"glosario", # glossary — ALWAYS LAST; clickable term destinations.
|
|
]
|
|
|
|
# Chapters whose position is special-cased by build_document: portada is built
|
|
# last (so it can summarize the rest) but placed first; glosario is built and
|
|
# placed last (it reads the terms every other chapter registered).
|
|
_PORTADA = "portada"
|
|
_GLOSARIO = "glosario"
|
|
|
|
|
|
def build_chapter(chapter_id: str, profile: dict, ctx: dict):
|
|
"""Build a single chapter by id, or None if absent/not-applicable/error.
|
|
|
|
Looks up ``automatic_eda.chapters.<chapter_id>`` and calls its
|
|
``build_<chapter_id>(profile, ctx)``. Returns a normalized Chapter, or None
|
|
when the module is missing, the builder returns None, or anything raises.
|
|
"""
|
|
mod_name = f"{__package__}.chapters.{chapter_id}"
|
|
try:
|
|
mod = importlib.import_module(mod_name)
|
|
except Exception: # noqa: BLE001 — chapter not implemented yet → skip.
|
|
return None
|
|
builder = getattr(mod, f"build_{chapter_id}", None)
|
|
if builder is None:
|
|
return None
|
|
try:
|
|
result = builder(profile or {}, ctx or {})
|
|
except Exception: # noqa: BLE001 — a broken chapter never aborts the doc.
|
|
return None
|
|
return model.as_chapter(result)
|
|
|
|
|
|
def build_document(profile: dict, ctx: dict = None, only: list = None) -> list:
|
|
"""Build the ordered list of chapters for a TableProfile.
|
|
|
|
Args:
|
|
profile: the ``eda`` group TableProfile dict (may be None/empty).
|
|
ctx: optional context dict carrying presentation metadata not present in
|
|
the profile (dataset_name, source_origin, storage, generated_at,
|
|
description, granularity, quality_criteria, head_rows, ...).
|
|
only: optional list of chapter ids to render. ``None`` (default) keeps
|
|
the historical behaviour — every implemented & applicable chapter in
|
|
canonical order. A list restricts the BODY to just those ids (in
|
|
canonical order), but the cover (``portada``) and glossary
|
|
(``glosario``) are ALWAYS included so the document stays valid and
|
|
the clickable terms keep a destination — so passing ``only=["x"]``
|
|
yields portada + x + glosario. Unknown ids are simply skipped (the
|
|
caller is responsible for strict validation). ``only=[]`` yields the
|
|
minimal document (portada + glosario only). This argument is additive
|
|
and backward-compatible: the signature is unchanged for existing
|
|
callers (default ``None``).
|
|
|
|
Returns:
|
|
list[Chapter] in canonical order, containing only the chapters that are
|
|
implemented, applicable and selected. Never raises.
|
|
"""
|
|
if not isinstance(profile, dict):
|
|
profile = {}
|
|
# Copy ctx so the shared collector / summary we add do not leak to the caller.
|
|
ctx = dict(ctx) if isinstance(ctx, dict) else {}
|
|
|
|
# only=None -> all body chapters (historical). only=list -> restrict body to
|
|
# that selection (portada/glosario are added unconditionally below). The
|
|
# renderers call build_document(profile, meta['ctx']) without an `only`
|
|
# argument, so the pipeline forwards the selection through a reserved ctx key
|
|
# (``_only_chapters``); an explicit `only` argument always wins. The key is
|
|
# popped from the local ctx copy so it never reaches the chapters.
|
|
if only is None:
|
|
_carried = ctx.pop("_only_chapters", None)
|
|
if isinstance(_carried, (list, tuple, set)):
|
|
only = list(_carried)
|
|
else:
|
|
ctx.pop("_only_chapters", None)
|
|
# A set makes the membership test cheap; the iteration order stays
|
|
# CHAPTER_ORDER. only=[] is a valid (empty) selection -> minimal document.
|
|
only_set = set(only) if isinstance(only, (list, tuple, set)) else None
|
|
|
|
# A single glossary collector is shared by every chapter via ctx['glossary'].
|
|
# Chapters call ctx['glossary'].add(key, label, definition) and mark in-text
|
|
# appearances with [[term:key]]…[[/term]]; the glosario chapter renders the
|
|
# registered terms and the renderers wire the clickable links.
|
|
glossary = ctx.get("glossary")
|
|
if not isinstance(glossary, model.GlossaryCollector):
|
|
glossary = model.GlossaryCollector()
|
|
ctx["glossary"] = glossary
|
|
|
|
# 1) Body: every chapter except portada (built last) and glosario (placed
|
|
# last), in canonical order. This also fills the glossary collector.
|
|
body = []
|
|
for cid in CHAPTER_ORDER:
|
|
if cid in (_PORTADA, _GLOSARIO):
|
|
continue
|
|
# When a selection is given, skip body chapters outside it. portada and
|
|
# glosario are never filtered (handled out of this loop).
|
|
if only_set is not None and cid not in only_set:
|
|
continue
|
|
ch = build_chapter(cid, profile, ctx)
|
|
if ch is not None and ch.blocks:
|
|
body.append(ch)
|
|
|
|
# 2) Aggregated summary of the rest, for the cover (user decision: the cover
|
|
# is BUILT after the body so it can reflect what the analysis found).
|
|
ctx["document_summary"] = _summarize_document(profile, body)
|
|
|
|
# 3) Build the cover last, place it FIRST.
|
|
portada = build_chapter(_PORTADA, profile, ctx)
|
|
# 4) Build the glossary last (reads the terms the body registered), place LAST.
|
|
glosario = build_chapter(_GLOSARIO, profile, ctx)
|
|
|
|
chapters = []
|
|
if portada is not None and portada.blocks:
|
|
chapters.append(portada)
|
|
chapters.extend(body)
|
|
if glosario is not None and glosario.blocks:
|
|
chapters.append(glosario)
|
|
return chapters
|
|
|
|
|
|
def _summarize_document(profile: dict, body: list) -> dict:
|
|
"""Aggregate a tiny findings summary of the body for the cover. Never raises.
|
|
|
|
Returns a dict with dataset shape, quality, column-type counts and the list
|
|
of chapters actually included — enough for the cover to show a mini-summary
|
|
of the analysis without re-deriving anything."""
|
|
try:
|
|
cols = profile.get("columns") or []
|
|
n_num = sum(1 for c in cols if isinstance(c, dict)
|
|
and c.get("inferred_type") == "numeric")
|
|
n_cat = sum(1 for c in cols if isinstance(c, dict)
|
|
and isinstance(c.get("categorical"), dict)
|
|
and c.get("categorical", {}).get("top")
|
|
and c.get("inferred_type") != "numeric")
|
|
return {
|
|
"n_chapters": len(body),
|
|
"chapter_titles": [getattr(c, "title", "") for c in body],
|
|
"n_rows": profile.get("n_rows"),
|
|
"n_cols": profile.get("n_cols"),
|
|
"quality_score": profile.get("quality_score"),
|
|
"n_numeric": n_num,
|
|
"n_categorical": n_cat,
|
|
"duplicate_pct": profile.get("duplicate_pct"),
|
|
"null_cell_pct": profile.get("null_cell_pct"),
|
|
}
|
|
except Exception: # noqa: BLE001 — the summary is best-effort.
|
|
return {"n_chapters": len(body) if isinstance(body, list) else 0}
|