fn_registry/python/functions/datascience/automatic_eda/chapters_registry.py

"""Chapter registry — the canonical order of an AutomaticEDA document.

``CHAPTER_ORDER`` declares every chapter the engine will *ever* place, in the
order they appear in the document. Each id maps by convention to a module
``automatic_eda/chapters/<id>.py`` exposing ``build_<id>(profile, ctx) ->
Chapter | None`` and a ``CHAPTER_VERSION`` constant.

This pre-declared order is what lets many agents add chapters in parallel
without contention: an agent only creates its own ``chapters/<id>.py`` module —
it never edits this file. ``build_document`` imports each chapter lazily; a
chapter whose module does not exist yet (not implemented) is simply skipped, so
the document is always renderable with whatever chapters are present today.

``build_document`` never raises: a chapter that errors out is dropped with a
note, and a chapter that returns ``None`` (does not apply to this dataset, e.g.
time series on a dataset with no date column) is omitted.
"""

from __future__ import annotations

import importlib

from . import model

# Canonical document order. Implemented today: portada, overview. The rest are
# placeholders other agents will fill by creating chapters/<id>.py — they will
# appear in this exact position automatically once their module exists.
CHAPTER_ORDER = [
    "portada",       # cover — BUILT LAST, PLACED FIRST (see build_document).
    "overview",      # df.head + columns/types/nulls/examples + describe
    "analisis_llm",  # LLM interpretation — sits next to overview (user request)
    "num_distr",     # numeric distributions
    "cat_distr",     # categorical distributions
    "text_distr",    # free-text / NLP distributions (non-tabular content)
    "calidad",       # data quality
    "missingness",   # missing-data patterns (co-occurrence of absences; MCAR/MAR)
    "outliers",      # atypical values: univariate (Tukey/z) + multivariate (IsolationForest)
    "correlacion",   # correlations / associations
    "relaciones",    # key relations: declared/candidate PK + FK (inter/intra-table)
    "modelos",       # cheap models (PCA/KMeans/outliers)
    "timeseries",    # time-series analysis
    "geospatial",    # geospatial
    "agregacion",    # aggregations / pivots
    "glosario",      # glossary — ALWAYS LAST; clickable term destinations.
]

# Chapters whose position is special-cased by build_document: portada is built
# last (so it can summarize the rest) but placed first; glosario is built and
# placed last (it reads the terms every other chapter registered).
_PORTADA = "portada"
_GLOSARIO = "glosario"


def build_chapter(chapter_id: str, profile: dict, ctx: dict):
    """Build a single chapter by id, or None if absent/not-applicable/error.

    Looks up ``automatic_eda.chapters.<chapter_id>`` and calls its
    ``build_<chapter_id>(profile, ctx)``. Returns a normalized Chapter, or None
    when the module is missing, the builder returns None, or anything raises.
    """
    mod_name = f"{__package__}.chapters.{chapter_id}"
    try:
        mod = importlib.import_module(mod_name)
    except Exception:  # noqa: BLE001 — chapter not implemented yet → skip.
        return None
    builder = getattr(mod, f"build_{chapter_id}", None)
    if builder is None:
        return None
    try:
        result = builder(profile or {}, ctx or {})
    except Exception:  # noqa: BLE001 — a broken chapter never aborts the doc.
        return None
    return model.as_chapter(result)


def build_document(profile: dict, ctx: dict = None, only: list = None) -> list:
    """Build the ordered list of chapters for a TableProfile.

    Args:
        profile: the ``eda`` group TableProfile dict (may be None/empty).
        ctx: optional context dict carrying presentation metadata not present in
            the profile (dataset_name, source_origin, storage, generated_at,
            description, granularity, quality_criteria, head_rows, ...).
        only: optional list of chapter ids to render. ``None`` (default) keeps
            the historical behaviour — every implemented & applicable chapter in
            canonical order. A list restricts the BODY to just those ids (in
            canonical order), but the cover (``portada``) and glossary
            (``glosario``) are ALWAYS included so the document stays valid and
            the clickable terms keep a destination — so passing ``only=["x"]``
            yields portada + x + glosario. Unknown ids are simply skipped (the
            caller is responsible for strict validation). ``only=[]`` yields the
            minimal document (portada + glosario only). This argument is additive
            and backward-compatible: the signature is unchanged for existing
            callers (default ``None``).

    Returns:
        list[Chapter] in canonical order, containing only the chapters that are
        implemented, applicable and selected. Never raises.
    """
    if not isinstance(profile, dict):
        profile = {}
    # Copy ctx so the shared collector / summary we add do not leak to the caller.
    ctx = dict(ctx) if isinstance(ctx, dict) else {}

    # only=None -> all body chapters (historical). only=list -> restrict body to
    # that selection (portada/glosario are added unconditionally below). The
    # renderers call build_document(profile, meta['ctx']) without an `only`
    # argument, so the pipeline forwards the selection through a reserved ctx key
    # (``_only_chapters``); an explicit `only` argument always wins. The key is
    # popped from the local ctx copy so it never reaches the chapters.
    if only is None:
        _carried = ctx.pop("_only_chapters", None)
        if isinstance(_carried, (list, tuple, set)):
            only = list(_carried)
    else:
        ctx.pop("_only_chapters", None)
    # A set makes the membership test cheap; the iteration order stays
    # CHAPTER_ORDER. only=[] is a valid (empty) selection -> minimal document.
    only_set = set(only) if isinstance(only, (list, tuple, set)) else None

    # A single glossary collector is shared by every chapter via ctx['glossary'].
    # Chapters call ctx['glossary'].add(key, label, definition) and mark in-text
    # appearances with [[term:key]]…[[/term]]; the glosario chapter renders the
    # registered terms and the renderers wire the clickable links.
    glossary = ctx.get("glossary")
    if not isinstance(glossary, model.GlossaryCollector):
        glossary = model.GlossaryCollector()
        ctx["glossary"] = glossary

    # 1) Body: every chapter except portada (built last) and glosario (placed
    # last), in canonical order. This also fills the glossary collector.
    body = []
    for cid in CHAPTER_ORDER:
        if cid in (_PORTADA, _GLOSARIO):
            continue
        # When a selection is given, skip body chapters outside it. portada and
        # glosario are never filtered (handled out of this loop).
        if only_set is not None and cid not in only_set:
            continue
        ch = build_chapter(cid, profile, ctx)
        if ch is not None and ch.blocks:
            body.append(ch)

    # 2) Aggregated summary of the rest, for the cover (user decision: the cover
    # is BUILT after the body so it can reflect what the analysis found).
    ctx["document_summary"] = _summarize_document(profile, body)

    # 3) Build the cover last, place it FIRST.
    portada = build_chapter(_PORTADA, profile, ctx)
    # 4) Build the glossary last (reads the terms the body registered), place LAST.
    glosario = build_chapter(_GLOSARIO, profile, ctx)

    chapters = []
    if portada is not None and portada.blocks:
        chapters.append(portada)
    chapters.extend(body)
    if glosario is not None and glosario.blocks:
        chapters.append(glosario)
    return chapters


def _summarize_document(profile: dict, body: list) -> dict:
    """Aggregate a tiny findings summary of the body for the cover. Never raises.

    Returns a dict with dataset shape, quality, column-type counts and the list
    of chapters actually included — enough for the cover to show a mini-summary
    of the analysis without re-deriving anything."""
    try:
        cols = profile.get("columns") or []
        n_num = sum(1 for c in cols if isinstance(c, dict)
                    and c.get("inferred_type") == "numeric")
        n_cat = sum(1 for c in cols if isinstance(c, dict)
                    and isinstance(c.get("categorical"), dict)
                    and c.get("categorical", {}).get("top")
                    and c.get("inferred_type") != "numeric")
        return {
            "n_chapters": len(body),
            "chapter_titles": [getattr(c, "title", "") for c in body],
            "n_rows": profile.get("n_rows"),
            "n_cols": profile.get("n_cols"),
            "quality_score": profile.get("quality_score"),
            "n_numeric": n_num,
            "n_categorical": n_cat,
            "duplicate_pct": profile.get("duplicate_pct"),
            "null_cell_pct": profile.get("null_cell_pct"),
        }
    except Exception:  # noqa: BLE001 — the summary is best-effort.
        return {"n_chapters": len(body) if isinstance(body, list) else 0}