"""Chapter registry — the canonical order of an AutomaticEDA document. ``CHAPTER_ORDER`` declares every chapter the engine will *ever* place, in the order they appear in the document. Each id maps by convention to a module ``automatic_eda/chapters/.py`` exposing ``build_(profile, ctx) -> Chapter | None`` and a ``CHAPTER_VERSION`` constant. This pre-declared order is what lets many agents add chapters in parallel without contention: an agent only creates its own ``chapters/.py`` module — it never edits this file. ``build_document`` imports each chapter lazily; a chapter whose module does not exist yet (not implemented) is simply skipped, so the document is always renderable with whatever chapters are present today. ``build_document`` never raises: a chapter that errors out is dropped with a note, and a chapter that returns ``None`` (does not apply to this dataset, e.g. time series on a dataset with no date column) is omitted. """ from __future__ import annotations import importlib from . import model # Canonical document order. Implemented today: portada, overview. The rest are # placeholders other agents will fill by creating chapters/.py — they will # appear in this exact position automatically once their module exists. CHAPTER_ORDER = [ "portada", # cover — BUILT LAST, PLACED FIRST (see build_document). "overview", # df.head + columns/types/nulls/examples + describe "analisis_llm", # LLM interpretation — sits next to overview (user request) "num_distr", # numeric distributions "cat_distr", # categorical distributions "calidad", # data quality "correlacion", # correlations / associations "relaciones", # key relations: declared/candidate PK + FK (inter/intra-table) "modelos", # cheap models (PCA/KMeans/outliers) "timeseries", # time-series analysis "geospatial", # geospatial "agregacion", # aggregations / pivots "glosario", # glossary — ALWAYS LAST; clickable term destinations. ] # Chapters whose position is special-cased by build_document: portada is built # last (so it can summarize the rest) but placed first; glosario is built and # placed last (it reads the terms every other chapter registered). _PORTADA = "portada" _GLOSARIO = "glosario" def build_chapter(chapter_id: str, profile: dict, ctx: dict): """Build a single chapter by id, or None if absent/not-applicable/error. Looks up ``automatic_eda.chapters.`` and calls its ``build_(profile, ctx)``. Returns a normalized Chapter, or None when the module is missing, the builder returns None, or anything raises. """ mod_name = f"{__package__}.chapters.{chapter_id}" try: mod = importlib.import_module(mod_name) except Exception: # noqa: BLE001 — chapter not implemented yet → skip. return None builder = getattr(mod, f"build_{chapter_id}", None) if builder is None: return None try: result = builder(profile or {}, ctx or {}) except Exception: # noqa: BLE001 — a broken chapter never aborts the doc. return None return model.as_chapter(result) def build_document(profile: dict, ctx: dict = None) -> list: """Build the full ordered list of chapters for a TableProfile. Args: profile: the ``eda`` group TableProfile dict (may be None/empty). ctx: optional context dict carrying presentation metadata not present in the profile (dataset_name, source_origin, storage, generated_at, description, granularity, quality_criteria, head_rows, ...). Returns: list[Chapter] in canonical order, containing only the chapters that are implemented and applicable. Never raises. """ if not isinstance(profile, dict): profile = {} # Copy ctx so the shared collector / summary we add do not leak to the caller. ctx = dict(ctx) if isinstance(ctx, dict) else {} # A single glossary collector is shared by every chapter via ctx['glossary']. # Chapters call ctx['glossary'].add(key, label, definition) and mark in-text # appearances with [[term:key]]…[[/term]]; the glosario chapter renders the # registered terms and the renderers wire the clickable links. glossary = ctx.get("glossary") if not isinstance(glossary, model.GlossaryCollector): glossary = model.GlossaryCollector() ctx["glossary"] = glossary # 1) Body: every chapter except portada (built last) and glosario (placed # last), in canonical order. This also fills the glossary collector. body = [] for cid in CHAPTER_ORDER: if cid in (_PORTADA, _GLOSARIO): continue ch = build_chapter(cid, profile, ctx) if ch is not None and ch.blocks: body.append(ch) # 2) Aggregated summary of the rest, for the cover (user decision: the cover # is BUILT after the body so it can reflect what the analysis found). ctx["document_summary"] = _summarize_document(profile, body) # 3) Build the cover last, place it FIRST. portada = build_chapter(_PORTADA, profile, ctx) # 4) Build the glossary last (reads the terms the body registered), place LAST. glosario = build_chapter(_GLOSARIO, profile, ctx) chapters = [] if portada is not None and portada.blocks: chapters.append(portada) chapters.extend(body) if glosario is not None and glosario.blocks: chapters.append(glosario) return chapters def _summarize_document(profile: dict, body: list) -> dict: """Aggregate a tiny findings summary of the body for the cover. Never raises. Returns a dict with dataset shape, quality, column-type counts and the list of chapters actually included — enough for the cover to show a mini-summary of the analysis without re-deriving anything.""" try: cols = profile.get("columns") or [] n_num = sum(1 for c in cols if isinstance(c, dict) and c.get("inferred_type") == "numeric") n_cat = sum(1 for c in cols if isinstance(c, dict) and isinstance(c.get("categorical"), dict) and c.get("categorical", {}).get("top") and c.get("inferred_type") != "numeric") return { "n_chapters": len(body), "chapter_titles": [getattr(c, "title", "") for c in body], "n_rows": profile.get("n_rows"), "n_cols": profile.get("n_cols"), "quality_score": profile.get("quality_score"), "n_numeric": n_num, "n_categorical": n_cat, "duplicate_pct": profile.get("duplicate_pct"), "null_cell_pct": profile.get("null_cell_pct"), } except Exception: # noqa: BLE001 — the summary is best-effort. return {"n_chapters": len(body) if isinstance(body, list) else 0}