merge: capitulo AutomaticEDA agregacion (verificado met) + funciones delegadas eda

2026-06-30 15:45:37 +02:00
parent 05fe76bce0 fd59530751
commit 5eaf3f662e
15 changed files with 2994 additions and 0 deletions
@@ -0,0 +1,592 @@
+"""Aggregation chapter (AGREGACION) — group analysis / OLAP of the EDA.
+
+This chapter is the group-by / pivot ("OLAP") section of an AutomaticEDA report
+and is meant to be present **whenever the dataset has at least one low-cardinality
+categorical column to group by**. For the most interesting categoricals (chosen
+by their cardinality/relevance, optionally with an LLM) it renders, as blocks the
+core paginator never cuts:
+
+1. **Per-group statistics** (split-apply-combine) — for each interesting
+   categorical key, the count of rows per group and, for each numeric measure,
+   its mean/median/std/min/max. One compact summary table (mean of every measure
+   per group) plus a per-measure detail table.
+2. **Bar charts** — a vertical bar chart of a measure's mean per group, bars from
+   zero (Tufte Lie-Factor = 1).
+3. **Pivot tables** — categorical A x categorical B -> aggregate of a measure,
+   limited to the top rows/cols so it fits a mobile page/slide, with a grouped
+   bar chart of the same pivot.
+
+The raw data needed to aggregate is **not** in the TableProfile, so — exactly
+like ``modelos`` reads its cluster projection from ``ctx`` — this chapter gets
+the aggregation results in one of two ways and degrades honestly when neither is
+available:
+
+ctx keys this chapter consumes (all optional):
+    aggregations : dict — pre-computed results, used directly (offline / tests /
+        forward-compatible with a calculation phase). Shape::
+
+            {"groupby": [{"group_by": str, "measures": [str], "why": str,
+                          "result": <groupby_stats_duckdb-shaped dict>}],
+             "pivots":  [{"index": str, "columns": str, "value": str, "agg": str,
+                          "why": str, "result": <pivot_table_duckdb-shaped dict>}]}
+
+    db_path, table : str — when ``aggregations`` is absent, the chapter selects
+        the interesting keys (``select_groupby_keys``), optionally asks an LLM
+        which to show (``suggest_aggregations_llm`` when ``run_agg_llm`` is True)
+        and computes the group-by/pivot results live via the push-down registry
+        functions ``groupby_stats_duckdb`` / ``pivot_table_duckdb``.
+    run_agg_llm : bool — when True (and ``db_path``/``table`` present), let the
+        LLM pick the interesting aggregations; otherwise the deterministic
+        quantitative selection is used.
+    agg_llm_model : str — model id for the optional LLM selection.
+    agg_max_keys, agg_max_card, agg_max_measures, agg_top_n : int — limits.
+    agg_insights : list — optional pre-computed micro-analysis entries
+        (``[{"title": str, "text": str}]``) rendered as an interpretation section.
+
+Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
+Reads everything defensively (``.get``) and never raises: anything missing
+degrades to a note instead of aborting the chapter; the chapter returns ``None``
+only when the dataset has no categorical column to group by.
+"""
+
+from __future__ import annotations
+
+from .. import model
+
+# Pure/impure registry functions (group ``eda``) this chapter composes. Imported
+# defensively so the chapter still builds (degrading the affected part to a note)
+# if a function is somehow unavailable / not indexed yet.
+try:
+    from datascience.select_groupby_keys import select_groupby_keys
+except Exception:  # noqa: BLE001 — keep the chapter importable no matter what.
+    select_groupby_keys = None  # type: ignore[assignment]
+try:
+    from datascience.groupby_stats_duckdb import groupby_stats_duckdb
+except Exception:  # noqa: BLE001
+    groupby_stats_duckdb = None  # type: ignore[assignment]
+try:
+    from datascience.pivot_table_duckdb import pivot_table_duckdb
+except Exception:  # noqa: BLE001
+    pivot_table_duckdb = None  # type: ignore[assignment]
+try:
+    from datascience.suggest_aggregations_llm import suggest_aggregations_llm
+except Exception:  # noqa: BLE001
+    suggest_aggregations_llm = None  # type: ignore[assignment]
+
+CHAPTER_VERSION = "1.0.0"
+CHAPTER_ID = "agregacion"
+CHAPTER_TITLE = "Agregación por grupos"
+
+# Tableau-10 palette — stable colours for the pivot's grouped-bar series.
+_SERIES_COLORS = [
+    "#4e79a7", "#f28e2b", "#e15759", "#76b7b2", "#59a14f",
+    "#edc948", "#b07aa1", "#ff9da7", "#9c755f", "#bab0ac",
+]
+
+# Defaults for the live selection/aggregation (overridable via ctx).
+_DEF_MAX_KEYS = 3
+_DEF_MAX_CARD = 20
+_DEF_MAX_MEASURES = 4
+_DEF_TOP_N = 12
+
+
+# --------------------------------------------------------------------------- #
+# Formatting helpers (mirror the other chapters' defensive style).
+# --------------------------------------------------------------------------- #
+def _fmt_num(value, decimals: int = 3) -> str:
+    if value is None:
+        return "—"
+    if isinstance(value, bool):
+        return "sí" if value else "no"
+    if isinstance(value, int):
+        return f"{value:,}".replace(",", ".")
+    if isinstance(value, float):
+        if value != value:  # NaN
+            return "NaN"
+        if value in (float("inf"), float("-inf")):
+            return str(value)
+        text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
+        return text if text else "0"
+    return model._safe_str(value)
+
+
+def _is_dict(v) -> bool:
+    return isinstance(v, dict)
+
+
+def _measure_mean(group: dict, measure: str):
+    """Pull the mean of one measure out of a groupby-result group entry."""
+    stats = group.get("stats") if _is_dict(group.get("stats")) else {}
+    ms = stats.get(measure) if _is_dict(stats.get(measure)) else {}
+    return ms.get("mean")
+
+
+# --------------------------------------------------------------------------- #
+# Plan + data resolution. Either a pre-computed ctx['aggregations'] is used
+# verbatim, or the plan is selected and the results are computed live.
+# --------------------------------------------------------------------------- #
+def _resolve_candidates(profile: dict, ctx: dict) -> dict:
+    """Return {group_keys, measures, pivots, note} of interesting columns."""
+    pre = ctx.get("agg_candidates")
+    if _is_dict(pre) and pre.get("group_keys") is not None:
+        return pre
+    if select_groupby_keys is not None:
+        try:
+            out = select_groupby_keys(
+                profile,
+                max_keys=int(ctx.get("agg_max_keys", _DEF_MAX_KEYS)),
+                max_card=int(ctx.get("agg_max_card", _DEF_MAX_CARD)),
+                max_measures=int(ctx.get("agg_max_measures", _DEF_MAX_MEASURES)),
+            )
+            if _is_dict(out):
+                return out
+        except Exception:  # noqa: BLE001 — fall through to the inline fallback.
+            pass
+    return _inline_candidates(profile, ctx)
+
+
+def _inline_candidates(profile: dict, ctx: dict) -> dict:
+    """Minimal defensive selection when select_groupby_keys is unavailable."""
+    max_card = int(ctx.get("agg_max_card", _DEF_MAX_CARD))
+    max_keys = int(ctx.get("agg_max_keys", _DEF_MAX_KEYS))
+    max_measures = int(ctx.get("agg_max_measures", _DEF_MAX_MEASURES))
+    keys = profile.get("key_candidates") or []
+    group_keys, measures = [], []
+    for col in profile.get("columns") or []:
+        if not _is_dict(col):
+            continue
+        name = col.get("name")
+        it = col.get("inferred_type")
+        flags = col.get("flags") or []
+        dc = col.get("distinct_count")
+        if it in ("categorical", "boolean") and name not in keys:
+            if ("possible_id" not in flags and "high_cardinality" not in flags
+                    and "constant" not in flags
+                    and isinstance(dc, int) and 2 <= dc <= max_card):
+                group_keys.append({"col": name, "cardinality": dc, "score": 0.0})
+        elif it == "numeric":
+            num = col.get("numeric") or {}
+            if num.get("std") not in (None, 0) and not (
+                    "possible_id" in flags and (col.get("unique_pct") or 0) >= 0.99):
+                measures.append(name)
+    group_keys = group_keys[:max_keys]
+    measures = measures[:max_measures]
+    pivots = []
+    if len(group_keys) >= 2:
+        pivots.append({"index": group_keys[0]["col"],
+                       "columns": group_keys[1]["col"],
+                       "value": measures[0] if measures else None})
+    return {"group_keys": group_keys, "measures": measures, "pivots": pivots,
+            "note": "selección cuantitativa básica"}
+
+
+def _resolve_plan(profile: dict, ctx: dict, candidates: dict) -> dict:
+    """Return {aggregations:[{group_by,measures,why}], pivots:[...], source}."""
+    group_keys = candidates.get("group_keys") or []
+    measures = candidates.get("measures") or []
+
+    if ctx.get("run_agg_llm") and suggest_aggregations_llm is not None:
+        try:
+            plan = suggest_aggregations_llm(
+                profile, candidates,
+                max_aggs=int(ctx.get("agg_max_keys", _DEF_MAX_KEYS)),
+                model=ctx.get("agg_llm_model", "claude-haiku-4-5-20251001"))
+            if _is_dict(plan) and plan.get("aggregations"):
+                return {"aggregations": plan.get("aggregations") or [],
+                        "pivots": plan.get("pivots") or [],
+                        "source": plan.get("source", "llm")}
+        except Exception:  # noqa: BLE001 — fall back to the quantitative plan.
+            pass
+
+    aggregations = [{
+        "group_by": gk.get("col"),
+        "measures": measures,
+        "why": f"categórica de {_fmt_num(gk.get('cardinality'))} niveles",
+    } for gk in group_keys if _is_dict(gk) and gk.get("col")]
+    pivots = []
+    for pv in candidates.get("pivots") or []:
+        if _is_dict(pv) and pv.get("index") and pv.get("columns"):
+            pivots.append({"index": pv.get("index"), "columns": pv.get("columns"),
+                           "value": pv.get("value") or (measures[0] if measures else None),
+                           "agg": "mean", "why": "cruce de dos categóricas"})
+    return {"aggregations": aggregations, "pivots": pivots, "source": "quantitative"}
+
+
+def _live_groupby(ctx: dict, group_by: str, measures: list, top_n: int):
+    """Compute one group-by result live via the push-down registry function."""
+    db_path = ctx.get("db_path")
+    table = ctx.get("table")
+    if not db_path or not table or groupby_stats_duckdb is None:
+        return None
+    try:
+        out = groupby_stats_duckdb(db_path, table, group_by, list(measures or []),
+                                   top_n=top_n)
+        if _is_dict(out) and out.get("status") == "ok":
+            return out
+    except Exception:  # noqa: BLE001
+        return None
+    return None
+
+
+def _live_pivot(ctx: dict, index: str, columns: str, value, agg: str):
+    """Compute one pivot live via the push-down registry function."""
+    db_path = ctx.get("db_path")
+    table = ctx.get("table")
+    if not db_path or not table or pivot_table_duckdb is None or not value:
+        return None
+    try:
+        out = pivot_table_duckdb(db_path, table, index, columns, value,
+                                 agg=agg or "mean")
+        if _is_dict(out) and out.get("status") == "ok":
+            return out
+    except Exception:  # noqa: BLE001
+        return None
+    return None
+
+
+# --------------------------------------------------------------------------- #
+# Figure builders (lazy: matplotlib only imported when the renderer draws them).
+# --------------------------------------------------------------------------- #
+def _make_group_bars(group_by: str, measure: str, groups: list):
+    """Vertical bars: mean of ``measure`` per group, bars from zero."""
+    labels, values = [], []
+    for g in groups:
+        if not _is_dict(g):
+            continue
+        mean = _measure_mean(g, measure)
+        if mean is None:
+            continue
+        labels.append(model._safe_str(g.get("key")))
+        values.append(float(mean))
+    if not labels:
+        return None
+
+    def _draw():
+        import matplotlib
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+
+        fig, ax = plt.subplots(figsize=(6.6, 3.6))
+        xs = list(range(len(labels)))
+        ax.bar(xs, values, color="#4e79a7", alpha=0.9, edgecolor="#2f4d6e",
+               linewidth=0.4)
+        ax.set_xticks(xs)
+        short = [(s[:18] + "…") if len(s) > 19 else s for s in labels]
+        rot = 30 if max((len(s) for s in short), default=0) > 6 else 0
+        ax.set_xticklabels(short, rotation=rot, ha="right" if rot else "center",
+                           fontsize=7)
+        ax.set_ylabel(f"media de {measure}", fontsize=8)
+        ax.set_xlabel(group_by, fontsize=8)
+        ax.set_title(f"Media de «{measure}» por «{group_by}»", fontsize=10)
+        ax.grid(axis="y", color="#dddddd", linewidth=0.6)
+        for spine in ("top", "right"):
+            ax.spines[spine].set_visible(False)
+        # Value labels above each bar.
+        vmax = max(values) if values else 0
+        for x, v in zip(xs, values):
+            ax.text(x, v + (abs(vmax) * 0.01 if vmax else 0.01),
+                    _fmt_num(v, 2), ha="center", va="bottom", fontsize=6.5)
+        fig.tight_layout()
+        return fig
+
+    return _draw
+
+
+def _make_pivot_bars(pivot: dict):
+    """Grouped bars of a pivot: x = row_labels, one series per col_label."""
+    row_labels = pivot.get("row_labels") or []
+    col_labels = pivot.get("col_labels") or []
+    matrix = pivot.get("matrix") or []
+    if not row_labels or not col_labels or not matrix:
+        return None
+
+    def _draw():
+        import matplotlib
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+
+        n_rows = len(row_labels)
+        n_cols = len(col_labels)
+        fig, ax = plt.subplots(figsize=(6.8, 3.8))
+        total_w = 0.8
+        bar_w = total_w / max(n_cols, 1)
+        base = list(range(n_rows))
+        for j, clabel in enumerate(col_labels):
+            offs = [b - total_w / 2 + bar_w * (j + 0.5) for b in base]
+            vals = []
+            for i in range(n_rows):
+                cell = matrix[i][j] if (i < len(matrix) and j < len(matrix[i])) else None
+                vals.append(float(cell) if isinstance(cell, (int, float)) else 0.0)
+            color = _SERIES_COLORS[j % len(_SERIES_COLORS)]
+            ax.bar(offs, vals, width=bar_w, color=color, alpha=0.9,
+                   label=model._safe_str(clabel))
+        ax.set_xticks(base)
+        short = [(s[:16] + "…") if len(s) > 17 else s
+                 for s in (model._safe_str(r) for r in row_labels)]
+        rot = 30 if max((len(s) for s in short), default=0) > 6 else 0
+        ax.set_xticklabels(short, rotation=rot, ha="right" if rot else "center",
+                           fontsize=7)
+        ax.set_xlabel(model._safe_str(pivot.get("index")), fontsize=8)
+        ax.set_ylabel(f"{pivot.get('agg','mean')} de {pivot.get('value')}",
+                      fontsize=8)
+        ax.set_title(f"{pivot.get('index')} × {pivot.get('columns')}", fontsize=10)
+        ax.grid(axis="y", color="#dddddd", linewidth=0.6)
+        ax.legend(title=model._safe_str(pivot.get("columns")), fontsize=6.5,
+                  title_fontsize=7, frameon=True, framealpha=0.9, loc="best")
+        for spine in ("top", "right"):
+            ax.spines[spine].set_visible(False)
+        fig.tight_layout()
+        return fig
+
+    return _draw
+
+
+def _group_bars_maker(group_by: str, measure: str, groups: list):
+    """Bind per-aggregation args so the lazy closure is loop-safe."""
+    def _make():
+        return _make_group_bars(group_by, measure, groups)()
+    return _make
+
+
+def _pivot_bars_maker(pivot: dict):
+    def _make():
+        return _make_pivot_bars(pivot)()
+    return _make
+
+
+# --------------------------------------------------------------------------- #
+# Section builders. Each returns a list of blocks (possibly empty).
+# --------------------------------------------------------------------------- #
+def _groupby_section(group_by: str, measures: list, result: dict, why: str) -> list:
+    """Build the blocks for one group-by aggregation, or [] if unusable."""
+    if not _is_dict(result) or not result.get("groups"):
+        return []
+    groups = [g for g in result.get("groups") or [] if _is_dict(g)]
+    if not groups:
+        return []
+    eff_measures = result.get("measures") or measures or []
+
+    blocks = [model.Heading(text=f"Agrupado por «{group_by}»", level=2)]
+    intro = f"**{why}.** " if why else ""
+    intro += (f"{_fmt_num(result.get('n_groups') or len(groups))} grupos"
+              f"{' (top por tamaño)' if result.get('truncated') else ''}.")
+    blocks.append(model.Markdown(text=intro))
+
+    # Summary table: one row per group, count + mean of every measure.
+    header = ["Grupo", "n"] + [f"{m} (media)" for m in eff_measures]
+    rows = []
+    for g in groups:
+        row = [model._safe_str(g.get("key")), _fmt_num(g.get("n"))]
+        for m in eff_measures:
+            row.append(_fmt_num(_measure_mean(g, m), 2))
+        rows.append(row)
+    blocks.append(model.DataTable(
+        header=header, rows=rows, title=f"Resumen por «{group_by}»",
+        note="Conteo de filas y media de cada medida por grupo."))
+
+    if not eff_measures:
+        return blocks
+
+    # Primary measure: a bar chart + a detail table (mean/median/std/min/max).
+    primary = eff_measures[0]
+    bars = _make_group_bars(group_by, primary, groups)
+    if bars is not None:
+        blocks.append(model.Figure(
+            make=_group_bars_maker(group_by, primary, groups),
+            caption=f"Media de «{primary}» por «{group_by}» (barras desde cero)."))
+
+    det_header = ["Grupo", "n", "media", "mediana", "σ", "mín", "máx"]
+    det_rows = []
+    for g in groups:
+        stats = g.get("stats") if _is_dict(g.get("stats")) else {}
+        ms = stats.get(primary) if _is_dict(stats.get(primary)) else {}
+        det_rows.append([
+            model._safe_str(g.get("key")), _fmt_num(g.get("n")),
+            _fmt_num(ms.get("mean"), 2), _fmt_num(ms.get("median"), 2),
+            _fmt_num(ms.get("std"), 2), _fmt_num(ms.get("min"), 2),
+            _fmt_num(ms.get("max"), 2),
+        ])
+    blocks.append(model.DataTable(
+        header=det_header, rows=det_rows,
+        title=f"Detalle de «{primary}» por «{group_by}»"))
+    return blocks
+
+
+def _pivot_section(pivot_spec: dict, result: dict) -> list:
+    """Build the blocks for one pivot table, or [] if unusable."""
+    if not _is_dict(result) or not result.get("row_labels"):
+        return []
+    row_labels = result.get("row_labels") or []
+    col_labels = result.get("col_labels") or []
+    matrix = result.get("matrix") or []
+    if not row_labels or not col_labels or not matrix:
+        return []
+
+    index = result.get("index") or pivot_spec.get("index")
+    columns = result.get("columns") or pivot_spec.get("columns")
+    value = result.get("value") or pivot_spec.get("value")
+    agg = result.get("agg") or pivot_spec.get("agg") or "mean"
+    why = pivot_spec.get("why") or ""
+
+    blocks = [model.Heading(text=f"Pivot: «{index}» × «{columns}»", level=2)]
+    intro = f"**{why}.** " if why else ""
+    intro += (f"{agg} de «{value}» cruzando «{index}» (filas) y «{columns}» "
+              f"(columnas).")
+    if result.get("truncated_rows") or result.get("truncated_cols"):
+        intro += " Limitado a las filas/columnas más frecuentes."
+    blocks.append(model.Markdown(text=intro))
+
+    header = [model._safe_str(index)] + [model._safe_str(c) for c in col_labels]
+    rows = []
+    for i, rlabel in enumerate(row_labels):
+        row = [model._safe_str(rlabel)]
+        cells = matrix[i] if i < len(matrix) else []
+        for j in range(len(col_labels)):
+            cell = cells[j] if j < len(cells) else None
+            row.append(_fmt_num(cell, 2))
+        rows.append(row)
+    blocks.append(model.DataTable(
+        header=header, rows=rows,
+        title=f"{agg} de «{value}»",
+        note=f"Cada celda es {agg} de «{value}» para esa combinación."))
+
+    fig_pivot = {"row_labels": row_labels, "col_labels": col_labels,
+                 "matrix": matrix, "index": index, "columns": columns,
+                 "value": value, "agg": agg}
+    if _make_pivot_bars(fig_pivot) is not None:
+        blocks.append(model.Figure(
+            make=_pivot_bars_maker(fig_pivot),
+            caption=f"{agg} de «{value}» por «{index}» y «{columns}» "
+                    f"(barras agrupadas)."))
+    return blocks
+
+
+def _insights_section(ctx: dict) -> list:
+    """Optional pre-computed micro-analysis of the aggregations (SHOULD-11.4)."""
+    entries = ctx.get("agg_insights")
+    if not isinstance(entries, list) or not entries:
+        return []
+    blocks = [model.Heading(text="Interpretación de los grupos", level=2)]
+    for e in entries:
+        if not _is_dict(e):
+            continue
+        title = model._safe_str(e.get("title"))
+        text = model._safe_str(e.get("text"))
+        line = (f"**{title}.** " if title else "") + text
+        if line.strip():
+            blocks.append(model.Markdown(text=line))
+    return blocks if len(blocks) > 1 else []
+
+
+# --------------------------------------------------------------------------- #
+# Pre-computed path: ctx['aggregations'] already carries the results.
+# --------------------------------------------------------------------------- #
+def _sections_from_precomputed(agg: dict) -> list:
+    sections = []
+    for entry in agg.get("groupby") or []:
+        if not _is_dict(entry):
+            continue
+        sections += _groupby_section(
+            entry.get("group_by"), entry.get("measures") or [],
+            entry.get("result") or {}, entry.get("why") or "")
+    for entry in agg.get("pivots") or []:
+        if not _is_dict(entry):
+            continue
+        sections += _pivot_section(entry, entry.get("result") or {})
+    return sections
+
+
+# --------------------------------------------------------------------------- #
+# Live path: select keys, pick a plan, compute results via push-down functions.
+# --------------------------------------------------------------------------- #
+def _sections_live(profile: dict, ctx: dict, candidates: dict) -> list:
+    top_n = int(ctx.get("agg_top_n", _DEF_TOP_N))
+    plan = _resolve_plan(profile, ctx, candidates)
+    sections = []
+    for agg in plan.get("aggregations") or []:
+        if not _is_dict(agg) or not agg.get("group_by"):
+            continue
+        result = _live_groupby(ctx, agg.get("group_by"),
+                               agg.get("measures") or [], top_n)
+        if result is not None:
+            sections += _groupby_section(agg.get("group_by"),
+                                         agg.get("measures") or [], result,
+                                         agg.get("why") or "")
+    for pv in plan.get("pivots") or []:
+        if not _is_dict(pv) or not pv.get("index") or not pv.get("columns"):
+            continue
+        result = _live_pivot(ctx, pv.get("index"), pv.get("columns"),
+                             pv.get("value"), pv.get("agg") or "mean")
+        if result is not None:
+            sections += _pivot_section(pv, result)
+    return sections
+
+
+# --------------------------------------------------------------------------- #
+# Entry point.
+# --------------------------------------------------------------------------- #
+def _intro_blocks() -> list:
+    text = (
+        "Este capítulo analiza la tabla **por grupos** (split-apply-combine): "
+        "elige las columnas categóricas más informativas — por su cardinalidad "
+        "y relevancia, no todas contra todas, para no inflar comparaciones "
+        "espurias — y resume las variables numéricas dentro de cada grupo "
+        "(conteo, media, mediana, desviación). Las **tablas dinámicas** (pivot) "
+        "cruzan dos categóricas sobre una medida, y los **gráficos de barras** "
+        "(siempre desde cero) comparan los grupos de un vistazo."
+    )
+    return [model.Heading(text=CHAPTER_TITLE, level=1),
+            model.Markdown(text=text)]
+
+
+def build_agregacion(profile: dict, ctx: dict):
+    """Build the AGREGACION Chapter, or None if the dataset can't be grouped.
+
+    Args:
+        profile: the ``eda`` group TableProfile dict.
+        ctx: presentation context (see module docstring for the keys consumed).
+
+    Returns:
+        A ``model.Chapter`` with per-group stats, pivots and bar charts; or
+        ``None`` when the dataset has no low-cardinality categorical column to
+        group by (the chapter does not apply).
+    """
+    profile = profile or {}
+    ctx = ctx or {}
+    if not isinstance(profile, dict):
+        return None
+
+    # Pre-computed results take precedence (offline / tests / forward-compat).
+    pre = ctx.get("aggregations")
+    if _is_dict(pre) and (pre.get("groupby") or pre.get("pivots")):
+        sections = _sections_from_precomputed(pre)
+        if not sections:
+            return None
+        blocks = _intro_blocks() + sections + _insights_section(ctx)
+        return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
+                             version=CHAPTER_VERSION, blocks=blocks)
+
+    # Live path: needs at least one categorical key to group by.
+    candidates = _resolve_candidates(profile, ctx)
+    if not _is_dict(candidates) or not (candidates.get("group_keys")):
+        return None  # chapter does not apply: nothing to group by.
+
+    sections = _sections_live(profile, ctx, candidates)
+    if not sections:
+        # Applies (there are categorical keys) but no aggregation data is
+        # reachable: emit an honest note instead of fabricating numbers.
+        keys = ", ".join(model._safe_str((k or {}).get("col"))
+                         for k in candidates.get("group_keys") or []
+                         if _is_dict(k))
+        note = model.Note(
+            "No se pudo calcular la agregación: el capítulo necesita los datos "
+            "crudos. Pasa ctx['db_path'] + ctx['table'] (para el cálculo "
+            "push-down en DuckDB) o ctx['aggregations'] ya precalculado. "
+            f"Columnas categóricas candidatas: {keys or '—'}.")
+        blocks = _intro_blocks() + [note] + _insights_section(ctx)
+        return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
+                             version=CHAPTER_VERSION, blocks=blocks)
+
+    blocks = _intro_blocks() + sections + _insights_section(ctx)
+    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
+                         version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,256 @@
+"""Tests for the AGREGACION chapter — DoD: golden + edges + error/no-cut path.
+
+Self-contained and deterministic: no DuckDB and no LLM. The aggregation results
+are passed pre-computed via ``ctx['aggregations']`` (the same shape the push-down
+registry functions ``groupby_stats_duckdb`` / ``pivot_table_duckdb`` produce), so
+the chapter's rendering logic is exercised without touching disk or the network.
+Live push-down + LLM selection are covered separately by the golden script.
+
+Verifies:
+- Golden: a profile with categoricals + numerics builds a Chapter with per-group
+  stats tables, a pivot table and bar-chart figures, and it renders to PDF AND
+  PPTX showing the group keys, values and pivot — nothing cut.
+- Edges: a dataset with no low-cardinality categorical returns None; an empty
+  profile returns None; a profile that *could* be grouped but has no reachable
+  data degrades to an honest note instead of raising.
+- No-cut: many groups (30) + a long interpretation paragraph survive intact in
+  the rendered PDF (table split by rows, text wrapped whole).
+"""
+
+import os
+import re
+import tempfile
+
+from pptx import Presentation
+from pypdf import PdfReader
+
+from datascience.automatic_eda.chapters.agregacion import build_agregacion
+from datascience.automatic_eda.model import Chapter
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+
+# --------------------------------------------------------------------------- #
+# Synthetic fixtures.
+# --------------------------------------------------------------------------- #
+def _profile() -> dict:
+    """A titanic-like profile: 2 categoricals + 2 numeric measures + 1 id."""
+    return {
+        "table": "titanic",
+        "source": "/data/titanic.csv",
+        "n_rows": 891,
+        "n_cols": 5,
+        "key_candidates": ["passenger_id"],
+        "columns": [
+            {"name": "passenger_id", "inferred_type": "numeric",
+             "unique_pct": 1.0, "flags": ["possible_id"],
+             "numeric": {"mean": 446.0, "std": 257.0}},
+            {"name": "sex", "inferred_type": "categorical", "distinct_count": 2,
+             "flags": [], "categorical": {"n_distinct": 2, "imbalance": 0.1,
+                                          "top": [{"value": "male", "count": 577}]}},
+            {"name": "pclass", "inferred_type": "categorical", "distinct_count": 3,
+             "flags": [], "categorical": {"n_distinct": 3, "imbalance": 0.2}},
+            {"name": "fare", "inferred_type": "numeric", "flags": [],
+             "numeric": {"mean": 32.2, "std": 49.7, "cv": 1.54}},
+            {"name": "age", "inferred_type": "numeric", "flags": [],
+             "numeric": {"mean": 29.7, "std": 14.5, "cv": 0.49}},
+        ],
+    }
+
+
+def _groupby_result(group_by: str, keys_n: list) -> dict:
+    """A groupby_stats_duckdb-shaped result for `fare` and `age`."""
+    groups = []
+    for i, (key, n) in enumerate(keys_n):
+        groups.append({
+            "key": key, "n": n,
+            "stats": {
+                "fare": {"mean": 20.0 + i * 15, "median": 10.0 + i * 8,
+                         "std": 40.0 + i, "min": 0.0, "max": 512.3},
+                "age": {"mean": 28.0 + i, "median": 27.0 + i, "std": 14.0,
+                        "min": 0.42, "max": 80.0},
+            },
+        })
+    return {"status": "ok", "group_by": group_by, "measures": ["fare", "age"],
+            "aggs": ["count", "mean", "median", "std", "min", "max"],
+            "n_groups": len(groups), "truncated": False, "groups": groups}
+
+
+def _pivot_result() -> dict:
+    return {"status": "ok", "index": "sex", "columns": "pclass", "value": "fare",
+            "agg": "mean", "row_labels": ["male", "female"],
+            "col_labels": ["1", "2", "3"],
+            "matrix": [[62.0, 19.0, 12.0], [110.0, 22.0, 15.0]],
+            "truncated_rows": False, "truncated_cols": False}
+
+
+def _ctx_precomputed() -> dict:
+    return {
+        "aggregations": {
+            "groupby": [
+                {"group_by": "sex", "measures": ["fare", "age"],
+                 "why": "sexo del pasajero",
+                 "result": _groupby_result("sex", [("male", 577), ("female", 314)])},
+                {"group_by": "pclass", "measures": ["fare", "age"],
+                 "why": "clase del billete",
+                 "result": _groupby_result(
+                     "pclass", [("3", 491), ("1", 216), ("2", 184)])},
+            ],
+            "pivots": [
+                {"index": "sex", "columns": "pclass", "value": "fare",
+                 "agg": "mean", "why": "tarifa por sexo y clase",
+                 "result": _pivot_result()},
+            ],
+        },
+        "agg_insights": [
+            {"title": "Tarifa por sexo",
+             "text": "Las mujeres pagaron de media casi el doble que los hombres."},
+        ],
+    }
+
+
+def _pdf_text(path: str) -> str:
+    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
+    return re.sub(r"\s+", " ", txt)
+
+
+def _pptx_text(path: str) -> str:
+    prs = Presentation(path)
+    parts = []
+    for sl in prs.slides:
+        for sh in sl.shapes:
+            if sh.has_text_frame:
+                parts.append(sh.text_frame.text)
+            if sh.has_table:
+                tb = sh.table
+                for r in range(len(tb.rows)):
+                    for c in range(len(tb.columns)):
+                        parts.append(tb.cell(r, c).text)
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+# --------------------------------------------------------------------------- #
+# Golden: builds a Chapter and renders to both formats.
+# --------------------------------------------------------------------------- #
+def test_golden_chapter_blocks_present():
+    ch = build_agregacion(_profile(), _ctx_precomputed())
+    assert isinstance(ch, Chapter)
+    assert ch.id == "agregacion"
+    kinds = [b.kind for b in ch.blocks]
+    assert "heading" in kinds
+    assert kinds.count("data_table") >= 3   # 2 group summaries + pivot (+details)
+    assert "figure" in kinds                 # at least one bar chart.
+    # Headings mention the group keys and the pivot.
+    htext = " ".join(b.text for b in ch.blocks if b.kind == "heading")
+    assert "sex" in htext and "pclass" in htext and "Pivot" in htext
+
+
+def test_golden_render_pdf():
+    ch = build_agregacion(_profile(), _ctx_precomputed())
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "agg.pdf")
+        res = render_automatic_eda_pdf([ch], out, {"write_manifest": False})
+        assert res["path"] == out and os.path.exists(out)
+        assert res["n_pages"] >= 1
+        txt = _pdf_text(out)
+        assert "Agregación por grupos" in txt
+        assert "male" in txt and "female" in txt        # group + pivot labels.
+        assert "Pivot" in txt
+        assert "mediana" in txt                           # per-measure detail.
+        assert "casi el doble" in txt                     # interpretation kept.
+
+
+def test_golden_render_pptx():
+    ch = build_agregacion(_profile(), _ctx_precomputed())
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "agg.pptx")
+        res = render_automatic_eda_pptx([ch], out, {"write_manifest": False})
+        assert res["path"] == out and os.path.exists(out)
+        assert res["n_slides"] >= 1
+        txt = _pptx_text(out)
+        assert "male" in txt and "pclass" in txt
+        assert "Pivot" in txt or "sex" in txt
+
+
+# --------------------------------------------------------------------------- #
+# Edges.
+# --------------------------------------------------------------------------- #
+def test_edge_no_categorical_returns_none():
+    # Only numerics + an id: nothing to group by -> chapter does not apply.
+    prof = {
+        "table": "t", "n_rows": 100, "key_candidates": ["id"],
+        "columns": [
+            {"name": "id", "inferred_type": "numeric", "unique_pct": 1.0,
+             "flags": ["possible_id"], "numeric": {"std": 10.0}},
+            {"name": "x", "inferred_type": "numeric", "flags": [],
+             "numeric": {"mean": 1.0, "std": 2.0}},
+        ],
+    }
+    assert build_agregacion(prof, {}) is None
+
+
+def test_edge_empty_profile_returns_none():
+    assert build_agregacion({}, {}) is None
+    assert build_agregacion(None, None) is None
+
+
+def test_edge_high_cardinality_only_returns_none():
+    # The single categorical is id-like (high cardinality) -> not groupable.
+    prof = {
+        "table": "t", "n_rows": 100, "key_candidates": ["uuid"],
+        "columns": [
+            {"name": "uuid", "inferred_type": "categorical", "distinct_count": 100,
+             "flags": ["high_cardinality", "possible_id"]},
+            {"name": "x", "inferred_type": "numeric", "flags": [],
+             "numeric": {"mean": 1.0, "std": 2.0}},
+        ],
+    }
+    assert build_agregacion(prof, {}) is None
+
+
+def test_live_without_data_degrades_to_note():
+    # Has a categorical to group by but no db_path / no precomputed results:
+    # must NOT raise and must emit an honest note (chapter still applies).
+    prof = {
+        "table": "t", "n_rows": 100, "key_candidates": [],
+        "columns": [
+            {"name": "grp", "inferred_type": "categorical", "distinct_count": 3,
+             "flags": [], "categorical": {"n_distinct": 3}},
+            {"name": "v", "inferred_type": "numeric", "flags": [],
+             "numeric": {"mean": 1.0, "std": 2.0}},
+        ],
+    }
+    ch = build_agregacion(prof, {})
+    assert isinstance(ch, Chapter)
+    notes = [b.text for b in ch.blocks if b.kind == "note"]
+    assert any("datos crudos" in n for n in notes)
+
+
+# --------------------------------------------------------------------------- #
+# No-cut: many groups + long text survive intact in the PDF.
+# --------------------------------------------------------------------------- #
+def test_anti_corte_muchos_grupos_y_texto_largo():
+    keys_n = [(f"grupo_{i:02d}", 30 - (i % 5)) for i in range(30)]
+    long_text = " ".join(f"palabra{i}" for i in range(120))
+    ctx = {
+        "aggregations": {
+            "groupby": [
+                {"group_by": "cat", "measures": ["fare"], "why": "muchos niveles",
+                 "result": _groupby_result("cat", keys_n)},
+            ],
+            "pivots": [],
+        },
+        "agg_insights": [{"title": "Nota larga", "text": long_text}],
+    }
+    ch = build_agregacion(_profile(), ctx)
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "big.pdf")
+        res = render_automatic_eda_pdf([ch], out, {"write_manifest": False})
+        assert res["path"] == out
+        assert res["n_pages"] > 1  # 30-row table + figure spill across pages.
+        txt = _pdf_text(out)
+        # First and last group labels both survive (table not truncated).
+        assert "grupo_00" in txt and "grupo_29" in txt
+        # First, middle and last words of the long paragraph all present.
+        for i in (0, 60, 119):
+            assert f"palabra{i}" in txt