feat(eda): capítulo AutomaticEDA CAT DISTR + funciones cardinalidad/pie

Capítulo cat_distr del motor AutomaticEDA: distribuciones categóricas con explicación de entropía de Shannon, métricas de cardinalidad por columna (valores distintos, % distintos, total de filas, valores únicos, entropía y su máximo log2(k) + normalizada), tabla top-k y un donut de las categorías más comunes (top-k + «Otros»). Marca columnas id-like y dominadas. Delegadas a fn-constructor (grupo eda): - categorical_cardinality_block: deriva métricas de cardinalidad/entropía. - categorical_top_pie_figure: figura donut top-k + «Otros», leyenda lateral. Defensivo (dict-no-throw): None si no hay columnas categóricas; normaliza mode_pct a escala 0-100 (summarize_categorical lo emite como fracción). Tablas vía DataTable y figura perezosa: el paginador del núcleo garantiza no-corte en PDF y PPTX. Tests: golden + edge (sin categóricas) + anti-corte (label largo / muchas columnas) en ambos renderers. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 15:04:10 +02:00
parent cb7a7fc1fd
commit 649de07d6b
8 changed files with 1493 additions and 0 deletions
@@ -0,0 +1,402 @@
+"""Categorical distributions chapter (CAT DISTR).
+
+Third reference chapter for AutomaticEDA. For every categorical column it shows,
+fulfilling the user's request:
+
+1. A short opening explanation of **Shannon entropy** (what it measures, its 0
+   and log2(k) bounds, the normalized 0–1 version) and the dataset row total used
+   as a comparison baseline.
+2. Per column, a cardinality key/value table: distinct values, ``% distinct``
+   (distinct / total rows), total dataset rows, singleton values (frequency 1),
+   entropy with its theoretical maximum and the normalized ratio, mode, imbalance
+   and string-length stats.
+3. A short note flagging problematic cardinality (id-like ≈100% distinct, or a
+   single dominating category).
+4. A ``top-k`` table (value / count / %).
+5. A **donut pie chart** of the most common categories (top-k + an "Otros"
+   bucket), drawn lazily so the renderers scale it to fit entirely.
+
+Data comes from the ``eda`` group: each ``columns[i]['categorical']`` is the
+output of ``summarize_categorical`` (``top[{value,count,pct}]``, ``mode``,
+``n_distinct``, ``entropy``, ``imbalance``, ``len_min/mean/max``). The derived
+cardinality metrics and the pie figure are delegated to two registry functions
+(``categorical_cardinality_block`` and ``categorical_top_pie_figure``); both are
+imported lazily and degrade to a minimal inline fallback so this chapter never
+raises even if they are unavailable.
+
+Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
+"""
+
+from __future__ import annotations
+
+import math
+
+from .. import model
+
+CHAPTER_VERSION = "1.0.0"
+CHAPTER_ID = "cat_distr"
+CHAPTER_TITLE = "Distribuciones categóricas"
+
+# Cap the number of categorical columns rendered to keep the document bounded;
+# the rest are summarized in a closing note (no silent truncation).
+MAX_COLS = 40
+# Rows shown in each top-k table and explicit slices in the pie.
+TOP_TABLE_ROWS = 15
+PIE_TOP_K = 6
+# Truncate very long category labels in tables (the renderer also wraps).
+LABEL_MAX = 48
+
+
+def _fmt_int(value) -> str:
+    if value is None:
+        return "—"
+    try:
+        return f"{int(value):,}".replace(",", ".")
+    except (TypeError, ValueError):
+        return str(value)
+
+
+def _fmt_num(value, decimals: int = 3) -> str:
+    if value is None:
+        return "—"
+    if isinstance(value, bool):
+        return str(value)
+    if isinstance(value, int):
+        return f"{value:,}".replace(",", ".")
+    if isinstance(value, float):
+        if value != value:  # NaN
+            return "NaN"
+        if value in (float("inf"), float("-inf")):
+            return str(value)
+        text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
+        return text if text else "0"
+    return str(value)
+
+
+def _fmt_pct_value(value, decimals: int = 1) -> str:
+    """Format an already-in-percent value (0–100). None -> placeholder."""
+    if value is None:
+        return "—"
+    try:
+        return f"{float(value):.{decimals}f}%"
+    except (TypeError, ValueError):
+        return str(value)
+
+
+def _pct_from_maybe_fraction(value, decimals: int = 1) -> str:
+    """Format a percentage that may arrive as a 0–1 fraction or a 0–100 number."""
+    if value is None:
+        return "—"
+    try:
+        v = float(value)
+    except (TypeError, ValueError):
+        return str(value)
+    if v <= 1.0:
+        v *= 100.0
+    return f"{v:.{decimals}f}%"
+
+
+def _truncate(text: str, limit: int = LABEL_MAX) -> str:
+    s = model._safe_str(text)
+    if len(s) <= limit:
+        return s
+    return s[: max(1, limit - 1)].rstrip() + "…"
+
+
+def _is_categorical(col: dict) -> bool:
+    """A column is treated as categorical when it carries a non-empty top list
+    and is not a pure numeric column (numeric columns may still expose a top)."""
+    if not isinstance(col, dict):
+        return False
+    cat = col.get("categorical")
+    if not (isinstance(cat, dict) and cat.get("top")):
+        return False
+    if col.get("inferred_type") == "numeric":
+        return False
+    return True
+
+
+def _cardinality(cat: dict, n_rows) -> dict:
+    """Derive cardinality metrics for a column, via the registry function when
+    available, otherwise a minimal inline fallback. Never raises."""
+    try:
+        from datascience.categorical_cardinality_block import (
+            categorical_cardinality_block,
+        )
+
+        out = categorical_cardinality_block(cat=cat, n_rows=n_rows)
+        if isinstance(out, dict):
+            return out
+    except Exception:  # noqa: BLE001 — fall back to the inline derivation.
+        pass
+    return _fallback_cardinality(cat, n_rows)
+
+
+def _fallback_cardinality(cat: dict, n_rows) -> dict:
+    cat = cat or {}
+    top = cat.get("top") or []
+    n_distinct = cat.get("n_distinct")
+    entropy = cat.get("entropy")
+    try:
+        nr = int(n_rows) if n_rows is not None else None
+    except (TypeError, ValueError):
+        nr = None
+    pct_distinct = None
+    if isinstance(n_distinct, (int, float)) and nr:
+        pct_distinct = float(n_distinct) / nr * 100.0
+    entropy_max = None
+    if isinstance(n_distinct, (int, float)):
+        entropy_max = math.log2(n_distinct) if n_distinct > 1 else 0.0
+    entropy_norm = None
+    if isinstance(entropy, (int, float)) and entropy_max:
+        entropy_norm = max(0.0, min(1.0, float(entropy) / entropy_max))
+    mode_pct = cat.get("mode_pct")
+    if mode_pct is None and top and isinstance(top[0], dict):
+        mode_pct = top[0].get("pct")
+    # Normalize to a 0–100 scale: summarize_categorical emits a 0–1 fraction.
+    if isinstance(mode_pct, (int, float)) and not isinstance(mode_pct, bool):
+        mode_pct = float(mode_pct) * 100.0 if mode_pct <= 1.0 else float(mode_pct)
+    else:
+        mode_pct = None
+    n_singletons = None
+    if top:
+        n_singletons = sum(
+            1 for t in top if isinstance(t, dict) and t.get("count") == 1)
+    return {
+        "n_distinct": n_distinct,
+        "n_rows": nr,
+        "pct_distinct": pct_distinct,
+        "entropy": entropy,
+        "entropy_max": entropy_max,
+        "entropy_norm": entropy_norm,
+        "mode": cat.get("mode"),
+        "mode_pct": mode_pct,
+        "imbalance": cat.get("imbalance"),
+        "n_singletons": n_singletons,
+        "n_singletons_partial": (
+            isinstance(n_distinct, (int, float)) and n_distinct > len(top)),
+        "len_min": cat.get("len_min"),
+        "len_mean": cat.get("len_mean"),
+        "len_max": cat.get("len_max"),
+        "id_like": pct_distinct is not None and pct_distinct >= 99.0,
+        "dominated": mode_pct is not None and mode_pct >= 90.0,
+    }
+
+
+def _pie_make(top, n_distinct, title, n_rows):
+    """Return a zero-arg callable that builds the donut figure lazily."""
+
+    def make():
+        try:
+            from datascience.categorical_top_pie_figure import (
+                categorical_top_pie_figure,
+            )
+
+            return categorical_top_pie_figure(
+                top=top, n_distinct=n_distinct or 0, title=title,
+                top_k=PIE_TOP_K, n_rows=n_rows)
+        except Exception:  # noqa: BLE001 — minimal local fallback figure.
+            return _fallback_pie(top, title)
+
+    return make
+
+
+def _fallback_pie(top, title):
+    """Minimal donut figure used only if the registry function is unavailable."""
+    import matplotlib
+
+    matplotlib.use("Agg")
+    from matplotlib.figure import Figure
+
+    fig = Figure(figsize=(5.0, 3.2))
+    ax = fig.add_subplot(111)
+    items = [t for t in (top or [])
+             if isinstance(t, dict) and isinstance(t.get("count"), (int, float))]
+    items = sorted(items, key=lambda t: t.get("count") or 0, reverse=True)
+    head = items[:PIE_TOP_K]
+    rest = items[PIE_TOP_K:]
+    labels = [_truncate(t.get("value"), 20) for t in head]
+    sizes = [float(t.get("count") or 0) for t in head]
+    if rest:
+        labels.append(f"Otros ({len(rest)})")
+        sizes.append(sum(float(t.get("count") or 0) for t in rest))
+    if not sizes or sum(sizes) <= 0:
+        ax.text(0.5, 0.5, "sin datos categóricos", ha="center", va="center")
+        ax.axis("off")
+        return fig
+    ax.pie(sizes, labels=None, wedgeprops={"width": 0.42},
+           autopct=lambda p: f"{p:.0f}%" if p >= 4 else "")
+    ax.legend(labels, loc="center left", bbox_to_anchor=(1.0, 0.5),
+              fontsize=7, frameon=False)
+    ax.set_title(_truncate(title, 40))
+    fig.tight_layout()
+    return fig
+
+
+def _normalize_card(card: dict) -> dict:
+    """Make the cardinality dict robust regardless of the upstream scale.
+
+    ``summarize_categorical`` emits ``mode_pct`` as a 0–1 fraction; bring it to a
+    0–100 scale and recompute the ``dominated`` flag here so the chapter is
+    correct whether it consumed the registry function or the inline fallback.
+    """
+    card = dict(card or {})
+    mp = card.get("mode_pct")
+    if isinstance(mp, (int, float)) and not isinstance(mp, bool):
+        mp = float(mp) * 100.0 if mp <= 1.0 else float(mp)
+    else:
+        mp = None
+    card["mode_pct"] = mp
+    card["dominated"] = mp is not None and mp >= 90.0
+    pd = card.get("pct_distinct")
+    card["id_like"] = isinstance(pd, (int, float)) and pd >= 99.0
+    return card
+
+
+def _cardinality_block(card: dict):
+    """KVTable with the cardinality / entropy metrics for one column."""
+    n_singletons = card.get("n_singletons")
+    if n_singletons is not None and card.get("n_singletons_partial"):
+        singletons = f"≥{_fmt_int(n_singletons)} (en top mostrado)"
+    elif n_singletons is not None:
+        singletons = _fmt_int(n_singletons)
+    else:
+        singletons = "—"
+
+    entropy_ref = _fmt_num(card.get("entropy"))
+    emax = card.get("entropy_max")
+    if emax is not None:
+        entropy_ref = f"{entropy_ref} (máx {_fmt_num(emax)})"
+
+    mode = card.get("mode")
+    mode_pct = card.get("mode_pct")
+    mode_str = "—" if mode is None else model._safe_str(mode)
+    if mode is not None and mode_pct is not None:
+        mode_str = f"{mode_str} ({_fmt_pct_value(mode_pct)})"
+
+    rows = [
+        ("Valores distintos", _fmt_int(card.get("n_distinct"))),
+        ("% distintos", _fmt_pct_value(card.get("pct_distinct"))),
+        ("Total filas (dataset)", _fmt_int(card.get("n_rows"))),
+        ("Valores únicos (frecuencia 1)", singletons),
+        ("Entropía (bits)", entropy_ref),
+        ("Entropía normalizada (0–1)", _fmt_num(card.get("entropy_norm"))),
+        ("Moda", mode_str),
+    ]
+    imbalance = card.get("imbalance")
+    if imbalance is not None:
+        rows.append(("Desbalance", _fmt_num(imbalance)))
+    lm = card.get("len_min")
+    lmean = card.get("len_mean")
+    lmax = card.get("len_max")
+    if any(v is not None for v in (lm, lmean, lmax)):
+        rows.append((
+            "Longitud (mín/media/máx)",
+            f"{_fmt_num(lm)} / {_fmt_num(lmean)} / {_fmt_num(lmax)}"))
+    return model.KVTable(rows=rows, title="Cardinalidad")
+
+
+def _flag_note(card: dict):
+    """Return a Note flagging problematic cardinality, or None."""
+    if card.get("id_like"):
+        return model.Note(
+            "Casi todos los valores son distintos (≈100% distintos): la columna "
+            "se comporta como un identificador y aporta poco para agrupar o "
+            "comparar categorías.")
+    if card.get("dominated"):
+        mp = card.get("mode_pct")
+        mp_str = _fmt_pct_value(mp) if mp is not None else "muy alta"
+        return model.Note(
+            f"Una sola categoría domina la columna (moda {mp_str}): la "
+            "distribución está muy desbalanceada.")
+    return None
+
+
+def _topk_table(cat: dict):
+    """DataTable value / count / % for the top categories."""
+    top = cat.get("top") or []
+    n_distinct = cat.get("n_distinct")
+    header = ["Valor", "Conteo", "%"]
+    rows = []
+    for t in top[:TOP_TABLE_ROWS]:
+        if not isinstance(t, dict):
+            continue
+        rows.append([
+            model._safe_str(t.get("value")),
+            _fmt_int(t.get("count")),
+            _pct_from_maybe_fraction(t.get("pct")),
+        ])
+    if not rows:
+        return None
+    shown = len(rows)
+    if isinstance(n_distinct, (int, float)) and n_distinct > shown:
+        note = f"top {shown} de {_fmt_int(n_distinct)} categorías distintas"
+    else:
+        note = f"{shown} categorías"
+    return model.DataTable(header=header, rows=rows, title="Top categorías",
+                           note=note)
+
+
+def _intro_blocks(n_rows):
+    total = _fmt_int(n_rows)
+    text = (
+        "La **entropía de Shannon** mide cómo de repartidos están los valores de "
+        "una columna categórica, en bits. Vale 0 cuando una sola categoría "
+        "concentra todas las filas (máxima previsibilidad) y alcanza su máximo, "
+        "log2(k) para k categorías distintas, cuando todas aparecen por igual "
+        "(máxima diversidad). La **entropía normalizada** (entropía dividida por "
+        "su máximo) la lleva al rango 0–1 para comparar columnas con distinto "
+        "número de categorías. Para cada columna se muestran los valores "
+        "distintos, el porcentaje que representan sobre el total de filas, los "
+        "valores únicos (que aparecen una sola vez), la tabla de las categorías "
+        "más frecuentes y un gráfico de tarta (donut) de las más comunes."
+    )
+    if n_rows is not None:
+        text += f" El dataset tiene {total} filas en total como referencia."
+    return [
+        model.Heading(text="Entropía y cardinalidad", level=2),
+        model.Markdown(text=text),
+    ]
+
+
+def build_cat_distr(profile: dict, ctx: dict):
+    """Build the categorical-distributions Chapter, or None if the dataset has
+    no categorical columns."""
+    profile = profile or {}
+    ctx = ctx or {}
+    cols = profile.get("columns") or []
+    cat_cols = [c for c in cols if _is_categorical(c)]
+    if not cat_cols:
+        return None
+
+    n_rows = profile.get("n_rows")
+    blocks = list(_intro_blocks(n_rows))
+
+    rendered = cat_cols[:MAX_COLS]
+    for col in rendered:
+        name = col.get("name") or "(columna)"
+        cat = col.get("categorical") or {}
+        card = _normalize_card(_cardinality(cat, n_rows))
+
+        blocks.append(model.Heading(text=str(name), level=2))
+        blocks.append(_cardinality_block(card))
+        note = _flag_note(card)
+        if note is not None:
+            blocks.append(note)
+        topk = _topk_table(cat)
+        if topk is not None:
+            blocks.append(topk)
+        blocks.append(model.Figure(
+            make=_pie_make(cat.get("top") or [], card.get("n_distinct"),
+                           str(name), n_rows),
+            caption=(f"Categorías más comunes de «{_truncate(name, 32)}» "
+                     "(donut: top-k + «Otros»)")))
+
+    if len(cat_cols) > len(rendered):
+        omitted = len(cat_cols) - len(rendered)
+        blocks.append(model.Note(
+            f"Se muestran las primeras {len(rendered)} columnas categóricas; "
+            f"quedan {omitted} sin mostrar para mantener acotado el informe."))
+
+    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
+                         version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,186 @@
+"""Tests for the CAT DISTR chapter — DoD: golden + edges + anti-cut.
+
+Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
+and deterministic. Verifies that ``build_cat_distr`` emits the blocks the user
+asked for (entropy intro, distinct/total/%-distinct/unique metrics, top-k table
+and a donut figure), that the chapter renders inside the full document to both
+PDF and PPTX showing that content, that a profile with no categorical columns
+yields ``None`` without raising, and that long labels / many columns are never
+cut in either output.
+"""
+
+import os
+import re
+import tempfile
+
+from pypdf import PdfReader
+from pptx import Presentation
+
+from datascience.automatic_eda.model import (
+    DataTable, Figure, Heading, KVTable, Note,
+)
+from datascience.automatic_eda.chapters.cat_distr import (
+    CHAPTER_ID, CHAPTER_VERSION, build_cat_distr,
+)
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+
+def _profile() -> dict:
+    return {
+        "table": "productos",
+        "source": "/data/productos.csv",
+        "profiled_at": "2026-06-30T10:00:00+00:00",
+        "n_rows": 1000,
+        "n_cols": 3,
+        "quality_score": 90.0,
+        "columns": [
+            {"name": "precio", "inferred_type": "numeric", "null_pct": 0.0,
+             "null_count": 0,
+             "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0,
+                         "max": 100.0, "std": 12.3}},
+            {"name": "categoria", "inferred_type": "categorical",
+             "null_pct": 0.0, "null_count": 0, "distinct_count": 8,
+             "categorical": {
+                 "top": [
+                     {"value": "neumaticos", "count": 500, "pct": 0.5},
+                     {"value": "aceite", "count": 300, "pct": 0.3},
+                     {"value": "filtros", "count": 120, "pct": 0.12},
+                     {"value": "frenos", "count": 80, "pct": 0.08},
+                 ],
+                 "mode": "neumaticos", "n_distinct": 8, "entropy": 1.6,
+                 "imbalance": 6.25, "len_min": 6, "len_mean": 7.5,
+                 "len_max": 10}},
+            {"name": "uuid", "inferred_type": "categorical",
+             "null_pct": 0.0, "null_count": 0, "distinct_count": 1000,
+             "categorical": {
+                 "top": [{"value": f"id-{i}", "count": 1} for i in range(5)],
+                 "mode": "id-0", "n_distinct": 1000, "entropy": 9.97,
+                 "imbalance": 1.0}},
+        ],
+    }
+
+
+def _pdf_text(path: str) -> str:
+    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
+    return re.sub(r"\s+", " ", txt)
+
+
+def _pptx_text(path: str) -> str:
+    prs = Presentation(path)
+    parts = []
+    for sl in prs.slides:
+        for sh in sl.shapes:
+            if sh.has_text_frame:
+                parts.append(sh.text_frame.text)
+            if sh.has_table:
+                tb = sh.table
+                for r in range(len(tb.rows)):
+                    for c in range(len(tb.columns)):
+                        parts.append(tb.cell(r, c).text)
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def _kinds(chapter):
+    return [b.kind for b in chapter.blocks]
+
+
+def test_golden_build_cat_distr_emite_bloques_pedidos():
+    ch = build_cat_distr(_profile(), {})
+    assert ch is not None
+    assert ch.id == CHAPTER_ID
+    assert ch.version == CHAPTER_VERSION
+    kinds = _kinds(ch)
+    # Entropy intro present.
+    headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
+    assert any("Entrop" in h for h in headings)
+    md = next(b for b in ch.blocks if b.kind == "markdown")
+    assert "entropía" in md.text.lower() and "log2" in md.text
+    # Cardinality metrics: distinct, total rows, %-distinct, unique values.
+    kv = next(b for b in ch.blocks if isinstance(b, KVTable))
+    labels = [r[0] for r in kv.rows]
+    assert "Valores distintos" in labels
+    assert "% distintos" in labels
+    assert "Total filas (dataset)" in labels
+    assert "Valores únicos (frecuencia 1)" in labels
+    assert any("Entropía" in lbl for lbl in labels)
+    # Top-k table + pie figure.
+    dt = next(b for b in ch.blocks if isinstance(b, DataTable))
+    assert dt.header == ["Valor", "Conteo", "%"]
+    assert any("neumaticos" in str(cell) for row in dt.rows for cell in row)
+    assert any(isinstance(b, Figure) for b in ch.blocks)
+    # id-like column flagged with a Note.
+    assert any(isinstance(b, Note) and "identificador" in b.text
+               for b in ch.blocks)
+
+
+def test_golden_render_pdf_muestra_categoricas():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pdf")
+        res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pdf_text(out)
+        assert "Entrop" in txt
+        assert "distintos" in txt
+        assert "categoria" in txt and "neumaticos" in txt
+        assert "donut" in txt           # figure caption rendered as text.
+        assert "identificador" in txt   # id-like note rendered.
+
+
+def test_golden_render_pptx_muestra_categoricas():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pptx")
+        res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pptx_text(out)
+        assert "Entrop" in txt
+        assert "categoria" in txt and "neumaticos" in txt
+        assert "distintos" in txt
+
+
+def test_edge_sin_categoricas_devuelve_none():
+    only_numeric = {
+        "n_rows": 10, "columns": [
+            {"name": "x", "inferred_type": "numeric",
+             "numeric": {"mean": 1.0}}]}
+    assert build_cat_distr(only_numeric, {}) is None
+    # None / empty / no-columns never raise and yield None.
+    assert build_cat_distr(None, None) is None
+    assert build_cat_distr({}, {}) is None
+    assert build_cat_distr({"columns": []}, {}) is None
+
+
+def test_anti_corte_label_largo_y_muchas_columnas():
+    long_label = ("Lorem ipsum dolor sit amet consectetur adipiscing elit sed "
+                  "do eiusmod tempor incididunt ut labore reprehenderit voluptate")
+    cols = []
+    for i in range(30):
+        cols.append({
+            "name": f"cat_{i}", "inferred_type": "categorical",
+            "distinct_count": 3,
+            "categorical": {
+                "top": [{"value": long_label, "count": 60},
+                        {"value": "b", "count": 30},
+                        {"value": "c", "count": 10}],
+                "mode": long_label, "n_distinct": 3, "entropy": 1.2}})
+    profile = {"table": "t", "source": "t.csv", "n_rows": 100,
+               "n_cols": len(cols), "columns": cols}
+
+    ch = build_cat_distr(profile, {})
+    assert ch is not None
+    with tempfile.TemporaryDirectory() as d:
+        pdf = os.path.join(d, "anti.pdf")
+        res = render_automatic_eda_pdf(profile, pdf, {"write_manifest": False})
+        assert res["path"] == pdf
+        assert res["n_pages"] > 1       # many columns spilled across pages, OK.
+        txt = _pdf_text(pdf)
+        # Long label wrapped (not truncated): every word survives.
+        for word in ("Lorem", "incididunt", "reprehenderit", "voluptate"):
+            assert word in txt
+        # PPTX path must not raise either.
+        pptx = os.path.join(d, "anti.pptx")
+        res2 = render_automatic_eda_pptx(profile, pptx,
+                                         {"write_manifest": False})
+        assert res2["path"] == pptx and os.path.exists(pptx)