From 833597c831fac49c2a6e2f1e0ea61268a8947a5f Mon Sep 17 00:00:00 2001
From: Egutierrez <egutierrez@dead.dd>
Date: Tue, 30 Jun 2026 19:45:09 +0200
Subject: [PATCH] =?UTF-8?q?fix(eda):=20cat=5Fdistr=20PPTX=20=E2=80=94=20co?=
 =?UTF-8?q?lumnas=20de=20alta=20cardinalidad=20caben=20en=20UN=20slide=20c?=
 =?UTF-8?q?on=20su=20gr=C3=A1fico?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

La verificación adversarial detectó que, en PPTX (slide 16:9, corto), las columnas
categóricas de ALTA cardinalidad NO id-like (Ticket, Cabin) ocupaban 3 slides cada
una con el donut SEPARADO de su tabla: el top-k de 8 filas largas no cabía junto al
donut y el keep-together partía la columna. (El PDF, en A5, ya estaba 1:1 correcto.)

Arreglo SOLO en render_pptx_impl.py:

- `_fit_group_blocks` (nuevo): para un Group con figura + DataTable que no cabe en el
  slide, reserva un alto mínimo para el donut (`_GROUP_MIN_FIG_H`) y recorta las filas
  de la DataTable a lo que queda, de modo que el gráfico se queda en el MISMO slide,
  junto a su tabla. No-op cuando ya cabe o no hay par figura+tabla (p.ej. columnas
  id-like, que ya omiten la top-k).
- `_trim_data_table_to_budget` (nuevo): devuelve una COPIA de la DataTable con las
  filas que caben (al menos una) + nota honesta "top N de M categorías mostradas
  (recortado para caber en el slide; el PDF muestra más)". NUNCA muta el bloque
  original, que es compartido con el renderer PDF (el PDF sigue mostrando la tabla
  completa en A5).
- `_place_group`: aplica `_fit_group_blocks` antes de `_shrink_group_figures`.

Refuerzo de cat_distr_test.py:

- `test_golden_pptx_una_slide_por_columna_con_su_grafico`: perfil con una columna
  categórica de alta cardinalidad no-id-like (40 valores largos sobre 5000 filas,
  0.8% distinto) que reproduce el caso Ticket/Cabin. Asierta que CADA columna
  categórica aparece en EXACTAMENTE UN slide del capítulo y que ese mismo slide lleva
  su tabla (Cardinalidad/distintos) Y su donut (caption + shape Picture) — el gráfico
  nunca se separa de su tabla. Sustituye al laxo `n_slides >= 2`.

Verificado con titanic_train.csv (render_automatic_eda run_models=True): 5 columnas
categóricas (Name, Sex, Ticket, Cabin, Embarked); PDF 6 páginas y PPTX 6 slides del
capítulo (intro + 1 por columna), cada columna con su donut junto a su tabla en una
sola página/slide. Ticket y Cabin pasaron de 3 slides a 1. Suite verde (122 passed).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../automatic_eda/chapters/cat_distr_test.py  | 77 +++++++++++++++++++
 .../automatic_eda/render_pptx_impl.py         | 74 ++++++++++++++++++
 2 files changed, 151 insertions(+)

diff --git a/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py b/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py
index f26984a9..919b86fa 100644
--- a/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py
+++ b/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py
@@ -222,6 +222,83 @@ def test_golden_render_pptx_muestra_categoricas():
         assert "distintos" in txt
 
 
+def _profile_high_card() -> dict:
+    """Profile with a high-cardinality NON-id-like categorical column whose top-k
+    of long values would split from its donut on a short 16:9 slide unless the
+    renderer trims the table — the exact case the adversarial check flagged
+    (Ticket / Cabin)."""
+    long_vals = [f"Valor largo de categoria numero {i:02d} con texto extra"
+                 for i in range(40)]
+    top = [{"value": v, "count": 60 - i, "pct": (60 - i) / 5000.0}
+           for i, v in enumerate(long_vals)]
+    return {
+        "table": "t", "source": "t.csv", "n_rows": 5000, "n_cols": 3,
+        "quality_score": 80.0,
+        "columns": [
+            {"name": "precio", "inferred_type": "numeric", "null_pct": 0.0,
+             "numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0,
+                         "std": 0.5}},
+            # 40 distinct over 5000 rows = 0.8% distinct -> NOT id-like, keeps
+            # its (long) top-k table; the tall table must not push the donut off.
+            {"name": "alta_card_col", "inferred_type": "categorical",
+             "null_pct": 0.0, "distinct_count": 40,
+             "categorical": {"top": top, "mode": long_vals[0], "n_distinct": 40,
+                             "entropy": 5.2, "imbalance": 1.2, "len_min": 40,
+                             "len_mean": 45, "len_max": 50}},
+            {"name": "baja_card_col", "inferred_type": "categorical",
+             "null_pct": 0.0, "distinct_count": 4,
+             "categorical": {
+                 "top": [{"value": "norte", "count": 2000, "pct": 0.4},
+                         {"value": "sur", "count": 1500, "pct": 0.3},
+                         {"value": "este", "count": 1000, "pct": 0.2},
+                         {"value": "oeste", "count": 500, "pct": 0.1}],
+                 "mode": "norte", "n_distinct": 4, "entropy": 1.8}},
+        ],
+    }
+
+
+def test_golden_pptx_una_slide_por_columna_con_su_grafico():
+    """Each categorical column occupies EXACTLY ONE cat_distr slide that carries
+    BOTH its cardinality table and its donut figure (picture) — i.e. the chart is
+    never separated from its table, even for a high-cardinality column."""
+    from pptx.enum.shapes import MSO_SHAPE_TYPE
+
+    prof = _profile_high_card()
+    cat_names = ["alta_card_col", "baja_card_col"]
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pptx")
+        res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        prs = Presentation(out)
+
+        # Per column: the cat_distr slides whose text mentions it, and whether the
+        # owning slide also has the donut caption + an actual picture shape.
+        slides_with_col = {n: [] for n in cat_names}
+        owner_has_chart = {n: False for n in cat_names}
+        for i, sl in enumerate(prs.slides):
+            texts, has_pic = [], False
+            for sh in sl.shapes:
+                if sh.has_text_frame:
+                    texts.append(sh.text_frame.text)
+                if sh.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                    has_pic = True
+            txt = re.sub(r"\s+", " ", " ".join(texts))
+            if "Distribuciones categ" not in txt:   # footer stamp of the chapter.
+                continue
+            for n in cat_names:
+                if n in txt:
+                    slides_with_col[n].append(i)
+                    has_table = "Cardinalidad" in txt or "distintos" in txt
+                    if has_pic and "donut" in txt and has_table:
+                        owner_has_chart[n] = True
+
+        for n in cat_names:
+            # Exactly one slide carries the column (not split across slides).
+            assert len(slides_with_col[n]) == 1, (n, slides_with_col[n])
+            # That single slide also holds its table AND its donut picture.
+            assert owner_has_chart[n], (n, "tabla y donut no están en el mismo slide")
+
+
 def test_edge_sin_categoricas_devuelve_none():
     only_numeric = {
         "n_rows": 10, "columns": [
diff --git a/python/functions/datascience/automatic_eda/render_pptx_impl.py b/python/functions/datascience/automatic_eda/render_pptx_impl.py
index cc0171f4..7a813945 100644
--- a/python/functions/datascience/automatic_eda/render_pptx_impl.py
+++ b/python/functions/datascience/automatic_eda/render_pptx_impl.py
@@ -729,6 +729,77 @@ def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> No
                         if isinstance(cur, (int, float)) and cur > 0 else per)
 
 
+# Minimum height (inches) reserved for a figure inside a keep-together group on
+# the short 16:9 slide. When a high-cardinality column's table(s) would otherwise
+# leave no room, the data table is trimmed (with an honest note) so the chart
+# stays on the SAME slide next to its table instead of spilling to the next one.
+_GROUP_MIN_FIG_H = 1.3
+
+
+def _trim_data_table_to_budget(block, budget: float):
+    """Return a copy of a DataTable whose rows fit within ``budget`` inches.
+
+    Keeps the title, header, as many leading rows as fit (at least one) and an
+    honest note reporting how many of the original rows are shown. NEVER mutates
+    the original block — the same Chapter blocks are rendered by the PDF renderer,
+    which keeps the full table (an A5 page fits it)."""
+    header = list(getattr(block, "header", []) or [])
+    rows = list(getattr(block, "rows", []) or [])
+    title = getattr(block, "title", None)
+    fs = _FS_CELL
+    widths = _col_widths(header, rows)
+    fixed = 0.0
+    if title:
+        fixed += _measure_heading_text(title, 2)
+    if header:
+        fixed += _row_height_in(header, widths, fs)
+    note_h = tl.line_height_in(_FS_NOTE) + 0.05
+    avail_rows = budget - fixed - note_h - _GAP
+    kept = []
+    used = 0.0
+    for r in rows:
+        rh = _row_height_in(r, widths, fs)
+        if used + rh > avail_rows and kept:
+            break
+        kept.append(r)
+        used += rh
+    if len(kept) >= len(rows):
+        return block  # already fits; keep the original (with its own note).
+    note = (f"top {len(kept)} de {len(rows)} categorías mostradas "
+            "(recortado para caber en el slide; el PDF muestra más)")
+    return model.DataTable(header=header, rows=kept, title=title, note=note)
+
+
+def _fit_group_blocks(st: _PptxState, blocks: list, avail_full: float) -> list:
+    """Return a slide-fitting copy of a keep-together group's blocks.
+
+    On the short 16:9 slide a high-cardinality column's top-k table plus its
+    chart can overflow. Reserve ``_GROUP_MIN_FIG_H`` for the (later shrunk) figure
+    and trim the data table(s) to what is left, so every column keeps its chart
+    next to its table on ONE slide. No-op when the group has no figure+table pair
+    (e.g. id-like columns already drop the top-k upstream, or it already fits)."""
+    has_fig = any(getattr(b, "kind", "") in ("figure", "image") for b in blocks)
+    tbls = [b for b in blocks if getattr(b, "kind", "") == "data_table"]
+    if not (has_fig and tbls):
+        return blocks
+    fixed_h = sum(_measure_block(st, b) for b in blocks
+                  if getattr(b, "kind", "") not in ("figure", "image",
+                                                    "data_table"))
+    tables_h = sum(_measure_block(st, b) for b in tbls)
+    budget_tables = avail_full - fixed_h - _GROUP_MIN_FIG_H
+    if tables_h <= budget_tables:
+        return blocks  # already fits next to a min-height figure; leave intact.
+    out = []
+    for b in blocks:
+        if getattr(b, "kind", "") != "data_table":
+            out.append(b)
+            continue
+        trimmed = _trim_data_table_to_budget(b, max(budget_tables, 0.8))
+        out.append(trimmed)
+        budget_tables -= _measure_data_table(trimmed)
+    return out
+
+
 def _place_group(st: _PptxState, block) -> None:
     """Render a keep-together Group: move it whole to the next slide if needed."""
     blocks = getattr(block, "blocks", []) or []
@@ -739,6 +810,9 @@ def _place_group(st: _PptxState, block) -> None:
     if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6:
         _new_slide(st, cont=True)
     avail_full = _CONTENT_BOTTOM - _CONTENT_TOP
+    # Trim oversized tables first (keeps the chart on the same slide), then shrink
+    # the figure to share the remaining room.
+    blocks = _fit_group_blocks(st, blocks, avail_full)
     _shrink_group_figures(st, blocks, avail_full)
     total = sum(_measure_block(st, b) for b in blocks)
     if total <= avail_full: