From 833597c831fac49c2a6e2f1e0ea61268a8947a5f Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Tue, 30 Jun 2026 19:45:09 +0200 Subject: [PATCH] =?UTF-8?q?fix(eda):=20cat=5Fdistr=20PPTX=20=E2=80=94=20co?= =?UTF-8?q?lumnas=20de=20alta=20cardinalidad=20caben=20en=20UN=20slide=20c?= =?UTF-8?q?on=20su=20gr=C3=A1fico?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit La verificación adversarial detectó que, en PPTX (slide 16:9, corto), las columnas categóricas de ALTA cardinalidad NO id-like (Ticket, Cabin) ocupaban 3 slides cada una con el donut SEPARADO de su tabla: el top-k de 8 filas largas no cabía junto al donut y el keep-together partía la columna. (El PDF, en A5, ya estaba 1:1 correcto.) Arreglo SOLO en render_pptx_impl.py: - `_fit_group_blocks` (nuevo): para un Group con figura + DataTable que no cabe en el slide, reserva un alto mínimo para el donut (`_GROUP_MIN_FIG_H`) y recorta las filas de la DataTable a lo que queda, de modo que el gráfico se queda en el MISMO slide, junto a su tabla. No-op cuando ya cabe o no hay par figura+tabla (p.ej. columnas id-like, que ya omiten la top-k). - `_trim_data_table_to_budget` (nuevo): devuelve una COPIA de la DataTable con las filas que caben (al menos una) + nota honesta "top N de M categorías mostradas (recortado para caber en el slide; el PDF muestra más)". NUNCA muta el bloque original, que es compartido con el renderer PDF (el PDF sigue mostrando la tabla completa en A5). - `_place_group`: aplica `_fit_group_blocks` antes de `_shrink_group_figures`. Refuerzo de cat_distr_test.py: - `test_golden_pptx_una_slide_por_columna_con_su_grafico`: perfil con una columna categórica de alta cardinalidad no-id-like (40 valores largos sobre 5000 filas, 0.8% distinto) que reproduce el caso Ticket/Cabin. Asierta que CADA columna categórica aparece en EXACTAMENTE UN slide del capítulo y que ese mismo slide lleva su tabla (Cardinalidad/distintos) Y su donut (caption + shape Picture) — el gráfico nunca se separa de su tabla. Sustituye al laxo `n_slides >= 2`. Verificado con titanic_train.csv (render_automatic_eda run_models=True): 5 columnas categóricas (Name, Sex, Ticket, Cabin, Embarked); PDF 6 páginas y PPTX 6 slides del capítulo (intro + 1 por columna), cada columna con su donut junto a su tabla en una sola página/slide. Ticket y Cabin pasaron de 3 slides a 1. Suite verde (122 passed). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../automatic_eda/chapters/cat_distr_test.py | 77 +++++++++++++++++++ .../automatic_eda/render_pptx_impl.py | 74 ++++++++++++++++++ 2 files changed, 151 insertions(+) diff --git a/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py b/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py index f26984a9..919b86fa 100644 --- a/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py +++ b/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py @@ -222,6 +222,83 @@ def test_golden_render_pptx_muestra_categoricas(): assert "distintos" in txt +def _profile_high_card() -> dict: + """Profile with a high-cardinality NON-id-like categorical column whose top-k + of long values would split from its donut on a short 16:9 slide unless the + renderer trims the table — the exact case the adversarial check flagged + (Ticket / Cabin).""" + long_vals = [f"Valor largo de categoria numero {i:02d} con texto extra" + for i in range(40)] + top = [{"value": v, "count": 60 - i, "pct": (60 - i) / 5000.0} + for i, v in enumerate(long_vals)] + return { + "table": "t", "source": "t.csv", "n_rows": 5000, "n_cols": 3, + "quality_score": 80.0, + "columns": [ + {"name": "precio", "inferred_type": "numeric", "null_pct": 0.0, + "numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0, + "std": 0.5}}, + # 40 distinct over 5000 rows = 0.8% distinct -> NOT id-like, keeps + # its (long) top-k table; the tall table must not push the donut off. + {"name": "alta_card_col", "inferred_type": "categorical", + "null_pct": 0.0, "distinct_count": 40, + "categorical": {"top": top, "mode": long_vals[0], "n_distinct": 40, + "entropy": 5.2, "imbalance": 1.2, "len_min": 40, + "len_mean": 45, "len_max": 50}}, + {"name": "baja_card_col", "inferred_type": "categorical", + "null_pct": 0.0, "distinct_count": 4, + "categorical": { + "top": [{"value": "norte", "count": 2000, "pct": 0.4}, + {"value": "sur", "count": 1500, "pct": 0.3}, + {"value": "este", "count": 1000, "pct": 0.2}, + {"value": "oeste", "count": 500, "pct": 0.1}], + "mode": "norte", "n_distinct": 4, "entropy": 1.8}}, + ], + } + + +def test_golden_pptx_una_slide_por_columna_con_su_grafico(): + """Each categorical column occupies EXACTLY ONE cat_distr slide that carries + BOTH its cardinality table and its donut figure (picture) — i.e. the chart is + never separated from its table, even for a high-cardinality column.""" + from pptx.enum.shapes import MSO_SHAPE_TYPE + + prof = _profile_high_card() + cat_names = ["alta_card_col", "baja_card_col"] + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "eda.pptx") + res = render_automatic_eda_pptx(prof, out, {"title": "EDA"}) + assert res["path"] == out and os.path.exists(out) + prs = Presentation(out) + + # Per column: the cat_distr slides whose text mentions it, and whether the + # owning slide also has the donut caption + an actual picture shape. + slides_with_col = {n: [] for n in cat_names} + owner_has_chart = {n: False for n in cat_names} + for i, sl in enumerate(prs.slides): + texts, has_pic = [], False + for sh in sl.shapes: + if sh.has_text_frame: + texts.append(sh.text_frame.text) + if sh.shape_type == MSO_SHAPE_TYPE.PICTURE: + has_pic = True + txt = re.sub(r"\s+", " ", " ".join(texts)) + if "Distribuciones categ" not in txt: # footer stamp of the chapter. + continue + for n in cat_names: + if n in txt: + slides_with_col[n].append(i) + has_table = "Cardinalidad" in txt or "distintos" in txt + if has_pic and "donut" in txt and has_table: + owner_has_chart[n] = True + + for n in cat_names: + # Exactly one slide carries the column (not split across slides). + assert len(slides_with_col[n]) == 1, (n, slides_with_col[n]) + # That single slide also holds its table AND its donut picture. + assert owner_has_chart[n], (n, "tabla y donut no están en el mismo slide") + + def test_edge_sin_categoricas_devuelve_none(): only_numeric = { "n_rows": 10, "columns": [ diff --git a/python/functions/datascience/automatic_eda/render_pptx_impl.py b/python/functions/datascience/automatic_eda/render_pptx_impl.py index cc0171f4..7a813945 100644 --- a/python/functions/datascience/automatic_eda/render_pptx_impl.py +++ b/python/functions/datascience/automatic_eda/render_pptx_impl.py @@ -729,6 +729,77 @@ def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> No if isinstance(cur, (int, float)) and cur > 0 else per) +# Minimum height (inches) reserved for a figure inside a keep-together group on +# the short 16:9 slide. When a high-cardinality column's table(s) would otherwise +# leave no room, the data table is trimmed (with an honest note) so the chart +# stays on the SAME slide next to its table instead of spilling to the next one. +_GROUP_MIN_FIG_H = 1.3 + + +def _trim_data_table_to_budget(block, budget: float): + """Return a copy of a DataTable whose rows fit within ``budget`` inches. + + Keeps the title, header, as many leading rows as fit (at least one) and an + honest note reporting how many of the original rows are shown. NEVER mutates + the original block — the same Chapter blocks are rendered by the PDF renderer, + which keeps the full table (an A5 page fits it).""" + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + title = getattr(block, "title", None) + fs = _FS_CELL + widths = _col_widths(header, rows) + fixed = 0.0 + if title: + fixed += _measure_heading_text(title, 2) + if header: + fixed += _row_height_in(header, widths, fs) + note_h = tl.line_height_in(_FS_NOTE) + 0.05 + avail_rows = budget - fixed - note_h - _GAP + kept = [] + used = 0.0 + for r in rows: + rh = _row_height_in(r, widths, fs) + if used + rh > avail_rows and kept: + break + kept.append(r) + used += rh + if len(kept) >= len(rows): + return block # already fits; keep the original (with its own note). + note = (f"top {len(kept)} de {len(rows)} categorías mostradas " + "(recortado para caber en el slide; el PDF muestra más)") + return model.DataTable(header=header, rows=kept, title=title, note=note) + + +def _fit_group_blocks(st: _PptxState, blocks: list, avail_full: float) -> list: + """Return a slide-fitting copy of a keep-together group's blocks. + + On the short 16:9 slide a high-cardinality column's top-k table plus its + chart can overflow. Reserve ``_GROUP_MIN_FIG_H`` for the (later shrunk) figure + and trim the data table(s) to what is left, so every column keeps its chart + next to its table on ONE slide. No-op when the group has no figure+table pair + (e.g. id-like columns already drop the top-k upstream, or it already fits).""" + has_fig = any(getattr(b, "kind", "") in ("figure", "image") for b in blocks) + tbls = [b for b in blocks if getattr(b, "kind", "") == "data_table"] + if not (has_fig and tbls): + return blocks + fixed_h = sum(_measure_block(st, b) for b in blocks + if getattr(b, "kind", "") not in ("figure", "image", + "data_table")) + tables_h = sum(_measure_block(st, b) for b in tbls) + budget_tables = avail_full - fixed_h - _GROUP_MIN_FIG_H + if tables_h <= budget_tables: + return blocks # already fits next to a min-height figure; leave intact. + out = [] + for b in blocks: + if getattr(b, "kind", "") != "data_table": + out.append(b) + continue + trimmed = _trim_data_table_to_budget(b, max(budget_tables, 0.8)) + out.append(trimmed) + budget_tables -= _measure_data_table(trimmed) + return out + + def _place_group(st: _PptxState, block) -> None: """Render a keep-together Group: move it whole to the next slide if needed.""" blocks = getattr(block, "blocks", []) or [] @@ -739,6 +810,9 @@ def _place_group(st: _PptxState, block) -> None: if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6: _new_slide(st, cont=True) avail_full = _CONTENT_BOTTOM - _CONTENT_TOP + # Trim oversized tables first (keeps the chart on the same slide), then shrink + # the figure to share the remaining room. + blocks = _fit_group_blocks(st, blocks, avail_full) _shrink_group_figures(st, blocks, avail_full) total = sum(_measure_block(st, b) for b in blocks) if total <= avail_full: