fix(eda): cat_distr PPTX — columnas de alta cardinalidad caben en UN slide con su gráfico
La verificación adversarial detectó que, en PPTX (slide 16:9, corto), las columnas categóricas de ALTA cardinalidad NO id-like (Ticket, Cabin) ocupaban 3 slides cada una con el donut SEPARADO de su tabla: el top-k de 8 filas largas no cabía junto al donut y el keep-together partía la columna. (El PDF, en A5, ya estaba 1:1 correcto.) Arreglo SOLO en render_pptx_impl.py: - `_fit_group_blocks` (nuevo): para un Group con figura + DataTable que no cabe en el slide, reserva un alto mínimo para el donut (`_GROUP_MIN_FIG_H`) y recorta las filas de la DataTable a lo que queda, de modo que el gráfico se queda en el MISMO slide, junto a su tabla. No-op cuando ya cabe o no hay par figura+tabla (p.ej. columnas id-like, que ya omiten la top-k). - `_trim_data_table_to_budget` (nuevo): devuelve una COPIA de la DataTable con las filas que caben (al menos una) + nota honesta "top N de M categorías mostradas (recortado para caber en el slide; el PDF muestra más)". NUNCA muta el bloque original, que es compartido con el renderer PDF (el PDF sigue mostrando la tabla completa en A5). - `_place_group`: aplica `_fit_group_blocks` antes de `_shrink_group_figures`. Refuerzo de cat_distr_test.py: - `test_golden_pptx_una_slide_por_columna_con_su_grafico`: perfil con una columna categórica de alta cardinalidad no-id-like (40 valores largos sobre 5000 filas, 0.8% distinto) que reproduce el caso Ticket/Cabin. Asierta que CADA columna categórica aparece en EXACTAMENTE UN slide del capítulo y que ese mismo slide lleva su tabla (Cardinalidad/distintos) Y su donut (caption + shape Picture) — el gráfico nunca se separa de su tabla. Sustituye al laxo `n_slides >= 2`. Verificado con titanic_train.csv (render_automatic_eda run_models=True): 5 columnas categóricas (Name, Sex, Ticket, Cabin, Embarked); PDF 6 páginas y PPTX 6 slides del capítulo (intro + 1 por columna), cada columna con su donut junto a su tabla en una sola página/slide. Ticket y Cabin pasaron de 3 slides a 1. Suite verde (122 passed). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -222,6 +222,83 @@ def test_golden_render_pptx_muestra_categoricas():
|
|||||||
assert "distintos" in txt
|
assert "distintos" in txt
|
||||||
|
|
||||||
|
|
||||||
|
def _profile_high_card() -> dict:
|
||||||
|
"""Profile with a high-cardinality NON-id-like categorical column whose top-k
|
||||||
|
of long values would split from its donut on a short 16:9 slide unless the
|
||||||
|
renderer trims the table — the exact case the adversarial check flagged
|
||||||
|
(Ticket / Cabin)."""
|
||||||
|
long_vals = [f"Valor largo de categoria numero {i:02d} con texto extra"
|
||||||
|
for i in range(40)]
|
||||||
|
top = [{"value": v, "count": 60 - i, "pct": (60 - i) / 5000.0}
|
||||||
|
for i, v in enumerate(long_vals)]
|
||||||
|
return {
|
||||||
|
"table": "t", "source": "t.csv", "n_rows": 5000, "n_cols": 3,
|
||||||
|
"quality_score": 80.0,
|
||||||
|
"columns": [
|
||||||
|
{"name": "precio", "inferred_type": "numeric", "null_pct": 0.0,
|
||||||
|
"numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0,
|
||||||
|
"std": 0.5}},
|
||||||
|
# 40 distinct over 5000 rows = 0.8% distinct -> NOT id-like, keeps
|
||||||
|
# its (long) top-k table; the tall table must not push the donut off.
|
||||||
|
{"name": "alta_card_col", "inferred_type": "categorical",
|
||||||
|
"null_pct": 0.0, "distinct_count": 40,
|
||||||
|
"categorical": {"top": top, "mode": long_vals[0], "n_distinct": 40,
|
||||||
|
"entropy": 5.2, "imbalance": 1.2, "len_min": 40,
|
||||||
|
"len_mean": 45, "len_max": 50}},
|
||||||
|
{"name": "baja_card_col", "inferred_type": "categorical",
|
||||||
|
"null_pct": 0.0, "distinct_count": 4,
|
||||||
|
"categorical": {
|
||||||
|
"top": [{"value": "norte", "count": 2000, "pct": 0.4},
|
||||||
|
{"value": "sur", "count": 1500, "pct": 0.3},
|
||||||
|
{"value": "este", "count": 1000, "pct": 0.2},
|
||||||
|
{"value": "oeste", "count": 500, "pct": 0.1}],
|
||||||
|
"mode": "norte", "n_distinct": 4, "entropy": 1.8}},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_pptx_una_slide_por_columna_con_su_grafico():
|
||||||
|
"""Each categorical column occupies EXACTLY ONE cat_distr slide that carries
|
||||||
|
BOTH its cardinality table and its donut figure (picture) — i.e. the chart is
|
||||||
|
never separated from its table, even for a high-cardinality column."""
|
||||||
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||||
|
|
||||||
|
prof = _profile_high_card()
|
||||||
|
cat_names = ["alta_card_col", "baja_card_col"]
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
out = os.path.join(d, "eda.pptx")
|
||||||
|
res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
|
||||||
|
assert res["path"] == out and os.path.exists(out)
|
||||||
|
prs = Presentation(out)
|
||||||
|
|
||||||
|
# Per column: the cat_distr slides whose text mentions it, and whether the
|
||||||
|
# owning slide also has the donut caption + an actual picture shape.
|
||||||
|
slides_with_col = {n: [] for n in cat_names}
|
||||||
|
owner_has_chart = {n: False for n in cat_names}
|
||||||
|
for i, sl in enumerate(prs.slides):
|
||||||
|
texts, has_pic = [], False
|
||||||
|
for sh in sl.shapes:
|
||||||
|
if sh.has_text_frame:
|
||||||
|
texts.append(sh.text_frame.text)
|
||||||
|
if sh.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||||
|
has_pic = True
|
||||||
|
txt = re.sub(r"\s+", " ", " ".join(texts))
|
||||||
|
if "Distribuciones categ" not in txt: # footer stamp of the chapter.
|
||||||
|
continue
|
||||||
|
for n in cat_names:
|
||||||
|
if n in txt:
|
||||||
|
slides_with_col[n].append(i)
|
||||||
|
has_table = "Cardinalidad" in txt or "distintos" in txt
|
||||||
|
if has_pic and "donut" in txt and has_table:
|
||||||
|
owner_has_chart[n] = True
|
||||||
|
|
||||||
|
for n in cat_names:
|
||||||
|
# Exactly one slide carries the column (not split across slides).
|
||||||
|
assert len(slides_with_col[n]) == 1, (n, slides_with_col[n])
|
||||||
|
# That single slide also holds its table AND its donut picture.
|
||||||
|
assert owner_has_chart[n], (n, "tabla y donut no están en el mismo slide")
|
||||||
|
|
||||||
|
|
||||||
def test_edge_sin_categoricas_devuelve_none():
|
def test_edge_sin_categoricas_devuelve_none():
|
||||||
only_numeric = {
|
only_numeric = {
|
||||||
"n_rows": 10, "columns": [
|
"n_rows": 10, "columns": [
|
||||||
|
|||||||
@@ -729,6 +729,77 @@ def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> No
|
|||||||
if isinstance(cur, (int, float)) and cur > 0 else per)
|
if isinstance(cur, (int, float)) and cur > 0 else per)
|
||||||
|
|
||||||
|
|
||||||
|
# Minimum height (inches) reserved for a figure inside a keep-together group on
|
||||||
|
# the short 16:9 slide. When a high-cardinality column's table(s) would otherwise
|
||||||
|
# leave no room, the data table is trimmed (with an honest note) so the chart
|
||||||
|
# stays on the SAME slide next to its table instead of spilling to the next one.
|
||||||
|
_GROUP_MIN_FIG_H = 1.3
|
||||||
|
|
||||||
|
|
||||||
|
def _trim_data_table_to_budget(block, budget: float):
|
||||||
|
"""Return a copy of a DataTable whose rows fit within ``budget`` inches.
|
||||||
|
|
||||||
|
Keeps the title, header, as many leading rows as fit (at least one) and an
|
||||||
|
honest note reporting how many of the original rows are shown. NEVER mutates
|
||||||
|
the original block — the same Chapter blocks are rendered by the PDF renderer,
|
||||||
|
which keeps the full table (an A5 page fits it)."""
|
||||||
|
header = list(getattr(block, "header", []) or [])
|
||||||
|
rows = list(getattr(block, "rows", []) or [])
|
||||||
|
title = getattr(block, "title", None)
|
||||||
|
fs = _FS_CELL
|
||||||
|
widths = _col_widths(header, rows)
|
||||||
|
fixed = 0.0
|
||||||
|
if title:
|
||||||
|
fixed += _measure_heading_text(title, 2)
|
||||||
|
if header:
|
||||||
|
fixed += _row_height_in(header, widths, fs)
|
||||||
|
note_h = tl.line_height_in(_FS_NOTE) + 0.05
|
||||||
|
avail_rows = budget - fixed - note_h - _GAP
|
||||||
|
kept = []
|
||||||
|
used = 0.0
|
||||||
|
for r in rows:
|
||||||
|
rh = _row_height_in(r, widths, fs)
|
||||||
|
if used + rh > avail_rows and kept:
|
||||||
|
break
|
||||||
|
kept.append(r)
|
||||||
|
used += rh
|
||||||
|
if len(kept) >= len(rows):
|
||||||
|
return block # already fits; keep the original (with its own note).
|
||||||
|
note = (f"top {len(kept)} de {len(rows)} categorías mostradas "
|
||||||
|
"(recortado para caber en el slide; el PDF muestra más)")
|
||||||
|
return model.DataTable(header=header, rows=kept, title=title, note=note)
|
||||||
|
|
||||||
|
|
||||||
|
def _fit_group_blocks(st: _PptxState, blocks: list, avail_full: float) -> list:
|
||||||
|
"""Return a slide-fitting copy of a keep-together group's blocks.
|
||||||
|
|
||||||
|
On the short 16:9 slide a high-cardinality column's top-k table plus its
|
||||||
|
chart can overflow. Reserve ``_GROUP_MIN_FIG_H`` for the (later shrunk) figure
|
||||||
|
and trim the data table(s) to what is left, so every column keeps its chart
|
||||||
|
next to its table on ONE slide. No-op when the group has no figure+table pair
|
||||||
|
(e.g. id-like columns already drop the top-k upstream, or it already fits)."""
|
||||||
|
has_fig = any(getattr(b, "kind", "") in ("figure", "image") for b in blocks)
|
||||||
|
tbls = [b for b in blocks if getattr(b, "kind", "") == "data_table"]
|
||||||
|
if not (has_fig and tbls):
|
||||||
|
return blocks
|
||||||
|
fixed_h = sum(_measure_block(st, b) for b in blocks
|
||||||
|
if getattr(b, "kind", "") not in ("figure", "image",
|
||||||
|
"data_table"))
|
||||||
|
tables_h = sum(_measure_block(st, b) for b in tbls)
|
||||||
|
budget_tables = avail_full - fixed_h - _GROUP_MIN_FIG_H
|
||||||
|
if tables_h <= budget_tables:
|
||||||
|
return blocks # already fits next to a min-height figure; leave intact.
|
||||||
|
out = []
|
||||||
|
for b in blocks:
|
||||||
|
if getattr(b, "kind", "") != "data_table":
|
||||||
|
out.append(b)
|
||||||
|
continue
|
||||||
|
trimmed = _trim_data_table_to_budget(b, max(budget_tables, 0.8))
|
||||||
|
out.append(trimmed)
|
||||||
|
budget_tables -= _measure_data_table(trimmed)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def _place_group(st: _PptxState, block) -> None:
|
def _place_group(st: _PptxState, block) -> None:
|
||||||
"""Render a keep-together Group: move it whole to the next slide if needed."""
|
"""Render a keep-together Group: move it whole to the next slide if needed."""
|
||||||
blocks = getattr(block, "blocks", []) or []
|
blocks = getattr(block, "blocks", []) or []
|
||||||
@@ -739,6 +810,9 @@ def _place_group(st: _PptxState, block) -> None:
|
|||||||
if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6:
|
if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6:
|
||||||
_new_slide(st, cont=True)
|
_new_slide(st, cont=True)
|
||||||
avail_full = _CONTENT_BOTTOM - _CONTENT_TOP
|
avail_full = _CONTENT_BOTTOM - _CONTENT_TOP
|
||||||
|
# Trim oversized tables first (keeps the chart on the same slide), then shrink
|
||||||
|
# the figure to share the remaining room.
|
||||||
|
blocks = _fit_group_blocks(st, blocks, avail_full)
|
||||||
_shrink_group_figures(st, blocks, avail_full)
|
_shrink_group_figures(st, blocks, avail_full)
|
||||||
total = sum(_measure_block(st, b) for b in blocks)
|
total = sum(_measure_block(st, b) for b in blocks)
|
||||||
if total <= avail_full:
|
if total <= avail_full:
|
||||||
|
|||||||
Reference in New Issue
Block a user