From 7158be8142cb95dcfc21eb8506bd14f4fd5e20e3 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Tue, 30 Jun 2026 19:26:33 +0200 Subject: [PATCH] =?UTF-8?q?feat(eda):=20cat=5Fdistr=20una=20hoja=20por=20c?= =?UTF-8?q?olumna=20(gr=C3=A1fico=20incluido)=20+=20sin=20descripci=C3=B3n?= =?UTF-8?q?=20redundante=20con=20glosario?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cada columna categórica del capítulo CAT DISTR ocupa ahora su propia página (PDF) / slide (PPTX) con su gráfico junto a su tabla, y se elimina la explicación larga de la entropía que duplicaba el capítulo GLOSARIO. Cambios: - model.Group: nuevo campo aditivo `page_break_before` (default False). Cuando es True el renderer fuerza al grupo a empezar en página/slide nueva (salvo que la actual esté vacía). Comportamiento de todos los capítulos existentes intacto. Soportado también en el normalizador dict-defensivo `as_block`. - render_pdf_impl / render_pptx_impl `_place_group`: respetan `page_break_before`. - render_pdf_impl / render_pptx_impl `_measure_block`: medición fiel de KVTable y DataTable (replica `_place_*`: título-heading, wrap del valor/celdas por columna, nota). La estimación previa asumía una línea por fila e ignoraba el título, así que el keep-together infra-presupuestaba la figura y el gráfico se desbordaba a la página siguiente. Helpers `_measure_kv_table`/`_measure_data_table`. - render_pptx_impl `_shrink_group_figures`: umbrales más bajos (budget>0.6, per>0.35) para que en el slide corto 16:9 la figura se encoja y conviva con la tabla en lugar de partir la columna (misma filosofía keep-together del PDF). - cat_distr.py: - build envuelve cada columna en un `Group(page_break_before=idx>0)`: una columna por página/slide, con su tabla de cardinalidad, su top-k y su donut juntos. La primera comparte página con la intro para no dejar una casi vacía. - intro recortada: se elimina el párrafo que explicaba qué es la entropía (vive en el capítulo GLOSARIO, donde el término `[[term:entropia]]` enlaza); se conserva el término clicable y el total de filas de referencia. - `_cardinality_block`: métricas relacionadas agrupadas por fila (distintos·%· únicos; entropía bits·máx·norm; desbalance·longitud) sin perder ningún dato, para que tabla + gráfico quepan en el slide 16:9. - columnas id-like (≈100% distintas): se omite la top-k (sería una lista de valores únicos; la nota lo explica) y el donut ocupa ese hueco. - CHAPTER_VERSION 1.1.0 -> 1.2.0. Verificado con titanic (render_automatic_eda run_models=True): PDF 5 páginas y PPTX 5 slides del capítulo (intro + 1 por columna: Name, Sex, Ticket, Embarked), cada columna con su gráfico junto a su tabla, sin cortes. Suite verde (121 passed): pytest automatic_eda/ + render_automatic_eda_test.py. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../automatic_eda/chapters/cat_distr.py | 146 +++++++++++------- .../automatic_eda/chapters/cat_distr_test.py | 138 +++++++++++++---- .../datascience/automatic_eda/model.py | 11 +- .../automatic_eda/render_pdf_impl.py | 67 +++++++- .../automatic_eda/render_pptx_impl.py | 68 +++++++- 5 files changed, 335 insertions(+), 95 deletions(-) diff --git a/python/functions/datascience/automatic_eda/chapters/cat_distr.py b/python/functions/datascience/automatic_eda/chapters/cat_distr.py index 6421a574..b722c68a 100644 --- a/python/functions/datascience/automatic_eda/chapters/cat_distr.py +++ b/python/functions/datascience/automatic_eda/chapters/cat_distr.py @@ -1,19 +1,25 @@ """Categorical distributions chapter (CAT DISTR). -Third reference chapter for AutomaticEDA. For every categorical column it shows, -fulfilling the user's request: +Third reference chapter for AutomaticEDA. Each categorical column gets **its own +page (PDF) / slide (PPTX)**: every column is wrapped in a keep-together +``model.Group`` with ``page_break_before=True`` (except the first, which may share +the intro's page), so its chart sits next to its tables and no column is split. -1. A short opening explanation of **Shannon entropy** (what it measures, its 0 - and log2(k) bounds, the normalized 0–1 version) and the dataset row total used - as a comparison baseline. -2. Per column, a cardinality key/value table: distinct values, ``% distinct`` - (distinct / total rows), total dataset rows, singleton values (frequency 1), - entropy with its theoretical maximum and the normalized ratio, mode, imbalance - and string-length stats. -3. A short note flagging problematic cardinality (id-like ≈100% distinct, or a +A short intro names the clickable **[[term:entropia]]entropía[[/term]]** term — +the full definition lives in the GLOSARIO chapter, so it is NOT repeated inline +here (one click jumps to the glossary entry). The intro also carries the dataset +row total used as a comparison baseline. + +Per column the Group contains, in order: + +1. A cardinality key/value table: distinct values, ``% distinct`` (distinct / + total rows), total dataset rows, singleton values (frequency 1), entropy with + its theoretical maximum and the normalized ratio, mode, imbalance and + string-length stats. +2. A short note flagging problematic cardinality (id-like ≈100% distinct, or a single dominating category). -4. A ``top-k`` table (value / count / %). -5. A **donut pie chart** of the most common categories (top-k + an "Otros" +3. A ``top-k`` table (value / count / %). +4. A **donut pie chart** of the most common categories (top-k + an "Otros" bucket), drawn lazily so the renderers scale it to fit entirely. Data comes from the ``eda`` group: each ``columns[i]['categorical']`` is the @@ -33,7 +39,7 @@ import math from .. import model -CHAPTER_VERSION = "1.1.0" +CHAPTER_VERSION = "1.2.0" CHAPTER_ID = "cat_distr" CHAPTER_TITLE = "Distribuciones categóricas" @@ -53,11 +59,17 @@ _TERM_ENTROPIA_DEF = ( # Cap the number of categorical columns rendered to keep the document bounded; # the rest are summarized in a closing note (no silent truncation). MAX_COLS = 40 -# Rows shown in each top-k table and explicit slices in the pie. -TOP_TABLE_ROWS = 15 +# Rows shown in each top-k table and explicit slices in the pie. Kept moderate so +# the whole column — cardinality table + top-k table + donut — fits on ONE +# page/slide with the chart next to its tables; the table note still reports +# "top N of M" so nothing is silently hidden. For id-like columns (≈100% +# distinct) the top-k table is dropped entirely (it would be a list of unique +# values — pure noise), which also frees the room the donut needs (see build). +TOP_TABLE_ROWS = 8 PIE_TOP_K = 6 -# Truncate very long category labels in tables (the renderer also wraps). -LABEL_MAX = 48 +# Truncate very long category labels in tables (the renderer also wraps). Kept +# tight so a column with long id-like values (names, tickets) still fits its page. +LABEL_MAX = 28 def _fmt_int(value) -> str: @@ -267,45 +279,55 @@ def _normalize_card(card: dict) -> dict: def _cardinality_block(card: dict): - """KVTable with the cardinality / entropy metrics for one column.""" + """KVTable with the cardinality / entropy metrics for one column. + + Related metrics are grouped onto a single row each (distinct/%/unique; + entropy bits/max/normalized; length min/mean/max) so the whole column — + table + chart — fits one page/slide without dropping any datum; the short + 16:9 PPTX slide does not fit one metric per row plus a chart otherwise.""" n_singletons = card.get("n_singletons") if n_singletons is not None and card.get("n_singletons_partial"): - singletons = f"≥{_fmt_int(n_singletons)} (en top mostrado)" + singletons = f"≥{_fmt_int(n_singletons)}" elif n_singletons is not None: singletons = _fmt_int(n_singletons) else: singletons = "—" - entropy_ref = _fmt_num(card.get("entropy")) - emax = card.get("entropy_max") - if emax is not None: - entropy_ref = f"{entropy_ref} (máx {_fmt_num(emax)})" + # Distinct count · % distinct · unique (frequency 1) on one row. + distinct_combo = (f"{_fmt_int(card.get('n_distinct'))} · " + f"{_fmt_pct_value(card.get('pct_distinct'))} · " + f"{singletons} únicos") + + # Entropy bits · theoretical max · normalized 0–1 on one row. + entropy_combo = (f"{_fmt_num(card.get('entropy'))} bits · " + f"máx {_fmt_num(card.get('entropy_max'))} · " + f"norm {_fmt_num(card.get('entropy_norm'))}") mode = card.get("mode") mode_pct = card.get("mode_pct") - mode_str = "—" if mode is None else model._safe_str(mode) + mode_str = "—" if mode is None else _truncate(mode, 32) if mode is not None and mode_pct is not None: mode_str = f"{mode_str} ({_fmt_pct_value(mode_pct)})" rows = [ - ("Valores distintos", _fmt_int(card.get("n_distinct"))), - ("% distintos", _fmt_pct_value(card.get("pct_distinct"))), + ("Distintos · % · únicos", distinct_combo), ("Total filas (dataset)", _fmt_int(card.get("n_rows"))), - ("Valores únicos (frecuencia 1)", singletons), - ("Entropía (bits)", entropy_ref), - ("Entropía normalizada (0–1)", _fmt_num(card.get("entropy_norm"))), + ("Entropía (bits · máx · norm)", entropy_combo), ("Moda", mode_str), ] imbalance = card.get("imbalance") - if imbalance is not None: - rows.append(("Desbalance", _fmt_num(imbalance))) lm = card.get("len_min") lmean = card.get("len_mean") lmax = card.get("len_max") + # Imbalance and string length (both secondary) share one closing row. + extras = [] + if imbalance is not None: + extras.append(f"desbalance {_fmt_num(imbalance)}") if any(v is not None for v in (lm, lmean, lmax)): - rows.append(( - "Longitud (mín/media/máx)", - f"{_fmt_num(lm)} / {_fmt_num(lmean)} / {_fmt_num(lmax)}")) + extras.append( + f"long. {_fmt_num(lm)}/{_fmt_num(lmean)}/{_fmt_num(lmax)}") + if extras: + rows.append(("Desbalance · longitud", " · ".join(extras))) return model.KVTable(rows=rows, title="Cardinalidad") @@ -315,7 +337,8 @@ def _flag_note(card: dict): return model.Note( "Casi todos los valores son distintos (≈100% distintos): la columna " "se comporta como un identificador y aporta poco para agrupar o " - "comparar categorías.") + "comparar categorías. No se lista el top de categorías (serían " + "valores casi todos únicos).") if card.get("dominated"): mp = card.get("mode_pct") mp_str = _fmt_pct_value(mp) if mp is not None else "muy alta" @@ -335,7 +358,7 @@ def _topk_table(cat: dict): if not isinstance(t, dict): continue rows.append([ - model._safe_str(t.get("value")), + _truncate(t.get("value")), _fmt_int(t.get("count")), _pct_from_maybe_fraction(t.get("pct")), ]) @@ -353,20 +376,16 @@ def _topk_table(cat: dict): def _intro_blocks(n_rows, mark_term: bool = False): total = _fmt_int(n_rows) # Mark the first appearance of the term as a clickable glossary jump when the - # term was registered (mark_term). The visible text is identical either way. - entropia = ("[[term:entropia]]**entropía de Shannon**[[/term]]" if mark_term - else "**entropía de Shannon**") + # term was registered (mark_term). The full definition of entropy lives in the + # GLOSARIO chapter, so the intro only names the clickable term here instead of + # repeating the long explanation (avoids the redundancy with the glossary). + entropia = ("[[term:entropia]]entropía[[/term]]" if mark_term + else "entropía") text = ( - f"La {entropia} mide cómo de repartidos están los valores de " - "una columna categórica, en bits. Vale 0 cuando una sola categoría " - "concentra todas las filas (máxima previsibilidad) y alcanza su máximo, " - "log2(k) para k categorías distintas, cuando todas aparecen por igual " - "(máxima diversidad). La **entropía normalizada** (entropía dividida por " - "su máximo) la lleva al rango 0–1 para comparar columnas con distinto " - "número de categorías. Para cada columna se muestran los valores " - "distintos, el porcentaje que representan sobre el total de filas, los " - "valores únicos (que aparecen una sola vez), la tabla de las categorías " - "más frecuentes y un gráfico de tarta (donut) de las más comunes." + f"Cada columna categórica ocupa su propia página: sus métricas de " + f"cardinalidad —incluida la {entropia}—, una nota que señala cardinalidad " + "problemática, la tabla de las categorías más frecuentes y un gráfico de " + "tarta (donut) de las más comunes, todo junto." ) if n_rows is not None: text += f" El dataset tiene {total} filas en total como referencia." @@ -398,24 +417,37 @@ def build_cat_distr(profile: dict, ctx: dict): blocks = list(_intro_blocks(n_rows, mark_term=mark_term)) rendered = cat_cols[:MAX_COLS] - for col in rendered: + for idx, col in enumerate(rendered): name = col.get("name") or "(columna)" cat = col.get("categorical") or {} card = _normalize_card(_cardinality(cat, n_rows)) - blocks.append(model.Heading(text=str(name), level=2)) - blocks.append(_cardinality_block(card)) + # One Group per categorical column: heading + cardinality table + flag + # note + top-k table + donut figure are kept together and the renderer + # starts each on a fresh page/slide (page_break_before) so every column + # gets its own page with its chart next to its tables. The first column + # may share the intro's page (no forced break) to avoid a near-empty page. + col_blocks = [ + model.Heading(text=str(name), level=2), + _cardinality_block(card), + ] note = _flag_note(card) if note is not None: - blocks.append(note) - topk = _topk_table(cat) - if topk is not None: - blocks.append(topk) - blocks.append(model.Figure( + col_blocks.append(note) + # For id-like columns (≈100% distinct) the top-k is a list of unique + # values — pure noise; skip it (the flag note already explains why) and + # let the donut take that room so the whole column fits one page/slide. + if not card.get("id_like"): + topk = _topk_table(cat) + if topk is not None: + col_blocks.append(topk) + col_blocks.append(model.Figure( make=_pie_make(cat.get("top") or [], card.get("n_distinct"), str(name), n_rows), caption=(f"Categorías más comunes de «{_truncate(name, 32)}» " "(donut: top-k + «Otros»)"))) + blocks.append(model.Group(blocks=col_blocks, + page_break_before=(idx > 0))) if len(cat_cols) > len(rendered): omitted = len(cat_cols) - len(rendered) diff --git a/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py b/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py index a061c67d..f26984a9 100644 --- a/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py +++ b/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py @@ -2,11 +2,14 @@ Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast and deterministic. Verifies that ``build_cat_distr`` emits the blocks the user -asked for (entropy intro, distinct/total/%-distinct/unique metrics, top-k table -and a donut figure), that the chapter renders inside the full document to both -PDF and PPTX showing that content, that a profile with no categorical columns -yields ``None`` without raising, and that long labels / many columns are never -cut in either output. +asked for (distinct/total/%-distinct/unique metrics, top-k table and a donut +figure), that EACH categorical column is wrapped in its own keep-together +``Group`` that starts on a fresh page/slide (one column per page, chart next to +its tables), that the long entropy explanation is NOT repeated inline (it lives +in the glossary — only the clickable term is kept), that the chapter renders +inside the full document to both PDF and PPTX showing that content, that a +profile with no categorical columns yields ``None`` without raising, and that +long labels / many columns are never cut in either output. """ import os @@ -17,7 +20,8 @@ from pypdf import PdfReader from pptx import Presentation from datascience.automatic_eda.model import ( - DataTable, Figure, Heading, KVTable, Note, + DataTable, Figure, GlossaryCollector, Group, Heading, KVTable, Markdown, + Note, ) from datascience.automatic_eda.chapters.cat_distr import ( CHAPTER_ID, CHAPTER_VERSION, build_cat_distr, @@ -81,8 +85,20 @@ def _pptx_text(path: str) -> str: return re.sub(r"\s+", " ", " ".join(parts)) -def _kinds(chapter): - return [b.kind for b in chapter.blocks] +def _flatten(blocks): + """Expand keep-together Groups so the per-column heading/table/figure are + inspectable as a flat block list (the chapter wraps each column in a Group).""" + out = [] + for b in blocks: + if getattr(b, "kind", "") == "group": + out.extend(_flatten(getattr(b, "blocks", []) or [])) + else: + out.append(b) + return out + + +def _column_groups(chapter): + return [b for b in chapter.blocks if isinstance(b, Group)] def test_golden_build_cat_distr_emite_bloques_pedidos(): @@ -90,36 +106,101 @@ def test_golden_build_cat_distr_emite_bloques_pedidos(): assert ch is not None assert ch.id == CHAPTER_ID assert ch.version == CHAPTER_VERSION - kinds = _kinds(ch) - # Entropy intro present. + + # Entropy intro present, but the long explanation is gone (it lives in the + # glossary now): only the term is named, no log2/normalizada walkthrough. headings = [b.text for b in ch.blocks if isinstance(b, Heading)] assert any("Entrop" in h for h in headings) - md = next(b for b in ch.blocks if b.kind == "markdown") - assert "entropía" in md.text.lower() and "log2" in md.text - # Cardinality metrics: distinct, total rows, %-distinct, unique values. - kv = next(b for b in ch.blocks if isinstance(b, KVTable)) + md = next(b for b in ch.blocks if isinstance(b, Markdown)) + assert "entropía" in md.text.lower() + assert "log2" not in md.text # redundant explanation removed. + assert "máxima diversidad" not in md.text + + # Per-column blocks are wrapped in keep-together Groups: flatten to inspect. + flat = _flatten(ch.blocks) + kv = next(b for b in flat if isinstance(b, KVTable)) labels = [r[0] for r in kv.rows] - assert "Valores distintos" in labels - assert "% distintos" in labels + values = " ".join(str(r[1]) for r in kv.rows) + # Cardinality metrics: distinct count, %-distinct, unique values and total + # rows are present (grouped onto compact rows so the chart fits the page). + assert "Distintos · % · únicos" in labels assert "Total filas (dataset)" in labels - assert "Valores únicos (frecuencia 1)" in labels assert any("Entropía" in lbl for lbl in labels) + assert "únicos" in values and "%" in values + assert "bits" in values and "norm" in values # entropy + max + normalized. # Top-k table + pie figure. - dt = next(b for b in ch.blocks if isinstance(b, DataTable)) + dt = next(b for b in flat if isinstance(b, DataTable)) assert dt.header == ["Valor", "Conteo", "%"] assert any("neumaticos" in str(cell) for row in dt.rows for cell in row) - assert any(isinstance(b, Figure) for b in ch.blocks) - # id-like column flagged with a Note. - assert any(isinstance(b, Note) and "identificador" in b.text - for b in ch.blocks) + assert any(isinstance(b, Figure) for b in flat) + # id-like column flagged with a Note that also explains the top-k is dropped. + idnote = next((b for b in flat + if isinstance(b, Note) and "identificador" in b.text), None) + assert idnote is not None + assert "No se lista el top" in idnote.text -def test_golden_render_pdf_muestra_categoricas(): +def test_golden_idlike_omite_topk_y_conserva_donut(): + # The id-like column (uuid, 100% distinct) must NOT carry a top-k DataTable + # (it would be a list of unique values), but must still keep its donut Figure + # and its cardinality table so it stays a full per-column page. + ch = build_cat_distr(_profile(), {}) + groups = _column_groups(ch) + uuid_group = next(g for g in groups + if any(getattr(b, "text", "") == "uuid" for b in g.blocks)) + kinds = [b.kind for b in uuid_group.blocks] + assert "data_table" not in kinds # top-k of unique values dropped. + assert "kv_table" in kinds # cardinality kept. + assert "figure" in kinds # donut kept (chart per column). + # A non-id-like column keeps its top-k table. + cat_group = next(g for g in groups + if any(getattr(b, "text", "") == "categoria" + for b in g.blocks)) + assert "data_table" in [b.kind for b in cat_group.blocks] + + +def test_golden_una_pagina_por_columna_groups(): + ch = build_cat_distr(_profile(), {}) + groups = _column_groups(ch) + # Two categorical columns -> two column Groups (numeric column excluded). + assert len(groups) == 2 + # Each Group carries one column: a heading + its cardinality table + figure. + for g in groups: + kinds = [b.kind for b in g.blocks] + assert kinds[0] == "heading" + assert "kv_table" in kinds + assert "figure" in kinds + # The first column may share the intro page (no forced break); every later + # column starts on a fresh page/slide so each column gets its own page. + assert groups[0].page_break_before is False + assert all(g.page_break_before is True for g in groups[1:]) + + +def test_golden_entropia_clicable_y_definicion_en_glosario(): + # With a glossary collector the intro marks the clickable term and the FULL + # definition (the long explanation removed from the intro) lands in the + # glossary, not inline — no data lost, just relocated. + gc = GlossaryCollector() + ch = build_cat_distr(_profile(), {"glossary": gc}) + md = next(b for b in ch.blocks if isinstance(b, Markdown)) + assert "[[term:entropia]]entropía[[/term]]" in md.text + assert gc.has("entropia") + entry = gc.get("entropia") + assert entry is not None + # The definition kept in the glossary still carries the detail removed inline. + assert "log2" in entry["definition"] + assert "normalizada" in entry["definition"].lower() + + +def test_golden_render_pdf_una_pagina_por_columna(): with tempfile.TemporaryDirectory() as d: out = os.path.join(d, "eda.pdf") res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"}) assert res["path"] == out and os.path.exists(out) - assert CHAPTER_ID in [c["id"] for c in res["chapters"]] + cat_meta = next(c for c in res["chapters"] if c["id"] == CHAPTER_ID) + # Two categorical columns, each on its own page -> >= 2 pages for the + # chapter (intro shares the first column's page). + assert cat_meta["n_pages"] >= 2 txt = _pdf_text(out) assert "Entrop" in txt assert "distintos" in txt @@ -133,7 +214,8 @@ def test_golden_render_pptx_muestra_categoricas(): out = os.path.join(d, "eda.pptx") res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"}) assert res["path"] == out and os.path.exists(out) - assert CHAPTER_ID in [c["id"] for c in res["chapters"]] + cat_meta = next(c for c in res["chapters"] if c["id"] == CHAPTER_ID) + assert cat_meta["n_slides"] >= 2 # one slide per categorical column. txt = _pptx_text(out) assert "Entrop" in txt assert "categoria" in txt and "neumaticos" in txt @@ -170,11 +252,15 @@ def test_anti_corte_label_largo_y_muchas_columnas(): ch = build_cat_distr(profile, {}) assert ch is not None + # One Group per column, each forcing its own page (except the first). + groups = _column_groups(ch) + assert len(groups) == 30 + assert sum(1 for g in groups if g.page_break_before) == 29 with tempfile.TemporaryDirectory() as d: pdf = os.path.join(d, "anti.pdf") res = render_automatic_eda_pdf(profile, pdf, {"write_manifest": False}) assert res["path"] == pdf - assert res["n_pages"] > 1 # many columns spilled across pages, OK. + assert res["n_pages"] > 1 # one page per column, OK. txt = _pdf_text(pdf) # Long label wrapped (not truncated): every word survives. for word in ("Lorem", "incididunt", "reprehenderit", "voluptate"): diff --git a/python/functions/datascience/automatic_eda/model.py b/python/functions/datascience/automatic_eda/model.py index 53c41377..7237df0b 100644 --- a/python/functions/datascience/automatic_eda/model.py +++ b/python/functions/datascience/automatic_eda/model.py @@ -139,10 +139,17 @@ class Group: it starts on a fresh page and flows (honest degradation, never cut). Use it to bind ``Heading`` + ``Markdown`` + ``Figure`` of one idea together (see the DISTR NUM / AGREGACION chapters). + + When ``page_break_before`` is True the renderer additionally forces the group + to *start* on a fresh page/slide (unless the current one is already empty), so + a chapter can give each unit its own page — e.g. one categorical column per + page (see CAT DISTR). It is purely additive: the default False keeps the plain + keep-together behaviour for every existing chapter. """ blocks: list = field(default_factory=list) title: Optional[str] = None + page_break_before: bool = False kind: str = field(default="group", init=False) @@ -228,7 +235,9 @@ def as_block(obj: Any): return Note(text=_safe_str(obj.get("text"))) if cls is Group: return Group(blocks=as_blocks(obj.get("blocks")), - title=obj.get("title")) + title=obj.get("title"), + page_break_before=bool( + obj.get("page_break_before", False))) if cls is GlossaryEntry: return GlossaryEntry(key=_safe_str(obj.get("key")), label=_safe_str(obj.get("label")), diff --git a/python/functions/datascience/automatic_eda/render_pdf_impl.py b/python/functions/datascience/automatic_eda/render_pdf_impl.py index ffe9a349..06adea4b 100644 --- a/python/functions/datascience/automatic_eda/render_pdf_impl.py +++ b/python/functions/datascience/automatic_eda/render_pdf_impl.py @@ -675,6 +675,61 @@ def _measure_figure_like(block) -> float: return target_h + 0.04 + cap_h + _GAP +def _measure_kv_table(block) -> float: + """Faithful height of a KVTable — matches ``_place_kv_table``. + + Counts the optional title heading and, per row, the wrapped VALUE column + (the label column never wraps in the placer). The previous estimate assumed + one line per row and ignored the title, so a column's keep-together Group + under-budgeted the figure and the chart spilled to the next page. Keep this in + sync with ``_place_kv_table``.""" + h = 0.0 + title = getattr(block, "title", None) + if title: + h += _measure_heading_text(title, 2) + rows = getattr(block, "rows", []) or [] + key_w = 1.9 + val_chars = tl.chars_per_line(_USABLE_W - key_w - 0.1, _FS_BODY) + lh = tl.line_height_in(_FS_BODY) + for row in rows: + try: + value = row[1] + except Exception: # noqa: BLE001 + value = "" + v_lines = tl.wrap(model._safe_str(value), val_chars) + h += lh * len(v_lines) + _ROW_VPAD + return h + _GAP + + +def _measure_data_table(block) -> float: + """Faithful height of a DataTable — matches ``_place_data_table``. + + Counts the optional title heading, the wrapped header row, every wrapped data + row (per-column wrap via the same ``_col_widths``/``_wrap_row`` the placer + uses) and the optional note. Keep this in sync with ``_place_data_table``.""" + h = 0.0 + title = getattr(block, "title", None) + if title: + h += _measure_heading_text(title, 2) + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + fs = _FS_CELL + widths = _col_widths(header, rows, fs) + lh = tl.line_height_in(fs) + if header: + header_lines = _wrap_row(header, widths, fs) + h += lh * max((len(c) for c in header_lines), default=1) + _ROW_VPAD * 2 + for r in rows: + cells_lines = _wrap_row(r, widths, fs) + h += lh * max((len(c) for c in cells_lines), default=1) + _ROW_VPAD * 2 + note = getattr(block, "note", None) + if note: + nlines = tl.wrap(model._safe_str(note), + tl.chars_per_line(_USABLE_W, _FS_NOTE)) + h += tl.line_height_in(_FS_NOTE) * len(nlines) + return h + _GAP + + def _measure_block(st: _PdfState, block) -> float: kind = getattr(block, "kind", "") try: @@ -690,13 +745,9 @@ def _measure_block(st: _PdfState, block) -> float: tl.chars_per_line(_USABLE_W, _FS_NOTE)) return tl.line_height_in(_FS_NOTE) * len(lines) + _GAP if kind == "kv_table": - rows = getattr(block, "rows", []) or [] - return (tl.line_height_in(_FS_BODY) + _ROW_VPAD) * (len(rows) + 1) \ - + _GAP + return _measure_kv_table(block) if kind == "data_table": - rows = getattr(block, "rows", []) or [] - return (tl.line_height_in(_FS_CELL) + _ROW_VPAD * 2) \ - * (len(rows) + 1) + _GAP + return _measure_data_table(block) if kind == "group": return sum(_measure_block(st, b) for b in (getattr(block, "blocks", []) or [])) @@ -735,6 +786,10 @@ def _place_group(st: _PdfState, block) -> None: blocks = getattr(block, "blocks", []) or [] if not blocks: return + # Opt-in page break: start this group on a fresh page unless the current one + # is still empty (so a chapter can give each unit its own page). + if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6: + _new_page(st) avail_full = _CONTENT_BOTTOM - _CONTENT_TOP _shrink_group_figures(st, blocks, avail_full) total = sum(_measure_block(st, b) for b in blocks) diff --git a/python/functions/datascience/automatic_eda/render_pptx_impl.py b/python/functions/datascience/automatic_eda/render_pptx_impl.py index 5e3ba331..cc0171f4 100644 --- a/python/functions/datascience/automatic_eda/render_pptx_impl.py +++ b/python/functions/datascience/automatic_eda/render_pptx_impl.py @@ -625,6 +625,55 @@ def _measure_figure_like(block) -> float: return target_h + 0.05 + cap_h + _GAP +def _measure_kv_table(block) -> float: + """Faithful KVTable height — matches ``_place_kv_table`` (rendered as a + Campo/Valor data table with wrapped cells). The previous estimate assumed one + line per row and ignored the title, so a keep-together Group under-budgeted + the figure and the chart spilled to the next slide. Keep in sync.""" + h = 0.0 + title = getattr(block, "title", None) + if title: + h += _measure_heading_text(title, 2) + rows = getattr(block, "rows", []) or [] + data_rows = [] + for row in rows: + try: + label, value = row[0], row[1] + except Exception: # noqa: BLE001 + label, value = str(row), "" + data_rows.append([model._safe_str(label), model._safe_str(value)]) + header = ["Campo", "Valor"] + widths = _col_widths(header, data_rows) + fs = _FS_CELL + h += _row_height_in(header, widths, fs) + for r in data_rows: + h += _row_height_in(r, widths, fs) + return h + _GAP + + +def _measure_data_table(block) -> float: + """Faithful DataTable height — matches ``_place_data_table`` (title heading + + wrapped header + every wrapped row + optional note). Keep in sync.""" + h = 0.0 + title = getattr(block, "title", None) + if title: + h += _measure_heading_text(title, 2) + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + fs = _FS_CELL + widths = _col_widths(header, rows) + if header: + h += _row_height_in(header, widths, fs) + for r in rows: + h += _row_height_in(r, widths, fs) + note = getattr(block, "note", None) + if note: + nlines = tl.wrap(model._safe_str(note), + tl.chars_per_line(_USABLE_W, _FS_NOTE)) + h += tl.line_height_in(_FS_NOTE) * len(nlines) + 0.05 + return h + _GAP + + def _measure_block(st: _PptxState, block) -> float: kind = getattr(block, "kind", "") try: @@ -639,9 +688,10 @@ def _measure_block(st: _PptxState, block) -> float: lines = tl.wrap(getattr(block, "text", ""), tl.chars_per_line(_USABLE_W, _FS_NOTE)) return tl.line_height_in(_FS_NOTE) * len(lines) + 0.05 + _GAP - if kind in ("kv_table", "data_table"): - rows = getattr(block, "rows", []) or [] - return (tl.line_height_in(_FS_CELL) + 0.10) * (len(rows) + 1) + _GAP + if kind == "kv_table": + return _measure_kv_table(block) + if kind == "data_table": + return _measure_data_table(block) if kind == "group": return sum(_measure_block(st, b) for b in (getattr(block, "blocks", []) or [])) @@ -664,10 +714,14 @@ def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> No if getattr(b, "kind", "") not in ("figure", "image")) fig_overhead = tl.line_height_in(_FS_NOTE) + 0.05 + 0.05 + _GAP budget = avail_full - nonfig_h - 0.10 * len(fig_blocks) - if budget <= 1.0: + # Low thresholds: a 16:9 slide is short, so a content-heavy column (cardinality + # table + top-k + chart) only fits if the chart is allowed to shrink small. + # Prefer a small-but-present chart on the SAME slide over splitting the column + # across slides (matches the PDF renderer's keep-together philosophy). + if budget <= 0.6: return # not enough room to keep together; let it flow (degrade). per = budget / len(fig_blocks) - fig_overhead - if per <= 0.8: + if per <= 0.35: return for fb in fig_blocks: cur = getattr(fb, "height_in", None) @@ -680,6 +734,10 @@ def _place_group(st: _PptxState, block) -> None: blocks = getattr(block, "blocks", []) or [] if not blocks: return + # Opt-in slide break: start this group on a fresh slide unless the current one + # is still empty (so a chapter can give each unit its own slide). + if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6: + _new_slide(st, cont=True) avail_full = _CONTENT_BOTTOM - _CONTENT_TOP _shrink_group_figures(st, blocks, avail_full) total = sum(_measure_block(st, b) for b in blocks)