From a74a5a047f1006ff97c0122a627993bed1c94445 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Wed, 1 Jul 2026 01:34:21 +0200 Subject: [PATCH] =?UTF-8?q?feat(eda):=20render=20quality=20global=20?= =?UTF-8?q?=E2=80=94=20DPI=20220,=20tablas=20anchas=20como=20imagen,=20lay?= =?UTF-8?q?out=20side=5Fby=5Fside,=20=C3=ADndice=20clicable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mejoras transversales del motor AutomaticEDA (PDF + PPTX) sobre el modelo de bloques: 1. DPI alto global: toda figura/imagen embebida se rasteriza a 220 dpi (antes 150, y en PDF la página se guardaba a ~100 dpi re-rasterizando los imshow). En PDF se aplica savefig.dpi=220 a la página; el texto sigue vectorial y seleccionable. Permite ampliar en el móvil sin pixelar. Imagen embebida medida: ~1081px (antes ~492px). 2. Tabla ancha → imagen de alta resolución: cuando un DataTable tiene demasiadas columnas para ser legible como texto (criterio _table_fits_as_text), se dibuja entera como una imagen nítida (nueva función render_table_as_figure_py_datascience: cabecera sombreada + zebra) escalada para caber completa, de modo que el lector hace zoom y la lee sin perder datos. Las tablas que sí caben siguen como texto seleccionable / tabla nativa. Aplica en PDF y PPTX. El df.head de 19 columnas del dataset sintético ya no se corta: sale como imagen. 3. Group.layout: nuevo hint retrocompatible (default "stack"). "side_by_side" coloca la tabla a la izquierda (~55%) y la figura a la derecha (~45%) en la misma slide PPTX (cae a apilado si no hay par tabla+figura o no caben); en PDF se trata como "stack" (el ancho A5 móvil no admite dos columnas). Pensado para que el capítulo cat_distr ponga el gráfico al lado de la tabla en PPT. 4. Portada con índice clicable: la lista de capítulos pasa de "Este informe incluye..." (markdown) a un Heading "Índice" + un TocEntry por capítulo. El renderer registra el inicio de cada capítulo y cablea cada entrada como salto real (PDF: link GOTO PyMuPDF; PPTX: salto a slide nativo), reutilizando el mecanismo del glosario clicable. Modelo: Group gana `layout`; nuevo bloque TocEntry; normalizers y __init__ actualizados. Contrato: documentado en docs/automatic_eda_contract.md §11.4 (incluye el contrato exacto del campo layout para el agente de cat_distr). Tests: nuevo render_quality_test.py (13 golden: DPI alto real, tabla ancha→imagen PDF/PPTX, narrow→texto, side_by_side PPTX dos columnas / PDF apilado, índice clicable PDF+PPTX, retrocompatibilidad layout por defecto). render_features_test actualizado al índice nuevo. Suite: 188 passed (módulo) + 38 passed/1 skipped (acceptance + pipeline). Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/automatic_eda_contract.md | 64 +++- .../datascience/automatic_eda/__init__.py | 2 + .../automatic_eda/chapters/portada.py | 16 +- .../datascience/automatic_eda/model.py | 35 ++- .../automatic_eda/render_features_test.py | 15 +- .../automatic_eda/render_pdf_impl.py | 155 +++++++++- .../automatic_eda/render_pptx_impl.py | 264 ++++++++++++++-- .../automatic_eda/render_quality_test.py | 283 ++++++++++++++++++ .../datascience/render_table_as_figure.md | 121 ++++++++ .../datascience/render_table_as_figure.py | 241 +++++++++++++++ .../render_table_as_figure_test.py | 119 ++++++++ 11 files changed, 1272 insertions(+), 43 deletions(-) create mode 100644 python/functions/datascience/automatic_eda/render_quality_test.py create mode 100644 python/functions/datascience/render_table_as_figure.md create mode 100644 python/functions/datascience/render_table_as_figure.py create mode 100644 python/functions/datascience/render_table_as_figure_test.py diff --git a/docs/automatic_eda_contract.md b/docs/automatic_eda_contract.md index efd96fa9..e26e7e3a 100644 --- a/docs/automatic_eda_contract.md +++ b/docs/automatic_eda_contract.md @@ -41,12 +41,13 @@ reconocido se degrada a `Note`, nunca lanza). | `Heading(text, level=1)` | título de sección, `level` 1 (grande) … 3 (chico) | una o varias líneas en negrita; nivel 1 lleva subrayado de acento | | `Markdown(text)` | texto markdown ligero | ver subset abajo; **nunca corta a media línea** | | `KVTable(rows, title=None)` | `rows = [(clave, valor), ...]` | tabla de 2 columnas etiqueta/valor; el valor se envuelve | -| `DataTable(header, rows, title=None, note=None)` | `header=[...]`, `rows=[[...],...]` | tabla con cabecera; **se parte por filas repitiendo cabecera**; las celdas largas se envuelven dentro de su columna | +| `DataTable(header, rows, title=None, note=None)` | `header=[...]`, `rows=[[...],...]` | tabla con cabecera; **si cabe** como texto se parte por filas repitiendo cabecera; **si NO cabe** (demasiadas columnas) se rasteriza entera como imagen de alta resolución para hacer zoom. Ver §11.4 | | `Figure(fig=None, make=None, caption=None, height_in=None)` | una `matplotlib.figure.Figure` ya construida (`fig`) o un callable `make()->Figure` (perezoso) | se rasteriza y escala para caber entera (nunca recortada) | | `Image(path, caption=None, height_in=None)` | ruta a PNG/JPG | se escala para caber entera | | `Caption(text)` / `Note(text)` | texto auxiliar pequeño | pie/nota en gris; `Note` es además el fallback de lo desconocido | -| `Group(blocks, title=None)` | unidad **keep-together**: sus bloques se mantienen juntos | el renderer mide el grupo entero y lo mueve completo a la página/slide siguiente si no cabe; encoge la figura para dejar sitio al título+texto. Ver §11 | +| `Group(blocks, title=None, page_break_before=False, layout="stack")` | unidad **keep-together**: sus bloques se mantienen juntos | el renderer mide el grupo entero y lo mueve completo a la página/slide siguiente si no cabe; encoge la figura para dejar sitio al título+texto. `layout="side_by_side"` coloca tabla+figura en dos columnas (solo PPTX). Ver §11 y §11.4 | | `GlossaryEntry(key, label, definition)` | una entrada del glosario (destino clicable) | la genera el capítulo `glosario`; registra su posición como destino de los términos marcados. Ver §11 | +| `TocEntry(label, target_id)` | una entrada de **índice clicable** en la portada | la genera el capítulo `portada`; el renderer la cablea como salto al inicio del capítulo cuyo `id` o `title` coincide con `target_id`. Ver §11.4 | `Figure`/`Image` aceptan `height_in` (hint): el renderer **clampa** la figura a esa altura máxima (lo usa `Group` para encoger la figura). Toda figura escala dejando sitio a su caption en la misma página/slide; en PPTX el caption es **siempre** visible (si no se da `caption`, cae al último heading o a "Figura"). @@ -397,6 +398,65 @@ cabecera con su fondo propio. Es automático en PDF y PPTX; el patrón se mantie cuando una tabla larga se parte y repite cabecera (el índice de fila es lógico, no por página). No hay nada que hacer en los capítulos. +### 11.4 Calidad de render global: DPI alto, tabla ancha → imagen, figura al lado, índice clicable + +Cuatro capacidades transversales del motor, **todas automáticas salvo `layout`** (que un +capítulo activa explícitamente). Aplican a PDF y PPTX salvo donde se indique. + +**(a) DPI alto (automático).** Toda figura/imagen embebida se rasteriza a **220 dpi** +(constante `_RASTER_DPI` en ambos renderers; en PDF se aplica también al `savefig` de la +página, porque matplotlib re-rasteriza cada `imshow` al escribir la página). Objetivo: +ampliar en el móvil y leer detalle (ejes, celdas) sin pixelar. El texto sigue siendo +vectorial y seleccionable. No hay nada que hacer en los capítulos. + +**(b) Tabla ancha → imagen de alta resolución (automático).** Cuando un `DataTable` tiene +**demasiadas columnas para ser legible como texto** en el ancho útil (criterio +`_table_fits_as_text`: ancho mínimo legible por columna × nº de columnas > ancho útil; en +la práctica salta sobre tablas tipo `df.head` con muchas columnas), en vez de comprimir las +columnas hasta hacerlas ilegibles, la tabla se dibuja **entera como una imagen de alta +resolución** (función `render_table_as_figure_py_datascience`: cabecera sombreada + zebra) +escalada para caber completa, de modo que el lector hace **zoom** y la lee sin perder datos. +Si la tabla **sí cabe**, se mantiene como texto seleccionable (PDF) / tabla nativa (PPTX). +Las `KVTable` (2 columnas) caben siempre y se quedan como texto. No hay nada que hacer en +los capítulos. + +**(c) Figura al lado de la tabla — `Group(layout="side_by_side")`.** Hint de layout que un +capítulo activa para que su **tabla quede a la izquierda y su figura a la derecha** en la +misma diapositiva, en lugar de apiladas: + +```python +model.Group( + layout="side_by_side", + blocks=[ + model.Heading(text=str(name), level=2), # va a ancho completo arriba + model.DataTable(header=..., rows=...), # columna IZQUIERDA (~55%) + model.Figure(make=_grafico_perezoso(...)), # columna DERECHA (~45%) + model.Markdown(text="explicación…"), # va a ancho completo abajo + ]) +``` + +Contrato exacto del campo: + +| Campo | Valor | Efecto | +|---|---|---| +| `layout` | `"stack"` (por defecto) | comportamiento histórico: apilado vertical (keep-together). | +| `layout` | `"side_by_side"` | **PPTX**: la tabla (rasterizada a imagen) ocupa la columna izquierda (~55% del ancho útil) y la figura la derecha (~45%); cualquier otro bloque (heading, markdown) va a ancho completo arriba/abajo. Si no hay un par tabla+figura, o no caben lado a lado en una slide, **cae automáticamente a apilado**. **PDF**: se trata **igual que `stack`** (el ancho A5 móvil no admite dos columnas legibles). Valores desconocidos degradan a `"stack"`. | + +Es **retrocompatible**: un `Group` sin `layout` (o `layout="stack"`) se comporta exactamente +como antes. El capítulo `cat_distr` es el consumidor previsto (gráfico a la derecha de la +tabla de categorías en PPT); este motor solo provee el soporte. + +**(d) Índice clicable en la portada — `TocEntry`.** La portada emite un `Heading("Índice")` +seguido de un `TocEntry(label, target_id)` por capítulo. El renderer registra la +página/slide de inicio de **cada** capítulo (indexado por `id` **y** por `title`) y cablea +cada `TocEntry` como un salto real a ese inicio: en **PDF** vía +`add_pdf_internal_links_py_datascience` (link GOTO de PyMuPDF), en **PPTX** vía +`pptx_link_run_to_slide_py_datascience` (salto a slide nativo). Como la portada solo conoce +los **títulos** de los capítulos, el `target_id` se hace coincidir contra el `title` (o el +`id`) de destino. Si un destino no resuelve, la entrada se muestra igualmente como texto +(en color de enlace), nunca se corta. Es el mismo mecanismo que los términos clicables del +glosario (§11.1), reutilizado en sentido portada → capítulo. + --- ## 10. Integración futura con `profile_table` (siguiente fase) diff --git a/python/functions/datascience/automatic_eda/__init__.py b/python/functions/datascience/automatic_eda/__init__.py index 01085313..658570c8 100644 --- a/python/functions/datascience/automatic_eda/__init__.py +++ b/python/functions/datascience/automatic_eda/__init__.py @@ -29,6 +29,7 @@ from .model import ( # noqa: F401 KVTable, Markdown, Note, + TocEntry, as_blocks, as_chapters, merge_manifest, @@ -52,6 +53,7 @@ __all__ = [ "Group", "GlossaryEntry", "GlossaryCollector", + "TocEntry", "Chapter", "as_blocks", "as_chapters", diff --git a/python/functions/datascience/automatic_eda/chapters/portada.py b/python/functions/datascience/automatic_eda/chapters/portada.py index b1a28366..eef1c84c 100644 --- a/python/functions/datascience/automatic_eda/chapters/portada.py +++ b/python/functions/datascience/automatic_eda/chapters/portada.py @@ -26,7 +26,7 @@ from datetime import datetime, timezone from .. import model -CHAPTER_VERSION = "1.3.0" +CHAPTER_VERSION = "1.4.0" CHAPTER_ID = "portada" CHAPTER_TITLE = "Portada" @@ -120,9 +120,17 @@ def _summary_blocks(summary) -> list: # Values pinned to the right margin (numbers flush right, label left). blocks.append(model.KVTable(rows=rows, value_align="right")) if titles: - bullets = "\n".join(f"- {model._safe_str(t)}" for t in titles) - blocks.append(model.Markdown( - text="Este informe incluye los siguientes capítulos:\n" + bullets)) + # Clickable index ("Índice"): one TocEntry per chapter title. Each entry + # becomes a real jump to that chapter's first page/slide once the document + # is laid out (the renderers register every chapter start and wire the + # links; ``target_id`` is matched against the chapter title). The cover only + # knows chapter titles, so the title doubles as the link target. + blocks.append(model.Heading(text="Índice", level=2)) + for t in titles: + label = model._safe_str(t) + if not label: + continue + blocks.append(model.TocEntry(label=label, target_id=label)) return blocks diff --git a/python/functions/datascience/automatic_eda/model.py b/python/functions/datascience/automatic_eda/model.py index 9171652e..bc15c8bc 100644 --- a/python/functions/datascience/automatic_eda/model.py +++ b/python/functions/datascience/automatic_eda/model.py @@ -160,11 +160,21 @@ class Group: a chapter can give each unit its own page — e.g. one categorical column per page (see CAT DISTR). It is purely additive: the default False keeps the plain keep-together behaviour for every existing chapter. + + ``layout`` is a hint for how the group's children are arranged: + ``"stack"`` (default) keeps the historical top-to-bottom flow; ``"side_by_side"`` + asks the PPTX renderer to place the group's table to the LEFT and its figure to + the RIGHT of the same slide (table ~55% width, figure ~45%), measuring so both + fit and falling back to stacking when they do not. The PDF renderer treats + ``"side_by_side"`` exactly like ``"stack"`` (the A5 mobile page is too narrow for + two readable columns). Unknown values degrade to ``"stack"``. Purely additive: + the default keeps every existing chapter unchanged. """ blocks: list = field(default_factory=list) title: Optional[str] = None page_break_before: bool = False + layout: str = "stack" kind: str = field(default="group", init=False) @@ -183,6 +193,22 @@ class GlossaryEntry: kind: str = field(default="glossary_entry", init=False) +@dataclass +class TocEntry: + """One clickable index (table-of-contents) entry shown on the cover. + + Rendered as a single line — the chapter ``label`` in the accent link colour — + that, once the document is laid out, becomes a real click jumping to the first + page/slide of the target chapter (PDF link annotation via PyMuPDF; PPTX native + slide jump). ``target_id`` is matched against each chapter's ``id`` *and* its + ``title`` (the cover only knows chapter titles), so either resolves. If the + target cannot be resolved the entry still renders as plain text (never cut).""" + + label: str = "" + target_id: str = "" + kind: str = field(default="toc_entry", init=False) + + @dataclass class Chapter: """An ordered set of blocks with an id, a title and a generation version.""" @@ -207,13 +233,14 @@ _BLOCK_BY_KIND = { "note": Note, "group": Group, "glossary_entry": GlossaryEntry, + "toc_entry": TocEntry, } def as_block(obj: Any): """Coerce a value into a block dataclass. Unknown values become a Note.""" if isinstance(obj, (Heading, Markdown, KVTable, DataTable, Figure, Image, - Caption, Note, Group, GlossaryEntry)): + Caption, Note, Group, GlossaryEntry, TocEntry)): if isinstance(obj, Group): obj.blocks = as_blocks(obj.blocks) return obj @@ -259,11 +286,15 @@ def as_block(obj: Any): return Group(blocks=as_blocks(obj.get("blocks")), title=obj.get("title"), page_break_before=bool( - obj.get("page_break_before", False))) + obj.get("page_break_before", False)), + layout=_safe_str(obj.get("layout")) or "stack") if cls is GlossaryEntry: return GlossaryEntry(key=_safe_str(obj.get("key")), label=_safe_str(obj.get("label")), definition=_safe_str(obj.get("definition"))) + if cls is TocEntry: + return TocEntry(label=_safe_str(obj.get("label")), + target_id=_safe_str(obj.get("target_id"))) except Exception: # noqa: BLE001 — never raise on a malformed block. return Note(text=_safe_str(obj)) return Note(text=_safe_str(obj)) diff --git a/python/functions/datascience/automatic_eda/render_features_test.py b/python/functions/datascience/automatic_eda/render_features_test.py index 40d247ba..5bb5a2a0 100644 --- a/python/functions/datascience/automatic_eda/render_features_test.py +++ b/python/functions/datascience/automatic_eda/render_features_test.py @@ -298,11 +298,16 @@ def test_cover_first_glossary_last_with_summary(): headings = [b.text for b in cover.blocks if b.kind == "heading"] assert any("Resumen" in h for h in headings), \ "la portada no incluye el resumen agregado" - # The summary reflects the body chapters (e.g. the numeric/categorical ones). - cover_text = " ".join( - b.text for b in cover.blocks if getattr(b, "kind", "") == "markdown") - assert "Distribuciones" in cover_text, \ - "el resumen de portada no menciona los capítulos del cuerpo" + # The index ("Índice") is now a clickable list of TocEntry blocks (one per + # body chapter), not a markdown bullet list. Verify both the heading and that + # the entries name the body chapters. + assert any("Índice" in h for h in headings), \ + "la portada no incluye la sección Índice" + toc_labels = " ".join( + getattr(b, "label", "") for b in cover.blocks + if getattr(b, "kind", "") == "toc_entry") + assert "Distribuciones" in toc_labels, \ + "el índice de portada no menciona los capítulos del cuerpo" # --------------------------------------------------------------------------- # diff --git a/python/functions/datascience/automatic_eda/render_pdf_impl.py b/python/functions/datascience/automatic_eda/render_pdf_impl.py index 30115de0..909a482a 100644 --- a/python/functions/datascience/automatic_eda/render_pdf_impl.py +++ b/python/functions/datascience/automatic_eda/render_pdf_impl.py @@ -46,11 +46,23 @@ _MUTED = "#8a8a8a" _RULE = "#cccccc" _HEAD_BG = "#eef3f6" +# Rasterization DPI for every embedded raster (figure/table image) AND for the +# page save itself. Raised from the old 150/default-100 to 220 so a reader can +# pinch-zoom on a phone and still see crisp detail (axis labels, table cells) +# without pixelation. Text stays vectorial (pdf.fonttype=42) so it remains +# selectable regardless of DPI — only the embedded images gain resolution. 220 is +# a deliberate balance: noticeably sharper than 150 while keeping the file size +# reasonable. ``savefig.dpi`` matters because matplotlib re-rasterizes each +# ``imshow`` when PdfPages writes the page; without it the final image would land +# at ~100 dpi no matter how sharp the intermediate PNG was. +_RASTER_DPI = 220 + _RC = { "font.size": 10, "font.family": "sans-serif", "figure.facecolor": "white", "savefig.facecolor": "white", + "savefig.dpi": _RASTER_DPI, "pdf.fonttype": 42, # embed TrueType — text stays selectable on mobile. } @@ -80,6 +92,10 @@ class _PdfState: # points (1/72") with a top-left origin — same convention as PyMuPDF. self.term_sources = [] # [{key, page, rect:[x0,y0,x1,y1]}] self.term_dests = {} # key -> {page, point:[x,y]} + # Clickable index (cover → chapter). Sources are the cover's TocEntry + # rects; chapter_starts maps a chapter id AND its title to its first page. + self.toc_sources = [] # [{target_id, page, rect:[x0,y0,x1,y1]}] + self.chapter_starts = {} # id|title -> {page, point:[x,y]} # --------------------------------------------------------------------------- # @@ -385,6 +401,57 @@ def _col_widths(header: list, rows: list, fs: float) -> list: return widths +# Minimal legible characters reserved per column when deciding whether a table +# can be shown as selectable text. Below this width per column the cells become +# unreadable, so the table is rasterized to a zoomable high-res image instead. +_MIN_LEGIBLE_CHARS = 8 + + +def _table_fits_as_text(header: list, rows: list) -> bool: + """True when the table fits the usable width as readable text. + + A table whose columns cannot each get a minimal legible width within the A5 + usable width (typically many columns, e.g. a 19-column ``df.head``) is flagged + so it is rendered as a single high-resolution image — the reader zooms in on + the phone and reads every cell, nothing cut — instead of being squeezed until + unreadable. Narrow tables (few columns) keep the selectable-text rendering.""" + header = header or [] + rows = rows or [] + ncol = len(header) if header else (len(rows[0]) if rows else 1) + ncol = max(1, ncol) + cw = tl.avg_char_width_in(_FS_CELL) + min_needed = ncol * (_MIN_LEGIBLE_CHARS * cw + _CELL_PAD * 2) + return min_needed <= _USABLE_W + + +def _table_figure_block(block): + """Wrap a too-wide table as a lazily-rasterized Figure (cached on the block). + + The table is drawn once via ``render_table_as_figure`` (header shading + zebra) + and embedded as one high-res image scaled to fit entirely. The same Figure is + reused for measuring and placing so keep-together stays consistent. The table + title/note are drawn inside the image (self-describing when zoomed/shared), so + the block-level caption is left empty to avoid a duplicate title.""" + cached = getattr(block, "_aeda_tablefig", None) + if cached is not None: + return cached + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + title = getattr(block, "title", None) + note = getattr(block, "note", None) + + def _make(): + from datascience.render_table_as_figure import render_table_as_figure + return render_table_as_figure(header, rows, title=title, note=note) + + fig = model.Figure(make=_make, caption=None) + try: + block._aeda_tablefig = fig + except Exception: # noqa: BLE001 — block may reject attributes; degrade. + pass + return fig + + def _wrap_row(cells: list, widths: list, fs: float) -> list: """Wrap each cell to its column width → list of line-lists per cell.""" out = [] @@ -424,11 +491,16 @@ def _draw_table_row(st: _PdfState, cells_lines: list, widths: list, fs: float, def _place_data_table(st: _PdfState, block) -> None: + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + # Too many columns to be legible as text → render the whole table as one + # high-res image, scaled to fit entirely (the reader zooms to read it). + if not _table_fits_as_text(header, rows): + _place_figure(st, _table_figure_block(block)) + return title = getattr(block, "title", None) if title: _place_heading(st, model.Heading(title, level=2)) - header = list(getattr(block, "header", []) or []) - rows = list(getattr(block, "rows", []) or []) fs = _FS_CELL widths = _col_widths(header, rows, fs) header_lines = _wrap_row(header, widths, fs) if header else None @@ -486,8 +558,11 @@ def _resolve_figure(block): def _png_from_figure(fig) -> bytes: + # ``bbox_inches='tight'`` is kept so the real aspect ratio is what we measure + # and place. The page save (savefig.dpi in _RC) re-rasterizes this at the same + # high DPI, so the embedded image stays crisp for phone zoom. buf = io.BytesIO() - fig.savefig(buf, format="png", dpi=150, bbox_inches="tight") + fig.savefig(buf, format="png", dpi=_RASTER_DPI, bbox_inches="tight") buf.seek(0) return buf.read() @@ -729,12 +804,16 @@ def _measure_data_table(block) -> float: Counts the optional title heading, the wrapped header row, every wrapped data row (per-column wrap via the same ``_col_widths``/``_wrap_row`` the placer uses) and the optional note. Keep this in sync with ``_place_data_table``.""" + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + # Mirror the placer: a too-wide table is drawn as a single image, so its + # keep-together height is the image's, not the (squeezed) text layout's. + if not _table_fits_as_text(header, rows): + return _measure_figure_like(_table_figure_block(block)) h = 0.0 title = getattr(block, "title", None) if title: h += _measure_heading_text(title, 2) - header = list(getattr(block, "header", []) or []) - rows = list(getattr(block, "rows", []) or []) fs = _FS_CELL widths = _col_widths(header, rows, fs) lh = tl.line_height_in(fs) @@ -766,6 +845,10 @@ def _measure_block(st: _PdfState, block) -> float: lines = tl.wrap(getattr(block, "text", ""), tl.chars_per_line(_USABLE_W, _FS_NOTE)) return tl.line_height_in(_FS_NOTE) * len(lines) + _GAP + if kind == "toc_entry": + lines = tl.wrap(tl.strip_inline_md(getattr(block, "label", "")), + tl.chars_per_line(_USABLE_W - 0.22, _FS_BODY)) or [""] + return tl.line_height_in(_FS_BODY) * len(lines) + _GAP * 0.4 if kind == "kv_table": return _measure_kv_table(block) if kind == "data_table": @@ -850,6 +933,38 @@ def _place_glossary_entry(st: _PdfState, block) -> None: st.y += _GAP * 0.5 +def _place_toc_entry(st: _PdfState, block) -> None: + """Render one clickable index line and record it as a link source. + + Drawn as a bulleted line in the accent link colour; its rectangle is recorded + in ``st.toc_sources`` so the post-processor turns it into a real jump to the + target chapter's first page. If the target is never resolved the line still + shows as plain (accent) text — never cut, never broken.""" + label = tl.strip_inline_md(getattr(block, "label", "")) or "" + target_id = getattr(block, "target_id", "") or "" + fs = _FS_BODY + lh = tl.line_height_in(fs) + bullet = "• " + indent = 0.22 + max_chars = tl.chars_per_line(_USABLE_W - indent, fs) + lines = tl.wrap(label, max_chars) or [""] + for idx, ln in enumerate(lines): + _ensure_space(st, lh) + x = _ML + st.fig.text(_xf(x), _yf(st.y), bullet if idx == 0 else " ", + fontsize=fs, color=_LINK, ha="left", va="top") + x += indent + w = _text_width_in(st, ln, fs, False) + st.fig.text(_xf(x), _yf(st.y), ln, fontsize=fs, color=_LINK, + ha="left", va="top") + if target_id and idx == 0: + st.toc_sources.append({ + "target_id": target_id, "page": st.page - 1, + "rect": _pt_rect(_ML, st.y, x + w, st.y + lh)}) + st.y += lh + st.y += _GAP * 0.4 + + _PLACERS = { "heading": _place_heading, "markdown": _place_markdown, @@ -861,6 +976,7 @@ _PLACERS = { "note": _place_note, "group": _place_group, "glossary_entry": _place_glossary_entry, + "toc_entry": _place_toc_entry, } @@ -892,6 +1008,15 @@ def render_pdf(chapters: list, out_path: str, meta: dict = None) -> dict: st.chapter = ch st.chapter_pages = 0 _new_page(st) # each chapter starts on a fresh page. + # Record this chapter's first page as a link target for the + # cover index (keyed by id AND title, since the cover only + # knows titles). Point is the top of the content area. + _start = {"page": st.page - 1, + "point": [_ML * 72.0, _CONTENT_TOP * 72.0]} + if ch.id: + st.chapter_starts[ch.id] = _start + if getattr(ch, "title", ""): + st.chapter_starts.setdefault(ch.title, _start) for block in ch.blocks: placer = _PLACERS.get(getattr(block, "kind", ""), _place_note) @@ -924,7 +1049,7 @@ def render_pdf(chapters: list, out_path: str, meta: dict = None) -> dict: note = f"{n_pages} páginas" if n_links: - note += f" · {n_links} enlaces de glosario" + note += f" · {n_links} enlaces internos" if notes: note += " · " + "; ".join(notes) return {"path": out_path, "n_pages": n_pages, "chapters": chapters_meta, @@ -932,9 +1057,11 @@ def render_pdf(chapters: list, out_path: str, meta: dict = None) -> dict: def _wire_glossary_links(st: _PdfState, out_path: str, notes: list) -> int: - """Build {source rect → glossary dest} links and apply them via PyMuPDF. + """Apply internal PDF links via PyMuPDF: glossary terms + the cover index. - Returns the number of links applied (0 if there is nothing to wire or the + Builds two sets of GOTO links — every in-text glossary term → its entry, and + every cover ``TocEntry`` → its chapter's first page — and applies them in one + pass. Returns the number of links applied (0 if there is nothing to wire or the post-processor is unavailable). Never raises.""" try: links = [] @@ -945,6 +1072,14 @@ def _wire_glossary_links(st: _PdfState, out_path: str, notes: list) -> int: links.append({ "src_page": src["page"], "src_rect": src["rect"], "dst_page": dest["page"], "dst_point": dest["point"]}) + # Cover index → chapter first page (clickable, navigable table of contents). + for src in st.toc_sources: + dest = st.chapter_starts.get(src.get("target_id")) + if not dest: + continue + links.append({ + "src_page": src["page"], "src_rect": src["rect"], + "dst_page": dest["page"], "dst_point": dest["point"]}) if not links: return 0 from datascience.add_pdf_internal_links import add_pdf_internal_links @@ -952,7 +1087,7 @@ def _wire_glossary_links(st: _PdfState, out_path: str, notes: list) -> int: if isinstance(res, dict) and res.get("status") == "ok": return int(res.get("n_links") or 0) if isinstance(res, dict) and res.get("error"): - notes.append(f"glosario sin enlaces: {res.get('error')}") + notes.append(f"enlaces internos no aplicados: {res.get('error')}") except Exception as e: # noqa: BLE001 — links are best-effort. - notes.append(f"glosario sin enlaces: {e}") + notes.append(f"enlaces internos no aplicados: {e}") return 0 diff --git a/python/functions/datascience/automatic_eda/render_pptx_impl.py b/python/functions/datascience/automatic_eda/render_pptx_impl.py index 21b9e0ce..8a8039eb 100644 --- a/python/functions/datascience/automatic_eda/render_pptx_impl.py +++ b/python/functions/datascience/automatic_eda/render_pptx_impl.py @@ -51,6 +51,12 @@ _FS_H1, _FS_H2, _FS_H3 = 20, 16, 13 _FS_BODY, _FS_CELL, _FS_NOTE = 14, 11, 11 _GAP = 0.12 +# Rasterization DPI for every embedded figure/table image. Raised from 150 to 220 +# so a viewer can zoom into a slide (or a shared picture) and read crisp detail — +# axis labels, table cells — without pixelation. Kept moderate so the deck size +# stays reasonable. Same value as the PDF renderer. +_RASTER_DPI = 220 + class _PptxState: def __init__(self, prs, title: str): @@ -65,6 +71,10 @@ class _PptxState: # Glossary wiring (mejora 6): runs to link and per-term target slide. self.term_runs = [] # [(key, run)] self.term_anchor_slide = {} # key -> Slide (glossary entry) + # Clickable index (cover → chapter). toc_runs are the cover's index runs; + # chapter_starts maps a chapter id AND its title to its first slide. + self.toc_runs = [] # [(target_id, run, src_slide)] + self.chapter_starts = {} # id|title -> Slide (chapter first slide) def _rgb(c): @@ -309,6 +319,58 @@ def _col_widths(header, rows): return [_USABLE_W * w / total for w in clamped] +# Minimal legible characters reserved per column when deciding whether a table +# can be shown as a native (selectable) PowerPoint table. Below this width per +# column the cells become unreadable, so the table is rasterized to a zoomable +# high-res image instead. The 16:9 slide is wide, so more columns fit than on A5. +_MIN_LEGIBLE_CHARS = 8 +_CELL_PAD = 0.05 + + +def _table_fits_as_text(header: list, rows: list) -> bool: + """True when the table fits the usable slide width as a readable table. + + A table whose columns cannot each get a minimal legible width within the slide + usable width (typically many columns, e.g. a 19-column ``df.head``) is flagged + so it is rendered as one high-resolution image — the viewer zooms in and reads + every cell — instead of being squeezed unreadable. Narrow tables keep the + native selectable table.""" + header = header or [] + rows = rows or [] + ncol = len(header) if header else (len(rows[0]) if rows else 1) + ncol = max(1, ncol) + cw = tl.avg_char_width_in(_FS_CELL) + min_needed = ncol * (_MIN_LEGIBLE_CHARS * cw + _CELL_PAD * 2) + return min_needed <= _USABLE_W + + +def _table_figure_block(block): + """Wrap a too-wide table as a lazily-rasterized Figure (cached on the block). + + Drawn once via ``render_table_as_figure`` (header shading + zebra) and embedded + as one high-res image scaled to fit entirely. The title/note are drawn inside + the image (self-describing when zoomed/shared), so no separate caption is + emitted. Reused for measuring and placing so keep-together stays consistent.""" + cached = getattr(block, "_aeda_tablefig", None) + if cached is not None: + return cached + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + title = getattr(block, "title", None) + note = getattr(block, "note", None) + + def _make(): + from datascience.render_table_as_figure import render_table_as_figure + return render_table_as_figure(header, rows, title=title, note=note) + + fig = model.Figure(make=_make, caption=None) + try: + block._aeda_tablefig = fig + except Exception: # noqa: BLE001 — block may reject attributes; degrade. + pass + return fig + + def _row_height_in(cells, widths, fs) -> float: lh = tl.line_height_in(fs) maxlines = 1 @@ -372,11 +434,27 @@ def _style_cell(cell, fs, color, bold, fill) -> None: def _place_data_table(st: _PptxState, block, shaded_header=True, key_value=False) -> None: + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + # Too many columns to be legible as a native table → render the whole table as + # one high-res picture, scaled to fit entirely (the viewer zooms to read it). + # KVTables (rendered here as a 2-column Campo/Valor table) are excluded: they + # always fit in width and stay as a selectable table. + if not key_value and not _table_fits_as_text(header, rows): + figblock = _table_figure_block(block) + data, _asp = _figure_bytes_cached(figblock) + if data is None: + _add_text(st, ["(tabla no disponible)"], _FS_NOTE, _MUTED, + italic=True) + st.y += _GAP + return + _place_picture_bytes(st, data, None, + max_h_in=getattr(figblock, "height_in", None), + force_caption=False) + return title = getattr(block, "title", None) if title: _place_heading(st, model.Heading(title, level=2)) - header = list(getattr(block, "header", []) or []) - rows = list(getattr(block, "rows", []) or []) fs = _FS_CELL widths = _col_widths(header, rows) header_h = _row_height_in(header, widths, fs) if header else 0.0 @@ -436,7 +514,7 @@ def _resolve_png(block): try: import matplotlib.pyplot as plt buf = io.BytesIO() - f.savefig(buf, format="png", dpi=150, bbox_inches="tight") + f.savefig(buf, format="png", dpi=_RASTER_DPI, bbox_inches="tight") buf.seek(0) return buf.read() except Exception: # noqa: BLE001 @@ -483,12 +561,15 @@ def _figure_bytes_cached(block): def _place_picture_bytes(st: _PptxState, data: bytes, caption, - max_h_in=None) -> None: + max_h_in=None, force_caption=True) -> None: # Mejora 4 — every figure on a slide carries a visible caption/title. If the # block has no caption, fall back to the current section heading, then to a - # generic label, so no image is ever shown untitled. - caption = (model._safe_str(caption).strip() - or model._safe_str(st.last_heading).strip() or "Figura") + # generic label, so no image is ever shown untitled. ``force_caption=False`` + # suppresses that fallback (used for table images, whose title is inside the + # picture) so no redundant caption is drawn. + caption = model._safe_str(caption).strip() + if not caption and force_caption: + caption = model._safe_str(st.last_heading).strip() or "Figura" w_px, h_px = _img_size_px(data) aspect = (h_px / w_px) if w_px else 0.66 # Reserve the caption's REAL (possibly multi-line) height FIRST, then scale @@ -496,9 +577,11 @@ def _place_picture_bytes(st: _PptxState, data: bytes, caption, # so its caption always fits on the SAME slide and no image is untitled. # cap_real = what _add_text consumes; cap_reserve adds the post-image gap and # a small cushion so the caption never spills to the next slide. - cap_lines = tl.wrap(caption, tl.chars_per_line(_USABLE_W, _FS_NOTE)) - cap_real = tl.line_height_in(_FS_NOTE) * len(cap_lines) + 0.05 - cap_reserve = cap_real + 0.05 + 0.10 + cap_lines = tl.wrap(caption, tl.chars_per_line(_USABLE_W, _FS_NOTE)) \ + if caption else [] + cap_real = (tl.line_height_in(_FS_NOTE) * len(cap_lines) + 0.05) \ + if cap_lines else 0.0 + cap_reserve = (cap_real + 0.05 + 0.10) if cap_lines else 0.05 max_h = _CONTENT_BOTTOM - _CONTENT_TOP # height_in hint (model.Figure/Image): cap the target height so a figure in a # keep-together Group shrinks to leave room for its heading and text. @@ -517,7 +600,8 @@ def _place_picture_bytes(st: _PptxState, data: bytes, caption, st.slide.shapes.add_picture(io.BytesIO(data), Inches(left), Inches(st.y), width=Inches(target_w), height=Inches(target_h)) st.y += target_h + 0.05 - _add_text(st, cap_lines, _FS_NOTE, _MUTED, italic=True) + if cap_lines: + _add_text(st, cap_lines, _FS_NOTE, _MUTED, italic=True) st.y += _GAP @@ -663,12 +747,16 @@ def _measure_kv_table(block) -> float: def _measure_data_table(block) -> float: """Faithful DataTable height — matches ``_place_data_table`` (title heading + wrapped header + every wrapped row + optional note). Keep in sync.""" + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + # Mirror the placer: a too-wide table is drawn as one image, so its + # keep-together height is the image's, not the (squeezed) table layout's. + if not _table_fits_as_text(header, rows): + return _measure_figure_like(_table_figure_block(block)) h = 0.0 title = getattr(block, "title", None) if title: h += _measure_heading_text(title, 2) - header = list(getattr(block, "header", []) or []) - rows = list(getattr(block, "rows", []) or []) fs = _FS_CELL widths = _col_widths(header, rows) if header: @@ -698,6 +786,10 @@ def _measure_block(st: _PptxState, block) -> float: lines = tl.wrap(getattr(block, "text", ""), tl.chars_per_line(_USABLE_W, _FS_NOTE)) return tl.line_height_in(_FS_NOTE) * len(lines) + 0.05 + _GAP + if kind == "toc_entry": + lines = tl.wrap(tl.strip_inline_md(getattr(block, "label", "")), + tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY)) or [""] + return tl.line_height_in(_FS_BODY) * len(lines) + 0.05 if kind == "kv_table": return _measure_kv_table(block) if kind == "data_table": @@ -810,6 +902,73 @@ def _fit_group_blocks(st: _PptxState, blocks: list, avail_full: float) -> list: return out +def _fit_img(width_col: float, aspect: float, max_h: float): + """Scale an image to ``width_col`` then clamp to ``max_h`` keeping aspect.""" + w = width_col + h = w * aspect + if h > max_h: + h = max_h + w = (h / aspect) if aspect else width_col + return w, h + + +def _place_group_side_by_side(st: _PptxState, block, avail_full: float) -> bool: + """Place a Group's table (left ~55%) next to its figure (right ~45%). + + Both the table and the figure are rasterized to high-res images and placed in + two columns of the SAME slide; any other blocks (e.g. a heading) render full + width above the pair, the rest below. Returns True on success; returns False + (so the caller falls back to stacking) when the group has no table+figure pair + or the pair cannot fit side by side on one slide. Never raises by itself.""" + blocks = getattr(block, "blocks", []) or [] + tbl = next((b for b in blocks + if getattr(b, "kind", "") in ("data_table", "kv_table")), None) + fig = next((b for b in blocks + if getattr(b, "kind", "") in ("figure", "image")), None) + if tbl is None or fig is None: + return False + gap_col = 0.3 + left_w = _USABLE_W * 0.55 - gap_col / 2.0 + right_w = _USABLE_W * 0.45 - gap_col / 2.0 + if left_w <= 1.0 or right_w <= 1.0: + return False + tdata, tasp = _figure_bytes_cached(_table_figure_block(tbl)) + fdata, fasp = _figure_bytes_cached(fig) + if not tdata or not fdata: + return False + ti, fi = blocks.index(tbl), blocks.index(fig) + lo = min(ti, fi) + lead = list(blocks[:lo]) + rest = [b for b in blocks[lo + 1:] if b is not tbl and b is not fig] + lead_h = sum(_measure_block(st, b) for b in lead) + rest_h = sum(_measure_block(st, b) for b in rest) + col_max_h = avail_full - lead_h - rest_h - _GAP * 2 + if col_max_h < 1.2: + return False # not enough vertical room to put the pair side by side. + tw, th = _fit_img(left_w, tasp, col_max_h) + fw, fh = _fit_img(right_w, fasp, col_max_h) + band = max(th, fh) + needed = lead_h + band + rest_h + _GAP * 2 + if needed > avail_full: + return False # taller than a whole slide even side by side → stack. + if needed > _remaining(st): + _new_slide(st, cont=True) + for b in lead: + _PLACERS.get(getattr(b, "kind", ""), _place_note)(st, b) + top = st.y + f_left = _ML + left_w + gap_col + st.slide.shapes.add_picture( + io.BytesIO(tdata), Inches(_ML + (left_w - tw) / 2.0), + Inches(top + (band - th) / 2.0), width=Inches(tw), height=Inches(th)) + st.slide.shapes.add_picture( + io.BytesIO(fdata), Inches(f_left + (right_w - fw) / 2.0), + Inches(top + (band - fh) / 2.0), width=Inches(fw), height=Inches(fh)) + st.y = top + band + _GAP + for b in rest: + _PLACERS.get(getattr(b, "kind", ""), _place_note)(st, b) + return True + + def _place_group(st: _PptxState, block) -> None: """Render a keep-together Group: move it whole to the next slide if needed.""" blocks = getattr(block, "blocks", []) or [] @@ -820,6 +979,14 @@ def _place_group(st: _PptxState, block) -> None: if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6: _new_slide(st, cont=True) avail_full = _CONTENT_BOTTOM - _CONTENT_TOP + # layout="side_by_side": try table-left / figure-right on one slide; on any + # reason it can't, fall through to the normal stacked keep-together below. + if str(getattr(block, "layout", "stack")).lower() == "side_by_side": + try: + if _place_group_side_by_side(st, block, avail_full): + return + except Exception: # noqa: BLE001 — degrade to stacking, never abort. + pass # Trim oversized tables first (keeps the chart on the same slide), then shrink # the figure to share the remaining room. blocks = _fit_group_blocks(st, blocks, avail_full) @@ -853,6 +1020,44 @@ def _place_glossary_entry(st: _PptxState, block) -> None: st.y += _GAP +def _place_toc_entry(st: _PptxState, block) -> None: + """Render one clickable index line and record its run as a link source. + + Drawn as a bulleted line in the accent link colour; the run is recorded in + ``st.toc_runs`` so it later becomes a native slide-jump to the target chapter's + first slide. If the target is never resolved the line still shows as plain + (accent) text — never cut.""" + label = tl.strip_inline_md(getattr(block, "label", "")) or "" + target_id = getattr(block, "target_id", "") or "" + fs = _FS_BODY + lines = tl.wrap(label, tl.chars_per_line(_USABLE_W - 0.3, fs)) or [""] + lh = tl.line_height_in(fs) + height = lh * len(lines) + 0.05 + _ensure(st, height) + box = st.slide.shapes.add_textbox( + Inches(_ML), Inches(st.y), Inches(_USABLE_W), Inches(height)) + tf = box.text_frame + tf.word_wrap = True + first = True + link_run = None + for idx, ln in enumerate(lines): + p = tf.paragraphs[0] if first else tf.add_paragraph() + first = False + r0 = p.add_run() + r0.text = "• " if idx == 0 else " " + r0.font.size = Pt(fs) + r0.font.color.rgb = _rgb(_LINK) + run = p.add_run() + run.text = ln + run.font.size = Pt(fs) + run.font.color.rgb = _rgb(_LINK) + if idx == 0: + link_run = run + if target_id and link_run is not None: + st.toc_runs.append((target_id, link_run, st.slide)) + st.y += height + + _PLACERS = { "heading": _place_heading, "markdown": _place_markdown, @@ -864,6 +1069,7 @@ _PLACERS = { "note": _place_note, "group": _place_group, "glossary_entry": _place_glossary_entry, + "toc_entry": _place_toc_entry, } @@ -899,6 +1105,12 @@ def render_pptx(chapters: list, out_path: str, meta: dict = None) -> dict: st.chapter = ch st.chapter_slides = 0 _new_slide(st, cont=False) + # Record this chapter's first slide as a link target for the cover + # index (keyed by id AND title, since the cover only knows titles). + if ch.id: + st.chapter_starts[ch.id] = st.slide + if getattr(ch, "title", ""): + st.chapter_starts.setdefault(ch.title, st.slide) for block in ch.blocks: placer = _PLACERS.get(getattr(block, "kind", ""), _place_note) try: @@ -926,7 +1138,7 @@ def render_pptx(chapters: list, out_path: str, meta: dict = None) -> dict: note = f"{n_slides} slides" if n_links: - note += f" · {n_links} enlaces de glosario" + note += f" · {n_links} enlaces internos" if notes: note += " · " + "; ".join(notes) return {"path": out_path, "n_slides": n_slides, "chapters": chapters_meta, @@ -934,19 +1146,21 @@ def render_pptx(chapters: list, out_path: str, meta: dict = None) -> dict: def _wire_glossary_links(st: _PptxState, notes: list) -> int: - """Turn each recorded term run into a native jump to its glossary slide. + """Apply native slide-jumps: glossary terms + the cover index. - Returns the number of links applied. A term whose only appearance is inside - its own glossary entry (source slide == target slide) is skipped. Never + Each in-text glossary term run jumps to its glossary entry slide, and each + cover ``TocEntry`` run jumps to its chapter's first slide. Returns the total + number of links applied. A run whose target is its own slide is skipped. Never raises.""" - if not st.term_runs or not st.term_anchor_slide: + if not (st.term_runs and st.term_anchor_slide) and not ( + st.toc_runs and st.chapter_starts): return 0 - linked = 0 try: from datascience.pptx_link_run_to_slide import pptx_link_run_to_slide except Exception as e: # noqa: BLE001 - notes.append(f"glosario sin enlaces: {e}") + notes.append(f"enlaces internos no aplicados: {e}") return 0 + linked = 0 for key, run, src_slide in st.term_runs: tgt = st.term_anchor_slide.get(key) if tgt is None or tgt is src_slide: @@ -956,4 +1170,14 @@ def _wire_glossary_links(st: _PptxState, notes: list) -> int: linked += 1 except Exception: # noqa: BLE001 — links are best-effort. pass + # Cover index → chapter first slide (clickable, navigable table of contents). + for target_id, run, src_slide in st.toc_runs: + tgt = st.chapter_starts.get(target_id) + if tgt is None or tgt is src_slide: + continue + try: + if pptx_link_run_to_slide(run, src_slide, tgt): + linked += 1 + except Exception: # noqa: BLE001 — links are best-effort. + pass return linked diff --git a/python/functions/datascience/automatic_eda/render_quality_test.py b/python/functions/datascience/automatic_eda/render_quality_test.py new file mode 100644 index 00000000..a2567251 --- /dev/null +++ b/python/functions/datascience/automatic_eda/render_quality_test.py @@ -0,0 +1,283 @@ +"""Golden tests for the global render-quality features (issue: eda-render-quality). + +Covers, with executable evidence: + * High DPI: every embedded figure is rasterized at 220 dpi, so a phone reader + can zoom in and still see crisp detail. + * Wide table → image: a table too wide to be legible as text (e.g. a 19-column + df.head) is rendered as one high-res image that scales to fit entirely, while + a narrow table keeps its selectable-text/native-table rendering. + * ``Group(layout="side_by_side")``: in PPTX the table and figure are placed in + two columns of the same slide; in PDF the same group stacks vertically. + * Backward compatibility: a Group without ``layout`` defaults to ``"stack"`` and + a fitting table renders exactly as before. + +Renderers are invoked for real; PDFs are inspected with PyMuPDF and PPTX decks +with python-pptx. +""" + +from __future__ import annotations + +import os +import tempfile + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt # noqa: E402 + +import pytest # noqa: E402 + +from datascience.automatic_eda import model # noqa: E402 +from datascience.automatic_eda.render_pdf_impl import ( # noqa: E402 + render_pdf, _RASTER_DPI as _PDF_DPI, _table_fits_as_text as _pdf_fits) +from datascience.automatic_eda.render_pptx_impl import ( # noqa: E402 + render_pptx, _RASTER_DPI as _PPTX_DPI, _table_fits_as_text as _pptx_fits) + + +# --------------------------------------------------------------------------- # +# Helpers. +# --------------------------------------------------------------------------- # +def _simple_fig(): + """A small, real matplotlib figure for the figure blocks.""" + fig, ax = plt.subplots(figsize=(4, 3)) + ax.plot([0, 1, 2, 3], [1, 3, 2, 4]) + ax.set_title("demo") + return fig + + +def _wide_table(n_cols=19, n_rows=5): + header = [f"columna_{i}" for i in range(n_cols)] + rows = [[f"v{r}_{c}" for c in range(n_cols)] for r in range(n_rows)] + return model.DataTable(header=header, rows=rows, title="Primeras filas") + + +def _narrow_table(): + return model.DataTable(header=["a", "b", "c"], + rows=[["1", "2", "3"], ["4", "5", "6"]], + title="Tabla estrecha") + + +def _chapter(blocks, cid="cap", title="Capítulo"): + return [model.Chapter(id=cid, title=title, version="1.0.0", blocks=blocks)] + + +# --------------------------------------------------------------------------- # +# 1) High DPI — the unit constant and a real embedded image. +# --------------------------------------------------------------------------- # +def test_raster_dpi_is_high_both_renderers(): + assert _PDF_DPI >= 200, "el DPI del PDF debe ser alto (>=200)" + assert _PPTX_DPI >= 200, "el DPI del PPTX debe ser alto (>=200)" + + +def test_pdf_embedded_figure_is_high_resolution(tmp_path): + fitz = pytest.importorskip("fitz") + out = str(tmp_path / "fig.pdf") + res = render_pdf(_chapter([model.Figure(make=_simple_fig, caption="demo")]), + out, {"title": "T"}) + assert res["path"] == out + doc = fitz.open(out) + try: + widths = [] + for page in doc: + for img in page.get_images(full=True): + xref = img[0] + info = doc.extract_image(xref) + widths.append(info.get("width", 0)) + assert widths, "no se incrustó ninguna imagen en el PDF" + # A ~4" figure rasterized at 220 dpi is ~ >850 px wide. At the old 150 dpi + # it would be ~600 px. The high-res threshold proves the DPI bump. + assert max(widths) >= 800, \ + f"la figura embebida no es de alta resolución: {max(widths)} px" + finally: + doc.close() + + +# --------------------------------------------------------------------------- # +# 2) Wide table → image (PDF and PPTX); narrow table stays text. +# --------------------------------------------------------------------------- # +def test_fit_criterion_flags_wide_and_keeps_narrow(): + wide = _wide_table() + narrow = _narrow_table() + assert not _pdf_fits(wide.header, wide.rows), \ + "una tabla de 19 columnas debería NO caber como texto en A5" + assert not _pptx_fits(wide.header, wide.rows), \ + "una tabla de 19 columnas debería NO caber como tabla nativa en 16:9" + assert _pdf_fits(narrow.header, narrow.rows), \ + "una tabla de 3 columnas debería caber como texto en A5" + assert _pptx_fits(narrow.header, narrow.rows), \ + "una tabla de 3 columnas debería caber como tabla nativa en 16:9" + + +def test_wide_table_rendered_as_image_pdf(tmp_path): + fitz = pytest.importorskip("fitz") + out = str(tmp_path / "wide.pdf") + res = render_pdf(_chapter([_wide_table()]), out, {"title": "T"}) + assert res["path"] == out + doc = fitz.open(out) + try: + n_images = sum(len(page.get_images(full=True)) for page in doc) + text = "".join(page.get_text() for page in doc) + finally: + doc.close() + assert n_images >= 1, "la tabla ancha no se rasterizó como imagen en el PDF" + # The cells are now inside the image, not selectable text. A unique cell value + # must therefore NOT appear as extractable text (it lives in the picture). + assert "v4_18" not in text, \ + "la tabla ancha sigue como texto seleccionable (no se hizo imagen)" + + +def test_narrow_table_stays_selectable_text_pdf(tmp_path): + fitz = pytest.importorskip("fitz") + out = str(tmp_path / "narrow.pdf") + render_pdf(_chapter([_narrow_table()]), out, {"title": "T"}) + doc = fitz.open(out) + try: + text = "".join(page.get_text() for page in doc) + finally: + doc.close() + # Narrow table is selectable text: its header/cells are extractable. + for v in ("a", "b", "c", "1", "6"): + assert v in text, f"la celda '{v}' debería ser texto seleccionable" + + +def test_wide_table_rendered_as_picture_pptx(tmp_path): + pptx = pytest.importorskip("pptx") + from pptx.enum.shapes import MSO_SHAPE_TYPE + out = str(tmp_path / "wide.pptx") + res = render_pptx(_chapter([_wide_table()]), out, {"title": "T"}) + assert res["path"] == out + prs = pptx.Presentation(out) + pics = sum(1 for s in prs.slides for sh in s.shapes + if sh.shape_type == MSO_SHAPE_TYPE.PICTURE) + assert pics >= 1, "la tabla ancha no se colocó como imagen en el PPTX" + + +# --------------------------------------------------------------------------- # +# 3) Group(layout="side_by_side"): two columns in PPTX, stacked in PDF. +# --------------------------------------------------------------------------- # +def _side_by_side_group(): + return model.Group( + blocks=[model.Heading(text="Columna X", level=2), + _narrow_table(), + model.Figure(make=_simple_fig, caption="grafico")], + layout="side_by_side") + + +def test_side_by_side_places_two_columns_pptx(tmp_path): + pptx = pytest.importorskip("pptx") + from pptx.enum.shapes import MSO_SHAPE_TYPE + from pptx.util import Inches + out = str(tmp_path / "sbs.pptx") + render_pptx(_chapter([_side_by_side_group()]), out, {"title": "T"}) + prs = pptx.Presentation(out) + # Find the slide that holds the pair (table image + figure image). + centre_emu = int(Inches(13.333 / 2.0)) + placed = False + for s in prs.slides: + lefts = [sh.left for sh in s.shapes + if sh.shape_type == MSO_SHAPE_TYPE.PICTURE + and sh.left is not None] + if len(lefts) >= 2: + # one picture starts in the left half, another in the right half. + if min(lefts) < centre_emu and max(lefts) > centre_emu: + placed = True + break + assert placed, \ + "side_by_side no colocó tabla y figura en dos columnas de la misma slide" + + +def test_side_by_side_stacks_in_pdf(tmp_path): + fitz = pytest.importorskip("fitz") + out = str(tmp_path / "sbs.pdf") + res = render_pdf(_chapter([_side_by_side_group()]), out, {"title": "T"}) + assert res["path"] == out and res["n_pages"] >= 1 + doc = fitz.open(out) + try: + n_images = sum(len(page.get_images(full=True)) for page in doc) + text = "".join(page.get_text() for page in doc) + finally: + doc.close() + # PDF stacks: the narrow table stays selectable text (1 of its cells is + # extractable) and the figure is the single embedded image — not a 2-column + # pair of pictures like PPTX. + assert n_images == 1, "el PDF no debería usar el layout de dos imágenes" + assert "Columna X" in text and "1" in text, \ + "la tabla del grupo debería seguir como texto apilado en el PDF" + + +# --------------------------------------------------------------------------- # +# 4) Backward compatibility — default layout stacks, fitting table unchanged. +# --------------------------------------------------------------------------- # +def test_group_default_layout_is_stack(): + g = model.Group(blocks=[_narrow_table()]) + assert g.layout == "stack", "el layout por defecto debe ser 'stack'" + + +# --------------------------------------------------------------------------- # +# 5) Clickable cover index ("Índice") → chapter first page/slide. +# --------------------------------------------------------------------------- # +def _doc_with_index(): + portada = model.Chapter(id="portada", title="Portada", version="1.0.0", + blocks=[model.Heading(text="Índice", level=2), + model.TocEntry(label="Distribuciones", + target_id="Distribuciones")]) + cap = model.Chapter(id="num", title="Distribuciones", version="1.0.0", + blocks=[model.Markdown(text="contenido del capítulo")]) + return [portada, cap] + + +def test_cover_index_is_clickable_pdf(tmp_path): + fitz = pytest.importorskip("fitz") + out = str(tmp_path / "idx.pdf") + res = render_pdf(_doc_with_index(), out, {"title": "T"}) + assert res["path"] == out + doc = fitz.open(out) + try: + # The cover (page 0) must carry a GOTO link jumping to a later page. + goto = [lk for lk in doc[0].get_links() + if lk.get("kind") == fitz.LINK_GOTO and lk.get("page", 0) > 0] + finally: + doc.close() + assert goto, "el índice de la portada no produjo enlaces clicables en el PDF" + + +def test_cover_index_shows_heading_pdf(tmp_path): + fitz = pytest.importorskip("fitz") + out = str(tmp_path / "idxh.pdf") + render_pdf(_doc_with_index(), out, {"title": "T"}) + doc = fitz.open(out) + try: + text = "".join(page.get_text() for page in doc) + finally: + doc.close() + assert "Índice" in text, "la portada no muestra el encabezado 'Índice'" + assert "Este informe incluye" not in text, \ + "la portada aún muestra el texto antiguo 'Este informe incluye'" + + +def test_cover_index_is_clickable_pptx(tmp_path): + pptx = pytest.importorskip("pptx") + out = str(tmp_path / "idx.pptx") + render_pptx(_doc_with_index(), out, {"title": "T"}) + prs = pptx.Presentation(out) + cover_xml = prs.slides[0]._element.xml + assert "hlinksldjump" in cover_xml, \ + "el índice de la portada no produjo un salto de slide nativo en el PPTX" + + +def test_default_group_renders_like_before_pptx(tmp_path): + pptx = pytest.importorskip("pptx") + from pptx.enum.shapes import MSO_SHAPE_TYPE + out = str(tmp_path / "stack.pptx") + grp = model.Group(blocks=[model.Heading(text="Y", level=2), + _narrow_table(), + model.Figure(make=_simple_fig, caption="g")]) + render_pptx(_chapter([grp]), out, {"title": "T"}) + prs = pptx.Presentation(out) + # Stacked group: the narrow table is a NATIVE table (selectable), and there is + # exactly one picture (the figure) — not the two-image side-by-side layout. + n_tables = sum(1 for s in prs.slides for sh in s.shapes if sh.has_table) + n_pics = sum(1 for s in prs.slides for sh in s.shapes + if sh.shape_type == MSO_SHAPE_TYPE.PICTURE) + assert n_tables >= 1, "el grupo apilado debería usar una tabla nativa" + assert n_pics == 1, "el grupo apilado no debería duplicar imágenes" diff --git a/python/functions/datascience/render_table_as_figure.md b/python/functions/datascience/render_table_as_figure.md new file mode 100644 index 00000000..11d22d91 --- /dev/null +++ b/python/functions/datascience/render_table_as_figure.md @@ -0,0 +1,121 @@ +--- +id: render_table_as_figure_py_datascience +name: render_table_as_figure +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def render_table_as_figure(header, rows, title=None, note=None, fontsize=9.0, max_cell_chars=40) -> \"matplotlib.figure.Figure\"" +description: "Dibuja un bloque tabular (cabecera + filas) como una matplotlib.figure.Figure nítida, lista para rasterizar a DPI alto. Pensada para tablas que NO caben como texto en una página/slide del informe EDA: se rasteriza a alta resolución (el caller usa dpi=220, bbox_inches='tight') y el usuario hace zoom en el móvil para leerla entera sin perder datos. Cabecera sombreada (#eef3f6) y en negrita, filas pares (1-based) con zebra suave (#f6f8fa), tinta oscura (#1b1b1b) sobre blanco, rejilla gris muy fina (#cccccc). Trunca cada celda a max_cell_chars con elipsis y str()-ea cada valor (None -> \"\"). figsize proporcional al contenido (ancho por nº y longitud de columnas, alto por nº de filas) para que sea legible con zoom. Backend Agg sin pyplot global. Defensiva: header/rows vacíos o None, filas irregulares o cualquier error interno devuelven una Figure placeholder con texto centrado \"(tabla no disponible)\". NUNCA lanza." +tags: [eda, table, figure, matplotlib, visualization, rasterize, zoom, render, datascience, impure] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [matplotlib] +example: | + from datascience.render_table_as_figure import render_table_as_figure + header = ["columna", "n_nulos", "%_nulos", "distintos", "tipo", "ejemplo"] + rows = [ + ["ingresos", 12, "1.2%", 980, "float64", "2345.67"], + ["edad", 0, "0.0%", 88, "int64", "37"], + ["ciudad", 5, "0.5%", 412, "object", "Madrid"], + ] + fig = render_table_as_figure(header, rows, title="Resumen de columnas", + note="rasteriza a dpi=220 y haz zoom") + fig.savefig("/tmp/tabla.png", dpi=220, bbox_inches="tight") +tested: true +tests: + - "test_returns_figure_with_table" + - "test_rows_none_does_not_raise" + - "test_header_none_does_not_raise" + - "test_empty_lists_return_placeholder_figure" + - "test_both_none_return_placeholder_figure" + - "test_long_cell_is_truncated" + - "test_none_cells_become_empty_strings" + - "test_can_rasterize_to_png_high_dpi" + - "test_placeholder_can_rasterize" + - "test_ragged_rows_are_padded" +test_file_path: "python/functions/datascience/render_table_as_figure_test.py" +file_path: "python/functions/datascience/render_table_as_figure.py" +params: + - name: header + desc: "Lista de nombres de columna (puede ser [] o None). Cada nombre se str()-ea, se trunca a max_cell_chars y se pinta en la fila cabecera sombreada en negrita. Si está vacío/None no se dibuja fila de cabecera (solo cuerpo)." + - name: rows + desc: "Lista de filas; cada fila es una lista de celdas con valores cualesquiera (se str()-ean; None -> \"\"). Admite None (se trata como []), filas escalares (se envuelven en una celda) y filas de distinta longitud (la rejilla se rectangulariza al ancho máximo, rellenando con celdas vacías). Saltos de línea/tabs en una celda se colapsan a espacios para que no desborde a otras filas." + - name: title + desc: "Título opcional dibujado encima de la tabla, en negrita tinta #1b1b1b, alineado a la izquierda. None o \"\" => sin título. Default None." + - name: note + desc: "Nota opcional al pie de la figura, en gris #8a8a8a e itálica. None o \"\" => sin nota. Default None." + - name: fontsize + desc: "Tamaño de fuente base (pt) de las celdas del cuerpo. La cabecera usa fontsize+3 y la nota max(7, fontsize-1). Un valor no numérico o <= 0 cae a 9.0. Default 9.0." + - name: max_cell_chars + desc: "Trunca el texto de cada celda a este nº de chars (con … final cuando se recorta) para que el ancho no explote. Un valor no entero cae a 40; <= 0 deja las celdas vacías. Default 40." +output: "Un matplotlib.figure.Figure (figsize proporcional al contenido: ancho ≈ 0.9-1.6\" por columna según su texto, total acotado a 3-26\"; alto ≈ 0.32\" por fila + cabecera + espacio para título/nota, acotado) con un Axes sin ejes que contiene un ax.table(...) NO cerrado. Cabecera fondo #eef3f6 texto #1b1b1b bold; filas pares (1-based) zebra #f6f8fa, impares blanco; tinta #1b1b1b; bordes/rejilla #cccccc lw 0.4; texto alineado a la izquierda. Título encima (bold) y nota debajo (gris itálica) si se pasan. Si header/rows son vacíos o None, o ante cualquier error interno, devuelve una Figure placeholder pequeña con el texto centrado \"(tabla no disponible)\". NUNCA lanza. El caller la rasteriza (dpi=220, bbox_inches='tight') y la cierra; la función no la muestra ni la guarda." +--- + +## Ejemplo + +```python +import sys, os +sys.path.insert(0, os.path.join("python", "functions")) +from datascience.render_table_as_figure import render_table_as_figure + +# Tabla que no cabe como texto en la slide -> se rasteriza y se lee con zoom. +header = ["columna", "n_nulos", "%_nulos", "distintos", "tipo", "ejemplo"] +rows = [ + ["ingresos", 12, "1.2%", 980, "float64", "2345.67"], + ["edad", 0, "0.0%", 88, "int64", "37"], + ["ciudad", 5, "0.5%", 412, "object", "Madrid"], + ["categoria_producto", 0, "0.0%", 1840, "object", + "un_valor_categorico_muy_largo_que_se_trunca"], +] + +fig = render_table_as_figure( + header, + rows, + title="Resumen de columnas", + note="rasteriza a dpi=220 y haz zoom en el móvil", + fontsize=9.0, + max_cell_chars=40, +) + +# El renderer del informe lo rasteriza a alta resolución; aquí lo persistimos. +fig.savefig("/tmp/tabla.png", dpi=220, bbox_inches="tight") +``` + +## Cuando usarla + +Úsala en un informe EDA cuando una tabla **no cabe como texto** en una página o +slide y prefieres una imagen nítida que el lector pueda ampliar en el móvil para +leerla entera (perfiles de columnas, matrices de conteo, tablas de frecuencias +con muchas filas o columnas anchas). Pásale la cabecera y las filas tal cual (los +valores se `str()`-ean por ti) más un `title`/`note` opcionales; el llamante la +rasteriza a `dpi=220` con `bbox_inches='tight'`. Es la pareja "tabla-como-imagen" +de los gráficos `build_boxplots_figure` / `categorical_top_pie_figure`: misma +paleta y mismo contrato (Agg, sin `pyplot`, el caller cierra la figura). + +## Gotchas + +- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg` + y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí, + para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO + es thread-safe; esta función construye el `Figure` directamente, así que es + segura de llamar en bucle desde el renderer. +- **El caller cierra la figura.** Devuelve el `Figure` pero no lo muestra ni lo + guarda. Quien la consume debe rasterizarla y luego liberarla + (`matplotlib.pyplot.close(fig)`) para no acumular memoria en lotes grandes. +- **Pensada para rasterizar a DPI alto.** El `figsize` es proporcional al + contenido pero la legibilidad real viene del DPI: rasteriza con `dpi=220` y + `bbox_inches='tight'`. Una tabla con muchísimas filas crece en alto (capado a + ~60") — para miles de filas, parte la tabla o resume antes de pasarla. +- **Truncación de celda visible.** Cada celda se recorta a `max_cell_chars` + (default 40) con `…` final y los saltos de línea/tabs se colapsan a espacios, + para que ninguna celda desborde a otras filas. Sube `max_cell_chars` si + necesitas ver el valor completo (a costa de ancho). +- **Defensiva, nunca lanza.** `header`/`rows` vacíos o `None`, filas escalares, + filas de distinta longitud o cualquier error interno se manejan sin propagar: + en el peor caso devuelve una `Figure` placeholder con "(tabla no disponible)". + No envuelvas la llamada en try/except por miedo a un raise — no lo hay. diff --git a/python/functions/datascience/render_table_as_figure.py b/python/functions/datascience/render_table_as_figure.py new file mode 100644 index 00000000..3994c41d --- /dev/null +++ b/python/functions/datascience/render_table_as_figure.py @@ -0,0 +1,241 @@ +"""Impure EDA helper: a crisp table rendered as a matplotlib Figure (`eda` group). + +Draws a tabular block (header + rows) as a sharp ``matplotlib.figure.Figure`` +ready to be rasterized at high DPI, so a table that does NOT fit as text on a +page/slide can still be read in full by zooming into the rasterized image on a +phone. The header is shaded and bold, even rows carry a soft zebra stripe, the +ink is dark on white and the grid is very thin. + +Impure because it touches matplotlib's rendering machinery. It uses the headless +Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no +global state and is safe to call repeatedly from a report renderer. It is fully +defensive and NEVER raises: empty/invalid input or any internal error returns a +small placeholder figure carrying a centered "(tabla no disponible)". +""" + +import matplotlib + +matplotlib.use("Agg") + +from matplotlib.figure import Figure # noqa: E402 + +# Palette shared with the EDA report renderer so the document stays coherent. +_HEADER_BG = "#eef3f6" # header cell background. +_HEADER_TEXT = "#1b1b1b" # header cell text (bold). +_ZEBRA_BG = "#f6f8fa" # even (1-based) row background stripe. +_BODY_BG = "#ffffff" # odd row background. +_INK = "#1b1b1b" # body text + title ink. +_GRID = "#cccccc" # cell borders / grid (thin). +_NOTE_TEXT = "#8a8a8a" # muted gray for the note (italic). + + +def _placeholder_figure(message: str = "(tabla no disponible)") -> "Figure": + """Return a small fallback ``Figure`` carrying a single centered message.""" + fig = Figure(figsize=(6.0, 1.6), dpi=150) + ax = fig.add_subplot(111) + ax.axis("off") + ax.text( + 0.5, + 0.5, + message, + ha="center", + va="center", + fontsize=11, + color=_NOTE_TEXT, + style="italic", + wrap=True, + transform=ax.transAxes, + ) + fig.tight_layout() + return fig + + +def _cell_text(value, max_cell_chars: int) -> str: + """``str()`` a cell value defensively, None -> "", truncate with an ellipsis.""" + s = "" if value is None else str(value) + # Collapse newlines/tabs so a single cell never spills across table rows. + s = s.replace("\n", " ").replace("\r", " ").replace("\t", " ") + try: + limit = int(max_cell_chars) + except (TypeError, ValueError): + limit = 40 + if limit <= 0: + return "" + if len(s) <= limit: + return s + if limit == 1: + return "…" + return s[: limit - 1] + "…" + + +def render_table_as_figure( + header, + rows, + title=None, + note=None, + fontsize=9.0, + max_cell_chars=40, +): + """Dibuja una tabla nítida como matplotlib.figure.Figure, lista para rasterizar a DPI alto. + + Pensada para tablas que NO caben como texto en una página/slide: se rasteriza + a alta resolución y el usuario hace zoom en el móvil para leerla entera sin + perder datos. Cabecera sombreada + negrita, filas pares con zebra suave, + tinta oscura sobre blanco, rejilla muy fina. + + Args: + header: lista de nombres de columna (puede ser []). + rows: lista de filas; cada fila es una lista de celdas (valores cualquiera, se str()-ean). + title: título opcional dibujado encima de la tabla (o None). + note: nota opcional en gris/itálica bajo la tabla (o None). + fontsize: tamaño de fuente base (pt) de las celdas. + max_cell_chars: trunca el texto de celda a este nº de chars (con … final) para que no explote el ancho. + + Returns: + matplotlib.figure.Figure — NO cerrada (el llamante la rasteriza y la cierra). + Nunca lanza: ante cualquier error devuelve una Figure con el texto "(tabla no disponible)". + """ + try: + # --- Defensive normalization of header/rows into a rectangular grid. + header_list = list(header) if isinstance(header, (list, tuple)) else [] + raw_rows = list(rows) if isinstance(rows, (list, tuple)) else [] + + clean_rows = [] + for row in raw_rows: + if isinstance(row, (list, tuple)): + clean_rows.append(list(row)) + elif row is None: + clean_rows.append([]) + else: + # A scalar row becomes a single-cell row instead of being dropped. + clean_rows.append([row]) + + # Nothing to draw at all -> placeholder. + if not header_list and not clean_rows: + return _placeholder_figure() + + # Number of columns = widest of header / any row. + n_cols = len(header_list) + for row in clean_rows: + if len(row) > n_cols: + n_cols = len(row) + if n_cols <= 0: + return _placeholder_figure() + + # Base font size, tolerate a bad value. + try: + base_fs = float(fontsize) + except (TypeError, ValueError): + base_fs = 9.0 + if base_fs <= 0: + base_fs = 9.0 + + # --- Build the truncated, padded text matrix. + header_cells = [ + _cell_text(header_list[c] if c < len(header_list) else "", max_cell_chars) + for c in range(n_cols) + ] + body_cells = [] + for row in clean_rows: + body_cells.append( + [ + _cell_text(row[c] if c < len(row) else "", max_cell_chars) + for c in range(n_cols) + ] + ) + + has_header = any(t for t in header_cells) + n_body = len(body_cells) + # Total drawn table rows (header counts as one when present). + n_table_rows = n_body + (1 if has_header else 0) + if n_table_rows <= 0: + return _placeholder_figure() + + # --- figsize proportional to content so it reads under zoom. + # Width: per-column width scales with the longest text in that column, + # clamped to a sensible per-column range, total capped. + per_col_widths = [] + for c in range(n_cols): + col_texts = [header_cells[c]] if has_header else [] + col_texts += [body_cells[r][c] for r in range(n_body)] + longest = max((len(t) for t in col_texts), default=0) + # ~0.085" per char at the base font, clamped to [0.9, 1.6] inches. + w = 0.9 + 0.085 * max(longest - 6, 0) + w = max(0.9, min(1.6, w)) + per_col_widths.append(w) + fig_w = sum(per_col_widths) + fig_w = max(3.0, min(26.0, fig_w)) + + # Height: ~0.32" per row + room for title / note. + fig_h = 0.32 * n_table_rows + 0.30 + if title is not None and str(title) != "": + fig_h += 0.45 + if note is not None and str(note) != "": + fig_h += 0.30 + fig_h = max(1.0, min(60.0, fig_h)) + + fig = Figure(figsize=(fig_w, fig_h), dpi=150) + ax = fig.add_subplot(111) + ax.axis("off") + + # Reserve vertical bands for the optional title (top) and note (bottom) + # so the table itself never overlaps them. + title_band = 0.10 if (title is not None and str(title) != "") else 0.0 + note_band = 0.07 if (note is not None and str(note) != "") else 0.0 + table_bbox = [0.0, note_band, 1.0, max(0.05, 1.0 - title_band - note_band)] + + cell_text = ([header_cells] if has_header else []) + body_cells + + col_widths = [w / fig_w for w in per_col_widths] + + table = ax.table( + cellText=cell_text, + colWidths=col_widths, + cellLoc="left", + loc="center", + bbox=table_bbox, + ) + table.auto_set_font_size(False) + table.set_fontsize(base_fs) + + # --- Style every cell: zebra body, shaded bold header, thin gray grid. + for (r, _c), cell in table.get_celld().items(): + cell.set_edgecolor(_GRID) + cell.set_linewidth(0.4) + # Small horizontal padding so text does not touch the border. + cell.PAD = 0.04 + if has_header and r == 0: + cell.set_facecolor(_HEADER_BG) + cell.set_text_props(color=_HEADER_TEXT, fontweight="bold", ha="left") + else: + body_index = r - 1 if has_header else r # 0-based body row. + # 1-based even rows get the zebra stripe. + is_even = ((body_index + 1) % 2) == 0 + cell.set_facecolor(_ZEBRA_BG if is_even else _BODY_BG) + cell.set_text_props(color=_INK, ha="left") + + if title is not None and str(title) != "": + ax.set_title( + str(title), + fontsize=base_fs + 3.0, + fontweight="bold", + color=_INK, + loc="left", + pad=8, + ) + + if note is not None and str(note) != "": + fig.text( + 0.01, + 0.01, + str(note), + ha="left", + va="bottom", + fontsize=max(7.0, base_fs - 1.0), + color=_NOTE_TEXT, + style="italic", + ) + + return fig + except Exception: # noqa: BLE001 — never raise from a figure builder. + return _placeholder_figure() diff --git a/python/functions/datascience/render_table_as_figure_test.py b/python/functions/datascience/render_table_as_figure_test.py new file mode 100644 index 00000000..2ebe4b79 --- /dev/null +++ b/python/functions/datascience/render_table_as_figure_test.py @@ -0,0 +1,119 @@ +"""Tests para render_table_as_figure (tabla nítida como Figure, grupo eda). + +Usa el backend Agg sin display; no muestra ni guarda figuras a disco salvo a un +BytesIO en memoria. Cada test cierra explícitamente la Figure construida +(matplotlib.pyplot.close) para no acumular estado entre tests. +""" + +from io import BytesIO + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt # noqa: E402 +from matplotlib.figure import Figure # noqa: E402 + +from render_table_as_figure import render_table_as_figure + + +def _grid(n_cols, n_rows): + """Cabecera de n_cols columnas + n_rows filas de celdas.""" + header = [f"col_{c}" for c in range(n_cols)] + rows = [[f"r{r}c{c}" for c in range(n_cols)] for r in range(n_rows)] + return header, rows + + +def test_returns_figure_with_table(): + header, rows = _grid(6, 5) + fig = render_table_as_figure(header, rows, title="Tabla", note="nota al pie") + assert isinstance(fig, Figure) + # Hay al menos un Axes y ese Axes contiene una tabla con celdas. + assert len(fig.axes) >= 1 + ax = fig.axes[0] + assert len(ax.tables) >= 1 + # 6 columnas x (1 cabecera + 5 filas) = 36 celdas. + assert len(ax.tables[0].get_celld()) == 6 * (5 + 1) + plt.close(fig) + + +def test_rows_none_does_not_raise(): + fig = render_table_as_figure(["a", "b"], None) + assert isinstance(fig, Figure) + assert len(fig.axes) >= 1 + plt.close(fig) + + +def test_header_none_does_not_raise(): + fig = render_table_as_figure(None, [["x", "y"], ["z", "w"]]) + assert isinstance(fig, Figure) + assert len(fig.axes) >= 1 + plt.close(fig) + + +def test_empty_lists_return_placeholder_figure(): + fig = render_table_as_figure([], []) + assert isinstance(fig, Figure) + # Placeholder: un Axes con texto, sin tabla. + assert len(fig.axes) >= 1 + assert len(fig.axes[0].tables) == 0 + plt.close(fig) + + +def test_both_none_return_placeholder_figure(): + fig = render_table_as_figure(None, None) + assert isinstance(fig, Figure) + assert len(fig.axes[0].tables) == 0 + plt.close(fig) + + +def test_long_cell_is_truncated(): + long_value = "x" * 200 + header, _ = _grid(2, 0) + fig = render_table_as_figure(header, [[long_value, "ok"]], max_cell_chars=20) + assert isinstance(fig, Figure) + ax = fig.axes[0] + texts = [c.get_text().get_text() for c in ax.tables[0].get_celld().values()] + # La celda larga aparece truncada con elipsis y nunca en su forma completa. + assert any(t.endswith("…") and len(t) <= 20 for t in texts) + assert long_value not in texts + plt.close(fig) + + +def test_none_cells_become_empty_strings(): + fig = render_table_as_figure(["a", "b"], [[None, "v"], ["w", None]]) + assert isinstance(fig, Figure) + ax = fig.axes[0] + texts = [c.get_text().get_text() for c in ax.tables[0].get_celld().values()] + # Hay celdas vacías (los None) y celdas con valor. + assert "" in texts + assert "v" in texts + plt.close(fig) + + +def test_can_rasterize_to_png_high_dpi(): + header, rows = _grid(6, 8) + fig = render_table_as_figure(header, rows, title="Render", note="zoom me") + buf = BytesIO() + # No debe lanzar al rasterizar a DPI alto con bbox tight. + fig.savefig(buf, format="png", dpi=220, bbox_inches="tight") + assert buf.getbuffer().nbytes > 0 + plt.close(fig) + + +def test_placeholder_can_rasterize(): + fig = render_table_as_figure([], []) + buf = BytesIO() + fig.savefig(buf, format="png", dpi=220, bbox_inches="tight") + assert buf.getbuffer().nbytes > 0 + plt.close(fig) + + +def test_ragged_rows_are_padded(): + # Filas de distinta longitud: la rejilla se rectangulariza al ancho máximo. + fig = render_table_as_figure(["a", "b", "c"], [["1"], ["1", "2", "3", "4"]]) + assert isinstance(fig, Figure) + ax = fig.axes[0] + # 4 columnas (la fila más ancha) x (1 cabecera + 2 filas) = 12 celdas. + assert len(ax.tables[0].get_celld()) == 4 * (2 + 1) + plt.close(fig)