From 9cdde4a34127b0ac2a8bed4f4e939da8999a33b1 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Tue, 30 Jun 2026 14:30:31 +0200 Subject: [PATCH] =?UTF-8?q?feat(eda):=20n=C3=BAcleo=20AutomaticEDA=20?= =?UTF-8?q?=E2=80=94=20documento=20por=20cap=C3=ADtulos=20+=20renderers=20?= =?UTF-8?q?PDF/PPTX=20anti-corte?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce la capa intermedia entre el contenido de un EDA y su formato de salida. Un documento es una lista de capítulos versionados; cada capítulo es un conjunto ordenado de bloques (heading, markdown, kv_table, data_table, figure, image, caption, note) independientes del formato. Núcleo (paquete de soporte python/functions/datascience/automatic_eda/): - model.py: dataclasses de bloques + Chapter, normalizadores defensivos (aceptan dataclass o dict, nunca lanzan), ENGINE_VERSION y el manifiesto por capítulo (automatic_eda_manifest.json). - text_layout.py: medición/wrapping por rejilla de caracteres compartida. - chapters_registry.py: CHAPTER_ORDER pre-declarado + build_document con auto-discovery de capítulos por convención (permite añadir capítulos en paralelo sin editar el registro). - render_pdf_impl.py: paginador A5 retrato móvil que MIDE cada bloque y nunca corta: texto a líneas completas, tablas largas partidas por filas repitiendo cabecera, figuras/imágenes escaladas para caber enteras. Pie versionado por capítulo. - render_pptx_impl.py: mismo principio sobre slides 16:9 (continúa en slide "(cont.)"; tablas repiten cabecera; figuras exportadas a PNG escaladas). - chapters/portada.py y chapters/overview.py: capítulos de referencia. Portada con nombre, rótulo Automatic-EDA, fuente, almacenamiento (inferido de source), fecha europea, filas×cols, descripción, granularidad y calidad con criterios. Overview con df.head (placeholder honesto si falta head_rows), diccionario de columnas (tipo/nulos/ejemplos) y describe numérico. Funciones públicas del registry (grupo eda, dict-no-throw): - render_automatic_eda_pdf / render_automatic_eda_pptx: aceptan capítulos o un TableProfile (construyen los capítulos con build_document) y escriben el manifiesto. Aditivas — no reemplazan render_eda_pdf. Tests self-contained (sin DuckDB) para ambos renderers: golden (portada + overview), partición de tablas largas repitiendo cabecera, no-corte de celdas y markdown largos, profile None/{} válido de 1 página/slide, y error path en directorio no escribible. 23 tests verdes (incluye los previos de render_eda_pdf, intactos). Dependencia nueva python-pptx>=1.0.2 declarada en python/pyproject.toml. Co-Authored-By: Claude Opus 4.8 (1M context) --- python/functions/datascience/__init__.py | 4 + .../datascience/automatic_eda/__init__.py | 57 ++ .../automatic_eda/chapters/__init__.py | 7 + .../automatic_eda/chapters/overview.py | 176 ++++++ .../automatic_eda/chapters/portada.py | 156 +++++ .../automatic_eda/chapters_registry.py | 89 +++ .../datascience/automatic_eda/model.py | 310 ++++++++++ .../automatic_eda/render_pdf_impl.py | 532 ++++++++++++++++++ .../automatic_eda/render_pptx_impl.py | 518 +++++++++++++++++ .../datascience/automatic_eda/text_layout.py | 107 ++++ .../datascience/render_automatic_eda_pdf.md | 107 ++++ .../datascience/render_automatic_eda_pdf.py | 83 +++ .../render_automatic_eda_pdf_test.py | 140 +++++ .../datascience/render_automatic_eda_pptx.md | 86 +++ .../datascience/render_automatic_eda_pptx.py | 76 +++ .../render_automatic_eda_pptx_test.py | 114 ++++ python/pyproject.toml | 1 + 17 files changed, 2563 insertions(+) create mode 100644 python/functions/datascience/automatic_eda/__init__.py create mode 100644 python/functions/datascience/automatic_eda/chapters/__init__.py create mode 100644 python/functions/datascience/automatic_eda/chapters/overview.py create mode 100644 python/functions/datascience/automatic_eda/chapters/portada.py create mode 100644 python/functions/datascience/automatic_eda/chapters_registry.py create mode 100644 python/functions/datascience/automatic_eda/model.py create mode 100644 python/functions/datascience/automatic_eda/render_pdf_impl.py create mode 100644 python/functions/datascience/automatic_eda/render_pptx_impl.py create mode 100644 python/functions/datascience/automatic_eda/text_layout.py create mode 100644 python/functions/datascience/render_automatic_eda_pdf.md create mode 100644 python/functions/datascience/render_automatic_eda_pdf.py create mode 100644 python/functions/datascience/render_automatic_eda_pdf_test.py create mode 100644 python/functions/datascience/render_automatic_eda_pptx.md create mode 100644 python/functions/datascience/render_automatic_eda_pptx.py create mode 100644 python/functions/datascience/render_automatic_eda_pptx_test.py diff --git a/python/functions/datascience/__init__.py b/python/functions/datascience/__init__.py index 65cefda7..afa5ac45 100644 --- a/python/functions/datascience/__init__.py +++ b/python/functions/datascience/__init__.py @@ -53,8 +53,12 @@ from .fdr_correction import fdr_correction from .suggest_reexpression import suggest_reexpression from .exploratory_caveats import exploratory_caveats from .render_eda_pdf import render_eda_pdf, render_eda_pdf_relational +from .render_automatic_eda_pdf import render_automatic_eda_pdf +from .render_automatic_eda_pptx import render_automatic_eda_pptx __all__ = [ + "render_automatic_eda_pdf", + "render_automatic_eda_pptx", "decode_qr_image", "adf_kpss_stationarity", "acf_pacf", diff --git a/python/functions/datascience/automatic_eda/__init__.py b/python/functions/datascience/automatic_eda/__init__.py new file mode 100644 index 00000000..95d6f374 --- /dev/null +++ b/python/functions/datascience/automatic_eda/__init__.py @@ -0,0 +1,57 @@ +"""AutomaticEDA — chapter-based, versioned EDA document with PDF + PPTX output. + +Public surface (support package for the registry functions +``render_automatic_eda_pdf`` and ``render_automatic_eda_pptx``): + +- Document model: ``Heading``, ``Markdown``, ``KVTable``, ``DataTable``, + ``Figure``, ``Image``, ``Caption``, ``Note``, ``Chapter``; normalizers + ``as_blocks`` / ``as_chapters``; ``ENGINE_VERSION`` / ``ENGINE_NAME``. +- ``build_document(profile, ctx)`` — assemble the ordered chapters of a profile. +- ``render_pdf(chapters, out_path, meta)`` / ``render_pptx(...)`` — the two + renderers (used by the public registry functions). +- ``merge_manifest(...)`` — write/update the per-chapter version manifest. +""" + +from __future__ import annotations + +from .model import ( # noqa: F401 + ENGINE_NAME, + ENGINE_VERSION, + Caption, + Chapter, + DataTable, + Figure, + Heading, + Image, + KVTable, + Markdown, + Note, + as_blocks, + as_chapters, + merge_manifest, +) +from .chapters_registry import CHAPTER_ORDER, build_chapter, build_document # noqa: F401 +from .render_pdf_impl import render_pdf # noqa: F401 +from .render_pptx_impl import render_pptx # noqa: F401 + +__all__ = [ + "ENGINE_NAME", + "ENGINE_VERSION", + "Heading", + "Markdown", + "KVTable", + "DataTable", + "Figure", + "Image", + "Caption", + "Note", + "Chapter", + "as_blocks", + "as_chapters", + "merge_manifest", + "CHAPTER_ORDER", + "build_chapter", + "build_document", + "render_pdf", + "render_pptx", +] diff --git a/python/functions/datascience/automatic_eda/chapters/__init__.py b/python/functions/datascience/automatic_eda/chapters/__init__.py new file mode 100644 index 00000000..e8c1fd18 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/__init__.py @@ -0,0 +1,7 @@ +"""AutomaticEDA chapters. + +Each chapter is a module ``.py`` exposing ``build_(profile, ctx) -> +Chapter | None`` and a ``CHAPTER_VERSION`` constant. The canonical document +order lives in :mod:`automatic_eda.chapters_registry`. Implemented today: +``portada`` and ``overview`` (the reference chapters other agents copy). +""" diff --git a/python/functions/datascience/automatic_eda/chapters/overview.py b/python/functions/datascience/automatic_eda/chapters/overview.py new file mode 100644 index 00000000..93b25b52 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/overview.py @@ -0,0 +1,176 @@ +"""Overview chapter — df.head, column dictionary and describe (reference). + +Second reference chapter for AutomaticEDA. Renders (across as many pages/slides +as needed, the renderers paginate): + +1. ``df.head`` — the first rows of the table. The current ``TableProfile`` does + NOT carry the raw head, so this is read from ``ctx['head_rows']`` / + ``profile['head_rows']`` (a list of row dicts). When absent the chapter shows + an honest placeholder documenting the missing key instead of inventing data. +2. Column dictionary — name / type / nulls / non-null examples. Examples come + from ``columns[i]['examples']`` when present; otherwise they are derived from + real non-null profile values (categorical top values, numeric min/median/max) + so the cell is never empty nor fabricated. +3. ``df.describe`` — mean / median / min / max / std for every numeric column. + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +""" + +from __future__ import annotations + +from .. import model + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "overview" +CHAPTER_TITLE = "Overview" + +# Profile/ctx keys the calculation phase must add for a full head + examples. +HEAD_KEY = "head_rows" # list[dict] — df.head(n) +EXAMPLES_KEY = "examples" # per column: list of non-null sample values + + +def _fmt_num(value, decimals: int = 3) -> str: + if value is None: + return "—" + if isinstance(value, bool): + return str(value) + if isinstance(value, int): + return f"{value:,}".replace(",", ".") + if isinstance(value, float): + if value != value: # NaN + return "NaN" + if value in (float("inf"), float("-inf")): + return str(value) + text = f"{value:.{decimals}f}".rstrip("0").rstrip(".") + return text if text else "0" + return str(value) + + +def _fmt_pct(value, decimals: int = 1) -> str: + if value is None: + return "—" + try: + return f"{float(value) * 100:.{decimals}f}%" + except (TypeError, ValueError): + return str(value) + + +def _examples_for(col: dict) -> str: + """Build a short string of real non-null example values for a column.""" + explicit = col.get(EXAMPLES_KEY) + if isinstance(explicit, (list, tuple)) and explicit: + return ", ".join(model._safe_str(v) for v in explicit[:4]) + cat = col.get("categorical") or {} + top = cat.get("top") or [] + if top: + vals = [model._safe_str((t or {}).get("value")) for t in top[:4] + if isinstance(t, dict)] + vals = [v for v in vals if v] + if vals: + return ", ".join(vals) + num = col.get("numeric") or {} + if num: + bits = [] + for key in ("min", "median", "max"): + v = num.get(key) + if v is not None: + bits.append(_fmt_num(v)) + if bits: + return ", ".join(bits) + return "—" + + +def _head_block(profile: dict, ctx: dict): + """Return a DataTable for df.head, or a Note documenting the missing key.""" + head = ctx.get(HEAD_KEY) or profile.get(HEAD_KEY) + if isinstance(head, list) and head and isinstance(head[0], dict): + # Column order from the profile, then any extra keys present in rows. + cols = [c.get("name") for c in (profile.get("columns") or []) + if c.get("name")] + if not cols: + cols = list(head[0].keys()) + rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]] + return model.DataTable(header=cols, rows=rows, + note=f"primeras {len(rows)} filas") + return model.Note( + "df.head no disponible: el TableProfile no incluye 'head_rows'. La fase " + "de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o " + "pasarlo en ctx['head_rows'] para mostrar las primeras filas.") + + +def _columns_block(profile: dict): + cols = profile.get("columns") or [] + header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)"] + rows = [] + for c in cols: + if not isinstance(c, dict): + continue + name = c.get("name") or "(col)" + ctype = c.get("inferred_type") or c.get("physical_type") or "—" + sem = c.get("semantic_type") + if sem: + ctype = f"{ctype} ({sem})" + null_pct = c.get("null_pct") + null_count = c.get("null_count") + if null_pct is not None: + nulls = _fmt_pct(null_pct) + if null_count is not None: + nulls += f" ({null_count})" + elif null_count is not None: + nulls = str(null_count) + else: + nulls = "—" + rows.append([name, ctype, nulls, _examples_for(c)]) + if not rows: + return None + return model.DataTable(header=header, rows=rows, title="Columnas") + + +def _describe_block(profile: dict): + cols = profile.get("columns") or [] + header = ["Columna", "mean", "median", "min", "max", "std"] + rows = [] + for c in cols: + if not isinstance(c, dict) or c.get("inferred_type") != "numeric": + continue + num = c.get("numeric") or {} + if not num: + continue + rows.append([ + c.get("name") or "(col)", + _fmt_num(num.get("mean")), + _fmt_num(num.get("median")), + _fmt_num(num.get("min")), + _fmt_num(num.get("max")), + _fmt_num(num.get("std")), + ]) + if not rows: + return None + return model.DataTable(header=header, rows=rows, title="Estadística (describe)") + + +def build_overview(profile: dict, ctx: dict): + """Build the Overview Chapter, or None if the profile has no columns.""" + profile = profile or {} + ctx = ctx or {} + cols = profile.get("columns") or [] + if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)): + return None + + blocks = [ + model.Heading(text="Primeras filas (df.head)", level=2), + _head_block(profile, ctx), + ] + cols_block = _columns_block(profile) + if cols_block is not None: + blocks.append(model.Heading( + text="Diccionario de columnas", level=2)) + blocks.append(cols_block) + desc_block = _describe_block(profile) + if desc_block is not None: + blocks.append(model.Heading( + text="Resumen estadístico numérico", level=2)) + blocks.append(desc_block) + + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/portada.py b/python/functions/datascience/automatic_eda/chapters/portada.py new file mode 100644 index 00000000..3582d981 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/portada.py @@ -0,0 +1,156 @@ +"""Cover chapter (PORTADA) — the reference chapter for AutomaticEDA. + +Builds the document cover from a TableProfile plus an optional ``ctx`` of +presentation metadata. Reads everything defensively (``.get``) and degrades +honestly: a field that is neither in the profile nor in ``ctx`` is shown as a +placeholder rather than invented, leaving a hook for the LLM layer to fill it. + +Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``): + build_(profile: dict, ctx: dict) -> Chapter | None + CHAPTER_VERSION = "x.y.z" +""" + +from __future__ import annotations + +import os +from datetime import datetime, timezone + +from .. import model + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "portada" +CHAPTER_TITLE = "Portada" + +# Default human description of what the table quality score measures. Chapters +# can override it via ctx["quality_criteria"]. +_DEFAULT_QUALITY_CRITERIA = ( + "media de los scores por columna (0–100): completitud (sin nulos/vacíos), " + "validez (tipo y rango coherentes) y consistencia (sin duplicados/constantes)." +) + + +def _storage_from_source(source: str) -> str: + """Infer the storage technology the dataset currently lives in. + + Heuristic on the profile ``source`` string (a path, DSN or backend name). + Returns a human label; falls back to the raw source when unknown. + """ + s = (source or "").strip().lower() + if not s: + return "—" + if s.endswith(".csv") or s.endswith(".tsv"): + return "CSV" + if s.endswith(".parquet") or s.endswith(".pq"): + return "Parquet" + if s.endswith(".json") or s.endswith(".ndjson"): + return "JSON" + if s.endswith(".xlsx") or s.endswith(".xls"): + return "Excel" + if s.endswith((".duckdb", ".ddb")) or s == "duckdb" or s.endswith(".db"): + return "DuckDB" + if s.startswith(("postgres://", "postgresql://")) or "postgres" in s: + return "PostgreSQL" + if s.startswith("bigquery") or "bigquery" in s or s.count(".") == 2 and " " not in s: + return "BigQuery" + if "sqlite" in s: + return "SQLite" + # Unknown: show the raw source so nothing is hidden. + return source + + +def _fmt_int(v) -> str: + if v is None: + return "—" + try: + return f"{int(v):,}".replace(",", ".") + except (TypeError, ValueError): + return str(v) + + +def _fmt_date_eu(value) -> str: + """Format a date/ISO string as European DD/MM/AAAA HH:mm (UI convention). + + Accepts a datetime, an ISO-8601 string (with or without microseconds/tz) or + any other string. Non-parseable strings are returned verbatim so nothing is + lost; None yields a placeholder. + """ + if value is None: + return "—" + if isinstance(value, datetime): + return value.strftime("%d/%m/%Y %H:%M") + s = str(value).strip() + if not s: + return "—" + try: + dt = datetime.fromisoformat(s.replace("Z", "+00:00")) + return dt.strftime("%d/%m/%Y %H:%M") + except (TypeError, ValueError): + # Try a couple of common forms before giving up. + for fmt in ("%Y-%m-%d %H:%M:%S UTC", "%Y-%m-%d %H:%M UTC", + "%Y-%m-%d %H:%M:%S", "%Y-%m-%d"): + try: + return datetime.strptime(s, fmt).strftime("%d/%m/%Y %H:%M") + except ValueError: + continue + return s + + +def build_portada(profile: dict, ctx: dict): + """Build the cover Chapter, or None if there is truly nothing to show.""" + profile = profile or {} + ctx = ctx or {} + + dataset_name = (ctx.get("dataset_name") or profile.get("table") + or "(dataset sin nombre)") + source = profile.get("source") or "" + # Where the dataset comes from (origin), distinct from where it is stored. + source_origin = ctx.get("source_origin") or source or "—" + storage = ctx.get("storage") or _storage_from_source(source) + + when = _fmt_date_eu( + ctx.get("generated_at") or profile.get("profiled_at") + or datetime.now(timezone.utc)) + + n_rows = profile.get("n_rows") + n_cols = profile.get("n_cols") + shape = f"{_fmt_int(n_rows)} filas × {_fmt_int(n_cols)} columnas" + + score = profile.get("quality_score") + quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA + quality_value = "—" if score is None else f"{score} / 100" + + # Granularity: ctx wins; else derive from key candidates; else be honest. + granularity = ctx.get("granularity") + if not granularity: + keys = profile.get("key_candidates") or [] + if keys: + granularity = ("Cada fila parece identificada por " + + ", ".join(str(k) for k in keys[:3]) + ".") + else: + granularity = ("Cada fila es… (granularidad no determinada — " + "pendiente de la capa de cálculo/LLM).") + + description = ctx.get("description") + if not description: + description = ("Descripción no provista — pendiente de la capa LLM " + "(`run_llm`) o de `ctx['description']`.") + + blocks = [ + model.Heading(text=str(dataset_name), level=1), + model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"), + model.KVTable(rows=[ + ("Fuente", source_origin), + ("Almacenamiento", storage), + ("Generado", when), + ("Tamaño", shape), + ("Calidad", quality_value), + ("Criterios de calidad", quality_criteria), + ]), + model.Heading(text="Descripción", level=2), + model.Markdown(text=str(description)), + model.Heading(text="Granularidad", level=2), + model.Markdown(text=str(granularity)), + ] + + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters_registry.py b/python/functions/datascience/automatic_eda/chapters_registry.py new file mode 100644 index 00000000..1d6743f4 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters_registry.py @@ -0,0 +1,89 @@ +"""Chapter registry — the canonical order of an AutomaticEDA document. + +``CHAPTER_ORDER`` declares every chapter the engine will *ever* place, in the +order they appear in the document. Each id maps by convention to a module +``automatic_eda/chapters/.py`` exposing ``build_(profile, ctx) -> +Chapter | None`` and a ``CHAPTER_VERSION`` constant. + +This pre-declared order is what lets many agents add chapters in parallel +without contention: an agent only creates its own ``chapters/.py`` module — +it never edits this file. ``build_document`` imports each chapter lazily; a +chapter whose module does not exist yet (not implemented) is simply skipped, so +the document is always renderable with whatever chapters are present today. + +``build_document`` never raises: a chapter that errors out is dropped with a +note, and a chapter that returns ``None`` (does not apply to this dataset, e.g. +time series on a dataset with no date column) is omitted. +""" + +from __future__ import annotations + +import importlib + +from . import model + +# Canonical document order. Implemented today: portada, overview. The rest are +# placeholders other agents will fill by creating chapters/.py — they will +# appear in this exact position automatically once their module exists. +CHAPTER_ORDER = [ + "portada", # cover + "overview", # df.head + columns/types/nulls/examples + describe + "num_distr", # numeric distributions + "cat_distr", # categorical distributions + "calidad", # data quality + "correlacion", # correlations / associations + "modelos", # cheap models (PCA/KMeans/outliers) + "analisis_llm", # LLM interpretation + "timeseries", # time-series analysis + "geospatial", # geospatial + "agregacion", # aggregations / pivots +] + + +def build_chapter(chapter_id: str, profile: dict, ctx: dict): + """Build a single chapter by id, or None if absent/not-applicable/error. + + Looks up ``automatic_eda.chapters.`` and calls its + ``build_(profile, ctx)``. Returns a normalized Chapter, or None + when the module is missing, the builder returns None, or anything raises. + """ + mod_name = f"{__package__}.chapters.{chapter_id}" + try: + mod = importlib.import_module(mod_name) + except Exception: # noqa: BLE001 — chapter not implemented yet → skip. + return None + builder = getattr(mod, f"build_{chapter_id}", None) + if builder is None: + return None + try: + result = builder(profile or {}, ctx or {}) + except Exception: # noqa: BLE001 — a broken chapter never aborts the doc. + return None + return model.as_chapter(result) + + +def build_document(profile: dict, ctx: dict = None) -> list: + """Build the full ordered list of chapters for a TableProfile. + + Args: + profile: the ``eda`` group TableProfile dict (may be None/empty). + ctx: optional context dict carrying presentation metadata not present in + the profile (dataset_name, source_origin, storage, generated_at, + description, granularity, quality_criteria, head_rows, ...). + + Returns: + list[Chapter] in canonical order, containing only the chapters that are + implemented and applicable. Never raises. + """ + if profile is None: + profile = {} + if not isinstance(profile, dict): + profile = {} + if ctx is None: + ctx = {} + chapters = [] + for cid in CHAPTER_ORDER: + ch = build_chapter(cid, profile, ctx) + if ch is not None and ch.blocks: + chapters.append(ch) + return chapters diff --git a/python/functions/datascience/automatic_eda/model.py b/python/functions/datascience/automatic_eda/model.py new file mode 100644 index 00000000..8a5c488d --- /dev/null +++ b/python/functions/datascience/automatic_eda/model.py @@ -0,0 +1,310 @@ +"""AutomaticEDA document model — format-independent blocks and chapters. + +This is the intermediate layer between *content* (what an EDA chapter wants to +say) and *output format* (PDF for mobile reading, PPTX for sharing). A document +is an ordered list of :class:`Chapter`. A chapter is ``{id, title, version, +blocks}``. A block is one of a small, closed set of presentation primitives +(heading, markdown, key/value table, data table, figure, image, caption, note). + +Neither renderer knows anything about the EDA profile: they only know how to lay +out blocks so that **nothing is ever cut** — long text wraps to whole lines, +long tables split by rows repeating the header, figures and images are scaled to +fit entirely. Each chapter declares its own ``version`` so every page/slide can +be stamped `` · v`` and tracked in a manifest for continuous, +per-chapter improvement. + +Reading is defensive throughout (the ``eda`` group "dict-no-throw" style): the +normalizers accept dataclass blocks *or* plain dicts, coerce anything unknown +into a readable :class:`Note` instead of raising, and the renderers degrade a +malformed block to text rather than crashing the whole document. +""" + +from __future__ import annotations + +import json +import os +from dataclasses import dataclass, field +from typing import Any, Callable, Optional + +# Global engine version. Bump when the document model or a renderer changes in a +# way that affects output. Individual chapters carry their own CHAPTER_VERSION. +ENGINE_VERSION = "1.0.0" +ENGINE_NAME = "AutomaticEDA" + + +# --------------------------------------------------------------------------- # +# Block primitives. Each carries a stable ``kind`` string so renderers can +# dispatch by kind (works for dataclass instances and for plain dicts alike). +# --------------------------------------------------------------------------- # +@dataclass +class Heading: + """A section heading. ``level`` 1 (largest) .. 3 (smallest).""" + + text: str = "" + level: int = 1 + kind: str = field(default="heading", init=False) + + +@dataclass +class Markdown: + """A block of light markdown text. + + Supported subset (everything else is rendered verbatim, never dropped): + ``#``/``##``/``###`` headings, ``-``/``*`` bullet lists, ``| a | b |`` + tables (consecutive pipe lines become a data table), blank lines as + paragraph breaks, and ``**bold**`` inline markers (markers are stripped, the + text is kept). Text is wrapped to whole lines so it is never cut mid-line. + """ + + text: str = "" + kind: str = field(default="markdown", init=False) + + +@dataclass +class KVTable: + """A two-column key/value table. ``rows`` is a list of ``(label, value)``.""" + + rows: list = field(default_factory=list) + title: Optional[str] = None + kind: str = field(default="kv_table", init=False) + + +@dataclass +class DataTable: + """A tabular block with a header row. + + If it does not fit in the remaining page/slide space it is split by rows, + **repeating the header** on each continuation. Long cell text wraps inside + its column (the row grows taller) so no cell content is ever lost. + """ + + header: list = field(default_factory=list) + rows: list = field(default_factory=list) # list[list[Any]] + title: Optional[str] = None + note: Optional[str] = None + kind: str = field(default="data_table", init=False) + + +@dataclass +class Figure: + """A matplotlib figure, scaled to fit entirely (never cropped). + + Provide either an already-built ``fig`` (a ``matplotlib.figure.Figure``) or + a zero-arg ``make`` callable that returns one (lazy: only built when the + renderer needs it). ``height_in`` is an optional hint for the target height + on the page; renderers clamp it to the available space preserving aspect. + """ + + fig: Any = None + make: Optional[Callable[[], Any]] = None + caption: Optional[str] = None + height_in: Optional[float] = None + kind: str = field(default="figure", init=False) + + +@dataclass +class Image: + """A raster image (PNG/JPG) by path, scaled to fit entirely.""" + + path: str = "" + caption: Optional[str] = None + height_in: Optional[float] = None + kind: str = field(default="image", init=False) + + +@dataclass +class Caption: + """Small auxiliary text rendered under a figure/table.""" + + text: str = "" + kind: str = field(default="caption", init=False) + + +@dataclass +class Note: + """Small auxiliary note (italic). Also the fallback for unknown content.""" + + text: str = "" + kind: str = field(default="note", init=False) + + +@dataclass +class Chapter: + """An ordered set of blocks with an id, a title and a generation version.""" + + id: str = "" + title: str = "" + version: str = "1.0.0" + blocks: list = field(default_factory=list) + + +# --------------------------------------------------------------------------- # +# Defensive normalizers — accept dataclasses OR plain dicts, never raise. +# --------------------------------------------------------------------------- # +_BLOCK_BY_KIND = { + "heading": Heading, + "markdown": Markdown, + "kv_table": KVTable, + "data_table": DataTable, + "figure": Figure, + "image": Image, + "caption": Caption, + "note": Note, +} + + +def as_block(obj: Any): + """Coerce a value into a block dataclass. Unknown values become a Note.""" + if isinstance(obj, (Heading, Markdown, KVTable, DataTable, Figure, Image, + Caption, Note)): + return obj + if isinstance(obj, dict): + kind = obj.get("kind") + cls = _BLOCK_BY_KIND.get(kind) + if cls is None: + return Note(text=_safe_str(obj)) + # Build only with fields the dataclass accepts (ignore extras). + try: + if cls is Heading: + return Heading(text=_safe_str(obj.get("text")), + level=int(obj.get("level", 1) or 1)) + if cls is Markdown: + return Markdown(text=_safe_str(obj.get("text"))) + if cls is KVTable: + return KVTable(rows=list(obj.get("rows") or []), + title=obj.get("title")) + if cls is DataTable: + return DataTable(header=list(obj.get("header") or []), + rows=list(obj.get("rows") or []), + title=obj.get("title"), note=obj.get("note")) + if cls is Figure: + return Figure(fig=obj.get("fig"), make=obj.get("make"), + caption=obj.get("caption"), + height_in=obj.get("height_in")) + if cls is Image: + return Image(path=_safe_str(obj.get("path")), + caption=obj.get("caption"), + height_in=obj.get("height_in")) + if cls is Caption: + return Caption(text=_safe_str(obj.get("text"))) + if cls is Note: + return Note(text=_safe_str(obj.get("text"))) + except Exception: # noqa: BLE001 — never raise on a malformed block. + return Note(text=_safe_str(obj)) + return Note(text=_safe_str(obj)) + + +def as_blocks(seq: Any) -> list: + """Normalize an arbitrary sequence into a list of block dataclasses.""" + if seq is None: + return [] + if not isinstance(seq, (list, tuple)): + return [as_block(seq)] + return [as_block(b) for b in seq] + + +def as_chapter(obj: Any) -> Optional[Chapter]: + """Coerce a value into a Chapter (or None). Accepts a dict or a Chapter.""" + if obj is None: + return None + if isinstance(obj, Chapter): + obj.blocks = as_blocks(obj.blocks) + return obj + if isinstance(obj, dict): + return Chapter( + id=_safe_str(obj.get("id")), + title=_safe_str(obj.get("title")) or _safe_str(obj.get("id")), + version=_safe_str(obj.get("version")) or "1.0.0", + blocks=as_blocks(obj.get("blocks")), + ) + return None + + +def as_chapters(seq: Any) -> list: + """Normalize a sequence of chapters, dropping anything that can't coerce.""" + if seq is None: + return [] + if isinstance(seq, Chapter): + return [as_chapter(seq)] + if not isinstance(seq, (list, tuple)): + return [] + out = [] + for c in seq: + ch = as_chapter(c) + if ch is not None: + out.append(ch) + return out + + +def _safe_str(v: Any) -> str: + """str() that never raises and maps None to ''.""" + if v is None: + return "" + try: + return str(v) + except Exception: # noqa: BLE001 + return "" + + +# --------------------------------------------------------------------------- # +# Manifest — per-chapter versions and page/slide counts for tracking. +# --------------------------------------------------------------------------- # +def merge_manifest(manifest_path: str, renderer: str, chapters_meta: list, + generated_at: str, + engine_version: str = ENGINE_VERSION) -> dict: + """Read-modify-write the AutomaticEDA manifest, merging one renderer's run. + + The manifest lives next to the outputs as ``automatic_eda_manifest.json`` + and records, per chapter, its version plus the page count (PDF) and slide + count (PPTX). Calling either renderer creates or updates it. Never raises: + on any error returns the in-memory manifest without writing. + + Args: + manifest_path: path to the JSON manifest to create or update. + renderer: "pdf" or "pptx" — selects which count key is written. + chapters_meta: list of ``{"id", "version", "n_pages"|"n_slides"}``. + generated_at: ISO-ish timestamp string for this run. + engine_version: AutomaticEDA engine version. + + Returns: + The merged manifest dict (also written to disk on success). + """ + data: dict = {} + try: + if manifest_path and os.path.exists(manifest_path): + with open(manifest_path, "r", encoding="utf-8") as fh: + loaded = json.load(fh) + if isinstance(loaded, dict): + data = loaded + except Exception: # noqa: BLE001 — a corrupt manifest is overwritten. + data = {} + + data["engine"] = ENGINE_NAME + data["engine_version"] = engine_version + data["generated_at"] = generated_at + chapters = data.get("chapters") + if not isinstance(chapters, dict): + chapters = {} + count_key = "n_slides" if renderer == "pptx" else "n_pages" + for cm in chapters_meta or []: + if not isinstance(cm, dict): + continue + cid = cm.get("id") + if not cid: + continue + entry = chapters.get(cid) + if not isinstance(entry, dict): + entry = {} + entry["version"] = cm.get("version") or entry.get("version") or "1.0.0" + entry[count_key] = cm.get(count_key, cm.get("n_pages", cm.get("n_slides"))) + chapters[cid] = entry + data["chapters"] = chapters + + try: + parent = os.path.dirname(os.path.abspath(manifest_path)) + os.makedirs(parent, exist_ok=True) + with open(manifest_path, "w", encoding="utf-8") as fh: + json.dump(data, fh, ensure_ascii=False, indent=2, default=str) + except Exception: # noqa: BLE001 — never raise from the manifest writer. + pass + return data diff --git a/python/functions/datascience/automatic_eda/render_pdf_impl.py b/python/functions/datascience/automatic_eda/render_pdf_impl.py new file mode 100644 index 00000000..b7961b0c --- /dev/null +++ b/python/functions/datascience/automatic_eda/render_pdf_impl.py @@ -0,0 +1,532 @@ +"""AutomaticEDA PDF renderer — A5 portrait, mobile-first, never cuts content. + +A flow paginator: it measures each block (using the deterministic character grid +from :mod:`text_layout`) and places it top-to-bottom on the current page. When a +unit does not fit in the remaining space it moves whole to the next page — +text by whole lines (never mid-line, never mid-word), data tables by rows +**repeating the header**, figures/images scaled to fit entirely (never cropped). + +Each chapter starts on a fresh page and every page is stamped in the footer with +`` · v`` plus the engine version and a running page number, so +output is versioned per chapter for continuous improvement. + +dict-no-throw: a failure inside one block is caught and noted; the PDF is always +produced and at least one page is guaranteed. Engine: matplotlib ``PdfPages``. +""" + +from __future__ import annotations + +import io +import os + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.image as mpimg # noqa: E402 +import matplotlib.pyplot as plt # noqa: E402 +from matplotlib.backends.backend_pdf import PdfPages # noqa: E402 +from matplotlib.patches import Rectangle # noqa: E402 + +from . import model # noqa: E402 +from . import text_layout as tl # noqa: E402 + +# A5 portrait, inches. +_W, _H = 5.83, 8.27 +_ML, _MR, _MT, _MB = 0.5, 0.42, 0.55, 0.5 +_FOOTER_H = 0.34 +_USABLE_W = _W - _ML - _MR +_CONTENT_TOP = _MT +_CONTENT_BOTTOM = _H - _MB - _FOOTER_H + +# Palette / type (inherits the Tufte-ish mobile look of render_eda_pdf). +_INK = "#1b1b1b" +_ACCENT = "#2a6f97" +_MUTED = "#8a8a8a" +_RULE = "#cccccc" +_HEAD_BG = "#eef3f6" + +_RC = { + "font.size": 10, + "font.family": "sans-serif", + "figure.facecolor": "white", + "savefig.facecolor": "white", + "pdf.fonttype": 42, # embed TrueType — text stays selectable on mobile. +} + +# Font sizes (pt) and derived line heights (in). +_FS_H1, _FS_H2, _FS_H3 = 17, 13, 11 +_FS_BODY, _FS_CELL, _FS_NOTE = 10.5, 9.0, 9.0 +_GAP = 0.12 # vertical gap after a block, inches. +_CELL_PAD = 0.06 # horizontal padding inside a table cell, inches. +_ROW_VPAD = 0.05 # vertical padding inside a table row, inches. + + +class _PdfState: + """Mutable layout cursor for the running PDF document.""" + + def __init__(self, pdf, title: str): + self.pdf = pdf + self.title = title + self.fig = None + self.y = _CONTENT_TOP # inches from the top of the page. + self.page = 0 # global page counter. + self.chapter = None # current Chapter (for the footer). + self.chapter_pages = 0 # pages produced for the current chapter. + + +# --------------------------------------------------------------------------- # +# Coordinate helpers (inches-from-top → matplotlib figure fraction). +# --------------------------------------------------------------------------- # +def _yf(y_in: float) -> float: + return 1.0 - (y_in / _H) + + +def _xf(x_in: float) -> float: + return x_in / _W + + +def _new_page(st: _PdfState) -> None: + """Close the current page (if any) and open a fresh one with a footer.""" + _flush_page(st) + st.fig = plt.figure(figsize=(_W, _H)) + st.y = _CONTENT_TOP + st.page += 1 + st.chapter_pages += 1 + _draw_footer(st) + + +def _flush_page(st: _PdfState) -> None: + if st.fig is not None: + st.pdf.savefig(st.fig) + plt.close(st.fig) + st.fig = None + + +def _draw_footer(st: _PdfState) -> None: + ch = st.chapter + left = "" + if ch is not None: + left = f"{ch.title} · v{ch.version}" + right = f"{model.ENGINE_NAME} v{model.ENGINE_VERSION} · p.{st.page}" + yb = (_MB * 0.45) / _H + st.fig.text(_xf(_ML), yb, left, fontsize=7.5, color=_MUTED, + ha="left", va="center") + st.fig.text(_xf(_W - _MR), yb, right, fontsize=7.5, color=_MUTED, + ha="right", va="center") + # A thin rule above the footer. + st.fig.add_artist(Rectangle( + (_xf(_ML), (_MB + _FOOTER_H * 0.5) / _H), + _xf(_W - _MR) - _xf(_ML), 0.0008, + transform=st.fig.transFigure, color=_RULE, lw=0.6)) + + +def _remaining(st: _PdfState) -> float: + return _CONTENT_BOTTOM - st.y + + +def _ensure_space(st: _PdfState, height: float) -> None: + """Open a new page if ``height`` does not fit in the remaining space.""" + if _remaining(st) < height: + _new_page(st) + + +# --------------------------------------------------------------------------- # +# Block placers. Each advances st.y and paginates as needed. +# --------------------------------------------------------------------------- # +def _place_heading(st: _PdfState, block) -> None: + level = max(1, min(3, int(getattr(block, "level", 1) or 1))) + fs = {1: _FS_H1, 2: _FS_H2, 3: _FS_H3}[level] + text = tl.strip_inline_md(getattr(block, "text", "")) + max_chars = tl.chars_per_line(_USABLE_W, fs) + lines = tl.wrap(text, max_chars) + lh = tl.line_height_in(fs, leading=1.2) + block_h = lh * len(lines) + 0.06 + # Keep at least the heading + a couple of body lines together when possible. + _ensure_space(st, min(block_h + tl.line_height_in(_FS_BODY) * 2, + _CONTENT_BOTTOM - _CONTENT_TOP)) + for ln in lines: + _ensure_space(st, lh) + st.fig.text(_xf(_ML), _yf(st.y), ln, fontsize=fs, fontweight="bold", + color=_INK, ha="left", va="top") + st.y += lh + if level == 1: + # Accent underline under a top-level heading. + st.fig.add_artist(Rectangle( + (_xf(_ML), _yf(st.y + 0.02)), _xf(_ML + 1.4) - _xf(_ML), 0.0016, + transform=st.fig.transFigure, color=_ACCENT, lw=0)) + st.y += 0.10 + st.y += _GAP + + +def _place_text_lines(st: _PdfState, lines: list, fs: float, color: str, + style: str = "normal", indent: float = 0.0) -> None: + lh = tl.line_height_in(fs) + for ln in lines: + _ensure_space(st, lh) + st.fig.text(_xf(_ML + indent), _yf(st.y), ln, fontsize=fs, color=color, + ha="left", va="top", style=style) + st.y += lh + + +def _place_markdown(st: _PdfState, block) -> None: + raw = getattr(block, "text", "") or "" + md_lines = str(raw).split("\n") + i = 0 + n = len(md_lines) + while i < n: + line = md_lines[i] + stripped = line.strip() + # Consecutive pipe-table lines → a DataTable. + if stripped.startswith("|") and stripped.endswith("|"): + j = i + tbl_lines = [] + while j < n and md_lines[j].strip().startswith("|") \ + and md_lines[j].strip().endswith("|"): + tbl_lines.append(md_lines[j]) + j += 1 + parsed = tl.parse_md_table(tbl_lines) + if parsed: + header, rows = parsed + _place_data_table(st, model.DataTable(header=header, rows=rows)) + i = j + continue + if stripped == "": + st.y += tl.line_height_in(_FS_BODY) * 0.5 + i += 1 + continue + if stripped.startswith("### "): + _place_heading(st, model.Heading(stripped[4:], level=3)) + i += 1 + continue + if stripped.startswith("## "): + _place_heading(st, model.Heading(stripped[3:], level=2)) + i += 1 + continue + if stripped.startswith("# "): + _place_heading(st, model.Heading(stripped[2:], level=1)) + i += 1 + continue + if stripped.startswith("- ") or stripped.startswith("* "): + content = tl.strip_inline_md(stripped[2:]) + bullet_chars = tl.chars_per_line(_USABLE_W - 0.22, _FS_BODY) + wrapped = tl.wrap(content, bullet_chars) + first = True + for w in wrapped: + prefix = "• " if first else " " + _place_text_lines(st, [prefix + w], _FS_BODY, _INK, + indent=0.0) + first = False + i += 1 + continue + # Plain paragraph (gather following plain lines into one paragraph). + para = [tl.strip_inline_md(stripped)] + j = i + 1 + while j < n: + nxt = md_lines[j].strip() + if nxt == "" or nxt.startswith(("|", "#", "- ", "* ")): + break + para.append(tl.strip_inline_md(nxt)) + j += 1 + text = " ".join(para) + max_chars = tl.chars_per_line(_USABLE_W, _FS_BODY) + _place_text_lines(st, tl.wrap(text, max_chars), _FS_BODY, _INK) + i = j + st.y += _GAP + + +def _place_kv_table(st: _PdfState, block) -> None: + title = getattr(block, "title", None) + if title: + _place_heading(st, model.Heading(title, level=2)) + rows = getattr(block, "rows", []) or [] + key_w = 1.9 # inches reserved for the label column. + val_chars = tl.chars_per_line(_USABLE_W - key_w - 0.1, _FS_BODY) + lh = tl.line_height_in(_FS_BODY) + for row in rows: + try: + label, value = row[0], row[1] + except Exception: # noqa: BLE001 + label, value = str(row), "" + v_lines = tl.wrap(model._safe_str(value), val_chars) + row_h = lh * len(v_lines) + _ROW_VPAD + _ensure_space(st, row_h) + y0 = st.y + st.fig.text(_xf(_ML), _yf(y0), tl.strip_inline_md(model._safe_str(label)), + fontsize=_FS_BODY, color=_MUTED, ha="left", va="top") + for k, vl in enumerate(v_lines): + st.fig.text(_xf(_ML + key_w), _yf(y0 + k * lh), vl, + fontsize=_FS_BODY, color=_INK, ha="left", va="top") + st.y = y0 + row_h + st.y += _GAP + + +def _col_widths(header: list, rows: list, fs: float) -> list: + """Distribute usable width across columns proportional to content length.""" + ncol = len(header) if header else (len(rows[0]) if rows else 1) + ncol = max(1, ncol) + natural = [3] * ncol + for c in range(ncol): + if header and c < len(header): + natural[c] = max(natural[c], len(model._safe_str(header[c]))) + for r in rows: + if c < len(r): + natural[c] = max(natural[c], len(model._safe_str(r[c]))) + # Clamp so one very long column does not starve the others. + clamped = [min(max(w, 4), 40) for w in natural] + total = float(sum(clamped)) or 1.0 + widths = [_USABLE_W * w / total for w in clamped] + # Enforce a minimum readable column width. + min_w = 0.45 + widths = [max(w, min_w) for w in widths] + # Renormalize if the minimums pushed us over the usable width. + s = sum(widths) + if s > _USABLE_W: + widths = [w * _USABLE_W / s for w in widths] + return widths + + +def _wrap_row(cells: list, widths: list, fs: float) -> list: + """Wrap each cell to its column width → list of line-lists per cell.""" + out = [] + for c, w in enumerate(widths): + text = model._safe_str(cells[c]) if c < len(cells) else "" + max_chars = tl.chars_per_line(w - _CELL_PAD * 2, fs) + out.append(tl.wrap(text, max_chars)) + return out + + +def _draw_table_row(st: _PdfState, cells_lines: list, widths: list, fs: float, + y0: float, header: bool) -> float: + lh = tl.line_height_in(fs) + nlines = max((len(c) for c in cells_lines), default=1) + row_h = lh * nlines + _ROW_VPAD * 2 + if header: + st.fig.add_artist(Rectangle( + (_xf(_ML), _yf(y0 + row_h)), _xf(_ML + _USABLE_W) - _xf(_ML), + _yf(y0) - _yf(y0 + row_h), transform=st.fig.transFigure, + color=_HEAD_BG, lw=0, zorder=0)) + x = _ML + for c, lines in enumerate(cells_lines): + for k, ln in enumerate(lines): + st.fig.text(_xf(x + _CELL_PAD), _yf(y0 + _ROW_VPAD + k * lh), ln, + fontsize=fs, color=_INK, + fontweight="bold" if header else "normal", + ha="left", va="top", zorder=2) + x += widths[c] + # Bottom rule of the row. + st.fig.add_artist(Rectangle( + (_xf(_ML), _yf(y0 + row_h)), _xf(_ML + _USABLE_W) - _xf(_ML), 0.0006, + transform=st.fig.transFigure, color=_RULE, lw=0, zorder=1)) + return row_h + + +def _place_data_table(st: _PdfState, block) -> None: + title = getattr(block, "title", None) + if title: + _place_heading(st, model.Heading(title, level=2)) + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + fs = _FS_CELL + widths = _col_widths(header, rows, fs) + header_lines = _wrap_row(header, widths, fs) if header else None + lh = tl.line_height_in(fs) + + def header_h() -> float: + if not header_lines: + return 0.0 + return lh * max((len(c) for c in header_lines), default=1) + _ROW_VPAD * 2 + + def draw_header() -> None: + if header_lines: + st.y += _draw_table_row(st, header_lines, widths, fs, st.y, + header=True) + + # Ensure header + first row fit, else start on a new page. + first_row_h = 0.0 + if rows: + first_lines = _wrap_row(rows[0], widths, fs) + first_row_h = lh * max((len(c) for c in first_lines), default=1) \ + + _ROW_VPAD * 2 + _ensure_space(st, header_h() + max(first_row_h, lh)) + draw_header() + for r in rows: + cells_lines = _wrap_row(r, widths, fs) + row_h = lh * max((len(c) for c in cells_lines), default=1) \ + + _ROW_VPAD * 2 + if _remaining(st) < row_h: + _new_page(st) + draw_header() # repeat header on the continuation page. + st.y += _draw_table_row(st, cells_lines, widths, fs, st.y, header=False) + note = getattr(block, "note", None) + if note: + _place_text_lines(st, tl.wrap(model._safe_str(note), + tl.chars_per_line(_USABLE_W, _FS_NOTE)), + _FS_NOTE, _MUTED, style="italic") + st.y += _GAP + + +def _resolve_figure(block): + fig = getattr(block, "fig", None) + if fig is not None: + return fig, False + make = getattr(block, "make", None) + if callable(make): + try: + return make(), True + except Exception: # noqa: BLE001 + return None, False + return None, False + + +def _png_from_figure(fig) -> bytes: + buf = io.BytesIO() + fig.savefig(buf, format="png", dpi=150, bbox_inches="tight") + buf.seek(0) + return buf.read() + + +def _place_image_array(st: _PdfState, arr, caption) -> None: + h_px, w_px = arr.shape[0], arr.shape[1] + aspect = (h_px / w_px) if w_px else 1.0 + max_h = _CONTENT_BOTTOM - _CONTENT_TOP + target_w = _USABLE_W + target_h = target_w * aspect + if target_h > max_h: + target_h = max_h + target_w = target_h / aspect if aspect else _USABLE_W + cap_h = tl.line_height_in(_FS_NOTE) + 0.04 if caption else 0.0 + # Move whole image to next page if it does not fit in remaining space. + if _remaining(st) < target_h + cap_h: + if (max_h) >= target_h + cap_h: + _new_page(st) + else: + # Taller than a full page even at min — already clamped to max_h. + _new_page(st) + left_frac = _xf(_ML + (_USABLE_W - target_w) / 2.0) + bottom_frac = _yf(st.y + target_h) + ax = st.fig.add_axes([left_frac, bottom_frac, target_w / _W, target_h / _H]) + ax.imshow(arr) + ax.axis("off") + st.y += target_h + 0.04 + if caption: + _place_text_lines(st, tl.wrap(model._safe_str(caption), + tl.chars_per_line(_USABLE_W, _FS_NOTE)), + _FS_NOTE, _MUTED, style="italic") + st.y += _GAP + + +def _place_figure(st: _PdfState, block) -> None: + fig, owned = _resolve_figure(block) + if fig is None: + _place_text_lines(st, ["(figura no disponible)"], _FS_NOTE, _MUTED, + style="italic") + st.y += _GAP + return + try: + png = _png_from_figure(fig) + finally: + if owned: + try: + plt.close(fig) + except Exception: # noqa: BLE001 + pass + arr = mpimg.imread(io.BytesIO(png)) + _place_image_array(st, arr, getattr(block, "caption", None)) + + +def _place_image(st: _PdfState, block) -> None: + path = getattr(block, "path", "") + if not path or not os.path.exists(path): + _place_text_lines(st, [f"(imagen no encontrada: {path})"], _FS_NOTE, + _MUTED, style="italic") + st.y += _GAP + return + arr = mpimg.imread(path) + _place_image_array(st, arr, getattr(block, "caption", None)) + + +def _place_caption(st: _PdfState, block) -> None: + _place_text_lines(st, tl.wrap(getattr(block, "text", ""), + tl.chars_per_line(_USABLE_W, _FS_NOTE)), + _FS_NOTE, _MUTED, style="italic") + st.y += _GAP + + +def _place_note(st: _PdfState, block) -> None: + _place_text_lines(st, tl.wrap(getattr(block, "text", ""), + tl.chars_per_line(_USABLE_W, _FS_NOTE)), + _FS_NOTE, _MUTED, style="italic") + st.y += _GAP + + +_PLACERS = { + "heading": _place_heading, + "markdown": _place_markdown, + "kv_table": _place_kv_table, + "data_table": _place_data_table, + "figure": _place_figure, + "image": _place_image, + "caption": _place_caption, + "note": _place_note, +} + + +def render_pdf(chapters: list, out_path: str, meta: dict = None) -> dict: + """Render a list of Chapters into an A5-portrait, mobile-readable PDF. + + Never raises. Returns ``{path, n_pages, chapters, note}`` where ``chapters`` + is a list of ``{id, version, n_pages}`` for the manifest. On a fatal write + error ``path`` is None and ``note`` explains why. + """ + meta = meta or {} + chapters = model.as_chapters(chapters) + notes = [] + + try: + parent = os.path.dirname(os.path.abspath(out_path)) + os.makedirs(parent, exist_ok=True) + except OSError as e: + return {"path": None, "n_pages": 0, "chapters": [], + "note": f"no se pudo crear el directorio destino: {e}"} + + title = meta.get("title") or model.ENGINE_NAME + chapters_meta = [] + try: + with plt.rc_context(_RC): + with PdfPages(out_path) as pdf: + st = _PdfState(pdf, title) + for ch in chapters: + st.chapter = ch + st.chapter_pages = 0 + _new_page(st) # each chapter starts on a fresh page. + for block in ch.blocks: + placer = _PLACERS.get(getattr(block, "kind", ""), + _place_note) + try: + placer(st, block) + except Exception as e: # noqa: BLE001 + notes.append( + f"bloque '{getattr(block, 'kind', '?')}' del " + f"capítulo '{ch.id}' omitido: {e}") + chapters_meta.append({"id": ch.id, "version": ch.version, + "n_pages": st.chapter_pages}) + _flush_page(st) + if st.page == 0: + # No chapters at all → guarantee one valid page. + st.chapter = model.Chapter(id="vacio", title=title, + version=model.ENGINE_VERSION) + _new_page(st) + _place_note(st, model.Note( + "(documento vacío — sin capítulos aplicables)")) + _flush_page(st) + n_pages = st.page + except Exception as e: # noqa: BLE001 + return {"path": None, "n_pages": 0, "chapters": [], + "note": f"fallo al escribir el PDF: {e}"} + + note = f"{n_pages} páginas" + if notes: + note += " · " + "; ".join(notes) + return {"path": out_path, "n_pages": n_pages, "chapters": chapters_meta, + "note": note} diff --git a/python/functions/datascience/automatic_eda/render_pptx_impl.py b/python/functions/datascience/automatic_eda/render_pptx_impl.py new file mode 100644 index 00000000..5494d604 --- /dev/null +++ b/python/functions/datascience/automatic_eda/render_pptx_impl.py @@ -0,0 +1,518 @@ +"""AutomaticEDA PPTX renderer — 16:9 slides, never cuts content. + +Same flow principle as the PDF renderer but onto PowerPoint slides: measure each +block and place it top-to-bottom; when it does not fit in the remaining slide +space, continue on a new slide titled `` (cont.)``. Data tables split by +rows **repeating the header**; figures/images are scaled to fit entirely. Every +slide carries a footer `` · v`` plus the engine version. + +dict-no-throw: a failure inside one block is caught and noted; the deck is always +produced with at least one slide. Engine: ``python-pptx`` (added dependency). +""" + +from __future__ import annotations + +import io +import os + +from . import model +from . import text_layout as tl + +try: + from pptx import Presentation + from pptx.util import Inches, Pt, Emu + from pptx.dml.color import RGBColor + from pptx.enum.text import PP_ALIGN + _PPTX_OK = True + _PPTX_ERR = "" +except Exception as _e: # noqa: BLE001 — surfaced as a dict-no-throw note. + _PPTX_OK = False + _PPTX_ERR = str(_e) + +# 16:9 widescreen, inches. +_W, _H = 13.333, 7.5 +_ML, _MR = 0.7, 0.7 +_TITLE_TOP, _TITLE_H = 0.28, 0.7 +_CONTENT_TOP = 1.12 +_FOOTER_H = 0.4 +_CONTENT_BOTTOM = _H - _FOOTER_H - 0.15 +_USABLE_W = _W - _ML - _MR + +_INK = (0x1B, 0x1B, 0x1B) +_ACCENT = (0x2A, 0x6F, 0x97) +_MUTED = (0x8A, 0x8A, 0x8A) +_HEAD_BG = (0xEE, 0xF3, 0xF6) +_WHITE = (0xFF, 0xFF, 0xFF) + +_FS_TITLE = 26 +_FS_H1, _FS_H2, _FS_H3 = 20, 16, 13 +_FS_BODY, _FS_CELL, _FS_NOTE = 14, 11, 11 +_GAP = 0.12 + + +class _PptxState: + def __init__(self, prs, title: str): + self.prs = prs + self.title = title + self.slide = None + self.y = _CONTENT_TOP + self.chapter = None + self.slide_no = 0 + self.chapter_slides = 0 + + +def _rgb(c): + return RGBColor(*c) + + +def _new_slide(st: _PptxState, cont: bool = False) -> None: + blank = st.prs.slide_layouts[6] + st.slide = st.prs.slides.add_slide(blank) + st.y = _CONTENT_TOP + st.slide_no += 1 + st.chapter_slides += 1 + _draw_title(st, cont) + _draw_footer(st) + + +def _draw_title(st: _PptxState, cont: bool) -> None: + ch = st.chapter + title = ch.title if ch is not None else st.title + if cont: + title = f"{title} (cont.)" + box = st.slide.shapes.add_textbox( + Inches(_ML), Inches(_TITLE_TOP), Inches(_USABLE_W), Inches(_TITLE_H)) + tf = box.text_frame + tf.word_wrap = True + p = tf.paragraphs[0] + run = p.add_run() + run.text = title + run.font.size = Pt(_FS_TITLE) + run.font.bold = True + run.font.color.rgb = _rgb(_INK) + + +def _draw_footer(st: _PptxState) -> None: + ch = st.chapter + left = f"{ch.title} · v{ch.version}" if ch is not None else "" + right = f"{model.ENGINE_NAME} v{model.ENGINE_VERSION} · {st.slide_no}" + box = st.slide.shapes.add_textbox( + Inches(_ML), Inches(_H - _FOOTER_H), Inches(_USABLE_W), + Inches(_FOOTER_H * 0.7)) + tf = box.text_frame + tf.word_wrap = False + p = tf.paragraphs[0] + r = p.add_run() + r.text = left + r.font.size = Pt(9) + r.font.color.rgb = _rgb(_MUTED) + # Right-aligned engine stamp on a second textbox. + box2 = st.slide.shapes.add_textbox( + Inches(_ML), Inches(_H - _FOOTER_H), Inches(_USABLE_W), + Inches(_FOOTER_H * 0.7)) + tf2 = box2.text_frame + p2 = tf2.paragraphs[0] + p2.alignment = PP_ALIGN.RIGHT + r2 = p2.add_run() + r2.text = right + r2.font.size = Pt(9) + r2.font.color.rgb = _rgb(_MUTED) + + +def _remaining(st: _PptxState) -> float: + return _CONTENT_BOTTOM - st.y + + +def _ensure(st: _PptxState, height: float) -> None: + if _remaining(st) < height: + _new_slide(st, cont=True) + + +def _add_text(st: _PptxState, lines: list, fs: float, color, bold=False, + italic=False, indent=0.0, bullet=False) -> None: + lh = tl.line_height_in(fs) + height = lh * len(lines) + 0.05 + _ensure(st, height) + box = st.slide.shapes.add_textbox( + Inches(_ML + indent), Inches(st.y), Inches(_USABLE_W - indent), + Inches(height)) + tf = box.text_frame + tf.word_wrap = True + first = True + for ln in lines: + p = tf.paragraphs[0] if first else tf.add_paragraph() + first = False + run = p.add_run() + run.text = ("• " + ln) if bullet else ln + run.font.size = Pt(fs) + run.font.bold = bold + run.font.italic = italic + run.font.color.rgb = _rgb(color) + st.y += height + + +def _place_heading(st: _PptxState, block) -> None: + level = max(1, min(3, int(getattr(block, "level", 1) or 1))) + fs = {1: _FS_H1, 2: _FS_H2, 3: _FS_H3}[level] + text = tl.strip_inline_md(getattr(block, "text", "")) + lines = tl.wrap(text, tl.chars_per_line(_USABLE_W, fs)) + _add_text(st, lines, fs, _INK, bold=True) + st.y += 0.04 + + +def _place_markdown(st: _PptxState, block) -> None: + raw = str(getattr(block, "text", "") or "") + md_lines = raw.split("\n") + i, n = 0, len(md_lines) + while i < n: + stripped = md_lines[i].strip() + if stripped.startswith("|") and stripped.endswith("|"): + j = i + tbl = [] + while j < n and md_lines[j].strip().startswith("|") \ + and md_lines[j].strip().endswith("|"): + tbl.append(md_lines[j]) + j += 1 + parsed = tl.parse_md_table(tbl) + if parsed: + header, rows = parsed + _place_data_table(st, model.DataTable(header=header, rows=rows)) + i = j + continue + if stripped == "": + st.y += tl.line_height_in(_FS_BODY) * 0.4 + i += 1 + continue + if stripped.startswith("### "): + _place_heading(st, model.Heading(stripped[4:], level=3)) + i += 1 + continue + if stripped.startswith("## "): + _place_heading(st, model.Heading(stripped[3:], level=2)) + i += 1 + continue + if stripped.startswith("# "): + _place_heading(st, model.Heading(stripped[2:], level=1)) + i += 1 + continue + if stripped.startswith("- ") or stripped.startswith("* "): + content = tl.strip_inline_md(stripped[2:]) + lines = tl.wrap(content, tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY)) + _add_text(st, lines, _FS_BODY, _INK, bullet=True) + i += 1 + continue + para = [tl.strip_inline_md(stripped)] + j = i + 1 + while j < n: + nxt = md_lines[j].strip() + if nxt == "" or nxt.startswith(("|", "#", "- ", "* ")): + break + para.append(tl.strip_inline_md(nxt)) + j += 1 + text = " ".join(para) + _add_text(st, tl.wrap(text, tl.chars_per_line(_USABLE_W, _FS_BODY)), + _FS_BODY, _INK) + i = j + st.y += _GAP + + +def _place_kv_table(st: _PptxState, block) -> None: + title = getattr(block, "title", None) + if title: + _place_heading(st, model.Heading(title, level=2)) + rows = getattr(block, "rows", []) or [] + data_rows = [] + for row in rows: + try: + label, value = row[0], row[1] + except Exception: # noqa: BLE001 + label, value = str(row), "" + data_rows.append([model._safe_str(label), model._safe_str(value)]) + _place_data_table(st, model.DataTable(header=["Campo", "Valor"], + rows=data_rows), shaded_header=True, + key_value=True) + + +def _col_widths(header, rows): + ncol = len(header) if header else (len(rows[0]) if rows else 1) + ncol = max(1, ncol) + natural = [3] * ncol + for c in range(ncol): + if header and c < len(header): + natural[c] = max(natural[c], len(model._safe_str(header[c]))) + for r in rows: + if c < len(r): + natural[c] = max(natural[c], len(model._safe_str(r[c]))) + clamped = [min(max(w, 4), 44) for w in natural] + total = float(sum(clamped)) or 1.0 + return [_USABLE_W * w / total for w in clamped] + + +def _row_height_in(cells, widths, fs) -> float: + lh = tl.line_height_in(fs) + maxlines = 1 + for c, w in enumerate(widths): + text = model._safe_str(cells[c]) if c < len(cells) else "" + lines = tl.wrap(text, tl.chars_per_line(w - 0.12, fs)) + maxlines = max(maxlines, len(lines)) + return lh * maxlines + 0.10 + + +def _emit_table(st: _PptxState, header, chunk, widths, fs) -> None: + nrows = len(chunk) + (1 if header else 0) + ncol = len(widths) + # Pre-measure total height to size the shape (pptx still auto-grows rows). + heights = [] + if header: + heights.append(_row_height_in(header, widths, fs)) + for r in chunk: + heights.append(_row_height_in(r, widths, fs)) + total_h = sum(heights) + gtable = st.slide.shapes.add_table( + nrows, ncol, Inches(_ML), Inches(st.y), Inches(_USABLE_W), + Inches(total_h)).table + gtable.first_row = bool(header) + gtable.horz_banding = False + for c in range(ncol): + gtable.columns[c].width = Emu(int(Inches(widths[c]))) + ridx = 0 + if header: + for c in range(ncol): + cell = gtable.cell(0, c) + cell.text = model._safe_str(header[c]) if c < len(header) else "" + _style_cell(cell, fs, _INK, bold=True, fill=_HEAD_BG) + ridx = 1 + for r in chunk: + for c in range(ncol): + cell = gtable.cell(ridx, c) + cell.text = model._safe_str(r[c]) if c < len(r) else "" + _style_cell(cell, fs, _INK, bold=False, fill=_WHITE) + ridx += 1 + st.y += total_h + _GAP + + +def _style_cell(cell, fs, color, bold, fill) -> None: + cell.fill.solid() + cell.fill.fore_color.rgb = _rgb(fill) + cell.margin_left = Inches(0.05) + cell.margin_right = Inches(0.05) + cell.margin_top = Inches(0.02) + cell.margin_bottom = Inches(0.02) + for p in cell.text_frame.paragraphs: + for run in p.runs: + run.font.size = Pt(fs) + run.font.bold = bold + run.font.color.rgb = _rgb(color) + + +def _place_data_table(st: _PptxState, block, shaded_header=True, + key_value=False) -> None: + title = getattr(block, "title", None) + if title: + _place_heading(st, model.Heading(title, level=2)) + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + fs = _FS_CELL + widths = _col_widths(header, rows) + header_h = _row_height_in(header, widths, fs) if header else 0.0 + + idx = 0 + n = len(rows) + if n == 0: + # Header-only table still rendered (one slide). + _ensure(st, header_h + 0.2) + _emit_table(st, header, [], widths, fs) + return + while idx < n: + # Greedily fill the current slide with as many rows as fit. + if _remaining(st) < header_h + _row_height_in(rows[idx], widths, fs): + _new_slide(st, cont=True) + avail = _remaining(st) - header_h + chunk = [] + used = 0.0 + while idx < n: + rh = _row_height_in(rows[idx], widths, fs) + if used + rh > avail and chunk: + break + chunk.append(rows[idx]) + used += rh + idx += 1 + _emit_table(st, header, chunk, widths, fs) + note = getattr(block, "note", None) + if note: + _add_text(st, tl.wrap(model._safe_str(note), + tl.chars_per_line(_USABLE_W, _FS_NOTE)), _FS_NOTE, _MUTED, + italic=True) + + +def _img_size_px(data: bytes): + try: + from PIL import Image + with Image.open(io.BytesIO(data)) as im: + return im.size # (w, h) + except Exception: # noqa: BLE001 + return (1200, 800) + + +def _resolve_png(block): + fig = getattr(block, "fig", None) + make = getattr(block, "make", None) + f = fig + owned = False + if f is None and callable(make): + try: + f = make() + owned = True + except Exception: # noqa: BLE001 + f = None + if f is None: + return None + try: + import matplotlib.pyplot as plt + buf = io.BytesIO() + f.savefig(buf, format="png", dpi=150, bbox_inches="tight") + buf.seek(0) + return buf.read() + except Exception: # noqa: BLE001 + return None + finally: + if owned: + try: + import matplotlib.pyplot as plt + plt.close(f) + except Exception: # noqa: BLE001 + pass + + +def _place_picture_bytes(st: _PptxState, data: bytes, caption) -> None: + w_px, h_px = _img_size_px(data) + aspect = (h_px / w_px) if w_px else 0.66 + max_h = _CONTENT_BOTTOM - _CONTENT_TOP + target_w = _USABLE_W + target_h = target_w * aspect + if target_h > max_h: + target_h = max_h + target_w = target_h / aspect if aspect else _USABLE_W + cap_h = tl.line_height_in(_FS_NOTE) + 0.05 if caption else 0.0 + if _remaining(st) < target_h + cap_h: + _new_slide(st, cont=True) + left = _ML + (_USABLE_W - target_w) / 2.0 + st.slide.shapes.add_picture(io.BytesIO(data), Inches(left), Inches(st.y), + width=Inches(target_w), height=Inches(target_h)) + st.y += target_h + 0.05 + if caption: + _add_text(st, tl.wrap(model._safe_str(caption), + tl.chars_per_line(_USABLE_W, _FS_NOTE)), _FS_NOTE, _MUTED, + italic=True) + st.y += _GAP + + +def _place_figure(st: _PptxState, block) -> None: + png = _resolve_png(block) + if png is None: + _add_text(st, ["(figura no disponible)"], _FS_NOTE, _MUTED, italic=True) + st.y += _GAP + return + _place_picture_bytes(st, png, getattr(block, "caption", None)) + + +def _place_image(st: _PptxState, block) -> None: + path = getattr(block, "path", "") + if not path or not os.path.exists(path): + _add_text(st, [f"(imagen no encontrada: {path})"], _FS_NOTE, _MUTED, + italic=True) + st.y += _GAP + return + try: + with open(path, "rb") as fh: + data = fh.read() + except Exception as e: # noqa: BLE001 + _add_text(st, [f"(no se pudo leer la imagen: {e})"], _FS_NOTE, _MUTED, + italic=True) + st.y += _GAP + return + _place_picture_bytes(st, data, getattr(block, "caption", None)) + + +def _place_caption(st: _PptxState, block) -> None: + _add_text(st, tl.wrap(getattr(block, "text", ""), + tl.chars_per_line(_USABLE_W, _FS_NOTE)), _FS_NOTE, _MUTED, + italic=True) + st.y += _GAP + + +def _place_note(st: _PptxState, block) -> None: + _place_caption(st, block) + + +_PLACERS = { + "heading": _place_heading, + "markdown": _place_markdown, + "kv_table": _place_kv_table, + "data_table": _place_data_table, + "figure": _place_figure, + "image": _place_image, + "caption": _place_caption, + "note": _place_note, +} + + +def render_pptx(chapters: list, out_path: str, meta: dict = None) -> dict: + """Render a list of Chapters into a 16:9 PPTX deck. Never raises. + + Returns ``{path, n_slides, chapters, note}`` where ``chapters`` is a list of + ``{id, version, n_slides}`` for the manifest. On a fatal error ``path`` is + None and ``note`` explains why (e.g. python-pptx not installed). + """ + meta = meta or {} + if not _PPTX_OK: + return {"path": None, "n_slides": 0, "chapters": [], + "note": f"python-pptx no disponible: {_PPTX_ERR}"} + + chapters = model.as_chapters(chapters) + notes = [] + try: + parent = os.path.dirname(os.path.abspath(out_path)) + os.makedirs(parent, exist_ok=True) + except OSError as e: + return {"path": None, "n_slides": 0, "chapters": [], + "note": f"no se pudo crear el directorio destino: {e}"} + + title = meta.get("title") or model.ENGINE_NAME + chapters_meta = [] + try: + prs = Presentation() + prs.slide_width = Inches(_W) + prs.slide_height = Inches(_H) + st = _PptxState(prs, title) + for ch in chapters: + st.chapter = ch + st.chapter_slides = 0 + _new_slide(st, cont=False) + for block in ch.blocks: + placer = _PLACERS.get(getattr(block, "kind", ""), _place_note) + try: + placer(st, block) + except Exception as e: # noqa: BLE001 + notes.append( + f"bloque '{getattr(block, 'kind', '?')}' del capítulo " + f"'{ch.id}' omitido: {e}") + chapters_meta.append({"id": ch.id, "version": ch.version, + "n_slides": st.chapter_slides}) + if st.slide_no == 0: + st.chapter = model.Chapter(id="vacio", title=title, + version=model.ENGINE_VERSION) + _new_slide(st, cont=False) + _place_note(st, model.Note( + "(documento vacío — sin capítulos aplicables)")) + prs.save(out_path) + n_slides = st.slide_no + except Exception as e: # noqa: BLE001 + return {"path": None, "n_slides": 0, "chapters": [], + "note": f"fallo al escribir el PPTX: {e}"} + + note = f"{n_slides} slides" + if notes: + note += " · " + "; ".join(notes) + return {"path": out_path, "n_slides": n_slides, "chapters": chapters_meta, + "note": note} diff --git a/python/functions/datascience/automatic_eda/text_layout.py b/python/functions/datascience/automatic_eda/text_layout.py new file mode 100644 index 00000000..dae00904 --- /dev/null +++ b/python/functions/datascience/automatic_eda/text_layout.py @@ -0,0 +1,107 @@ +"""Shared text-measurement helpers for the AutomaticEDA renderers. + +Both renderers flow content top-to-bottom and must know, *before* placing a +block, how much vertical space it will take — that is what guarantees nothing is +cut: a unit either fits in the remaining space or moves to the next page/slide +whole. Measuring proportional text exactly in matplotlib/pptx is impractical, so +we use a deterministic character-grid estimate (chars-per-line from an average +glyph width) which slightly over-estimates and is therefore safe: it never +claims something fits when it would overflow. + +Wrapping is word-aware (``textwrap``) and additionally hard-splits any single +token longer than the line so a 200-character value still wraps instead of +overflowing — that is wrapping, not loss: every character is still rendered. +""" + +from __future__ import annotations + +import textwrap + + +def avg_char_width_in(fontsize_pt: float) -> float: + """Approximate average glyph width in inches for a sans-serif font. + + ~0.5 of the point size is a conservative mean advance width for proportional + sans fonts; dividing by 72 converts points to inches. + """ + return 0.5 * fontsize_pt / 72.0 + + +def line_height_in(fontsize_pt: float, leading: float = 1.32) -> float: + """Line height in inches for a given font size and leading.""" + return leading * fontsize_pt / 72.0 + + +def chars_per_line(width_in: float, fontsize_pt: float) -> int: + """How many average glyphs fit in ``width_in`` at ``fontsize_pt``.""" + cw = avg_char_width_in(fontsize_pt) + if cw <= 0: + return 80 + n = int(width_in / cw) + return max(1, n) + + +def wrap(text: str, max_chars: int) -> list: + """Word-wrap ``text`` to lines of at most ``max_chars``, never losing chars. + + Long tokens (no spaces) are hard-split so they cannot overflow. Existing + newlines are honored as hard breaks. Empty input yields a single empty line + so callers can still reserve a row. + """ + if max_chars < 1: + max_chars = 1 + s = "" if text is None else str(text) + out: list = [] + for raw_line in s.split("\n"): + if raw_line == "": + out.append("") + continue + # textwrap with break_long_words so no token overflows the column. + wrapped = textwrap.wrap( + raw_line, width=max_chars, break_long_words=True, + break_on_hyphens=False, replace_whitespace=True, + drop_whitespace=True, + ) + if not wrapped: + out.append("") + else: + out.extend(wrapped) + return out or [""] + + +def strip_inline_md(text: str) -> str: + """Strip a tiny subset of inline markdown markers, keeping the text. + + Removes ``**bold**`` / ``__bold__`` / ``*em*`` / `` `code` `` markers so the + content is preserved without trying to style spans (which the line-grid + layout cannot do). Nothing is dropped except the markers themselves. + """ + if not text: + return "" + s = str(text) + for marker in ("**", "__", "`"): + s = s.replace(marker, "") + return s + + +def parse_md_table(lines: list): + """Parse consecutive ``| a | b |`` lines into ``(header, rows)`` or None. + + Accepts an optional separator row (``|---|---|``) right after the header, + which is ignored. Returns None if the lines are not a pipe table. + """ + cells_rows = [] + for ln in lines: + s = ln.strip() + if not (s.startswith("|") and s.endswith("|")): + return None + parts = [c.strip() for c in s.strip("|").split("|")] + cells_rows.append(parts) + if not cells_rows: + return None + header = cells_rows[0] + body = cells_rows[1:] + # Drop a markdown separator row (all cells are dashes/colons). + if body and all(set(c) <= set("-: ") and "-" in c for c in body[0]): + body = body[1:] + return header, body diff --git a/python/functions/datascience/render_automatic_eda_pdf.md b/python/functions/datascience/render_automatic_eda_pdf.md new file mode 100644 index 00000000..cf92cf09 --- /dev/null +++ b/python/functions/datascience/render_automatic_eda_pdf.md @@ -0,0 +1,107 @@ +--- +name: render_automatic_eda_pdf +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def render_automatic_eda_pdf(chapters_or_profile, out_path: str, meta: dict = None) -> dict" +description: "Renderiza un documento AutomaticEDA por CAPÍTULOS (modelo de bloques independiente del formato) en un PDF A5 retrato pensado para LEER EN EL MÓVIL. Acepta una lista de capítulos del modelo o directamente un TableProfile del grupo eda (en cuyo caso construye los capítulos canónicos con build_document). El paginador MIDE cada bloque y NUNCA corta nada: el texto se envuelve a líneas completas, las tablas largas se parten por filas REPITIENDO la cabecera, figuras e imágenes se escalan para caber enteras. Cada capítulo empieza en página nueva con pie 'Capítulo · vX.Y.Z' y se escribe un manifiesto automatic_eda_manifest.json junto a la salida para seguimiento por capítulo. dict-no-throw: nunca lanza, devuelve {path, n_pages, chapters, manifest_path, note}. Motor matplotlib PdfPages. Aditivo: NO reemplaza render_eda_pdf." +tags: [eda, pdf, render, report, mobile, automatic-eda, chapters, versioned, no-cut, pagination, matplotlib, datascience, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [os, matplotlib, "datascience.automatic_eda"] +params: + - name: chapters_or_profile + desc: "una lista de capítulos del modelo AutomaticEDA (dataclasses Chapter o dicts {id,title,version,blocks}) O un TableProfile dict del grupo eda. Si es un TableProfile, los capítulos canónicos se construyen con build_document(profile, meta['ctx']). Un capítulo es {id,title,version,blocks}; un bloque es uno de: heading, markdown, kv_table, data_table, figure, image, caption, note. Lectura defensiva: cualquier cosa no reconocida se degrada a Note, nunca lanza." + - name: out_path + desc: "ruta del archivo PDF de salida. Los directorios padre se crean si faltan. Si está en un directorio no escribible (p.ej. /proc/...) devuelve {path:None, note:} sin lanzar." + - name: meta + desc: "dict opcional. Claves: title (título de portada/pie), ctx (contexto de presentación pasado a los builders de capítulo cuando se da un profile: dataset_name, source_origin, storage, generated_at, description, granularity, quality_criteria, head_rows...), manifest_path (override; por defecto automatic_eda_manifest.json junto a out_path), write_manifest (False para no escribirlo), generated_at." +output: "dict (nunca lanza): {path: str|None, n_pages: int, chapters: list[{id,version,n_pages}], manifest_path: str|None, note: str}. En éxito path es la ruta escrita, n_pages el total de páginas, chapters el desglose por capítulo para el manifiesto. En error fatal path es None y note explica la causa." +tested: true +tests: ["test_golden_profile_genera_pdf_portada_y_overview", "test_edge_tabla_larga_parte_repitiendo_cabecera", "test_edge_celda_larga_no_se_corta", "test_no_corta_texto_markdown", "test_edge_profile_none_y_vacio_un_pagina", "test_error_path_directorio_no_escribible_no_revienta"] +test_file_path: "python/functions/datascience/render_automatic_eda_pdf_test.py" +file_path: "python/functions/datascience/render_automatic_eda_pdf.py" +--- + +## Ejemplo + +```python +from datascience import render_automatic_eda_pdf + +# Caso 1: directamente desde un TableProfile del grupo eda. +# profile = profile_table(db, "ventas", backend="duckdb")["profile"] +profile = { + "table": "ventas", "source": "/data/ventas.csv", + "n_rows": 1000, "n_cols": 2, "quality_score": 92.5, + "columns": [ + {"name": "precio", "inferred_type": "numeric", "null_pct": 0.01, + "null_count": 10, + "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0, "max": 100.0, + "std": 12.3}}, + {"name": "categoria", "inferred_type": "categorical", "null_pct": 0.0, + "categorical": {"top": [{"value": "neumaticos", "count": 500}, + {"value": "aceite", "count": 300}]}}, + ], +} +res = render_automatic_eda_pdf( + profile, "reports/ventas_aeda.pdf", + {"title": "EDA — ventas", + "ctx": {"dataset_name": "Ventas", "source_origin": "ERP export", + "description": "Líneas de venta del ERP.", + "granularity": "Cada fila es una línea de venta."}}) +print(res["n_pages"], res["chapters"], res["manifest_path"]) +# -> 3 [{'id':'portada','version':'1.0.0','n_pages':1}, +# {'id':'overview','version':'1.0.0','n_pages':2}] reports/automatic_eda_manifest.json + +# Caso 2: desde capítulos construidos a mano (modelo de bloques). +from datascience.automatic_eda.model import Chapter, Heading, DataTable +ch = Chapter(id="resumen", title="Resumen", version="1.0.0", blocks=[ + Heading("Tabla", 1), + DataTable(header=["col", "valor"], rows=[["a", "1"], ["b", "2"]]), +]) +render_automatic_eda_pdf([ch], "reports/manual.pdf") +``` + +## Cuando usarla + +Cuando quieras el **PDF móvil del nuevo motor AutomaticEDA por capítulos** (portada ++ overview + los capítulos que existan): después de `profile_table(...)`, pásale el +`profile` y obtienes un PDF A5 retrato versionado por capítulo, con manifiesto. Úsala +como capa de presentación PDF del grupo `eda` cuando necesites **garantía de no-corte** +(texto, tablas e imágenes nunca recortados) y **versionado por capítulo** para mejora +continua. Es el reemplazo evolutivo de `render_eda_pdf`: comparte estética Tufte/móvil +pero separa contenido (capítulos/bloques) de formato (renderer), de modo que el mismo +documento se emite también como PPTX (`render_automatic_eda_pptx`). Para añadir un +capítulo nuevo, ver `docs/capabilities/automatic_eda.md`. + +## Gotchas + +- **Impura**: escribe el PDF en `out_path` (crea los directorios padre) y, salvo + `meta['write_manifest']=False`, un `automatic_eda_manifest.json` junto a la salida. + Backend headless `Agg` de matplotlib (corre en agentes/CI sin display). +- **Nunca lanza** (dict-no-throw): un bloque o capítulo que falle se omite y se anota + en `note`; el PDF se genera igual. Un profile `None`/`{}` produce un PDF de 1 página + válido. `out_path` no escribible → `{path: None, note: }`. +- **No corta nada**: el paginador mide cada bloque con una rejilla de caracteres + (sobre-estima ligeramente, nunca afirma que algo cabe cuando se desbordaría). El + texto se envuelve a líneas completas (sin cortar a media palabra), las tablas largas + se parten por filas **repitiendo la cabecera**, las celdas con texto largo se + envuelven dentro de su columna (la fila crece), y figuras/imágenes se escalan para + caber enteras (nunca se recortan). +- **Tablas muy anchas**: con muchas columnas (>10) cada columna se estrecha y su texto + se envuelve en varias líneas (sigue sin perderse). El reparto por columnas-en-grupos + para tablas muy anchas es una mejora pendiente (ver capability page). +- **head_rows / examples**: el capítulo Overview muestra `df.head` desde + `ctx['head_rows']`/`profile['head_rows']` y ejemplos no-nulos desde + `columns[i]['examples']`; si el profile no los trae (hoy no los trae), degrada con un + placeholder honesto y deriva los ejemplos de los valores reales del perfil (top + categóricos, min/median/max numéricos). Documentado en el contrato. +- **Registro en el package**: el `## Ejemplo` usa `from datascience import + render_automatic_eda_pdf` (añadido al `__init__.py`); el test importa el módulo + directo para no depender de ese registro. +- **Fechas en UI europeas**: la portada formatea la fecha como `DD/MM/AAAA HH:mm`. diff --git a/python/functions/datascience/render_automatic_eda_pdf.py b/python/functions/datascience/render_automatic_eda_pdf.py new file mode 100644 index 00000000..67ab10e0 --- /dev/null +++ b/python/functions/datascience/render_automatic_eda_pdf.py @@ -0,0 +1,83 @@ +"""render_automatic_eda_pdf — chapter-based EDA report as an A5-portrait PDF. + +Public ``eda``-group entry point of the AutomaticEDA engine. Takes either a list +of chapters (the format-independent document model) or an ``eda`` TableProfile +dict (in which case the canonical chapters are built with ``build_document``), +and renders a mobile-first PDF whose paginator MEASURES every block and never +cuts text, tables or images: text wraps to whole lines, long tables split by +rows repeating the header, figures/images scale to fit entirely. Each chapter +starts on a fresh page stamped `` · v`` in the footer, and a +per-chapter manifest (``automatic_eda_manifest.json``) is written next to the +output for version tracking. + +dict-no-throw: never raises. Returns ``{path, n_pages, chapters, manifest_path, +note}``; on a fatal write error ``path`` is None and ``note`` explains why. + +Additive: this does NOT replace ``render_eda_pdf`` (still used by +``profile_table(emit_pdf=True)``). It is the new engine that will, in the next +phase, let every EDA emit both a PDF and a PPTX from the same chapter model. +""" + +from __future__ import annotations + +import os + +from datascience.automatic_eda import build_document, merge_manifest, render_pdf +from datascience.automatic_eda.model import as_chapter, as_chapters + + +def _coerce_chapters(chapters_or_profile, meta: dict) -> list: + """Accept chapters OR an eda profile and return a list of Chapter.""" + arg = chapters_or_profile + if isinstance(arg, (list, tuple)): + return as_chapters(list(arg)) + if isinstance(arg, dict): + # A single chapter dict has 'blocks'; a profile has columns/table/rows. + if "blocks" in arg and "columns" not in arg: + ch = as_chapter(arg) + return [ch] if ch is not None else [] + # Treat as an eda TableProfile. + return build_document(arg, (meta or {}).get("ctx")) + return [] + + +def render_automatic_eda_pdf(chapters_or_profile, out_path: str, + meta: dict = None) -> dict: + """Render an AutomaticEDA document into a mobile-readable PDF. + + Args: + chapters_or_profile: either a list of chapters (``Chapter`` dataclasses + or dicts following the document model) or an ``eda`` TableProfile + dict — in the latter case the canonical chapters are built via + ``build_document(profile, meta['ctx'])``. + out_path: filesystem path for the PDF (parent dirs are created). + meta: optional dict. Recognised keys: ``title`` (cover/footer title), + ``ctx`` (presentation context passed to chapter builders when a + profile is given), ``manifest_path`` (override; defaults to + ``automatic_eda_manifest.json`` beside ``out_path``), + ``write_manifest`` (set False to skip), ``generated_at``. + + Returns: + dict (never raises): ``{path, n_pages, chapters, manifest_path, note}``. + """ + meta = dict(meta or {}) + chapters = _coerce_chapters(chapters_or_profile, meta) + result = render_pdf(chapters, out_path, meta) + + manifest_path = None + if meta.get("write_manifest", True) and result.get("path"): + manifest_path = meta.get("manifest_path") + if not manifest_path: + manifest_path = os.path.join( + os.path.dirname(os.path.abspath(out_path)), + "automatic_eda_manifest.json") + generated_at = meta.get("generated_at") or _now_iso() + merge_manifest(manifest_path, "pdf", result.get("chapters") or [], + generated_at) + result["manifest_path"] = manifest_path + return result + + +def _now_iso() -> str: + from datetime import datetime, timezone + return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") diff --git a/python/functions/datascience/render_automatic_eda_pdf_test.py b/python/functions/datascience/render_automatic_eda_pdf_test.py new file mode 100644 index 00000000..c5e42612 --- /dev/null +++ b/python/functions/datascience/render_automatic_eda_pdf_test.py @@ -0,0 +1,140 @@ +"""Tests for render_automatic_eda_pdf — DoD: golden + edges + error path. + +Self-contained: builds a synthetic TableProfile (no DuckDB) so the suite is fast +and deterministic. Verifies the cover/overview reference chapters render, that +long tables split by rows repeating the header without losing any cell text, +that an empty/None profile still yields a valid 1-page PDF, and that an +unwritable destination returns ``{path: None}`` without raising. +""" + +import os +import re +import tempfile + +from pypdf import PdfReader + +from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf +from datascience.automatic_eda.model import Chapter, DataTable, Heading, Markdown + + +def _profile() -> dict: + return { + "table": "ventas", + "source": "/data/ventas.csv", + "profiled_at": "2026-06-30T10:00:00+00:00", + "n_rows": 1000, + "n_cols": 3, + "quality_score": 92.5, + "key_candidates": ["id"], + "type_breakdown": {"numeric": 2, "categorical": 1}, + "columns": [ + {"name": "id", "inferred_type": "numeric", "null_pct": 0.0, + "null_count": 0, + "numeric": {"mean": 500.0, "median": 500.0, "min": 1.0, + "max": 1000.0, "std": 288.7}}, + {"name": "precio", "inferred_type": "numeric", "null_pct": 0.01, + "null_count": 10, + "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0, + "max": 100.0, "std": 12.3}}, + {"name": "categoria", "inferred_type": "categorical", + "null_pct": 0.0, "null_count": 0, + "categorical": {"top": [{"value": "neumaticos", "count": 500}, + {"value": "aceite", "count": 300}]}}, + ], + } + + +def _pdf_text(path: str) -> str: + txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages) + return re.sub(r"\s+", " ", txt) + + +def test_golden_profile_genera_pdf_portada_y_overview(): + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "eda.pdf") + res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA — ventas"}) + assert res["path"] == out + assert os.path.exists(out) + assert res["n_pages"] >= 2 # portada + overview (1+ each). + ids = [c["id"] for c in res["chapters"]] + assert "portada" in ids and "overview" in ids + # Manifest written next to the output with both chapters versioned. + assert res["manifest_path"] and os.path.exists(res["manifest_path"]) + txt = _pdf_text(out) + # Cover fields. + assert "Automatic-EDA" in txt + assert "CSV" in txt # storage inferred from .csv source. + assert "Calidad" in txt and "92.5" in txt + assert "Fuente" in txt + # Overview content: column dictionary + describe. + assert "precio" in txt and "categoria" in txt + assert "median" in txt + + +def test_edge_tabla_larga_parte_repitiendo_cabecera(): + # 60 rows over 6 wide columns: the table must split across pages and repeat + # the header on every continuation page (headers wide enough not to wrap). + header = ["ALPHA", "BETA", "GAMMA", "DELTA", "EPSILON", "ZETA"] + rows = [[f"r{r}c{c}" for c in range(6)] for r in range(60)] + ch = Chapter(id="edge", title="Edge", version="1.0.0", + blocks=[Heading("Tabla", 1), + DataTable(header=header, rows=rows)]) + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "edge.pdf") + res = render_automatic_eda_pdf([ch], out, {"write_manifest": False}) + assert res["path"] == out + reader = PdfReader(out) + n_pages = len(reader.pages) + assert n_pages > 1 # table spilled to several pages. + pages_with_header = sum( + 1 for pg in reader.pages if "ALPHA" in (pg.extract_text() or "")) + assert pages_with_header == n_pages # header repeated on every page. + + +def test_edge_celda_larga_no_se_corta(): + # A single cell with ~150 chars must wrap inside its column (the row grows), + # never truncated: all of its words survive in the rendered PDF. + long_cell = ("Lorem ipsum dolor sit amet consectetur adipiscing elit sed do " + "eiusmod tempor incididunt ut labore et dolore magna aliqua " + "reprehenderit voluptate") + header = ["clave", "descripcion"] + rows = [["k1", long_cell], ["k2", "corto"]] + ch = Chapter(id="edge2", title="Edge2", version="1.0.0", + blocks=[DataTable(header=header, rows=rows)]) + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "edge2.pdf") + render_automatic_eda_pdf([ch], out, {"write_manifest": False}) + txt = _pdf_text(out) + # Every word of the long cell present (wrapped, not truncated). + for word in ("Lorem", "incididunt", "reprehenderit", "voluptate"): + assert word in txt + + +def test_no_corta_texto_markdown(): + para = " ".join(f"palabra{i}" for i in range(120)) + ch = Chapter(id="md", title="MD", version="1.0.0", + blocks=[Markdown(text=para)]) + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "md.pdf") + render_automatic_eda_pdf([ch], out, {"write_manifest": False}) + txt = _pdf_text(out) + for i in (0, 60, 119): # first, middle, last words all present. + assert f"palabra{i}" in txt + + +def test_edge_profile_none_y_vacio_un_pagina(): + with tempfile.TemporaryDirectory() as d: + for arg, name in ((None, "none"), ({}, "empty")): + out = os.path.join(d, f"{name}.pdf") + res = render_automatic_eda_pdf(arg, out, {"write_manifest": False}) + assert res["path"] == out + assert os.path.exists(out) + assert res["n_pages"] == 1 + + +def test_error_path_directorio_no_escribible_no_revienta(): + res = render_automatic_eda_pdf(_profile(), "/proc/nope/x.pdf", + {"write_manifest": False}) + assert res["path"] is None + assert res["n_pages"] == 0 + assert res["note"] diff --git a/python/functions/datascience/render_automatic_eda_pptx.md b/python/functions/datascience/render_automatic_eda_pptx.md new file mode 100644 index 00000000..233609e7 --- /dev/null +++ b/python/functions/datascience/render_automatic_eda_pptx.md @@ -0,0 +1,86 @@ +--- +name: render_automatic_eda_pptx +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def render_automatic_eda_pptx(chapters_or_profile, out_path: str, meta: dict = None) -> dict" +description: "Renderiza un documento AutomaticEDA por CAPÍTULOS (modelo de bloques independiente del formato) en una presentación PPTX 16:9 pensada para COMPARTIR. Acepta una lista de capítulos del modelo o directamente un TableProfile del grupo eda (construye los capítulos canónicos con build_document). Mismo principio anti-corte que el renderer PDF: cada bloque se mide y, si no cabe en la slide, continúa en una slide ' (cont.)'; las tablas largas se parten por filas REPITIENDO la cabecera; las figuras matplotlib se exportan a PNG e insertan escaladas para caber enteras. Cada slide lleva pie 'Capítulo · vX.Y.Z' y se escribe automatic_eda_manifest.json junto a la salida. dict-no-throw: nunca lanza, devuelve {path, n_slides, chapters, manifest_path, note}. Motor python-pptx (dependencia declarada en python/pyproject.toml)." +tags: [eda, pptx, render, report, share, automatic-eda, chapters, versioned, no-cut, slides, python-pptx, datascience, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [os, "python-pptx", "datascience.automatic_eda"] +params: + - name: chapters_or_profile + desc: "una lista de capítulos del modelo AutomaticEDA (dataclasses Chapter o dicts {id,title,version,blocks}) O un TableProfile dict del grupo eda. Si es un TableProfile, los capítulos canónicos se construyen con build_document(profile, meta['ctx']). Bloques soportados: heading, markdown, kv_table, data_table, figure, image, caption, note. Lectura defensiva: lo no reconocido se degrada a Note, nunca lanza." + - name: out_path + desc: "ruta del archivo PPTX de salida. Los directorios padre se crean si faltan. Directorio no escribible → {path:None, note:} sin lanzar." + - name: meta + desc: "dict opcional. Claves: title (título), ctx (contexto de presentación para los builders de capítulo cuando se da un profile), manifest_path (override; por defecto automatic_eda_manifest.json junto a out_path), write_manifest (False para no escribirlo), generated_at." +output: "dict (nunca lanza): {path: str|None, n_slides: int, chapters: list[{id,version,n_slides}], manifest_path: str|None, note: str}. En error fatal (incluida python-pptx no instalada) path es None y note explica la causa." +tested: true +tests: ["test_golden_profile_genera_pptx_portada_y_overview", "test_edge_tabla_larga_parte_repitiendo_cabecera_sin_cortar", "test_edge_profile_none_y_vacio_un_slide", "test_error_path_directorio_no_escribible_no_revienta"] +test_file_path: "python/functions/datascience/render_automatic_eda_pptx_test.py" +file_path: "python/functions/datascience/render_automatic_eda_pptx.py" +--- + +## Ejemplo + +```python +from datascience import render_automatic_eda_pptx + +# Desde un TableProfile del grupo eda (mismo modelo que el renderer PDF). +profile = { + "table": "ventas", "source": "/data/ventas.csv", + "n_rows": 1000, "n_cols": 2, "quality_score": 92.5, + "columns": [ + {"name": "precio", "inferred_type": "numeric", "null_pct": 0.01, + "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0, "max": 100.0, + "std": 12.3}}, + {"name": "categoria", "inferred_type": "categorical", "null_pct": 0.0, + "categorical": {"top": [{"value": "neumaticos", "count": 500}]}}, + ], +} +res = render_automatic_eda_pptx( + profile, "reports/ventas_aeda.pptx", + {"title": "EDA — ventas", + "ctx": {"dataset_name": "Ventas", "source_origin": "ERP export"}}) +print(res["n_slides"], res["chapters"], res["manifest_path"]) +# -> 3 [{'id':'portada','version':'1.0.0','n_slides':1}, +# {'id':'overview','version':'1.0.0','n_slides':2}] reports/automatic_eda_manifest.json +``` + +## Cuando usarla + +Cuando quieras **compartir el EDA como una presentación** (no para móvil sino para +enseñar a alguien): mismo documento por capítulos que el PDF, emitido como PPTX 16:9. +Úsala junto a `render_automatic_eda_pdf` para que cada EDA tenga sus dos salidas (PDF +móvil + PPTX para compartir) desde el mismo modelo de capítulos. Garantiza no-corte: +ningún texto, tabla ni imagen se recorta — lo que no cabe en una slide continúa en otra +`(cont.)` con la cabecera repetida en las tablas. Para añadir capítulos nuevos al +documento, ver `docs/capabilities/automatic_eda.md`. + +## Gotchas + +- **Impura**: escribe el PPTX en `out_path` y, salvo `meta['write_manifest']=False`, el + manifiesto `automatic_eda_manifest.json` junto a la salida. +- **Dependencia python-pptx**: declarada en `python/pyproject.toml` + (`python-pptx>=1.0.2`). Si no está instalada, devuelve `{path: None, note: + 'python-pptx no disponible: ...'}` sin lanzar. Instalar: + `uv pip install --python python/.venv/bin/python3 python-pptx`. +- **Nunca lanza** (dict-no-throw): un bloque que falle se omite y se anota en `note`; el + deck se genera igual. Un profile `None`/`{}` produce un deck de 1 slide válido. +- **No corta nada**: cada bloque se mide; si no cabe en la slide actual, abre una slide + `(cont.)`. Las tablas largas se parten por filas **repitiendo la cabecera** (las filas + restantes pasan a la siguiente slide). Las figuras matplotlib se exportan a PNG en + memoria y se insertan escaladas para caber enteras (nunca recortadas). +- **Figuras**: un bloque `figure` puede traer una `matplotlib.figure.Figure` ya + construida o un callable `make` (se construye perezosamente). Se cierra tras + rasterizar. Las imágenes (`image`) por ruta se escalan manteniendo el aspecto. +- **Tablas anchas**: con muchas columnas el ancho por columna se reduce y el texto se + envuelve dentro de la celda (sigue sin perderse). El reparto por grupos de columnas + para tablas muy anchas es mejora pendiente. diff --git a/python/functions/datascience/render_automatic_eda_pptx.py b/python/functions/datascience/render_automatic_eda_pptx.py new file mode 100644 index 00000000..9ad36d4f --- /dev/null +++ b/python/functions/datascience/render_automatic_eda_pptx.py @@ -0,0 +1,76 @@ +"""render_automatic_eda_pptx — chapter-based EDA report as a 16:9 PPTX deck. + +Public ``eda``-group entry point that renders an AutomaticEDA document (a list +of chapters, or an ``eda`` TableProfile from which the canonical chapters are +built) into a PowerPoint deck for sharing. Same anti-cut principle as the PDF +renderer: every block is measured and, when it does not fit, continues on a new +slide titled `` (cont.)``; data tables split by rows repeating the +header; matplotlib figures are exported to PNG and inserted scaled to fit +entirely. Each slide is stamped `` · v`` and a per-chapter +manifest (``automatic_eda_manifest.json``) is written next to the output. + +dict-no-throw: never raises. Returns ``{path, n_slides, chapters, +manifest_path, note}``; on a fatal error ``path`` is None and ``note`` explains +why (e.g. python-pptx not installed). + +Engine: ``python-pptx`` (added dependency; declared in python/pyproject.toml). +""" + +from __future__ import annotations + +import os + +from datascience.automatic_eda import build_document, merge_manifest, render_pptx +from datascience.automatic_eda.model import as_chapter, as_chapters + + +def _coerce_chapters(chapters_or_profile, meta: dict) -> list: + """Accept chapters OR an eda profile and return a list of Chapter.""" + arg = chapters_or_profile + if isinstance(arg, (list, tuple)): + return as_chapters(list(arg)) + if isinstance(arg, dict): + if "blocks" in arg and "columns" not in arg: + ch = as_chapter(arg) + return [ch] if ch is not None else [] + return build_document(arg, (meta or {}).get("ctx")) + return [] + + +def render_automatic_eda_pptx(chapters_or_profile, out_path: str, + meta: dict = None) -> dict: + """Render an AutomaticEDA document into a shareable PPTX deck. + + Args: + chapters_or_profile: a list of chapters (``Chapter`` dataclasses or + dicts) or an ``eda`` TableProfile dict (chapters built via + ``build_document(profile, meta['ctx'])``). + out_path: filesystem path for the PPTX (parent dirs are created). + meta: optional dict. Recognised keys: ``title``, ``ctx``, + ``manifest_path`` (defaults to ``automatic_eda_manifest.json`` beside + ``out_path``), ``write_manifest`` (False to skip), ``generated_at``. + + Returns: + dict (never raises): ``{path, n_slides, chapters, manifest_path, note}``. + """ + meta = dict(meta or {}) + chapters = _coerce_chapters(chapters_or_profile, meta) + result = render_pptx(chapters, out_path, meta) + + manifest_path = None + if meta.get("write_manifest", True) and result.get("path"): + manifest_path = meta.get("manifest_path") + if not manifest_path: + manifest_path = os.path.join( + os.path.dirname(os.path.abspath(out_path)), + "automatic_eda_manifest.json") + generated_at = meta.get("generated_at") or _now_iso() + merge_manifest(manifest_path, "pptx", result.get("chapters") or [], + generated_at) + result["manifest_path"] = manifest_path + return result + + +def _now_iso() -> str: + from datetime import datetime, timezone + return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") diff --git a/python/functions/datascience/render_automatic_eda_pptx_test.py b/python/functions/datascience/render_automatic_eda_pptx_test.py new file mode 100644 index 00000000..0cde8da6 --- /dev/null +++ b/python/functions/datascience/render_automatic_eda_pptx_test.py @@ -0,0 +1,114 @@ +"""Tests for render_automatic_eda_pptx — DoD: golden + edges + error path. + +Self-contained synthetic TableProfile (no DuckDB). Verifies the cover/overview +chapters render to slides, that long tables split across slides repeating the +header without losing cell text, that an empty/None profile yields a valid +1-slide deck, and that an unwritable destination returns ``{path: None}``. +""" + +import os +import tempfile + +from pptx import Presentation + +from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx +from datascience.automatic_eda.model import Chapter, DataTable, Heading + + +def _profile() -> dict: + return { + "table": "ventas", + "source": "/data/ventas.csv", + "profiled_at": "2026-06-30T10:00:00+00:00", + "n_rows": 1000, + "n_cols": 2, + "quality_score": 92.5, + "columns": [ + {"name": "precio", "inferred_type": "numeric", "null_pct": 0.01, + "null_count": 10, + "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0, + "max": 100.0, "std": 12.3}}, + {"name": "categoria", "inferred_type": "categorical", + "null_pct": 0.0, "null_count": 0, + "categorical": {"top": [{"value": "neumaticos", "count": 500}, + {"value": "aceite", "count": 300}]}}, + ], + } + + +def _slide_texts(path: str) -> list: + prs = Presentation(path) + out = [] + for sl in prs.slides: + parts = [] + for sh in sl.shapes: + if sh.has_text_frame: + parts.append(sh.text_frame.text) + if sh.has_table: + tb = sh.table + for r in range(len(tb.rows)): + for c in range(len(tb.columns)): + parts.append(tb.cell(r, c).text) + out.append(" ".join(parts)) + return out + + +def test_golden_profile_genera_pptx_portada_y_overview(): + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "eda.pptx") + res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA — ventas"}) + assert res["path"] == out + assert os.path.exists(out) + assert res["n_slides"] >= 2 + ids = [c["id"] for c in res["chapters"]] + assert "portada" in ids and "overview" in ids + assert res["manifest_path"] and os.path.exists(res["manifest_path"]) + joined = " ".join(_slide_texts(out)) + assert "Automatic-EDA" in joined + assert "CSV" in joined + assert "92.5" in joined + assert "precio" in joined and "categoria" in joined + assert "median" in joined + + +def test_edge_tabla_larga_parte_repitiendo_cabecera_sin_cortar(): + long_cell = ("Lorem ipsum dolor sit amet consectetur adipiscing elit sed do " + "eiusmod tempor incididunt reprehenderit voluptate") + header = ["ALPHA", "BETA", "GAMMA", "DELTA"] + rows = [[f"r{r}c{c}" for c in range(4)] for r in range(50)] + rows[0][1] = long_cell + ch = Chapter(id="edge", title="Edge", version="1.0.0", + blocks=[Heading("Tabla", 1), + DataTable(header=header, rows=rows)]) + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "edge.pptx") + res = render_automatic_eda_pptx([ch], out, {"write_manifest": False}) + assert res["path"] == out + texts = _slide_texts(out) + assert res["n_slides"] > 1 # table spilled to several slides. + # Header repeated: every slide that carries table rows shows "ALPHA". + slides_with_header = sum(1 for t in texts if "ALPHA" in t) + assert slides_with_header >= 2 + joined = " ".join(texts) + assert "Lorem ipsum dolor" in joined and "reprehenderit voluptate" in joined + # No row lost: every data cell r0..r49 col0 present. + for r in (0, 25, 49): + assert f"r{r}c0" in joined + + +def test_edge_profile_none_y_vacio_un_slide(): + with tempfile.TemporaryDirectory() as d: + for arg, name in ((None, "none"), ({}, "empty")): + out = os.path.join(d, f"{name}.pptx") + res = render_automatic_eda_pptx(arg, out, {"write_manifest": False}) + assert res["path"] == out + assert os.path.exists(out) + assert res["n_slides"] == 1 + + +def test_error_path_directorio_no_escribible_no_revienta(): + res = render_automatic_eda_pptx(_profile(), "/proc/nope/x.pptx", + {"write_manifest": False}) + assert res["path"] is None + assert res["n_slides"] == 0 + assert res["note"] diff --git a/python/pyproject.toml b/python/pyproject.toml index f75f2afa..9553fbe8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "pypdf>=6.10.0", "pyproj>=3.7.2", "python-docx>=1.2.0", + "python-pptx>=1.0.2", "pyyaml>=6.0.3", "qrcode[pil]>=8.2", "rapidfuzz>=3.14.5",