From d1a3d58a6bfc609c02c019e7a884ec09028072ad Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Tue, 30 Jun 2026 17:35:19 +0200 Subject: [PATCH] =?UTF-8?q?feat(eda):=20motor=20AutomaticEDA=20fase=204a?= =?UTF-8?q?=20=E2=80=94=20render=20fixes=20+=20keep-together=20+=20glosari?= =?UTF-8?q?o=20clicable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mejoras transversales del motor de render (no del contenido de capítulos): 1. Fix negrita pisa texto (PDF): _place_rich_lines mide el ancho REAL de cada span con las métricas de fuente del renderer (peso correcto) en vez del grid de ancho medio; negrita y normal en la misma línea ya no se solapan. 2. Zebra striping: filas pares sombreadas (#f6f8fa) en DataTable (PDF + PPTX), coherente al partir tablas largas (índice de fila lógico, no por página). 3. Keep-together: bloque Group nuevo; el renderer mide el grupo entero y lo mueve completo a la página/slide siguiente si no cabe, y encoge la figura (height_in) para dejar sitio a su título y texto. num_distr lo usa. 4. Caption siempre visible en toda figura PPTX (fallback al heading); la figura reserva el alto de su caption para que ambos quepan en el mismo slide. 5. Portada construida al final (con resumen agregado del análisis vía ctx['document_summary']) pero colocada primera por build_document. 6. Glosario: capítulo nuevo (último) + GlossaryCollector en ctx; los capítulos registran términos y marcan apariciones con [[term:key]]...[[/term]]. Links clicables reales: PDF (PyMuPDF, link GOTO) y PPTX (slide-jump nativo). Enganchado "entropía" en cat_distr como ejemplo end-to-end. Funciones reutilizables delegadas a fn-constructor (tag eda): - add_pdf_internal_links_py_datascience (PyMuPDF) - pptx_link_run_to_slide_py_datascience (slide-jump) Contrato docs/automatic_eda_contract.md actualizado (§1/§3/§5 + §11 nueva) con la API de glosario, keep-together y zebra para la siguiente fase. PyMuPDF declarado en pyproject. Suite verde (90 tests); golden titanic verificado. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/automatic_eda_contract.md | 126 +++++- python/functions/datascience/__init__.py | 2 + .../datascience/add_pdf_internal_links.md | 85 ++++ .../datascience/add_pdf_internal_links.py | 132 ++++++ .../add_pdf_internal_links_test.py | 77 ++++ .../datascience/automatic_eda/__init__.py | 6 + .../automatic_eda/chapters/cat_distr.py | 33 +- .../automatic_eda/chapters/glosario.py | 47 ++ .../automatic_eda/chapters/num_distr.py | 19 +- .../automatic_eda/chapters/num_distr_test.py | 24 +- .../automatic_eda/chapters/portada.py | 52 ++- .../automatic_eda/chapters_registry.py | 78 +++- .../datascience/automatic_eda/model.py | 107 ++++- .../automatic_eda/render_features_test.py | 354 +++++++++++++++ .../automatic_eda/render_pdf_impl.py | 409 ++++++++++++++++-- .../automatic_eda/render_pptx_impl.py | 334 ++++++++++++-- .../datascience/automatic_eda/text_layout.py | 129 ++++++ .../datascience/pptx_link_run_to_slide.md | 85 ++++ .../datascience/pptx_link_run_to_slide.py | 50 +++ .../pptx_link_run_to_slide_test.py | 73 ++++ python/pyproject.toml | 1 + 21 files changed, 2116 insertions(+), 107 deletions(-) create mode 100644 python/functions/datascience/add_pdf_internal_links.md create mode 100644 python/functions/datascience/add_pdf_internal_links.py create mode 100644 python/functions/datascience/add_pdf_internal_links_test.py create mode 100644 python/functions/datascience/automatic_eda/chapters/glosario.py create mode 100644 python/functions/datascience/automatic_eda/render_features_test.py create mode 100644 python/functions/datascience/pptx_link_run_to_slide.md create mode 100644 python/functions/datascience/pptx_link_run_to_slide.py create mode 100644 python/functions/datascience/pptx_link_run_to_slide_test.py diff --git a/docs/automatic_eda_contract.md b/docs/automatic_eda_contract.md index 63e55213..efd96fa9 100644 --- a/docs/automatic_eda_contract.md +++ b/docs/automatic_eda_contract.md @@ -25,7 +25,8 @@ cabecera, y figuras/imágenes se escalan para caber enteras. ``` Document = list[Chapter] Chapter = { id: str, title: str, version: str, blocks: list[Block] } -Block = Heading | Markdown | KVTable | DataTable | Figure | Image | Caption | Note +Block = Heading | Markdown | KVTable | DataTable | Figure | Image | Caption + | Note | Group | GlossaryEntry ``` Importa el modelo desde `datascience.automatic_eda.model` (o @@ -44,6 +45,10 @@ reconocido se degrada a `Note`, nunca lanza). | `Figure(fig=None, make=None, caption=None, height_in=None)` | una `matplotlib.figure.Figure` ya construida (`fig`) o un callable `make()->Figure` (perezoso) | se rasteriza y escala para caber entera (nunca recortada) | | `Image(path, caption=None, height_in=None)` | ruta a PNG/JPG | se escala para caber entera | | `Caption(text)` / `Note(text)` | texto auxiliar pequeño | pie/nota en gris; `Note` es además el fallback de lo desconocido | +| `Group(blocks, title=None)` | unidad **keep-together**: sus bloques se mantienen juntos | el renderer mide el grupo entero y lo mueve completo a la página/slide siguiente si no cabe; encoge la figura para dejar sitio al título+texto. Ver §11 | +| `GlossaryEntry(key, label, definition)` | una entrada del glosario (destino clicable) | la genera el capítulo `glosario`; registra su posición como destino de los términos marcados. Ver §11 | + +`Figure`/`Image` aceptan `height_in` (hint): el renderer **clampa** la figura a esa altura máxima (lo usa `Group` para encoger la figura). Toda figura escala dejando sitio a su caption en la misma página/slide; en PPTX el caption es **siempre** visible (si no se da `caption`, cae al último heading o a "Figura"). ### Subset de markdown soportado (`Markdown`) @@ -84,8 +89,9 @@ El orden canónico está **pre-declarado** en ```python CHAPTER_ORDER = [ - "portada", "overview", "num_distr", "cat_distr", "calidad", "correlacion", - "modelos", "analisis_llm", "timeseries", "geospatial", "agregacion", + "portada", "overview", "analisis_llm", "num_distr", "cat_distr", "calidad", + "correlacion", "modelos", "timeseries", "geospatial", "agregacion", + "glosario", ] ``` @@ -95,6 +101,15 @@ CHAPTER_ORDER = [ `CHAPTER_ORDER`) y aparecerá automáticamente en su posición. Esto permite que muchos agentes trabajen **en paralelo** sin contención: cada uno toca solo su archivo. +**Dos capítulos tienen posición especial** (los gestiona `build_document`, no toques esto): + +- `portada`: se **construye el último** (después del cuerpo) para poder resumir el + análisis, pero se **coloca el primero**. Recibe `ctx['document_summary']` (ver §5) con + un resumen agregado del resto. Decisión del usuario: la portada refleja hallazgos. +- `glosario`: se construye y se **coloca el último**. Lee los términos que los demás + capítulos registraron en `ctx['glossary']` (ver §11). Si no se registró ninguno, el + capítulo devuelve `None` y desaparece. + Si tu capítulo usa un `` que aún no está en `CHAPTER_ORDER`, añádelo en la posición correcta (única edición compartida; coordínala con el orquestador). @@ -143,6 +158,8 @@ defensivo). Esto habilita el **seguimiento y la mejora continua por capítulo**. | `granularity` | "Cada fila es…" (portada). Default: derivado de `key_candidates` | | `quality_criteria` | criterios del score de calidad (portada) | | `head_rows` | `list[dict]` con `df.head` (overview). Ver §7 | +| `glossary` | `GlossaryCollector` compartido — los capítulos registran términos en él. Lo crea `build_document`; ver §11 | +| `document_summary` | dict con el resumen agregado del cuerpo (n_rows, n_cols, quality_score, n_numeric, n_categorical, chapter_titles, …). Lo calcula `build_document` y lo consume la portada | Un capítulo puede definir y consumir sus propias claves `ctx` — documenta cuáles en su docstring. @@ -279,6 +296,109 @@ sus bloques presentes y el no-corte (texto largo intacto en la salida). Patrón: --- +## 11. Glosario, keep-together y zebra (motor, fase 4a) + +Tres capacidades transversales del motor que **todos** los capítulos pueden usar. La 6.1 +(glosario) requiere que el capítulo coopere (registrar + marcar términos); la 6.2 +(keep-together) es opt-in por capítulo (envolver bloques en `Group`); la 6.3 (zebra) es +automática (no hay nada que hacer). + +### 11.1 Glosario con términos clicables + +El glosario es un capítulo nuevo (`chapters/glosario.py`) que se renderiza **siempre el +último** y lista cada término técnico que algún capítulo haya registrado. Cada aparición +del término en el texto se vuelve un **clic real** que salta a su entrada: en PDF como +*link annotation* interno (post-proceso con PyMuPDF, porque `PdfPages` no soporta +hyperlinks internos), en PPTX como *slide-jump* nativo (`ppaction://hlinksldjump`). + +**API exacta para un capítulo (dos pasos):** + +1. **Registrar el término** en el colector compartido `ctx['glossary']` (un + `model.GlossaryCollector`, creado por `build_document` y pasado a todos los capítulos): + + ```python + glossary = ctx.get("glossary") + if isinstance(glossary, model.GlossaryCollector): + glossary.add("entropia", "Entropía (de Shannon)", "Medida, en bits, de …") + ``` + + `add(key, label, definition)` es idempotente (la primera definición de cada `key` gana). + `key` debe ser `[A-Za-z0-9_]+`. Si no hay colector en `ctx` (renderizado suelto), el + capítulo simplemente no marca términos — degrada sin romper. + +2. **Marcar cada aparición** en el texto de un bloque `Markdown` con el span inline + `[[term:KEY]]texto visible[[/term]]`. El texto visible puede llevar `**negrita**`. El + marcador no altera el texto visible (se elimina como cualquier marcador inline); solo + añade el destino clicable. + + ```python + # En cat_distr (ejemplo real ya implementado): + "La [[term:entropia]]**entropía de Shannon**[[/term]] mide cómo de repartidos…" + ``` + +Eso es todo: el capítulo `glosario` recoge los términos (orden alfabético por `label`), +emite un `GlossaryEntry` por término, y los renderers cablean los enlaces automáticamente. +Si ningún capítulo registró términos, el glosario no aparece. + +**Helpers de `text_layout` (no reimplementar):** `parse_inline_rich(text)` → +`[(texto, is_bold, term_key), …]`; `wrap_rich_terms(text, max_chars)` → líneas de esos +spans sin corte. `strip_inline_md` ya elimina los marcadores `[[term:…]]`/`[[/term]]`. +(Las funciones previas `parse_inline_bold` / `wrap_rich` siguen existiendo, sin términos.) + +**Funciones del registry que cablean los enlaces** (grupo `eda`, ya invocadas por los +renderers; degradan en silencio si faltan): `add_pdf_internal_links_py_datascience` +(PyMuPDF, link GOTO) y `pptx_link_run_to_slide_py_datascience` (salto a slide nativo). +Dependencia: `pymupdf` (declarada en `python/pyproject.toml`). + +**Trabajo de la siguiente fase — enganchar más términos.** El mecanismo está hecho y +probado de extremo a extremo con `entropia` (en `cat_distr`). Cada capítulo debe registrar +y marcar SUS términos con el mismo patrón de dos pasos. Candidatos por capítulo: + +| Capítulo | Términos a enganchar (key sugerida) | +|---|---| +| `cat_distr` | `entropia` ✅ (hecho) | +| `calidad` | `completitud`, `validez`, `consistencia` | +| `correlacion` | `cramers_v`, `fdr` (comparaciones múltiples), método de correlación usado | +| `modelos` | `pca`, `silhouette`, `isolation_forest` | +| `timeseries` | `estacionariedad`, `acf_pacf`, `stl` | +| `num_distr` | `iqr`, `curtosis`, `outlier` (vallas de Tukey) | + +Define la definición de cada término en su capítulo (constante local, como +`_TERM_ENTROPIA_DEF` en `cat_distr`) y márcalo en su primera aparición. + +### 11.2 Keep-together: gráfico junto a su título y texto (`Group`) + +Para que un encabezado no quede en una página/slide y su figura en la siguiente, envuelve +los bloques de una misma idea en un `model.Group`: + +```python +blocks.append(model.Group(blocks=[ + model.Heading(text=str(name), level=2), + model.Figure(make=_figura_perezosa(...), caption="…"), + model.Markdown(text="explicación…"), +])) +``` + +El renderer **mide el grupo entero** antes de dibujar nada: si no cabe en lo que queda de +página/slide pero cabe en una entera, lo mueve **completo** a la siguiente; y **encoge la +figura** (vía `height_in`) lo justo para que el título + texto + figura quepan juntos. Si +el grupo es más alto que una página entera, empieza en una nueva y fluye (degradación +honesta, nunca corta). Ejemplo real implementado: `num_distr` envuelve cada columna +(heading + figura histograma/boxplot + nota) en un `Group`. + +Recomendado para `agregacion` y cualquier capítulo donde una figura deba ir pegada a su +título/explicación. Coste: si un capítulo inspecciona `chapter.blocks` en sus tests, ahora +encontrará `Group`s — aplana con un helper recursivo (ver `num_distr_test.py::_flatten`). + +### 11.3 Zebra striping en tablas (automático) + +Todo `DataTable` se renderiza con **filas pares sombreadas** (gris muy suave `#f6f8fa`) y +cabecera con su fondo propio. Es automático en PDF y PPTX; el patrón se mantiene coherente +cuando una tabla larga se parte y repite cabecera (el índice de fila es lógico, no por +página). No hay nada que hacer en los capítulos. + +--- + ## 10. Integración futura con `profile_table` (siguiente fase) `profile_table(emit_pdf=True)` usa hoy `render_eda_pdf` (intacto). En la siguiente fase diff --git a/python/functions/datascience/__init__.py b/python/functions/datascience/__init__.py index a1e6331f..6302642f 100644 --- a/python/functions/datascience/__init__.py +++ b/python/functions/datascience/__init__.py @@ -68,11 +68,13 @@ from .extract_timeseries_raw import extract_timeseries_raw from .build_eda_render_ctx import build_eda_render_ctx from .profile_datetime import profile_datetime from .resample_timeseries import resample_timeseries +from .add_pdf_internal_links import add_pdf_internal_links __all__ = [ "detect_time_column", "extract_timeseries_raw", "build_eda_render_ctx", + "add_pdf_internal_links", "profile_datetime", "resample_timeseries", "render_automatic_eda_pdf", diff --git a/python/functions/datascience/add_pdf_internal_links.md b/python/functions/datascience/add_pdf_internal_links.md new file mode 100644 index 00000000..c1a3873f --- /dev/null +++ b/python/functions/datascience/add_pdf_internal_links.md @@ -0,0 +1,85 @@ +--- +name: add_pdf_internal_links +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def add_pdf_internal_links(pdf_path: str, links: list) -> dict" +description: "Postprocesa un PDF YA escrito insertando link annotations internos de tipo GOTO ('ir a') con PyMuPDF (import fitz). Pensado para PDFs generados por matplotlib PdfPages, que NO soporta hyperlinks internos: tras escribir el PDF se reabre y, por cada entrada de `links`, se añade una anotacion clicable desde un rectangulo de una pagina origen (src_page + src_rect en puntos top-left) hasta un punto de una pagina destino (dst_page + dst_point). Caso de uso tipico del grupo eda: hacer clicables los terminos de un AutomaticEDA que apuntan a su entrada en el glosario al final del documento. Estilo dict-no-throw: NUNCA lanza; valida cada link y SALTA (n_skipped++) los malformados o fuera de rango en vez de fallar. Guarda de forma segura escribiendo a un temporal en el mismo directorio y haciendo os.replace atomico (evita corromper el original). Devuelve {status:ok,n_links,n_skipped} o {status:error,error}; si pymupdf no esta disponible o el archivo no existe devuelve status error." +tags: [eda, datascience, pdf, links, glossary, pymupdf, fitz, postprocess, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [] +params: + - name: pdf_path + desc: "ruta al PDF existente (str no vacio). Se reescribe IN SITU (in-place) tras añadir los links: se guarda a un temporal `..tmp_links` en el mismo directorio y se reemplaza atomicamente con os.replace. Si no es str o no existe el archivo -> {status:error}." + - name: links + desc: "lista de dicts, uno por link a insertar. Cada dict: src_page (int 0-based de la pagina origen), src_rect ([x0,y0,x1,y1] del rectangulo clicable en PUNTOS PDF 1/72\" con origen ARRIBA-IZQUIERDA), dst_page (int 0-based de la pagina destino), dst_point ([x,y] punto destino, mismos puntos top-left). Las entradas que no son dict, con page fuera de rango [0,page_count), src_rect que no tenga 4 numeros o dst_point que no tenga 2 numeros se SALTAN (n_skipped++), no lanzan. None se trata como lista vacia." +output: "dict (NUNCA lanza): en exito {\"status\":\"ok\",\"n_links\":int,\"n_skipped\":int} con n_links = anotaciones GOTO insertadas y n_skipped = entradas invalidas saltadas. En fallo {\"status\":\"error\",\"error\":str}: pymupdf no disponible, pdf_path no es str / no existe, links no es lista, o cualquier excepcion global (el PDF original queda intacto porque el replace solo ocurre tras un save correcto)." +tested: true +tests: ["test_add_goto_link_basico", "test_links_invalidos_se_saltan", "test_archivo_inexistente_devuelve_error"] +test_file_path: "python/functions/datascience/add_pdf_internal_links_test.py" +file_path: "python/functions/datascience/add_pdf_internal_links.py" +--- + +## Ejemplo + +```python +import sys, os +sys.path.insert(0, os.path.join("python", "functions")) +from datascience import add_pdf_internal_links + +# Tienes un PDF ya escrito por matplotlib PdfPages (sin hyperlinks internos). +# Quieres que el texto "Margen bruto" de la pagina 0 (rectangulo en puntos +# top-left) salte a su entrada del glosario en la ultima pagina (indice 7). +res = add_pdf_internal_links( + "reports/eda.pdf", + [ + {"src_page": 0, "src_rect": [72, 120, 180, 134], "dst_page": 7, "dst_point": [72, 200]}, + {"src_page": 0, "src_rect": [72, 140, 180, 154], "dst_page": 7, "dst_point": [72, 260]}, + ], +) +# res == {"status": "ok", "n_links": 2, "n_skipped": 0} +``` + +## Cuando usarla + +Justo DESPUES de escribir un PDF con matplotlib `PdfPages` (o cualquier motor +que no genere hyperlinks internos) cuando necesitas que ciertos terminos o +referencias sean clicables y salten a otra pagina del mismo documento — el caso +canonico es enlazar los terminos de un AutomaticEDA con su entrada de glosario +al final. Es un paso de postproceso: primero generas el PDF y calculas en que +rectangulo quedo cada termino (en puntos PDF), luego pasas esa lista a esta +funcion para inyectar las anotaciones GOTO. + +## Gotchas + +- **Impura — reescribe el archivo IN SITU.** El PDF en `pdf_path` se reemplaza + por la version con los links. El guardado es seguro: escribe a un temporal + `..tmp_links` en el MISMO directorio y hace `os.replace` atomico tras + cerrar el documento, asi un fallo a mitad no corrompe el original. Aun asi, + conserva una copia si el PDF es valioso. +- **Sistema de coordenadas: puntos top-left, igual que matplotlib.** PyMuPDF y + matplotlib (PdfPages) usan ambos PUNTOS PDF (1/72") con el origen ARRIBA- + IZQUIERDA, asi que los rectangulos/puntos COINCIDEN: el `src_rect` que calcules + con la geometria de la figura matplotlib se pasa tal cual, sin invertir el eje + Y. (Ojo: el espacio de datos de matplotlib SI tiene el origen abajo; lo que + coincide es el espacio de la PAGINA en puntos.) +- **Indices de pagina 0-based.** `src_page` / `dst_page` son indices base 0 + (la primera pagina es 0). Fuera del rango `[0, page_count)` el link se SALTA + (cuenta en `n_skipped`), no lanza. +- **dict-no-throw, validacion por-link.** Las entradas malformadas (no dict, + page fuera de rango, `src_rect` sin 4 numeros, `dst_point` sin 2 numeros) se + saltan individualmente e incrementan `n_skipped`; el resto de links validos se + insertan igual. La funcion solo devuelve `{status:error}` ante fallos globales + (pymupdf ausente, archivo inexistente, `links` no es lista). +- **`error_type: error_go_core` es metadata del registry, no comportamiento.** + Toda funcion impura debe declararlo y el indexer lo exige, pero el codigo NUNCA + lanza esa excepcion: degrada al dict de estado. +- **Requiere PyMuPDF (`import fitz`).** Si no esta instalado devuelve + `{"status":"error","error":"pymupdf no disponible: ..."}`. En el registry el + venv `python/.venv` ya lo trae. diff --git a/python/functions/datascience/add_pdf_internal_links.py b/python/functions/datascience/add_pdf_internal_links.py new file mode 100644 index 00000000..7084e778 --- /dev/null +++ b/python/functions/datascience/add_pdf_internal_links.py @@ -0,0 +1,132 @@ +"""Postprocesa un PDF existente insertando link annotations internos (GOTO). + +Motor: PyMuPDF (``import fitz``). Pensado para PDFs generados por matplotlib +``PdfPages``, que no soporta hyperlinks internos: tras escribir el PDF, esta +funcion lo reabre y le añade anotaciones "ir a" (GOTO) desde un rectangulo de +una pagina origen hasta un punto de una pagina destino. Util para hacer +clicables terminos que apuntan a su entrada en un glosario al final del +documento. + +Estilo dict-no-throw del grupo `eda`: NUNCA lanza; devuelve un dict de estado. +""" + +import os + + +def add_pdf_internal_links(pdf_path: str, links: list) -> dict: + """Añade link annotations internos (GOTO) a un PDF ya escrito. + + Postprocesa un PDF (p.ej. generado por matplotlib PdfPages, que NO soporta + hyperlinks internos) insertando, por cada entrada de ``links``, una + anotacion de tipo "ir a" desde un rectangulo de una pagina origen hasta un + punto de una pagina destino. Sirve para hacer clicables terminos que apuntan + a su entrada en un glosario al final del documento. + + Args: + pdf_path: ruta al PDF existente (se reescribe in situ). + links: lista de dicts, cada uno: + { + "src_page": int, # indice 0-based de la pagina origen + "src_rect": [x0,y0,x1,y1], # rectangulo clicable, en PUNTOS PDF + # (1/72") con origen ARRIBA-IZQUIERDA + "dst_page": int, # indice 0-based de la pagina destino + "dst_point": [x, y], # punto destino, mismos puntos top-left + } + + Returns: + dict (NUNCA lanza): {"status":"ok","n_links":int,"n_skipped":int} + o {"status":"error","error":str}. Si pymupdf no esta disponible o el + archivo no existe -> {"status":"error", ...}. + """ + try: + try: + import fitz # PyMuPDF + except Exception as exc: # ImportError u otro fallo de carga + return {"status": "error", "error": f"pymupdf no disponible: {exc}"} + + if not isinstance(pdf_path, str) or not pdf_path: + return {"status": "error", "error": "pdf_path debe ser una ruta no vacia"} + if not os.path.isfile(pdf_path): + return {"status": "error", "error": f"el archivo no existe: {pdf_path}"} + + if links is None: + links = [] + if not isinstance(links, (list, tuple)): + return {"status": "error", "error": "links debe ser una lista de dicts"} + + doc = fitz.open(pdf_path) + try: + n_pages = doc.page_count + n_ok = 0 + n_skipped = 0 + + for link in links: + if not isinstance(link, dict): + n_skipped += 1 + continue + + src_page = link.get("src_page") + dst_page = link.get("dst_page") + src_rect = link.get("src_rect") + dst_point = link.get("dst_point") + + # src_page / dst_page: enteros 0-based en rango. + if not _is_int(src_page) or not _is_int(dst_page): + n_skipped += 1 + continue + if not (0 <= src_page < n_pages) or not (0 <= dst_page < n_pages): + n_skipped += 1 + continue + + # src_rect: 4 numeros. + if not _is_num_seq(src_rect, 4): + n_skipped += 1 + continue + # dst_point: 2 numeros. + if not _is_num_seq(dst_point, 2): + n_skipped += 1 + continue + + try: + doc[int(src_page)].insert_link( + { + "kind": fitz.LINK_GOTO, + "from": fitz.Rect(*[float(v) for v in src_rect]), + "page": int(dst_page), + "to": fitz.Point(*[float(v) for v in dst_point]), + } + ) + n_ok += 1 + except Exception: + n_skipped += 1 + continue + + # Guardado seguro: escribir a temporal en el mismo directorio y + # reemplazar atomicamente (evita corromper el PDF original). + directory = os.path.dirname(os.path.abspath(pdf_path)) or "." + base = os.path.basename(pdf_path) + tmp_path = os.path.join(directory, f".{base}.tmp_links") + doc.save(tmp_path) + finally: + doc.close() + + os.replace(tmp_path, pdf_path) + + return {"status": "ok", "n_links": n_ok, "n_skipped": n_skipped} + except Exception as exc: # degrada cualquier fallo a dict de error + return {"status": "error", "error": str(exc)} + + +def _is_int(value) -> bool: + """True si value es un entero (no bool).""" + return isinstance(value, int) and not isinstance(value, bool) + + +def _is_num_seq(value, length: int) -> bool: + """True si value es una secuencia de `length` numeros (int/float, no bool).""" + if not isinstance(value, (list, tuple)) or len(value) != length: + return False + for v in value: + if isinstance(v, bool) or not isinstance(v, (int, float)): + return False + return True diff --git a/python/functions/datascience/add_pdf_internal_links_test.py b/python/functions/datascience/add_pdf_internal_links_test.py new file mode 100644 index 00000000..79baabb2 --- /dev/null +++ b/python/functions/datascience/add_pdf_internal_links_test.py @@ -0,0 +1,77 @@ +"""Tests para add_pdf_internal_links.""" + +import os +import sys + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) + +from add_pdf_internal_links import add_pdf_internal_links + + +def test_add_goto_link_basico(tmp_path): + """Golden: un PDF de 2 paginas recibe un link GOTO de la pag 0 a la pag 1.""" + fitz = pytest.importorskip("fitz") + + # 1) PDF temporal de 2 paginas A5 (~419x595 puntos). + pdf = str(tmp_path / "doc.pdf") + doc = fitz.open() + doc.new_page(width=419, height=595) + doc.new_page(width=419, height=595) + doc.save(pdf) + doc.close() + + # 2) Insertar un link interno desde la pag 0 hacia la pag 1. + res = add_pdf_internal_links( + pdf, + [{"src_page": 0, "src_rect": [50, 50, 200, 70], "dst_page": 1, "dst_point": [40, 40]}], + ) + assert res["status"] == "ok" + assert res["n_links"] == 1 + assert res["n_skipped"] == 0 + + # 3) Reabrir y verificar que la pag 0 tiene un link GOTO a la pag 1. + doc = fitz.open(pdf) + try: + links = doc[0].get_links() + goto = [l for l in links if l.get("kind") == fitz.LINK_GOTO and l.get("page") == 1] + assert len(goto) >= 1 + finally: + doc.close() + + +def test_links_invalidos_se_saltan(tmp_path): + """Edge: entradas malformadas o fuera de rango incrementan n_skipped, no lanzan.""" + fitz = pytest.importorskip("fitz") + + pdf = str(tmp_path / "doc.pdf") + doc = fitz.open() + doc.new_page(width=419, height=595) + doc.new_page(width=419, height=595) + doc.save(pdf) + doc.close() + + res = add_pdf_internal_links( + pdf, + [ + # valido + {"src_page": 0, "src_rect": [10, 10, 90, 30], "dst_page": 1, "dst_point": [20, 20]}, + # dst_page fuera de rango + {"src_page": 0, "src_rect": [10, 40, 90, 60], "dst_page": 9, "dst_point": [20, 20]}, + # src_rect con 3 numeros + {"src_page": 0, "src_rect": [10, 70, 90], "dst_page": 1, "dst_point": [20, 20]}, + # no es dict + "no-soy-un-dict", + ], + ) + assert res["status"] == "ok" + assert res["n_links"] == 1 + assert res["n_skipped"] == 3 + + +def test_archivo_inexistente_devuelve_error(): + """Error path: pdf_path inexistente -> status error sin lanzar.""" + res = add_pdf_internal_links("/ruta/que/no/existe_xyz.pdf", []) + assert res["status"] == "error" + assert "error" in res diff --git a/python/functions/datascience/automatic_eda/__init__.py b/python/functions/datascience/automatic_eda/__init__.py index 95d6f374..f9a6f2e3 100644 --- a/python/functions/datascience/automatic_eda/__init__.py +++ b/python/functions/datascience/automatic_eda/__init__.py @@ -21,6 +21,9 @@ from .model import ( # noqa: F401 Chapter, DataTable, Figure, + GlossaryCollector, + GlossaryEntry, + Group, Heading, Image, KVTable, @@ -45,6 +48,9 @@ __all__ = [ "Image", "Caption", "Note", + "Group", + "GlossaryEntry", + "GlossaryCollector", "Chapter", "as_blocks", "as_chapters", diff --git a/python/functions/datascience/automatic_eda/chapters/cat_distr.py b/python/functions/datascience/automatic_eda/chapters/cat_distr.py index c593a6b7..6421a574 100644 --- a/python/functions/datascience/automatic_eda/chapters/cat_distr.py +++ b/python/functions/datascience/automatic_eda/chapters/cat_distr.py @@ -33,10 +33,23 @@ import math from .. import model -CHAPTER_VERSION = "1.0.0" +CHAPTER_VERSION = "1.1.0" CHAPTER_ID = "cat_distr" CHAPTER_TITLE = "Distribuciones categóricas" +# Glossary term this chapter explains. Registered in the shared collector and +# marked clickable on its first appearance (end-to-end glossary example — +# mejora 6). Other chapters hook their own terms the same way (see the contract). +_TERM_ENTROPIA_KEY = "entropia" +_TERM_ENTROPIA_LABEL = "Entropía (de Shannon)" +_TERM_ENTROPIA_DEF = ( + "Medida, en bits, de cómo de repartidos están los valores de una columna " + "categórica. Vale 0 cuando una sola categoría concentra todas las filas " + "(máxima previsibilidad) y alcanza su máximo, log2(k) para k categorías " + "distintas, cuando todas aparecen por igual (máxima diversidad). La entropía " + "normalizada (entropía dividida por su máximo) la lleva al rango 0–1 para " + "comparar columnas con distinto número de categorías.") + # Cap the number of categorical columns rendered to keep the document bounded; # the rest are summarized in a closing note (no silent truncation). MAX_COLS = 40 @@ -337,10 +350,14 @@ def _topk_table(cat: dict): note=note) -def _intro_blocks(n_rows): +def _intro_blocks(n_rows, mark_term: bool = False): total = _fmt_int(n_rows) + # Mark the first appearance of the term as a clickable glossary jump when the + # term was registered (mark_term). The visible text is identical either way. + entropia = ("[[term:entropia]]**entropía de Shannon**[[/term]]" if mark_term + else "**entropía de Shannon**") text = ( - "La **entropía de Shannon** mide cómo de repartidos están los valores de " + f"La {entropia} mide cómo de repartidos están los valores de " "una columna categórica, en bits. Vale 0 cuando una sola categoría " "concentra todas las filas (máxima previsibilidad) y alcanza su máximo, " "log2(k) para k categorías distintas, cuando todas aparecen por igual " @@ -370,7 +387,15 @@ def build_cat_distr(profile: dict, ctx: dict): return None n_rows = profile.get("n_rows") - blocks = list(_intro_blocks(n_rows)) + # Register "entropía" in the shared glossary collector (if present) and mark + # its first appearance clickable. End-to-end glossary example (mejora 6). + glossary = ctx.get("glossary") + mark_term = False + if isinstance(glossary, model.GlossaryCollector): + glossary.add(_TERM_ENTROPIA_KEY, _TERM_ENTROPIA_LABEL, + _TERM_ENTROPIA_DEF) + mark_term = True + blocks = list(_intro_blocks(n_rows, mark_term=mark_term)) rendered = cat_cols[:MAX_COLS] for col in rendered: diff --git a/python/functions/datascience/automatic_eda/chapters/glosario.py b/python/functions/datascience/automatic_eda/chapters/glosario.py new file mode 100644 index 00000000..fe7098fc --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/glosario.py @@ -0,0 +1,47 @@ +"""Glossary chapter (GLOSARIO) — always the last chapter, clickable terms. + +Renders one entry per glossary term that the other chapters registered during +the document build through ``ctx['glossary'].add(key, label, definition)`` (see +``GlossaryCollector`` in ``model.py``). Each entry is a clickable destination: +every in-text appearance a chapter marked with ``[[term:key]]texto[[/term]]`` +becomes a real jump to its entry here — PDF link annotations (PyMuPDF) and PPTX +native slide jumps, both wired by the renderers. + +Returns ``None`` when no term was registered (there is nothing to show), so the +chapter simply disappears from documents that did not mark any term. + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +""" + +from __future__ import annotations + +from .. import model + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "glosario" +CHAPTER_TITLE = "Glosario" + + +def build_glosario(profile: dict, ctx: dict): + """Build the glossary Chapter from the shared collector, or None if empty.""" + ctx = ctx or {} + glossary = ctx.get("glossary") + if not isinstance(glossary, model.GlossaryCollector) or not glossary: + return None + + blocks = [ + model.Heading(text="Glosario de términos", level=1), + model.Markdown(text=( + "Definición de los términos técnicos que aparecen en el informe. " + "Cada término va resaltado en el texto y, al pulsarlo, salta a su " + "definición en esta sección.")), + ] + # One clickable destination per term, alphabetically by visible label. + for term in glossary.terms(by="label"): + blocks.append(model.GlossaryEntry( + key=model._safe_str(term.get("key")), + label=model._safe_str(term.get("label")), + definition=model._safe_str(term.get("definition")))) + + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/num_distr.py b/python/functions/datascience/automatic_eda/chapters/num_distr.py index 6c105dc6..67a47779 100644 --- a/python/functions/datascience/automatic_eda/chapters/num_distr.py +++ b/python/functions/datascience/automatic_eda/chapters/num_distr.py @@ -34,7 +34,7 @@ try: except Exception: # noqa: BLE001 — keep the chapter importable no matter what. build_boxplot_stats = None # type: ignore[assignment] -CHAPTER_VERSION = "1.0.0" +CHAPTER_VERSION = "1.1.0" CHAPTER_ID = "num_distr" CHAPTER_TITLE = "Distribuciones numéricas" @@ -278,12 +278,17 @@ def build_num_distr(profile: dict, ctx: dict): box = build_boxplot_stats(numeric) or {} except Exception: # noqa: BLE001 — degrade, never raise. box = {} - blocks.append(model.Heading(text=str(name), level=2)) - blocks.append(model.Figure( - make=_figure_maker(name, numeric, box), - caption=f"Distribución de «{name}» — histograma (media/mediana/±σ) " - f"y boxplot.")) - blocks.append(model.Markdown(text=_stats_note(name, numeric, box))) + # Keep the column heading, its figure and its stats note together on the + # same page/slide (mejora 3 — keep-together): the renderers measure the + # whole Group and move it whole when it would not fit. + blocks.append(model.Group(blocks=[ + model.Heading(text=str(name), level=2), + model.Figure( + make=_figure_maker(name, numeric, box), + caption=f"Distribución de «{name}» — histograma " + f"(media/mediana/±σ) y boxplot."), + model.Markdown(text=_stats_note(name, numeric, box)), + ])) return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/num_distr_test.py b/python/functions/datascience/automatic_eda/chapters/num_distr_test.py index a9b459ed..71793ad1 100644 --- a/python/functions/datascience/automatic_eda/chapters/num_distr_test.py +++ b/python/functions/datascience/automatic_eda/chapters/num_distr_test.py @@ -65,19 +65,33 @@ def _pdf_text(path: str) -> str: return re.sub(r"\s+", " ", txt) +def _flatten(blocks): + """Expand keep-together Groups so the per-column heading/figure/markdown are + inspectable as a flat block list (the chapter wraps each column in a Group).""" + out = [] + for b in blocks: + if getattr(b, "kind", "") == "group": + out.extend(_flatten(getattr(b, "blocks", []) or [])) + else: + out.append(b) + return out + + def test_golden_chapter_estructura_y_bloques(): ch = build_num_distr(_profile(n_numeric=2), {}) assert ch is not None assert ch.id == "num_distr" assert ch.version == CHAPTER_VERSION - kinds = [b.kind for b in ch.blocks] + # Per-column blocks are wrapped in keep-together Groups: flatten to inspect. + flat = _flatten(ch.blocks) + kinds = [b.kind for b in flat] # Heading + intro Markdown, then per column: Heading + Figure + Markdown. assert kinds[0] == "heading" assert kinds[1] == "markdown" assert kinds.count("figure") == 2 # one figure per numeric column. assert kinds.count("heading") == 1 + 2 # chapter title + one per column. # Each figure has a lazy maker that produces a real matplotlib figure. - figs = [b for b in ch.blocks if b.kind == "figure"] + figs = [b for b in flat if b.kind == "figure"] fig = figs[0].make() assert fig is not None # Two stacked axes: histogram + boxplot share the figure. @@ -90,7 +104,8 @@ def test_golden_media_mediana_sigma_y_boxplot_presentes(): # The intro documents the three reference lines and the Tukey boxplot; the # per-column note carries the actual mean/median/σ numbers and the shape. ch = build_num_distr(_profile(n_numeric=1, extra_categorical=False), {}) - md_texts = " ".join(b.text for b in ch.blocks if b.kind == "markdown") + md_texts = " ".join(b.text for b in _flatten(ch.blocks) + if b.kind == "markdown") assert "media" in md_texts and "mediana" in md_texts assert "±1σ" in md_texts or "σ" in md_texts assert "boxplot" in md_texts.lower() @@ -126,7 +141,8 @@ def test_anti_corte_muchas_columnas_pdf_y_pptx(): # 8 numeric columns + long note text: nothing may be cut. Every column # heading must survive in both the PDF text and the PPTX deck. ch = build_num_distr(_profile(n_numeric=8), {}) - names = [b.text for b in ch.blocks if b.kind == "heading" and b.level == 2] + names = [b.text for b in _flatten(ch.blocks) + if b.kind == "heading" and b.level == 2] assert len(names) == 8 with tempfile.TemporaryDirectory() as d: pdf = os.path.join(d, "num.pdf") diff --git a/python/functions/datascience/automatic_eda/chapters/portada.py b/python/functions/datascience/automatic_eda/chapters/portada.py index 3582d981..c1bb43ab 100644 --- a/python/functions/datascience/automatic_eda/chapters/portada.py +++ b/python/functions/datascience/automatic_eda/chapters/portada.py @@ -17,7 +17,7 @@ from datetime import datetime, timezone from .. import model -CHAPTER_VERSION = "1.0.0" +CHAPTER_VERSION = "1.1.0" CHAPTER_ID = "portada" CHAPTER_TITLE = "Portada" @@ -67,6 +67,53 @@ def _fmt_int(v) -> str: return str(v) +def _fmt_pct(value) -> str: + """Format a percentage that may arrive as a 0–1 fraction or a 0–100 number.""" + if value is None: + return "—" + try: + v = float(value) + except (TypeError, ValueError): + return str(value) + if 0 < v <= 1.0: + v *= 100.0 + return f"{v:.1f}%" + + +def _summary_blocks(summary) -> list: + """Mini-summary of the rest of the analysis, shown on the cover (mejora 5). + + The cover is built AFTER the body (``build_document`` passes the aggregated + ``ctx['document_summary']``), so it can reflect what the analysis found: + shape, column types, quality flags and which chapters were included. Returns + an empty list when there is no summary (the cover degrades to its metadata + table only).""" + if not isinstance(summary, dict) or not summary: + return [] + rows = [] + n_num = summary.get("n_numeric") + n_cat = summary.get("n_categorical") + if n_num is not None or n_cat is not None: + rows.append(("Columnas numéricas / categóricas", + f"{_fmt_int(n_num)} / {_fmt_int(n_cat)}")) + if summary.get("duplicate_pct") is not None: + rows.append(("Filas duplicadas", _fmt_pct(summary.get("duplicate_pct")))) + if summary.get("null_cell_pct") is not None: + rows.append(("Celdas nulas", _fmt_pct(summary.get("null_cell_pct")))) + titles = summary.get("chapter_titles") or [] + if titles: + rows.append(("Capítulos del informe", _fmt_int(len(titles)))) + + blocks = [model.Heading(text="Resumen del análisis", level=2)] + if rows: + blocks.append(model.KVTable(rows=rows)) + if titles: + bullets = "\n".join(f"- {model._safe_str(t)}" for t in titles) + blocks.append(model.Markdown( + text="Este informe incluye los siguientes capítulos:\n" + bullets)) + return blocks + + def _fmt_date_eu(value) -> str: """Format a date/ISO string as European DD/MM/AAAA HH:mm (UI convention). @@ -152,5 +199,8 @@ def build_portada(profile: dict, ctx: dict): model.Markdown(text=str(granularity)), ] + # Mini-summary of the rest of the analysis (built last, shown on the cover). + blocks.extend(_summary_blocks(ctx.get("document_summary"))) + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters_registry.py b/python/functions/datascience/automatic_eda/chapters_registry.py index 6dd73237..d4dc329d 100644 --- a/python/functions/datascience/automatic_eda/chapters_registry.py +++ b/python/functions/datascience/automatic_eda/chapters_registry.py @@ -26,7 +26,7 @@ from . import model # placeholders other agents will fill by creating chapters/.py — they will # appear in this exact position automatically once their module exists. CHAPTER_ORDER = [ - "portada", # cover + "portada", # cover — BUILT LAST, PLACED FIRST (see build_document). "overview", # df.head + columns/types/nulls/examples + describe "analisis_llm", # LLM interpretation — sits next to overview (user request) "num_distr", # numeric distributions @@ -37,8 +37,15 @@ CHAPTER_ORDER = [ "timeseries", # time-series analysis "geospatial", # geospatial "agregacion", # aggregations / pivots + "glosario", # glossary — ALWAYS LAST; clickable term destinations. ] +# Chapters whose position is special-cased by build_document: portada is built +# last (so it can summarize the rest) but placed first; glosario is built and +# placed last (it reads the terms every other chapter registered). +_PORTADA = "portada" +_GLOSARIO = "glosario" + def build_chapter(chapter_id: str, profile: dict, ctx: dict): """Build a single chapter by id, or None if absent/not-applicable/error. @@ -75,15 +82,72 @@ def build_document(profile: dict, ctx: dict = None) -> list: list[Chapter] in canonical order, containing only the chapters that are implemented and applicable. Never raises. """ - if profile is None: - profile = {} if not isinstance(profile, dict): profile = {} - if ctx is None: - ctx = {} - chapters = [] + # Copy ctx so the shared collector / summary we add do not leak to the caller. + ctx = dict(ctx) if isinstance(ctx, dict) else {} + + # A single glossary collector is shared by every chapter via ctx['glossary']. + # Chapters call ctx['glossary'].add(key, label, definition) and mark in-text + # appearances with [[term:key]]…[[/term]]; the glosario chapter renders the + # registered terms and the renderers wire the clickable links. + glossary = ctx.get("glossary") + if not isinstance(glossary, model.GlossaryCollector): + glossary = model.GlossaryCollector() + ctx["glossary"] = glossary + + # 1) Body: every chapter except portada (built last) and glosario (placed + # last), in canonical order. This also fills the glossary collector. + body = [] for cid in CHAPTER_ORDER: + if cid in (_PORTADA, _GLOSARIO): + continue ch = build_chapter(cid, profile, ctx) if ch is not None and ch.blocks: - chapters.append(ch) + body.append(ch) + + # 2) Aggregated summary of the rest, for the cover (user decision: the cover + # is BUILT after the body so it can reflect what the analysis found). + ctx["document_summary"] = _summarize_document(profile, body) + + # 3) Build the cover last, place it FIRST. + portada = build_chapter(_PORTADA, profile, ctx) + # 4) Build the glossary last (reads the terms the body registered), place LAST. + glosario = build_chapter(_GLOSARIO, profile, ctx) + + chapters = [] + if portada is not None and portada.blocks: + chapters.append(portada) + chapters.extend(body) + if glosario is not None and glosario.blocks: + chapters.append(glosario) return chapters + + +def _summarize_document(profile: dict, body: list) -> dict: + """Aggregate a tiny findings summary of the body for the cover. Never raises. + + Returns a dict with dataset shape, quality, column-type counts and the list + of chapters actually included — enough for the cover to show a mini-summary + of the analysis without re-deriving anything.""" + try: + cols = profile.get("columns") or [] + n_num = sum(1 for c in cols if isinstance(c, dict) + and c.get("inferred_type") == "numeric") + n_cat = sum(1 for c in cols if isinstance(c, dict) + and isinstance(c.get("categorical"), dict) + and c.get("categorical", {}).get("top") + and c.get("inferred_type") != "numeric") + return { + "n_chapters": len(body), + "chapter_titles": [getattr(c, "title", "") for c in body], + "n_rows": profile.get("n_rows"), + "n_cols": profile.get("n_cols"), + "quality_score": profile.get("quality_score"), + "n_numeric": n_num, + "n_categorical": n_cat, + "duplicate_pct": profile.get("duplicate_pct"), + "null_cell_pct": profile.get("null_cell_pct"), + } + except Exception: # noqa: BLE001 — the summary is best-effort. + return {"n_chapters": len(body) if isinstance(body, list) else 0} diff --git a/python/functions/datascience/automatic_eda/model.py b/python/functions/datascience/automatic_eda/model.py index 8a5c488d..53c41377 100644 --- a/python/functions/datascience/automatic_eda/model.py +++ b/python/functions/datascience/automatic_eda/model.py @@ -128,6 +128,39 @@ class Note: kind: str = field(default="note", init=False) +@dataclass +class Group: + """A keep-together unit: its blocks render on the SAME page/slide. + + Renderers measure the whole group first; if it does not fit in the remaining + space they move it *whole* to the next page (PDF) or slide (PPTX) before + drawing anything — so a heading never gets stranded apart from the figure and + text it introduces. If the group is taller than a full page even on its own, + it starts on a fresh page and flows (honest degradation, never cut). Use it to + bind ``Heading`` + ``Markdown`` + ``Figure`` of one idea together (see the + DISTR NUM / AGREGACION chapters). + """ + + blocks: list = field(default_factory=list) + title: Optional[str] = None + kind: str = field(default="group", init=False) + + +@dataclass +class GlossaryEntry: + """One glossary term: a clickable destination at the end of the document. + + Rendered as the term ``label`` (heading) plus its ``definition`` (markdown). + The renderers register its page/slide position as the link target so every + in-text appearance of the same ``key`` becomes a real clickable jump (PDF link + annotation via PyMuPDF; PPTX internal slide jump).""" + + key: str = "" + label: str = "" + definition: str = "" + kind: str = field(default="glossary_entry", init=False) + + @dataclass class Chapter: """An ordered set of blocks with an id, a title and a generation version.""" @@ -150,13 +183,17 @@ _BLOCK_BY_KIND = { "image": Image, "caption": Caption, "note": Note, + "group": Group, + "glossary_entry": GlossaryEntry, } def as_block(obj: Any): """Coerce a value into a block dataclass. Unknown values become a Note.""" if isinstance(obj, (Heading, Markdown, KVTable, DataTable, Figure, Image, - Caption, Note)): + Caption, Note, Group, GlossaryEntry)): + if isinstance(obj, Group): + obj.blocks = as_blocks(obj.blocks) return obj if isinstance(obj, dict): kind = obj.get("kind") @@ -189,6 +226,13 @@ def as_block(obj: Any): return Caption(text=_safe_str(obj.get("text"))) if cls is Note: return Note(text=_safe_str(obj.get("text"))) + if cls is Group: + return Group(blocks=as_blocks(obj.get("blocks")), + title=obj.get("title")) + if cls is GlossaryEntry: + return GlossaryEntry(key=_safe_str(obj.get("key")), + label=_safe_str(obj.get("label")), + definition=_safe_str(obj.get("definition"))) except Exception: # noqa: BLE001 — never raise on a malformed block. return Note(text=_safe_str(obj)) return Note(text=_safe_str(obj)) @@ -246,6 +290,67 @@ def _safe_str(v: Any) -> str: return "" +# --------------------------------------------------------------------------- # +# Glossary collector — chapters register the terms they use; the glosario +# chapter renders them at the end and the renderers wire the clickable links. +# --------------------------------------------------------------------------- # +class GlossaryCollector: + """Accumulates glossary terms registered by chapters during document build. + + A single instance is created by :func:`build_document` and passed to every + chapter via ``ctx['glossary']``. A chapter calls ``add(key, label, + definition)`` to declare a term it explains (e.g. ``"entropia"`` → + "Entropía"), and marks each in-text appearance with the inline span + ``[[term:key]]texto visible[[/term]]`` (see ``text_layout.parse_inline_rich``). + The ``glosario`` chapter reads ``terms()`` to emit one :class:`GlossaryEntry` + per term; the renderers turn every marked appearance into a real click that + jumps to that entry. First registration of a key wins (idempotent); never + raises.""" + + def __init__(self): + self._terms: dict = {} + self._order: list = [] + + def add(self, key: Any, label: Any = None, definition: Any = "") -> str: + """Register a term and return its normalized key (''. if invalid).""" + try: + k = _safe_str(key).strip() + if not k: + return "" + if k not in self._terms: + self._terms[k] = { + "key": k, + "label": _safe_str(label).strip() or k, + "definition": _safe_str(definition), + } + self._order.append(k) + return k + except Exception: # noqa: BLE001 — collecting a term never breaks a build. + return "" + + def has(self, key: Any) -> bool: + return _safe_str(key).strip() in self._terms + + def get(self, key: Any) -> Optional[dict]: + return self._terms.get(_safe_str(key).strip()) + + def terms(self, by: str = "label") -> list: + """Return the registered terms as dicts. + + ``by='label'`` (default) sorts alphabetically by visible label; + ``by='order'`` keeps first-appearance order.""" + if by == "order": + return [self._terms[k] for k in self._order] + return sorted(self._terms.values(), + key=lambda t: _safe_str(t.get("label")).lower()) + + def __len__(self) -> int: + return len(self._terms) + + def __bool__(self) -> bool: + return bool(self._terms) + + # --------------------------------------------------------------------------- # # Manifest — per-chapter versions and page/slide counts for tracking. # --------------------------------------------------------------------------- # diff --git a/python/functions/datascience/automatic_eda/render_features_test.py b/python/functions/datascience/automatic_eda/render_features_test.py new file mode 100644 index 00000000..40d247ba --- /dev/null +++ b/python/functions/datascience/automatic_eda/render_features_test.py @@ -0,0 +1,354 @@ +"""Tests for the AutomaticEDA engine features added in phase 4a. + +Covers, with executable evidence, the six render-engine improvements: + +1. Bold no longer overlaps the following text in the PDF (real width measured). +2. Zebra striping on data tables (PDF Rectangle fills + PPTX cell fills). +3. Keep-together: a Group moves whole to the next page/slide (heading never gets + stranded from its figure). +4. Every PPTX figure carries a visible caption/title (fallback to the heading). +5. Cover is built last but placed first and reflects an aggregated summary. +6. Glossary is the last chapter; the term "entropía" is a real clickable link in + the PDF (PyMuPDF GOTO annotation) and in the PPTX (native slide-jump run). + +Self-contained: synthetic profiles, no DuckDB. Heavy renderer checks (fitz/pptx) +skip cleanly when the optional engine is missing. +""" + +import os +import sys + +import pytest + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", "..")) # python/functions +if _FUNCTIONS not in sys.path: + sys.path.insert(0, _FUNCTIONS) + +import matplotlib # noqa: E402 + +matplotlib.use("Agg") +import matplotlib.colors as mcolors # noqa: E402 +import matplotlib.pyplot as plt # noqa: E402 +from matplotlib.patches import Rectangle # noqa: E402 + +from datascience.automatic_eda import model # noqa: E402 +from datascience.automatic_eda import render_pdf_impl as RP # noqa: E402 +from datascience.automatic_eda import render_pptx_impl as RX # noqa: E402 +from datascience.automatic_eda import build_document # noqa: E402 +from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf # noqa: E402 +from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx # noqa: E402 + + +class _FakePdf: + """Stand-in for PdfPages so the placers can call _new_page in unit tests.""" + + def savefig(self, fig): # noqa: D401 + pass + + +def _small_fig(): + fig = plt.figure(figsize=(4.0, 1.5)) + ax = fig.add_subplot(111) + ax.plot([0, 1, 2], [1, 3, 2]) + return fig + + +def _profile_with_cat_and_num(): + """A tiny profile that triggers cat_distr (→ entropía term) and num_distr.""" + return { + "table": "ventas", "n_rows": 120, "n_cols": 2, "quality_score": 91, + "duplicate_pct": 1.5, "null_cell_pct": 0.8, + "columns": [ + {"name": "region", "inferred_type": "categorical", + "categorical": { + "top": [{"value": "norte", "count": 50, "pct": 0.42}, + {"value": "sur", "count": 40, "pct": 0.33}, + {"value": "este", "count": 30, "pct": 0.25}], + "mode": "norte", "n_distinct": 3, "entropy": 1.55, + "imbalance": 0.1}}, + {"name": "importe", "inferred_type": "numeric", + "numeric": {"mean": 50.0, "median": 48.0, "std": 10.0, + "min": 10, "max": 99, "iqr": 15, + "histogram": [{"lo": 0, "hi": 50, "count": 40}, + {"lo": 50, "hi": 100, "count": 80}]}}, + ], + } + + +# --------------------------------------------------------------------------- # +# 1) Bold does not overlap the following text (PDF). +# --------------------------------------------------------------------------- # +def test_pdf_bold_span_does_not_overlap_following_text(): + fig = plt.figure(figsize=(RP._W, RP._H)) + st = RP._PdfState(_FakePdf(), "t") + st.fig = fig + st.page = 1 + # A wide bold token immediately followed by normal text on the SAME line. + rich = [[("PALABRAMUYANCHAENNEGRITA", True, None), + (" texto normal justo después", False, None)]] + RP._place_rich_lines(st, rich, RP._FS_BODY, RP._INK) + + renderer = fig.canvas.get_renderer() + boxes = sorted((t.get_window_extent(renderer) for t in fig.texts), + key=lambda b: b.x0) + assert len(boxes) == 2, "se esperaban dos spans dibujados" + # The bold span ends before the normal span starts (no overlap). 1px slack. + assert boxes[0].x1 <= boxes[1].x0 + 1.0, \ + "la negrita se solapa con el texto siguiente" + plt.close(fig) + + +# --------------------------------------------------------------------------- # +# 2) Zebra striping. +# --------------------------------------------------------------------------- # +def _facecolor_eq(artist, hexcolor) -> bool: + want = mcolors.to_rgba(hexcolor) + got = artist.get_facecolor() + return all(abs(a - b) < 0.02 for a, b in zip(got[:3], want[:3])) + + +def test_pdf_table_has_zebra_striping(): + fig = plt.figure(figsize=(RP._W, RP._H)) + st = RP._PdfState(_FakePdf(), "t") + st.fig = fig + st.page = 1 + st.chapter = model.Chapter(id="c", title="C", version="1.0.0") + dt = model.DataTable(header=["A", "B"], + rows=[["1", "x"], ["2", "y"], ["3", "z"], ["4", "w"]]) + RP._place_data_table(st, dt) + zebra = [a for a in fig.findobj(Rectangle) if _facecolor_eq(a, RP._ZEBRA)] + # 4 data rows → even rows (1-based 2 and 4) shaded = 2 zebra rectangles. + assert len(zebra) == 2, f"esperadas 2 filas zebra, hay {len(zebra)}" + plt.close(fig) + + +def test_pptx_table_has_zebra_striping(tmp_path): + pptx = pytest.importorskip("pptx") + from pptx import Presentation + from pptx.dml.color import RGBColor + + doc = [model.Chapter(id="c", title="Tabla", version="1.0.0", blocks=[ + model.DataTable(header=["A", "B"], + rows=[["1", "x"], ["2", "y"], ["3", "z"], ["4", "w"]])])] + out = str(tmp_path / "zebra.pptx") + assert render_automatic_eda_pptx(doc, out, {"write_manifest": False})["path"] + + prs = Presentation(out) + table = None + for slide in prs.slides: + for sh in slide.shapes: + if sh.has_table: + table = sh.table + break + assert table is not None, "no se encontró la tabla en el deck" + zebra = RGBColor(0xF6, 0xF8, 0xFA) + white = RGBColor(0xFF, 0xFF, 0xFF) + # Row 0 = header; data rows follow. Even data rows (table rows 2, 4) shaded. + assert table.cell(1, 0).fill.fore_color.rgb == white + assert table.cell(2, 0).fill.fore_color.rgb == zebra + assert table.cell(4, 0).fill.fore_color.rgb == zebra + + +# --------------------------------------------------------------------------- # +# 3) Keep-together (Group): heading + figure never split. +# --------------------------------------------------------------------------- # +def test_pdf_group_moves_whole_to_next_page_when_it_does_not_fit(): + fig = plt.figure(figsize=(RP._W, RP._H)) + st = RP._PdfState(_FakePdf(), "t") + st.fig = fig + st.page = 1 + st.chapter = model.Chapter(id="c", title="C", version="1.0.0") + grp = model.Group(blocks=[ + model.Heading(text="Sección con figura", level=2), + model.Figure(make=_small_fig, caption="cap"), + model.Markdown(text="Descripción breve de la figura."), + ]) + # Only ~0.4in left: the group does not fit here but fits on a fresh page. + st.y = RP._CONTENT_BOTTOM - 0.4 + page_before = st.page + RP._place_group(st, grp) + # Exactly one page break: the whole group (heading+figure+text) stays + # together on the new page — no second break inside it. + assert st.page == page_before + 1 + plt.close(st.fig) + + +def test_pdf_group_does_not_break_when_it_fits(): + fig = plt.figure(figsize=(RP._W, RP._H)) + st = RP._PdfState(_FakePdf(), "t") + st.fig = fig + st.page = 1 + st.chapter = model.Chapter(id="c", title="C", version="1.0.0") + grp = model.Group(blocks=[ + model.Heading(text="Cabe entera", level=2), + model.Figure(make=_small_fig, caption="cap"), + ]) + st.y = RP._CONTENT_TOP # empty page → fits, must not break. + page_before = st.page + RP._place_group(st, grp) + assert st.page == page_before + plt.close(st.fig) + + +def test_pptx_group_moves_whole_to_next_slide(tmp_path): + pytest.importorskip("pptx") + from pptx import Presentation + from pptx.util import Inches + + prs = Presentation() + prs.slide_width = Inches(RX._W) + prs.slide_height = Inches(RX._H) + st = RX._PptxState(prs, "t") + st.chapter = model.Chapter(id="c", title="C", version="1.0.0") + RX._new_slide(st, cont=False) + grp = model.Group(blocks=[ + model.Heading(text="Sección con figura", level=2), + model.Figure(make=_small_fig, caption="cap"), + model.Markdown(text="Descripción breve."), + ]) + st.y = RX._CONTENT_BOTTOM - 0.4 # does not fit here. + slide_before = st.slide_no + RX._place_group(st, grp) + assert st.slide_no == slide_before + 1 # one jump; group kept together. + + +# --------------------------------------------------------------------------- # +# 4) Every PPTX figure carries a visible caption/title. +# --------------------------------------------------------------------------- # +def test_pptx_figure_without_caption_gets_heading_title(tmp_path): + pytest.importorskip("pptx") + from pptx import Presentation + from pptx.enum.shapes import MSO_SHAPE_TYPE + + doc = [model.Chapter(id="c", title="Cap", version="1.0.0", blocks=[ + model.Heading(text="Mi sección gráfica", level=2), + model.Figure(make=_small_fig), # NO caption provided. + ])] + out = str(tmp_path / "cap.pptx") + assert render_automatic_eda_pptx(doc, out, {"write_manifest": False})["path"] + + prs = Presentation(out) + for slide in prs.slides: + has_pic = any(sh.shape_type == MSO_SHAPE_TYPE.PICTURE + for sh in slide.shapes) + if not has_pic: + continue + italic = [r.text for sh in slide.shapes if sh.has_text_frame + for p in sh.text_frame.paragraphs for r in p.runs + if r.font.italic and r.text.strip()] + assert italic, "la figura no lleva caption visible en su slide" + assert any("Mi sección gráfica" in t for t in italic), \ + "el caption no cayó al título de la sección" + return + pytest.fail("no se encontró ningún slide con imagen") + + +def test_pptx_no_figure_slide_is_ever_untitled(tmp_path): + """Invariant: across many figures (incl. tall ones), NO slide with an image + lacks a visible caption — the caption never spills to the next slide.""" + pytest.importorskip("pptx") + from pptx import Presentation + from pptx.enum.shapes import MSO_SHAPE_TYPE + + def _tall_fig(): + fig = plt.figure(figsize=(5.0, 4.6)) # nearly square → fills the slide. + fig.add_subplot(111).bar([1, 2, 3], [4, 5, 6]) + return fig + + blocks = [] + for i in range(6): + blocks.append(model.Heading(text=f"Gráfico {i}", level=2)) + blocks.append(model.Figure( + make=_tall_fig, + caption=("Una descripción de la figura deliberadamente larga para " + "que el caption ocupe más de una línea al envolverse en el " + f"ancho del slide — figura número {i} del bloque."))) + doc = [model.Chapter(id="c", title="Muchas figuras", version="1.0.0", + blocks=blocks)] + out = str(tmp_path / "many.pptx") + assert render_automatic_eda_pptx(doc, out, {"write_manifest": False})["path"] + + prs = Presentation(out) + missing = [] + pics = 0 + for i, slide in enumerate(prs.slides): + if not any(sh.shape_type == MSO_SHAPE_TYPE.PICTURE + for sh in slide.shapes): + continue + pics += 1 + italic = [r.text for sh in slide.shapes if sh.has_text_frame + for p in sh.text_frame.paragraphs for r in p.runs + if r.font.italic and r.text.strip()] + if not italic: + missing.append(i) + assert pics >= 6, f"esperadas >=6 figuras, hay {pics}" + assert not missing, f"slides con imagen sin caption: {missing}" + + +# --------------------------------------------------------------------------- # +# 5) Cover built last, placed first, with an aggregated summary. +# --------------------------------------------------------------------------- # +def test_cover_first_glossary_last_with_summary(): + chs = build_document(_profile_with_cat_and_num(), ctx={"dataset_name": "v"}) + ids = [c.id for c in chs] + assert ids[0] == "portada", f"la portada no es la primera: {ids}" + assert ids[-1] == "glosario", f"el glosario no es el último: {ids}" + cover = chs[0] + headings = [b.text for b in cover.blocks if b.kind == "heading"] + assert any("Resumen" in h for h in headings), \ + "la portada no incluye el resumen agregado" + # The summary reflects the body chapters (e.g. the numeric/categorical ones). + cover_text = " ".join( + b.text for b in cover.blocks if getattr(b, "kind", "") == "markdown") + assert "Distribuciones" in cover_text, \ + "el resumen de portada no menciona los capítulos del cuerpo" + + +# --------------------------------------------------------------------------- # +# 6) Glossary clickable in PDF (PyMuPDF GOTO) and PPTX (native slide jump). +# --------------------------------------------------------------------------- # +def test_pdf_glossary_term_is_clickable(tmp_path): + fitz = pytest.importorskip("fitz") + out = str(tmp_path / "glos.pdf") + res = render_automatic_eda_pdf(_profile_with_cat_and_num(), out, + {"ctx": {"dataset_name": "v"}, + "write_manifest": False}) + assert res["path"] == out and os.path.exists(out) + + doc = fitz.open(out) + goto = [(pno, l) for pno in range(doc.page_count) + for l in doc[pno].get_links() if l.get("kind") == fitz.LINK_GOTO] + doc.close() + assert goto, "no hay ningún enlace interno (entropía → glosario) en el PDF" + # Destination must be a real page in the document (the glossary page). + assert all(0 <= l.get("page", -1) for _p, l in goto) + + +def test_pptx_glossary_term_is_clickable(tmp_path): + pytest.importorskip("pptx") + from pptx import Presentation + from pptx.oxml.ns import qn + + out = str(tmp_path / "glos.pptx") + res = render_automatic_eda_pptx(_profile_with_cat_and_num(), out, + {"ctx": {"dataset_name": "v"}, + "write_manifest": False}) + assert res["path"] == out and os.path.exists(out) + + prs = Presentation(out) + found = False + for slide in prs.slides: + for sh in slide.shapes: + if not sh.has_text_frame: + continue + for p in sh.text_frame.paragraphs: + for r in p.runs: + rpr = r._r.find(qn("a:rPr")) + if rpr is None: + continue + hl = rpr.find(qn("a:hlinkClick")) + if hl is not None and \ + hl.get("action") == "ppaction://hlinksldjump": + found = True + assert found, "ningún término tiene hyperlink de salto a slide en el PPTX" diff --git a/python/functions/datascience/automatic_eda/render_pdf_impl.py b/python/functions/datascience/automatic_eda/render_pdf_impl.py index fe8702ce..ffe9a349 100644 --- a/python/functions/datascience/automatic_eda/render_pdf_impl.py +++ b/python/functions/datascience/automatic_eda/render_pdf_impl.py @@ -60,6 +60,8 @@ _FS_BODY, _FS_CELL, _FS_NOTE = 10.5, 9.0, 9.0 _GAP = 0.12 # vertical gap after a block, inches. _CELL_PAD = 0.06 # horizontal padding inside a table cell, inches. _ROW_VPAD = 0.05 # vertical padding inside a table row, inches. +_ZEBRA = "#f6f8fa" # very light grey for zebra-striped (even) table rows. +_LINK = "#2a6f97" # accent colour for clickable glossary terms. class _PdfState: @@ -73,6 +75,11 @@ class _PdfState: self.page = 0 # global page counter. self.chapter = None # current Chapter (for the footer). self.chapter_pages = 0 # pages produced for the current chapter. + self.last_heading = "" # text of the most recent heading. + # Glossary wiring (mejora 6). Pages are 0-based; rects/points are in PDF + # points (1/72") with a top-left origin — same convention as PyMuPDF. + self.term_sources = [] # [{key, page, rect:[x0,y0,x1,y1]}] + self.term_dests = {} # key -> {page, point:[x,y]} # --------------------------------------------------------------------------- # @@ -121,6 +128,35 @@ def _draw_footer(st: _PdfState) -> None: transform=st.fig.transFigure, color=_RULE, lw=0.6)) +def _text_width_in(st: _PdfState, s: str, fs: float, bold: bool) -> float: + """Real rendered width (inches) of ``s`` at ``fs`` with the given weight. + + Measured with the Agg renderer's own font metrics (the same TrueType the PDF + backend embeds), so a **bold** span advances the cursor by its ACTUAL width — + fixing the bug where bold text overlapped the following normal text because + the cursor advanced by the normal-weight average-glyph estimate. Falls back to + the deterministic character grid if the renderer is unavailable, so it never + raises. + """ + if not s: + return 0.0 + try: + from matplotlib.font_manager import FontProperties + renderer = st.fig.canvas.get_renderer() + prop = FontProperties(family="sans-serif", size=fs, + weight="bold" if bold else "normal") + w_px, _h, _d = renderer.get_text_width_height_descent(s, prop, False) + return w_px / float(st.fig.dpi) + except Exception: # noqa: BLE001 — fall back to the conservative grid metric. + return tl.avg_char_width_in(fs) * len(s) + + +def _pt_rect(x0_in: float, y_top_in: float, x1_in: float, + y_bottom_in: float) -> list: + """An inches box (top-left origin) → a PDF-points rect for PyMuPDF links.""" + return [x0_in * 72.0, y_top_in * 72.0, x1_in * 72.0, y_bottom_in * 72.0] + + def _remaining(st: _PdfState) -> float: return _CONTENT_BOTTOM - st.y @@ -138,6 +174,7 @@ def _place_heading(st: _PdfState, block) -> None: level = max(1, min(3, int(getattr(block, "level", 1) or 1))) fs = {1: _FS_H1, 2: _FS_H2, 3: _FS_H3}[level] text = tl.strip_inline_md(getattr(block, "text", "")) + st.last_heading = text or st.last_heading max_chars = tl.chars_per_line(_USABLE_W, fs) lines = tl.wrap(text, max_chars) lh = tl.line_height_in(fs, leading=1.2) @@ -171,17 +208,19 @@ def _place_text_lines(st: _PdfState, lines: list, fs: float, color: str, def _place_rich_lines(st: _PdfState, rich_lines: list, fs: float, color: str, indent: float = 0.0, prefixes=None) -> None: - """Draw pre-wrapped lines of styled segments (bold spans rendered bold). + """Draw pre-wrapped lines of styled segments (bold + clickable term spans). - Each line is ``[(text, is_bold), ...]``. Segments are placed left-to-right, - advancing x by the deterministic character grid (same metric the wrapper - used), so a bold span is rendered with ``fontweight='bold'`` without - changing the line's measured width — the no-cut guarantee is preserved. + Each line is a list of ``(text, is_bold)`` or ``(text, is_bold, term_key)`` + segments. Segments are placed left-to-right, advancing x by the segment's + REAL rendered width (measured with the renderer's font metrics for the actual + weight) — this is what stops a bold span from overlapping the following text: + the cursor no longer advances by the normal-weight estimate. A segment with a + ``term_key`` is drawn in the accent colour and its rectangle is recorded in + ``st.term_sources`` so it becomes a clickable jump to the glossary entry. ``prefixes`` is an optional ``(first_line, other_lines)`` pair (e.g. a bullet) drawn before the segments. """ lh = tl.line_height_in(fs) - cw = tl.avg_char_width_in(fs) for idx, segs in enumerate(rich_lines): _ensure_space(st, lh) x = _ML + indent @@ -190,14 +229,23 @@ def _place_rich_lines(st: _PdfState, rich_lines: list, fs: float, color: str, if prefix: st.fig.text(_xf(x), _yf(st.y), prefix, fontsize=fs, color=color, ha="left", va="top") - x += cw * len(prefix) - for seg_text, is_bold in segs: + x += _text_width_in(st, prefix, fs, False) + for seg in segs: + if len(seg) == 3: + seg_text, is_bold, term = seg + else: + seg_text, is_bold, term = seg[0], seg[1], None if seg_text == "": continue - st.fig.text(_xf(x), _yf(st.y), seg_text, fontsize=fs, color=color, - ha="left", va="top", + w = _text_width_in(st, seg_text, fs, bool(is_bold)) + st.fig.text(_xf(x), _yf(st.y), seg_text, fontsize=fs, + color=(_LINK if term else color), ha="left", va="top", fontweight="bold" if is_bold else "normal") - x += cw * len(seg_text) + if term: + st.term_sources.append({ + "key": term, "page": st.page - 1, + "rect": _pt_rect(x, st.y, x + w, st.y + lh)}) + x += w st.y += lh @@ -242,7 +290,7 @@ def _place_markdown(st: _PdfState, block) -> None: if stripped.startswith("- ") or stripped.startswith("* "): content = stripped[2:] # keep inline markers for bold rendering. bullet_chars = tl.chars_per_line(_USABLE_W - 0.22, _FS_BODY) - rich = tl.wrap_rich(content, bullet_chars) + rich = tl.wrap_rich_terms(content, bullet_chars) _place_rich_lines(st, rich, _FS_BODY, _INK, prefixes=("• ", " ")) i += 1 @@ -258,7 +306,8 @@ def _place_markdown(st: _PdfState, block) -> None: j += 1 text = " ".join(para) max_chars = tl.chars_per_line(_USABLE_W, _FS_BODY) - _place_rich_lines(st, tl.wrap_rich(text, max_chars), _FS_BODY, _INK) + _place_rich_lines(st, tl.wrap_rich_terms(text, max_chars), _FS_BODY, + _INK) i = j st.y += _GAP @@ -325,15 +374,18 @@ def _wrap_row(cells: list, widths: list, fs: float) -> list: def _draw_table_row(st: _PdfState, cells_lines: list, widths: list, fs: float, - y0: float, header: bool) -> float: + y0: float, header: bool, zebra: bool = False) -> float: lh = tl.line_height_in(fs) nlines = max((len(c) for c in cells_lines), default=1) row_h = lh * nlines + _ROW_VPAD * 2 - if header: + # Background: header band, or a faint zebra fill for even data rows. Drawn + # below the text/rule (zorder 0) so striping never hides cell content. + bg = _HEAD_BG if header else (_ZEBRA if zebra else None) + if bg is not None: st.fig.add_artist(Rectangle( (_xf(_ML), _yf(y0 + row_h)), _xf(_ML + _USABLE_W) - _xf(_ML), _yf(y0) - _yf(y0 + row_h), transform=st.fig.transFigure, - color=_HEAD_BG, lw=0, zorder=0)) + color=bg, lw=0, zorder=0)) x = _ML for c, lines in enumerate(cells_lines): for k, ln in enumerate(lines): @@ -378,14 +430,18 @@ def _place_data_table(st: _PdfState, block) -> None: + _ROW_VPAD * 2 _ensure_space(st, header_h() + max(first_row_h, lh)) draw_header() - for r in rows: + # ``data_idx`` is the LOGICAL row index (not reset across page breaks) so the + # zebra pattern stays coherent when a long table splits and repeats the + # header: even rows (1-based) are shaded → 0-based odd indices. + for data_idx, r in enumerate(rows): cells_lines = _wrap_row(r, widths, fs) row_h = lh * max((len(c) for c in cells_lines), default=1) \ + _ROW_VPAD * 2 if _remaining(st) < row_h: _new_page(st) draw_header() # repeat header on the continuation page. - st.y += _draw_table_row(st, cells_lines, widths, fs, st.y, header=False) + st.y += _draw_table_row(st, cells_lines, widths, fs, st.y, + header=False, zebra=(data_idx % 2 == 1)) note = getattr(block, "note", None) if note: _place_text_lines(st, tl.wrap(model._safe_str(note), @@ -414,53 +470,98 @@ def _png_from_figure(fig) -> bytes: return buf.read() -def _place_image_array(st: _PdfState, arr, caption) -> None: +def _figure_png_cached(block): + """Rasterize a Figure to PNG bytes ONCE and cache (bytes, aspect). + + Measuring (keep-together) and drawing must agree on the REAL aspect ratio: + ``bbox_inches='tight'`` changes it vs ``figsize``, so we rasterize once and + reuse the bytes for both. Cached on the block; never raises.""" + cached = getattr(block, "_aeda_png", None) + if cached is not None: + return cached + fig, owned = _resolve_figure(block) + data = None + if fig is not None: + try: + data = _png_from_figure(fig) + finally: + if owned: + try: + plt.close(fig) + except Exception: # noqa: BLE001 + pass + aspect = 0.66 + if data is not None: + try: + arr = mpimg.imread(io.BytesIO(data)) + aspect = (arr.shape[0] / arr.shape[1]) if arr.shape[1] else 0.66 + except Exception: # noqa: BLE001 + aspect = 0.66 + try: + block._aeda_png = (data, aspect) + return block._aeda_png + except Exception: # noqa: BLE001 — block may reject attributes; degrade. + return (data, aspect) + + +def _image_aspect(block) -> float: + """Real aspect (h/w) of an Image block by path, for measurement.""" + path = getattr(block, "path", "") + if path and os.path.exists(path): + try: + arr = mpimg.imread(path) + return (arr.shape[0] / arr.shape[1]) if arr.shape[1] else 0.66 + except Exception: # noqa: BLE001 + pass + return 0.66 + + +def _place_image_array(st: _PdfState, arr, caption, max_h_in=None) -> None: h_px, w_px = arr.shape[0], arr.shape[1] aspect = (h_px / w_px) if w_px else 1.0 + # Reserve the caption's REAL (possibly multi-line) height FIRST, then scale + # the image to (max_h - cap_reserve) so figure + caption always fit the same + # page. cap_reserve adds a cushion so the caption never spills to next page. + cap_lines = (tl.wrap(model._safe_str(caption), + tl.chars_per_line(_USABLE_W, _FS_NOTE)) + if caption else []) + cap_real = tl.line_height_in(_FS_NOTE) * len(cap_lines) if caption else 0.0 + cap_reserve = (cap_real + 0.04 + 0.08) if caption else 0.0 max_h = _CONTENT_BOTTOM - _CONTENT_TOP + # height_in hint (model.Figure/Image): cap the height so a figure in a + # keep-together Group shrinks to leave room for its heading and text. + if isinstance(max_h_in, (int, float)) and max_h_in > 0: + max_h = min(max_h, float(max_h_in)) + max_img_h = max(max_h - cap_reserve, 0.6) target_w = _USABLE_W target_h = target_w * aspect - if target_h > max_h: - target_h = max_h + if target_h > max_img_h: + target_h = max_img_h target_w = target_h / aspect if aspect else _USABLE_W - cap_h = tl.line_height_in(_FS_NOTE) + 0.04 if caption else 0.0 # Move whole image to next page if it does not fit in remaining space. - if _remaining(st) < target_h + cap_h: - if (max_h) >= target_h + cap_h: - _new_page(st) - else: - # Taller than a full page even at min — already clamped to max_h. - _new_page(st) + if _remaining(st) < target_h + cap_reserve: + _new_page(st) left_frac = _xf(_ML + (_USABLE_W - target_w) / 2.0) bottom_frac = _yf(st.y + target_h) ax = st.fig.add_axes([left_frac, bottom_frac, target_w / _W, target_h / _H]) ax.imshow(arr) ax.axis("off") st.y += target_h + 0.04 - if caption: - _place_text_lines(st, tl.wrap(model._safe_str(caption), - tl.chars_per_line(_USABLE_W, _FS_NOTE)), - _FS_NOTE, _MUTED, style="italic") + if cap_lines: + _place_text_lines(st, cap_lines, _FS_NOTE, _MUTED, style="italic") st.y += _GAP def _place_figure(st: _PdfState, block) -> None: - fig, owned = _resolve_figure(block) - if fig is None: + png, _aspect = _figure_png_cached(block) + if png is None: _place_text_lines(st, ["(figura no disponible)"], _FS_NOTE, _MUTED, style="italic") st.y += _GAP return - try: - png = _png_from_figure(fig) - finally: - if owned: - try: - plt.close(fig) - except Exception: # noqa: BLE001 - pass arr = mpimg.imread(io.BytesIO(png)) - _place_image_array(st, arr, getattr(block, "caption", None)) + _place_image_array(st, arr, getattr(block, "caption", None), + max_h_in=getattr(block, "height_in", None)) def _place_image(st: _PdfState, block) -> None: @@ -471,7 +572,8 @@ def _place_image(st: _PdfState, block) -> None: st.y += _GAP return arr = mpimg.imread(path) - _place_image_array(st, arr, getattr(block, "caption", None)) + _place_image_array(st, arr, getattr(block, "caption", None), + max_h_in=getattr(block, "height_in", None)) def _place_caption(st: _PdfState, block) -> None: @@ -488,6 +590,189 @@ def _place_note(st: _PdfState, block) -> None: st.y += _GAP +# --------------------------------------------------------------------------- # +# Block measurement (mejora 3 — keep-together). These estimate a block's height +# WITHOUT drawing it, so a Group can decide to move whole to the next page before +# anything is drawn. Over-estimating is safe: it only triggers an earlier page +# break, never a content cut (the placers keep their own no-cut pagination). +# --------------------------------------------------------------------------- # +def _measure_heading_text(text: str, level: int) -> float: + level = max(1, min(3, int(level or 1))) + fs = {1: _FS_H1, 2: _FS_H2, 3: _FS_H3}[level] + lines = tl.wrap(tl.strip_inline_md(text), tl.chars_per_line(_USABLE_W, fs)) + h = tl.line_height_in(fs, leading=1.2) * len(lines) + 0.06 + if level == 1: + h += 0.10 + return h + _GAP + + +def _measure_markdown(block) -> float: + raw = str(getattr(block, "text", "") or "") + md_lines = raw.split("\n") + h = 0.0 + i, n = 0, len(md_lines) + while i < n: + stripped = md_lines[i].strip() + if stripped.startswith("|") and stripped.endswith("|"): + j = i + while j < n and md_lines[j].strip().startswith("|") \ + and md_lines[j].strip().endswith("|"): + j += 1 + h += (tl.line_height_in(_FS_CELL) + _ROW_VPAD * 2) * (j - i) + _GAP + i = j + continue + if stripped == "": + h += tl.line_height_in(_FS_BODY) * 0.5 + i += 1 + continue + if stripped.startswith("### "): + h += _measure_heading_text(stripped[4:], 3) + i += 1 + continue + if stripped.startswith("## "): + h += _measure_heading_text(stripped[3:], 2) + i += 1 + continue + if stripped.startswith("# "): + h += _measure_heading_text(stripped[2:], 1) + i += 1 + continue + if stripped.startswith("- ") or stripped.startswith("* "): + lines = tl.wrap_rich_terms( + stripped[2:], tl.chars_per_line(_USABLE_W - 0.22, _FS_BODY)) + h += tl.line_height_in(_FS_BODY) * len(lines) + i += 1 + continue + para = [stripped] + j = i + 1 + while j < n: + nxt = md_lines[j].strip() + if nxt == "" or nxt.startswith(("|", "#", "- ", "* ")): + break + para.append(nxt) + j += 1 + lines = tl.wrap_rich_terms(" ".join(para), + tl.chars_per_line(_USABLE_W, _FS_BODY)) + h += tl.line_height_in(_FS_BODY) * len(lines) + i = j + return h + _GAP + + +def _measure_figure_like(block) -> float: + max_h = _CONTENT_BOTTOM - _CONTENT_TOP + hint = getattr(block, "height_in", None) + if isinstance(hint, (int, float)) and hint > 0: + target_h = min(float(hint), max_h) + else: + # Real rasterized aspect (cached) so measuring matches drawing. + if getattr(block, "kind", "") == "image": + aspect = _image_aspect(block) + else: + _data, aspect = _figure_png_cached(block) + target_h = min(_USABLE_W * aspect, max_h) + cap = getattr(block, "caption", None) + cap_h = tl.line_height_in(_FS_NOTE) + 0.04 if cap else 0.0 + return target_h + 0.04 + cap_h + _GAP + + +def _measure_block(st: _PdfState, block) -> float: + kind = getattr(block, "kind", "") + try: + if kind == "heading": + return _measure_heading_text(getattr(block, "text", ""), + getattr(block, "level", 1)) + if kind == "markdown": + return _measure_markdown(block) + if kind in ("figure", "image"): + return _measure_figure_like(block) + if kind in ("caption", "note"): + lines = tl.wrap(getattr(block, "text", ""), + tl.chars_per_line(_USABLE_W, _FS_NOTE)) + return tl.line_height_in(_FS_NOTE) * len(lines) + _GAP + if kind == "kv_table": + rows = getattr(block, "rows", []) or [] + return (tl.line_height_in(_FS_BODY) + _ROW_VPAD) * (len(rows) + 1) \ + + _GAP + if kind == "data_table": + rows = getattr(block, "rows", []) or [] + return (tl.line_height_in(_FS_CELL) + _ROW_VPAD * 2) \ + * (len(rows) + 1) + _GAP + if kind == "group": + return sum(_measure_block(st, b) + for b in (getattr(block, "blocks", []) or [])) + except Exception: # noqa: BLE001 — a measurement never aborts rendering. + pass + return tl.line_height_in(_FS_BODY) + + +def _shrink_group_figures(st: _PdfState, blocks: list, avail_full: float) -> None: + """Cap each figure's height (via height_in) so the whole group fits a page. + + The figure shrinks just enough to leave room for its heading, text and + caption — keep-together puts the chart on the SAME page as its title and + description instead of pushing it to the next page.""" + fig_blocks = [b for b in blocks + if getattr(b, "kind", "") in ("figure", "image")] + if not fig_blocks: + return + nonfig_h = sum(_measure_block(st, b) for b in blocks + if getattr(b, "kind", "") not in ("figure", "image")) + fig_overhead = tl.line_height_in(_FS_NOTE) + 0.04 + 0.04 + _GAP + budget = avail_full - nonfig_h - 0.08 * len(fig_blocks) + if budget <= 0.8: + return + per = budget / len(fig_blocks) - fig_overhead + if per <= 0.6: + return + for fb in fig_blocks: + cur = getattr(fb, "height_in", None) + fb.height_in = (min(float(cur), per) + if isinstance(cur, (int, float)) and cur > 0 else per) + + +def _place_group(st: _PdfState, block) -> None: + """Render a keep-together Group: move it whole to the next page if needed.""" + blocks = getattr(block, "blocks", []) or [] + if not blocks: + return + avail_full = _CONTENT_BOTTOM - _CONTENT_TOP + _shrink_group_figures(st, blocks, avail_full) + total = sum(_measure_block(st, b) for b in blocks) + if total <= avail_full: + # Fits on one page: keep it together by moving whole when it won't fit. + if total > _remaining(st): + _new_page(st) + elif st.y > _CONTENT_TOP + 1e-6: + # Taller than a full page: at least start it on a fresh page, then flow. + _new_page(st) + for b in blocks: + placer = _PLACERS.get(getattr(b, "kind", ""), _place_note) + try: + placer(st, b) + except Exception: # noqa: BLE001 — a bad block never aborts the group. + pass + + +def _place_glossary_entry(st: _PdfState, block) -> None: + """Render one glossary term and register it as a clickable link target.""" + key = getattr(block, "key", "") + label = getattr(block, "label", "") or key + definition = getattr(block, "definition", "") + # Reserve the term + its first definition line together, then anchor the + # destination at the resolved page/position before drawing. + _ensure_space(st, tl.line_height_in(_FS_H3, leading=1.2) + + tl.line_height_in(_FS_BODY) * 2) + if key: + st.term_dests[key] = {"page": st.page - 1, + "point": [_ML * 72.0, st.y * 72.0]} + _place_heading(st, model.Heading(text=str(label), level=3)) + if definition: + _place_text_lines(st, tl.wrap(model._safe_str(definition), + tl.chars_per_line(_USABLE_W, _FS_BODY)), + _FS_BODY, _INK) + st.y += _GAP * 0.5 + + _PLACERS = { "heading": _place_heading, "markdown": _place_markdown, @@ -497,6 +782,8 @@ _PLACERS = { "image": _place_image, "caption": _place_caption, "note": _place_note, + "group": _place_group, + "glossary_entry": _place_glossary_entry, } @@ -553,8 +840,42 @@ def render_pdf(chapters: list, out_path: str, meta: dict = None) -> dict: return {"path": None, "n_pages": 0, "chapters": [], "note": f"fallo al escribir el PDF: {e}"} + # Mejora 6 — wire clickable glossary links now the PDF is closed on disk. + # PdfPages cannot emit internal hyperlinks, so we post-process with PyMuPDF + # (delegated registry function). Degrades silently if it is unavailable. + n_links = _wire_glossary_links(st, out_path, notes) + note = f"{n_pages} páginas" + if n_links: + note += f" · {n_links} enlaces de glosario" if notes: note += " · " + "; ".join(notes) return {"path": out_path, "n_pages": n_pages, "chapters": chapters_meta, "note": note} + + +def _wire_glossary_links(st: _PdfState, out_path: str, notes: list) -> int: + """Build {source rect → glossary dest} links and apply them via PyMuPDF. + + Returns the number of links applied (0 if there is nothing to wire or the + post-processor is unavailable). Never raises.""" + try: + links = [] + for src in st.term_sources: + dest = st.term_dests.get(src.get("key")) + if not dest: + continue + links.append({ + "src_page": src["page"], "src_rect": src["rect"], + "dst_page": dest["page"], "dst_point": dest["point"]}) + if not links: + return 0 + from datascience.add_pdf_internal_links import add_pdf_internal_links + res = add_pdf_internal_links(out_path, links) + if isinstance(res, dict) and res.get("status") == "ok": + return int(res.get("n_links") or 0) + if isinstance(res, dict) and res.get("error"): + notes.append(f"glosario sin enlaces: {res.get('error')}") + except Exception as e: # noqa: BLE001 — links are best-effort. + notes.append(f"glosario sin enlaces: {e}") + return 0 diff --git a/python/functions/datascience/automatic_eda/render_pptx_impl.py b/python/functions/datascience/automatic_eda/render_pptx_impl.py index db7d201a..5e3ba331 100644 --- a/python/functions/datascience/automatic_eda/render_pptx_impl.py +++ b/python/functions/datascience/automatic_eda/render_pptx_impl.py @@ -43,6 +43,8 @@ _ACCENT = (0x2A, 0x6F, 0x97) _MUTED = (0x8A, 0x8A, 0x8A) _HEAD_BG = (0xEE, 0xF3, 0xF6) _WHITE = (0xFF, 0xFF, 0xFF) +_ZEBRA = (0xF6, 0xF8, 0xFA) # faint grey for even (zebra) data rows. +_LINK = (0x2A, 0x6F, 0x97) # accent colour for clickable glossary terms. _FS_TITLE = 26 _FS_H1, _FS_H2, _FS_H3 = 20, 16, 13 @@ -59,6 +61,10 @@ class _PptxState: self.chapter = None self.slide_no = 0 self.chapter_slides = 0 + self.last_heading = "" # text of the most recent heading. + # Glossary wiring (mejora 6): runs to link and per-term target slide. + self.term_runs = [] # [(key, run)] + self.term_anchor_slide = {} # key -> Slide (glossary entry) def _rgb(c): @@ -155,9 +161,13 @@ def _add_rich_text(st: _PptxState, rich_lines: list, fs: float, color, indent=0.0, bullet=False) -> None: """Add pre-wrapped lines of styled segments as one paragraph per line. - Each line is ``[(text, is_bold), ...]``; every segment becomes its own run - so ``**bold**`` spans render with native PowerPoint bold (``run.font.bold``) - without affecting the measured height (one paragraph per pre-wrapped line). + Each line is a list of ``(text, is_bold)`` or ``(text, is_bold, term_key)`` + segments; every segment becomes its own run so ``**bold**`` spans render with + native PowerPoint bold (``run.font.bold``) without affecting the measured + height (one paragraph per pre-wrapped line). A segment carrying a + ``term_key`` is drawn in the accent colour and its run is recorded in + ``st.term_runs`` so it later becomes a native hyperlink jumping to the + glossary slide of that term. """ lh = tl.line_height_in(fs) height = lh * len(rich_lines) + 0.05 @@ -176,14 +186,20 @@ def _add_rich_text(st: _PptxState, rich_lines: list, fs: float, color, r0.text = "• " r0.font.size = Pt(fs) r0.font.color.rgb = _rgb(color) - for seg_text, is_bold in segs: + for seg in segs: + if len(seg) == 3: + seg_text, is_bold, term = seg + else: + seg_text, is_bold, term = seg[0], seg[1], None if seg_text == "": continue run = p.add_run() run.text = seg_text run.font.size = Pt(fs) run.font.bold = bool(is_bold) - run.font.color.rgb = _rgb(color) + run.font.color.rgb = _rgb(_LINK if term else color) + if term: + st.term_runs.append((term, run, st.slide)) st.y += height @@ -191,6 +207,7 @@ def _place_heading(st: _PptxState, block) -> None: level = max(1, min(3, int(getattr(block, "level", 1) or 1))) fs = {1: _FS_H1, 2: _FS_H2, 3: _FS_H3}[level] text = tl.strip_inline_md(getattr(block, "text", "")) + st.last_heading = text or st.last_heading lines = tl.wrap(text, tl.chars_per_line(_USABLE_W, fs)) _add_text(st, lines, fs, _INK, bold=True) st.y += 0.04 @@ -233,12 +250,12 @@ def _place_markdown(st: _PptxState, block) -> None: continue if stripped.startswith("- ") or stripped.startswith("* "): content = stripped[2:] # keep inline markers for bold rendering. - rich = tl.wrap_rich(content, - tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY)) + rich = tl.wrap_rich_terms(content, + tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY)) _add_rich_text(st, rich, _FS_BODY, _INK, bullet=True) i += 1 continue - para = [stripped] # keep inline markers; wrap_rich renders **bold**. + para = [stripped] # keep inline markers; wrap_rich_terms renders **bold**. j = i + 1 while j < n: nxt = md_lines[j].strip() @@ -247,8 +264,8 @@ def _place_markdown(st: _PptxState, block) -> None: para.append(nxt) j += 1 text = " ".join(para) - _add_rich_text(st, tl.wrap_rich(text, tl.chars_per_line(_USABLE_W, _FS_BODY)), - _FS_BODY, _INK) + _add_rich_text(st, tl.wrap_rich_terms( + text, tl.chars_per_line(_USABLE_W, _FS_BODY)), _FS_BODY, _INK) i = j st.y += _GAP @@ -295,7 +312,8 @@ def _row_height_in(cells, widths, fs) -> float: return lh * maxlines + 0.10 -def _emit_table(st: _PptxState, header, chunk, widths, fs) -> None: +def _emit_table(st: _PptxState, header, chunk, widths, fs, + start_index: int = 0) -> None: nrows = len(chunk) + (1 if header else 0) ncol = len(widths) # Pre-measure total height to size the shape (pptx still auto-grows rows). @@ -319,11 +337,14 @@ def _emit_table(st: _PptxState, header, chunk, widths, fs) -> None: cell.text = model._safe_str(header[c]) if c < len(header) else "" _style_cell(cell, fs, _INK, bold=True, fill=_HEAD_BG) ridx = 1 - for r in chunk: + # Zebra striping: shade even data rows (1-based) using the GLOBAL row index + # (start_index offset) so the pattern stays coherent across split chunks. + for k, r in enumerate(chunk): + fill = _ZEBRA if (start_index + k) % 2 == 1 else _WHITE for c in range(ncol): cell = gtable.cell(ridx, c) cell.text = model._safe_str(r[c]) if c < len(r) else "" - _style_cell(cell, fs, _INK, bold=False, fill=_WHITE) + _style_cell(cell, fs, _INK, bold=False, fill=fill) ridx += 1 st.y += total_h + _GAP @@ -367,6 +388,7 @@ def _place_data_table(st: _PptxState, block, shaded_header=True, avail = _remaining(st) - header_h chunk = [] used = 0.0 + chunk_start = idx # global index of the first row in this chunk (zebra). while idx < n: rh = _row_height_in(rows[idx], widths, fs) if used + rh > avail and chunk: @@ -374,7 +396,7 @@ def _place_data_table(st: _PptxState, block, shaded_header=True, chunk.append(rows[idx]) used += rh idx += 1 - _emit_table(st, header, chunk, widths, fs) + _emit_table(st, header, chunk, widths, fs, start_index=chunk_start) note = getattr(block, "note", None) if note: _add_text(st, tl.wrap(model._safe_str(note), @@ -421,54 +443,97 @@ def _resolve_png(block): pass -def _place_picture_bytes(st: _PptxState, data: bytes, caption) -> None: +def _figure_bytes_cached(block): + """Rasterize a figure/image to PNG bytes ONCE and cache (bytes, aspect). + + Measuring (keep-together) and drawing must agree on the real aspect ratio — + ``bbox_inches='tight'`` changes it vs ``figsize``, so we rasterize once and + reuse the bytes for both. Cached on the block; never raises.""" + cached = getattr(block, "_aeda_png", None) + if cached is not None: + return cached + kind = getattr(block, "kind", "") + data = None + if kind == "image": + path = getattr(block, "path", "") + if path and os.path.exists(path): + try: + with open(path, "rb") as fh: + data = fh.read() + except Exception: # noqa: BLE001 + data = None + else: + data = _resolve_png(block) + aspect = 0.66 + if data is not None: + w_px, h_px = _img_size_px(data) + aspect = (h_px / w_px) if w_px else 0.66 + try: + block._aeda_png = (data, aspect) + return block._aeda_png + except Exception: # noqa: BLE001 — block may reject attributes; degrade. + return (data, aspect) + + +def _place_picture_bytes(st: _PptxState, data: bytes, caption, + max_h_in=None) -> None: + # Mejora 4 — every figure on a slide carries a visible caption/title. If the + # block has no caption, fall back to the current section heading, then to a + # generic label, so no image is ever shown untitled. + caption = (model._safe_str(caption).strip() + or model._safe_str(st.last_heading).strip() or "Figura") w_px, h_px = _img_size_px(data) aspect = (h_px / w_px) if w_px else 0.66 + # Reserve the caption's REAL (possibly multi-line) height FIRST, then scale + # the image to (max_h - cap_reserve): a figure never fills the whole slide, + # so its caption always fits on the SAME slide and no image is untitled. + # cap_real = what _add_text consumes; cap_reserve adds the post-image gap and + # a small cushion so the caption never spills to the next slide. + cap_lines = tl.wrap(caption, tl.chars_per_line(_USABLE_W, _FS_NOTE)) + cap_real = tl.line_height_in(_FS_NOTE) * len(cap_lines) + 0.05 + cap_reserve = cap_real + 0.05 + 0.10 max_h = _CONTENT_BOTTOM - _CONTENT_TOP + # height_in hint (model.Figure/Image): cap the target height so a figure in a + # keep-together Group shrinks to leave room for its heading and text. + if isinstance(max_h_in, (int, float)) and max_h_in > 0: + max_h = min(max_h, float(max_h_in)) + max_img_h = max(max_h - cap_reserve, 0.6) target_w = _USABLE_W target_h = target_w * aspect - if target_h > max_h: - target_h = max_h + if target_h > max_img_h: + target_h = max_img_h target_w = target_h / aspect if aspect else _USABLE_W - cap_h = tl.line_height_in(_FS_NOTE) + 0.05 if caption else 0.0 - if _remaining(st) < target_h + cap_h: + # Keep the image and its caption together on the same slide. + if _remaining(st) < target_h + cap_reserve: _new_slide(st, cont=True) left = _ML + (_USABLE_W - target_w) / 2.0 st.slide.shapes.add_picture(io.BytesIO(data), Inches(left), Inches(st.y), width=Inches(target_w), height=Inches(target_h)) st.y += target_h + 0.05 - if caption: - _add_text(st, tl.wrap(model._safe_str(caption), - tl.chars_per_line(_USABLE_W, _FS_NOTE)), _FS_NOTE, _MUTED, - italic=True) + _add_text(st, cap_lines, _FS_NOTE, _MUTED, italic=True) st.y += _GAP def _place_figure(st: _PptxState, block) -> None: - png = _resolve_png(block) + png, _aspect = _figure_bytes_cached(block) if png is None: _add_text(st, ["(figura no disponible)"], _FS_NOTE, _MUTED, italic=True) st.y += _GAP return - _place_picture_bytes(st, png, getattr(block, "caption", None)) + _place_picture_bytes(st, png, getattr(block, "caption", None), + max_h_in=getattr(block, "height_in", None)) def _place_image(st: _PptxState, block) -> None: - path = getattr(block, "path", "") - if not path or not os.path.exists(path): + data, _aspect = _figure_bytes_cached(block) + if data is None: + path = getattr(block, "path", "") _add_text(st, [f"(imagen no encontrada: {path})"], _FS_NOTE, _MUTED, italic=True) st.y += _GAP return - try: - with open(path, "rb") as fh: - data = fh.read() - except Exception as e: # noqa: BLE001 - _add_text(st, [f"(no se pudo leer la imagen: {e})"], _FS_NOTE, _MUTED, - italic=True) - st.y += _GAP - return - _place_picture_bytes(st, data, getattr(block, "caption", None)) + _place_picture_bytes(st, data, getattr(block, "caption", None), + max_h_in=getattr(block, "height_in", None)) def _place_caption(st: _PptxState, block) -> None: @@ -482,6 +547,170 @@ def _place_note(st: _PptxState, block) -> None: _place_caption(st, block) +# --------------------------------------------------------------------------- # +# Block measurement (mejora 3 — keep-together). Estimate a block's slide height +# WITHOUT drawing it so a Group can move whole to the next slide before drawing. +# Over-estimating only triggers an earlier slide break, never a content cut. +# --------------------------------------------------------------------------- # +def _measure_heading_text(text: str, level: int) -> float: + level = max(1, min(3, int(level or 1))) + fs = {1: _FS_H1, 2: _FS_H2, 3: _FS_H3}[level] + lines = tl.wrap(tl.strip_inline_md(text), tl.chars_per_line(_USABLE_W, fs)) + return tl.line_height_in(fs) * len(lines) + 0.05 + 0.04 + + +def _measure_markdown(block) -> float: + raw = str(getattr(block, "text", "") or "") + md_lines = raw.split("\n") + h = 0.0 + i, n = 0, len(md_lines) + while i < n: + stripped = md_lines[i].strip() + if stripped.startswith("|") and stripped.endswith("|"): + j = i + while j < n and md_lines[j].strip().startswith("|") \ + and md_lines[j].strip().endswith("|"): + j += 1 + h += (tl.line_height_in(_FS_CELL) + 0.10) * (j - i) + _GAP + i = j + continue + if stripped == "": + h += tl.line_height_in(_FS_BODY) * 0.4 + i += 1 + continue + if stripped.startswith("### "): + h += _measure_heading_text(stripped[4:], 3) + i += 1 + continue + if stripped.startswith("## "): + h += _measure_heading_text(stripped[3:], 2) + i += 1 + continue + if stripped.startswith("# "): + h += _measure_heading_text(stripped[2:], 1) + i += 1 + continue + if stripped.startswith("- ") or stripped.startswith("* "): + lines = tl.wrap_rich_terms( + stripped[2:], tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY)) + h += tl.line_height_in(_FS_BODY) * len(lines) + 0.05 + i += 1 + continue + para = [stripped] + j = i + 1 + while j < n: + nxt = md_lines[j].strip() + if nxt == "" or nxt.startswith(("|", "#", "- ", "* ")): + break + para.append(nxt) + j += 1 + lines = tl.wrap_rich_terms(" ".join(para), + tl.chars_per_line(_USABLE_W, _FS_BODY)) + h += tl.line_height_in(_FS_BODY) * len(lines) + 0.05 + i = j + return h + _GAP + + +def _measure_figure_like(block) -> float: + max_h = _CONTENT_BOTTOM - _CONTENT_TOP + hint = getattr(block, "height_in", None) + if isinstance(hint, (int, float)) and hint > 0: + max_h = min(max_h, float(hint)) + # Use the REAL rasterized aspect (cached) so measuring matches drawing — this + # is what keeps a figure together with its heading instead of splitting. + _data, aspect = _figure_bytes_cached(block) + target_h = min(_USABLE_W * aspect, max_h) + # Caption is always emitted now (mejora 4), so always reserve its line. + cap_h = tl.line_height_in(_FS_NOTE) + 0.05 + return target_h + 0.05 + cap_h + _GAP + + +def _measure_block(st: _PptxState, block) -> float: + kind = getattr(block, "kind", "") + try: + if kind == "heading": + return _measure_heading_text(getattr(block, "text", ""), + getattr(block, "level", 1)) + if kind == "markdown": + return _measure_markdown(block) + if kind in ("figure", "image"): + return _measure_figure_like(block) + if kind in ("caption", "note"): + lines = tl.wrap(getattr(block, "text", ""), + tl.chars_per_line(_USABLE_W, _FS_NOTE)) + return tl.line_height_in(_FS_NOTE) * len(lines) + 0.05 + _GAP + if kind in ("kv_table", "data_table"): + rows = getattr(block, "rows", []) or [] + return (tl.line_height_in(_FS_CELL) + 0.10) * (len(rows) + 1) + _GAP + if kind == "group": + return sum(_measure_block(st, b) + for b in (getattr(block, "blocks", []) or [])) + except Exception: # noqa: BLE001 — a measurement never aborts rendering. + pass + return tl.line_height_in(_FS_BODY) + + +def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> None: + """Cap each figure's height (via height_in) so the whole group fits a slide. + + The figure shrinks just enough to leave room for its heading, text and + caption — that is how keep-together puts a chart on the SAME slide as its + title and description instead of pushing it to the next slide.""" + fig_blocks = [b for b in blocks + if getattr(b, "kind", "") in ("figure", "image")] + if not fig_blocks: + return + nonfig_h = sum(_measure_block(st, b) for b in blocks + if getattr(b, "kind", "") not in ("figure", "image")) + fig_overhead = tl.line_height_in(_FS_NOTE) + 0.05 + 0.05 + _GAP + budget = avail_full - nonfig_h - 0.10 * len(fig_blocks) + if budget <= 1.0: + return # not enough room to keep together; let it flow (degrade). + per = budget / len(fig_blocks) - fig_overhead + if per <= 0.8: + return + for fb in fig_blocks: + cur = getattr(fb, "height_in", None) + fb.height_in = (min(float(cur), per) + if isinstance(cur, (int, float)) and cur > 0 else per) + + +def _place_group(st: _PptxState, block) -> None: + """Render a keep-together Group: move it whole to the next slide if needed.""" + blocks = getattr(block, "blocks", []) or [] + if not blocks: + return + avail_full = _CONTENT_BOTTOM - _CONTENT_TOP + _shrink_group_figures(st, blocks, avail_full) + total = sum(_measure_block(st, b) for b in blocks) + if total <= avail_full: + if total > _remaining(st): + _new_slide(st, cont=True) + elif st.y > _CONTENT_TOP + 1e-6: + _new_slide(st, cont=True) + for b in blocks: + placer = _PLACERS.get(getattr(b, "kind", ""), _place_note) + try: + placer(st, b) + except Exception: # noqa: BLE001 — a bad block never aborts the group. + pass + + +def _place_glossary_entry(st: _PptxState, block) -> None: + """Render one glossary term and register its slide as the link target.""" + key = getattr(block, "key", "") + label = getattr(block, "label", "") or key + definition = getattr(block, "definition", "") + _ensure(st, tl.line_height_in(_FS_H3) + tl.line_height_in(_FS_BODY) * 2) + if key: + st.term_anchor_slide[key] = st.slide + _place_heading(st, model.Heading(text=str(label), level=3)) + if definition: + _add_text(st, tl.wrap(model._safe_str(definition), + tl.chars_per_line(_USABLE_W, _FS_BODY)), _FS_BODY, _INK) + st.y += _GAP + + _PLACERS = { "heading": _place_heading, "markdown": _place_markdown, @@ -491,6 +720,8 @@ _PLACERS = { "image": _place_image, "caption": _place_caption, "note": _place_note, + "group": _place_group, + "glossary_entry": _place_glossary_entry, } @@ -542,6 +773,9 @@ def render_pptx(chapters: list, out_path: str, meta: dict = None) -> dict: _new_slide(st, cont=False) _place_note(st, model.Note( "(documento vacío — sin capítulos aplicables)")) + # Mejora 6 — wire clickable glossary terms to their entry slide (native + # PowerPoint slide-jump). Delegated registry function; degrades silently. + n_links = _wire_glossary_links(st, notes) prs.save(out_path) n_slides = st.slide_no except Exception as e: # noqa: BLE001 @@ -549,7 +783,35 @@ def render_pptx(chapters: list, out_path: str, meta: dict = None) -> dict: "note": f"fallo al escribir el PPTX: {e}"} note = f"{n_slides} slides" + if n_links: + note += f" · {n_links} enlaces de glosario" if notes: note += " · " + "; ".join(notes) return {"path": out_path, "n_slides": n_slides, "chapters": chapters_meta, "note": note} + + +def _wire_glossary_links(st: _PptxState, notes: list) -> int: + """Turn each recorded term run into a native jump to its glossary slide. + + Returns the number of links applied. A term whose only appearance is inside + its own glossary entry (source slide == target slide) is skipped. Never + raises.""" + if not st.term_runs or not st.term_anchor_slide: + return 0 + linked = 0 + try: + from datascience.pptx_link_run_to_slide import pptx_link_run_to_slide + except Exception as e: # noqa: BLE001 + notes.append(f"glosario sin enlaces: {e}") + return 0 + for key, run, src_slide in st.term_runs: + tgt = st.term_anchor_slide.get(key) + if tgt is None or tgt is src_slide: + continue + try: + if pptx_link_run_to_slide(run, src_slide, tgt): + linked += 1 + except Exception: # noqa: BLE001 — links are best-effort. + pass + return linked diff --git a/python/functions/datascience/automatic_eda/text_layout.py b/python/functions/datascience/automatic_eda/text_layout.py index 0d07d140..059e12a2 100644 --- a/python/functions/datascience/automatic_eda/text_layout.py +++ b/python/functions/datascience/automatic_eda/text_layout.py @@ -24,6 +24,13 @@ import textwrap # the visible text matches ``strip_inline_md`` exactly. _INLINE_SPAN_RE = re.compile(r"(\*\*.+?\*\*|__.+?__|`.+?`)") +# Glossary term span: ``[[term:key]]texto visible[[/term]]``. The visible text +# (which may itself contain ``**bold**``) is kept and tagged with ``key`` so the +# renderers can turn each appearance into a clickable jump to the glossary entry. +_TERM_SPAN_RE = re.compile(r"\[\[term:([A-Za-z0-9_]+)\]\](.*?)\[\[/term\]\]", + re.S) +_TERM_OPEN_RE = re.compile(r"\[\[term:[A-Za-z0-9_]+\]\]") + def avg_char_width_in(fontsize_pt: float) -> float: """Approximate average glyph width in inches for a sans-serif font. @@ -86,11 +93,21 @@ def strip_inline_md(text: str) -> str: if not text: return "" s = str(text) + # Drop glossary term markers, keeping the visible inner text. + s = _TERM_SPAN_RE.sub(lambda m: m.group(2), s) + s = _TERM_OPEN_RE.sub("", s) # leftover unbalanced open marker. + s = s.replace("[[/term]]", "") # leftover unbalanced close marker. for marker in ("**", "__", "`"): s = s.replace(marker, "") return s +def _strip_term_markers(s: str) -> str: + """Remove any (balanced or leftover) glossary term markers, keeping text.""" + s = _TERM_OPEN_RE.sub("", s) + return s.replace("[[/term]]", "") + + def _strip_leftover_markers(s: str) -> str: """Drop any unbalanced inline markers from a plain (non-span) fragment. @@ -222,6 +239,118 @@ def wrap_rich(text: str, max_chars: int): return lines or [[("", False)]] +def parse_inline_rich(text: str): + """Split ``text`` into ``[(fragment, is_bold, term_key), ...]``. + + Extends :func:`parse_inline_bold` with glossary term spans + ``[[term:key]]visible[[/term]]``: the inner ``visible`` text is parsed for + ``**bold**`` as usual and every resulting fragment carries ``term_key`` so the + renderers can make it clickable. Text outside a term span gets ``term_key = + None``. Unbalanced term markers are stripped (kept identical to + :func:`strip_inline_md`). The concatenation of all fragment texts equals + ``strip_inline_md(text)`` — visible characters and wrapping are unchanged; only + the bold flag and the term key are added. Adjacent fragments with the same + (bold, term) are merged. + """ + s = "" if text is None else str(text) + if not s: + return [] + out = [] + + def _emit(fragment: str, bold: bool, term) -> None: + if fragment == "": + return + if out and out[-1][1] == bold and out[-1][2] == term: + out[-1] = (out[-1][0] + fragment, bold, term) + else: + out.append((fragment, bold, term)) + + def _emit_bolded(segment: str, term) -> None: + # Reuse the bold parser on a term-marker-free segment. + for frag, bold in parse_inline_bold(_strip_term_markers(segment)): + _emit(frag, bold, term) + + pos = 0 + for m in _TERM_SPAN_RE.finditer(s): + if m.start() > pos: + _emit_bolded(s[pos:m.start()], None) + _emit_bolded(m.group(2), m.group(1)) + pos = m.end() + if pos < len(s): + _emit_bolded(s[pos:], None) + return out + + +def wrap_rich_terms(text: str, max_chars: int): + """Like :func:`wrap_rich` but preserving glossary term keys per fragment. + + Returns ``list[list[(fragment, is_bold, term_key)]]`` — one inner list per + output line. Wrapping is word-aware and hard-splits over-long tokens so no + line exceeds ``max_chars`` (the renderers measure these very lines). Term and + bold flags never widen a line: the visible width matches :func:`wrap`. + """ + if max_chars < 1: + max_chars = 1 + spans = parse_inline_rich(text) + if not spans: + return [[("", False, None)]] + + tokens = [] # each: (word, bold, term) or ("\n", None, None) + for frag, bold, term in spans: + parts = frag.split("\n") + for pi, part in enumerate(parts): + if pi > 0: + tokens.append(("\n", None, None)) + for word in part.split(" "): + if word == "": + continue + tokens.append((word, bold, term)) + + lines = [] + cur = [] + cur_len = 0 + + def _flush(): + nonlocal cur, cur_len + merged = [] + for k, (word, bold, term) in enumerate(cur): + piece = word if k == 0 else " " + word + if merged and merged[-1][1] == bold and merged[-1][2] == term: + merged[-1] = (merged[-1][0] + piece, bold, term) + else: + merged.append((piece, bold, term)) + lines.append(merged or [("", False, None)]) + cur = [] + cur_len = 0 + + for word, bold, term in tokens: + if bold is None: # forced newline + _flush() + continue + if len(word) > max_chars: + if cur: + _flush() + chunks = _hard_split(word, max_chars) + for ci, chunk in enumerate(chunks): + if ci < len(chunks) - 1: + lines.append([(chunk, bold, term)]) + else: + cur = [(chunk, bold, term)] + cur_len = len(chunk) + continue + add = len(word) if cur_len == 0 else cur_len + 1 + len(word) + if cur_len != 0 and add > max_chars: + _flush() + cur = [(word, bold, term)] + cur_len = len(word) + else: + cur.append((word, bold, term)) + cur_len = add + if cur: + _flush() + return lines or [[("", False, None)]] + + def parse_md_table(lines: list): """Parse consecutive ``| a | b |`` lines into ``(header, rows)`` or None. diff --git a/python/functions/datascience/pptx_link_run_to_slide.md b/python/functions/datascience/pptx_link_run_to_slide.md new file mode 100644 index 00000000..33298afd --- /dev/null +++ b/python/functions/datascience/pptx_link_run_to_slide.md @@ -0,0 +1,85 @@ +--- +name: pptx_link_run_to_slide +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def pptx_link_run_to_slide(run, source_slide, target_slide) -> bool" +description: "Convierte un run de texto de python-pptx en un hyperlink INTERNO 'ir a la diapositiva'. python-pptx soporta run.hyperlink.address para URLs externas pero NO para saltar a otra slide del mismo deck; esta función crea ese salto manipulando el XML: añade una relación slide->slide (RT.SLIDE) y un con action='ppaction://hlinksldjump' y el r:id de la relación, insertado como primer hijo del del run (orden del schema CT_TextCharacterProperties). Idempotente (elimina un hlinkClick previo antes de insertar). Al pulsar el texto en PowerPoint o visores compatibles se navega a target_slide. Motor python-pptx. No lanza nunca: cualquier excepción -> return False." +tags: [eda, pptx, hyperlink, slide-jump, navigation, glossary, automatic-eda, python-pptx, xml, datascience, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: ["python-pptx"] +params: + - name: run + desc: "el pptx.text.text._Run cuyo texto se vuelve clicable. Debe pertenecer a un run real (expone ._r, el elemento ). Un objeto sin ._r hace que la función devuelva False sin lanzar." + - name: source_slide + desc: "la Slide que contiene el run. Su part recibe la relación slide->slide (relate_to con RELATIONSHIP_TYPE.SLIDE); el r:id resultante se referencia en el hlinkClick." + - name: target_slide + desc: "la Slide de destino del salto. Debe pertenecer al MISMO Presentation que source_slide para que la relación interna sea válida." +output: "bool. True si se aplicó el hyperlink interno (relación creada + insertado en el rPr del run); False si algo lo impidió (run inválido, slides de presentaciones distintas, etc.). Nunca lanza." +tested: true +tests: ["test_golden_run_se_vuelve_salto_a_otra_slide", "test_idempotente_reaplica_sin_duplicar_hlinkclick", "test_error_path_run_invalido_devuelve_false_sin_lanzar"] +test_file_path: "python/functions/datascience/pptx_link_run_to_slide_test.py" +file_path: "python/functions/datascience/pptx_link_run_to_slide.py" +--- + +## Ejemplo + +```python +from pptx import Presentation +from pptx.util import Inches +from pptx.oxml.ns import qn + +from datascience.pptx_link_run_to_slide import pptx_link_run_to_slide + +prs = Presentation() +blank = prs.slide_layouts[6] # layout en blanco +slide0 = prs.slides.add_slide(blank) +slide1 = prs.slides.add_slide(blank) # destino del salto (p.ej. el glosario) + +box = slide0.shapes.add_textbox(Inches(1), Inches(1), Inches(4), Inches(1)) +run = box.text_frame.paragraphs[0].add_run() +run.text = "ir al glosario" + +ok = pptx_link_run_to_slide(run, slide0, slide1) +print(ok) # -> True + +# El run quedó con +hlink = run._r.get_or_add_rPr().find(qn("a:hlinkClick")) +print(hlink.get("action")) # -> ppaction://hlinksldjump +prs.save("deck_con_salto.pptx") +``` + +## Cuando usarla + +Cuando construyas un deck PPTX con **navegación interna** y quieras que un texto salte a +otra diapositiva al pulsarlo: un **glosario clicable** (cada término enlaza a su slide de +definición), un **índice/tabla de contenidos navegable**, botones "volver a la portada", o +referencias cruzadas entre capítulos. Es la pieza que `python-pptx` no cubre de fábrica — +úsala sobre los runs ya creados por renderers como `render_automatic_eda_pptx` del grupo +`eda` para enriquecer el deck con saltos sin reescribir el XML a mano cada vez. + +## Gotchas + +- **Impura**: muta el XML del run y crea una relación nueva en el part de `source_slide`. +- **Solo navega en visores que respetan `ppaction://hlinksldjump`**: PowerPoint y la + mayoría de visores compatibles lo siguen; algunos visores web/ligeros lo ignoran (el + texto se ve igual pero no salta). +- **Mismo Presentation**: `source_slide` y `target_slide` deben pertenecer al mismo deck. + Si son de presentaciones distintas, la relación interna no es válida y el salto no + funcionará (la función puede devolver True por crear la relación, pero el resultado en + el visor no será el esperado). +- **El `` vive en el `` del run**, no como hijo directo del ``. + Para localizarlo: `run._r.get_or_add_rPr().find(qn("a:hlinkClick"))` (un `find` sobre + `run._r` devuelve `None` porque solo mira hijos directos del ``). +- **Idempotente**: si el run ya tenía un `hlinkClick` (p.ej. una URL externa o un salto + previo), se elimina antes de insertar el nuevo — un run tiene como mucho un click-link. +- **Nunca lanza**: cualquier excepción (run sin `._r`, slides incompatibles, etc.) se + traga y devuelve `False`. Comprobar el booleano si el salto es crítico. +- **Dependencia python-pptx**: declarada en `python/pyproject.toml`. Tests con + `~/fn_registry/python/.venv/bin/python3` (tiene `python-pptx` instalado). diff --git a/python/functions/datascience/pptx_link_run_to_slide.py b/python/functions/datascience/pptx_link_run_to_slide.py new file mode 100644 index 00000000..bc3cfd1c --- /dev/null +++ b/python/functions/datascience/pptx_link_run_to_slide.py @@ -0,0 +1,50 @@ +"""Convierte un run de texto de python-pptx en un hyperlink interno "ir a la diapositiva". + +python-pptx expone ``run.hyperlink.address`` para URLs externas, pero NO ofrece una +API pública para saltar a otra diapositiva del mismo deck. Esta función crea ese salto +interno manipulando el XML: añade una relación ``slide -> slide`` y un +```` con la acción ``ppaction://hlinksldjump`` en el run, de modo que al +pulsar el texto en PowerPoint (o en visores que respetan esa acción) se navega a la +diapositiva de destino. +""" + +from pptx.opc.constants import RELATIONSHIP_TYPE as RT +from pptx.oxml.ns import qn + + +def pptx_link_run_to_slide(run, source_slide, target_slide) -> bool: + """Convierte un run de texto en un hyperlink interno "ir a la diapositiva". + + Añade una relación ``slide -> slide`` desde la slide origen al part de la slide + destino y crea un ```` con ``action="ppaction://hlinksldjump"`` como + primer hijo del ```` del run (orden válido del schema + ``CT_TextCharacterProperties``). La operación es idempotente: un ``hlinkClick`` + previo en el mismo run se elimina antes de insertar el nuevo. + + Args: + run: el ``pptx.text.text._Run`` cuyo texto se vuelve clicable. + source_slide: la ``Slide`` que contiene el run. + target_slide: la ``Slide`` de destino del salto. + + Returns: + True si se aplicó el hyperlink; False si algo impidió aplicarlo (no lanza). + """ + try: + rId = source_slide.part.relate_to(target_slide.part, RT.SLIDE) + rPr = run._r.get_or_add_rPr() + # Elimina un hlinkClick previo si lo hubiera (idempotente). + for existing in rPr.findall(qn("a:hlinkClick")): + rPr.remove(existing) + hlink = rPr.makeelement( + qn("a:hlinkClick"), + { + qn("r:id"): rId, + "action": "ppaction://hlinksldjump", + }, + ) + # a:hlinkClick debe ir como primer hijo de rPr + # (orden del schema CT_TextCharacterProperties). + rPr.insert(0, hlink) + return True + except Exception: + return False diff --git a/python/functions/datascience/pptx_link_run_to_slide_test.py b/python/functions/datascience/pptx_link_run_to_slide_test.py new file mode 100644 index 00000000..ccfc65dd --- /dev/null +++ b/python/functions/datascience/pptx_link_run_to_slide_test.py @@ -0,0 +1,73 @@ +"""Tests for pptx_link_run_to_slide — salto interno run -> diapositiva. + +Self-contained: construye una Presentation en memoria con dos slides en blanco, +un textbox con un run en la slide 0, y verifica que la función inyecta un +```` con ``action="ppaction://hlinksldjump"`` y un ``r:id`` que +resuelve al part de la slide 1. +""" + +import pytest + +pytest.importorskip("pptx") + +from pptx import Presentation # noqa: E402 +from pptx.oxml.ns import qn # noqa: E402 +from pptx.util import Inches # noqa: E402 + +from datascience.pptx_link_run_to_slide import pptx_link_run_to_slide # noqa: E402 + + +def _two_slide_deck_with_run(): + prs = Presentation() + blank = prs.slide_layouts[6] # layout en blanco + slide0 = prs.slides.add_slide(blank) + slide1 = prs.slides.add_slide(blank) + + box = slide0.shapes.add_textbox(Inches(1), Inches(1), Inches(4), Inches(1)) + tf = box.text_frame + para = tf.paragraphs[0] + run = para.add_run() + run.text = "ir al glosario" + return prs, slide0, slide1, run + + +def test_golden_run_se_vuelve_salto_a_otra_slide(): + prs, slide0, slide1, run = _two_slide_deck_with_run() + + ok = pptx_link_run_to_slide(run, slide0, slide1) + assert ok is True + + # El hlinkClick es hijo del rPr del run (orden del schema + # CT_TextCharacterProperties), no hijo directo del . + rPr = run._r.get_or_add_rPr() + hlink = rPr.find(qn("a:hlinkClick")) + assert hlink is not None + assert hlink.get("action") == "ppaction://hlinksldjump" + + rId = hlink.get(qn("r:id")) + assert rId, "el hlinkClick debe llevar un r:id no vacío" + + # El rId debe existir en las relaciones de la slide origen y apuntar + # al part de la slide destino. + rels = slide0.part.rels + assert rId in rels + assert rels[rId].target_part is slide1.part + + +def test_idempotente_reaplica_sin_duplicar_hlinkclick(): + prs, slide0, slide1, run = _two_slide_deck_with_run() + + assert pptx_link_run_to_slide(run, slide0, slide1) is True + assert pptx_link_run_to_slide(run, slide0, slide1) is True + + rPr = run._r.get_or_add_rPr() + hlinks = rPr.findall(qn("a:hlinkClick")) + assert len(hlinks) == 1 + + +def test_error_path_run_invalido_devuelve_false_sin_lanzar(): + prs, slide0, slide1, _run = _two_slide_deck_with_run() + + # Un objeto sin ._r ni soporte de relación -> la función no lanza, devuelve False. + ok = pptx_link_run_to_slide(object(), slide0, slide1) + assert ok is False diff --git a/python/pyproject.toml b/python/pyproject.toml index 9553fbe8..052f7280 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "polars>=1.40.1", "pymeshlab>=2025.7.post1", "pymssql>=2.3.13", + "pymupdf>=1.28.0", "pypdf>=6.10.0", "pyproj>=3.7.2", "python-docx>=1.2.0",