feat(eda): motor AutomaticEDA fase 4a — render fixes + keep-together + glosario clicable
Mejoras transversales del motor de render (no del contenido de capítulos): 1. Fix negrita pisa texto (PDF): _place_rich_lines mide el ancho REAL de cada span con las métricas de fuente del renderer (peso correcto) en vez del grid de ancho medio; negrita y normal en la misma línea ya no se solapan. 2. Zebra striping: filas pares sombreadas (#f6f8fa) en DataTable (PDF + PPTX), coherente al partir tablas largas (índice de fila lógico, no por página). 3. Keep-together: bloque Group nuevo; el renderer mide el grupo entero y lo mueve completo a la página/slide siguiente si no cabe, y encoge la figura (height_in) para dejar sitio a su título y texto. num_distr lo usa. 4. Caption siempre visible en toda figura PPTX (fallback al heading); la figura reserva el alto de su caption para que ambos quepan en el mismo slide. 5. Portada construida al final (con resumen agregado del análisis vía ctx['document_summary']) pero colocada primera por build_document. 6. Glosario: capítulo nuevo (último) + GlossaryCollector en ctx; los capítulos registran términos y marcan apariciones con [[term:key]]...[[/term]]. Links clicables reales: PDF (PyMuPDF, link GOTO) y PPTX (slide-jump nativo). Enganchado "entropía" en cat_distr como ejemplo end-to-end. Funciones reutilizables delegadas a fn-constructor (tag eda): - add_pdf_internal_links_py_datascience (PyMuPDF) - pptx_link_run_to_slide_py_datascience (slide-jump) Contrato docs/automatic_eda_contract.md actualizado (§1/§3/§5 + §11 nueva) con la API de glosario, keep-together y zebra para la siguiente fase. PyMuPDF declarado en pyproject. Suite verde (90 tests); golden titanic verificado. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -33,10 +33,23 @@ import math
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_ID = "cat_distr"
|
||||
CHAPTER_TITLE = "Distribuciones categóricas"
|
||||
|
||||
# Glossary term this chapter explains. Registered in the shared collector and
|
||||
# marked clickable on its first appearance (end-to-end glossary example —
|
||||
# mejora 6). Other chapters hook their own terms the same way (see the contract).
|
||||
_TERM_ENTROPIA_KEY = "entropia"
|
||||
_TERM_ENTROPIA_LABEL = "Entropía (de Shannon)"
|
||||
_TERM_ENTROPIA_DEF = (
|
||||
"Medida, en bits, de cómo de repartidos están los valores de una columna "
|
||||
"categórica. Vale 0 cuando una sola categoría concentra todas las filas "
|
||||
"(máxima previsibilidad) y alcanza su máximo, log2(k) para k categorías "
|
||||
"distintas, cuando todas aparecen por igual (máxima diversidad). La entropía "
|
||||
"normalizada (entropía dividida por su máximo) la lleva al rango 0–1 para "
|
||||
"comparar columnas con distinto número de categorías.")
|
||||
|
||||
# Cap the number of categorical columns rendered to keep the document bounded;
|
||||
# the rest are summarized in a closing note (no silent truncation).
|
||||
MAX_COLS = 40
|
||||
@@ -337,10 +350,14 @@ def _topk_table(cat: dict):
|
||||
note=note)
|
||||
|
||||
|
||||
def _intro_blocks(n_rows):
|
||||
def _intro_blocks(n_rows, mark_term: bool = False):
|
||||
total = _fmt_int(n_rows)
|
||||
# Mark the first appearance of the term as a clickable glossary jump when the
|
||||
# term was registered (mark_term). The visible text is identical either way.
|
||||
entropia = ("[[term:entropia]]**entropía de Shannon**[[/term]]" if mark_term
|
||||
else "**entropía de Shannon**")
|
||||
text = (
|
||||
"La **entropía de Shannon** mide cómo de repartidos están los valores de "
|
||||
f"La {entropia} mide cómo de repartidos están los valores de "
|
||||
"una columna categórica, en bits. Vale 0 cuando una sola categoría "
|
||||
"concentra todas las filas (máxima previsibilidad) y alcanza su máximo, "
|
||||
"log2(k) para k categorías distintas, cuando todas aparecen por igual "
|
||||
@@ -370,7 +387,15 @@ def build_cat_distr(profile: dict, ctx: dict):
|
||||
return None
|
||||
|
||||
n_rows = profile.get("n_rows")
|
||||
blocks = list(_intro_blocks(n_rows))
|
||||
# Register "entropía" in the shared glossary collector (if present) and mark
|
||||
# its first appearance clickable. End-to-end glossary example (mejora 6).
|
||||
glossary = ctx.get("glossary")
|
||||
mark_term = False
|
||||
if isinstance(glossary, model.GlossaryCollector):
|
||||
glossary.add(_TERM_ENTROPIA_KEY, _TERM_ENTROPIA_LABEL,
|
||||
_TERM_ENTROPIA_DEF)
|
||||
mark_term = True
|
||||
blocks = list(_intro_blocks(n_rows, mark_term=mark_term))
|
||||
|
||||
rendered = cat_cols[:MAX_COLS]
|
||||
for col in rendered:
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
"""Glossary chapter (GLOSARIO) — always the last chapter, clickable terms.
|
||||
|
||||
Renders one entry per glossary term that the other chapters registered during
|
||||
the document build through ``ctx['glossary'].add(key, label, definition)`` (see
|
||||
``GlossaryCollector`` in ``model.py``). Each entry is a clickable destination:
|
||||
every in-text appearance a chapter marked with ``[[term:key]]texto[[/term]]``
|
||||
becomes a real jump to its entry here — PDF link annotations (PyMuPDF) and PPTX
|
||||
native slide jumps, both wired by the renderers.
|
||||
|
||||
Returns ``None`` when no term was registered (there is nothing to show), so the
|
||||
chapter simply disappears from documents that did not mark any term.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "glosario"
|
||||
CHAPTER_TITLE = "Glosario"
|
||||
|
||||
|
||||
def build_glosario(profile: dict, ctx: dict):
|
||||
"""Build the glossary Chapter from the shared collector, or None if empty."""
|
||||
ctx = ctx or {}
|
||||
glossary = ctx.get("glossary")
|
||||
if not isinstance(glossary, model.GlossaryCollector) or not glossary:
|
||||
return None
|
||||
|
||||
blocks = [
|
||||
model.Heading(text="Glosario de términos", level=1),
|
||||
model.Markdown(text=(
|
||||
"Definición de los términos técnicos que aparecen en el informe. "
|
||||
"Cada término va resaltado en el texto y, al pulsarlo, salta a su "
|
||||
"definición en esta sección.")),
|
||||
]
|
||||
# One clickable destination per term, alphabetically by visible label.
|
||||
for term in glossary.terms(by="label"):
|
||||
blocks.append(model.GlossaryEntry(
|
||||
key=model._safe_str(term.get("key")),
|
||||
label=model._safe_str(term.get("label")),
|
||||
definition=model._safe_str(term.get("definition"))))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -34,7 +34,7 @@ try:
|
||||
except Exception: # noqa: BLE001 — keep the chapter importable no matter what.
|
||||
build_boxplot_stats = None # type: ignore[assignment]
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_ID = "num_distr"
|
||||
CHAPTER_TITLE = "Distribuciones numéricas"
|
||||
|
||||
@@ -278,12 +278,17 @@ def build_num_distr(profile: dict, ctx: dict):
|
||||
box = build_boxplot_stats(numeric) or {}
|
||||
except Exception: # noqa: BLE001 — degrade, never raise.
|
||||
box = {}
|
||||
blocks.append(model.Heading(text=str(name), level=2))
|
||||
blocks.append(model.Figure(
|
||||
make=_figure_maker(name, numeric, box),
|
||||
caption=f"Distribución de «{name}» — histograma (media/mediana/±σ) "
|
||||
f"y boxplot."))
|
||||
blocks.append(model.Markdown(text=_stats_note(name, numeric, box)))
|
||||
# Keep the column heading, its figure and its stats note together on the
|
||||
# same page/slide (mejora 3 — keep-together): the renderers measure the
|
||||
# whole Group and move it whole when it would not fit.
|
||||
blocks.append(model.Group(blocks=[
|
||||
model.Heading(text=str(name), level=2),
|
||||
model.Figure(
|
||||
make=_figure_maker(name, numeric, box),
|
||||
caption=f"Distribución de «{name}» — histograma "
|
||||
f"(media/mediana/±σ) y boxplot."),
|
||||
model.Markdown(text=_stats_note(name, numeric, box)),
|
||||
]))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -65,19 +65,33 @@ def _pdf_text(path: str) -> str:
|
||||
return re.sub(r"\s+", " ", txt)
|
||||
|
||||
|
||||
def _flatten(blocks):
|
||||
"""Expand keep-together Groups so the per-column heading/figure/markdown are
|
||||
inspectable as a flat block list (the chapter wraps each column in a Group)."""
|
||||
out = []
|
||||
for b in blocks:
|
||||
if getattr(b, "kind", "") == "group":
|
||||
out.extend(_flatten(getattr(b, "blocks", []) or []))
|
||||
else:
|
||||
out.append(b)
|
||||
return out
|
||||
|
||||
|
||||
def test_golden_chapter_estructura_y_bloques():
|
||||
ch = build_num_distr(_profile(n_numeric=2), {})
|
||||
assert ch is not None
|
||||
assert ch.id == "num_distr"
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
kinds = [b.kind for b in ch.blocks]
|
||||
# Per-column blocks are wrapped in keep-together Groups: flatten to inspect.
|
||||
flat = _flatten(ch.blocks)
|
||||
kinds = [b.kind for b in flat]
|
||||
# Heading + intro Markdown, then per column: Heading + Figure + Markdown.
|
||||
assert kinds[0] == "heading"
|
||||
assert kinds[1] == "markdown"
|
||||
assert kinds.count("figure") == 2 # one figure per numeric column.
|
||||
assert kinds.count("heading") == 1 + 2 # chapter title + one per column.
|
||||
# Each figure has a lazy maker that produces a real matplotlib figure.
|
||||
figs = [b for b in ch.blocks if b.kind == "figure"]
|
||||
figs = [b for b in flat if b.kind == "figure"]
|
||||
fig = figs[0].make()
|
||||
assert fig is not None
|
||||
# Two stacked axes: histogram + boxplot share the figure.
|
||||
@@ -90,7 +104,8 @@ def test_golden_media_mediana_sigma_y_boxplot_presentes():
|
||||
# The intro documents the three reference lines and the Tukey boxplot; the
|
||||
# per-column note carries the actual mean/median/σ numbers and the shape.
|
||||
ch = build_num_distr(_profile(n_numeric=1, extra_categorical=False), {})
|
||||
md_texts = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
md_texts = " ".join(b.text for b in _flatten(ch.blocks)
|
||||
if b.kind == "markdown")
|
||||
assert "media" in md_texts and "mediana" in md_texts
|
||||
assert "±1σ" in md_texts or "σ" in md_texts
|
||||
assert "boxplot" in md_texts.lower()
|
||||
@@ -126,7 +141,8 @@ def test_anti_corte_muchas_columnas_pdf_y_pptx():
|
||||
# 8 numeric columns + long note text: nothing may be cut. Every column
|
||||
# heading must survive in both the PDF text and the PPTX deck.
|
||||
ch = build_num_distr(_profile(n_numeric=8), {})
|
||||
names = [b.text for b in ch.blocks if b.kind == "heading" and b.level == 2]
|
||||
names = [b.text for b in _flatten(ch.blocks)
|
||||
if b.kind == "heading" and b.level == 2]
|
||||
assert len(names) == 8
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "num.pdf")
|
||||
|
||||
@@ -17,7 +17,7 @@ from datetime import datetime, timezone
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_ID = "portada"
|
||||
CHAPTER_TITLE = "Portada"
|
||||
|
||||
@@ -67,6 +67,53 @@ def _fmt_int(v) -> str:
|
||||
return str(v)
|
||||
|
||||
|
||||
def _fmt_pct(value) -> str:
|
||||
"""Format a percentage that may arrive as a 0–1 fraction or a 0–100 number."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
v = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
if 0 < v <= 1.0:
|
||||
v *= 100.0
|
||||
return f"{v:.1f}%"
|
||||
|
||||
|
||||
def _summary_blocks(summary) -> list:
|
||||
"""Mini-summary of the rest of the analysis, shown on the cover (mejora 5).
|
||||
|
||||
The cover is built AFTER the body (``build_document`` passes the aggregated
|
||||
``ctx['document_summary']``), so it can reflect what the analysis found:
|
||||
shape, column types, quality flags and which chapters were included. Returns
|
||||
an empty list when there is no summary (the cover degrades to its metadata
|
||||
table only)."""
|
||||
if not isinstance(summary, dict) or not summary:
|
||||
return []
|
||||
rows = []
|
||||
n_num = summary.get("n_numeric")
|
||||
n_cat = summary.get("n_categorical")
|
||||
if n_num is not None or n_cat is not None:
|
||||
rows.append(("Columnas numéricas / categóricas",
|
||||
f"{_fmt_int(n_num)} / {_fmt_int(n_cat)}"))
|
||||
if summary.get("duplicate_pct") is not None:
|
||||
rows.append(("Filas duplicadas", _fmt_pct(summary.get("duplicate_pct"))))
|
||||
if summary.get("null_cell_pct") is not None:
|
||||
rows.append(("Celdas nulas", _fmt_pct(summary.get("null_cell_pct"))))
|
||||
titles = summary.get("chapter_titles") or []
|
||||
if titles:
|
||||
rows.append(("Capítulos del informe", _fmt_int(len(titles))))
|
||||
|
||||
blocks = [model.Heading(text="Resumen del análisis", level=2)]
|
||||
if rows:
|
||||
blocks.append(model.KVTable(rows=rows))
|
||||
if titles:
|
||||
bullets = "\n".join(f"- {model._safe_str(t)}" for t in titles)
|
||||
blocks.append(model.Markdown(
|
||||
text="Este informe incluye los siguientes capítulos:\n" + bullets))
|
||||
return blocks
|
||||
|
||||
|
||||
def _fmt_date_eu(value) -> str:
|
||||
"""Format a date/ISO string as European DD/MM/AAAA HH:mm (UI convention).
|
||||
|
||||
@@ -152,5 +199,8 @@ def build_portada(profile: dict, ctx: dict):
|
||||
model.Markdown(text=str(granularity)),
|
||||
]
|
||||
|
||||
# Mini-summary of the rest of the analysis (built last, shown on the cover).
|
||||
blocks.extend(_summary_blocks(ctx.get("document_summary")))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
Reference in New Issue
Block a user