feat(eda): salida Markdown del AutomaticEDA para pegar a un LLM

Añade un tercer formato de salida al AutomaticEDA, junto al PDF y el PPTX:
un Markdown autocontenido del MISMO documento por capítulos
(chapters_registry.build_document), optimizado para incorporar a un LLM
(texto plano + tablas markdown reales, sin binarios incrustados).

- render_md_impl.render_md(chapters, out_path, meta): serializa los bloques
  del modelo (Heading/Markdown/KVTable/DataTable/Figure/Image/Caption/Note/
  Group/GlossaryEntry) a Markdown. Cabecera con metadatos + índice navegable
  con anclas GitHub; tablas volcadas enteras (el MD no pagina); marcadores de
  glosario eliminados conservando la negrita; glosario al final.
- Figuras: un LLM no ve la imagen, así que se prioriza texto + datos. Se emite
  el caption y, cuando la figura tiene barras (histograma), se extrae la tabla
  de bins (Desde/Hasta/Frecuencia) de los artistas matplotlib. La banda ±1σ
  (axvspan) se descarta por ancho para que no aparezca como un falso bin.
  PNG opcional vía meta['embed_figures'] (off por defecto → sin binarios).
- render_automatic_eda_markdown: función pública del registry (tag eda),
  espejo de render_automatic_eda_pdf/pptx, acepta lista de capítulos o un
  TableProfile (build_document). dict-no-throw.
- render_automatic_eda (pipeline): emite también el .md (emit_md=True por
  defecto, clave de retorno aeda_md_path). Cambio aditivo: PDF/PPTX/manifest
  siguen saliendo igual.

Tests: golden de todos los kinds + regresión del filtro de la banda ±1σ +
edge documento vacío + profile path. Suite del paquete y del pipeline verde
(122 passed).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-30 18:52:08 +02:00
parent ab21e5d90b
commit 48de3ce3da
7 changed files with 805 additions and 8 deletions
@@ -36,6 +36,7 @@ from .model import ( # noqa: F401
from .chapters_registry import CHAPTER_ORDER, build_chapter, build_document # noqa: F401
from .render_pdf_impl import render_pdf # noqa: F401
from .render_pptx_impl import render_pptx # noqa: F401
from .render_md_impl import render_md # noqa: F401
__all__ = [
"ENGINE_NAME",
@@ -60,4 +61,5 @@ __all__ = [
"build_document",
"render_pdf",
"render_pptx",
"render_md",
]
@@ -0,0 +1,458 @@
"""AutomaticEDA Markdown serializer — one self-contained file to paste to an LLM.
Same document model as the PDF/PPTX renderers (an ordered list of
:class:`Chapter`, each a list of format-independent blocks) but emitted as plain
**Markdown** instead of a binary. The goal is different from the other two
renderers: a Markdown EDA is meant to be *pasted into an LLM*, so it prioritises
TEXT and DATA over visuals. Tables become Markdown tables (every row dumped, no
pagination — nothing is cut because there are no pages); a ``Figure`` becomes its
caption plus, when possible, the underlying bar/histogram data as a Markdown
table (an LLM cannot see the image); glossary term markers are stripped while
``**bold**`` is kept (it is valid Markdown).
dict-no-throw (the ``eda`` group style): :func:`render_md` never raises. On a
fatal error it returns ``{path: None, ...}`` with a ``note`` explaining why; a
malformed block degrades to a readable note rather than crashing the document.
"""
from __future__ import annotations
import os
import re
from . import model
# Glossary span markers (kept text, dropped markers). We intentionally do NOT use
# ``text_layout.strip_inline_md`` for Markdown blocks because that also removes
# ``**bold**`` — valid Markdown we want to preserve when pasting to an LLM.
_TERM_OPEN_RE = re.compile(r"\[\[term:[A-Za-z0-9_]+\]\]")
_MAX_BAR_ROWS = 100
# --------------------------------------------------------------------------- #
# Small helpers.
# --------------------------------------------------------------------------- #
def _clean_terms(s) -> str:
"""Drop glossary term markers, keeping the visible text (and any **bold**)."""
s = model._safe_str(s)
s = _TERM_OPEN_RE.sub("", s)
return s.replace("[[/term]]", "")
def _cell(v) -> str:
"""Render a value as a safe Markdown table cell.
Escapes pipes (``|`` -> ``\\|``) so they do not break the column layout and
folds newlines to ``<br>`` so a multi-line value stays inside one cell. None
becomes an empty string.
"""
s = model._safe_str(v)
s = s.replace("|", "\\|")
s = s.replace("\r\n", "\n").replace("\r", "\n").replace("\n", "<br>")
return s
def _slug(text: str) -> str:
"""GitHub-style heading anchor: lowercase, spaces->'-', drop other symbols."""
s = model._safe_str(text).strip().lower()
out = []
for ch in s:
if ch.isalnum():
out.append(ch)
elif ch in " -":
out.append("-")
# any other symbol is dropped.
slug = "".join(out)
while "--" in slug:
slug = slug.replace("--", "-")
return slug.strip("-")
def _fmt_num(v) -> str:
"""Compact number for the figure data tables (ints as ints, else 4 sig figs)."""
try:
f = float(v)
except Exception: # noqa: BLE001
return model._safe_str(v)
if f != f: # NaN
return "NaN"
if f == int(f) and abs(f) < 1e15:
return str(int(f))
return f"{f:.4g}"
def _fmt_int(v) -> str:
try:
return str(int(v))
except Exception: # noqa: BLE001
return model._safe_str(v)
def _now_iso() -> str:
from datetime import datetime, timezone
return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
# --------------------------------------------------------------------------- #
# Document header (title + metadata blockquote + numbered index).
# --------------------------------------------------------------------------- #
def _meta_block(meta: dict) -> list:
"""Build the metadata lines for the header blockquote (omitting absentees)."""
ctx = meta.get("ctx") if isinstance(meta.get("ctx"), dict) else {}
lines: list = []
def add(label, value) -> None:
if value is None:
return
s = model._safe_str(value).strip()
if s and s.lower() != "none":
lines.append(f"**{label}:** {s}")
add("Dataset", ctx.get("dataset_name") or meta.get("dataset_name"))
add("Fuente", ctx.get("source_origin") or meta.get("source_origin"))
add("Almacenamiento", ctx.get("storage") or meta.get("storage"))
n_rows = ctx.get("n_rows", meta.get("n_rows"))
n_cols = ctx.get("n_cols", meta.get("n_cols"))
if n_rows is not None and n_cols is not None:
lines.append(
f"**Dimensiones:** {_fmt_int(n_rows)} filas × {_fmt_int(n_cols)} columnas")
add("Generado", meta.get("generated_at") or _now_iso())
lines.append(f"**Motor:** {model.ENGINE_NAME} v{model.ENGINE_VERSION}")
return lines
# --------------------------------------------------------------------------- #
# Per-block serializers. Each returns a Markdown string (no surrounding blanks;
# the caller separates blocks with a blank line).
# --------------------------------------------------------------------------- #
def _md_heading(block) -> str:
level = int(getattr(block, "level", 1) or 1)
hashes = "#" * min(level + 2, 6) # level1 -> ###; '#'/'##' reserved for doc/chapter.
text = _clean_terms(getattr(block, "text", "")).strip()
return f"{hashes} {text}"
def _md_markdown(block) -> str:
# Keep the text verbatim, dropping only glossary markers (keep **bold**).
return _clean_terms(getattr(block, "text", "")).rstrip("\n")
def _md_kv_table(block) -> str:
lines: list = []
title = getattr(block, "title", None)
if title:
lines.append(f"**{_clean_terms(title).strip()}**")
lines.append("")
lines.append("| Campo | Valor |")
lines.append("| --- | --- |")
for row in (getattr(block, "rows", []) or []):
try:
label, value = row[0], row[1]
except Exception: # noqa: BLE001
label, value = row, ""
lines.append(f"| {_cell(label)} | {_cell(value)} |")
return "\n".join(lines)
def _md_data_table(block) -> str:
lines: list = []
title = getattr(block, "title", None)
if title:
lines.append(f"**{_clean_terms(title).strip()}**")
lines.append("")
header = list(getattr(block, "header", []) or [])
rows = list(getattr(block, "rows", []) or [])
if not header:
ncol = max((len(r) for r in rows), default=1)
header = [f"col{i + 1}" for i in range(ncol)]
ncol = len(header)
lines.append("| " + " | ".join(_cell(h) for h in header) + " |")
lines.append("| " + " | ".join(["---"] * ncol) + " |")
for r in rows: # dump every row — no pagination, nothing cut.
cells = [_cell(r[c]) if c < len(r) else "" for c in range(ncol)]
lines.append("| " + " | ".join(cells) + " |")
note = getattr(block, "note", None)
if note:
lines.append("")
lines.append(f"*{_clean_terms(note).strip()}*")
return "\n".join(lines)
def _bars_table(bars: list) -> str:
"""Render extracted bar/histogram data as a Markdown table (Desde/Hasta/Frec)."""
lines = ["| Desde | Hasta | Frecuencia |", "| --- | --- | --- |"]
shown = bars[:_MAX_BAR_ROWS]
for x0, x1, h in shown:
lines.append(f"| {_fmt_num(x0)} | {_fmt_num(x1)} | {_fmt_num(h)} |")
out = "\n".join(lines)
extra = len(bars) - len(shown)
if extra > 0:
out += f"\n\n*… ({extra} filas más)*"
return out
def _extract_bars(fig) -> list:
"""Collect (x_from, x_to, height) of the rectangular bars of a matplotlib fig.
Histogram / bar-chart bars are ``matplotlib.patches.Rectangle`` with positive
width and height; spines, legends and zero-area artists are skipped. Never
raises — returns ``[]`` on any problem.
"""
bars: list = []
try:
for ax in fig.get_axes():
# Collect this axes' positive-area rectangles, then keep only the ones
# that look like actual histogram/bar bins. Reference shapes that
# matplotlib also stores in ``ax.patches`` — most notably the ``±1σ``
# band drawn by ``axvspan`` (a single rectangle far wider than a bin)
# and a lone Tukey boxplot box — would otherwise show up as fake
# "bins". A histogram axes has several near-equal-width bars, so we
# drop any rectangle whose width is more than twice the median width
# of that axes' rectangles (the σ-band spans many bins; uniform bins
# all sit at the median width and stay).
ax_bars: list = []
for patch in list(getattr(ax, "patches", []) or []):
try:
w = patch.get_width()
h = patch.get_height()
x = patch.get_x()
except Exception: # noqa: BLE001 — not a Rectangle-like patch.
continue
if w and w > 0 and h and h > 0:
ax_bars.append((x, x + w, h))
if len(ax_bars) >= 3:
widths = sorted(b[1] - b[0] for b in ax_bars)
median_w = widths[len(widths) // 2]
if median_w > 0:
ax_bars = [b for b in ax_bars
if (b[1] - b[0]) <= 2.0 * median_w]
bars.extend(ax_bars)
except Exception: # noqa: BLE001
return []
return bars
def _md_figure(block, meta: dict, out_path: str, counter: list) -> str:
"""Serialize a Figure prioritising TEXT + DATA (an LLM cannot see the image).
Emits the caption, then — if the matplotlib figure has bars — a Markdown table
of the underlying (Desde, Hasta, Frecuencia) values. Optionally (when
``meta['embed_figures']`` is True) also exports a PNG beside the .md and adds
an image link; off by default so the Markdown stays self-contained.
"""
caption = model._safe_str(getattr(block, "caption", "")).strip()
parts = [f"*Figura: {caption}*" if caption else "*Figura*"]
fig = None
try:
import matplotlib
matplotlib.use("Agg") # defensive: headless rasterization backend.
fig = getattr(block, "fig", None)
make = getattr(block, "make", None)
if fig is None and callable(make):
fig = make()
if fig is not None:
bars = _extract_bars(fig)
if bars:
parts.append(_bars_table(bars))
if meta.get("embed_figures"):
png = _embed_png(fig, out_path, counter)
if png:
parts.append(f"![{caption}]({png})")
except Exception: # noqa: BLE001 — a bad figure degrades to just its caption.
pass
finally:
if fig is not None:
try:
import matplotlib.pyplot as plt
plt.close(fig)
except Exception: # noqa: BLE001
pass
return "\n\n".join(parts)
def _embed_png(fig, out_path: str, counter: list) -> str:
"""Export the figure to ``<basename>_figN.png`` beside the .md; return its name."""
try:
counter[0] += 1
base = os.path.splitext(os.path.basename(out_path))[0] or "figura"
name = f"{base}_fig{counter[0]}.png"
path = os.path.join(os.path.dirname(os.path.abspath(out_path)), name)
fig.savefig(path, format="png", dpi=120, bbox_inches="tight")
return name
except Exception: # noqa: BLE001
return ""
def _md_image(block) -> str:
path = model._safe_str(getattr(block, "path", ""))
caption = model._safe_str(getattr(block, "caption", "")).strip()
out = f"![{caption}]({path})"
if caption:
out += f"\n\n*{caption}*"
return out
def _md_caption(block) -> str:
return f"*{_clean_terms(getattr(block, 'text', '')).strip()}*"
def _md_note(block) -> str:
text = _clean_terms(getattr(block, "text", "")).strip()
lines = text.split("\n")
return "\n".join((f"> {ln}" if ln.strip() else ">") for ln in lines)
def _md_group(block, meta: dict, out_path: str, counter: list) -> str:
parts: list = []
title = getattr(block, "title", None)
if title:
parts.append(f"### {_clean_terms(title).strip()}")
for b in (getattr(block, "blocks", []) or []):
try:
seg = _serialize_block(b, meta, out_path, counter)
except Exception: # noqa: BLE001
seg = ""
if seg:
parts.append(seg)
return "\n\n".join(parts)
def _md_glossary_entry(block) -> str:
label = (model._safe_str(getattr(block, "label", "")).strip()
or model._safe_str(getattr(block, "key", "")).strip())
definition = _clean_terms(getattr(block, "definition", "")).strip()
out = f"### {label}"
if definition:
out += f"\n\n{definition}"
return out
def _serialize_block(block, meta: dict, out_path: str, counter: list) -> str:
"""Dispatch a single block to its Markdown serializer. Unknown -> note."""
kind = getattr(block, "kind", "")
if kind == "heading":
return _md_heading(block)
if kind == "markdown":
return _md_markdown(block)
if kind == "kv_table":
return _md_kv_table(block)
if kind == "data_table":
return _md_data_table(block)
if kind == "figure":
return _md_figure(block, meta, out_path, counter)
if kind == "image":
return _md_image(block)
if kind == "caption":
return _md_caption(block)
if kind == "note":
return _md_note(block)
if kind == "group":
return _md_group(block, meta, out_path, counter)
if kind == "glossary_entry":
return _md_glossary_entry(block)
# Unknown content -> readable note (mirrors the model's defensive coercion).
return _md_note(model.Note(text=model._safe_str(block)))
# --------------------------------------------------------------------------- #
# Entry point.
# --------------------------------------------------------------------------- #
def render_md(chapters: list, out_path: str, meta: dict = None) -> dict:
"""Serialize a list of Chapters into a single self-contained Markdown file.
The output leads with ``# <title>``, a metadata blockquote and a numbered
``## Índice`` linking each chapter, then one ``## N. <title>`` section per
chapter with its blocks. Tables become Markdown tables (every row dumped),
figures become caption + underlying data table, glossary markers are stripped
while ``**bold**`` is kept. Designed to be pasted into an LLM.
Args:
chapters: a list of ``Chapter`` (dataclasses or dicts); normalized
defensively with ``model.as_chapters``.
out_path: filesystem path for the ``.md`` (parent dirs are created).
meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with
``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``),
``generated_at``, ``embed_figures`` (export PNGs beside the .md,
default False).
Returns:
dict (never raises): ``{path: str|None, n_chars: int,
chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is
None and ``note`` explains why.
"""
meta = meta or {}
chapters = model.as_chapters(chapters)
title = model._safe_str(meta.get("title")) or model.ENGINE_NAME
# Edge: nothing to render -> a minimal but valid Markdown document.
if not chapters:
content = (f"# {title}\n\n"
"*(documento vacío — sin capítulos aplicables)*\n")
return _write(out_path, content, [], "documento vacío")
counter = [0] # document-wide figure counter for unique PNG names.
notes: list = []
segments: list = [f"# {title}"]
meta_lines = _meta_block(meta)
if meta_lines:
segments.append("\n".join(f"> {ln}" for ln in meta_lines))
# Numbered index. The anchor matches the chapter heading emitted below
# (``## N. <title>``) in GitHub slug style.
chap_heads = []
idx_lines = ["## Índice"]
for i, ch in enumerate(chapters, 1):
head_text = f"{i}. {model._safe_str(ch.title)}"
anchor = _slug(head_text)
chap_heads.append((head_text, anchor))
idx_lines.append(f"{i}. [{model._safe_str(ch.title)}](#{anchor})")
segments.append("\n".join(idx_lines))
chapters_meta = []
for i, ch in enumerate(chapters, 1):
segments.append("---")
head_text, _anchor = chap_heads[i - 1]
segments.append(f"## {head_text}")
blocks = list(ch.blocks or [])
# Omit a leading level-1 Heading that just repeats the chapter title.
if blocks:
b0 = blocks[0]
if (getattr(b0, "kind", "") == "heading"
and int(getattr(b0, "level", 1) or 1) == 1
and _clean_terms(getattr(b0, "text", "")).strip()
== model._safe_str(ch.title).strip()):
blocks = blocks[1:]
for block in blocks:
try:
seg = _serialize_block(block, meta, out_path, counter)
except Exception as e: # noqa: BLE001
seg = _md_note(model.Note(text=model._safe_str(block)))
notes.append(
f"bloque '{getattr(block, 'kind', '?')}' del capítulo "
f"'{ch.id}' degradado: {e}")
if seg:
segments.append(seg)
chapters_meta.append({"id": ch.id, "version": ch.version})
content = "\n\n".join(segments) + "\n"
note = f"{len(content)} caracteres"
if notes:
note += " · " + "; ".join(notes)
return _write(out_path, content, chapters_meta, note)
def _write(out_path: str, content: str, chapters_meta: list, note: str) -> dict:
"""Write the Markdown to disk (creating parents). dict-no-throw."""
try:
parent = os.path.dirname(os.path.abspath(out_path))
os.makedirs(parent, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as fh:
fh.write(content)
except Exception as e: # noqa: BLE001 — never raise from the writer.
return {"path": None, "n_chars": 0, "chapters": [],
"note": f"no se pudo escribir el Markdown: {e}"}
return {"path": out_path, "n_chars": len(content),
"chapters": chapters_meta, "note": note}