diff --git a/python/functions/datascience/__init__.py b/python/functions/datascience/__init__.py
index f1505d22..cdefab14 100644
--- a/python/functions/datascience/__init__.py
+++ b/python/functions/datascience/__init__.py
@@ -64,6 +64,7 @@ from .exploratory_caveats import exploratory_caveats
from .render_eda_pdf import render_eda_pdf, render_eda_pdf_relational
from .render_automatic_eda_pdf import render_automatic_eda_pdf
from .render_automatic_eda_pptx import render_automatic_eda_pptx
+from .render_automatic_eda_markdown import render_automatic_eda_markdown
from .detect_time_column import detect_time_column
from .extract_timeseries_raw import extract_timeseries_raw
from .build_eda_render_ctx import build_eda_render_ctx
@@ -82,6 +83,7 @@ __all__ = [
"resample_timeseries",
"render_automatic_eda_pdf",
"render_automatic_eda_pptx",
+ "render_automatic_eda_markdown",
"decode_qr_image",
"adf_kpss_stationarity",
"acf_pacf",
diff --git a/python/functions/datascience/automatic_eda/__init__.py b/python/functions/datascience/automatic_eda/__init__.py
index f9a6f2e3..01085313 100644
--- a/python/functions/datascience/automatic_eda/__init__.py
+++ b/python/functions/datascience/automatic_eda/__init__.py
@@ -36,6 +36,7 @@ from .model import ( # noqa: F401
from .chapters_registry import CHAPTER_ORDER, build_chapter, build_document # noqa: F401
from .render_pdf_impl import render_pdf # noqa: F401
from .render_pptx_impl import render_pptx # noqa: F401
+from .render_md_impl import render_md # noqa: F401
__all__ = [
"ENGINE_NAME",
@@ -60,4 +61,5 @@ __all__ = [
"build_document",
"render_pdf",
"render_pptx",
+ "render_md",
]
diff --git a/python/functions/datascience/automatic_eda/render_md_impl.py b/python/functions/datascience/automatic_eda/render_md_impl.py
new file mode 100644
index 00000000..fba8ba6f
--- /dev/null
+++ b/python/functions/datascience/automatic_eda/render_md_impl.py
@@ -0,0 +1,458 @@
+"""AutomaticEDA Markdown serializer — one self-contained file to paste to an LLM.
+
+Same document model as the PDF/PPTX renderers (an ordered list of
+:class:`Chapter`, each a list of format-independent blocks) but emitted as plain
+**Markdown** instead of a binary. The goal is different from the other two
+renderers: a Markdown EDA is meant to be *pasted into an LLM*, so it prioritises
+TEXT and DATA over visuals. Tables become Markdown tables (every row dumped, no
+pagination — nothing is cut because there are no pages); a ``Figure`` becomes its
+caption plus, when possible, the underlying bar/histogram data as a Markdown
+table (an LLM cannot see the image); glossary term markers are stripped while
+``**bold**`` is kept (it is valid Markdown).
+
+dict-no-throw (the ``eda`` group style): :func:`render_md` never raises. On a
+fatal error it returns ``{path: None, ...}`` with a ``note`` explaining why; a
+malformed block degrades to a readable note rather than crashing the document.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+
+from . import model
+
+# Glossary span markers (kept text, dropped markers). We intentionally do NOT use
+# ``text_layout.strip_inline_md`` for Markdown blocks because that also removes
+# ``**bold**`` — valid Markdown we want to preserve when pasting to an LLM.
+_TERM_OPEN_RE = re.compile(r"\[\[term:[A-Za-z0-9_]+\]\]")
+_MAX_BAR_ROWS = 100
+
+
+# --------------------------------------------------------------------------- #
+# Small helpers.
+# --------------------------------------------------------------------------- #
+def _clean_terms(s) -> str:
+ """Drop glossary term markers, keeping the visible text (and any **bold**)."""
+ s = model._safe_str(s)
+ s = _TERM_OPEN_RE.sub("", s)
+ return s.replace("[[/term]]", "")
+
+
+def _cell(v) -> str:
+ """Render a value as a safe Markdown table cell.
+
+ Escapes pipes (``|`` -> ``\\|``) so they do not break the column layout and
+ folds newlines to ``
`` so a multi-line value stays inside one cell. None
+ becomes an empty string.
+ """
+ s = model._safe_str(v)
+ s = s.replace("|", "\\|")
+ s = s.replace("\r\n", "\n").replace("\r", "\n").replace("\n", "
")
+ return s
+
+
+def _slug(text: str) -> str:
+ """GitHub-style heading anchor: lowercase, spaces->'-', drop other symbols."""
+ s = model._safe_str(text).strip().lower()
+ out = []
+ for ch in s:
+ if ch.isalnum():
+ out.append(ch)
+ elif ch in " -":
+ out.append("-")
+ # any other symbol is dropped.
+ slug = "".join(out)
+ while "--" in slug:
+ slug = slug.replace("--", "-")
+ return slug.strip("-")
+
+
+def _fmt_num(v) -> str:
+ """Compact number for the figure data tables (ints as ints, else 4 sig figs)."""
+ try:
+ f = float(v)
+ except Exception: # noqa: BLE001
+ return model._safe_str(v)
+ if f != f: # NaN
+ return "NaN"
+ if f == int(f) and abs(f) < 1e15:
+ return str(int(f))
+ return f"{f:.4g}"
+
+
+def _fmt_int(v) -> str:
+ try:
+ return str(int(v))
+ except Exception: # noqa: BLE001
+ return model._safe_str(v)
+
+
+def _now_iso() -> str:
+ from datetime import datetime, timezone
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
+
+
+# --------------------------------------------------------------------------- #
+# Document header (title + metadata blockquote + numbered index).
+# --------------------------------------------------------------------------- #
+def _meta_block(meta: dict) -> list:
+ """Build the metadata lines for the header blockquote (omitting absentees)."""
+ ctx = meta.get("ctx") if isinstance(meta.get("ctx"), dict) else {}
+ lines: list = []
+
+ def add(label, value) -> None:
+ if value is None:
+ return
+ s = model._safe_str(value).strip()
+ if s and s.lower() != "none":
+ lines.append(f"**{label}:** {s}")
+
+ add("Dataset", ctx.get("dataset_name") or meta.get("dataset_name"))
+ add("Fuente", ctx.get("source_origin") or meta.get("source_origin"))
+ add("Almacenamiento", ctx.get("storage") or meta.get("storage"))
+ n_rows = ctx.get("n_rows", meta.get("n_rows"))
+ n_cols = ctx.get("n_cols", meta.get("n_cols"))
+ if n_rows is not None and n_cols is not None:
+ lines.append(
+ f"**Dimensiones:** {_fmt_int(n_rows)} filas × {_fmt_int(n_cols)} columnas")
+ add("Generado", meta.get("generated_at") or _now_iso())
+ lines.append(f"**Motor:** {model.ENGINE_NAME} v{model.ENGINE_VERSION}")
+ return lines
+
+
+# --------------------------------------------------------------------------- #
+# Per-block serializers. Each returns a Markdown string (no surrounding blanks;
+# the caller separates blocks with a blank line).
+# --------------------------------------------------------------------------- #
+def _md_heading(block) -> str:
+ level = int(getattr(block, "level", 1) or 1)
+ hashes = "#" * min(level + 2, 6) # level1 -> ###; '#'/'##' reserved for doc/chapter.
+ text = _clean_terms(getattr(block, "text", "")).strip()
+ return f"{hashes} {text}"
+
+
+def _md_markdown(block) -> str:
+ # Keep the text verbatim, dropping only glossary markers (keep **bold**).
+ return _clean_terms(getattr(block, "text", "")).rstrip("\n")
+
+
+def _md_kv_table(block) -> str:
+ lines: list = []
+ title = getattr(block, "title", None)
+ if title:
+ lines.append(f"**{_clean_terms(title).strip()}**")
+ lines.append("")
+ lines.append("| Campo | Valor |")
+ lines.append("| --- | --- |")
+ for row in (getattr(block, "rows", []) or []):
+ try:
+ label, value = row[0], row[1]
+ except Exception: # noqa: BLE001
+ label, value = row, ""
+ lines.append(f"| {_cell(label)} | {_cell(value)} |")
+ return "\n".join(lines)
+
+
+def _md_data_table(block) -> str:
+ lines: list = []
+ title = getattr(block, "title", None)
+ if title:
+ lines.append(f"**{_clean_terms(title).strip()}**")
+ lines.append("")
+ header = list(getattr(block, "header", []) or [])
+ rows = list(getattr(block, "rows", []) or [])
+ if not header:
+ ncol = max((len(r) for r in rows), default=1)
+ header = [f"col{i + 1}" for i in range(ncol)]
+ ncol = len(header)
+ lines.append("| " + " | ".join(_cell(h) for h in header) + " |")
+ lines.append("| " + " | ".join(["---"] * ncol) + " |")
+ for r in rows: # dump every row — no pagination, nothing cut.
+ cells = [_cell(r[c]) if c < len(r) else "" for c in range(ncol)]
+ lines.append("| " + " | ".join(cells) + " |")
+ note = getattr(block, "note", None)
+ if note:
+ lines.append("")
+ lines.append(f"*{_clean_terms(note).strip()}*")
+ return "\n".join(lines)
+
+
+def _bars_table(bars: list) -> str:
+ """Render extracted bar/histogram data as a Markdown table (Desde/Hasta/Frec)."""
+ lines = ["| Desde | Hasta | Frecuencia |", "| --- | --- | --- |"]
+ shown = bars[:_MAX_BAR_ROWS]
+ for x0, x1, h in shown:
+ lines.append(f"| {_fmt_num(x0)} | {_fmt_num(x1)} | {_fmt_num(h)} |")
+ out = "\n".join(lines)
+ extra = len(bars) - len(shown)
+ if extra > 0:
+ out += f"\n\n*… ({extra} filas más)*"
+ return out
+
+
+def _extract_bars(fig) -> list:
+ """Collect (x_from, x_to, height) of the rectangular bars of a matplotlib fig.
+
+ Histogram / bar-chart bars are ``matplotlib.patches.Rectangle`` with positive
+ width and height; spines, legends and zero-area artists are skipped. Never
+ raises — returns ``[]`` on any problem.
+ """
+ bars: list = []
+ try:
+ for ax in fig.get_axes():
+ # Collect this axes' positive-area rectangles, then keep only the ones
+ # that look like actual histogram/bar bins. Reference shapes that
+ # matplotlib also stores in ``ax.patches`` — most notably the ``±1σ``
+ # band drawn by ``axvspan`` (a single rectangle far wider than a bin)
+ # and a lone Tukey boxplot box — would otherwise show up as fake
+ # "bins". A histogram axes has several near-equal-width bars, so we
+ # drop any rectangle whose width is more than twice the median width
+ # of that axes' rectangles (the σ-band spans many bins; uniform bins
+ # all sit at the median width and stay).
+ ax_bars: list = []
+ for patch in list(getattr(ax, "patches", []) or []):
+ try:
+ w = patch.get_width()
+ h = patch.get_height()
+ x = patch.get_x()
+ except Exception: # noqa: BLE001 — not a Rectangle-like patch.
+ continue
+ if w and w > 0 and h and h > 0:
+ ax_bars.append((x, x + w, h))
+ if len(ax_bars) >= 3:
+ widths = sorted(b[1] - b[0] for b in ax_bars)
+ median_w = widths[len(widths) // 2]
+ if median_w > 0:
+ ax_bars = [b for b in ax_bars
+ if (b[1] - b[0]) <= 2.0 * median_w]
+ bars.extend(ax_bars)
+ except Exception: # noqa: BLE001
+ return []
+ return bars
+
+
+def _md_figure(block, meta: dict, out_path: str, counter: list) -> str:
+ """Serialize a Figure prioritising TEXT + DATA (an LLM cannot see the image).
+
+ Emits the caption, then — if the matplotlib figure has bars — a Markdown table
+ of the underlying (Desde, Hasta, Frecuencia) values. Optionally (when
+ ``meta['embed_figures']`` is True) also exports a PNG beside the .md and adds
+ an image link; off by default so the Markdown stays self-contained.
+ """
+ caption = model._safe_str(getattr(block, "caption", "")).strip()
+ parts = [f"*Figura: {caption}*" if caption else "*Figura*"]
+ fig = None
+ try:
+ import matplotlib
+ matplotlib.use("Agg") # defensive: headless rasterization backend.
+ fig = getattr(block, "fig", None)
+ make = getattr(block, "make", None)
+ if fig is None and callable(make):
+ fig = make()
+ if fig is not None:
+ bars = _extract_bars(fig)
+ if bars:
+ parts.append(_bars_table(bars))
+ if meta.get("embed_figures"):
+ png = _embed_png(fig, out_path, counter)
+ if png:
+ parts.append(f"")
+ except Exception: # noqa: BLE001 — a bad figure degrades to just its caption.
+ pass
+ finally:
+ if fig is not None:
+ try:
+ import matplotlib.pyplot as plt
+ plt.close(fig)
+ except Exception: # noqa: BLE001
+ pass
+ return "\n\n".join(parts)
+
+
+def _embed_png(fig, out_path: str, counter: list) -> str:
+ """Export the figure to ``_figN.png`` beside the .md; return its name."""
+ try:
+ counter[0] += 1
+ base = os.path.splitext(os.path.basename(out_path))[0] or "figura"
+ name = f"{base}_fig{counter[0]}.png"
+ path = os.path.join(os.path.dirname(os.path.abspath(out_path)), name)
+ fig.savefig(path, format="png", dpi=120, bbox_inches="tight")
+ return name
+ except Exception: # noqa: BLE001
+ return ""
+
+
+def _md_image(block) -> str:
+ path = model._safe_str(getattr(block, "path", ""))
+ caption = model._safe_str(getattr(block, "caption", "")).strip()
+ out = f""
+ if caption:
+ out += f"\n\n*{caption}*"
+ return out
+
+
+def _md_caption(block) -> str:
+ return f"*{_clean_terms(getattr(block, 'text', '')).strip()}*"
+
+
+def _md_note(block) -> str:
+ text = _clean_terms(getattr(block, "text", "")).strip()
+ lines = text.split("\n")
+ return "\n".join((f"> {ln}" if ln.strip() else ">") for ln in lines)
+
+
+def _md_group(block, meta: dict, out_path: str, counter: list) -> str:
+ parts: list = []
+ title = getattr(block, "title", None)
+ if title:
+ parts.append(f"### {_clean_terms(title).strip()}")
+ for b in (getattr(block, "blocks", []) or []):
+ try:
+ seg = _serialize_block(b, meta, out_path, counter)
+ except Exception: # noqa: BLE001
+ seg = ""
+ if seg:
+ parts.append(seg)
+ return "\n\n".join(parts)
+
+
+def _md_glossary_entry(block) -> str:
+ label = (model._safe_str(getattr(block, "label", "")).strip()
+ or model._safe_str(getattr(block, "key", "")).strip())
+ definition = _clean_terms(getattr(block, "definition", "")).strip()
+ out = f"### {label}"
+ if definition:
+ out += f"\n\n{definition}"
+ return out
+
+
+def _serialize_block(block, meta: dict, out_path: str, counter: list) -> str:
+ """Dispatch a single block to its Markdown serializer. Unknown -> note."""
+ kind = getattr(block, "kind", "")
+ if kind == "heading":
+ return _md_heading(block)
+ if kind == "markdown":
+ return _md_markdown(block)
+ if kind == "kv_table":
+ return _md_kv_table(block)
+ if kind == "data_table":
+ return _md_data_table(block)
+ if kind == "figure":
+ return _md_figure(block, meta, out_path, counter)
+ if kind == "image":
+ return _md_image(block)
+ if kind == "caption":
+ return _md_caption(block)
+ if kind == "note":
+ return _md_note(block)
+ if kind == "group":
+ return _md_group(block, meta, out_path, counter)
+ if kind == "glossary_entry":
+ return _md_glossary_entry(block)
+ # Unknown content -> readable note (mirrors the model's defensive coercion).
+ return _md_note(model.Note(text=model._safe_str(block)))
+
+
+# --------------------------------------------------------------------------- #
+# Entry point.
+# --------------------------------------------------------------------------- #
+def render_md(chapters: list, out_path: str, meta: dict = None) -> dict:
+ """Serialize a list of Chapters into a single self-contained Markdown file.
+
+ The output leads with ``# ``, a metadata blockquote and a numbered
+ ``## Índice`` linking each chapter, then one ``## N. `` section per
+ chapter with its blocks. Tables become Markdown tables (every row dumped),
+ figures become caption + underlying data table, glossary markers are stripped
+ while ``**bold**`` is kept. Designed to be pasted into an LLM.
+
+ Args:
+ chapters: a list of ``Chapter`` (dataclasses or dicts); normalized
+ defensively with ``model.as_chapters``.
+ out_path: filesystem path for the ``.md`` (parent dirs are created).
+ meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with
+ ``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``),
+ ``generated_at``, ``embed_figures`` (export PNGs beside the .md,
+ default False).
+
+ Returns:
+ dict (never raises): ``{path: str|None, n_chars: int,
+ chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is
+ None and ``note`` explains why.
+ """
+ meta = meta or {}
+ chapters = model.as_chapters(chapters)
+ title = model._safe_str(meta.get("title")) or model.ENGINE_NAME
+
+ # Edge: nothing to render -> a minimal but valid Markdown document.
+ if not chapters:
+ content = (f"# {title}\n\n"
+ "*(documento vacío — sin capítulos aplicables)*\n")
+ return _write(out_path, content, [], "documento vacío")
+
+ counter = [0] # document-wide figure counter for unique PNG names.
+ notes: list = []
+ segments: list = [f"# {title}"]
+
+ meta_lines = _meta_block(meta)
+ if meta_lines:
+ segments.append("\n".join(f"> {ln}" for ln in meta_lines))
+
+ # Numbered index. The anchor matches the chapter heading emitted below
+ # (``## N. ``) in GitHub slug style.
+ chap_heads = []
+ idx_lines = ["## Índice"]
+ for i, ch in enumerate(chapters, 1):
+ head_text = f"{i}. {model._safe_str(ch.title)}"
+ anchor = _slug(head_text)
+ chap_heads.append((head_text, anchor))
+ idx_lines.append(f"{i}. [{model._safe_str(ch.title)}](#{anchor})")
+ segments.append("\n".join(idx_lines))
+
+ chapters_meta = []
+ for i, ch in enumerate(chapters, 1):
+ segments.append("---")
+ head_text, _anchor = chap_heads[i - 1]
+ segments.append(f"## {head_text}")
+
+ blocks = list(ch.blocks or [])
+ # Omit a leading level-1 Heading that just repeats the chapter title.
+ if blocks:
+ b0 = blocks[0]
+ if (getattr(b0, "kind", "") == "heading"
+ and int(getattr(b0, "level", 1) or 1) == 1
+ and _clean_terms(getattr(b0, "text", "")).strip()
+ == model._safe_str(ch.title).strip()):
+ blocks = blocks[1:]
+
+ for block in blocks:
+ try:
+ seg = _serialize_block(block, meta, out_path, counter)
+ except Exception as e: # noqa: BLE001
+ seg = _md_note(model.Note(text=model._safe_str(block)))
+ notes.append(
+ f"bloque '{getattr(block, 'kind', '?')}' del capítulo "
+ f"'{ch.id}' degradado: {e}")
+ if seg:
+ segments.append(seg)
+ chapters_meta.append({"id": ch.id, "version": ch.version})
+
+ content = "\n\n".join(segments) + "\n"
+ note = f"{len(content)} caracteres"
+ if notes:
+ note += " · " + "; ".join(notes)
+ return _write(out_path, content, chapters_meta, note)
+
+
+def _write(out_path: str, content: str, chapters_meta: list, note: str) -> dict:
+ """Write the Markdown to disk (creating parents). dict-no-throw."""
+ try:
+ parent = os.path.dirname(os.path.abspath(out_path))
+ os.makedirs(parent, exist_ok=True)
+ with open(out_path, "w", encoding="utf-8") as fh:
+ fh.write(content)
+ except Exception as e: # noqa: BLE001 — never raise from the writer.
+ return {"path": None, "n_chars": 0, "chapters": [],
+ "note": f"no se pudo escribir el Markdown: {e}"}
+ return {"path": out_path, "n_chars": len(content),
+ "chapters": chapters_meta, "note": note}
diff --git a/python/functions/datascience/render_automatic_eda_markdown.md b/python/functions/datascience/render_automatic_eda_markdown.md
new file mode 100644
index 00000000..6615baf9
--- /dev/null
+++ b/python/functions/datascience/render_automatic_eda_markdown.md
@@ -0,0 +1,89 @@
+---
+name: render_automatic_eda_markdown
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def render_automatic_eda_markdown(chapters_or_profile, out_path: str, meta: dict = None) -> dict"
+description: "Renderiza un documento AutomaticEDA por CAPÍTULOS (modelo de bloques independiente del formato) en un único MARKDOWN autocontenido pensado para PEGAR A UN LLM. Acepta una lista de capítulos del modelo o directamente un TableProfile del grupo eda (construye los capítulos canónicos con build_document). Prioriza TEXTO + DATOS sobre lo visual: las tablas se vuelcan como tablas markdown con TODAS las filas (sin paginar — no hay páginas que cortar), una figura matplotlib se reduce a su caption más la tabla de datos subyacente (Desde/Hasta/Frecuencia de las barras del histograma) porque un LLM no ve la imagen, y los marcadores de glosario se eliminan conservando el **negrita**. Lleva cabecera (# título), bloque de metadatos en blockquote e índice numerado con anclas GitHub. Espejo de render_automatic_eda_pdf/render_automatic_eda_pptx pero SIN manifest (KISS, el markdown es un único artefacto de texto). dict-no-throw: nunca lanza, devuelve {path, n_chars, chapters, note}; en error fatal path es None y note explica la causa. Flag opcional meta['embed_figures'] exporta PNGs junto al .md (off por defecto)."
+tags: [eda, markdown, render, report, llm, automatic-eda, chapters, versioned, no-cut, text, datascience, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [os, re, matplotlib, "datascience.automatic_eda"]
+params:
+ - name: chapters_or_profile
+ desc: "una lista de capítulos del modelo AutomaticEDA (dataclasses Chapter o dicts {id,title,version,blocks}) O un TableProfile dict del grupo eda. Si es un TableProfile, los capítulos canónicos se construyen con build_document(profile, meta['ctx']). Bloques soportados: heading, markdown, kv_table, data_table, figure, image, caption, note, group, glossary_entry. Lectura defensiva: lo no reconocido se degrada a Note, nunca lanza."
+ - name: out_path
+ desc: "ruta del archivo .md de salida. Los directorios padre se crean si faltan. Directorio no escribible → {path:None, note:} sin lanzar."
+ - name: meta
+ desc: "dict opcional. Claves: title (título del documento), ctx (dict con dataset_name→Dataset, source_origin→Fuente, storage→Almacenamiento, n_rows/n_cols→Dimensiones; también lo consumen los builders de capítulo cuando se da un profile), generated_at (timestamp; si falta se genera ISO UTC), embed_figures (True para exportar PNGs _figN.png junto al .md; por defecto False y el markdown queda autocontenido)."
+output: "dict (nunca lanza): {path: str|None, n_chars: int, chapters: list[{id,version}], note: str}. En error fatal (p.ej. directorio no escribible) path es None y note explica la causa. Un documento sin capítulos aplicables produce un markdown mínimo válido con 'documento vacío' y chapters=[]."
+tested: true
+tests: ["test_golden_bloques_sinteticos_serializa_todo_a_markdown", "test_edge_documento_vacio_no_revienta", "test_profile_path_construye_capitulos_y_escribe"]
+test_file_path: "python/functions/datascience/render_automatic_eda_markdown_test.py"
+file_path: "python/functions/datascience/render_automatic_eda_markdown.py"
+---
+
+## Ejemplo
+
+```python
+from datascience import render_automatic_eda_markdown
+
+# Desde un TableProfile del grupo eda (mismo modelo que los renderers PDF/PPTX).
+profile = {
+ "table": "ventas", "source": "/data/ventas.csv",
+ "n_rows": 1000, "n_cols": 2, "quality_score": 92.5,
+ "columns": [
+ {"name": "precio", "inferred_type": "numeric", "null_pct": 0.01,
+ "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0, "max": 100.0,
+ "std": 12.3}},
+ {"name": "categoria", "inferred_type": "categorical", "null_pct": 0.0,
+ "categorical": {"top": [{"value": "neumaticos", "count": 500}]}},
+ ],
+}
+res = render_automatic_eda_markdown(
+ profile, "reports/ventas_aeda.md",
+ {"title": "EDA — ventas",
+ "ctx": {"dataset_name": "Ventas", "source_origin": "ERP export",
+ "n_rows": 1000, "n_cols": 2}})
+print(res["path"], res["n_chars"], res["chapters"])
+# -> reports/ventas_aeda.md 4123 [{'id':'portada','version':'1.0.0'}, ...]
+```
+
+## Cuando usarla
+
+Cuando quieras **pegar el EDA a un LLM** (ChatGPT, Claude, ...) o tenerlo en texto
+plano versionable: mismo documento por capítulos que el PDF/PPTX, pero serializado a
+Markdown sin binarios. Úsala como tercera salida junto a `render_automatic_eda_pdf`
+(móvil) y `render_automatic_eda_pptx` (compartir) desde el MISMO modelo de capítulos.
+A diferencia de esas dos, no hay páginas ni slides: todas las filas de cada tabla se
+vuelcan (nada se corta) y cada figura se reduce a su caption + la tabla de datos
+subyacente, que es lo que un LLM puede leer. Para añadir capítulos al documento, ver
+`docs/capabilities/automatic_eda.md`.
+
+## Gotchas
+
+- **Impura**: escribe el `.md` en `out_path` (crea los directorios padre). Con
+ `meta['embed_figures']=True` además exporta un PNG `_figN.png` por figura
+ junto al `.md`; por defecto NO exporta nada y el markdown queda autocontenido.
+- **Nunca lanza** (dict-no-throw): un bloque que falle se degrada a una nota y se anota
+ en `note`; el documento se escribe igual. Un profile/lista vacíos producen un markdown
+ mínimo válido con `*(documento vacío …)*` y `chapters=[]`.
+- **Figuras = datos, no imagen**: un bloque `figure` se serializa como `*Figura: caption*`
+ más, si la figura matplotlib trae barras (histograma / barras), una tabla
+ `| Desde | Hasta | Frecuencia |` extraída de los `Rectangle` patches (máx 100 filas;
+ el resto se trunca con `*… (N filas más)*`). Si no hay barras o algo falla, solo sale
+ el caption. La figura se cierra (`plt.close`) tras leerla.
+- **Glosario vs negrita**: se eliminan SOLO los marcadores de glosario
+ `[[term:key]]visible[[/term]]` (queda `visible`); el `**negrita**` markdown SE
+ CONSERVA (es válido). No se usa `strip_inline_md` aquí porque ese también quita el bold.
+- **Anclas del índice**: el `## Índice` enlaza cada capítulo con un ancla estilo GitHub
+ del encabezado `## N. Título` (minúsculas, espacios→`-`, sin signos). Si dos capítulos
+ comparten título exacto sus anclas colisionan (caso raro; los capítulos canónicos tienen
+ títulos únicos).
+- **Tablas**: las celdas escapan `|` (→ `\|`) y pliegan saltos de línea a `
` para no
+ romper la columna. No hay reparto por ancho — un LLM no lo necesita.
diff --git a/python/functions/datascience/render_automatic_eda_markdown.py b/python/functions/datascience/render_automatic_eda_markdown.py
new file mode 100644
index 00000000..649b2cd1
--- /dev/null
+++ b/python/functions/datascience/render_automatic_eda_markdown.py
@@ -0,0 +1,55 @@
+"""render_automatic_eda_markdown — chapter-based EDA report as one Markdown file.
+
+Public ``eda``-group entry point that serializes an AutomaticEDA document (a list
+of chapters, or an ``eda`` TableProfile from which the canonical chapters are
+built) into a single self-contained Markdown file optimised to be **pasted into
+an LLM**: plain text, Markdown tables (every row dumped — there are no pages to
+cut), figures reduced to caption + underlying data, no binaries. It mirrors
+``render_automatic_eda_pdf`` / ``render_automatic_eda_pptx`` but for text output;
+unlike those it writes no manifest (KISS — Markdown is a single text artefact).
+
+dict-no-throw: never raises. Returns ``{path, n_chars, chapters, note}``; on a
+fatal error ``path`` is None and ``note`` explains why.
+"""
+
+from __future__ import annotations
+
+from datascience.automatic_eda import build_document, render_md
+from datascience.automatic_eda.model import as_chapter, as_chapters
+
+
+def _coerce_chapters(chapters_or_profile, meta: dict) -> list:
+ """Accept chapters OR an eda profile and return a list of Chapter."""
+ arg = chapters_or_profile
+ if isinstance(arg, (list, tuple)):
+ return as_chapters(list(arg))
+ if isinstance(arg, dict):
+ if "blocks" in arg and "columns" not in arg:
+ ch = as_chapter(arg)
+ return [ch] if ch is not None else []
+ return build_document(arg, (meta or {}).get("ctx"))
+ return []
+
+
+def render_automatic_eda_markdown(chapters_or_profile, out_path: str,
+ meta: dict = None) -> dict:
+ """Render an AutomaticEDA document into a single self-contained Markdown file.
+
+ Args:
+ chapters_or_profile: a list of chapters (``Chapter`` dataclasses or
+ dicts) or an ``eda`` TableProfile dict (chapters built via
+ ``build_document(profile, meta['ctx'])``).
+ out_path: filesystem path for the ``.md`` (parent dirs are created).
+ meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with
+ ``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``),
+ ``generated_at``, ``embed_figures`` (export PNGs beside the .md,
+ default False — off keeps the Markdown self-contained).
+
+ Returns:
+ dict (never raises): ``{path: str|None, n_chars: int,
+ chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is
+ None and ``note`` explains the cause.
+ """
+ meta = dict(meta or {})
+ chapters = _coerce_chapters(chapters_or_profile, meta)
+ return render_md(chapters, out_path, meta)
diff --git a/python/functions/datascience/render_automatic_eda_markdown_test.py b/python/functions/datascience/render_automatic_eda_markdown_test.py
new file mode 100644
index 00000000..5d77ee10
--- /dev/null
+++ b/python/functions/datascience/render_automatic_eda_markdown_test.py
@@ -0,0 +1,168 @@
+"""Tests for render_automatic_eda_markdown — DoD: golden + edge + profile path.
+
+Self-contained synthetic blocks (no DuckDB). Verifies every block kind serializes
+to Markdown (heading, markdown with glossary+bold, kv/data tables, a figure whose
+histogram bars become a data table, caption, note, group, glossary entry), that a
+leading level-1 heading equal to the chapter title is omitted, that an empty
+document degrades to a valid minimal Markdown without raising, and that passing a
+minimal TableProfile builds chapters and writes the file.
+"""
+
+import os
+import tempfile
+
+from datascience.render_automatic_eda_markdown import render_automatic_eda_markdown
+from datascience.automatic_eda.model import (
+ Caption, Chapter, DataTable, Figure, GlossaryEntry, Group, Heading, KVTable,
+ Markdown, Note,
+)
+
+
+def _hist_fig():
+ import matplotlib
+ matplotlib.use("Agg")
+ import matplotlib.pyplot as plt
+ fig, ax = plt.subplots()
+ ax.hist([1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5], bins=5)
+ return fig
+
+
+def _chapters() -> list:
+ blocks = [
+ Heading("Demo", 1), # == chapter title -> omitted.
+ Heading("Seccion dos", 2), # -> ####
+ Markdown("Texto con [[term:ent]]entropia[[/term]] y **bold** aqui."),
+ KVTable(rows=[("Filas", 1000), ("Columnas", 5)], title="Resumen"),
+ DataTable(header=["col", "valor"],
+ rows=[["alpha", "111"], ["beta", "222"], ["gamma", "333"]],
+ title="Datos", note="nota inferior"),
+ Figure(make=_hist_fig, caption="Histograma demo"),
+ Caption("pie de figura"),
+ Note("una nota aparte"),
+ Group(title="Grupo X", blocks=[Markdown("dentro del grupo")]),
+ GlossaryEntry(key="ent", label="Entropia",
+ definition="Medida de incertidumbre."),
+ ]
+ return [Chapter(id="demo", title="Demo", version="1.0.0", blocks=blocks)]
+
+
+def _read(path: str) -> str:
+ with open(path, "r", encoding="utf-8") as fh:
+ return fh.read()
+
+
+def test_golden_bloques_sinteticos_serializa_todo_a_markdown():
+ with tempfile.TemporaryDirectory() as d:
+ out = os.path.join(d, "demo.md")
+ res = render_automatic_eda_markdown(
+ _chapters(), out,
+ {"title": "EDA Demo",
+ "ctx": {"dataset_name": "Demo", "n_rows": 12, "n_cols": 2}})
+ assert res["path"] == out
+ assert os.path.exists(out)
+ assert res["n_chars"] > 0
+ assert res["chapters"] == [{"id": "demo", "version": "1.0.0"}]
+
+ content = _read(out)
+ # Document structure.
+ assert content.startswith("# ")
+ assert "## Índice" in content
+ # A Markdown table is present (header + separator row).
+ assert "| " in content and "| --- " in content
+ # DataTable values are all dumped.
+ for v in ("alpha", "111", "beta", "222", "gamma", "333"):
+ assert v in content
+ # Glossary markers stripped, bold kept.
+ assert "[[term" not in content
+ assert "[[/term]]" not in content
+ assert "**bold**" in content
+ assert "entropia" in content # visible glossary text preserved.
+ # Figure histogram bars became a data table.
+ assert "| Desde | Hasta | Frecuencia |" in content
+ # Glossary entry rendered as a level-3 heading.
+ assert "### Entropia" in content
+ # Level-2 heading -> ####.
+ assert "#### Seccion dos" in content
+ # Leading level-1 heading equal to the title was omitted.
+ assert "### Demo" not in content
+ # Group title rendered.
+ assert "### Grupo X" in content
+
+
+def _hist_fig_with_span():
+ """Histogram with a wide ``axvspan`` (±1σ band) over it.
+
+ Reproduces the num_distr figure shape: matplotlib keeps the span as a lone
+ Rectangle in ``ax.patches`` alongside the bin bars; it must NOT leak into the
+ extracted bins table as a fake bin (it is ~5x wider than a bin)."""
+ import matplotlib
+ matplotlib.use("Agg")
+ import matplotlib.pyplot as plt
+ fig, ax = plt.subplots()
+ data = [1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5]
+ ax.hist(data, bins=5)
+ ax.axvspan(2.0, 4.0, alpha=0.2) # mean±σ band — a wide stray rectangle.
+ return fig
+
+
+def test_figura_descarta_axvspan_de_la_tabla_de_bins():
+ """The ±1σ band rectangle must not appear as a row in the bins table."""
+ blocks = [Figure(make=_hist_fig_with_span, caption="Hist con banda")]
+ chapters = [Chapter(id="f", title="Fig", version="1.0.0", blocks=blocks)]
+ with tempfile.TemporaryDirectory() as d:
+ out = os.path.join(d, "fig.md")
+ render_automatic_eda_markdown(chapters, out, {"title": "T"})
+ content = _read(out)
+ assert "| Desde | Hasta | Frecuencia |" in content
+ # Extract the rows of the bins table: lines between the header/separator
+ # and the next blank line.
+ lines = content.splitlines()
+ hi = next(i for i, ln in enumerate(lines)
+ if ln.startswith("| Desde | Hasta | Frecuencia |"))
+ rows = []
+ for ln in lines[hi + 2:]: # skip header + separator
+ if not ln.startswith("|"):
+ break
+ rows.append(ln)
+ # 5 histogram bins, no extra wide span row.
+ assert len(rows) == 5, rows
+ # No row spans a width of ~2.0 (the axvspan from x=2 to x=4).
+ for ln in rows:
+ cells = [c.strip() for c in ln.strip("|").split("|")]
+ lo, hi_v = float(cells[0]), float(cells[1])
+ assert (hi_v - lo) < 1.5, f"wide span leaked: {ln}"
+
+
+def test_edge_documento_vacio_no_revienta():
+ with tempfile.TemporaryDirectory() as d:
+ out = os.path.join(d, "empty.md")
+ res = render_automatic_eda_markdown([], out, {})
+ assert res["path"] == out
+ assert os.path.exists(out)
+ assert res["chapters"] == []
+ content = _read(out)
+ assert "documento vacío" in content
+ assert content.startswith("# ")
+
+
+def test_profile_path_construye_capitulos_y_escribe():
+ profile = {
+ "table": "mini",
+ "source": "/data/mini.csv",
+ "n_rows": 10,
+ "n_cols": 1,
+ "quality_score": 88.0,
+ "columns": [
+ {"name": "x", "inferred_type": "numeric", "null_pct": 0.0,
+ "null_count": 0,
+ "numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0,
+ "std": 0.5}},
+ ],
+ }
+ with tempfile.TemporaryDirectory() as d:
+ out = os.path.join(d, "mini.md")
+ res = render_automatic_eda_markdown(
+ profile, out, {"title": "Mini", "ctx": {"dataset_name": "Mini"}})
+ assert res["path"] == out # not None — no exception, file written.
+ assert os.path.exists(out)
+ assert res["n_chars"] > 0
diff --git a/python/functions/pipelines/render_automatic_eda.py b/python/functions/pipelines/render_automatic_eda.py
index 8090bc1f..5361b927 100644
--- a/python/functions/pipelines/render_automatic_eda.py
+++ b/python/functions/pipelines/render_automatic_eda.py
@@ -1,9 +1,10 @@
-"""render_automatic_eda — EDA completo one-shot: perfil → ctx → PDF + PPTX.
+"""render_automatic_eda — EDA completo one-shot: perfil → ctx → PDF + PPTX + MD.
Pipeline impuro del grupo de capacidad `eda`. Dada UNA tabla DuckDB (o
-PostgreSQL), produce el informe AutomaticEDA COMPLETO en sus dos formatos a la
-vez (PDF móvil A5 + PPTX 16:9) con los 11 capítulos POBLADOS, en una sola
-llamada. Compone, sin reimplementar su lógica, cuatro funciones del registry:
+PostgreSQL), produce el informe AutomaticEDA COMPLETO en sus tres formatos a la
+vez (PDF móvil A5 + PPTX 16:9 + Markdown autocontenido para pegar a un LLM) con
+los capítulos POBLADOS, en una sola llamada. Compone, sin reimplementar su
+lógica, varias funciones del registry:
- profile_table : perfila la tabla end-to-end (TableProfile agregado),
opcionalmente con modelos baratos y análisis de serie.
@@ -12,8 +13,11 @@ llamada. Compone, sin reimplementar su lógica, cuatro funciones del registry:
modelos/geo, timeseries_raw para series, geo_points
para el mapa, db_path/table para la agregación
push-down). Sin él, esos capítulos degradan.
- - render_automatic_eda_pdf : renderiza el documento por capítulos a PDF.
- - render_automatic_eda_pptx : renderiza el mismo documento a PPTX.
+ - render_automatic_eda_pdf : renderiza el documento por capítulos a PDF.
+ - render_automatic_eda_pptx : renderiza el mismo documento a PPTX.
+ - render_automatic_eda_markdown : serializa el mismo documento a Markdown
+ autocontenido (texto + tablas markdown, sin
+ binarios) para incorporar a un LLM.
El TableProfile agregado basta para portada/overview/distribuciones/calidad/
correlación, pero los capítulos `modelos`, `timeseries`, `geospatial` y
@@ -32,6 +36,7 @@ from datetime import datetime, timezone
from datascience import (
build_eda_render_ctx,
+ render_automatic_eda_markdown,
render_automatic_eda_pdf,
render_automatic_eda_pptx,
run_eda_models,
@@ -93,6 +98,7 @@ def render_automatic_eda(
out_dir: str = "reports",
basename: str = None,
ctx_extra: dict = None,
+ emit_md: bool = True,
) -> dict:
"""Perfila una tabla y emite el informe AutomaticEDA completo (PDF + PPTX).
@@ -140,13 +146,19 @@ def render_automatic_eda(
ctx_extra: dict opcional con claves de presentación/contexto extra que se
mezclan en el ctx (p.ej. dataset_name, description, source_origin).
No pisan las claves de datos calculadas por build_eda_render_ctx.
+ emit_md: además del PDF y el PPTX, emite un Markdown autocontenido del
+ MISMO documento por capítulos (texto plano + tablas markdown, sin
+ binarios), pensado para pegar a un LLM. Default True. La ruta sale en
+ la clave de retorno ``aeda_md_path``. No altera las demás salidas.
Returns:
dict (nunca lanza). En éxito::
{"status": "ok", "pdf_path": str, "pptx_path": str,
- "manifest_path": str|None, "n_pages": int, "n_slides": int,
- "pdf_note": str, "pptx_note": str, "profile": }
+ "aeda_md_path": str|None, "manifest_path": str|None,
+ "n_pages": int, "n_slides": int, "md_chars": int|None,
+ "pdf_note": str, "pptx_note": str, "md_note": str|None,
+ "profile": }
En error: {"status": "error", "error": str}.
"""
@@ -243,15 +255,26 @@ def render_automatic_eda(
rpdf = render_automatic_eda_pdf(prof, pdf_path, meta) or {}
rpptx = render_automatic_eda_pptx(prof, pptx_path, meta) or {}
+ # Salida Markdown autocontenida (mismo documento por capítulos) para
+ # pegar a un LLM. Aditiva: no afecta a PDF/PPTX/manifest. dict-no-throw.
+ rmd = {}
+ md_path = None
+ if emit_md:
+ md_path = os.path.join(out_dir, base + ".md")
+ rmd = render_automatic_eda_markdown(prof, md_path, meta) or {}
+
return {
"status": "ok",
"pdf_path": rpdf.get("path"),
"pptx_path": rpptx.get("path"),
+ "aeda_md_path": rmd.get("path"),
"manifest_path": rpdf.get("manifest_path"),
"n_pages": rpdf.get("n_pages"),
"n_slides": rpptx.get("n_slides"),
+ "md_chars": rmd.get("n_chars"),
"pdf_note": rpdf.get("note"),
"pptx_note": rpptx.get("note"),
+ "md_note": rmd.get("note"),
"profile": prof,
}
except Exception as e: # noqa: BLE001 — dict-no-throw: degradar, nunca lanzar.