diff --git a/python/functions/datascience/__init__.py b/python/functions/datascience/__init__.py index f1505d22..cdefab14 100644 --- a/python/functions/datascience/__init__.py +++ b/python/functions/datascience/__init__.py @@ -64,6 +64,7 @@ from .exploratory_caveats import exploratory_caveats from .render_eda_pdf import render_eda_pdf, render_eda_pdf_relational from .render_automatic_eda_pdf import render_automatic_eda_pdf from .render_automatic_eda_pptx import render_automatic_eda_pptx +from .render_automatic_eda_markdown import render_automatic_eda_markdown from .detect_time_column import detect_time_column from .extract_timeseries_raw import extract_timeseries_raw from .build_eda_render_ctx import build_eda_render_ctx @@ -82,6 +83,7 @@ __all__ = [ "resample_timeseries", "render_automatic_eda_pdf", "render_automatic_eda_pptx", + "render_automatic_eda_markdown", "decode_qr_image", "adf_kpss_stationarity", "acf_pacf", diff --git a/python/functions/datascience/automatic_eda/__init__.py b/python/functions/datascience/automatic_eda/__init__.py index f9a6f2e3..01085313 100644 --- a/python/functions/datascience/automatic_eda/__init__.py +++ b/python/functions/datascience/automatic_eda/__init__.py @@ -36,6 +36,7 @@ from .model import ( # noqa: F401 from .chapters_registry import CHAPTER_ORDER, build_chapter, build_document # noqa: F401 from .render_pdf_impl import render_pdf # noqa: F401 from .render_pptx_impl import render_pptx # noqa: F401 +from .render_md_impl import render_md # noqa: F401 __all__ = [ "ENGINE_NAME", @@ -60,4 +61,5 @@ __all__ = [ "build_document", "render_pdf", "render_pptx", + "render_md", ] diff --git a/python/functions/datascience/automatic_eda/render_md_impl.py b/python/functions/datascience/automatic_eda/render_md_impl.py new file mode 100644 index 00000000..fba8ba6f --- /dev/null +++ b/python/functions/datascience/automatic_eda/render_md_impl.py @@ -0,0 +1,458 @@ +"""AutomaticEDA Markdown serializer — one self-contained file to paste to an LLM. + +Same document model as the PDF/PPTX renderers (an ordered list of +:class:`Chapter`, each a list of format-independent blocks) but emitted as plain +**Markdown** instead of a binary. The goal is different from the other two +renderers: a Markdown EDA is meant to be *pasted into an LLM*, so it prioritises +TEXT and DATA over visuals. Tables become Markdown tables (every row dumped, no +pagination — nothing is cut because there are no pages); a ``Figure`` becomes its +caption plus, when possible, the underlying bar/histogram data as a Markdown +table (an LLM cannot see the image); glossary term markers are stripped while +``**bold**`` is kept (it is valid Markdown). + +dict-no-throw (the ``eda`` group style): :func:`render_md` never raises. On a +fatal error it returns ``{path: None, ...}`` with a ``note`` explaining why; a +malformed block degrades to a readable note rather than crashing the document. +""" + +from __future__ import annotations + +import os +import re + +from . import model + +# Glossary span markers (kept text, dropped markers). We intentionally do NOT use +# ``text_layout.strip_inline_md`` for Markdown blocks because that also removes +# ``**bold**`` — valid Markdown we want to preserve when pasting to an LLM. +_TERM_OPEN_RE = re.compile(r"\[\[term:[A-Za-z0-9_]+\]\]") +_MAX_BAR_ROWS = 100 + + +# --------------------------------------------------------------------------- # +# Small helpers. +# --------------------------------------------------------------------------- # +def _clean_terms(s) -> str: + """Drop glossary term markers, keeping the visible text (and any **bold**).""" + s = model._safe_str(s) + s = _TERM_OPEN_RE.sub("", s) + return s.replace("[[/term]]", "") + + +def _cell(v) -> str: + """Render a value as a safe Markdown table cell. + + Escapes pipes (``|`` -> ``\\|``) so they do not break the column layout and + folds newlines to ``
`` so a multi-line value stays inside one cell. None + becomes an empty string. + """ + s = model._safe_str(v) + s = s.replace("|", "\\|") + s = s.replace("\r\n", "\n").replace("\r", "\n").replace("\n", "
") + return s + + +def _slug(text: str) -> str: + """GitHub-style heading anchor: lowercase, spaces->'-', drop other symbols.""" + s = model._safe_str(text).strip().lower() + out = [] + for ch in s: + if ch.isalnum(): + out.append(ch) + elif ch in " -": + out.append("-") + # any other symbol is dropped. + slug = "".join(out) + while "--" in slug: + slug = slug.replace("--", "-") + return slug.strip("-") + + +def _fmt_num(v) -> str: + """Compact number for the figure data tables (ints as ints, else 4 sig figs).""" + try: + f = float(v) + except Exception: # noqa: BLE001 + return model._safe_str(v) + if f != f: # NaN + return "NaN" + if f == int(f) and abs(f) < 1e15: + return str(int(f)) + return f"{f:.4g}" + + +def _fmt_int(v) -> str: + try: + return str(int(v)) + except Exception: # noqa: BLE001 + return model._safe_str(v) + + +def _now_iso() -> str: + from datetime import datetime, timezone + return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + + +# --------------------------------------------------------------------------- # +# Document header (title + metadata blockquote + numbered index). +# --------------------------------------------------------------------------- # +def _meta_block(meta: dict) -> list: + """Build the metadata lines for the header blockquote (omitting absentees).""" + ctx = meta.get("ctx") if isinstance(meta.get("ctx"), dict) else {} + lines: list = [] + + def add(label, value) -> None: + if value is None: + return + s = model._safe_str(value).strip() + if s and s.lower() != "none": + lines.append(f"**{label}:** {s}") + + add("Dataset", ctx.get("dataset_name") or meta.get("dataset_name")) + add("Fuente", ctx.get("source_origin") or meta.get("source_origin")) + add("Almacenamiento", ctx.get("storage") or meta.get("storage")) + n_rows = ctx.get("n_rows", meta.get("n_rows")) + n_cols = ctx.get("n_cols", meta.get("n_cols")) + if n_rows is not None and n_cols is not None: + lines.append( + f"**Dimensiones:** {_fmt_int(n_rows)} filas × {_fmt_int(n_cols)} columnas") + add("Generado", meta.get("generated_at") or _now_iso()) + lines.append(f"**Motor:** {model.ENGINE_NAME} v{model.ENGINE_VERSION}") + return lines + + +# --------------------------------------------------------------------------- # +# Per-block serializers. Each returns a Markdown string (no surrounding blanks; +# the caller separates blocks with a blank line). +# --------------------------------------------------------------------------- # +def _md_heading(block) -> str: + level = int(getattr(block, "level", 1) or 1) + hashes = "#" * min(level + 2, 6) # level1 -> ###; '#'/'##' reserved for doc/chapter. + text = _clean_terms(getattr(block, "text", "")).strip() + return f"{hashes} {text}" + + +def _md_markdown(block) -> str: + # Keep the text verbatim, dropping only glossary markers (keep **bold**). + return _clean_terms(getattr(block, "text", "")).rstrip("\n") + + +def _md_kv_table(block) -> str: + lines: list = [] + title = getattr(block, "title", None) + if title: + lines.append(f"**{_clean_terms(title).strip()}**") + lines.append("") + lines.append("| Campo | Valor |") + lines.append("| --- | --- |") + for row in (getattr(block, "rows", []) or []): + try: + label, value = row[0], row[1] + except Exception: # noqa: BLE001 + label, value = row, "" + lines.append(f"| {_cell(label)} | {_cell(value)} |") + return "\n".join(lines) + + +def _md_data_table(block) -> str: + lines: list = [] + title = getattr(block, "title", None) + if title: + lines.append(f"**{_clean_terms(title).strip()}**") + lines.append("") + header = list(getattr(block, "header", []) or []) + rows = list(getattr(block, "rows", []) or []) + if not header: + ncol = max((len(r) for r in rows), default=1) + header = [f"col{i + 1}" for i in range(ncol)] + ncol = len(header) + lines.append("| " + " | ".join(_cell(h) for h in header) + " |") + lines.append("| " + " | ".join(["---"] * ncol) + " |") + for r in rows: # dump every row — no pagination, nothing cut. + cells = [_cell(r[c]) if c < len(r) else "" for c in range(ncol)] + lines.append("| " + " | ".join(cells) + " |") + note = getattr(block, "note", None) + if note: + lines.append("") + lines.append(f"*{_clean_terms(note).strip()}*") + return "\n".join(lines) + + +def _bars_table(bars: list) -> str: + """Render extracted bar/histogram data as a Markdown table (Desde/Hasta/Frec).""" + lines = ["| Desde | Hasta | Frecuencia |", "| --- | --- | --- |"] + shown = bars[:_MAX_BAR_ROWS] + for x0, x1, h in shown: + lines.append(f"| {_fmt_num(x0)} | {_fmt_num(x1)} | {_fmt_num(h)} |") + out = "\n".join(lines) + extra = len(bars) - len(shown) + if extra > 0: + out += f"\n\n*… ({extra} filas más)*" + return out + + +def _extract_bars(fig) -> list: + """Collect (x_from, x_to, height) of the rectangular bars of a matplotlib fig. + + Histogram / bar-chart bars are ``matplotlib.patches.Rectangle`` with positive + width and height; spines, legends and zero-area artists are skipped. Never + raises — returns ``[]`` on any problem. + """ + bars: list = [] + try: + for ax in fig.get_axes(): + # Collect this axes' positive-area rectangles, then keep only the ones + # that look like actual histogram/bar bins. Reference shapes that + # matplotlib also stores in ``ax.patches`` — most notably the ``±1σ`` + # band drawn by ``axvspan`` (a single rectangle far wider than a bin) + # and a lone Tukey boxplot box — would otherwise show up as fake + # "bins". A histogram axes has several near-equal-width bars, so we + # drop any rectangle whose width is more than twice the median width + # of that axes' rectangles (the σ-band spans many bins; uniform bins + # all sit at the median width and stay). + ax_bars: list = [] + for patch in list(getattr(ax, "patches", []) or []): + try: + w = patch.get_width() + h = patch.get_height() + x = patch.get_x() + except Exception: # noqa: BLE001 — not a Rectangle-like patch. + continue + if w and w > 0 and h and h > 0: + ax_bars.append((x, x + w, h)) + if len(ax_bars) >= 3: + widths = sorted(b[1] - b[0] for b in ax_bars) + median_w = widths[len(widths) // 2] + if median_w > 0: + ax_bars = [b for b in ax_bars + if (b[1] - b[0]) <= 2.0 * median_w] + bars.extend(ax_bars) + except Exception: # noqa: BLE001 + return [] + return bars + + +def _md_figure(block, meta: dict, out_path: str, counter: list) -> str: + """Serialize a Figure prioritising TEXT + DATA (an LLM cannot see the image). + + Emits the caption, then — if the matplotlib figure has bars — a Markdown table + of the underlying (Desde, Hasta, Frecuencia) values. Optionally (when + ``meta['embed_figures']`` is True) also exports a PNG beside the .md and adds + an image link; off by default so the Markdown stays self-contained. + """ + caption = model._safe_str(getattr(block, "caption", "")).strip() + parts = [f"*Figura: {caption}*" if caption else "*Figura*"] + fig = None + try: + import matplotlib + matplotlib.use("Agg") # defensive: headless rasterization backend. + fig = getattr(block, "fig", None) + make = getattr(block, "make", None) + if fig is None and callable(make): + fig = make() + if fig is not None: + bars = _extract_bars(fig) + if bars: + parts.append(_bars_table(bars)) + if meta.get("embed_figures"): + png = _embed_png(fig, out_path, counter) + if png: + parts.append(f"![{caption}]({png})") + except Exception: # noqa: BLE001 — a bad figure degrades to just its caption. + pass + finally: + if fig is not None: + try: + import matplotlib.pyplot as plt + plt.close(fig) + except Exception: # noqa: BLE001 + pass + return "\n\n".join(parts) + + +def _embed_png(fig, out_path: str, counter: list) -> str: + """Export the figure to ``_figN.png`` beside the .md; return its name.""" + try: + counter[0] += 1 + base = os.path.splitext(os.path.basename(out_path))[0] or "figura" + name = f"{base}_fig{counter[0]}.png" + path = os.path.join(os.path.dirname(os.path.abspath(out_path)), name) + fig.savefig(path, format="png", dpi=120, bbox_inches="tight") + return name + except Exception: # noqa: BLE001 + return "" + + +def _md_image(block) -> str: + path = model._safe_str(getattr(block, "path", "")) + caption = model._safe_str(getattr(block, "caption", "")).strip() + out = f"![{caption}]({path})" + if caption: + out += f"\n\n*{caption}*" + return out + + +def _md_caption(block) -> str: + return f"*{_clean_terms(getattr(block, 'text', '')).strip()}*" + + +def _md_note(block) -> str: + text = _clean_terms(getattr(block, "text", "")).strip() + lines = text.split("\n") + return "\n".join((f"> {ln}" if ln.strip() else ">") for ln in lines) + + +def _md_group(block, meta: dict, out_path: str, counter: list) -> str: + parts: list = [] + title = getattr(block, "title", None) + if title: + parts.append(f"### {_clean_terms(title).strip()}") + for b in (getattr(block, "blocks", []) or []): + try: + seg = _serialize_block(b, meta, out_path, counter) + except Exception: # noqa: BLE001 + seg = "" + if seg: + parts.append(seg) + return "\n\n".join(parts) + + +def _md_glossary_entry(block) -> str: + label = (model._safe_str(getattr(block, "label", "")).strip() + or model._safe_str(getattr(block, "key", "")).strip()) + definition = _clean_terms(getattr(block, "definition", "")).strip() + out = f"### {label}" + if definition: + out += f"\n\n{definition}" + return out + + +def _serialize_block(block, meta: dict, out_path: str, counter: list) -> str: + """Dispatch a single block to its Markdown serializer. Unknown -> note.""" + kind = getattr(block, "kind", "") + if kind == "heading": + return _md_heading(block) + if kind == "markdown": + return _md_markdown(block) + if kind == "kv_table": + return _md_kv_table(block) + if kind == "data_table": + return _md_data_table(block) + if kind == "figure": + return _md_figure(block, meta, out_path, counter) + if kind == "image": + return _md_image(block) + if kind == "caption": + return _md_caption(block) + if kind == "note": + return _md_note(block) + if kind == "group": + return _md_group(block, meta, out_path, counter) + if kind == "glossary_entry": + return _md_glossary_entry(block) + # Unknown content -> readable note (mirrors the model's defensive coercion). + return _md_note(model.Note(text=model._safe_str(block))) + + +# --------------------------------------------------------------------------- # +# Entry point. +# --------------------------------------------------------------------------- # +def render_md(chapters: list, out_path: str, meta: dict = None) -> dict: + """Serialize a list of Chapters into a single self-contained Markdown file. + + The output leads with ``# ``, a metadata blockquote and a numbered + ``## Índice`` linking each chapter, then one ``## N. <title>`` section per + chapter with its blocks. Tables become Markdown tables (every row dumped), + figures become caption + underlying data table, glossary markers are stripped + while ``**bold**`` is kept. Designed to be pasted into an LLM. + + Args: + chapters: a list of ``Chapter`` (dataclasses or dicts); normalized + defensively with ``model.as_chapters``. + out_path: filesystem path for the ``.md`` (parent dirs are created). + meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with + ``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``), + ``generated_at``, ``embed_figures`` (export PNGs beside the .md, + default False). + + Returns: + dict (never raises): ``{path: str|None, n_chars: int, + chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is + None and ``note`` explains why. + """ + meta = meta or {} + chapters = model.as_chapters(chapters) + title = model._safe_str(meta.get("title")) or model.ENGINE_NAME + + # Edge: nothing to render -> a minimal but valid Markdown document. + if not chapters: + content = (f"# {title}\n\n" + "*(documento vacío — sin capítulos aplicables)*\n") + return _write(out_path, content, [], "documento vacío") + + counter = [0] # document-wide figure counter for unique PNG names. + notes: list = [] + segments: list = [f"# {title}"] + + meta_lines = _meta_block(meta) + if meta_lines: + segments.append("\n".join(f"> {ln}" for ln in meta_lines)) + + # Numbered index. The anchor matches the chapter heading emitted below + # (``## N. <title>``) in GitHub slug style. + chap_heads = [] + idx_lines = ["## Índice"] + for i, ch in enumerate(chapters, 1): + head_text = f"{i}. {model._safe_str(ch.title)}" + anchor = _slug(head_text) + chap_heads.append((head_text, anchor)) + idx_lines.append(f"{i}. [{model._safe_str(ch.title)}](#{anchor})") + segments.append("\n".join(idx_lines)) + + chapters_meta = [] + for i, ch in enumerate(chapters, 1): + segments.append("---") + head_text, _anchor = chap_heads[i - 1] + segments.append(f"## {head_text}") + + blocks = list(ch.blocks or []) + # Omit a leading level-1 Heading that just repeats the chapter title. + if blocks: + b0 = blocks[0] + if (getattr(b0, "kind", "") == "heading" + and int(getattr(b0, "level", 1) or 1) == 1 + and _clean_terms(getattr(b0, "text", "")).strip() + == model._safe_str(ch.title).strip()): + blocks = blocks[1:] + + for block in blocks: + try: + seg = _serialize_block(block, meta, out_path, counter) + except Exception as e: # noqa: BLE001 + seg = _md_note(model.Note(text=model._safe_str(block))) + notes.append( + f"bloque '{getattr(block, 'kind', '?')}' del capítulo " + f"'{ch.id}' degradado: {e}") + if seg: + segments.append(seg) + chapters_meta.append({"id": ch.id, "version": ch.version}) + + content = "\n\n".join(segments) + "\n" + note = f"{len(content)} caracteres" + if notes: + note += " · " + "; ".join(notes) + return _write(out_path, content, chapters_meta, note) + + +def _write(out_path: str, content: str, chapters_meta: list, note: str) -> dict: + """Write the Markdown to disk (creating parents). dict-no-throw.""" + try: + parent = os.path.dirname(os.path.abspath(out_path)) + os.makedirs(parent, exist_ok=True) + with open(out_path, "w", encoding="utf-8") as fh: + fh.write(content) + except Exception as e: # noqa: BLE001 — never raise from the writer. + return {"path": None, "n_chars": 0, "chapters": [], + "note": f"no se pudo escribir el Markdown: {e}"} + return {"path": out_path, "n_chars": len(content), + "chapters": chapters_meta, "note": note} diff --git a/python/functions/datascience/render_automatic_eda_markdown.md b/python/functions/datascience/render_automatic_eda_markdown.md new file mode 100644 index 00000000..6615baf9 --- /dev/null +++ b/python/functions/datascience/render_automatic_eda_markdown.md @@ -0,0 +1,89 @@ +--- +name: render_automatic_eda_markdown +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def render_automatic_eda_markdown(chapters_or_profile, out_path: str, meta: dict = None) -> dict" +description: "Renderiza un documento AutomaticEDA por CAPÍTULOS (modelo de bloques independiente del formato) en un único MARKDOWN autocontenido pensado para PEGAR A UN LLM. Acepta una lista de capítulos del modelo o directamente un TableProfile del grupo eda (construye los capítulos canónicos con build_document). Prioriza TEXTO + DATOS sobre lo visual: las tablas se vuelcan como tablas markdown con TODAS las filas (sin paginar — no hay páginas que cortar), una figura matplotlib se reduce a su caption más la tabla de datos subyacente (Desde/Hasta/Frecuencia de las barras del histograma) porque un LLM no ve la imagen, y los marcadores de glosario se eliminan conservando el **negrita**. Lleva cabecera (# título), bloque de metadatos en blockquote e índice numerado con anclas GitHub. Espejo de render_automatic_eda_pdf/render_automatic_eda_pptx pero SIN manifest (KISS, el markdown es un único artefacto de texto). dict-no-throw: nunca lanza, devuelve {path, n_chars, chapters, note}; en error fatal path es None y note explica la causa. Flag opcional meta['embed_figures'] exporta PNGs junto al .md (off por defecto)." +tags: [eda, markdown, render, report, llm, automatic-eda, chapters, versioned, no-cut, text, datascience, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [os, re, matplotlib, "datascience.automatic_eda"] +params: + - name: chapters_or_profile + desc: "una lista de capítulos del modelo AutomaticEDA (dataclasses Chapter o dicts {id,title,version,blocks}) O un TableProfile dict del grupo eda. Si es un TableProfile, los capítulos canónicos se construyen con build_document(profile, meta['ctx']). Bloques soportados: heading, markdown, kv_table, data_table, figure, image, caption, note, group, glossary_entry. Lectura defensiva: lo no reconocido se degrada a Note, nunca lanza." + - name: out_path + desc: "ruta del archivo .md de salida. Los directorios padre se crean si faltan. Directorio no escribible → {path:None, note:<causa>} sin lanzar." + - name: meta + desc: "dict opcional. Claves: title (título del documento), ctx (dict con dataset_name→Dataset, source_origin→Fuente, storage→Almacenamiento, n_rows/n_cols→Dimensiones; también lo consumen los builders de capítulo cuando se da un profile), generated_at (timestamp; si falta se genera ISO UTC), embed_figures (True para exportar PNGs <basename>_figN.png junto al .md; por defecto False y el markdown queda autocontenido)." +output: "dict (nunca lanza): {path: str|None, n_chars: int, chapters: list[{id,version}], note: str}. En error fatal (p.ej. directorio no escribible) path es None y note explica la causa. Un documento sin capítulos aplicables produce un markdown mínimo válido con 'documento vacío' y chapters=[]." +tested: true +tests: ["test_golden_bloques_sinteticos_serializa_todo_a_markdown", "test_edge_documento_vacio_no_revienta", "test_profile_path_construye_capitulos_y_escribe"] +test_file_path: "python/functions/datascience/render_automatic_eda_markdown_test.py" +file_path: "python/functions/datascience/render_automatic_eda_markdown.py" +--- + +## Ejemplo + +```python +from datascience import render_automatic_eda_markdown + +# Desde un TableProfile del grupo eda (mismo modelo que los renderers PDF/PPTX). +profile = { + "table": "ventas", "source": "/data/ventas.csv", + "n_rows": 1000, "n_cols": 2, "quality_score": 92.5, + "columns": [ + {"name": "precio", "inferred_type": "numeric", "null_pct": 0.01, + "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0, "max": 100.0, + "std": 12.3}}, + {"name": "categoria", "inferred_type": "categorical", "null_pct": 0.0, + "categorical": {"top": [{"value": "neumaticos", "count": 500}]}}, + ], +} +res = render_automatic_eda_markdown( + profile, "reports/ventas_aeda.md", + {"title": "EDA — ventas", + "ctx": {"dataset_name": "Ventas", "source_origin": "ERP export", + "n_rows": 1000, "n_cols": 2}}) +print(res["path"], res["n_chars"], res["chapters"]) +# -> reports/ventas_aeda.md 4123 [{'id':'portada','version':'1.0.0'}, ...] +``` + +## Cuando usarla + +Cuando quieras **pegar el EDA a un LLM** (ChatGPT, Claude, ...) o tenerlo en texto +plano versionable: mismo documento por capítulos que el PDF/PPTX, pero serializado a +Markdown sin binarios. Úsala como tercera salida junto a `render_automatic_eda_pdf` +(móvil) y `render_automatic_eda_pptx` (compartir) desde el MISMO modelo de capítulos. +A diferencia de esas dos, no hay páginas ni slides: todas las filas de cada tabla se +vuelcan (nada se corta) y cada figura se reduce a su caption + la tabla de datos +subyacente, que es lo que un LLM puede leer. Para añadir capítulos al documento, ver +`docs/capabilities/automatic_eda.md`. + +## Gotchas + +- **Impura**: escribe el `.md` en `out_path` (crea los directorios padre). Con + `meta['embed_figures']=True` además exporta un PNG `<basename>_figN.png` por figura + junto al `.md`; por defecto NO exporta nada y el markdown queda autocontenido. +- **Nunca lanza** (dict-no-throw): un bloque que falle se degrada a una nota y se anota + en `note`; el documento se escribe igual. Un profile/lista vacíos producen un markdown + mínimo válido con `*(documento vacío …)*` y `chapters=[]`. +- **Figuras = datos, no imagen**: un bloque `figure` se serializa como `*Figura: caption*` + más, si la figura matplotlib trae barras (histograma / barras), una tabla + `| Desde | Hasta | Frecuencia |` extraída de los `Rectangle` patches (máx 100 filas; + el resto se trunca con `*… (N filas más)*`). Si no hay barras o algo falla, solo sale + el caption. La figura se cierra (`plt.close`) tras leerla. +- **Glosario vs negrita**: se eliminan SOLO los marcadores de glosario + `[[term:key]]visible[[/term]]` (queda `visible`); el `**negrita**` markdown SE + CONSERVA (es válido). No se usa `strip_inline_md` aquí porque ese también quita el bold. +- **Anclas del índice**: el `## Índice` enlaza cada capítulo con un ancla estilo GitHub + del encabezado `## N. Título` (minúsculas, espacios→`-`, sin signos). Si dos capítulos + comparten título exacto sus anclas colisionan (caso raro; los capítulos canónicos tienen + títulos únicos). +- **Tablas**: las celdas escapan `|` (→ `\|`) y pliegan saltos de línea a `<br>` para no + romper la columna. No hay reparto por ancho — un LLM no lo necesita. diff --git a/python/functions/datascience/render_automatic_eda_markdown.py b/python/functions/datascience/render_automatic_eda_markdown.py new file mode 100644 index 00000000..649b2cd1 --- /dev/null +++ b/python/functions/datascience/render_automatic_eda_markdown.py @@ -0,0 +1,55 @@ +"""render_automatic_eda_markdown — chapter-based EDA report as one Markdown file. + +Public ``eda``-group entry point that serializes an AutomaticEDA document (a list +of chapters, or an ``eda`` TableProfile from which the canonical chapters are +built) into a single self-contained Markdown file optimised to be **pasted into +an LLM**: plain text, Markdown tables (every row dumped — there are no pages to +cut), figures reduced to caption + underlying data, no binaries. It mirrors +``render_automatic_eda_pdf`` / ``render_automatic_eda_pptx`` but for text output; +unlike those it writes no manifest (KISS — Markdown is a single text artefact). + +dict-no-throw: never raises. Returns ``{path, n_chars, chapters, note}``; on a +fatal error ``path`` is None and ``note`` explains why. +""" + +from __future__ import annotations + +from datascience.automatic_eda import build_document, render_md +from datascience.automatic_eda.model import as_chapter, as_chapters + + +def _coerce_chapters(chapters_or_profile, meta: dict) -> list: + """Accept chapters OR an eda profile and return a list of Chapter.""" + arg = chapters_or_profile + if isinstance(arg, (list, tuple)): + return as_chapters(list(arg)) + if isinstance(arg, dict): + if "blocks" in arg and "columns" not in arg: + ch = as_chapter(arg) + return [ch] if ch is not None else [] + return build_document(arg, (meta or {}).get("ctx")) + return [] + + +def render_automatic_eda_markdown(chapters_or_profile, out_path: str, + meta: dict = None) -> dict: + """Render an AutomaticEDA document into a single self-contained Markdown file. + + Args: + chapters_or_profile: a list of chapters (``Chapter`` dataclasses or + dicts) or an ``eda`` TableProfile dict (chapters built via + ``build_document(profile, meta['ctx'])``). + out_path: filesystem path for the ``.md`` (parent dirs are created). + meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with + ``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``), + ``generated_at``, ``embed_figures`` (export PNGs beside the .md, + default False — off keeps the Markdown self-contained). + + Returns: + dict (never raises): ``{path: str|None, n_chars: int, + chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is + None and ``note`` explains the cause. + """ + meta = dict(meta or {}) + chapters = _coerce_chapters(chapters_or_profile, meta) + return render_md(chapters, out_path, meta) diff --git a/python/functions/datascience/render_automatic_eda_markdown_test.py b/python/functions/datascience/render_automatic_eda_markdown_test.py new file mode 100644 index 00000000..5d77ee10 --- /dev/null +++ b/python/functions/datascience/render_automatic_eda_markdown_test.py @@ -0,0 +1,168 @@ +"""Tests for render_automatic_eda_markdown — DoD: golden + edge + profile path. + +Self-contained synthetic blocks (no DuckDB). Verifies every block kind serializes +to Markdown (heading, markdown with glossary+bold, kv/data tables, a figure whose +histogram bars become a data table, caption, note, group, glossary entry), that a +leading level-1 heading equal to the chapter title is omitted, that an empty +document degrades to a valid minimal Markdown without raising, and that passing a +minimal TableProfile builds chapters and writes the file. +""" + +import os +import tempfile + +from datascience.render_automatic_eda_markdown import render_automatic_eda_markdown +from datascience.automatic_eda.model import ( + Caption, Chapter, DataTable, Figure, GlossaryEntry, Group, Heading, KVTable, + Markdown, Note, +) + + +def _hist_fig(): + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + fig, ax = plt.subplots() + ax.hist([1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5], bins=5) + return fig + + +def _chapters() -> list: + blocks = [ + Heading("Demo", 1), # == chapter title -> omitted. + Heading("Seccion dos", 2), # -> #### + Markdown("Texto con [[term:ent]]entropia[[/term]] y **bold** aqui."), + KVTable(rows=[("Filas", 1000), ("Columnas", 5)], title="Resumen"), + DataTable(header=["col", "valor"], + rows=[["alpha", "111"], ["beta", "222"], ["gamma", "333"]], + title="Datos", note="nota inferior"), + Figure(make=_hist_fig, caption="Histograma demo"), + Caption("pie de figura"), + Note("una nota aparte"), + Group(title="Grupo X", blocks=[Markdown("dentro del grupo")]), + GlossaryEntry(key="ent", label="Entropia", + definition="Medida de incertidumbre."), + ] + return [Chapter(id="demo", title="Demo", version="1.0.0", blocks=blocks)] + + +def _read(path: str) -> str: + with open(path, "r", encoding="utf-8") as fh: + return fh.read() + + +def test_golden_bloques_sinteticos_serializa_todo_a_markdown(): + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "demo.md") + res = render_automatic_eda_markdown( + _chapters(), out, + {"title": "EDA Demo", + "ctx": {"dataset_name": "Demo", "n_rows": 12, "n_cols": 2}}) + assert res["path"] == out + assert os.path.exists(out) + assert res["n_chars"] > 0 + assert res["chapters"] == [{"id": "demo", "version": "1.0.0"}] + + content = _read(out) + # Document structure. + assert content.startswith("# ") + assert "## Índice" in content + # A Markdown table is present (header + separator row). + assert "| " in content and "| --- " in content + # DataTable values are all dumped. + for v in ("alpha", "111", "beta", "222", "gamma", "333"): + assert v in content + # Glossary markers stripped, bold kept. + assert "[[term" not in content + assert "[[/term]]" not in content + assert "**bold**" in content + assert "entropia" in content # visible glossary text preserved. + # Figure histogram bars became a data table. + assert "| Desde | Hasta | Frecuencia |" in content + # Glossary entry rendered as a level-3 heading. + assert "### Entropia" in content + # Level-2 heading -> ####. + assert "#### Seccion dos" in content + # Leading level-1 heading equal to the title was omitted. + assert "### Demo" not in content + # Group title rendered. + assert "### Grupo X" in content + + +def _hist_fig_with_span(): + """Histogram with a wide ``axvspan`` (±1σ band) over it. + + Reproduces the num_distr figure shape: matplotlib keeps the span as a lone + Rectangle in ``ax.patches`` alongside the bin bars; it must NOT leak into the + extracted bins table as a fake bin (it is ~5x wider than a bin).""" + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + fig, ax = plt.subplots() + data = [1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5] + ax.hist(data, bins=5) + ax.axvspan(2.0, 4.0, alpha=0.2) # mean±σ band — a wide stray rectangle. + return fig + + +def test_figura_descarta_axvspan_de_la_tabla_de_bins(): + """The ±1σ band rectangle must not appear as a row in the bins table.""" + blocks = [Figure(make=_hist_fig_with_span, caption="Hist con banda")] + chapters = [Chapter(id="f", title="Fig", version="1.0.0", blocks=blocks)] + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "fig.md") + render_automatic_eda_markdown(chapters, out, {"title": "T"}) + content = _read(out) + assert "| Desde | Hasta | Frecuencia |" in content + # Extract the rows of the bins table: lines between the header/separator + # and the next blank line. + lines = content.splitlines() + hi = next(i for i, ln in enumerate(lines) + if ln.startswith("| Desde | Hasta | Frecuencia |")) + rows = [] + for ln in lines[hi + 2:]: # skip header + separator + if not ln.startswith("|"): + break + rows.append(ln) + # 5 histogram bins, no extra wide span row. + assert len(rows) == 5, rows + # No row spans a width of ~2.0 (the axvspan from x=2 to x=4). + for ln in rows: + cells = [c.strip() for c in ln.strip("|").split("|")] + lo, hi_v = float(cells[0]), float(cells[1]) + assert (hi_v - lo) < 1.5, f"wide span leaked: {ln}" + + +def test_edge_documento_vacio_no_revienta(): + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "empty.md") + res = render_automatic_eda_markdown([], out, {}) + assert res["path"] == out + assert os.path.exists(out) + assert res["chapters"] == [] + content = _read(out) + assert "documento vacío" in content + assert content.startswith("# ") + + +def test_profile_path_construye_capitulos_y_escribe(): + profile = { + "table": "mini", + "source": "/data/mini.csv", + "n_rows": 10, + "n_cols": 1, + "quality_score": 88.0, + "columns": [ + {"name": "x", "inferred_type": "numeric", "null_pct": 0.0, + "null_count": 0, + "numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0, + "std": 0.5}}, + ], + } + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "mini.md") + res = render_automatic_eda_markdown( + profile, out, {"title": "Mini", "ctx": {"dataset_name": "Mini"}}) + assert res["path"] == out # not None — no exception, file written. + assert os.path.exists(out) + assert res["n_chars"] > 0 diff --git a/python/functions/pipelines/render_automatic_eda.py b/python/functions/pipelines/render_automatic_eda.py index 8090bc1f..5361b927 100644 --- a/python/functions/pipelines/render_automatic_eda.py +++ b/python/functions/pipelines/render_automatic_eda.py @@ -1,9 +1,10 @@ -"""render_automatic_eda — EDA completo one-shot: perfil → ctx → PDF + PPTX. +"""render_automatic_eda — EDA completo one-shot: perfil → ctx → PDF + PPTX + MD. Pipeline impuro del grupo de capacidad `eda`. Dada UNA tabla DuckDB (o -PostgreSQL), produce el informe AutomaticEDA COMPLETO en sus dos formatos a la -vez (PDF móvil A5 + PPTX 16:9) con los 11 capítulos POBLADOS, en una sola -llamada. Compone, sin reimplementar su lógica, cuatro funciones del registry: +PostgreSQL), produce el informe AutomaticEDA COMPLETO en sus tres formatos a la +vez (PDF móvil A5 + PPTX 16:9 + Markdown autocontenido para pegar a un LLM) con +los capítulos POBLADOS, en una sola llamada. Compone, sin reimplementar su +lógica, varias funciones del registry: - profile_table : perfila la tabla end-to-end (TableProfile agregado), opcionalmente con modelos baratos y análisis de serie. @@ -12,8 +13,11 @@ llamada. Compone, sin reimplementar su lógica, cuatro funciones del registry: modelos/geo, timeseries_raw para series, geo_points para el mapa, db_path/table para la agregación push-down). Sin él, esos capítulos degradan. - - render_automatic_eda_pdf : renderiza el documento por capítulos a PDF. - - render_automatic_eda_pptx : renderiza el mismo documento a PPTX. + - render_automatic_eda_pdf : renderiza el documento por capítulos a PDF. + - render_automatic_eda_pptx : renderiza el mismo documento a PPTX. + - render_automatic_eda_markdown : serializa el mismo documento a Markdown + autocontenido (texto + tablas markdown, sin + binarios) para incorporar a un LLM. El TableProfile agregado basta para portada/overview/distribuciones/calidad/ correlación, pero los capítulos `modelos`, `timeseries`, `geospatial` y @@ -32,6 +36,7 @@ from datetime import datetime, timezone from datascience import ( build_eda_render_ctx, + render_automatic_eda_markdown, render_automatic_eda_pdf, render_automatic_eda_pptx, run_eda_models, @@ -93,6 +98,7 @@ def render_automatic_eda( out_dir: str = "reports", basename: str = None, ctx_extra: dict = None, + emit_md: bool = True, ) -> dict: """Perfila una tabla y emite el informe AutomaticEDA completo (PDF + PPTX). @@ -140,13 +146,19 @@ def render_automatic_eda( ctx_extra: dict opcional con claves de presentación/contexto extra que se mezclan en el ctx (p.ej. dataset_name, description, source_origin). No pisan las claves de datos calculadas por build_eda_render_ctx. + emit_md: además del PDF y el PPTX, emite un Markdown autocontenido del + MISMO documento por capítulos (texto plano + tablas markdown, sin + binarios), pensado para pegar a un LLM. Default True. La ruta sale en + la clave de retorno ``aeda_md_path``. No altera las demás salidas. Returns: dict (nunca lanza). En éxito:: {"status": "ok", "pdf_path": str, "pptx_path": str, - "manifest_path": str|None, "n_pages": int, "n_slides": int, - "pdf_note": str, "pptx_note": str, "profile": <TableProfile>} + "aeda_md_path": str|None, "manifest_path": str|None, + "n_pages": int, "n_slides": int, "md_chars": int|None, + "pdf_note": str, "pptx_note": str, "md_note": str|None, + "profile": <TableProfile>} En error: {"status": "error", "error": str}. """ @@ -243,15 +255,26 @@ def render_automatic_eda( rpdf = render_automatic_eda_pdf(prof, pdf_path, meta) or {} rpptx = render_automatic_eda_pptx(prof, pptx_path, meta) or {} + # Salida Markdown autocontenida (mismo documento por capítulos) para + # pegar a un LLM. Aditiva: no afecta a PDF/PPTX/manifest. dict-no-throw. + rmd = {} + md_path = None + if emit_md: + md_path = os.path.join(out_dir, base + ".md") + rmd = render_automatic_eda_markdown(prof, md_path, meta) or {} + return { "status": "ok", "pdf_path": rpdf.get("path"), "pptx_path": rpptx.get("path"), + "aeda_md_path": rmd.get("path"), "manifest_path": rpdf.get("manifest_path"), "n_pages": rpdf.get("n_pages"), "n_slides": rpptx.get("n_slides"), + "md_chars": rmd.get("n_chars"), "pdf_note": rpdf.get("note"), "pptx_note": rpptx.get("note"), + "md_note": rmd.get("note"), "profile": prof, } except Exception as e: # noqa: BLE001 — dict-no-throw: degradar, nunca lanzar.