From fc5bc334c8907934a6726dde345ba56cbeb5e92e Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Tue, 30 Jun 2026 15:01:26 +0200 Subject: [PATCH 1/2] =?UTF-8?q?feat(eda):=20cap=C3=ADtulo=20AN=C3=81LISIS?= =?UTF-8?q?=20LLM=20para=20AutomaticEDA,=20junto=20al=20overview?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nuevo capítulo `analisis_llm` del motor AutomaticEDA. Consume el bloque `llm` que `eda_llm_insights` (grupo eda) ya deja en el TableProfile —no llama al LLM ni recalcula— y lo convierte en bloques del modelo de documento para que se renderice sin cortarse en PDF ni PPTX: - Resumen de la tabla y significado de una fila -> bloques Markdown (el renderer los envuelve a líneas completas, nunca pierde texto). - Diccionario de datos y PII -> DataTable (el paginador parte por filas repitiendo cabecera y envuelve celdas largas dentro de su columna). - Análisis sugeridos y limpieza sugerida -> listas de viñetas Markdown; cada entrada es una línea completa que el renderer envuelve, nunca trunca. Lectura defensiva (.get) en todo; devuelve None si el profile no trae bloque `llm` (p.ej. profile_table sin run_llm) para omitir el capítulo. MUST-3.2 (report 2043): se mueve `analisis_llm` en CHAPTER_ORDER a la posición inmediatamente posterior a `overview`, como pidió el usuario ("va junto al overview"). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../automatic_eda/chapters/analisis_llm.py | 221 ++++++++++++++++++ .../automatic_eda/chapters_registry.py | 2 +- 2 files changed, 222 insertions(+), 1 deletion(-) create mode 100644 python/functions/datascience/automatic_eda/chapters/analisis_llm.py diff --git a/python/functions/datascience/automatic_eda/chapters/analisis_llm.py b/python/functions/datascience/automatic_eda/chapters/analisis_llm.py new file mode 100644 index 00000000..e182e6a0 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/analisis_llm.py @@ -0,0 +1,221 @@ +"""LLM analysis chapter (ANÁLISIS LLM) — the interpretive layer, next to overview. + +Third reference chapter for AutomaticEDA. Renders the ``llm`` block that the +``eda`` group function ``eda_llm_insights`` already produced and stored in the +``TableProfile`` — it does NOT call the LLM nor recompute anything. The block is +turned into clean, markdown-style document blocks so it reads as a real chapter +(table summary, row meaning, data dictionary, suggested analyses, cleaning +suggestions, PII findings) and, crucially, **nothing is ever cut** in PDF or +PPTX: + +* Prose (summary, row meaning) → ``Markdown`` blocks the renderers wrap to whole + lines, so no word is lost no matter how long the text is. +* The data dictionary and PII findings → ``DataTable`` blocks the paginator + splits by rows (repeating the header) and whose long cells wrap inside their + column — wide, multi-row tables never overflow a page/slide. +* Cleaning suggestions and suggested analyses → ``Markdown`` bullet lists; each + item is a whole line the renderer wraps, never truncated mid-entry. + +Position: this chapter is declared in ``chapters_registry.CHAPTER_ORDER`` right +after ``overview`` so the interpretation sits next to the table preview, as the +user asked ("va junto al overview"). + +Data source: the ``llm`` dict produced by ``eda_llm_insights`` (group ``eda``), +read from ``profile['llm']`` (or ``ctx['llm']`` as a fallback). Shape:: + + { + "summary": str, # what the table is, 2-3 sentences + "row_meaning": str, # what one row represents / granularity + "dictionary": [ {"column","description","business_meaning","unit"} ], + "pii": [ {"column","kind","severity"} ], + "cleaning": [str], # cleaning / transformation suggestions + "analyses": [str], # suggested questions / analyses / hypotheses + } + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +Reads everything defensively (``.get``) and NEVER raises; returns ``None`` when +the profile carries no LLM block (e.g. ``profile_table`` ran without +``run_llm``), so the chapter is simply omitted from the document. +""" + +from __future__ import annotations + +from .. import model + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "analisis_llm" +CHAPTER_TITLE = "Análisis LLM" + +# Key under which eda_llm_insights stores its interpretive block in the profile. +LLM_KEY = "llm" + + +def _clean_text(value) -> str: + """Coerce a value to a single trimmed line (collapse inner newlines). + + Used for bullet items so each suggestion stays a single markdown bullet the + renderer wraps; never drops content, only normalizes whitespace. + """ + text = model._safe_str(value).strip() + if not text: + return "" + return " ".join(text.split()) + + +def _para(value) -> str: + """Coerce a value to trimmed prose, preserving paragraph breaks.""" + text = model._safe_str(value).strip() + if not text: + return "" + # Keep blank-line paragraph breaks; collapse runs of spaces/tabs per line. + lines = [" ".join(ln.split()) for ln in text.splitlines()] + out: list = [] + for ln in lines: + if ln or (out and out[-1] != ""): + out.append(ln) + return "\n".join(out).strip() + + +def _bullets(items) -> str: + """Build a markdown bullet list from a sequence of strings. + + Each item becomes one ``- ...`` line (a whole, wrappable unit). Empty items + and non-list inputs are handled gracefully; returns "" when there is nothing. + """ + if isinstance(items, str): + items = [items] + if not isinstance(items, (list, tuple)): + return "" + lines = [] + for it in items: + text = _clean_text(it) + if text: + lines.append(f"- {text}") + return "\n".join(lines) + + +def _summary_blocks(llm: dict) -> list: + """Heading + prose for the table summary, or [] if absent.""" + text = _para(llm.get("summary")) + if not text: + return [] + return [model.Heading(text="Resumen de la tabla", level=2), + model.Markdown(text=text)] + + +def _row_meaning_blocks(llm: dict) -> list: + """Heading + prose for what one row represents, or [] if absent.""" + text = _para(llm.get("row_meaning")) + if not text: + return [] + return [model.Heading(text="Significado de una fila", level=2), + model.Markdown(text=text)] + + +def _dictionary_block(llm: dict): + """DataTable for the data dictionary, or None if absent/empty. + + Columns: Columna / Descripción / Significado de negocio / Unidad. The + paginator splits this by rows repeating the header and wraps long cells, so a + long dictionary (many columns) never gets cut. + """ + entries = llm.get("dictionary") + if not isinstance(entries, (list, tuple)) or not entries: + return None + header = ["Columna", "Descripción", "Significado de negocio", "Unidad"] + rows = [] + for e in entries: + if not isinstance(e, dict): + # Be tolerant: a bare string still shows up as a description row. + rows.append(["—", _clean_text(e), "", ""]) + continue + rows.append([ + _clean_text(e.get("column")) or "—", + _clean_text(e.get("description")), + _clean_text(e.get("business_meaning")), + _clean_text(e.get("unit")), + ]) + if not rows: + return None + return model.DataTable(header=header, rows=rows, title="Diccionario de datos") + + +def _analyses_blocks(llm: dict) -> list: + """Heading + bullet list of suggested analyses, or [] if absent.""" + bullets = _bullets(llm.get("analyses")) + if not bullets: + return [] + return [model.Heading(text="Análisis sugeridos", level=2), + model.Markdown(text=bullets)] + + +def _cleaning_blocks(llm: dict) -> list: + """Heading + bullet list of cleaning suggestions, or [] if absent.""" + bullets = _bullets(llm.get("cleaning")) + if not bullets: + return [] + return [model.Heading(text="Limpieza sugerida", level=2), + model.Markdown(text=bullets)] + + +def _pii_block(llm: dict): + """DataTable for PII/GDPR findings, or None if absent/empty.""" + entries = llm.get("pii") + if not isinstance(entries, (list, tuple)) or not entries: + return None + header = ["Columna", "Tipo", "Severidad"] + rows = [] + for e in entries: + if not isinstance(e, dict): + continue + rows.append([ + _clean_text(e.get("column")) or "—", + _clean_text(e.get("kind")), + _clean_text(e.get("severity")), + ]) + if not rows: + return None + return model.DataTable( + header=header, rows=rows, title="Datos personales (PII / RGPD)", + note="detección automática orientativa — revisar antes de tratar los datos") + + +def build_analisis_llm(profile: dict, ctx: dict): + """Build the LLM analysis Chapter, or None if there is no LLM block. + + Consumes ``profile['llm']`` (the block produced by ``eda_llm_insights``, + group ``eda``); falls back to ``ctx['llm']``. Returns ``None`` when no LLM + block is present or it carries no usable content, so the chapter is omitted + rather than rendering an empty section. + """ + profile = profile or {} + ctx = ctx or {} + + llm = profile.get(LLM_KEY) + if not isinstance(llm, dict): + llm = ctx.get(LLM_KEY) + if not isinstance(llm, dict) or not llm: + return None + + blocks: list = [] + blocks += _summary_blocks(llm) + blocks += _row_meaning_blocks(llm) + + dict_block = _dictionary_block(llm) + if dict_block is not None: + blocks.append(model.Heading(text="Diccionario de datos", level=2)) + blocks.append(dict_block) + + blocks += _analyses_blocks(llm) + blocks += _cleaning_blocks(llm) + + pii_block = _pii_block(llm) + if pii_block is not None: + blocks.append(model.Heading(text="Datos personales (PII / RGPD)", level=2)) + blocks.append(pii_block) + + if not blocks: + return None # LLM block present but every field empty → omit chapter. + + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters_registry.py b/python/functions/datascience/automatic_eda/chapters_registry.py index 1d6743f4..6dd73237 100644 --- a/python/functions/datascience/automatic_eda/chapters_registry.py +++ b/python/functions/datascience/automatic_eda/chapters_registry.py @@ -28,12 +28,12 @@ from . import model CHAPTER_ORDER = [ "portada", # cover "overview", # df.head + columns/types/nulls/examples + describe + "analisis_llm", # LLM interpretation — sits next to overview (user request) "num_distr", # numeric distributions "cat_distr", # categorical distributions "calidad", # data quality "correlacion", # correlations / associations "modelos", # cheap models (PCA/KMeans/outliers) - "analisis_llm", # LLM interpretation "timeseries", # time-series analysis "geospatial", # geospatial "agregacion", # aggregations / pivots From af1dd9bcc2649a9f4f7b7f5606d3dbce19696c11 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Tue, 30 Jun 2026 15:01:26 +0200 Subject: [PATCH 2/2] =?UTF-8?q?test(eda):=20tests=20del=20cap=C3=ADtulo=20?= =?UTF-8?q?AN=C3=81LISIS=20LLM=20(golden=20+=20edges=20+=20anti-cortes)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suite self-contained (perfil sintético + un golden, sin DuckDB): - golden: build_analisis_llm devuelve el Chapter y el documento entero renderiza a PDF y PPTX con resumen, análisis sugeridos, limpieza y una columna del diccionario presentes. - orden: el capítulo queda inmediatamente después de `overview`. - edges: profile sin bloque `llm` (o None/{}/malformado/llm vacío) -> None sin lanzar; fallback a ctx['llm']. - anti-cortes: diccionario de 40 filas + sugerencia de limpieza de ~150 chars se reparten en varias páginas/slides sin perder ninguna fila ni palabra. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../chapters/analisis_llm_test.py | 190 ++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 python/functions/datascience/automatic_eda/chapters/analisis_llm_test.py diff --git a/python/functions/datascience/automatic_eda/chapters/analisis_llm_test.py b/python/functions/datascience/automatic_eda/chapters/analisis_llm_test.py new file mode 100644 index 00000000..2b32470a --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/analisis_llm_test.py @@ -0,0 +1,190 @@ +"""Tests for the ANÁLISIS LLM chapter — DoD: golden + edges + anti-cut. + +Self-contained: builds a synthetic TableProfile carrying an ``llm`` block (the +shape ``eda_llm_insights`` produces) so the suite is fast and deterministic — no +DuckDB and no LLM call. Verifies: + +* golden — ``build_analisis_llm`` yields the chapter and the full document + renders to PDF *and* PPTX with the summary, a suggested analysis, a cleaning + suggestion and a dictionary column all present; +* order — the chapter sits immediately after ``overview`` (user requirement); +* edges — a profile with no ``llm`` block (or None/empty/malformed) returns + ``None`` and never raises; +* anti-cut — a long dictionary (40 rows) and a 150-char cleaning suggestion are + rendered to PDF and PPTX without losing a single row or word. +""" + +import os +import re +import tempfile + +from pypdf import PdfReader +from pptx import Presentation + +from datascience.automatic_eda.chapters.analisis_llm import ( + build_analisis_llm, CHAPTER_VERSION) +from datascience.automatic_eda.chapters_registry import build_document +from datascience.automatic_eda.model import Chapter, DataTable +from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf +from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx + + +def _profile() -> dict: + return { + "table": "ventas", + "source": "/data/ventas.csv", + "profiled_at": "2026-06-30T10:00:00+00:00", + "n_rows": 1000, + "n_cols": 2, + "quality_score": 92.5, + "columns": [ + {"name": "precio", "inferred_type": "numeric", "null_pct": 0.0, + "null_count": 0, + "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0, + "max": 100.0, "std": 12.3}}, + {"name": "categoria", "inferred_type": "categorical", + "null_pct": 0.0, "null_count": 0, + "categorical": {"top": [{"value": "neumaticos", "count": 500}]}}, + ], + "llm": { + "summary": "Tabla de ventas por producto. Token SUMMARYTOKEN.", + "row_meaning": "Cada fila es una venta. Token ROWTOKEN.", + "dictionary": [ + {"column": "precio", "description": "Precio unitario DESCTOKEN", + "business_meaning": "Ingreso por unidad", "unit": "EUR"}, + {"column": "categoria", "description": "Familia de producto", + "business_meaning": "Segmento comercial", "unit": ""}, + ], + "pii": [{"column": "categoria", "kind": "ninguno", "severity": "low"}], + "cleaning": ["Quitar nulos de precio CLEANTOKEN", + "Normalizar mayusculas en categoria"], + "analyses": ["Estudiar relacion precio-categoria ANALYSISTOKEN", + "Detectar outliers de precio"], + }, + } + + +def _pdf_text(path: str) -> str: + txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages) + return re.sub(r"\s+", " ", txt) + + +def _pptx_text(path: str) -> str: + prs = Presentation(path) + parts = [] + for sl in prs.slides: + for sh in sl.shapes: + if sh.has_text_frame: + parts.append(sh.text_frame.text) + if sh.has_table: + tb = sh.table + for r in range(len(tb.rows)): + for c in range(len(tb.columns)): + parts.append(tb.cell(r, c).text) + return re.sub(r"\s+", " ", " ".join(parts)) + + +def test_golden_build_y_render_pdf_pptx(): + prof = _profile() + ch = build_analisis_llm(prof, {}) + assert ch is not None + assert ch.id == "analisis_llm" + assert ch.version == CHAPTER_VERSION + assert ch.blocks # non-empty. + + with tempfile.TemporaryDirectory() as d: + out_pdf = os.path.join(d, "eda.pdf") + res = render_automatic_eda_pdf(prof, out_pdf, {"title": "EDA — ventas"}) + assert res["path"] == out_pdf and os.path.exists(out_pdf) + ids = [c["id"] for c in res["chapters"]] + assert "analisis_llm" in ids + txt = _pdf_text(out_pdf) + # The user's required content: summary, suggested analyses, cleaning. + assert "SUMMARYTOKEN" in txt + assert "ANALYSISTOKEN" in txt + assert "CLEANTOKEN" in txt + assert "DESCTOKEN" in txt # data dictionary cell. + + out_pptx = os.path.join(d, "eda.pptx") + res2 = render_automatic_eda_pptx(prof, out_pptx, {"title": "EDA — ventas"}) + assert res2["path"] == out_pptx and os.path.exists(out_pptx) + ids2 = [c["id"] for c in res2["chapters"]] + assert "analisis_llm" in ids2 + ptx = _pptx_text(out_pptx) + assert "SUMMARYTOKEN" in ptx + assert "ANALYSISTOKEN" in ptx + assert "CLEANTOKEN" in ptx + assert "DESCTOKEN" in ptx + + +def test_orden_capitulo_junto_a_overview(): + chapters = build_document(_profile(), {}) + ids = [c.id for c in chapters] + assert "overview" in ids and "analisis_llm" in ids + # User requirement: the LLM chapter sits right after overview. + assert ids.index("analisis_llm") == ids.index("overview") + 1 + + +def test_edge_sin_llm_devuelve_none(): + # No llm block at all. + prof = {k: v for k, v in _profile().items() if k != "llm"} + assert build_analisis_llm(prof, {}) is None + # None / empty / malformed never raise and yield None. + assert build_analisis_llm(None, None) is None + assert build_analisis_llm({}, {}) is None + assert build_analisis_llm({"llm": {}}, {}) is None + assert build_analisis_llm({"llm": "not-a-dict"}, {}) is None + # All-empty fields → omitted (no blocks). + empty = {"llm": {"summary": "", "dictionary": [], "cleaning": [], + "analyses": [], "pii": [], "row_meaning": ""}} + assert build_analisis_llm(empty, {}) is None + + +def test_edge_llm_via_ctx_fallback(): + # The block may arrive in ctx instead of the profile. + prof = {k: v for k, v in _profile().items() if k != "llm"} + ctx = {"llm": {"summary": "Resumen via ctx CTXTOKEN."}} + ch = build_analisis_llm(prof, ctx) + assert ch is not None and ch.id == "analisis_llm" + + +def test_anti_cortes_diccionario_largo_y_limpieza_larga(): + long_clean = ("Lorem ipsum dolor sit amet consectetur adipiscing elit sed do " + "eiusmod tempor incididunt ut labore et dolore magna aliqua " + "reprehenderit voluptate velit esse cillum dolore") + dictionary = [ + {"column": f"col_{i}", + "description": f"Descripcion larga numero {i} con bastante texto para " + f"forzar el wrap dentro de la celda fila{i}", + "business_meaning": f"Significado de negocio {i}", "unit": "u"} + for i in range(40) + ] + prof = { + "table": "t", "n_rows": 1, "n_cols": 1, "columns": [], + "llm": {"summary": "S", "dictionary": dictionary, + "cleaning": [long_clean], "analyses": ["A"]}, + } + ch = build_analisis_llm(prof, {}) + assert ch is not None + # Structure: the dictionary DataTable keeps ALL 40 rows — none dropped on + # construction (the renderers then split it by rows, repeating the header). + dts = [b for b in ch.blocks if isinstance(b, DataTable)] + assert any(len(dt.rows) == 40 for dt in dts) + + with tempfile.TemporaryDirectory() as d: + out_pdf = os.path.join(d, "x.pdf") + render_automatic_eda_pdf([ch], out_pdf, {"write_manifest": False}) + # 40 wide rows + a long cleaning line cannot fit one page → it spills, + # which is exactly the no-cut behaviour (paginate, never truncate). + assert len(PdfReader(out_pdf).pages) > 1 + txt = _pdf_text(out_pdf) + # The long cleaning suggestion is wrapped word-by-word, not truncated. + for word in ("Lorem", "incididunt", "reprehenderit", "voluptate", "cillum"): + assert word in txt + + out_pptx = os.path.join(d, "x.pptx") + res2 = render_automatic_eda_pptx([ch], out_pptx, {"write_manifest": False}) + assert res2["n_slides"] > 1 # table + long text spill across slides. + ptx = _pptx_text(out_pptx) + for word in ("Lorem", "reprehenderit", "voluptate"): + assert word in ptx