diff --git a/python/functions/datascience/automatic_eda/chapters/portada.py b/python/functions/datascience/automatic_eda/chapters/portada.py index c1bb43ab..409322f7 100644 --- a/python/functions/datascience/automatic_eda/chapters/portada.py +++ b/python/functions/datascience/automatic_eda/chapters/portada.py @@ -2,8 +2,17 @@ Builds the document cover from a TableProfile plus an optional ``ctx`` of presentation metadata. Reads everything defensively (``.get``) and degrades -honestly: a field that is neither in the profile nor in ``ctx`` is shown as a -placeholder rather than invented, leaving a hook for the LLM layer to fill it. +honestly. + +The dataset size (N rows x M columns) is always shown big, as a heading right +under the dataset name (kept together in a ``Group``), not buried in the +metadata table. The Description and Granularity are resolved through a cascade +so they are never empty: an explicit ``ctx`` value wins; otherwise the LLM block +(``profile['llm']`` from ``eda_llm_insights``) provides ``summary`` / +``row_meaning``; otherwise a short summary is derived from the profile itself +(shape, column-type mix, quality score) and a "Cada fila es…" sentence from the +key-candidate columns or the table shape. Nothing is invented: the derived +fallbacks state that they come from the profile. Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``): build_(profile: dict, ctx: dict) -> Chapter | None @@ -17,10 +26,15 @@ from datetime import datetime, timezone from .. import model -CHAPTER_VERSION = "1.1.0" +CHAPTER_VERSION = "1.2.0" CHAPTER_ID = "portada" CHAPTER_TITLE = "Portada" +# Key under which eda_llm_insights stores its interpretive block in the profile. +# The cover reads ``summary`` (what the table is) and ``row_meaning`` (what one +# row represents) from it when the LLM layer ran (``run_llm``). +_LLM_KEY = "llm" + # Default human description of what the table quality score measures. Chapters # can override it via ctx["quality_criteria"]. _DEFAULT_QUALITY_CRITERIA = ( @@ -142,6 +156,88 @@ def _fmt_date_eu(value) -> str: return s +def _llm_block(profile: dict, ctx: dict) -> dict: + """Return the interpretive LLM block (``eda_llm_insights`` output), or {}. + + It is stored under ``profile['llm']`` by ``profile_table(run_llm=True)`` and + may also be forwarded in ``ctx['llm']``. Read defensively: anything that is + not a dict degrades to an empty dict so the cover never raises. + """ + block = profile.get(_LLM_KEY) + if not isinstance(block, dict): + block = ctx.get(_LLM_KEY) + return block if isinstance(block, dict) else {} + + +def _count_column_types(profile: dict, ctx: dict): + """Best-effort (n_numeric, n_categorical) for the dataset. + + Prefers the aggregated ``ctx['document_summary']`` (computed by the engine + over the whole body); falls back to counting the profile columns directly so + the cover still has the numbers when no summary was passed. + """ + summary = ctx.get("document_summary") + if isinstance(summary, dict): + n_num = summary.get("n_numeric") + n_cat = summary.get("n_categorical") + if n_num is not None or n_cat is not None: + return n_num, n_cat + cols = profile.get("columns") or [] + n_num = sum(1 for c in cols if isinstance(c, dict) + and c.get("inferred_type") == "numeric") + n_cat = sum(1 for c in cols if isinstance(c, dict) + and isinstance(c.get("categorical"), dict) + and c.get("categorical", {}).get("top") + and c.get("inferred_type") != "numeric") + return n_num, n_cat + + +def _derive_description(profile: dict, ctx: dict) -> str: + """A short, honest description of the dataset from the profile. + + Used only when no explicit ``ctx['description']`` and no LLM ``summary`` are + available. Summarizes shape, column-type mix and quality score; never empty, + never invents business meaning (it states the description was derived).""" + n_rows = profile.get("n_rows") + n_cols = profile.get("n_cols") + n_num, n_cat = _count_column_types(profile, ctx) + head = f"Conjunto de datos con {_fmt_int(n_rows)} filas y {_fmt_int(n_cols)} columnas" + type_bits = [] + if n_num: + type_bits.append(f"{_fmt_int(n_num)} numéricas") + if n_cat: + type_bits.append(f"{_fmt_int(n_cat)} categóricas") + if type_bits: + head += " (" + ", ".join(type_bits) + ")" + parts = [head + "."] + score = profile.get("quality_score") + if score is not None: + parts.append(f"Calidad media estimada: {score}/100.") + parts.append( + "Resumen derivado del perfil; active la interpretación LLM (`run_llm`) " + "para una descripción de negocio más rica.") + return " ".join(parts) + + +def _derive_granularity(profile: dict, dataset_name: str) -> str: + """A ``Cada fila es…`` granularity sentence from the profile. + + Prefers the key-candidate columns (a row is identified by them); when no key + is detected, falls back to the table shape so the line is always meaningful + and starts with ``Cada fila es`` as the user requested.""" + keys = profile.get("key_candidates") or [] + if keys: + shown = ", ".join(str(k) for k in keys[:3]) + more = "" if len(keys) <= 3 else f" (y {len(keys) - 3} más)" + return (f"Cada fila es un registro identificado por {shown}{more}, " + "candidata(s) a clave por ser únicas y sin nulos.") + n_rows = profile.get("n_rows") + tail = f" El dataset tiene {_fmt_int(n_rows)} filas en total." if n_rows else "" + return (f"Cada fila es un registro de «{dataset_name}». No se detectó una " + "columna identificadora única, así que la granularidad se infiere " + "de la forma de la tabla." + tail) + + def build_portada(profile: dict, ctx: dict): """Build the cover Chapter, or None if there is truly nothing to show.""" profile = profile or {} @@ -166,30 +262,38 @@ def build_portada(profile: dict, ctx: dict): quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA quality_value = "—" if score is None else f"{score} / 100" - # Granularity: ctx wins; else derive from key candidates; else be honest. + llm = _llm_block(profile, ctx) + + # Granularity: explicit ctx wins; then the LLM "row_meaning"; then the key + # candidates; finally a shape-based fallback. Always a real "Cada fila es…". granularity = ctx.get("granularity") if not granularity: - keys = profile.get("key_candidates") or [] - if keys: - granularity = ("Cada fila parece identificada por " - + ", ".join(str(k) for k in keys[:3]) + ".") - else: - granularity = ("Cada fila es… (granularidad no determinada — " - "pendiente de la capa de cálculo/LLM).") + granularity = (llm.get("row_meaning") or "").strip() or None + if not granularity: + granularity = _derive_granularity(profile, str(dataset_name)) + # Description: explicit ctx wins; then the LLM "summary"; finally a short + # profile-derived summary. Never the old empty placeholder. description = ctx.get("description") if not description: - description = ("Descripción no provista — pendiente de la capa LLM " - "(`run_llm`) o de `ctx['description']`.") + description = (llm.get("summary") or "").strip() or None + if not description: + description = _derive_description(profile, ctx) - blocks = [ + # Title + dataset size shown together and BIG (Heading) at the top, kept on + # the same page (Group). The size is no longer buried in the metadata table. + cover = [ model.Heading(text=str(dataset_name), level=1), model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"), + model.Heading(text=shape, level=2), + ] + + blocks = [ + model.Group(blocks=cover), model.KVTable(rows=[ ("Fuente", source_origin), ("Almacenamiento", storage), ("Generado", when), - ("Tamaño", shape), ("Calidad", quality_value), ("Criterios de calidad", quality_criteria), ]), diff --git a/python/functions/datascience/automatic_eda/chapters/portada_test.py b/python/functions/datascience/automatic_eda/chapters/portada_test.py new file mode 100644 index 00000000..ae9df818 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/portada_test.py @@ -0,0 +1,197 @@ +"""Tests for the PORTADA (cover) chapter — DoD: golden + edges + render. + +Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast +and deterministic. Verifies the Fase 4b improvements: + +1. The dataset size (N rows x M columns) is always shown BIG — as a level-2 + heading kept together with the dataset name in a ``Group`` — and is no longer + a row of the metadata table. +2. Description and Granularity are resolved through a real cascade and are never + the old empty placeholders: an explicit ``ctx`` value wins; otherwise the LLM + block (``profile['llm']``) provides ``summary`` / ``row_meaning``; otherwise a + short summary is derived from the profile and a "Cada fila es…" sentence from + the key-candidate columns or the table shape. +3. The chapter degrades without raising on empty/None input. +4. It renders inside the full document to both PDF and PPTX showing that content. +""" + +import os +import re +import tempfile + +from pypdf import PdfReader +from pptx import Presentation + +from datascience.automatic_eda.model import Group, Heading, KVTable, Markdown +from datascience.automatic_eda.chapters.portada import ( + CHAPTER_ID, CHAPTER_VERSION, build_portada, +) +from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf +from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx + + +def _profile(with_llm: bool = True, with_keys: bool = True) -> dict: + prof = { + "table": "titanic", + "source": "/data/titanic.csv", + "profiled_at": "2026-06-30T10:00:00+00:00", + "n_rows": 891, + "n_cols": 12, + "quality_score": 78.0, + "columns": [ + {"name": "PassengerId", "inferred_type": "numeric", + "null_pct": 0.0, "numeric": {"mean": 446.0, "min": 1.0, + "max": 891.0, "std": 257.0}}, + {"name": "Survived", "inferred_type": "numeric", + "null_pct": 0.0, "numeric": {"mean": 0.38, "min": 0.0, + "max": 1.0, "std": 0.49}}, + {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0, + "categorical": {"top": [{"value": "male", "count": 577, "pct": 0.65}, + {"value": "female", "count": 314, + "pct": 0.35}], + "mode": "male", "n_distinct": 2, "entropy": 0.93}}, + ], + } + if with_keys: + prof["key_candidates"] = ["PassengerId"] + if with_llm: + prof["llm"] = { + "summary": "Pasajeros del Titanic con su supervivencia y datos de viaje.", + "row_meaning": "Cada fila es un pasajero del Titanic.", + "dictionary": [], "pii": [], "cleaning": [], "analyses": [], + } + return prof + + +def _pdf_text(path: str) -> str: + txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages) + return re.sub(r"\s+", " ", txt) + + +def _pptx_text(path: str) -> str: + prs = Presentation(path) + parts = [] + for sl in prs.slides: + for sh in sl.shapes: + if sh.has_text_frame: + parts.append(sh.text_frame.text) + if sh.has_table: + tb = sh.table + for r in range(len(tb.rows)): + for c in range(len(tb.columns)): + parts.append(tb.cell(r, c).text) + return re.sub(r"\s+", " ", " ".join(parts)) + + +def _markdown_after(blocks, heading_text): + """Return the Markdown block that follows a Heading whose text matches.""" + for i, b in enumerate(blocks): + if isinstance(b, Heading) and heading_text.lower() in b.text.lower(): + for nb in blocks[i + 1:]: + if isinstance(nb, Markdown): + return nb + return None + + +def test_golden_tamano_grande_y_textos_llm(): + ch = build_portada(_profile(), {}) + assert ch is not None + assert ch.id == CHAPTER_ID + assert ch.version == CHAPTER_VERSION + + # 1) Title + size kept together in a Group; size is a BIG level-2 heading. + group = next(b for b in ch.blocks if isinstance(b, Group)) + inner = group.blocks + assert isinstance(inner[0], Heading) and inner[0].level == 1 + assert inner[0].text == "titanic" + size_h = next(b for b in inner if isinstance(b, Heading) and b.level == 2) + assert "891" in size_h.text and "12" in size_h.text + assert "filas" in size_h.text and "columnas" in size_h.text + + # 2) Size is no longer a row of the metadata table. + kv = next(b for b in ch.blocks if isinstance(b, KVTable)) + labels = [r[0] for r in kv.rows] + assert "Tamaño" not in labels + assert "Fuente" in labels and "Calidad" in labels + + # 3) Description and Granularity come from the LLM block. + desc = _markdown_after(ch.blocks, "Descripción") + gran = _markdown_after(ch.blocks, "Granularidad") + assert desc is not None and "Titanic" in desc.text + assert gran is not None and gran.text.startswith("Cada fila es") + assert "pasajero" in gran.text.lower() + + +def test_fallback_sin_llm_usa_keys_y_perfil(): + # No LLM block: description derived from the profile, granularity from keys. + ch = build_portada(_profile(with_llm=False, with_keys=True), {}) + desc = _markdown_after(ch.blocks, "Descripción") + gran = _markdown_after(ch.blocks, "Granularidad") + # Description is the derived summary, never the old "pendiente" placeholder. + assert "pendiente" not in desc.text.lower() + assert "891" in desc.text and "columnas" in desc.text + assert "numéricas" in desc.text or "categóricas" in desc.text + # Granularity mentions the key candidate and starts with "Cada fila es". + assert gran.text.startswith("Cada fila es") + assert "PassengerId" in gran.text + assert "…" not in gran.text # the old ellipsis placeholder is gone. + + +def test_fallback_sin_llm_sin_keys_usa_forma(): + ch = build_portada(_profile(with_llm=False, with_keys=False), {}) + gran = _markdown_after(ch.blocks, "Granularidad") + assert gran.text.startswith("Cada fila es") + assert "titanic" in gran.text.lower() + assert "pendiente" not in gran.text.lower() + + +def test_ctx_explicito_gana_sobre_llm(): + ctx = {"description": "Descripción manual.", + "granularity": "Cada fila es una unidad manual."} + ch = build_portada(_profile(), ctx) + desc = _markdown_after(ch.blocks, "Descripción") + gran = _markdown_after(ch.blocks, "Granularidad") + assert desc.text == "Descripción manual." + assert gran.text == "Cada fila es una unidad manual." + + +def test_edge_perfil_vacio_no_lanza(): + # Empty / None never raise; the cover still shows a size and real texts. + for prof, ctx in (({}, {}), (None, None)): + ch = build_portada(prof, ctx) + assert ch is not None + group = next(b for b in ch.blocks if isinstance(b, Group)) + size_h = next(b for b in group.blocks + if isinstance(b, Heading) and b.level == 2) + assert "filas" in size_h.text and "columnas" in size_h.text + desc = _markdown_after(ch.blocks, "Descripción") + gran = _markdown_after(ch.blocks, "Granularidad") + assert desc.text and "pendiente" not in desc.text.lower() + assert gran.text.startswith("Cada fila es") + + +def test_golden_render_pdf_muestra_portada(): + prof = _profile() + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "eda.pdf") + res = render_automatic_eda_pdf(prof, out, {"title": "EDA"}) + assert res["path"] == out and os.path.exists(out) + assert CHAPTER_ID in [c["id"] for c in res["chapters"]] + txt = _pdf_text(out) + assert "titanic" in txt.lower() + assert "891" in txt and "filas" in txt and "columnas" in txt + assert "Titanic" in txt # LLM summary in the Description. + assert "Cada fila es" in txt # granularity sentence. + + +def test_golden_render_pptx_muestra_portada(): + prof = _profile() + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "eda.pptx") + res = render_automatic_eda_pptx(prof, out, {"title": "EDA"}) + assert res["path"] == out and os.path.exists(out) + assert CHAPTER_ID in [c["id"] for c in res["chapters"]] + txt = _pptx_text(out) + assert "titanic" in txt.lower() + assert "891" in txt and "columnas" in txt + assert "Cada fila es" in txt