merge: 4b portada — tamano grande junto al nombre + descripcion y granularidad funcionando (verificado met)

2026-06-30 18:12:22 +02:00
parent d001d90306 048781df3f
commit 32054ad781
2 changed files with 316 additions and 15 deletions
@@ -2,8 +2,17 @@

 Builds the document cover from a TableProfile plus an optional ``ctx`` of
 presentation metadata. Reads everything defensively (``.get``) and degrades
-honestly: a field that is neither in the profile nor in ``ctx`` is shown as a
-placeholder rather than invented, leaving a hook for the LLM layer to fill it.
+honestly.
+
+The dataset size (N rows x M columns) is always shown big, as a heading right
+under the dataset name (kept together in a ``Group``), not buried in the
+metadata table. The Description and Granularity are resolved through a cascade
+so they are never empty: an explicit ``ctx`` value wins; otherwise the LLM block
+(``profile['llm']`` from ``eda_llm_insights``) provides ``summary`` /
+``row_meaning``; otherwise a short summary is derived from the profile itself
+(shape, column-type mix, quality score) and a "Cada fila es…" sentence from the
+key-candidate columns or the table shape. Nothing is invented: the derived
+fallbacks state that they come from the profile.

 Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``):
    build_<id>(profile: dict, ctx: dict) -> Chapter | None
@@ -17,10 +26,15 @@ from datetime import datetime, timezone

 from .. import model

-CHAPTER_VERSION = "1.1.0"
+CHAPTER_VERSION = "1.2.0"
 CHAPTER_ID = "portada"
 CHAPTER_TITLE = "Portada"

+# Key under which eda_llm_insights stores its interpretive block in the profile.
+# The cover reads ``summary`` (what the table is) and ``row_meaning`` (what one
+# row represents) from it when the LLM layer ran (``run_llm``).
+_LLM_KEY = "llm"
+
 # Default human description of what the table quality score measures. Chapters
 # can override it via ctx["quality_criteria"].
 _DEFAULT_QUALITY_CRITERIA = (
@@ -142,6 +156,88 @@ def _fmt_date_eu(value) -> str:
        return s


+def _llm_block(profile: dict, ctx: dict) -> dict:
+    """Return the interpretive LLM block (``eda_llm_insights`` output), or {}.
+
+    It is stored under ``profile['llm']`` by ``profile_table(run_llm=True)`` and
+    may also be forwarded in ``ctx['llm']``. Read defensively: anything that is
+    not a dict degrades to an empty dict so the cover never raises.
+    """
+    block = profile.get(_LLM_KEY)
+    if not isinstance(block, dict):
+        block = ctx.get(_LLM_KEY)
+    return block if isinstance(block, dict) else {}
+
+
+def _count_column_types(profile: dict, ctx: dict):
+    """Best-effort (n_numeric, n_categorical) for the dataset.
+
+    Prefers the aggregated ``ctx['document_summary']`` (computed by the engine
+    over the whole body); falls back to counting the profile columns directly so
+    the cover still has the numbers when no summary was passed.
+    """
+    summary = ctx.get("document_summary")
+    if isinstance(summary, dict):
+        n_num = summary.get("n_numeric")
+        n_cat = summary.get("n_categorical")
+        if n_num is not None or n_cat is not None:
+            return n_num, n_cat
+    cols = profile.get("columns") or []
+    n_num = sum(1 for c in cols if isinstance(c, dict)
+                and c.get("inferred_type") == "numeric")
+    n_cat = sum(1 for c in cols if isinstance(c, dict)
+                and isinstance(c.get("categorical"), dict)
+                and c.get("categorical", {}).get("top")
+                and c.get("inferred_type") != "numeric")
+    return n_num, n_cat
+
+
+def _derive_description(profile: dict, ctx: dict) -> str:
+    """A short, honest description of the dataset from the profile.
+
+    Used only when no explicit ``ctx['description']`` and no LLM ``summary`` are
+    available. Summarizes shape, column-type mix and quality score; never empty,
+    never invents business meaning (it states the description was derived)."""
+    n_rows = profile.get("n_rows")
+    n_cols = profile.get("n_cols")
+    n_num, n_cat = _count_column_types(profile, ctx)
+    head = f"Conjunto de datos con {_fmt_int(n_rows)} filas y {_fmt_int(n_cols)} columnas"
+    type_bits = []
+    if n_num:
+        type_bits.append(f"{_fmt_int(n_num)} numéricas")
+    if n_cat:
+        type_bits.append(f"{_fmt_int(n_cat)} categóricas")
+    if type_bits:
+        head += " (" + ", ".join(type_bits) + ")"
+    parts = [head + "."]
+    score = profile.get("quality_score")
+    if score is not None:
+        parts.append(f"Calidad media estimada: {score}/100.")
+    parts.append(
+        "Resumen derivado del perfil; active la interpretación LLM (`run_llm`) "
+        "para una descripción de negocio más rica.")
+    return " ".join(parts)
+
+
+def _derive_granularity(profile: dict, dataset_name: str) -> str:
+    """A ``Cada fila es…`` granularity sentence from the profile.
+
+    Prefers the key-candidate columns (a row is identified by them); when no key
+    is detected, falls back to the table shape so the line is always meaningful
+    and starts with ``Cada fila es`` as the user requested."""
+    keys = profile.get("key_candidates") or []
+    if keys:
+        shown = ", ".join(str(k) for k in keys[:3])
+        more = "" if len(keys) <= 3 else f" (y {len(keys) - 3} más)"
+        return (f"Cada fila es un registro identificado por {shown}{more}, "
+                "candidata(s) a clave por ser únicas y sin nulos.")
+    n_rows = profile.get("n_rows")
+    tail = f" El dataset tiene {_fmt_int(n_rows)} filas en total." if n_rows else ""
+    return (f"Cada fila es un registro de «{dataset_name}». No se detectó una "
+            "columna identificadora única, así que la granularidad se infiere "
+            "de la forma de la tabla." + tail)
+
+
 def build_portada(profile: dict, ctx: dict):
    """Build the cover Chapter, or None if there is truly nothing to show."""
    profile = profile or {}
@@ -166,30 +262,38 @@ def build_portada(profile: dict, ctx: dict):
    quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA
    quality_value = "—" if score is None else f"{score} / 100"

-    # Granularity: ctx wins; else derive from key candidates; else be honest.
+    llm = _llm_block(profile, ctx)
+
+    # Granularity: explicit ctx wins; then the LLM "row_meaning"; then the key
+    # candidates; finally a shape-based fallback. Always a real "Cada fila es…".
    granularity = ctx.get("granularity")
    if not granularity:
-        keys = profile.get("key_candidates") or []
-        if keys:
-            granularity = ("Cada fila parece identificada por "
-                           + ", ".join(str(k) for k in keys[:3]) + ".")
-        else:
-            granularity = ("Cada fila es… (granularidad no determinada — "
-                           "pendiente de la capa de cálculo/LLM).")
+        granularity = (llm.get("row_meaning") or "").strip() or None
+    if not granularity:
+        granularity = _derive_granularity(profile, str(dataset_name))

+    # Description: explicit ctx wins; then the LLM "summary"; finally a short
+    # profile-derived summary. Never the old empty placeholder.
    description = ctx.get("description")
    if not description:
-        description = ("Descripción no provista — pendiente de la capa LLM "
-                       "(`run_llm`) o de `ctx['description']`.")
+        description = (llm.get("summary") or "").strip() or None
+    if not description:
+        description = _derive_description(profile, ctx)

-    blocks = [
+    # Title + dataset size shown together and BIG (Heading) at the top, kept on
+    # the same page (Group). The size is no longer buried in the metadata table.
+    cover = [
        model.Heading(text=str(dataset_name), level=1),
        model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
+        model.Heading(text=shape, level=2),
+    ]
+
+    blocks = [
+        model.Group(blocks=cover),
        model.KVTable(rows=[
            ("Fuente", source_origin),
            ("Almacenamiento", storage),
            ("Generado", when),
-            ("Tamaño", shape),
            ("Calidad", quality_value),
            ("Criterios de calidad", quality_criteria),
        ]),
@@ -0,0 +1,197 @@
+"""Tests for the PORTADA (cover) chapter — DoD: golden + edges + render.
+
+Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
+and deterministic. Verifies the Fase 4b improvements:
+
+1. The dataset size (N rows x M columns) is always shown BIG — as a level-2
+   heading kept together with the dataset name in a ``Group`` — and is no longer
+   a row of the metadata table.
+2. Description and Granularity are resolved through a real cascade and are never
+   the old empty placeholders: an explicit ``ctx`` value wins; otherwise the LLM
+   block (``profile['llm']``) provides ``summary`` / ``row_meaning``; otherwise a
+   short summary is derived from the profile and a "Cada fila es…" sentence from
+   the key-candidate columns or the table shape.
+3. The chapter degrades without raising on empty/None input.
+4. It renders inside the full document to both PDF and PPTX showing that content.
+"""
+
+import os
+import re
+import tempfile
+
+from pypdf import PdfReader
+from pptx import Presentation
+
+from datascience.automatic_eda.model import Group, Heading, KVTable, Markdown
+from datascience.automatic_eda.chapters.portada import (
+    CHAPTER_ID, CHAPTER_VERSION, build_portada,
+)
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+
+def _profile(with_llm: bool = True, with_keys: bool = True) -> dict:
+    prof = {
+        "table": "titanic",
+        "source": "/data/titanic.csv",
+        "profiled_at": "2026-06-30T10:00:00+00:00",
+        "n_rows": 891,
+        "n_cols": 12,
+        "quality_score": 78.0,
+        "columns": [
+            {"name": "PassengerId", "inferred_type": "numeric",
+             "null_pct": 0.0, "numeric": {"mean": 446.0, "min": 1.0,
+                                          "max": 891.0, "std": 257.0}},
+            {"name": "Survived", "inferred_type": "numeric",
+             "null_pct": 0.0, "numeric": {"mean": 0.38, "min": 0.0,
+                                          "max": 1.0, "std": 0.49}},
+            {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
+             "categorical": {"top": [{"value": "male", "count": 577, "pct": 0.65},
+                                     {"value": "female", "count": 314,
+                                      "pct": 0.35}],
+                             "mode": "male", "n_distinct": 2, "entropy": 0.93}},
+        ],
+    }
+    if with_keys:
+        prof["key_candidates"] = ["PassengerId"]
+    if with_llm:
+        prof["llm"] = {
+            "summary": "Pasajeros del Titanic con su supervivencia y datos de viaje.",
+            "row_meaning": "Cada fila es un pasajero del Titanic.",
+            "dictionary": [], "pii": [], "cleaning": [], "analyses": [],
+        }
+    return prof
+
+
+def _pdf_text(path: str) -> str:
+    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
+    return re.sub(r"\s+", " ", txt)
+
+
+def _pptx_text(path: str) -> str:
+    prs = Presentation(path)
+    parts = []
+    for sl in prs.slides:
+        for sh in sl.shapes:
+            if sh.has_text_frame:
+                parts.append(sh.text_frame.text)
+            if sh.has_table:
+                tb = sh.table
+                for r in range(len(tb.rows)):
+                    for c in range(len(tb.columns)):
+                        parts.append(tb.cell(r, c).text)
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def _markdown_after(blocks, heading_text):
+    """Return the Markdown block that follows a Heading whose text matches."""
+    for i, b in enumerate(blocks):
+        if isinstance(b, Heading) and heading_text.lower() in b.text.lower():
+            for nb in blocks[i + 1:]:
+                if isinstance(nb, Markdown):
+                    return nb
+    return None
+
+
+def test_golden_tamano_grande_y_textos_llm():
+    ch = build_portada(_profile(), {})
+    assert ch is not None
+    assert ch.id == CHAPTER_ID
+    assert ch.version == CHAPTER_VERSION
+
+    # 1) Title + size kept together in a Group; size is a BIG level-2 heading.
+    group = next(b for b in ch.blocks if isinstance(b, Group))
+    inner = group.blocks
+    assert isinstance(inner[0], Heading) and inner[0].level == 1
+    assert inner[0].text == "titanic"
+    size_h = next(b for b in inner if isinstance(b, Heading) and b.level == 2)
+    assert "891" in size_h.text and "12" in size_h.text
+    assert "filas" in size_h.text and "columnas" in size_h.text
+
+    # 2) Size is no longer a row of the metadata table.
+    kv = next(b for b in ch.blocks if isinstance(b, KVTable))
+    labels = [r[0] for r in kv.rows]
+    assert "Tamaño" not in labels
+    assert "Fuente" in labels and "Calidad" in labels
+
+    # 3) Description and Granularity come from the LLM block.
+    desc = _markdown_after(ch.blocks, "Descripción")
+    gran = _markdown_after(ch.blocks, "Granularidad")
+    assert desc is not None and "Titanic" in desc.text
+    assert gran is not None and gran.text.startswith("Cada fila es")
+    assert "pasajero" in gran.text.lower()
+
+
+def test_fallback_sin_llm_usa_keys_y_perfil():
+    # No LLM block: description derived from the profile, granularity from keys.
+    ch = build_portada(_profile(with_llm=False, with_keys=True), {})
+    desc = _markdown_after(ch.blocks, "Descripción")
+    gran = _markdown_after(ch.blocks, "Granularidad")
+    # Description is the derived summary, never the old "pendiente" placeholder.
+    assert "pendiente" not in desc.text.lower()
+    assert "891" in desc.text and "columnas" in desc.text
+    assert "numéricas" in desc.text or "categóricas" in desc.text
+    # Granularity mentions the key candidate and starts with "Cada fila es".
+    assert gran.text.startswith("Cada fila es")
+    assert "PassengerId" in gran.text
+    assert "…" not in gran.text  # the old ellipsis placeholder is gone.
+
+
+def test_fallback_sin_llm_sin_keys_usa_forma():
+    ch = build_portada(_profile(with_llm=False, with_keys=False), {})
+    gran = _markdown_after(ch.blocks, "Granularidad")
+    assert gran.text.startswith("Cada fila es")
+    assert "titanic" in gran.text.lower()
+    assert "pendiente" not in gran.text.lower()
+
+
+def test_ctx_explicito_gana_sobre_llm():
+    ctx = {"description": "Descripción manual.",
+           "granularity": "Cada fila es una unidad manual."}
+    ch = build_portada(_profile(), ctx)
+    desc = _markdown_after(ch.blocks, "Descripción")
+    gran = _markdown_after(ch.blocks, "Granularidad")
+    assert desc.text == "Descripción manual."
+    assert gran.text == "Cada fila es una unidad manual."
+
+
+def test_edge_perfil_vacio_no_lanza():
+    # Empty / None never raise; the cover still shows a size and real texts.
+    for prof, ctx in (({}, {}), (None, None)):
+        ch = build_portada(prof, ctx)
+        assert ch is not None
+        group = next(b for b in ch.blocks if isinstance(b, Group))
+        size_h = next(b for b in group.blocks
+                      if isinstance(b, Heading) and b.level == 2)
+        assert "filas" in size_h.text and "columnas" in size_h.text
+        desc = _markdown_after(ch.blocks, "Descripción")
+        gran = _markdown_after(ch.blocks, "Granularidad")
+        assert desc.text and "pendiente" not in desc.text.lower()
+        assert gran.text.startswith("Cada fila es")
+
+
+def test_golden_render_pdf_muestra_portada():
+    prof = _profile()
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pdf")
+        res = render_automatic_eda_pdf(prof, out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pdf_text(out)
+        assert "titanic" in txt.lower()
+        assert "891" in txt and "filas" in txt and "columnas" in txt
+        assert "Titanic" in txt          # LLM summary in the Description.
+        assert "Cada fila es" in txt     # granularity sentence.
+
+
+def test_golden_render_pptx_muestra_portada():
+    prof = _profile()
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pptx")
+        res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pptx_text(out)
+        assert "titanic" in txt.lower()
+        assert "891" in txt and "columnas" in txt
+        assert "Cada fila es" in txt