merge: 4b portada — tamano grande junto al nombre + descripcion y granularidad funcionando (verificado met)

2026-06-30 18:12:22 +02:00
parent d001d90306 048781df3f
commit 32054ad781
2 changed files with 316 additions and 15 deletions
@@ -2,8 +2,17 @@
 Builds the document cover from a TableProfile plus an optional ``ctx`` of
 presentation metadata. Reads everything defensively (``.get``) and degrades
-honestly: a field that is neither in the profile nor in ``ctx`` is shown as a
+honestly.
-placeholder rather than invented, leaving a hook for the LLM layer to fill it.
+
 The dataset size (N rows x M columns) is always shown big, as a heading right
 under the dataset name (kept together in a ``Group``), not buried in the
 metadata table. The Description and Granularity are resolved through a cascade
 so they are never empty: an explicit ``ctx`` value wins; otherwise the LLM block
 (``profile['llm']`` from ``eda_llm_insights``) provides ``summary`` /
 ``row_meaning``; otherwise a short summary is derived from the profile itself
 (shape, column-type mix, quality score) and a "Cada fila es…" sentence from the
 key-candidate columns or the table shape. Nothing is invented: the derived
 fallbacks state that they come from the profile.
 Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``):
    build_<id>(profile: dict, ctx: dict) -> Chapter | None
@@ -17,10 +26,15 @@ from datetime import datetime, timezone
 from .. import model
-CHAPTER_VERSION = "1.1.0"
+CHAPTER_VERSION = "1.2.0"
 CHAPTER_ID = "portada"
 CHAPTER_TITLE = "Portada"
 # Key under which eda_llm_insights stores its interpretive block in the profile.
 # The cover reads ``summary`` (what the table is) and ``row_meaning`` (what one
 # row represents) from it when the LLM layer ran (``run_llm``).
 _LLM_KEY = "llm"
 # Default human description of what the table quality score measures. Chapters
 # can override it via ctx["quality_criteria"].
 _DEFAULT_QUALITY_CRITERIA = (
@@ -142,6 +156,88 @@ def _fmt_date_eu(value) -> str:
        return s
 def _llm_block(profile: dict, ctx: dict) -> dict:
    """Return the interpretive LLM block (``eda_llm_insights`` output), or {}.
    It is stored under ``profile['llm']`` by ``profile_table(run_llm=True)`` and
    may also be forwarded in ``ctx['llm']``. Read defensively: anything that is
    not a dict degrades to an empty dict so the cover never raises.
    """
    block = profile.get(_LLM_KEY)
    if not isinstance(block, dict):
        block = ctx.get(_LLM_KEY)
    return block if isinstance(block, dict) else {}
 def _count_column_types(profile: dict, ctx: dict):
    """Best-effort (n_numeric, n_categorical) for the dataset.
    Prefers the aggregated ``ctx['document_summary']`` (computed by the engine
    over the whole body); falls back to counting the profile columns directly so
    the cover still has the numbers when no summary was passed.
    """
    summary = ctx.get("document_summary")
    if isinstance(summary, dict):
        n_num = summary.get("n_numeric")
        n_cat = summary.get("n_categorical")
        if n_num is not None or n_cat is not None:
            return n_num, n_cat
    cols = profile.get("columns") or []
    n_num = sum(1 for c in cols if isinstance(c, dict)
                and c.get("inferred_type") == "numeric")
    n_cat = sum(1 for c in cols if isinstance(c, dict)
                and isinstance(c.get("categorical"), dict)
                and c.get("categorical", {}).get("top")
                and c.get("inferred_type") != "numeric")
    return n_num, n_cat
 def _derive_description(profile: dict, ctx: dict) -> str:
    """A short, honest description of the dataset from the profile.
    Used only when no explicit ``ctx['description']`` and no LLM ``summary`` are
    available. Summarizes shape, column-type mix and quality score; never empty,
    never invents business meaning (it states the description was derived)."""
    n_rows = profile.get("n_rows")
    n_cols = profile.get("n_cols")
    n_num, n_cat = _count_column_types(profile, ctx)
    head = f"Conjunto de datos con {_fmt_int(n_rows)} filas y {_fmt_int(n_cols)} columnas"
    type_bits = []
    if n_num:
        type_bits.append(f"{_fmt_int(n_num)} numéricas")
    if n_cat:
        type_bits.append(f"{_fmt_int(n_cat)} categóricas")
    if type_bits:
        head += " (" + ", ".join(type_bits) + ")"
    parts = [head + "."]
    score = profile.get("quality_score")
    if score is not None:
        parts.append(f"Calidad media estimada: {score}/100.")
    parts.append(
        "Resumen derivado del perfil; active la interpretación LLM (`run_llm`) "
        "para una descripción de negocio más rica.")
    return " ".join(parts)
 def _derive_granularity(profile: dict, dataset_name: str) -> str:
    """A ``Cada fila es…`` granularity sentence from the profile.
    Prefers the key-candidate columns (a row is identified by them); when no key
    is detected, falls back to the table shape so the line is always meaningful
    and starts with ``Cada fila es`` as the user requested."""
    keys = profile.get("key_candidates") or []
    if keys:
        shown = ", ".join(str(k) for k in keys[:3])
        more = "" if len(keys) <= 3 else f" (y {len(keys) - 3} más)"
        return (f"Cada fila es un registro identificado por {shown}{more}, "
                "candidata(s) a clave por ser únicas y sin nulos.")
    n_rows = profile.get("n_rows")
    tail = f" El dataset tiene {_fmt_int(n_rows)} filas en total." if n_rows else ""
    return (f"Cada fila es un registro de «{dataset_name}». No se detectó una "
            "columna identificadora única, así que la granularidad se infiere "
            "de la forma de la tabla." + tail)
 def build_portada(profile: dict, ctx: dict):
    """Build the cover Chapter, or None if there is truly nothing to show."""
    profile = profile or {}
@@ -166,30 +262,38 @@ def build_portada(profile: dict, ctx: dict):
    quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA
    quality_value = "—" if score is None else f"{score} / 100"
-    # Granularity: ctx wins; else derive from key candidates; else be honest.
+    llm = _llm_block(profile, ctx)
    # Granularity: explicit ctx wins; then the LLM "row_meaning"; then the key
    # candidates; finally a shape-based fallback. Always a real "Cada fila es…".
    granularity = ctx.get("granularity")
    if not granularity:
-        keys = profile.get("key_candidates") or []
+        granularity = (llm.get("row_meaning") or "").strip() or None
-        if keys:
+    if not granularity:
-            granularity = ("Cada fila parece identificada por "
+        granularity = _derive_granularity(profile, str(dataset_name))
                           + ", ".join(str(k) for k in keys[:3]) + ".")
        else:
            granularity = ("Cada fila es… (granularidad no determinada — "
                           "pendiente de la capa de cálculo/LLM).")
    # Description: explicit ctx wins; then the LLM "summary"; finally a short
    # profile-derived summary. Never the old empty placeholder.
    description = ctx.get("description")
    if not description:
-        description = ("Descripción no provista — pendiente de la capa LLM "
+        description = (llm.get("summary") or "").strip() or None
-                       "(`run_llm`) o de `ctx['description']`.")
+    if not description:
        description = _derive_description(profile, ctx)
-    blocks = [
+    # Title + dataset size shown together and BIG (Heading) at the top, kept on
    # the same page (Group). The size is no longer buried in the metadata table.
    cover = [
        model.Heading(text=str(dataset_name), level=1),
        model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
        model.Heading(text=shape, level=2),
    ]
    blocks = [
        model.Group(blocks=cover),
        model.KVTable(rows=[
            ("Fuente", source_origin),
            ("Almacenamiento", storage),
            ("Generado", when),
            ("Tamaño", shape),
            ("Calidad", quality_value),
            ("Criterios de calidad", quality_criteria),
        ]),
@@ -0,0 +1,197 @@
 """Tests for the PORTADA (cover) chapter — DoD: golden + edges + render.
 Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
 and deterministic. Verifies the Fase 4b improvements:
 1. The dataset size (N rows x M columns) is always shown BIG — as a level-2
   heading kept together with the dataset name in a ``Group`` — and is no longer
   a row of the metadata table.
 2. Description and Granularity are resolved through a real cascade and are never
   the old empty placeholders: an explicit ``ctx`` value wins; otherwise the LLM
   block (``profile['llm']``) provides ``summary`` / ``row_meaning``; otherwise a
   short summary is derived from the profile and a "Cada fila es…" sentence from
   the key-candidate columns or the table shape.
 3. The chapter degrades without raising on empty/None input.
 4. It renders inside the full document to both PDF and PPTX showing that content.
 """
 import os
 import re
 import tempfile
 from pypdf import PdfReader
 from pptx import Presentation
 from datascience.automatic_eda.model import Group, Heading, KVTable, Markdown
 from datascience.automatic_eda.chapters.portada import (
    CHAPTER_ID, CHAPTER_VERSION, build_portada,
 )
 from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
 from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
 def _profile(with_llm: bool = True, with_keys: bool = True) -> dict:
    prof = {
        "table": "titanic",
        "source": "/data/titanic.csv",
        "profiled_at": "2026-06-30T10:00:00+00:00",
        "n_rows": 891,
        "n_cols": 12,
        "quality_score": 78.0,
        "columns": [
            {"name": "PassengerId", "inferred_type": "numeric",
             "null_pct": 0.0, "numeric": {"mean": 446.0, "min": 1.0,
                                          "max": 891.0, "std": 257.0}},
            {"name": "Survived", "inferred_type": "numeric",
             "null_pct": 0.0, "numeric": {"mean": 0.38, "min": 0.0,
                                          "max": 1.0, "std": 0.49}},
            {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
             "categorical": {"top": [{"value": "male", "count": 577, "pct": 0.65},
                                     {"value": "female", "count": 314,
                                      "pct": 0.35}],
                             "mode": "male", "n_distinct": 2, "entropy": 0.93}},
        ],
    }
    if with_keys:
        prof["key_candidates"] = ["PassengerId"]
    if with_llm:
        prof["llm"] = {
            "summary": "Pasajeros del Titanic con su supervivencia y datos de viaje.",
            "row_meaning": "Cada fila es un pasajero del Titanic.",
            "dictionary": [], "pii": [], "cleaning": [], "analyses": [],
        }
    return prof
 def _pdf_text(path: str) -> str:
    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
    return re.sub(r"\s+", " ", txt)
 def _pptx_text(path: str) -> str:
    prs = Presentation(path)
    parts = []
    for sl in prs.slides:
        for sh in sl.shapes:
            if sh.has_text_frame:
                parts.append(sh.text_frame.text)
            if sh.has_table:
                tb = sh.table
                for r in range(len(tb.rows)):
                    for c in range(len(tb.columns)):
                        parts.append(tb.cell(r, c).text)
    return re.sub(r"\s+", " ", " ".join(parts))
 def _markdown_after(blocks, heading_text):
    """Return the Markdown block that follows a Heading whose text matches."""
    for i, b in enumerate(blocks):
        if isinstance(b, Heading) and heading_text.lower() in b.text.lower():
            for nb in blocks[i + 1:]:
                if isinstance(nb, Markdown):
                    return nb
    return None
 def test_golden_tamano_grande_y_textos_llm():
    ch = build_portada(_profile(), {})
    assert ch is not None
    assert ch.id == CHAPTER_ID
    assert ch.version == CHAPTER_VERSION
    # 1) Title + size kept together in a Group; size is a BIG level-2 heading.
    group = next(b for b in ch.blocks if isinstance(b, Group))
    inner = group.blocks
    assert isinstance(inner[0], Heading) and inner[0].level == 1
    assert inner[0].text == "titanic"
    size_h = next(b for b in inner if isinstance(b, Heading) and b.level == 2)
    assert "891" in size_h.text and "12" in size_h.text
    assert "filas" in size_h.text and "columnas" in size_h.text
    # 2) Size is no longer a row of the metadata table.
    kv = next(b for b in ch.blocks if isinstance(b, KVTable))
    labels = [r[0] for r in kv.rows]
    assert "Tamaño" not in labels
    assert "Fuente" in labels and "Calidad" in labels
    # 3) Description and Granularity come from the LLM block.
    desc = _markdown_after(ch.blocks, "Descripción")
    gran = _markdown_after(ch.blocks, "Granularidad")
    assert desc is not None and "Titanic" in desc.text
    assert gran is not None and gran.text.startswith("Cada fila es")
    assert "pasajero" in gran.text.lower()
 def test_fallback_sin_llm_usa_keys_y_perfil():
    # No LLM block: description derived from the profile, granularity from keys.
    ch = build_portada(_profile(with_llm=False, with_keys=True), {})
    desc = _markdown_after(ch.blocks, "Descripción")
    gran = _markdown_after(ch.blocks, "Granularidad")
    # Description is the derived summary, never the old "pendiente" placeholder.
    assert "pendiente" not in desc.text.lower()
    assert "891" in desc.text and "columnas" in desc.text
    assert "numéricas" in desc.text or "categóricas" in desc.text
    # Granularity mentions the key candidate and starts with "Cada fila es".
    assert gran.text.startswith("Cada fila es")
    assert "PassengerId" in gran.text
    assert "…" not in gran.text  # the old ellipsis placeholder is gone.
 def test_fallback_sin_llm_sin_keys_usa_forma():
    ch = build_portada(_profile(with_llm=False, with_keys=False), {})
    gran = _markdown_after(ch.blocks, "Granularidad")
    assert gran.text.startswith("Cada fila es")
    assert "titanic" in gran.text.lower()
    assert "pendiente" not in gran.text.lower()
 def test_ctx_explicito_gana_sobre_llm():
    ctx = {"description": "Descripción manual.",
           "granularity": "Cada fila es una unidad manual."}
    ch = build_portada(_profile(), ctx)
    desc = _markdown_after(ch.blocks, "Descripción")
    gran = _markdown_after(ch.blocks, "Granularidad")
    assert desc.text == "Descripción manual."
    assert gran.text == "Cada fila es una unidad manual."
 def test_edge_perfil_vacio_no_lanza():
    # Empty / None never raise; the cover still shows a size and real texts.
    for prof, ctx in (({}, {}), (None, None)):
        ch = build_portada(prof, ctx)
        assert ch is not None
        group = next(b for b in ch.blocks if isinstance(b, Group))
        size_h = next(b for b in group.blocks
                      if isinstance(b, Heading) and b.level == 2)
        assert "filas" in size_h.text and "columnas" in size_h.text
        desc = _markdown_after(ch.blocks, "Descripción")
        gran = _markdown_after(ch.blocks, "Granularidad")
        assert desc.text and "pendiente" not in desc.text.lower()
        assert gran.text.startswith("Cada fila es")
 def test_golden_render_pdf_muestra_portada():
    prof = _profile()
    with tempfile.TemporaryDirectory() as d:
        out = os.path.join(d, "eda.pdf")
        res = render_automatic_eda_pdf(prof, out, {"title": "EDA"})
        assert res["path"] == out and os.path.exists(out)
        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
        txt = _pdf_text(out)
        assert "titanic" in txt.lower()
        assert "891" in txt and "filas" in txt and "columnas" in txt
        assert "Titanic" in txt          # LLM summary in the Description.
        assert "Cada fila es" in txt     # granularity sentence.
 def test_golden_render_pptx_muestra_portada():
    prof = _profile()
    with tempfile.TemporaryDirectory() as d:
        out = os.path.join(d, "eda.pptx")
        res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
        assert res["path"] == out and os.path.exists(out)
        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
        txt = _pptx_text(out)
        assert "titanic" in txt.lower()
        assert "891" in txt and "columnas" in txt
        assert "Cada fila es" in txt