feat(eda): overview enriquece diccionario y describe con descripcion+unidad del LLM

La tabla DICCIONARIO de columnas del capitulo overview gana columnas "Descripcion" y "Unidad", y la tabla DESCRIBE gana "Unidad", consumiendo profile['llm']['dictionary'] (entradas column/description/business_meaning/unit producidas por eda_llm_insights) emparejadas por nombre de columna. Lectura defensiva: sin bloque LLM (run_llm no corrio) las celdas degradan a "—" y las tablas siguen renderizando. No recalcula nada ni llama al LLM. CHAPTER_VERSION 1.1.0 -> 1.2.0. Tests: golden (descripcion+unidad pobladas para income), edge (sin LLM -> "—"), fallback ctx['llm'], y render PDF con las columnas nuevas visibles. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
merge(eda): portada v2 (sin Criterios, descripcion LLM, resumen a la derecha) + zebra global PDF + nombre PPTX grande/subrayado
2026-07-01 01:13:02 +02:00 · 2026-06-30 22:53:46 +02:00
2 changed files with 162 additions and 12 deletions
@@ -7,11 +7,21 @@ as needed, the renderers paginate):
   NOT carry the raw head, so this is read from ``ctx['head_rows']`` /
   ``profile['head_rows']`` (a list of row dicts). When absent the chapter shows
   an honest placeholder documenting the missing key instead of inventing data.
-2. Column dictionary — name / type / nulls / non-null examples. Examples come
+2. Column dictionary — name / type / nulls / non-null examples plus, when the
+   LLM layer ran, the business **description** and **unit** of each column so the
+   reader knows at a glance what every column is and in which unit. Examples come
   from ``columns[i]['examples']`` when present; otherwise they are derived from
   real non-null profile values (categorical top values, numeric min/median/max)
   so the cell is never empty nor fabricated.
-3. ``df.describe`` — mean / median / min / max / std for every numeric column.
+3. ``df.describe`` — mean / median / min / max / std for every numeric column,
+   plus its **unit** (same LLM source) so the stats read in context.
+
+The description/unit come from the ``llm`` block that ``eda_llm_insights`` (group
+``eda``) already stored in the profile (``profile['llm']['dictionary']``, a list
+of ``{"column","description","business_meaning","unit"}`` entries) — this chapter
+only **consumes** it, matching by column name; it never calls the LLM nor
+recomputes anything. When the block is absent (``run_llm`` did not run) those
+cells degrade to ``"—"`` and the tables still render.

 Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
 """
@@ -20,13 +30,59 @@ from __future__ import annotations

 from .. import model

-CHAPTER_VERSION = "1.1.0"
+CHAPTER_VERSION = "1.2.0"
 CHAPTER_ID = "overview"
 CHAPTER_TITLE = "Overview"

 # Profile/ctx keys the calculation phase must add for a full head + examples.
 HEAD_KEY = "head_rows"          # list[dict] — df.head(n)
 EXAMPLES_KEY = "examples"       # per column: list of non-null sample values
+LLM_KEY = "llm"                 # interpretive block from eda_llm_insights
+
+
+def _llm_dict_index(profile: dict, ctx: dict) -> dict:
+    """Map column name -> its LLM dictionary entry (description/unit/...).
+
+    Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the
+    profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty
+    dict when no LLM block ran, so the caller degrades to "—" cells. Fully
+    defensive: never raises on malformed input.
+    """
+    llm = profile.get(LLM_KEY)
+    if not isinstance(llm, dict):
+        llm = ctx.get(LLM_KEY)
+    if not isinstance(llm, dict):
+        return {}
+    entries = llm.get("dictionary")
+    if not isinstance(entries, (list, tuple)):
+        return {}
+    index: dict = {}
+    for e in entries:
+        if not isinstance(e, dict):
+            continue
+        col = e.get("column")
+        if col is None:
+            continue
+        index[model._safe_str(col)] = e
+    return index
+
+
+def _llm_desc(entry) -> str:
+    """Business description of a column from its LLM entry, or "—"."""
+    if not isinstance(entry, dict):
+        return "—"
+    raw = entry.get("description") or entry.get("business_meaning")
+    text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
+    return text or "—"
+
+
+def _llm_unit(entry) -> str:
+    """Unit of a column from its LLM entry, or "—"."""
+    if not isinstance(entry, dict):
+        return "—"
+    raw = entry.get("unit")
+    text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
+    return text or "—"


 def _fmt_num(value, decimals: int = 3) -> str:
@@ -104,9 +160,12 @@ def _head_block(profile: dict, ctx: dict):
        "pasarlo en ctx['head_rows'] para mostrar las primeras filas.")


-def _columns_block(profile: dict):
+def _columns_block(profile: dict, llm_index: dict):
    cols = profile.get("columns") or []
-    header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)"]
+    # Descripción / Unidad come from the LLM dictionary (matched by column name);
+    # they read "—" when run_llm did not run, so the table always renders.
+    header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
+              "Descripción", "Unidad"]
    rows = []
    for c in cols:
        if not isinstance(c, dict):
@@ -126,15 +185,18 @@ def _columns_block(profile: dict):
            nulls = str(null_count)
        else:
            nulls = "—"
-        rows.append([name, ctype, nulls, _examples_for(c)])
+        entry = llm_index.get(model._safe_str(name))
+        rows.append([name, ctype, nulls, _examples_for(c),
+                     _llm_desc(entry), _llm_unit(entry)])
    if not rows:
        return None
    return model.DataTable(header=header, rows=rows, title="Columnas")


-def _describe_block(profile: dict):
+def _describe_block(profile: dict, llm_index: dict):
    cols = profile.get("columns") or []
-    header = ["Columna", "mean", "median", "min", "max", "std"]
+    # "Unidad" (LLM source) lets the reader know in which unit each stat is.
+    header = ["Columna", "mean", "median", "min", "max", "std", "Unidad"]
    rows = []
    for c in cols:
        if not isinstance(c, dict) or c.get("inferred_type") != "numeric":
@@ -142,13 +204,16 @@ def _describe_block(profile: dict):
        num = c.get("numeric") or {}
        if not num:
            continue
+        name = c.get("name") or "(col)"
+        entry = llm_index.get(model._safe_str(name))
        rows.append([
-            c.get("name") or "(col)",
+            name,
            _fmt_num(num.get("mean")),
            _fmt_num(num.get("median")),
            _fmt_num(num.get("min")),
            _fmt_num(num.get("max")),
            _fmt_num(num.get("std")),
+            _llm_unit(entry),
        ])
    if not rows:
        return None
@@ -163,16 +228,18 @@ def build_overview(profile: dict, ctx: dict):
    if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)):
        return None

+    llm_index = _llm_dict_index(profile, ctx)
+
    blocks = [
        model.Heading(text="Primeras filas (df.head)", level=2),
        _head_block(profile, ctx),
    ]
-    cols_block = _columns_block(profile)
+    cols_block = _columns_block(profile, llm_index)
    if cols_block is not None:
        blocks.append(model.Heading(
            text="Diccionario de columnas", level=2))
        blocks.append(cols_block)
-    desc_block = _describe_block(profile)
+    desc_block = _describe_block(profile, llm_index)
    if desc_block is not None:
        blocks.append(model.Heading(
            text="Resumen estadístico numérico", level=2))
@@ -56,7 +56,21 @@ def _head_rows() -> list:
    ]


-def _profile(with_head: bool = True) -> dict:
+def _llm() -> dict:
+    """Interpretive block as eda_llm_insights stores it under profile['llm']."""
+    return {
+        "summary": "Pasajeros del Titanic.",
+        "dictionary": [
+            {"column": "PassengerId", "description": "Identificador del pasajero",
+             "business_meaning": "Clave única de cada pasajero", "unit": "id"},
+            {"column": "Pclass", "description": "Clase del billete",
+             "business_meaning": "Clase socioeconómica", "unit": "clase (1-3)"},
+            # No entry for Survived/Name/Sex on purpose -> they degrade to "—".
+        ],
+    }
+
+
+def _profile(with_head: bool = True, with_llm: bool = False) -> dict:
    prof = {
        "table": "titanic",
        "source": "/data/titanic.csv",
@@ -68,6 +82,8 @@ def _profile(with_head: bool = True) -> dict:
    }
    if with_head:
        prof["head_rows"] = _head_rows()
+    if with_llm:
+        prof["llm"] = _llm()
    return prof


@@ -185,3 +201,70 @@ def test_edge_none_y_vacio_no_rompen():
    assert ch is not None
    tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
    assert tables and len(tables[0].rows) == 3
+
+
+def _table_by_header(blocks, marker: str):
+    """Return the first DataTable whose header contains ``marker``."""
+    for b in _flatten(blocks):
+        if isinstance(b, DataTable) and marker in b.header:
+            return b
+    return None
+
+
+def test_golden_diccionario_lleva_descripcion_y_unidad_del_llm():
+    # With run_llm: the column dictionary gains "Descripción" and "Unidad"
+    # columns populated from profile['llm']['dictionary'], matched by name.
+    ch = build_overview(_profile(with_llm=True), {})
+    assert ch is not None
+    dic = _table_by_header(ch.blocks, "Descripción")
+    assert dic is not None
+    assert dic.header == ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
+                          "Descripción", "Unidad"]
+    by_name = {row[0]: row for row in dic.rows}
+    # PassengerId has an LLM entry -> description + unit populated.
+    assert by_name["PassengerId"][4] == "Identificador del pasajero"
+    assert by_name["PassengerId"][5] == "id"
+    assert by_name["Pclass"][5] == "clase (1-3)"
+    # Columns with no LLM entry degrade to "—" without breaking the row.
+    assert by_name["Survived"][4] == "—" and by_name["Survived"][5] == "—"
+
+
+def test_golden_describe_lleva_unidad_del_llm():
+    ch = build_overview(_profile(with_llm=True), {})
+    desc = _table_by_header(ch.blocks, "std")
+    assert desc is not None
+    assert desc.header[-1] == "Unidad"
+    by_name = {row[0]: row for row in desc.rows}
+    assert by_name["PassengerId"][-1] == "id"
+    assert by_name["Pclass"][-1] == "clase (1-3)"
+    # Numeric column with no LLM unit still renders, unit "—".
+    assert by_name["Survived"][-1] == "—"
+
+
+def test_edge_sin_llm_descripcion_unidad_son_guion():
+    # No profile['llm'] at all: the new cells degrade to "—" and nothing breaks.
+    ch = build_overview(_profile(), {})
+    assert ch is not None
+    dic = _table_by_header(ch.blocks, "Unidad")
+    assert dic is not None
+    for row in dic.rows:
+        assert row[4] == "—" and row[5] == "—"
+    desc = _table_by_header(ch.blocks, "std")
+    assert all(row[-1] == "—" for row in desc.rows)
+
+
+def test_golden_llm_via_ctx_tambien_funciona():
+    # LLM block arriving through ctx['llm'] (fallback path) is consumed too.
+    ch = build_overview(_profile(with_llm=False), {"llm": _llm()})
+    dic = _table_by_header(ch.blocks, "Descripción")
+    by_name = {row[0]: row for row in dic.rows}
+    assert by_name["PassengerId"][5] == "id"
+
+
+def test_golden_render_pdf_muestra_descripcion_y_unidad():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pdf")
+        render_automatic_eda_pdf(_profile(with_llm=True), out, {"title": "EDA"})
+        txt = _pdf_text(out)
+        assert "Descripción" in txt and "Unidad" in txt
+        assert "Identificador del pasajero" in txt