feat(eda): overview enriquece diccionario y describe con descripcion+unidad del LLM

La tabla DICCIONARIO de columnas del capitulo overview gana columnas "Descripcion" y "Unidad", y la tabla DESCRIBE gana "Unidad", consumiendo profile['llm']['dictionary'] (entradas column/description/business_meaning/unit producidas por eda_llm_insights) emparejadas por nombre de columna. Lectura defensiva: sin bloque LLM (run_llm no corrio) las celdas degradan a "—" y las tablas siguen renderizando. No recalcula nada ni llama al LLM. CHAPTER_VERSION 1.1.0 -> 1.2.0. Tests: golden (descripcion+unidad pobladas para income), edge (sin LLM -> "—"), fallback ctx['llm'], y render PDF con las columnas nuevas visibles. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-01 01:13:02 +02:00
parent f2eb782a5f
commit 64306f3b1c
2 changed files with 162 additions and 12 deletions
@@ -7,11 +7,21 @@ as needed, the renderers paginate):
   NOT carry the raw head, so this is read from ``ctx['head_rows']`` /
   ``profile['head_rows']`` (a list of row dicts). When absent the chapter shows
   an honest placeholder documenting the missing key instead of inventing data.
-2. Column dictionary — name / type / nulls / non-null examples. Examples come
+2. Column dictionary — name / type / nulls / non-null examples plus, when the
   LLM layer ran, the business **description** and **unit** of each column so the
   reader knows at a glance what every column is and in which unit. Examples come
   from ``columns[i]['examples']`` when present; otherwise they are derived from
   real non-null profile values (categorical top values, numeric min/median/max)
   so the cell is never empty nor fabricated.
-3. ``df.describe`` — mean / median / min / max / std for every numeric column.
+3. ``df.describe`` — mean / median / min / max / std for every numeric column,
   plus its **unit** (same LLM source) so the stats read in context.
 The description/unit come from the ``llm`` block that ``eda_llm_insights`` (group
 ``eda``) already stored in the profile (``profile['llm']['dictionary']``, a list
 of ``{"column","description","business_meaning","unit"}`` entries) — this chapter
 only **consumes** it, matching by column name; it never calls the LLM nor
 recomputes anything. When the block is absent (``run_llm`` did not run) those
 cells degrade to ``"—"`` and the tables still render.
 Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
 """
@@ -20,13 +30,59 @@ from __future__ import annotations
 from .. import model
-CHAPTER_VERSION = "1.1.0"
+CHAPTER_VERSION = "1.2.0"
 CHAPTER_ID = "overview"
 CHAPTER_TITLE = "Overview"
 # Profile/ctx keys the calculation phase must add for a full head + examples.
 HEAD_KEY = "head_rows"          # list[dict] — df.head(n)
 EXAMPLES_KEY = "examples"       # per column: list of non-null sample values
 LLM_KEY = "llm"                 # interpretive block from eda_llm_insights
 def _llm_dict_index(profile: dict, ctx: dict) -> dict:
    """Map column name -> its LLM dictionary entry (description/unit/...).
    Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the
    profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty
    dict when no LLM block ran, so the caller degrades to "—" cells. Fully
    defensive: never raises on malformed input.
    """
    llm = profile.get(LLM_KEY)
    if not isinstance(llm, dict):
        llm = ctx.get(LLM_KEY)
    if not isinstance(llm, dict):
        return {}
    entries = llm.get("dictionary")
    if not isinstance(entries, (list, tuple)):
        return {}
    index: dict = {}
    for e in entries:
        if not isinstance(e, dict):
            continue
        col = e.get("column")
        if col is None:
            continue
        index[model._safe_str(col)] = e
    return index
 def _llm_desc(entry) -> str:
    """Business description of a column from its LLM entry, or "—"."""
    if not isinstance(entry, dict):
        return "—"
    raw = entry.get("description") or entry.get("business_meaning")
    text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
    return text or "—"
 def _llm_unit(entry) -> str:
    """Unit of a column from its LLM entry, or "—"."""
    if not isinstance(entry, dict):
        return "—"
    raw = entry.get("unit")
    text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
    return text or "—"
 def _fmt_num(value, decimals: int = 3) -> str:
@@ -104,9 +160,12 @@ def _head_block(profile: dict, ctx: dict):
        "pasarlo en ctx['head_rows'] para mostrar las primeras filas.")
-def _columns_block(profile: dict):
+def _columns_block(profile: dict, llm_index: dict):
    cols = profile.get("columns") or []
-    header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)"]
+    # Descripción / Unidad come from the LLM dictionary (matched by column name);
    # they read "—" when run_llm did not run, so the table always renders.
    header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
              "Descripción", "Unidad"]
    rows = []
    for c in cols:
        if not isinstance(c, dict):
@@ -126,15 +185,18 @@ def _columns_block(profile: dict):
            nulls = str(null_count)
        else:
            nulls = "—"
-        rows.append([name, ctype, nulls, _examples_for(c)])
+        entry = llm_index.get(model._safe_str(name))
        rows.append([name, ctype, nulls, _examples_for(c),
                     _llm_desc(entry), _llm_unit(entry)])
    if not rows:
        return None
    return model.DataTable(header=header, rows=rows, title="Columnas")
-def _describe_block(profile: dict):
+def _describe_block(profile: dict, llm_index: dict):
    cols = profile.get("columns") or []
-    header = ["Columna", "mean", "median", "min", "max", "std"]
+    # "Unidad" (LLM source) lets the reader know in which unit each stat is.
    header = ["Columna", "mean", "median", "min", "max", "std", "Unidad"]
    rows = []
    for c in cols:
        if not isinstance(c, dict) or c.get("inferred_type") != "numeric":
@@ -142,13 +204,16 @@ def _describe_block(profile: dict):
        num = c.get("numeric") or {}
        if not num:
            continue
        name = c.get("name") or "(col)"
        entry = llm_index.get(model._safe_str(name))
        rows.append([
-            c.get("name") or "(col)",
+            name,
            _fmt_num(num.get("mean")),
            _fmt_num(num.get("median")),
            _fmt_num(num.get("min")),
            _fmt_num(num.get("max")),
            _fmt_num(num.get("std")),
            _llm_unit(entry),
        ])
    if not rows:
        return None
@@ -163,16 +228,18 @@ def build_overview(profile: dict, ctx: dict):
    if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)):
        return None
    llm_index = _llm_dict_index(profile, ctx)
    blocks = [
        model.Heading(text="Primeras filas (df.head)", level=2),
        _head_block(profile, ctx),
    ]
-    cols_block = _columns_block(profile)
+    cols_block = _columns_block(profile, llm_index)
    if cols_block is not None:
        blocks.append(model.Heading(
            text="Diccionario de columnas", level=2))
        blocks.append(cols_block)
-    desc_block = _describe_block(profile)
+    desc_block = _describe_block(profile, llm_index)
    if desc_block is not None:
        blocks.append(model.Heading(
            text="Resumen estadístico numérico", level=2))
@@ -56,7 +56,21 @@ def _head_rows() -> list:
    ]
-def _profile(with_head: bool = True) -> dict:
+def _llm() -> dict:
    """Interpretive block as eda_llm_insights stores it under profile['llm']."""
    return {
        "summary": "Pasajeros del Titanic.",
        "dictionary": [
            {"column": "PassengerId", "description": "Identificador del pasajero",
             "business_meaning": "Clave única de cada pasajero", "unit": "id"},
            {"column": "Pclass", "description": "Clase del billete",
             "business_meaning": "Clase socioeconómica", "unit": "clase (1-3)"},
            # No entry for Survived/Name/Sex on purpose -> they degrade to "—".
        ],
    }
 def _profile(with_head: bool = True, with_llm: bool = False) -> dict:
    prof = {
        "table": "titanic",
        "source": "/data/titanic.csv",
@@ -68,6 +82,8 @@ def _profile(with_head: bool = True) -> dict:
    }
    if with_head:
        prof["head_rows"] = _head_rows()
    if with_llm:
        prof["llm"] = _llm()
    return prof
@@ -185,3 +201,70 @@ def test_edge_none_y_vacio_no_rompen():
    assert ch is not None
    tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
    assert tables and len(tables[0].rows) == 3
 def _table_by_header(blocks, marker: str):
    """Return the first DataTable whose header contains ``marker``."""
    for b in _flatten(blocks):
        if isinstance(b, DataTable) and marker in b.header:
            return b
    return None
 def test_golden_diccionario_lleva_descripcion_y_unidad_del_llm():
    # With run_llm: the column dictionary gains "Descripción" and "Unidad"
    # columns populated from profile['llm']['dictionary'], matched by name.
    ch = build_overview(_profile(with_llm=True), {})
    assert ch is not None
    dic = _table_by_header(ch.blocks, "Descripción")
    assert dic is not None
    assert dic.header == ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
                          "Descripción", "Unidad"]
    by_name = {row[0]: row for row in dic.rows}
    # PassengerId has an LLM entry -> description + unit populated.
    assert by_name["PassengerId"][4] == "Identificador del pasajero"
    assert by_name["PassengerId"][5] == "id"
    assert by_name["Pclass"][5] == "clase (1-3)"
    # Columns with no LLM entry degrade to "—" without breaking the row.
    assert by_name["Survived"][4] == "—" and by_name["Survived"][5] == "—"
 def test_golden_describe_lleva_unidad_del_llm():
    ch = build_overview(_profile(with_llm=True), {})
    desc = _table_by_header(ch.blocks, "std")
    assert desc is not None
    assert desc.header[-1] == "Unidad"
    by_name = {row[0]: row for row in desc.rows}
    assert by_name["PassengerId"][-1] == "id"
    assert by_name["Pclass"][-1] == "clase (1-3)"
    # Numeric column with no LLM unit still renders, unit "—".
    assert by_name["Survived"][-1] == "—"
 def test_edge_sin_llm_descripcion_unidad_son_guion():
    # No profile['llm'] at all: the new cells degrade to "—" and nothing breaks.
    ch = build_overview(_profile(), {})
    assert ch is not None
    dic = _table_by_header(ch.blocks, "Unidad")
    assert dic is not None
    for row in dic.rows:
        assert row[4] == "—" and row[5] == "—"
    desc = _table_by_header(ch.blocks, "std")
    assert all(row[-1] == "—" for row in desc.rows)
 def test_golden_llm_via_ctx_tambien_funciona():
    # LLM block arriving through ctx['llm'] (fallback path) is consumed too.
    ch = build_overview(_profile(with_llm=False), {"llm": _llm()})
    dic = _table_by_header(ch.blocks, "Descripción")
    by_name = {row[0]: row for row in dic.rows}
    assert by_name["PassengerId"][5] == "id"
 def test_golden_render_pdf_muestra_descripcion_y_unidad():
    with tempfile.TemporaryDirectory() as d:
        out = os.path.join(d, "eda.pdf")
        render_automatic_eda_pdf(_profile(with_llm=True), out, {"title": "EDA"})
        txt = _pdf_text(out)
        assert "Descripción" in txt and "Unidad" in txt
        assert "Identificador del pasajero" in txt