feat(eda): overview enriquece diccionario y describe con descripcion+unidad del LLM

La tabla DICCIONARIO de columnas del capitulo overview gana columnas "Descripcion" y "Unidad", y la tabla DESCRIBE gana "Unidad", consumiendo profile['llm']['dictionary'] (entradas column/description/business_meaning/unit producidas por eda_llm_insights) emparejadas por nombre de columna. Lectura defensiva: sin bloque LLM (run_llm no corrio) las celdas degradan a "—" y las tablas siguen renderizando. No recalcula nada ni llama al LLM. CHAPTER_VERSION 1.1.0 -> 1.2.0. Tests: golden (descripcion+unidad pobladas para income), edge (sin LLM -> "—"), fallback ctx['llm'], y render PDF con las columnas nuevas visibles. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-01 01:13:02 +02:00
parent f2eb782a5f
commit 64306f3b1c
2 changed files with 162 additions and 12 deletions
@@ -7,11 +7,21 @@ as needed, the renderers paginate):
   NOT carry the raw head, so this is read from ``ctx['head_rows']`` /
   ``profile['head_rows']`` (a list of row dicts). When absent the chapter shows
   an honest placeholder documenting the missing key instead of inventing data.
-2. Column dictionary — name / type / nulls / non-null examples. Examples come
+2. Column dictionary — name / type / nulls / non-null examples plus, when the
+   LLM layer ran, the business **description** and **unit** of each column so the
+   reader knows at a glance what every column is and in which unit. Examples come
   from ``columns[i]['examples']`` when present; otherwise they are derived from
   real non-null profile values (categorical top values, numeric min/median/max)
   so the cell is never empty nor fabricated.
-3. ``df.describe`` — mean / median / min / max / std for every numeric column.
+3. ``df.describe`` — mean / median / min / max / std for every numeric column,
+   plus its **unit** (same LLM source) so the stats read in context.
+
+The description/unit come from the ``llm`` block that ``eda_llm_insights`` (group
+``eda``) already stored in the profile (``profile['llm']['dictionary']``, a list
+of ``{"column","description","business_meaning","unit"}`` entries) — this chapter
+only **consumes** it, matching by column name; it never calls the LLM nor
+recomputes anything. When the block is absent (``run_llm`` did not run) those
+cells degrade to ``"—"`` and the tables still render.

 Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
 """
@@ -20,13 +30,59 @@ from __future__ import annotations

 from .. import model

-CHAPTER_VERSION = "1.1.0"
+CHAPTER_VERSION = "1.2.0"
 CHAPTER_ID = "overview"
 CHAPTER_TITLE = "Overview"

 # Profile/ctx keys the calculation phase must add for a full head + examples.
 HEAD_KEY = "head_rows"          # list[dict] — df.head(n)
 EXAMPLES_KEY = "examples"       # per column: list of non-null sample values
+LLM_KEY = "llm"                 # interpretive block from eda_llm_insights
+
+
+def _llm_dict_index(profile: dict, ctx: dict) -> dict:
+    """Map column name -> its LLM dictionary entry (description/unit/...).
+
+    Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the
+    profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty
+    dict when no LLM block ran, so the caller degrades to "—" cells. Fully
+    defensive: never raises on malformed input.
+    """
+    llm = profile.get(LLM_KEY)
+    if not isinstance(llm, dict):
+        llm = ctx.get(LLM_KEY)
+    if not isinstance(llm, dict):
+        return {}
+    entries = llm.get("dictionary")
+    if not isinstance(entries, (list, tuple)):
+        return {}
+    index: dict = {}
+    for e in entries:
+        if not isinstance(e, dict):
+            continue
+        col = e.get("column")
+        if col is None:
+            continue
+        index[model._safe_str(col)] = e
+    return index
+
+
+def _llm_desc(entry) -> str:
+    """Business description of a column from its LLM entry, or "—"."""
+    if not isinstance(entry, dict):
+        return "—"
+    raw = entry.get("description") or entry.get("business_meaning")
+    text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
+    return text or "—"
+
+
+def _llm_unit(entry) -> str:
+    """Unit of a column from its LLM entry, or "—"."""
+    if not isinstance(entry, dict):
+        return "—"
+    raw = entry.get("unit")
+    text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
+    return text or "—"


 def _fmt_num(value, decimals: int = 3) -> str:
@@ -104,9 +160,12 @@ def _head_block(profile: dict, ctx: dict):
        "pasarlo en ctx['head_rows'] para mostrar las primeras filas.")


-def _columns_block(profile: dict):
+def _columns_block(profile: dict, llm_index: dict):
    cols = profile.get("columns") or []
-    header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)"]
+    # Descripción / Unidad come from the LLM dictionary (matched by column name);
+    # they read "—" when run_llm did not run, so the table always renders.
+    header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
+              "Descripción", "Unidad"]
    rows = []
    for c in cols:
        if not isinstance(c, dict):
@@ -126,15 +185,18 @@ def _columns_block(profile: dict):
            nulls = str(null_count)
        else:
            nulls = "—"
-        rows.append([name, ctype, nulls, _examples_for(c)])
+        entry = llm_index.get(model._safe_str(name))
+        rows.append([name, ctype, nulls, _examples_for(c),
+                     _llm_desc(entry), _llm_unit(entry)])
    if not rows:
        return None
    return model.DataTable(header=header, rows=rows, title="Columnas")


-def _describe_block(profile: dict):
+def _describe_block(profile: dict, llm_index: dict):
    cols = profile.get("columns") or []
-    header = ["Columna", "mean", "median", "min", "max", "std"]
+    # "Unidad" (LLM source) lets the reader know in which unit each stat is.
+    header = ["Columna", "mean", "median", "min", "max", "std", "Unidad"]
    rows = []
    for c in cols:
        if not isinstance(c, dict) or c.get("inferred_type") != "numeric":
@@ -142,13 +204,16 @@ def _describe_block(profile: dict):
        num = c.get("numeric") or {}
        if not num:
            continue
+        name = c.get("name") or "(col)"
+        entry = llm_index.get(model._safe_str(name))
        rows.append([
-            c.get("name") or "(col)",
+            name,
            _fmt_num(num.get("mean")),
            _fmt_num(num.get("median")),
            _fmt_num(num.get("min")),
            _fmt_num(num.get("max")),
            _fmt_num(num.get("std")),
+            _llm_unit(entry),
        ])
    if not rows:
        return None
@@ -163,16 +228,18 @@ def build_overview(profile: dict, ctx: dict):
    if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)):
        return None

+    llm_index = _llm_dict_index(profile, ctx)
+
    blocks = [
        model.Heading(text="Primeras filas (df.head)", level=2),
        _head_block(profile, ctx),
    ]
-    cols_block = _columns_block(profile)
+    cols_block = _columns_block(profile, llm_index)
    if cols_block is not None:
        blocks.append(model.Heading(
            text="Diccionario de columnas", level=2))
        blocks.append(cols_block)
-    desc_block = _describe_block(profile)
+    desc_block = _describe_block(profile, llm_index)
    if desc_block is not None:
        blocks.append(model.Heading(
            text="Resumen estadístico numérico", level=2))