feat(eda): overview enriquece diccionario y describe con descripcion+unidad del LLM

La tabla DICCIONARIO de columnas del capitulo overview gana columnas
"Descripcion" y "Unidad", y la tabla DESCRIBE gana "Unidad", consumiendo
profile['llm']['dictionary'] (entradas column/description/business_meaning/unit
producidas por eda_llm_insights) emparejadas por nombre de columna. Lectura
defensiva: sin bloque LLM (run_llm no corrio) las celdas degradan a "—" y las
tablas siguen renderizando. No recalcula nada ni llama al LLM.

CHAPTER_VERSION 1.1.0 -> 1.2.0. Tests: golden (descripcion+unidad pobladas para
income), edge (sin LLM -> "—"), fallback ctx['llm'], y render PDF con las
columnas nuevas visibles.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-07-01 01:13:02 +02:00
parent f2eb782a5f
commit 64306f3b1c
2 changed files with 162 additions and 12 deletions
@@ -7,11 +7,21 @@ as needed, the renderers paginate):
NOT carry the raw head, so this is read from ``ctx['head_rows']`` / NOT carry the raw head, so this is read from ``ctx['head_rows']`` /
``profile['head_rows']`` (a list of row dicts). When absent the chapter shows ``profile['head_rows']`` (a list of row dicts). When absent the chapter shows
an honest placeholder documenting the missing key instead of inventing data. an honest placeholder documenting the missing key instead of inventing data.
2. Column dictionary — name / type / nulls / non-null examples. Examples come 2. Column dictionary — name / type / nulls / non-null examples plus, when the
LLM layer ran, the business **description** and **unit** of each column so the
reader knows at a glance what every column is and in which unit. Examples come
from ``columns[i]['examples']`` when present; otherwise they are derived from from ``columns[i]['examples']`` when present; otherwise they are derived from
real non-null profile values (categorical top values, numeric min/median/max) real non-null profile values (categorical top values, numeric min/median/max)
so the cell is never empty nor fabricated. so the cell is never empty nor fabricated.
3. ``df.describe`` — mean / median / min / max / std for every numeric column. 3. ``df.describe`` — mean / median / min / max / std for every numeric column,
plus its **unit** (same LLM source) so the stats read in context.
The description/unit come from the ``llm`` block that ``eda_llm_insights`` (group
``eda``) already stored in the profile (``profile['llm']['dictionary']``, a list
of ``{"column","description","business_meaning","unit"}`` entries) — this chapter
only **consumes** it, matching by column name; it never calls the LLM nor
recomputes anything. When the block is absent (``run_llm`` did not run) those
cells degrade to ``""`` and the tables still render.
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
""" """
@@ -20,13 +30,59 @@ from __future__ import annotations
from .. import model from .. import model
CHAPTER_VERSION = "1.1.0" CHAPTER_VERSION = "1.2.0"
CHAPTER_ID = "overview" CHAPTER_ID = "overview"
CHAPTER_TITLE = "Overview" CHAPTER_TITLE = "Overview"
# Profile/ctx keys the calculation phase must add for a full head + examples. # Profile/ctx keys the calculation phase must add for a full head + examples.
HEAD_KEY = "head_rows" # list[dict] — df.head(n) HEAD_KEY = "head_rows" # list[dict] — df.head(n)
EXAMPLES_KEY = "examples" # per column: list of non-null sample values EXAMPLES_KEY = "examples" # per column: list of non-null sample values
LLM_KEY = "llm" # interpretive block from eda_llm_insights
def _llm_dict_index(profile: dict, ctx: dict) -> dict:
"""Map column name -> its LLM dictionary entry (description/unit/...).
Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the
profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty
dict when no LLM block ran, so the caller degrades to "" cells. Fully
defensive: never raises on malformed input.
"""
llm = profile.get(LLM_KEY)
if not isinstance(llm, dict):
llm = ctx.get(LLM_KEY)
if not isinstance(llm, dict):
return {}
entries = llm.get("dictionary")
if not isinstance(entries, (list, tuple)):
return {}
index: dict = {}
for e in entries:
if not isinstance(e, dict):
continue
col = e.get("column")
if col is None:
continue
index[model._safe_str(col)] = e
return index
def _llm_desc(entry) -> str:
"""Business description of a column from its LLM entry, or ""."""
if not isinstance(entry, dict):
return ""
raw = entry.get("description") or entry.get("business_meaning")
text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
return text or ""
def _llm_unit(entry) -> str:
"""Unit of a column from its LLM entry, or ""."""
if not isinstance(entry, dict):
return ""
raw = entry.get("unit")
text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
return text or ""
def _fmt_num(value, decimals: int = 3) -> str: def _fmt_num(value, decimals: int = 3) -> str:
@@ -104,9 +160,12 @@ def _head_block(profile: dict, ctx: dict):
"pasarlo en ctx['head_rows'] para mostrar las primeras filas.") "pasarlo en ctx['head_rows'] para mostrar las primeras filas.")
def _columns_block(profile: dict): def _columns_block(profile: dict, llm_index: dict):
cols = profile.get("columns") or [] cols = profile.get("columns") or []
header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)"] # Descripción / Unidad come from the LLM dictionary (matched by column name);
# they read "—" when run_llm did not run, so the table always renders.
header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
"Descripción", "Unidad"]
rows = [] rows = []
for c in cols: for c in cols:
if not isinstance(c, dict): if not isinstance(c, dict):
@@ -126,15 +185,18 @@ def _columns_block(profile: dict):
nulls = str(null_count) nulls = str(null_count)
else: else:
nulls = "" nulls = ""
rows.append([name, ctype, nulls, _examples_for(c)]) entry = llm_index.get(model._safe_str(name))
rows.append([name, ctype, nulls, _examples_for(c),
_llm_desc(entry), _llm_unit(entry)])
if not rows: if not rows:
return None return None
return model.DataTable(header=header, rows=rows, title="Columnas") return model.DataTable(header=header, rows=rows, title="Columnas")
def _describe_block(profile: dict): def _describe_block(profile: dict, llm_index: dict):
cols = profile.get("columns") or [] cols = profile.get("columns") or []
header = ["Columna", "mean", "median", "min", "max", "std"] # "Unidad" (LLM source) lets the reader know in which unit each stat is.
header = ["Columna", "mean", "median", "min", "max", "std", "Unidad"]
rows = [] rows = []
for c in cols: for c in cols:
if not isinstance(c, dict) or c.get("inferred_type") != "numeric": if not isinstance(c, dict) or c.get("inferred_type") != "numeric":
@@ -142,13 +204,16 @@ def _describe_block(profile: dict):
num = c.get("numeric") or {} num = c.get("numeric") or {}
if not num: if not num:
continue continue
name = c.get("name") or "(col)"
entry = llm_index.get(model._safe_str(name))
rows.append([ rows.append([
c.get("name") or "(col)", name,
_fmt_num(num.get("mean")), _fmt_num(num.get("mean")),
_fmt_num(num.get("median")), _fmt_num(num.get("median")),
_fmt_num(num.get("min")), _fmt_num(num.get("min")),
_fmt_num(num.get("max")), _fmt_num(num.get("max")),
_fmt_num(num.get("std")), _fmt_num(num.get("std")),
_llm_unit(entry),
]) ])
if not rows: if not rows:
return None return None
@@ -163,16 +228,18 @@ def build_overview(profile: dict, ctx: dict):
if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)): if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)):
return None return None
llm_index = _llm_dict_index(profile, ctx)
blocks = [ blocks = [
model.Heading(text="Primeras filas (df.head)", level=2), model.Heading(text="Primeras filas (df.head)", level=2),
_head_block(profile, ctx), _head_block(profile, ctx),
] ]
cols_block = _columns_block(profile) cols_block = _columns_block(profile, llm_index)
if cols_block is not None: if cols_block is not None:
blocks.append(model.Heading( blocks.append(model.Heading(
text="Diccionario de columnas", level=2)) text="Diccionario de columnas", level=2))
blocks.append(cols_block) blocks.append(cols_block)
desc_block = _describe_block(profile) desc_block = _describe_block(profile, llm_index)
if desc_block is not None: if desc_block is not None:
blocks.append(model.Heading( blocks.append(model.Heading(
text="Resumen estadístico numérico", level=2)) text="Resumen estadístico numérico", level=2))
@@ -56,7 +56,21 @@ def _head_rows() -> list:
] ]
def _profile(with_head: bool = True) -> dict: def _llm() -> dict:
"""Interpretive block as eda_llm_insights stores it under profile['llm']."""
return {
"summary": "Pasajeros del Titanic.",
"dictionary": [
{"column": "PassengerId", "description": "Identificador del pasajero",
"business_meaning": "Clave única de cada pasajero", "unit": "id"},
{"column": "Pclass", "description": "Clase del billete",
"business_meaning": "Clase socioeconómica", "unit": "clase (1-3)"},
# No entry for Survived/Name/Sex on purpose -> they degrade to "—".
],
}
def _profile(with_head: bool = True, with_llm: bool = False) -> dict:
prof = { prof = {
"table": "titanic", "table": "titanic",
"source": "/data/titanic.csv", "source": "/data/titanic.csv",
@@ -68,6 +82,8 @@ def _profile(with_head: bool = True) -> dict:
} }
if with_head: if with_head:
prof["head_rows"] = _head_rows() prof["head_rows"] = _head_rows()
if with_llm:
prof["llm"] = _llm()
return prof return prof
@@ -185,3 +201,70 @@ def test_edge_none_y_vacio_no_rompen():
assert ch is not None assert ch is not None
tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)] tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
assert tables and len(tables[0].rows) == 3 assert tables and len(tables[0].rows) == 3
def _table_by_header(blocks, marker: str):
"""Return the first DataTable whose header contains ``marker``."""
for b in _flatten(blocks):
if isinstance(b, DataTable) and marker in b.header:
return b
return None
def test_golden_diccionario_lleva_descripcion_y_unidad_del_llm():
# With run_llm: the column dictionary gains "Descripción" and "Unidad"
# columns populated from profile['llm']['dictionary'], matched by name.
ch = build_overview(_profile(with_llm=True), {})
assert ch is not None
dic = _table_by_header(ch.blocks, "Descripción")
assert dic is not None
assert dic.header == ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
"Descripción", "Unidad"]
by_name = {row[0]: row for row in dic.rows}
# PassengerId has an LLM entry -> description + unit populated.
assert by_name["PassengerId"][4] == "Identificador del pasajero"
assert by_name["PassengerId"][5] == "id"
assert by_name["Pclass"][5] == "clase (1-3)"
# Columns with no LLM entry degrade to "—" without breaking the row.
assert by_name["Survived"][4] == "" and by_name["Survived"][5] == ""
def test_golden_describe_lleva_unidad_del_llm():
ch = build_overview(_profile(with_llm=True), {})
desc = _table_by_header(ch.blocks, "std")
assert desc is not None
assert desc.header[-1] == "Unidad"
by_name = {row[0]: row for row in desc.rows}
assert by_name["PassengerId"][-1] == "id"
assert by_name["Pclass"][-1] == "clase (1-3)"
# Numeric column with no LLM unit still renders, unit "—".
assert by_name["Survived"][-1] == ""
def test_edge_sin_llm_descripcion_unidad_son_guion():
# No profile['llm'] at all: the new cells degrade to "—" and nothing breaks.
ch = build_overview(_profile(), {})
assert ch is not None
dic = _table_by_header(ch.blocks, "Unidad")
assert dic is not None
for row in dic.rows:
assert row[4] == "" and row[5] == ""
desc = _table_by_header(ch.blocks, "std")
assert all(row[-1] == "" for row in desc.rows)
def test_golden_llm_via_ctx_tambien_funciona():
# LLM block arriving through ctx['llm'] (fallback path) is consumed too.
ch = build_overview(_profile(with_llm=False), {"llm": _llm()})
dic = _table_by_header(ch.blocks, "Descripción")
by_name = {row[0]: row for row in dic.rows}
assert by_name["PassengerId"][5] == "id"
def test_golden_render_pdf_muestra_descripcion_y_unidad():
with tempfile.TemporaryDirectory() as d:
out = os.path.join(d, "eda.pdf")
render_automatic_eda_pdf(_profile(with_llm=True), out, {"title": "EDA"})
txt = _pdf_text(out)
assert "Descripción" in txt and "Unidad" in txt
assert "Identificador del pasajero" in txt