Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 64306f3b1c | |||
| f2eb782a5f |
@@ -7,11 +7,21 @@ as needed, the renderers paginate):
|
||||
NOT carry the raw head, so this is read from ``ctx['head_rows']`` /
|
||||
``profile['head_rows']`` (a list of row dicts). When absent the chapter shows
|
||||
an honest placeholder documenting the missing key instead of inventing data.
|
||||
2. Column dictionary — name / type / nulls / non-null examples. Examples come
|
||||
2. Column dictionary — name / type / nulls / non-null examples plus, when the
|
||||
LLM layer ran, the business **description** and **unit** of each column so the
|
||||
reader knows at a glance what every column is and in which unit. Examples come
|
||||
from ``columns[i]['examples']`` when present; otherwise they are derived from
|
||||
real non-null profile values (categorical top values, numeric min/median/max)
|
||||
so the cell is never empty nor fabricated.
|
||||
3. ``df.describe`` — mean / median / min / max / std for every numeric column.
|
||||
3. ``df.describe`` — mean / median / min / max / std for every numeric column,
|
||||
plus its **unit** (same LLM source) so the stats read in context.
|
||||
|
||||
The description/unit come from the ``llm`` block that ``eda_llm_insights`` (group
|
||||
``eda``) already stored in the profile (``profile['llm']['dictionary']``, a list
|
||||
of ``{"column","description","business_meaning","unit"}`` entries) — this chapter
|
||||
only **consumes** it, matching by column name; it never calls the LLM nor
|
||||
recomputes anything. When the block is absent (``run_llm`` did not run) those
|
||||
cells degrade to ``"—"`` and the tables still render.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
@@ -20,13 +30,59 @@ from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_VERSION = "1.2.0"
|
||||
CHAPTER_ID = "overview"
|
||||
CHAPTER_TITLE = "Overview"
|
||||
|
||||
# Profile/ctx keys the calculation phase must add for a full head + examples.
|
||||
HEAD_KEY = "head_rows" # list[dict] — df.head(n)
|
||||
EXAMPLES_KEY = "examples" # per column: list of non-null sample values
|
||||
LLM_KEY = "llm" # interpretive block from eda_llm_insights
|
||||
|
||||
|
||||
def _llm_dict_index(profile: dict, ctx: dict) -> dict:
|
||||
"""Map column name -> its LLM dictionary entry (description/unit/...).
|
||||
|
||||
Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the
|
||||
profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty
|
||||
dict when no LLM block ran, so the caller degrades to "—" cells. Fully
|
||||
defensive: never raises on malformed input.
|
||||
"""
|
||||
llm = profile.get(LLM_KEY)
|
||||
if not isinstance(llm, dict):
|
||||
llm = ctx.get(LLM_KEY)
|
||||
if not isinstance(llm, dict):
|
||||
return {}
|
||||
entries = llm.get("dictionary")
|
||||
if not isinstance(entries, (list, tuple)):
|
||||
return {}
|
||||
index: dict = {}
|
||||
for e in entries:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
col = e.get("column")
|
||||
if col is None:
|
||||
continue
|
||||
index[model._safe_str(col)] = e
|
||||
return index
|
||||
|
||||
|
||||
def _llm_desc(entry) -> str:
|
||||
"""Business description of a column from its LLM entry, or "—"."""
|
||||
if not isinstance(entry, dict):
|
||||
return "—"
|
||||
raw = entry.get("description") or entry.get("business_meaning")
|
||||
text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
|
||||
return text or "—"
|
||||
|
||||
|
||||
def _llm_unit(entry) -> str:
|
||||
"""Unit of a column from its LLM entry, or "—"."""
|
||||
if not isinstance(entry, dict):
|
||||
return "—"
|
||||
raw = entry.get("unit")
|
||||
text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
|
||||
return text or "—"
|
||||
|
||||
|
||||
def _fmt_num(value, decimals: int = 3) -> str:
|
||||
@@ -104,9 +160,12 @@ def _head_block(profile: dict, ctx: dict):
|
||||
"pasarlo en ctx['head_rows'] para mostrar las primeras filas.")
|
||||
|
||||
|
||||
def _columns_block(profile: dict):
|
||||
def _columns_block(profile: dict, llm_index: dict):
|
||||
cols = profile.get("columns") or []
|
||||
header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)"]
|
||||
# Descripción / Unidad come from the LLM dictionary (matched by column name);
|
||||
# they read "—" when run_llm did not run, so the table always renders.
|
||||
header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
|
||||
"Descripción", "Unidad"]
|
||||
rows = []
|
||||
for c in cols:
|
||||
if not isinstance(c, dict):
|
||||
@@ -126,15 +185,18 @@ def _columns_block(profile: dict):
|
||||
nulls = str(null_count)
|
||||
else:
|
||||
nulls = "—"
|
||||
rows.append([name, ctype, nulls, _examples_for(c)])
|
||||
entry = llm_index.get(model._safe_str(name))
|
||||
rows.append([name, ctype, nulls, _examples_for(c),
|
||||
_llm_desc(entry), _llm_unit(entry)])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=header, rows=rows, title="Columnas")
|
||||
|
||||
|
||||
def _describe_block(profile: dict):
|
||||
def _describe_block(profile: dict, llm_index: dict):
|
||||
cols = profile.get("columns") or []
|
||||
header = ["Columna", "mean", "median", "min", "max", "std"]
|
||||
# "Unidad" (LLM source) lets the reader know in which unit each stat is.
|
||||
header = ["Columna", "mean", "median", "min", "max", "std", "Unidad"]
|
||||
rows = []
|
||||
for c in cols:
|
||||
if not isinstance(c, dict) or c.get("inferred_type") != "numeric":
|
||||
@@ -142,13 +204,16 @@ def _describe_block(profile: dict):
|
||||
num = c.get("numeric") or {}
|
||||
if not num:
|
||||
continue
|
||||
name = c.get("name") or "(col)"
|
||||
entry = llm_index.get(model._safe_str(name))
|
||||
rows.append([
|
||||
c.get("name") or "(col)",
|
||||
name,
|
||||
_fmt_num(num.get("mean")),
|
||||
_fmt_num(num.get("median")),
|
||||
_fmt_num(num.get("min")),
|
||||
_fmt_num(num.get("max")),
|
||||
_fmt_num(num.get("std")),
|
||||
_llm_unit(entry),
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
@@ -163,16 +228,18 @@ def build_overview(profile: dict, ctx: dict):
|
||||
if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)):
|
||||
return None
|
||||
|
||||
llm_index = _llm_dict_index(profile, ctx)
|
||||
|
||||
blocks = [
|
||||
model.Heading(text="Primeras filas (df.head)", level=2),
|
||||
_head_block(profile, ctx),
|
||||
]
|
||||
cols_block = _columns_block(profile)
|
||||
cols_block = _columns_block(profile, llm_index)
|
||||
if cols_block is not None:
|
||||
blocks.append(model.Heading(
|
||||
text="Diccionario de columnas", level=2))
|
||||
blocks.append(cols_block)
|
||||
desc_block = _describe_block(profile)
|
||||
desc_block = _describe_block(profile, llm_index)
|
||||
if desc_block is not None:
|
||||
blocks.append(model.Heading(
|
||||
text="Resumen estadístico numérico", level=2))
|
||||
|
||||
@@ -56,7 +56,21 @@ def _head_rows() -> list:
|
||||
]
|
||||
|
||||
|
||||
def _profile(with_head: bool = True) -> dict:
|
||||
def _llm() -> dict:
|
||||
"""Interpretive block as eda_llm_insights stores it under profile['llm']."""
|
||||
return {
|
||||
"summary": "Pasajeros del Titanic.",
|
||||
"dictionary": [
|
||||
{"column": "PassengerId", "description": "Identificador del pasajero",
|
||||
"business_meaning": "Clave única de cada pasajero", "unit": "id"},
|
||||
{"column": "Pclass", "description": "Clase del billete",
|
||||
"business_meaning": "Clase socioeconómica", "unit": "clase (1-3)"},
|
||||
# No entry for Survived/Name/Sex on purpose -> they degrade to "—".
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _profile(with_head: bool = True, with_llm: bool = False) -> dict:
|
||||
prof = {
|
||||
"table": "titanic",
|
||||
"source": "/data/titanic.csv",
|
||||
@@ -68,6 +82,8 @@ def _profile(with_head: bool = True) -> dict:
|
||||
}
|
||||
if with_head:
|
||||
prof["head_rows"] = _head_rows()
|
||||
if with_llm:
|
||||
prof["llm"] = _llm()
|
||||
return prof
|
||||
|
||||
|
||||
@@ -185,3 +201,70 @@ def test_edge_none_y_vacio_no_rompen():
|
||||
assert ch is not None
|
||||
tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
|
||||
assert tables and len(tables[0].rows) == 3
|
||||
|
||||
|
||||
def _table_by_header(blocks, marker: str):
|
||||
"""Return the first DataTable whose header contains ``marker``."""
|
||||
for b in _flatten(blocks):
|
||||
if isinstance(b, DataTable) and marker in b.header:
|
||||
return b
|
||||
return None
|
||||
|
||||
|
||||
def test_golden_diccionario_lleva_descripcion_y_unidad_del_llm():
|
||||
# With run_llm: the column dictionary gains "Descripción" and "Unidad"
|
||||
# columns populated from profile['llm']['dictionary'], matched by name.
|
||||
ch = build_overview(_profile(with_llm=True), {})
|
||||
assert ch is not None
|
||||
dic = _table_by_header(ch.blocks, "Descripción")
|
||||
assert dic is not None
|
||||
assert dic.header == ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
|
||||
"Descripción", "Unidad"]
|
||||
by_name = {row[0]: row for row in dic.rows}
|
||||
# PassengerId has an LLM entry -> description + unit populated.
|
||||
assert by_name["PassengerId"][4] == "Identificador del pasajero"
|
||||
assert by_name["PassengerId"][5] == "id"
|
||||
assert by_name["Pclass"][5] == "clase (1-3)"
|
||||
# Columns with no LLM entry degrade to "—" without breaking the row.
|
||||
assert by_name["Survived"][4] == "—" and by_name["Survived"][5] == "—"
|
||||
|
||||
|
||||
def test_golden_describe_lleva_unidad_del_llm():
|
||||
ch = build_overview(_profile(with_llm=True), {})
|
||||
desc = _table_by_header(ch.blocks, "std")
|
||||
assert desc is not None
|
||||
assert desc.header[-1] == "Unidad"
|
||||
by_name = {row[0]: row for row in desc.rows}
|
||||
assert by_name["PassengerId"][-1] == "id"
|
||||
assert by_name["Pclass"][-1] == "clase (1-3)"
|
||||
# Numeric column with no LLM unit still renders, unit "—".
|
||||
assert by_name["Survived"][-1] == "—"
|
||||
|
||||
|
||||
def test_edge_sin_llm_descripcion_unidad_son_guion():
|
||||
# No profile['llm'] at all: the new cells degrade to "—" and nothing breaks.
|
||||
ch = build_overview(_profile(), {})
|
||||
assert ch is not None
|
||||
dic = _table_by_header(ch.blocks, "Unidad")
|
||||
assert dic is not None
|
||||
for row in dic.rows:
|
||||
assert row[4] == "—" and row[5] == "—"
|
||||
desc = _table_by_header(ch.blocks, "std")
|
||||
assert all(row[-1] == "—" for row in desc.rows)
|
||||
|
||||
|
||||
def test_golden_llm_via_ctx_tambien_funciona():
|
||||
# LLM block arriving through ctx['llm'] (fallback path) is consumed too.
|
||||
ch = build_overview(_profile(with_llm=False), {"llm": _llm()})
|
||||
dic = _table_by_header(ch.blocks, "Descripción")
|
||||
by_name = {row[0]: row for row in dic.rows}
|
||||
assert by_name["PassengerId"][5] == "id"
|
||||
|
||||
|
||||
def test_golden_render_pdf_muestra_descripcion_y_unidad():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pdf")
|
||||
render_automatic_eda_pdf(_profile(with_llm=True), out, {"title": "EDA"})
|
||||
txt = _pdf_text(out)
|
||||
assert "Descripción" in txt and "Unidad" in txt
|
||||
assert "Identificador del pasajero" in txt
|
||||
|
||||
Reference in New Issue
Block a user