64306f3b1c
La tabla DICCIONARIO de columnas del capitulo overview gana columnas "Descripcion" y "Unidad", y la tabla DESCRIBE gana "Unidad", consumiendo profile['llm']['dictionary'] (entradas column/description/business_meaning/unit producidas por eda_llm_insights) emparejadas por nombre de columna. Lectura defensiva: sin bloque LLM (run_llm no corrio) las celdas degradan a "—" y las tablas siguen renderizando. No recalcula nada ni llama al LLM. CHAPTER_VERSION 1.1.0 -> 1.2.0. Tests: golden (descripcion+unidad pobladas para income), edge (sin LLM -> "—"), fallback ctx['llm'], y render PDF con las columnas nuevas visibles. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
250 lines
9.5 KiB
Python
250 lines
9.5 KiB
Python
"""Overview chapter — df.head, column dictionary and describe (reference).
|
|
|
|
Second reference chapter for AutomaticEDA. Renders (across as many pages/slides
|
|
as needed, the renderers paginate):
|
|
|
|
1. ``df.head`` — the first rows of the table. The current ``TableProfile`` does
|
|
NOT carry the raw head, so this is read from ``ctx['head_rows']`` /
|
|
``profile['head_rows']`` (a list of row dicts). When absent the chapter shows
|
|
an honest placeholder documenting the missing key instead of inventing data.
|
|
2. Column dictionary — name / type / nulls / non-null examples plus, when the
|
|
LLM layer ran, the business **description** and **unit** of each column so the
|
|
reader knows at a glance what every column is and in which unit. Examples come
|
|
from ``columns[i]['examples']`` when present; otherwise they are derived from
|
|
real non-null profile values (categorical top values, numeric min/median/max)
|
|
so the cell is never empty nor fabricated.
|
|
3. ``df.describe`` — mean / median / min / max / std for every numeric column,
|
|
plus its **unit** (same LLM source) so the stats read in context.
|
|
|
|
The description/unit come from the ``llm`` block that ``eda_llm_insights`` (group
|
|
``eda``) already stored in the profile (``profile['llm']['dictionary']``, a list
|
|
of ``{"column","description","business_meaning","unit"}`` entries) — this chapter
|
|
only **consumes** it, matching by column name; it never calls the LLM nor
|
|
recomputes anything. When the block is absent (``run_llm`` did not run) those
|
|
cells degrade to ``"—"`` and the tables still render.
|
|
|
|
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from .. import model
|
|
|
|
CHAPTER_VERSION = "1.2.0"
|
|
CHAPTER_ID = "overview"
|
|
CHAPTER_TITLE = "Overview"
|
|
|
|
# Profile/ctx keys the calculation phase must add for a full head + examples.
|
|
HEAD_KEY = "head_rows" # list[dict] — df.head(n)
|
|
EXAMPLES_KEY = "examples" # per column: list of non-null sample values
|
|
LLM_KEY = "llm" # interpretive block from eda_llm_insights
|
|
|
|
|
|
def _llm_dict_index(profile: dict, ctx: dict) -> dict:
|
|
"""Map column name -> its LLM dictionary entry (description/unit/...).
|
|
|
|
Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the
|
|
profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty
|
|
dict when no LLM block ran, so the caller degrades to "—" cells. Fully
|
|
defensive: never raises on malformed input.
|
|
"""
|
|
llm = profile.get(LLM_KEY)
|
|
if not isinstance(llm, dict):
|
|
llm = ctx.get(LLM_KEY)
|
|
if not isinstance(llm, dict):
|
|
return {}
|
|
entries = llm.get("dictionary")
|
|
if not isinstance(entries, (list, tuple)):
|
|
return {}
|
|
index: dict = {}
|
|
for e in entries:
|
|
if not isinstance(e, dict):
|
|
continue
|
|
col = e.get("column")
|
|
if col is None:
|
|
continue
|
|
index[model._safe_str(col)] = e
|
|
return index
|
|
|
|
|
|
def _llm_desc(entry) -> str:
|
|
"""Business description of a column from its LLM entry, or "—"."""
|
|
if not isinstance(entry, dict):
|
|
return "—"
|
|
raw = entry.get("description") or entry.get("business_meaning")
|
|
text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
|
|
return text or "—"
|
|
|
|
|
|
def _llm_unit(entry) -> str:
|
|
"""Unit of a column from its LLM entry, or "—"."""
|
|
if not isinstance(entry, dict):
|
|
return "—"
|
|
raw = entry.get("unit")
|
|
text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
|
|
return text or "—"
|
|
|
|
|
|
def _fmt_num(value, decimals: int = 3) -> str:
|
|
if value is None:
|
|
return "—"
|
|
if isinstance(value, bool):
|
|
return str(value)
|
|
if isinstance(value, int):
|
|
return f"{value:,}".replace(",", ".")
|
|
if isinstance(value, float):
|
|
if value != value: # NaN
|
|
return "NaN"
|
|
if value in (float("inf"), float("-inf")):
|
|
return str(value)
|
|
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
|
|
return text if text else "0"
|
|
return str(value)
|
|
|
|
|
|
def _fmt_pct(value, decimals: int = 1) -> str:
|
|
if value is None:
|
|
return "—"
|
|
try:
|
|
return f"{float(value) * 100:.{decimals}f}%"
|
|
except (TypeError, ValueError):
|
|
return str(value)
|
|
|
|
|
|
def _examples_for(col: dict) -> str:
|
|
"""Build a short string of real non-null example values for a column."""
|
|
explicit = col.get(EXAMPLES_KEY)
|
|
if isinstance(explicit, (list, tuple)) and explicit:
|
|
return ", ".join(model._safe_str(v) for v in explicit[:4])
|
|
cat = col.get("categorical") or {}
|
|
top = cat.get("top") or []
|
|
if top:
|
|
vals = [model._safe_str((t or {}).get("value")) for t in top[:4]
|
|
if isinstance(t, dict)]
|
|
vals = [v for v in vals if v]
|
|
if vals:
|
|
return ", ".join(vals)
|
|
num = col.get("numeric") or {}
|
|
if num:
|
|
bits = []
|
|
for key in ("min", "median", "max"):
|
|
v = num.get(key)
|
|
if v is not None:
|
|
bits.append(_fmt_num(v))
|
|
if bits:
|
|
return ", ".join(bits)
|
|
return "—"
|
|
|
|
|
|
def _head_block(profile: dict, ctx: dict):
|
|
"""Return a DataTable for df.head, or a Note documenting the missing key."""
|
|
head = ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)
|
|
if isinstance(head, list) and head and isinstance(head[0], dict):
|
|
# Column order from the profile, then any extra keys present in rows.
|
|
cols = [c.get("name") for c in (profile.get("columns") or [])
|
|
if c.get("name")]
|
|
if not cols:
|
|
cols = list(head[0].keys())
|
|
rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]]
|
|
# Honest note: how many rows are shown and, when known, out of how many
|
|
# rows the dataset has (so "primeras 10 filas de 891" gives context).
|
|
note = f"primeras {len(rows)} filas"
|
|
n_rows = profile.get("n_rows")
|
|
if isinstance(n_rows, int) and not isinstance(n_rows, bool) \
|
|
and n_rows > len(rows):
|
|
note += f" de {n_rows:,}".replace(",", ".")
|
|
return model.DataTable(header=cols, rows=rows, note=note)
|
|
return model.Note(
|
|
"df.head no disponible: el TableProfile no incluye 'head_rows'. La fase "
|
|
"de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o "
|
|
"pasarlo en ctx['head_rows'] para mostrar las primeras filas.")
|
|
|
|
|
|
def _columns_block(profile: dict, llm_index: dict):
|
|
cols = profile.get("columns") or []
|
|
# Descripción / Unidad come from the LLM dictionary (matched by column name);
|
|
# they read "—" when run_llm did not run, so the table always renders.
|
|
header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
|
|
"Descripción", "Unidad"]
|
|
rows = []
|
|
for c in cols:
|
|
if not isinstance(c, dict):
|
|
continue
|
|
name = c.get("name") or "(col)"
|
|
ctype = c.get("inferred_type") or c.get("physical_type") or "—"
|
|
sem = c.get("semantic_type")
|
|
if sem:
|
|
ctype = f"{ctype} ({sem})"
|
|
null_pct = c.get("null_pct")
|
|
null_count = c.get("null_count")
|
|
if null_pct is not None:
|
|
nulls = _fmt_pct(null_pct)
|
|
if null_count is not None:
|
|
nulls += f" ({null_count})"
|
|
elif null_count is not None:
|
|
nulls = str(null_count)
|
|
else:
|
|
nulls = "—"
|
|
entry = llm_index.get(model._safe_str(name))
|
|
rows.append([name, ctype, nulls, _examples_for(c),
|
|
_llm_desc(entry), _llm_unit(entry)])
|
|
if not rows:
|
|
return None
|
|
return model.DataTable(header=header, rows=rows, title="Columnas")
|
|
|
|
|
|
def _describe_block(profile: dict, llm_index: dict):
|
|
cols = profile.get("columns") or []
|
|
# "Unidad" (LLM source) lets the reader know in which unit each stat is.
|
|
header = ["Columna", "mean", "median", "min", "max", "std", "Unidad"]
|
|
rows = []
|
|
for c in cols:
|
|
if not isinstance(c, dict) or c.get("inferred_type") != "numeric":
|
|
continue
|
|
num = c.get("numeric") or {}
|
|
if not num:
|
|
continue
|
|
name = c.get("name") or "(col)"
|
|
entry = llm_index.get(model._safe_str(name))
|
|
rows.append([
|
|
name,
|
|
_fmt_num(num.get("mean")),
|
|
_fmt_num(num.get("median")),
|
|
_fmt_num(num.get("min")),
|
|
_fmt_num(num.get("max")),
|
|
_fmt_num(num.get("std")),
|
|
_llm_unit(entry),
|
|
])
|
|
if not rows:
|
|
return None
|
|
return model.DataTable(header=header, rows=rows, title="Estadística (describe)")
|
|
|
|
|
|
def build_overview(profile: dict, ctx: dict):
|
|
"""Build the Overview Chapter, or None if the profile has no columns."""
|
|
profile = profile or {}
|
|
ctx = ctx or {}
|
|
cols = profile.get("columns") or []
|
|
if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)):
|
|
return None
|
|
|
|
llm_index = _llm_dict_index(profile, ctx)
|
|
|
|
blocks = [
|
|
model.Heading(text="Primeras filas (df.head)", level=2),
|
|
_head_block(profile, ctx),
|
|
]
|
|
cols_block = _columns_block(profile, llm_index)
|
|
if cols_block is not None:
|
|
blocks.append(model.Heading(
|
|
text="Diccionario de columnas", level=2))
|
|
blocks.append(cols_block)
|
|
desc_block = _describe_block(profile, llm_index)
|
|
if desc_block is not None:
|
|
blocks.append(model.Heading(
|
|
text="Resumen estadístico numérico", level=2))
|
|
blocks.append(desc_block)
|
|
|
|
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
|
version=CHAPTER_VERSION, blocks=blocks)
|