diff --git a/python/functions/datascience/automatic_eda/chapters/overview.py b/python/functions/datascience/automatic_eda/chapters/overview.py index f3dc8b53..67b709b4 100644 --- a/python/functions/datascience/automatic_eda/chapters/overview.py +++ b/python/functions/datascience/automatic_eda/chapters/overview.py @@ -7,11 +7,21 @@ as needed, the renderers paginate): NOT carry the raw head, so this is read from ``ctx['head_rows']`` / ``profile['head_rows']`` (a list of row dicts). When absent the chapter shows an honest placeholder documenting the missing key instead of inventing data. -2. Column dictionary — name / type / nulls / non-null examples. Examples come +2. Column dictionary — name / type / nulls / non-null examples plus, when the + LLM layer ran, the business **description** and **unit** of each column so the + reader knows at a glance what every column is and in which unit. Examples come from ``columns[i]['examples']`` when present; otherwise they are derived from real non-null profile values (categorical top values, numeric min/median/max) so the cell is never empty nor fabricated. -3. ``df.describe`` — mean / median / min / max / std for every numeric column. +3. ``df.describe`` — mean / median / min / max / std for every numeric column, + plus its **unit** (same LLM source) so the stats read in context. + +The description/unit come from the ``llm`` block that ``eda_llm_insights`` (group +``eda``) already stored in the profile (``profile['llm']['dictionary']``, a list +of ``{"column","description","business_meaning","unit"}`` entries) — this chapter +only **consumes** it, matching by column name; it never calls the LLM nor +recomputes anything. When the block is absent (``run_llm`` did not run) those +cells degrade to ``"—"`` and the tables still render. Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". """ @@ -20,13 +30,59 @@ from __future__ import annotations from .. import model -CHAPTER_VERSION = "1.1.0" +CHAPTER_VERSION = "1.2.0" CHAPTER_ID = "overview" CHAPTER_TITLE = "Overview" # Profile/ctx keys the calculation phase must add for a full head + examples. HEAD_KEY = "head_rows" # list[dict] — df.head(n) EXAMPLES_KEY = "examples" # per column: list of non-null sample values +LLM_KEY = "llm" # interpretive block from eda_llm_insights + + +def _llm_dict_index(profile: dict, ctx: dict) -> dict: + """Map column name -> its LLM dictionary entry (description/unit/...). + + Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the + profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty + dict when no LLM block ran, so the caller degrades to "—" cells. Fully + defensive: never raises on malformed input. + """ + llm = profile.get(LLM_KEY) + if not isinstance(llm, dict): + llm = ctx.get(LLM_KEY) + if not isinstance(llm, dict): + return {} + entries = llm.get("dictionary") + if not isinstance(entries, (list, tuple)): + return {} + index: dict = {} + for e in entries: + if not isinstance(e, dict): + continue + col = e.get("column") + if col is None: + continue + index[model._safe_str(col)] = e + return index + + +def _llm_desc(entry) -> str: + """Business description of a column from its LLM entry, or "—".""" + if not isinstance(entry, dict): + return "—" + raw = entry.get("description") or entry.get("business_meaning") + text = " ".join(model._safe_str(raw).split()) if raw is not None else "" + return text or "—" + + +def _llm_unit(entry) -> str: + """Unit of a column from its LLM entry, or "—".""" + if not isinstance(entry, dict): + return "—" + raw = entry.get("unit") + text = " ".join(model._safe_str(raw).split()) if raw is not None else "" + return text or "—" def _fmt_num(value, decimals: int = 3) -> str: @@ -104,9 +160,12 @@ def _head_block(profile: dict, ctx: dict): "pasarlo en ctx['head_rows'] para mostrar las primeras filas.") -def _columns_block(profile: dict): +def _columns_block(profile: dict, llm_index: dict): cols = profile.get("columns") or [] - header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)"] + # Descripción / Unidad come from the LLM dictionary (matched by column name); + # they read "—" when run_llm did not run, so the table always renders. + header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)", + "Descripción", "Unidad"] rows = [] for c in cols: if not isinstance(c, dict): @@ -126,15 +185,18 @@ def _columns_block(profile: dict): nulls = str(null_count) else: nulls = "—" - rows.append([name, ctype, nulls, _examples_for(c)]) + entry = llm_index.get(model._safe_str(name)) + rows.append([name, ctype, nulls, _examples_for(c), + _llm_desc(entry), _llm_unit(entry)]) if not rows: return None return model.DataTable(header=header, rows=rows, title="Columnas") -def _describe_block(profile: dict): +def _describe_block(profile: dict, llm_index: dict): cols = profile.get("columns") or [] - header = ["Columna", "mean", "median", "min", "max", "std"] + # "Unidad" (LLM source) lets the reader know in which unit each stat is. + header = ["Columna", "mean", "median", "min", "max", "std", "Unidad"] rows = [] for c in cols: if not isinstance(c, dict) or c.get("inferred_type") != "numeric": @@ -142,13 +204,16 @@ def _describe_block(profile: dict): num = c.get("numeric") or {} if not num: continue + name = c.get("name") or "(col)" + entry = llm_index.get(model._safe_str(name)) rows.append([ - c.get("name") or "(col)", + name, _fmt_num(num.get("mean")), _fmt_num(num.get("median")), _fmt_num(num.get("min")), _fmt_num(num.get("max")), _fmt_num(num.get("std")), + _llm_unit(entry), ]) if not rows: return None @@ -163,16 +228,18 @@ def build_overview(profile: dict, ctx: dict): if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)): return None + llm_index = _llm_dict_index(profile, ctx) + blocks = [ model.Heading(text="Primeras filas (df.head)", level=2), _head_block(profile, ctx), ] - cols_block = _columns_block(profile) + cols_block = _columns_block(profile, llm_index) if cols_block is not None: blocks.append(model.Heading( text="Diccionario de columnas", level=2)) blocks.append(cols_block) - desc_block = _describe_block(profile) + desc_block = _describe_block(profile, llm_index) if desc_block is not None: blocks.append(model.Heading( text="Resumen estadístico numérico", level=2)) diff --git a/python/functions/datascience/automatic_eda/chapters/overview_test.py b/python/functions/datascience/automatic_eda/chapters/overview_test.py index b66263a1..0f9d985a 100644 --- a/python/functions/datascience/automatic_eda/chapters/overview_test.py +++ b/python/functions/datascience/automatic_eda/chapters/overview_test.py @@ -56,7 +56,21 @@ def _head_rows() -> list: ] -def _profile(with_head: bool = True) -> dict: +def _llm() -> dict: + """Interpretive block as eda_llm_insights stores it under profile['llm'].""" + return { + "summary": "Pasajeros del Titanic.", + "dictionary": [ + {"column": "PassengerId", "description": "Identificador del pasajero", + "business_meaning": "Clave única de cada pasajero", "unit": "id"}, + {"column": "Pclass", "description": "Clase del billete", + "business_meaning": "Clase socioeconómica", "unit": "clase (1-3)"}, + # No entry for Survived/Name/Sex on purpose -> they degrade to "—". + ], + } + + +def _profile(with_head: bool = True, with_llm: bool = False) -> dict: prof = { "table": "titanic", "source": "/data/titanic.csv", @@ -68,6 +82,8 @@ def _profile(with_head: bool = True) -> dict: } if with_head: prof["head_rows"] = _head_rows() + if with_llm: + prof["llm"] = _llm() return prof @@ -185,3 +201,70 @@ def test_edge_none_y_vacio_no_rompen(): assert ch is not None tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)] assert tables and len(tables[0].rows) == 3 + + +def _table_by_header(blocks, marker: str): + """Return the first DataTable whose header contains ``marker``.""" + for b in _flatten(blocks): + if isinstance(b, DataTable) and marker in b.header: + return b + return None + + +def test_golden_diccionario_lleva_descripcion_y_unidad_del_llm(): + # With run_llm: the column dictionary gains "Descripción" and "Unidad" + # columns populated from profile['llm']['dictionary'], matched by name. + ch = build_overview(_profile(with_llm=True), {}) + assert ch is not None + dic = _table_by_header(ch.blocks, "Descripción") + assert dic is not None + assert dic.header == ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)", + "Descripción", "Unidad"] + by_name = {row[0]: row for row in dic.rows} + # PassengerId has an LLM entry -> description + unit populated. + assert by_name["PassengerId"][4] == "Identificador del pasajero" + assert by_name["PassengerId"][5] == "id" + assert by_name["Pclass"][5] == "clase (1-3)" + # Columns with no LLM entry degrade to "—" without breaking the row. + assert by_name["Survived"][4] == "—" and by_name["Survived"][5] == "—" + + +def test_golden_describe_lleva_unidad_del_llm(): + ch = build_overview(_profile(with_llm=True), {}) + desc = _table_by_header(ch.blocks, "std") + assert desc is not None + assert desc.header[-1] == "Unidad" + by_name = {row[0]: row for row in desc.rows} + assert by_name["PassengerId"][-1] == "id" + assert by_name["Pclass"][-1] == "clase (1-3)" + # Numeric column with no LLM unit still renders, unit "—". + assert by_name["Survived"][-1] == "—" + + +def test_edge_sin_llm_descripcion_unidad_son_guion(): + # No profile['llm'] at all: the new cells degrade to "—" and nothing breaks. + ch = build_overview(_profile(), {}) + assert ch is not None + dic = _table_by_header(ch.blocks, "Unidad") + assert dic is not None + for row in dic.rows: + assert row[4] == "—" and row[5] == "—" + desc = _table_by_header(ch.blocks, "std") + assert all(row[-1] == "—" for row in desc.rows) + + +def test_golden_llm_via_ctx_tambien_funciona(): + # LLM block arriving through ctx['llm'] (fallback path) is consumed too. + ch = build_overview(_profile(with_llm=False), {"llm": _llm()}) + dic = _table_by_header(ch.blocks, "Descripción") + by_name = {row[0]: row for row in dic.rows} + assert by_name["PassengerId"][5] == "id" + + +def test_golden_render_pdf_muestra_descripcion_y_unidad(): + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "eda.pdf") + render_automatic_eda_pdf(_profile(with_llm=True), out, {"title": "EDA"}) + txt = _pdf_text(out) + assert "Descripción" in txt and "Unidad" in txt + assert "Identificador del pasajero" in txt