diff --git a/python/functions/datascience/automatic_eda/chapters/overview.py b/python/functions/datascience/automatic_eda/chapters/overview.py index 93b25b52..f3dc8b53 100644 --- a/python/functions/datascience/automatic_eda/chapters/overview.py +++ b/python/functions/datascience/automatic_eda/chapters/overview.py @@ -20,7 +20,7 @@ from __future__ import annotations from .. import model -CHAPTER_VERSION = "1.0.0" +CHAPTER_VERSION = "1.1.0" CHAPTER_ID = "overview" CHAPTER_TITLE = "Overview" @@ -90,8 +90,14 @@ def _head_block(profile: dict, ctx: dict): if not cols: cols = list(head[0].keys()) rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]] - return model.DataTable(header=cols, rows=rows, - note=f"primeras {len(rows)} filas") + # Honest note: how many rows are shown and, when known, out of how many + # rows the dataset has (so "primeras 10 filas de 891" gives context). + note = f"primeras {len(rows)} filas" + n_rows = profile.get("n_rows") + if isinstance(n_rows, int) and not isinstance(n_rows, bool) \ + and n_rows > len(rows): + note += f" de {n_rows:,}".replace(",", ".") + return model.DataTable(header=cols, rows=rows, note=note) return model.Note( "df.head no disponible: el TableProfile no incluye 'head_rows'. La fase " "de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o " diff --git a/python/functions/datascience/automatic_eda/chapters/overview_test.py b/python/functions/datascience/automatic_eda/chapters/overview_test.py new file mode 100644 index 00000000..b66263a1 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/overview_test.py @@ -0,0 +1,187 @@ +"""Tests for the OVERVIEW chapter — DoD: golden + edges + degradation. + +Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast +and deterministic. Verifies that ``build_overview`` renders the raw first rows +(``df.head``) as a DataTable when ``head_rows`` is present — both when it arrives +via ``profile['head_rows']`` (populated by ``profile_table``) and via +``ctx['head_rows']`` (populated by ``build_eda_render_ctx``) — that the chapter +also renders the column dictionary and the numeric describe, that the full +document renders to PDF and PPTX showing the head values, and that a profile with +NO head data degrades to an honest note instead of raising or inventing rows. +""" + +import os +import re +import tempfile + +from pypdf import PdfReader +from pptx import Presentation + +from datascience.automatic_eda.model import DataTable, Note +from datascience.automatic_eda.chapters.overview import ( + CHAPTER_ID, CHAPTER_VERSION, build_overview, +) +from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf +from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx + + +def _columns() -> list: + return [ + {"name": "PassengerId", "inferred_type": "numeric", "null_pct": 0.0, + "null_count": 0, "numeric": {"mean": 2.0, "median": 2.0, "min": 1.0, + "max": 3.0, "std": 1.0}}, + {"name": "Survived", "inferred_type": "numeric", "null_pct": 0.0, + "null_count": 0, "numeric": {"mean": 0.33, "median": 0.0, "min": 0.0, + "max": 1.0, "std": 0.58}}, + {"name": "Pclass", "inferred_type": "numeric", "null_pct": 0.0, + "null_count": 0, "numeric": {"mean": 2.33, "median": 3.0, "min": 1.0, + "max": 3.0, "std": 1.15}}, + {"name": "Name", "inferred_type": "categorical", "null_pct": 0.0, + "null_count": 0, "distinct_count": 3}, + {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0, + "null_count": 0, "distinct_count": 2, + "categorical": {"top": [{"value": "male", "count": 2}, + {"value": "female", "count": 1}]}}, + ] + + +def _head_rows() -> list: + return [ + {"PassengerId": 1, "Survived": 0, "Pclass": 3, + "Name": "Braund Owen", "Sex": "male"}, + {"PassengerId": 2, "Survived": 1, "Pclass": 1, + "Name": "Cumings Florence", "Sex": "female"}, + {"PassengerId": 3, "Survived": 1, "Pclass": 3, + "Name": "Heikkinen Laina", "Sex": "female"}, + ] + + +def _profile(with_head: bool = True) -> dict: + prof = { + "table": "titanic", + "source": "/data/titanic.csv", + "profiled_at": "2026-06-30T10:00:00+00:00", + "n_rows": 891, + "n_cols": 5, + "quality_score": 88.0, + "columns": _columns(), + } + if with_head: + prof["head_rows"] = _head_rows() + return prof + + +def _pdf_text(path: str) -> str: + txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages) + return re.sub(r"\s+", " ", txt) + + +def _pptx_text(path: str) -> str: + prs = Presentation(path) + parts = [] + for sl in prs.slides: + for sh in sl.shapes: + if sh.has_text_frame: + parts.append(sh.text_frame.text) + if sh.has_table: + tb = sh.table + for r in range(len(tb.rows)): + for c in range(len(tb.columns)): + parts.append(tb.cell(r, c).text) + return re.sub(r"\s+", " ", " ".join(parts)) + + +def _flatten(blocks): + """Recursively flatten Group blocks into a flat list (none here today).""" + out = [] + for b in blocks: + inner = getattr(b, "blocks", None) + if inner is not None and getattr(b, "kind", None) == "group": + out.extend(_flatten(inner)) + else: + out.append(b) + return out + + +def test_golden_build_overview_muestra_head_desde_profile(): + ch = build_overview(_profile(), {}) + assert ch is not None + assert ch.id == CHAPTER_ID + assert ch.version == CHAPTER_VERSION + blocks = _flatten(ch.blocks) + # The first DataTable is df.head: its header is the column names and the + # real first rows are present (not a placeholder note). + tables = [b for b in blocks if isinstance(b, DataTable)] + assert tables, "overview must emit at least the df.head DataTable" + head_tbl = tables[0] + assert head_tbl.header == ["PassengerId", "Survived", "Pclass", + "Name", "Sex"] + assert len(head_tbl.rows) == 3 + flat = [str(c) for row in head_tbl.rows for c in row] + assert "Braund Owen" in flat and "Cumings Florence" in flat + # Honest note carries how many rows shown out of the dataset total. + assert head_tbl.note is not None + assert "primeras 3 filas" in head_tbl.note and "891" in head_tbl.note + # No "df.head no disponible" placeholder when head_rows is present. + assert not any(isinstance(b, Note) and "no disponible" in b.text + for b in blocks) + + +def test_golden_head_desde_ctx_tambien_funciona(): + # head_rows absent in profile but present in ctx (build_eda_render_ctx path). + prof = _profile(with_head=False) + ch = build_overview(prof, {"head_rows": _head_rows()}) + assert ch is not None + tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)] + flat = [str(c) for row in tables[0].rows for c in row] + assert "Braund Owen" in flat + + +def test_golden_render_pdf_muestra_head(): + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "eda.pdf") + res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"}) + assert res["path"] == out and os.path.exists(out) + assert CHAPTER_ID in [c["id"] for c in res["chapters"]] + txt = _pdf_text(out) + assert "Braund" in txt and "male" in txt + assert "primeras" in txt # head note rendered. + assert "df.head" in txt # chapter heading rendered. + assert "no disponible" not in txt # placeholder NOT shown. + + +def test_golden_render_pptx_muestra_head(): + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "eda.pptx") + res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"}) + assert res["path"] == out and os.path.exists(out) + assert CHAPTER_ID in [c["id"] for c in res["chapters"]] + txt = _pptx_text(out) + assert "Braund" in txt and "Cumings" in txt + + +def test_edge_sin_head_rows_degrada_a_nota_honesta(): + # No head data anywhere: chapter still builds (columns exist), shows the + # honest placeholder note, and never invents rows nor raises. + prof = _profile(with_head=False) + ch = build_overview(prof, {}) + assert ch is not None + blocks = _flatten(ch.blocks) + assert any(isinstance(b, Note) and "no disponible" in b.text + for b in blocks) + # The first DataTable now is the column dictionary, not df.head rows. + tables = [b for b in blocks if isinstance(b, DataTable)] + assert all("Braund" not in str(c) + for tbl in tables for row in tbl.rows for c in row) + + +def test_edge_none_y_vacio_no_rompen(): + # Nothing to render at all -> None, no raise. + assert build_overview(None, None) is None + assert build_overview({}, {}) is None + assert build_overview({"columns": []}, {}) is None + # Only head_rows (no columns) still yields a chapter with the head table. + ch = build_overview({"columns": []}, {"head_rows": _head_rows()}) + assert ch is not None + tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)] + assert tables and len(tables[0].rows) == 3 diff --git a/python/functions/datascience/build_eda_render_ctx.py b/python/functions/datascience/build_eda_render_ctx.py index efcda2cb..68959abf 100644 --- a/python/functions/datascience/build_eda_render_ctx.py +++ b/python/functions/datascience/build_eda_render_ctx.py @@ -20,6 +20,10 @@ vacia y el resto del ctx se construye igual. Ante un fallo global devuelve al menos ``{**base_ctx, "db_path": db_path, "table": table}``. Claves de DATOS que produce (las consumen los capitulos): + - ``head_rows`` : [ {col: valor, ...}, ... ] primeras filas CRUDAS de la + tabla (``SELECT * LIMIT head_n``), una entrada por fila. + La lee el capitulo OVERVIEW para mostrar df.head real en + lugar del placeholder "df.head no disponible". - ``raw_numeric`` : {col: [float|None, ...]} muestra cruda de las columnas numericas, ALINEADA POR FILA (una entrada por fila aunque sea None). La leen modelos (clustering 2D en vivo) y @@ -56,7 +60,7 @@ def _to_float(value): return None -def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None): +def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None, head_n=10): """Construye el ctx de datos crudos para los renderers de AutomaticEDA. Args: @@ -77,13 +81,15 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx: dict opcional con claves de presentacion ya preparadas (dataset_name, source_origin, ...). Se parte de una copia y NO se pisan sus claves; solo se añaden las de datos. Default None -> {}. + head_n: numero de filas crudas a muestrear para ``ctx["head_rows"]`` + (df.head del capitulo OVERVIEW). Default 10. <=0 omite la clave. Returns: El dict ``ctx`` directamente (NO un wrapper {status,...}): se pasa tal cual como ``meta={"ctx": }`` a render_automatic_eda_pdf/pptx. - Nunca lanza. Claves que puede contener: raw_numeric, timeseries_raw, - geo_points (omitidas si no aplican o fallan), y siempre db_path + table - para backends validos. + Nunca lanza. Claves que puede contener: head_rows, raw_numeric, + timeseries_raw, geo_points (omitidas si no aplican o fallan), y siempre + db_path + table para backends validos. """ # Copia de base_ctx: nunca mutamos el dict del caller. Las claves de # presentacion que ya traiga se conservan; las de datos se añaden encima. @@ -117,6 +123,24 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, ctx["db_path"] = db_path ctx["table"] = table + # 1.5) head_rows: primeras filas CRUDAS de la tabla (SELECT * LIMIT n) + # para que el capitulo OVERVIEW muestre df.head real en vez del + # placeholder. Una sola query, dict-no-throw: si falla, se omite la + # clave (el capitulo degrada a su nota honesta). No se pisa una clave + # head_rows que ya viniera en base_ctx (presentacion). + if head_n and int(head_n) > 0 and "head_rows" not in ctx: + try: + hq = query_fn(f'SELECT * FROM "{table}" LIMIT {int(head_n)}') + if isinstance(hq, dict) and hq.get("status") == "ok": + hrows = [ + dict(r) for r in (hq.get("rows") or []) + if isinstance(r, dict) + ] + if hrows: + ctx["head_rows"] = hrows + except Exception: # noqa: BLE001 - dict-no-throw: omitir la clave + pass + # 2) Columnas del perfil agregado (lectura defensiva). cols = profile.get("columns") if isinstance(profile, dict) else None cols = cols or [] diff --git a/python/functions/pipelines/profile_table.py b/python/functions/pipelines/profile_table.py index 8a0077af..84912d11 100644 --- a/python/functions/pipelines/profile_table.py +++ b/python/functions/pipelines/profile_table.py @@ -536,6 +536,21 @@ def profile_table( type_breakdown[it] += 1 prof["type_breakdown"] = type_breakdown + # 8.1) Primeras filas crudas (df.head) para el capitulo OVERVIEW del motor + # AutomaticEDA: una muestra SELECT col1,col2,... LIMIT 10 alineada por fila. + # Se reusa _sample_rows (mismo lector read-only). Estilo dict-no-throw: si + # falla, head_rows queda None y el capitulo degrada a su nota honesta. El + # capitulo lo recoge via profile["head_rows"]; build_eda_render_ctx ademas + # lo replica en ctx["head_rows"] cuando se construye el contexto de render. + try: + head_names = [c.get("name") for c in cols if c.get("name")] + head_rows = _sample_rows(_q, table, head_names, 10) + prof["head_rows"] = [ + dict(r) for r in head_rows if isinstance(r, dict) + ] or None + except Exception: # noqa: BLE001 + prof["head_rows"] = None + # 8.5) Matriz de correlacion/asociacion sobre una muestra de filas # alineadas. Elige la metrica por par de tipos (Pearson/Spearman, # Cramer's V/Theil's U, correlation ratio, MI) via association_matrix.