feat(eda): poblar head_rows real en el capitulo OVERVIEW (df.head)

El capitulo OVERVIEW del motor AutomaticEDA mostraba "df.head no disponible" porque ninguna fase de calculo poblaba las primeras filas crudas de la tabla. - build_eda_render_ctx: nuevo bloque que muestrea SELECT * LIMIT head_n (param nuevo head_n=10) y lo expone en ctx["head_rows"] como lista de dicts fila. Estilo dict-no-throw: si la query falla, se omite la clave. - profile_table: puebla prof["head_rows"] reusando _sample_rows (SELECT de las columnas LIMIT 10) tras recalcular el type_breakdown. Asi el report JSON sidecar tambien lo lleva y el capitulo lo recoge via profile aunque no se construya el ctx. - overview.py: la nota del DataTable de df.head ahora indica el total de filas del dataset cuando se conoce ("primeras 10 filas de 891"). Bump CHAPTER_VERSION 1.0.0 -> 1.1.0. - overview_test.py (nuevo): golden (head via profile y via ctx, render PDF + PPTX muestran las filas reales, placeholder ausente), edge (sin head_rows degrada a nota honesta sin romper, None/vacio devuelven None). Verificado end-to-end con titanic: render_automatic_eda emite PDF + PPTX con df.head visible (Braund/Cumings/Heikkinen + columnas) y sin el placeholder. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 17:56:24 +02:00
parent c6d9bc26da
commit b1d205203a
4 changed files with 239 additions and 7 deletions
@@ -20,7 +20,7 @@ from __future__ import annotations

 from .. import model

-CHAPTER_VERSION = "1.0.0"
+CHAPTER_VERSION = "1.1.0"
 CHAPTER_ID = "overview"
 CHAPTER_TITLE = "Overview"

@@ -90,8 +90,14 @@ def _head_block(profile: dict, ctx: dict):
        if not cols:
            cols = list(head[0].keys())
        rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]]
-        return model.DataTable(header=cols, rows=rows,
-                               note=f"primeras {len(rows)} filas")
+        # Honest note: how many rows are shown and, when known, out of how many
+        # rows the dataset has (so "primeras 10 filas de 891" gives context).
+        note = f"primeras {len(rows)} filas"
+        n_rows = profile.get("n_rows")
+        if isinstance(n_rows, int) and not isinstance(n_rows, bool) \
+                and n_rows > len(rows):
+            note += f" de {n_rows:,}".replace(",", ".")
+        return model.DataTable(header=cols, rows=rows, note=note)
    return model.Note(
        "df.head no disponible: el TableProfile no incluye 'head_rows'. La fase "
        "de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o "
@@ -0,0 +1,187 @@
+"""Tests for the OVERVIEW chapter — DoD: golden + edges + degradation.
+
+Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
+and deterministic. Verifies that ``build_overview`` renders the raw first rows
+(``df.head``) as a DataTable when ``head_rows`` is present — both when it arrives
+via ``profile['head_rows']`` (populated by ``profile_table``) and via
+``ctx['head_rows']`` (populated by ``build_eda_render_ctx``) — that the chapter
+also renders the column dictionary and the numeric describe, that the full
+document renders to PDF and PPTX showing the head values, and that a profile with
+NO head data degrades to an honest note instead of raising or inventing rows.
+"""
+
+import os
+import re
+import tempfile
+
+from pypdf import PdfReader
+from pptx import Presentation
+
+from datascience.automatic_eda.model import DataTable, Note
+from datascience.automatic_eda.chapters.overview import (
+    CHAPTER_ID, CHAPTER_VERSION, build_overview,
+)
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+
+def _columns() -> list:
+    return [
+        {"name": "PassengerId", "inferred_type": "numeric", "null_pct": 0.0,
+         "null_count": 0, "numeric": {"mean": 2.0, "median": 2.0, "min": 1.0,
+                                      "max": 3.0, "std": 1.0}},
+        {"name": "Survived", "inferred_type": "numeric", "null_pct": 0.0,
+         "null_count": 0, "numeric": {"mean": 0.33, "median": 0.0, "min": 0.0,
+                                      "max": 1.0, "std": 0.58}},
+        {"name": "Pclass", "inferred_type": "numeric", "null_pct": 0.0,
+         "null_count": 0, "numeric": {"mean": 2.33, "median": 3.0, "min": 1.0,
+                                      "max": 3.0, "std": 1.15}},
+        {"name": "Name", "inferred_type": "categorical", "null_pct": 0.0,
+         "null_count": 0, "distinct_count": 3},
+        {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
+         "null_count": 0, "distinct_count": 2,
+         "categorical": {"top": [{"value": "male", "count": 2},
+                                 {"value": "female", "count": 1}]}},
+    ]
+
+
+def _head_rows() -> list:
+    return [
+        {"PassengerId": 1, "Survived": 0, "Pclass": 3,
+         "Name": "Braund Owen", "Sex": "male"},
+        {"PassengerId": 2, "Survived": 1, "Pclass": 1,
+         "Name": "Cumings Florence", "Sex": "female"},
+        {"PassengerId": 3, "Survived": 1, "Pclass": 3,
+         "Name": "Heikkinen Laina", "Sex": "female"},
+    ]
+
+
+def _profile(with_head: bool = True) -> dict:
+    prof = {
+        "table": "titanic",
+        "source": "/data/titanic.csv",
+        "profiled_at": "2026-06-30T10:00:00+00:00",
+        "n_rows": 891,
+        "n_cols": 5,
+        "quality_score": 88.0,
+        "columns": _columns(),
+    }
+    if with_head:
+        prof["head_rows"] = _head_rows()
+    return prof
+
+
+def _pdf_text(path: str) -> str:
+    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
+    return re.sub(r"\s+", " ", txt)
+
+
+def _pptx_text(path: str) -> str:
+    prs = Presentation(path)
+    parts = []
+    for sl in prs.slides:
+        for sh in sl.shapes:
+            if sh.has_text_frame:
+                parts.append(sh.text_frame.text)
+            if sh.has_table:
+                tb = sh.table
+                for r in range(len(tb.rows)):
+                    for c in range(len(tb.columns)):
+                        parts.append(tb.cell(r, c).text)
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def _flatten(blocks):
+    """Recursively flatten Group blocks into a flat list (none here today)."""
+    out = []
+    for b in blocks:
+        inner = getattr(b, "blocks", None)
+        if inner is not None and getattr(b, "kind", None) == "group":
+            out.extend(_flatten(inner))
+        else:
+            out.append(b)
+    return out
+
+
+def test_golden_build_overview_muestra_head_desde_profile():
+    ch = build_overview(_profile(), {})
+    assert ch is not None
+    assert ch.id == CHAPTER_ID
+    assert ch.version == CHAPTER_VERSION
+    blocks = _flatten(ch.blocks)
+    # The first DataTable is df.head: its header is the column names and the
+    # real first rows are present (not a placeholder note).
+    tables = [b for b in blocks if isinstance(b, DataTable)]
+    assert tables, "overview must emit at least the df.head DataTable"
+    head_tbl = tables[0]
+    assert head_tbl.header == ["PassengerId", "Survived", "Pclass",
+                               "Name", "Sex"]
+    assert len(head_tbl.rows) == 3
+    flat = [str(c) for row in head_tbl.rows for c in row]
+    assert "Braund Owen" in flat and "Cumings Florence" in flat
+    # Honest note carries how many rows shown out of the dataset total.
+    assert head_tbl.note is not None
+    assert "primeras 3 filas" in head_tbl.note and "891" in head_tbl.note
+    # No "df.head no disponible" placeholder when head_rows is present.
+    assert not any(isinstance(b, Note) and "no disponible" in b.text
+                   for b in blocks)
+
+
+def test_golden_head_desde_ctx_tambien_funciona():
+    # head_rows absent in profile but present in ctx (build_eda_render_ctx path).
+    prof = _profile(with_head=False)
+    ch = build_overview(prof, {"head_rows": _head_rows()})
+    assert ch is not None
+    tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
+    flat = [str(c) for row in tables[0].rows for c in row]
+    assert "Braund Owen" in flat
+
+
+def test_golden_render_pdf_muestra_head():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pdf")
+        res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pdf_text(out)
+        assert "Braund" in txt and "male" in txt
+        assert "primeras" in txt          # head note rendered.
+        assert "df.head" in txt           # chapter heading rendered.
+        assert "no disponible" not in txt  # placeholder NOT shown.
+
+
+def test_golden_render_pptx_muestra_head():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pptx")
+        res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pptx_text(out)
+        assert "Braund" in txt and "Cumings" in txt
+
+
+def test_edge_sin_head_rows_degrada_a_nota_honesta():
+    # No head data anywhere: chapter still builds (columns exist), shows the
+    # honest placeholder note, and never invents rows nor raises.
+    prof = _profile(with_head=False)
+    ch = build_overview(prof, {})
+    assert ch is not None
+    blocks = _flatten(ch.blocks)
+    assert any(isinstance(b, Note) and "no disponible" in b.text
+               for b in blocks)
+    # The first DataTable now is the column dictionary, not df.head rows.
+    tables = [b for b in blocks if isinstance(b, DataTable)]
+    assert all("Braund" not in str(c)
+               for tbl in tables for row in tbl.rows for c in row)
+
+
+def test_edge_none_y_vacio_no_rompen():
+    # Nothing to render at all -> None, no raise.
+    assert build_overview(None, None) is None
+    assert build_overview({}, {}) is None
+    assert build_overview({"columns": []}, {}) is None
+    # Only head_rows (no columns) still yields a chapter with the head table.
+    ch = build_overview({"columns": []}, {"head_rows": _head_rows()})
+    assert ch is not None
+    tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
+    assert tables and len(tables[0].rows) == 3
@@ -20,6 +20,10 @@ vacia y el resto del ctx se construye igual. Ante un fallo global devuelve al
 menos ``{**base_ctx, "db_path": db_path, "table": table}``.

 Claves de DATOS que produce (las consumen los capitulos):
+  - ``head_rows``      : [ {col: valor, ...}, ... ] primeras filas CRUDAS de la
+                         tabla (``SELECT * LIMIT head_n``), una entrada por fila.
+                         La lee el capitulo OVERVIEW para mostrar df.head real en
+                         lugar del placeholder "df.head no disponible".
  - ``raw_numeric``    : {col: [float|None, ...]} muestra cruda de las columnas
                         numericas, ALINEADA POR FILA (una entrada por fila aunque
                         sea None). La leen modelos (clustering 2D en vivo) y
@@ -56,7 +60,7 @@ def _to_float(value):
        return None


-def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None):
+def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None, head_n=10):
    """Construye el ctx de datos crudos para los renderers de AutomaticEDA.

    Args:
@@ -77,13 +81,15 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
        base_ctx: dict opcional con claves de presentacion ya preparadas
            (dataset_name, source_origin, ...). Se parte de una copia y NO se
            pisan sus claves; solo se añaden las de datos. Default None -> {}.
+        head_n: numero de filas crudas a muestrear para ``ctx["head_rows"]``
+            (df.head del capitulo OVERVIEW). Default 10. <=0 omite la clave.

    Returns:
        El dict ``ctx`` directamente (NO un wrapper {status,...}): se pasa tal
        cual como ``meta={"ctx": <ese dict>}`` a render_automatic_eda_pdf/pptx.
-        Nunca lanza. Claves que puede contener: raw_numeric, timeseries_raw,
-        geo_points (omitidas si no aplican o fallan), y siempre db_path + table
-        para backends validos.
+        Nunca lanza. Claves que puede contener: head_rows, raw_numeric,
+        timeseries_raw, geo_points (omitidas si no aplican o fallan), y siempre
+        db_path + table para backends validos.
    """
    # Copia de base_ctx: nunca mutamos el dict del caller. Las claves de
    # presentacion que ya traiga se conservan; las de datos se añaden encima.
@@ -117,6 +123,24 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
        ctx["db_path"] = db_path
        ctx["table"] = table

+        # 1.5) head_rows: primeras filas CRUDAS de la tabla (SELECT * LIMIT n)
+        # para que el capitulo OVERVIEW muestre df.head real en vez del
+        # placeholder. Una sola query, dict-no-throw: si falla, se omite la
+        # clave (el capitulo degrada a su nota honesta). No se pisa una clave
+        # head_rows que ya viniera en base_ctx (presentacion).
+        if head_n and int(head_n) > 0 and "head_rows" not in ctx:
+            try:
+                hq = query_fn(f'SELECT * FROM "{table}" LIMIT {int(head_n)}')
+                if isinstance(hq, dict) and hq.get("status") == "ok":
+                    hrows = [
+                        dict(r) for r in (hq.get("rows") or [])
+                        if isinstance(r, dict)
+                    ]
+                    if hrows:
+                        ctx["head_rows"] = hrows
+            except Exception:  # noqa: BLE001 - dict-no-throw: omitir la clave
+                pass
+
        # 2) Columnas del perfil agregado (lectura defensiva).
        cols = profile.get("columns") if isinstance(profile, dict) else None
        cols = cols or []