merge: 4b head_rows — overview muestra df.head (build_eda_render_ctx pobla head_rows, verificado met)

2026-06-30 18:04:51 +02:00
parent 7fb00defdf b1d205203a
commit f2ac734ef7
4 changed files with 239 additions and 7 deletions
@@ -20,7 +20,7 @@ from __future__ import annotations
 from .. import model
-CHAPTER_VERSION = "1.0.0"
+CHAPTER_VERSION = "1.1.0"
 CHAPTER_ID = "overview"
 CHAPTER_TITLE = "Overview"
@@ -90,8 +90,14 @@ def _head_block(profile: dict, ctx: dict):
        if not cols:
            cols = list(head[0].keys())
        rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]]
-        return model.DataTable(header=cols, rows=rows,
+        # Honest note: how many rows are shown and, when known, out of how many
-                               note=f"primeras {len(rows)} filas")
+        # rows the dataset has (so "primeras 10 filas de 891" gives context).
        note = f"primeras {len(rows)} filas"
        n_rows = profile.get("n_rows")
        if isinstance(n_rows, int) and not isinstance(n_rows, bool) \
                and n_rows > len(rows):
            note += f" de {n_rows:,}".replace(",", ".")
        return model.DataTable(header=cols, rows=rows, note=note)
    return model.Note(
        "df.head no disponible: el TableProfile no incluye 'head_rows'. La fase "
        "de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o "
@@ -0,0 +1,187 @@
 """Tests for the OVERVIEW chapter — DoD: golden + edges + degradation.
 Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
 and deterministic. Verifies that ``build_overview`` renders the raw first rows
 (``df.head``) as a DataTable when ``head_rows`` is present — both when it arrives
 via ``profile['head_rows']`` (populated by ``profile_table``) and via
 ``ctx['head_rows']`` (populated by ``build_eda_render_ctx``) — that the chapter
 also renders the column dictionary and the numeric describe, that the full
 document renders to PDF and PPTX showing the head values, and that a profile with
 NO head data degrades to an honest note instead of raising or inventing rows.
 """
 import os
 import re
 import tempfile
 from pypdf import PdfReader
 from pptx import Presentation
 from datascience.automatic_eda.model import DataTable, Note
 from datascience.automatic_eda.chapters.overview import (
    CHAPTER_ID, CHAPTER_VERSION, build_overview,
 )
 from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
 from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
 def _columns() -> list:
    return [
        {"name": "PassengerId", "inferred_type": "numeric", "null_pct": 0.0,
         "null_count": 0, "numeric": {"mean": 2.0, "median": 2.0, "min": 1.0,
                                      "max": 3.0, "std": 1.0}},
        {"name": "Survived", "inferred_type": "numeric", "null_pct": 0.0,
         "null_count": 0, "numeric": {"mean": 0.33, "median": 0.0, "min": 0.0,
                                      "max": 1.0, "std": 0.58}},
        {"name": "Pclass", "inferred_type": "numeric", "null_pct": 0.0,
         "null_count": 0, "numeric": {"mean": 2.33, "median": 3.0, "min": 1.0,
                                      "max": 3.0, "std": 1.15}},
        {"name": "Name", "inferred_type": "categorical", "null_pct": 0.0,
         "null_count": 0, "distinct_count": 3},
        {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
         "null_count": 0, "distinct_count": 2,
         "categorical": {"top": [{"value": "male", "count": 2},
                                 {"value": "female", "count": 1}]}},
    ]
 def _head_rows() -> list:
    return [
        {"PassengerId": 1, "Survived": 0, "Pclass": 3,
         "Name": "Braund Owen", "Sex": "male"},
        {"PassengerId": 2, "Survived": 1, "Pclass": 1,
         "Name": "Cumings Florence", "Sex": "female"},
        {"PassengerId": 3, "Survived": 1, "Pclass": 3,
         "Name": "Heikkinen Laina", "Sex": "female"},
    ]
 def _profile(with_head: bool = True) -> dict:
    prof = {
        "table": "titanic",
        "source": "/data/titanic.csv",
        "profiled_at": "2026-06-30T10:00:00+00:00",
        "n_rows": 891,
        "n_cols": 5,
        "quality_score": 88.0,
        "columns": _columns(),
    }
    if with_head:
        prof["head_rows"] = _head_rows()
    return prof
 def _pdf_text(path: str) -> str:
    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
    return re.sub(r"\s+", " ", txt)
 def _pptx_text(path: str) -> str:
    prs = Presentation(path)
    parts = []
    for sl in prs.slides:
        for sh in sl.shapes:
            if sh.has_text_frame:
                parts.append(sh.text_frame.text)
            if sh.has_table:
                tb = sh.table
                for r in range(len(tb.rows)):
                    for c in range(len(tb.columns)):
                        parts.append(tb.cell(r, c).text)
    return re.sub(r"\s+", " ", " ".join(parts))
 def _flatten(blocks):
    """Recursively flatten Group blocks into a flat list (none here today)."""
    out = []
    for b in blocks:
        inner = getattr(b, "blocks", None)
        if inner is not None and getattr(b, "kind", None) == "group":
            out.extend(_flatten(inner))
        else:
            out.append(b)
    return out
 def test_golden_build_overview_muestra_head_desde_profile():
    ch = build_overview(_profile(), {})
    assert ch is not None
    assert ch.id == CHAPTER_ID
    assert ch.version == CHAPTER_VERSION
    blocks = _flatten(ch.blocks)
    # The first DataTable is df.head: its header is the column names and the
    # real first rows are present (not a placeholder note).
    tables = [b for b in blocks if isinstance(b, DataTable)]
    assert tables, "overview must emit at least the df.head DataTable"
    head_tbl = tables[0]
    assert head_tbl.header == ["PassengerId", "Survived", "Pclass",
                               "Name", "Sex"]
    assert len(head_tbl.rows) == 3
    flat = [str(c) for row in head_tbl.rows for c in row]
    assert "Braund Owen" in flat and "Cumings Florence" in flat
    # Honest note carries how many rows shown out of the dataset total.
    assert head_tbl.note is not None
    assert "primeras 3 filas" in head_tbl.note and "891" in head_tbl.note
    # No "df.head no disponible" placeholder when head_rows is present.
    assert not any(isinstance(b, Note) and "no disponible" in b.text
                   for b in blocks)
 def test_golden_head_desde_ctx_tambien_funciona():
    # head_rows absent in profile but present in ctx (build_eda_render_ctx path).
    prof = _profile(with_head=False)
    ch = build_overview(prof, {"head_rows": _head_rows()})
    assert ch is not None
    tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
    flat = [str(c) for row in tables[0].rows for c in row]
    assert "Braund Owen" in flat
 def test_golden_render_pdf_muestra_head():
    with tempfile.TemporaryDirectory() as d:
        out = os.path.join(d, "eda.pdf")
        res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
        assert res["path"] == out and os.path.exists(out)
        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
        txt = _pdf_text(out)
        assert "Braund" in txt and "male" in txt
        assert "primeras" in txt          # head note rendered.
        assert "df.head" in txt           # chapter heading rendered.
        assert "no disponible" not in txt  # placeholder NOT shown.
 def test_golden_render_pptx_muestra_head():
    with tempfile.TemporaryDirectory() as d:
        out = os.path.join(d, "eda.pptx")
        res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
        assert res["path"] == out and os.path.exists(out)
        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
        txt = _pptx_text(out)
        assert "Braund" in txt and "Cumings" in txt
 def test_edge_sin_head_rows_degrada_a_nota_honesta():
    # No head data anywhere: chapter still builds (columns exist), shows the
    # honest placeholder note, and never invents rows nor raises.
    prof = _profile(with_head=False)
    ch = build_overview(prof, {})
    assert ch is not None
    blocks = _flatten(ch.blocks)
    assert any(isinstance(b, Note) and "no disponible" in b.text
               for b in blocks)
    # The first DataTable now is the column dictionary, not df.head rows.
    tables = [b for b in blocks if isinstance(b, DataTable)]
    assert all("Braund" not in str(c)
               for tbl in tables for row in tbl.rows for c in row)
 def test_edge_none_y_vacio_no_rompen():
    # Nothing to render at all -> None, no raise.
    assert build_overview(None, None) is None
    assert build_overview({}, {}) is None
    assert build_overview({"columns": []}, {}) is None
    # Only head_rows (no columns) still yields a chapter with the head table.
    ch = build_overview({"columns": []}, {"head_rows": _head_rows()})
    assert ch is not None
    tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
    assert tables and len(tables[0].rows) == 3
@@ -20,6 +20,10 @@ vacia y el resto del ctx se construye igual. Ante un fallo global devuelve al
 menos ``{**base_ctx, "db_path": db_path, "table": table}``.
 Claves de DATOS que produce (las consumen los capitulos):
  - ``head_rows``      : [ {col: valor, ...}, ... ] primeras filas CRUDAS de la
                         tabla (``SELECT * LIMIT head_n``), una entrada por fila.
                         La lee el capitulo OVERVIEW para mostrar df.head real en
                         lugar del placeholder "df.head no disponible".
  - ``raw_numeric``    : {col: [float|None, ...]} muestra cruda de las columnas
                         numericas, ALINEADA POR FILA (una entrada por fila aunque
                         sea None). La leen modelos (clustering 2D en vivo) y
@@ -56,7 +60,7 @@ def _to_float(value):
        return None
-def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None):
+def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None, head_n=10):
    """Construye el ctx de datos crudos para los renderers de AutomaticEDA.
    Args:
@@ -77,13 +81,15 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
        base_ctx: dict opcional con claves de presentacion ya preparadas
            (dataset_name, source_origin, ...). Se parte de una copia y NO se
            pisan sus claves; solo se añaden las de datos. Default None -> {}.
        head_n: numero de filas crudas a muestrear para ``ctx["head_rows"]``
            (df.head del capitulo OVERVIEW). Default 10. <=0 omite la clave.
    Returns:
        El dict ``ctx`` directamente (NO un wrapper {status,...}): se pasa tal
        cual como ``meta={"ctx": <ese dict>}`` a render_automatic_eda_pdf/pptx.
-        Nunca lanza. Claves que puede contener: raw_numeric, timeseries_raw,
+        Nunca lanza. Claves que puede contener: head_rows, raw_numeric,
-        geo_points (omitidas si no aplican o fallan), y siempre db_path + table
+        timeseries_raw, geo_points (omitidas si no aplican o fallan), y siempre
-        para backends validos.
+        db_path + table para backends validos.
    """
    # Copia de base_ctx: nunca mutamos el dict del caller. Las claves de
    # presentacion que ya traiga se conservan; las de datos se añaden encima.
@@ -117,6 +123,24 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
        ctx["db_path"] = db_path
        ctx["table"] = table
        # 1.5) head_rows: primeras filas CRUDAS de la tabla (SELECT * LIMIT n)
        # para que el capitulo OVERVIEW muestre df.head real en vez del
        # placeholder. Una sola query, dict-no-throw: si falla, se omite la
        # clave (el capitulo degrada a su nota honesta). No se pisa una clave
        # head_rows que ya viniera en base_ctx (presentacion).
        if head_n and int(head_n) > 0 and "head_rows" not in ctx:
            try:
                hq = query_fn(f'SELECT * FROM "{table}" LIMIT {int(head_n)}')
                if isinstance(hq, dict) and hq.get("status") == "ok":
                    hrows = [
                        dict(r) for r in (hq.get("rows") or [])
                        if isinstance(r, dict)
                    ]
                    if hrows:
                        ctx["head_rows"] = hrows
            except Exception:  # noqa: BLE001 - dict-no-throw: omitir la clave
                pass
        # 2) Columnas del perfil agregado (lectura defensiva).
        cols = profile.get("columns") if isinstance(profile, dict) else None
        cols = cols or []
@@ -536,6 +536,21 @@ def profile_table(
                type_breakdown[it] += 1
        prof["type_breakdown"] = type_breakdown
        # 8.1) Primeras filas crudas (df.head) para el capitulo OVERVIEW del motor
        # AutomaticEDA: una muestra SELECT col1,col2,... LIMIT 10 alineada por fila.
        # Se reusa _sample_rows (mismo lector read-only). Estilo dict-no-throw: si
        # falla, head_rows queda None y el capitulo degrada a su nota honesta. El
        # capitulo lo recoge via profile["head_rows"]; build_eda_render_ctx ademas
        # lo replica en ctx["head_rows"] cuando se construye el contexto de render.
        try:
            head_names = [c.get("name") for c in cols if c.get("name")]
            head_rows = _sample_rows(_q, table, head_names, 10)
            prof["head_rows"] = [
                dict(r) for r in head_rows if isinstance(r, dict)
            ] or None
        except Exception:  # noqa: BLE001
            prof["head_rows"] = None
        # 8.5) Matriz de correlacion/asociacion sobre una muestra de filas
        # alineadas. Elige la metrica por par de tipos (Pearson/Spearman,
        # Cramer's V/Theil's U, correlation ratio, MI) via association_matrix.