Compare commits

..

2 Commits

Author SHA1 Message Date
egutierrez b1d205203a feat(eda): poblar head_rows real en el capitulo OVERVIEW (df.head)
El capitulo OVERVIEW del motor AutomaticEDA mostraba "df.head no disponible"
porque ninguna fase de calculo poblaba las primeras filas crudas de la tabla.

- build_eda_render_ctx: nuevo bloque que muestrea SELECT * LIMIT head_n
  (param nuevo head_n=10) y lo expone en ctx["head_rows"] como lista de
  dicts fila. Estilo dict-no-throw: si la query falla, se omite la clave.
- profile_table: puebla prof["head_rows"] reusando _sample_rows (SELECT de
  las columnas LIMIT 10) tras recalcular el type_breakdown. Asi el report
  JSON sidecar tambien lo lleva y el capitulo lo recoge via profile aunque
  no se construya el ctx.
- overview.py: la nota del DataTable de df.head ahora indica el total de
  filas del dataset cuando se conoce ("primeras 10 filas de 891"). Bump
  CHAPTER_VERSION 1.0.0 -> 1.1.0.
- overview_test.py (nuevo): golden (head via profile y via ctx, render PDF
  + PPTX muestran las filas reales, placeholder ausente), edge (sin
  head_rows degrada a nota honesta sin romper, None/vacio devuelven None).

Verificado end-to-end con titanic: render_automatic_eda emite PDF + PPTX con
df.head visible (Braund/Cumings/Heikkinen + columnas) y sin el placeholder.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 17:56:24 +02:00
egutierrez c6d9bc26da merge: Fase 4a AutomaticEDA motor+glosario (verificado met)
- fix negrita-pisa PDF, zebra striping (PDF+PPT), keep-together (Group: heading+figura+texto misma pagina/slide), imagenes con caption en PPT
- portada construida-al-final mostrada en posicion 1 (con resumen agregado del cuerpo)
- capitulo glosario al final + terminos clicables REALES: PDF link annotation (add_pdf_internal_links, PyMuPDF) + PPT hyperlink nativo (pptx_link_run_to_slide); entropia enganchado en cat_distr como ejemplo E2E
- contrato docs/automatic_eda_contract.md §11 (glosario + keep-together + zebra)
- pymupdf>=1.28.0
2026-06-30 17:45:30 +02:00
4 changed files with 239 additions and 7 deletions
@@ -20,7 +20,7 @@ from __future__ import annotations
from .. import model
CHAPTER_VERSION = "1.0.0"
CHAPTER_VERSION = "1.1.0"
CHAPTER_ID = "overview"
CHAPTER_TITLE = "Overview"
@@ -90,8 +90,14 @@ def _head_block(profile: dict, ctx: dict):
if not cols:
cols = list(head[0].keys())
rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]]
return model.DataTable(header=cols, rows=rows,
note=f"primeras {len(rows)} filas")
# Honest note: how many rows are shown and, when known, out of how many
# rows the dataset has (so "primeras 10 filas de 891" gives context).
note = f"primeras {len(rows)} filas"
n_rows = profile.get("n_rows")
if isinstance(n_rows, int) and not isinstance(n_rows, bool) \
and n_rows > len(rows):
note += f" de {n_rows:,}".replace(",", ".")
return model.DataTable(header=cols, rows=rows, note=note)
return model.Note(
"df.head no disponible: el TableProfile no incluye 'head_rows'. La fase "
"de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o "
@@ -0,0 +1,187 @@
"""Tests for the OVERVIEW chapter — DoD: golden + edges + degradation.
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
and deterministic. Verifies that ``build_overview`` renders the raw first rows
(``df.head``) as a DataTable when ``head_rows`` is present — both when it arrives
via ``profile['head_rows']`` (populated by ``profile_table``) and via
``ctx['head_rows']`` (populated by ``build_eda_render_ctx``) — that the chapter
also renders the column dictionary and the numeric describe, that the full
document renders to PDF and PPTX showing the head values, and that a profile with
NO head data degrades to an honest note instead of raising or inventing rows.
"""
import os
import re
import tempfile
from pypdf import PdfReader
from pptx import Presentation
from datascience.automatic_eda.model import DataTable, Note
from datascience.automatic_eda.chapters.overview import (
CHAPTER_ID, CHAPTER_VERSION, build_overview,
)
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
def _columns() -> list:
return [
{"name": "PassengerId", "inferred_type": "numeric", "null_pct": 0.0,
"null_count": 0, "numeric": {"mean": 2.0, "median": 2.0, "min": 1.0,
"max": 3.0, "std": 1.0}},
{"name": "Survived", "inferred_type": "numeric", "null_pct": 0.0,
"null_count": 0, "numeric": {"mean": 0.33, "median": 0.0, "min": 0.0,
"max": 1.0, "std": 0.58}},
{"name": "Pclass", "inferred_type": "numeric", "null_pct": 0.0,
"null_count": 0, "numeric": {"mean": 2.33, "median": 3.0, "min": 1.0,
"max": 3.0, "std": 1.15}},
{"name": "Name", "inferred_type": "categorical", "null_pct": 0.0,
"null_count": 0, "distinct_count": 3},
{"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
"null_count": 0, "distinct_count": 2,
"categorical": {"top": [{"value": "male", "count": 2},
{"value": "female", "count": 1}]}},
]
def _head_rows() -> list:
return [
{"PassengerId": 1, "Survived": 0, "Pclass": 3,
"Name": "Braund Owen", "Sex": "male"},
{"PassengerId": 2, "Survived": 1, "Pclass": 1,
"Name": "Cumings Florence", "Sex": "female"},
{"PassengerId": 3, "Survived": 1, "Pclass": 3,
"Name": "Heikkinen Laina", "Sex": "female"},
]
def _profile(with_head: bool = True) -> dict:
prof = {
"table": "titanic",
"source": "/data/titanic.csv",
"profiled_at": "2026-06-30T10:00:00+00:00",
"n_rows": 891,
"n_cols": 5,
"quality_score": 88.0,
"columns": _columns(),
}
if with_head:
prof["head_rows"] = _head_rows()
return prof
def _pdf_text(path: str) -> str:
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
return re.sub(r"\s+", " ", txt)
def _pptx_text(path: str) -> str:
prs = Presentation(path)
parts = []
for sl in prs.slides:
for sh in sl.shapes:
if sh.has_text_frame:
parts.append(sh.text_frame.text)
if sh.has_table:
tb = sh.table
for r in range(len(tb.rows)):
for c in range(len(tb.columns)):
parts.append(tb.cell(r, c).text)
return re.sub(r"\s+", " ", " ".join(parts))
def _flatten(blocks):
"""Recursively flatten Group blocks into a flat list (none here today)."""
out = []
for b in blocks:
inner = getattr(b, "blocks", None)
if inner is not None and getattr(b, "kind", None) == "group":
out.extend(_flatten(inner))
else:
out.append(b)
return out
def test_golden_build_overview_muestra_head_desde_profile():
ch = build_overview(_profile(), {})
assert ch is not None
assert ch.id == CHAPTER_ID
assert ch.version == CHAPTER_VERSION
blocks = _flatten(ch.blocks)
# The first DataTable is df.head: its header is the column names and the
# real first rows are present (not a placeholder note).
tables = [b for b in blocks if isinstance(b, DataTable)]
assert tables, "overview must emit at least the df.head DataTable"
head_tbl = tables[0]
assert head_tbl.header == ["PassengerId", "Survived", "Pclass",
"Name", "Sex"]
assert len(head_tbl.rows) == 3
flat = [str(c) for row in head_tbl.rows for c in row]
assert "Braund Owen" in flat and "Cumings Florence" in flat
# Honest note carries how many rows shown out of the dataset total.
assert head_tbl.note is not None
assert "primeras 3 filas" in head_tbl.note and "891" in head_tbl.note
# No "df.head no disponible" placeholder when head_rows is present.
assert not any(isinstance(b, Note) and "no disponible" in b.text
for b in blocks)
def test_golden_head_desde_ctx_tambien_funciona():
# head_rows absent in profile but present in ctx (build_eda_render_ctx path).
prof = _profile(with_head=False)
ch = build_overview(prof, {"head_rows": _head_rows()})
assert ch is not None
tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
flat = [str(c) for row in tables[0].rows for c in row]
assert "Braund Owen" in flat
def test_golden_render_pdf_muestra_head():
with tempfile.TemporaryDirectory() as d:
out = os.path.join(d, "eda.pdf")
res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
assert res["path"] == out and os.path.exists(out)
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
txt = _pdf_text(out)
assert "Braund" in txt and "male" in txt
assert "primeras" in txt # head note rendered.
assert "df.head" in txt # chapter heading rendered.
assert "no disponible" not in txt # placeholder NOT shown.
def test_golden_render_pptx_muestra_head():
with tempfile.TemporaryDirectory() as d:
out = os.path.join(d, "eda.pptx")
res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
assert res["path"] == out and os.path.exists(out)
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
txt = _pptx_text(out)
assert "Braund" in txt and "Cumings" in txt
def test_edge_sin_head_rows_degrada_a_nota_honesta():
# No head data anywhere: chapter still builds (columns exist), shows the
# honest placeholder note, and never invents rows nor raises.
prof = _profile(with_head=False)
ch = build_overview(prof, {})
assert ch is not None
blocks = _flatten(ch.blocks)
assert any(isinstance(b, Note) and "no disponible" in b.text
for b in blocks)
# The first DataTable now is the column dictionary, not df.head rows.
tables = [b for b in blocks if isinstance(b, DataTable)]
assert all("Braund" not in str(c)
for tbl in tables for row in tbl.rows for c in row)
def test_edge_none_y_vacio_no_rompen():
# Nothing to render at all -> None, no raise.
assert build_overview(None, None) is None
assert build_overview({}, {}) is None
assert build_overview({"columns": []}, {}) is None
# Only head_rows (no columns) still yields a chapter with the head table.
ch = build_overview({"columns": []}, {"head_rows": _head_rows()})
assert ch is not None
tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
assert tables and len(tables[0].rows) == 3
@@ -20,6 +20,10 @@ vacia y el resto del ctx se construye igual. Ante un fallo global devuelve al
menos ``{**base_ctx, "db_path": db_path, "table": table}``.
Claves de DATOS que produce (las consumen los capitulos):
- ``head_rows`` : [ {col: valor, ...}, ... ] primeras filas CRUDAS de la
tabla (``SELECT * LIMIT head_n``), una entrada por fila.
La lee el capitulo OVERVIEW para mostrar df.head real en
lugar del placeholder "df.head no disponible".
- ``raw_numeric`` : {col: [float|None, ...]} muestra cruda de las columnas
numericas, ALINEADA POR FILA (una entrada por fila aunque
sea None). La leen modelos (clustering 2D en vivo) y
@@ -56,7 +60,7 @@ def _to_float(value):
return None
def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None):
def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None, head_n=10):
"""Construye el ctx de datos crudos para los renderers de AutomaticEDA.
Args:
@@ -77,13 +81,15 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
base_ctx: dict opcional con claves de presentacion ya preparadas
(dataset_name, source_origin, ...). Se parte de una copia y NO se
pisan sus claves; solo se añaden las de datos. Default None -> {}.
head_n: numero de filas crudas a muestrear para ``ctx["head_rows"]``
(df.head del capitulo OVERVIEW). Default 10. <=0 omite la clave.
Returns:
El dict ``ctx`` directamente (NO un wrapper {status,...}): se pasa tal
cual como ``meta={"ctx": <ese dict>}`` a render_automatic_eda_pdf/pptx.
Nunca lanza. Claves que puede contener: raw_numeric, timeseries_raw,
geo_points (omitidas si no aplican o fallan), y siempre db_path + table
para backends validos.
Nunca lanza. Claves que puede contener: head_rows, raw_numeric,
timeseries_raw, geo_points (omitidas si no aplican o fallan), y siempre
db_path + table para backends validos.
"""
# Copia de base_ctx: nunca mutamos el dict del caller. Las claves de
# presentacion que ya traiga se conservan; las de datos se añaden encima.
@@ -117,6 +123,24 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
ctx["db_path"] = db_path
ctx["table"] = table
# 1.5) head_rows: primeras filas CRUDAS de la tabla (SELECT * LIMIT n)
# para que el capitulo OVERVIEW muestre df.head real en vez del
# placeholder. Una sola query, dict-no-throw: si falla, se omite la
# clave (el capitulo degrada a su nota honesta). No se pisa una clave
# head_rows que ya viniera en base_ctx (presentacion).
if head_n and int(head_n) > 0 and "head_rows" not in ctx:
try:
hq = query_fn(f'SELECT * FROM "{table}" LIMIT {int(head_n)}')
if isinstance(hq, dict) and hq.get("status") == "ok":
hrows = [
dict(r) for r in (hq.get("rows") or [])
if isinstance(r, dict)
]
if hrows:
ctx["head_rows"] = hrows
except Exception: # noqa: BLE001 - dict-no-throw: omitir la clave
pass
# 2) Columnas del perfil agregado (lectura defensiva).
cols = profile.get("columns") if isinstance(profile, dict) else None
cols = cols or []
@@ -536,6 +536,21 @@ def profile_table(
type_breakdown[it] += 1
prof["type_breakdown"] = type_breakdown
# 8.1) Primeras filas crudas (df.head) para el capitulo OVERVIEW del motor
# AutomaticEDA: una muestra SELECT col1,col2,... LIMIT 10 alineada por fila.
# Se reusa _sample_rows (mismo lector read-only). Estilo dict-no-throw: si
# falla, head_rows queda None y el capitulo degrada a su nota honesta. El
# capitulo lo recoge via profile["head_rows"]; build_eda_render_ctx ademas
# lo replica en ctx["head_rows"] cuando se construye el contexto de render.
try:
head_names = [c.get("name") for c in cols if c.get("name")]
head_rows = _sample_rows(_q, table, head_names, 10)
prof["head_rows"] = [
dict(r) for r in head_rows if isinstance(r, dict)
] or None
except Exception: # noqa: BLE001
prof["head_rows"] = None
# 8.5) Matriz de correlacion/asociacion sobre una muestra de filas
# alineadas. Elige la metrica por par de tipos (Pearson/Spearman,
# Cramer's V/Theil's U, correlation ratio, MI) via association_matrix.