merge: 4b head_rows — overview muestra df.head (build_eda_render_ctx pobla head_rows, verificado met)
This commit is contained in:
@@ -20,7 +20,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from .. import model
|
from .. import model
|
||||||
|
|
||||||
CHAPTER_VERSION = "1.0.0"
|
CHAPTER_VERSION = "1.1.0"
|
||||||
CHAPTER_ID = "overview"
|
CHAPTER_ID = "overview"
|
||||||
CHAPTER_TITLE = "Overview"
|
CHAPTER_TITLE = "Overview"
|
||||||
|
|
||||||
@@ -90,8 +90,14 @@ def _head_block(profile: dict, ctx: dict):
|
|||||||
if not cols:
|
if not cols:
|
||||||
cols = list(head[0].keys())
|
cols = list(head[0].keys())
|
||||||
rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]]
|
rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]]
|
||||||
return model.DataTable(header=cols, rows=rows,
|
# Honest note: how many rows are shown and, when known, out of how many
|
||||||
note=f"primeras {len(rows)} filas")
|
# rows the dataset has (so "primeras 10 filas de 891" gives context).
|
||||||
|
note = f"primeras {len(rows)} filas"
|
||||||
|
n_rows = profile.get("n_rows")
|
||||||
|
if isinstance(n_rows, int) and not isinstance(n_rows, bool) \
|
||||||
|
and n_rows > len(rows):
|
||||||
|
note += f" de {n_rows:,}".replace(",", ".")
|
||||||
|
return model.DataTable(header=cols, rows=rows, note=note)
|
||||||
return model.Note(
|
return model.Note(
|
||||||
"df.head no disponible: el TableProfile no incluye 'head_rows'. La fase "
|
"df.head no disponible: el TableProfile no incluye 'head_rows'. La fase "
|
||||||
"de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o "
|
"de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o "
|
||||||
|
|||||||
@@ -0,0 +1,187 @@
|
|||||||
|
"""Tests for the OVERVIEW chapter — DoD: golden + edges + degradation.
|
||||||
|
|
||||||
|
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||||
|
and deterministic. Verifies that ``build_overview`` renders the raw first rows
|
||||||
|
(``df.head``) as a DataTable when ``head_rows`` is present — both when it arrives
|
||||||
|
via ``profile['head_rows']`` (populated by ``profile_table``) and via
|
||||||
|
``ctx['head_rows']`` (populated by ``build_eda_render_ctx``) — that the chapter
|
||||||
|
also renders the column dictionary and the numeric describe, that the full
|
||||||
|
document renders to PDF and PPTX showing the head values, and that a profile with
|
||||||
|
NO head data degrades to an honest note instead of raising or inventing rows.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from pypdf import PdfReader
|
||||||
|
from pptx import Presentation
|
||||||
|
|
||||||
|
from datascience.automatic_eda.model import DataTable, Note
|
||||||
|
from datascience.automatic_eda.chapters.overview import (
|
||||||
|
CHAPTER_ID, CHAPTER_VERSION, build_overview,
|
||||||
|
)
|
||||||
|
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||||
|
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||||
|
|
||||||
|
|
||||||
|
def _columns() -> list:
|
||||||
|
return [
|
||||||
|
{"name": "PassengerId", "inferred_type": "numeric", "null_pct": 0.0,
|
||||||
|
"null_count": 0, "numeric": {"mean": 2.0, "median": 2.0, "min": 1.0,
|
||||||
|
"max": 3.0, "std": 1.0}},
|
||||||
|
{"name": "Survived", "inferred_type": "numeric", "null_pct": 0.0,
|
||||||
|
"null_count": 0, "numeric": {"mean": 0.33, "median": 0.0, "min": 0.0,
|
||||||
|
"max": 1.0, "std": 0.58}},
|
||||||
|
{"name": "Pclass", "inferred_type": "numeric", "null_pct": 0.0,
|
||||||
|
"null_count": 0, "numeric": {"mean": 2.33, "median": 3.0, "min": 1.0,
|
||||||
|
"max": 3.0, "std": 1.15}},
|
||||||
|
{"name": "Name", "inferred_type": "categorical", "null_pct": 0.0,
|
||||||
|
"null_count": 0, "distinct_count": 3},
|
||||||
|
{"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
|
||||||
|
"null_count": 0, "distinct_count": 2,
|
||||||
|
"categorical": {"top": [{"value": "male", "count": 2},
|
||||||
|
{"value": "female", "count": 1}]}},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _head_rows() -> list:
|
||||||
|
return [
|
||||||
|
{"PassengerId": 1, "Survived": 0, "Pclass": 3,
|
||||||
|
"Name": "Braund Owen", "Sex": "male"},
|
||||||
|
{"PassengerId": 2, "Survived": 1, "Pclass": 1,
|
||||||
|
"Name": "Cumings Florence", "Sex": "female"},
|
||||||
|
{"PassengerId": 3, "Survived": 1, "Pclass": 3,
|
||||||
|
"Name": "Heikkinen Laina", "Sex": "female"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _profile(with_head: bool = True) -> dict:
|
||||||
|
prof = {
|
||||||
|
"table": "titanic",
|
||||||
|
"source": "/data/titanic.csv",
|
||||||
|
"profiled_at": "2026-06-30T10:00:00+00:00",
|
||||||
|
"n_rows": 891,
|
||||||
|
"n_cols": 5,
|
||||||
|
"quality_score": 88.0,
|
||||||
|
"columns": _columns(),
|
||||||
|
}
|
||||||
|
if with_head:
|
||||||
|
prof["head_rows"] = _head_rows()
|
||||||
|
return prof
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_text(path: str) -> str:
|
||||||
|
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
|
||||||
|
return re.sub(r"\s+", " ", txt)
|
||||||
|
|
||||||
|
|
||||||
|
def _pptx_text(path: str) -> str:
|
||||||
|
prs = Presentation(path)
|
||||||
|
parts = []
|
||||||
|
for sl in prs.slides:
|
||||||
|
for sh in sl.shapes:
|
||||||
|
if sh.has_text_frame:
|
||||||
|
parts.append(sh.text_frame.text)
|
||||||
|
if sh.has_table:
|
||||||
|
tb = sh.table
|
||||||
|
for r in range(len(tb.rows)):
|
||||||
|
for c in range(len(tb.columns)):
|
||||||
|
parts.append(tb.cell(r, c).text)
|
||||||
|
return re.sub(r"\s+", " ", " ".join(parts))
|
||||||
|
|
||||||
|
|
||||||
|
def _flatten(blocks):
|
||||||
|
"""Recursively flatten Group blocks into a flat list (none here today)."""
|
||||||
|
out = []
|
||||||
|
for b in blocks:
|
||||||
|
inner = getattr(b, "blocks", None)
|
||||||
|
if inner is not None and getattr(b, "kind", None) == "group":
|
||||||
|
out.extend(_flatten(inner))
|
||||||
|
else:
|
||||||
|
out.append(b)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_build_overview_muestra_head_desde_profile():
|
||||||
|
ch = build_overview(_profile(), {})
|
||||||
|
assert ch is not None
|
||||||
|
assert ch.id == CHAPTER_ID
|
||||||
|
assert ch.version == CHAPTER_VERSION
|
||||||
|
blocks = _flatten(ch.blocks)
|
||||||
|
# The first DataTable is df.head: its header is the column names and the
|
||||||
|
# real first rows are present (not a placeholder note).
|
||||||
|
tables = [b for b in blocks if isinstance(b, DataTable)]
|
||||||
|
assert tables, "overview must emit at least the df.head DataTable"
|
||||||
|
head_tbl = tables[0]
|
||||||
|
assert head_tbl.header == ["PassengerId", "Survived", "Pclass",
|
||||||
|
"Name", "Sex"]
|
||||||
|
assert len(head_tbl.rows) == 3
|
||||||
|
flat = [str(c) for row in head_tbl.rows for c in row]
|
||||||
|
assert "Braund Owen" in flat and "Cumings Florence" in flat
|
||||||
|
# Honest note carries how many rows shown out of the dataset total.
|
||||||
|
assert head_tbl.note is not None
|
||||||
|
assert "primeras 3 filas" in head_tbl.note and "891" in head_tbl.note
|
||||||
|
# No "df.head no disponible" placeholder when head_rows is present.
|
||||||
|
assert not any(isinstance(b, Note) and "no disponible" in b.text
|
||||||
|
for b in blocks)
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_head_desde_ctx_tambien_funciona():
|
||||||
|
# head_rows absent in profile but present in ctx (build_eda_render_ctx path).
|
||||||
|
prof = _profile(with_head=False)
|
||||||
|
ch = build_overview(prof, {"head_rows": _head_rows()})
|
||||||
|
assert ch is not None
|
||||||
|
tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
|
||||||
|
flat = [str(c) for row in tables[0].rows for c in row]
|
||||||
|
assert "Braund Owen" in flat
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_render_pdf_muestra_head():
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
out = os.path.join(d, "eda.pdf")
|
||||||
|
res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
|
||||||
|
assert res["path"] == out and os.path.exists(out)
|
||||||
|
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||||
|
txt = _pdf_text(out)
|
||||||
|
assert "Braund" in txt and "male" in txt
|
||||||
|
assert "primeras" in txt # head note rendered.
|
||||||
|
assert "df.head" in txt # chapter heading rendered.
|
||||||
|
assert "no disponible" not in txt # placeholder NOT shown.
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_render_pptx_muestra_head():
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
out = os.path.join(d, "eda.pptx")
|
||||||
|
res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
|
||||||
|
assert res["path"] == out and os.path.exists(out)
|
||||||
|
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||||
|
txt = _pptx_text(out)
|
||||||
|
assert "Braund" in txt and "Cumings" in txt
|
||||||
|
|
||||||
|
|
||||||
|
def test_edge_sin_head_rows_degrada_a_nota_honesta():
|
||||||
|
# No head data anywhere: chapter still builds (columns exist), shows the
|
||||||
|
# honest placeholder note, and never invents rows nor raises.
|
||||||
|
prof = _profile(with_head=False)
|
||||||
|
ch = build_overview(prof, {})
|
||||||
|
assert ch is not None
|
||||||
|
blocks = _flatten(ch.blocks)
|
||||||
|
assert any(isinstance(b, Note) and "no disponible" in b.text
|
||||||
|
for b in blocks)
|
||||||
|
# The first DataTable now is the column dictionary, not df.head rows.
|
||||||
|
tables = [b for b in blocks if isinstance(b, DataTable)]
|
||||||
|
assert all("Braund" not in str(c)
|
||||||
|
for tbl in tables for row in tbl.rows for c in row)
|
||||||
|
|
||||||
|
|
||||||
|
def test_edge_none_y_vacio_no_rompen():
|
||||||
|
# Nothing to render at all -> None, no raise.
|
||||||
|
assert build_overview(None, None) is None
|
||||||
|
assert build_overview({}, {}) is None
|
||||||
|
assert build_overview({"columns": []}, {}) is None
|
||||||
|
# Only head_rows (no columns) still yields a chapter with the head table.
|
||||||
|
ch = build_overview({"columns": []}, {"head_rows": _head_rows()})
|
||||||
|
assert ch is not None
|
||||||
|
tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
|
||||||
|
assert tables and len(tables[0].rows) == 3
|
||||||
@@ -20,6 +20,10 @@ vacia y el resto del ctx se construye igual. Ante un fallo global devuelve al
|
|||||||
menos ``{**base_ctx, "db_path": db_path, "table": table}``.
|
menos ``{**base_ctx, "db_path": db_path, "table": table}``.
|
||||||
|
|
||||||
Claves de DATOS que produce (las consumen los capitulos):
|
Claves de DATOS que produce (las consumen los capitulos):
|
||||||
|
- ``head_rows`` : [ {col: valor, ...}, ... ] primeras filas CRUDAS de la
|
||||||
|
tabla (``SELECT * LIMIT head_n``), una entrada por fila.
|
||||||
|
La lee el capitulo OVERVIEW para mostrar df.head real en
|
||||||
|
lugar del placeholder "df.head no disponible".
|
||||||
- ``raw_numeric`` : {col: [float|None, ...]} muestra cruda de las columnas
|
- ``raw_numeric`` : {col: [float|None, ...]} muestra cruda de las columnas
|
||||||
numericas, ALINEADA POR FILA (una entrada por fila aunque
|
numericas, ALINEADA POR FILA (una entrada por fila aunque
|
||||||
sea None). La leen modelos (clustering 2D en vivo) y
|
sea None). La leen modelos (clustering 2D en vivo) y
|
||||||
@@ -56,7 +60,7 @@ def _to_float(value):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None):
|
def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None, head_n=10):
|
||||||
"""Construye el ctx de datos crudos para los renderers de AutomaticEDA.
|
"""Construye el ctx de datos crudos para los renderers de AutomaticEDA.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -77,13 +81,15 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
|
|||||||
base_ctx: dict opcional con claves de presentacion ya preparadas
|
base_ctx: dict opcional con claves de presentacion ya preparadas
|
||||||
(dataset_name, source_origin, ...). Se parte de una copia y NO se
|
(dataset_name, source_origin, ...). Se parte de una copia y NO se
|
||||||
pisan sus claves; solo se añaden las de datos. Default None -> {}.
|
pisan sus claves; solo se añaden las de datos. Default None -> {}.
|
||||||
|
head_n: numero de filas crudas a muestrear para ``ctx["head_rows"]``
|
||||||
|
(df.head del capitulo OVERVIEW). Default 10. <=0 omite la clave.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
El dict ``ctx`` directamente (NO un wrapper {status,...}): se pasa tal
|
El dict ``ctx`` directamente (NO un wrapper {status,...}): se pasa tal
|
||||||
cual como ``meta={"ctx": <ese dict>}`` a render_automatic_eda_pdf/pptx.
|
cual como ``meta={"ctx": <ese dict>}`` a render_automatic_eda_pdf/pptx.
|
||||||
Nunca lanza. Claves que puede contener: raw_numeric, timeseries_raw,
|
Nunca lanza. Claves que puede contener: head_rows, raw_numeric,
|
||||||
geo_points (omitidas si no aplican o fallan), y siempre db_path + table
|
timeseries_raw, geo_points (omitidas si no aplican o fallan), y siempre
|
||||||
para backends validos.
|
db_path + table para backends validos.
|
||||||
"""
|
"""
|
||||||
# Copia de base_ctx: nunca mutamos el dict del caller. Las claves de
|
# Copia de base_ctx: nunca mutamos el dict del caller. Las claves de
|
||||||
# presentacion que ya traiga se conservan; las de datos se añaden encima.
|
# presentacion que ya traiga se conservan; las de datos se añaden encima.
|
||||||
@@ -117,6 +123,24 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
|
|||||||
ctx["db_path"] = db_path
|
ctx["db_path"] = db_path
|
||||||
ctx["table"] = table
|
ctx["table"] = table
|
||||||
|
|
||||||
|
# 1.5) head_rows: primeras filas CRUDAS de la tabla (SELECT * LIMIT n)
|
||||||
|
# para que el capitulo OVERVIEW muestre df.head real en vez del
|
||||||
|
# placeholder. Una sola query, dict-no-throw: si falla, se omite la
|
||||||
|
# clave (el capitulo degrada a su nota honesta). No se pisa una clave
|
||||||
|
# head_rows que ya viniera en base_ctx (presentacion).
|
||||||
|
if head_n and int(head_n) > 0 and "head_rows" not in ctx:
|
||||||
|
try:
|
||||||
|
hq = query_fn(f'SELECT * FROM "{table}" LIMIT {int(head_n)}')
|
||||||
|
if isinstance(hq, dict) and hq.get("status") == "ok":
|
||||||
|
hrows = [
|
||||||
|
dict(r) for r in (hq.get("rows") or [])
|
||||||
|
if isinstance(r, dict)
|
||||||
|
]
|
||||||
|
if hrows:
|
||||||
|
ctx["head_rows"] = hrows
|
||||||
|
except Exception: # noqa: BLE001 - dict-no-throw: omitir la clave
|
||||||
|
pass
|
||||||
|
|
||||||
# 2) Columnas del perfil agregado (lectura defensiva).
|
# 2) Columnas del perfil agregado (lectura defensiva).
|
||||||
cols = profile.get("columns") if isinstance(profile, dict) else None
|
cols = profile.get("columns") if isinstance(profile, dict) else None
|
||||||
cols = cols or []
|
cols = cols or []
|
||||||
|
|||||||
@@ -536,6 +536,21 @@ def profile_table(
|
|||||||
type_breakdown[it] += 1
|
type_breakdown[it] += 1
|
||||||
prof["type_breakdown"] = type_breakdown
|
prof["type_breakdown"] = type_breakdown
|
||||||
|
|
||||||
|
# 8.1) Primeras filas crudas (df.head) para el capitulo OVERVIEW del motor
|
||||||
|
# AutomaticEDA: una muestra SELECT col1,col2,... LIMIT 10 alineada por fila.
|
||||||
|
# Se reusa _sample_rows (mismo lector read-only). Estilo dict-no-throw: si
|
||||||
|
# falla, head_rows queda None y el capitulo degrada a su nota honesta. El
|
||||||
|
# capitulo lo recoge via profile["head_rows"]; build_eda_render_ctx ademas
|
||||||
|
# lo replica en ctx["head_rows"] cuando se construye el contexto de render.
|
||||||
|
try:
|
||||||
|
head_names = [c.get("name") for c in cols if c.get("name")]
|
||||||
|
head_rows = _sample_rows(_q, table, head_names, 10)
|
||||||
|
prof["head_rows"] = [
|
||||||
|
dict(r) for r in head_rows if isinstance(r, dict)
|
||||||
|
] or None
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
prof["head_rows"] = None
|
||||||
|
|
||||||
# 8.5) Matriz de correlacion/asociacion sobre una muestra de filas
|
# 8.5) Matriz de correlacion/asociacion sobre una muestra de filas
|
||||||
# alineadas. Elige la metrica por par de tipos (Pearson/Spearman,
|
# alineadas. Elige la metrica por par de tipos (Pearson/Spearman,
|
||||||
# Cramer's V/Theil's U, correlation ratio, MI) via association_matrix.
|
# Cramer's V/Theil's U, correlation ratio, MI) via association_matrix.
|
||||||
|
|||||||
Reference in New Issue
Block a user