c4cff5ed5b
- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers); render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo) - H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas + join graph) via render_eda_pdf_relational; clave report_pdf_path - aditivos y retrocompatibles (flags default False). 38 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
330 lines
11 KiB
Python
330 lines
11 KiB
Python
"""Tests para render_eda_pdf.
|
|
|
|
Importa el módulo directo (sys.path), igual que el resto de tests del grupo eda,
|
|
para no depender del registro en __init__.py (lo añade el orquestador al integrar).
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
|
|
from render_eda_pdf import (
|
|
render_eda_pdf,
|
|
render_eda_pdf_relational,
|
|
_models_pages,
|
|
_series_pages,
|
|
_caveats_pages,
|
|
)
|
|
|
|
|
|
class _StubPdf:
|
|
"""Captura pdf.savefig sin escribir nada — para testear builders aislados."""
|
|
|
|
def __init__(self):
|
|
self.figs = 0
|
|
|
|
def savefig(self, fig):
|
|
self.figs += 1
|
|
|
|
|
|
def _synthetic_profile() -> dict:
|
|
"""TableProfile sintético mínimo: 2 numéricas + 1 categórica + overview."""
|
|
return {
|
|
"table": "ventas",
|
|
"source": "data/ventas.csv",
|
|
"profiled_at": "2026-06-28 10:00 UTC",
|
|
"n_rows": 1000,
|
|
"n_cols": 3,
|
|
"null_cell_pct": 0.02,
|
|
"duplicate_rows": 5,
|
|
"duplicate_pct": 0.005,
|
|
"quality_score": 92.5,
|
|
"type_breakdown": {"numeric": 2, "categorical": 1},
|
|
"key_candidates": ["id"],
|
|
"columns": [
|
|
{
|
|
"name": "precio",
|
|
"inferred_type": "numeric",
|
|
"semantic_type": "currency",
|
|
"null_pct": 0.0,
|
|
"distinct_count": 850,
|
|
"unique_pct": 0.85,
|
|
"quality_score": 95.0,
|
|
"flags": [],
|
|
"numeric": {
|
|
"min": 1.0, "max": 100.0, "median": 40.0, "mean": 42.5,
|
|
"std": 12.3, "p25": 30.0, "p75": 55.0, "outlier_pct": 1.2,
|
|
"distribution_type": "right-skewed",
|
|
"histogram": [
|
|
{"lo": 0.0, "hi": 25.0, "count": 100},
|
|
{"lo": 25.0, "hi": 50.0, "count": 500},
|
|
{"lo": 50.0, "hi": 75.0, "count": 300},
|
|
{"lo": 75.0, "hi": 100.0, "count": 50},
|
|
],
|
|
},
|
|
},
|
|
{
|
|
"name": "unidades",
|
|
"inferred_type": "numeric",
|
|
"semantic_type": "integer",
|
|
"null_pct": 0.01,
|
|
"distinct_count": 40,
|
|
"unique_pct": 0.04,
|
|
"quality_score": 88.0,
|
|
"flags": ["has_nulls"],
|
|
"numeric": {
|
|
"min": 1.0, "max": 12.0, "median": 4.0, "mean": 4.8,
|
|
"std": 2.1, "outlier_pct": 0.0,
|
|
"distribution_type": "normal",
|
|
"histogram": [
|
|
{"lo": 1.0, "hi": 4.0, "count": 400},
|
|
{"lo": 4.0, "hi": 8.0, "count": 450},
|
|
{"lo": 8.0, "hi": 12.0, "count": 150},
|
|
],
|
|
},
|
|
},
|
|
{
|
|
"name": "categoria",
|
|
"inferred_type": "categorical",
|
|
"semantic_type": "",
|
|
"null_pct": 0.0,
|
|
"distinct_count": 3,
|
|
"unique_pct": 0.003,
|
|
"quality_score": 99.0,
|
|
"flags": [],
|
|
"categorical": {
|
|
"entropy": 1.05,
|
|
"top": [
|
|
{"value": "neumaticos", "count": 500, "pct": 0.5},
|
|
{"value": "aceite", "count": 300, "pct": 0.3},
|
|
{"value": "filtros", "count": 200, "pct": 0.2},
|
|
],
|
|
},
|
|
},
|
|
],
|
|
"correlations": {
|
|
"pairs": [
|
|
{"a": "precio", "b": "unidades", "value": -0.42, "method": "pearson"},
|
|
],
|
|
},
|
|
}
|
|
|
|
|
|
def test_golden_genera_pdf_multipagina(tmp_path):
|
|
"""Caso real: profile completo -> PDF existe, pesa >0 y tiene varias páginas."""
|
|
out = str(tmp_path / "eda_ventas.pdf")
|
|
res = render_eda_pdf(_synthetic_profile(), out, title="EDA — ventas")
|
|
|
|
assert isinstance(res, dict)
|
|
assert set(res.keys()) == {"pdf_path", "n_pages", "note"}
|
|
assert res["pdf_path"] == out
|
|
assert os.path.exists(out)
|
|
assert os.path.getsize(out) > 0
|
|
# Cover + overview + numéricas + categóricas + calidad + correlaciones >= 5.
|
|
assert res["n_pages"] >= 5
|
|
# Cabecera de archivo PDF.
|
|
with open(out, "rb") as fh:
|
|
assert fh.read(4) == b"%PDF"
|
|
|
|
|
|
def test_edge_profile_vacio_no_revienta(tmp_path):
|
|
"""Edge: dict vacío -> 1 página garantizada, sin excepción."""
|
|
out = str(tmp_path / "vacio.pdf")
|
|
res = render_eda_pdf({}, out)
|
|
assert os.path.exists(out)
|
|
assert os.path.getsize(out) > 0
|
|
assert res["n_pages"] >= 1
|
|
assert res["pdf_path"] == out
|
|
|
|
|
|
def test_edge_profile_none_no_revienta(tmp_path):
|
|
"""Edge: None -> tratado como vacío, 1 página, sin excepción."""
|
|
out = str(tmp_path / "none.pdf")
|
|
res = render_eda_pdf(None, out)
|
|
assert os.path.exists(out)
|
|
assert res["n_pages"] >= 1
|
|
|
|
|
|
def test_edge_solo_numericas(tmp_path):
|
|
"""Edge: profile sólo con columnas numéricas (sin categóricas ni corr)."""
|
|
prof = {
|
|
"table": "t",
|
|
"n_rows": 10,
|
|
"n_cols": 1,
|
|
"columns": [
|
|
{
|
|
"name": "x",
|
|
"inferred_type": "numeric",
|
|
"quality_score": 80.0,
|
|
"numeric": {
|
|
"median": 2.0, "mean": 2.0,
|
|
"histogram": [{"lo": 0.0, "hi": 4.0, "count": 10}],
|
|
},
|
|
},
|
|
],
|
|
}
|
|
out = str(tmp_path / "num.pdf")
|
|
res = render_eda_pdf(prof, out)
|
|
assert os.path.exists(out)
|
|
assert res["n_pages"] >= 2 # cover + numéricas al menos.
|
|
|
|
|
|
def test_forward_compat_seccion_desconocida(tmp_path):
|
|
"""Error/forward-compat: un bloque nuevo del profile se vuelca, no rompe."""
|
|
prof = {
|
|
"table": "t",
|
|
"n_rows": 5,
|
|
"columns": [],
|
|
# Bloques que este renderer no conoce (otros agentes los añaden):
|
|
"models": {"kmeans": {"k": 3, "silhouette": 0.55}},
|
|
"caveats": ["muestra pequeña", "fechas como texto"],
|
|
}
|
|
out = str(tmp_path / "fwd.pdf")
|
|
res = render_eda_pdf(prof, out)
|
|
assert os.path.exists(out)
|
|
assert res["n_pages"] >= 1
|
|
# No se perdió ninguna sección por error.
|
|
assert "omitida" not in res["note"]
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# H4: builders dedicados para models / series / caveats (antes caían al volcado
|
|
# genérico como str(dict) truncado). Se testean aislados con un stub de pdf.
|
|
# --------------------------------------------------------------------------- #
|
|
def _sample_models() -> dict:
|
|
return {
|
|
"n_numeric_cols": 3,
|
|
"pca": {
|
|
"n_components": 2, "n_rows_used": 1000, "n_features": 3,
|
|
"explained_variance_ratio": [0.62, 0.21],
|
|
"cumulative": [0.62, 0.83],
|
|
"top_loadings": [
|
|
{"component": 0, "feature": "precio", "loading": 0.71},
|
|
{"component": 1, "feature": "unidades", "loading": -0.55},
|
|
],
|
|
},
|
|
"kmeans": {
|
|
"best_k": 3, "silhouette": 0.48, "cluster_sizes": [500, 300, 200],
|
|
"scores_by_k": [{"k": 3, "silhouette": 0.48, "inertia": 900.0}],
|
|
},
|
|
"outliers": {"n_outliers": 35, "outlier_pct": 3.5, "threshold": -0.51},
|
|
"normality": {"precio": {"jarque_bera": {"p": 0.0001}, "is_normal": False}},
|
|
"note": "",
|
|
}
|
|
|
|
|
|
def _sample_series() -> dict:
|
|
return {
|
|
"precio": {
|
|
"stationarity": {"verdict": "non_stationary"},
|
|
"acf_pacf": {"is_autocorrelated": True},
|
|
"stl": {"trend_strength": 0.95, "seasonal_strength": 0.10, "period": 7},
|
|
"levels_suggested": True, "levels_kind": "returns",
|
|
},
|
|
}
|
|
|
|
|
|
def _sample_caveats() -> dict:
|
|
return {
|
|
"n": 1,
|
|
"caveats": [
|
|
{"id": "exploratory_nature", "topic": "naturaleza exploratoria",
|
|
"message": "El EDA genera hipótesis, no conclusiones."},
|
|
],
|
|
}
|
|
|
|
|
|
def test_models_builder_produces_pages():
|
|
pdf = _StubPdf()
|
|
assert _models_pages(pdf, _sample_models()) >= 1
|
|
assert pdf.figs >= 1
|
|
|
|
|
|
def test_series_builder_produces_pages():
|
|
pdf = _StubPdf()
|
|
assert _series_pages(pdf, _sample_series()) >= 1
|
|
assert pdf.figs >= 1
|
|
|
|
|
|
def test_caveats_builder_produces_pages():
|
|
pdf = _StubPdf()
|
|
assert _caveats_pages(pdf, _sample_caveats()) >= 1
|
|
assert pdf.figs >= 1
|
|
|
|
|
|
def test_builders_tolerate_none_and_empty():
|
|
pdf = _StubPdf()
|
|
# None / vacío -> 0 páginas, sin excepción.
|
|
assert _models_pages(pdf, None) == 0
|
|
assert _series_pages(pdf, {}) == 0
|
|
assert _caveats_pages(pdf, None) == 0
|
|
assert pdf.figs == 0
|
|
|
|
|
|
def test_models_series_caveats_no_caen_al_generico(tmp_path):
|
|
# Con builder dedicado, models/series/caveats NO se vuelcan en "Otras
|
|
# secciones" (genérico). El profile completo se renderiza sin error.
|
|
prof = _synthetic_profile()
|
|
prof["models"] = _sample_models()
|
|
prof["series"] = _sample_series()
|
|
prof["caveats"] = _sample_caveats()
|
|
out = str(tmp_path / "full.pdf")
|
|
res = render_eda_pdf(prof, out)
|
|
assert os.path.exists(out)
|
|
assert os.path.getsize(out) > 0
|
|
assert "omitida" not in res["note"]
|
|
# Cover+overview+num+cat+calidad+corr + models + series + caveats.
|
|
assert res["n_pages"] >= 8
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# H9: render_eda_pdf_relational — PDF DB-level (resumen de tablas + join graph).
|
|
# --------------------------------------------------------------------------- #
|
|
def _synthetic_db_profile() -> dict:
|
|
return {
|
|
"db_path": "data/shop.duckdb",
|
|
"profiled_at": "2026-06-29 01:00 UTC",
|
|
"n_tables": 2,
|
|
"tables": [
|
|
{"table": "customers", "n_rows": 4, "n_cols": 3, "quality_score": 98.0,
|
|
"key_candidates": ["id"]},
|
|
{"table": "orders", "n_rows": 6, "n_cols": 3, "quality_score": 95.0,
|
|
"key_candidates": ["order_id"]},
|
|
],
|
|
"fk_candidates": [
|
|
{"from_table": "orders", "from_col": "customer_id",
|
|
"to_table": "customers", "to_col": "id",
|
|
"inclusion": 1.0, "cardinality": "N:1"},
|
|
],
|
|
"join_graph": {"mermaid": "graph LR\n orders --> customers"},
|
|
}
|
|
|
|
|
|
def test_relational_golden_genera_pdf(tmp_path):
|
|
out = str(tmp_path / "eda_db.pdf")
|
|
res = render_eda_pdf_relational(_synthetic_db_profile(), out, title="EDA base")
|
|
assert isinstance(res, dict)
|
|
assert set(res.keys()) == {"pdf_path", "n_pages", "note"}
|
|
assert res["pdf_path"] == out
|
|
assert os.path.exists(out)
|
|
assert os.path.getsize(out) > 0
|
|
# cover + tablas + relaciones >= 3.
|
|
assert res["n_pages"] >= 3
|
|
with open(out, "rb") as fh:
|
|
assert fh.read(4) == b"%PDF"
|
|
|
|
|
|
def test_relational_edge_vacio_no_revienta(tmp_path):
|
|
out = str(tmp_path / "db_vacio.pdf")
|
|
res = render_eda_pdf_relational({}, out)
|
|
assert os.path.exists(out)
|
|
assert res["n_pages"] >= 1
|
|
|
|
|
|
def test_relational_edge_none_no_revienta(tmp_path):
|
|
out = str(tmp_path / "db_none.pdf")
|
|
res = render_eda_pdf_relational(None, out)
|
|
assert os.path.exists(out)
|
|
assert res["n_pages"] >= 1
|