feat(eda): render de models en markdown + PDF DB-level para profile_database (H4,H9)

- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers); render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo) - H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas + join graph) via render_eda_pdf_relational; clave report_pdf_path - aditivos y retrocompatibles (flags default False). 38 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 04:05:38 +02:00
parent caf8c25d99
commit c4cff5ed5b
7 changed files with 706 additions and 15 deletions
@@ -9,7 +9,23 @@ import sys

 sys.path.insert(0, os.path.dirname(__file__))

-from render_eda_pdf import render_eda_pdf
+from render_eda_pdf import (
+    render_eda_pdf,
+    render_eda_pdf_relational,
+    _models_pages,
+    _series_pages,
+    _caveats_pages,
+)
+
+
+class _StubPdf:
+    """Captura pdf.savefig sin escribir nada — para testear builders aislados."""
+
+    def __init__(self):
+        self.figs = 0
+
+    def savefig(self, fig):
+        self.figs += 1


 def _synthetic_profile() -> dict:
@@ -170,3 +186,144 @@ def test_forward_compat_seccion_desconocida(tmp_path):
    assert res["n_pages"] >= 1
    # No se perdió ninguna sección por error.
    assert "omitida" not in res["note"]
+
+
+# --------------------------------------------------------------------------- #
+# H4: builders dedicados para models / series / caveats (antes caían al volcado
+# genérico como str(dict) truncado). Se testean aislados con un stub de pdf.
+# --------------------------------------------------------------------------- #
+def _sample_models() -> dict:
+    return {
+        "n_numeric_cols": 3,
+        "pca": {
+            "n_components": 2, "n_rows_used": 1000, "n_features": 3,
+            "explained_variance_ratio": [0.62, 0.21],
+            "cumulative": [0.62, 0.83],
+            "top_loadings": [
+                {"component": 0, "feature": "precio", "loading": 0.71},
+                {"component": 1, "feature": "unidades", "loading": -0.55},
+            ],
+        },
+        "kmeans": {
+            "best_k": 3, "silhouette": 0.48, "cluster_sizes": [500, 300, 200],
+            "scores_by_k": [{"k": 3, "silhouette": 0.48, "inertia": 900.0}],
+        },
+        "outliers": {"n_outliers": 35, "outlier_pct": 3.5, "threshold": -0.51},
+        "normality": {"precio": {"jarque_bera": {"p": 0.0001}, "is_normal": False}},
+        "note": "",
+    }
+
+
+def _sample_series() -> dict:
+    return {
+        "precio": {
+            "stationarity": {"verdict": "non_stationary"},
+            "acf_pacf": {"is_autocorrelated": True},
+            "stl": {"trend_strength": 0.95, "seasonal_strength": 0.10, "period": 7},
+            "levels_suggested": True, "levels_kind": "returns",
+        },
+    }
+
+
+def _sample_caveats() -> dict:
+    return {
+        "n": 1,
+        "caveats": [
+            {"id": "exploratory_nature", "topic": "naturaleza exploratoria",
+             "message": "El EDA genera hipótesis, no conclusiones."},
+        ],
+    }
+
+
+def test_models_builder_produces_pages():
+    pdf = _StubPdf()
+    assert _models_pages(pdf, _sample_models()) >= 1
+    assert pdf.figs >= 1
+
+
+def test_series_builder_produces_pages():
+    pdf = _StubPdf()
+    assert _series_pages(pdf, _sample_series()) >= 1
+    assert pdf.figs >= 1
+
+
+def test_caveats_builder_produces_pages():
+    pdf = _StubPdf()
+    assert _caveats_pages(pdf, _sample_caveats()) >= 1
+    assert pdf.figs >= 1
+
+
+def test_builders_tolerate_none_and_empty():
+    pdf = _StubPdf()
+    # None / vacío -> 0 páginas, sin excepción.
+    assert _models_pages(pdf, None) == 0
+    assert _series_pages(pdf, {}) == 0
+    assert _caveats_pages(pdf, None) == 0
+    assert pdf.figs == 0
+
+
+def test_models_series_caveats_no_caen_al_generico(tmp_path):
+    # Con builder dedicado, models/series/caveats NO se vuelcan en "Otras
+    # secciones" (genérico). El profile completo se renderiza sin error.
+    prof = _synthetic_profile()
+    prof["models"] = _sample_models()
+    prof["series"] = _sample_series()
+    prof["caveats"] = _sample_caveats()
+    out = str(tmp_path / "full.pdf")
+    res = render_eda_pdf(prof, out)
+    assert os.path.exists(out)
+    assert os.path.getsize(out) > 0
+    assert "omitida" not in res["note"]
+    # Cover+overview+num+cat+calidad+corr + models + series + caveats.
+    assert res["n_pages"] >= 8
+
+
+# --------------------------------------------------------------------------- #
+# H9: render_eda_pdf_relational — PDF DB-level (resumen de tablas + join graph).
+# --------------------------------------------------------------------------- #
+def _synthetic_db_profile() -> dict:
+    return {
+        "db_path": "data/shop.duckdb",
+        "profiled_at": "2026-06-29 01:00 UTC",
+        "n_tables": 2,
+        "tables": [
+            {"table": "customers", "n_rows": 4, "n_cols": 3, "quality_score": 98.0,
+             "key_candidates": ["id"]},
+            {"table": "orders", "n_rows": 6, "n_cols": 3, "quality_score": 95.0,
+             "key_candidates": ["order_id"]},
+        ],
+        "fk_candidates": [
+            {"from_table": "orders", "from_col": "customer_id",
+             "to_table": "customers", "to_col": "id",
+             "inclusion": 1.0, "cardinality": "N:1"},
+        ],
+        "join_graph": {"mermaid": "graph LR\n  orders --> customers"},
+    }
+
+
+def test_relational_golden_genera_pdf(tmp_path):
+    out = str(tmp_path / "eda_db.pdf")
+    res = render_eda_pdf_relational(_synthetic_db_profile(), out, title="EDA base")
+    assert isinstance(res, dict)
+    assert set(res.keys()) == {"pdf_path", "n_pages", "note"}
+    assert res["pdf_path"] == out
+    assert os.path.exists(out)
+    assert os.path.getsize(out) > 0
+    # cover + tablas + relaciones >= 3.
+    assert res["n_pages"] >= 3
+    with open(out, "rb") as fh:
+        assert fh.read(4) == b"%PDF"
+
+
+def test_relational_edge_vacio_no_revienta(tmp_path):
+    out = str(tmp_path / "db_vacio.pdf")
+    res = render_eda_pdf_relational({}, out)
+    assert os.path.exists(out)
+    assert res["n_pages"] >= 1
+
+
+def test_relational_edge_none_no_revienta(tmp_path):
+    out = str(tmp_path / "db_none.pdf")
+    res = render_eda_pdf_relational(None, out)
+    assert os.path.exists(out)
+    assert res["n_pages"] >= 1