feat(eda): render de models en markdown + PDF DB-level para profile_database (H4,H9)

- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers); render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo) - H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas + join graph) via render_eda_pdf_relational; clave report_pdf_path - aditivos y retrocompatibles (flags default False). 38 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 04:05:38 +02:00
parent caf8c25d99
commit c4cff5ed5b
7 changed files with 706 additions and 15 deletions
@@ -52,6 +52,8 @@ _KNOWN_TOP_KEYS = {
    "duplicate_rows", "duplicate_pct", "null_cell_pct", "constant_cols",
    "all_null_cols", "quality_score", "type_breakdown", "key_candidates",
    "columns", "correlations", "llm",
+    # Bloques con builder dedicado (no caen al volcado genérico str(dict)).
+    "models", "series", "caveats",
 }

 # Restrained, high-contrast palette: a single accent reads cleanly on a phone.
@@ -59,6 +61,17 @@ _INK = "#1b1b1b"
 _ACCENT = "#2a6f97"
 _MUTED = "#8a8a8a"

+# Tufte-ish render defaults shared by both public entry points.
+_RC = {
+    "font.size": 10,
+    "font.family": "sans-serif",
+    "axes.titlesize": 11,
+    "axes.edgecolor": _MUTED,
+    "figure.facecolor": "white",
+    "savefig.facecolor": "white",
+    "pdf.fonttype": 42,  # embed TrueType so text stays selectable on mobile.
+}
+

 # --------------------------------------------------------------------------- #
 # Small formatting + Tufte helpers
@@ -535,6 +548,246 @@ def _paginate_text(pdf, title: str, lines: list, subtitle: str = None,
    return pages


+# --------------------------------------------------------------------------- #
+# Dedicated builders for forward-compat blocks (models / series / caveats).
+# Before these existed, ``models``/``series``/``caveats`` fell to the generic
+# dump and were rendered as truncated ``str(dict)``. Each builder is fully
+# defensive, reads with ``.get`` and returns the number of pages it produced.
+# --------------------------------------------------------------------------- #
+def _models_pages(pdf, models) -> int:
+    """Render the cheap-models block (PCA / KMeans / outliers / normality)."""
+    if not isinstance(models, dict):
+        return 0
+    lines = []
+
+    pca = models.get("pca")
+    if isinstance(pca, dict):
+        lines.append("## PCA")
+        n_used = pca.get("n_rows_used")
+        n_feat = pca.get("n_features")
+        if n_used is not None or n_feat is not None:
+            lines.append(
+                f"  {pca.get('n_components')} comp · "
+                f"{_fmt_num(n_used)} filas · {_fmt_num(n_feat)} features"
+            )
+        evr = pca.get("explained_variance_ratio") or []
+        cum = pca.get("cumulative") or []
+        for i, var in enumerate(evr):
+            acc = cum[i] if i < len(cum) else None
+            lines.append(f"  PC{i + 1}: var {_fmt_pct(var)}  acum {_fmt_pct(acc)}")
+        loadings = pca.get("top_loadings") or []
+        if loadings:
+            lines.append("  cargas principales:")
+            for ld in loadings[:8]:
+                if not isinstance(ld, dict):
+                    continue
+                comp = ld.get("component")
+                comp_label = f"PC{comp + 1}" if isinstance(comp, int) else str(comp)
+                lines.append(
+                    f"    {comp_label} {_truncate(ld.get('feature'), 18)}: "
+                    f"{_fmt_num(ld.get('loading'), 3)}"
+                )
+        lines.append("")
+
+    km = models.get("kmeans")
+    if isinstance(km, dict):
+        lines.append("## KMeans")
+        head = f"  mejor k = {_fmt_num(km.get('best_k'))}"
+        if km.get("silhouette") is not None:
+            head += f"  silhouette {_fmt_num(km.get('silhouette'), 3)}"
+        lines.append(head)
+        sizes = km.get("cluster_sizes") or []
+        if sizes:
+            lines.append("  tamaños cluster: " + ", ".join(
+                _fmt_num(s) for s in sizes))
+        for sc in km.get("scores_by_k") or []:
+            if not isinstance(sc, dict):
+                continue
+            lines.append(
+                f"    k={sc.get('k')}: silhouette {_fmt_num(sc.get('silhouette'), 3)}"
+                f"  inertia {_fmt_num(sc.get('inertia'), 1)}"
+            )
+        lines.append("")
+
+    out = models.get("outliers")
+    if isinstance(out, dict):
+        lines.append("## Outliers multivariante (Isolation Forest)")
+        # outlier_pct del modelo ya viene en escala 0-100.
+        line = f"  {_fmt_num(out.get('n_outliers'))} outliers"
+        if out.get("outlier_pct") is not None:
+            line += f" ({_fmt_num(out.get('outlier_pct'), 2)}%)"
+        if out.get("threshold") is not None:
+            line += f"  umbral {_fmt_num(out.get('threshold'), 3)}"
+        lines.append(line)
+        lines.append("")
+
+    normality = models.get("normality")
+    if isinstance(normality, dict):
+        lines.append("## Normalidad (Jarque-Bera)")
+        for col_name, res in normality.items():
+            if not isinstance(res, dict):
+                continue
+            jb = res.get("jarque_bera") or {}
+            lines.append(
+                f"  {_truncate(col_name, 18):<18} normal={res.get('is_normal')}"
+                f"  JB p={_fmt_num(jb.get('p'), 4)}"
+            )
+        lines.append("")
+
+    note = models.get("note")
+    if note:
+        lines.append(f"nota: {note}")
+
+    if not [ln for ln in lines if ln.strip()]:
+        return 0
+    return _paginate_text(pdf, "Modelos", lines)
+
+
+def _series_pages(pdf, series) -> int:
+    """Render the time-series block: one compact summary per series column."""
+    if not isinstance(series, dict) or not series:
+        return 0
+    lines = []
+    for col, s in series.items():
+        if not isinstance(s, dict):
+            continue
+        lines.append(f"## {col}")
+        stat = s.get("stationarity") or {}
+        if stat.get("verdict") is not None:
+            lines.append(f"  estacionariedad (ADF+KPSS): {stat.get('verdict')}")
+        acf = s.get("acf_pacf") or {}
+        if acf.get("is_autocorrelated") is not None:
+            lines.append(
+                "  autocorrelada (Ljung-Box): "
+                + ("sí" if acf.get("is_autocorrelated") else "no")
+            )
+        stl = s.get("stl") or {}
+        if stl.get("trend_strength") is not None:
+            lines.append(
+                f"  fuerza tendencia (STL): {_fmt_num(stl.get('trend_strength'), 3)}")
+        if stl.get("seasonal_strength") is not None:
+            extra = (f"  (periodo {stl.get('period')})"
+                     if stl.get("period") is not None else "")
+            lines.append(
+                f"  fuerza estacional (STL): "
+                f"{_fmt_num(stl.get('seasonal_strength'), 3)}{extra}")
+        elif stl.get("note"):
+            lines.append(f"  STL: {_truncate(stl.get('note'), 60)}")
+        if s.get("levels_suggested"):
+            kind = s.get("levels_kind")
+            if kind == "returns":
+                lines.append("  sugerencia: convertir a retornos (serie financiera)")
+            elif kind == "differences":
+                lines.append("  sugerencia: trabajar sobre diferencias (serie física)")
+            else:
+                lines.append("  sugerencia: retornos o diferencias (serie de niveles)")
+        lines.append("")
+    if not [ln for ln in lines if ln.strip()]:
+        return 0
+    return _paginate_text(pdf, "Series temporales", lines)
+
+
+def _caveats_pages(pdf, caveats) -> int:
+    """Render the exploratory caveats block as a wrapped, readable list."""
+    cav_list = []
+    if isinstance(caveats, dict):
+        cav_list = caveats.get("caveats") or []
+    elif isinstance(caveats, list):
+        cav_list = caveats
+    lines = []
+    for cav in cav_list:
+        if not isinstance(cav, dict):
+            continue
+        topic = cav.get("topic") or cav.get("id") or ""
+        msg = cav.get("message") or ""
+        lines.append(f"## {topic}")
+        lines.extend(textwrap.wrap(str(msg), width=78) or [""])
+        lines.append("")
+    if not [ln for ln in lines if ln.strip()]:
+        return 0
+    return _paginate_text(pdf, "Avisos exploratorios", lines,
+                          subtitle="el EDA genera hipótesis, no conclusiones")
+
+
+# --------------------------------------------------------------------------- #
+# DB-level (relational) page builders — used by render_eda_pdf_relational.
+# --------------------------------------------------------------------------- #
+def _db_cover_page(pdf, db_profile: dict, title: str) -> int:
+    """Cover for a DatabaseProfile: name, date, table count, FK count."""
+    fig = plt.figure(figsize=_A5_PORTRAIT)
+    db_path = db_profile.get("db_path") or "(base sin nombre)"
+    heading = title or f"EDA base — {os.path.basename(str(db_path))}"
+    fig.text(0.08, 0.82, heading, fontsize=20, fontweight="bold", color=_INK,
+             wrap=True)
+
+    sub = [f"fuente: {_truncate(db_path, 44)}"]
+    when = db_profile.get("profiled_at") or datetime.now(timezone.utc).strftime(
+        "%Y-%m-%d %H:%M UTC")
+    sub.append(f"generado: {when}")
+    fig.text(0.08, 0.74, "\n".join(sub), fontsize=10, color=_MUTED, va="top")
+
+    n_tables = db_profile.get("n_tables")
+    fig.text(0.08, 0.58, f"{_fmt_num(n_tables)} tablas", fontsize=16,
+             color=_ACCENT, fontweight="bold")
+    n_fk = len(db_profile.get("fk_candidates") or [])
+    fig.text(0.08, 0.51, f"{_fmt_num(n_fk)} relaciones FK candidatas",
+             fontsize=12, color=_INK)
+
+    fig.text(0.08, 0.06, "Tufte · alta densidad de datos · lectura en móvil",
+             fontsize=8, color=_MUTED, style="italic")
+    pdf.savefig(fig)
+    plt.close(fig)
+    return 1
+
+
+def _db_tables_page(pdf, db_profile: dict) -> int:
+    """One text page summarising every table (rows / cols / quality)."""
+    tables = db_profile.get("tables") or []
+    if not isinstance(tables, list) or not tables:
+        return 0
+    lines = [f"{'tabla':<24}{'filas':>9}{'cols':>6}{'cal':>6}", "-" * 45]
+    for t in tables:
+        if not isinstance(t, dict):
+            continue
+        lines.append(
+            f"{_truncate(t.get('table'), 24):<24}"
+            f"{_fmt_num(t.get('n_rows')):>9}"
+            f"{_fmt_num(t.get('n_cols')):>6}"
+            f"{_fmt_num(t.get('quality_score'), 1):>6}"
+        )
+    return _paginate_text(pdf, "Tablas", lines, subtitle="resumen por tabla")
+
+
+def _db_fk_page(pdf, db_profile: dict) -> int:
+    """FK candidates table + the join-graph mermaid text."""
+    fks = db_profile.get("fk_candidates") or []
+    lines = []
+    if isinstance(fks, list) and fks:
+        lines.append(f"{'from':<26}{'to':<26}{'incl':>7}")
+        lines.append("-" * 59)
+        for fk in fks:
+            if not isinstance(fk, dict):
+                continue
+            frm = f"{fk.get('from_table')}.{fk.get('from_col')}"
+            to = f"{fk.get('to_table')}.{fk.get('to_col')}"
+            inc = fk.get("inclusion")
+            inc_s = (_fmt_num(inc, 3) if isinstance(inc, (int, float))
+                     and not isinstance(inc, bool) else str(inc))
+            lines.append(
+                f"{_truncate(frm, 25):<26}{_truncate(to, 25):<26}{inc_s:>7}")
+    else:
+        lines.append("(sin relaciones FK candidatas detectadas)")
+
+    mermaid = (db_profile.get("join_graph") or {}).get("mermaid")
+    if mermaid:
+        lines.append("")
+        lines.append("## join graph (mermaid)")
+        for raw in str(mermaid).splitlines():
+            lines.append(_truncate(raw, 72))
+    return _paginate_text(pdf, "Relaciones inter-tabla", lines,
+                          subtitle="FK candidatas + join graph")
+
+
 # --------------------------------------------------------------------------- #
 # Public entry point
 # --------------------------------------------------------------------------- #
@@ -580,16 +833,8 @@ def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict:
        return {"pdf_path": None, "n_pages": 0,
                "note": f"no se pudo crear el directorio destino: {e}"}

-    # Tufte-ish defaults scoped to this render only.
-    rc = {
-        "font.size": 10,
-        "font.family": "sans-serif",
-        "axes.titlesize": 11,
-        "axes.edgecolor": _MUTED,
-        "figure.facecolor": "white",
-        "savefig.facecolor": "white",
-        "pdf.fonttype": 42,  # embed TrueType so text stays selectable on mobile.
-    }
+    # Tufte-ish defaults shared with the relational renderer (module-level _RC).
+    rc = _RC

    # Each section is isolated: a failure in one never aborts the whole PDF.
    builders = [
@@ -599,7 +844,10 @@ def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict:
        ("categorical", lambda p: _categorical_pages(p, columns)),
        ("quality", lambda p: _quality_page(p, columns)),
        ("correlations", lambda p: _correlations_page(p, profile.get("correlations"))),
+        ("models", lambda p: _models_pages(p, profile.get("models"))),
+        ("series", lambda p: _series_pages(p, profile.get("series"))),
        ("llm", lambda p: _llm_pages(p, profile.get("llm"))),
+        ("caveats", lambda p: _caveats_pages(p, profile.get("caveats"))),
        ("generic", lambda p: _generic_pages(p, profile)),
    ]

@@ -624,3 +872,71 @@ def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict:
    if notes:
        note += " · " + "; ".join(notes)
    return {"pdf_path": out_path, "n_pages": n_pages, "note": note}
+
+
+def render_eda_pdf_relational(db_profile: dict, out_path: str,
+                             title: str = None) -> dict:
+    """Render a DatabaseProfile dict into a portable, mobile-readable PDF.
+
+    DB-level sibling of :func:`render_eda_pdf`: instead of a single table it
+    summarises a whole database (the dict ``profile_database`` returns under
+    ``db_profile``). Pages are A5 portrait, single column, large type — built to
+    be read on a phone. Three pages: a cover (table + FK counts), a per-table
+    summary (rows / cols / quality) and the inter-table relations (FK candidates
+    plus the join-graph mermaid text). Every key is read defensively and any
+    section that fails is noted, never aborting the whole render.
+
+    Args:
+        db_profile: DatabaseProfile dict from ``profile_database`` (the value
+            under ``db_profile``). May have keys absent or None; a None/empty
+            profile still yields a 1-page PDF.
+        out_path: filesystem path where the PDF is written. Parent directories
+            are created if missing.
+        title: optional cover title. Defaults to ``"EDA base — <db filename>"``.
+
+    Returns:
+        dict (never raises): {"pdf_path": str, "n_pages": int, "note": str}.
+        On a fatal write error, ``pdf_path`` is None and ``note`` explains why.
+    """
+    if db_profile is None:
+        db_profile = {}
+    if not isinstance(db_profile, dict):
+        return {"pdf_path": None, "n_pages": 0,
+                "note": f"db_profile no es dict: {type(db_profile).__name__}"}
+
+    try:
+        parent = os.path.dirname(os.path.abspath(out_path))
+        os.makedirs(parent, exist_ok=True)
+    except OSError as e:
+        return {"pdf_path": None, "n_pages": 0,
+                "note": f"no se pudo crear el directorio destino: {e}"}
+
+    notes = []
+    n_pages = 0
+
+    builders = [
+        ("cover", lambda p: _db_cover_page(p, db_profile, title)),
+        ("tables", lambda p: _db_tables_page(p, db_profile)),
+        ("relations", lambda p: _db_fk_page(p, db_profile)),
+    ]
+
+    try:
+        with plt.rc_context(_RC):
+            with PdfPages(out_path) as pdf:
+                for name, build in builders:
+                    try:
+                        n_pages += build(pdf) or 0
+                    except Exception as e:  # noqa: BLE001 — one bad section never aborts.
+                        notes.append(f"sección '{name}' omitida: {e}")
+                if n_pages == 0:
+                    n_pages += _text_page(
+                        pdf, title or "EDA base", ["(base vacía — sin secciones)"]
+                    )
+    except Exception as e:  # noqa: BLE001
+        return {"pdf_path": None, "n_pages": 0,
+                "note": f"fallo al escribir el PDF: {e}"}
+
+    note = f"{n_pages} páginas"
+    if notes:
+        note += " · " + "; ".join(notes)
+    return {"pdf_path": out_path, "n_pages": n_pages, "note": note}