From c4cff5ed5bc51303511d316272785b4db25efeb1 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Mon, 29 Jun 2026 04:05:38 +0200 Subject: [PATCH] feat(eda): render de models en markdown + PDF DB-level para profile_database (H4,H9) - H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers); render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo) - H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas + join graph) via render_eda_pdf_relational; clave report_pdf_path - aditivos y retrocompatibles (flags default False). 38 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) --- python/functions/datascience/__init__.py | 3 +- .../datascience/render_eda_markdown.py | 104 ++++++ .../datascience/render_eda_markdown_test.py | 59 +++ .../functions/datascience/render_eda_pdf.py | 336 +++++++++++++++++- .../datascience/render_eda_pdf_test.py | 159 ++++++++- .../functions/pipelines/profile_database.py | 27 +- .../pipelines/profile_database_test.py | 33 ++ 7 files changed, 706 insertions(+), 15 deletions(-) diff --git a/python/functions/datascience/__init__.py b/python/functions/datascience/__init__.py index b8f0388f..65cefda7 100644 --- a/python/functions/datascience/__init__.py +++ b/python/functions/datascience/__init__.py @@ -52,7 +52,7 @@ from .to_returns import to_returns from .fdr_correction import fdr_correction from .suggest_reexpression import suggest_reexpression from .exploratory_caveats import exploratory_caveats -from .render_eda_pdf import render_eda_pdf +from .render_eda_pdf import render_eda_pdf, render_eda_pdf_relational __all__ = [ "decode_qr_image", @@ -64,6 +64,7 @@ __all__ = [ "suggest_reexpression", "exploratory_caveats", "render_eda_pdf", + "render_eda_pdf_relational", "summarize_table_duckdb", "summarize_table_pg", "spearman_corr", diff --git a/python/functions/datascience/render_eda_markdown.py b/python/functions/datascience/render_eda_markdown.py index b0b06863..05b77009 100644 --- a/python/functions/datascience/render_eda_markdown.py +++ b/python/functions/datascience/render_eda_markdown.py @@ -405,6 +405,110 @@ def render_eda_markdown(profile: dict) -> str: parts.append("## Series temporales") parts.extend(series_blocks) + # 7d. Modelos baratos (PCA, KMeans, outliers multivariantes, normalidad). El + # pipeline corre `run_eda_models` cuando se pide con run_models; el bloque está + # completo en el JSON pero antes no tenía formatter en markdown y se omitía. Se + # lee todo defensivo con .get y cada submodelo se renderiza solo si está presente. + models = profile.get("models") + if isinstance(models, dict): + model_parts: list[str] = [] + + pca = models.get("pca") + if isinstance(pca, dict): + evr = pca.get("explained_variance_ratio") or [] + cum = pca.get("cumulative") or [] + pca_rows = [] + for i, var in enumerate(evr): + acc = cum[i] if i < len(cum) else None + pca_rows.append([f"PC{i + 1}", _fmt_pct(var), _fmt_pct(acc)]) + sub = ["### PCA"] + n_feat = pca.get("n_features") + n_used = pca.get("n_rows_used") + if n_feat is not None or n_used is not None: + sub.append( + f"{pca.get('n_components')} componentes sobre " + f"{n_used if n_used is not None else '?'} filas, " + f"{n_feat if n_feat is not None else '?'} features." + ) + if pca_rows: + sub.append(_md_table( + ["componente", "var. explicada", "acumulada"], pca_rows)) + loadings = pca.get("top_loadings") or [] + load_rows = [] + for ld in loadings[:12]: + if not isinstance(ld, dict): + continue + comp = ld.get("component") + comp_label = f"PC{comp + 1}" if isinstance(comp, int) else str(comp) + load_rows.append([comp_label, ld.get("feature"), + _fmt_num(ld.get("loading"), 3)]) + if load_rows: + sub.append("Cargas principales:") + sub.append(_md_table(["componente", "feature", "carga"], load_rows)) + model_parts.append("\n\n".join(sub)) + + km = models.get("kmeans") + if isinstance(km, dict): + sub = ["### KMeans"] + best_k = km.get("best_k") + sil = km.get("silhouette") + sizes = km.get("cluster_sizes") or [] + head = f"mejor k = {_fmt_num(best_k)}" + if sil is not None: + head += f" (silhouette {_fmt_num(sil, 3)})" + if sizes: + head += ". Tamaños de cluster: " + ", ".join( + _fmt_num(s) for s in sizes) + sub.append(head + ".") + score_rows = [] + for sc in km.get("scores_by_k") or []: + if not isinstance(sc, dict): + continue + score_rows.append([sc.get("k"), _fmt_num(sc.get("silhouette"), 3), + _fmt_num(sc.get("inertia"), 2)]) + if score_rows: + sub.append(_md_table(["k", "silhouette", "inertia"], score_rows)) + model_parts.append("\n\n".join(sub)) + + out = models.get("outliers") + if isinstance(out, dict): + # outlier_pct del modelo multivariante ya viene en escala 0-100. + n_out = out.get("n_outliers") + pct = out.get("outlier_pct") + thr = out.get("threshold") + line = f"{_fmt_num(n_out)} filas marcadas como outlier" + if pct is not None: + line += f" ({_fmt_num(pct, 2)}%)" + if thr is not None: + line += f"; umbral de score {_fmt_num(thr, 3)}" + model_parts.append("### Outliers multivariante (Isolation Forest)\n\n" + + line + ".") + + normality = models.get("normality") + if isinstance(normality, dict): + norm_rows = [] + for col_name, res in normality.items(): + if not isinstance(res, dict): + continue + jb = res.get("jarque_bera") or {} + norm_rows.append([ + col_name, + "sí" if res.get("is_normal") else "no", + _fmt_num(jb.get("p")) if jb.get("p") is not None else "", + ]) + if norm_rows: + model_parts.append( + "### Normalidad\n\n" + + _md_table(["columna", "normal", "Jarque-Bera p"], norm_rows)) + + note = models.get("note") + if note: + model_parts.append(f"> {note}") + + if model_parts: + parts.append("## Modelos") + parts.extend(model_parts) + # 8. LLM analysis (tolerate None for now). llm = profile.get("llm") if llm: diff --git a/python/functions/datascience/render_eda_markdown_test.py b/python/functions/datascience/render_eda_markdown_test.py index 7ecd3785..d5399f79 100644 --- a/python/functions/datascience/render_eda_markdown_test.py +++ b/python/functions/datascience/render_eda_markdown_test.py @@ -173,3 +173,62 @@ def test_tolerates_empty_profile(): def test_tolerates_none_profile(): md = render_eda_markdown(None) assert "# EDA — (unnamed)" in md + + +def _sample_models(): + """Bloque `models` como el que produce run_eda_models (PCA/KMeans/...).""" + return { + "n_numeric_cols": 3, + "pca": { + "n_components": 2, + "n_rows_used": 1000, + "n_features": 3, + "explained_variance_ratio": [0.62, 0.21], + "cumulative": [0.62, 0.83], + "top_loadings": [ + {"component": 0, "feature": "price", "loading": 0.71}, + {"component": 1, "feature": "qty", "loading": -0.55}, + ], + }, + "kmeans": { + "best_k": 3, + "silhouette": 0.48, + "cluster_sizes": [500, 300, 200], + "scores_by_k": [ + {"k": 2, "silhouette": 0.41, "inertia": 1200.0}, + {"k": 3, "silhouette": 0.48, "inertia": 900.0}, + ], + }, + "outliers": { + "n_outliers": 35, + "outlier_pct": 3.5, + "threshold": -0.51, + }, + "normality": { + "price": {"jarque_bera": {"p": 0.0001}, "is_normal": False}, + }, + "note": "", + } + + +def test_models_section_rendered(): + # H4: el bloque models antes se omitía en markdown; ahora tiene formatter. + profile = _sample_profile() + profile["models"] = _sample_models() + md = render_eda_markdown(profile) + assert "## Modelos" in md + assert "### PCA" in md + assert "### KMeans" in md + assert "### Outliers multivariante (Isolation Forest)" in md + assert "### Normalidad" in md + # Datos reales del PCA renderizados (varianza explicada ×100) y KMeans. + assert "62.0" in md # explained_variance_ratio 0.62 -> 62.00% + assert "mejor k = 3" in md + # outlier_pct del modelo ya viene en escala 0-100: 3.5 -> "3.5%", no "350". + assert "3.5%" in md + + +def test_models_absent_when_none(): + # Edge: profile sin models (None) no produce sección Modelos ni rompe. + md = render_eda_markdown(_sample_profile()) # models=None en el sample + assert "## Modelos" not in md diff --git a/python/functions/datascience/render_eda_pdf.py b/python/functions/datascience/render_eda_pdf.py index b2a1bf50..a0a39f4b 100644 --- a/python/functions/datascience/render_eda_pdf.py +++ b/python/functions/datascience/render_eda_pdf.py @@ -52,6 +52,8 @@ _KNOWN_TOP_KEYS = { "duplicate_rows", "duplicate_pct", "null_cell_pct", "constant_cols", "all_null_cols", "quality_score", "type_breakdown", "key_candidates", "columns", "correlations", "llm", + # Bloques con builder dedicado (no caen al volcado genérico str(dict)). + "models", "series", "caveats", } # Restrained, high-contrast palette: a single accent reads cleanly on a phone. @@ -59,6 +61,17 @@ _INK = "#1b1b1b" _ACCENT = "#2a6f97" _MUTED = "#8a8a8a" +# Tufte-ish render defaults shared by both public entry points. +_RC = { + "font.size": 10, + "font.family": "sans-serif", + "axes.titlesize": 11, + "axes.edgecolor": _MUTED, + "figure.facecolor": "white", + "savefig.facecolor": "white", + "pdf.fonttype": 42, # embed TrueType so text stays selectable on mobile. +} + # --------------------------------------------------------------------------- # # Small formatting + Tufte helpers @@ -535,6 +548,246 @@ def _paginate_text(pdf, title: str, lines: list, subtitle: str = None, return pages +# --------------------------------------------------------------------------- # +# Dedicated builders for forward-compat blocks (models / series / caveats). +# Before these existed, ``models``/``series``/``caveats`` fell to the generic +# dump and were rendered as truncated ``str(dict)``. Each builder is fully +# defensive, reads with ``.get`` and returns the number of pages it produced. +# --------------------------------------------------------------------------- # +def _models_pages(pdf, models) -> int: + """Render the cheap-models block (PCA / KMeans / outliers / normality).""" + if not isinstance(models, dict): + return 0 + lines = [] + + pca = models.get("pca") + if isinstance(pca, dict): + lines.append("## PCA") + n_used = pca.get("n_rows_used") + n_feat = pca.get("n_features") + if n_used is not None or n_feat is not None: + lines.append( + f" {pca.get('n_components')} comp · " + f"{_fmt_num(n_used)} filas · {_fmt_num(n_feat)} features" + ) + evr = pca.get("explained_variance_ratio") or [] + cum = pca.get("cumulative") or [] + for i, var in enumerate(evr): + acc = cum[i] if i < len(cum) else None + lines.append(f" PC{i + 1}: var {_fmt_pct(var)} acum {_fmt_pct(acc)}") + loadings = pca.get("top_loadings") or [] + if loadings: + lines.append(" cargas principales:") + for ld in loadings[:8]: + if not isinstance(ld, dict): + continue + comp = ld.get("component") + comp_label = f"PC{comp + 1}" if isinstance(comp, int) else str(comp) + lines.append( + f" {comp_label} {_truncate(ld.get('feature'), 18)}: " + f"{_fmt_num(ld.get('loading'), 3)}" + ) + lines.append("") + + km = models.get("kmeans") + if isinstance(km, dict): + lines.append("## KMeans") + head = f" mejor k = {_fmt_num(km.get('best_k'))}" + if km.get("silhouette") is not None: + head += f" silhouette {_fmt_num(km.get('silhouette'), 3)}" + lines.append(head) + sizes = km.get("cluster_sizes") or [] + if sizes: + lines.append(" tamaños cluster: " + ", ".join( + _fmt_num(s) for s in sizes)) + for sc in km.get("scores_by_k") or []: + if not isinstance(sc, dict): + continue + lines.append( + f" k={sc.get('k')}: silhouette {_fmt_num(sc.get('silhouette'), 3)}" + f" inertia {_fmt_num(sc.get('inertia'), 1)}" + ) + lines.append("") + + out = models.get("outliers") + if isinstance(out, dict): + lines.append("## Outliers multivariante (Isolation Forest)") + # outlier_pct del modelo ya viene en escala 0-100. + line = f" {_fmt_num(out.get('n_outliers'))} outliers" + if out.get("outlier_pct") is not None: + line += f" ({_fmt_num(out.get('outlier_pct'), 2)}%)" + if out.get("threshold") is not None: + line += f" umbral {_fmt_num(out.get('threshold'), 3)}" + lines.append(line) + lines.append("") + + normality = models.get("normality") + if isinstance(normality, dict): + lines.append("## Normalidad (Jarque-Bera)") + for col_name, res in normality.items(): + if not isinstance(res, dict): + continue + jb = res.get("jarque_bera") or {} + lines.append( + f" {_truncate(col_name, 18):<18} normal={res.get('is_normal')}" + f" JB p={_fmt_num(jb.get('p'), 4)}" + ) + lines.append("") + + note = models.get("note") + if note: + lines.append(f"nota: {note}") + + if not [ln for ln in lines if ln.strip()]: + return 0 + return _paginate_text(pdf, "Modelos", lines) + + +def _series_pages(pdf, series) -> int: + """Render the time-series block: one compact summary per series column.""" + if not isinstance(series, dict) or not series: + return 0 + lines = [] + for col, s in series.items(): + if not isinstance(s, dict): + continue + lines.append(f"## {col}") + stat = s.get("stationarity") or {} + if stat.get("verdict") is not None: + lines.append(f" estacionariedad (ADF+KPSS): {stat.get('verdict')}") + acf = s.get("acf_pacf") or {} + if acf.get("is_autocorrelated") is not None: + lines.append( + " autocorrelada (Ljung-Box): " + + ("sí" if acf.get("is_autocorrelated") else "no") + ) + stl = s.get("stl") or {} + if stl.get("trend_strength") is not None: + lines.append( + f" fuerza tendencia (STL): {_fmt_num(stl.get('trend_strength'), 3)}") + if stl.get("seasonal_strength") is not None: + extra = (f" (periodo {stl.get('period')})" + if stl.get("period") is not None else "") + lines.append( + f" fuerza estacional (STL): " + f"{_fmt_num(stl.get('seasonal_strength'), 3)}{extra}") + elif stl.get("note"): + lines.append(f" STL: {_truncate(stl.get('note'), 60)}") + if s.get("levels_suggested"): + kind = s.get("levels_kind") + if kind == "returns": + lines.append(" sugerencia: convertir a retornos (serie financiera)") + elif kind == "differences": + lines.append(" sugerencia: trabajar sobre diferencias (serie física)") + else: + lines.append(" sugerencia: retornos o diferencias (serie de niveles)") + lines.append("") + if not [ln for ln in lines if ln.strip()]: + return 0 + return _paginate_text(pdf, "Series temporales", lines) + + +def _caveats_pages(pdf, caveats) -> int: + """Render the exploratory caveats block as a wrapped, readable list.""" + cav_list = [] + if isinstance(caveats, dict): + cav_list = caveats.get("caveats") or [] + elif isinstance(caveats, list): + cav_list = caveats + lines = [] + for cav in cav_list: + if not isinstance(cav, dict): + continue + topic = cav.get("topic") or cav.get("id") or "" + msg = cav.get("message") or "" + lines.append(f"## {topic}") + lines.extend(textwrap.wrap(str(msg), width=78) or [""]) + lines.append("") + if not [ln for ln in lines if ln.strip()]: + return 0 + return _paginate_text(pdf, "Avisos exploratorios", lines, + subtitle="el EDA genera hipótesis, no conclusiones") + + +# --------------------------------------------------------------------------- # +# DB-level (relational) page builders — used by render_eda_pdf_relational. +# --------------------------------------------------------------------------- # +def _db_cover_page(pdf, db_profile: dict, title: str) -> int: + """Cover for a DatabaseProfile: name, date, table count, FK count.""" + fig = plt.figure(figsize=_A5_PORTRAIT) + db_path = db_profile.get("db_path") or "(base sin nombre)" + heading = title or f"EDA base — {os.path.basename(str(db_path))}" + fig.text(0.08, 0.82, heading, fontsize=20, fontweight="bold", color=_INK, + wrap=True) + + sub = [f"fuente: {_truncate(db_path, 44)}"] + when = db_profile.get("profiled_at") or datetime.now(timezone.utc).strftime( + "%Y-%m-%d %H:%M UTC") + sub.append(f"generado: {when}") + fig.text(0.08, 0.74, "\n".join(sub), fontsize=10, color=_MUTED, va="top") + + n_tables = db_profile.get("n_tables") + fig.text(0.08, 0.58, f"{_fmt_num(n_tables)} tablas", fontsize=16, + color=_ACCENT, fontweight="bold") + n_fk = len(db_profile.get("fk_candidates") or []) + fig.text(0.08, 0.51, f"{_fmt_num(n_fk)} relaciones FK candidatas", + fontsize=12, color=_INK) + + fig.text(0.08, 0.06, "Tufte · alta densidad de datos · lectura en móvil", + fontsize=8, color=_MUTED, style="italic") + pdf.savefig(fig) + plt.close(fig) + return 1 + + +def _db_tables_page(pdf, db_profile: dict) -> int: + """One text page summarising every table (rows / cols / quality).""" + tables = db_profile.get("tables") or [] + if not isinstance(tables, list) or not tables: + return 0 + lines = [f"{'tabla':<24}{'filas':>9}{'cols':>6}{'cal':>6}", "-" * 45] + for t in tables: + if not isinstance(t, dict): + continue + lines.append( + f"{_truncate(t.get('table'), 24):<24}" + f"{_fmt_num(t.get('n_rows')):>9}" + f"{_fmt_num(t.get('n_cols')):>6}" + f"{_fmt_num(t.get('quality_score'), 1):>6}" + ) + return _paginate_text(pdf, "Tablas", lines, subtitle="resumen por tabla") + + +def _db_fk_page(pdf, db_profile: dict) -> int: + """FK candidates table + the join-graph mermaid text.""" + fks = db_profile.get("fk_candidates") or [] + lines = [] + if isinstance(fks, list) and fks: + lines.append(f"{'from':<26}{'to':<26}{'incl':>7}") + lines.append("-" * 59) + for fk in fks: + if not isinstance(fk, dict): + continue + frm = f"{fk.get('from_table')}.{fk.get('from_col')}" + to = f"{fk.get('to_table')}.{fk.get('to_col')}" + inc = fk.get("inclusion") + inc_s = (_fmt_num(inc, 3) if isinstance(inc, (int, float)) + and not isinstance(inc, bool) else str(inc)) + lines.append( + f"{_truncate(frm, 25):<26}{_truncate(to, 25):<26}{inc_s:>7}") + else: + lines.append("(sin relaciones FK candidatas detectadas)") + + mermaid = (db_profile.get("join_graph") or {}).get("mermaid") + if mermaid: + lines.append("") + lines.append("## join graph (mermaid)") + for raw in str(mermaid).splitlines(): + lines.append(_truncate(raw, 72)) + return _paginate_text(pdf, "Relaciones inter-tabla", lines, + subtitle="FK candidatas + join graph") + + # --------------------------------------------------------------------------- # # Public entry point # --------------------------------------------------------------------------- # @@ -580,16 +833,8 @@ def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict: return {"pdf_path": None, "n_pages": 0, "note": f"no se pudo crear el directorio destino: {e}"} - # Tufte-ish defaults scoped to this render only. - rc = { - "font.size": 10, - "font.family": "sans-serif", - "axes.titlesize": 11, - "axes.edgecolor": _MUTED, - "figure.facecolor": "white", - "savefig.facecolor": "white", - "pdf.fonttype": 42, # embed TrueType so text stays selectable on mobile. - } + # Tufte-ish defaults shared with the relational renderer (module-level _RC). + rc = _RC # Each section is isolated: a failure in one never aborts the whole PDF. builders = [ @@ -599,7 +844,10 @@ def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict: ("categorical", lambda p: _categorical_pages(p, columns)), ("quality", lambda p: _quality_page(p, columns)), ("correlations", lambda p: _correlations_page(p, profile.get("correlations"))), + ("models", lambda p: _models_pages(p, profile.get("models"))), + ("series", lambda p: _series_pages(p, profile.get("series"))), ("llm", lambda p: _llm_pages(p, profile.get("llm"))), + ("caveats", lambda p: _caveats_pages(p, profile.get("caveats"))), ("generic", lambda p: _generic_pages(p, profile)), ] @@ -624,3 +872,71 @@ def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict: if notes: note += " · " + "; ".join(notes) return {"pdf_path": out_path, "n_pages": n_pages, "note": note} + + +def render_eda_pdf_relational(db_profile: dict, out_path: str, + title: str = None) -> dict: + """Render a DatabaseProfile dict into a portable, mobile-readable PDF. + + DB-level sibling of :func:`render_eda_pdf`: instead of a single table it + summarises a whole database (the dict ``profile_database`` returns under + ``db_profile``). Pages are A5 portrait, single column, large type — built to + be read on a phone. Three pages: a cover (table + FK counts), a per-table + summary (rows / cols / quality) and the inter-table relations (FK candidates + plus the join-graph mermaid text). Every key is read defensively and any + section that fails is noted, never aborting the whole render. + + Args: + db_profile: DatabaseProfile dict from ``profile_database`` (the value + under ``db_profile``). May have keys absent or None; a None/empty + profile still yields a 1-page PDF. + out_path: filesystem path where the PDF is written. Parent directories + are created if missing. + title: optional cover title. Defaults to ``"EDA base — "``. + + Returns: + dict (never raises): {"pdf_path": str, "n_pages": int, "note": str}. + On a fatal write error, ``pdf_path`` is None and ``note`` explains why. + """ + if db_profile is None: + db_profile = {} + if not isinstance(db_profile, dict): + return {"pdf_path": None, "n_pages": 0, + "note": f"db_profile no es dict: {type(db_profile).__name__}"} + + try: + parent = os.path.dirname(os.path.abspath(out_path)) + os.makedirs(parent, exist_ok=True) + except OSError as e: + return {"pdf_path": None, "n_pages": 0, + "note": f"no se pudo crear el directorio destino: {e}"} + + notes = [] + n_pages = 0 + + builders = [ + ("cover", lambda p: _db_cover_page(p, db_profile, title)), + ("tables", lambda p: _db_tables_page(p, db_profile)), + ("relations", lambda p: _db_fk_page(p, db_profile)), + ] + + try: + with plt.rc_context(_RC): + with PdfPages(out_path) as pdf: + for name, build in builders: + try: + n_pages += build(pdf) or 0 + except Exception as e: # noqa: BLE001 — one bad section never aborts. + notes.append(f"sección '{name}' omitida: {e}") + if n_pages == 0: + n_pages += _text_page( + pdf, title or "EDA base", ["(base vacía — sin secciones)"] + ) + except Exception as e: # noqa: BLE001 + return {"pdf_path": None, "n_pages": 0, + "note": f"fallo al escribir el PDF: {e}"} + + note = f"{n_pages} páginas" + if notes: + note += " · " + "; ".join(notes) + return {"pdf_path": out_path, "n_pages": n_pages, "note": note} diff --git a/python/functions/datascience/render_eda_pdf_test.py b/python/functions/datascience/render_eda_pdf_test.py index 0b04734b..50f145c8 100644 --- a/python/functions/datascience/render_eda_pdf_test.py +++ b/python/functions/datascience/render_eda_pdf_test.py @@ -9,7 +9,23 @@ import sys sys.path.insert(0, os.path.dirname(__file__)) -from render_eda_pdf import render_eda_pdf +from render_eda_pdf import ( + render_eda_pdf, + render_eda_pdf_relational, + _models_pages, + _series_pages, + _caveats_pages, +) + + +class _StubPdf: + """Captura pdf.savefig sin escribir nada — para testear builders aislados.""" + + def __init__(self): + self.figs = 0 + + def savefig(self, fig): + self.figs += 1 def _synthetic_profile() -> dict: @@ -170,3 +186,144 @@ def test_forward_compat_seccion_desconocida(tmp_path): assert res["n_pages"] >= 1 # No se perdió ninguna sección por error. assert "omitida" not in res["note"] + + +# --------------------------------------------------------------------------- # +# H4: builders dedicados para models / series / caveats (antes caían al volcado +# genérico como str(dict) truncado). Se testean aislados con un stub de pdf. +# --------------------------------------------------------------------------- # +def _sample_models() -> dict: + return { + "n_numeric_cols": 3, + "pca": { + "n_components": 2, "n_rows_used": 1000, "n_features": 3, + "explained_variance_ratio": [0.62, 0.21], + "cumulative": [0.62, 0.83], + "top_loadings": [ + {"component": 0, "feature": "precio", "loading": 0.71}, + {"component": 1, "feature": "unidades", "loading": -0.55}, + ], + }, + "kmeans": { + "best_k": 3, "silhouette": 0.48, "cluster_sizes": [500, 300, 200], + "scores_by_k": [{"k": 3, "silhouette": 0.48, "inertia": 900.0}], + }, + "outliers": {"n_outliers": 35, "outlier_pct": 3.5, "threshold": -0.51}, + "normality": {"precio": {"jarque_bera": {"p": 0.0001}, "is_normal": False}}, + "note": "", + } + + +def _sample_series() -> dict: + return { + "precio": { + "stationarity": {"verdict": "non_stationary"}, + "acf_pacf": {"is_autocorrelated": True}, + "stl": {"trend_strength": 0.95, "seasonal_strength": 0.10, "period": 7}, + "levels_suggested": True, "levels_kind": "returns", + }, + } + + +def _sample_caveats() -> dict: + return { + "n": 1, + "caveats": [ + {"id": "exploratory_nature", "topic": "naturaleza exploratoria", + "message": "El EDA genera hipótesis, no conclusiones."}, + ], + } + + +def test_models_builder_produces_pages(): + pdf = _StubPdf() + assert _models_pages(pdf, _sample_models()) >= 1 + assert pdf.figs >= 1 + + +def test_series_builder_produces_pages(): + pdf = _StubPdf() + assert _series_pages(pdf, _sample_series()) >= 1 + assert pdf.figs >= 1 + + +def test_caveats_builder_produces_pages(): + pdf = _StubPdf() + assert _caveats_pages(pdf, _sample_caveats()) >= 1 + assert pdf.figs >= 1 + + +def test_builders_tolerate_none_and_empty(): + pdf = _StubPdf() + # None / vacío -> 0 páginas, sin excepción. + assert _models_pages(pdf, None) == 0 + assert _series_pages(pdf, {}) == 0 + assert _caveats_pages(pdf, None) == 0 + assert pdf.figs == 0 + + +def test_models_series_caveats_no_caen_al_generico(tmp_path): + # Con builder dedicado, models/series/caveats NO se vuelcan en "Otras + # secciones" (genérico). El profile completo se renderiza sin error. + prof = _synthetic_profile() + prof["models"] = _sample_models() + prof["series"] = _sample_series() + prof["caveats"] = _sample_caveats() + out = str(tmp_path / "full.pdf") + res = render_eda_pdf(prof, out) + assert os.path.exists(out) + assert os.path.getsize(out) > 0 + assert "omitida" not in res["note"] + # Cover+overview+num+cat+calidad+corr + models + series + caveats. + assert res["n_pages"] >= 8 + + +# --------------------------------------------------------------------------- # +# H9: render_eda_pdf_relational — PDF DB-level (resumen de tablas + join graph). +# --------------------------------------------------------------------------- # +def _synthetic_db_profile() -> dict: + return { + "db_path": "data/shop.duckdb", + "profiled_at": "2026-06-29 01:00 UTC", + "n_tables": 2, + "tables": [ + {"table": "customers", "n_rows": 4, "n_cols": 3, "quality_score": 98.0, + "key_candidates": ["id"]}, + {"table": "orders", "n_rows": 6, "n_cols": 3, "quality_score": 95.0, + "key_candidates": ["order_id"]}, + ], + "fk_candidates": [ + {"from_table": "orders", "from_col": "customer_id", + "to_table": "customers", "to_col": "id", + "inclusion": 1.0, "cardinality": "N:1"}, + ], + "join_graph": {"mermaid": "graph LR\n orders --> customers"}, + } + + +def test_relational_golden_genera_pdf(tmp_path): + out = str(tmp_path / "eda_db.pdf") + res = render_eda_pdf_relational(_synthetic_db_profile(), out, title="EDA base") + assert isinstance(res, dict) + assert set(res.keys()) == {"pdf_path", "n_pages", "note"} + assert res["pdf_path"] == out + assert os.path.exists(out) + assert os.path.getsize(out) > 0 + # cover + tablas + relaciones >= 3. + assert res["n_pages"] >= 3 + with open(out, "rb") as fh: + assert fh.read(4) == b"%PDF" + + +def test_relational_edge_vacio_no_revienta(tmp_path): + out = str(tmp_path / "db_vacio.pdf") + res = render_eda_pdf_relational({}, out) + assert os.path.exists(out) + assert res["n_pages"] >= 1 + + +def test_relational_edge_none_no_revienta(tmp_path): + out = str(tmp_path / "db_none.pdf") + res = render_eda_pdf_relational(None, out) + assert os.path.exists(out) + assert res["n_pages"] >= 1 diff --git a/python/functions/pipelines/profile_database.py b/python/functions/pipelines/profile_database.py index 84c10fa5..7f63c893 100644 --- a/python/functions/pipelines/profile_database.py +++ b/python/functions/pipelines/profile_database.py @@ -12,6 +12,7 @@ Funciones del registry compuestas (NO se reimplementa su logica): - build_join_graph : grafo de relaciones inter-tabla + diagrama Mermaid. - duckdb_list_tables : introspeccion "que tablas hay" (read-only). - render_eda_markdown : report legible de un TableProfile. + - render_eda_pdf_relational : PDF movil DB-level (resumen de tablas + join graph). Aporta una capa propia de AGREGACION A NIVEL DE BASE: ensambla un DatabaseProfile con el resumen de cada tabla, los TableProfiles completos, las FK candidatas y el @@ -31,6 +32,7 @@ from datascience import ( build_join_graph, infer_fk_containment_duckdb, render_eda_markdown, + render_eda_pdf_relational, ) from infra import duckdb_list_tables from pipelines.profile_table import profile_table @@ -118,6 +120,7 @@ def profile_database( report_dir: str = "reports", write_report: bool = True, min_inclusion: float = 0.9, + emit_pdf: bool = False, ) -> dict: """Perfila una base DuckDB entera + sus relaciones inter-tabla. @@ -134,11 +137,16 @@ def profile_database( paths del retorno son None. min_inclusion: umbral minimo de inclusion (0-1) para emitir una FK candidata (se pasa a infer_fk_containment_duckdb). Default 0.9. + emit_pdf: si True (default False) renderiza un PDF movil DB-level con + render_eda_pdf_relational (resumen de tablas + relaciones FK + join + graph) junto a los reports y devuelve su ruta en report_pdf_path. Con + False no se toca el PDF (retrocompatible) y report_pdf_path es None. Returns: dict dict-no-throw. En exito: {status:'ok', db_profile:, - report_md_path:str|None, report_json_path:str|None}. + report_md_path:str|None, report_json_path:str|None, + report_pdf_path:str|None}. En error (sin lanzar): {status:'error', error:str}. DatabaseProfile = { @@ -204,12 +212,13 @@ def profile_database( "errors": errors, } - # 6) Reports opcionales. + # 6) Reports opcionales (markdown + JSON sidecar + PDF movil DB-level). report_md_path = None report_json_path = None + report_pdf_path = None + ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") if write_report: os.makedirs(report_dir, exist_ok=True) - ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") report_json_path = os.path.join(report_dir, f"eda_db_{ts}.json") report_md_path = os.path.join(report_dir, f"eda_db_{ts}.md") with open(report_json_path, "w", encoding="utf-8") as fh: @@ -219,11 +228,23 @@ def profile_database( with open(report_md_path, "w", encoding="utf-8") as fh: fh.write(_render_db_markdown(db_profile)) + # PDF DB-level (legible en movil): resumen de tablas + join graph. Se + # genera bajo demanda (emit_pdf) reusando el renderer relational del grupo. + if emit_pdf: + try: + os.makedirs(report_dir, exist_ok=True) + pdf_target = os.path.join(report_dir, f"eda_db_{ts}.pdf") + pres = render_eda_pdf_relational(db_profile, pdf_target) + report_pdf_path = pres.get("pdf_path") + except Exception: # noqa: BLE001 + report_pdf_path = None + return { "status": "ok", "db_profile": db_profile, "report_md_path": report_md_path, "report_json_path": report_json_path, + "report_pdf_path": report_pdf_path, } except Exception as e: # noqa: BLE001 return {"status": "error", "error": str(e)} diff --git a/python/functions/pipelines/profile_database_test.py b/python/functions/pipelines/profile_database_test.py index 88fa67d4..11c03e67 100644 --- a/python/functions/pipelines/profile_database_test.py +++ b/python/functions/pipelines/profile_database_test.py @@ -165,3 +165,36 @@ def test_profile_database_writes_report(tmp_path): assert "# EDA base —" in md assert "## Relaciones inter-tabla" in md assert "```mermaid" in md + + +def test_profile_database_emit_pdf(tmp_path): + # H9: con emit_pdf=True, profile_database genera un PDF DB-level (>0 bytes, + # cabecera %PDF) además del markdown + JSON. + db_path = os.path.join(str(tmp_path), "shop3.duckdb") + _build_related_db(db_path) + report_dir = os.path.join(str(tmp_path), "reports") + + res = profile_database( + db_path, report_dir=report_dir, write_report=True, emit_pdf=True + ) + + assert res["status"] == "ok", res + pdf = res.get("report_pdf_path") + assert pdf is not None + assert os.path.exists(pdf) + assert os.path.getsize(pdf) > 0 + with open(pdf, "rb") as fh: + assert fh.read(4) == b"%PDF" + + +def test_profile_database_emit_pdf_false_retrocompat(tmp_path): + # Edge: emit_pdf=False (default) se comporta como antes — no genera PDF y + # report_pdf_path es None. + db_path = os.path.join(str(tmp_path), "shop4.duckdb") + _build_related_db(db_path) + report_dir = os.path.join(str(tmp_path), "reports") + + res = profile_database(db_path, report_dir=report_dir, write_report=True) + + assert res["status"] == "ok", res + assert res.get("report_pdf_path") is None