feat(eda): render de models en markdown + PDF DB-level para profile_database (H4,H9)

- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers);
  render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo)
- H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas +
  join graph) via render_eda_pdf_relational; clave report_pdf_path
- aditivos y retrocompatibles (flags default False). 38 tests verdes

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Egutierrez
2026-06-29 04:05:38 +02:00
parent caf8c25d99
commit c4cff5ed5b
7 changed files with 706 additions and 15 deletions
@@ -405,6 +405,110 @@ def render_eda_markdown(profile: dict) -> str:
parts.append("## Series temporales")
parts.extend(series_blocks)
# 7d. Modelos baratos (PCA, KMeans, outliers multivariantes, normalidad). El
# pipeline corre `run_eda_models` cuando se pide con run_models; el bloque está
# completo en el JSON pero antes no tenía formatter en markdown y se omitía. Se
# lee todo defensivo con .get y cada submodelo se renderiza solo si está presente.
models = profile.get("models")
if isinstance(models, dict):
model_parts: list[str] = []
pca = models.get("pca")
if isinstance(pca, dict):
evr = pca.get("explained_variance_ratio") or []
cum = pca.get("cumulative") or []
pca_rows = []
for i, var in enumerate(evr):
acc = cum[i] if i < len(cum) else None
pca_rows.append([f"PC{i + 1}", _fmt_pct(var), _fmt_pct(acc)])
sub = ["### PCA"]
n_feat = pca.get("n_features")
n_used = pca.get("n_rows_used")
if n_feat is not None or n_used is not None:
sub.append(
f"{pca.get('n_components')} componentes sobre "
f"{n_used if n_used is not None else '?'} filas, "
f"{n_feat if n_feat is not None else '?'} features."
)
if pca_rows:
sub.append(_md_table(
["componente", "var. explicada", "acumulada"], pca_rows))
loadings = pca.get("top_loadings") or []
load_rows = []
for ld in loadings[:12]:
if not isinstance(ld, dict):
continue
comp = ld.get("component")
comp_label = f"PC{comp + 1}" if isinstance(comp, int) else str(comp)
load_rows.append([comp_label, ld.get("feature"),
_fmt_num(ld.get("loading"), 3)])
if load_rows:
sub.append("Cargas principales:")
sub.append(_md_table(["componente", "feature", "carga"], load_rows))
model_parts.append("\n\n".join(sub))
km = models.get("kmeans")
if isinstance(km, dict):
sub = ["### KMeans"]
best_k = km.get("best_k")
sil = km.get("silhouette")
sizes = km.get("cluster_sizes") or []
head = f"mejor k = {_fmt_num(best_k)}"
if sil is not None:
head += f" (silhouette {_fmt_num(sil, 3)})"
if sizes:
head += ". Tamaños de cluster: " + ", ".join(
_fmt_num(s) for s in sizes)
sub.append(head + ".")
score_rows = []
for sc in km.get("scores_by_k") or []:
if not isinstance(sc, dict):
continue
score_rows.append([sc.get("k"), _fmt_num(sc.get("silhouette"), 3),
_fmt_num(sc.get("inertia"), 2)])
if score_rows:
sub.append(_md_table(["k", "silhouette", "inertia"], score_rows))
model_parts.append("\n\n".join(sub))
out = models.get("outliers")
if isinstance(out, dict):
# outlier_pct del modelo multivariante ya viene en escala 0-100.
n_out = out.get("n_outliers")
pct = out.get("outlier_pct")
thr = out.get("threshold")
line = f"{_fmt_num(n_out)} filas marcadas como outlier"
if pct is not None:
line += f" ({_fmt_num(pct, 2)}%)"
if thr is not None:
line += f"; umbral de score {_fmt_num(thr, 3)}"
model_parts.append("### Outliers multivariante (Isolation Forest)\n\n"
+ line + ".")
normality = models.get("normality")
if isinstance(normality, dict):
norm_rows = []
for col_name, res in normality.items():
if not isinstance(res, dict):
continue
jb = res.get("jarque_bera") or {}
norm_rows.append([
col_name,
"" if res.get("is_normal") else "no",
_fmt_num(jb.get("p")) if jb.get("p") is not None else "",
])
if norm_rows:
model_parts.append(
"### Normalidad\n\n"
+ _md_table(["columna", "normal", "Jarque-Bera p"], norm_rows))
note = models.get("note")
if note:
model_parts.append(f"> {note}")
if model_parts:
parts.append("## Modelos")
parts.extend(model_parts)
# 8. LLM analysis (tolerate None for now).
llm = profile.get("llm")
if llm: