feat(eda): series temporales + rigor anti-data-mining + PDF movil + /eda + benchmark issues

Bloque del grupo eda (sesion ausente EDA-benchmark): - 8 funciones nuevas: adf_kpss_stationarity, acf_pacf, stl_decompose, to_returns, fdr_correction, suggest_reexpression, exploratory_caveats, render_eda_pdf - integracion: profile_table (run_series, emit_pdf), association_matrix (FDR Benjamini-Hochberg), render_eda_markdown (secciones series/reexpresion/caveats) - slash commands /eda y /capitulos - issues 0173-0177: mejoras del /eda derivadas del benchmark sobre 12 datasets reales (outlier_pct x100, periodo estacional, FK inference, render models, tipos id-like) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 03:34:01 +02:00
parent 02301aaed3
commit 7ac69ab4fb
33 changed files with 3995 additions and 51 deletions
@@ -264,24 +264,129 @@ def render_eda_markdown(profile: dict) -> str:
        parts.append("## Calidad")
        parts.append(_md_table(["column", "quality_score", "issues"], rows))

-    # 7. Correlations (tolerate None for now).
+    # 7. Correlaciones / asociación. `association_matrix` ya corrige los p-valores
+    # por comparaciones múltiples (FDR Benjamini-Hochberg / Bonferroni); aquí solo
+    # se renderizan los campos que produjo (value, p_value_adjusted, significant),
+    # sin recalcular nada. Se prefieren los pares `strong` (magnitud alta Y
+    # significativos tras la corrección); si no hay, se muestran todos.
    correlations = profile.get("correlations")
    if correlations:
-        pairs = correlations
+        strong = []
+        all_pairs = []
+        multiple_testing = None
        if isinstance(correlations, dict):
-            pairs = correlations.get("pairs") or correlations.get("strongest") or []
+            strong = correlations.get("strong") or correlations.get("strongest") or []
+            all_pairs = correlations.get("pairs") or []
+            multiple_testing = correlations.get("multiple_testing")
+        else:
+            all_pairs = correlations
+        shown = strong or all_pairs
        corr_rows = []
-        for pair in pairs or []:
-            if isinstance(pair, dict):
-                corr_rows.append([
-                    pair.get("a") or pair.get("col_a"),
-                    pair.get("b") or pair.get("col_b"),
-                    _fmt_num(pair.get("value") if pair.get("value") is not None
-                             else pair.get("corr")),
-                ])
+        for pair in shown or []:
+            if not isinstance(pair, dict):
+                continue
+            padj = pair.get("p_value_adjusted")
+            sig = pair.get("significant")
+            corr_rows.append([
+                pair.get("a") or pair.get("col_a"),
+                pair.get("b") or pair.get("col_b"),
+                pair.get("method", ""),
+                _fmt_num(pair.get("value") if pair.get("value") is not None
+                         else pair.get("corr")),
+                _fmt_num(padj) if padj is not None else "",
+                "sí" if sig else ("no" if sig is not None else ""),
+            ])
        if corr_rows:
            parts.append("## Correlaciones")
-            parts.append(_md_table(["a", "b", "corr"], corr_rows))
+            if isinstance(multiple_testing, dict):
+                parts.append(
+                    "Corrección de comparaciones múltiples: "
+                    f"{multiple_testing.get('method')} "
+                    f"(α={multiple_testing.get('alpha')}); "
+                    f"{multiple_testing.get('n_rejected')} de "
+                    f"{multiple_testing.get('n_tests')} pares significativos tras la "
+                    "corrección. Mostrando "
+                    f"{'solo pares fuertes' if strong else 'todos los pares evaluados'}."
+                )
+            parts.append(_md_table(
+                ["a", "b", "method", "value", "p_adj (FDR)", "sig"], corr_rows))
+
+    # 7b. Re-expresión sugerida (escalera de potencias de Tukey) por columna
+    # numérica. `suggest_reexpression` decide la transformación que más simetriza;
+    # aquí solo se rinde su recomendación y razón.
+    reexp_rows = []
+    for col in columns:
+        if not isinstance(col, dict):
+            continue
+        rx = col.get("reexpression")
+        if not isinstance(rx, dict) or rx.get("recommended") is None:
+            continue
+        ladder = rx.get("ladder_power")
+        reexp_rows.append([
+            col.get("name"),
+            _fmt_num(rx.get("skew")),
+            rx.get("recommended"),
+            _fmt_num(ladder) if ladder is not None else "",
+            rx.get("reason", ""),
+        ])
+    if reexp_rows:
+        parts.append("## Re-expresión sugerida")
+        parts.append(_md_table(
+            ["column", "skew", "transform", "ladder_power", "reason"], reexp_rows))
+
+    # 7c. Series temporales. Bloque por columna numérica cuando el pipeline corrió
+    # con run_series: estacionariedad (ADF+KPSS), autocorrelación (ACF/PACF +
+    # Ljung-Box), descomposición STL y, si es una serie de niveles, sugerencia de
+    # retornos.
+    series_blocks = []
+    for col in columns:
+        if not isinstance(col, dict):
+            continue
+        s = col.get("series")
+        if not isinstance(s, dict):
+            continue
+        name = col.get("name") or "(col)"
+        block = [f"### {name}"]
+        rows = []
+        stat = s.get("stationarity") or {}
+        if stat.get("verdict") is not None:
+            rows.append(["estacionariedad (ADF+KPSS)", stat.get("verdict")])
+        acf = s.get("acf_pacf") or {}
+        if acf.get("is_autocorrelated") is not None:
+            rows.append([
+                "autocorrelada (Ljung-Box)",
+                "sí" if acf.get("is_autocorrelated") else "no",
+            ])
+        sig_lags = acf.get("significant_acf_lags")
+        if sig_lags:
+            rows.append([
+                "lags ACF significativos",
+                ", ".join(str(lag) for lag in sig_lags[:12]),
+            ])
+        stl = s.get("stl") or {}
+        if stl.get("trend_strength") is not None:
+            rows.append(["fuerza de tendencia (STL)", _fmt_num(stl.get("trend_strength"))])
+        if stl.get("seasonal_strength") is not None:
+            rows.append(["fuerza estacional (STL)", _fmt_num(stl.get("seasonal_strength"))])
+            if stl.get("period") is not None:
+                rows.append(["periodo estacional", stl.get("period")])
+        elif stl.get("note"):
+            rows.append(["STL", stl.get("note")])
+        if s.get("levels_suggested"):
+            rows.append(["sugerencia", "convertir a retornos (serie de niveles)"])
+            tr = s.get("to_returns") or {}
+            if tr.get("mean") is not None:
+                rows.append(["retorno medio (log)", _fmt_num(tr.get("mean"))])
+            if tr.get("std") is not None:
+                rows.append(["volatilidad retornos (σ)", _fmt_num(tr.get("std"))])
+        if rows:
+            block.append(_md_table(["aspecto", "valor"], rows))
+        if stat.get("warning"):
+            block.append(f"> {stat.get('warning')}")
+        series_blocks.append("\n\n".join(block))
+    if series_blocks:
+        parts.append("## Series temporales")
+        parts.extend(series_blocks)

    # 8. LLM analysis (tolerate None for now).
    llm = profile.get("llm")
@@ -299,4 +404,24 @@ def render_eda_markdown(profile: dict) -> str:
        else:
            parts.append(str(llm))

+    # 9. Avisos exploratorios. `exploratory_caveats` recuerda que el EDA genera
+    # hipótesis, no conclusiones; se renderiza la lista de advertencias que aplican
+    # a lo que realmente se calculó.
+    caveats = profile.get("caveats")
+    cav_list = []
+    if isinstance(caveats, dict):
+        cav_list = caveats.get("caveats") or []
+    elif isinstance(caveats, list):
+        cav_list = caveats
+    cav_lines = []
+    for cav in cav_list:
+        if not isinstance(cav, dict):
+            continue
+        topic = cav.get("topic") or cav.get("id") or ""
+        msg = cav.get("message") or ""
+        cav_lines.append(f"- **{topic}**: {msg}")
+    if cav_lines:
+        parts.append("## Avisos exploratorios")
+        parts.append("\n".join(cav_lines))
+
    return "\n\n".join(parts) + "\n"