feat(eda): series temporales + rigor anti-data-mining + PDF movil + /eda + benchmark issues

Bloque del grupo eda (sesion ausente EDA-benchmark):
- 8 funciones nuevas: adf_kpss_stationarity, acf_pacf, stl_decompose, to_returns,
  fdr_correction, suggest_reexpression, exploratory_caveats, render_eda_pdf
- integracion: profile_table (run_series, emit_pdf), association_matrix (FDR Benjamini-Hochberg),
  render_eda_markdown (secciones series/reexpresion/caveats)
- slash commands /eda y /capitulos
- issues 0173-0177: mejoras del /eda derivadas del benchmark sobre 12 datasets reales
  (outlier_pct x100, periodo estacional, FK inference, render models, tipos id-like)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Egutierrez
2026-06-29 03:34:01 +02:00
parent 02301aaed3
commit 7ac69ab4fb
33 changed files with 3995 additions and 51 deletions
@@ -264,24 +264,129 @@ def render_eda_markdown(profile: dict) -> str:
parts.append("## Calidad")
parts.append(_md_table(["column", "quality_score", "issues"], rows))
# 7. Correlations (tolerate None for now).
# 7. Correlaciones / asociación. `association_matrix` ya corrige los p-valores
# por comparaciones múltiples (FDR Benjamini-Hochberg / Bonferroni); aquí solo
# se renderizan los campos que produjo (value, p_value_adjusted, significant),
# sin recalcular nada. Se prefieren los pares `strong` (magnitud alta Y
# significativos tras la corrección); si no hay, se muestran todos.
correlations = profile.get("correlations")
if correlations:
pairs = correlations
strong = []
all_pairs = []
multiple_testing = None
if isinstance(correlations, dict):
pairs = correlations.get("pairs") or correlations.get("strongest") or []
strong = correlations.get("strong") or correlations.get("strongest") or []
all_pairs = correlations.get("pairs") or []
multiple_testing = correlations.get("multiple_testing")
else:
all_pairs = correlations
shown = strong or all_pairs
corr_rows = []
for pair in pairs or []:
if isinstance(pair, dict):
corr_rows.append([
pair.get("a") or pair.get("col_a"),
pair.get("b") or pair.get("col_b"),
_fmt_num(pair.get("value") if pair.get("value") is not None
else pair.get("corr")),
])
for pair in shown or []:
if not isinstance(pair, dict):
continue
padj = pair.get("p_value_adjusted")
sig = pair.get("significant")
corr_rows.append([
pair.get("a") or pair.get("col_a"),
pair.get("b") or pair.get("col_b"),
pair.get("method", ""),
_fmt_num(pair.get("value") if pair.get("value") is not None
else pair.get("corr")),
_fmt_num(padj) if padj is not None else "",
"" if sig else ("no" if sig is not None else ""),
])
if corr_rows:
parts.append("## Correlaciones")
parts.append(_md_table(["a", "b", "corr"], corr_rows))
if isinstance(multiple_testing, dict):
parts.append(
"Corrección de comparaciones múltiples: "
f"{multiple_testing.get('method')} "
f"(α={multiple_testing.get('alpha')}); "
f"{multiple_testing.get('n_rejected')} de "
f"{multiple_testing.get('n_tests')} pares significativos tras la "
"corrección. Mostrando "
f"{'solo pares fuertes' if strong else 'todos los pares evaluados'}."
)
parts.append(_md_table(
["a", "b", "method", "value", "p_adj (FDR)", "sig"], corr_rows))
# 7b. Re-expresión sugerida (escalera de potencias de Tukey) por columna
# numérica. `suggest_reexpression` decide la transformación que más simetriza;
# aquí solo se rinde su recomendación y razón.
reexp_rows = []
for col in columns:
if not isinstance(col, dict):
continue
rx = col.get("reexpression")
if not isinstance(rx, dict) or rx.get("recommended") is None:
continue
ladder = rx.get("ladder_power")
reexp_rows.append([
col.get("name"),
_fmt_num(rx.get("skew")),
rx.get("recommended"),
_fmt_num(ladder) if ladder is not None else "",
rx.get("reason", ""),
])
if reexp_rows:
parts.append("## Re-expresión sugerida")
parts.append(_md_table(
["column", "skew", "transform", "ladder_power", "reason"], reexp_rows))
# 7c. Series temporales. Bloque por columna numérica cuando el pipeline corrió
# con run_series: estacionariedad (ADF+KPSS), autocorrelación (ACF/PACF +
# Ljung-Box), descomposición STL y, si es una serie de niveles, sugerencia de
# retornos.
series_blocks = []
for col in columns:
if not isinstance(col, dict):
continue
s = col.get("series")
if not isinstance(s, dict):
continue
name = col.get("name") or "(col)"
block = [f"### {name}"]
rows = []
stat = s.get("stationarity") or {}
if stat.get("verdict") is not None:
rows.append(["estacionariedad (ADF+KPSS)", stat.get("verdict")])
acf = s.get("acf_pacf") or {}
if acf.get("is_autocorrelated") is not None:
rows.append([
"autocorrelada (Ljung-Box)",
"" if acf.get("is_autocorrelated") else "no",
])
sig_lags = acf.get("significant_acf_lags")
if sig_lags:
rows.append([
"lags ACF significativos",
", ".join(str(lag) for lag in sig_lags[:12]),
])
stl = s.get("stl") or {}
if stl.get("trend_strength") is not None:
rows.append(["fuerza de tendencia (STL)", _fmt_num(stl.get("trend_strength"))])
if stl.get("seasonal_strength") is not None:
rows.append(["fuerza estacional (STL)", _fmt_num(stl.get("seasonal_strength"))])
if stl.get("period") is not None:
rows.append(["periodo estacional", stl.get("period")])
elif stl.get("note"):
rows.append(["STL", stl.get("note")])
if s.get("levels_suggested"):
rows.append(["sugerencia", "convertir a retornos (serie de niveles)"])
tr = s.get("to_returns") or {}
if tr.get("mean") is not None:
rows.append(["retorno medio (log)", _fmt_num(tr.get("mean"))])
if tr.get("std") is not None:
rows.append(["volatilidad retornos (σ)", _fmt_num(tr.get("std"))])
if rows:
block.append(_md_table(["aspecto", "valor"], rows))
if stat.get("warning"):
block.append(f"> {stat.get('warning')}")
series_blocks.append("\n\n".join(block))
if series_blocks:
parts.append("## Series temporales")
parts.extend(series_blocks)
# 8. LLM analysis (tolerate None for now).
llm = profile.get("llm")
@@ -299,4 +404,24 @@ def render_eda_markdown(profile: dict) -> str:
else:
parts.append(str(llm))
# 9. Avisos exploratorios. `exploratory_caveats` recuerda que el EDA genera
# hipótesis, no conclusiones; se renderiza la lista de advertencias que aplican
# a lo que realmente se calculó.
caveats = profile.get("caveats")
cav_list = []
if isinstance(caveats, dict):
cav_list = caveats.get("caveats") or []
elif isinstance(caveats, list):
cav_list = caveats
cav_lines = []
for cav in cav_list:
if not isinstance(cav, dict):
continue
topic = cav.get("topic") or cav.get("id") or ""
msg = cav.get("message") or ""
cav_lines.append(f"- **{topic}**: {msg}")
if cav_lines:
parts.append("## Avisos exploratorios")
parts.append("\n".join(cav_lines))
return "\n\n".join(parts) + "\n"