feat(eda): series temporales + rigor anti-data-mining + PDF movil + /eda + benchmark issues
Bloque del grupo eda (sesion ausente EDA-benchmark): - 8 funciones nuevas: adf_kpss_stationarity, acf_pacf, stl_decompose, to_returns, fdr_correction, suggest_reexpression, exploratory_caveats, render_eda_pdf - integracion: profile_table (run_series, emit_pdf), association_matrix (FDR Benjamini-Hochberg), render_eda_markdown (secciones series/reexpresion/caveats) - slash commands /eda y /capitulos - issues 0173-0177: mejoras del /eda derivadas del benchmark sobre 12 datasets reales (outlier_pct x100, periodo estacional, FK inference, render models, tipos id-like) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -264,24 +264,129 @@ def render_eda_markdown(profile: dict) -> str:
|
||||
parts.append("## Calidad")
|
||||
parts.append(_md_table(["column", "quality_score", "issues"], rows))
|
||||
|
||||
# 7. Correlations (tolerate None for now).
|
||||
# 7. Correlaciones / asociación. `association_matrix` ya corrige los p-valores
|
||||
# por comparaciones múltiples (FDR Benjamini-Hochberg / Bonferroni); aquí solo
|
||||
# se renderizan los campos que produjo (value, p_value_adjusted, significant),
|
||||
# sin recalcular nada. Se prefieren los pares `strong` (magnitud alta Y
|
||||
# significativos tras la corrección); si no hay, se muestran todos.
|
||||
correlations = profile.get("correlations")
|
||||
if correlations:
|
||||
pairs = correlations
|
||||
strong = []
|
||||
all_pairs = []
|
||||
multiple_testing = None
|
||||
if isinstance(correlations, dict):
|
||||
pairs = correlations.get("pairs") or correlations.get("strongest") or []
|
||||
strong = correlations.get("strong") or correlations.get("strongest") or []
|
||||
all_pairs = correlations.get("pairs") or []
|
||||
multiple_testing = correlations.get("multiple_testing")
|
||||
else:
|
||||
all_pairs = correlations
|
||||
shown = strong or all_pairs
|
||||
corr_rows = []
|
||||
for pair in pairs or []:
|
||||
if isinstance(pair, dict):
|
||||
corr_rows.append([
|
||||
pair.get("a") or pair.get("col_a"),
|
||||
pair.get("b") or pair.get("col_b"),
|
||||
_fmt_num(pair.get("value") if pair.get("value") is not None
|
||||
else pair.get("corr")),
|
||||
])
|
||||
for pair in shown or []:
|
||||
if not isinstance(pair, dict):
|
||||
continue
|
||||
padj = pair.get("p_value_adjusted")
|
||||
sig = pair.get("significant")
|
||||
corr_rows.append([
|
||||
pair.get("a") or pair.get("col_a"),
|
||||
pair.get("b") or pair.get("col_b"),
|
||||
pair.get("method", ""),
|
||||
_fmt_num(pair.get("value") if pair.get("value") is not None
|
||||
else pair.get("corr")),
|
||||
_fmt_num(padj) if padj is not None else "",
|
||||
"sí" if sig else ("no" if sig is not None else ""),
|
||||
])
|
||||
if corr_rows:
|
||||
parts.append("## Correlaciones")
|
||||
parts.append(_md_table(["a", "b", "corr"], corr_rows))
|
||||
if isinstance(multiple_testing, dict):
|
||||
parts.append(
|
||||
"Corrección de comparaciones múltiples: "
|
||||
f"{multiple_testing.get('method')} "
|
||||
f"(α={multiple_testing.get('alpha')}); "
|
||||
f"{multiple_testing.get('n_rejected')} de "
|
||||
f"{multiple_testing.get('n_tests')} pares significativos tras la "
|
||||
"corrección. Mostrando "
|
||||
f"{'solo pares fuertes' if strong else 'todos los pares evaluados'}."
|
||||
)
|
||||
parts.append(_md_table(
|
||||
["a", "b", "method", "value", "p_adj (FDR)", "sig"], corr_rows))
|
||||
|
||||
# 7b. Re-expresión sugerida (escalera de potencias de Tukey) por columna
|
||||
# numérica. `suggest_reexpression` decide la transformación que más simetriza;
|
||||
# aquí solo se rinde su recomendación y razón.
|
||||
reexp_rows = []
|
||||
for col in columns:
|
||||
if not isinstance(col, dict):
|
||||
continue
|
||||
rx = col.get("reexpression")
|
||||
if not isinstance(rx, dict) or rx.get("recommended") is None:
|
||||
continue
|
||||
ladder = rx.get("ladder_power")
|
||||
reexp_rows.append([
|
||||
col.get("name"),
|
||||
_fmt_num(rx.get("skew")),
|
||||
rx.get("recommended"),
|
||||
_fmt_num(ladder) if ladder is not None else "",
|
||||
rx.get("reason", ""),
|
||||
])
|
||||
if reexp_rows:
|
||||
parts.append("## Re-expresión sugerida")
|
||||
parts.append(_md_table(
|
||||
["column", "skew", "transform", "ladder_power", "reason"], reexp_rows))
|
||||
|
||||
# 7c. Series temporales. Bloque por columna numérica cuando el pipeline corrió
|
||||
# con run_series: estacionariedad (ADF+KPSS), autocorrelación (ACF/PACF +
|
||||
# Ljung-Box), descomposición STL y, si es una serie de niveles, sugerencia de
|
||||
# retornos.
|
||||
series_blocks = []
|
||||
for col in columns:
|
||||
if not isinstance(col, dict):
|
||||
continue
|
||||
s = col.get("series")
|
||||
if not isinstance(s, dict):
|
||||
continue
|
||||
name = col.get("name") or "(col)"
|
||||
block = [f"### {name}"]
|
||||
rows = []
|
||||
stat = s.get("stationarity") or {}
|
||||
if stat.get("verdict") is not None:
|
||||
rows.append(["estacionariedad (ADF+KPSS)", stat.get("verdict")])
|
||||
acf = s.get("acf_pacf") or {}
|
||||
if acf.get("is_autocorrelated") is not None:
|
||||
rows.append([
|
||||
"autocorrelada (Ljung-Box)",
|
||||
"sí" if acf.get("is_autocorrelated") else "no",
|
||||
])
|
||||
sig_lags = acf.get("significant_acf_lags")
|
||||
if sig_lags:
|
||||
rows.append([
|
||||
"lags ACF significativos",
|
||||
", ".join(str(lag) for lag in sig_lags[:12]),
|
||||
])
|
||||
stl = s.get("stl") or {}
|
||||
if stl.get("trend_strength") is not None:
|
||||
rows.append(["fuerza de tendencia (STL)", _fmt_num(stl.get("trend_strength"))])
|
||||
if stl.get("seasonal_strength") is not None:
|
||||
rows.append(["fuerza estacional (STL)", _fmt_num(stl.get("seasonal_strength"))])
|
||||
if stl.get("period") is not None:
|
||||
rows.append(["periodo estacional", stl.get("period")])
|
||||
elif stl.get("note"):
|
||||
rows.append(["STL", stl.get("note")])
|
||||
if s.get("levels_suggested"):
|
||||
rows.append(["sugerencia", "convertir a retornos (serie de niveles)"])
|
||||
tr = s.get("to_returns") or {}
|
||||
if tr.get("mean") is not None:
|
||||
rows.append(["retorno medio (log)", _fmt_num(tr.get("mean"))])
|
||||
if tr.get("std") is not None:
|
||||
rows.append(["volatilidad retornos (σ)", _fmt_num(tr.get("std"))])
|
||||
if rows:
|
||||
block.append(_md_table(["aspecto", "valor"], rows))
|
||||
if stat.get("warning"):
|
||||
block.append(f"> {stat.get('warning')}")
|
||||
series_blocks.append("\n\n".join(block))
|
||||
if series_blocks:
|
||||
parts.append("## Series temporales")
|
||||
parts.extend(series_blocks)
|
||||
|
||||
# 8. LLM analysis (tolerate None for now).
|
||||
llm = profile.get("llm")
|
||||
@@ -299,4 +404,24 @@ def render_eda_markdown(profile: dict) -> str:
|
||||
else:
|
||||
parts.append(str(llm))
|
||||
|
||||
# 9. Avisos exploratorios. `exploratory_caveats` recuerda que el EDA genera
|
||||
# hipótesis, no conclusiones; se renderiza la lista de advertencias que aplican
|
||||
# a lo que realmente se calculó.
|
||||
caveats = profile.get("caveats")
|
||||
cav_list = []
|
||||
if isinstance(caveats, dict):
|
||||
cav_list = caveats.get("caveats") or []
|
||||
elif isinstance(caveats, list):
|
||||
cav_list = caveats
|
||||
cav_lines = []
|
||||
for cav in cav_list:
|
||||
if not isinstance(cav, dict):
|
||||
continue
|
||||
topic = cav.get("topic") or cav.get("id") or ""
|
||||
msg = cav.get("message") or ""
|
||||
cav_lines.append(f"- **{topic}**: {msg}")
|
||||
if cav_lines:
|
||||
parts.append("## Avisos exploratorios")
|
||||
parts.append("\n".join(cav_lines))
|
||||
|
||||
return "\n\n".join(parts) + "\n"
|
||||
|
||||
Reference in New Issue
Block a user