c4cff5ed5b
- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers); render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo) - H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas + join graph) via render_eda_pdf_relational; clave report_pdf_path - aditivos y retrocompatibles (flags default False). 38 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
549 lines
22 KiB
Python
549 lines
22 KiB
Python
"""Render a TableProfile dict (eda capability group) into a readable markdown report.
|
||
|
||
Pure render function: dict in, markdown string out. No I/O, stdlib only.
|
||
Reads every key defensively with .get(...) because most profile phases may be
|
||
absent (None / missing) depending on how complete the profiling was.
|
||
"""
|
||
|
||
# ASCII block characters used to draw histogram sparklines, low -> high.
|
||
_SPARK_BLOCKS = "▁▂▃▄▅▆▇█"
|
||
|
||
|
||
def _fmt_num(value, decimals: int = 4) -> str:
|
||
"""Format a number compactly, falling back to str for non-numerics."""
|
||
if value is None:
|
||
return ""
|
||
if isinstance(value, bool):
|
||
return str(value)
|
||
if isinstance(value, int):
|
||
return str(value)
|
||
if isinstance(value, float):
|
||
if value != value: # NaN
|
||
return "NaN"
|
||
if value in (float("inf"), float("-inf")):
|
||
return str(value)
|
||
# Trim trailing zeros for readability.
|
||
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
|
||
return text if text else "0"
|
||
return str(value)
|
||
|
||
|
||
def _fmt_pct(value, decimals: int = 2) -> str:
|
||
"""Format a fraction (0-1) as a percentage 'NN.NN%'. Returns '' for None.
|
||
|
||
Every ``*_pct`` field in a TableProfile/ColumnProfile is a fraction in the
|
||
[0, 1] range (e.g. ``unique_pct=0.857`` means 85.7%). This helper multiplies
|
||
by 100 so the rendered markdown shows the human-facing percentage.
|
||
"""
|
||
if value is None:
|
||
return ""
|
||
try:
|
||
num = float(value)
|
||
except (TypeError, ValueError):
|
||
return str(value)
|
||
return f"{num * 100:.{decimals}f}%"
|
||
|
||
|
||
def _sparkline(histogram) -> str:
|
||
"""Build an ASCII block sparkline from a histogram list of bins.
|
||
|
||
Each bin is a dict with a 'count' key. Counts are scaled linearly across the
|
||
block character ramp. Returns '' when the histogram is empty/None.
|
||
"""
|
||
if not histogram:
|
||
return ""
|
||
counts = []
|
||
for bin_ in histogram:
|
||
if not isinstance(bin_, dict):
|
||
return ""
|
||
counts.append(bin_.get("count") or 0)
|
||
if not counts:
|
||
return ""
|
||
lo = min(counts)
|
||
hi = max(counts)
|
||
span = hi - lo
|
||
chars = []
|
||
last_idx = len(_SPARK_BLOCKS) - 1
|
||
for c in counts:
|
||
if span <= 0:
|
||
idx = 0
|
||
else:
|
||
idx = int(round((c - lo) / span * last_idx))
|
||
idx = max(0, min(last_idx, idx))
|
||
chars.append(_SPARK_BLOCKS[idx])
|
||
return "".join(chars)
|
||
|
||
|
||
def _md_table(headers, rows) -> str:
|
||
"""Render a markdown table from headers and a list of row lists."""
|
||
head = "| " + " | ".join(str(h) for h in headers) + " |"
|
||
sep = "| " + " | ".join("---" for _ in headers) + " |"
|
||
body = []
|
||
for row in rows:
|
||
cells = [str(c) if c is not None else "" for c in row]
|
||
body.append("| " + " | ".join(cells) + " |")
|
||
return "\n".join([head, sep] + body)
|
||
|
||
|
||
def render_eda_markdown(profile: dict) -> str:
|
||
"""Convert a TableProfile dict into a readable, self-contained markdown report.
|
||
|
||
Args:
|
||
profile: TableProfile dict from the eda capability group. May have many
|
||
keys set to None or missing; everything is read defensively and
|
||
empty sections are omitted cleanly.
|
||
|
||
Returns:
|
||
A markdown string. Sections with no data are skipped.
|
||
"""
|
||
if profile is None:
|
||
profile = {}
|
||
|
||
parts: list[str] = []
|
||
columns = profile.get("columns") or []
|
||
|
||
# 1. Title + identity line.
|
||
table = profile.get("table") or "(unnamed)"
|
||
parts.append(f"# EDA — {table}")
|
||
|
||
identity_bits = []
|
||
source = profile.get("source")
|
||
if source:
|
||
identity_bits.append(f"source: `{source}`")
|
||
profiled_at = profile.get("profiled_at")
|
||
if profiled_at:
|
||
identity_bits.append(f"profiled_at: {profiled_at}")
|
||
n_rows = profile.get("n_rows")
|
||
n_cols = profile.get("n_cols")
|
||
if n_rows is not None or n_cols is not None:
|
||
identity_bits.append(f"{n_rows if n_rows is not None else '?'} rows × "
|
||
f"{n_cols if n_cols is not None else '?'} cols")
|
||
if identity_bits:
|
||
parts.append(" · ".join(identity_bits))
|
||
|
||
# 2. Overview.
|
||
overview_rows = []
|
||
if profile.get("n_rows") is not None:
|
||
overview_rows.append(["Rows", profile.get("n_rows")])
|
||
if profile.get("n_cols") is not None:
|
||
overview_rows.append(["Columns", profile.get("n_cols")])
|
||
if profile.get("size_bytes") is not None:
|
||
overview_rows.append(["Size (bytes)", profile.get("size_bytes")])
|
||
if profile.get("duplicate_rows") is not None:
|
||
dup = f"{profile.get('duplicate_rows')}"
|
||
if profile.get("duplicate_pct") is not None:
|
||
dup += f" ({_fmt_pct(profile.get('duplicate_pct'))})"
|
||
overview_rows.append(["Duplicate rows", dup])
|
||
if profile.get("null_cell_pct") is not None:
|
||
overview_rows.append(["Null cells", _fmt_pct(profile.get("null_cell_pct"))])
|
||
constant_cols = profile.get("constant_cols") or []
|
||
if constant_cols:
|
||
overview_rows.append(["Constant columns", ", ".join(constant_cols)])
|
||
all_null_cols = profile.get("all_null_cols") or []
|
||
if all_null_cols:
|
||
overview_rows.append(["All-null columns", ", ".join(all_null_cols)])
|
||
if profile.get("quality_score") is not None:
|
||
overview_rows.append(["Quality score", _fmt_num(profile.get("quality_score"))])
|
||
type_breakdown = profile.get("type_breakdown") or {}
|
||
if type_breakdown:
|
||
tb = ", ".join(f"{k}: {v}" for k, v in type_breakdown.items() if v is not None)
|
||
if tb:
|
||
overview_rows.append(["Type breakdown", tb])
|
||
key_candidates = profile.get("key_candidates") or []
|
||
if key_candidates:
|
||
overview_rows.append(["Key candidates", ", ".join(key_candidates)])
|
||
if overview_rows:
|
||
parts.append("## Overview")
|
||
parts.append(_md_table(["Metric", "Value"], overview_rows))
|
||
|
||
# 3. Columns summary table.
|
||
if columns:
|
||
rows = []
|
||
for col in columns:
|
||
if not isinstance(col, dict):
|
||
continue
|
||
rows.append([
|
||
col.get("name"),
|
||
col.get("inferred_type"),
|
||
col.get("semantic_type"),
|
||
_fmt_pct(col.get("null_pct")),
|
||
col.get("distinct_count"),
|
||
_fmt_pct(col.get("unique_pct")),
|
||
_fmt_num(col.get("quality_score")),
|
||
", ".join(col.get("flags") or []),
|
||
])
|
||
if rows:
|
||
parts.append("## Columnas")
|
||
parts.append(_md_table(
|
||
["name", "inferred_type", "semantic_type", "null_pct",
|
||
"distinct", "unique_pct", "quality_score", "flags"],
|
||
rows,
|
||
))
|
||
|
||
# 4. Numeric columns.
|
||
numeric_blocks = []
|
||
for col in columns:
|
||
if not isinstance(col, dict):
|
||
continue
|
||
num = col.get("numeric")
|
||
if not num:
|
||
continue
|
||
name = col.get("name") or "(col)"
|
||
stat_rows = []
|
||
for label, key in [
|
||
("min", "min"), ("median", "median"), ("mean", "mean"),
|
||
("std", "std"), ("p25", "p25"), ("p75", "p75"),
|
||
("p95", "p95"), ("p99", "p99"), ("skew", "skew"),
|
||
("outlier_pct", "outlier_pct"),
|
||
("distribution_type", "distribution_type"),
|
||
]:
|
||
val = num.get(key)
|
||
if val is None:
|
||
continue
|
||
if key == "outlier_pct":
|
||
# outlier_pct ya viene en escala 0-100 desde describe_numeric
|
||
# (100 * n_outliers / n). NO usar _fmt_pct (multiplica x100 otra
|
||
# vez y produce porcentajes imposibles, p.ej. 7% -> 700%).
|
||
stat_rows.append([label, _fmt_num(val, 2) + "%"])
|
||
elif key == "distribution_type":
|
||
stat_rows.append([label, str(val)])
|
||
else:
|
||
stat_rows.append([label, _fmt_num(val)])
|
||
block = [f"### {name}"]
|
||
if stat_rows:
|
||
block.append(_md_table(["stat", "value"], stat_rows))
|
||
spark = _sparkline(num.get("histogram"))
|
||
if spark:
|
||
block.append(f"histogram: `{spark}`")
|
||
numeric_blocks.append("\n\n".join(block))
|
||
if numeric_blocks:
|
||
parts.append("## Numéricas")
|
||
parts.extend(numeric_blocks)
|
||
|
||
# 5. Categorical columns.
|
||
categorical_blocks = []
|
||
for col in columns:
|
||
if not isinstance(col, dict):
|
||
continue
|
||
cat = col.get("categorical")
|
||
if not cat:
|
||
continue
|
||
name = col.get("name") or "(col)"
|
||
block = [f"### {name}"]
|
||
top = cat.get("top") or []
|
||
top_rows = []
|
||
for item in top:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
top_rows.append([
|
||
item.get("value"),
|
||
item.get("count"),
|
||
_fmt_pct(item.get("pct")),
|
||
])
|
||
if top_rows:
|
||
block.append(_md_table(["value", "count", "pct"], top_rows))
|
||
if cat.get("entropy") is not None:
|
||
block.append(f"entropy: {_fmt_num(cat.get('entropy'))}")
|
||
categorical_blocks.append("\n\n".join(block))
|
||
if categorical_blocks:
|
||
parts.append("## Categóricas")
|
||
parts.extend(categorical_blocks)
|
||
|
||
# 6. Quality ranking (worst quality_score first).
|
||
scored = [
|
||
col for col in columns
|
||
if isinstance(col, dict) and col.get("quality_score") is not None
|
||
]
|
||
if scored:
|
||
scored.sort(key=lambda c: c.get("quality_score"))
|
||
rows = []
|
||
for col in scored:
|
||
issues = col.get("issues") or col.get("flags") or []
|
||
rows.append([
|
||
col.get("name"),
|
||
_fmt_num(col.get("quality_score")),
|
||
", ".join(issues) if isinstance(issues, list) else str(issues),
|
||
])
|
||
parts.append("## Calidad")
|
||
parts.append(_md_table(["column", "quality_score", "issues"], rows))
|
||
|
||
# 7. Correlaciones / asociación. `association_matrix` ya corrige los p-valores
|
||
# por comparaciones múltiples (FDR Benjamini-Hochberg / Bonferroni); aquí solo
|
||
# se renderizan los campos que produjo (value, p_value_adjusted, significant),
|
||
# sin recalcular nada. Se prefieren los pares `strong` (magnitud alta Y
|
||
# significativos tras la corrección); si no hay, se muestran todos.
|
||
correlations = profile.get("correlations")
|
||
if correlations:
|
||
strong = []
|
||
all_pairs = []
|
||
multiple_testing = None
|
||
if isinstance(correlations, dict):
|
||
strong = correlations.get("strong") or correlations.get("strongest") or []
|
||
all_pairs = correlations.get("pairs") or []
|
||
multiple_testing = correlations.get("multiple_testing")
|
||
else:
|
||
all_pairs = correlations
|
||
shown = strong or all_pairs
|
||
corr_rows = []
|
||
for pair in shown or []:
|
||
if not isinstance(pair, dict):
|
||
continue
|
||
padj = pair.get("p_value_adjusted")
|
||
sig = pair.get("significant")
|
||
corr_rows.append([
|
||
pair.get("a") or pair.get("col_a"),
|
||
pair.get("b") or pair.get("col_b"),
|
||
pair.get("method", ""),
|
||
_fmt_num(pair.get("value") if pair.get("value") is not None
|
||
else pair.get("corr")),
|
||
_fmt_num(padj) if padj is not None else "",
|
||
"sí" if sig else ("no" if sig is not None else ""),
|
||
])
|
||
if corr_rows:
|
||
parts.append("## Correlaciones")
|
||
if isinstance(multiple_testing, dict):
|
||
parts.append(
|
||
"Corrección de comparaciones múltiples: "
|
||
f"{multiple_testing.get('method')} "
|
||
f"(α={multiple_testing.get('alpha')}); "
|
||
f"{multiple_testing.get('n_rejected')} de "
|
||
f"{multiple_testing.get('n_tests')} pares significativos tras la "
|
||
"corrección. Mostrando "
|
||
f"{'solo pares fuertes' if strong else 'todos los pares evaluados'}."
|
||
)
|
||
parts.append(_md_table(
|
||
["a", "b", "method", "value", "p_adj (FDR)", "sig"], corr_rows))
|
||
|
||
# 7b. Re-expresión sugerida (escalera de potencias de Tukey) por columna
|
||
# numérica. `suggest_reexpression` decide la transformación que más simetriza;
|
||
# aquí solo se rinde su recomendación y razón.
|
||
reexp_rows = []
|
||
for col in columns:
|
||
if not isinstance(col, dict):
|
||
continue
|
||
rx = col.get("reexpression")
|
||
if not isinstance(rx, dict) or rx.get("recommended") is None:
|
||
continue
|
||
ladder = rx.get("ladder_power")
|
||
reexp_rows.append([
|
||
col.get("name"),
|
||
_fmt_num(rx.get("skew")),
|
||
rx.get("recommended"),
|
||
_fmt_num(ladder) if ladder is not None else "",
|
||
rx.get("reason", ""),
|
||
])
|
||
if reexp_rows:
|
||
parts.append("## Re-expresión sugerida")
|
||
parts.append(_md_table(
|
||
["column", "skew", "transform", "ladder_power", "reason"], reexp_rows))
|
||
|
||
# 7c. Series temporales. Bloque por columna numérica cuando el pipeline corrió
|
||
# con run_series: estacionariedad (ADF+KPSS), autocorrelación (ACF/PACF +
|
||
# Ljung-Box), descomposición STL y, si es una serie de niveles, sugerencia de
|
||
# retornos.
|
||
series_blocks = []
|
||
for col in columns:
|
||
if not isinstance(col, dict):
|
||
continue
|
||
s = col.get("series")
|
||
if not isinstance(s, dict):
|
||
continue
|
||
name = col.get("name") or "(col)"
|
||
block = [f"### {name}"]
|
||
rows = []
|
||
stat = s.get("stationarity") or {}
|
||
if stat.get("verdict") is not None:
|
||
rows.append(["estacionariedad (ADF+KPSS)", stat.get("verdict")])
|
||
acf = s.get("acf_pacf") or {}
|
||
if acf.get("is_autocorrelated") is not None:
|
||
rows.append([
|
||
"autocorrelada (Ljung-Box)",
|
||
"sí" if acf.get("is_autocorrelated") else "no",
|
||
])
|
||
sig_lags = acf.get("significant_acf_lags")
|
||
if sig_lags:
|
||
rows.append([
|
||
"lags ACF significativos",
|
||
", ".join(str(lag) for lag in sig_lags[:12]),
|
||
])
|
||
stl = s.get("stl") or {}
|
||
if stl.get("trend_strength") is not None:
|
||
rows.append(["fuerza de tendencia (STL)", _fmt_num(stl.get("trend_strength"))])
|
||
if stl.get("seasonal_strength") is not None:
|
||
rows.append(["fuerza estacional (STL)", _fmt_num(stl.get("seasonal_strength"))])
|
||
if stl.get("period") is not None:
|
||
rows.append(["periodo estacional", stl.get("period")])
|
||
elif stl.get("note"):
|
||
rows.append(["STL", stl.get("note")])
|
||
if s.get("levels_suggested"):
|
||
# La transformación recomendada depende de la semántica: retornos para
|
||
# series financieras (precio/volumen), diferencias para magnitudes
|
||
# físicas (temperatura, caudal). Aplicar "retornos" a temperatura no
|
||
# tiene sentido físico; las diferencias sí.
|
||
kind = s.get("levels_kind")
|
||
if kind == "returns":
|
||
label = "convertir a retornos (serie de niveles financiera)"
|
||
elif kind == "differences":
|
||
label = "trabajar sobre diferencias (serie de niveles no financiera)"
|
||
else:
|
||
label = "convertir a retornos o diferencias (serie de niveles)"
|
||
rows.append(["sugerencia", label])
|
||
# Las métricas de retorno (media/volatilidad) solo se muestran cuando la
|
||
# transformación recomendada son retornos; para diferencias no aplican.
|
||
if kind != "differences":
|
||
tr = s.get("to_returns") or {}
|
||
if tr.get("mean") is not None:
|
||
rows.append(["retorno medio (log)", _fmt_num(tr.get("mean"))])
|
||
if tr.get("std") is not None:
|
||
rows.append(["volatilidad retornos (σ)", _fmt_num(tr.get("std"))])
|
||
if rows:
|
||
block.append(_md_table(["aspecto", "valor"], rows))
|
||
if stat.get("warning"):
|
||
block.append(f"> {stat.get('warning')}")
|
||
series_blocks.append("\n\n".join(block))
|
||
if series_blocks:
|
||
parts.append("## Series temporales")
|
||
parts.extend(series_blocks)
|
||
|
||
# 7d. Modelos baratos (PCA, KMeans, outliers multivariantes, normalidad). El
|
||
# pipeline corre `run_eda_models` cuando se pide con run_models; el bloque está
|
||
# completo en el JSON pero antes no tenía formatter en markdown y se omitía. Se
|
||
# lee todo defensivo con .get y cada submodelo se renderiza solo si está presente.
|
||
models = profile.get("models")
|
||
if isinstance(models, dict):
|
||
model_parts: list[str] = []
|
||
|
||
pca = models.get("pca")
|
||
if isinstance(pca, dict):
|
||
evr = pca.get("explained_variance_ratio") or []
|
||
cum = pca.get("cumulative") or []
|
||
pca_rows = []
|
||
for i, var in enumerate(evr):
|
||
acc = cum[i] if i < len(cum) else None
|
||
pca_rows.append([f"PC{i + 1}", _fmt_pct(var), _fmt_pct(acc)])
|
||
sub = ["### PCA"]
|
||
n_feat = pca.get("n_features")
|
||
n_used = pca.get("n_rows_used")
|
||
if n_feat is not None or n_used is not None:
|
||
sub.append(
|
||
f"{pca.get('n_components')} componentes sobre "
|
||
f"{n_used if n_used is not None else '?'} filas, "
|
||
f"{n_feat if n_feat is not None else '?'} features."
|
||
)
|
||
if pca_rows:
|
||
sub.append(_md_table(
|
||
["componente", "var. explicada", "acumulada"], pca_rows))
|
||
loadings = pca.get("top_loadings") or []
|
||
load_rows = []
|
||
for ld in loadings[:12]:
|
||
if not isinstance(ld, dict):
|
||
continue
|
||
comp = ld.get("component")
|
||
comp_label = f"PC{comp + 1}" if isinstance(comp, int) else str(comp)
|
||
load_rows.append([comp_label, ld.get("feature"),
|
||
_fmt_num(ld.get("loading"), 3)])
|
||
if load_rows:
|
||
sub.append("Cargas principales:")
|
||
sub.append(_md_table(["componente", "feature", "carga"], load_rows))
|
||
model_parts.append("\n\n".join(sub))
|
||
|
||
km = models.get("kmeans")
|
||
if isinstance(km, dict):
|
||
sub = ["### KMeans"]
|
||
best_k = km.get("best_k")
|
||
sil = km.get("silhouette")
|
||
sizes = km.get("cluster_sizes") or []
|
||
head = f"mejor k = {_fmt_num(best_k)}"
|
||
if sil is not None:
|
||
head += f" (silhouette {_fmt_num(sil, 3)})"
|
||
if sizes:
|
||
head += ". Tamaños de cluster: " + ", ".join(
|
||
_fmt_num(s) for s in sizes)
|
||
sub.append(head + ".")
|
||
score_rows = []
|
||
for sc in km.get("scores_by_k") or []:
|
||
if not isinstance(sc, dict):
|
||
continue
|
||
score_rows.append([sc.get("k"), _fmt_num(sc.get("silhouette"), 3),
|
||
_fmt_num(sc.get("inertia"), 2)])
|
||
if score_rows:
|
||
sub.append(_md_table(["k", "silhouette", "inertia"], score_rows))
|
||
model_parts.append("\n\n".join(sub))
|
||
|
||
out = models.get("outliers")
|
||
if isinstance(out, dict):
|
||
# outlier_pct del modelo multivariante ya viene en escala 0-100.
|
||
n_out = out.get("n_outliers")
|
||
pct = out.get("outlier_pct")
|
||
thr = out.get("threshold")
|
||
line = f"{_fmt_num(n_out)} filas marcadas como outlier"
|
||
if pct is not None:
|
||
line += f" ({_fmt_num(pct, 2)}%)"
|
||
if thr is not None:
|
||
line += f"; umbral de score {_fmt_num(thr, 3)}"
|
||
model_parts.append("### Outliers multivariante (Isolation Forest)\n\n"
|
||
+ line + ".")
|
||
|
||
normality = models.get("normality")
|
||
if isinstance(normality, dict):
|
||
norm_rows = []
|
||
for col_name, res in normality.items():
|
||
if not isinstance(res, dict):
|
||
continue
|
||
jb = res.get("jarque_bera") or {}
|
||
norm_rows.append([
|
||
col_name,
|
||
"sí" if res.get("is_normal") else "no",
|
||
_fmt_num(jb.get("p")) if jb.get("p") is not None else "",
|
||
])
|
||
if norm_rows:
|
||
model_parts.append(
|
||
"### Normalidad\n\n"
|
||
+ _md_table(["columna", "normal", "Jarque-Bera p"], norm_rows))
|
||
|
||
note = models.get("note")
|
||
if note:
|
||
model_parts.append(f"> {note}")
|
||
|
||
if model_parts:
|
||
parts.append("## Modelos")
|
||
parts.extend(model_parts)
|
||
|
||
# 8. LLM analysis (tolerate None for now).
|
||
llm = profile.get("llm")
|
||
if llm:
|
||
parts.append("## Análisis LLM")
|
||
if isinstance(llm, dict):
|
||
for key, value in llm.items():
|
||
if value is None:
|
||
continue
|
||
parts.append(f"### {key}")
|
||
if isinstance(value, (list, tuple)):
|
||
parts.append("\n".join(f"- {v}" for v in value))
|
||
else:
|
||
parts.append(str(value))
|
||
else:
|
||
parts.append(str(llm))
|
||
|
||
# 9. Avisos exploratorios. `exploratory_caveats` recuerda que el EDA genera
|
||
# hipótesis, no conclusiones; se renderiza la lista de advertencias que aplican
|
||
# a lo que realmente se calculó.
|
||
caveats = profile.get("caveats")
|
||
cav_list = []
|
||
if isinstance(caveats, dict):
|
||
cav_list = caveats.get("caveats") or []
|
||
elif isinstance(caveats, list):
|
||
cav_list = caveats
|
||
cav_lines = []
|
||
for cav in cav_list:
|
||
if not isinstance(cav, dict):
|
||
continue
|
||
topic = cav.get("topic") or cav.get("id") or ""
|
||
msg = cav.get("message") or ""
|
||
cav_lines.append(f"- **{topic}**: {msg}")
|
||
if cav_lines:
|
||
parts.append("## Avisos exploratorios")
|
||
parts.append("\n".join(cav_lines))
|
||
|
||
return "\n\n".join(parts) + "\n"
|