Files
fn_registry/python/functions/datascience/render_eda_markdown.py
T
Egutierrez c4cff5ed5b feat(eda): render de models en markdown + PDF DB-level para profile_database (H4,H9)
- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers);
  render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo)
- H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas +
  join graph) via render_eda_pdf_relational; clave report_pdf_path
- aditivos y retrocompatibles (flags default False). 38 tests verdes

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 04:05:38 +02:00

549 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Render a TableProfile dict (eda capability group) into a readable markdown report.
Pure render function: dict in, markdown string out. No I/O, stdlib only.
Reads every key defensively with .get(...) because most profile phases may be
absent (None / missing) depending on how complete the profiling was.
"""
# ASCII block characters used to draw histogram sparklines, low -> high.
_SPARK_BLOCKS = "▁▂▃▄▅▆▇█"
def _fmt_num(value, decimals: int = 4) -> str:
"""Format a number compactly, falling back to str for non-numerics."""
if value is None:
return ""
if isinstance(value, bool):
return str(value)
if isinstance(value, int):
return str(value)
if isinstance(value, float):
if value != value: # NaN
return "NaN"
if value in (float("inf"), float("-inf")):
return str(value)
# Trim trailing zeros for readability.
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
return text if text else "0"
return str(value)
def _fmt_pct(value, decimals: int = 2) -> str:
"""Format a fraction (0-1) as a percentage 'NN.NN%'. Returns '' for None.
Every ``*_pct`` field in a TableProfile/ColumnProfile is a fraction in the
[0, 1] range (e.g. ``unique_pct=0.857`` means 85.7%). This helper multiplies
by 100 so the rendered markdown shows the human-facing percentage.
"""
if value is None:
return ""
try:
num = float(value)
except (TypeError, ValueError):
return str(value)
return f"{num * 100:.{decimals}f}%"
def _sparkline(histogram) -> str:
"""Build an ASCII block sparkline from a histogram list of bins.
Each bin is a dict with a 'count' key. Counts are scaled linearly across the
block character ramp. Returns '' when the histogram is empty/None.
"""
if not histogram:
return ""
counts = []
for bin_ in histogram:
if not isinstance(bin_, dict):
return ""
counts.append(bin_.get("count") or 0)
if not counts:
return ""
lo = min(counts)
hi = max(counts)
span = hi - lo
chars = []
last_idx = len(_SPARK_BLOCKS) - 1
for c in counts:
if span <= 0:
idx = 0
else:
idx = int(round((c - lo) / span * last_idx))
idx = max(0, min(last_idx, idx))
chars.append(_SPARK_BLOCKS[idx])
return "".join(chars)
def _md_table(headers, rows) -> str:
"""Render a markdown table from headers and a list of row lists."""
head = "| " + " | ".join(str(h) for h in headers) + " |"
sep = "| " + " | ".join("---" for _ in headers) + " |"
body = []
for row in rows:
cells = [str(c) if c is not None else "" for c in row]
body.append("| " + " | ".join(cells) + " |")
return "\n".join([head, sep] + body)
def render_eda_markdown(profile: dict) -> str:
"""Convert a TableProfile dict into a readable, self-contained markdown report.
Args:
profile: TableProfile dict from the eda capability group. May have many
keys set to None or missing; everything is read defensively and
empty sections are omitted cleanly.
Returns:
A markdown string. Sections with no data are skipped.
"""
if profile is None:
profile = {}
parts: list[str] = []
columns = profile.get("columns") or []
# 1. Title + identity line.
table = profile.get("table") or "(unnamed)"
parts.append(f"# EDA — {table}")
identity_bits = []
source = profile.get("source")
if source:
identity_bits.append(f"source: `{source}`")
profiled_at = profile.get("profiled_at")
if profiled_at:
identity_bits.append(f"profiled_at: {profiled_at}")
n_rows = profile.get("n_rows")
n_cols = profile.get("n_cols")
if n_rows is not None or n_cols is not None:
identity_bits.append(f"{n_rows if n_rows is not None else '?'} rows × "
f"{n_cols if n_cols is not None else '?'} cols")
if identity_bits:
parts.append(" · ".join(identity_bits))
# 2. Overview.
overview_rows = []
if profile.get("n_rows") is not None:
overview_rows.append(["Rows", profile.get("n_rows")])
if profile.get("n_cols") is not None:
overview_rows.append(["Columns", profile.get("n_cols")])
if profile.get("size_bytes") is not None:
overview_rows.append(["Size (bytes)", profile.get("size_bytes")])
if profile.get("duplicate_rows") is not None:
dup = f"{profile.get('duplicate_rows')}"
if profile.get("duplicate_pct") is not None:
dup += f" ({_fmt_pct(profile.get('duplicate_pct'))})"
overview_rows.append(["Duplicate rows", dup])
if profile.get("null_cell_pct") is not None:
overview_rows.append(["Null cells", _fmt_pct(profile.get("null_cell_pct"))])
constant_cols = profile.get("constant_cols") or []
if constant_cols:
overview_rows.append(["Constant columns", ", ".join(constant_cols)])
all_null_cols = profile.get("all_null_cols") or []
if all_null_cols:
overview_rows.append(["All-null columns", ", ".join(all_null_cols)])
if profile.get("quality_score") is not None:
overview_rows.append(["Quality score", _fmt_num(profile.get("quality_score"))])
type_breakdown = profile.get("type_breakdown") or {}
if type_breakdown:
tb = ", ".join(f"{k}: {v}" for k, v in type_breakdown.items() if v is not None)
if tb:
overview_rows.append(["Type breakdown", tb])
key_candidates = profile.get("key_candidates") or []
if key_candidates:
overview_rows.append(["Key candidates", ", ".join(key_candidates)])
if overview_rows:
parts.append("## Overview")
parts.append(_md_table(["Metric", "Value"], overview_rows))
# 3. Columns summary table.
if columns:
rows = []
for col in columns:
if not isinstance(col, dict):
continue
rows.append([
col.get("name"),
col.get("inferred_type"),
col.get("semantic_type"),
_fmt_pct(col.get("null_pct")),
col.get("distinct_count"),
_fmt_pct(col.get("unique_pct")),
_fmt_num(col.get("quality_score")),
", ".join(col.get("flags") or []),
])
if rows:
parts.append("## Columnas")
parts.append(_md_table(
["name", "inferred_type", "semantic_type", "null_pct",
"distinct", "unique_pct", "quality_score", "flags"],
rows,
))
# 4. Numeric columns.
numeric_blocks = []
for col in columns:
if not isinstance(col, dict):
continue
num = col.get("numeric")
if not num:
continue
name = col.get("name") or "(col)"
stat_rows = []
for label, key in [
("min", "min"), ("median", "median"), ("mean", "mean"),
("std", "std"), ("p25", "p25"), ("p75", "p75"),
("p95", "p95"), ("p99", "p99"), ("skew", "skew"),
("outlier_pct", "outlier_pct"),
("distribution_type", "distribution_type"),
]:
val = num.get(key)
if val is None:
continue
if key == "outlier_pct":
# outlier_pct ya viene en escala 0-100 desde describe_numeric
# (100 * n_outliers / n). NO usar _fmt_pct (multiplica x100 otra
# vez y produce porcentajes imposibles, p.ej. 7% -> 700%).
stat_rows.append([label, _fmt_num(val, 2) + "%"])
elif key == "distribution_type":
stat_rows.append([label, str(val)])
else:
stat_rows.append([label, _fmt_num(val)])
block = [f"### {name}"]
if stat_rows:
block.append(_md_table(["stat", "value"], stat_rows))
spark = _sparkline(num.get("histogram"))
if spark:
block.append(f"histogram: `{spark}`")
numeric_blocks.append("\n\n".join(block))
if numeric_blocks:
parts.append("## Numéricas")
parts.extend(numeric_blocks)
# 5. Categorical columns.
categorical_blocks = []
for col in columns:
if not isinstance(col, dict):
continue
cat = col.get("categorical")
if not cat:
continue
name = col.get("name") or "(col)"
block = [f"### {name}"]
top = cat.get("top") or []
top_rows = []
for item in top:
if not isinstance(item, dict):
continue
top_rows.append([
item.get("value"),
item.get("count"),
_fmt_pct(item.get("pct")),
])
if top_rows:
block.append(_md_table(["value", "count", "pct"], top_rows))
if cat.get("entropy") is not None:
block.append(f"entropy: {_fmt_num(cat.get('entropy'))}")
categorical_blocks.append("\n\n".join(block))
if categorical_blocks:
parts.append("## Categóricas")
parts.extend(categorical_blocks)
# 6. Quality ranking (worst quality_score first).
scored = [
col for col in columns
if isinstance(col, dict) and col.get("quality_score") is not None
]
if scored:
scored.sort(key=lambda c: c.get("quality_score"))
rows = []
for col in scored:
issues = col.get("issues") or col.get("flags") or []
rows.append([
col.get("name"),
_fmt_num(col.get("quality_score")),
", ".join(issues) if isinstance(issues, list) else str(issues),
])
parts.append("## Calidad")
parts.append(_md_table(["column", "quality_score", "issues"], rows))
# 7. Correlaciones / asociación. `association_matrix` ya corrige los p-valores
# por comparaciones múltiples (FDR Benjamini-Hochberg / Bonferroni); aquí solo
# se renderizan los campos que produjo (value, p_value_adjusted, significant),
# sin recalcular nada. Se prefieren los pares `strong` (magnitud alta Y
# significativos tras la corrección); si no hay, se muestran todos.
correlations = profile.get("correlations")
if correlations:
strong = []
all_pairs = []
multiple_testing = None
if isinstance(correlations, dict):
strong = correlations.get("strong") or correlations.get("strongest") or []
all_pairs = correlations.get("pairs") or []
multiple_testing = correlations.get("multiple_testing")
else:
all_pairs = correlations
shown = strong or all_pairs
corr_rows = []
for pair in shown or []:
if not isinstance(pair, dict):
continue
padj = pair.get("p_value_adjusted")
sig = pair.get("significant")
corr_rows.append([
pair.get("a") or pair.get("col_a"),
pair.get("b") or pair.get("col_b"),
pair.get("method", ""),
_fmt_num(pair.get("value") if pair.get("value") is not None
else pair.get("corr")),
_fmt_num(padj) if padj is not None else "",
"" if sig else ("no" if sig is not None else ""),
])
if corr_rows:
parts.append("## Correlaciones")
if isinstance(multiple_testing, dict):
parts.append(
"Corrección de comparaciones múltiples: "
f"{multiple_testing.get('method')} "
f"(α={multiple_testing.get('alpha')}); "
f"{multiple_testing.get('n_rejected')} de "
f"{multiple_testing.get('n_tests')} pares significativos tras la "
"corrección. Mostrando "
f"{'solo pares fuertes' if strong else 'todos los pares evaluados'}."
)
parts.append(_md_table(
["a", "b", "method", "value", "p_adj (FDR)", "sig"], corr_rows))
# 7b. Re-expresión sugerida (escalera de potencias de Tukey) por columna
# numérica. `suggest_reexpression` decide la transformación que más simetriza;
# aquí solo se rinde su recomendación y razón.
reexp_rows = []
for col in columns:
if not isinstance(col, dict):
continue
rx = col.get("reexpression")
if not isinstance(rx, dict) or rx.get("recommended") is None:
continue
ladder = rx.get("ladder_power")
reexp_rows.append([
col.get("name"),
_fmt_num(rx.get("skew")),
rx.get("recommended"),
_fmt_num(ladder) if ladder is not None else "",
rx.get("reason", ""),
])
if reexp_rows:
parts.append("## Re-expresión sugerida")
parts.append(_md_table(
["column", "skew", "transform", "ladder_power", "reason"], reexp_rows))
# 7c. Series temporales. Bloque por columna numérica cuando el pipeline corrió
# con run_series: estacionariedad (ADF+KPSS), autocorrelación (ACF/PACF +
# Ljung-Box), descomposición STL y, si es una serie de niveles, sugerencia de
# retornos.
series_blocks = []
for col in columns:
if not isinstance(col, dict):
continue
s = col.get("series")
if not isinstance(s, dict):
continue
name = col.get("name") or "(col)"
block = [f"### {name}"]
rows = []
stat = s.get("stationarity") or {}
if stat.get("verdict") is not None:
rows.append(["estacionariedad (ADF+KPSS)", stat.get("verdict")])
acf = s.get("acf_pacf") or {}
if acf.get("is_autocorrelated") is not None:
rows.append([
"autocorrelada (Ljung-Box)",
"" if acf.get("is_autocorrelated") else "no",
])
sig_lags = acf.get("significant_acf_lags")
if sig_lags:
rows.append([
"lags ACF significativos",
", ".join(str(lag) for lag in sig_lags[:12]),
])
stl = s.get("stl") or {}
if stl.get("trend_strength") is not None:
rows.append(["fuerza de tendencia (STL)", _fmt_num(stl.get("trend_strength"))])
if stl.get("seasonal_strength") is not None:
rows.append(["fuerza estacional (STL)", _fmt_num(stl.get("seasonal_strength"))])
if stl.get("period") is not None:
rows.append(["periodo estacional", stl.get("period")])
elif stl.get("note"):
rows.append(["STL", stl.get("note")])
if s.get("levels_suggested"):
# La transformación recomendada depende de la semántica: retornos para
# series financieras (precio/volumen), diferencias para magnitudes
# físicas (temperatura, caudal). Aplicar "retornos" a temperatura no
# tiene sentido físico; las diferencias sí.
kind = s.get("levels_kind")
if kind == "returns":
label = "convertir a retornos (serie de niveles financiera)"
elif kind == "differences":
label = "trabajar sobre diferencias (serie de niveles no financiera)"
else:
label = "convertir a retornos o diferencias (serie de niveles)"
rows.append(["sugerencia", label])
# Las métricas de retorno (media/volatilidad) solo se muestran cuando la
# transformación recomendada son retornos; para diferencias no aplican.
if kind != "differences":
tr = s.get("to_returns") or {}
if tr.get("mean") is not None:
rows.append(["retorno medio (log)", _fmt_num(tr.get("mean"))])
if tr.get("std") is not None:
rows.append(["volatilidad retornos (σ)", _fmt_num(tr.get("std"))])
if rows:
block.append(_md_table(["aspecto", "valor"], rows))
if stat.get("warning"):
block.append(f"> {stat.get('warning')}")
series_blocks.append("\n\n".join(block))
if series_blocks:
parts.append("## Series temporales")
parts.extend(series_blocks)
# 7d. Modelos baratos (PCA, KMeans, outliers multivariantes, normalidad). El
# pipeline corre `run_eda_models` cuando se pide con run_models; el bloque está
# completo en el JSON pero antes no tenía formatter en markdown y se omitía. Se
# lee todo defensivo con .get y cada submodelo se renderiza solo si está presente.
models = profile.get("models")
if isinstance(models, dict):
model_parts: list[str] = []
pca = models.get("pca")
if isinstance(pca, dict):
evr = pca.get("explained_variance_ratio") or []
cum = pca.get("cumulative") or []
pca_rows = []
for i, var in enumerate(evr):
acc = cum[i] if i < len(cum) else None
pca_rows.append([f"PC{i + 1}", _fmt_pct(var), _fmt_pct(acc)])
sub = ["### PCA"]
n_feat = pca.get("n_features")
n_used = pca.get("n_rows_used")
if n_feat is not None or n_used is not None:
sub.append(
f"{pca.get('n_components')} componentes sobre "
f"{n_used if n_used is not None else '?'} filas, "
f"{n_feat if n_feat is not None else '?'} features."
)
if pca_rows:
sub.append(_md_table(
["componente", "var. explicada", "acumulada"], pca_rows))
loadings = pca.get("top_loadings") or []
load_rows = []
for ld in loadings[:12]:
if not isinstance(ld, dict):
continue
comp = ld.get("component")
comp_label = f"PC{comp + 1}" if isinstance(comp, int) else str(comp)
load_rows.append([comp_label, ld.get("feature"),
_fmt_num(ld.get("loading"), 3)])
if load_rows:
sub.append("Cargas principales:")
sub.append(_md_table(["componente", "feature", "carga"], load_rows))
model_parts.append("\n\n".join(sub))
km = models.get("kmeans")
if isinstance(km, dict):
sub = ["### KMeans"]
best_k = km.get("best_k")
sil = km.get("silhouette")
sizes = km.get("cluster_sizes") or []
head = f"mejor k = {_fmt_num(best_k)}"
if sil is not None:
head += f" (silhouette {_fmt_num(sil, 3)})"
if sizes:
head += ". Tamaños de cluster: " + ", ".join(
_fmt_num(s) for s in sizes)
sub.append(head + ".")
score_rows = []
for sc in km.get("scores_by_k") or []:
if not isinstance(sc, dict):
continue
score_rows.append([sc.get("k"), _fmt_num(sc.get("silhouette"), 3),
_fmt_num(sc.get("inertia"), 2)])
if score_rows:
sub.append(_md_table(["k", "silhouette", "inertia"], score_rows))
model_parts.append("\n\n".join(sub))
out = models.get("outliers")
if isinstance(out, dict):
# outlier_pct del modelo multivariante ya viene en escala 0-100.
n_out = out.get("n_outliers")
pct = out.get("outlier_pct")
thr = out.get("threshold")
line = f"{_fmt_num(n_out)} filas marcadas como outlier"
if pct is not None:
line += f" ({_fmt_num(pct, 2)}%)"
if thr is not None:
line += f"; umbral de score {_fmt_num(thr, 3)}"
model_parts.append("### Outliers multivariante (Isolation Forest)\n\n"
+ line + ".")
normality = models.get("normality")
if isinstance(normality, dict):
norm_rows = []
for col_name, res in normality.items():
if not isinstance(res, dict):
continue
jb = res.get("jarque_bera") or {}
norm_rows.append([
col_name,
"" if res.get("is_normal") else "no",
_fmt_num(jb.get("p")) if jb.get("p") is not None else "",
])
if norm_rows:
model_parts.append(
"### Normalidad\n\n"
+ _md_table(["columna", "normal", "Jarque-Bera p"], norm_rows))
note = models.get("note")
if note:
model_parts.append(f"> {note}")
if model_parts:
parts.append("## Modelos")
parts.extend(model_parts)
# 8. LLM analysis (tolerate None for now).
llm = profile.get("llm")
if llm:
parts.append("## Análisis LLM")
if isinstance(llm, dict):
for key, value in llm.items():
if value is None:
continue
parts.append(f"### {key}")
if isinstance(value, (list, tuple)):
parts.append("\n".join(f"- {v}" for v in value))
else:
parts.append(str(value))
else:
parts.append(str(llm))
# 9. Avisos exploratorios. `exploratory_caveats` recuerda que el EDA genera
# hipótesis, no conclusiones; se renderiza la lista de advertencias que aplican
# a lo que realmente se calculó.
caveats = profile.get("caveats")
cav_list = []
if isinstance(caveats, dict):
cav_list = caveats.get("caveats") or []
elif isinstance(caveats, list):
cav_list = caveats
cav_lines = []
for cav in cav_list:
if not isinstance(cav, dict):
continue
topic = cav.get("topic") or cav.get("id") or ""
msg = cav.get("message") or ""
cav_lines.append(f"- **{topic}**: {msg}")
if cav_lines:
parts.append("## Avisos exploratorios")
parts.append("\n".join(cav_lines))
return "\n\n".join(parts) + "\n"