Files
fn_registry/python/functions/datascience/render_eda_markdown.py
T
egutierrez 763e06c127 feat(browser): auto-commit con 178 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-20 18:22:23 +02:00

303 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Render a TableProfile dict (eda capability group) into a readable markdown report.
Pure render function: dict in, markdown string out. No I/O, stdlib only.
Reads every key defensively with .get(...) because most profile phases may be
absent (None / missing) depending on how complete the profiling was.
"""
# ASCII block characters used to draw histogram sparklines, low -> high.
_SPARK_BLOCKS = "▁▂▃▄▅▆▇█"
def _fmt_num(value, decimals: int = 4) -> str:
"""Format a number compactly, falling back to str for non-numerics."""
if value is None:
return ""
if isinstance(value, bool):
return str(value)
if isinstance(value, int):
return str(value)
if isinstance(value, float):
if value != value: # NaN
return "NaN"
if value in (float("inf"), float("-inf")):
return str(value)
# Trim trailing zeros for readability.
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
return text if text else "0"
return str(value)
def _fmt_pct(value, decimals: int = 2) -> str:
"""Format a fraction (0-1) as a percentage 'NN.NN%'. Returns '' for None.
Every ``*_pct`` field in a TableProfile/ColumnProfile is a fraction in the
[0, 1] range (e.g. ``unique_pct=0.857`` means 85.7%). This helper multiplies
by 100 so the rendered markdown shows the human-facing percentage.
"""
if value is None:
return ""
try:
num = float(value)
except (TypeError, ValueError):
return str(value)
return f"{num * 100:.{decimals}f}%"
def _sparkline(histogram) -> str:
"""Build an ASCII block sparkline from a histogram list of bins.
Each bin is a dict with a 'count' key. Counts are scaled linearly across the
block character ramp. Returns '' when the histogram is empty/None.
"""
if not histogram:
return ""
counts = []
for bin_ in histogram:
if not isinstance(bin_, dict):
return ""
counts.append(bin_.get("count") or 0)
if not counts:
return ""
lo = min(counts)
hi = max(counts)
span = hi - lo
chars = []
last_idx = len(_SPARK_BLOCKS) - 1
for c in counts:
if span <= 0:
idx = 0
else:
idx = int(round((c - lo) / span * last_idx))
idx = max(0, min(last_idx, idx))
chars.append(_SPARK_BLOCKS[idx])
return "".join(chars)
def _md_table(headers, rows) -> str:
"""Render a markdown table from headers and a list of row lists."""
head = "| " + " | ".join(str(h) for h in headers) + " |"
sep = "| " + " | ".join("---" for _ in headers) + " |"
body = []
for row in rows:
cells = [str(c) if c is not None else "" for c in row]
body.append("| " + " | ".join(cells) + " |")
return "\n".join([head, sep] + body)
def render_eda_markdown(profile: dict) -> str:
"""Convert a TableProfile dict into a readable, self-contained markdown report.
Args:
profile: TableProfile dict from the eda capability group. May have many
keys set to None or missing; everything is read defensively and
empty sections are omitted cleanly.
Returns:
A markdown string. Sections with no data are skipped.
"""
if profile is None:
profile = {}
parts: list[str] = []
columns = profile.get("columns") or []
# 1. Title + identity line.
table = profile.get("table") or "(unnamed)"
parts.append(f"# EDA — {table}")
identity_bits = []
source = profile.get("source")
if source:
identity_bits.append(f"source: `{source}`")
profiled_at = profile.get("profiled_at")
if profiled_at:
identity_bits.append(f"profiled_at: {profiled_at}")
n_rows = profile.get("n_rows")
n_cols = profile.get("n_cols")
if n_rows is not None or n_cols is not None:
identity_bits.append(f"{n_rows if n_rows is not None else '?'} rows × "
f"{n_cols if n_cols is not None else '?'} cols")
if identity_bits:
parts.append(" · ".join(identity_bits))
# 2. Overview.
overview_rows = []
if profile.get("n_rows") is not None:
overview_rows.append(["Rows", profile.get("n_rows")])
if profile.get("n_cols") is not None:
overview_rows.append(["Columns", profile.get("n_cols")])
if profile.get("size_bytes") is not None:
overview_rows.append(["Size (bytes)", profile.get("size_bytes")])
if profile.get("duplicate_rows") is not None:
dup = f"{profile.get('duplicate_rows')}"
if profile.get("duplicate_pct") is not None:
dup += f" ({_fmt_pct(profile.get('duplicate_pct'))})"
overview_rows.append(["Duplicate rows", dup])
if profile.get("null_cell_pct") is not None:
overview_rows.append(["Null cells", _fmt_pct(profile.get("null_cell_pct"))])
constant_cols = profile.get("constant_cols") or []
if constant_cols:
overview_rows.append(["Constant columns", ", ".join(constant_cols)])
all_null_cols = profile.get("all_null_cols") or []
if all_null_cols:
overview_rows.append(["All-null columns", ", ".join(all_null_cols)])
if profile.get("quality_score") is not None:
overview_rows.append(["Quality score", _fmt_num(profile.get("quality_score"))])
type_breakdown = profile.get("type_breakdown") or {}
if type_breakdown:
tb = ", ".join(f"{k}: {v}" for k, v in type_breakdown.items() if v is not None)
if tb:
overview_rows.append(["Type breakdown", tb])
key_candidates = profile.get("key_candidates") or []
if key_candidates:
overview_rows.append(["Key candidates", ", ".join(key_candidates)])
if overview_rows:
parts.append("## Overview")
parts.append(_md_table(["Metric", "Value"], overview_rows))
# 3. Columns summary table.
if columns:
rows = []
for col in columns:
if not isinstance(col, dict):
continue
rows.append([
col.get("name"),
col.get("inferred_type"),
col.get("semantic_type"),
_fmt_pct(col.get("null_pct")),
col.get("distinct_count"),
_fmt_pct(col.get("unique_pct")),
_fmt_num(col.get("quality_score")),
", ".join(col.get("flags") or []),
])
if rows:
parts.append("## Columnas")
parts.append(_md_table(
["name", "inferred_type", "semantic_type", "null_pct",
"distinct", "unique_pct", "quality_score", "flags"],
rows,
))
# 4. Numeric columns.
numeric_blocks = []
for col in columns:
if not isinstance(col, dict):
continue
num = col.get("numeric")
if not num:
continue
name = col.get("name") or "(col)"
stat_rows = []
for label, key in [
("min", "min"), ("median", "median"), ("mean", "mean"),
("std", "std"), ("p25", "p25"), ("p75", "p75"),
("p95", "p95"), ("p99", "p99"), ("skew", "skew"),
("outlier_pct", "outlier_pct"),
("distribution_type", "distribution_type"),
]:
val = num.get(key)
if val is None:
continue
if key == "outlier_pct":
stat_rows.append([label, _fmt_pct(val)])
elif key == "distribution_type":
stat_rows.append([label, str(val)])
else:
stat_rows.append([label, _fmt_num(val)])
block = [f"### {name}"]
if stat_rows:
block.append(_md_table(["stat", "value"], stat_rows))
spark = _sparkline(num.get("histogram"))
if spark:
block.append(f"histogram: `{spark}`")
numeric_blocks.append("\n\n".join(block))
if numeric_blocks:
parts.append("## Numéricas")
parts.extend(numeric_blocks)
# 5. Categorical columns.
categorical_blocks = []
for col in columns:
if not isinstance(col, dict):
continue
cat = col.get("categorical")
if not cat:
continue
name = col.get("name") or "(col)"
block = [f"### {name}"]
top = cat.get("top") or []
top_rows = []
for item in top:
if not isinstance(item, dict):
continue
top_rows.append([
item.get("value"),
item.get("count"),
_fmt_pct(item.get("pct")),
])
if top_rows:
block.append(_md_table(["value", "count", "pct"], top_rows))
if cat.get("entropy") is not None:
block.append(f"entropy: {_fmt_num(cat.get('entropy'))}")
categorical_blocks.append("\n\n".join(block))
if categorical_blocks:
parts.append("## Categóricas")
parts.extend(categorical_blocks)
# 6. Quality ranking (worst quality_score first).
scored = [
col for col in columns
if isinstance(col, dict) and col.get("quality_score") is not None
]
if scored:
scored.sort(key=lambda c: c.get("quality_score"))
rows = []
for col in scored:
issues = col.get("issues") or col.get("flags") or []
rows.append([
col.get("name"),
_fmt_num(col.get("quality_score")),
", ".join(issues) if isinstance(issues, list) else str(issues),
])
parts.append("## Calidad")
parts.append(_md_table(["column", "quality_score", "issues"], rows))
# 7. Correlations (tolerate None for now).
correlations = profile.get("correlations")
if correlations:
pairs = correlations
if isinstance(correlations, dict):
pairs = correlations.get("pairs") or correlations.get("strongest") or []
corr_rows = []
for pair in pairs or []:
if isinstance(pair, dict):
corr_rows.append([
pair.get("a") or pair.get("col_a"),
pair.get("b") or pair.get("col_b"),
_fmt_num(pair.get("value") if pair.get("value") is not None
else pair.get("corr")),
])
if corr_rows:
parts.append("## Correlaciones")
parts.append(_md_table(["a", "b", "corr"], corr_rows))
# 8. LLM analysis (tolerate None for now).
llm = profile.get("llm")
if llm:
parts.append("## Análisis LLM")
if isinstance(llm, dict):
for key, value in llm.items():
if value is None:
continue
parts.append(f"### {key}")
if isinstance(value, (list, tuple)):
parts.append("\n".join(f"- {v}" for v in value))
else:
parts.append(str(value))
else:
parts.append(str(llm))
return "\n\n".join(parts) + "\n"