feat(browser): auto-commit con 178 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,302 @@
|
||||
"""Render a TableProfile dict (eda capability group) into a readable markdown report.
|
||||
|
||||
Pure render function: dict in, markdown string out. No I/O, stdlib only.
|
||||
Reads every key defensively with .get(...) because most profile phases may be
|
||||
absent (None / missing) depending on how complete the profiling was.
|
||||
"""
|
||||
|
||||
# ASCII block characters used to draw histogram sparklines, low -> high.
|
||||
_SPARK_BLOCKS = "▁▂▃▄▅▆▇█"
|
||||
|
||||
|
||||
def _fmt_num(value, decimals: int = 4) -> str:
|
||||
"""Format a number compactly, falling back to str for non-numerics."""
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, bool):
|
||||
return str(value)
|
||||
if isinstance(value, int):
|
||||
return str(value)
|
||||
if isinstance(value, float):
|
||||
if value != value: # NaN
|
||||
return "NaN"
|
||||
if value in (float("inf"), float("-inf")):
|
||||
return str(value)
|
||||
# Trim trailing zeros for readability.
|
||||
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
|
||||
return text if text else "0"
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fmt_pct(value, decimals: int = 2) -> str:
|
||||
"""Format a fraction (0-1) as a percentage 'NN.NN%'. Returns '' for None.
|
||||
|
||||
Every ``*_pct`` field in a TableProfile/ColumnProfile is a fraction in the
|
||||
[0, 1] range (e.g. ``unique_pct=0.857`` means 85.7%). This helper multiplies
|
||||
by 100 so the rendered markdown shows the human-facing percentage.
|
||||
"""
|
||||
if value is None:
|
||||
return ""
|
||||
try:
|
||||
num = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
return f"{num * 100:.{decimals}f}%"
|
||||
|
||||
|
||||
def _sparkline(histogram) -> str:
|
||||
"""Build an ASCII block sparkline from a histogram list of bins.
|
||||
|
||||
Each bin is a dict with a 'count' key. Counts are scaled linearly across the
|
||||
block character ramp. Returns '' when the histogram is empty/None.
|
||||
"""
|
||||
if not histogram:
|
||||
return ""
|
||||
counts = []
|
||||
for bin_ in histogram:
|
||||
if not isinstance(bin_, dict):
|
||||
return ""
|
||||
counts.append(bin_.get("count") or 0)
|
||||
if not counts:
|
||||
return ""
|
||||
lo = min(counts)
|
||||
hi = max(counts)
|
||||
span = hi - lo
|
||||
chars = []
|
||||
last_idx = len(_SPARK_BLOCKS) - 1
|
||||
for c in counts:
|
||||
if span <= 0:
|
||||
idx = 0
|
||||
else:
|
||||
idx = int(round((c - lo) / span * last_idx))
|
||||
idx = max(0, min(last_idx, idx))
|
||||
chars.append(_SPARK_BLOCKS[idx])
|
||||
return "".join(chars)
|
||||
|
||||
|
||||
def _md_table(headers, rows) -> str:
|
||||
"""Render a markdown table from headers and a list of row lists."""
|
||||
head = "| " + " | ".join(str(h) for h in headers) + " |"
|
||||
sep = "| " + " | ".join("---" for _ in headers) + " |"
|
||||
body = []
|
||||
for row in rows:
|
||||
cells = [str(c) if c is not None else "" for c in row]
|
||||
body.append("| " + " | ".join(cells) + " |")
|
||||
return "\n".join([head, sep] + body)
|
||||
|
||||
|
||||
def render_eda_markdown(profile: dict) -> str:
|
||||
"""Convert a TableProfile dict into a readable, self-contained markdown report.
|
||||
|
||||
Args:
|
||||
profile: TableProfile dict from the eda capability group. May have many
|
||||
keys set to None or missing; everything is read defensively and
|
||||
empty sections are omitted cleanly.
|
||||
|
||||
Returns:
|
||||
A markdown string. Sections with no data are skipped.
|
||||
"""
|
||||
if profile is None:
|
||||
profile = {}
|
||||
|
||||
parts: list[str] = []
|
||||
columns = profile.get("columns") or []
|
||||
|
||||
# 1. Title + identity line.
|
||||
table = profile.get("table") or "(unnamed)"
|
||||
parts.append(f"# EDA — {table}")
|
||||
|
||||
identity_bits = []
|
||||
source = profile.get("source")
|
||||
if source:
|
||||
identity_bits.append(f"source: `{source}`")
|
||||
profiled_at = profile.get("profiled_at")
|
||||
if profiled_at:
|
||||
identity_bits.append(f"profiled_at: {profiled_at}")
|
||||
n_rows = profile.get("n_rows")
|
||||
n_cols = profile.get("n_cols")
|
||||
if n_rows is not None or n_cols is not None:
|
||||
identity_bits.append(f"{n_rows if n_rows is not None else '?'} rows × "
|
||||
f"{n_cols if n_cols is not None else '?'} cols")
|
||||
if identity_bits:
|
||||
parts.append(" · ".join(identity_bits))
|
||||
|
||||
# 2. Overview.
|
||||
overview_rows = []
|
||||
if profile.get("n_rows") is not None:
|
||||
overview_rows.append(["Rows", profile.get("n_rows")])
|
||||
if profile.get("n_cols") is not None:
|
||||
overview_rows.append(["Columns", profile.get("n_cols")])
|
||||
if profile.get("size_bytes") is not None:
|
||||
overview_rows.append(["Size (bytes)", profile.get("size_bytes")])
|
||||
if profile.get("duplicate_rows") is not None:
|
||||
dup = f"{profile.get('duplicate_rows')}"
|
||||
if profile.get("duplicate_pct") is not None:
|
||||
dup += f" ({_fmt_pct(profile.get('duplicate_pct'))})"
|
||||
overview_rows.append(["Duplicate rows", dup])
|
||||
if profile.get("null_cell_pct") is not None:
|
||||
overview_rows.append(["Null cells", _fmt_pct(profile.get("null_cell_pct"))])
|
||||
constant_cols = profile.get("constant_cols") or []
|
||||
if constant_cols:
|
||||
overview_rows.append(["Constant columns", ", ".join(constant_cols)])
|
||||
all_null_cols = profile.get("all_null_cols") or []
|
||||
if all_null_cols:
|
||||
overview_rows.append(["All-null columns", ", ".join(all_null_cols)])
|
||||
if profile.get("quality_score") is not None:
|
||||
overview_rows.append(["Quality score", _fmt_num(profile.get("quality_score"))])
|
||||
type_breakdown = profile.get("type_breakdown") or {}
|
||||
if type_breakdown:
|
||||
tb = ", ".join(f"{k}: {v}" for k, v in type_breakdown.items() if v is not None)
|
||||
if tb:
|
||||
overview_rows.append(["Type breakdown", tb])
|
||||
key_candidates = profile.get("key_candidates") or []
|
||||
if key_candidates:
|
||||
overview_rows.append(["Key candidates", ", ".join(key_candidates)])
|
||||
if overview_rows:
|
||||
parts.append("## Overview")
|
||||
parts.append(_md_table(["Metric", "Value"], overview_rows))
|
||||
|
||||
# 3. Columns summary table.
|
||||
if columns:
|
||||
rows = []
|
||||
for col in columns:
|
||||
if not isinstance(col, dict):
|
||||
continue
|
||||
rows.append([
|
||||
col.get("name"),
|
||||
col.get("inferred_type"),
|
||||
col.get("semantic_type"),
|
||||
_fmt_pct(col.get("null_pct")),
|
||||
col.get("distinct_count"),
|
||||
_fmt_pct(col.get("unique_pct")),
|
||||
_fmt_num(col.get("quality_score")),
|
||||
", ".join(col.get("flags") or []),
|
||||
])
|
||||
if rows:
|
||||
parts.append("## Columnas")
|
||||
parts.append(_md_table(
|
||||
["name", "inferred_type", "semantic_type", "null_pct",
|
||||
"distinct", "unique_pct", "quality_score", "flags"],
|
||||
rows,
|
||||
))
|
||||
|
||||
# 4. Numeric columns.
|
||||
numeric_blocks = []
|
||||
for col in columns:
|
||||
if not isinstance(col, dict):
|
||||
continue
|
||||
num = col.get("numeric")
|
||||
if not num:
|
||||
continue
|
||||
name = col.get("name") or "(col)"
|
||||
stat_rows = []
|
||||
for label, key in [
|
||||
("min", "min"), ("median", "median"), ("mean", "mean"),
|
||||
("std", "std"), ("p25", "p25"), ("p75", "p75"),
|
||||
("p95", "p95"), ("p99", "p99"), ("skew", "skew"),
|
||||
("outlier_pct", "outlier_pct"),
|
||||
("distribution_type", "distribution_type"),
|
||||
]:
|
||||
val = num.get(key)
|
||||
if val is None:
|
||||
continue
|
||||
if key == "outlier_pct":
|
||||
stat_rows.append([label, _fmt_pct(val)])
|
||||
elif key == "distribution_type":
|
||||
stat_rows.append([label, str(val)])
|
||||
else:
|
||||
stat_rows.append([label, _fmt_num(val)])
|
||||
block = [f"### {name}"]
|
||||
if stat_rows:
|
||||
block.append(_md_table(["stat", "value"], stat_rows))
|
||||
spark = _sparkline(num.get("histogram"))
|
||||
if spark:
|
||||
block.append(f"histogram: `{spark}`")
|
||||
numeric_blocks.append("\n\n".join(block))
|
||||
if numeric_blocks:
|
||||
parts.append("## Numéricas")
|
||||
parts.extend(numeric_blocks)
|
||||
|
||||
# 5. Categorical columns.
|
||||
categorical_blocks = []
|
||||
for col in columns:
|
||||
if not isinstance(col, dict):
|
||||
continue
|
||||
cat = col.get("categorical")
|
||||
if not cat:
|
||||
continue
|
||||
name = col.get("name") or "(col)"
|
||||
block = [f"### {name}"]
|
||||
top = cat.get("top") or []
|
||||
top_rows = []
|
||||
for item in top:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
top_rows.append([
|
||||
item.get("value"),
|
||||
item.get("count"),
|
||||
_fmt_pct(item.get("pct")),
|
||||
])
|
||||
if top_rows:
|
||||
block.append(_md_table(["value", "count", "pct"], top_rows))
|
||||
if cat.get("entropy") is not None:
|
||||
block.append(f"entropy: {_fmt_num(cat.get('entropy'))}")
|
||||
categorical_blocks.append("\n\n".join(block))
|
||||
if categorical_blocks:
|
||||
parts.append("## Categóricas")
|
||||
parts.extend(categorical_blocks)
|
||||
|
||||
# 6. Quality ranking (worst quality_score first).
|
||||
scored = [
|
||||
col for col in columns
|
||||
if isinstance(col, dict) and col.get("quality_score") is not None
|
||||
]
|
||||
if scored:
|
||||
scored.sort(key=lambda c: c.get("quality_score"))
|
||||
rows = []
|
||||
for col in scored:
|
||||
issues = col.get("issues") or col.get("flags") or []
|
||||
rows.append([
|
||||
col.get("name"),
|
||||
_fmt_num(col.get("quality_score")),
|
||||
", ".join(issues) if isinstance(issues, list) else str(issues),
|
||||
])
|
||||
parts.append("## Calidad")
|
||||
parts.append(_md_table(["column", "quality_score", "issues"], rows))
|
||||
|
||||
# 7. Correlations (tolerate None for now).
|
||||
correlations = profile.get("correlations")
|
||||
if correlations:
|
||||
pairs = correlations
|
||||
if isinstance(correlations, dict):
|
||||
pairs = correlations.get("pairs") or correlations.get("strongest") or []
|
||||
corr_rows = []
|
||||
for pair in pairs or []:
|
||||
if isinstance(pair, dict):
|
||||
corr_rows.append([
|
||||
pair.get("a") or pair.get("col_a"),
|
||||
pair.get("b") or pair.get("col_b"),
|
||||
_fmt_num(pair.get("value") if pair.get("value") is not None
|
||||
else pair.get("corr")),
|
||||
])
|
||||
if corr_rows:
|
||||
parts.append("## Correlaciones")
|
||||
parts.append(_md_table(["a", "b", "corr"], corr_rows))
|
||||
|
||||
# 8. LLM analysis (tolerate None for now).
|
||||
llm = profile.get("llm")
|
||||
if llm:
|
||||
parts.append("## Análisis LLM")
|
||||
if isinstance(llm, dict):
|
||||
for key, value in llm.items():
|
||||
if value is None:
|
||||
continue
|
||||
parts.append(f"### {key}")
|
||||
if isinstance(value, (list, tuple)):
|
||||
parts.append("\n".join(f"- {v}" for v in value))
|
||||
else:
|
||||
parts.append(str(value))
|
||||
else:
|
||||
parts.append(str(llm))
|
||||
|
||||
return "\n\n".join(parts) + "\n"
|
||||
Reference in New Issue
Block a user