feat(browser): auto-commit con 178 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-20 18:22:23 +02:00
parent 7d100e7f3e
commit 763e06c127
178 changed files with 19917 additions and 317 deletions
@@ -0,0 +1,302 @@
+"""Render a TableProfile dict (eda capability group) into a readable markdown report.
+
+Pure render function: dict in, markdown string out. No I/O, stdlib only.
+Reads every key defensively with .get(...) because most profile phases may be
+absent (None / missing) depending on how complete the profiling was.
+"""
+
+# ASCII block characters used to draw histogram sparklines, low -> high.
+_SPARK_BLOCKS = "▁▂▃▄▅▆▇█"
+
+
+def _fmt_num(value, decimals: int = 4) -> str:
+    """Format a number compactly, falling back to str for non-numerics."""
+    if value is None:
+        return ""
+    if isinstance(value, bool):
+        return str(value)
+    if isinstance(value, int):
+        return str(value)
+    if isinstance(value, float):
+        if value != value:  # NaN
+            return "NaN"
+        if value in (float("inf"), float("-inf")):
+            return str(value)
+        # Trim trailing zeros for readability.
+        text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
+        return text if text else "0"
+    return str(value)
+
+
+def _fmt_pct(value, decimals: int = 2) -> str:
+    """Format a fraction (0-1) as a percentage 'NN.NN%'. Returns '' for None.
+
+    Every ``*_pct`` field in a TableProfile/ColumnProfile is a fraction in the
+    [0, 1] range (e.g. ``unique_pct=0.857`` means 85.7%). This helper multiplies
+    by 100 so the rendered markdown shows the human-facing percentage.
+    """
+    if value is None:
+        return ""
+    try:
+        num = float(value)
+    except (TypeError, ValueError):
+        return str(value)
+    return f"{num * 100:.{decimals}f}%"
+
+
+def _sparkline(histogram) -> str:
+    """Build an ASCII block sparkline from a histogram list of bins.
+
+    Each bin is a dict with a 'count' key. Counts are scaled linearly across the
+    block character ramp. Returns '' when the histogram is empty/None.
+    """
+    if not histogram:
+        return ""
+    counts = []
+    for bin_ in histogram:
+        if not isinstance(bin_, dict):
+            return ""
+        counts.append(bin_.get("count") or 0)
+    if not counts:
+        return ""
+    lo = min(counts)
+    hi = max(counts)
+    span = hi - lo
+    chars = []
+    last_idx = len(_SPARK_BLOCKS) - 1
+    for c in counts:
+        if span <= 0:
+            idx = 0
+        else:
+            idx = int(round((c - lo) / span * last_idx))
+            idx = max(0, min(last_idx, idx))
+        chars.append(_SPARK_BLOCKS[idx])
+    return "".join(chars)
+
+
+def _md_table(headers, rows) -> str:
+    """Render a markdown table from headers and a list of row lists."""
+    head = "| " + " | ".join(str(h) for h in headers) + " |"
+    sep = "| " + " | ".join("---" for _ in headers) + " |"
+    body = []
+    for row in rows:
+        cells = [str(c) if c is not None else "" for c in row]
+        body.append("| " + " | ".join(cells) + " |")
+    return "\n".join([head, sep] + body)
+
+
+def render_eda_markdown(profile: dict) -> str:
+    """Convert a TableProfile dict into a readable, self-contained markdown report.
+
+    Args:
+        profile: TableProfile dict from the eda capability group. May have many
+            keys set to None or missing; everything is read defensively and
+            empty sections are omitted cleanly.
+
+    Returns:
+        A markdown string. Sections with no data are skipped.
+    """
+    if profile is None:
+        profile = {}
+
+    parts: list[str] = []
+    columns = profile.get("columns") or []
+
+    # 1. Title + identity line.
+    table = profile.get("table") or "(unnamed)"
+    parts.append(f"# EDA — {table}")
+
+    identity_bits = []
+    source = profile.get("source")
+    if source:
+        identity_bits.append(f"source: `{source}`")
+    profiled_at = profile.get("profiled_at")
+    if profiled_at:
+        identity_bits.append(f"profiled_at: {profiled_at}")
+    n_rows = profile.get("n_rows")
+    n_cols = profile.get("n_cols")
+    if n_rows is not None or n_cols is not None:
+        identity_bits.append(f"{n_rows if n_rows is not None else '?'} rows × "
+                             f"{n_cols if n_cols is not None else '?'} cols")
+    if identity_bits:
+        parts.append(" · ".join(identity_bits))
+
+    # 2. Overview.
+    overview_rows = []
+    if profile.get("n_rows") is not None:
+        overview_rows.append(["Rows", profile.get("n_rows")])
+    if profile.get("n_cols") is not None:
+        overview_rows.append(["Columns", profile.get("n_cols")])
+    if profile.get("size_bytes") is not None:
+        overview_rows.append(["Size (bytes)", profile.get("size_bytes")])
+    if profile.get("duplicate_rows") is not None:
+        dup = f"{profile.get('duplicate_rows')}"
+        if profile.get("duplicate_pct") is not None:
+            dup += f" ({_fmt_pct(profile.get('duplicate_pct'))})"
+        overview_rows.append(["Duplicate rows", dup])
+    if profile.get("null_cell_pct") is not None:
+        overview_rows.append(["Null cells", _fmt_pct(profile.get("null_cell_pct"))])
+    constant_cols = profile.get("constant_cols") or []
+    if constant_cols:
+        overview_rows.append(["Constant columns", ", ".join(constant_cols)])
+    all_null_cols = profile.get("all_null_cols") or []
+    if all_null_cols:
+        overview_rows.append(["All-null columns", ", ".join(all_null_cols)])
+    if profile.get("quality_score") is not None:
+        overview_rows.append(["Quality score", _fmt_num(profile.get("quality_score"))])
+    type_breakdown = profile.get("type_breakdown") or {}
+    if type_breakdown:
+        tb = ", ".join(f"{k}: {v}" for k, v in type_breakdown.items() if v is not None)
+        if tb:
+            overview_rows.append(["Type breakdown", tb])
+    key_candidates = profile.get("key_candidates") or []
+    if key_candidates:
+        overview_rows.append(["Key candidates", ", ".join(key_candidates)])
+    if overview_rows:
+        parts.append("## Overview")
+        parts.append(_md_table(["Metric", "Value"], overview_rows))
+
+    # 3. Columns summary table.
+    if columns:
+        rows = []
+        for col in columns:
+            if not isinstance(col, dict):
+                continue
+            rows.append([
+                col.get("name"),
+                col.get("inferred_type"),
+                col.get("semantic_type"),
+                _fmt_pct(col.get("null_pct")),
+                col.get("distinct_count"),
+                _fmt_pct(col.get("unique_pct")),
+                _fmt_num(col.get("quality_score")),
+                ", ".join(col.get("flags") or []),
+            ])
+        if rows:
+            parts.append("## Columnas")
+            parts.append(_md_table(
+                ["name", "inferred_type", "semantic_type", "null_pct",
+                 "distinct", "unique_pct", "quality_score", "flags"],
+                rows,
+            ))
+
+    # 4. Numeric columns.
+    numeric_blocks = []
+    for col in columns:
+        if not isinstance(col, dict):
+            continue
+        num = col.get("numeric")
+        if not num:
+            continue
+        name = col.get("name") or "(col)"
+        stat_rows = []
+        for label, key in [
+            ("min", "min"), ("median", "median"), ("mean", "mean"),
+            ("std", "std"), ("p25", "p25"), ("p75", "p75"),
+            ("p95", "p95"), ("p99", "p99"), ("skew", "skew"),
+            ("outlier_pct", "outlier_pct"),
+            ("distribution_type", "distribution_type"),
+        ]:
+            val = num.get(key)
+            if val is None:
+                continue
+            if key == "outlier_pct":
+                stat_rows.append([label, _fmt_pct(val)])
+            elif key == "distribution_type":
+                stat_rows.append([label, str(val)])
+            else:
+                stat_rows.append([label, _fmt_num(val)])
+        block = [f"### {name}"]
+        if stat_rows:
+            block.append(_md_table(["stat", "value"], stat_rows))
+        spark = _sparkline(num.get("histogram"))
+        if spark:
+            block.append(f"histogram: `{spark}`")
+        numeric_blocks.append("\n\n".join(block))
+    if numeric_blocks:
+        parts.append("## Numéricas")
+        parts.extend(numeric_blocks)
+
+    # 5. Categorical columns.
+    categorical_blocks = []
+    for col in columns:
+        if not isinstance(col, dict):
+            continue
+        cat = col.get("categorical")
+        if not cat:
+            continue
+        name = col.get("name") or "(col)"
+        block = [f"### {name}"]
+        top = cat.get("top") or []
+        top_rows = []
+        for item in top:
+            if not isinstance(item, dict):
+                continue
+            top_rows.append([
+                item.get("value"),
+                item.get("count"),
+                _fmt_pct(item.get("pct")),
+            ])
+        if top_rows:
+            block.append(_md_table(["value", "count", "pct"], top_rows))
+        if cat.get("entropy") is not None:
+            block.append(f"entropy: {_fmt_num(cat.get('entropy'))}")
+        categorical_blocks.append("\n\n".join(block))
+    if categorical_blocks:
+        parts.append("## Categóricas")
+        parts.extend(categorical_blocks)
+
+    # 6. Quality ranking (worst quality_score first).
+    scored = [
+        col for col in columns
+        if isinstance(col, dict) and col.get("quality_score") is not None
+    ]
+    if scored:
+        scored.sort(key=lambda c: c.get("quality_score"))
+        rows = []
+        for col in scored:
+            issues = col.get("issues") or col.get("flags") or []
+            rows.append([
+                col.get("name"),
+                _fmt_num(col.get("quality_score")),
+                ", ".join(issues) if isinstance(issues, list) else str(issues),
+            ])
+        parts.append("## Calidad")
+        parts.append(_md_table(["column", "quality_score", "issues"], rows))
+
+    # 7. Correlations (tolerate None for now).
+    correlations = profile.get("correlations")
+    if correlations:
+        pairs = correlations
+        if isinstance(correlations, dict):
+            pairs = correlations.get("pairs") or correlations.get("strongest") or []
+        corr_rows = []
+        for pair in pairs or []:
+            if isinstance(pair, dict):
+                corr_rows.append([
+                    pair.get("a") or pair.get("col_a"),
+                    pair.get("b") or pair.get("col_b"),
+                    _fmt_num(pair.get("value") if pair.get("value") is not None
+                             else pair.get("corr")),
+                ])
+        if corr_rows:
+            parts.append("## Correlaciones")
+            parts.append(_md_table(["a", "b", "corr"], corr_rows))
+
+    # 8. LLM analysis (tolerate None for now).
+    llm = profile.get("llm")
+    if llm:
+        parts.append("## Análisis LLM")
+        if isinstance(llm, dict):
+            for key, value in llm.items():
+                if value is None:
+                    continue
+                parts.append(f"### {key}")
+                if isinstance(value, (list, tuple)):
+                    parts.append("\n".join(f"- {v}" for v in value))
+                else:
+                    parts.append(str(value))
+        else:
+            parts.append(str(llm))
+
+    return "\n\n".join(parts) + "\n"