"""Render a TableProfile dict (eda capability group) into a readable markdown report. Pure render function: dict in, markdown string out. No I/O, stdlib only. Reads every key defensively with .get(...) because most profile phases may be absent (None / missing) depending on how complete the profiling was. """ # ASCII block characters used to draw histogram sparklines, low -> high. _SPARK_BLOCKS = "▁▂▃▄▅▆▇█" def _fmt_num(value, decimals: int = 4) -> str: """Format a number compactly, falling back to str for non-numerics.""" if value is None: return "" if isinstance(value, bool): return str(value) if isinstance(value, int): return str(value) if isinstance(value, float): if value != value: # NaN return "NaN" if value in (float("inf"), float("-inf")): return str(value) # Trim trailing zeros for readability. text = f"{value:.{decimals}f}".rstrip("0").rstrip(".") return text if text else "0" return str(value) def _fmt_pct(value, decimals: int = 2) -> str: """Format a fraction (0-1) as a percentage 'NN.NN%'. Returns '' for None. Every ``*_pct`` field in a TableProfile/ColumnProfile is a fraction in the [0, 1] range (e.g. ``unique_pct=0.857`` means 85.7%). This helper multiplies by 100 so the rendered markdown shows the human-facing percentage. """ if value is None: return "" try: num = float(value) except (TypeError, ValueError): return str(value) return f"{num * 100:.{decimals}f}%" def _sparkline(histogram) -> str: """Build an ASCII block sparkline from a histogram list of bins. Each bin is a dict with a 'count' key. Counts are scaled linearly across the block character ramp. Returns '' when the histogram is empty/None. """ if not histogram: return "" counts = [] for bin_ in histogram: if not isinstance(bin_, dict): return "" counts.append(bin_.get("count") or 0) if not counts: return "" lo = min(counts) hi = max(counts) span = hi - lo chars = [] last_idx = len(_SPARK_BLOCKS) - 1 for c in counts: if span <= 0: idx = 0 else: idx = int(round((c - lo) / span * last_idx)) idx = max(0, min(last_idx, idx)) chars.append(_SPARK_BLOCKS[idx]) return "".join(chars) def _md_table(headers, rows) -> str: """Render a markdown table from headers and a list of row lists.""" head = "| " + " | ".join(str(h) for h in headers) + " |" sep = "| " + " | ".join("---" for _ in headers) + " |" body = [] for row in rows: cells = [str(c) if c is not None else "" for c in row] body.append("| " + " | ".join(cells) + " |") return "\n".join([head, sep] + body) def render_eda_markdown(profile: dict) -> str: """Convert a TableProfile dict into a readable, self-contained markdown report. Args: profile: TableProfile dict from the eda capability group. May have many keys set to None or missing; everything is read defensively and empty sections are omitted cleanly. Returns: A markdown string. Sections with no data are skipped. """ if profile is None: profile = {} parts: list[str] = [] columns = profile.get("columns") or [] # 1. Title + identity line. table = profile.get("table") or "(unnamed)" parts.append(f"# EDA — {table}") identity_bits = [] source = profile.get("source") if source: identity_bits.append(f"source: `{source}`") profiled_at = profile.get("profiled_at") if profiled_at: identity_bits.append(f"profiled_at: {profiled_at}") n_rows = profile.get("n_rows") n_cols = profile.get("n_cols") if n_rows is not None or n_cols is not None: identity_bits.append(f"{n_rows if n_rows is not None else '?'} rows × " f"{n_cols if n_cols is not None else '?'} cols") if identity_bits: parts.append(" · ".join(identity_bits)) # 2. Overview. overview_rows = [] if profile.get("n_rows") is not None: overview_rows.append(["Rows", profile.get("n_rows")]) if profile.get("n_cols") is not None: overview_rows.append(["Columns", profile.get("n_cols")]) if profile.get("size_bytes") is not None: overview_rows.append(["Size (bytes)", profile.get("size_bytes")]) if profile.get("duplicate_rows") is not None: dup = f"{profile.get('duplicate_rows')}" if profile.get("duplicate_pct") is not None: dup += f" ({_fmt_pct(profile.get('duplicate_pct'))})" overview_rows.append(["Duplicate rows", dup]) if profile.get("null_cell_pct") is not None: overview_rows.append(["Null cells", _fmt_pct(profile.get("null_cell_pct"))]) constant_cols = profile.get("constant_cols") or [] if constant_cols: overview_rows.append(["Constant columns", ", ".join(constant_cols)]) all_null_cols = profile.get("all_null_cols") or [] if all_null_cols: overview_rows.append(["All-null columns", ", ".join(all_null_cols)]) if profile.get("quality_score") is not None: overview_rows.append(["Quality score", _fmt_num(profile.get("quality_score"))]) type_breakdown = profile.get("type_breakdown") or {} if type_breakdown: tb = ", ".join(f"{k}: {v}" for k, v in type_breakdown.items() if v is not None) if tb: overview_rows.append(["Type breakdown", tb]) key_candidates = profile.get("key_candidates") or [] if key_candidates: overview_rows.append(["Key candidates", ", ".join(key_candidates)]) if overview_rows: parts.append("## Overview") parts.append(_md_table(["Metric", "Value"], overview_rows)) # 3. Columns summary table. if columns: rows = [] for col in columns: if not isinstance(col, dict): continue rows.append([ col.get("name"), col.get("inferred_type"), col.get("semantic_type"), _fmt_pct(col.get("null_pct")), col.get("distinct_count"), _fmt_pct(col.get("unique_pct")), _fmt_num(col.get("quality_score")), ", ".join(col.get("flags") or []), ]) if rows: parts.append("## Columnas") parts.append(_md_table( ["name", "inferred_type", "semantic_type", "null_pct", "distinct", "unique_pct", "quality_score", "flags"], rows, )) # 4. Numeric columns. numeric_blocks = [] for col in columns: if not isinstance(col, dict): continue num = col.get("numeric") if not num: continue name = col.get("name") or "(col)" stat_rows = [] for label, key in [ ("min", "min"), ("median", "median"), ("mean", "mean"), ("std", "std"), ("p25", "p25"), ("p75", "p75"), ("p95", "p95"), ("p99", "p99"), ("skew", "skew"), ("outlier_pct", "outlier_pct"), ("distribution_type", "distribution_type"), ]: val = num.get(key) if val is None: continue if key == "outlier_pct": # outlier_pct ya viene en escala 0-100 desde describe_numeric # (100 * n_outliers / n). NO usar _fmt_pct (multiplica x100 otra # vez y produce porcentajes imposibles, p.ej. 7% -> 700%). stat_rows.append([label, _fmt_num(val, 2) + "%"]) elif key == "distribution_type": stat_rows.append([label, str(val)]) else: stat_rows.append([label, _fmt_num(val)]) block = [f"### {name}"] if stat_rows: block.append(_md_table(["stat", "value"], stat_rows)) spark = _sparkline(num.get("histogram")) if spark: block.append(f"histogram: `{spark}`") numeric_blocks.append("\n\n".join(block)) if numeric_blocks: parts.append("## Numéricas") parts.extend(numeric_blocks) # 5. Categorical columns. categorical_blocks = [] for col in columns: if not isinstance(col, dict): continue cat = col.get("categorical") if not cat: continue name = col.get("name") or "(col)" block = [f"### {name}"] top = cat.get("top") or [] top_rows = [] for item in top: if not isinstance(item, dict): continue top_rows.append([ item.get("value"), item.get("count"), _fmt_pct(item.get("pct")), ]) if top_rows: block.append(_md_table(["value", "count", "pct"], top_rows)) if cat.get("entropy") is not None: block.append(f"entropy: {_fmt_num(cat.get('entropy'))}") categorical_blocks.append("\n\n".join(block)) if categorical_blocks: parts.append("## Categóricas") parts.extend(categorical_blocks) # 6. Quality ranking (worst quality_score first). scored = [ col for col in columns if isinstance(col, dict) and col.get("quality_score") is not None ] if scored: scored.sort(key=lambda c: c.get("quality_score")) rows = [] for col in scored: issues = col.get("issues") or col.get("flags") or [] rows.append([ col.get("name"), _fmt_num(col.get("quality_score")), ", ".join(issues) if isinstance(issues, list) else str(issues), ]) parts.append("## Calidad") parts.append(_md_table(["column", "quality_score", "issues"], rows)) # 7. Correlaciones / asociación. `association_matrix` ya corrige los p-valores # por comparaciones múltiples (FDR Benjamini-Hochberg / Bonferroni); aquí solo # se renderizan los campos que produjo (value, p_value_adjusted, significant), # sin recalcular nada. Se prefieren los pares `strong` (magnitud alta Y # significativos tras la corrección); si no hay, se muestran todos. correlations = profile.get("correlations") if correlations: strong = [] all_pairs = [] multiple_testing = None if isinstance(correlations, dict): strong = correlations.get("strong") or correlations.get("strongest") or [] all_pairs = correlations.get("pairs") or [] multiple_testing = correlations.get("multiple_testing") else: all_pairs = correlations shown = strong or all_pairs corr_rows = [] for pair in shown or []: if not isinstance(pair, dict): continue padj = pair.get("p_value_adjusted") sig = pair.get("significant") corr_rows.append([ pair.get("a") or pair.get("col_a"), pair.get("b") or pair.get("col_b"), pair.get("method", ""), _fmt_num(pair.get("value") if pair.get("value") is not None else pair.get("corr")), _fmt_num(padj) if padj is not None else "", "sí" if sig else ("no" if sig is not None else ""), ]) if corr_rows: parts.append("## Correlaciones") if isinstance(multiple_testing, dict): parts.append( "Corrección de comparaciones múltiples: " f"{multiple_testing.get('method')} " f"(α={multiple_testing.get('alpha')}); " f"{multiple_testing.get('n_rejected')} de " f"{multiple_testing.get('n_tests')} pares significativos tras la " "corrección. Mostrando " f"{'solo pares fuertes' if strong else 'todos los pares evaluados'}." ) parts.append(_md_table( ["a", "b", "method", "value", "p_adj (FDR)", "sig"], corr_rows)) # 7b. Re-expresión sugerida (escalera de potencias de Tukey) por columna # numérica. `suggest_reexpression` decide la transformación que más simetriza; # aquí solo se rinde su recomendación y razón. reexp_rows = [] for col in columns: if not isinstance(col, dict): continue rx = col.get("reexpression") if not isinstance(rx, dict) or rx.get("recommended") is None: continue ladder = rx.get("ladder_power") reexp_rows.append([ col.get("name"), _fmt_num(rx.get("skew")), rx.get("recommended"), _fmt_num(ladder) if ladder is not None else "", rx.get("reason", ""), ]) if reexp_rows: parts.append("## Re-expresión sugerida") parts.append(_md_table( ["column", "skew", "transform", "ladder_power", "reason"], reexp_rows)) # 7c. Series temporales. Bloque por columna numérica cuando el pipeline corrió # con run_series: estacionariedad (ADF+KPSS), autocorrelación (ACF/PACF + # Ljung-Box), descomposición STL y, si es una serie de niveles, sugerencia de # retornos. series_blocks = [] for col in columns: if not isinstance(col, dict): continue s = col.get("series") if not isinstance(s, dict): continue name = col.get("name") or "(col)" block = [f"### {name}"] rows = [] stat = s.get("stationarity") or {} if stat.get("verdict") is not None: rows.append(["estacionariedad (ADF+KPSS)", stat.get("verdict")]) acf = s.get("acf_pacf") or {} if acf.get("is_autocorrelated") is not None: rows.append([ "autocorrelada (Ljung-Box)", "sí" if acf.get("is_autocorrelated") else "no", ]) sig_lags = acf.get("significant_acf_lags") if sig_lags: rows.append([ "lags ACF significativos", ", ".join(str(lag) for lag in sig_lags[:12]), ]) stl = s.get("stl") or {} if stl.get("trend_strength") is not None: rows.append(["fuerza de tendencia (STL)", _fmt_num(stl.get("trend_strength"))]) if stl.get("seasonal_strength") is not None: rows.append(["fuerza estacional (STL)", _fmt_num(stl.get("seasonal_strength"))]) if stl.get("period") is not None: rows.append(["periodo estacional", stl.get("period")]) elif stl.get("note"): rows.append(["STL", stl.get("note")]) if s.get("levels_suggested"): # La transformación recomendada depende de la semántica: retornos para # series financieras (precio/volumen), diferencias para magnitudes # físicas (temperatura, caudal). Aplicar "retornos" a temperatura no # tiene sentido físico; las diferencias sí. kind = s.get("levels_kind") if kind == "returns": label = "convertir a retornos (serie de niveles financiera)" elif kind == "differences": label = "trabajar sobre diferencias (serie de niveles no financiera)" else: label = "convertir a retornos o diferencias (serie de niveles)" rows.append(["sugerencia", label]) # Las métricas de retorno (media/volatilidad) solo se muestran cuando la # transformación recomendada son retornos; para diferencias no aplican. if kind != "differences": tr = s.get("to_returns") or {} if tr.get("mean") is not None: rows.append(["retorno medio (log)", _fmt_num(tr.get("mean"))]) if tr.get("std") is not None: rows.append(["volatilidad retornos (σ)", _fmt_num(tr.get("std"))]) if rows: block.append(_md_table(["aspecto", "valor"], rows)) if stat.get("warning"): block.append(f"> {stat.get('warning')}") series_blocks.append("\n\n".join(block)) if series_blocks: parts.append("## Series temporales") parts.extend(series_blocks) # 8. LLM analysis (tolerate None for now). llm = profile.get("llm") if llm: parts.append("## Análisis LLM") if isinstance(llm, dict): for key, value in llm.items(): if value is None: continue parts.append(f"### {key}") if isinstance(value, (list, tuple)): parts.append("\n".join(f"- {v}" for v in value)) else: parts.append(str(value)) else: parts.append(str(llm)) # 9. Avisos exploratorios. `exploratory_caveats` recuerda que el EDA genera # hipótesis, no conclusiones; se renderiza la lista de advertencias que aplican # a lo que realmente se calculó. caveats = profile.get("caveats") cav_list = [] if isinstance(caveats, dict): cav_list = caveats.get("caveats") or [] elif isinstance(caveats, list): cav_list = caveats cav_lines = [] for cav in cav_list: if not isinstance(cav, dict): continue topic = cav.get("topic") or cav.get("id") or "" msg = cav.get("message") or "" cav_lines.append(f"- **{topic}**: {msg}") if cav_lines: parts.append("## Avisos exploratorios") parts.append("\n".join(cav_lines)) return "\n\n".join(parts) + "\n"