c4cff5ed5b
- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers); render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo) - H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas + join graph) via render_eda_pdf_relational; clave report_pdf_path - aditivos y retrocompatibles (flags default False). 38 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
943 lines
36 KiB
Python
943 lines
36 KiB
Python
"""render_eda_pdf — Portable, mobile-readable PDF report of a TableProfile (eda group).
|
||
|
||
Impure function (writes a file): takes a TableProfile dict from the `eda`
|
||
capability group and renders a MULTI-PAGE PDF designed to be read and explored
|
||
on a phone screen. It is the 4th output of the eda workflow, next to the
|
||
markdown report, the JSON sidecar and the executed Jupyter notebook.
|
||
|
||
Design follows Edward Tufte, "The Visual Display of Quantitative Information":
|
||
high data-ink ratio (no chartjunk, despined axes, light grids), small multiples
|
||
for per-column histograms, and graphical integrity (y-axes start at 0, no
|
||
misleading truncation). Pages are A5 portrait, single column, with a large,
|
||
legible typeface so the report stays readable on a small display.
|
||
|
||
Every key of the profile is read defensively with ``.get(...)`` and only the
|
||
sections actually present are rendered. The function is forward-compatible: if
|
||
the profile carries blocks this renderer does not know about (e.g. ``models``,
|
||
time series, ``caveats`` added by sibling functions), they are dumped generically
|
||
on a final page instead of being ignored or crashing the render.
|
||
|
||
dict-no-throw contract of the eda group: it NEVER raises. Any failure of a single
|
||
section is caught and noted; the function always returns a dict with the path,
|
||
the page count and a human note.
|
||
|
||
Engine: matplotlib ``PdfPages`` (already in ``python/.venv``) — zero new deps.
|
||
"""
|
||
|
||
import os
|
||
import textwrap
|
||
from datetime import datetime, timezone
|
||
|
||
import matplotlib
|
||
|
||
# Headless backend: this runs in agents/CI without a display.
|
||
matplotlib.use("Agg")
|
||
|
||
import matplotlib.pyplot as plt # noqa: E402
|
||
import numpy as np # noqa: E402
|
||
from matplotlib.backends.backend_pdf import PdfPages # noqa: E402
|
||
|
||
# A5 portrait in inches (148 x 210 mm). Single column, tall, phone-friendly.
|
||
_A5_PORTRAIT = (5.83, 8.27)
|
||
|
||
# Number of per-column small multiples stacked vertically on one page.
|
||
_NUMERIC_PER_PAGE = 3
|
||
_CATEGORICAL_PER_PAGE = 3
|
||
|
||
# Top-of-profile keys this renderer handles explicitly. Anything else found at
|
||
# the top level of the profile is dumped on the forward-compat "Otros" page so
|
||
# new sections added by sibling functions still reach the reader.
|
||
_KNOWN_TOP_KEYS = {
|
||
"table", "source", "profiled_at", "n_rows", "n_cols", "size_bytes",
|
||
"duplicate_rows", "duplicate_pct", "null_cell_pct", "constant_cols",
|
||
"all_null_cols", "quality_score", "type_breakdown", "key_candidates",
|
||
"columns", "correlations", "llm",
|
||
# Bloques con builder dedicado (no caen al volcado genérico str(dict)).
|
||
"models", "series", "caveats",
|
||
}
|
||
|
||
# Restrained, high-contrast palette: a single accent reads cleanly on a phone.
|
||
_INK = "#1b1b1b"
|
||
_ACCENT = "#2a6f97"
|
||
_MUTED = "#8a8a8a"
|
||
|
||
# Tufte-ish render defaults shared by both public entry points.
|
||
_RC = {
|
||
"font.size": 10,
|
||
"font.family": "sans-serif",
|
||
"axes.titlesize": 11,
|
||
"axes.edgecolor": _MUTED,
|
||
"figure.facecolor": "white",
|
||
"savefig.facecolor": "white",
|
||
"pdf.fonttype": 42, # embed TrueType so text stays selectable on mobile.
|
||
}
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# Small formatting + Tufte helpers
|
||
# --------------------------------------------------------------------------- #
|
||
def _fmt_num(value, decimals: int = 3) -> str:
|
||
"""Format a number compactly; fall back to str for non-numerics/None."""
|
||
if value is None:
|
||
return "—"
|
||
if isinstance(value, bool):
|
||
return str(value)
|
||
if isinstance(value, int):
|
||
return f"{value:,}"
|
||
if isinstance(value, float):
|
||
if value != value: # NaN
|
||
return "NaN"
|
||
if value in (float("inf"), float("-inf")):
|
||
return str(value)
|
||
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
|
||
return text if text else "0"
|
||
return str(value)
|
||
|
||
|
||
def _fmt_pct(value, decimals: int = 1) -> str:
|
||
"""Format a fraction (0-1) as 'NN.N%'. Returns '—' for None."""
|
||
if value is None:
|
||
return "—"
|
||
try:
|
||
num = float(value)
|
||
except (TypeError, ValueError):
|
||
return str(value)
|
||
return f"{num * 100:.{decimals}f}%"
|
||
|
||
|
||
def _despine(ax) -> None:
|
||
"""Strip top/right spines and soften the rest — raise the data-ink ratio."""
|
||
for side in ("top", "right"):
|
||
ax.spines[side].set_visible(False)
|
||
for side in ("left", "bottom"):
|
||
ax.spines[side].set_color(_MUTED)
|
||
ax.spines[side].set_linewidth(0.6)
|
||
ax.tick_params(colors=_MUTED, labelsize=7, length=2)
|
||
ax.title.set_color(_INK)
|
||
|
||
|
||
def _truncate(text, width: int = 22) -> str:
|
||
"""Clip an arbitrary value to a short label for tight phone layouts."""
|
||
s = str(text) if text is not None else "—"
|
||
return s if len(s) <= width else s[: width - 1] + "…"
|
||
|
||
|
||
def _text_page(pdf, title: str, lines: list, subtitle: str = None) -> int:
|
||
"""Render one text page (monospace body) and return 1 (pages written)."""
|
||
fig = plt.figure(figsize=_A5_PORTRAIT)
|
||
fig.text(0.08, 0.94, title, fontsize=16, fontweight="bold", color=_INK)
|
||
if subtitle:
|
||
fig.text(0.08, 0.905, subtitle, fontsize=9, color=_MUTED)
|
||
body = "\n".join(lines)
|
||
fig.text(
|
||
0.08, 0.88, body, fontsize=9.5, color=_INK, family="monospace",
|
||
va="top", ha="left", linespacing=1.5,
|
||
)
|
||
pdf.savefig(fig)
|
||
plt.close(fig)
|
||
return 1
|
||
|
||
|
||
def _kv_lines(rows: list, key_width: int = 18) -> list:
|
||
"""Format [label, value] rows as aligned 'label : value' monospace lines."""
|
||
out = []
|
||
for label, value in rows:
|
||
out.append(f"{str(label):<{key_width}}: {value}")
|
||
return out
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# Page builders (each fully defensive, each returns the number of pages it made)
|
||
# --------------------------------------------------------------------------- #
|
||
def _cover_page(pdf, profile: dict, title: str) -> int:
|
||
"""Cover: table name, date, shape and an oversized quality score."""
|
||
fig = plt.figure(figsize=_A5_PORTRAIT)
|
||
|
||
table = profile.get("table") or "(tabla sin nombre)"
|
||
heading = title or f"EDA — {table}"
|
||
fig.text(0.08, 0.82, heading, fontsize=22, fontweight="bold", color=_INK,
|
||
wrap=True)
|
||
|
||
sub = []
|
||
src = profile.get("source")
|
||
if src:
|
||
sub.append(f"fuente: {_truncate(src, 40)}")
|
||
when = profile.get("profiled_at") or datetime.now(timezone.utc).strftime(
|
||
"%Y-%m-%d %H:%M UTC"
|
||
)
|
||
sub.append(f"generado: {when}")
|
||
fig.text(0.08, 0.76, "\n".join(sub), fontsize=10, color=_MUTED, va="top")
|
||
|
||
n_rows = profile.get("n_rows")
|
||
n_cols = profile.get("n_cols")
|
||
shape = (f"{_fmt_num(n_rows)} filas × {_fmt_num(n_cols)} columnas")
|
||
fig.text(0.08, 0.60, shape, fontsize=15, color=_ACCENT, fontweight="bold")
|
||
|
||
score = profile.get("quality_score")
|
||
if score is not None:
|
||
fig.text(0.08, 0.42, "calidad", fontsize=12, color=_MUTED)
|
||
fig.text(0.08, 0.31, _fmt_num(score), fontsize=60, fontweight="bold",
|
||
color=_INK)
|
||
fig.text(0.08, 0.25, "sobre 100", fontsize=12, color=_MUTED)
|
||
|
||
fig.text(0.08, 0.06, "Tufte · alta densidad de datos · lectura en móvil",
|
||
fontsize=8, color=_MUTED, style="italic")
|
||
pdf.savefig(fig)
|
||
plt.close(fig)
|
||
return 1
|
||
|
||
|
||
def _overview_page(pdf, profile: dict) -> int:
|
||
"""Overview key/value page: types, duplicates, nulls, constants, keys."""
|
||
rows = []
|
||
if profile.get("n_rows") is not None:
|
||
rows.append(["Filas", _fmt_num(profile.get("n_rows"))])
|
||
if profile.get("n_cols") is not None:
|
||
rows.append(["Columnas", _fmt_num(profile.get("n_cols"))])
|
||
if profile.get("size_bytes") is not None:
|
||
rows.append(["Tamaño (bytes)", _fmt_num(profile.get("size_bytes"))])
|
||
if profile.get("duplicate_rows") is not None:
|
||
dup = _fmt_num(profile.get("duplicate_rows"))
|
||
if profile.get("duplicate_pct") is not None:
|
||
dup += f" ({_fmt_pct(profile.get('duplicate_pct'))})"
|
||
rows.append(["Filas duplicadas", dup])
|
||
if profile.get("null_cell_pct") is not None:
|
||
rows.append(["Celdas nulas", _fmt_pct(profile.get("null_cell_pct"))])
|
||
if profile.get("quality_score") is not None:
|
||
rows.append(["Calidad", _fmt_num(profile.get("quality_score"))])
|
||
|
||
type_breakdown = profile.get("type_breakdown") or {}
|
||
tb = ", ".join(
|
||
f"{k}: {v}" for k, v in type_breakdown.items() if v
|
||
)
|
||
if tb:
|
||
rows.append(["Tipos", tb])
|
||
|
||
constant_cols = profile.get("constant_cols") or []
|
||
if constant_cols:
|
||
rows.append(["Columnas constantes", _truncate(", ".join(constant_cols), 40)])
|
||
all_null_cols = profile.get("all_null_cols") or []
|
||
if all_null_cols:
|
||
rows.append(["Columnas all-null", _truncate(", ".join(all_null_cols), 40)])
|
||
key_candidates = profile.get("key_candidates") or []
|
||
if key_candidates:
|
||
rows.append(["Candidatos a clave", _truncate(", ".join(key_candidates), 40)])
|
||
|
||
if not rows:
|
||
rows.append(["(sin métricas de overview)", ""])
|
||
|
||
return _text_page(pdf, "Overview", _kv_lines(rows, key_width=20))
|
||
|
||
|
||
def _numeric_pages(pdf, columns: list) -> int:
|
||
"""Small multiples: a real histogram per numeric column, several per page."""
|
||
numeric_cols = [
|
||
c for c in columns
|
||
if isinstance(c, dict) and c.get("numeric") and c["numeric"].get("histogram")
|
||
]
|
||
if not numeric_cols:
|
||
return 0
|
||
|
||
pages = 0
|
||
for start in range(0, len(numeric_cols), _NUMERIC_PER_PAGE):
|
||
chunk = numeric_cols[start:start + _NUMERIC_PER_PAGE]
|
||
fig, axes = plt.subplots(
|
||
len(chunk), 1, figsize=_A5_PORTRAIT, squeeze=False,
|
||
)
|
||
fig.suptitle("Distribuciones numéricas", fontsize=14, fontweight="bold",
|
||
color=_INK, x=0.08, ha="left", y=0.98)
|
||
for ax, col in zip(axes[:, 0], chunk):
|
||
_draw_histogram(ax, col)
|
||
# Hide unused axes if the chunk is short (keeps spacing even).
|
||
for ax in axes[len(chunk):, 0]:
|
||
ax.axis("off")
|
||
fig.tight_layout(rect=[0, 0, 1, 0.95])
|
||
pdf.savefig(fig)
|
||
plt.close(fig)
|
||
pages += 1
|
||
return pages
|
||
|
||
|
||
def _draw_histogram(ax, col: dict) -> None:
|
||
"""Draw one column's real histogram from its {lo, hi, count} bins."""
|
||
num = col.get("numeric") or {}
|
||
hist = num.get("histogram") or []
|
||
lefts, widths, counts = [], [], []
|
||
for b in hist:
|
||
if not isinstance(b, dict):
|
||
continue
|
||
lo = b.get("lo")
|
||
hi = b.get("hi")
|
||
cnt = b.get("count") or 0
|
||
if lo is None or hi is None:
|
||
continue
|
||
w = hi - lo
|
||
if w <= 0:
|
||
w = max(abs(lo) * 1e-6, 1e-6)
|
||
lefts.append(lo)
|
||
widths.append(w)
|
||
counts.append(cnt)
|
||
|
||
name = col.get("name") or "(col)"
|
||
if not counts:
|
||
ax.axis("off")
|
||
ax.text(0.5, 0.5, f"{name}: sin datos numéricos", ha="center",
|
||
va="center", fontsize=8, color=_MUTED, transform=ax.transAxes)
|
||
return
|
||
|
||
ax.bar(lefts, counts, width=widths, align="edge", color=_ACCENT,
|
||
edgecolor="white", linewidth=0.3)
|
||
# Graphical integrity: count axis starts at 0, never truncated.
|
||
ax.set_ylim(bottom=0)
|
||
_despine(ax)
|
||
ax.set_title(_truncate(name, 28), fontsize=10, loc="left", pad=4)
|
||
ax.grid(axis="y", color=_MUTED, alpha=0.15, linewidth=0.5)
|
||
ax.set_axisbelow(True)
|
||
|
||
# Median reference line (a single light marker, no chartjunk).
|
||
median = num.get("median")
|
||
if isinstance(median, (int, float)) and not isinstance(median, bool):
|
||
ax.axvline(median, color=_INK, linewidth=0.8, alpha=0.5)
|
||
|
||
# One compact annotation line: mean / std / outliers.
|
||
bits = []
|
||
if num.get("mean") is not None:
|
||
bits.append(f"μ={_fmt_num(num.get('mean'))}")
|
||
if num.get("std") is not None:
|
||
bits.append(f"σ={_fmt_num(num.get('std'))}")
|
||
if num.get("outlier_pct") is not None:
|
||
bits.append(f"outliers={_fmt_num(num.get('outlier_pct'), 1)}%")
|
||
if bits:
|
||
ax.text(0.99, 0.92, " ".join(bits), transform=ax.transAxes,
|
||
ha="right", va="top", fontsize=7, color=_MUTED)
|
||
|
||
|
||
def _categorical_pages(pdf, columns: list) -> int:
|
||
"""Top-k horizontal bars per categorical column, several per page."""
|
||
cat_cols = [
|
||
c for c in columns
|
||
if isinstance(c, dict) and c.get("categorical")
|
||
and (c["categorical"].get("top"))
|
||
]
|
||
if not cat_cols:
|
||
return 0
|
||
|
||
pages = 0
|
||
for start in range(0, len(cat_cols), _CATEGORICAL_PER_PAGE):
|
||
chunk = cat_cols[start:start + _CATEGORICAL_PER_PAGE]
|
||
fig, axes = plt.subplots(
|
||
len(chunk), 1, figsize=_A5_PORTRAIT, squeeze=False,
|
||
)
|
||
fig.suptitle("Categóricas (top-k)", fontsize=14, fontweight="bold",
|
||
color=_INK, x=0.08, ha="left", y=0.98)
|
||
for ax, col in zip(axes[:, 0], chunk):
|
||
_draw_topk_bars(ax, col)
|
||
for ax in axes[len(chunk):, 0]:
|
||
ax.axis("off")
|
||
fig.tight_layout(rect=[0, 0, 1, 0.95])
|
||
pdf.savefig(fig)
|
||
plt.close(fig)
|
||
pages += 1
|
||
return pages
|
||
|
||
|
||
def _draw_topk_bars(ax, col: dict) -> None:
|
||
"""Draw top-k counts for one categorical column as horizontal bars."""
|
||
cat = col.get("categorical") or {}
|
||
top = cat.get("top") or []
|
||
labels, values = [], []
|
||
for item in top[:10]:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
labels.append(_truncate(item.get("value"), 20))
|
||
values.append(item.get("count") or 0)
|
||
|
||
name = col.get("name") or "(col)"
|
||
if not values:
|
||
ax.axis("off")
|
||
ax.text(0.5, 0.5, f"{name}: sin categorías", ha="center", va="center",
|
||
fontsize=8, color=_MUTED, transform=ax.transAxes)
|
||
return
|
||
|
||
# Largest on top: reverse so barh reads naturally top-to-bottom.
|
||
labels = labels[::-1]
|
||
values = values[::-1]
|
||
y = np.arange(len(values))
|
||
ax.barh(y, values, color=_ACCENT, edgecolor="white", linewidth=0.3)
|
||
ax.set_yticks(y)
|
||
ax.set_yticklabels(labels, fontsize=7)
|
||
ax.set_xlim(left=0) # bars start at 0 — honest length encoding.
|
||
_despine(ax)
|
||
ax.set_title(_truncate(name, 28), fontsize=10, loc="left", pad=4)
|
||
ax.grid(axis="x", color=_MUTED, alpha=0.15, linewidth=0.5)
|
||
ax.set_axisbelow(True)
|
||
if cat.get("entropy") is not None:
|
||
ax.text(0.99, 1.02, f"entropía={_fmt_num(cat.get('entropy'))}",
|
||
transform=ax.transAxes, ha="right", va="bottom", fontsize=7,
|
||
color=_MUTED)
|
||
|
||
|
||
def _quality_page(pdf, columns: list) -> int:
|
||
"""Worst-quality columns first, with their issues/flags."""
|
||
scored = [
|
||
c for c in columns
|
||
if isinstance(c, dict) and c.get("quality_score") is not None
|
||
]
|
||
if not scored:
|
||
return 0
|
||
scored = sorted(scored, key=lambda c: c.get("quality_score"))
|
||
|
||
lines = [f"{'columna':<20} {'score':>6} problemas", "-" * 52]
|
||
for col in scored:
|
||
issues = col.get("issues") or col.get("flags") or []
|
||
issues_s = ", ".join(issues) if isinstance(issues, list) else str(issues)
|
||
lines.append(
|
||
f"{_truncate(col.get('name'), 20):<20} "
|
||
f"{_fmt_num(col.get('quality_score'), 1):>6} {_truncate(issues_s, 24)}"
|
||
)
|
||
return _text_page(pdf, "Calidad", lines,
|
||
subtitle="ordenado de peor a mejor calidad")
|
||
|
||
|
||
def _correlations_page(pdf, correlations) -> int:
|
||
"""Heatmap of the association matrix reconstructed from the pairs list."""
|
||
if not correlations:
|
||
return 0
|
||
pairs = correlations
|
||
if isinstance(correlations, dict):
|
||
pairs = correlations.get("pairs") or correlations.get("strong") or []
|
||
if not pairs:
|
||
return 0
|
||
|
||
# Build the symmetric label set and a value matrix from the pairs.
|
||
labels = []
|
||
for p in pairs:
|
||
if not isinstance(p, dict):
|
||
continue
|
||
for key in ("a", "col_a", "b", "col_b"):
|
||
v = p.get(key)
|
||
if v is not None and v not in labels:
|
||
labels.append(v)
|
||
if len(labels) < 2:
|
||
return 0
|
||
idx = {lab: i for i, lab in enumerate(labels)}
|
||
n = len(labels)
|
||
mat = np.full((n, n), np.nan)
|
||
for i in range(n):
|
||
mat[i, i] = 1.0
|
||
for p in pairs:
|
||
if not isinstance(p, dict):
|
||
continue
|
||
a = p.get("a") or p.get("col_a")
|
||
b = p.get("b") or p.get("col_b")
|
||
val = p.get("value")
|
||
if val is None:
|
||
val = p.get("corr")
|
||
if a in idx and b in idx and val is not None:
|
||
try:
|
||
fv = float(val)
|
||
except (TypeError, ValueError):
|
||
continue
|
||
mat[idx[a], idx[b]] = fv
|
||
mat[idx[b], idx[a]] = fv
|
||
|
||
fig, ax = plt.subplots(figsize=_A5_PORTRAIT)
|
||
fig.suptitle("Correlaciones / asociación", fontsize=14, fontweight="bold",
|
||
color=_INK, x=0.08, ha="left", y=0.97)
|
||
im = ax.imshow(mat, cmap="RdBu_r", vmin=-1, vmax=1, aspect="auto")
|
||
ax.set_xticks(np.arange(n))
|
||
ax.set_yticks(np.arange(n))
|
||
ax.set_xticklabels([_truncate(lab, 12) for lab in labels], rotation=60,
|
||
ha="right", fontsize=7, color=_INK)
|
||
ax.set_yticklabels([_truncate(lab, 14) for lab in labels], fontsize=7,
|
||
color=_INK)
|
||
ax.tick_params(length=0)
|
||
for side in ("top", "right", "left", "bottom"):
|
||
ax.spines[side].set_visible(False)
|
||
# Annotate cells only when few columns (keeps it legible on a phone).
|
||
if n <= 8:
|
||
for i in range(n):
|
||
for j in range(n):
|
||
if not np.isnan(mat[i, j]):
|
||
ax.text(j, i, _fmt_num(mat[i, j], 2), ha="center",
|
||
va="center", fontsize=6,
|
||
color=_INK if abs(mat[i, j]) < 0.6 else "white")
|
||
cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
|
||
cbar.ax.tick_params(labelsize=7)
|
||
fig.tight_layout(rect=[0, 0, 1, 0.94])
|
||
pdf.savefig(fig)
|
||
plt.close(fig)
|
||
return 1
|
||
|
||
|
||
def _llm_pages(pdf, llm) -> int:
|
||
"""Render the LLM block (data dictionary / summary) as wrapped text pages."""
|
||
if not llm:
|
||
return 0
|
||
lines = []
|
||
if isinstance(llm, dict):
|
||
for key, value in llm.items():
|
||
if value is None:
|
||
continue
|
||
lines.append(f"## {key}")
|
||
lines.extend(_wrap_value(value))
|
||
lines.append("")
|
||
else:
|
||
lines.extend(_wrap_value(llm))
|
||
if not lines:
|
||
return 0
|
||
return _paginate_text(pdf, "Análisis LLM", lines)
|
||
|
||
|
||
def _generic_pages(pdf, profile: dict) -> int:
|
||
"""Forward-compat: dump unknown top-level sections so they still reach the reader."""
|
||
extras = {
|
||
k: v for k, v in profile.items()
|
||
if k not in _KNOWN_TOP_KEYS and v is not None
|
||
}
|
||
if not extras:
|
||
return 0
|
||
lines = []
|
||
for key, value in extras.items():
|
||
lines.append(f"## {key}")
|
||
lines.extend(_wrap_value(value))
|
||
lines.append("")
|
||
if not lines:
|
||
return 0
|
||
return _paginate_text(pdf, "Otras secciones", lines,
|
||
subtitle="bloques nuevos del profile (forward-compat)")
|
||
|
||
|
||
def _wrap_value(value, width: int = 78) -> list:
|
||
"""Flatten an arbitrary value into wrapped, readable text lines."""
|
||
out = []
|
||
if isinstance(value, dict):
|
||
for k, v in value.items():
|
||
out.append(f"- {k}: {_truncate(_scalar(v), 64)}")
|
||
elif isinstance(value, (list, tuple)):
|
||
for item in value:
|
||
if isinstance(item, dict):
|
||
out.append("- " + _truncate(
|
||
", ".join(f"{k}={_scalar(v)}" for k, v in item.items()), 70))
|
||
else:
|
||
out.append(f"- {_truncate(_scalar(item), 72)}")
|
||
else:
|
||
for line in textwrap.wrap(str(value), width=width) or [""]:
|
||
out.append(line)
|
||
return out
|
||
|
||
|
||
def _scalar(v) -> str:
|
||
"""Compact one-line representation of a scalar/nested value."""
|
||
if isinstance(v, float):
|
||
return _fmt_num(v)
|
||
if isinstance(v, (dict, list, tuple)):
|
||
return _truncate(str(v), 60)
|
||
return str(v)
|
||
|
||
|
||
def _paginate_text(pdf, title: str, lines: list, subtitle: str = None,
|
||
per_page: int = 34) -> int:
|
||
"""Split a long list of text lines across several text pages."""
|
||
pages = 0
|
||
for start in range(0, len(lines), per_page):
|
||
chunk = lines[start:start + per_page]
|
||
page_title = title if pages == 0 else f"{title} (cont.)"
|
||
pages += _text_page(pdf, page_title, chunk,
|
||
subtitle=subtitle if pages == 0 else None)
|
||
return pages
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# Dedicated builders for forward-compat blocks (models / series / caveats).
|
||
# Before these existed, ``models``/``series``/``caveats`` fell to the generic
|
||
# dump and were rendered as truncated ``str(dict)``. Each builder is fully
|
||
# defensive, reads with ``.get`` and returns the number of pages it produced.
|
||
# --------------------------------------------------------------------------- #
|
||
def _models_pages(pdf, models) -> int:
|
||
"""Render the cheap-models block (PCA / KMeans / outliers / normality)."""
|
||
if not isinstance(models, dict):
|
||
return 0
|
||
lines = []
|
||
|
||
pca = models.get("pca")
|
||
if isinstance(pca, dict):
|
||
lines.append("## PCA")
|
||
n_used = pca.get("n_rows_used")
|
||
n_feat = pca.get("n_features")
|
||
if n_used is not None or n_feat is not None:
|
||
lines.append(
|
||
f" {pca.get('n_components')} comp · "
|
||
f"{_fmt_num(n_used)} filas · {_fmt_num(n_feat)} features"
|
||
)
|
||
evr = pca.get("explained_variance_ratio") or []
|
||
cum = pca.get("cumulative") or []
|
||
for i, var in enumerate(evr):
|
||
acc = cum[i] if i < len(cum) else None
|
||
lines.append(f" PC{i + 1}: var {_fmt_pct(var)} acum {_fmt_pct(acc)}")
|
||
loadings = pca.get("top_loadings") or []
|
||
if loadings:
|
||
lines.append(" cargas principales:")
|
||
for ld in loadings[:8]:
|
||
if not isinstance(ld, dict):
|
||
continue
|
||
comp = ld.get("component")
|
||
comp_label = f"PC{comp + 1}" if isinstance(comp, int) else str(comp)
|
||
lines.append(
|
||
f" {comp_label} {_truncate(ld.get('feature'), 18)}: "
|
||
f"{_fmt_num(ld.get('loading'), 3)}"
|
||
)
|
||
lines.append("")
|
||
|
||
km = models.get("kmeans")
|
||
if isinstance(km, dict):
|
||
lines.append("## KMeans")
|
||
head = f" mejor k = {_fmt_num(km.get('best_k'))}"
|
||
if km.get("silhouette") is not None:
|
||
head += f" silhouette {_fmt_num(km.get('silhouette'), 3)}"
|
||
lines.append(head)
|
||
sizes = km.get("cluster_sizes") or []
|
||
if sizes:
|
||
lines.append(" tamaños cluster: " + ", ".join(
|
||
_fmt_num(s) for s in sizes))
|
||
for sc in km.get("scores_by_k") or []:
|
||
if not isinstance(sc, dict):
|
||
continue
|
||
lines.append(
|
||
f" k={sc.get('k')}: silhouette {_fmt_num(sc.get('silhouette'), 3)}"
|
||
f" inertia {_fmt_num(sc.get('inertia'), 1)}"
|
||
)
|
||
lines.append("")
|
||
|
||
out = models.get("outliers")
|
||
if isinstance(out, dict):
|
||
lines.append("## Outliers multivariante (Isolation Forest)")
|
||
# outlier_pct del modelo ya viene en escala 0-100.
|
||
line = f" {_fmt_num(out.get('n_outliers'))} outliers"
|
||
if out.get("outlier_pct") is not None:
|
||
line += f" ({_fmt_num(out.get('outlier_pct'), 2)}%)"
|
||
if out.get("threshold") is not None:
|
||
line += f" umbral {_fmt_num(out.get('threshold'), 3)}"
|
||
lines.append(line)
|
||
lines.append("")
|
||
|
||
normality = models.get("normality")
|
||
if isinstance(normality, dict):
|
||
lines.append("## Normalidad (Jarque-Bera)")
|
||
for col_name, res in normality.items():
|
||
if not isinstance(res, dict):
|
||
continue
|
||
jb = res.get("jarque_bera") or {}
|
||
lines.append(
|
||
f" {_truncate(col_name, 18):<18} normal={res.get('is_normal')}"
|
||
f" JB p={_fmt_num(jb.get('p'), 4)}"
|
||
)
|
||
lines.append("")
|
||
|
||
note = models.get("note")
|
||
if note:
|
||
lines.append(f"nota: {note}")
|
||
|
||
if not [ln for ln in lines if ln.strip()]:
|
||
return 0
|
||
return _paginate_text(pdf, "Modelos", lines)
|
||
|
||
|
||
def _series_pages(pdf, series) -> int:
|
||
"""Render the time-series block: one compact summary per series column."""
|
||
if not isinstance(series, dict) or not series:
|
||
return 0
|
||
lines = []
|
||
for col, s in series.items():
|
||
if not isinstance(s, dict):
|
||
continue
|
||
lines.append(f"## {col}")
|
||
stat = s.get("stationarity") or {}
|
||
if stat.get("verdict") is not None:
|
||
lines.append(f" estacionariedad (ADF+KPSS): {stat.get('verdict')}")
|
||
acf = s.get("acf_pacf") or {}
|
||
if acf.get("is_autocorrelated") is not None:
|
||
lines.append(
|
||
" autocorrelada (Ljung-Box): "
|
||
+ ("sí" if acf.get("is_autocorrelated") else "no")
|
||
)
|
||
stl = s.get("stl") or {}
|
||
if stl.get("trend_strength") is not None:
|
||
lines.append(
|
||
f" fuerza tendencia (STL): {_fmt_num(stl.get('trend_strength'), 3)}")
|
||
if stl.get("seasonal_strength") is not None:
|
||
extra = (f" (periodo {stl.get('period')})"
|
||
if stl.get("period") is not None else "")
|
||
lines.append(
|
||
f" fuerza estacional (STL): "
|
||
f"{_fmt_num(stl.get('seasonal_strength'), 3)}{extra}")
|
||
elif stl.get("note"):
|
||
lines.append(f" STL: {_truncate(stl.get('note'), 60)}")
|
||
if s.get("levels_suggested"):
|
||
kind = s.get("levels_kind")
|
||
if kind == "returns":
|
||
lines.append(" sugerencia: convertir a retornos (serie financiera)")
|
||
elif kind == "differences":
|
||
lines.append(" sugerencia: trabajar sobre diferencias (serie física)")
|
||
else:
|
||
lines.append(" sugerencia: retornos o diferencias (serie de niveles)")
|
||
lines.append("")
|
||
if not [ln for ln in lines if ln.strip()]:
|
||
return 0
|
||
return _paginate_text(pdf, "Series temporales", lines)
|
||
|
||
|
||
def _caveats_pages(pdf, caveats) -> int:
|
||
"""Render the exploratory caveats block as a wrapped, readable list."""
|
||
cav_list = []
|
||
if isinstance(caveats, dict):
|
||
cav_list = caveats.get("caveats") or []
|
||
elif isinstance(caveats, list):
|
||
cav_list = caveats
|
||
lines = []
|
||
for cav in cav_list:
|
||
if not isinstance(cav, dict):
|
||
continue
|
||
topic = cav.get("topic") or cav.get("id") or ""
|
||
msg = cav.get("message") or ""
|
||
lines.append(f"## {topic}")
|
||
lines.extend(textwrap.wrap(str(msg), width=78) or [""])
|
||
lines.append("")
|
||
if not [ln for ln in lines if ln.strip()]:
|
||
return 0
|
||
return _paginate_text(pdf, "Avisos exploratorios", lines,
|
||
subtitle="el EDA genera hipótesis, no conclusiones")
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# DB-level (relational) page builders — used by render_eda_pdf_relational.
|
||
# --------------------------------------------------------------------------- #
|
||
def _db_cover_page(pdf, db_profile: dict, title: str) -> int:
|
||
"""Cover for a DatabaseProfile: name, date, table count, FK count."""
|
||
fig = plt.figure(figsize=_A5_PORTRAIT)
|
||
db_path = db_profile.get("db_path") or "(base sin nombre)"
|
||
heading = title or f"EDA base — {os.path.basename(str(db_path))}"
|
||
fig.text(0.08, 0.82, heading, fontsize=20, fontweight="bold", color=_INK,
|
||
wrap=True)
|
||
|
||
sub = [f"fuente: {_truncate(db_path, 44)}"]
|
||
when = db_profile.get("profiled_at") or datetime.now(timezone.utc).strftime(
|
||
"%Y-%m-%d %H:%M UTC")
|
||
sub.append(f"generado: {when}")
|
||
fig.text(0.08, 0.74, "\n".join(sub), fontsize=10, color=_MUTED, va="top")
|
||
|
||
n_tables = db_profile.get("n_tables")
|
||
fig.text(0.08, 0.58, f"{_fmt_num(n_tables)} tablas", fontsize=16,
|
||
color=_ACCENT, fontweight="bold")
|
||
n_fk = len(db_profile.get("fk_candidates") or [])
|
||
fig.text(0.08, 0.51, f"{_fmt_num(n_fk)} relaciones FK candidatas",
|
||
fontsize=12, color=_INK)
|
||
|
||
fig.text(0.08, 0.06, "Tufte · alta densidad de datos · lectura en móvil",
|
||
fontsize=8, color=_MUTED, style="italic")
|
||
pdf.savefig(fig)
|
||
plt.close(fig)
|
||
return 1
|
||
|
||
|
||
def _db_tables_page(pdf, db_profile: dict) -> int:
|
||
"""One text page summarising every table (rows / cols / quality)."""
|
||
tables = db_profile.get("tables") or []
|
||
if not isinstance(tables, list) or not tables:
|
||
return 0
|
||
lines = [f"{'tabla':<24}{'filas':>9}{'cols':>6}{'cal':>6}", "-" * 45]
|
||
for t in tables:
|
||
if not isinstance(t, dict):
|
||
continue
|
||
lines.append(
|
||
f"{_truncate(t.get('table'), 24):<24}"
|
||
f"{_fmt_num(t.get('n_rows')):>9}"
|
||
f"{_fmt_num(t.get('n_cols')):>6}"
|
||
f"{_fmt_num(t.get('quality_score'), 1):>6}"
|
||
)
|
||
return _paginate_text(pdf, "Tablas", lines, subtitle="resumen por tabla")
|
||
|
||
|
||
def _db_fk_page(pdf, db_profile: dict) -> int:
|
||
"""FK candidates table + the join-graph mermaid text."""
|
||
fks = db_profile.get("fk_candidates") or []
|
||
lines = []
|
||
if isinstance(fks, list) and fks:
|
||
lines.append(f"{'from':<26}{'to':<26}{'incl':>7}")
|
||
lines.append("-" * 59)
|
||
for fk in fks:
|
||
if not isinstance(fk, dict):
|
||
continue
|
||
frm = f"{fk.get('from_table')}.{fk.get('from_col')}"
|
||
to = f"{fk.get('to_table')}.{fk.get('to_col')}"
|
||
inc = fk.get("inclusion")
|
||
inc_s = (_fmt_num(inc, 3) if isinstance(inc, (int, float))
|
||
and not isinstance(inc, bool) else str(inc))
|
||
lines.append(
|
||
f"{_truncate(frm, 25):<26}{_truncate(to, 25):<26}{inc_s:>7}")
|
||
else:
|
||
lines.append("(sin relaciones FK candidatas detectadas)")
|
||
|
||
mermaid = (db_profile.get("join_graph") or {}).get("mermaid")
|
||
if mermaid:
|
||
lines.append("")
|
||
lines.append("## join graph (mermaid)")
|
||
for raw in str(mermaid).splitlines():
|
||
lines.append(_truncate(raw, 72))
|
||
return _paginate_text(pdf, "Relaciones inter-tabla", lines,
|
||
subtitle="FK candidatas + join graph")
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# Public entry point
|
||
# --------------------------------------------------------------------------- #
|
||
def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict:
|
||
"""Render a TableProfile dict into a portable, mobile-readable multi-page PDF.
|
||
|
||
The report is laid out for reading on a phone: A5 portrait pages, single
|
||
column, large type, Tufte-style high data-ink charts (real histograms as
|
||
small multiples, top-k bars, an association heatmap). Every profile key is
|
||
read defensively and only present sections are rendered; unknown top-level
|
||
blocks are dumped on a forward-compat page rather than dropped.
|
||
|
||
Args:
|
||
profile: TableProfile dict from the `eda` capability group (the dict
|
||
returned by ``profile_table`` under ``profile``). May have many keys
|
||
absent or None; a None/empty profile still yields a 1-page PDF.
|
||
out_path: filesystem path where the PDF is written. Parent directories
|
||
are created if missing.
|
||
title: optional report title for the cover. Defaults to
|
||
``"EDA — <table>"``.
|
||
|
||
Returns:
|
||
dict (never raises): {"pdf_path": str, "n_pages": int, "note": str}.
|
||
On a fatal write error, ``pdf_path`` is None and ``note`` explains why.
|
||
"""
|
||
if profile is None:
|
||
profile = {}
|
||
if not isinstance(profile, dict):
|
||
return {"pdf_path": None, "n_pages": 0,
|
||
"note": f"profile no es dict: {type(profile).__name__}"}
|
||
|
||
columns = profile.get("columns") or []
|
||
if not isinstance(columns, list):
|
||
columns = []
|
||
|
||
notes = []
|
||
n_pages = 0
|
||
|
||
try:
|
||
parent = os.path.dirname(os.path.abspath(out_path))
|
||
os.makedirs(parent, exist_ok=True)
|
||
except OSError as e:
|
||
return {"pdf_path": None, "n_pages": 0,
|
||
"note": f"no se pudo crear el directorio destino: {e}"}
|
||
|
||
# Tufte-ish defaults shared with the relational renderer (module-level _RC).
|
||
rc = _RC
|
||
|
||
# Each section is isolated: a failure in one never aborts the whole PDF.
|
||
builders = [
|
||
("cover", lambda p: _cover_page(p, profile, title)),
|
||
("overview", lambda p: _overview_page(p, profile)),
|
||
("numeric", lambda p: _numeric_pages(p, columns)),
|
||
("categorical", lambda p: _categorical_pages(p, columns)),
|
||
("quality", lambda p: _quality_page(p, columns)),
|
||
("correlations", lambda p: _correlations_page(p, profile.get("correlations"))),
|
||
("models", lambda p: _models_pages(p, profile.get("models"))),
|
||
("series", lambda p: _series_pages(p, profile.get("series"))),
|
||
("llm", lambda p: _llm_pages(p, profile.get("llm"))),
|
||
("caveats", lambda p: _caveats_pages(p, profile.get("caveats"))),
|
||
("generic", lambda p: _generic_pages(p, profile)),
|
||
]
|
||
|
||
try:
|
||
with plt.rc_context(rc):
|
||
with PdfPages(out_path) as pdf:
|
||
for name, build in builders:
|
||
try:
|
||
n_pages += build(pdf) or 0
|
||
except Exception as e: # noqa: BLE001 — one bad section never aborts.
|
||
notes.append(f"sección '{name}' omitida: {e}")
|
||
# Guarantee at least one page so the PDF is always valid.
|
||
if n_pages == 0:
|
||
n_pages += _text_page(
|
||
pdf, title or "EDA", ["(perfil vacío — sin secciones)"]
|
||
)
|
||
except Exception as e: # noqa: BLE001
|
||
return {"pdf_path": None, "n_pages": 0,
|
||
"note": f"fallo al escribir el PDF: {e}"}
|
||
|
||
note = f"{n_pages} páginas"
|
||
if notes:
|
||
note += " · " + "; ".join(notes)
|
||
return {"pdf_path": out_path, "n_pages": n_pages, "note": note}
|
||
|
||
|
||
def render_eda_pdf_relational(db_profile: dict, out_path: str,
|
||
title: str = None) -> dict:
|
||
"""Render a DatabaseProfile dict into a portable, mobile-readable PDF.
|
||
|
||
DB-level sibling of :func:`render_eda_pdf`: instead of a single table it
|
||
summarises a whole database (the dict ``profile_database`` returns under
|
||
``db_profile``). Pages are A5 portrait, single column, large type — built to
|
||
be read on a phone. Three pages: a cover (table + FK counts), a per-table
|
||
summary (rows / cols / quality) and the inter-table relations (FK candidates
|
||
plus the join-graph mermaid text). Every key is read defensively and any
|
||
section that fails is noted, never aborting the whole render.
|
||
|
||
Args:
|
||
db_profile: DatabaseProfile dict from ``profile_database`` (the value
|
||
under ``db_profile``). May have keys absent or None; a None/empty
|
||
profile still yields a 1-page PDF.
|
||
out_path: filesystem path where the PDF is written. Parent directories
|
||
are created if missing.
|
||
title: optional cover title. Defaults to ``"EDA base — <db filename>"``.
|
||
|
||
Returns:
|
||
dict (never raises): {"pdf_path": str, "n_pages": int, "note": str}.
|
||
On a fatal write error, ``pdf_path`` is None and ``note`` explains why.
|
||
"""
|
||
if db_profile is None:
|
||
db_profile = {}
|
||
if not isinstance(db_profile, dict):
|
||
return {"pdf_path": None, "n_pages": 0,
|
||
"note": f"db_profile no es dict: {type(db_profile).__name__}"}
|
||
|
||
try:
|
||
parent = os.path.dirname(os.path.abspath(out_path))
|
||
os.makedirs(parent, exist_ok=True)
|
||
except OSError as e:
|
||
return {"pdf_path": None, "n_pages": 0,
|
||
"note": f"no se pudo crear el directorio destino: {e}"}
|
||
|
||
notes = []
|
||
n_pages = 0
|
||
|
||
builders = [
|
||
("cover", lambda p: _db_cover_page(p, db_profile, title)),
|
||
("tables", lambda p: _db_tables_page(p, db_profile)),
|
||
("relations", lambda p: _db_fk_page(p, db_profile)),
|
||
]
|
||
|
||
try:
|
||
with plt.rc_context(_RC):
|
||
with PdfPages(out_path) as pdf:
|
||
for name, build in builders:
|
||
try:
|
||
n_pages += build(pdf) or 0
|
||
except Exception as e: # noqa: BLE001 — one bad section never aborts.
|
||
notes.append(f"sección '{name}' omitida: {e}")
|
||
if n_pages == 0:
|
||
n_pages += _text_page(
|
||
pdf, title or "EDA base", ["(base vacía — sin secciones)"]
|
||
)
|
||
except Exception as e: # noqa: BLE001
|
||
return {"pdf_path": None, "n_pages": 0,
|
||
"note": f"fallo al escribir el PDF: {e}"}
|
||
|
||
note = f"{n_pages} páginas"
|
||
if notes:
|
||
note += " · " + "; ".join(notes)
|
||
return {"pdf_path": out_path, "n_pages": n_pages, "note": note}
|