feat(eda): series temporales + rigor anti-data-mining + PDF movil + /eda + benchmark issues
Bloque del grupo eda (sesion ausente EDA-benchmark): - 8 funciones nuevas: adf_kpss_stationarity, acf_pacf, stl_decompose, to_returns, fdr_correction, suggest_reexpression, exploratory_caveats, render_eda_pdf - integracion: profile_table (run_series, emit_pdf), association_matrix (FDR Benjamini-Hochberg), render_eda_markdown (secciones series/reexpresion/caveats) - slash commands /eda y /capitulos - issues 0173-0177: mejoras del /eda derivadas del benchmark sobre 12 datasets reales (outlier_pct x100, periodo estacional, FK inference, render models, tipos id-like) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,626 @@
|
||||
"""render_eda_pdf — Portable, mobile-readable PDF report of a TableProfile (eda group).
|
||||
|
||||
Impure function (writes a file): takes a TableProfile dict from the `eda`
|
||||
capability group and renders a MULTI-PAGE PDF designed to be read and explored
|
||||
on a phone screen. It is the 4th output of the eda workflow, next to the
|
||||
markdown report, the JSON sidecar and the executed Jupyter notebook.
|
||||
|
||||
Design follows Edward Tufte, "The Visual Display of Quantitative Information":
|
||||
high data-ink ratio (no chartjunk, despined axes, light grids), small multiples
|
||||
for per-column histograms, and graphical integrity (y-axes start at 0, no
|
||||
misleading truncation). Pages are A5 portrait, single column, with a large,
|
||||
legible typeface so the report stays readable on a small display.
|
||||
|
||||
Every key of the profile is read defensively with ``.get(...)`` and only the
|
||||
sections actually present are rendered. The function is forward-compatible: if
|
||||
the profile carries blocks this renderer does not know about (e.g. ``models``,
|
||||
time series, ``caveats`` added by sibling functions), they are dumped generically
|
||||
on a final page instead of being ignored or crashing the render.
|
||||
|
||||
dict-no-throw contract of the eda group: it NEVER raises. Any failure of a single
|
||||
section is caught and noted; the function always returns a dict with the path,
|
||||
the page count and a human note.
|
||||
|
||||
Engine: matplotlib ``PdfPages`` (already in ``python/.venv``) — zero new deps.
|
||||
"""
|
||||
|
||||
import os
|
||||
import textwrap
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import matplotlib
|
||||
|
||||
# Headless backend: this runs in agents/CI without a display.
|
||||
matplotlib.use("Agg")
|
||||
|
||||
import matplotlib.pyplot as plt # noqa: E402
|
||||
import numpy as np # noqa: E402
|
||||
from matplotlib.backends.backend_pdf import PdfPages # noqa: E402
|
||||
|
||||
# A5 portrait in inches (148 x 210 mm). Single column, tall, phone-friendly.
|
||||
_A5_PORTRAIT = (5.83, 8.27)
|
||||
|
||||
# Number of per-column small multiples stacked vertically on one page.
|
||||
_NUMERIC_PER_PAGE = 3
|
||||
_CATEGORICAL_PER_PAGE = 3
|
||||
|
||||
# Top-of-profile keys this renderer handles explicitly. Anything else found at
|
||||
# the top level of the profile is dumped on the forward-compat "Otros" page so
|
||||
# new sections added by sibling functions still reach the reader.
|
||||
_KNOWN_TOP_KEYS = {
|
||||
"table", "source", "profiled_at", "n_rows", "n_cols", "size_bytes",
|
||||
"duplicate_rows", "duplicate_pct", "null_cell_pct", "constant_cols",
|
||||
"all_null_cols", "quality_score", "type_breakdown", "key_candidates",
|
||||
"columns", "correlations", "llm",
|
||||
}
|
||||
|
||||
# Restrained, high-contrast palette: a single accent reads cleanly on a phone.
|
||||
_INK = "#1b1b1b"
|
||||
_ACCENT = "#2a6f97"
|
||||
_MUTED = "#8a8a8a"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Small formatting + Tufte helpers
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _fmt_num(value, decimals: int = 3) -> str:
|
||||
"""Format a number compactly; fall back to str for non-numerics/None."""
|
||||
if value is None:
|
||||
return "—"
|
||||
if isinstance(value, bool):
|
||||
return str(value)
|
||||
if isinstance(value, int):
|
||||
return f"{value:,}"
|
||||
if isinstance(value, float):
|
||||
if value != value: # NaN
|
||||
return "NaN"
|
||||
if value in (float("inf"), float("-inf")):
|
||||
return str(value)
|
||||
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
|
||||
return text if text else "0"
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fmt_pct(value, decimals: int = 1) -> str:
|
||||
"""Format a fraction (0-1) as 'NN.N%'. Returns '—' for None."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
num = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
return f"{num * 100:.{decimals}f}%"
|
||||
|
||||
|
||||
def _despine(ax) -> None:
|
||||
"""Strip top/right spines and soften the rest — raise the data-ink ratio."""
|
||||
for side in ("top", "right"):
|
||||
ax.spines[side].set_visible(False)
|
||||
for side in ("left", "bottom"):
|
||||
ax.spines[side].set_color(_MUTED)
|
||||
ax.spines[side].set_linewidth(0.6)
|
||||
ax.tick_params(colors=_MUTED, labelsize=7, length=2)
|
||||
ax.title.set_color(_INK)
|
||||
|
||||
|
||||
def _truncate(text, width: int = 22) -> str:
|
||||
"""Clip an arbitrary value to a short label for tight phone layouts."""
|
||||
s = str(text) if text is not None else "—"
|
||||
return s if len(s) <= width else s[: width - 1] + "…"
|
||||
|
||||
|
||||
def _text_page(pdf, title: str, lines: list, subtitle: str = None) -> int:
|
||||
"""Render one text page (monospace body) and return 1 (pages written)."""
|
||||
fig = plt.figure(figsize=_A5_PORTRAIT)
|
||||
fig.text(0.08, 0.94, title, fontsize=16, fontweight="bold", color=_INK)
|
||||
if subtitle:
|
||||
fig.text(0.08, 0.905, subtitle, fontsize=9, color=_MUTED)
|
||||
body = "\n".join(lines)
|
||||
fig.text(
|
||||
0.08, 0.88, body, fontsize=9.5, color=_INK, family="monospace",
|
||||
va="top", ha="left", linespacing=1.5,
|
||||
)
|
||||
pdf.savefig(fig)
|
||||
plt.close(fig)
|
||||
return 1
|
||||
|
||||
|
||||
def _kv_lines(rows: list, key_width: int = 18) -> list:
|
||||
"""Format [label, value] rows as aligned 'label : value' monospace lines."""
|
||||
out = []
|
||||
for label, value in rows:
|
||||
out.append(f"{str(label):<{key_width}}: {value}")
|
||||
return out
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Page builders (each fully defensive, each returns the number of pages it made)
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _cover_page(pdf, profile: dict, title: str) -> int:
|
||||
"""Cover: table name, date, shape and an oversized quality score."""
|
||||
fig = plt.figure(figsize=_A5_PORTRAIT)
|
||||
|
||||
table = profile.get("table") or "(tabla sin nombre)"
|
||||
heading = title or f"EDA — {table}"
|
||||
fig.text(0.08, 0.82, heading, fontsize=22, fontweight="bold", color=_INK,
|
||||
wrap=True)
|
||||
|
||||
sub = []
|
||||
src = profile.get("source")
|
||||
if src:
|
||||
sub.append(f"fuente: {_truncate(src, 40)}")
|
||||
when = profile.get("profiled_at") or datetime.now(timezone.utc).strftime(
|
||||
"%Y-%m-%d %H:%M UTC"
|
||||
)
|
||||
sub.append(f"generado: {when}")
|
||||
fig.text(0.08, 0.76, "\n".join(sub), fontsize=10, color=_MUTED, va="top")
|
||||
|
||||
n_rows = profile.get("n_rows")
|
||||
n_cols = profile.get("n_cols")
|
||||
shape = (f"{_fmt_num(n_rows)} filas × {_fmt_num(n_cols)} columnas")
|
||||
fig.text(0.08, 0.60, shape, fontsize=15, color=_ACCENT, fontweight="bold")
|
||||
|
||||
score = profile.get("quality_score")
|
||||
if score is not None:
|
||||
fig.text(0.08, 0.42, "calidad", fontsize=12, color=_MUTED)
|
||||
fig.text(0.08, 0.31, _fmt_num(score), fontsize=60, fontweight="bold",
|
||||
color=_INK)
|
||||
fig.text(0.08, 0.25, "sobre 100", fontsize=12, color=_MUTED)
|
||||
|
||||
fig.text(0.08, 0.06, "Tufte · alta densidad de datos · lectura en móvil",
|
||||
fontsize=8, color=_MUTED, style="italic")
|
||||
pdf.savefig(fig)
|
||||
plt.close(fig)
|
||||
return 1
|
||||
|
||||
|
||||
def _overview_page(pdf, profile: dict) -> int:
|
||||
"""Overview key/value page: types, duplicates, nulls, constants, keys."""
|
||||
rows = []
|
||||
if profile.get("n_rows") is not None:
|
||||
rows.append(["Filas", _fmt_num(profile.get("n_rows"))])
|
||||
if profile.get("n_cols") is not None:
|
||||
rows.append(["Columnas", _fmt_num(profile.get("n_cols"))])
|
||||
if profile.get("size_bytes") is not None:
|
||||
rows.append(["Tamaño (bytes)", _fmt_num(profile.get("size_bytes"))])
|
||||
if profile.get("duplicate_rows") is not None:
|
||||
dup = _fmt_num(profile.get("duplicate_rows"))
|
||||
if profile.get("duplicate_pct") is not None:
|
||||
dup += f" ({_fmt_pct(profile.get('duplicate_pct'))})"
|
||||
rows.append(["Filas duplicadas", dup])
|
||||
if profile.get("null_cell_pct") is not None:
|
||||
rows.append(["Celdas nulas", _fmt_pct(profile.get("null_cell_pct"))])
|
||||
if profile.get("quality_score") is not None:
|
||||
rows.append(["Calidad", _fmt_num(profile.get("quality_score"))])
|
||||
|
||||
type_breakdown = profile.get("type_breakdown") or {}
|
||||
tb = ", ".join(
|
||||
f"{k}: {v}" for k, v in type_breakdown.items() if v
|
||||
)
|
||||
if tb:
|
||||
rows.append(["Tipos", tb])
|
||||
|
||||
constant_cols = profile.get("constant_cols") or []
|
||||
if constant_cols:
|
||||
rows.append(["Columnas constantes", _truncate(", ".join(constant_cols), 40)])
|
||||
all_null_cols = profile.get("all_null_cols") or []
|
||||
if all_null_cols:
|
||||
rows.append(["Columnas all-null", _truncate(", ".join(all_null_cols), 40)])
|
||||
key_candidates = profile.get("key_candidates") or []
|
||||
if key_candidates:
|
||||
rows.append(["Candidatos a clave", _truncate(", ".join(key_candidates), 40)])
|
||||
|
||||
if not rows:
|
||||
rows.append(["(sin métricas de overview)", ""])
|
||||
|
||||
return _text_page(pdf, "Overview", _kv_lines(rows, key_width=20))
|
||||
|
||||
|
||||
def _numeric_pages(pdf, columns: list) -> int:
|
||||
"""Small multiples: a real histogram per numeric column, several per page."""
|
||||
numeric_cols = [
|
||||
c for c in columns
|
||||
if isinstance(c, dict) and c.get("numeric") and c["numeric"].get("histogram")
|
||||
]
|
||||
if not numeric_cols:
|
||||
return 0
|
||||
|
||||
pages = 0
|
||||
for start in range(0, len(numeric_cols), _NUMERIC_PER_PAGE):
|
||||
chunk = numeric_cols[start:start + _NUMERIC_PER_PAGE]
|
||||
fig, axes = plt.subplots(
|
||||
len(chunk), 1, figsize=_A5_PORTRAIT, squeeze=False,
|
||||
)
|
||||
fig.suptitle("Distribuciones numéricas", fontsize=14, fontweight="bold",
|
||||
color=_INK, x=0.08, ha="left", y=0.98)
|
||||
for ax, col in zip(axes[:, 0], chunk):
|
||||
_draw_histogram(ax, col)
|
||||
# Hide unused axes if the chunk is short (keeps spacing even).
|
||||
for ax in axes[len(chunk):, 0]:
|
||||
ax.axis("off")
|
||||
fig.tight_layout(rect=[0, 0, 1, 0.95])
|
||||
pdf.savefig(fig)
|
||||
plt.close(fig)
|
||||
pages += 1
|
||||
return pages
|
||||
|
||||
|
||||
def _draw_histogram(ax, col: dict) -> None:
|
||||
"""Draw one column's real histogram from its {lo, hi, count} bins."""
|
||||
num = col.get("numeric") or {}
|
||||
hist = num.get("histogram") or []
|
||||
lefts, widths, counts = [], [], []
|
||||
for b in hist:
|
||||
if not isinstance(b, dict):
|
||||
continue
|
||||
lo = b.get("lo")
|
||||
hi = b.get("hi")
|
||||
cnt = b.get("count") or 0
|
||||
if lo is None or hi is None:
|
||||
continue
|
||||
w = hi - lo
|
||||
if w <= 0:
|
||||
w = max(abs(lo) * 1e-6, 1e-6)
|
||||
lefts.append(lo)
|
||||
widths.append(w)
|
||||
counts.append(cnt)
|
||||
|
||||
name = col.get("name") or "(col)"
|
||||
if not counts:
|
||||
ax.axis("off")
|
||||
ax.text(0.5, 0.5, f"{name}: sin datos numéricos", ha="center",
|
||||
va="center", fontsize=8, color=_MUTED, transform=ax.transAxes)
|
||||
return
|
||||
|
||||
ax.bar(lefts, counts, width=widths, align="edge", color=_ACCENT,
|
||||
edgecolor="white", linewidth=0.3)
|
||||
# Graphical integrity: count axis starts at 0, never truncated.
|
||||
ax.set_ylim(bottom=0)
|
||||
_despine(ax)
|
||||
ax.set_title(_truncate(name, 28), fontsize=10, loc="left", pad=4)
|
||||
ax.grid(axis="y", color=_MUTED, alpha=0.15, linewidth=0.5)
|
||||
ax.set_axisbelow(True)
|
||||
|
||||
# Median reference line (a single light marker, no chartjunk).
|
||||
median = num.get("median")
|
||||
if isinstance(median, (int, float)) and not isinstance(median, bool):
|
||||
ax.axvline(median, color=_INK, linewidth=0.8, alpha=0.5)
|
||||
|
||||
# One compact annotation line: mean / std / outliers.
|
||||
bits = []
|
||||
if num.get("mean") is not None:
|
||||
bits.append(f"μ={_fmt_num(num.get('mean'))}")
|
||||
if num.get("std") is not None:
|
||||
bits.append(f"σ={_fmt_num(num.get('std'))}")
|
||||
if num.get("outlier_pct") is not None:
|
||||
bits.append(f"outliers={_fmt_num(num.get('outlier_pct'), 1)}%")
|
||||
if bits:
|
||||
ax.text(0.99, 0.92, " ".join(bits), transform=ax.transAxes,
|
||||
ha="right", va="top", fontsize=7, color=_MUTED)
|
||||
|
||||
|
||||
def _categorical_pages(pdf, columns: list) -> int:
|
||||
"""Top-k horizontal bars per categorical column, several per page."""
|
||||
cat_cols = [
|
||||
c for c in columns
|
||||
if isinstance(c, dict) and c.get("categorical")
|
||||
and (c["categorical"].get("top"))
|
||||
]
|
||||
if not cat_cols:
|
||||
return 0
|
||||
|
||||
pages = 0
|
||||
for start in range(0, len(cat_cols), _CATEGORICAL_PER_PAGE):
|
||||
chunk = cat_cols[start:start + _CATEGORICAL_PER_PAGE]
|
||||
fig, axes = plt.subplots(
|
||||
len(chunk), 1, figsize=_A5_PORTRAIT, squeeze=False,
|
||||
)
|
||||
fig.suptitle("Categóricas (top-k)", fontsize=14, fontweight="bold",
|
||||
color=_INK, x=0.08, ha="left", y=0.98)
|
||||
for ax, col in zip(axes[:, 0], chunk):
|
||||
_draw_topk_bars(ax, col)
|
||||
for ax in axes[len(chunk):, 0]:
|
||||
ax.axis("off")
|
||||
fig.tight_layout(rect=[0, 0, 1, 0.95])
|
||||
pdf.savefig(fig)
|
||||
plt.close(fig)
|
||||
pages += 1
|
||||
return pages
|
||||
|
||||
|
||||
def _draw_topk_bars(ax, col: dict) -> None:
|
||||
"""Draw top-k counts for one categorical column as horizontal bars."""
|
||||
cat = col.get("categorical") or {}
|
||||
top = cat.get("top") or []
|
||||
labels, values = [], []
|
||||
for item in top[:10]:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
labels.append(_truncate(item.get("value"), 20))
|
||||
values.append(item.get("count") or 0)
|
||||
|
||||
name = col.get("name") or "(col)"
|
||||
if not values:
|
||||
ax.axis("off")
|
||||
ax.text(0.5, 0.5, f"{name}: sin categorías", ha="center", va="center",
|
||||
fontsize=8, color=_MUTED, transform=ax.transAxes)
|
||||
return
|
||||
|
||||
# Largest on top: reverse so barh reads naturally top-to-bottom.
|
||||
labels = labels[::-1]
|
||||
values = values[::-1]
|
||||
y = np.arange(len(values))
|
||||
ax.barh(y, values, color=_ACCENT, edgecolor="white", linewidth=0.3)
|
||||
ax.set_yticks(y)
|
||||
ax.set_yticklabels(labels, fontsize=7)
|
||||
ax.set_xlim(left=0) # bars start at 0 — honest length encoding.
|
||||
_despine(ax)
|
||||
ax.set_title(_truncate(name, 28), fontsize=10, loc="left", pad=4)
|
||||
ax.grid(axis="x", color=_MUTED, alpha=0.15, linewidth=0.5)
|
||||
ax.set_axisbelow(True)
|
||||
if cat.get("entropy") is not None:
|
||||
ax.text(0.99, 1.02, f"entropía={_fmt_num(cat.get('entropy'))}",
|
||||
transform=ax.transAxes, ha="right", va="bottom", fontsize=7,
|
||||
color=_MUTED)
|
||||
|
||||
|
||||
def _quality_page(pdf, columns: list) -> int:
|
||||
"""Worst-quality columns first, with their issues/flags."""
|
||||
scored = [
|
||||
c for c in columns
|
||||
if isinstance(c, dict) and c.get("quality_score") is not None
|
||||
]
|
||||
if not scored:
|
||||
return 0
|
||||
scored = sorted(scored, key=lambda c: c.get("quality_score"))
|
||||
|
||||
lines = [f"{'columna':<20} {'score':>6} problemas", "-" * 52]
|
||||
for col in scored:
|
||||
issues = col.get("issues") or col.get("flags") or []
|
||||
issues_s = ", ".join(issues) if isinstance(issues, list) else str(issues)
|
||||
lines.append(
|
||||
f"{_truncate(col.get('name'), 20):<20} "
|
||||
f"{_fmt_num(col.get('quality_score'), 1):>6} {_truncate(issues_s, 24)}"
|
||||
)
|
||||
return _text_page(pdf, "Calidad", lines,
|
||||
subtitle="ordenado de peor a mejor calidad")
|
||||
|
||||
|
||||
def _correlations_page(pdf, correlations) -> int:
|
||||
"""Heatmap of the association matrix reconstructed from the pairs list."""
|
||||
if not correlations:
|
||||
return 0
|
||||
pairs = correlations
|
||||
if isinstance(correlations, dict):
|
||||
pairs = correlations.get("pairs") or correlations.get("strong") or []
|
||||
if not pairs:
|
||||
return 0
|
||||
|
||||
# Build the symmetric label set and a value matrix from the pairs.
|
||||
labels = []
|
||||
for p in pairs:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
for key in ("a", "col_a", "b", "col_b"):
|
||||
v = p.get(key)
|
||||
if v is not None and v not in labels:
|
||||
labels.append(v)
|
||||
if len(labels) < 2:
|
||||
return 0
|
||||
idx = {lab: i for i, lab in enumerate(labels)}
|
||||
n = len(labels)
|
||||
mat = np.full((n, n), np.nan)
|
||||
for i in range(n):
|
||||
mat[i, i] = 1.0
|
||||
for p in pairs:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
a = p.get("a") or p.get("col_a")
|
||||
b = p.get("b") or p.get("col_b")
|
||||
val = p.get("value")
|
||||
if val is None:
|
||||
val = p.get("corr")
|
||||
if a in idx and b in idx and val is not None:
|
||||
try:
|
||||
fv = float(val)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
mat[idx[a], idx[b]] = fv
|
||||
mat[idx[b], idx[a]] = fv
|
||||
|
||||
fig, ax = plt.subplots(figsize=_A5_PORTRAIT)
|
||||
fig.suptitle("Correlaciones / asociación", fontsize=14, fontweight="bold",
|
||||
color=_INK, x=0.08, ha="left", y=0.97)
|
||||
im = ax.imshow(mat, cmap="RdBu_r", vmin=-1, vmax=1, aspect="auto")
|
||||
ax.set_xticks(np.arange(n))
|
||||
ax.set_yticks(np.arange(n))
|
||||
ax.set_xticklabels([_truncate(lab, 12) for lab in labels], rotation=60,
|
||||
ha="right", fontsize=7, color=_INK)
|
||||
ax.set_yticklabels([_truncate(lab, 14) for lab in labels], fontsize=7,
|
||||
color=_INK)
|
||||
ax.tick_params(length=0)
|
||||
for side in ("top", "right", "left", "bottom"):
|
||||
ax.spines[side].set_visible(False)
|
||||
# Annotate cells only when few columns (keeps it legible on a phone).
|
||||
if n <= 8:
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
if not np.isnan(mat[i, j]):
|
||||
ax.text(j, i, _fmt_num(mat[i, j], 2), ha="center",
|
||||
va="center", fontsize=6,
|
||||
color=_INK if abs(mat[i, j]) < 0.6 else "white")
|
||||
cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
|
||||
cbar.ax.tick_params(labelsize=7)
|
||||
fig.tight_layout(rect=[0, 0, 1, 0.94])
|
||||
pdf.savefig(fig)
|
||||
plt.close(fig)
|
||||
return 1
|
||||
|
||||
|
||||
def _llm_pages(pdf, llm) -> int:
|
||||
"""Render the LLM block (data dictionary / summary) as wrapped text pages."""
|
||||
if not llm:
|
||||
return 0
|
||||
lines = []
|
||||
if isinstance(llm, dict):
|
||||
for key, value in llm.items():
|
||||
if value is None:
|
||||
continue
|
||||
lines.append(f"## {key}")
|
||||
lines.extend(_wrap_value(value))
|
||||
lines.append("")
|
||||
else:
|
||||
lines.extend(_wrap_value(llm))
|
||||
if not lines:
|
||||
return 0
|
||||
return _paginate_text(pdf, "Análisis LLM", lines)
|
||||
|
||||
|
||||
def _generic_pages(pdf, profile: dict) -> int:
|
||||
"""Forward-compat: dump unknown top-level sections so they still reach the reader."""
|
||||
extras = {
|
||||
k: v for k, v in profile.items()
|
||||
if k not in _KNOWN_TOP_KEYS and v is not None
|
||||
}
|
||||
if not extras:
|
||||
return 0
|
||||
lines = []
|
||||
for key, value in extras.items():
|
||||
lines.append(f"## {key}")
|
||||
lines.extend(_wrap_value(value))
|
||||
lines.append("")
|
||||
if not lines:
|
||||
return 0
|
||||
return _paginate_text(pdf, "Otras secciones", lines,
|
||||
subtitle="bloques nuevos del profile (forward-compat)")
|
||||
|
||||
|
||||
def _wrap_value(value, width: int = 78) -> list:
|
||||
"""Flatten an arbitrary value into wrapped, readable text lines."""
|
||||
out = []
|
||||
if isinstance(value, dict):
|
||||
for k, v in value.items():
|
||||
out.append(f"- {k}: {_truncate(_scalar(v), 64)}")
|
||||
elif isinstance(value, (list, tuple)):
|
||||
for item in value:
|
||||
if isinstance(item, dict):
|
||||
out.append("- " + _truncate(
|
||||
", ".join(f"{k}={_scalar(v)}" for k, v in item.items()), 70))
|
||||
else:
|
||||
out.append(f"- {_truncate(_scalar(item), 72)}")
|
||||
else:
|
||||
for line in textwrap.wrap(str(value), width=width) or [""]:
|
||||
out.append(line)
|
||||
return out
|
||||
|
||||
|
||||
def _scalar(v) -> str:
|
||||
"""Compact one-line representation of a scalar/nested value."""
|
||||
if isinstance(v, float):
|
||||
return _fmt_num(v)
|
||||
if isinstance(v, (dict, list, tuple)):
|
||||
return _truncate(str(v), 60)
|
||||
return str(v)
|
||||
|
||||
|
||||
def _paginate_text(pdf, title: str, lines: list, subtitle: str = None,
|
||||
per_page: int = 34) -> int:
|
||||
"""Split a long list of text lines across several text pages."""
|
||||
pages = 0
|
||||
for start in range(0, len(lines), per_page):
|
||||
chunk = lines[start:start + per_page]
|
||||
page_title = title if pages == 0 else f"{title} (cont.)"
|
||||
pages += _text_page(pdf, page_title, chunk,
|
||||
subtitle=subtitle if pages == 0 else None)
|
||||
return pages
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Public entry point
|
||||
# --------------------------------------------------------------------------- #
|
||||
def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict:
|
||||
"""Render a TableProfile dict into a portable, mobile-readable multi-page PDF.
|
||||
|
||||
The report is laid out for reading on a phone: A5 portrait pages, single
|
||||
column, large type, Tufte-style high data-ink charts (real histograms as
|
||||
small multiples, top-k bars, an association heatmap). Every profile key is
|
||||
read defensively and only present sections are rendered; unknown top-level
|
||||
blocks are dumped on a forward-compat page rather than dropped.
|
||||
|
||||
Args:
|
||||
profile: TableProfile dict from the `eda` capability group (the dict
|
||||
returned by ``profile_table`` under ``profile``). May have many keys
|
||||
absent or None; a None/empty profile still yields a 1-page PDF.
|
||||
out_path: filesystem path where the PDF is written. Parent directories
|
||||
are created if missing.
|
||||
title: optional report title for the cover. Defaults to
|
||||
``"EDA — <table>"``.
|
||||
|
||||
Returns:
|
||||
dict (never raises): {"pdf_path": str, "n_pages": int, "note": str}.
|
||||
On a fatal write error, ``pdf_path`` is None and ``note`` explains why.
|
||||
"""
|
||||
if profile is None:
|
||||
profile = {}
|
||||
if not isinstance(profile, dict):
|
||||
return {"pdf_path": None, "n_pages": 0,
|
||||
"note": f"profile no es dict: {type(profile).__name__}"}
|
||||
|
||||
columns = profile.get("columns") or []
|
||||
if not isinstance(columns, list):
|
||||
columns = []
|
||||
|
||||
notes = []
|
||||
n_pages = 0
|
||||
|
||||
try:
|
||||
parent = os.path.dirname(os.path.abspath(out_path))
|
||||
os.makedirs(parent, exist_ok=True)
|
||||
except OSError as e:
|
||||
return {"pdf_path": None, "n_pages": 0,
|
||||
"note": f"no se pudo crear el directorio destino: {e}"}
|
||||
|
||||
# Tufte-ish defaults scoped to this render only.
|
||||
rc = {
|
||||
"font.size": 10,
|
||||
"font.family": "sans-serif",
|
||||
"axes.titlesize": 11,
|
||||
"axes.edgecolor": _MUTED,
|
||||
"figure.facecolor": "white",
|
||||
"savefig.facecolor": "white",
|
||||
"pdf.fonttype": 42, # embed TrueType so text stays selectable on mobile.
|
||||
}
|
||||
|
||||
# Each section is isolated: a failure in one never aborts the whole PDF.
|
||||
builders = [
|
||||
("cover", lambda p: _cover_page(p, profile, title)),
|
||||
("overview", lambda p: _overview_page(p, profile)),
|
||||
("numeric", lambda p: _numeric_pages(p, columns)),
|
||||
("categorical", lambda p: _categorical_pages(p, columns)),
|
||||
("quality", lambda p: _quality_page(p, columns)),
|
||||
("correlations", lambda p: _correlations_page(p, profile.get("correlations"))),
|
||||
("llm", lambda p: _llm_pages(p, profile.get("llm"))),
|
||||
("generic", lambda p: _generic_pages(p, profile)),
|
||||
]
|
||||
|
||||
try:
|
||||
with plt.rc_context(rc):
|
||||
with PdfPages(out_path) as pdf:
|
||||
for name, build in builders:
|
||||
try:
|
||||
n_pages += build(pdf) or 0
|
||||
except Exception as e: # noqa: BLE001 — one bad section never aborts.
|
||||
notes.append(f"sección '{name}' omitida: {e}")
|
||||
# Guarantee at least one page so the PDF is always valid.
|
||||
if n_pages == 0:
|
||||
n_pages += _text_page(
|
||||
pdf, title or "EDA", ["(perfil vacío — sin secciones)"]
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
return {"pdf_path": None, "n_pages": 0,
|
||||
"note": f"fallo al escribir el PDF: {e}"}
|
||||
|
||||
note = f"{n_pages} páginas"
|
||||
if notes:
|
||||
note += " · " + "; ".join(notes)
|
||||
return {"pdf_path": out_path, "n_pages": n_pages, "note": note}
|
||||
Reference in New Issue
Block a user