feat(eda): render de models en markdown + PDF DB-level para profile_database (H4,H9)
- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers); render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo) - H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas + join graph) via render_eda_pdf_relational; clave report_pdf_path - aditivos y retrocompatibles (flags default False). 38 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -52,7 +52,7 @@ from .to_returns import to_returns
|
|||||||
from .fdr_correction import fdr_correction
|
from .fdr_correction import fdr_correction
|
||||||
from .suggest_reexpression import suggest_reexpression
|
from .suggest_reexpression import suggest_reexpression
|
||||||
from .exploratory_caveats import exploratory_caveats
|
from .exploratory_caveats import exploratory_caveats
|
||||||
from .render_eda_pdf import render_eda_pdf
|
from .render_eda_pdf import render_eda_pdf, render_eda_pdf_relational
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"decode_qr_image",
|
"decode_qr_image",
|
||||||
@@ -64,6 +64,7 @@ __all__ = [
|
|||||||
"suggest_reexpression",
|
"suggest_reexpression",
|
||||||
"exploratory_caveats",
|
"exploratory_caveats",
|
||||||
"render_eda_pdf",
|
"render_eda_pdf",
|
||||||
|
"render_eda_pdf_relational",
|
||||||
"summarize_table_duckdb",
|
"summarize_table_duckdb",
|
||||||
"summarize_table_pg",
|
"summarize_table_pg",
|
||||||
"spearman_corr",
|
"spearman_corr",
|
||||||
|
|||||||
@@ -405,6 +405,110 @@ def render_eda_markdown(profile: dict) -> str:
|
|||||||
parts.append("## Series temporales")
|
parts.append("## Series temporales")
|
||||||
parts.extend(series_blocks)
|
parts.extend(series_blocks)
|
||||||
|
|
||||||
|
# 7d. Modelos baratos (PCA, KMeans, outliers multivariantes, normalidad). El
|
||||||
|
# pipeline corre `run_eda_models` cuando se pide con run_models; el bloque está
|
||||||
|
# completo en el JSON pero antes no tenía formatter en markdown y se omitía. Se
|
||||||
|
# lee todo defensivo con .get y cada submodelo se renderiza solo si está presente.
|
||||||
|
models = profile.get("models")
|
||||||
|
if isinstance(models, dict):
|
||||||
|
model_parts: list[str] = []
|
||||||
|
|
||||||
|
pca = models.get("pca")
|
||||||
|
if isinstance(pca, dict):
|
||||||
|
evr = pca.get("explained_variance_ratio") or []
|
||||||
|
cum = pca.get("cumulative") or []
|
||||||
|
pca_rows = []
|
||||||
|
for i, var in enumerate(evr):
|
||||||
|
acc = cum[i] if i < len(cum) else None
|
||||||
|
pca_rows.append([f"PC{i + 1}", _fmt_pct(var), _fmt_pct(acc)])
|
||||||
|
sub = ["### PCA"]
|
||||||
|
n_feat = pca.get("n_features")
|
||||||
|
n_used = pca.get("n_rows_used")
|
||||||
|
if n_feat is not None or n_used is not None:
|
||||||
|
sub.append(
|
||||||
|
f"{pca.get('n_components')} componentes sobre "
|
||||||
|
f"{n_used if n_used is not None else '?'} filas, "
|
||||||
|
f"{n_feat if n_feat is not None else '?'} features."
|
||||||
|
)
|
||||||
|
if pca_rows:
|
||||||
|
sub.append(_md_table(
|
||||||
|
["componente", "var. explicada", "acumulada"], pca_rows))
|
||||||
|
loadings = pca.get("top_loadings") or []
|
||||||
|
load_rows = []
|
||||||
|
for ld in loadings[:12]:
|
||||||
|
if not isinstance(ld, dict):
|
||||||
|
continue
|
||||||
|
comp = ld.get("component")
|
||||||
|
comp_label = f"PC{comp + 1}" if isinstance(comp, int) else str(comp)
|
||||||
|
load_rows.append([comp_label, ld.get("feature"),
|
||||||
|
_fmt_num(ld.get("loading"), 3)])
|
||||||
|
if load_rows:
|
||||||
|
sub.append("Cargas principales:")
|
||||||
|
sub.append(_md_table(["componente", "feature", "carga"], load_rows))
|
||||||
|
model_parts.append("\n\n".join(sub))
|
||||||
|
|
||||||
|
km = models.get("kmeans")
|
||||||
|
if isinstance(km, dict):
|
||||||
|
sub = ["### KMeans"]
|
||||||
|
best_k = km.get("best_k")
|
||||||
|
sil = km.get("silhouette")
|
||||||
|
sizes = km.get("cluster_sizes") or []
|
||||||
|
head = f"mejor k = {_fmt_num(best_k)}"
|
||||||
|
if sil is not None:
|
||||||
|
head += f" (silhouette {_fmt_num(sil, 3)})"
|
||||||
|
if sizes:
|
||||||
|
head += ". Tamaños de cluster: " + ", ".join(
|
||||||
|
_fmt_num(s) for s in sizes)
|
||||||
|
sub.append(head + ".")
|
||||||
|
score_rows = []
|
||||||
|
for sc in km.get("scores_by_k") or []:
|
||||||
|
if not isinstance(sc, dict):
|
||||||
|
continue
|
||||||
|
score_rows.append([sc.get("k"), _fmt_num(sc.get("silhouette"), 3),
|
||||||
|
_fmt_num(sc.get("inertia"), 2)])
|
||||||
|
if score_rows:
|
||||||
|
sub.append(_md_table(["k", "silhouette", "inertia"], score_rows))
|
||||||
|
model_parts.append("\n\n".join(sub))
|
||||||
|
|
||||||
|
out = models.get("outliers")
|
||||||
|
if isinstance(out, dict):
|
||||||
|
# outlier_pct del modelo multivariante ya viene en escala 0-100.
|
||||||
|
n_out = out.get("n_outliers")
|
||||||
|
pct = out.get("outlier_pct")
|
||||||
|
thr = out.get("threshold")
|
||||||
|
line = f"{_fmt_num(n_out)} filas marcadas como outlier"
|
||||||
|
if pct is not None:
|
||||||
|
line += f" ({_fmt_num(pct, 2)}%)"
|
||||||
|
if thr is not None:
|
||||||
|
line += f"; umbral de score {_fmt_num(thr, 3)}"
|
||||||
|
model_parts.append("### Outliers multivariante (Isolation Forest)\n\n"
|
||||||
|
+ line + ".")
|
||||||
|
|
||||||
|
normality = models.get("normality")
|
||||||
|
if isinstance(normality, dict):
|
||||||
|
norm_rows = []
|
||||||
|
for col_name, res in normality.items():
|
||||||
|
if not isinstance(res, dict):
|
||||||
|
continue
|
||||||
|
jb = res.get("jarque_bera") or {}
|
||||||
|
norm_rows.append([
|
||||||
|
col_name,
|
||||||
|
"sí" if res.get("is_normal") else "no",
|
||||||
|
_fmt_num(jb.get("p")) if jb.get("p") is not None else "",
|
||||||
|
])
|
||||||
|
if norm_rows:
|
||||||
|
model_parts.append(
|
||||||
|
"### Normalidad\n\n"
|
||||||
|
+ _md_table(["columna", "normal", "Jarque-Bera p"], norm_rows))
|
||||||
|
|
||||||
|
note = models.get("note")
|
||||||
|
if note:
|
||||||
|
model_parts.append(f"> {note}")
|
||||||
|
|
||||||
|
if model_parts:
|
||||||
|
parts.append("## Modelos")
|
||||||
|
parts.extend(model_parts)
|
||||||
|
|
||||||
# 8. LLM analysis (tolerate None for now).
|
# 8. LLM analysis (tolerate None for now).
|
||||||
llm = profile.get("llm")
|
llm = profile.get("llm")
|
||||||
if llm:
|
if llm:
|
||||||
|
|||||||
@@ -173,3 +173,62 @@ def test_tolerates_empty_profile():
|
|||||||
def test_tolerates_none_profile():
|
def test_tolerates_none_profile():
|
||||||
md = render_eda_markdown(None)
|
md = render_eda_markdown(None)
|
||||||
assert "# EDA — (unnamed)" in md
|
assert "# EDA — (unnamed)" in md
|
||||||
|
|
||||||
|
|
||||||
|
def _sample_models():
|
||||||
|
"""Bloque `models` como el que produce run_eda_models (PCA/KMeans/...)."""
|
||||||
|
return {
|
||||||
|
"n_numeric_cols": 3,
|
||||||
|
"pca": {
|
||||||
|
"n_components": 2,
|
||||||
|
"n_rows_used": 1000,
|
||||||
|
"n_features": 3,
|
||||||
|
"explained_variance_ratio": [0.62, 0.21],
|
||||||
|
"cumulative": [0.62, 0.83],
|
||||||
|
"top_loadings": [
|
||||||
|
{"component": 0, "feature": "price", "loading": 0.71},
|
||||||
|
{"component": 1, "feature": "qty", "loading": -0.55},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"kmeans": {
|
||||||
|
"best_k": 3,
|
||||||
|
"silhouette": 0.48,
|
||||||
|
"cluster_sizes": [500, 300, 200],
|
||||||
|
"scores_by_k": [
|
||||||
|
{"k": 2, "silhouette": 0.41, "inertia": 1200.0},
|
||||||
|
{"k": 3, "silhouette": 0.48, "inertia": 900.0},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"outliers": {
|
||||||
|
"n_outliers": 35,
|
||||||
|
"outlier_pct": 3.5,
|
||||||
|
"threshold": -0.51,
|
||||||
|
},
|
||||||
|
"normality": {
|
||||||
|
"price": {"jarque_bera": {"p": 0.0001}, "is_normal": False},
|
||||||
|
},
|
||||||
|
"note": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_models_section_rendered():
|
||||||
|
# H4: el bloque models antes se omitía en markdown; ahora tiene formatter.
|
||||||
|
profile = _sample_profile()
|
||||||
|
profile["models"] = _sample_models()
|
||||||
|
md = render_eda_markdown(profile)
|
||||||
|
assert "## Modelos" in md
|
||||||
|
assert "### PCA" in md
|
||||||
|
assert "### KMeans" in md
|
||||||
|
assert "### Outliers multivariante (Isolation Forest)" in md
|
||||||
|
assert "### Normalidad" in md
|
||||||
|
# Datos reales del PCA renderizados (varianza explicada ×100) y KMeans.
|
||||||
|
assert "62.0" in md # explained_variance_ratio 0.62 -> 62.00%
|
||||||
|
assert "mejor k = 3" in md
|
||||||
|
# outlier_pct del modelo ya viene en escala 0-100: 3.5 -> "3.5%", no "350".
|
||||||
|
assert "3.5%" in md
|
||||||
|
|
||||||
|
|
||||||
|
def test_models_absent_when_none():
|
||||||
|
# Edge: profile sin models (None) no produce sección Modelos ni rompe.
|
||||||
|
md = render_eda_markdown(_sample_profile()) # models=None en el sample
|
||||||
|
assert "## Modelos" not in md
|
||||||
|
|||||||
@@ -52,6 +52,8 @@ _KNOWN_TOP_KEYS = {
|
|||||||
"duplicate_rows", "duplicate_pct", "null_cell_pct", "constant_cols",
|
"duplicate_rows", "duplicate_pct", "null_cell_pct", "constant_cols",
|
||||||
"all_null_cols", "quality_score", "type_breakdown", "key_candidates",
|
"all_null_cols", "quality_score", "type_breakdown", "key_candidates",
|
||||||
"columns", "correlations", "llm",
|
"columns", "correlations", "llm",
|
||||||
|
# Bloques con builder dedicado (no caen al volcado genérico str(dict)).
|
||||||
|
"models", "series", "caveats",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Restrained, high-contrast palette: a single accent reads cleanly on a phone.
|
# Restrained, high-contrast palette: a single accent reads cleanly on a phone.
|
||||||
@@ -59,6 +61,17 @@ _INK = "#1b1b1b"
|
|||||||
_ACCENT = "#2a6f97"
|
_ACCENT = "#2a6f97"
|
||||||
_MUTED = "#8a8a8a"
|
_MUTED = "#8a8a8a"
|
||||||
|
|
||||||
|
# Tufte-ish render defaults shared by both public entry points.
|
||||||
|
_RC = {
|
||||||
|
"font.size": 10,
|
||||||
|
"font.family": "sans-serif",
|
||||||
|
"axes.titlesize": 11,
|
||||||
|
"axes.edgecolor": _MUTED,
|
||||||
|
"figure.facecolor": "white",
|
||||||
|
"savefig.facecolor": "white",
|
||||||
|
"pdf.fonttype": 42, # embed TrueType so text stays selectable on mobile.
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------------- #
|
# --------------------------------------------------------------------------- #
|
||||||
# Small formatting + Tufte helpers
|
# Small formatting + Tufte helpers
|
||||||
@@ -535,6 +548,246 @@ def _paginate_text(pdf, title: str, lines: list, subtitle: str = None,
|
|||||||
return pages
|
return pages
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Dedicated builders for forward-compat blocks (models / series / caveats).
|
||||||
|
# Before these existed, ``models``/``series``/``caveats`` fell to the generic
|
||||||
|
# dump and were rendered as truncated ``str(dict)``. Each builder is fully
|
||||||
|
# defensive, reads with ``.get`` and returns the number of pages it produced.
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def _models_pages(pdf, models) -> int:
|
||||||
|
"""Render the cheap-models block (PCA / KMeans / outliers / normality)."""
|
||||||
|
if not isinstance(models, dict):
|
||||||
|
return 0
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
pca = models.get("pca")
|
||||||
|
if isinstance(pca, dict):
|
||||||
|
lines.append("## PCA")
|
||||||
|
n_used = pca.get("n_rows_used")
|
||||||
|
n_feat = pca.get("n_features")
|
||||||
|
if n_used is not None or n_feat is not None:
|
||||||
|
lines.append(
|
||||||
|
f" {pca.get('n_components')} comp · "
|
||||||
|
f"{_fmt_num(n_used)} filas · {_fmt_num(n_feat)} features"
|
||||||
|
)
|
||||||
|
evr = pca.get("explained_variance_ratio") or []
|
||||||
|
cum = pca.get("cumulative") or []
|
||||||
|
for i, var in enumerate(evr):
|
||||||
|
acc = cum[i] if i < len(cum) else None
|
||||||
|
lines.append(f" PC{i + 1}: var {_fmt_pct(var)} acum {_fmt_pct(acc)}")
|
||||||
|
loadings = pca.get("top_loadings") or []
|
||||||
|
if loadings:
|
||||||
|
lines.append(" cargas principales:")
|
||||||
|
for ld in loadings[:8]:
|
||||||
|
if not isinstance(ld, dict):
|
||||||
|
continue
|
||||||
|
comp = ld.get("component")
|
||||||
|
comp_label = f"PC{comp + 1}" if isinstance(comp, int) else str(comp)
|
||||||
|
lines.append(
|
||||||
|
f" {comp_label} {_truncate(ld.get('feature'), 18)}: "
|
||||||
|
f"{_fmt_num(ld.get('loading'), 3)}"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
km = models.get("kmeans")
|
||||||
|
if isinstance(km, dict):
|
||||||
|
lines.append("## KMeans")
|
||||||
|
head = f" mejor k = {_fmt_num(km.get('best_k'))}"
|
||||||
|
if km.get("silhouette") is not None:
|
||||||
|
head += f" silhouette {_fmt_num(km.get('silhouette'), 3)}"
|
||||||
|
lines.append(head)
|
||||||
|
sizes = km.get("cluster_sizes") or []
|
||||||
|
if sizes:
|
||||||
|
lines.append(" tamaños cluster: " + ", ".join(
|
||||||
|
_fmt_num(s) for s in sizes))
|
||||||
|
for sc in km.get("scores_by_k") or []:
|
||||||
|
if not isinstance(sc, dict):
|
||||||
|
continue
|
||||||
|
lines.append(
|
||||||
|
f" k={sc.get('k')}: silhouette {_fmt_num(sc.get('silhouette'), 3)}"
|
||||||
|
f" inertia {_fmt_num(sc.get('inertia'), 1)}"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
out = models.get("outliers")
|
||||||
|
if isinstance(out, dict):
|
||||||
|
lines.append("## Outliers multivariante (Isolation Forest)")
|
||||||
|
# outlier_pct del modelo ya viene en escala 0-100.
|
||||||
|
line = f" {_fmt_num(out.get('n_outliers'))} outliers"
|
||||||
|
if out.get("outlier_pct") is not None:
|
||||||
|
line += f" ({_fmt_num(out.get('outlier_pct'), 2)}%)"
|
||||||
|
if out.get("threshold") is not None:
|
||||||
|
line += f" umbral {_fmt_num(out.get('threshold'), 3)}"
|
||||||
|
lines.append(line)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
normality = models.get("normality")
|
||||||
|
if isinstance(normality, dict):
|
||||||
|
lines.append("## Normalidad (Jarque-Bera)")
|
||||||
|
for col_name, res in normality.items():
|
||||||
|
if not isinstance(res, dict):
|
||||||
|
continue
|
||||||
|
jb = res.get("jarque_bera") or {}
|
||||||
|
lines.append(
|
||||||
|
f" {_truncate(col_name, 18):<18} normal={res.get('is_normal')}"
|
||||||
|
f" JB p={_fmt_num(jb.get('p'), 4)}"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
note = models.get("note")
|
||||||
|
if note:
|
||||||
|
lines.append(f"nota: {note}")
|
||||||
|
|
||||||
|
if not [ln for ln in lines if ln.strip()]:
|
||||||
|
return 0
|
||||||
|
return _paginate_text(pdf, "Modelos", lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _series_pages(pdf, series) -> int:
|
||||||
|
"""Render the time-series block: one compact summary per series column."""
|
||||||
|
if not isinstance(series, dict) or not series:
|
||||||
|
return 0
|
||||||
|
lines = []
|
||||||
|
for col, s in series.items():
|
||||||
|
if not isinstance(s, dict):
|
||||||
|
continue
|
||||||
|
lines.append(f"## {col}")
|
||||||
|
stat = s.get("stationarity") or {}
|
||||||
|
if stat.get("verdict") is not None:
|
||||||
|
lines.append(f" estacionariedad (ADF+KPSS): {stat.get('verdict')}")
|
||||||
|
acf = s.get("acf_pacf") or {}
|
||||||
|
if acf.get("is_autocorrelated") is not None:
|
||||||
|
lines.append(
|
||||||
|
" autocorrelada (Ljung-Box): "
|
||||||
|
+ ("sí" if acf.get("is_autocorrelated") else "no")
|
||||||
|
)
|
||||||
|
stl = s.get("stl") or {}
|
||||||
|
if stl.get("trend_strength") is not None:
|
||||||
|
lines.append(
|
||||||
|
f" fuerza tendencia (STL): {_fmt_num(stl.get('trend_strength'), 3)}")
|
||||||
|
if stl.get("seasonal_strength") is not None:
|
||||||
|
extra = (f" (periodo {stl.get('period')})"
|
||||||
|
if stl.get("period") is not None else "")
|
||||||
|
lines.append(
|
||||||
|
f" fuerza estacional (STL): "
|
||||||
|
f"{_fmt_num(stl.get('seasonal_strength'), 3)}{extra}")
|
||||||
|
elif stl.get("note"):
|
||||||
|
lines.append(f" STL: {_truncate(stl.get('note'), 60)}")
|
||||||
|
if s.get("levels_suggested"):
|
||||||
|
kind = s.get("levels_kind")
|
||||||
|
if kind == "returns":
|
||||||
|
lines.append(" sugerencia: convertir a retornos (serie financiera)")
|
||||||
|
elif kind == "differences":
|
||||||
|
lines.append(" sugerencia: trabajar sobre diferencias (serie física)")
|
||||||
|
else:
|
||||||
|
lines.append(" sugerencia: retornos o diferencias (serie de niveles)")
|
||||||
|
lines.append("")
|
||||||
|
if not [ln for ln in lines if ln.strip()]:
|
||||||
|
return 0
|
||||||
|
return _paginate_text(pdf, "Series temporales", lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _caveats_pages(pdf, caveats) -> int:
|
||||||
|
"""Render the exploratory caveats block as a wrapped, readable list."""
|
||||||
|
cav_list = []
|
||||||
|
if isinstance(caveats, dict):
|
||||||
|
cav_list = caveats.get("caveats") or []
|
||||||
|
elif isinstance(caveats, list):
|
||||||
|
cav_list = caveats
|
||||||
|
lines = []
|
||||||
|
for cav in cav_list:
|
||||||
|
if not isinstance(cav, dict):
|
||||||
|
continue
|
||||||
|
topic = cav.get("topic") or cav.get("id") or ""
|
||||||
|
msg = cav.get("message") or ""
|
||||||
|
lines.append(f"## {topic}")
|
||||||
|
lines.extend(textwrap.wrap(str(msg), width=78) or [""])
|
||||||
|
lines.append("")
|
||||||
|
if not [ln for ln in lines if ln.strip()]:
|
||||||
|
return 0
|
||||||
|
return _paginate_text(pdf, "Avisos exploratorios", lines,
|
||||||
|
subtitle="el EDA genera hipótesis, no conclusiones")
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# DB-level (relational) page builders — used by render_eda_pdf_relational.
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def _db_cover_page(pdf, db_profile: dict, title: str) -> int:
|
||||||
|
"""Cover for a DatabaseProfile: name, date, table count, FK count."""
|
||||||
|
fig = plt.figure(figsize=_A5_PORTRAIT)
|
||||||
|
db_path = db_profile.get("db_path") or "(base sin nombre)"
|
||||||
|
heading = title or f"EDA base — {os.path.basename(str(db_path))}"
|
||||||
|
fig.text(0.08, 0.82, heading, fontsize=20, fontweight="bold", color=_INK,
|
||||||
|
wrap=True)
|
||||||
|
|
||||||
|
sub = [f"fuente: {_truncate(db_path, 44)}"]
|
||||||
|
when = db_profile.get("profiled_at") or datetime.now(timezone.utc).strftime(
|
||||||
|
"%Y-%m-%d %H:%M UTC")
|
||||||
|
sub.append(f"generado: {when}")
|
||||||
|
fig.text(0.08, 0.74, "\n".join(sub), fontsize=10, color=_MUTED, va="top")
|
||||||
|
|
||||||
|
n_tables = db_profile.get("n_tables")
|
||||||
|
fig.text(0.08, 0.58, f"{_fmt_num(n_tables)} tablas", fontsize=16,
|
||||||
|
color=_ACCENT, fontweight="bold")
|
||||||
|
n_fk = len(db_profile.get("fk_candidates") or [])
|
||||||
|
fig.text(0.08, 0.51, f"{_fmt_num(n_fk)} relaciones FK candidatas",
|
||||||
|
fontsize=12, color=_INK)
|
||||||
|
|
||||||
|
fig.text(0.08, 0.06, "Tufte · alta densidad de datos · lectura en móvil",
|
||||||
|
fontsize=8, color=_MUTED, style="italic")
|
||||||
|
pdf.savefig(fig)
|
||||||
|
plt.close(fig)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
def _db_tables_page(pdf, db_profile: dict) -> int:
|
||||||
|
"""One text page summarising every table (rows / cols / quality)."""
|
||||||
|
tables = db_profile.get("tables") or []
|
||||||
|
if not isinstance(tables, list) or not tables:
|
||||||
|
return 0
|
||||||
|
lines = [f"{'tabla':<24}{'filas':>9}{'cols':>6}{'cal':>6}", "-" * 45]
|
||||||
|
for t in tables:
|
||||||
|
if not isinstance(t, dict):
|
||||||
|
continue
|
||||||
|
lines.append(
|
||||||
|
f"{_truncate(t.get('table'), 24):<24}"
|
||||||
|
f"{_fmt_num(t.get('n_rows')):>9}"
|
||||||
|
f"{_fmt_num(t.get('n_cols')):>6}"
|
||||||
|
f"{_fmt_num(t.get('quality_score'), 1):>6}"
|
||||||
|
)
|
||||||
|
return _paginate_text(pdf, "Tablas", lines, subtitle="resumen por tabla")
|
||||||
|
|
||||||
|
|
||||||
|
def _db_fk_page(pdf, db_profile: dict) -> int:
|
||||||
|
"""FK candidates table + the join-graph mermaid text."""
|
||||||
|
fks = db_profile.get("fk_candidates") or []
|
||||||
|
lines = []
|
||||||
|
if isinstance(fks, list) and fks:
|
||||||
|
lines.append(f"{'from':<26}{'to':<26}{'incl':>7}")
|
||||||
|
lines.append("-" * 59)
|
||||||
|
for fk in fks:
|
||||||
|
if not isinstance(fk, dict):
|
||||||
|
continue
|
||||||
|
frm = f"{fk.get('from_table')}.{fk.get('from_col')}"
|
||||||
|
to = f"{fk.get('to_table')}.{fk.get('to_col')}"
|
||||||
|
inc = fk.get("inclusion")
|
||||||
|
inc_s = (_fmt_num(inc, 3) if isinstance(inc, (int, float))
|
||||||
|
and not isinstance(inc, bool) else str(inc))
|
||||||
|
lines.append(
|
||||||
|
f"{_truncate(frm, 25):<26}{_truncate(to, 25):<26}{inc_s:>7}")
|
||||||
|
else:
|
||||||
|
lines.append("(sin relaciones FK candidatas detectadas)")
|
||||||
|
|
||||||
|
mermaid = (db_profile.get("join_graph") or {}).get("mermaid")
|
||||||
|
if mermaid:
|
||||||
|
lines.append("")
|
||||||
|
lines.append("## join graph (mermaid)")
|
||||||
|
for raw in str(mermaid).splitlines():
|
||||||
|
lines.append(_truncate(raw, 72))
|
||||||
|
return _paginate_text(pdf, "Relaciones inter-tabla", lines,
|
||||||
|
subtitle="FK candidatas + join graph")
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------------- #
|
# --------------------------------------------------------------------------- #
|
||||||
# Public entry point
|
# Public entry point
|
||||||
# --------------------------------------------------------------------------- #
|
# --------------------------------------------------------------------------- #
|
||||||
@@ -580,16 +833,8 @@ def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict:
|
|||||||
return {"pdf_path": None, "n_pages": 0,
|
return {"pdf_path": None, "n_pages": 0,
|
||||||
"note": f"no se pudo crear el directorio destino: {e}"}
|
"note": f"no se pudo crear el directorio destino: {e}"}
|
||||||
|
|
||||||
# Tufte-ish defaults scoped to this render only.
|
# Tufte-ish defaults shared with the relational renderer (module-level _RC).
|
||||||
rc = {
|
rc = _RC
|
||||||
"font.size": 10,
|
|
||||||
"font.family": "sans-serif",
|
|
||||||
"axes.titlesize": 11,
|
|
||||||
"axes.edgecolor": _MUTED,
|
|
||||||
"figure.facecolor": "white",
|
|
||||||
"savefig.facecolor": "white",
|
|
||||||
"pdf.fonttype": 42, # embed TrueType so text stays selectable on mobile.
|
|
||||||
}
|
|
||||||
|
|
||||||
# Each section is isolated: a failure in one never aborts the whole PDF.
|
# Each section is isolated: a failure in one never aborts the whole PDF.
|
||||||
builders = [
|
builders = [
|
||||||
@@ -599,7 +844,10 @@ def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict:
|
|||||||
("categorical", lambda p: _categorical_pages(p, columns)),
|
("categorical", lambda p: _categorical_pages(p, columns)),
|
||||||
("quality", lambda p: _quality_page(p, columns)),
|
("quality", lambda p: _quality_page(p, columns)),
|
||||||
("correlations", lambda p: _correlations_page(p, profile.get("correlations"))),
|
("correlations", lambda p: _correlations_page(p, profile.get("correlations"))),
|
||||||
|
("models", lambda p: _models_pages(p, profile.get("models"))),
|
||||||
|
("series", lambda p: _series_pages(p, profile.get("series"))),
|
||||||
("llm", lambda p: _llm_pages(p, profile.get("llm"))),
|
("llm", lambda p: _llm_pages(p, profile.get("llm"))),
|
||||||
|
("caveats", lambda p: _caveats_pages(p, profile.get("caveats"))),
|
||||||
("generic", lambda p: _generic_pages(p, profile)),
|
("generic", lambda p: _generic_pages(p, profile)),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -624,3 +872,71 @@ def render_eda_pdf(profile: dict, out_path: str, title: str = None) -> dict:
|
|||||||
if notes:
|
if notes:
|
||||||
note += " · " + "; ".join(notes)
|
note += " · " + "; ".join(notes)
|
||||||
return {"pdf_path": out_path, "n_pages": n_pages, "note": note}
|
return {"pdf_path": out_path, "n_pages": n_pages, "note": note}
|
||||||
|
|
||||||
|
|
||||||
|
def render_eda_pdf_relational(db_profile: dict, out_path: str,
|
||||||
|
title: str = None) -> dict:
|
||||||
|
"""Render a DatabaseProfile dict into a portable, mobile-readable PDF.
|
||||||
|
|
||||||
|
DB-level sibling of :func:`render_eda_pdf`: instead of a single table it
|
||||||
|
summarises a whole database (the dict ``profile_database`` returns under
|
||||||
|
``db_profile``). Pages are A5 portrait, single column, large type — built to
|
||||||
|
be read on a phone. Three pages: a cover (table + FK counts), a per-table
|
||||||
|
summary (rows / cols / quality) and the inter-table relations (FK candidates
|
||||||
|
plus the join-graph mermaid text). Every key is read defensively and any
|
||||||
|
section that fails is noted, never aborting the whole render.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db_profile: DatabaseProfile dict from ``profile_database`` (the value
|
||||||
|
under ``db_profile``). May have keys absent or None; a None/empty
|
||||||
|
profile still yields a 1-page PDF.
|
||||||
|
out_path: filesystem path where the PDF is written. Parent directories
|
||||||
|
are created if missing.
|
||||||
|
title: optional cover title. Defaults to ``"EDA base — <db filename>"``.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict (never raises): {"pdf_path": str, "n_pages": int, "note": str}.
|
||||||
|
On a fatal write error, ``pdf_path`` is None and ``note`` explains why.
|
||||||
|
"""
|
||||||
|
if db_profile is None:
|
||||||
|
db_profile = {}
|
||||||
|
if not isinstance(db_profile, dict):
|
||||||
|
return {"pdf_path": None, "n_pages": 0,
|
||||||
|
"note": f"db_profile no es dict: {type(db_profile).__name__}"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
parent = os.path.dirname(os.path.abspath(out_path))
|
||||||
|
os.makedirs(parent, exist_ok=True)
|
||||||
|
except OSError as e:
|
||||||
|
return {"pdf_path": None, "n_pages": 0,
|
||||||
|
"note": f"no se pudo crear el directorio destino: {e}"}
|
||||||
|
|
||||||
|
notes = []
|
||||||
|
n_pages = 0
|
||||||
|
|
||||||
|
builders = [
|
||||||
|
("cover", lambda p: _db_cover_page(p, db_profile, title)),
|
||||||
|
("tables", lambda p: _db_tables_page(p, db_profile)),
|
||||||
|
("relations", lambda p: _db_fk_page(p, db_profile)),
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
with plt.rc_context(_RC):
|
||||||
|
with PdfPages(out_path) as pdf:
|
||||||
|
for name, build in builders:
|
||||||
|
try:
|
||||||
|
n_pages += build(pdf) or 0
|
||||||
|
except Exception as e: # noqa: BLE001 — one bad section never aborts.
|
||||||
|
notes.append(f"sección '{name}' omitida: {e}")
|
||||||
|
if n_pages == 0:
|
||||||
|
n_pages += _text_page(
|
||||||
|
pdf, title or "EDA base", ["(base vacía — sin secciones)"]
|
||||||
|
)
|
||||||
|
except Exception as e: # noqa: BLE001
|
||||||
|
return {"pdf_path": None, "n_pages": 0,
|
||||||
|
"note": f"fallo al escribir el PDF: {e}"}
|
||||||
|
|
||||||
|
note = f"{n_pages} páginas"
|
||||||
|
if notes:
|
||||||
|
note += " · " + "; ".join(notes)
|
||||||
|
return {"pdf_path": out_path, "n_pages": n_pages, "note": note}
|
||||||
|
|||||||
@@ -9,7 +9,23 @@ import sys
|
|||||||
|
|
||||||
sys.path.insert(0, os.path.dirname(__file__))
|
sys.path.insert(0, os.path.dirname(__file__))
|
||||||
|
|
||||||
from render_eda_pdf import render_eda_pdf
|
from render_eda_pdf import (
|
||||||
|
render_eda_pdf,
|
||||||
|
render_eda_pdf_relational,
|
||||||
|
_models_pages,
|
||||||
|
_series_pages,
|
||||||
|
_caveats_pages,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _StubPdf:
|
||||||
|
"""Captura pdf.savefig sin escribir nada — para testear builders aislados."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.figs = 0
|
||||||
|
|
||||||
|
def savefig(self, fig):
|
||||||
|
self.figs += 1
|
||||||
|
|
||||||
|
|
||||||
def _synthetic_profile() -> dict:
|
def _synthetic_profile() -> dict:
|
||||||
@@ -170,3 +186,144 @@ def test_forward_compat_seccion_desconocida(tmp_path):
|
|||||||
assert res["n_pages"] >= 1
|
assert res["n_pages"] >= 1
|
||||||
# No se perdió ninguna sección por error.
|
# No se perdió ninguna sección por error.
|
||||||
assert "omitida" not in res["note"]
|
assert "omitida" not in res["note"]
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# H4: builders dedicados para models / series / caveats (antes caían al volcado
|
||||||
|
# genérico como str(dict) truncado). Se testean aislados con un stub de pdf.
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def _sample_models() -> dict:
|
||||||
|
return {
|
||||||
|
"n_numeric_cols": 3,
|
||||||
|
"pca": {
|
||||||
|
"n_components": 2, "n_rows_used": 1000, "n_features": 3,
|
||||||
|
"explained_variance_ratio": [0.62, 0.21],
|
||||||
|
"cumulative": [0.62, 0.83],
|
||||||
|
"top_loadings": [
|
||||||
|
{"component": 0, "feature": "precio", "loading": 0.71},
|
||||||
|
{"component": 1, "feature": "unidades", "loading": -0.55},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"kmeans": {
|
||||||
|
"best_k": 3, "silhouette": 0.48, "cluster_sizes": [500, 300, 200],
|
||||||
|
"scores_by_k": [{"k": 3, "silhouette": 0.48, "inertia": 900.0}],
|
||||||
|
},
|
||||||
|
"outliers": {"n_outliers": 35, "outlier_pct": 3.5, "threshold": -0.51},
|
||||||
|
"normality": {"precio": {"jarque_bera": {"p": 0.0001}, "is_normal": False}},
|
||||||
|
"note": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _sample_series() -> dict:
|
||||||
|
return {
|
||||||
|
"precio": {
|
||||||
|
"stationarity": {"verdict": "non_stationary"},
|
||||||
|
"acf_pacf": {"is_autocorrelated": True},
|
||||||
|
"stl": {"trend_strength": 0.95, "seasonal_strength": 0.10, "period": 7},
|
||||||
|
"levels_suggested": True, "levels_kind": "returns",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _sample_caveats() -> dict:
|
||||||
|
return {
|
||||||
|
"n": 1,
|
||||||
|
"caveats": [
|
||||||
|
{"id": "exploratory_nature", "topic": "naturaleza exploratoria",
|
||||||
|
"message": "El EDA genera hipótesis, no conclusiones."},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_models_builder_produces_pages():
|
||||||
|
pdf = _StubPdf()
|
||||||
|
assert _models_pages(pdf, _sample_models()) >= 1
|
||||||
|
assert pdf.figs >= 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_series_builder_produces_pages():
|
||||||
|
pdf = _StubPdf()
|
||||||
|
assert _series_pages(pdf, _sample_series()) >= 1
|
||||||
|
assert pdf.figs >= 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_caveats_builder_produces_pages():
|
||||||
|
pdf = _StubPdf()
|
||||||
|
assert _caveats_pages(pdf, _sample_caveats()) >= 1
|
||||||
|
assert pdf.figs >= 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_builders_tolerate_none_and_empty():
|
||||||
|
pdf = _StubPdf()
|
||||||
|
# None / vacío -> 0 páginas, sin excepción.
|
||||||
|
assert _models_pages(pdf, None) == 0
|
||||||
|
assert _series_pages(pdf, {}) == 0
|
||||||
|
assert _caveats_pages(pdf, None) == 0
|
||||||
|
assert pdf.figs == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_models_series_caveats_no_caen_al_generico(tmp_path):
|
||||||
|
# Con builder dedicado, models/series/caveats NO se vuelcan en "Otras
|
||||||
|
# secciones" (genérico). El profile completo se renderiza sin error.
|
||||||
|
prof = _synthetic_profile()
|
||||||
|
prof["models"] = _sample_models()
|
||||||
|
prof["series"] = _sample_series()
|
||||||
|
prof["caveats"] = _sample_caveats()
|
||||||
|
out = str(tmp_path / "full.pdf")
|
||||||
|
res = render_eda_pdf(prof, out)
|
||||||
|
assert os.path.exists(out)
|
||||||
|
assert os.path.getsize(out) > 0
|
||||||
|
assert "omitida" not in res["note"]
|
||||||
|
# Cover+overview+num+cat+calidad+corr + models + series + caveats.
|
||||||
|
assert res["n_pages"] >= 8
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# H9: render_eda_pdf_relational — PDF DB-level (resumen de tablas + join graph).
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def _synthetic_db_profile() -> dict:
|
||||||
|
return {
|
||||||
|
"db_path": "data/shop.duckdb",
|
||||||
|
"profiled_at": "2026-06-29 01:00 UTC",
|
||||||
|
"n_tables": 2,
|
||||||
|
"tables": [
|
||||||
|
{"table": "customers", "n_rows": 4, "n_cols": 3, "quality_score": 98.0,
|
||||||
|
"key_candidates": ["id"]},
|
||||||
|
{"table": "orders", "n_rows": 6, "n_cols": 3, "quality_score": 95.0,
|
||||||
|
"key_candidates": ["order_id"]},
|
||||||
|
],
|
||||||
|
"fk_candidates": [
|
||||||
|
{"from_table": "orders", "from_col": "customer_id",
|
||||||
|
"to_table": "customers", "to_col": "id",
|
||||||
|
"inclusion": 1.0, "cardinality": "N:1"},
|
||||||
|
],
|
||||||
|
"join_graph": {"mermaid": "graph LR\n orders --> customers"},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_relational_golden_genera_pdf(tmp_path):
|
||||||
|
out = str(tmp_path / "eda_db.pdf")
|
||||||
|
res = render_eda_pdf_relational(_synthetic_db_profile(), out, title="EDA base")
|
||||||
|
assert isinstance(res, dict)
|
||||||
|
assert set(res.keys()) == {"pdf_path", "n_pages", "note"}
|
||||||
|
assert res["pdf_path"] == out
|
||||||
|
assert os.path.exists(out)
|
||||||
|
assert os.path.getsize(out) > 0
|
||||||
|
# cover + tablas + relaciones >= 3.
|
||||||
|
assert res["n_pages"] >= 3
|
||||||
|
with open(out, "rb") as fh:
|
||||||
|
assert fh.read(4) == b"%PDF"
|
||||||
|
|
||||||
|
|
||||||
|
def test_relational_edge_vacio_no_revienta(tmp_path):
|
||||||
|
out = str(tmp_path / "db_vacio.pdf")
|
||||||
|
res = render_eda_pdf_relational({}, out)
|
||||||
|
assert os.path.exists(out)
|
||||||
|
assert res["n_pages"] >= 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_relational_edge_none_no_revienta(tmp_path):
|
||||||
|
out = str(tmp_path / "db_none.pdf")
|
||||||
|
res = render_eda_pdf_relational(None, out)
|
||||||
|
assert os.path.exists(out)
|
||||||
|
assert res["n_pages"] >= 1
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ Funciones del registry compuestas (NO se reimplementa su logica):
|
|||||||
- build_join_graph : grafo de relaciones inter-tabla + diagrama Mermaid.
|
- build_join_graph : grafo de relaciones inter-tabla + diagrama Mermaid.
|
||||||
- duckdb_list_tables : introspeccion "que tablas hay" (read-only).
|
- duckdb_list_tables : introspeccion "que tablas hay" (read-only).
|
||||||
- render_eda_markdown : report legible de un TableProfile.
|
- render_eda_markdown : report legible de un TableProfile.
|
||||||
|
- render_eda_pdf_relational : PDF movil DB-level (resumen de tablas + join graph).
|
||||||
|
|
||||||
Aporta una capa propia de AGREGACION A NIVEL DE BASE: ensambla un DatabaseProfile
|
Aporta una capa propia de AGREGACION A NIVEL DE BASE: ensambla un DatabaseProfile
|
||||||
con el resumen de cada tabla, los TableProfiles completos, las FK candidatas y el
|
con el resumen de cada tabla, los TableProfiles completos, las FK candidatas y el
|
||||||
@@ -31,6 +32,7 @@ from datascience import (
|
|||||||
build_join_graph,
|
build_join_graph,
|
||||||
infer_fk_containment_duckdb,
|
infer_fk_containment_duckdb,
|
||||||
render_eda_markdown,
|
render_eda_markdown,
|
||||||
|
render_eda_pdf_relational,
|
||||||
)
|
)
|
||||||
from infra import duckdb_list_tables
|
from infra import duckdb_list_tables
|
||||||
from pipelines.profile_table import profile_table
|
from pipelines.profile_table import profile_table
|
||||||
@@ -118,6 +120,7 @@ def profile_database(
|
|||||||
report_dir: str = "reports",
|
report_dir: str = "reports",
|
||||||
write_report: bool = True,
|
write_report: bool = True,
|
||||||
min_inclusion: float = 0.9,
|
min_inclusion: float = 0.9,
|
||||||
|
emit_pdf: bool = False,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Perfila una base DuckDB entera + sus relaciones inter-tabla.
|
"""Perfila una base DuckDB entera + sus relaciones inter-tabla.
|
||||||
|
|
||||||
@@ -134,11 +137,16 @@ def profile_database(
|
|||||||
paths del retorno son None.
|
paths del retorno son None.
|
||||||
min_inclusion: umbral minimo de inclusion (0-1) para emitir una FK
|
min_inclusion: umbral minimo de inclusion (0-1) para emitir una FK
|
||||||
candidata (se pasa a infer_fk_containment_duckdb). Default 0.9.
|
candidata (se pasa a infer_fk_containment_duckdb). Default 0.9.
|
||||||
|
emit_pdf: si True (default False) renderiza un PDF movil DB-level con
|
||||||
|
render_eda_pdf_relational (resumen de tablas + relaciones FK + join
|
||||||
|
graph) junto a los reports y devuelve su ruta en report_pdf_path. Con
|
||||||
|
False no se toca el PDF (retrocompatible) y report_pdf_path es None.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict dict-no-throw. En exito:
|
dict dict-no-throw. En exito:
|
||||||
{status:'ok', db_profile:<DatabaseProfile>,
|
{status:'ok', db_profile:<DatabaseProfile>,
|
||||||
report_md_path:str|None, report_json_path:str|None}.
|
report_md_path:str|None, report_json_path:str|None,
|
||||||
|
report_pdf_path:str|None}.
|
||||||
En error (sin lanzar): {status:'error', error:str}.
|
En error (sin lanzar): {status:'error', error:str}.
|
||||||
|
|
||||||
DatabaseProfile = {
|
DatabaseProfile = {
|
||||||
@@ -204,12 +212,13 @@ def profile_database(
|
|||||||
"errors": errors,
|
"errors": errors,
|
||||||
}
|
}
|
||||||
|
|
||||||
# 6) Reports opcionales.
|
# 6) Reports opcionales (markdown + JSON sidecar + PDF movil DB-level).
|
||||||
report_md_path = None
|
report_md_path = None
|
||||||
report_json_path = None
|
report_json_path = None
|
||||||
|
report_pdf_path = None
|
||||||
|
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
||||||
if write_report:
|
if write_report:
|
||||||
os.makedirs(report_dir, exist_ok=True)
|
os.makedirs(report_dir, exist_ok=True)
|
||||||
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
||||||
report_json_path = os.path.join(report_dir, f"eda_db_{ts}.json")
|
report_json_path = os.path.join(report_dir, f"eda_db_{ts}.json")
|
||||||
report_md_path = os.path.join(report_dir, f"eda_db_{ts}.md")
|
report_md_path = os.path.join(report_dir, f"eda_db_{ts}.md")
|
||||||
with open(report_json_path, "w", encoding="utf-8") as fh:
|
with open(report_json_path, "w", encoding="utf-8") as fh:
|
||||||
@@ -219,11 +228,23 @@ def profile_database(
|
|||||||
with open(report_md_path, "w", encoding="utf-8") as fh:
|
with open(report_md_path, "w", encoding="utf-8") as fh:
|
||||||
fh.write(_render_db_markdown(db_profile))
|
fh.write(_render_db_markdown(db_profile))
|
||||||
|
|
||||||
|
# PDF DB-level (legible en movil): resumen de tablas + join graph. Se
|
||||||
|
# genera bajo demanda (emit_pdf) reusando el renderer relational del grupo.
|
||||||
|
if emit_pdf:
|
||||||
|
try:
|
||||||
|
os.makedirs(report_dir, exist_ok=True)
|
||||||
|
pdf_target = os.path.join(report_dir, f"eda_db_{ts}.pdf")
|
||||||
|
pres = render_eda_pdf_relational(db_profile, pdf_target)
|
||||||
|
report_pdf_path = pres.get("pdf_path")
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
report_pdf_path = None
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
"db_profile": db_profile,
|
"db_profile": db_profile,
|
||||||
"report_md_path": report_md_path,
|
"report_md_path": report_md_path,
|
||||||
"report_json_path": report_json_path,
|
"report_json_path": report_json_path,
|
||||||
|
"report_pdf_path": report_pdf_path,
|
||||||
}
|
}
|
||||||
except Exception as e: # noqa: BLE001
|
except Exception as e: # noqa: BLE001
|
||||||
return {"status": "error", "error": str(e)}
|
return {"status": "error", "error": str(e)}
|
||||||
|
|||||||
@@ -165,3 +165,36 @@ def test_profile_database_writes_report(tmp_path):
|
|||||||
assert "# EDA base —" in md
|
assert "# EDA base —" in md
|
||||||
assert "## Relaciones inter-tabla" in md
|
assert "## Relaciones inter-tabla" in md
|
||||||
assert "```mermaid" in md
|
assert "```mermaid" in md
|
||||||
|
|
||||||
|
|
||||||
|
def test_profile_database_emit_pdf(tmp_path):
|
||||||
|
# H9: con emit_pdf=True, profile_database genera un PDF DB-level (>0 bytes,
|
||||||
|
# cabecera %PDF) además del markdown + JSON.
|
||||||
|
db_path = os.path.join(str(tmp_path), "shop3.duckdb")
|
||||||
|
_build_related_db(db_path)
|
||||||
|
report_dir = os.path.join(str(tmp_path), "reports")
|
||||||
|
|
||||||
|
res = profile_database(
|
||||||
|
db_path, report_dir=report_dir, write_report=True, emit_pdf=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert res["status"] == "ok", res
|
||||||
|
pdf = res.get("report_pdf_path")
|
||||||
|
assert pdf is not None
|
||||||
|
assert os.path.exists(pdf)
|
||||||
|
assert os.path.getsize(pdf) > 0
|
||||||
|
with open(pdf, "rb") as fh:
|
||||||
|
assert fh.read(4) == b"%PDF"
|
||||||
|
|
||||||
|
|
||||||
|
def test_profile_database_emit_pdf_false_retrocompat(tmp_path):
|
||||||
|
# Edge: emit_pdf=False (default) se comporta como antes — no genera PDF y
|
||||||
|
# report_pdf_path es None.
|
||||||
|
db_path = os.path.join(str(tmp_path), "shop4.duckdb")
|
||||||
|
_build_related_db(db_path)
|
||||||
|
report_dir = os.path.join(str(tmp_path), "reports")
|
||||||
|
|
||||||
|
res = profile_database(db_path, report_dir=report_dir, write_report=True)
|
||||||
|
|
||||||
|
assert res["status"] == "ok", res
|
||||||
|
assert res.get("report_pdf_path") is None
|
||||||
|
|||||||
Reference in New Issue
Block a user