763e06c127
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
228 lines
8.8 KiB
Python
228 lines
8.8 KiB
Python
"""profile_database — orquestador one-shot del grupo `eda` a nivel de BASE.
|
|
|
|
Pipeline impuro: perfila TODA una base DuckDB (todas las tablas o las indicadas)
|
|
componiendo el grupo de capacidad `eda` y, encima, infiere las relaciones FK
|
|
entre tablas y construye el join graph. Es la composicion canonica para "hazme
|
|
un EDA de esta base de datos": una sola llamada en vez de orquestar el perfil de
|
|
cada tabla + la inferencia de relaciones a mano.
|
|
|
|
Funciones del registry compuestas (NO se reimplementa su logica):
|
|
- profile_table : perfila UNA tabla end-to-end (a su vez compone el grupo eda).
|
|
- infer_fk_containment_duckdb : infiere FK candidatas por containment de valores.
|
|
- build_join_graph : grafo de relaciones inter-tabla + diagrama Mermaid.
|
|
- duckdb_list_tables : introspeccion "que tablas hay" (read-only).
|
|
- render_eda_markdown : report legible de un TableProfile.
|
|
|
|
Aporta una capa propia de AGREGACION A NIVEL DE BASE: ensambla un DatabaseProfile
|
|
con el resumen de cada tabla, los TableProfiles completos, las FK candidatas y el
|
|
join graph, y opcionalmente emite un report markdown DB-level (con un diagrama
|
|
Mermaid) + un JSON sidecar a disco.
|
|
|
|
Estilo dict-no-throw del grupo: nunca lanza; captura cualquier error y devuelve
|
|
{status:'error', error:str}. Los fallos por tabla individual se toleran: se anota
|
|
el error en errors[] y se sigue con las demas tablas.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from datetime import datetime, timezone
|
|
|
|
from datascience import (
|
|
build_join_graph,
|
|
infer_fk_containment_duckdb,
|
|
render_eda_markdown,
|
|
)
|
|
from infra import duckdb_list_tables
|
|
from pipelines.profile_table import profile_table
|
|
|
|
|
|
def _table_summary(prof: dict) -> dict:
|
|
"""Extrae el resumen de cabecera de un TableProfile para la vista DB-level."""
|
|
return {
|
|
"table": prof.get("table"),
|
|
"n_rows": prof.get("n_rows"),
|
|
"n_cols": prof.get("n_cols"),
|
|
"quality_score": prof.get("quality_score"),
|
|
"key_candidates": prof.get("key_candidates", []),
|
|
"type_breakdown": prof.get("type_breakdown", {}),
|
|
}
|
|
|
|
|
|
def _render_db_markdown(db_profile: dict) -> str:
|
|
"""Renderiza el report markdown a nivel de base.
|
|
|
|
Tabla resumen de tablas, tabla de relaciones inter-tabla (FK candidatas),
|
|
diagrama Mermaid del join graph, y un detalle por tabla reusando
|
|
render_eda_markdown sobre cada TableProfile completo.
|
|
"""
|
|
lines = []
|
|
lines.append(f"# EDA base — {db_profile.get('db_path')}")
|
|
lines.append("")
|
|
lines.append(f"- profiled_at: {db_profile.get('profiled_at')}")
|
|
lines.append(f"- n_tables: {db_profile.get('n_tables')}")
|
|
lines.append("")
|
|
|
|
# ## Tablas
|
|
lines.append("## Tablas")
|
|
lines.append("")
|
|
lines.append("| Tabla | Filas | Cols | Calidad | key_candidates |")
|
|
lines.append("|---|---|---|---|---|")
|
|
for t in db_profile.get("tables", []):
|
|
keys = ", ".join(t.get("key_candidates") or []) or "—"
|
|
lines.append(
|
|
f"| {t.get('table')} | {t.get('n_rows')} | {t.get('n_cols')} "
|
|
f"| {t.get('quality_score')} | {keys} |"
|
|
)
|
|
lines.append("")
|
|
|
|
# ## Relaciones inter-tabla
|
|
lines.append("## Relaciones inter-tabla")
|
|
lines.append("")
|
|
fks = db_profile.get("fk_candidates", [])
|
|
if fks:
|
|
lines.append("| From | To | Inclusion | Cardinalidad |")
|
|
lines.append("|---|---|---|---|")
|
|
for fk in fks:
|
|
frm = f"{fk.get('from_table')}.{fk.get('from_col')}"
|
|
to = f"{fk.get('to_table')}.{fk.get('to_col')}"
|
|
inc = fk.get("inclusion")
|
|
inc_s = f"{inc:.3f}" if isinstance(inc, (int, float)) else str(inc)
|
|
lines.append(f"| {frm} | {to} | {inc_s} | {fk.get('cardinality')} |")
|
|
else:
|
|
lines.append("_Sin relaciones FK candidatas detectadas._")
|
|
lines.append("")
|
|
|
|
# ## Diagrama
|
|
lines.append("## Diagrama")
|
|
lines.append("")
|
|
mermaid = (db_profile.get("join_graph") or {}).get("mermaid", "")
|
|
lines.append("```mermaid")
|
|
lines.append(mermaid)
|
|
lines.append("```")
|
|
lines.append("")
|
|
|
|
# ## Detalle por tabla
|
|
lines.append("## Detalle por tabla")
|
|
lines.append("")
|
|
for prof in db_profile.get("table_profiles", []):
|
|
lines.append(render_eda_markdown(prof))
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def profile_database(
|
|
db_path: str,
|
|
tables: list = None,
|
|
sample: int = 5000,
|
|
report_dir: str = "reports",
|
|
write_report: bool = True,
|
|
min_inclusion: float = 0.9,
|
|
) -> dict:
|
|
"""Perfila una base DuckDB entera + sus relaciones inter-tabla.
|
|
|
|
Args:
|
|
db_path: ruta al archivo DuckDB (read-only, debe existir).
|
|
tables: lista de tablas a perfilar. None (default) usa todas las del
|
|
esquema main (duckdb_list_tables).
|
|
sample: maximo de valores no nulos muestreados por columna en el perfil
|
|
de cada tabla (se pasa a profile_table). Default 5000.
|
|
report_dir: directorio donde escribir los reports DB-level si
|
|
write_report. Default "reports". Se crea si no existe.
|
|
write_report: si True (default), escribe un report markdown DB-level + un
|
|
JSON sidecar timestamped en report_dir. Si False, no toca disco y los
|
|
paths del retorno son None.
|
|
min_inclusion: umbral minimo de inclusion (0-1) para emitir una FK
|
|
candidata (se pasa a infer_fk_containment_duckdb). Default 0.9.
|
|
|
|
Returns:
|
|
dict dict-no-throw. En exito:
|
|
{status:'ok', db_profile:<DatabaseProfile>,
|
|
report_md_path:str|None, report_json_path:str|None}.
|
|
En error (sin lanzar): {status:'error', error:str}.
|
|
|
|
DatabaseProfile = {
|
|
db_path, profiled_at, n_tables,
|
|
tables:[{table, n_rows, n_cols, quality_score, key_candidates,
|
|
type_breakdown}, ...],
|
|
table_profiles:[<TableProfile completo>, ...],
|
|
fk_candidates:[...], join_graph:{nodes, edges, mermaid, hubs},
|
|
errors:[...]
|
|
}
|
|
"""
|
|
try:
|
|
# 1) Resolver lista de tablas.
|
|
if tables is None:
|
|
lst = duckdb_list_tables(db_path)
|
|
if lst.get("status") != "ok":
|
|
return {"status": "error", "error": lst.get("error", "list failed")}
|
|
tables = lst.get("tables", [])
|
|
|
|
if not isinstance(tables, list):
|
|
return {"status": "error", "error": "tables debe ser una lista o None"}
|
|
|
|
errors = []
|
|
table_profiles = []
|
|
table_summaries = []
|
|
|
|
# 2) Perfilar cada tabla (tolerando fallos individuales).
|
|
for table in tables:
|
|
r = profile_table(db_path, table, sample=sample, write_report=False)
|
|
if r.get("status") == "ok":
|
|
prof = r["profile"]
|
|
table_profiles.append(prof)
|
|
table_summaries.append(_table_summary(prof))
|
|
else:
|
|
errors.append(
|
|
{"table": table, "error": r.get("error", "profile failed")}
|
|
)
|
|
|
|
# 3) Inferir FK candidatas por containment.
|
|
fk = infer_fk_containment_duckdb(
|
|
db_path, tables=tables, min_inclusion=min_inclusion
|
|
)
|
|
if fk.get("status") == "ok":
|
|
fk_candidates = fk.get("fk_candidates", [])
|
|
else:
|
|
fk_candidates = []
|
|
errors.append({"step": "infer_fk", "error": fk.get("error", "fk failed")})
|
|
|
|
# 4) Construir el join graph.
|
|
graph = build_join_graph(fk_candidates, tables=tables)
|
|
|
|
# 5) Ensamblar el DatabaseProfile.
|
|
db_profile = {
|
|
"db_path": db_path,
|
|
"profiled_at": datetime.now(timezone.utc).isoformat(),
|
|
"n_tables": len(table_profiles),
|
|
"tables": table_summaries,
|
|
"table_profiles": table_profiles,
|
|
"fk_candidates": fk_candidates,
|
|
"join_graph": graph,
|
|
"errors": errors,
|
|
}
|
|
|
|
# 6) Reports opcionales.
|
|
report_md_path = None
|
|
report_json_path = None
|
|
if write_report:
|
|
os.makedirs(report_dir, exist_ok=True)
|
|
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
report_json_path = os.path.join(report_dir, f"eda_db_{ts}.json")
|
|
report_md_path = os.path.join(report_dir, f"eda_db_{ts}.md")
|
|
with open(report_json_path, "w", encoding="utf-8") as fh:
|
|
fh.write(
|
|
json.dumps(db_profile, ensure_ascii=False, indent=1, default=str)
|
|
)
|
|
with open(report_md_path, "w", encoding="utf-8") as fh:
|
|
fh.write(_render_db_markdown(db_profile))
|
|
|
|
return {
|
|
"status": "ok",
|
|
"db_profile": db_profile,
|
|
"report_md_path": report_md_path,
|
|
"report_json_path": report_json_path,
|
|
}
|
|
except Exception as e: # noqa: BLE001
|
|
return {"status": "error", "error": str(e)}
|