c4cff5ed5b
- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers); render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo) - H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas + join graph) via render_eda_pdf_relational; clave report_pdf_path - aditivos y retrocompatibles (flags default False). 38 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
201 lines
7.3 KiB
Python
201 lines
7.3 KiB
Python
"""Tests para profile_database — perfilado de una base DuckDB + relaciones."""
|
|
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
|
|
import duckdb
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from pipelines.profile_database import profile_database
|
|
|
|
|
|
def _build_related_db(path: str) -> None:
|
|
"""Crea una DuckDB con 2 tablas relacionadas: customers <- orders.
|
|
|
|
customers.id es clave; orders.customer_id contiene solo ids de customers,
|
|
de modo que orders.customer_id -> customers.id es una FK detectable por
|
|
containment.
|
|
"""
|
|
conn = duckdb.connect(path)
|
|
try:
|
|
conn.execute(
|
|
"CREATE TABLE customers (id INTEGER, name VARCHAR, city VARCHAR)"
|
|
)
|
|
conn.execute(
|
|
"INSERT INTO customers VALUES "
|
|
"(1,'Ana','Madrid'),(2,'Luis','Sevilla'),"
|
|
"(3,'Marta','Bilbao'),(4,'Jon','Vigo')"
|
|
)
|
|
conn.execute(
|
|
"CREATE TABLE orders (order_id INTEGER, customer_id INTEGER, total DOUBLE)"
|
|
)
|
|
conn.execute(
|
|
"INSERT INTO orders VALUES "
|
|
"(10,1,99.5),(11,1,12.0),(12,2,45.0),"
|
|
"(13,3,7.25),(14,4,200.0),(15,2,33.3)"
|
|
)
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def test_profile_database_two_related_tables():
|
|
with tempfile.TemporaryDirectory() as d:
|
|
db_path = os.path.join(d, "shop.duckdb")
|
|
_build_related_db(db_path)
|
|
|
|
res = profile_database(db_path, write_report=False)
|
|
|
|
# status ok y dos tablas perfiladas
|
|
assert res["status"] == "ok", res
|
|
prof = res["db_profile"]
|
|
assert prof["n_tables"] == 2
|
|
|
|
# los TableProfiles completos llegan para ambas tablas
|
|
assert len(prof["table_profiles"]) == 2
|
|
profiled_tables = {tp["table"] for tp in prof["table_profiles"]}
|
|
assert profiled_tables == {"customers", "orders"}
|
|
|
|
# se detecta la relacion orders.customer_id -> customers.id
|
|
fks = prof["fk_candidates"]
|
|
assert any(
|
|
fk.get("from_table") == "orders"
|
|
and fk.get("from_col") == "customer_id"
|
|
and fk.get("to_table") == "customers"
|
|
and fk.get("to_col") == "id"
|
|
for fk in fks
|
|
), fks
|
|
|
|
# el join graph trae un diagrama mermaid
|
|
graph = prof["join_graph"]
|
|
assert "mermaid" in graph
|
|
assert isinstance(graph["mermaid"], str)
|
|
assert graph["mermaid"].startswith("graph LR")
|
|
|
|
# no se reportan paths cuando write_report=False
|
|
assert res["report_md_path"] is None
|
|
assert res["report_json_path"] is None
|
|
|
|
|
|
def test_profile_database_excluye_views(tmp_path):
|
|
# Regresión H5: una VIEW no es una tabla real. profile_database debe perfilar
|
|
# solo las BASE TABLE y no contar las VIEWs (inflan n_tables y multiplican FK
|
|
# falsas, al ser copias de columnas de las tablas base).
|
|
db_path = os.path.join(str(tmp_path), "withviews.duckdb")
|
|
_build_related_db(db_path)
|
|
con = duckdb.connect(db_path)
|
|
con.execute("CREATE VIEW customers_v AS SELECT id, name FROM customers")
|
|
con.execute("CREATE VIEW orders_v AS SELECT order_id, total FROM orders")
|
|
con.close()
|
|
|
|
res = profile_database(db_path, write_report=False)
|
|
|
|
assert res["status"] == "ok", res
|
|
prof = res["db_profile"]
|
|
# Solo las 2 tablas base; las 2 views quedan fuera.
|
|
assert prof["n_tables"] == 2
|
|
profiled = {tp["table"] for tp in prof["table_profiles"]}
|
|
assert profiled == {"customers", "orders"}
|
|
assert "customers_v" not in profiled
|
|
assert "orders_v" not in profiled
|
|
|
|
|
|
def test_profile_database_attach_sqlite_no_usa_sqlite_master(tmp_path):
|
|
# Regresión H14: materializar una base SQLite vía ATTACH (information_schema,
|
|
# no sqlite_master) y perfilarla con profile_database sin que falle. Blinda el
|
|
# bug original 'sqlite_master does not exist'.
|
|
import sqlite3
|
|
|
|
sqlite_path = os.path.join(str(tmp_path), "shop.sqlite")
|
|
sconn = sqlite3.connect(sqlite_path)
|
|
sconn.execute("CREATE TABLE customers (id INTEGER PRIMARY KEY, name TEXT)")
|
|
sconn.execute("INSERT INTO customers VALUES (1,'Ana'),(2,'Luis'),(3,'Marta')")
|
|
sconn.execute(
|
|
"CREATE TABLE orders (order_id INTEGER, customer_id INTEGER, total REAL)"
|
|
)
|
|
sconn.execute(
|
|
"INSERT INTO orders VALUES (10,1,99.5),(11,2,12.0),(12,3,7.25),(13,1,5.0)"
|
|
)
|
|
sconn.execute("CREATE VIEW big_orders AS SELECT * FROM orders WHERE total > 10")
|
|
sconn.commit()
|
|
sconn.close()
|
|
|
|
ddb_path = os.path.join(str(tmp_path), "shop_mat.duckdb")
|
|
con = duckdb.connect(ddb_path)
|
|
con.execute("INSTALL sqlite")
|
|
con.execute("LOAD sqlite")
|
|
con.execute(f"ATTACH '{sqlite_path}' AS src (TYPE sqlite)")
|
|
rows = con.execute(
|
|
"SELECT table_name FROM information_schema.tables "
|
|
"WHERE table_catalog='src' AND table_type='BASE TABLE' "
|
|
"AND table_name NOT LIKE 'sqlite_%'"
|
|
).fetchall()
|
|
for (name,) in rows:
|
|
con.execute(f'CREATE TABLE "{name}" AS SELECT * FROM src."{name}"')
|
|
con.execute("DETACH src")
|
|
con.close()
|
|
|
|
res = profile_database(ddb_path, write_report=False)
|
|
assert res["status"] == "ok", res
|
|
prof = res["db_profile"]
|
|
# Solo las 2 tablas base materializadas (la VIEW no se materializó).
|
|
profiled = {tp["table"] for tp in prof["table_profiles"]}
|
|
assert profiled == {"customers", "orders"}
|
|
# FK orders.customer_id -> customers.id detectable.
|
|
assert any(
|
|
fk.get("from_table") == "orders" and fk.get("to_table") == "customers"
|
|
for fk in prof["fk_candidates"]
|
|
), prof["fk_candidates"]
|
|
|
|
|
|
def test_profile_database_writes_report(tmp_path):
|
|
db_path = os.path.join(str(tmp_path), "shop2.duckdb")
|
|
_build_related_db(db_path)
|
|
report_dir = os.path.join(str(tmp_path), "reports")
|
|
|
|
res = profile_database(db_path, report_dir=report_dir, write_report=True)
|
|
|
|
assert res["status"] == "ok", res
|
|
assert res["report_md_path"] is not None
|
|
assert res["report_json_path"] is not None
|
|
assert os.path.exists(res["report_md_path"])
|
|
assert os.path.exists(res["report_json_path"])
|
|
md = open(res["report_md_path"], encoding="utf-8").read()
|
|
assert "# EDA base —" in md
|
|
assert "## Relaciones inter-tabla" in md
|
|
assert "```mermaid" in md
|
|
|
|
|
|
def test_profile_database_emit_pdf(tmp_path):
|
|
# H9: con emit_pdf=True, profile_database genera un PDF DB-level (>0 bytes,
|
|
# cabecera %PDF) además del markdown + JSON.
|
|
db_path = os.path.join(str(tmp_path), "shop3.duckdb")
|
|
_build_related_db(db_path)
|
|
report_dir = os.path.join(str(tmp_path), "reports")
|
|
|
|
res = profile_database(
|
|
db_path, report_dir=report_dir, write_report=True, emit_pdf=True
|
|
)
|
|
|
|
assert res["status"] == "ok", res
|
|
pdf = res.get("report_pdf_path")
|
|
assert pdf is not None
|
|
assert os.path.exists(pdf)
|
|
assert os.path.getsize(pdf) > 0
|
|
with open(pdf, "rb") as fh:
|
|
assert fh.read(4) == b"%PDF"
|
|
|
|
|
|
def test_profile_database_emit_pdf_false_retrocompat(tmp_path):
|
|
# Edge: emit_pdf=False (default) se comporta como antes — no genera PDF y
|
|
# report_pdf_path es None.
|
|
db_path = os.path.join(str(tmp_path), "shop4.duckdb")
|
|
_build_related_db(db_path)
|
|
report_dir = os.path.join(str(tmp_path), "reports")
|
|
|
|
res = profile_database(db_path, report_dir=report_dir, write_report=True)
|
|
|
|
assert res["status"] == "ok", res
|
|
assert res.get("report_pdf_path") is None
|