"""Tests para profile_database — perfilado de una base DuckDB + relaciones.""" import os import sys import tempfile import duckdb sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pipelines.profile_database import profile_database def _build_related_db(path: str) -> None: """Crea una DuckDB con 2 tablas relacionadas: customers <- orders. customers.id es clave; orders.customer_id contiene solo ids de customers, de modo que orders.customer_id -> customers.id es una FK detectable por containment. """ conn = duckdb.connect(path) try: conn.execute( "CREATE TABLE customers (id INTEGER, name VARCHAR, city VARCHAR)" ) conn.execute( "INSERT INTO customers VALUES " "(1,'Ana','Madrid'),(2,'Luis','Sevilla')," "(3,'Marta','Bilbao'),(4,'Jon','Vigo')" ) conn.execute( "CREATE TABLE orders (order_id INTEGER, customer_id INTEGER, total DOUBLE)" ) conn.execute( "INSERT INTO orders VALUES " "(10,1,99.5),(11,1,12.0),(12,2,45.0)," "(13,3,7.25),(14,4,200.0),(15,2,33.3)" ) finally: conn.close() def test_profile_database_two_related_tables(): with tempfile.TemporaryDirectory() as d: db_path = os.path.join(d, "shop.duckdb") _build_related_db(db_path) res = profile_database(db_path, write_report=False) # status ok y dos tablas perfiladas assert res["status"] == "ok", res prof = res["db_profile"] assert prof["n_tables"] == 2 # los TableProfiles completos llegan para ambas tablas assert len(prof["table_profiles"]) == 2 profiled_tables = {tp["table"] for tp in prof["table_profiles"]} assert profiled_tables == {"customers", "orders"} # se detecta la relacion orders.customer_id -> customers.id fks = prof["fk_candidates"] assert any( fk.get("from_table") == "orders" and fk.get("from_col") == "customer_id" and fk.get("to_table") == "customers" and fk.get("to_col") == "id" for fk in fks ), fks # el join graph trae un diagrama mermaid graph = prof["join_graph"] assert "mermaid" in graph assert isinstance(graph["mermaid"], str) assert graph["mermaid"].startswith("graph LR") # no se reportan paths cuando write_report=False assert res["report_md_path"] is None assert res["report_json_path"] is None def test_profile_database_excluye_views(tmp_path): # Regresión H5: una VIEW no es una tabla real. profile_database debe perfilar # solo las BASE TABLE y no contar las VIEWs (inflan n_tables y multiplican FK # falsas, al ser copias de columnas de las tablas base). db_path = os.path.join(str(tmp_path), "withviews.duckdb") _build_related_db(db_path) con = duckdb.connect(db_path) con.execute("CREATE VIEW customers_v AS SELECT id, name FROM customers") con.execute("CREATE VIEW orders_v AS SELECT order_id, total FROM orders") con.close() res = profile_database(db_path, write_report=False) assert res["status"] == "ok", res prof = res["db_profile"] # Solo las 2 tablas base; las 2 views quedan fuera. assert prof["n_tables"] == 2 profiled = {tp["table"] for tp in prof["table_profiles"]} assert profiled == {"customers", "orders"} assert "customers_v" not in profiled assert "orders_v" not in profiled def test_profile_database_attach_sqlite_no_usa_sqlite_master(tmp_path): # Regresión H14: materializar una base SQLite vía ATTACH (information_schema, # no sqlite_master) y perfilarla con profile_database sin que falle. Blinda el # bug original 'sqlite_master does not exist'. import sqlite3 sqlite_path = os.path.join(str(tmp_path), "shop.sqlite") sconn = sqlite3.connect(sqlite_path) sconn.execute("CREATE TABLE customers (id INTEGER PRIMARY KEY, name TEXT)") sconn.execute("INSERT INTO customers VALUES (1,'Ana'),(2,'Luis'),(3,'Marta')") sconn.execute( "CREATE TABLE orders (order_id INTEGER, customer_id INTEGER, total REAL)" ) sconn.execute( "INSERT INTO orders VALUES (10,1,99.5),(11,2,12.0),(12,3,7.25),(13,1,5.0)" ) sconn.execute("CREATE VIEW big_orders AS SELECT * FROM orders WHERE total > 10") sconn.commit() sconn.close() ddb_path = os.path.join(str(tmp_path), "shop_mat.duckdb") con = duckdb.connect(ddb_path) con.execute("INSTALL sqlite") con.execute("LOAD sqlite") con.execute(f"ATTACH '{sqlite_path}' AS src (TYPE sqlite)") rows = con.execute( "SELECT table_name FROM information_schema.tables " "WHERE table_catalog='src' AND table_type='BASE TABLE' " "AND table_name NOT LIKE 'sqlite_%'" ).fetchall() for (name,) in rows: con.execute(f'CREATE TABLE "{name}" AS SELECT * FROM src."{name}"') con.execute("DETACH src") con.close() res = profile_database(ddb_path, write_report=False) assert res["status"] == "ok", res prof = res["db_profile"] # Solo las 2 tablas base materializadas (la VIEW no se materializó). profiled = {tp["table"] for tp in prof["table_profiles"]} assert profiled == {"customers", "orders"} # FK orders.customer_id -> customers.id detectable. assert any( fk.get("from_table") == "orders" and fk.get("to_table") == "customers" for fk in prof["fk_candidates"] ), prof["fk_candidates"] def test_profile_database_writes_report(tmp_path): db_path = os.path.join(str(tmp_path), "shop2.duckdb") _build_related_db(db_path) report_dir = os.path.join(str(tmp_path), "reports") res = profile_database(db_path, report_dir=report_dir, write_report=True) assert res["status"] == "ok", res assert res["report_md_path"] is not None assert res["report_json_path"] is not None assert os.path.exists(res["report_md_path"]) assert os.path.exists(res["report_json_path"]) md = open(res["report_md_path"], encoding="utf-8").read() assert "# EDA base —" in md assert "## Relaciones inter-tabla" in md assert "```mermaid" in md