"""Tests para summarize_table_duckdb.""" import duckdb import pytest from .summarize_table_duckdb import summarize_table_duckdb @pytest.fixture def db(tmp_path): """Crea una DuckDB temporal con numerica + categorica + nulls + id unico.""" path = str(tmp_path / "eda_test.duckdb") con = duckdb.connect(path) con.execute( "CREATE TABLE ventas (" " id INTEGER," # unico, sin nulls -> possible_id " region VARCHAR," # categorica baja cardinalidad " total DOUBLE," # numerica con un null " pais VARCHAR" # constante ")" ) con.execute( "INSERT INTO ventas VALUES " "(1, 'norte', 120.5, 'ES'), " "(2, 'sur', 80.0, 'ES'), " "(3, 'norte', NULL, 'ES'), " "(4, 'este', 45.25, 'ES')" ) con.close() return path def test_shape_y_metadatos_tabla(db): res = summarize_table_duckdb(db, "ventas") assert res["status"] == "ok" profile = res["profile"] # Claves del TableProfile presentes. for key in ( "table", "source", "profiled_at", "n_rows", "n_cols", "size_bytes", "duplicate_rows", "duplicate_pct", "constant_cols", "all_null_cols", "null_cell_pct", "type_breakdown", "columns", "correlations", "key_candidates", "quality_score", "llm", "models", ): assert key in profile, f"falta clave {key} en TableProfile" assert profile["table"] == "ventas" assert profile["source"] == "duckdb" assert profile["n_rows"] == 4 assert profile["n_cols"] == 4 assert len(profile["columns"]) == 4 assert profile["key_candidates"] == [] assert profile["quality_score"] is None assert profile["correlations"] is None def test_column_profile_shape(db): profile = summarize_table_duckdb(db, "ventas")["profile"] by_name = {c["name"]: c for c in profile["columns"]} for col in profile["columns"]: for key in ( "name", "physical_type", "inferred_type", "semantic_type", "count", "n_rows", "null_count", "null_pct", "empty_count", "empty_pct", "distinct_count", "unique_pct", "flags", "quality_score", "numeric", "categorical", "datetime", ): assert key in col, f"falta clave {key} en ColumnProfile {col['name']}" # id: numerica, sin nulls, unica. assert by_name["id"]["inferred_type"] == "numeric" assert by_name["id"]["null_count"] == 0 assert by_name["id"]["count"] == 4 assert by_name["id"]["distinct_count"] == 4 assert "possible_id" in by_name["id"]["flags"] # region: categorica baja cardinalidad. assert by_name["region"]["inferred_type"] == "categorical" assert by_name["region"]["distinct_count"] == 3 # total: numerica con un null. count no-nulo = 3. total = by_name["total"] assert total["inferred_type"] == "numeric" assert total["null_count"] == 1 assert total["count"] == 3 assert total["numeric"] is not None # SUMMARIZE rellena min/max/mean/std/p25/p50/p75; el resto queda en None. assert total["numeric"]["min"] == pytest.approx(45.25) assert total["numeric"]["max"] == pytest.approx(120.5) assert total["numeric"]["mean"] is not None assert total["numeric"]["skew"] is None assert total["numeric"]["histogram"] is None assert total["numeric"]["p99"] is None # pais: constante -> flag constant + aparece en constant_cols. assert "constant" in by_name["pais"]["flags"] assert "pais" in profile["constant_cols"] def test_distinct_no_excede_filas(db): """distinct_count exacto: nunca supera n_rows ni unique_pct pasa de 1.0. Regresion: SUMMARIZE.approx_unique (HyperLogLog) sobreestimaba y reportaba mas distintos que filas en tablas pequenas, inflando unique_pct > 1.0 y disparando flags possible_id falsos. """ profile = summarize_table_duckdb(db, "ventas")["profile"] n_rows = profile["n_rows"] for col in profile["columns"]: assert col["distinct_count"] <= n_rows, ( f"{col['name']}: distinct_count {col['distinct_count']} > n_rows {n_rows}" ) assert col["unique_pct"] <= 1.0, ( f"{col['name']}: unique_pct {col['unique_pct']} > 1.0" ) def test_columna_unica_da_possible_id(db): """Una columna con todos los valores unicos -> unique_pct == 1.0 + possible_id.""" profile = summarize_table_duckdb(db, "ventas")["profile"] by_name = {c["name"]: c for c in profile["columns"]} # id: 4 valores distintos sobre 4 filas, sin nulls. idc = by_name["id"] assert idc["distinct_count"] == 4 assert idc["unique_pct"] == 1.0 assert "possible_id" in idc["flags"] def test_type_breakdown(db): profile = summarize_table_duckdb(db, "ventas")["profile"] tb = profile["type_breakdown"] assert set(tb.keys()) == { "numeric", "categorical", "datetime", "text", "boolean" } assert tb["numeric"] == 2 # id, total assert tb["categorical"] == 2 # region, pais assert tb["datetime"] == 0 assert tb["boolean"] == 0 def test_tabla_invalida_devuelve_error(db): res = summarize_table_duckdb(db, "ventas; DROP TABLE ventas") assert res["status"] == "error" assert "invalido" in res["error"] def test_tabla_inexistente_devuelve_error(db): res = summarize_table_duckdb(db, "no_existe") assert res["status"] == "error"