Files
fn_registry/python/functions/datascience/summarize_table_duckdb_test.py
T
egutierrez 763e06c127 feat(browser): auto-commit con 178 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-20 18:22:23 +02:00

151 lines
5.3 KiB
Python

"""Tests para summarize_table_duckdb."""
import duckdb
import pytest
from .summarize_table_duckdb import summarize_table_duckdb
@pytest.fixture
def db(tmp_path):
"""Crea una DuckDB temporal con numerica + categorica + nulls + id unico."""
path = str(tmp_path / "eda_test.duckdb")
con = duckdb.connect(path)
con.execute(
"CREATE TABLE ventas ("
" id INTEGER," # unico, sin nulls -> possible_id
" region VARCHAR," # categorica baja cardinalidad
" total DOUBLE," # numerica con un null
" pais VARCHAR" # constante
")"
)
con.execute(
"INSERT INTO ventas VALUES "
"(1, 'norte', 120.5, 'ES'), "
"(2, 'sur', 80.0, 'ES'), "
"(3, 'norte', NULL, 'ES'), "
"(4, 'este', 45.25, 'ES')"
)
con.close()
return path
def test_shape_y_metadatos_tabla(db):
res = summarize_table_duckdb(db, "ventas")
assert res["status"] == "ok"
profile = res["profile"]
# Claves del TableProfile presentes.
for key in (
"table", "source", "profiled_at", "n_rows", "n_cols", "size_bytes",
"duplicate_rows", "duplicate_pct", "constant_cols", "all_null_cols",
"null_cell_pct", "type_breakdown", "columns", "correlations",
"key_candidates", "quality_score", "llm", "models",
):
assert key in profile, f"falta clave {key} en TableProfile"
assert profile["table"] == "ventas"
assert profile["source"] == "duckdb"
assert profile["n_rows"] == 4
assert profile["n_cols"] == 4
assert len(profile["columns"]) == 4
assert profile["key_candidates"] == []
assert profile["quality_score"] is None
assert profile["correlations"] is None
def test_column_profile_shape(db):
profile = summarize_table_duckdb(db, "ventas")["profile"]
by_name = {c["name"]: c for c in profile["columns"]}
for col in profile["columns"]:
for key in (
"name", "physical_type", "inferred_type", "semantic_type", "count",
"n_rows", "null_count", "null_pct", "empty_count", "empty_pct",
"distinct_count", "unique_pct", "flags", "quality_score",
"numeric", "categorical", "datetime",
):
assert key in col, f"falta clave {key} en ColumnProfile {col['name']}"
# id: numerica, sin nulls, unica.
assert by_name["id"]["inferred_type"] == "numeric"
assert by_name["id"]["null_count"] == 0
assert by_name["id"]["count"] == 4
assert by_name["id"]["distinct_count"] == 4
assert "possible_id" in by_name["id"]["flags"]
# region: categorica baja cardinalidad.
assert by_name["region"]["inferred_type"] == "categorical"
assert by_name["region"]["distinct_count"] == 3
# total: numerica con un null. count no-nulo = 3.
total = by_name["total"]
assert total["inferred_type"] == "numeric"
assert total["null_count"] == 1
assert total["count"] == 3
assert total["numeric"] is not None
# SUMMARIZE rellena min/max/mean/std/p25/p50/p75; el resto queda en None.
assert total["numeric"]["min"] == pytest.approx(45.25)
assert total["numeric"]["max"] == pytest.approx(120.5)
assert total["numeric"]["mean"] is not None
assert total["numeric"]["skew"] is None
assert total["numeric"]["histogram"] is None
assert total["numeric"]["p99"] is None
# pais: constante -> flag constant + aparece en constant_cols.
assert "constant" in by_name["pais"]["flags"]
assert "pais" in profile["constant_cols"]
def test_distinct_no_excede_filas(db):
"""distinct_count exacto: nunca supera n_rows ni unique_pct pasa de 1.0.
Regresion: SUMMARIZE.approx_unique (HyperLogLog) sobreestimaba y reportaba
mas distintos que filas en tablas pequenas, inflando unique_pct > 1.0 y
disparando flags possible_id falsos.
"""
profile = summarize_table_duckdb(db, "ventas")["profile"]
n_rows = profile["n_rows"]
for col in profile["columns"]:
assert col["distinct_count"] <= n_rows, (
f"{col['name']}: distinct_count {col['distinct_count']} > n_rows {n_rows}"
)
assert col["unique_pct"] <= 1.0, (
f"{col['name']}: unique_pct {col['unique_pct']} > 1.0"
)
def test_columna_unica_da_possible_id(db):
"""Una columna con todos los valores unicos -> unique_pct == 1.0 + possible_id."""
profile = summarize_table_duckdb(db, "ventas")["profile"]
by_name = {c["name"]: c for c in profile["columns"]}
# id: 4 valores distintos sobre 4 filas, sin nulls.
idc = by_name["id"]
assert idc["distinct_count"] == 4
assert idc["unique_pct"] == 1.0
assert "possible_id" in idc["flags"]
def test_type_breakdown(db):
profile = summarize_table_duckdb(db, "ventas")["profile"]
tb = profile["type_breakdown"]
assert set(tb.keys()) == {
"numeric", "categorical", "datetime", "text", "boolean"
}
assert tb["numeric"] == 2 # id, total
assert tb["categorical"] == 2 # region, pais
assert tb["datetime"] == 0
assert tb["boolean"] == 0
def test_tabla_invalida_devuelve_error(db):
res = summarize_table_duckdb(db, "ventas; DROP TABLE ventas")
assert res["status"] == "error"
assert "invalido" in res["error"]
def test_tabla_inexistente_devuelve_error(db):
res = summarize_table_duckdb(db, "no_existe")
assert res["status"] == "error"