feat(browser): auto-commit con 178 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,150 @@
|
||||
"""Tests para summarize_table_duckdb."""
|
||||
|
||||
import duckdb
|
||||
import pytest
|
||||
|
||||
from .summarize_table_duckdb import summarize_table_duckdb
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def db(tmp_path):
|
||||
"""Crea una DuckDB temporal con numerica + categorica + nulls + id unico."""
|
||||
path = str(tmp_path / "eda_test.duckdb")
|
||||
con = duckdb.connect(path)
|
||||
con.execute(
|
||||
"CREATE TABLE ventas ("
|
||||
" id INTEGER," # unico, sin nulls -> possible_id
|
||||
" region VARCHAR," # categorica baja cardinalidad
|
||||
" total DOUBLE," # numerica con un null
|
||||
" pais VARCHAR" # constante
|
||||
")"
|
||||
)
|
||||
con.execute(
|
||||
"INSERT INTO ventas VALUES "
|
||||
"(1, 'norte', 120.5, 'ES'), "
|
||||
"(2, 'sur', 80.0, 'ES'), "
|
||||
"(3, 'norte', NULL, 'ES'), "
|
||||
"(4, 'este', 45.25, 'ES')"
|
||||
)
|
||||
con.close()
|
||||
return path
|
||||
|
||||
|
||||
def test_shape_y_metadatos_tabla(db):
|
||||
res = summarize_table_duckdb(db, "ventas")
|
||||
assert res["status"] == "ok"
|
||||
profile = res["profile"]
|
||||
|
||||
# Claves del TableProfile presentes.
|
||||
for key in (
|
||||
"table", "source", "profiled_at", "n_rows", "n_cols", "size_bytes",
|
||||
"duplicate_rows", "duplicate_pct", "constant_cols", "all_null_cols",
|
||||
"null_cell_pct", "type_breakdown", "columns", "correlations",
|
||||
"key_candidates", "quality_score", "llm", "models",
|
||||
):
|
||||
assert key in profile, f"falta clave {key} en TableProfile"
|
||||
|
||||
assert profile["table"] == "ventas"
|
||||
assert profile["source"] == "duckdb"
|
||||
assert profile["n_rows"] == 4
|
||||
assert profile["n_cols"] == 4
|
||||
assert len(profile["columns"]) == 4
|
||||
assert profile["key_candidates"] == []
|
||||
assert profile["quality_score"] is None
|
||||
assert profile["correlations"] is None
|
||||
|
||||
|
||||
def test_column_profile_shape(db):
|
||||
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
||||
by_name = {c["name"]: c for c in profile["columns"]}
|
||||
|
||||
for col in profile["columns"]:
|
||||
for key in (
|
||||
"name", "physical_type", "inferred_type", "semantic_type", "count",
|
||||
"n_rows", "null_count", "null_pct", "empty_count", "empty_pct",
|
||||
"distinct_count", "unique_pct", "flags", "quality_score",
|
||||
"numeric", "categorical", "datetime",
|
||||
):
|
||||
assert key in col, f"falta clave {key} en ColumnProfile {col['name']}"
|
||||
|
||||
# id: numerica, sin nulls, unica.
|
||||
assert by_name["id"]["inferred_type"] == "numeric"
|
||||
assert by_name["id"]["null_count"] == 0
|
||||
assert by_name["id"]["count"] == 4
|
||||
assert by_name["id"]["distinct_count"] == 4
|
||||
assert "possible_id" in by_name["id"]["flags"]
|
||||
|
||||
# region: categorica baja cardinalidad.
|
||||
assert by_name["region"]["inferred_type"] == "categorical"
|
||||
assert by_name["region"]["distinct_count"] == 3
|
||||
|
||||
# total: numerica con un null. count no-nulo = 3.
|
||||
total = by_name["total"]
|
||||
assert total["inferred_type"] == "numeric"
|
||||
assert total["null_count"] == 1
|
||||
assert total["count"] == 3
|
||||
assert total["numeric"] is not None
|
||||
# SUMMARIZE rellena min/max/mean/std/p25/p50/p75; el resto queda en None.
|
||||
assert total["numeric"]["min"] == pytest.approx(45.25)
|
||||
assert total["numeric"]["max"] == pytest.approx(120.5)
|
||||
assert total["numeric"]["mean"] is not None
|
||||
assert total["numeric"]["skew"] is None
|
||||
assert total["numeric"]["histogram"] is None
|
||||
assert total["numeric"]["p99"] is None
|
||||
|
||||
# pais: constante -> flag constant + aparece en constant_cols.
|
||||
assert "constant" in by_name["pais"]["flags"]
|
||||
assert "pais" in profile["constant_cols"]
|
||||
|
||||
|
||||
def test_distinct_no_excede_filas(db):
|
||||
"""distinct_count exacto: nunca supera n_rows ni unique_pct pasa de 1.0.
|
||||
|
||||
Regresion: SUMMARIZE.approx_unique (HyperLogLog) sobreestimaba y reportaba
|
||||
mas distintos que filas en tablas pequenas, inflando unique_pct > 1.0 y
|
||||
disparando flags possible_id falsos.
|
||||
"""
|
||||
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
||||
n_rows = profile["n_rows"]
|
||||
for col in profile["columns"]:
|
||||
assert col["distinct_count"] <= n_rows, (
|
||||
f"{col['name']}: distinct_count {col['distinct_count']} > n_rows {n_rows}"
|
||||
)
|
||||
assert col["unique_pct"] <= 1.0, (
|
||||
f"{col['name']}: unique_pct {col['unique_pct']} > 1.0"
|
||||
)
|
||||
|
||||
|
||||
def test_columna_unica_da_possible_id(db):
|
||||
"""Una columna con todos los valores unicos -> unique_pct == 1.0 + possible_id."""
|
||||
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
||||
by_name = {c["name"]: c for c in profile["columns"]}
|
||||
|
||||
# id: 4 valores distintos sobre 4 filas, sin nulls.
|
||||
idc = by_name["id"]
|
||||
assert idc["distinct_count"] == 4
|
||||
assert idc["unique_pct"] == 1.0
|
||||
assert "possible_id" in idc["flags"]
|
||||
|
||||
|
||||
def test_type_breakdown(db):
|
||||
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
||||
tb = profile["type_breakdown"]
|
||||
assert set(tb.keys()) == {
|
||||
"numeric", "categorical", "datetime", "text", "boolean"
|
||||
}
|
||||
assert tb["numeric"] == 2 # id, total
|
||||
assert tb["categorical"] == 2 # region, pais
|
||||
assert tb["datetime"] == 0
|
||||
assert tb["boolean"] == 0
|
||||
|
||||
|
||||
def test_tabla_invalida_devuelve_error(db):
|
||||
res = summarize_table_duckdb(db, "ventas; DROP TABLE ventas")
|
||||
assert res["status"] == "error"
|
||||
assert "invalido" in res["error"]
|
||||
|
||||
|
||||
def test_tabla_inexistente_devuelve_error(db):
|
||||
res = summarize_table_duckdb(db, "no_existe")
|
||||
assert res["status"] == "error"
|
||||
Reference in New Issue
Block a user