Files
fn_registry/python/functions/datascience/summarize_table_duckdb_test.py
T
egutierrez a2074a0167 feat(eda): nueva fórmula de calidad de datos (report 2046) + capítulo calidad
Implementa el modelo de calidad del report 2046 en el grupo eda.

Score de columna: 0.6·completeness + 0.4·validity con renormalización por
aplicabilidad (si la validez no es medible —texto libre o columna 100% nula— el
score se basa solo en completeness). Validez = conformidad real al tipo: nativo
numérico/fecha/bool = 1.0; texto promovido a número/fecha = parse rate
(validity_rate); texto con semantic_type = match_rate; texto libre = no aplica.

Outliers, columnas constantes e identificadores salen del score a un bloque de
observaciones analíticas (no son defectos de calidad). Se elimina el doble
conteo de la falta de datos (mostly_null ya no castiga validez) y el bug de
escala de outliers (que además ya no entran en el score).

Score de dataset: 100·(0.85·cell_quality + 0.15·row_uniqueness) en vez de la
media simple. Se pobla duplicate_rows/duplicate_pct push-down en
summarize_table_duckdb (COUNT sobre DISTINCT *, sin RAM) para habilitar la
unicidad de registro; renormaliza a solo cell_quality si no se puede calcular.

Capítulo calidad (v2.0.0): intro de dos dimensiones (60/40) que declara que los
outliers no bajan el score; tabla de scores Columna|Calidad|Completitud|Validez
(sin Consistencia, n/a cuando no aplica); DOS tablas separadas (Problemas de
calidad vs Observaciones analíticas); resumen con Unicidad de registro; glosario
clicable de completitud, validez, unicidad de registro y calidad de datos.

Verificado: 123 tests verdes (automatic_eda + render_automatic_eda +
column_quality_score + summarize_table_duckdb + profile_table). Golden EDA de
titanic (run_models+run_llm) con score recomputado a mano, outliers separados en
observaciones y glosario clicable (5 links GOTO en el PDF).

column_quality_score v2.0.0, summarize_table_duckdb v1.1.0, profile_table v1.1.0.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 18:10:23 +02:00

175 lines
6.2 KiB
Python

"""Tests para summarize_table_duckdb."""
import duckdb
import pytest
from .summarize_table_duckdb import summarize_table_duckdb
@pytest.fixture
def db(tmp_path):
"""Crea una DuckDB temporal con numerica + categorica + nulls + id unico."""
path = str(tmp_path / "eda_test.duckdb")
con = duckdb.connect(path)
con.execute(
"CREATE TABLE ventas ("
" id INTEGER," # unico, sin nulls -> possible_id
" region VARCHAR," # categorica baja cardinalidad
" total DOUBLE," # numerica con un null
" pais VARCHAR" # constante
")"
)
con.execute(
"INSERT INTO ventas VALUES "
"(1, 'norte', 120.5, 'ES'), "
"(2, 'sur', 80.0, 'ES'), "
"(3, 'norte', NULL, 'ES'), "
"(4, 'este', 45.25, 'ES')"
)
con.close()
return path
def test_shape_y_metadatos_tabla(db):
res = summarize_table_duckdb(db, "ventas")
assert res["status"] == "ok"
profile = res["profile"]
# Claves del TableProfile presentes.
for key in (
"table", "source", "profiled_at", "n_rows", "n_cols", "size_bytes",
"duplicate_rows", "duplicate_pct", "constant_cols", "all_null_cols",
"null_cell_pct", "type_breakdown", "columns", "correlations",
"key_candidates", "quality_score", "llm", "models",
):
assert key in profile, f"falta clave {key} en TableProfile"
assert profile["table"] == "ventas"
assert profile["source"] == "duckdb"
assert profile["n_rows"] == 4
assert profile["n_cols"] == 4
assert len(profile["columns"]) == 4
assert profile["key_candidates"] == []
assert profile["quality_score"] is None
assert profile["correlations"] is None
def test_duplicate_pct_sin_duplicados(db):
"""Tabla con todas las filas distintas: duplicate_pct = 0, no None."""
profile = summarize_table_duckdb(db, "ventas")["profile"]
assert profile["duplicate_rows"] == 0
assert profile["duplicate_pct"] == 0.0
def test_duplicate_pct_con_duplicados(tmp_path):
"""Filas repetidas: duplicate_rows/duplicate_pct se pueblan push-down."""
path = str(tmp_path / "dups.duckdb")
con = duckdb.connect(path)
con.execute("CREATE TABLE t (a INTEGER, b VARCHAR)")
# 5 filas, 2 de ellas idénticas a otras -> 2 duplicadas sobre 5 = 0.4.
con.execute(
"INSERT INTO t VALUES "
"(1,'x'), (2,'y'), (1,'x'), (3,'z'), (2,'y')"
)
con.close()
profile = summarize_table_duckdb(path, "t")["profile"]
assert profile["n_rows"] == 5
assert profile["duplicate_rows"] == 2
assert profile["duplicate_pct"] == 0.4
def test_column_profile_shape(db):
profile = summarize_table_duckdb(db, "ventas")["profile"]
by_name = {c["name"]: c for c in profile["columns"]}
for col in profile["columns"]:
for key in (
"name", "physical_type", "inferred_type", "semantic_type", "count",
"n_rows", "null_count", "null_pct", "empty_count", "empty_pct",
"distinct_count", "unique_pct", "flags", "quality_score",
"numeric", "categorical", "datetime",
):
assert key in col, f"falta clave {key} en ColumnProfile {col['name']}"
# id: numerica, sin nulls, unica.
assert by_name["id"]["inferred_type"] == "numeric"
assert by_name["id"]["null_count"] == 0
assert by_name["id"]["count"] == 4
assert by_name["id"]["distinct_count"] == 4
assert "possible_id" in by_name["id"]["flags"]
# region: categorica baja cardinalidad.
assert by_name["region"]["inferred_type"] == "categorical"
assert by_name["region"]["distinct_count"] == 3
# total: numerica con un null. count no-nulo = 3.
total = by_name["total"]
assert total["inferred_type"] == "numeric"
assert total["null_count"] == 1
assert total["count"] == 3
assert total["numeric"] is not None
# SUMMARIZE rellena min/max/mean/std/p25/p50/p75; el resto queda en None.
assert total["numeric"]["min"] == pytest.approx(45.25)
assert total["numeric"]["max"] == pytest.approx(120.5)
assert total["numeric"]["mean"] is not None
assert total["numeric"]["skew"] is None
assert total["numeric"]["histogram"] is None
assert total["numeric"]["p99"] is None
# pais: constante -> flag constant + aparece en constant_cols.
assert "constant" in by_name["pais"]["flags"]
assert "pais" in profile["constant_cols"]
def test_distinct_no_excede_filas(db):
"""distinct_count exacto: nunca supera n_rows ni unique_pct pasa de 1.0.
Regresion: SUMMARIZE.approx_unique (HyperLogLog) sobreestimaba y reportaba
mas distintos que filas en tablas pequenas, inflando unique_pct > 1.0 y
disparando flags possible_id falsos.
"""
profile = summarize_table_duckdb(db, "ventas")["profile"]
n_rows = profile["n_rows"]
for col in profile["columns"]:
assert col["distinct_count"] <= n_rows, (
f"{col['name']}: distinct_count {col['distinct_count']} > n_rows {n_rows}"
)
assert col["unique_pct"] <= 1.0, (
f"{col['name']}: unique_pct {col['unique_pct']} > 1.0"
)
def test_columna_unica_da_possible_id(db):
"""Una columna con todos los valores unicos -> unique_pct == 1.0 + possible_id."""
profile = summarize_table_duckdb(db, "ventas")["profile"]
by_name = {c["name"]: c for c in profile["columns"]}
# id: 4 valores distintos sobre 4 filas, sin nulls.
idc = by_name["id"]
assert idc["distinct_count"] == 4
assert idc["unique_pct"] == 1.0
assert "possible_id" in idc["flags"]
def test_type_breakdown(db):
profile = summarize_table_duckdb(db, "ventas")["profile"]
tb = profile["type_breakdown"]
assert set(tb.keys()) == {
"numeric", "categorical", "datetime", "text", "boolean"
}
assert tb["numeric"] == 2 # id, total
assert tb["categorical"] == 2 # region, pais
assert tb["datetime"] == 0
assert tb["boolean"] == 0
def test_tabla_invalida_devuelve_error(db):
res = summarize_table_duckdb(db, "ventas; DROP TABLE ventas")
assert res["status"] == "error"
assert "invalido" in res["error"]
def test_tabla_inexistente_devuelve_error(db):
res = summarize_table_duckdb(db, "no_existe")
assert res["status"] == "error"