Files
fn_registry/python/functions/pipelines/profile_table_test.py
T
Egutierrez caf8c25d99 fix(eda): bugs de bajo riesgo del benchmark (H1,H5,H12,H13,H14) + tests faltantes
- H1: render_eda_markdown ya no aplica doble x100 a outlier_pct (336% -> real)
- H5: profile_database filtra base_tables_only (excluye VIEWs; sakila 21->16)
- H12: suggest_reexpression salta columnas no-continuas
- H13: to_returns/profile_table elige retornos (financiera) vs diferencias (fisica)
- H14: test de regresion ATTACH sqlite via information_schema
- +8 tests de las funciones eda nuevas (acf_pacf, adf_kpss, ...). 77 tests verdes
- L/M (H2,H3,H4,H6,H7,H8,H9,H10,H11) quedan en issues 0174-0177 para revision

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 03:51:11 +02:00

189 lines
6.9 KiB
Python

"""Tests para profile_table — pipeline EDA one-shot del grupo `eda`.
Crea una DuckDB temporal con tres columnas representativas:
- id_str: enteros guardados como VARCHAR ('10','20',...) -> debe promocionarse
a inferred_type "numeric" y recibir un bloque col["numeric"].
- precio: numerica nativa (DOUBLE).
- categoria: categorica textual.
Luego corre profile_table(write_report=False) y verifica el contrato.
"""
import os
import tempfile
import duckdb
from pipelines.profile_table import (
_is_continuous_for_reexpr,
_looks_financial,
profile_table,
)
# --- H12: re-expresión solo para columnas continuas -------------------------
def test_is_continuous_for_reexpr_baja_cardinalidad():
# Binaria (2 niveles) y ordinal de baja cardinalidad (3 niveles): NO continuas.
binaria = {"distinct_count": 2, "flags": []}
ordinal = {"distinct_count": 3, "flags": []}
assert _is_continuous_for_reexpr(binaria, [0.0, 1.0, 0.0, 1.0]) is False
assert _is_continuous_for_reexpr(ordinal, [1.0, 2.0, 3.0, 2.0]) is False
def test_is_continuous_for_reexpr_id_entero():
# Identificador entero (possible_id + todos enteros): NO continua.
idcol = {"distinct_count": 200, "flags": ["possible_id"]}
vals = [float(i) for i in range(1, 201)]
assert _is_continuous_for_reexpr(idcol, vals) is False
def test_is_continuous_for_reexpr_float_continuo():
# Float continuo de alta cardinalidad, aunque lleve possible_id, SÍ es continuo
# (tiene parte decimal, no es un id entero).
precio = {"distinct_count": 200, "flags": ["possible_id"]}
vals = [i * 1.7 for i in range(200)]
assert _is_continuous_for_reexpr(precio, vals) is True
def test_reexpression_solo_para_columnas_continuas():
# En una tabla con binaria/ordinal/id/continua, solo la continua trae el bloque
# reexpression en su ColumnProfile.
tmp_dir = tempfile.mkdtemp(prefix="reexpr_test_")
db_path = os.path.join(tmp_dir, "t.duckdb")
con = duckdb.connect(db_path)
con.execute(
"CREATE TABLE t (pid INTEGER, surv INTEGER, pclass INTEGER, fare DOUBLE)"
)
con.execute(
"INSERT INTO t SELECT i, i%2, (i%3)+1, ((i*1.7)%50)+0.3 "
"FROM range(300) tbl(i)"
)
con.close()
r = profile_table(db_path, "t", write_report=False)
assert r["status"] == "ok", r
prof = r["profile"]
assert _col(prof, "pid").get("reexpression") is None # id entero
assert _col(prof, "surv").get("reexpression") is None # binaria
assert _col(prof, "pclass").get("reexpression") is None # ordinal baja card
assert _col(prof, "fare").get("reexpression") is not None # continua
# --- H13: retornos (financiera) vs diferencias (física) ---------------------
def test_looks_financial_por_nombre_y_semantic():
assert _looks_financial({"name": "Close"}) is True
assert _looks_financial({"name": "Adj Close"}) is True
assert _looks_financial({"name": "Volume"}) is True
assert _looks_financial({"name": "precio_cierre"}) is True
assert _looks_financial({"name": "temp_max"}) is False
assert _looks_financial({"name": "precipitation"}) is False
assert _looks_financial({"name": "caudal", "semantic_type": "currency"}) is True
def _make_series_db(value_col: str) -> str:
"""DuckDB con una serie de niveles no estacionaria (random walk creciente)."""
tmp_dir = tempfile.mkdtemp(prefix="series_test_")
db_path = os.path.join(tmp_dir, "s.duckdb")
con = duckdb.connect(db_path)
con.execute(f'CREATE TABLE s (ts INTEGER, "{value_col}" DOUBLE)')
# Niveles estrictamente positivos con tendencia creciente (no estacionaria).
level = 100.0
rows = []
for t in range(80):
level += 1.0 + (t % 7) * 0.3 # incrementos positivos deterministas
rows.append((t, level))
con.executemany(f'INSERT INTO s VALUES (?, ?)', rows)
con.close()
return db_path
def test_series_financiera_sugiere_retornos():
db_path = _make_series_db("close")
r = profile_table(db_path, "s", run_series=True, write_report=False)
assert r["status"] == "ok", r
s = _col(r["profile"], "close").get("series")
assert s is not None
if s.get("levels_suggested"):
assert s.get("levels_kind") == "returns"
def test_series_no_financiera_sugiere_diferencias():
db_path = _make_series_db("temp_max")
r = profile_table(db_path, "s", run_series=True, write_report=False)
assert r["status"] == "ok", r
s = _col(r["profile"], "temp_max").get("series")
assert s is not None
if s.get("levels_suggested"):
assert s.get("levels_kind") == "differences"
# Para diferencias no se computa el bloque de retornos.
assert "to_returns" not in s
def _make_db() -> str:
"""Crea una DuckDB temporal con la tabla de prueba y devuelve su path."""
tmp_dir = tempfile.mkdtemp(prefix="profile_table_test_")
db_path = os.path.join(tmp_dir, "t.duckdb")
con = duckdb.connect(db_path)
con.execute(
"CREATE TABLE items ("
" id_str VARCHAR," # enteros guardados como texto
" precio DOUBLE," # numerica nativa
" categoria VARCHAR" # categorica
")"
)
rows = [
("10", 9.5, "alfa"),
("20", 12.0, "beta"),
("30", 7.25, "alfa"),
("40", 15.75, "gamma"),
("50", 3.0, "beta"),
("60", 22.4, "alfa"),
]
con.executemany("INSERT INTO items VALUES (?, ?, ?)", rows)
con.close()
return db_path
def _col(profile: dict, name: str) -> dict:
return next(c for c in profile["columns"] if c["name"] == name)
def test_varchar_integer_promotes_to_numeric():
db_path = _make_db()
r = profile_table(db_path, "items", sample=5000, write_report=False)
# status ok y sin tocar disco.
assert r["status"] == "ok", r
assert r["report_md_path"] is None
assert r["report_json_path"] is None
prof = r["profile"]
# La columna VARCHAR-entera se promociono a numeric con bloque numeric.
id_col = _col(prof, "id_str")
assert id_col["inferred_type"] == "numeric", id_col["inferred_type"]
assert id_col["numeric"] is not None
assert id_col["numeric"]["min"] == 10.0
assert id_col["numeric"]["max"] == 60.0
# La numerica nativa sigue siendo numeric con su bloque.
precio_col = _col(prof, "precio")
assert precio_col["inferred_type"] == "numeric"
assert precio_col["numeric"] is not None
# La categorica recibe su bloque categorical.
cat_col = _col(prof, "categoria")
assert cat_col["inferred_type"] in ("categorical", "text")
assert cat_col["categorical"] is not None
assert cat_col["categorical"]["mode"] == "alfa"
# key_candidates es una lista; quality_score existe (tabla y columnas).
assert isinstance(prof["key_candidates"], list)
assert prof["quality_score"] is not None
assert id_col["quality_score"] is not None
# type_breakdown recalculado refleja la promocion (>=2 numeric).
assert prof["type_breakdown"]["numeric"] >= 2