e142ef026d
Ronda 4 (verificada con re-corrida sobre los datasets afectados): - H2: stl_decompose deriva periodo de la frecuencia del indice (seattle period=365 seasonal_strength=0.84; fin del period=2 espurio) - H3+H10: infer_fk por senal de nombre (<X>Id->X.<X>Id) + excluir no-clave -> chinook 111->9 FK, todas reales, cero absurdas, 16-27x mas rapido; base intacta (flag off->111) - H6: association no computa eta2 si cardinalidad~=n (Ticket-Fare espurio fuera) - H7: id secuencial monotono excluido de correlacion y PCA/KMeans (PassengerId fuera) - H8: correlacion de series no estacionarias marcada espuria / sobre retornos - H11: distribution_type usa modos/cardinalidad/normalidad (quality->discrete) - 66 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
327 lines
12 KiB
Python
327 lines
12 KiB
Python
"""Tests para profile_table — pipeline EDA one-shot del grupo `eda`.
|
|
|
|
Crea una DuckDB temporal con tres columnas representativas:
|
|
- id_str: enteros guardados como VARCHAR ('10','20',...) -> debe promocionarse
|
|
a inferred_type "numeric" y recibir un bloque col["numeric"].
|
|
- precio: numerica nativa (DOUBLE).
|
|
- categoria: categorica textual.
|
|
Luego corre profile_table(write_report=False) y verifica el contrato.
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
|
|
import duckdb
|
|
|
|
from pipelines.profile_table import (
|
|
_infer_period_from_dates,
|
|
_is_continuous_for_reexpr,
|
|
_is_sequential_id,
|
|
_looks_financial,
|
|
profile_table,
|
|
)
|
|
|
|
|
|
# --- H12: re-expresión solo para columnas continuas -------------------------
|
|
|
|
def test_is_continuous_for_reexpr_baja_cardinalidad():
|
|
# Binaria (2 niveles) y ordinal de baja cardinalidad (3 niveles): NO continuas.
|
|
binaria = {"distinct_count": 2, "flags": []}
|
|
ordinal = {"distinct_count": 3, "flags": []}
|
|
assert _is_continuous_for_reexpr(binaria, [0.0, 1.0, 0.0, 1.0]) is False
|
|
assert _is_continuous_for_reexpr(ordinal, [1.0, 2.0, 3.0, 2.0]) is False
|
|
|
|
|
|
def test_is_continuous_for_reexpr_id_entero():
|
|
# Identificador entero (possible_id + todos enteros): NO continua.
|
|
idcol = {"distinct_count": 200, "flags": ["possible_id"]}
|
|
vals = [float(i) for i in range(1, 201)]
|
|
assert _is_continuous_for_reexpr(idcol, vals) is False
|
|
|
|
|
|
def test_is_continuous_for_reexpr_float_continuo():
|
|
# Float continuo de alta cardinalidad, aunque lleve possible_id, SÍ es continuo
|
|
# (tiene parte decimal, no es un id entero).
|
|
precio = {"distinct_count": 200, "flags": ["possible_id"]}
|
|
vals = [i * 1.7 for i in range(200)]
|
|
assert _is_continuous_for_reexpr(precio, vals) is True
|
|
|
|
|
|
def test_reexpression_solo_para_columnas_continuas():
|
|
# En una tabla con binaria/ordinal/id/continua, solo la continua trae el bloque
|
|
# reexpression en su ColumnProfile.
|
|
tmp_dir = tempfile.mkdtemp(prefix="reexpr_test_")
|
|
db_path = os.path.join(tmp_dir, "t.duckdb")
|
|
con = duckdb.connect(db_path)
|
|
con.execute(
|
|
"CREATE TABLE t (pid INTEGER, surv INTEGER, pclass INTEGER, fare DOUBLE)"
|
|
)
|
|
con.execute(
|
|
"INSERT INTO t SELECT i, i%2, (i%3)+1, ((i*1.7)%50)+0.3 "
|
|
"FROM range(300) tbl(i)"
|
|
)
|
|
con.close()
|
|
|
|
r = profile_table(db_path, "t", write_report=False)
|
|
assert r["status"] == "ok", r
|
|
prof = r["profile"]
|
|
|
|
assert _col(prof, "pid").get("reexpression") is None # id entero
|
|
assert _col(prof, "surv").get("reexpression") is None # binaria
|
|
assert _col(prof, "pclass").get("reexpression") is None # ordinal baja card
|
|
assert _col(prof, "fare").get("reexpression") is not None # continua
|
|
|
|
|
|
# --- H13: retornos (financiera) vs diferencias (física) ---------------------
|
|
|
|
def test_looks_financial_por_nombre_y_semantic():
|
|
assert _looks_financial({"name": "Close"}) is True
|
|
assert _looks_financial({"name": "Adj Close"}) is True
|
|
assert _looks_financial({"name": "Volume"}) is True
|
|
assert _looks_financial({"name": "precio_cierre"}) is True
|
|
assert _looks_financial({"name": "temp_max"}) is False
|
|
assert _looks_financial({"name": "precipitation"}) is False
|
|
assert _looks_financial({"name": "caudal", "semantic_type": "currency"}) is True
|
|
|
|
|
|
def _make_series_db(value_col: str) -> str:
|
|
"""DuckDB con una serie de niveles no estacionaria (random walk creciente)."""
|
|
tmp_dir = tempfile.mkdtemp(prefix="series_test_")
|
|
db_path = os.path.join(tmp_dir, "s.duckdb")
|
|
con = duckdb.connect(db_path)
|
|
con.execute(f'CREATE TABLE s (ts INTEGER, "{value_col}" DOUBLE)')
|
|
# Niveles estrictamente positivos con tendencia creciente (no estacionaria).
|
|
level = 100.0
|
|
rows = []
|
|
for t in range(80):
|
|
level += 1.0 + (t % 7) * 0.3 # incrementos positivos deterministas
|
|
rows.append((t, level))
|
|
con.executemany(f'INSERT INTO s VALUES (?, ?)', rows)
|
|
con.close()
|
|
return db_path
|
|
|
|
|
|
def test_series_financiera_sugiere_retornos():
|
|
db_path = _make_series_db("close")
|
|
r = profile_table(db_path, "s", run_series=True, write_report=False)
|
|
assert r["status"] == "ok", r
|
|
s = _col(r["profile"], "close").get("series")
|
|
assert s is not None
|
|
if s.get("levels_suggested"):
|
|
assert s.get("levels_kind") == "returns"
|
|
|
|
|
|
def test_series_no_financiera_sugiere_diferencias():
|
|
db_path = _make_series_db("temp_max")
|
|
r = profile_table(db_path, "s", run_series=True, write_report=False)
|
|
assert r["status"] == "ok", r
|
|
s = _col(r["profile"], "temp_max").get("series")
|
|
assert s is not None
|
|
if s.get("levels_suggested"):
|
|
assert s.get("levels_kind") == "differences"
|
|
# Para diferencias no se computa el bloque de retornos.
|
|
assert "to_returns" not in s
|
|
|
|
|
|
# --- H2: periodo estacional derivado de la frecuencia del indice datetime ---
|
|
|
|
def test_infer_period_from_dates_mensual_y_diario():
|
|
from datetime import date as _date, timedelta
|
|
|
|
# Mensual (delta ~30 dias) con 72 puntos -> periodo 12.
|
|
mensual = [_date(2000 + i // 12, i % 12 + 1, 1) for i in range(72)]
|
|
assert _infer_period_from_dates(mensual, n_series=72) == 12
|
|
|
|
# Diario con >= 2 anios de datos -> estacionalidad anual (365).
|
|
diario = [_date(2010, 1, 1) + timedelta(days=i) for i in range(800)]
|
|
assert _infer_period_from_dates(diario, n_series=800) == 365
|
|
|
|
# Diario corto (< 2 anios) -> cae a semanal (7).
|
|
diario_corto = [_date(2010, 1, 1) + timedelta(days=i) for i in range(100)]
|
|
assert _infer_period_from_dates(diario_corto, n_series=100) == 7
|
|
|
|
# Sin fechas validas -> None (stl_decompose infiere o avisa).
|
|
assert _infer_period_from_dates(["x", None, 3], n_series=50) is None
|
|
|
|
|
|
def test_h2_periodo_de_frecuencia_datetime_end_to_end():
|
|
import math
|
|
from datetime import date as _date
|
|
|
|
tmp_dir = tempfile.mkdtemp(prefix="h2_period_test_")
|
|
db_path = os.path.join(tmp_dir, "m.duckdb")
|
|
con = duckdb.connect(db_path)
|
|
con.execute("CREATE TABLE m (d DATE, v DOUBLE)")
|
|
rows = []
|
|
for i in range(72): # 6 anios mensual con estacionalidad de periodo 12
|
|
dt = _date(2000 + i // 12, i % 12 + 1, 1)
|
|
v = 10.0 + 0.1 * i + 5.0 * math.sin(2 * math.pi * (i % 12) / 12)
|
|
rows.append((dt, v))
|
|
con.executemany("INSERT INTO m VALUES (?, ?)", rows)
|
|
con.close()
|
|
|
|
r = profile_table(db_path, "m", run_series=True, write_report=False)
|
|
assert r["status"] == "ok", r
|
|
s = _col(r["profile"], "v").get("series") or {}
|
|
assert s.get("period_source") == "datetime_freq"
|
|
stl = s.get("stl") or {}
|
|
assert stl.get("period") == 12
|
|
# Estacionalidad sinusoidal clara -> fuerza estacional alta (antes salia ~0).
|
|
assert (stl.get("seasonal_strength") or 0) > 0.3
|
|
|
|
|
|
# --- H7: id entero secuencial fuera de correlacion y de PCA/KMeans -----------
|
|
|
|
def test_is_sequential_id_distingue_id_de_precio():
|
|
# Id entero secuencial denso (1..n): True.
|
|
idcol = {
|
|
"inferred_type": "numeric",
|
|
"flags": ["possible_id"],
|
|
"distinct_count": 300,
|
|
"numeric": {"min": 1.0, "max": 300.0},
|
|
}
|
|
assert _is_sequential_id(idcol) is True
|
|
# Float continuo de alta cardinalidad (precios): min/max con decimales -> False.
|
|
precio = {
|
|
"inferred_type": "numeric",
|
|
"flags": ["possible_id"],
|
|
"distinct_count": 300,
|
|
"numeric": {"min": 24.35, "max": 189.7},
|
|
}
|
|
assert _is_sequential_id(precio) is False
|
|
# Entero disperso (anios): no es indice denso -> False.
|
|
disperso = {
|
|
"inferred_type": "numeric",
|
|
"flags": ["possible_id"],
|
|
"distinct_count": 3,
|
|
"numeric": {"min": 1990.0, "max": 2010.0},
|
|
}
|
|
assert _is_sequential_id(disperso) is False
|
|
# Sin flag possible_id -> nunca id secuencial.
|
|
sin_flag = {
|
|
"inferred_type": "numeric",
|
|
"flags": [],
|
|
"distinct_count": 300,
|
|
"numeric": {"min": 1.0, "max": 300.0},
|
|
}
|
|
assert _is_sequential_id(sin_flag) is False
|
|
|
|
|
|
def test_h7_id_secuencial_fuera_de_correlacion_y_modelos():
|
|
tmp_dir = tempfile.mkdtemp(prefix="h7_id_test_")
|
|
db_path = os.path.join(tmp_dir, "t.duckdb")
|
|
con = duckdb.connect(db_path)
|
|
con.execute("CREATE TABLE t (rid INTEGER, age DOUBLE, fare DOUBLE)")
|
|
# rid 0..299: indice de fila (id secuencial). age/fare: floats continuos.
|
|
con.execute(
|
|
"INSERT INTO t SELECT i, ((i*0.13)%80)+1.5, ((i*1.7)%50)+0.3 "
|
|
"FROM range(300) tbl(i)"
|
|
)
|
|
con.close()
|
|
|
|
r = profile_table(db_path, "t", run_models=True, write_report=False)
|
|
assert r["status"] == "ok", r
|
|
prof = r["profile"]
|
|
|
|
# rid (id secuencial) no entra en correlaciones fuertes.
|
|
strong = (prof.get("correlations") or {}).get("strong", [])
|
|
assert not any("rid" in (p["a"], p["b"]) for p in strong)
|
|
|
|
# rid no entra como feature de los modelos (normality solo sobre continuas).
|
|
norm = (prof.get("models") or {}).get("normality") or {}
|
|
assert "rid" not in norm
|
|
# age/fare (continuas) SI se mantienen como features.
|
|
assert "age" in norm and "fare" in norm
|
|
|
|
|
|
# --- H8: correlacion sobre niveles no estacionarios marcada espuria ----------
|
|
|
|
def test_h8_correlacion_niveles_marcada_posible_espuria():
|
|
tmp_dir = tempfile.mkdtemp(prefix="h8_levels_test_")
|
|
db_path = os.path.join(tmp_dir, "s.duckdb")
|
|
con = duckdb.connect(db_path)
|
|
con.execute('CREATE TABLE s (ts INTEGER, "close" DOUBLE, "open" DOUBLE)')
|
|
rows = []
|
|
level = 100.0
|
|
for t in range(90): # niveles crecientes (no estacionarios), close~open
|
|
level += 1.0 + (t % 5) * 0.4
|
|
rows.append((t, level, level - 0.5))
|
|
con.executemany("INSERT INTO s VALUES (?, ?, ?)", rows)
|
|
con.close()
|
|
|
|
r = profile_table(db_path, "s", run_series=True, write_report=False)
|
|
assert r["status"] == "ok", r
|
|
corr = r["profile"].get("correlations") or {}
|
|
co = [p for p in corr.get("pairs", []) if {p["a"], p["b"]} == {"close", "open"}]
|
|
assert co, "par close-open no encontrado"
|
|
# Ambas son series financieras de niveles no estacionarias -> par marcado.
|
|
assert co[0].get("levels_possible_spurious") is True
|
|
assert "levels_caveat" in corr
|
|
|
|
|
|
def _make_db() -> str:
|
|
"""Crea una DuckDB temporal con la tabla de prueba y devuelve su path."""
|
|
tmp_dir = tempfile.mkdtemp(prefix="profile_table_test_")
|
|
db_path = os.path.join(tmp_dir, "t.duckdb")
|
|
con = duckdb.connect(db_path)
|
|
con.execute(
|
|
"CREATE TABLE items ("
|
|
" id_str VARCHAR," # enteros guardados como texto
|
|
" precio DOUBLE," # numerica nativa
|
|
" categoria VARCHAR" # categorica
|
|
")"
|
|
)
|
|
rows = [
|
|
("10", 9.5, "alfa"),
|
|
("20", 12.0, "beta"),
|
|
("30", 7.25, "alfa"),
|
|
("40", 15.75, "gamma"),
|
|
("50", 3.0, "beta"),
|
|
("60", 22.4, "alfa"),
|
|
]
|
|
con.executemany("INSERT INTO items VALUES (?, ?, ?)", rows)
|
|
con.close()
|
|
return db_path
|
|
|
|
|
|
def _col(profile: dict, name: str) -> dict:
|
|
return next(c for c in profile["columns"] if c["name"] == name)
|
|
|
|
|
|
def test_varchar_integer_promotes_to_numeric():
|
|
db_path = _make_db()
|
|
r = profile_table(db_path, "items", sample=5000, write_report=False)
|
|
|
|
# status ok y sin tocar disco.
|
|
assert r["status"] == "ok", r
|
|
assert r["report_md_path"] is None
|
|
assert r["report_json_path"] is None
|
|
|
|
prof = r["profile"]
|
|
|
|
# La columna VARCHAR-entera se promociono a numeric con bloque numeric.
|
|
id_col = _col(prof, "id_str")
|
|
assert id_col["inferred_type"] == "numeric", id_col["inferred_type"]
|
|
assert id_col["numeric"] is not None
|
|
assert id_col["numeric"]["min"] == 10.0
|
|
assert id_col["numeric"]["max"] == 60.0
|
|
|
|
# La numerica nativa sigue siendo numeric con su bloque.
|
|
precio_col = _col(prof, "precio")
|
|
assert precio_col["inferred_type"] == "numeric"
|
|
assert precio_col["numeric"] is not None
|
|
|
|
# La categorica recibe su bloque categorical.
|
|
cat_col = _col(prof, "categoria")
|
|
assert cat_col["inferred_type"] in ("categorical", "text")
|
|
assert cat_col["categorical"] is not None
|
|
assert cat_col["categorical"]["mode"] == "alfa"
|
|
|
|
# key_candidates es una lista; quality_score existe (tabla y columnas).
|
|
assert isinstance(prof["key_candidates"], list)
|
|
assert prof["quality_score"] is not None
|
|
assert id_col["quality_score"] is not None
|
|
|
|
# type_breakdown recalculado refleja la promocion (>=2 numeric).
|
|
assert prof["type_breakdown"]["numeric"] >= 2
|