fix(eda): hallazgos de comportamiento del benchmark (H2,H3,H6,H7,H8,H10,H11)

Ronda 4 (verificada con re-corrida sobre los datasets afectados):
- H2: stl_decompose deriva periodo de la frecuencia del indice (seattle period=365
  seasonal_strength=0.84; fin del period=2 espurio)
- H3+H10: infer_fk por senal de nombre (<X>Id->X.<X>Id) + excluir no-clave -> chinook
  111->9 FK, todas reales, cero absurdas, 16-27x mas rapido; base intacta (flag off->111)
- H6: association no computa eta2 si cardinalidad~=n (Ticket-Fare espurio fuera)
- H7: id secuencial monotono excluido de correlacion y PCA/KMeans (PassengerId fuera)
- H8: correlacion de series no estacionarias marcada espuria / sobre retornos
- H11: distribution_type usa modos/cardinalidad/normalidad (quality->discrete)
- 66 tests verdes

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Egutierrez
2026-06-29 06:37:47 +02:00
parent c4cff5ed5b
commit e142ef026d
12 changed files with 1028 additions and 36 deletions
@@ -14,7 +14,9 @@ import tempfile
import duckdb
from pipelines.profile_table import (
_infer_period_from_dates,
_is_continuous_for_reexpr,
_is_sequential_id,
_looks_financial,
profile_table,
)
@@ -121,6 +123,142 @@ def test_series_no_financiera_sugiere_diferencias():
assert "to_returns" not in s
# --- H2: periodo estacional derivado de la frecuencia del indice datetime ---
def test_infer_period_from_dates_mensual_y_diario():
from datetime import date as _date, timedelta
# Mensual (delta ~30 dias) con 72 puntos -> periodo 12.
mensual = [_date(2000 + i // 12, i % 12 + 1, 1) for i in range(72)]
assert _infer_period_from_dates(mensual, n_series=72) == 12
# Diario con >= 2 anios de datos -> estacionalidad anual (365).
diario = [_date(2010, 1, 1) + timedelta(days=i) for i in range(800)]
assert _infer_period_from_dates(diario, n_series=800) == 365
# Diario corto (< 2 anios) -> cae a semanal (7).
diario_corto = [_date(2010, 1, 1) + timedelta(days=i) for i in range(100)]
assert _infer_period_from_dates(diario_corto, n_series=100) == 7
# Sin fechas validas -> None (stl_decompose infiere o avisa).
assert _infer_period_from_dates(["x", None, 3], n_series=50) is None
def test_h2_periodo_de_frecuencia_datetime_end_to_end():
import math
from datetime import date as _date
tmp_dir = tempfile.mkdtemp(prefix="h2_period_test_")
db_path = os.path.join(tmp_dir, "m.duckdb")
con = duckdb.connect(db_path)
con.execute("CREATE TABLE m (d DATE, v DOUBLE)")
rows = []
for i in range(72): # 6 anios mensual con estacionalidad de periodo 12
dt = _date(2000 + i // 12, i % 12 + 1, 1)
v = 10.0 + 0.1 * i + 5.0 * math.sin(2 * math.pi * (i % 12) / 12)
rows.append((dt, v))
con.executemany("INSERT INTO m VALUES (?, ?)", rows)
con.close()
r = profile_table(db_path, "m", run_series=True, write_report=False)
assert r["status"] == "ok", r
s = _col(r["profile"], "v").get("series") or {}
assert s.get("period_source") == "datetime_freq"
stl = s.get("stl") or {}
assert stl.get("period") == 12
# Estacionalidad sinusoidal clara -> fuerza estacional alta (antes salia ~0).
assert (stl.get("seasonal_strength") or 0) > 0.3
# --- H7: id entero secuencial fuera de correlacion y de PCA/KMeans -----------
def test_is_sequential_id_distingue_id_de_precio():
# Id entero secuencial denso (1..n): True.
idcol = {
"inferred_type": "numeric",
"flags": ["possible_id"],
"distinct_count": 300,
"numeric": {"min": 1.0, "max": 300.0},
}
assert _is_sequential_id(idcol) is True
# Float continuo de alta cardinalidad (precios): min/max con decimales -> False.
precio = {
"inferred_type": "numeric",
"flags": ["possible_id"],
"distinct_count": 300,
"numeric": {"min": 24.35, "max": 189.7},
}
assert _is_sequential_id(precio) is False
# Entero disperso (anios): no es indice denso -> False.
disperso = {
"inferred_type": "numeric",
"flags": ["possible_id"],
"distinct_count": 3,
"numeric": {"min": 1990.0, "max": 2010.0},
}
assert _is_sequential_id(disperso) is False
# Sin flag possible_id -> nunca id secuencial.
sin_flag = {
"inferred_type": "numeric",
"flags": [],
"distinct_count": 300,
"numeric": {"min": 1.0, "max": 300.0},
}
assert _is_sequential_id(sin_flag) is False
def test_h7_id_secuencial_fuera_de_correlacion_y_modelos():
tmp_dir = tempfile.mkdtemp(prefix="h7_id_test_")
db_path = os.path.join(tmp_dir, "t.duckdb")
con = duckdb.connect(db_path)
con.execute("CREATE TABLE t (rid INTEGER, age DOUBLE, fare DOUBLE)")
# rid 0..299: indice de fila (id secuencial). age/fare: floats continuos.
con.execute(
"INSERT INTO t SELECT i, ((i*0.13)%80)+1.5, ((i*1.7)%50)+0.3 "
"FROM range(300) tbl(i)"
)
con.close()
r = profile_table(db_path, "t", run_models=True, write_report=False)
assert r["status"] == "ok", r
prof = r["profile"]
# rid (id secuencial) no entra en correlaciones fuertes.
strong = (prof.get("correlations") or {}).get("strong", [])
assert not any("rid" in (p["a"], p["b"]) for p in strong)
# rid no entra como feature de los modelos (normality solo sobre continuas).
norm = (prof.get("models") or {}).get("normality") or {}
assert "rid" not in norm
# age/fare (continuas) SI se mantienen como features.
assert "age" in norm and "fare" in norm
# --- H8: correlacion sobre niveles no estacionarios marcada espuria ----------
def test_h8_correlacion_niveles_marcada_posible_espuria():
tmp_dir = tempfile.mkdtemp(prefix="h8_levels_test_")
db_path = os.path.join(tmp_dir, "s.duckdb")
con = duckdb.connect(db_path)
con.execute('CREATE TABLE s (ts INTEGER, "close" DOUBLE, "open" DOUBLE)')
rows = []
level = 100.0
for t in range(90): # niveles crecientes (no estacionarios), close~open
level += 1.0 + (t % 5) * 0.4
rows.append((t, level, level - 0.5))
con.executemany("INSERT INTO s VALUES (?, ?, ?)", rows)
con.close()
r = profile_table(db_path, "s", run_series=True, write_report=False)
assert r["status"] == "ok", r
corr = r["profile"].get("correlations") or {}
co = [p for p in corr.get("pairs", []) if {p["a"], p["b"]} == {"close", "open"}]
assert co, "par close-open no encontrado"
# Ambas son series financieras de niveles no estacionarias -> par marcado.
assert co[0].get("levels_possible_spurious") is True
assert "levels_caveat" in corr
def _make_db() -> str:
"""Crea una DuckDB temporal con la tabla de prueba y devuelve su path."""
tmp_dir = tempfile.mkdtemp(prefix="profile_table_test_")