"""Tests para profile_table — pipeline EDA one-shot del grupo `eda`. Crea una DuckDB temporal con tres columnas representativas: - id_str: enteros guardados como VARCHAR ('10','20',...) -> debe promocionarse a inferred_type "numeric" y recibir un bloque col["numeric"]. - precio: numerica nativa (DOUBLE). - categoria: categorica textual. Luego corre profile_table(write_report=False) y verifica el contrato. """ import os import tempfile import duckdb from pipelines.profile_table import ( _infer_period_from_dates, _is_continuous_for_reexpr, _is_sequential_id, _looks_financial, profile_table, ) # --- H12: re-expresión solo para columnas continuas ------------------------- def test_is_continuous_for_reexpr_baja_cardinalidad(): # Binaria (2 niveles) y ordinal de baja cardinalidad (3 niveles): NO continuas. binaria = {"distinct_count": 2, "flags": []} ordinal = {"distinct_count": 3, "flags": []} assert _is_continuous_for_reexpr(binaria, [0.0, 1.0, 0.0, 1.0]) is False assert _is_continuous_for_reexpr(ordinal, [1.0, 2.0, 3.0, 2.0]) is False def test_is_continuous_for_reexpr_id_entero(): # Identificador entero (possible_id + todos enteros): NO continua. idcol = {"distinct_count": 200, "flags": ["possible_id"]} vals = [float(i) for i in range(1, 201)] assert _is_continuous_for_reexpr(idcol, vals) is False def test_is_continuous_for_reexpr_float_continuo(): # Float continuo de alta cardinalidad, aunque lleve possible_id, SÍ es continuo # (tiene parte decimal, no es un id entero). precio = {"distinct_count": 200, "flags": ["possible_id"]} vals = [i * 1.7 for i in range(200)] assert _is_continuous_for_reexpr(precio, vals) is True def test_reexpression_solo_para_columnas_continuas(): # En una tabla con binaria/ordinal/id/continua, solo la continua trae el bloque # reexpression en su ColumnProfile. tmp_dir = tempfile.mkdtemp(prefix="reexpr_test_") db_path = os.path.join(tmp_dir, "t.duckdb") con = duckdb.connect(db_path) con.execute( "CREATE TABLE t (pid INTEGER, surv INTEGER, pclass INTEGER, fare DOUBLE)" ) con.execute( "INSERT INTO t SELECT i, i%2, (i%3)+1, ((i*1.7)%50)+0.3 " "FROM range(300) tbl(i)" ) con.close() r = profile_table(db_path, "t", write_report=False) assert r["status"] == "ok", r prof = r["profile"] assert _col(prof, "pid").get("reexpression") is None # id entero assert _col(prof, "surv").get("reexpression") is None # binaria assert _col(prof, "pclass").get("reexpression") is None # ordinal baja card assert _col(prof, "fare").get("reexpression") is not None # continua # --- H13: retornos (financiera) vs diferencias (física) --------------------- def test_looks_financial_por_nombre_y_semantic(): assert _looks_financial({"name": "Close"}) is True assert _looks_financial({"name": "Adj Close"}) is True assert _looks_financial({"name": "Volume"}) is True assert _looks_financial({"name": "precio_cierre"}) is True assert _looks_financial({"name": "temp_max"}) is False assert _looks_financial({"name": "precipitation"}) is False assert _looks_financial({"name": "caudal", "semantic_type": "currency"}) is True def _make_series_db(value_col: str) -> str: """DuckDB con una serie de niveles no estacionaria (random walk creciente).""" tmp_dir = tempfile.mkdtemp(prefix="series_test_") db_path = os.path.join(tmp_dir, "s.duckdb") con = duckdb.connect(db_path) con.execute(f'CREATE TABLE s (ts INTEGER, "{value_col}" DOUBLE)') # Niveles estrictamente positivos con tendencia creciente (no estacionaria). level = 100.0 rows = [] for t in range(80): level += 1.0 + (t % 7) * 0.3 # incrementos positivos deterministas rows.append((t, level)) con.executemany(f'INSERT INTO s VALUES (?, ?)', rows) con.close() return db_path def test_series_financiera_sugiere_retornos(): db_path = _make_series_db("close") r = profile_table(db_path, "s", run_series=True, write_report=False) assert r["status"] == "ok", r s = _col(r["profile"], "close").get("series") assert s is not None if s.get("levels_suggested"): assert s.get("levels_kind") == "returns" def test_series_no_financiera_sugiere_diferencias(): db_path = _make_series_db("temp_max") r = profile_table(db_path, "s", run_series=True, write_report=False) assert r["status"] == "ok", r s = _col(r["profile"], "temp_max").get("series") assert s is not None if s.get("levels_suggested"): assert s.get("levels_kind") == "differences" # Para diferencias no se computa el bloque de retornos. assert "to_returns" not in s # --- H2: periodo estacional derivado de la frecuencia del indice datetime --- def test_infer_period_from_dates_mensual_y_diario(): from datetime import date as _date, timedelta # Mensual (delta ~30 dias) con 72 puntos -> periodo 12. mensual = [_date(2000 + i // 12, i % 12 + 1, 1) for i in range(72)] assert _infer_period_from_dates(mensual, n_series=72) == 12 # Diario con >= 2 anios de datos -> estacionalidad anual (365). diario = [_date(2010, 1, 1) + timedelta(days=i) for i in range(800)] assert _infer_period_from_dates(diario, n_series=800) == 365 # Diario corto (< 2 anios) -> cae a semanal (7). diario_corto = [_date(2010, 1, 1) + timedelta(days=i) for i in range(100)] assert _infer_period_from_dates(diario_corto, n_series=100) == 7 # Sin fechas validas -> None (stl_decompose infiere o avisa). assert _infer_period_from_dates(["x", None, 3], n_series=50) is None def test_h2_periodo_de_frecuencia_datetime_end_to_end(): import math from datetime import date as _date tmp_dir = tempfile.mkdtemp(prefix="h2_period_test_") db_path = os.path.join(tmp_dir, "m.duckdb") con = duckdb.connect(db_path) con.execute("CREATE TABLE m (d DATE, v DOUBLE)") rows = [] for i in range(72): # 6 anios mensual con estacionalidad de periodo 12 dt = _date(2000 + i // 12, i % 12 + 1, 1) v = 10.0 + 0.1 * i + 5.0 * math.sin(2 * math.pi * (i % 12) / 12) rows.append((dt, v)) con.executemany("INSERT INTO m VALUES (?, ?)", rows) con.close() r = profile_table(db_path, "m", run_series=True, write_report=False) assert r["status"] == "ok", r s = _col(r["profile"], "v").get("series") or {} assert s.get("period_source") == "datetime_freq" stl = s.get("stl") or {} assert stl.get("period") == 12 # Estacionalidad sinusoidal clara -> fuerza estacional alta (antes salia ~0). assert (stl.get("seasonal_strength") or 0) > 0.3 # --- H7: id entero secuencial fuera de correlacion y de PCA/KMeans ----------- def test_is_sequential_id_distingue_id_de_precio(): # Id entero secuencial denso (1..n): True. idcol = { "inferred_type": "numeric", "flags": ["possible_id"], "distinct_count": 300, "numeric": {"min": 1.0, "max": 300.0}, } assert _is_sequential_id(idcol) is True # Float continuo de alta cardinalidad (precios): min/max con decimales -> False. precio = { "inferred_type": "numeric", "flags": ["possible_id"], "distinct_count": 300, "numeric": {"min": 24.35, "max": 189.7}, } assert _is_sequential_id(precio) is False # Entero disperso (anios): no es indice denso -> False. disperso = { "inferred_type": "numeric", "flags": ["possible_id"], "distinct_count": 3, "numeric": {"min": 1990.0, "max": 2010.0}, } assert _is_sequential_id(disperso) is False # Sin flag possible_id -> nunca id secuencial. sin_flag = { "inferred_type": "numeric", "flags": [], "distinct_count": 300, "numeric": {"min": 1.0, "max": 300.0}, } assert _is_sequential_id(sin_flag) is False def test_h7_id_secuencial_fuera_de_correlacion_y_modelos(): tmp_dir = tempfile.mkdtemp(prefix="h7_id_test_") db_path = os.path.join(tmp_dir, "t.duckdb") con = duckdb.connect(db_path) con.execute("CREATE TABLE t (rid INTEGER, age DOUBLE, fare DOUBLE)") # rid 0..299: indice de fila (id secuencial). age/fare: floats continuos. con.execute( "INSERT INTO t SELECT i, ((i*0.13)%80)+1.5, ((i*1.7)%50)+0.3 " "FROM range(300) tbl(i)" ) con.close() r = profile_table(db_path, "t", run_models=True, write_report=False) assert r["status"] == "ok", r prof = r["profile"] # rid (id secuencial) no entra en correlaciones fuertes. strong = (prof.get("correlations") or {}).get("strong", []) assert not any("rid" in (p["a"], p["b"]) for p in strong) # rid no entra como feature de los modelos (normality solo sobre continuas). norm = (prof.get("models") or {}).get("normality") or {} assert "rid" not in norm # age/fare (continuas) SI se mantienen como features. assert "age" in norm and "fare" in norm # --- H8: correlacion sobre niveles no estacionarios marcada espuria ---------- def test_h8_correlacion_niveles_marcada_posible_espuria(): tmp_dir = tempfile.mkdtemp(prefix="h8_levels_test_") db_path = os.path.join(tmp_dir, "s.duckdb") con = duckdb.connect(db_path) con.execute('CREATE TABLE s (ts INTEGER, "close" DOUBLE, "open" DOUBLE)') rows = [] level = 100.0 for t in range(90): # niveles crecientes (no estacionarios), close~open level += 1.0 + (t % 5) * 0.4 rows.append((t, level, level - 0.5)) con.executemany("INSERT INTO s VALUES (?, ?, ?)", rows) con.close() r = profile_table(db_path, "s", run_series=True, write_report=False) assert r["status"] == "ok", r corr = r["profile"].get("correlations") or {} co = [p for p in corr.get("pairs", []) if {p["a"], p["b"]} == {"close", "open"}] assert co, "par close-open no encontrado" # Ambas son series financieras de niveles no estacionarias -> par marcado. assert co[0].get("levels_possible_spurious") is True assert "levels_caveat" in corr def _make_db() -> str: """Crea una DuckDB temporal con la tabla de prueba y devuelve su path.""" tmp_dir = tempfile.mkdtemp(prefix="profile_table_test_") db_path = os.path.join(tmp_dir, "t.duckdb") con = duckdb.connect(db_path) con.execute( "CREATE TABLE items (" " id_str VARCHAR," # enteros guardados como texto " precio DOUBLE," # numerica nativa " categoria VARCHAR" # categorica ")" ) rows = [ ("10", 9.5, "alfa"), ("20", 12.0, "beta"), ("30", 7.25, "alfa"), ("40", 15.75, "gamma"), ("50", 3.0, "beta"), ("60", 22.4, "alfa"), ] con.executemany("INSERT INTO items VALUES (?, ?, ?)", rows) con.close() return db_path def _col(profile: dict, name: str) -> dict: return next(c for c in profile["columns"] if c["name"] == name) def test_varchar_integer_promotes_to_numeric(): db_path = _make_db() r = profile_table(db_path, "items", sample=5000, write_report=False) # status ok y sin tocar disco. assert r["status"] == "ok", r assert r["report_md_path"] is None assert r["report_json_path"] is None prof = r["profile"] # La columna VARCHAR-entera se promociono a numeric con bloque numeric. id_col = _col(prof, "id_str") assert id_col["inferred_type"] == "numeric", id_col["inferred_type"] assert id_col["numeric"] is not None assert id_col["numeric"]["min"] == 10.0 assert id_col["numeric"]["max"] == 60.0 # La numerica nativa sigue siendo numeric con su bloque. precio_col = _col(prof, "precio") assert precio_col["inferred_type"] == "numeric" assert precio_col["numeric"] is not None # La categorica recibe su bloque categorical. cat_col = _col(prof, "categoria") assert cat_col["inferred_type"] in ("categorical", "text") assert cat_col["categorical"] is not None assert cat_col["categorical"]["mode"] == "alfa" # key_candidates es una lista; quality_score existe (tabla y columnas). assert isinstance(prof["key_candidates"], list) assert prof["quality_score"] is not None assert id_col["quality_score"] is not None # type_breakdown recalculado refleja la promocion (>=2 numeric). assert prof["type_breakdown"]["numeric"] >= 2