763e06c127
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
84 lines
2.8 KiB
Python
84 lines
2.8 KiB
Python
"""Tests para profile_table — pipeline EDA one-shot del grupo `eda`.
|
|
|
|
Crea una DuckDB temporal con tres columnas representativas:
|
|
- id_str: enteros guardados como VARCHAR ('10','20',...) -> debe promocionarse
|
|
a inferred_type "numeric" y recibir un bloque col["numeric"].
|
|
- precio: numerica nativa (DOUBLE).
|
|
- categoria: categorica textual.
|
|
Luego corre profile_table(write_report=False) y verifica el contrato.
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
|
|
import duckdb
|
|
|
|
from pipelines.profile_table import profile_table
|
|
|
|
|
|
def _make_db() -> str:
|
|
"""Crea una DuckDB temporal con la tabla de prueba y devuelve su path."""
|
|
tmp_dir = tempfile.mkdtemp(prefix="profile_table_test_")
|
|
db_path = os.path.join(tmp_dir, "t.duckdb")
|
|
con = duckdb.connect(db_path)
|
|
con.execute(
|
|
"CREATE TABLE items ("
|
|
" id_str VARCHAR," # enteros guardados como texto
|
|
" precio DOUBLE," # numerica nativa
|
|
" categoria VARCHAR" # categorica
|
|
")"
|
|
)
|
|
rows = [
|
|
("10", 9.5, "alfa"),
|
|
("20", 12.0, "beta"),
|
|
("30", 7.25, "alfa"),
|
|
("40", 15.75, "gamma"),
|
|
("50", 3.0, "beta"),
|
|
("60", 22.4, "alfa"),
|
|
]
|
|
con.executemany("INSERT INTO items VALUES (?, ?, ?)", rows)
|
|
con.close()
|
|
return db_path
|
|
|
|
|
|
def _col(profile: dict, name: str) -> dict:
|
|
return next(c for c in profile["columns"] if c["name"] == name)
|
|
|
|
|
|
def test_varchar_integer_promotes_to_numeric():
|
|
db_path = _make_db()
|
|
r = profile_table(db_path, "items", sample=5000, write_report=False)
|
|
|
|
# status ok y sin tocar disco.
|
|
assert r["status"] == "ok", r
|
|
assert r["report_md_path"] is None
|
|
assert r["report_json_path"] is None
|
|
|
|
prof = r["profile"]
|
|
|
|
# La columna VARCHAR-entera se promociono a numeric con bloque numeric.
|
|
id_col = _col(prof, "id_str")
|
|
assert id_col["inferred_type"] == "numeric", id_col["inferred_type"]
|
|
assert id_col["numeric"] is not None
|
|
assert id_col["numeric"]["min"] == 10.0
|
|
assert id_col["numeric"]["max"] == 60.0
|
|
|
|
# La numerica nativa sigue siendo numeric con su bloque.
|
|
precio_col = _col(prof, "precio")
|
|
assert precio_col["inferred_type"] == "numeric"
|
|
assert precio_col["numeric"] is not None
|
|
|
|
# La categorica recibe su bloque categorical.
|
|
cat_col = _col(prof, "categoria")
|
|
assert cat_col["inferred_type"] in ("categorical", "text")
|
|
assert cat_col["categorical"] is not None
|
|
assert cat_col["categorical"]["mode"] == "alfa"
|
|
|
|
# key_candidates es una lista; quality_score existe (tabla y columnas).
|
|
assert isinstance(prof["key_candidates"], list)
|
|
assert prof["quality_score"] is not None
|
|
assert id_col["quality_score"] is not None
|
|
|
|
# type_breakdown recalculado refleja la promocion (>=2 numeric).
|
|
assert prof["type_breakdown"]["numeric"] >= 2
|