"""Tests para extract_text_sample. Self-contained: crea un DuckDB temporal pequeño con una columna de texto (algunas filas con NULL) y una numerica, y verifica que la muestra de texto trae solo los valores no nulos, que el backend desconocido y la lista de columnas vacia se manejan dict-no-throw, y que sample acota el numero de filas leidas. """ import os import sys _HERE = os.path.dirname(os.path.abspath(__file__)) _FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..")) # python/functions if _FUNCTIONS not in sys.path: sys.path.insert(0, _FUNCTIONS) import duckdb # noqa: E402 from datascience.extract_text_sample import extract_text_sample # noqa: E402 _TABLE = "t" # 6 filas: txt VARCHAR con dos NULL, other INT siempre presente. _ROWS = [ ("alpha", 1), ("beta", 2), (None, 3), ("gamma", 4), (None, 5), ("delta", 6), ] _TXT_NON_NULL = {"alpha", "beta", "gamma", "delta"} def _make_db(tmp_path): """Crea un DuckDB temporal con la tabla de prueba y devuelve su ruta.""" db_path = os.path.join(str(tmp_path), "text_sample.duckdb") con = duckdb.connect(db_path) try: con.execute(f'CREATE TABLE "{_TABLE}" (txt VARCHAR, other INTEGER)') con.executemany(f'INSERT INTO "{_TABLE}" VALUES (?, ?)', _ROWS) finally: con.close() return db_path def test_extract_basic(tmp_path): db_path = _make_db(tmp_path) res = extract_text_sample(db_path, _TABLE, ["txt"]) assert res["status"] == "ok" # n = filas leidas por la query (6), antes de filtrar None. assert res["n"] == len(_ROWS) # columns["txt"] trae solo los strings no nulos (los dos NULL fuera). assert "txt" in res["columns"] assert set(res["columns"]["txt"]) == _TXT_NON_NULL assert len(res["columns"]["txt"]) == len(_TXT_NON_NULL) # No se pidio "other", no debe aparecer. assert "other" not in res["columns"] def test_backend_desconocido(tmp_path): db_path = _make_db(tmp_path) res = extract_text_sample(db_path, _TABLE, ["txt"], backend="mysql") assert res["status"] == "error" assert "backend desconocido" in res["error"] assert res["columns"] == {} assert res["n"] == 0 def test_columns_vacio(tmp_path): db_path = _make_db(tmp_path) res = extract_text_sample(db_path, _TABLE, []) assert res["status"] == "ok" assert res["columns"] == {} assert res["n"] == 0 def test_sample_limit(tmp_path): db_path = _make_db(tmp_path) res = extract_text_sample(db_path, _TABLE, ["txt"], sample=2) assert res["status"] == "ok" # sample=2 -> la query lee como mucho 2 filas. assert res["n"] == 2 assert len(res["columns"]["txt"]) <= 2