fn_registry/python/functions/datascience/extract_text_sample_test.py

"""Tests para extract_text_sample.

Self-contained: crea un DuckDB temporal pequeño con una columna de texto (algunas
filas con NULL) y una numerica, y verifica que la muestra de texto trae solo los
valores no nulos, que el backend desconocido y la lista de columnas vacia se
manejan dict-no-throw, y que sample acota el numero de filas leidas.
"""

import os
import sys

_HERE = os.path.dirname(os.path.abspath(__file__))
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, ".."))  # python/functions
if _FUNCTIONS not in sys.path:
    sys.path.insert(0, _FUNCTIONS)

import duckdb  # noqa: E402

from datascience.extract_text_sample import extract_text_sample  # noqa: E402

_TABLE = "t"
# 6 filas: txt VARCHAR con dos NULL, other INT siempre presente.
_ROWS = [
    ("alpha", 1),
    ("beta", 2),
    (None, 3),
    ("gamma", 4),
    (None, 5),
    ("delta", 6),
]
_TXT_NON_NULL = {"alpha", "beta", "gamma", "delta"}


def _make_db(tmp_path):
    """Crea un DuckDB temporal con la tabla de prueba y devuelve su ruta."""
    db_path = os.path.join(str(tmp_path), "text_sample.duckdb")
    con = duckdb.connect(db_path)
    try:
        con.execute(f'CREATE TABLE "{_TABLE}" (txt VARCHAR, other INTEGER)')
        con.executemany(f'INSERT INTO "{_TABLE}" VALUES (?, ?)', _ROWS)
    finally:
        con.close()
    return db_path


def test_extract_basic(tmp_path):
    db_path = _make_db(tmp_path)
    res = extract_text_sample(db_path, _TABLE, ["txt"])
    assert res["status"] == "ok"
    # n = filas leidas por la query (6), antes de filtrar None.
    assert res["n"] == len(_ROWS)
    # columns["txt"] trae solo los strings no nulos (los dos NULL fuera).
    assert "txt" in res["columns"]
    assert set(res["columns"]["txt"]) == _TXT_NON_NULL
    assert len(res["columns"]["txt"]) == len(_TXT_NON_NULL)
    # No se pidio "other", no debe aparecer.
    assert "other" not in res["columns"]


def test_backend_desconocido(tmp_path):
    db_path = _make_db(tmp_path)
    res = extract_text_sample(db_path, _TABLE, ["txt"], backend="mysql")
    assert res["status"] == "error"
    assert "backend desconocido" in res["error"]
    assert res["columns"] == {}
    assert res["n"] == 0


def test_columns_vacio(tmp_path):
    db_path = _make_db(tmp_path)
    res = extract_text_sample(db_path, _TABLE, [])
    assert res["status"] == "ok"
    assert res["columns"] == {}
    assert res["n"] == 0


def test_sample_limit(tmp_path):
    db_path = _make_db(tmp_path)
    res = extract_text_sample(db_path, _TABLE, ["txt"], sample=2)
    assert res["status"] == "ok"
    # sample=2 -> la query lee como mucho 2 filas.
    assert res["n"] == 2
    assert len(res["columns"]["txt"]) <= 2