"""Tests para groupby_stats_duckdb.""" import os import sys import duckdb # Permitir importar funciones del registry (from infra import ..., from datascience import ...). sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "functions")) from datascience.groupby_stats_duckdb import groupby_stats_duckdb def _make_db(tmp_path, rows): """Crea una DuckDB con tabla t(g VARCHAR, x DOUBLE) e inserta `rows`.""" db = os.path.join(str(tmp_path), "t.duckdb") con = duckdb.connect(db) con.execute("CREATE TABLE t(g VARCHAR, x DOUBLE)") con.executemany("INSERT INTO t VALUES (?, ?)", rows) con.close() return db def test_agrega_por_grupo_con_valores_conocidos(tmp_path): # Grupo a: [10, 20, 30] -> n=3, mean=20, min=10, max=30, median=20, std=10. # Grupo b: [5, 15] -> n=2, mean=10, median=10. # Grupo c: [100] -> n=1, mean=100, std=None (1 sola fila). rows = [ ("a", 10.0), ("a", 20.0), ("a", 30.0), ("b", 5.0), ("b", 15.0), ("c", 100.0), ] db = _make_db(tmp_path, rows) res = groupby_stats_duckdb(db, "t", "g", ["x"]) assert res["status"] == "ok", res assert res["n_groups"] == 3 assert res["truncated"] is False assert res["aggs"] == ["count", "mean", "median", "std", "min", "max"] by_key = {g["key"]: g for g in res["groups"]} assert set(by_key) == {"a", "b", "c"} # Grupo a: comprobacion manual de mean/min/max/median/std. sa = by_key["a"]["stats"]["x"] assert by_key["a"]["n"] == 3 assert abs(sa["mean"] - 20.0) < 1e-9 assert abs(sa["min"] - 10.0) < 1e-9 assert abs(sa["max"] - 30.0) < 1e-9 assert abs(sa["median"] - 20.0) < 1e-9 assert "std" in sa and sa["std"] is not None assert abs(sa["std"] - 10.0) < 1e-9 # stddev_samp([10,20,30]) = 10 # Grupo b: mean y median conocidas. sb = by_key["b"]["stats"]["x"] assert by_key["b"]["n"] == 2 assert abs(sb["mean"] - 10.0) < 1e-9 assert abs(sb["median"] - 10.0) < 1e-9 assert "median" in sb and "std" in sb # Grupo c: una sola fila -> std None (stddev_samp NULL), mean/min/max definidos. sc = by_key["c"]["stats"]["x"] assert by_key["c"]["n"] == 1 assert abs(sc["mean"] - 100.0) < 1e-9 assert sc["std"] is None def test_db_inexistente_devuelve_error_sin_lanzar(tmp_path): db = os.path.join(str(tmp_path), "no_existe.duckdb") res = groupby_stats_duckdb(db, "t", "g", ["x"]) assert res["status"] == "error", res assert isinstance(res["error"], str) and res["error"] def test_measures_vacias_agrega_solo_count(tmp_path): rows = [("a", 1.0), ("a", 2.0), ("b", 3.0)] db = _make_db(tmp_path, rows) res = groupby_stats_duckdb(db, "t", "g", []) assert res["status"] == "ok", res by_key = {g["key"]: g for g in res["groups"]} assert by_key["a"]["n"] == 2 assert by_key["b"]["n"] == 1 # Sin measures, stats por grupo es un dict vacio (valido). assert by_key["a"]["stats"] == {} assert by_key["b"]["stats"] == {} def test_columna_con_espacio_agrupa_bien(tmp_path): # Tabla con nombres de columna con espacios -> prueba el quoting con dobles # comillas tanto en group_by como en la measure. db = os.path.join(str(tmp_path), "space.duckdb") con = duckdb.connect(db) con.execute('CREATE TABLE t("my col" VARCHAR, "the val" DOUBLE)') con.executemany( 'INSERT INTO t VALUES (?, ?)', [("x", 1.0), ("x", 3.0), ("y", 10.0)], ) con.close() res = groupby_stats_duckdb(db, "t", "my col", ["the val"]) assert res["status"] == "ok", res by_key = {g["key"]: g for g in res["groups"]} assert by_key["x"]["n"] == 2 assert abs(by_key["x"]["stats"]["the val"]["mean"] - 2.0) < 1e-9 assert by_key["y"]["n"] == 1 assert abs(by_key["y"]["stats"]["the val"]["mean"] - 10.0) < 1e-9