feat(eda): funciones de agregación/OLAP para AutomaticEDA (groupby/pivot push-down + selección LLM)
Cuatro funciones nuevas del grupo eda que nutren el capítulo AGREGACION: - select_groupby_keys (pure): elige categóricas agrupables + numéricas medida desde el TableProfile. - groupby_stats_duckdb (impure): GROUP BY push-down en DuckDB (count/mean/median/std/min/max por grupo). - pivot_table_duckdb (impure): pivot A×B push-down, limitado a top filas/cols para no cortar. - suggest_aggregations_llm (impure): el LLM elige las agregaciones interesantes con fallback determinista. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,106 @@
|
||||
"""Tests para groupby_stats_duckdb."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import duckdb
|
||||
|
||||
# Permitir importar funciones del registry (from infra import ..., from datascience import ...).
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "functions"))
|
||||
|
||||
from datascience.groupby_stats_duckdb import groupby_stats_duckdb
|
||||
|
||||
|
||||
def _make_db(tmp_path, rows):
|
||||
"""Crea una DuckDB con tabla t(g VARCHAR, x DOUBLE) e inserta `rows`."""
|
||||
db = os.path.join(str(tmp_path), "t.duckdb")
|
||||
con = duckdb.connect(db)
|
||||
con.execute("CREATE TABLE t(g VARCHAR, x DOUBLE)")
|
||||
con.executemany("INSERT INTO t VALUES (?, ?)", rows)
|
||||
con.close()
|
||||
return db
|
||||
|
||||
|
||||
def test_agrega_por_grupo_con_valores_conocidos(tmp_path):
|
||||
# Grupo a: [10, 20, 30] -> n=3, mean=20, min=10, max=30, median=20, std=10.
|
||||
# Grupo b: [5, 15] -> n=2, mean=10, median=10.
|
||||
# Grupo c: [100] -> n=1, mean=100, std=None (1 sola fila).
|
||||
rows = [
|
||||
("a", 10.0), ("a", 20.0), ("a", 30.0),
|
||||
("b", 5.0), ("b", 15.0),
|
||||
("c", 100.0),
|
||||
]
|
||||
db = _make_db(tmp_path, rows)
|
||||
res = groupby_stats_duckdb(db, "t", "g", ["x"])
|
||||
assert res["status"] == "ok", res
|
||||
assert res["n_groups"] == 3
|
||||
assert res["truncated"] is False
|
||||
assert res["aggs"] == ["count", "mean", "median", "std", "min", "max"]
|
||||
|
||||
by_key = {g["key"]: g for g in res["groups"]}
|
||||
assert set(by_key) == {"a", "b", "c"}
|
||||
|
||||
# Grupo a: comprobacion manual de mean/min/max/median/std.
|
||||
sa = by_key["a"]["stats"]["x"]
|
||||
assert by_key["a"]["n"] == 3
|
||||
assert abs(sa["mean"] - 20.0) < 1e-9
|
||||
assert abs(sa["min"] - 10.0) < 1e-9
|
||||
assert abs(sa["max"] - 30.0) < 1e-9
|
||||
assert abs(sa["median"] - 20.0) < 1e-9
|
||||
assert "std" in sa and sa["std"] is not None
|
||||
assert abs(sa["std"] - 10.0) < 1e-9 # stddev_samp([10,20,30]) = 10
|
||||
|
||||
# Grupo b: mean y median conocidas.
|
||||
sb = by_key["b"]["stats"]["x"]
|
||||
assert by_key["b"]["n"] == 2
|
||||
assert abs(sb["mean"] - 10.0) < 1e-9
|
||||
assert abs(sb["median"] - 10.0) < 1e-9
|
||||
assert "median" in sb and "std" in sb
|
||||
|
||||
# Grupo c: una sola fila -> std None (stddev_samp NULL), mean/min/max definidos.
|
||||
sc = by_key["c"]["stats"]["x"]
|
||||
assert by_key["c"]["n"] == 1
|
||||
assert abs(sc["mean"] - 100.0) < 1e-9
|
||||
assert sc["std"] is None
|
||||
|
||||
|
||||
def test_db_inexistente_devuelve_error_sin_lanzar(tmp_path):
|
||||
db = os.path.join(str(tmp_path), "no_existe.duckdb")
|
||||
res = groupby_stats_duckdb(db, "t", "g", ["x"])
|
||||
assert res["status"] == "error", res
|
||||
assert isinstance(res["error"], str) and res["error"]
|
||||
|
||||
|
||||
def test_measures_vacias_agrega_solo_count(tmp_path):
|
||||
rows = [("a", 1.0), ("a", 2.0), ("b", 3.0)]
|
||||
db = _make_db(tmp_path, rows)
|
||||
res = groupby_stats_duckdb(db, "t", "g", [])
|
||||
assert res["status"] == "ok", res
|
||||
by_key = {g["key"]: g for g in res["groups"]}
|
||||
assert by_key["a"]["n"] == 2
|
||||
assert by_key["b"]["n"] == 1
|
||||
# Sin measures, stats por grupo es un dict vacio (valido).
|
||||
assert by_key["a"]["stats"] == {}
|
||||
assert by_key["b"]["stats"] == {}
|
||||
|
||||
|
||||
def test_columna_con_espacio_agrupa_bien(tmp_path):
|
||||
# Tabla con nombres de columna con espacios -> prueba el quoting con dobles
|
||||
# comillas tanto en group_by como en la measure.
|
||||
db = os.path.join(str(tmp_path), "space.duckdb")
|
||||
con = duckdb.connect(db)
|
||||
con.execute('CREATE TABLE t("my col" VARCHAR, "the val" DOUBLE)')
|
||||
con.executemany(
|
||||
'INSERT INTO t VALUES (?, ?)',
|
||||
[("x", 1.0), ("x", 3.0), ("y", 10.0)],
|
||||
)
|
||||
con.close()
|
||||
|
||||
res = groupby_stats_duckdb(db, "t", "my col", ["the val"])
|
||||
assert res["status"] == "ok", res
|
||||
by_key = {g["key"]: g for g in res["groups"]}
|
||||
assert by_key["x"]["n"] == 2
|
||||
assert abs(by_key["x"]["stats"]["the val"]["mean"] - 2.0) < 1e-9
|
||||
assert by_key["y"]["n"] == 1
|
||||
assert abs(by_key["y"]["stats"]["the val"]["mean"] - 10.0) < 1e-9
|
||||
Reference in New Issue
Block a user