feat(eda): funciones de agregación/OLAP para AutomaticEDA (groupby/pivot push-down + selección LLM)
Cuatro funciones nuevas del grupo eda que nutren el capítulo AGREGACION: - select_groupby_keys (pure): elige categóricas agrupables + numéricas medida desde el TableProfile. - groupby_stats_duckdb (impure): GROUP BY push-down en DuckDB (count/mean/median/std/min/max por grupo). - pivot_table_duckdb (impure): pivot A×B push-down, limitado a top filas/cols para no cortar. - suggest_aggregations_llm (impure): el LLM elige las agregaciones interesantes con fallback determinista. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,213 @@
|
||||
"""Tests para select_groupby_keys (grupo eda, dominio datascience)."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from select_groupby_keys import select_groupby_keys
|
||||
|
||||
|
||||
def _cat_col(name, card, *, imbalance=2.0, flags=None, null_pct=0.0):
|
||||
"""ColumnProfile categorico minimo con bloque categorical."""
|
||||
return {
|
||||
"name": name,
|
||||
"inferred_type": "categorical",
|
||||
"distinct_count": card,
|
||||
"unique_pct": card / 1000.0,
|
||||
"null_pct": null_pct,
|
||||
"flags": flags or [],
|
||||
"numeric": None,
|
||||
"categorical": {"imbalance": imbalance, "mode_pct": 0.5, "n_distinct": card},
|
||||
}
|
||||
|
||||
|
||||
def _num_col(name, *, std, cv, flags=None, unique_pct=0.1):
|
||||
"""ColumnProfile numerico minimo con bloque numeric."""
|
||||
return {
|
||||
"name": name,
|
||||
"inferred_type": "numeric",
|
||||
"distinct_count": 200,
|
||||
"unique_pct": unique_pct,
|
||||
"null_pct": 0.0,
|
||||
"flags": flags or [],
|
||||
"numeric": {"std": std, "cv": cv},
|
||||
"categorical": None,
|
||||
}
|
||||
|
||||
|
||||
def _titanic_like_profile() -> dict:
|
||||
"""Perfil estilo titanic: 2 categoricas buenas, 2 numericas, 1 id, 1 constante."""
|
||||
return {
|
||||
"n_rows": 891,
|
||||
"key_candidates": ["passenger_id"],
|
||||
"columns": [
|
||||
_cat_col("sex", 2, imbalance=1.8),
|
||||
_cat_col("pclass", 3, imbalance=2.5),
|
||||
_num_col("age", std=14.5, cv=0.49),
|
||||
_num_col("fare", std=49.7, cv=1.54),
|
||||
# id secuencial: flag possible_id + unique_pct alto.
|
||||
{
|
||||
"name": "passenger_id",
|
||||
"inferred_type": "numeric",
|
||||
"distinct_count": 891,
|
||||
"unique_pct": 1.0,
|
||||
"null_pct": 0.0,
|
||||
"flags": ["possible_id"],
|
||||
"numeric": {"std": 257.4, "cv": 0.58},
|
||||
"categorical": None,
|
||||
},
|
||||
# columna constante: flag constant + std 0.
|
||||
{
|
||||
"name": "embarked_const",
|
||||
"inferred_type": "categorical",
|
||||
"distinct_count": 1,
|
||||
"unique_pct": 0.001,
|
||||
"null_pct": 0.0,
|
||||
"flags": ["constant"],
|
||||
"numeric": None,
|
||||
"categorical": {"imbalance": 1.0},
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_titanic_picks_good_cats_excludes_id_and_constant():
|
||||
out = select_groupby_keys(_titanic_like_profile())
|
||||
|
||||
# Elige las dos categoricas buenas.
|
||||
chosen_cols = {g["col"] for g in out["group_keys"]}
|
||||
assert chosen_cols == {"sex", "pclass"}
|
||||
|
||||
# Excluye la constante y el key_candidate.
|
||||
assert "embarked_const" not in chosen_cols
|
||||
assert "passenger_id" not in chosen_cols
|
||||
|
||||
# Cada group key trae col, cardinality y score.
|
||||
for g in out["group_keys"]:
|
||||
assert set(g.keys()) == {"col", "cardinality", "score"}
|
||||
assert isinstance(g["score"], float)
|
||||
by_col = {g["col"]: g for g in out["group_keys"]}
|
||||
assert by_col["sex"]["cardinality"] == 2
|
||||
assert by_col["pclass"]["cardinality"] == 3
|
||||
|
||||
# Ordenadas por score descendente.
|
||||
scores = [g["score"] for g in out["group_keys"]]
|
||||
assert scores == sorted(scores, reverse=True)
|
||||
|
||||
|
||||
def test_titanic_measures_exclude_id_constant_and_keep_numerics():
|
||||
out = select_groupby_keys(_titanic_like_profile())
|
||||
|
||||
# Solo nombres (strings) de numericas informativas, sin el id secuencial.
|
||||
assert all(isinstance(m, str) for m in out["measures"])
|
||||
assert "passenger_id" not in out["measures"]
|
||||
assert set(out["measures"]) == {"age", "fare"}
|
||||
|
||||
# fare tiene mayor cv (1.54 > 0.49) -> primero.
|
||||
assert out["measures"][0] == "fare"
|
||||
|
||||
|
||||
def test_titanic_generates_one_pivot():
|
||||
out = select_groupby_keys(_titanic_like_profile())
|
||||
|
||||
# Con 2 group keys -> exactamente 1 pivot.
|
||||
assert len(out["pivots"]) == 1
|
||||
pivot = out["pivots"][0]
|
||||
assert set(pivot.keys()) == {"index", "columns", "value"}
|
||||
assert {pivot["index"], pivot["columns"]} == {"sex", "pclass"}
|
||||
# El valor es la primera measure (fare).
|
||||
assert pivot["value"] == "fare"
|
||||
|
||||
|
||||
def test_empty_profile_returns_all_empty_and_does_not_crash():
|
||||
out = select_groupby_keys({})
|
||||
assert out["group_keys"] == []
|
||||
assert out["measures"] == []
|
||||
assert out["pivots"] == []
|
||||
assert isinstance(out["note"], str)
|
||||
|
||||
|
||||
def test_none_profile_does_not_crash():
|
||||
out = select_groupby_keys(None)
|
||||
assert out == {
|
||||
"group_keys": [],
|
||||
"measures": [],
|
||||
"pivots": [],
|
||||
"note": out["note"],
|
||||
}
|
||||
assert isinstance(out["note"], str)
|
||||
|
||||
|
||||
def test_only_numerics_yields_empty_group_keys_and_no_pivots():
|
||||
profile = {
|
||||
"n_rows": 500,
|
||||
"key_candidates": [],
|
||||
"columns": [
|
||||
_num_col("price", std=12.0, cv=0.6),
|
||||
_num_col("weight", std=3.0, cv=0.2),
|
||||
],
|
||||
}
|
||||
out = select_groupby_keys(profile)
|
||||
assert out["group_keys"] == []
|
||||
assert out["pivots"] == []
|
||||
# Las numericas si se eligen como measures.
|
||||
assert set(out["measures"]) == {"price", "weight"}
|
||||
assert out["measures"][0] == "price" # mayor cv.
|
||||
|
||||
|
||||
def test_high_cardinality_and_max_card_are_excluded():
|
||||
profile = {
|
||||
"n_rows": 1000,
|
||||
"key_candidates": [],
|
||||
"columns": [
|
||||
_cat_col("city", 50, flags=["high_cardinality"]), # flag -> fuera.
|
||||
_cat_col("zone", 35), # card 35 > max_card 20 -> fuera.
|
||||
_cat_col("region", 5), # valida.
|
||||
],
|
||||
}
|
||||
out = select_groupby_keys(profile, max_card=20)
|
||||
assert {g["col"] for g in out["group_keys"]} == {"region"}
|
||||
|
||||
|
||||
def test_max_keys_limits_group_keys():
|
||||
profile = {
|
||||
"n_rows": 1000,
|
||||
"key_candidates": [],
|
||||
"columns": [
|
||||
_cat_col("a", 4, imbalance=1.0),
|
||||
_cat_col("b", 5, imbalance=1.2),
|
||||
_cat_col("c", 6, imbalance=1.5),
|
||||
_cat_col("d", 7, imbalance=2.0),
|
||||
],
|
||||
}
|
||||
out = select_groupby_keys(profile, max_keys=2)
|
||||
assert len(out["group_keys"]) == 2
|
||||
# Hasta 2 pivots con >=2 keys (aqui exactamente 1 par posible entre 2 keys).
|
||||
assert len(out["pivots"]) == 1
|
||||
|
||||
|
||||
def test_three_keys_cap_pivots_to_two():
|
||||
profile = {
|
||||
"n_rows": 1000,
|
||||
"key_candidates": [],
|
||||
"columns": [
|
||||
_cat_col("a", 4, imbalance=1.0),
|
||||
_cat_col("b", 5, imbalance=1.1),
|
||||
_cat_col("c", 6, imbalance=1.2),
|
||||
_num_col("m", std=10.0, cv=0.5),
|
||||
],
|
||||
}
|
||||
out = select_groupby_keys(profile, max_keys=3)
|
||||
assert len(out["group_keys"]) == 3
|
||||
# 3 keys -> 3 pares posibles, capado a 2.
|
||||
assert len(out["pivots"]) == 2
|
||||
for p in out["pivots"]:
|
||||
assert p["value"] == "m"
|
||||
|
||||
|
||||
def test_does_not_mutate_input():
|
||||
profile = _titanic_like_profile()
|
||||
before = repr(profile)
|
||||
select_groupby_keys(profile)
|
||||
assert repr(profile) == before
|
||||
Reference in New Issue
Block a user