"""Tests para select_groupby_keys (grupo eda, dominio datascience).""" import os import sys sys.path.insert(0, os.path.dirname(__file__)) from select_groupby_keys import select_groupby_keys def _cat_col(name, card, *, imbalance=2.0, flags=None, null_pct=0.0): """ColumnProfile categorico minimo con bloque categorical.""" return { "name": name, "inferred_type": "categorical", "distinct_count": card, "unique_pct": card / 1000.0, "null_pct": null_pct, "flags": flags or [], "numeric": None, "categorical": {"imbalance": imbalance, "mode_pct": 0.5, "n_distinct": card}, } def _num_col(name, *, std, cv, flags=None, unique_pct=0.1): """ColumnProfile numerico minimo con bloque numeric.""" return { "name": name, "inferred_type": "numeric", "distinct_count": 200, "unique_pct": unique_pct, "null_pct": 0.0, "flags": flags or [], "numeric": {"std": std, "cv": cv}, "categorical": None, } def _titanic_like_profile() -> dict: """Perfil estilo titanic: 2 categoricas buenas, 2 numericas, 1 id, 1 constante.""" return { "n_rows": 891, "key_candidates": ["passenger_id"], "columns": [ _cat_col("sex", 2, imbalance=1.8), _cat_col("pclass", 3, imbalance=2.5), _num_col("age", std=14.5, cv=0.49), _num_col("fare", std=49.7, cv=1.54), # id secuencial: flag possible_id + unique_pct alto. { "name": "passenger_id", "inferred_type": "numeric", "distinct_count": 891, "unique_pct": 1.0, "null_pct": 0.0, "flags": ["possible_id"], "numeric": {"std": 257.4, "cv": 0.58}, "categorical": None, }, # columna constante: flag constant + std 0. { "name": "embarked_const", "inferred_type": "categorical", "distinct_count": 1, "unique_pct": 0.001, "null_pct": 0.0, "flags": ["constant"], "numeric": None, "categorical": {"imbalance": 1.0}, }, ], } def test_titanic_picks_good_cats_excludes_id_and_constant(): out = select_groupby_keys(_titanic_like_profile()) # Elige las dos categoricas buenas. chosen_cols = {g["col"] for g in out["group_keys"]} assert chosen_cols == {"sex", "pclass"} # Excluye la constante y el key_candidate. assert "embarked_const" not in chosen_cols assert "passenger_id" not in chosen_cols # Cada group key trae col, cardinality y score. for g in out["group_keys"]: assert set(g.keys()) == {"col", "cardinality", "score"} assert isinstance(g["score"], float) by_col = {g["col"]: g for g in out["group_keys"]} assert by_col["sex"]["cardinality"] == 2 assert by_col["pclass"]["cardinality"] == 3 # Ordenadas por score descendente. scores = [g["score"] for g in out["group_keys"]] assert scores == sorted(scores, reverse=True) def test_titanic_measures_exclude_id_constant_and_keep_numerics(): out = select_groupby_keys(_titanic_like_profile()) # Solo nombres (strings) de numericas informativas, sin el id secuencial. assert all(isinstance(m, str) for m in out["measures"]) assert "passenger_id" not in out["measures"] assert set(out["measures"]) == {"age", "fare"} # fare tiene mayor cv (1.54 > 0.49) -> primero. assert out["measures"][0] == "fare" def test_titanic_generates_one_pivot(): out = select_groupby_keys(_titanic_like_profile()) # Con 2 group keys -> exactamente 1 pivot. assert len(out["pivots"]) == 1 pivot = out["pivots"][0] assert set(pivot.keys()) == {"index", "columns", "value"} assert {pivot["index"], pivot["columns"]} == {"sex", "pclass"} # El valor es la primera measure (fare). assert pivot["value"] == "fare" def test_empty_profile_returns_all_empty_and_does_not_crash(): out = select_groupby_keys({}) assert out["group_keys"] == [] assert out["measures"] == [] assert out["pivots"] == [] assert isinstance(out["note"], str) def test_none_profile_does_not_crash(): out = select_groupby_keys(None) assert out == { "group_keys": [], "measures": [], "pivots": [], "note": out["note"], } assert isinstance(out["note"], str) def test_only_numerics_yields_empty_group_keys_and_no_pivots(): profile = { "n_rows": 500, "key_candidates": [], "columns": [ _num_col("price", std=12.0, cv=0.6), _num_col("weight", std=3.0, cv=0.2), ], } out = select_groupby_keys(profile) assert out["group_keys"] == [] assert out["pivots"] == [] # Las numericas si se eligen como measures. assert set(out["measures"]) == {"price", "weight"} assert out["measures"][0] == "price" # mayor cv. def test_high_cardinality_and_max_card_are_excluded(): profile = { "n_rows": 1000, "key_candidates": [], "columns": [ _cat_col("city", 50, flags=["high_cardinality"]), # flag -> fuera. _cat_col("zone", 35), # card 35 > max_card 20 -> fuera. _cat_col("region", 5), # valida. ], } out = select_groupby_keys(profile, max_card=20) assert {g["col"] for g in out["group_keys"]} == {"region"} def test_max_keys_limits_group_keys(): profile = { "n_rows": 1000, "key_candidates": [], "columns": [ _cat_col("a", 4, imbalance=1.0), _cat_col("b", 5, imbalance=1.2), _cat_col("c", 6, imbalance=1.5), _cat_col("d", 7, imbalance=2.0), ], } out = select_groupby_keys(profile, max_keys=2) assert len(out["group_keys"]) == 2 # Hasta 2 pivots con >=2 keys (aqui exactamente 1 par posible entre 2 keys). assert len(out["pivots"]) == 1 def test_three_keys_cap_pivots_to_two(): profile = { "n_rows": 1000, "key_candidates": [], "columns": [ _cat_col("a", 4, imbalance=1.0), _cat_col("b", 5, imbalance=1.1), _cat_col("c", 6, imbalance=1.2), _num_col("m", std=10.0, cv=0.5), ], } out = select_groupby_keys(profile, max_keys=3) assert len(out["group_keys"]) == 3 # 3 keys -> 3 pares posibles, capado a 2. assert len(out["pivots"]) == 2 for p in out["pivots"]: assert p["value"] == "m" def test_does_not_mutate_input(): profile = _titanic_like_profile() before = repr(profile) select_groupby_keys(profile) assert repr(profile) == before