"""Tests para summarize_categorical.""" import sys import os sys.path.insert(0, os.path.dirname(__file__)) from summarize_categorical import summarize_categorical def test_summarize_categorical_repeated(): """Lista con repetidos: top ordenado por count desc, mode/n_distinct/entropy.""" values = ["a", "a", "b", "c", "a", None, ""] result = summarize_categorical(values) # None descartado; total no-nulo = 6 (a,a,b,c,a,""). assert [t["value"] for t in result["top"]] == ["a", "b", "c", ""] assert result["top"][0]["count"] == 3 # top ordenado por count descendente. counts = [t["count"] for t in result["top"]] assert counts == sorted(counts, reverse=True) assert abs(result["top"][0]["pct"] - 3 / 6) < 1e-12 assert result["mode"] == "a" assert abs(result["mode_pct"] - 3 / 6) < 1e-12 assert result["n_distinct"] == 4 assert result["entropy"] > 0 assert result["imbalance"] == 3 / 1 # max_count(3) / min_count(1) assert result["len_min"] == 0 # the "" value assert result["len_max"] == 1 def test_summarize_categorical_empty(): """Lista vacia: top=[] y resto de claves None.""" result = summarize_categorical([]) assert result["top"] == [] for key in ( "mode", "mode_pct", "n_distinct", "entropy", "imbalance", "len_mean", "len_min", "len_max", ): assert result[key] is None def test_summarize_categorical_all_none(): """Lista de solo None se trata como vacia.""" result = summarize_categorical([None, None, None]) assert result["top"] == [] assert result["n_distinct"] is None assert result["entropy"] is None def test_summarize_categorical_single_value(): """Un solo valor distinto: entropy 0.0, imbalance 1.0.""" result = summarize_categorical(["x", "x", "x"]) assert result["n_distinct"] == 1 assert result["entropy"] == 0.0 assert result["imbalance"] == 1.0 assert result["mode"] == "x" assert result["mode_pct"] == 1.0 assert result["len_mean"] == 1.0 def test_summarize_categorical_top_k(): """top_k limita el numero de entradas en top sin alterar n_distinct.""" values = ["a", "a", "b", "b", "c", "d", "e"] result = summarize_categorical(values, top_k=2) assert len(result["top"]) == 2 assert result["n_distinct"] == 5 def test_summarize_categorical_keys(): """El dict tiene exactamente las claves del contrato categorical_sub.""" result = summarize_categorical(["a", "b"]) assert set(result.keys()) == { "top", "mode", "mode_pct", "n_distinct", "entropy", "imbalance", "len_mean", "len_min", "len_max", }