763e06c127
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
91 lines
2.7 KiB
Python
91 lines
2.7 KiB
Python
"""Tests para summarize_categorical."""
|
|
|
|
import sys
|
|
import os
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
|
|
from summarize_categorical import summarize_categorical
|
|
|
|
|
|
def test_summarize_categorical_repeated():
|
|
"""Lista con repetidos: top ordenado por count desc, mode/n_distinct/entropy."""
|
|
values = ["a", "a", "b", "c", "a", None, ""]
|
|
result = summarize_categorical(values)
|
|
|
|
# None descartado; total no-nulo = 6 (a,a,b,c,a,"").
|
|
assert [t["value"] for t in result["top"]] == ["a", "b", "c", ""]
|
|
assert result["top"][0]["count"] == 3
|
|
# top ordenado por count descendente.
|
|
counts = [t["count"] for t in result["top"]]
|
|
assert counts == sorted(counts, reverse=True)
|
|
assert abs(result["top"][0]["pct"] - 3 / 6) < 1e-12
|
|
|
|
assert result["mode"] == "a"
|
|
assert abs(result["mode_pct"] - 3 / 6) < 1e-12
|
|
assert result["n_distinct"] == 4
|
|
assert result["entropy"] > 0
|
|
assert result["imbalance"] == 3 / 1 # max_count(3) / min_count(1)
|
|
assert result["len_min"] == 0 # the "" value
|
|
assert result["len_max"] == 1
|
|
|
|
|
|
def test_summarize_categorical_empty():
|
|
"""Lista vacia: top=[] y resto de claves None."""
|
|
result = summarize_categorical([])
|
|
assert result["top"] == []
|
|
for key in (
|
|
"mode",
|
|
"mode_pct",
|
|
"n_distinct",
|
|
"entropy",
|
|
"imbalance",
|
|
"len_mean",
|
|
"len_min",
|
|
"len_max",
|
|
):
|
|
assert result[key] is None
|
|
|
|
|
|
def test_summarize_categorical_all_none():
|
|
"""Lista de solo None se trata como vacia."""
|
|
result = summarize_categorical([None, None, None])
|
|
assert result["top"] == []
|
|
assert result["n_distinct"] is None
|
|
assert result["entropy"] is None
|
|
|
|
|
|
def test_summarize_categorical_single_value():
|
|
"""Un solo valor distinto: entropy 0.0, imbalance 1.0."""
|
|
result = summarize_categorical(["x", "x", "x"])
|
|
assert result["n_distinct"] == 1
|
|
assert result["entropy"] == 0.0
|
|
assert result["imbalance"] == 1.0
|
|
assert result["mode"] == "x"
|
|
assert result["mode_pct"] == 1.0
|
|
assert result["len_mean"] == 1.0
|
|
|
|
|
|
def test_summarize_categorical_top_k():
|
|
"""top_k limita el numero de entradas en top sin alterar n_distinct."""
|
|
values = ["a", "a", "b", "b", "c", "d", "e"]
|
|
result = summarize_categorical(values, top_k=2)
|
|
assert len(result["top"]) == 2
|
|
assert result["n_distinct"] == 5
|
|
|
|
|
|
def test_summarize_categorical_keys():
|
|
"""El dict tiene exactamente las claves del contrato categorical_sub."""
|
|
result = summarize_categorical(["a", "b"])
|
|
assert set(result.keys()) == {
|
|
"top",
|
|
"mode",
|
|
"mode_pct",
|
|
"n_distinct",
|
|
"entropy",
|
|
"imbalance",
|
|
"len_mean",
|
|
"len_min",
|
|
"len_max",
|
|
}
|