a9a60cbf2c
describe_numeric emite una nueva clave aditiva histogram_clipped: un segundo histograma re-binado sobre el rango de vallas de Tukey [p25-1.5*IQR, p75+1.5*IQR], reutilizando los percentiles ya calculados. Es [] cuando el recorte no excluye nada (sin outliers), la columna es constante (iqr==0) o la sub-muestra recortada pierde dispersion, de modo que el renderer no duplica el histograma completo. El capitulo num_distr consume histogram_clipped como una segunda figura DENTRO del mismo grupo keep-together de la columna: la vista central se lee cuando una cola larga aplasta la escala del histograma completo. Bump describe_numeric 1.0.0->1.1.0 (aditivo) y CHAPTER_VERSION num_distr 1.3.0->1.4.0. Tests: golden (recorta la cola), edges (sin outliers -> [], constante -> []), contrato de claves y smoke e2e de render.
141 lines
5.4 KiB
Python
141 lines
5.4 KiB
Python
"""Tests para describe_numeric."""
|
|
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
|
|
from describe_numeric import describe_numeric
|
|
|
|
# Keys that every result dict must always contain (the eda numeric_sub contract).
|
|
_EXPECTED_KEYS = {
|
|
"min", "max", "mean", "median", "mode", "std", "variance", "cv",
|
|
"p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr",
|
|
"skew", "kurtosis", "n_outliers", "outlier_pct",
|
|
"zero_pct", "negative_pct", "distribution_type", "histogram",
|
|
"histogram_clipped",
|
|
}
|
|
|
|
|
|
def test_lista_con_outlier_y_none():
|
|
"""Lista con outlier claro y None descartado."""
|
|
# Tight cluster around 2-4 plus a None to drop and a clear extreme outlier.
|
|
# A wide cluster (n=40) keeps std small so the extreme value's z-score
|
|
# exceeds the 3.0 threshold used by detect_outliers.
|
|
cluster = [1, 2, 2, 3, 4] * 8 # 40 numeric values, mode == 2
|
|
values = cluster + [None, 1000]
|
|
result = describe_numeric(values)
|
|
|
|
# Contract: all keys present.
|
|
assert set(result.keys()) == _EXPECTED_KEYS
|
|
|
|
# Non-numeric / missing dropped: 41 numeric values remain.
|
|
assert result["min"] == 1.0
|
|
assert result["max"] == 1000.0
|
|
|
|
# mean/median reasonable: median sits in the cluster, mean pulled up by 1000.
|
|
assert result["median"] < result["mean"]
|
|
assert 0.0 < result["median"] <= 5.0
|
|
assert result["mean"] > result["median"]
|
|
|
|
# mode = most frequent (2 appears twice per block).
|
|
assert result["mode"] == 2.0
|
|
|
|
# At least one z-score outlier detected (the 1000).
|
|
assert result["n_outliers"] >= 1
|
|
assert result["outlier_pct"] > 0.0
|
|
|
|
# Histogram non-empty and counts cover every numeric value.
|
|
assert len(result["histogram"]) > 0
|
|
total = sum(bucket["count"] for bucket in result["histogram"])
|
|
assert total == 41
|
|
for bucket in result["histogram"]:
|
|
assert "lo" in bucket and "hi" in bucket and "count" in bucket
|
|
|
|
# No zeros, no negatives in this sample.
|
|
assert result["zero_pct"] == 0.0
|
|
assert result["negative_pct"] == 0.0
|
|
|
|
|
|
def test_lista_vacia_todo_none():
|
|
"""Lista vacia (o sin numericos) devuelve todas las claves en None."""
|
|
result = describe_numeric([None, "abc", float("nan")])
|
|
|
|
assert set(result.keys()) == _EXPECTED_KEYS
|
|
for key in _EXPECTED_KEYS - {"histogram", "histogram_clipped"}:
|
|
assert result[key] is None, f"{key} debe ser None"
|
|
assert result["histogram"] == []
|
|
assert result["histogram_clipped"] == []
|
|
|
|
|
|
def test_cv_none_cuando_mean_cero():
|
|
"""cv es None cuando la media es 0."""
|
|
# Symmetric around zero so mean == 0.
|
|
result = describe_numeric([-2, -1, 0, 1, 2])
|
|
assert result["mean"] == 0.0
|
|
assert result["cv"] is None
|
|
assert result["zero_pct"] == 20.0
|
|
assert result["negative_pct"] == 40.0
|
|
|
|
|
|
def test_iqr_y_percentiles():
|
|
"""iqr = p75 - p25 y percentiles coherentes."""
|
|
result = describe_numeric(list(range(1, 101))) # 1..100
|
|
assert result["iqr"] == result["p75"] - result["p25"]
|
|
assert result["p1"] <= result["p25"] <= result["p50"] <= result["p75"] <= result["p99"]
|
|
assert result["min"] == 1.0
|
|
assert result["max"] == 100.0
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# histogram_clipped: second view of the central mass, outliers trimmed.
|
|
# --------------------------------------------------------------------------- #
|
|
def test_histogram_clipped_trims_the_tail():
|
|
"""Golden: with a long high tail, the clipped histogram excludes the outliers.
|
|
|
|
A tight cluster in [1, 5] plus a handful of extreme values. The full histogram
|
|
stretches to the extreme (min..max); the clipped one is re-binned over the
|
|
Tukey inner fences, so its upper edge stays far below the extreme and it holds
|
|
fewer values than the full sample.
|
|
"""
|
|
cluster = [1, 2, 3, 4, 5] * 20 # 100 values in [1, 5]
|
|
values = cluster + [500, 800, 1000] # 3 far outliers
|
|
result = describe_numeric(values)
|
|
|
|
full = result["histogram"]
|
|
clipped = result["histogram_clipped"]
|
|
assert full and clipped # both present
|
|
for bucket in clipped:
|
|
assert "lo" in bucket and "hi" in bucket and "count" in bucket
|
|
|
|
# The full histogram reaches the extreme; the clipped one does not.
|
|
assert full[-1]["hi"] >= 900
|
|
assert clipped[-1]["hi"] < 100
|
|
|
|
# The clip removed the tail: fewer values counted than the full sample.
|
|
total_full = sum(b["count"] for b in full)
|
|
total_clipped = sum(b["count"] for b in clipped)
|
|
assert total_full == 103
|
|
assert total_clipped < total_full
|
|
assert total_clipped >= 100 # the whole cluster survives the clip
|
|
|
|
|
|
def test_histogram_clipped_empty_when_no_outliers():
|
|
"""Edge: a clean spread with no fence outliers yields an empty clipped view.
|
|
|
|
When the inner-fence range already covers every value, there is nothing to
|
|
trim, so histogram_clipped is [] and the renderer skips the redundant second
|
|
view instead of duplicating the full histogram.
|
|
"""
|
|
result = describe_numeric(list(range(1, 101))) # uniform 1..100, no outliers
|
|
assert result["n_outliers"] == 0
|
|
assert result["histogram"] # full histogram present
|
|
assert result["histogram_clipped"] == [] # nothing trimmed
|
|
|
|
|
|
def test_histogram_clipped_empty_when_constant():
|
|
"""Edge: a constant column (iqr == 0) never produces a clipped view."""
|
|
result = describe_numeric([7] * 30)
|
|
assert result["iqr"] == 0
|
|
assert result["histogram_clipped"] == []
|