"""Tests para describe_numeric.""" import os import sys sys.path.insert(0, os.path.dirname(__file__)) from describe_numeric import describe_numeric # Keys that every result dict must always contain (the eda numeric_sub contract). _EXPECTED_KEYS = { "min", "max", "mean", "median", "mode", "std", "variance", "cv", "p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr", "skew", "kurtosis", "n_outliers", "outlier_pct", "zero_pct", "negative_pct", "distribution_type", "histogram", "histogram_clipped", } def test_lista_con_outlier_y_none(): """Lista con outlier claro y None descartado.""" # Tight cluster around 2-4 plus a None to drop and a clear extreme outlier. # A wide cluster (n=40) keeps std small so the extreme value's z-score # exceeds the 3.0 threshold used by detect_outliers. cluster = [1, 2, 2, 3, 4] * 8 # 40 numeric values, mode == 2 values = cluster + [None, 1000] result = describe_numeric(values) # Contract: all keys present. assert set(result.keys()) == _EXPECTED_KEYS # Non-numeric / missing dropped: 41 numeric values remain. assert result["min"] == 1.0 assert result["max"] == 1000.0 # mean/median reasonable: median sits in the cluster, mean pulled up by 1000. assert result["median"] < result["mean"] assert 0.0 < result["median"] <= 5.0 assert result["mean"] > result["median"] # mode = most frequent (2 appears twice per block). assert result["mode"] == 2.0 # At least one z-score outlier detected (the 1000). assert result["n_outliers"] >= 1 assert result["outlier_pct"] > 0.0 # Histogram non-empty and counts cover every numeric value. assert len(result["histogram"]) > 0 total = sum(bucket["count"] for bucket in result["histogram"]) assert total == 41 for bucket in result["histogram"]: assert "lo" in bucket and "hi" in bucket and "count" in bucket # No zeros, no negatives in this sample. assert result["zero_pct"] == 0.0 assert result["negative_pct"] == 0.0 def test_lista_vacia_todo_none(): """Lista vacia (o sin numericos) devuelve todas las claves en None.""" result = describe_numeric([None, "abc", float("nan")]) assert set(result.keys()) == _EXPECTED_KEYS for key in _EXPECTED_KEYS - {"histogram", "histogram_clipped"}: assert result[key] is None, f"{key} debe ser None" assert result["histogram"] == [] assert result["histogram_clipped"] == [] def test_cv_none_cuando_mean_cero(): """cv es None cuando la media es 0.""" # Symmetric around zero so mean == 0. result = describe_numeric([-2, -1, 0, 1, 2]) assert result["mean"] == 0.0 assert result["cv"] is None assert result["zero_pct"] == 20.0 assert result["negative_pct"] == 40.0 def test_iqr_y_percentiles(): """iqr = p75 - p25 y percentiles coherentes.""" result = describe_numeric(list(range(1, 101))) # 1..100 assert result["iqr"] == result["p75"] - result["p25"] assert result["p1"] <= result["p25"] <= result["p50"] <= result["p75"] <= result["p99"] assert result["min"] == 1.0 assert result["max"] == 100.0 # --------------------------------------------------------------------------- # # histogram_clipped: second view of the central mass, outliers trimmed. # --------------------------------------------------------------------------- # def test_histogram_clipped_trims_the_tail(): """Golden: with a long high tail, the clipped histogram excludes the outliers. A tight cluster in [1, 5] plus a handful of extreme values. The full histogram stretches to the extreme (min..max); the clipped one is re-binned over the Tukey inner fences, so its upper edge stays far below the extreme and it holds fewer values than the full sample. """ cluster = [1, 2, 3, 4, 5] * 20 # 100 values in [1, 5] values = cluster + [500, 800, 1000] # 3 far outliers result = describe_numeric(values) full = result["histogram"] clipped = result["histogram_clipped"] assert full and clipped # both present for bucket in clipped: assert "lo" in bucket and "hi" in bucket and "count" in bucket # The full histogram reaches the extreme; the clipped one does not. assert full[-1]["hi"] >= 900 assert clipped[-1]["hi"] < 100 # The clip removed the tail: fewer values counted than the full sample. total_full = sum(b["count"] for b in full) total_clipped = sum(b["count"] for b in clipped) assert total_full == 103 assert total_clipped < total_full assert total_clipped >= 100 # the whole cluster survives the clip def test_histogram_clipped_empty_when_no_outliers(): """Edge: a clean spread with no fence outliers yields an empty clipped view. When the inner-fence range already covers every value, there is nothing to trim, so histogram_clipped is [] and the renderer skips the redundant second view instead of duplicating the full histogram. """ result = describe_numeric(list(range(1, 101))) # uniform 1..100, no outliers assert result["n_outliers"] == 0 assert result["histogram"] # full histogram present assert result["histogram_clipped"] == [] # nothing trimmed def test_histogram_clipped_empty_when_constant(): """Edge: a constant column (iqr == 0) never produces a clipped view.""" result = describe_numeric([7] * 30) assert result["iqr"] == 0 assert result["histogram_clipped"] == []