fn_registry/python/functions/datascience/describe_numeric_test.py

"""Tests para describe_numeric."""

import os
import sys

sys.path.insert(0, os.path.dirname(__file__))

from describe_numeric import describe_numeric

# Keys that every result dict must always contain (the eda numeric_sub contract).
_EXPECTED_KEYS = {
    "min", "max", "mean", "median", "mode", "std", "variance", "cv",
    "p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr",
    "skew", "kurtosis", "n_outliers", "outlier_pct",
    "zero_pct", "negative_pct", "distribution_type", "histogram",
    "histogram_clipped",
}


def test_lista_con_outlier_y_none():
    """Lista con outlier claro y None descartado."""
    # Tight cluster around 2-4 plus a None to drop and a clear extreme outlier.
    # A wide cluster (n=40) keeps std small so the extreme value's z-score
    # exceeds the 3.0 threshold used by detect_outliers.
    cluster = [1, 2, 2, 3, 4] * 8  # 40 numeric values, mode == 2
    values = cluster + [None, 1000]
    result = describe_numeric(values)

    # Contract: all keys present.
    assert set(result.keys()) == _EXPECTED_KEYS

    # Non-numeric / missing dropped: 41 numeric values remain.
    assert result["min"] == 1.0
    assert result["max"] == 1000.0

    # mean/median reasonable: median sits in the cluster, mean pulled up by 1000.
    assert result["median"] < result["mean"]
    assert 0.0 < result["median"] <= 5.0
    assert result["mean"] > result["median"]

    # mode = most frequent (2 appears twice per block).
    assert result["mode"] == 2.0

    # At least one z-score outlier detected (the 1000).
    assert result["n_outliers"] >= 1
    assert result["outlier_pct"] > 0.0

    # Histogram non-empty and counts cover every numeric value.
    assert len(result["histogram"]) > 0
    total = sum(bucket["count"] for bucket in result["histogram"])
    assert total == 41
    for bucket in result["histogram"]:
        assert "lo" in bucket and "hi" in bucket and "count" in bucket

    # No zeros, no negatives in this sample.
    assert result["zero_pct"] == 0.0
    assert result["negative_pct"] == 0.0


def test_lista_vacia_todo_none():
    """Lista vacia (o sin numericos) devuelve todas las claves en None."""
    result = describe_numeric([None, "abc", float("nan")])

    assert set(result.keys()) == _EXPECTED_KEYS
    for key in _EXPECTED_KEYS - {"histogram", "histogram_clipped"}:
        assert result[key] is None, f"{key} debe ser None"
    assert result["histogram"] == []
    assert result["histogram_clipped"] == []


def test_cv_none_cuando_mean_cero():
    """cv es None cuando la media es 0."""
    # Symmetric around zero so mean == 0.
    result = describe_numeric([-2, -1, 0, 1, 2])
    assert result["mean"] == 0.0
    assert result["cv"] is None
    assert result["zero_pct"] == 20.0
    assert result["negative_pct"] == 40.0


def test_iqr_y_percentiles():
    """iqr = p75 - p25 y percentiles coherentes."""
    result = describe_numeric(list(range(1, 101)))  # 1..100
    assert result["iqr"] == result["p75"] - result["p25"]
    assert result["p1"] <= result["p25"] <= result["p50"] <= result["p75"] <= result["p99"]
    assert result["min"] == 1.0
    assert result["max"] == 100.0


# --------------------------------------------------------------------------- #
# histogram_clipped: second view of the central mass, outliers trimmed.
# --------------------------------------------------------------------------- #
def test_histogram_clipped_trims_the_tail():
    """Golden: with a long high tail, the clipped histogram excludes the outliers.

    A tight cluster in [1, 5] plus a handful of extreme values. The full histogram
    stretches to the extreme (min..max); the clipped one is re-binned over the
    Tukey inner fences, so its upper edge stays far below the extreme and it holds
    fewer values than the full sample.
    """
    cluster = [1, 2, 3, 4, 5] * 20          # 100 values in [1, 5]
    values = cluster + [500, 800, 1000]     # 3 far outliers
    result = describe_numeric(values)

    full = result["histogram"]
    clipped = result["histogram_clipped"]
    assert full and clipped                                   # both present
    for bucket in clipped:
        assert "lo" in bucket and "hi" in bucket and "count" in bucket

    # The full histogram reaches the extreme; the clipped one does not.
    assert full[-1]["hi"] >= 900
    assert clipped[-1]["hi"] < 100

    # The clip removed the tail: fewer values counted than the full sample.
    total_full = sum(b["count"] for b in full)
    total_clipped = sum(b["count"] for b in clipped)
    assert total_full == 103
    assert total_clipped < total_full
    assert total_clipped >= 100               # the whole cluster survives the clip


def test_histogram_clipped_empty_when_no_outliers():
    """Edge: a clean spread with no fence outliers yields an empty clipped view.

    When the inner-fence range already covers every value, there is nothing to
    trim, so histogram_clipped is [] and the renderer skips the redundant second
    view instead of duplicating the full histogram.
    """
    result = describe_numeric(list(range(1, 101)))  # uniform 1..100, no outliers
    assert result["n_outliers"] == 0
    assert result["histogram"]                       # full histogram present
    assert result["histogram_clipped"] == []         # nothing trimmed


def test_histogram_clipped_empty_when_constant():
    """Edge: a constant column (iqr == 0) never produces a clipped view."""
    result = describe_numeric([7] * 30)
    assert result["iqr"] == 0
    assert result["histogram_clipped"] == []