fn_registry/python/functions/datascience/build_boxplot_stats.py

"""build_boxplot_stats — Tukey boxplot statistics from an EDA `numeric` sub-block.

Pure function: no I/O, deterministic. Takes the `numeric` dict of a ColumnProfile
(group `eda`, the output of describe_numeric) and derives the figures needed to
draw a horizontal Tukey boxplot using the 1.5 * IQR rule.

It only derives numbers from already-computed percentiles; it never sees the raw
column values. Reading is defensive (.get throughout) and the function NEVER
raises: if the key percentiles (p25 / p50 / p75) are missing it returns {} so the
caller can simply skip the boxplot.
"""


def _num(value):
    """Coerce to float defensively; return None for None/bool/non-numeric."""
    # bool is a subclass of int; a percentile value is never a real bool, so
    # treat True/False as missing instead of silently coercing to 1.0/0.0.
    if value is None or isinstance(value, bool):
        return None
    try:
        return float(value)
    except (TypeError, ValueError):
        return None


def build_boxplot_stats(numeric: dict) -> dict:
    """Derive Tukey boxplot statistics from the `numeric` sub-block of a profile.

    Reads the percentiles already computed by describe_numeric and applies the
    classic 1.5 * IQR fence rule to obtain the whisker extremes and outlier
    flags of a horizontal boxplot. No raw values are needed.

    Args:
        numeric: The `numeric` sub-block of an eda ColumnProfile (output of
            describe_numeric). Every value may be None; read defensively.

    Returns:
        Dict with the boxplot figures
        {q1, median, q3, iqr, lower_fence, upper_fence, whisker_lo, whisker_hi,
         min, max, has_low_outliers, has_high_outliers, n_outliers}.
        If p25, p50/median or p75 are missing (None) returns {} (empty dict) so
        the caller omits the plot.
    """
    if not isinstance(numeric, dict):
        return {}

    q1 = _num(numeric.get("p25"))
    q3 = _num(numeric.get("p75"))
    # Prefer the explicit median; fall back to p50 (they are the same quantile).
    median = _num(numeric.get("median"))
    if median is None:
        median = _num(numeric.get("p50"))

    # Without the three quartiles a boxplot cannot be drawn.
    if q1 is None or q3 is None or median is None:
        return {}

    # Recompute the IQR from the quartiles rather than trusting numeric['iqr'],
    # which may be missing even when the percentiles are present.
    iqr = q3 - q1
    lower_fence = q1 - 1.5 * iqr
    upper_fence = q3 + 1.5 * iqr

    mn = _num(numeric.get("min"))
    mx = _num(numeric.get("max"))

    # Whisker extremes: the real data range clamped to the fences. When the
    # corresponding extreme is missing, fall back to the fence itself.
    whisker_lo = max(mn, lower_fence) if mn is not None else lower_fence
    whisker_hi = min(mx, upper_fence) if mx is not None else upper_fence

    has_low_outliers = bool(mn is not None and mn < lower_fence)
    has_high_outliers = bool(mx is not None and mx > upper_fence)

    # Informative only: these outliers come from the z-score block of the
    # profile, not from this IQR fence computation.
    raw_n = numeric.get("n_outliers")
    n_outliers = int(raw_n) if isinstance(raw_n, (int, float)) and not isinstance(raw_n, bool) else 0

    return {
        "q1": q1,
        "median": median,
        "q3": q3,
        "iqr": iqr,
        "lower_fence": lower_fence,
        "upper_fence": upper_fence,
        "whisker_lo": whisker_lo,
        "whisker_hi": whisker_hi,
        "min": mn,
        "max": mx,
        "has_low_outliers": has_low_outliers,
        "has_high_outliers": has_high_outliers,
        "n_outliers": n_outliers,
    }