Files
fn_registry/python/functions/datascience/build_boxplot_stats.py
T

95 lines
3.6 KiB
Python

"""build_boxplot_stats — Tukey boxplot statistics from an EDA `numeric` sub-block.
Pure function: no I/O, deterministic. Takes the `numeric` dict of a ColumnProfile
(group `eda`, the output of describe_numeric) and derives the figures needed to
draw a horizontal Tukey boxplot using the 1.5 * IQR rule.
It only derives numbers from already-computed percentiles; it never sees the raw
column values. Reading is defensive (.get throughout) and the function NEVER
raises: if the key percentiles (p25 / p50 / p75) are missing it returns {} so the
caller can simply skip the boxplot.
"""
def _num(value):
"""Coerce to float defensively; return None for None/bool/non-numeric."""
# bool is a subclass of int; a percentile value is never a real bool, so
# treat True/False as missing instead of silently coercing to 1.0/0.0.
if value is None or isinstance(value, bool):
return None
try:
return float(value)
except (TypeError, ValueError):
return None
def build_boxplot_stats(numeric: dict) -> dict:
"""Derive Tukey boxplot statistics from the `numeric` sub-block of a profile.
Reads the percentiles already computed by describe_numeric and applies the
classic 1.5 * IQR fence rule to obtain the whisker extremes and outlier
flags of a horizontal boxplot. No raw values are needed.
Args:
numeric: The `numeric` sub-block of an eda ColumnProfile (output of
describe_numeric). Every value may be None; read defensively.
Returns:
Dict with the boxplot figures
{q1, median, q3, iqr, lower_fence, upper_fence, whisker_lo, whisker_hi,
min, max, has_low_outliers, has_high_outliers, n_outliers}.
If p25, p50/median or p75 are missing (None) returns {} (empty dict) so
the caller omits the plot.
"""
if not isinstance(numeric, dict):
return {}
q1 = _num(numeric.get("p25"))
q3 = _num(numeric.get("p75"))
# Prefer the explicit median; fall back to p50 (they are the same quantile).
median = _num(numeric.get("median"))
if median is None:
median = _num(numeric.get("p50"))
# Without the three quartiles a boxplot cannot be drawn.
if q1 is None or q3 is None or median is None:
return {}
# Recompute the IQR from the quartiles rather than trusting numeric['iqr'],
# which may be missing even when the percentiles are present.
iqr = q3 - q1
lower_fence = q1 - 1.5 * iqr
upper_fence = q3 + 1.5 * iqr
mn = _num(numeric.get("min"))
mx = _num(numeric.get("max"))
# Whisker extremes: the real data range clamped to the fences. When the
# corresponding extreme is missing, fall back to the fence itself.
whisker_lo = max(mn, lower_fence) if mn is not None else lower_fence
whisker_hi = min(mx, upper_fence) if mx is not None else upper_fence
has_low_outliers = bool(mn is not None and mn < lower_fence)
has_high_outliers = bool(mx is not None and mx > upper_fence)
# Informative only: these outliers come from the z-score block of the
# profile, not from this IQR fence computation.
raw_n = numeric.get("n_outliers")
n_outliers = int(raw_n) if isinstance(raw_n, (int, float)) and not isinstance(raw_n, bool) else 0
return {
"q1": q1,
"median": median,
"q3": q3,
"iqr": iqr,
"lower_fence": lower_fence,
"upper_fence": upper_fence,
"whisker_lo": whisker_lo,
"whisker_hi": whisker_hi,
"min": mn,
"max": mx,
"has_low_outliers": has_low_outliers,
"has_high_outliers": has_high_outliers,
"n_outliers": n_outliers,
}