fcf5a4c6a3
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
95 lines
3.6 KiB
Python
95 lines
3.6 KiB
Python
"""build_boxplot_stats — Tukey boxplot statistics from an EDA `numeric` sub-block.
|
|
|
|
Pure function: no I/O, deterministic. Takes the `numeric` dict of a ColumnProfile
|
|
(group `eda`, the output of describe_numeric) and derives the figures needed to
|
|
draw a horizontal Tukey boxplot using the 1.5 * IQR rule.
|
|
|
|
It only derives numbers from already-computed percentiles; it never sees the raw
|
|
column values. Reading is defensive (.get throughout) and the function NEVER
|
|
raises: if the key percentiles (p25 / p50 / p75) are missing it returns {} so the
|
|
caller can simply skip the boxplot.
|
|
"""
|
|
|
|
|
|
def _num(value):
|
|
"""Coerce to float defensively; return None for None/bool/non-numeric."""
|
|
# bool is a subclass of int; a percentile value is never a real bool, so
|
|
# treat True/False as missing instead of silently coercing to 1.0/0.0.
|
|
if value is None or isinstance(value, bool):
|
|
return None
|
|
try:
|
|
return float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def build_boxplot_stats(numeric: dict) -> dict:
|
|
"""Derive Tukey boxplot statistics from the `numeric` sub-block of a profile.
|
|
|
|
Reads the percentiles already computed by describe_numeric and applies the
|
|
classic 1.5 * IQR fence rule to obtain the whisker extremes and outlier
|
|
flags of a horizontal boxplot. No raw values are needed.
|
|
|
|
Args:
|
|
numeric: The `numeric` sub-block of an eda ColumnProfile (output of
|
|
describe_numeric). Every value may be None; read defensively.
|
|
|
|
Returns:
|
|
Dict with the boxplot figures
|
|
{q1, median, q3, iqr, lower_fence, upper_fence, whisker_lo, whisker_hi,
|
|
min, max, has_low_outliers, has_high_outliers, n_outliers}.
|
|
If p25, p50/median or p75 are missing (None) returns {} (empty dict) so
|
|
the caller omits the plot.
|
|
"""
|
|
if not isinstance(numeric, dict):
|
|
return {}
|
|
|
|
q1 = _num(numeric.get("p25"))
|
|
q3 = _num(numeric.get("p75"))
|
|
# Prefer the explicit median; fall back to p50 (they are the same quantile).
|
|
median = _num(numeric.get("median"))
|
|
if median is None:
|
|
median = _num(numeric.get("p50"))
|
|
|
|
# Without the three quartiles a boxplot cannot be drawn.
|
|
if q1 is None or q3 is None or median is None:
|
|
return {}
|
|
|
|
# Recompute the IQR from the quartiles rather than trusting numeric['iqr'],
|
|
# which may be missing even when the percentiles are present.
|
|
iqr = q3 - q1
|
|
lower_fence = q1 - 1.5 * iqr
|
|
upper_fence = q3 + 1.5 * iqr
|
|
|
|
mn = _num(numeric.get("min"))
|
|
mx = _num(numeric.get("max"))
|
|
|
|
# Whisker extremes: the real data range clamped to the fences. When the
|
|
# corresponding extreme is missing, fall back to the fence itself.
|
|
whisker_lo = max(mn, lower_fence) if mn is not None else lower_fence
|
|
whisker_hi = min(mx, upper_fence) if mx is not None else upper_fence
|
|
|
|
has_low_outliers = bool(mn is not None and mn < lower_fence)
|
|
has_high_outliers = bool(mx is not None and mx > upper_fence)
|
|
|
|
# Informative only: these outliers come from the z-score block of the
|
|
# profile, not from this IQR fence computation.
|
|
raw_n = numeric.get("n_outliers")
|
|
n_outliers = int(raw_n) if isinstance(raw_n, (int, float)) and not isinstance(raw_n, bool) else 0
|
|
|
|
return {
|
|
"q1": q1,
|
|
"median": median,
|
|
"q3": q3,
|
|
"iqr": iqr,
|
|
"lower_fence": lower_fence,
|
|
"upper_fence": upper_fence,
|
|
"whisker_lo": whisker_lo,
|
|
"whisker_hi": whisker_hi,
|
|
"min": mn,
|
|
"max": mx,
|
|
"has_low_outliers": has_low_outliers,
|
|
"has_high_outliers": has_high_outliers,
|
|
"n_outliers": n_outliers,
|
|
}
|