"""build_boxplot_stats — Tukey boxplot statistics from an EDA `numeric` sub-block. Pure function: no I/O, deterministic. Takes the `numeric` dict of a ColumnProfile (group `eda`, the output of describe_numeric) and derives the figures needed to draw a horizontal Tukey boxplot using the 1.5 * IQR rule. It only derives numbers from already-computed percentiles; it never sees the raw column values. Reading is defensive (.get throughout) and the function NEVER raises: if the key percentiles (p25 / p50 / p75) are missing it returns {} so the caller can simply skip the boxplot. """ def _num(value): """Coerce to float defensively; return None for None/bool/non-numeric.""" # bool is a subclass of int; a percentile value is never a real bool, so # treat True/False as missing instead of silently coercing to 1.0/0.0. if value is None or isinstance(value, bool): return None try: return float(value) except (TypeError, ValueError): return None def build_boxplot_stats(numeric: dict) -> dict: """Derive Tukey boxplot statistics from the `numeric` sub-block of a profile. Reads the percentiles already computed by describe_numeric and applies the classic 1.5 * IQR fence rule to obtain the whisker extremes and outlier flags of a horizontal boxplot. No raw values are needed. Args: numeric: The `numeric` sub-block of an eda ColumnProfile (output of describe_numeric). Every value may be None; read defensively. Returns: Dict with the boxplot figures {q1, median, q3, iqr, lower_fence, upper_fence, whisker_lo, whisker_hi, min, max, has_low_outliers, has_high_outliers, n_outliers}. If p25, p50/median or p75 are missing (None) returns {} (empty dict) so the caller omits the plot. """ if not isinstance(numeric, dict): return {} q1 = _num(numeric.get("p25")) q3 = _num(numeric.get("p75")) # Prefer the explicit median; fall back to p50 (they are the same quantile). median = _num(numeric.get("median")) if median is None: median = _num(numeric.get("p50")) # Without the three quartiles a boxplot cannot be drawn. if q1 is None or q3 is None or median is None: return {} # Recompute the IQR from the quartiles rather than trusting numeric['iqr'], # which may be missing even when the percentiles are present. iqr = q3 - q1 lower_fence = q1 - 1.5 * iqr upper_fence = q3 + 1.5 * iqr mn = _num(numeric.get("min")) mx = _num(numeric.get("max")) # Whisker extremes: the real data range clamped to the fences. When the # corresponding extreme is missing, fall back to the fence itself. whisker_lo = max(mn, lower_fence) if mn is not None else lower_fence whisker_hi = min(mx, upper_fence) if mx is not None else upper_fence has_low_outliers = bool(mn is not None and mn < lower_fence) has_high_outliers = bool(mx is not None and mx > upper_fence) # Informative only: these outliers come from the z-score block of the # profile, not from this IQR fence computation. raw_n = numeric.get("n_outliers") n_outliers = int(raw_n) if isinstance(raw_n, (int, float)) and not isinstance(raw_n, bool) else 0 return { "q1": q1, "median": median, "q3": q3, "iqr": iqr, "lower_fence": lower_fence, "upper_fence": upper_fence, "whisker_lo": whisker_lo, "whisker_hi": whisker_hi, "min": mn, "max": mx, "has_low_outliers": has_low_outliers, "has_high_outliers": has_high_outliers, "n_outliers": n_outliers, }