feat(eda): build_boxplot_stats — estadísticas de boxplot Tukey desde sub-bloque numeric
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,94 @@
|
||||
"""build_boxplot_stats — Tukey boxplot statistics from an EDA `numeric` sub-block.
|
||||
|
||||
Pure function: no I/O, deterministic. Takes the `numeric` dict of a ColumnProfile
|
||||
(group `eda`, the output of describe_numeric) and derives the figures needed to
|
||||
draw a horizontal Tukey boxplot using the 1.5 * IQR rule.
|
||||
|
||||
It only derives numbers from already-computed percentiles; it never sees the raw
|
||||
column values. Reading is defensive (.get throughout) and the function NEVER
|
||||
raises: if the key percentiles (p25 / p50 / p75) are missing it returns {} so the
|
||||
caller can simply skip the boxplot.
|
||||
"""
|
||||
|
||||
|
||||
def _num(value):
|
||||
"""Coerce to float defensively; return None for None/bool/non-numeric."""
|
||||
# bool is a subclass of int; a percentile value is never a real bool, so
|
||||
# treat True/False as missing instead of silently coercing to 1.0/0.0.
|
||||
if value is None or isinstance(value, bool):
|
||||
return None
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def build_boxplot_stats(numeric: dict) -> dict:
|
||||
"""Derive Tukey boxplot statistics from the `numeric` sub-block of a profile.
|
||||
|
||||
Reads the percentiles already computed by describe_numeric and applies the
|
||||
classic 1.5 * IQR fence rule to obtain the whisker extremes and outlier
|
||||
flags of a horizontal boxplot. No raw values are needed.
|
||||
|
||||
Args:
|
||||
numeric: The `numeric` sub-block of an eda ColumnProfile (output of
|
||||
describe_numeric). Every value may be None; read defensively.
|
||||
|
||||
Returns:
|
||||
Dict with the boxplot figures
|
||||
{q1, median, q3, iqr, lower_fence, upper_fence, whisker_lo, whisker_hi,
|
||||
min, max, has_low_outliers, has_high_outliers, n_outliers}.
|
||||
If p25, p50/median or p75 are missing (None) returns {} (empty dict) so
|
||||
the caller omits the plot.
|
||||
"""
|
||||
if not isinstance(numeric, dict):
|
||||
return {}
|
||||
|
||||
q1 = _num(numeric.get("p25"))
|
||||
q3 = _num(numeric.get("p75"))
|
||||
# Prefer the explicit median; fall back to p50 (they are the same quantile).
|
||||
median = _num(numeric.get("median"))
|
||||
if median is None:
|
||||
median = _num(numeric.get("p50"))
|
||||
|
||||
# Without the three quartiles a boxplot cannot be drawn.
|
||||
if q1 is None or q3 is None or median is None:
|
||||
return {}
|
||||
|
||||
# Recompute the IQR from the quartiles rather than trusting numeric['iqr'],
|
||||
# which may be missing even when the percentiles are present.
|
||||
iqr = q3 - q1
|
||||
lower_fence = q1 - 1.5 * iqr
|
||||
upper_fence = q3 + 1.5 * iqr
|
||||
|
||||
mn = _num(numeric.get("min"))
|
||||
mx = _num(numeric.get("max"))
|
||||
|
||||
# Whisker extremes: the real data range clamped to the fences. When the
|
||||
# corresponding extreme is missing, fall back to the fence itself.
|
||||
whisker_lo = max(mn, lower_fence) if mn is not None else lower_fence
|
||||
whisker_hi = min(mx, upper_fence) if mx is not None else upper_fence
|
||||
|
||||
has_low_outliers = bool(mn is not None and mn < lower_fence)
|
||||
has_high_outliers = bool(mx is not None and mx > upper_fence)
|
||||
|
||||
# Informative only: these outliers come from the z-score block of the
|
||||
# profile, not from this IQR fence computation.
|
||||
raw_n = numeric.get("n_outliers")
|
||||
n_outliers = int(raw_n) if isinstance(raw_n, (int, float)) and not isinstance(raw_n, bool) else 0
|
||||
|
||||
return {
|
||||
"q1": q1,
|
||||
"median": median,
|
||||
"q3": q3,
|
||||
"iqr": iqr,
|
||||
"lower_fence": lower_fence,
|
||||
"upper_fence": upper_fence,
|
||||
"whisker_lo": whisker_lo,
|
||||
"whisker_hi": whisker_hi,
|
||||
"min": mn,
|
||||
"max": mx,
|
||||
"has_low_outliers": has_low_outliers,
|
||||
"has_high_outliers": has_high_outliers,
|
||||
"n_outliers": n_outliers,
|
||||
}
|
||||
Reference in New Issue
Block a user