diff --git a/python/functions/datascience/build_boxplot_stats.md b/python/functions/datascience/build_boxplot_stats.md new file mode 100644 index 00000000..05dd5fab --- /dev/null +++ b/python/functions/datascience/build_boxplot_stats.md @@ -0,0 +1,58 @@ +--- +name: build_boxplot_stats +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def build_boxplot_stats(numeric: dict) -> dict" +description: "Deriva las estadisticas de un boxplot de Tukey desde el sub-bloque numeric de un ColumnProfile del grupo eda (salida de describe_numeric). Aplica la regla del 1.5*IQR a los percentiles p25/p50/p75 para obtener cuartiles, fences, bigotes reales y flags de outliers. Lectura defensiva con .get; NUNCA lanza. Si faltan los percentiles clave devuelve {} para que el caller omita el grafico." +tags: [eda, statistics, profiling, boxplot, tukey, iqr, datascience] +params: + - name: numeric + desc: "Sub-bloque numeric de un ColumnProfile del grupo eda (la salida de describe_numeric). Claves esperadas (todas pueden ser None): min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50, p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct, negative_pct, distribution_type, histogram. Solo se usan p25, median/p50, p75, min, max y n_outliers." +output: "Dict con las cifras de un boxplot horizontal de Tukey: {q1=p25, median=median(o p50), q3=p75, iqr=q3-q1, lower_fence=q1-1.5*iqr, upper_fence=q3+1.5*iqr, whisker_lo=max(min,lower_fence), whisker_hi=min(max,upper_fence), min, max, has_low_outliers=minupper_fence, n_outliers}. Numericos en float, flags en bool nativo, n_outliers en int. Si faltan p25/median(o p50)/p75 devuelve {} (dict vacio). Cuando min/max faltan, los bigotes caen a la fence correspondiente." +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: ["test_boxplot_tukey_basico", "test_percentiles_faltan_devuelve_vacio", "test_median_cae_a_p50", "test_whiskers_usan_fence_si_falta_min_max", "test_tipos_salida_float_bool_int"] +test_file_path: "python/functions/datascience/build_boxplot_stats_test.py" +file_path: "python/functions/datascience/build_boxplot_stats.py" +--- + +## Ejemplo + +```python +import sys, os +sys.path.insert(0, os.path.join("python", "functions")) +from datascience.build_boxplot_stats import build_boxplot_stats + +# Sub-bloque numeric tal y como lo produce describe_numeric: +numeric = { + "min": 1.0, "max": 100.0, + "p25": 10.0, "median": 25.0, "p75": 40.0, + "iqr": 30.0, "n_outliers": 3, +} +box = build_boxplot_stats(numeric) +print(box["lower_fence"], box["upper_fence"]) # -35.0 85.0 +print(box["whisker_lo"], box["whisker_hi"]) # 1.0 85.0 +print(box["has_low_outliers"], box["has_high_outliers"]) # False True +``` + +## Cuando usarla + +- Usala al dibujar un boxplot horizontal bajo el histograma en el capitulo `num_distr` de `AutomaticEDA`: convierte el bloque `numeric` de un `ColumnProfile` en las cifras exactas que el renderer necesita (cuartiles, fences, extremos de los bigotes y flags de outliers). +- Cuando ya tengas los percentiles calculados (salida de `describe_numeric`) y solo necesites derivar la geometria del boxplot de Tukey sin volver a tocar los valores crudos. +- Cuando quieras decidir si una columna tiene cola alta/baja (`has_high_outliers` / `has_low_outliers`) antes de proponer una transformacion (log, winsorize). + +## Gotchas + +- Funcion pura, sin I/O y determinista. Lectura defensiva con `.get`: NUNCA lanza. Si faltan `p25`, `median`/`p50` o `p75` devuelve `{}` (dict vacio) — el caller debe omitir el boxplot. +- Los `n_outliers` que se propagan vienen del bloque z-score del profile (`detect_outliers`, threshold 3.0), NO de la regla IQR. Son informativos: el conteo de Tukey que esta funcion calcula son los **fences** (`lower_fence`/`upper_fence`), no un recuento de puntos. +- No recibe los valores crudos de la columna, solo deriva cifras desde los percentiles ya calculados. Por eso no puede contar cuantos puntos caen fuera de las fences, solo si los extremos (`min`/`max`) las superan. +- `iqr` se recalcula como `q3 - q1` aunque el bloque traiga `numeric['iqr']`: asi funciona aunque esa clave falte. +- Cuando `min`/`max` faltan, los bigotes caen a la fence correspondiente y los flags de outliers quedan en `False` (sin extremo real no se afirma cola). diff --git a/python/functions/datascience/build_boxplot_stats.py b/python/functions/datascience/build_boxplot_stats.py new file mode 100644 index 00000000..6fbcdc86 --- /dev/null +++ b/python/functions/datascience/build_boxplot_stats.py @@ -0,0 +1,94 @@ +"""build_boxplot_stats — Tukey boxplot statistics from an EDA `numeric` sub-block. + +Pure function: no I/O, deterministic. Takes the `numeric` dict of a ColumnProfile +(group `eda`, the output of describe_numeric) and derives the figures needed to +draw a horizontal Tukey boxplot using the 1.5 * IQR rule. + +It only derives numbers from already-computed percentiles; it never sees the raw +column values. Reading is defensive (.get throughout) and the function NEVER +raises: if the key percentiles (p25 / p50 / p75) are missing it returns {} so the +caller can simply skip the boxplot. +""" + + +def _num(value): + """Coerce to float defensively; return None for None/bool/non-numeric.""" + # bool is a subclass of int; a percentile value is never a real bool, so + # treat True/False as missing instead of silently coercing to 1.0/0.0. + if value is None or isinstance(value, bool): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def build_boxplot_stats(numeric: dict) -> dict: + """Derive Tukey boxplot statistics from the `numeric` sub-block of a profile. + + Reads the percentiles already computed by describe_numeric and applies the + classic 1.5 * IQR fence rule to obtain the whisker extremes and outlier + flags of a horizontal boxplot. No raw values are needed. + + Args: + numeric: The `numeric` sub-block of an eda ColumnProfile (output of + describe_numeric). Every value may be None; read defensively. + + Returns: + Dict with the boxplot figures + {q1, median, q3, iqr, lower_fence, upper_fence, whisker_lo, whisker_hi, + min, max, has_low_outliers, has_high_outliers, n_outliers}. + If p25, p50/median or p75 are missing (None) returns {} (empty dict) so + the caller omits the plot. + """ + if not isinstance(numeric, dict): + return {} + + q1 = _num(numeric.get("p25")) + q3 = _num(numeric.get("p75")) + # Prefer the explicit median; fall back to p50 (they are the same quantile). + median = _num(numeric.get("median")) + if median is None: + median = _num(numeric.get("p50")) + + # Without the three quartiles a boxplot cannot be drawn. + if q1 is None or q3 is None or median is None: + return {} + + # Recompute the IQR from the quartiles rather than trusting numeric['iqr'], + # which may be missing even when the percentiles are present. + iqr = q3 - q1 + lower_fence = q1 - 1.5 * iqr + upper_fence = q3 + 1.5 * iqr + + mn = _num(numeric.get("min")) + mx = _num(numeric.get("max")) + + # Whisker extremes: the real data range clamped to the fences. When the + # corresponding extreme is missing, fall back to the fence itself. + whisker_lo = max(mn, lower_fence) if mn is not None else lower_fence + whisker_hi = min(mx, upper_fence) if mx is not None else upper_fence + + has_low_outliers = bool(mn is not None and mn < lower_fence) + has_high_outliers = bool(mx is not None and mx > upper_fence) + + # Informative only: these outliers come from the z-score block of the + # profile, not from this IQR fence computation. + raw_n = numeric.get("n_outliers") + n_outliers = int(raw_n) if isinstance(raw_n, (int, float)) and not isinstance(raw_n, bool) else 0 + + return { + "q1": q1, + "median": median, + "q3": q3, + "iqr": iqr, + "lower_fence": lower_fence, + "upper_fence": upper_fence, + "whisker_lo": whisker_lo, + "whisker_hi": whisker_hi, + "min": mn, + "max": mx, + "has_low_outliers": has_low_outliers, + "has_high_outliers": has_high_outliers, + "n_outliers": n_outliers, + } diff --git a/python/functions/datascience/build_boxplot_stats_test.py b/python/functions/datascience/build_boxplot_stats_test.py new file mode 100644 index 00000000..dbf541e3 --- /dev/null +++ b/python/functions/datascience/build_boxplot_stats_test.py @@ -0,0 +1,108 @@ +"""Tests para build_boxplot_stats.""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +from build_boxplot_stats import build_boxplot_stats + +# Keys that a non-empty result dict must always contain. +_EXPECTED_KEYS = { + "q1", "median", "q3", "iqr", "lower_fence", "upper_fence", + "whisker_lo", "whisker_hi", "min", "max", + "has_low_outliers", "has_high_outliers", "n_outliers", +} + + +def test_boxplot_tukey_basico(): + """Golden: bloque numeric con outlier alto claro -> fences IQR de Tukey.""" + numeric = { + "min": 1.0, "max": 100.0, + "p25": 10.0, "median": 25.0, "p75": 40.0, + "iqr": 30.0, "n_outliers": 3, + } + box = build_boxplot_stats(numeric) + + assert set(box.keys()) == _EXPECTED_KEYS + + assert box["q1"] == 10.0 + assert box["median"] == 25.0 + assert box["q3"] == 40.0 + # iqr recomputado desde los cuartiles. + assert box["iqr"] == 30.0 + # lower = 10 - 1.5*30 = -35 ; upper = 40 + 1.5*30 = 85. + assert box["lower_fence"] == -35.0 + assert box["upper_fence"] == 85.0 + # whisker_lo = max(min=1, -35) = 1 ; whisker_hi = min(max=100, 85) = 85. + assert box["whisker_lo"] == 1.0 + assert box["whisker_hi"] == 85.0 + assert box["min"] == 1.0 + assert box["max"] == 100.0 + # Solo hay outliers altos (100 > 85), no bajos (1 no < -35). + assert box["has_low_outliers"] is False + assert box["has_high_outliers"] is True + # n_outliers se propaga del bloque z-score (informativo). + assert box["n_outliers"] == 3 + + +def test_percentiles_faltan_devuelve_vacio(): + """Si falta p25/median/p75 -> {} (caller omite el boxplot).""" + # Falta p25. + assert build_boxplot_stats({"median": 25.0, "p75": 40.0}) == {} + # Falta p75. + assert build_boxplot_stats({"p25": 10.0, "median": 25.0}) == {} + # Falta median y p50. + assert build_boxplot_stats({"p25": 10.0, "p75": 40.0}) == {} + # numeric None / no dict tambien es vacio, nunca lanza. + assert build_boxplot_stats(None) == {} + assert build_boxplot_stats({}) == {} + + +def test_median_cae_a_p50(): + """median ausente cae a p50.""" + numeric = {"min": 0.0, "max": 10.0, "p25": 2.0, "p50": 5.0, "p75": 8.0} + box = build_boxplot_stats(numeric) + assert box["median"] == 5.0 + assert box["q1"] == 2.0 + assert box["q3"] == 8.0 + + +def test_whiskers_usan_fence_si_falta_min_max(): + """Sin min/max los bigotes caen a las fences y no hay outliers marcados.""" + numeric = {"p25": 10.0, "median": 25.0, "p75": 40.0} # sin min ni max + box = build_boxplot_stats(numeric) + + assert box["min"] is None + assert box["max"] is None + # iqr = 30, fences -35 / 85; los bigotes caen a las fences. + assert box["whisker_lo"] == box["lower_fence"] == -35.0 + assert box["whisker_hi"] == box["upper_fence"] == 85.0 + # Sin extremos reales, no se afirma que haya outliers. + assert box["has_low_outliers"] is False + assert box["has_high_outliers"] is False + # n_outliers ausente -> 0. + assert box["n_outliers"] == 0 + + +def test_tipos_salida_float_bool_int(): + """Numericos en float, flags bool nativos, n_outliers int.""" + numeric = { + "min": -50.0, "max": 200.0, + "p25": 10.0, "median": 25.0, "p75": 40.0, + "n_outliers": 7, + } + box = build_boxplot_stats(numeric) + + for key in ("q1", "median", "q3", "iqr", "lower_fence", "upper_fence", + "whisker_lo", "whisker_hi", "min", "max"): + assert isinstance(box[key], float), f"{key} debe ser float" + + assert isinstance(box["has_low_outliers"], bool) + assert isinstance(box["has_high_outliers"], bool) + assert isinstance(box["n_outliers"], int) and not isinstance(box["n_outliers"], bool) + + # min=-50 < lower_fence=-35 -> outlier bajo ; max=200 > upper_fence=85 -> alto. + assert box["has_low_outliers"] is True + assert box["has_high_outliers"] is True + assert box["n_outliers"] == 7