diff --git a/python/functions/datascience/automatic_eda/chapters/num_distr.py b/python/functions/datascience/automatic_eda/chapters/num_distr.py index 9401d710..cc8c2f64 100644 --- a/python/functions/datascience/automatic_eda/chapters/num_distr.py +++ b/python/functions/datascience/automatic_eda/chapters/num_distr.py @@ -35,7 +35,7 @@ try: except Exception: # noqa: BLE001 — keep the chapter importable no matter what. build_boxplot_stats = None # type: ignore[assignment] -CHAPTER_VERSION = "1.3.0" +CHAPTER_VERSION = "1.4.0" CHAPTER_ID = "num_distr" CHAPTER_TITLE = "Distribuciones numéricas" @@ -275,6 +275,69 @@ def _make_hist_box(name: str, numeric: dict, box: dict): return fig +def _make_hist_clipped(name: str, numeric: dict): + """Histogram of the central mass with the outliers trimmed away. + + Companion to :func:`_make_hist_box`: same column, re-binned over the Tukey + inner-fence range [Q1-1.5*IQR, Q3+1.5*IQR] (precomputed in ``describe_numeric`` + as ``histogram_clipped``), so the bulk of the distribution stays readable when + a long tail would otherwise crush the scale. Only the reference median is drawn + — it always falls inside the fence range by construction — because mean/±σ were + already shown on the full histogram above and could sit outside the clip. + """ + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + fig, ax = plt.subplots(figsize=(6.4, 2.6)) + hist = numeric.get("histogram_clipped") or [] + drew_bars = False + for b in hist: + if not isinstance(b, dict): + continue + lo = b.get("lo") + hi = b.get("hi") + count = b.get("count") or 0 + if lo is None or hi is None: + continue + width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6) + ax.bar(lo, count, width=width, align="edge", color="#b7d7a8", + edgecolor="#6a9a5b", linewidth=0.4, zorder=2) + drew_bars = True + + median = numeric.get("median") + if drew_bars and median is not None: + lo0 = hist[0].get("lo") + hi1 = hist[-1].get("hi") + if lo0 is not None and hi1 is not None and lo0 <= median <= hi1: + ax.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.4, + zorder=4, label=f"mediana = {_fmt_num(median)}") + ax.legend(fontsize=6.5, loc="upper right", framealpha=0.85) + if not drew_bars: + ax.text(0.5, 0.5, "(sin histograma recortado)", ha="center", + va="center", fontsize=9, color="#8a8a8a", + transform=ax.transAxes) + + ax.set_ylabel("frecuencia", fontsize=8) + ax.set_xlabel(name, fontsize=8) + ax.tick_params(labelsize=7) + for spine in ("top", "right"): + ax.spines[spine].set_visible(False) + fig.suptitle(f"{name} — vista central (sin outliers)", fontsize=10, + fontweight="bold", x=0.02, ha="left") + fig.tight_layout() + return fig + + +def _clipped_figure_maker(name: str, numeric: dict): + """Bind the per-column arguments so the lazy closure is loop-safe.""" + def _make(): + return _make_hist_clipped(name, numeric) + + return _make + + def _stats_note(name: str, numeric: dict, box: dict) -> str: """One compact line of the key numbers + a plain-Spanish shape gloss.""" bits = [ @@ -374,6 +437,16 @@ def build_num_distr(profile: dict, ctx: dict): make=_figure_maker(name, numeric, box), caption=f"Distribución de «{name}» — histograma " f"(media/mediana/±σ) y boxplot.")) + # Second view: the central mass with the outliers trimmed (Tukey fences). + # Only added when describe_numeric produced a non-empty histogram_clipped + # (i.e. the clip actually removed tail values), and stays inside the same + # keep-together Group so it never drifts to another page from its heading. + if numeric.get("histogram_clipped"): + col_blocks.append(model.Figure( + make=_clipped_figure_maker(name, numeric), + caption=f"«{name}» — vista central con los atípicos recortados " + f"(vallas de Tukey 1,5·IQR); útil cuando la cola larga " + f"aplasta la escala del histograma completo.")) col_blocks.append(model.Markdown(text=_stats_note(name, numeric, box))) blocks.append(model.Group(blocks=col_blocks)) diff --git a/python/functions/datascience/describe_numeric.md b/python/functions/datascience/describe_numeric.md index 27b76e66..810c7a98 100644 --- a/python/functions/datascience/describe_numeric.md +++ b/python/functions/datascience/describe_numeric.md @@ -3,17 +3,17 @@ name: describe_numeric kind: function lang: py domain: datascience -version: "1.0.0" +version: "1.1.0" purity: pure signature: "def describe_numeric(values: list, bins: int = 20) -> dict" -description: "Calcula el bloque estadistico fino numeric de un ColumnProfile del grupo eda sobre una MUESTRA de una columna numerica. Descarta None/NaN/no-numericos y devuelve min/max/mean/median/mode/std/variance/cv, percentiles, iqr, skew, kurtosis, outliers, zero_pct, negative_pct, distribution_type e histogram. Reusa detect_distribution_type, detect_outliers y histogram del registry." +description: "Calcula el bloque estadistico fino numeric de un ColumnProfile del grupo eda sobre una MUESTRA de una columna numerica. Descarta None/NaN/no-numericos y devuelve min/max/mean/median/mode/std/variance/cv, percentiles, iqr, skew, kurtosis, outliers, zero_pct, negative_pct, distribution_type, histogram e histogram_clipped (segunda vista del histograma con los outliers recortados a las vallas de Tukey). Reusa detect_distribution_type, detect_outliers y histogram del registry." tags: [eda, statistics, profiling, distribution, histogram, datascience] params: - name: values desc: "Lista de valores crudos de una columna (muestra). Puede contener None, NaN, infinitos y strings no numericos: se descartan antes de calcular. bool se trata como no numerico." - name: bins desc: "Numero de buckets equiespaciados del histograma. Default 20." -output: "Dict con las claves exactas del contrato numeric_sub del grupo eda: {min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50, p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct, negative_pct, distribution_type, histogram}. cv = std/mean (None si mean==0). iqr = p75-p25. mode = valor mas frecuente (menor en empate). histogram = lista de {lo, hi, count}. Si tras limpiar quedan 0 valores: todas las claves None y histogram=[]." +output: "Dict con las claves exactas del contrato numeric_sub del grupo eda: {min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50, p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct, negative_pct, distribution_type, histogram, histogram_clipped}. cv = std/mean (None si mean==0). iqr = p75-p25. mode = valor mas frecuente (menor en empate). histogram = lista de {lo, hi, count} sobre el rango completo min..max. histogram_clipped = misma estructura pero re-binado sobre el rango de vallas de Tukey [p25-1.5*iqr, p75+1.5*iqr] (vista central sin outliers); es [] cuando el recorte no excluye nada (ningun outlier), cuando iqr==0 (columna constante) o cuando el recorte deja la muestra sin dispersion. Si tras limpiar quedan 0 valores: todas las claves None, histogram=[] e histogram_clipped=[]." uses_functions: - detect_distribution_type_py_datascience - detect_outliers_py_datascience @@ -56,3 +56,8 @@ print(prof["histogram"][:2]) # [{'lo': 1.0, 'hi': 5.95, 'count': ...}, ...] - `distribution_type`, `skew` y `kurtosis` vienen de `detect_distribution_type`, que devuelve `too_few_samples` (y skew/kurtosis None) cuando la muestra limpia tiene **menos de 30 valores**. - Los outliers usan z-score con `std` poblacional y threshold 3.0 (de `detect_outliers`): en muestras muy pequeñas un unico valor extremo puede inflar la `std` y no marcarse como outlier (efecto masking). Para deteccion fiable, pasa una muestra suficientemente grande. - `cv` es `None` cuando `mean == 0` (division indefinida). +- `histogram_clipped` NO recalcula media/mediana/std: reutiliza los percentiles ya calculados (`p25`, `p75`, `iqr`) para definir el rango de recorte y solo re-bina la sub-muestra dentro de las vallas. Es aditivo: los consumidores que solo miran `histogram` no se ven afectados. + +## Capability growth log + +- v1.1.0 (2026-07-03) — añade la clave `histogram_clipped`: segunda vista del histograma re-binada sobre las vallas de Tukey [p25-1.5·IQR, p75+1.5·IQR] para leer la masa central cuando una cola larga aplasta la escala. Aditivo (los consumidores de `histogram` no cambian); `[]` cuando el recorte no excluye nada, la columna es constante (iqr==0) o la sub-muestra recortada pierde dispersion. Lo consume el capitulo `num_distr` del motor AutomaticEDA como figura adicional dentro del mismo grupo keep-together de la columna. diff --git a/python/functions/datascience/describe_numeric.py b/python/functions/datascience/describe_numeric.py index 5654141a..a501c5fd 100644 --- a/python/functions/datascience/describe_numeric.py +++ b/python/functions/datascience/describe_numeric.py @@ -69,7 +69,9 @@ def describe_numeric(values: list, bins: int = 20) -> dict: Dict with the exact keys of the eda `numeric_sub` contract: {min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50, p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct, - negative_pct, distribution_type, histogram}. + negative_pct, distribution_type, histogram, histogram_clipped}. + histogram_clipped is a second histogram over the Tukey inner-fence + range (outliers trimmed) or [] when the clip removes nothing. """ clean = _clean(values) n = len(clean) @@ -77,6 +79,7 @@ def describe_numeric(values: list, bins: int = 20) -> dict: if n == 0: result = {k: None for k in _NULL_KEYS} result["histogram"] = [] + result["histogram_clipped"] = [] return result arr = np.array(clean, dtype=float) @@ -131,6 +134,32 @@ def describe_numeric(values: list, bins: int = 20) -> dict: hi = minimum + (i + 1) * width hist.append({"lo": float(lo), "hi": float(hi), "count": int(count)}) + # Clipped histogram: a second view of the central mass with the outliers + # trimmed away, re-binned over the Tukey inner-fence range [Q1-1.5*IQR, + # Q3+1.5*IQR] (coherent with the boxplot already drawn below the histogram). + # It answers "what does the bulk look like when the long tail no longer + # crushes the scale". Computed here because the raw sample (`clean`) is only + # alive at this point — the profile keeps aggregated bins, not raw values. + # Only emitted when the clip actually removes something *and* the trimmed + # sample still has spread; otherwise it degrades to [] and the renderer skips + # the second view (no redundant duplicate of the full histogram). + hist_clipped: list = [] + lower_fence = p25 - 1.5 * iqr + upper_fence = p75 + 1.5 * iqr + if iqr > 0: + clipped = [v for v in clean if lower_fence <= v <= upper_fence] + if clipped and len(clipped) < len(clean): + c_counts = histogram(clipped, bins) + c_min = float(min(clipped)) + c_max = float(max(clipped)) + if c_counts and c_max > c_min: + c_width = (c_max - c_min) / bins + for i, count in enumerate(c_counts): + lo = c_min + i * c_width + hi = c_min + (i + 1) * c_width + hist_clipped.append( + {"lo": float(lo), "hi": float(hi), "count": int(count)}) + return { "min": minimum, "max": maximum, @@ -156,4 +185,5 @@ def describe_numeric(values: list, bins: int = 20) -> dict: "negative_pct": negative_pct, "distribution_type": distribution_type, "histogram": hist, + "histogram_clipped": hist_clipped, } diff --git a/python/functions/datascience/describe_numeric_test.py b/python/functions/datascience/describe_numeric_test.py index 6ec20442..1ffe12c8 100644 --- a/python/functions/datascience/describe_numeric_test.py +++ b/python/functions/datascience/describe_numeric_test.py @@ -13,6 +13,7 @@ _EXPECTED_KEYS = { "p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr", "skew", "kurtosis", "n_outliers", "outlier_pct", "zero_pct", "negative_pct", "distribution_type", "histogram", + "histogram_clipped", } @@ -61,9 +62,10 @@ def test_lista_vacia_todo_none(): result = describe_numeric([None, "abc", float("nan")]) assert set(result.keys()) == _EXPECTED_KEYS - for key in _EXPECTED_KEYS - {"histogram"}: + for key in _EXPECTED_KEYS - {"histogram", "histogram_clipped"}: assert result[key] is None, f"{key} debe ser None" assert result["histogram"] == [] + assert result["histogram_clipped"] == [] def test_cv_none_cuando_mean_cero(): @@ -83,3 +85,56 @@ def test_iqr_y_percentiles(): assert result["p1"] <= result["p25"] <= result["p50"] <= result["p75"] <= result["p99"] assert result["min"] == 1.0 assert result["max"] == 100.0 + + +# --------------------------------------------------------------------------- # +# histogram_clipped: second view of the central mass, outliers trimmed. +# --------------------------------------------------------------------------- # +def test_histogram_clipped_trims_the_tail(): + """Golden: with a long high tail, the clipped histogram excludes the outliers. + + A tight cluster in [1, 5] plus a handful of extreme values. The full histogram + stretches to the extreme (min..max); the clipped one is re-binned over the + Tukey inner fences, so its upper edge stays far below the extreme and it holds + fewer values than the full sample. + """ + cluster = [1, 2, 3, 4, 5] * 20 # 100 values in [1, 5] + values = cluster + [500, 800, 1000] # 3 far outliers + result = describe_numeric(values) + + full = result["histogram"] + clipped = result["histogram_clipped"] + assert full and clipped # both present + for bucket in clipped: + assert "lo" in bucket and "hi" in bucket and "count" in bucket + + # The full histogram reaches the extreme; the clipped one does not. + assert full[-1]["hi"] >= 900 + assert clipped[-1]["hi"] < 100 + + # The clip removed the tail: fewer values counted than the full sample. + total_full = sum(b["count"] for b in full) + total_clipped = sum(b["count"] for b in clipped) + assert total_full == 103 + assert total_clipped < total_full + assert total_clipped >= 100 # the whole cluster survives the clip + + +def test_histogram_clipped_empty_when_no_outliers(): + """Edge: a clean spread with no fence outliers yields an empty clipped view. + + When the inner-fence range already covers every value, there is nothing to + trim, so histogram_clipped is [] and the renderer skips the redundant second + view instead of duplicating the full histogram. + """ + result = describe_numeric(list(range(1, 101))) # uniform 1..100, no outliers + assert result["n_outliers"] == 0 + assert result["histogram"] # full histogram present + assert result["histogram_clipped"] == [] # nothing trimmed + + +def test_histogram_clipped_empty_when_constant(): + """Edge: a constant column (iqr == 0) never produces a clipped view.""" + result = describe_numeric([7] * 30) + assert result["iqr"] == 0 + assert result["histogram_clipped"] == []