feat(eda): histograma sin outliers (vista central) en num_distr

describe_numeric emite una nueva clave aditiva histogram_clipped: un segundo histograma re-binado sobre el rango de vallas de Tukey [p25-1.5*IQR, p75+1.5*IQR], reutilizando los percentiles ya calculados. Es [] cuando el recorte no excluye nada (sin outliers), la columna es constante (iqr==0) o la sub-muestra recortada pierde dispersion, de modo que el renderer no duplica el histograma completo. El capitulo num_distr consume histogram_clipped como una segunda figura DENTRO del mismo grupo keep-together de la columna: la vista central se lee cuando una cola larga aplasta la escala del histograma completo. Bump describe_numeric 1.0.0->1.1.0 (aditivo) y CHAPTER_VERSION num_distr 1.3.0->1.4.0. Tests: golden (recorta la cola), edges (sin outliers -> [], constante -> []), contrato de claves y smoke e2e de render.
2026-07-03 20:34:08 +02:00
parent 1fee225bff
commit a9a60cbf2c
4 changed files with 169 additions and 6 deletions
@@ -35,7 +35,7 @@ try:
 except Exception:  # noqa: BLE001 — keep the chapter importable no matter what.
    build_boxplot_stats = None  # type: ignore[assignment]

-CHAPTER_VERSION = "1.3.0"
+CHAPTER_VERSION = "1.4.0"
 CHAPTER_ID = "num_distr"
 CHAPTER_TITLE = "Distribuciones numéricas"

@@ -275,6 +275,69 @@ def _make_hist_box(name: str, numeric: dict, box: dict):
    return fig


+def _make_hist_clipped(name: str, numeric: dict):
+    """Histogram of the central mass with the outliers trimmed away.
+
+    Companion to :func:`_make_hist_box`: same column, re-binned over the Tukey
+    inner-fence range [Q1-1.5*IQR, Q3+1.5*IQR] (precomputed in ``describe_numeric``
+    as ``histogram_clipped``), so the bulk of the distribution stays readable when
+    a long tail would otherwise crush the scale. Only the reference median is drawn
+    — it always falls inside the fence range by construction — because mean/±σ were
+    already shown on the full histogram above and could sit outside the clip.
+    """
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+
+    fig, ax = plt.subplots(figsize=(6.4, 2.6))
+    hist = numeric.get("histogram_clipped") or []
+    drew_bars = False
+    for b in hist:
+        if not isinstance(b, dict):
+            continue
+        lo = b.get("lo")
+        hi = b.get("hi")
+        count = b.get("count") or 0
+        if lo is None or hi is None:
+            continue
+        width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6)
+        ax.bar(lo, count, width=width, align="edge", color="#b7d7a8",
+               edgecolor="#6a9a5b", linewidth=0.4, zorder=2)
+        drew_bars = True
+
+    median = numeric.get("median")
+    if drew_bars and median is not None:
+        lo0 = hist[0].get("lo")
+        hi1 = hist[-1].get("hi")
+        if lo0 is not None and hi1 is not None and lo0 <= median <= hi1:
+            ax.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.4,
+                       zorder=4, label=f"mediana = {_fmt_num(median)}")
+            ax.legend(fontsize=6.5, loc="upper right", framealpha=0.85)
+    if not drew_bars:
+        ax.text(0.5, 0.5, "(sin histograma recortado)", ha="center",
+                va="center", fontsize=9, color="#8a8a8a",
+                transform=ax.transAxes)
+
+    ax.set_ylabel("frecuencia", fontsize=8)
+    ax.set_xlabel(name, fontsize=8)
+    ax.tick_params(labelsize=7)
+    for spine in ("top", "right"):
+        ax.spines[spine].set_visible(False)
+    fig.suptitle(f"{name} — vista central (sin outliers)", fontsize=10,
+                 fontweight="bold", x=0.02, ha="left")
+    fig.tight_layout()
+    return fig
+
+
+def _clipped_figure_maker(name: str, numeric: dict):
+    """Bind the per-column arguments so the lazy closure is loop-safe."""
+    def _make():
+        return _make_hist_clipped(name, numeric)
+
+    return _make
+
+
 def _stats_note(name: str, numeric: dict, box: dict) -> str:
    """One compact line of the key numbers + a plain-Spanish shape gloss."""
    bits = [
@@ -374,6 +437,16 @@ def build_num_distr(profile: dict, ctx: dict):
            make=_figure_maker(name, numeric, box),
            caption=f"Distribución de «{name}» — histograma "
                    f"(media/mediana/±σ) y boxplot."))
+        # Second view: the central mass with the outliers trimmed (Tukey fences).
+        # Only added when describe_numeric produced a non-empty histogram_clipped
+        # (i.e. the clip actually removed tail values), and stays inside the same
+        # keep-together Group so it never drifts to another page from its heading.
+        if numeric.get("histogram_clipped"):
+            col_blocks.append(model.Figure(
+                make=_clipped_figure_maker(name, numeric),
+                caption=f"«{name}» — vista central con los atípicos recortados "
+                        f"(vallas de Tukey 1,5·IQR); útil cuando la cola larga "
+                        f"aplasta la escala del histograma completo."))
        col_blocks.append(model.Markdown(text=_stats_note(name, numeric, box)))
        blocks.append(model.Group(blocks=col_blocks))

@@ -3,17 +3,17 @@ name: describe_numeric
 kind: function
 lang: py
 domain: datascience
-version: "1.0.0"
+version: "1.1.0"
 purity: pure
 signature: "def describe_numeric(values: list, bins: int = 20) -> dict"
-description: "Calcula el bloque estadistico fino numeric de un ColumnProfile del grupo eda sobre una MUESTRA de una columna numerica. Descarta None/NaN/no-numericos y devuelve min/max/mean/median/mode/std/variance/cv, percentiles, iqr, skew, kurtosis, outliers, zero_pct, negative_pct, distribution_type e histogram. Reusa detect_distribution_type, detect_outliers y histogram del registry."
+description: "Calcula el bloque estadistico fino numeric de un ColumnProfile del grupo eda sobre una MUESTRA de una columna numerica. Descarta None/NaN/no-numericos y devuelve min/max/mean/median/mode/std/variance/cv, percentiles, iqr, skew, kurtosis, outliers, zero_pct, negative_pct, distribution_type, histogram e histogram_clipped (segunda vista del histograma con los outliers recortados a las vallas de Tukey). Reusa detect_distribution_type, detect_outliers y histogram del registry."
 tags: [eda, statistics, profiling, distribution, histogram, datascience]
 params:
  - name: values
    desc: "Lista de valores crudos de una columna (muestra). Puede contener None, NaN, infinitos y strings no numericos: se descartan antes de calcular. bool se trata como no numerico."
  - name: bins
    desc: "Numero de buckets equiespaciados del histograma. Default 20."
-output: "Dict con las claves exactas del contrato numeric_sub del grupo eda: {min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50, p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct, negative_pct, distribution_type, histogram}. cv = std/mean (None si mean==0). iqr = p75-p25. mode = valor mas frecuente (menor en empate). histogram = lista de {lo, hi, count}. Si tras limpiar quedan 0 valores: todas las claves None y histogram=[]."
+output: "Dict con las claves exactas del contrato numeric_sub del grupo eda: {min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50, p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct, negative_pct, distribution_type, histogram, histogram_clipped}. cv = std/mean (None si mean==0). iqr = p75-p25. mode = valor mas frecuente (menor en empate). histogram = lista de {lo, hi, count} sobre el rango completo min..max. histogram_clipped = misma estructura pero re-binado sobre el rango de vallas de Tukey [p25-1.5*iqr, p75+1.5*iqr] (vista central sin outliers); es [] cuando el recorte no excluye nada (ningun outlier), cuando iqr==0 (columna constante) o cuando el recorte deja la muestra sin dispersion. Si tras limpiar quedan 0 valores: todas las claves None, histogram=[] e histogram_clipped=[]."
 uses_functions:
  - detect_distribution_type_py_datascience
  - detect_outliers_py_datascience
@@ -56,3 +56,8 @@ print(prof["histogram"][:2])      # [{'lo': 1.0, 'hi': 5.95, 'count': ...}, ...]
 - `distribution_type`, `skew` y `kurtosis` vienen de `detect_distribution_type`, que devuelve `too_few_samples` (y skew/kurtosis None) cuando la muestra limpia tiene **menos de 30 valores**.
 - Los outliers usan z-score con `std` poblacional y threshold 3.0 (de `detect_outliers`): en muestras muy pequeñas un unico valor extremo puede inflar la `std` y no marcarse como outlier (efecto masking). Para deteccion fiable, pasa una muestra suficientemente grande.
 - `cv` es `None` cuando `mean == 0` (division indefinida).
+- `histogram_clipped` NO recalcula media/mediana/std: reutiliza los percentiles ya calculados (`p25`, `p75`, `iqr`) para definir el rango de recorte y solo re-bina la sub-muestra dentro de las vallas. Es aditivo: los consumidores que solo miran `histogram` no se ven afectados.
+
+## Capability growth log
+
+- v1.1.0 (2026-07-03) — añade la clave `histogram_clipped`: segunda vista del histograma re-binada sobre las vallas de Tukey [p25-1.5·IQR, p75+1.5·IQR] para leer la masa central cuando una cola larga aplasta la escala. Aditivo (los consumidores de `histogram` no cambian); `[]` cuando el recorte no excluye nada, la columna es constante (iqr==0) o la sub-muestra recortada pierde dispersion. Lo consume el capitulo `num_distr` del motor AutomaticEDA como figura adicional dentro del mismo grupo keep-together de la columna.
@@ -69,7 +69,9 @@ def describe_numeric(values: list, bins: int = 20) -> dict:
        Dict with the exact keys of the eda `numeric_sub` contract:
        {min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50,
         p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct,
-         negative_pct, distribution_type, histogram}.
+         negative_pct, distribution_type, histogram, histogram_clipped}.
+        histogram_clipped is a second histogram over the Tukey inner-fence
+        range (outliers trimmed) or [] when the clip removes nothing.
    """
    clean = _clean(values)
    n = len(clean)
@@ -77,6 +79,7 @@ def describe_numeric(values: list, bins: int = 20) -> dict:
    if n == 0:
        result = {k: None for k in _NULL_KEYS}
        result["histogram"] = []
+        result["histogram_clipped"] = []
        return result

    arr = np.array(clean, dtype=float)
@@ -131,6 +134,32 @@ def describe_numeric(values: list, bins: int = 20) -> dict:
                hi = minimum + (i + 1) * width
                hist.append({"lo": float(lo), "hi": float(hi), "count": int(count)})

+    # Clipped histogram: a second view of the central mass with the outliers
+    # trimmed away, re-binned over the Tukey inner-fence range [Q1-1.5*IQR,
+    # Q3+1.5*IQR] (coherent with the boxplot already drawn below the histogram).
+    # It answers "what does the bulk look like when the long tail no longer
+    # crushes the scale". Computed here because the raw sample (`clean`) is only
+    # alive at this point — the profile keeps aggregated bins, not raw values.
+    # Only emitted when the clip actually removes something *and* the trimmed
+    # sample still has spread; otherwise it degrades to [] and the renderer skips
+    # the second view (no redundant duplicate of the full histogram).
+    hist_clipped: list = []
+    lower_fence = p25 - 1.5 * iqr
+    upper_fence = p75 + 1.5 * iqr
+    if iqr > 0:
+        clipped = [v for v in clean if lower_fence <= v <= upper_fence]
+        if clipped and len(clipped) < len(clean):
+            c_counts = histogram(clipped, bins)
+            c_min = float(min(clipped))
+            c_max = float(max(clipped))
+            if c_counts and c_max > c_min:
+                c_width = (c_max - c_min) / bins
+                for i, count in enumerate(c_counts):
+                    lo = c_min + i * c_width
+                    hi = c_min + (i + 1) * c_width
+                    hist_clipped.append(
+                        {"lo": float(lo), "hi": float(hi), "count": int(count)})
+
    return {
        "min": minimum,
        "max": maximum,
@@ -156,4 +185,5 @@ def describe_numeric(values: list, bins: int = 20) -> dict:
        "negative_pct": negative_pct,
        "distribution_type": distribution_type,
        "histogram": hist,
+        "histogram_clipped": hist_clipped,
    }
@@ -13,6 +13,7 @@ _EXPECTED_KEYS = {
    "p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr",
    "skew", "kurtosis", "n_outliers", "outlier_pct",
    "zero_pct", "negative_pct", "distribution_type", "histogram",
+    "histogram_clipped",
 }


@@ -61,9 +62,10 @@ def test_lista_vacia_todo_none():
    result = describe_numeric([None, "abc", float("nan")])

    assert set(result.keys()) == _EXPECTED_KEYS
-    for key in _EXPECTED_KEYS - {"histogram"}:
+    for key in _EXPECTED_KEYS - {"histogram", "histogram_clipped"}:
        assert result[key] is None, f"{key} debe ser None"
    assert result["histogram"] == []
+    assert result["histogram_clipped"] == []


 def test_cv_none_cuando_mean_cero():
@@ -83,3 +85,56 @@ def test_iqr_y_percentiles():
    assert result["p1"] <= result["p25"] <= result["p50"] <= result["p75"] <= result["p99"]
    assert result["min"] == 1.0
    assert result["max"] == 100.0
+
+
+# --------------------------------------------------------------------------- #
+# histogram_clipped: second view of the central mass, outliers trimmed.
+# --------------------------------------------------------------------------- #
+def test_histogram_clipped_trims_the_tail():
+    """Golden: with a long high tail, the clipped histogram excludes the outliers.
+
+    A tight cluster in [1, 5] plus a handful of extreme values. The full histogram
+    stretches to the extreme (min..max); the clipped one is re-binned over the
+    Tukey inner fences, so its upper edge stays far below the extreme and it holds
+    fewer values than the full sample.
+    """
+    cluster = [1, 2, 3, 4, 5] * 20          # 100 values in [1, 5]
+    values = cluster + [500, 800, 1000]     # 3 far outliers
+    result = describe_numeric(values)
+
+    full = result["histogram"]
+    clipped = result["histogram_clipped"]
+    assert full and clipped                                   # both present
+    for bucket in clipped:
+        assert "lo" in bucket and "hi" in bucket and "count" in bucket
+
+    # The full histogram reaches the extreme; the clipped one does not.
+    assert full[-1]["hi"] >= 900
+    assert clipped[-1]["hi"] < 100
+
+    # The clip removed the tail: fewer values counted than the full sample.
+    total_full = sum(b["count"] for b in full)
+    total_clipped = sum(b["count"] for b in clipped)
+    assert total_full == 103
+    assert total_clipped < total_full
+    assert total_clipped >= 100               # the whole cluster survives the clip
+
+
+def test_histogram_clipped_empty_when_no_outliers():
+    """Edge: a clean spread with no fence outliers yields an empty clipped view.
+
+    When the inner-fence range already covers every value, there is nothing to
+    trim, so histogram_clipped is [] and the renderer skips the redundant second
+    view instead of duplicating the full histogram.
+    """
+    result = describe_numeric(list(range(1, 101)))  # uniform 1..100, no outliers
+    assert result["n_outliers"] == 0
+    assert result["histogram"]                       # full histogram present
+    assert result["histogram_clipped"] == []         # nothing trimmed
+
+
+def test_histogram_clipped_empty_when_constant():
+    """Edge: a constant column (iqr == 0) never produces a clipped view."""
+    result = describe_numeric([7] * 30)
+    assert result["iqr"] == 0
+    assert result["histogram_clipped"] == []