feat(eda): histograma sin outliers (vista central) en num_distr

describe_numeric emite una nueva clave aditiva histogram_clipped: un segundo histograma re-binado sobre el rango de vallas de Tukey [p25-1.5*IQR, p75+1.5*IQR], reutilizando los percentiles ya calculados. Es [] cuando el recorte no excluye nada (sin outliers), la columna es constante (iqr==0) o la sub-muestra recortada pierde dispersion, de modo que el renderer no duplica el histograma completo.

El capitulo num_distr consume histogram_clipped como una segunda figura DENTRO del mismo grupo keep-together de la columna: la vista central se lee cuando una cola larga aplasta la escala del histograma completo. Bump describe_numeric 1.0.0->1.1.0 (aditivo) y CHAPTER_VERSION num_distr 1.3.0->1.4.0. Tests: golden (recorta la cola), edges (sin outliers -> [], constante -> []), contrato de claves y smoke e2e de render.
This commit is contained in:
2026-07-03 20:34:08 +02:00
parent 1fee225bff
commit a9a60cbf2c
4 changed files with 169 additions and 6 deletions
@@ -35,7 +35,7 @@ try:
except Exception: # noqa: BLE001 — keep the chapter importable no matter what.
build_boxplot_stats = None # type: ignore[assignment]
CHAPTER_VERSION = "1.3.0"
CHAPTER_VERSION = "1.4.0"
CHAPTER_ID = "num_distr"
CHAPTER_TITLE = "Distribuciones numéricas"
@@ -275,6 +275,69 @@ def _make_hist_box(name: str, numeric: dict, box: dict):
return fig
def _make_hist_clipped(name: str, numeric: dict):
"""Histogram of the central mass with the outliers trimmed away.
Companion to :func:`_make_hist_box`: same column, re-binned over the Tukey
inner-fence range [Q1-1.5*IQR, Q3+1.5*IQR] (precomputed in ``describe_numeric``
as ``histogram_clipped``), so the bulk of the distribution stays readable when
a long tail would otherwise crush the scale. Only the reference median is drawn
— it always falls inside the fence range by construction — because mean/±σ were
already shown on the full histogram above and could sit outside the clip.
"""
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(6.4, 2.6))
hist = numeric.get("histogram_clipped") or []
drew_bars = False
for b in hist:
if not isinstance(b, dict):
continue
lo = b.get("lo")
hi = b.get("hi")
count = b.get("count") or 0
if lo is None or hi is None:
continue
width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6)
ax.bar(lo, count, width=width, align="edge", color="#b7d7a8",
edgecolor="#6a9a5b", linewidth=0.4, zorder=2)
drew_bars = True
median = numeric.get("median")
if drew_bars and median is not None:
lo0 = hist[0].get("lo")
hi1 = hist[-1].get("hi")
if lo0 is not None and hi1 is not None and lo0 <= median <= hi1:
ax.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.4,
zorder=4, label=f"mediana = {_fmt_num(median)}")
ax.legend(fontsize=6.5, loc="upper right", framealpha=0.85)
if not drew_bars:
ax.text(0.5, 0.5, "(sin histograma recortado)", ha="center",
va="center", fontsize=9, color="#8a8a8a",
transform=ax.transAxes)
ax.set_ylabel("frecuencia", fontsize=8)
ax.set_xlabel(name, fontsize=8)
ax.tick_params(labelsize=7)
for spine in ("top", "right"):
ax.spines[spine].set_visible(False)
fig.suptitle(f"{name} — vista central (sin outliers)", fontsize=10,
fontweight="bold", x=0.02, ha="left")
fig.tight_layout()
return fig
def _clipped_figure_maker(name: str, numeric: dict):
"""Bind the per-column arguments so the lazy closure is loop-safe."""
def _make():
return _make_hist_clipped(name, numeric)
return _make
def _stats_note(name: str, numeric: dict, box: dict) -> str:
"""One compact line of the key numbers + a plain-Spanish shape gloss."""
bits = [
@@ -374,6 +437,16 @@ def build_num_distr(profile: dict, ctx: dict):
make=_figure_maker(name, numeric, box),
caption=f"Distribución de «{name}» — histograma "
f"(media/mediana/±σ) y boxplot."))
# Second view: the central mass with the outliers trimmed (Tukey fences).
# Only added when describe_numeric produced a non-empty histogram_clipped
# (i.e. the clip actually removed tail values), and stays inside the same
# keep-together Group so it never drifts to another page from its heading.
if numeric.get("histogram_clipped"):
col_blocks.append(model.Figure(
make=_clipped_figure_maker(name, numeric),
caption=f"«{name}» — vista central con los atípicos recortados "
f"(vallas de Tukey 1,5·IQR); útil cuando la cola larga "
f"aplasta la escala del histograma completo."))
col_blocks.append(model.Markdown(text=_stats_note(name, numeric, box)))
blocks.append(model.Group(blocks=col_blocks))