feat(eda): histograma sin outliers (vista central) en num_distr
describe_numeric emite una nueva clave aditiva histogram_clipped: un segundo histograma re-binado sobre el rango de vallas de Tukey [p25-1.5*IQR, p75+1.5*IQR], reutilizando los percentiles ya calculados. Es [] cuando el recorte no excluye nada (sin outliers), la columna es constante (iqr==0) o la sub-muestra recortada pierde dispersion, de modo que el renderer no duplica el histograma completo. El capitulo num_distr consume histogram_clipped como una segunda figura DENTRO del mismo grupo keep-together de la columna: la vista central se lee cuando una cola larga aplasta la escala del histograma completo. Bump describe_numeric 1.0.0->1.1.0 (aditivo) y CHAPTER_VERSION num_distr 1.3.0->1.4.0. Tests: golden (recorta la cola), edges (sin outliers -> [], constante -> []), contrato de claves y smoke e2e de render.
This commit is contained in:
@@ -35,7 +35,7 @@ try:
|
||||
except Exception: # noqa: BLE001 — keep the chapter importable no matter what.
|
||||
build_boxplot_stats = None # type: ignore[assignment]
|
||||
|
||||
CHAPTER_VERSION = "1.3.0"
|
||||
CHAPTER_VERSION = "1.4.0"
|
||||
CHAPTER_ID = "num_distr"
|
||||
CHAPTER_TITLE = "Distribuciones numéricas"
|
||||
|
||||
@@ -275,6 +275,69 @@ def _make_hist_box(name: str, numeric: dict, box: dict):
|
||||
return fig
|
||||
|
||||
|
||||
def _make_hist_clipped(name: str, numeric: dict):
|
||||
"""Histogram of the central mass with the outliers trimmed away.
|
||||
|
||||
Companion to :func:`_make_hist_box`: same column, re-binned over the Tukey
|
||||
inner-fence range [Q1-1.5*IQR, Q3+1.5*IQR] (precomputed in ``describe_numeric``
|
||||
as ``histogram_clipped``), so the bulk of the distribution stays readable when
|
||||
a long tail would otherwise crush the scale. Only the reference median is drawn
|
||||
— it always falls inside the fence range by construction — because mean/±σ were
|
||||
already shown on the full histogram above and could sit outside the clip.
|
||||
"""
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
fig, ax = plt.subplots(figsize=(6.4, 2.6))
|
||||
hist = numeric.get("histogram_clipped") or []
|
||||
drew_bars = False
|
||||
for b in hist:
|
||||
if not isinstance(b, dict):
|
||||
continue
|
||||
lo = b.get("lo")
|
||||
hi = b.get("hi")
|
||||
count = b.get("count") or 0
|
||||
if lo is None or hi is None:
|
||||
continue
|
||||
width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6)
|
||||
ax.bar(lo, count, width=width, align="edge", color="#b7d7a8",
|
||||
edgecolor="#6a9a5b", linewidth=0.4, zorder=2)
|
||||
drew_bars = True
|
||||
|
||||
median = numeric.get("median")
|
||||
if drew_bars and median is not None:
|
||||
lo0 = hist[0].get("lo")
|
||||
hi1 = hist[-1].get("hi")
|
||||
if lo0 is not None and hi1 is not None and lo0 <= median <= hi1:
|
||||
ax.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.4,
|
||||
zorder=4, label=f"mediana = {_fmt_num(median)}")
|
||||
ax.legend(fontsize=6.5, loc="upper right", framealpha=0.85)
|
||||
if not drew_bars:
|
||||
ax.text(0.5, 0.5, "(sin histograma recortado)", ha="center",
|
||||
va="center", fontsize=9, color="#8a8a8a",
|
||||
transform=ax.transAxes)
|
||||
|
||||
ax.set_ylabel("frecuencia", fontsize=8)
|
||||
ax.set_xlabel(name, fontsize=8)
|
||||
ax.tick_params(labelsize=7)
|
||||
for spine in ("top", "right"):
|
||||
ax.spines[spine].set_visible(False)
|
||||
fig.suptitle(f"{name} — vista central (sin outliers)", fontsize=10,
|
||||
fontweight="bold", x=0.02, ha="left")
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
|
||||
|
||||
def _clipped_figure_maker(name: str, numeric: dict):
|
||||
"""Bind the per-column arguments so the lazy closure is loop-safe."""
|
||||
def _make():
|
||||
return _make_hist_clipped(name, numeric)
|
||||
|
||||
return _make
|
||||
|
||||
|
||||
def _stats_note(name: str, numeric: dict, box: dict) -> str:
|
||||
"""One compact line of the key numbers + a plain-Spanish shape gloss."""
|
||||
bits = [
|
||||
@@ -374,6 +437,16 @@ def build_num_distr(profile: dict, ctx: dict):
|
||||
make=_figure_maker(name, numeric, box),
|
||||
caption=f"Distribución de «{name}» — histograma "
|
||||
f"(media/mediana/±σ) y boxplot."))
|
||||
# Second view: the central mass with the outliers trimmed (Tukey fences).
|
||||
# Only added when describe_numeric produced a non-empty histogram_clipped
|
||||
# (i.e. the clip actually removed tail values), and stays inside the same
|
||||
# keep-together Group so it never drifts to another page from its heading.
|
||||
if numeric.get("histogram_clipped"):
|
||||
col_blocks.append(model.Figure(
|
||||
make=_clipped_figure_maker(name, numeric),
|
||||
caption=f"«{name}» — vista central con los atípicos recortados "
|
||||
f"(vallas de Tukey 1,5·IQR); útil cuando la cola larga "
|
||||
f"aplasta la escala del histograma completo."))
|
||||
col_blocks.append(model.Markdown(text=_stats_note(name, numeric, box)))
|
||||
blocks.append(model.Group(blocks=col_blocks))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user