feat(eda): capítulo num_distr — histograma con media/mediana/±σ + boxplot Tukey

Capítulo NUM DISTR del motor AutomaticEDA. Por cada columna numérica emite, como una sola Figure indivisible de dos ejes compartiendo X, un histograma con la media (línea roja discontinua), la mediana (línea verde continua) y la banda ±1σ dibujadas como referencias, y un boxplot de Tukey debajo (caja P25–P75, bigotes a 1,5·IQR, marca de valores fuera de las vallas). Una nota por columna traduce el distribution_type a lenguaje llano (MUST-4.1/4.2/4.3 del report 2043). Consume el profile del grupo eda sin recalcular: el histograma usa los bins {lo,hi,count} de describe_numeric y las vallas del boxplot las deriva la función pura build_boxplot_stats_py_datascience. Lectura defensiva: sin columna numérica devuelve None; profile None/{} no lanza. Test self-contained: golden + edges + anti-corte (8 columnas no cortan en PDF ni PPTX).
2026-06-30 14:58:03 +02:00
parent fcf5a4c6a3
commit c1a4a83717
2 changed files with 440 additions and 0 deletions
@@ -0,0 +1,289 @@
+"""Numeric distributions chapter (NUM DISTR) for AutomaticEDA.
+
+For every numeric column the chapter draws, as a single indivisible figure, a
+histogram with the **mean, median and ±1σ band drawn as reference lines** and a
+**Tukey boxplot right below it** sharing the same X axis — exactly the user
+requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block
+so the renderers rasterize and scale it to fit a whole page/slide and nothing is
+ever cut; columns with many numerics simply flow across pages as small
+multiples.
+
+Data comes from the ``eda`` group profile and is never recomputed here:
+
+- ``columns[i]['numeric']`` (the output of ``describe_numeric``) gives
+  ``mean, median, std, min, max, p25, p75, iqr, n_outliers, outlier_pct,
+  distribution_type`` and the ``histogram`` bins ``[{lo, hi, count}]``.
+- The boxplot five-number summary + Tukey 1.5·IQR fences are derived by the
+  pure registry function ``build_boxplot_stats`` (group ``eda``); this chapter
+  only consumes its output, it does not reimplement the statistics.
+
+Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
+Reads everything defensively (``.get``) and never raises: a column whose figure
+cannot be built is degraded to a short note instead of aborting the chapter.
+"""
+
+from __future__ import annotations
+
+from .. import model
+
+# Pure registry function (group ``eda``) that derives the Tukey boxplot stats
+# from a ``numeric`` sub-block. Imported defensively so the chapter still builds
+# (degrading the boxplot to a note) if the function is somehow unavailable.
+try:
+    from datascience.build_boxplot_stats import build_boxplot_stats
+except Exception:  # noqa: BLE001 — keep the chapter importable no matter what.
+    build_boxplot_stats = None  # type: ignore[assignment]
+
+CHAPTER_VERSION = "1.0.0"
+CHAPTER_ID = "num_distr"
+CHAPTER_TITLE = "Distribuciones numéricas"
+
+# Plain-Spanish gloss for every label ``detect_distribution_type`` can emit, so a
+# non-expert reader understands the shape and the suggested next step (MUST-4.3).
+_DIST_GLOSS = {
+    "normal-ish": "aproximadamente simétrica (campana); media y mediana casi "
+                  "coinciden.",
+    "right-skewed": "asimétrica a la derecha (cola larga hacia valores altos); "
+                    "la media supera a la mediana — considera una transformación "
+                    "logarítmica.",
+    "left-skewed": "asimétrica a la izquierda (cola larga hacia valores bajos); "
+                   "la media queda por debajo de la mediana.",
+    "heavy-tail": "colas pesadas (curtosis alta): más valores extremos de lo "
+                  "que esperaría una normal — vigila los outliers.",
+    "lognormal-ish": "compatible con lognormal (simétrica al tomar logaritmos); "
+                     "la re-expresión log suele normalizarla.",
+    "multimodal": "varios picos: probablemente mezcla de subgrupos — conviene "
+                  "segmentar antes de resumir con una sola media.",
+    "discrete": "pocos valores distintos (discreta/ordinal); el histograma "
+                "cuenta niveles, no un continuo.",
+    "too_few_samples": "muestra demasiado pequeña para clasificar la forma con "
+                       "fiabilidad.",
+    "other": "forma no encuadrada en las categorías estándar.",
+}
+
+
+def _fmt_num(value, decimals: int = 3) -> str:
+    """Compact, defensive number formatting shared with the other chapters."""
+    if value is None:
+        return "—"
+    if isinstance(value, bool):
+        return str(value)
+    if isinstance(value, int):
+        return f"{value:,}".replace(",", ".")
+    if isinstance(value, float):
+        if value != value:  # NaN
+            return "NaN"
+        if value in (float("inf"), float("-inf")):
+            return str(value)
+        text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
+        return text if text else "0"
+    return str(value)
+
+
+def _numeric_columns(profile: dict) -> list:
+    """Return the list of (name, numeric_dict) for columns with usable stats."""
+    out = []
+    for col in profile.get("columns") or []:
+        if not isinstance(col, dict):
+            continue
+        if col.get("inferred_type") != "numeric":
+            continue
+        num = col.get("numeric")
+        if not isinstance(num, dict) or not num:
+            continue
+        # A numeric block is renderable when it carries at least a center.
+        if num.get("mean") is None and num.get("median") is None:
+            continue
+        out.append((col.get("name") or "(columna)", num))
+    return out
+
+
+def _make_hist_box(name: str, numeric: dict, box: dict):
+    """Build the histogram (with mean/median/±σ lines) + boxplot figure.
+
+    Returned lazily to the renderer (a zero-arg callable via ``Figure.make``) so
+    matplotlib is only imported and the figure only drawn when a renderer needs
+    it. The two stacked axes share the X axis and are produced as a single
+    figure, which both renderers treat as one indivisible unit (scaled whole,
+    never cut).
+    """
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+
+    fig, (ax_h, ax_b) = plt.subplots(
+        2, 1, figsize=(6.4, 3.4), sharex=True,
+        gridspec_kw={"height_ratios": [3.2, 1.0], "hspace": 0.08})
+
+    # ---- Histogram from the precomputed equal-width bins {lo, hi, count}. ----
+    hist = numeric.get("histogram") or []
+    drew_bars = False
+    for b in hist:
+        if not isinstance(b, dict):
+            continue
+        lo = b.get("lo")
+        hi = b.get("hi")
+        count = b.get("count") or 0
+        if lo is None or hi is None:
+            continue
+        width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6)
+        ax_h.bar(lo, count, width=width, align="edge", color="#9ec6df",
+                 edgecolor="#5b8aa6", linewidth=0.4, zorder=2)
+        drew_bars = True
+    if not drew_bars:
+        ax_h.text(0.5, 0.5, "(sin histograma)", ha="center", va="center",
+                  fontsize=9, color="#8a8a8a", transform=ax_h.transAxes)
+
+    mean = numeric.get("mean")
+    median = numeric.get("median")
+    std = numeric.get("std")
+
+    # ±1σ band first (behind the lines), then median (solid) and mean (dashed).
+    if mean is not None and std is not None and std > 0:
+        ax_h.axvspan(mean - std, mean + std, color="#f0c27b", alpha=0.22,
+                     zorder=1, label="±1σ")
+    if median is not None:
+        ax_h.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.6,
+                     zorder=4, label=f"mediana = {_fmt_num(median)}")
+    if mean is not None:
+        ax_h.axvline(mean, color="#c0392b", linestyle="--", linewidth=1.6,
+                     zorder=4, label=f"media = {_fmt_num(mean)}")
+
+    ax_h.set_ylabel("frecuencia", fontsize=8)
+    ax_h.tick_params(labelsize=7)
+    ax_h.legend(fontsize=6.5, loc="upper right", framealpha=0.85)
+    for spine in ("top", "right"):
+        ax_h.spines[spine].set_visible(False)
+
+    # ---- Tukey boxplot below, sharing the X axis (MUST-4.2). ----
+    if box:
+        stats = [{
+            "med": box.get("median"),
+            "q1": box.get("q1"),
+            "q3": box.get("q3"),
+            "whislo": box.get("whisker_lo"),
+            "whishi": box.get("whisker_hi"),
+            "fliers": [],  # raw outlier values are not in the profile.
+            "label": "",
+        }]
+        bxp_kw = dict(
+            showfliers=False, widths=0.5, patch_artist=True,
+            boxprops={"facecolor": "#9ec6df", "edgecolor": "#5b8aa6"},
+            medianprops={"color": "#2e8b57", "linewidth": 1.6},
+            whiskerprops={"color": "#5b8aa6"},
+            capprops={"color": "#5b8aa6"})
+        try:
+            # ``orientation`` is the current API; older matplotlib uses ``vert``.
+            try:
+                ax_b.bxp(stats, orientation="horizontal", **bxp_kw)
+            except TypeError:
+                ax_b.bxp(stats, vert=False, **bxp_kw)
+        except Exception:  # noqa: BLE001 — never let one axis kill the figure.
+            pass
+        # Mark the presence of out-of-fence points (the raw values are unknown).
+        if box.get("has_low_outliers") and box.get("min") is not None:
+            ax_b.plot([box["min"]], [1], marker="o", markersize=3.5,
+                      color="#c0392b", zorder=5)
+        if box.get("has_high_outliers") and box.get("max") is not None:
+            ax_b.plot([box["max"]], [1], marker="o", markersize=3.5,
+                      color="#c0392b", zorder=5)
+    else:
+        ax_b.text(0.5, 0.5, "(boxplot no disponible)", ha="center", va="center",
+                  fontsize=8, color="#8a8a8a", transform=ax_b.transAxes)
+
+    ax_b.set_yticks([])
+    ax_b.set_xlabel(name, fontsize=8)
+    ax_b.tick_params(labelsize=7)
+    for spine in ("top", "right", "left"):
+        ax_b.spines[spine].set_visible(False)
+
+    fig.suptitle(name, fontsize=10, fontweight="bold", x=0.02, ha="left")
+    return fig
+
+
+def _stats_note(name: str, numeric: dict, box: dict) -> str:
+    """One compact line of the key numbers + a plain-Spanish shape gloss."""
+    bits = [
+        f"media {_fmt_num(numeric.get('mean'))}",
+        f"mediana {_fmt_num(numeric.get('median'))}",
+        f"σ {_fmt_num(numeric.get('std'))}",
+        f"min {_fmt_num(numeric.get('min'))}",
+        f"max {_fmt_num(numeric.get('max'))}",
+        f"IQR {_fmt_num(numeric.get('iqr'))}",
+    ]
+    n_out = numeric.get("n_outliers")
+    out_pct = numeric.get("outlier_pct")
+    if n_out is not None:
+        pct = f" ({_fmt_num(out_pct, 2)}%)" if out_pct is not None else ""
+        bits.append(f"outliers {n_out}{pct}")
+    if box and (box.get("lower_fence") is not None):
+        bits.append(
+            f"vallas Tukey [{_fmt_num(box.get('lower_fence'))}, "
+            f"{_fmt_num(box.get('upper_fence'))}]")
+    line = " · ".join(bits)
+
+    dist = numeric.get("distribution_type")
+    gloss = _DIST_GLOSS.get(dist)
+    if dist and gloss:
+        line += f"\n\n**Forma ({dist}):** {gloss}"
+    return line
+
+
+def _figure_maker(name: str, numeric: dict, box: dict):
+    """Bind the per-column arguments so the lazy closure is loop-safe."""
+    def _make():
+        return _make_hist_box(name, numeric, box)
+
+    return _make
+
+
+def build_num_distr(profile: dict, ctx: dict):
+    """Build the numeric-distributions Chapter, or None if no numeric column.
+
+    Args:
+        profile: the ``eda`` group TableProfile dict.
+        ctx: presentation context (unused here beyond defensive handling).
+
+    Returns:
+        A ``model.Chapter`` with, per numeric column, a histogram+boxplot figure
+        and a stats note; or ``None`` when the dataset has no numeric column.
+    """
+    profile = profile or {}
+    ctx = ctx or {}
+
+    numerics = _numeric_columns(profile)
+    if not numerics:
+        return None  # chapter does not apply to a dataset with no numerics.
+
+    intro = (
+        "Para cada columna numérica se muestra su **histograma** con tres líneas "
+        "de referencia: la **media** (línea roja discontinua), la **mediana** "
+        "(línea verde continua) y la banda **±1σ** (zona sombreada). Debajo, "
+        "alineado al mismo eje, un **boxplot de Tukey**: la caja abarca del "
+        "primer al tercer cuartil (P25–P75), la línea interior es la mediana y "
+        "los bigotes llegan hasta 1,5·IQR; los puntos rojos señalan que hay "
+        "valores más allá de las vallas. Comparar media y mediana revela la "
+        "asimetría de la distribución.")
+
+    blocks = [
+        model.Heading(text=CHAPTER_TITLE, level=1),
+        model.Markdown(text=intro),
+    ]
+
+    for name, numeric in numerics:
+        box = {}
+        if build_boxplot_stats is not None:
+            try:
+                box = build_boxplot_stats(numeric) or {}
+            except Exception:  # noqa: BLE001 — degrade, never raise.
+                box = {}
+        blocks.append(model.Heading(text=str(name), level=2))
+        blocks.append(model.Figure(
+            make=_figure_maker(name, numeric, box),
+            caption=f"Distribución de «{name}» — histograma (media/mediana/±σ) "
+                    f"y boxplot."))
+        blocks.append(model.Markdown(text=_stats_note(name, numeric, box)))
+
+    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
+                         version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,151 @@
+"""Tests for the NUM DISTR chapter — DoD: golden + edges + anti-cut.
+
+Self-contained: builds synthetic ``numeric`` blocks (no DuckDB) so the suite is
+fast and deterministic. Verifies that the chapter emits, per numeric column, a
+histogram+boxplot figure plus a stats note; that the mean/median/±σ requirement
+and the boxplot are present; that a profile with no numeric column yields None;
+that None/empty never raises; and that with many numeric columns and long text
+both the PDF and the PPTX render without cutting anything (every column heading
+survives in the rendered output).
+"""
+
+import os
+import re
+import tempfile
+
+from pypdf import PdfReader
+
+from datascience.automatic_eda.chapters.num_distr import (
+    build_num_distr, CHAPTER_VERSION, _DIST_GLOSS,
+)
+from datascience.automatic_eda import model
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+
+def _numeric_block(mean, median, std, mn, mx, dist="normal-ish",
+                   n_outliers=0, nbins=10):
+    """A synthetic ``numeric`` sub-block shaped like describe_numeric's output."""
+    width = (mx - mn) / nbins if mx > mn else 1.0
+    hist = [{"lo": mn + i * width, "hi": mn + (i + 1) * width,
+             "count": (i + 1) * 3} for i in range(nbins)]
+    p25 = mn + (mx - mn) * 0.25
+    p75 = mn + (mx - mn) * 0.75
+    return {
+        "min": mn, "max": mx, "mean": mean, "median": median, "std": std,
+        "p25": p25, "p50": median, "p75": p75, "iqr": p75 - p25,
+        "n_outliers": n_outliers, "outlier_pct": 100.0 * n_outliers / 300.0,
+        "distribution_type": dist, "histogram": hist,
+    }
+
+
+def _profile(n_numeric=2, extra_categorical=True):
+    cols = []
+    presets = [
+        ("precio", 42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5),
+        ("alcohol", 10.4, 10.3, 1.1, 8.0, 14.9, "normal-ish", 0),
+        ("sulfatos", 0.66, 0.62, 0.17, 0.33, 2.0, "heavy-tail", 9),
+        ("calidad", 5.6, 6.0, 0.8, 3.0, 8.0, "discrete", 0),
+    ]
+    for i in range(n_numeric):
+        name, mean, med, std, mn, mx, dist, no = presets[i % len(presets)]
+        if i >= len(presets):
+            name = f"{name}_{i}"
+        cols.append({"name": name, "inferred_type": "numeric",
+                     "numeric": _numeric_block(mean, med, std, mn, mx, dist, no)})
+    if extra_categorical:
+        cols.append({"name": "categoria", "inferred_type": "categorical",
+                     "categorical": {"top": [{"value": "tinto", "count": 200}]}})
+    return {"table": "vinos", "n_rows": 300, "n_cols": len(cols),
+            "columns": cols}
+
+
+def _pdf_text(path: str) -> str:
+    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
+    return re.sub(r"\s+", " ", txt)
+
+
+def test_golden_chapter_estructura_y_bloques():
+    ch = build_num_distr(_profile(n_numeric=2), {})
+    assert ch is not None
+    assert ch.id == "num_distr"
+    assert ch.version == CHAPTER_VERSION
+    kinds = [b.kind for b in ch.blocks]
+    # Heading + intro Markdown, then per column: Heading + Figure + Markdown.
+    assert kinds[0] == "heading"
+    assert kinds[1] == "markdown"
+    assert kinds.count("figure") == 2          # one figure per numeric column.
+    assert kinds.count("heading") == 1 + 2     # chapter title + one per column.
+    # Each figure has a lazy maker that produces a real matplotlib figure.
+    figs = [b for b in ch.blocks if b.kind == "figure"]
+    fig = figs[0].make()
+    assert fig is not None
+    # Two stacked axes: histogram + boxplot share the figure.
+    assert len(fig.axes) == 2
+    import matplotlib.pyplot as plt
+    plt.close(fig)
+
+
+def test_golden_media_mediana_sigma_y_boxplot_presentes():
+    # The intro documents the three reference lines and the Tukey boxplot; the
+    # per-column note carries the actual mean/median/σ numbers and the shape.
+    ch = build_num_distr(_profile(n_numeric=1, extra_categorical=False), {})
+    md_texts = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
+    assert "media" in md_texts and "mediana" in md_texts
+    assert "±1σ" in md_texts or "σ" in md_texts
+    assert "boxplot" in md_texts.lower()
+    assert "Tukey" in md_texts
+    # distribution_type gloss surfaced for the column (right-skewed preset).
+    assert _DIST_GLOSS["right-skewed"].split(";")[0][:20] in md_texts
+
+
+def test_boxplot_stats_se_consumen_del_registry():
+    # The chapter must feed build_boxplot_stats (group eda) and the resulting
+    # box must carry the Tukey fences for the figure.
+    from datascience.build_boxplot_stats import build_boxplot_stats
+    box = build_boxplot_stats(
+        _numeric_block(42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5))
+    assert box
+    assert "lower_fence" in box and "upper_fence" in box
+    assert box["q1"] is not None and box["q3"] is not None
+
+
+def test_edge_sin_columnas_numericas_devuelve_none():
+    prof = {"columns": [{"name": "c", "inferred_type": "categorical",
+                         "categorical": {"top": []}}]}
+    assert build_num_distr(prof, {}) is None
+
+
+def test_edge_profile_none_y_vacio_no_revienta():
+    assert build_num_distr(None, None) is None
+    assert build_num_distr({}, {}) is None
+    assert build_num_distr({"columns": []}, {}) is None
+
+
+def test_anti_corte_muchas_columnas_pdf_y_pptx():
+    # 8 numeric columns + long note text: nothing may be cut. Every column
+    # heading must survive in both the PDF text and the PPTX deck.
+    ch = build_num_distr(_profile(n_numeric=8), {})
+    names = [b.text for b in ch.blocks if b.kind == "heading" and b.level == 2]
+    assert len(names) == 8
+    with tempfile.TemporaryDirectory() as d:
+        pdf = os.path.join(d, "num.pdf")
+        res_pdf = render_automatic_eda_pdf(_profile(n_numeric=8), pdf,
+                                           {"write_manifest": False})
+        assert res_pdf["path"] == pdf
+        txt = _pdf_text(pdf)
+        for name in names:
+            assert name in txt, f"columna '{name}' cortada/ausente en el PDF"
+        pptx = os.path.join(d, "num.pptx")
+        res_pptx = render_automatic_eda_pptx(_profile(n_numeric=8), pptx,
+                                             {"write_manifest": False})
+        assert res_pptx["path"] == pptx
+        assert res_pptx["n_slides"] >= 8  # at least one slide per column figure.
+
+
+def test_distribution_gloss_cubre_todas_las_etiquetas():
+    # Every label detect_distribution_type can emit has a Spanish gloss.
+    for label in ("normal-ish", "right-skewed", "left-skewed", "heavy-tail",
+                  "lognormal-ish", "multimodal", "discrete", "too_few_samples",
+                  "other"):
+        assert label in _DIST_GLOSS and _DIST_GLOSS[label]