diff --git a/python/functions/datascience/automatic_eda/chapters/num_distr.py b/python/functions/datascience/automatic_eda/chapters/num_distr.py new file mode 100644 index 00000000..6c105dc6 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/num_distr.py @@ -0,0 +1,289 @@ +"""Numeric distributions chapter (NUM DISTR) for AutomaticEDA. + +For every numeric column the chapter draws, as a single indivisible figure, a +histogram with the **mean, median and ±1σ band drawn as reference lines** and a +**Tukey boxplot right below it** sharing the same X axis — exactly the user +requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block +so the renderers rasterize and scale it to fit a whole page/slide and nothing is +ever cut; columns with many numerics simply flow across pages as small +multiples. + +Data comes from the ``eda`` group profile and is never recomputed here: + +- ``columns[i]['numeric']`` (the output of ``describe_numeric``) gives + ``mean, median, std, min, max, p25, p75, iqr, n_outliers, outlier_pct, + distribution_type`` and the ``histogram`` bins ``[{lo, hi, count}]``. +- The boxplot five-number summary + Tukey 1.5·IQR fences are derived by the + pure registry function ``build_boxplot_stats`` (group ``eda``); this chapter + only consumes its output, it does not reimplement the statistics. + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +Reads everything defensively (``.get``) and never raises: a column whose figure +cannot be built is degraded to a short note instead of aborting the chapter. +""" + +from __future__ import annotations + +from .. import model + +# Pure registry function (group ``eda``) that derives the Tukey boxplot stats +# from a ``numeric`` sub-block. Imported defensively so the chapter still builds +# (degrading the boxplot to a note) if the function is somehow unavailable. +try: + from datascience.build_boxplot_stats import build_boxplot_stats +except Exception: # noqa: BLE001 — keep the chapter importable no matter what. + build_boxplot_stats = None # type: ignore[assignment] + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "num_distr" +CHAPTER_TITLE = "Distribuciones numéricas" + +# Plain-Spanish gloss for every label ``detect_distribution_type`` can emit, so a +# non-expert reader understands the shape and the suggested next step (MUST-4.3). +_DIST_GLOSS = { + "normal-ish": "aproximadamente simétrica (campana); media y mediana casi " + "coinciden.", + "right-skewed": "asimétrica a la derecha (cola larga hacia valores altos); " + "la media supera a la mediana — considera una transformación " + "logarítmica.", + "left-skewed": "asimétrica a la izquierda (cola larga hacia valores bajos); " + "la media queda por debajo de la mediana.", + "heavy-tail": "colas pesadas (curtosis alta): más valores extremos de lo " + "que esperaría una normal — vigila los outliers.", + "lognormal-ish": "compatible con lognormal (simétrica al tomar logaritmos); " + "la re-expresión log suele normalizarla.", + "multimodal": "varios picos: probablemente mezcla de subgrupos — conviene " + "segmentar antes de resumir con una sola media.", + "discrete": "pocos valores distintos (discreta/ordinal); el histograma " + "cuenta niveles, no un continuo.", + "too_few_samples": "muestra demasiado pequeña para clasificar la forma con " + "fiabilidad.", + "other": "forma no encuadrada en las categorías estándar.", +} + + +def _fmt_num(value, decimals: int = 3) -> str: + """Compact, defensive number formatting shared with the other chapters.""" + if value is None: + return "—" + if isinstance(value, bool): + return str(value) + if isinstance(value, int): + return f"{value:,}".replace(",", ".") + if isinstance(value, float): + if value != value: # NaN + return "NaN" + if value in (float("inf"), float("-inf")): + return str(value) + text = f"{value:.{decimals}f}".rstrip("0").rstrip(".") + return text if text else "0" + return str(value) + + +def _numeric_columns(profile: dict) -> list: + """Return the list of (name, numeric_dict) for columns with usable stats.""" + out = [] + for col in profile.get("columns") or []: + if not isinstance(col, dict): + continue + if col.get("inferred_type") != "numeric": + continue + num = col.get("numeric") + if not isinstance(num, dict) or not num: + continue + # A numeric block is renderable when it carries at least a center. + if num.get("mean") is None and num.get("median") is None: + continue + out.append((col.get("name") or "(columna)", num)) + return out + + +def _make_hist_box(name: str, numeric: dict, box: dict): + """Build the histogram (with mean/median/±σ lines) + boxplot figure. + + Returned lazily to the renderer (a zero-arg callable via ``Figure.make``) so + matplotlib is only imported and the figure only drawn when a renderer needs + it. The two stacked axes share the X axis and are produced as a single + figure, which both renderers treat as one indivisible unit (scaled whole, + never cut). + """ + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + fig, (ax_h, ax_b) = plt.subplots( + 2, 1, figsize=(6.4, 3.4), sharex=True, + gridspec_kw={"height_ratios": [3.2, 1.0], "hspace": 0.08}) + + # ---- Histogram from the precomputed equal-width bins {lo, hi, count}. ---- + hist = numeric.get("histogram") or [] + drew_bars = False + for b in hist: + if not isinstance(b, dict): + continue + lo = b.get("lo") + hi = b.get("hi") + count = b.get("count") or 0 + if lo is None or hi is None: + continue + width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6) + ax_h.bar(lo, count, width=width, align="edge", color="#9ec6df", + edgecolor="#5b8aa6", linewidth=0.4, zorder=2) + drew_bars = True + if not drew_bars: + ax_h.text(0.5, 0.5, "(sin histograma)", ha="center", va="center", + fontsize=9, color="#8a8a8a", transform=ax_h.transAxes) + + mean = numeric.get("mean") + median = numeric.get("median") + std = numeric.get("std") + + # ±1σ band first (behind the lines), then median (solid) and mean (dashed). + if mean is not None and std is not None and std > 0: + ax_h.axvspan(mean - std, mean + std, color="#f0c27b", alpha=0.22, + zorder=1, label="±1σ") + if median is not None: + ax_h.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.6, + zorder=4, label=f"mediana = {_fmt_num(median)}") + if mean is not None: + ax_h.axvline(mean, color="#c0392b", linestyle="--", linewidth=1.6, + zorder=4, label=f"media = {_fmt_num(mean)}") + + ax_h.set_ylabel("frecuencia", fontsize=8) + ax_h.tick_params(labelsize=7) + ax_h.legend(fontsize=6.5, loc="upper right", framealpha=0.85) + for spine in ("top", "right"): + ax_h.spines[spine].set_visible(False) + + # ---- Tukey boxplot below, sharing the X axis (MUST-4.2). ---- + if box: + stats = [{ + "med": box.get("median"), + "q1": box.get("q1"), + "q3": box.get("q3"), + "whislo": box.get("whisker_lo"), + "whishi": box.get("whisker_hi"), + "fliers": [], # raw outlier values are not in the profile. + "label": "", + }] + bxp_kw = dict( + showfliers=False, widths=0.5, patch_artist=True, + boxprops={"facecolor": "#9ec6df", "edgecolor": "#5b8aa6"}, + medianprops={"color": "#2e8b57", "linewidth": 1.6}, + whiskerprops={"color": "#5b8aa6"}, + capprops={"color": "#5b8aa6"}) + try: + # ``orientation`` is the current API; older matplotlib uses ``vert``. + try: + ax_b.bxp(stats, orientation="horizontal", **bxp_kw) + except TypeError: + ax_b.bxp(stats, vert=False, **bxp_kw) + except Exception: # noqa: BLE001 — never let one axis kill the figure. + pass + # Mark the presence of out-of-fence points (the raw values are unknown). + if box.get("has_low_outliers") and box.get("min") is not None: + ax_b.plot([box["min"]], [1], marker="o", markersize=3.5, + color="#c0392b", zorder=5) + if box.get("has_high_outliers") and box.get("max") is not None: + ax_b.plot([box["max"]], [1], marker="o", markersize=3.5, + color="#c0392b", zorder=5) + else: + ax_b.text(0.5, 0.5, "(boxplot no disponible)", ha="center", va="center", + fontsize=8, color="#8a8a8a", transform=ax_b.transAxes) + + ax_b.set_yticks([]) + ax_b.set_xlabel(name, fontsize=8) + ax_b.tick_params(labelsize=7) + for spine in ("top", "right", "left"): + ax_b.spines[spine].set_visible(False) + + fig.suptitle(name, fontsize=10, fontweight="bold", x=0.02, ha="left") + return fig + + +def _stats_note(name: str, numeric: dict, box: dict) -> str: + """One compact line of the key numbers + a plain-Spanish shape gloss.""" + bits = [ + f"media {_fmt_num(numeric.get('mean'))}", + f"mediana {_fmt_num(numeric.get('median'))}", + f"σ {_fmt_num(numeric.get('std'))}", + f"min {_fmt_num(numeric.get('min'))}", + f"max {_fmt_num(numeric.get('max'))}", + f"IQR {_fmt_num(numeric.get('iqr'))}", + ] + n_out = numeric.get("n_outliers") + out_pct = numeric.get("outlier_pct") + if n_out is not None: + pct = f" ({_fmt_num(out_pct, 2)}%)" if out_pct is not None else "" + bits.append(f"outliers {n_out}{pct}") + if box and (box.get("lower_fence") is not None): + bits.append( + f"vallas Tukey [{_fmt_num(box.get('lower_fence'))}, " + f"{_fmt_num(box.get('upper_fence'))}]") + line = " · ".join(bits) + + dist = numeric.get("distribution_type") + gloss = _DIST_GLOSS.get(dist) + if dist and gloss: + line += f"\n\n**Forma ({dist}):** {gloss}" + return line + + +def _figure_maker(name: str, numeric: dict, box: dict): + """Bind the per-column arguments so the lazy closure is loop-safe.""" + def _make(): + return _make_hist_box(name, numeric, box) + + return _make + + +def build_num_distr(profile: dict, ctx: dict): + """Build the numeric-distributions Chapter, or None if no numeric column. + + Args: + profile: the ``eda`` group TableProfile dict. + ctx: presentation context (unused here beyond defensive handling). + + Returns: + A ``model.Chapter`` with, per numeric column, a histogram+boxplot figure + and a stats note; or ``None`` when the dataset has no numeric column. + """ + profile = profile or {} + ctx = ctx or {} + + numerics = _numeric_columns(profile) + if not numerics: + return None # chapter does not apply to a dataset with no numerics. + + intro = ( + "Para cada columna numérica se muestra su **histograma** con tres líneas " + "de referencia: la **media** (línea roja discontinua), la **mediana** " + "(línea verde continua) y la banda **±1σ** (zona sombreada). Debajo, " + "alineado al mismo eje, un **boxplot de Tukey**: la caja abarca del " + "primer al tercer cuartil (P25–P75), la línea interior es la mediana y " + "los bigotes llegan hasta 1,5·IQR; los puntos rojos señalan que hay " + "valores más allá de las vallas. Comparar media y mediana revela la " + "asimetría de la distribución.") + + blocks = [ + model.Heading(text=CHAPTER_TITLE, level=1), + model.Markdown(text=intro), + ] + + for name, numeric in numerics: + box = {} + if build_boxplot_stats is not None: + try: + box = build_boxplot_stats(numeric) or {} + except Exception: # noqa: BLE001 — degrade, never raise. + box = {} + blocks.append(model.Heading(text=str(name), level=2)) + blocks.append(model.Figure( + make=_figure_maker(name, numeric, box), + caption=f"Distribución de «{name}» — histograma (media/mediana/±σ) " + f"y boxplot.")) + blocks.append(model.Markdown(text=_stats_note(name, numeric, box))) + + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/num_distr_test.py b/python/functions/datascience/automatic_eda/chapters/num_distr_test.py new file mode 100644 index 00000000..a9b459ed --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/num_distr_test.py @@ -0,0 +1,151 @@ +"""Tests for the NUM DISTR chapter — DoD: golden + edges + anti-cut. + +Self-contained: builds synthetic ``numeric`` blocks (no DuckDB) so the suite is +fast and deterministic. Verifies that the chapter emits, per numeric column, a +histogram+boxplot figure plus a stats note; that the mean/median/±σ requirement +and the boxplot are present; that a profile with no numeric column yields None; +that None/empty never raises; and that with many numeric columns and long text +both the PDF and the PPTX render without cutting anything (every column heading +survives in the rendered output). +""" + +import os +import re +import tempfile + +from pypdf import PdfReader + +from datascience.automatic_eda.chapters.num_distr import ( + build_num_distr, CHAPTER_VERSION, _DIST_GLOSS, +) +from datascience.automatic_eda import model +from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf +from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx + + +def _numeric_block(mean, median, std, mn, mx, dist="normal-ish", + n_outliers=0, nbins=10): + """A synthetic ``numeric`` sub-block shaped like describe_numeric's output.""" + width = (mx - mn) / nbins if mx > mn else 1.0 + hist = [{"lo": mn + i * width, "hi": mn + (i + 1) * width, + "count": (i + 1) * 3} for i in range(nbins)] + p25 = mn + (mx - mn) * 0.25 + p75 = mn + (mx - mn) * 0.75 + return { + "min": mn, "max": mx, "mean": mean, "median": median, "std": std, + "p25": p25, "p50": median, "p75": p75, "iqr": p75 - p25, + "n_outliers": n_outliers, "outlier_pct": 100.0 * n_outliers / 300.0, + "distribution_type": dist, "histogram": hist, + } + + +def _profile(n_numeric=2, extra_categorical=True): + cols = [] + presets = [ + ("precio", 42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5), + ("alcohol", 10.4, 10.3, 1.1, 8.0, 14.9, "normal-ish", 0), + ("sulfatos", 0.66, 0.62, 0.17, 0.33, 2.0, "heavy-tail", 9), + ("calidad", 5.6, 6.0, 0.8, 3.0, 8.0, "discrete", 0), + ] + for i in range(n_numeric): + name, mean, med, std, mn, mx, dist, no = presets[i % len(presets)] + if i >= len(presets): + name = f"{name}_{i}" + cols.append({"name": name, "inferred_type": "numeric", + "numeric": _numeric_block(mean, med, std, mn, mx, dist, no)}) + if extra_categorical: + cols.append({"name": "categoria", "inferred_type": "categorical", + "categorical": {"top": [{"value": "tinto", "count": 200}]}}) + return {"table": "vinos", "n_rows": 300, "n_cols": len(cols), + "columns": cols} + + +def _pdf_text(path: str) -> str: + txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages) + return re.sub(r"\s+", " ", txt) + + +def test_golden_chapter_estructura_y_bloques(): + ch = build_num_distr(_profile(n_numeric=2), {}) + assert ch is not None + assert ch.id == "num_distr" + assert ch.version == CHAPTER_VERSION + kinds = [b.kind for b in ch.blocks] + # Heading + intro Markdown, then per column: Heading + Figure + Markdown. + assert kinds[0] == "heading" + assert kinds[1] == "markdown" + assert kinds.count("figure") == 2 # one figure per numeric column. + assert kinds.count("heading") == 1 + 2 # chapter title + one per column. + # Each figure has a lazy maker that produces a real matplotlib figure. + figs = [b for b in ch.blocks if b.kind == "figure"] + fig = figs[0].make() + assert fig is not None + # Two stacked axes: histogram + boxplot share the figure. + assert len(fig.axes) == 2 + import matplotlib.pyplot as plt + plt.close(fig) + + +def test_golden_media_mediana_sigma_y_boxplot_presentes(): + # The intro documents the three reference lines and the Tukey boxplot; the + # per-column note carries the actual mean/median/σ numbers and the shape. + ch = build_num_distr(_profile(n_numeric=1, extra_categorical=False), {}) + md_texts = " ".join(b.text for b in ch.blocks if b.kind == "markdown") + assert "media" in md_texts and "mediana" in md_texts + assert "±1σ" in md_texts or "σ" in md_texts + assert "boxplot" in md_texts.lower() + assert "Tukey" in md_texts + # distribution_type gloss surfaced for the column (right-skewed preset). + assert _DIST_GLOSS["right-skewed"].split(";")[0][:20] in md_texts + + +def test_boxplot_stats_se_consumen_del_registry(): + # The chapter must feed build_boxplot_stats (group eda) and the resulting + # box must carry the Tukey fences for the figure. + from datascience.build_boxplot_stats import build_boxplot_stats + box = build_boxplot_stats( + _numeric_block(42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5)) + assert box + assert "lower_fence" in box and "upper_fence" in box + assert box["q1"] is not None and box["q3"] is not None + + +def test_edge_sin_columnas_numericas_devuelve_none(): + prof = {"columns": [{"name": "c", "inferred_type": "categorical", + "categorical": {"top": []}}]} + assert build_num_distr(prof, {}) is None + + +def test_edge_profile_none_y_vacio_no_revienta(): + assert build_num_distr(None, None) is None + assert build_num_distr({}, {}) is None + assert build_num_distr({"columns": []}, {}) is None + + +def test_anti_corte_muchas_columnas_pdf_y_pptx(): + # 8 numeric columns + long note text: nothing may be cut. Every column + # heading must survive in both the PDF text and the PPTX deck. + ch = build_num_distr(_profile(n_numeric=8), {}) + names = [b.text for b in ch.blocks if b.kind == "heading" and b.level == 2] + assert len(names) == 8 + with tempfile.TemporaryDirectory() as d: + pdf = os.path.join(d, "num.pdf") + res_pdf = render_automatic_eda_pdf(_profile(n_numeric=8), pdf, + {"write_manifest": False}) + assert res_pdf["path"] == pdf + txt = _pdf_text(pdf) + for name in names: + assert name in txt, f"columna '{name}' cortada/ausente en el PDF" + pptx = os.path.join(d, "num.pptx") + res_pptx = render_automatic_eda_pptx(_profile(n_numeric=8), pptx, + {"write_manifest": False}) + assert res_pptx["path"] == pptx + assert res_pptx["n_slides"] >= 8 # at least one slide per column figure. + + +def test_distribution_gloss_cubre_todas_las_etiquetas(): + # Every label detect_distribution_type can emit has a Spanish gloss. + for label in ("normal-ish", "right-skewed", "left-skewed", "heavy-tail", + "lognormal-ish", "multimodal", "discrete", "too_few_samples", + "other"): + assert label in _DIST_GLOSS and _DIST_GLOSS[label]