diff --git a/python/functions/datascience/automatic_eda/chapters/num_distr.py b/python/functions/datascience/automatic_eda/chapters/num_distr.py index 67a47779..5890b123 100644 --- a/python/functions/datascience/automatic_eda/chapters/num_distr.py +++ b/python/functions/datascience/automatic_eda/chapters/num_distr.py @@ -1,9 +1,10 @@ """Numeric distributions chapter (NUM DISTR) for AutomaticEDA. For every numeric column the chapter draws, as a single indivisible figure, a -histogram with the **mean, median and ±1σ band drawn as reference lines** and a -**Tukey boxplot right below it** sharing the same X axis — exactly the user -requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block +histogram with the **mean, median and ±1σ band drawn as reference lines** (the +legend reports the numeric value of the mean, the median **and the standard +deviation σ**) and a **Tukey boxplot right below it** sharing the same X axis — +exactly the user requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block so the renderers rasterize and scale it to fit a whole page/slide and nothing is ever cut; columns with many numerics simply flow across pages as small multiples. @@ -34,7 +35,7 @@ try: except Exception: # noqa: BLE001 — keep the chapter importable no matter what. build_boxplot_stats = None # type: ignore[assignment] -CHAPTER_VERSION = "1.1.0" +CHAPTER_VERSION = "1.2.0" CHAPTER_ID = "num_distr" CHAPTER_TITLE = "Distribuciones numéricas" @@ -140,9 +141,11 @@ def _make_hist_box(name: str, numeric: dict, box: dict): std = numeric.get("std") # ±1σ band first (behind the lines), then median (solid) and mean (dashed). + # The band's legend entry also reports the numeric value of the standard + # deviation, so the reader sees mean, median AND σ at a glance. if mean is not None and std is not None and std > 0: ax_h.axvspan(mean - std, mean + std, color="#f0c27b", alpha=0.22, - zorder=1, label="±1σ") + zorder=1, label=f"±1σ (σ = {_fmt_num(std)})") if median is not None: ax_h.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.6, zorder=4, label=f"mediana = {_fmt_num(median)}") @@ -152,7 +155,19 @@ def _make_hist_box(name: str, numeric: dict, box: dict): ax_h.set_ylabel("frecuencia", fontsize=8) ax_h.tick_params(labelsize=7) - ax_h.legend(fontsize=6.5, loc="upper right", framealpha=0.85) + # Always surface σ in the legend: if the ±1σ band could not be drawn (no mean + # or std<=0) but σ is still known, add a label-only proxy handle so the value + # of the standard deviation is reported regardless of the band. + handles, labels = ax_h.get_legend_handles_labels() + if std is not None and not any("σ =" in lbl for lbl in labels): + from matplotlib.lines import Line2D + proxy = Line2D([], [], linestyle="none", marker="", + label=f"σ = {_fmt_num(std)}") + handles.append(proxy) + labels.append(f"σ = {_fmt_num(std)}") + if handles: + ax_h.legend(handles, labels, fontsize=6.5, loc="upper right", + framealpha=0.85) for spine in ("top", "right"): ax_h.spines[spine].set_visible(False) diff --git a/python/functions/datascience/automatic_eda/chapters/num_distr_test.py b/python/functions/datascience/automatic_eda/chapters/num_distr_test.py index 71793ad1..280cff17 100644 --- a/python/functions/datascience/automatic_eda/chapters/num_distr_test.py +++ b/python/functions/datascience/automatic_eda/chapters/num_distr_test.py @@ -159,6 +159,50 @@ def test_anti_corte_muchas_columnas_pdf_y_pptx(): assert res_pptx["n_slides"] >= 8 # at least one slide per column figure. +def _hist_legend_texts(numeric, box=None): + """Build the per-column figure and return its histogram-legend label texts.""" + from datascience.automatic_eda.chapters.num_distr import _make_hist_box + import matplotlib.pyplot as plt + fig = _make_hist_box("col", numeric, box or {}) + ax_h = fig.axes[0] # the histogram is the top axis. + leg = ax_h.get_legend() + texts = [t.get_text() for t in leg.get_texts()] if leg else [] + plt.close(fig) + return texts + + +def test_golden_leyenda_histograma_reporta_valor_std(): + # The histogram legend must report the numeric value of the standard + # deviation σ next to mean and median. + numeric = _numeric_block(42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5) + texts = _hist_legend_texts(numeric) + joined = " ".join(texts) + assert any("σ =" in t for t in texts), f"σ value missing in legend: {texts}" + assert "12.3" in joined, f"std value 12.3 not in legend: {texts}" + assert any("media =" in t for t in texts) + assert any("mediana =" in t for t in texts) + + +def test_edge_std_en_leyenda_aunque_no_haya_banda(): + # When the ±1σ band cannot be drawn (no mean) but σ is known, the legend + # still surfaces the σ value via a label-only proxy handle. + numeric = _numeric_block(42.5, 40.0, 7.5, 1.0, 100.0, "right-skewed", 0) + numeric["mean"] = None # forces the band off; σ must still appear. + texts = _hist_legend_texts(numeric) + assert any("σ = 7.5" in t for t in texts), f"σ proxy missing: {texts}" + + +def test_edge_sin_std_no_revienta_la_figura(): + # A numeric block without σ must not raise and simply omits the σ entry. + import matplotlib.pyplot as plt + numeric = _numeric_block(42.5, 40.0, 0.0, 1.0, 100.0, "discrete", 0) + numeric["std"] = None + texts = _hist_legend_texts(numeric) + assert not any("σ =" in t for t in texts) + # mean/median lines still produce their own legend entries. + assert any("media =" in t for t in texts) + + def test_distribution_gloss_cubre_todas_las_etiquetas(): # Every label detect_distribution_type can emit has a Spanish gloss. for label in ("normal-ish", "right-skewed", "left-skewed", "heavy-tail",