merge: 4b num_distr — desv std (sigma) en leyenda del histograma (verificado met)

This commit is contained in:
2026-06-30 18:06:46 +02:00
2 changed files with 65 additions and 6 deletions
@@ -1,9 +1,10 @@
"""Numeric distributions chapter (NUM DISTR) for AutomaticEDA. """Numeric distributions chapter (NUM DISTR) for AutomaticEDA.
For every numeric column the chapter draws, as a single indivisible figure, a For every numeric column the chapter draws, as a single indivisible figure, a
histogram with the **mean, median and ±1σ band drawn as reference lines** and a histogram with the **mean, median and ±1σ band drawn as reference lines** (the
**Tukey boxplot right below it** sharing the same X axis — exactly the user legend reports the numeric value of the mean, the median **and the standard
requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block deviation σ**) and a **Tukey boxplot right below it** sharing the same X axis —
exactly the user requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block
so the renderers rasterize and scale it to fit a whole page/slide and nothing is so the renderers rasterize and scale it to fit a whole page/slide and nothing is
ever cut; columns with many numerics simply flow across pages as small ever cut; columns with many numerics simply flow across pages as small
multiples. multiples.
@@ -34,7 +35,7 @@ try:
except Exception: # noqa: BLE001 — keep the chapter importable no matter what. except Exception: # noqa: BLE001 — keep the chapter importable no matter what.
build_boxplot_stats = None # type: ignore[assignment] build_boxplot_stats = None # type: ignore[assignment]
CHAPTER_VERSION = "1.1.0" CHAPTER_VERSION = "1.2.0"
CHAPTER_ID = "num_distr" CHAPTER_ID = "num_distr"
CHAPTER_TITLE = "Distribuciones numéricas" CHAPTER_TITLE = "Distribuciones numéricas"
@@ -140,9 +141,11 @@ def _make_hist_box(name: str, numeric: dict, box: dict):
std = numeric.get("std") std = numeric.get("std")
# ±1σ band first (behind the lines), then median (solid) and mean (dashed). # ±1σ band first (behind the lines), then median (solid) and mean (dashed).
# The band's legend entry also reports the numeric value of the standard
# deviation, so the reader sees mean, median AND σ at a glance.
if mean is not None and std is not None and std > 0: if mean is not None and std is not None and std > 0:
ax_h.axvspan(mean - std, mean + std, color="#f0c27b", alpha=0.22, ax_h.axvspan(mean - std, mean + std, color="#f0c27b", alpha=0.22,
zorder=1, label="±1σ") zorder=1, label=f"±1σ (σ = {_fmt_num(std)})")
if median is not None: if median is not None:
ax_h.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.6, ax_h.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.6,
zorder=4, label=f"mediana = {_fmt_num(median)}") zorder=4, label=f"mediana = {_fmt_num(median)}")
@@ -152,7 +155,19 @@ def _make_hist_box(name: str, numeric: dict, box: dict):
ax_h.set_ylabel("frecuencia", fontsize=8) ax_h.set_ylabel("frecuencia", fontsize=8)
ax_h.tick_params(labelsize=7) ax_h.tick_params(labelsize=7)
ax_h.legend(fontsize=6.5, loc="upper right", framealpha=0.85) # Always surface σ in the legend: if the ±1σ band could not be drawn (no mean
# or std<=0) but σ is still known, add a label-only proxy handle so the value
# of the standard deviation is reported regardless of the band.
handles, labels = ax_h.get_legend_handles_labels()
if std is not None and not any("σ =" in lbl for lbl in labels):
from matplotlib.lines import Line2D
proxy = Line2D([], [], linestyle="none", marker="",
label=f"σ = {_fmt_num(std)}")
handles.append(proxy)
labels.append(f"σ = {_fmt_num(std)}")
if handles:
ax_h.legend(handles, labels, fontsize=6.5, loc="upper right",
framealpha=0.85)
for spine in ("top", "right"): for spine in ("top", "right"):
ax_h.spines[spine].set_visible(False) ax_h.spines[spine].set_visible(False)
@@ -159,6 +159,50 @@ def test_anti_corte_muchas_columnas_pdf_y_pptx():
assert res_pptx["n_slides"] >= 8 # at least one slide per column figure. assert res_pptx["n_slides"] >= 8 # at least one slide per column figure.
def _hist_legend_texts(numeric, box=None):
"""Build the per-column figure and return its histogram-legend label texts."""
from datascience.automatic_eda.chapters.num_distr import _make_hist_box
import matplotlib.pyplot as plt
fig = _make_hist_box("col", numeric, box or {})
ax_h = fig.axes[0] # the histogram is the top axis.
leg = ax_h.get_legend()
texts = [t.get_text() for t in leg.get_texts()] if leg else []
plt.close(fig)
return texts
def test_golden_leyenda_histograma_reporta_valor_std():
# The histogram legend must report the numeric value of the standard
# deviation σ next to mean and median.
numeric = _numeric_block(42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5)
texts = _hist_legend_texts(numeric)
joined = " ".join(texts)
assert any("σ =" in t for t in texts), f"σ value missing in legend: {texts}"
assert "12.3" in joined, f"std value 12.3 not in legend: {texts}"
assert any("media =" in t for t in texts)
assert any("mediana =" in t for t in texts)
def test_edge_std_en_leyenda_aunque_no_haya_banda():
# When the ±1σ band cannot be drawn (no mean) but σ is known, the legend
# still surfaces the σ value via a label-only proxy handle.
numeric = _numeric_block(42.5, 40.0, 7.5, 1.0, 100.0, "right-skewed", 0)
numeric["mean"] = None # forces the band off; σ must still appear.
texts = _hist_legend_texts(numeric)
assert any("σ = 7.5" in t for t in texts), f"σ proxy missing: {texts}"
def test_edge_sin_std_no_revienta_la_figura():
# A numeric block without σ must not raise and simply omits the σ entry.
import matplotlib.pyplot as plt
numeric = _numeric_block(42.5, 40.0, 0.0, 1.0, 100.0, "discrete", 0)
numeric["std"] = None
texts = _hist_legend_texts(numeric)
assert not any("σ =" in t for t in texts)
# mean/median lines still produce their own legend entries.
assert any("media =" in t for t in texts)
def test_distribution_gloss_cubre_todas_las_etiquetas(): def test_distribution_gloss_cubre_todas_las_etiquetas():
# Every label detect_distribution_type can emit has a Spanish gloss. # Every label detect_distribution_type can emit has a Spanish gloss.
for label in ("normal-ish", "right-skewed", "left-skewed", "heavy-tail", for label in ("normal-ish", "right-skewed", "left-skewed", "heavy-tail",