Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c1a4a83717 | |||
| fcf5a4c6a3 |
@@ -1,352 +0,0 @@
|
||||
"""Correlation chapter — association matrix plus top positive/negative pairs.
|
||||
|
||||
Builds the CORRELACION chapter of an AutomaticEDA document from a TableProfile.
|
||||
It renders exactly what the user asked for:
|
||||
|
||||
1. A correlation/association **matrix** (heatmap) reconstructed from the evaluated
|
||||
pairs, signed for numeric-numeric pairs (Pearson/Spearman, ``[-1, 1]``) and as
|
||||
magnitude for the mixed-type metrics (Cramér's V, correlation ratio, mutual
|
||||
information, ``[0, 1]``). Labels are ordered by total connectivity so strong
|
||||
associations cluster together instead of being scattered alphabetically.
|
||||
2. The **TOP positive** pairs and the **TOP negative** pairs as two separate
|
||||
tables. Only numeric-numeric metrics carry a sign, so negative pairs are by
|
||||
construction Pearson/Spearman; positive pairs may use any method.
|
||||
3. The methods legend and the multiple-testing (FDR) summary, so the reader sees
|
||||
how many pairs survive the correction.
|
||||
4. A spuriousness caveat when the profile flags level-based correlations on
|
||||
non-stationary series (Granger–Newbold).
|
||||
|
||||
All data comes from ``profile['correlations']`` — the output of the ``eda`` group
|
||||
function ``association_matrix`` (optionally enriched by ``profile_table``). The
|
||||
chapter never recomputes any statistic; it only lays the existing values out as
|
||||
format-independent blocks. The renderers paginate tables (repeating the header)
|
||||
and scale the heatmap to fit entirely, so nothing is ever cut.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "correlacion"
|
||||
CHAPTER_TITLE = "Correlación"
|
||||
|
||||
# Methods whose value carries a sign (direction). Everything else is a magnitude
|
||||
# in [0, 1] and therefore only ever contributes to the positive side.
|
||||
_SIGNED_METHODS = ("pearson", "spearman")
|
||||
|
||||
# Cap the heatmap to the most-connected variables so it stays legible on a phone
|
||||
# screen / a slide. The renderer would scale a bigger matrix to fit, but the
|
||||
# cells become unreadable; we instead show the top-N and say so.
|
||||
_MAX_MATRIX_LABELS = 16
|
||||
|
||||
# How many pairs to show in each of the top-positive / top-negative tables.
|
||||
_TOP_N = 10
|
||||
|
||||
|
||||
def _is_num(v) -> bool:
|
||||
"""True for a real, finite int/float (not bool, not NaN/inf)."""
|
||||
return (
|
||||
isinstance(v, (int, float))
|
||||
and not isinstance(v, bool)
|
||||
and not (isinstance(v, float) and (math.isnan(v) or math.isinf(v)))
|
||||
)
|
||||
|
||||
|
||||
def _fmt_val(value, decimals: int = 2) -> str:
|
||||
"""Format an association value compactly, signed, with a fixed width feel."""
|
||||
if not _is_num(value):
|
||||
return "—"
|
||||
text = f"{float(value):+.{decimals}f}"
|
||||
# Strip a trailing -0.00 / +0.00 into a clean 0.00 for readability.
|
||||
if text in ("+0.00", "-0.00"):
|
||||
return "0.00"
|
||||
return text
|
||||
|
||||
|
||||
def _fmt_p(value) -> str:
|
||||
"""Format an adjusted p-value; tiny values collapse to a '<' threshold."""
|
||||
if not _is_num(value):
|
||||
return "—"
|
||||
p = float(value)
|
||||
if p < 0.001:
|
||||
return "<0.001"
|
||||
return f"{p:.3f}"
|
||||
|
||||
|
||||
def _is_signed(pair: dict) -> bool:
|
||||
"""True if the pair's method reports a directional (signed) value."""
|
||||
method = str(pair.get("method") or "").lower()
|
||||
return any(m in method for m in _SIGNED_METHODS)
|
||||
|
||||
|
||||
def _significant(pair: dict) -> bool:
|
||||
"""True if the pair is significant after FDR (or has no test to correct)."""
|
||||
if pair.get("significant") is True:
|
||||
return True
|
||||
# Pairs without an applicable test (p_value None) are not penalised: they are
|
||||
# admitted on magnitude alone upstream, so treat missing as "not rejected".
|
||||
return pair.get("p_value") is None and pair.get("significant") is None
|
||||
|
||||
|
||||
def _label(pair: dict) -> str:
|
||||
"""Human label for a pair, e.g. 'alcohol ↔ density'."""
|
||||
return f"{model._safe_str(pair.get('a'))} ↔ {model._safe_str(pair.get('b'))}"
|
||||
|
||||
|
||||
def _split_top(pairs: list, top_n: int = _TOP_N):
|
||||
"""Split evaluated pairs into ranked top-positive and top-negative lists.
|
||||
|
||||
Positive: any pair with a positive value, ranked by value descending.
|
||||
Negative: only signed (numeric-numeric) pairs with a negative value, ranked
|
||||
by value ascending (most negative first). Non-finite values are dropped.
|
||||
"""
|
||||
positive = []
|
||||
negative = []
|
||||
for pair in pairs:
|
||||
if not isinstance(pair, dict):
|
||||
continue
|
||||
value = pair.get("value")
|
||||
if not _is_num(value):
|
||||
continue
|
||||
if value > 0:
|
||||
positive.append(pair)
|
||||
elif value < 0 and _is_signed(pair):
|
||||
negative.append(pair)
|
||||
positive.sort(key=lambda p: float(p.get("value", 0.0)), reverse=True)
|
||||
negative.sort(key=lambda p: float(p.get("value", 0.0)))
|
||||
return positive[:top_n], negative[:top_n]
|
||||
|
||||
|
||||
def _top_table(pairs: list, title: str):
|
||||
"""Build a DataTable for a list of pairs, or None if there are none."""
|
||||
if not pairs:
|
||||
return None
|
||||
header = ["Par", "Método", "Valor", "p (FDR)", "Sig."]
|
||||
rows = []
|
||||
for pair in pairs:
|
||||
method = model._safe_str(pair.get("method")) or "—"
|
||||
rows.append([
|
||||
_label(pair),
|
||||
method,
|
||||
_fmt_val(pair.get("value")),
|
||||
_fmt_p(pair.get("p_value_adjusted")),
|
||||
"sí" if _significant(pair) else "no",
|
||||
])
|
||||
return model.DataTable(header=header, rows=rows, title=title)
|
||||
|
||||
|
||||
def _ordered_labels(pairs: list):
|
||||
"""Pick and order the matrix labels by total connectivity (descending).
|
||||
|
||||
Returns the list of variable names to place on the axes, capped at
|
||||
``_MAX_MATRIX_LABELS`` (the most-connected ones), plus a boolean saying
|
||||
whether the cap trimmed anything.
|
||||
"""
|
||||
strength = {}
|
||||
for pair in pairs:
|
||||
if not isinstance(pair, dict):
|
||||
continue
|
||||
value = pair.get("value")
|
||||
if not _is_num(value):
|
||||
continue
|
||||
mag = abs(float(value))
|
||||
for key in ("a", "b"):
|
||||
name = pair.get(key)
|
||||
if name is None:
|
||||
continue
|
||||
strength[name] = strength.get(name, 0.0) + mag
|
||||
if not strength:
|
||||
return [], False
|
||||
ordered = sorted(strength, key=lambda n: strength[n], reverse=True)
|
||||
trimmed = len(ordered) > _MAX_MATRIX_LABELS
|
||||
return ordered[:_MAX_MATRIX_LABELS], trimmed
|
||||
|
||||
|
||||
def _matrix_figure(pairs: list, labels: list):
|
||||
"""Return a Figure (lazy) with the signed association heatmap, or None.
|
||||
|
||||
The matplotlib figure is built lazily inside ``make`` so importing this
|
||||
module never requires matplotlib and a malformed plot degrades to nothing
|
||||
instead of aborting the chapter.
|
||||
"""
|
||||
if len(labels) < 2:
|
||||
return None
|
||||
|
||||
index = {name: i for i, name in enumerate(labels)}
|
||||
|
||||
def make():
|
||||
import numpy as np
|
||||
from matplotlib.figure import Figure
|
||||
|
||||
n = len(labels)
|
||||
grid = np.full((n, n), np.nan, dtype=float)
|
||||
for i in range(n):
|
||||
grid[i, i] = 1.0
|
||||
for pair in pairs:
|
||||
if not isinstance(pair, dict):
|
||||
continue
|
||||
a = pair.get("a")
|
||||
b = pair.get("b")
|
||||
value = pair.get("value")
|
||||
if a not in index or b not in index or not _is_num(value):
|
||||
continue
|
||||
v = float(value)
|
||||
# Mixed-type magnitudes are non-negative; keep them as-is on [0, 1].
|
||||
ia, ib = index[a], index[b]
|
||||
grid[ia, ib] = v
|
||||
grid[ib, ia] = v
|
||||
|
||||
import matplotlib
|
||||
|
||||
masked = np.ma.masked_invalid(grid)
|
||||
fig = Figure(figsize=(6.2, 5.6))
|
||||
ax = fig.add_subplot(111)
|
||||
cmap = matplotlib.colormaps["RdBu_r"].copy()
|
||||
cmap.set_bad(color="#eeeeee")
|
||||
im = ax.imshow(masked, cmap=cmap, vmin=-1.0, vmax=1.0, aspect="auto")
|
||||
ax.set_xticks(range(n))
|
||||
ax.set_yticks(range(n))
|
||||
short = [str(s)[:14] for s in labels]
|
||||
ax.set_xticks(range(n))
|
||||
ax.set_xticklabels(short, rotation=90, fontsize=7)
|
||||
ax.set_yticklabels(short, fontsize=7)
|
||||
# Annotate cells only when the matrix is small enough to stay legible.
|
||||
if n <= 8:
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
cell = grid[i, j]
|
||||
if _is_num(cell):
|
||||
ax.text(j, i, f"{cell:+.2f}".replace("+", "") if cell < 0
|
||||
else f"{cell:.2f}",
|
||||
ha="center", va="center", fontsize=6,
|
||||
color="#222222")
|
||||
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04,
|
||||
label="asociación (signo en num-num)")
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
|
||||
return model.Figure(make=make,
|
||||
caption="Matriz de asociación. Azul = positiva, rojo = "
|
||||
"negativa (sólo num-num lleva signo); gris = par "
|
||||
"no evaluado.")
|
||||
|
||||
|
||||
def _methods_block(corr: dict):
|
||||
"""Build a KVTable with the legend of the methods actually present."""
|
||||
legend = corr.get("methods_legend")
|
||||
if not isinstance(legend, dict) or not legend:
|
||||
return None
|
||||
rows = [(model._safe_str(k), model._safe_str(v)) for k, v in legend.items()]
|
||||
return model.KVTable(rows=rows, title="Métodos de asociación")
|
||||
|
||||
|
||||
def _fdr_text(corr: dict) -> str | None:
|
||||
"""One-line summary of the multiple-testing (FDR) correction, or None."""
|
||||
mt = corr.get("multiple_testing")
|
||||
if not isinstance(mt, dict) or not mt:
|
||||
return None
|
||||
method = model._safe_str(mt.get("method")).upper() or "FDR"
|
||||
alpha = mt.get("alpha")
|
||||
n_tests = mt.get("n_tests")
|
||||
n_rej = mt.get("n_rejected")
|
||||
parts = [f"Corrección por comparaciones múltiples ({method}"]
|
||||
if _is_num(alpha):
|
||||
parts[0] += f", α={float(alpha):g}"
|
||||
parts[0] += ")."
|
||||
if _is_num(n_tests):
|
||||
rej = n_rej if _is_num(n_rej) else "—"
|
||||
parts.append(
|
||||
f"De {int(n_tests)} pares con test, {rej} siguen siendo "
|
||||
f"significativos tras la corrección.")
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def build_correlacion(profile: dict, ctx: dict):
|
||||
"""Build the Correlation Chapter, or None if there are no pairs to show.
|
||||
|
||||
Reads ``profile['correlations']`` (the ``association_matrix`` output). Returns
|
||||
``None`` when the dataset has fewer than two associable columns (no evaluated
|
||||
pairs), so the chapter is omitted instead of showing an empty section. Never
|
||||
raises: every access is defensive.
|
||||
|
||||
ctx keys consumed: none specific (presentation metadata is inherited from the
|
||||
document). The chapter reads everything it needs from the profile.
|
||||
"""
|
||||
profile = profile or {}
|
||||
ctx = ctx or {}
|
||||
|
||||
corr = profile.get("correlations")
|
||||
if not isinstance(corr, dict):
|
||||
return None
|
||||
pairs = corr.get("pairs")
|
||||
if not isinstance(pairs, list) or not pairs:
|
||||
return None
|
||||
|
||||
blocks: list = []
|
||||
|
||||
# Intro: what this chapter shows and how to read the sign.
|
||||
blocks.append(model.Markdown(text=(
|
||||
"Asociación entre columnas. Cada par se evalúa con la métrica adecuada a "
|
||||
"sus tipos (Pearson/Spearman entre numéricas — con **signo**; Cramér's V "
|
||||
"entre categóricas; razón de correlación num-categórica; información mutua "
|
||||
"como medida común no lineal). Sólo las correlaciones **num-num** tienen "
|
||||
"dirección: por eso los pares **negativos** son siempre num-num.")))
|
||||
|
||||
# 1) Association matrix (heatmap).
|
||||
labels, trimmed = _ordered_labels(pairs)
|
||||
fig = _matrix_figure(pairs, labels)
|
||||
if fig is not None:
|
||||
blocks.append(model.Heading(text="Matriz de asociación", level=2))
|
||||
blocks.append(fig)
|
||||
if trimmed:
|
||||
blocks.append(model.Note(text=(
|
||||
f"Se muestran las {len(labels)} variables más conectadas de la "
|
||||
"matriz para mantenerla legible; el resto de pares siguen en las "
|
||||
"tablas de abajo.")))
|
||||
|
||||
# 2) Top positive / top negative pairs.
|
||||
positive, negative = _split_top(pairs, _TOP_N)
|
||||
pos_table = _top_table(positive, f"Top {len(positive)} positivas")
|
||||
neg_table = _top_table(negative, f"Top {len(negative)} negativas")
|
||||
if pos_table is not None:
|
||||
blocks.append(model.Heading(text="Pares más correlacionados (positivos)",
|
||||
level=2))
|
||||
blocks.append(pos_table)
|
||||
if neg_table is not None:
|
||||
blocks.append(model.Heading(text="Pares más correlacionados (negativos)",
|
||||
level=2))
|
||||
blocks.append(neg_table)
|
||||
elif pos_table is not None:
|
||||
# No signed-negative pairs at all: say so honestly rather than omit.
|
||||
blocks.append(model.Note(text=(
|
||||
"No se han hallado correlaciones negativas significativas entre "
|
||||
"columnas numéricas.")))
|
||||
|
||||
# 3) Spuriousness caveat for level-based correlations (Granger–Newbold).
|
||||
caveat = corr.get("levels_caveat")
|
||||
if isinstance(caveat, str) and caveat.strip():
|
||||
blocks.append(model.Note(text=caveat.strip()))
|
||||
elif corr.get("levels_possible_spurious"):
|
||||
blocks.append(model.Note(text=(
|
||||
"Aviso: algunas correlaciones se calcularon sobre niveles de series "
|
||||
"no estacionarias y pueden ser espurias (Granger–Newbold). Compáralas "
|
||||
"sobre los retornos/diferencias antes de interpretarlas.")))
|
||||
|
||||
# 4) FDR summary + methods legend.
|
||||
fdr_text = _fdr_text(corr)
|
||||
if fdr_text:
|
||||
blocks.append(model.Markdown(text=fdr_text))
|
||||
methods = _methods_block(corr)
|
||||
if methods is not None:
|
||||
blocks.append(model.Heading(text="Métodos y leyenda", level=2))
|
||||
blocks.append(methods)
|
||||
|
||||
if not blocks:
|
||||
return None
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -1,175 +0,0 @@
|
||||
"""Tests for the CORRELACION chapter — DoD: golden + edges + error/anti-cut.
|
||||
|
||||
Self-contained: builds a synthetic TableProfile carrying a ``correlations`` block
|
||||
shaped exactly like ``association_matrix`` output (no DuckDB), so the suite is
|
||||
fast and deterministic. Verifies that the chapter emits the association-matrix
|
||||
figure plus separate top-positive / top-negative tables with the right pairs,
|
||||
that it returns None when the profile has no pairs, that a None/empty profile
|
||||
does not raise, and that a wide matrix with long labels renders to PDF *and* PPTX
|
||||
without cutting anything.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
from datascience.automatic_eda.chapters.correlacion import (
|
||||
CHAPTER_VERSION,
|
||||
build_correlacion,
|
||||
)
|
||||
from datascience.automatic_eda.model import DataTable, Figure
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
|
||||
def _pair(a, b, value, method, padj, sig, p=0.0001):
|
||||
return {
|
||||
"a": a, "b": b, "a_type": "numeric", "b_type": "numeric",
|
||||
"method": method, "value": value, "extra": {"mi": abs(value) * 0.5},
|
||||
"p_value": p, "p_value_adjusted": padj, "significant": sig,
|
||||
}
|
||||
|
||||
|
||||
def _profile() -> dict:
|
||||
"""Synthetic wine-like profile with signed and unsigned associations."""
|
||||
pairs = [
|
||||
_pair("alcohol", "quality", 0.48, "pearson/spearman", 0.0005, True),
|
||||
_pair("density", "alcohol", -0.78, "pearson/spearman", 0.0001, True),
|
||||
_pair("ph", "fixed_acidity", -0.68, "pearson/spearman", 0.0002, True),
|
||||
_pair("sulphates", "quality", 0.25, "pearson/spearman", 0.03, True),
|
||||
# Unsigned mixed-type metrics: only ever positive, never in the neg table.
|
||||
{"a": "region", "b": "type", "a_type": "categorical",
|
||||
"b_type": "categorical", "method": "cramers_v", "value": 0.55,
|
||||
"extra": {"mi": 0.3}, "p_value": 0.001, "p_value_adjusted": 0.004,
|
||||
"significant": True},
|
||||
]
|
||||
return {
|
||||
"table": "wine",
|
||||
"source": "/data/wine.csv",
|
||||
"n_rows": 1599,
|
||||
"n_cols": 12,
|
||||
"correlations": {
|
||||
"pairs": pairs,
|
||||
"strong": [p for p in pairs if abs(p["value"]) >= 0.5],
|
||||
"methods_legend": {
|
||||
"pearson": "num-num lineal (Pearson r), [-1, 1]",
|
||||
"cramers_v": "cat-cat simétrica (Cramér's V), [0, 1]",
|
||||
},
|
||||
"multiple_testing": {"method": "bh", "alpha": 0.05,
|
||||
"n_tests": 5, "n_rejected": 5},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _pdf_text(path: str) -> str:
|
||||
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
|
||||
return re.sub(r"\s+", " ", txt)
|
||||
|
||||
|
||||
def test_golden_chapter_tiene_matriz_y_top_positivos_y_negativos():
|
||||
ch = build_correlacion(_profile(), {})
|
||||
assert ch is not None
|
||||
assert ch.id == "correlacion"
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
kinds = [b.kind for b in ch.blocks]
|
||||
assert "figure" in kinds # association matrix heatmap.
|
||||
figs = [b for b in ch.blocks if isinstance(b, Figure)]
|
||||
assert figs and figs[0].make is not None # lazy figure.
|
||||
|
||||
tables = [b for b in ch.blocks if isinstance(b, DataTable)]
|
||||
assert len(tables) >= 2 # top positive + top negative.
|
||||
flat = " ".join(str(c) for t in tables for r in t.rows for c in r)
|
||||
# Strongest positive present and signed +, strongest negative present and -.
|
||||
assert "alcohol" in flat and "quality" in flat
|
||||
assert "+0.48" in flat
|
||||
assert "density" in flat and "-0.78" in flat
|
||||
|
||||
|
||||
def test_golden_render_pdf_y_pptx_muestran_lo_exigido():
|
||||
prof = _profile()
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "corr.pdf")
|
||||
pptx = os.path.join(d, "corr.pptx")
|
||||
rp = render_automatic_eda_pdf(prof, pdf, {"title": "EDA — wine"})
|
||||
rx = render_automatic_eda_pptx(prof, pptx, {"title": "EDA — wine"})
|
||||
assert rp["path"] == pdf and rp["n_pages"] >= 1
|
||||
assert rx["path"] == pptx and rx["n_slides"] >= 1
|
||||
assert "correlacion" in [c["id"] for c in rp["chapters"]]
|
||||
assert "correlacion" in [c["id"] for c in rx["chapters"]]
|
||||
txt = _pdf_text(pdf)
|
||||
# The requirement: matrix + top positive/negative pairs, all visible.
|
||||
assert "Correlaci" in txt # chapter title (accents may vary in extract).
|
||||
assert "density" in txt and "alcohol" in txt and "quality" in txt
|
||||
assert "0.78" in txt and "0.48" in txt
|
||||
# Both signs surfaced as separate sections.
|
||||
assert "positiv" in txt.lower() and "negativ" in txt.lower()
|
||||
|
||||
|
||||
def test_edge_sin_pares_devuelve_none():
|
||||
# No correlations key, empty pairs, and wrong types all yield None, not error.
|
||||
assert build_correlacion({"table": "x"}, {}) is None
|
||||
assert build_correlacion({"correlations": {}}, {}) is None
|
||||
assert build_correlacion({"correlations": {"pairs": []}}, {}) is None
|
||||
assert build_correlacion({"correlations": {"pairs": "nope"}}, {}) is None
|
||||
assert build_correlacion(None, None) is None
|
||||
assert build_correlacion({}, {}) is None
|
||||
|
||||
|
||||
def test_edge_solo_positivos_emite_nota_sin_tabla_negativa():
|
||||
prof = {
|
||||
"correlations": {
|
||||
"pairs": [
|
||||
_pair("a", "b", 0.6, "pearson/spearman", 0.001, True),
|
||||
{"a": "c", "b": "d", "a_type": "categorical",
|
||||
"b_type": "categorical", "method": "cramers_v", "value": 0.7,
|
||||
"extra": {"mi": 0.4}, "p_value": 0.001,
|
||||
"p_value_adjusted": 0.003, "significant": True},
|
||||
],
|
||||
},
|
||||
}
|
||||
ch = build_correlacion(prof, {})
|
||||
assert ch is not None
|
||||
tables = [b for b in ch.blocks if isinstance(b, DataTable)]
|
||||
assert len(tables) == 1 # only the positive table.
|
||||
notes = " ".join(b.text for b in ch.blocks if b.kind == "note")
|
||||
assert "negativas" in notes # honest "no negative correlations" note.
|
||||
|
||||
|
||||
def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan():
|
||||
# 20 numeric vars with long names -> matrix trimmed to top-N + both renderers
|
||||
# must lay the chapter out without raising and keep a long label intact.
|
||||
long_a = "concentracion_de_dioxido_de_azufre_libre"
|
||||
long_b = "concentracion_de_dioxido_de_azufre_total"
|
||||
pairs = [_pair(long_a, long_b, -0.72, "pearson/spearman", 0.0001, True)]
|
||||
for i in range(20):
|
||||
pairs.append(_pair(f"variable_numerica_larga_{i:02d}",
|
||||
f"variable_numerica_larga_{(i + 1) % 20:02d}",
|
||||
0.55 - i * 0.02, "pearson/spearman", 0.01, True))
|
||||
prof = {"correlations": {"pairs": pairs,
|
||||
"multiple_testing": {"method": "bh", "alpha": 0.05,
|
||||
"n_tests": len(pairs),
|
||||
"n_rejected": len(pairs)}}}
|
||||
ch = build_correlacion(prof, {})
|
||||
assert ch is not None
|
||||
# A "showing top-N most connected" note appears when the matrix is trimmed.
|
||||
notes = " ".join(b.text for b in ch.blocks if b.kind == "note")
|
||||
assert "más conectadas" in notes
|
||||
# Anti-cut guarantee at the block level: the long pair reaches the renderer
|
||||
# whole (the block never truncates); the renderer then wraps the cell inside
|
||||
# its column. Both long labels are present, intact, in a table cell.
|
||||
tables = [b for b in ch.blocks if isinstance(b, DataTable)]
|
||||
cells = [str(c) for t in tables for r in t.rows for c in r]
|
||||
assert any(long_a in c and long_b in c for c in cells)
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "wide.pdf")
|
||||
pptx = os.path.join(d, "wide.pptx")
|
||||
rp = render_automatic_eda_pdf(prof, pdf, {"write_manifest": False})
|
||||
rx = render_automatic_eda_pptx(prof, pptx, {"write_manifest": False})
|
||||
# Both renderers lay the wide chapter out without raising and produce a
|
||||
# non-empty document (nothing dropped, just wrapped/scaled to fit).
|
||||
assert rp["path"] == pdf and os.path.exists(pdf) and rp["n_pages"] >= 1
|
||||
assert rx["path"] == pptx and os.path.exists(pptx) and rx["n_slides"] >= 1
|
||||
# A short, unbreakable fragment of the long label survives the wrap.
|
||||
assert "azufre" in _pdf_text(pdf)
|
||||
@@ -0,0 +1,289 @@
|
||||
"""Numeric distributions chapter (NUM DISTR) for AutomaticEDA.
|
||||
|
||||
For every numeric column the chapter draws, as a single indivisible figure, a
|
||||
histogram with the **mean, median and ±1σ band drawn as reference lines** and a
|
||||
**Tukey boxplot right below it** sharing the same X axis — exactly the user
|
||||
requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block
|
||||
so the renderers rasterize and scale it to fit a whole page/slide and nothing is
|
||||
ever cut; columns with many numerics simply flow across pages as small
|
||||
multiples.
|
||||
|
||||
Data comes from the ``eda`` group profile and is never recomputed here:
|
||||
|
||||
- ``columns[i]['numeric']`` (the output of ``describe_numeric``) gives
|
||||
``mean, median, std, min, max, p25, p75, iqr, n_outliers, outlier_pct,
|
||||
distribution_type`` and the ``histogram`` bins ``[{lo, hi, count}]``.
|
||||
- The boxplot five-number summary + Tukey 1.5·IQR fences are derived by the
|
||||
pure registry function ``build_boxplot_stats`` (group ``eda``); this chapter
|
||||
only consumes its output, it does not reimplement the statistics.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
Reads everything defensively (``.get``) and never raises: a column whose figure
|
||||
cannot be built is degraded to a short note instead of aborting the chapter.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
# Pure registry function (group ``eda``) that derives the Tukey boxplot stats
|
||||
# from a ``numeric`` sub-block. Imported defensively so the chapter still builds
|
||||
# (degrading the boxplot to a note) if the function is somehow unavailable.
|
||||
try:
|
||||
from datascience.build_boxplot_stats import build_boxplot_stats
|
||||
except Exception: # noqa: BLE001 — keep the chapter importable no matter what.
|
||||
build_boxplot_stats = None # type: ignore[assignment]
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "num_distr"
|
||||
CHAPTER_TITLE = "Distribuciones numéricas"
|
||||
|
||||
# Plain-Spanish gloss for every label ``detect_distribution_type`` can emit, so a
|
||||
# non-expert reader understands the shape and the suggested next step (MUST-4.3).
|
||||
_DIST_GLOSS = {
|
||||
"normal-ish": "aproximadamente simétrica (campana); media y mediana casi "
|
||||
"coinciden.",
|
||||
"right-skewed": "asimétrica a la derecha (cola larga hacia valores altos); "
|
||||
"la media supera a la mediana — considera una transformación "
|
||||
"logarítmica.",
|
||||
"left-skewed": "asimétrica a la izquierda (cola larga hacia valores bajos); "
|
||||
"la media queda por debajo de la mediana.",
|
||||
"heavy-tail": "colas pesadas (curtosis alta): más valores extremos de lo "
|
||||
"que esperaría una normal — vigila los outliers.",
|
||||
"lognormal-ish": "compatible con lognormal (simétrica al tomar logaritmos); "
|
||||
"la re-expresión log suele normalizarla.",
|
||||
"multimodal": "varios picos: probablemente mezcla de subgrupos — conviene "
|
||||
"segmentar antes de resumir con una sola media.",
|
||||
"discrete": "pocos valores distintos (discreta/ordinal); el histograma "
|
||||
"cuenta niveles, no un continuo.",
|
||||
"too_few_samples": "muestra demasiado pequeña para clasificar la forma con "
|
||||
"fiabilidad.",
|
||||
"other": "forma no encuadrada en las categorías estándar.",
|
||||
}
|
||||
|
||||
|
||||
def _fmt_num(value, decimals: int = 3) -> str:
|
||||
"""Compact, defensive number formatting shared with the other chapters."""
|
||||
if value is None:
|
||||
return "—"
|
||||
if isinstance(value, bool):
|
||||
return str(value)
|
||||
if isinstance(value, int):
|
||||
return f"{value:,}".replace(",", ".")
|
||||
if isinstance(value, float):
|
||||
if value != value: # NaN
|
||||
return "NaN"
|
||||
if value in (float("inf"), float("-inf")):
|
||||
return str(value)
|
||||
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
|
||||
return text if text else "0"
|
||||
return str(value)
|
||||
|
||||
|
||||
def _numeric_columns(profile: dict) -> list:
|
||||
"""Return the list of (name, numeric_dict) for columns with usable stats."""
|
||||
out = []
|
||||
for col in profile.get("columns") or []:
|
||||
if not isinstance(col, dict):
|
||||
continue
|
||||
if col.get("inferred_type") != "numeric":
|
||||
continue
|
||||
num = col.get("numeric")
|
||||
if not isinstance(num, dict) or not num:
|
||||
continue
|
||||
# A numeric block is renderable when it carries at least a center.
|
||||
if num.get("mean") is None and num.get("median") is None:
|
||||
continue
|
||||
out.append((col.get("name") or "(columna)", num))
|
||||
return out
|
||||
|
||||
|
||||
def _make_hist_box(name: str, numeric: dict, box: dict):
|
||||
"""Build the histogram (with mean/median/±σ lines) + boxplot figure.
|
||||
|
||||
Returned lazily to the renderer (a zero-arg callable via ``Figure.make``) so
|
||||
matplotlib is only imported and the figure only drawn when a renderer needs
|
||||
it. The two stacked axes share the X axis and are produced as a single
|
||||
figure, which both renderers treat as one indivisible unit (scaled whole,
|
||||
never cut).
|
||||
"""
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
fig, (ax_h, ax_b) = plt.subplots(
|
||||
2, 1, figsize=(6.4, 3.4), sharex=True,
|
||||
gridspec_kw={"height_ratios": [3.2, 1.0], "hspace": 0.08})
|
||||
|
||||
# ---- Histogram from the precomputed equal-width bins {lo, hi, count}. ----
|
||||
hist = numeric.get("histogram") or []
|
||||
drew_bars = False
|
||||
for b in hist:
|
||||
if not isinstance(b, dict):
|
||||
continue
|
||||
lo = b.get("lo")
|
||||
hi = b.get("hi")
|
||||
count = b.get("count") or 0
|
||||
if lo is None or hi is None:
|
||||
continue
|
||||
width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6)
|
||||
ax_h.bar(lo, count, width=width, align="edge", color="#9ec6df",
|
||||
edgecolor="#5b8aa6", linewidth=0.4, zorder=2)
|
||||
drew_bars = True
|
||||
if not drew_bars:
|
||||
ax_h.text(0.5, 0.5, "(sin histograma)", ha="center", va="center",
|
||||
fontsize=9, color="#8a8a8a", transform=ax_h.transAxes)
|
||||
|
||||
mean = numeric.get("mean")
|
||||
median = numeric.get("median")
|
||||
std = numeric.get("std")
|
||||
|
||||
# ±1σ band first (behind the lines), then median (solid) and mean (dashed).
|
||||
if mean is not None and std is not None and std > 0:
|
||||
ax_h.axvspan(mean - std, mean + std, color="#f0c27b", alpha=0.22,
|
||||
zorder=1, label="±1σ")
|
||||
if median is not None:
|
||||
ax_h.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.6,
|
||||
zorder=4, label=f"mediana = {_fmt_num(median)}")
|
||||
if mean is not None:
|
||||
ax_h.axvline(mean, color="#c0392b", linestyle="--", linewidth=1.6,
|
||||
zorder=4, label=f"media = {_fmt_num(mean)}")
|
||||
|
||||
ax_h.set_ylabel("frecuencia", fontsize=8)
|
||||
ax_h.tick_params(labelsize=7)
|
||||
ax_h.legend(fontsize=6.5, loc="upper right", framealpha=0.85)
|
||||
for spine in ("top", "right"):
|
||||
ax_h.spines[spine].set_visible(False)
|
||||
|
||||
# ---- Tukey boxplot below, sharing the X axis (MUST-4.2). ----
|
||||
if box:
|
||||
stats = [{
|
||||
"med": box.get("median"),
|
||||
"q1": box.get("q1"),
|
||||
"q3": box.get("q3"),
|
||||
"whislo": box.get("whisker_lo"),
|
||||
"whishi": box.get("whisker_hi"),
|
||||
"fliers": [], # raw outlier values are not in the profile.
|
||||
"label": "",
|
||||
}]
|
||||
bxp_kw = dict(
|
||||
showfliers=False, widths=0.5, patch_artist=True,
|
||||
boxprops={"facecolor": "#9ec6df", "edgecolor": "#5b8aa6"},
|
||||
medianprops={"color": "#2e8b57", "linewidth": 1.6},
|
||||
whiskerprops={"color": "#5b8aa6"},
|
||||
capprops={"color": "#5b8aa6"})
|
||||
try:
|
||||
# ``orientation`` is the current API; older matplotlib uses ``vert``.
|
||||
try:
|
||||
ax_b.bxp(stats, orientation="horizontal", **bxp_kw)
|
||||
except TypeError:
|
||||
ax_b.bxp(stats, vert=False, **bxp_kw)
|
||||
except Exception: # noqa: BLE001 — never let one axis kill the figure.
|
||||
pass
|
||||
# Mark the presence of out-of-fence points (the raw values are unknown).
|
||||
if box.get("has_low_outliers") and box.get("min") is not None:
|
||||
ax_b.plot([box["min"]], [1], marker="o", markersize=3.5,
|
||||
color="#c0392b", zorder=5)
|
||||
if box.get("has_high_outliers") and box.get("max") is not None:
|
||||
ax_b.plot([box["max"]], [1], marker="o", markersize=3.5,
|
||||
color="#c0392b", zorder=5)
|
||||
else:
|
||||
ax_b.text(0.5, 0.5, "(boxplot no disponible)", ha="center", va="center",
|
||||
fontsize=8, color="#8a8a8a", transform=ax_b.transAxes)
|
||||
|
||||
ax_b.set_yticks([])
|
||||
ax_b.set_xlabel(name, fontsize=8)
|
||||
ax_b.tick_params(labelsize=7)
|
||||
for spine in ("top", "right", "left"):
|
||||
ax_b.spines[spine].set_visible(False)
|
||||
|
||||
fig.suptitle(name, fontsize=10, fontweight="bold", x=0.02, ha="left")
|
||||
return fig
|
||||
|
||||
|
||||
def _stats_note(name: str, numeric: dict, box: dict) -> str:
|
||||
"""One compact line of the key numbers + a plain-Spanish shape gloss."""
|
||||
bits = [
|
||||
f"media {_fmt_num(numeric.get('mean'))}",
|
||||
f"mediana {_fmt_num(numeric.get('median'))}",
|
||||
f"σ {_fmt_num(numeric.get('std'))}",
|
||||
f"min {_fmt_num(numeric.get('min'))}",
|
||||
f"max {_fmt_num(numeric.get('max'))}",
|
||||
f"IQR {_fmt_num(numeric.get('iqr'))}",
|
||||
]
|
||||
n_out = numeric.get("n_outliers")
|
||||
out_pct = numeric.get("outlier_pct")
|
||||
if n_out is not None:
|
||||
pct = f" ({_fmt_num(out_pct, 2)}%)" if out_pct is not None else ""
|
||||
bits.append(f"outliers {n_out}{pct}")
|
||||
if box and (box.get("lower_fence") is not None):
|
||||
bits.append(
|
||||
f"vallas Tukey [{_fmt_num(box.get('lower_fence'))}, "
|
||||
f"{_fmt_num(box.get('upper_fence'))}]")
|
||||
line = " · ".join(bits)
|
||||
|
||||
dist = numeric.get("distribution_type")
|
||||
gloss = _DIST_GLOSS.get(dist)
|
||||
if dist and gloss:
|
||||
line += f"\n\n**Forma ({dist}):** {gloss}"
|
||||
return line
|
||||
|
||||
|
||||
def _figure_maker(name: str, numeric: dict, box: dict):
|
||||
"""Bind the per-column arguments so the lazy closure is loop-safe."""
|
||||
def _make():
|
||||
return _make_hist_box(name, numeric, box)
|
||||
|
||||
return _make
|
||||
|
||||
|
||||
def build_num_distr(profile: dict, ctx: dict):
|
||||
"""Build the numeric-distributions Chapter, or None if no numeric column.
|
||||
|
||||
Args:
|
||||
profile: the ``eda`` group TableProfile dict.
|
||||
ctx: presentation context (unused here beyond defensive handling).
|
||||
|
||||
Returns:
|
||||
A ``model.Chapter`` with, per numeric column, a histogram+boxplot figure
|
||||
and a stats note; or ``None`` when the dataset has no numeric column.
|
||||
"""
|
||||
profile = profile or {}
|
||||
ctx = ctx or {}
|
||||
|
||||
numerics = _numeric_columns(profile)
|
||||
if not numerics:
|
||||
return None # chapter does not apply to a dataset with no numerics.
|
||||
|
||||
intro = (
|
||||
"Para cada columna numérica se muestra su **histograma** con tres líneas "
|
||||
"de referencia: la **media** (línea roja discontinua), la **mediana** "
|
||||
"(línea verde continua) y la banda **±1σ** (zona sombreada). Debajo, "
|
||||
"alineado al mismo eje, un **boxplot de Tukey**: la caja abarca del "
|
||||
"primer al tercer cuartil (P25–P75), la línea interior es la mediana y "
|
||||
"los bigotes llegan hasta 1,5·IQR; los puntos rojos señalan que hay "
|
||||
"valores más allá de las vallas. Comparar media y mediana revela la "
|
||||
"asimetría de la distribución.")
|
||||
|
||||
blocks = [
|
||||
model.Heading(text=CHAPTER_TITLE, level=1),
|
||||
model.Markdown(text=intro),
|
||||
]
|
||||
|
||||
for name, numeric in numerics:
|
||||
box = {}
|
||||
if build_boxplot_stats is not None:
|
||||
try:
|
||||
box = build_boxplot_stats(numeric) or {}
|
||||
except Exception: # noqa: BLE001 — degrade, never raise.
|
||||
box = {}
|
||||
blocks.append(model.Heading(text=str(name), level=2))
|
||||
blocks.append(model.Figure(
|
||||
make=_figure_maker(name, numeric, box),
|
||||
caption=f"Distribución de «{name}» — histograma (media/mediana/±σ) "
|
||||
f"y boxplot."))
|
||||
blocks.append(model.Markdown(text=_stats_note(name, numeric, box)))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -0,0 +1,151 @@
|
||||
"""Tests for the NUM DISTR chapter — DoD: golden + edges + anti-cut.
|
||||
|
||||
Self-contained: builds synthetic ``numeric`` blocks (no DuckDB) so the suite is
|
||||
fast and deterministic. Verifies that the chapter emits, per numeric column, a
|
||||
histogram+boxplot figure plus a stats note; that the mean/median/±σ requirement
|
||||
and the boxplot are present; that a profile with no numeric column yields None;
|
||||
that None/empty never raises; and that with many numeric columns and long text
|
||||
both the PDF and the PPTX render without cutting anything (every column heading
|
||||
survives in the rendered output).
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
from datascience.automatic_eda.chapters.num_distr import (
|
||||
build_num_distr, CHAPTER_VERSION, _DIST_GLOSS,
|
||||
)
|
||||
from datascience.automatic_eda import model
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
|
||||
def _numeric_block(mean, median, std, mn, mx, dist="normal-ish",
|
||||
n_outliers=0, nbins=10):
|
||||
"""A synthetic ``numeric`` sub-block shaped like describe_numeric's output."""
|
||||
width = (mx - mn) / nbins if mx > mn else 1.0
|
||||
hist = [{"lo": mn + i * width, "hi": mn + (i + 1) * width,
|
||||
"count": (i + 1) * 3} for i in range(nbins)]
|
||||
p25 = mn + (mx - mn) * 0.25
|
||||
p75 = mn + (mx - mn) * 0.75
|
||||
return {
|
||||
"min": mn, "max": mx, "mean": mean, "median": median, "std": std,
|
||||
"p25": p25, "p50": median, "p75": p75, "iqr": p75 - p25,
|
||||
"n_outliers": n_outliers, "outlier_pct": 100.0 * n_outliers / 300.0,
|
||||
"distribution_type": dist, "histogram": hist,
|
||||
}
|
||||
|
||||
|
||||
def _profile(n_numeric=2, extra_categorical=True):
|
||||
cols = []
|
||||
presets = [
|
||||
("precio", 42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5),
|
||||
("alcohol", 10.4, 10.3, 1.1, 8.0, 14.9, "normal-ish", 0),
|
||||
("sulfatos", 0.66, 0.62, 0.17, 0.33, 2.0, "heavy-tail", 9),
|
||||
("calidad", 5.6, 6.0, 0.8, 3.0, 8.0, "discrete", 0),
|
||||
]
|
||||
for i in range(n_numeric):
|
||||
name, mean, med, std, mn, mx, dist, no = presets[i % len(presets)]
|
||||
if i >= len(presets):
|
||||
name = f"{name}_{i}"
|
||||
cols.append({"name": name, "inferred_type": "numeric",
|
||||
"numeric": _numeric_block(mean, med, std, mn, mx, dist, no)})
|
||||
if extra_categorical:
|
||||
cols.append({"name": "categoria", "inferred_type": "categorical",
|
||||
"categorical": {"top": [{"value": "tinto", "count": 200}]}})
|
||||
return {"table": "vinos", "n_rows": 300, "n_cols": len(cols),
|
||||
"columns": cols}
|
||||
|
||||
|
||||
def _pdf_text(path: str) -> str:
|
||||
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
|
||||
return re.sub(r"\s+", " ", txt)
|
||||
|
||||
|
||||
def test_golden_chapter_estructura_y_bloques():
|
||||
ch = build_num_distr(_profile(n_numeric=2), {})
|
||||
assert ch is not None
|
||||
assert ch.id == "num_distr"
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
kinds = [b.kind for b in ch.blocks]
|
||||
# Heading + intro Markdown, then per column: Heading + Figure + Markdown.
|
||||
assert kinds[0] == "heading"
|
||||
assert kinds[1] == "markdown"
|
||||
assert kinds.count("figure") == 2 # one figure per numeric column.
|
||||
assert kinds.count("heading") == 1 + 2 # chapter title + one per column.
|
||||
# Each figure has a lazy maker that produces a real matplotlib figure.
|
||||
figs = [b for b in ch.blocks if b.kind == "figure"]
|
||||
fig = figs[0].make()
|
||||
assert fig is not None
|
||||
# Two stacked axes: histogram + boxplot share the figure.
|
||||
assert len(fig.axes) == 2
|
||||
import matplotlib.pyplot as plt
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_golden_media_mediana_sigma_y_boxplot_presentes():
|
||||
# The intro documents the three reference lines and the Tukey boxplot; the
|
||||
# per-column note carries the actual mean/median/σ numbers and the shape.
|
||||
ch = build_num_distr(_profile(n_numeric=1, extra_categorical=False), {})
|
||||
md_texts = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
assert "media" in md_texts and "mediana" in md_texts
|
||||
assert "±1σ" in md_texts or "σ" in md_texts
|
||||
assert "boxplot" in md_texts.lower()
|
||||
assert "Tukey" in md_texts
|
||||
# distribution_type gloss surfaced for the column (right-skewed preset).
|
||||
assert _DIST_GLOSS["right-skewed"].split(";")[0][:20] in md_texts
|
||||
|
||||
|
||||
def test_boxplot_stats_se_consumen_del_registry():
|
||||
# The chapter must feed build_boxplot_stats (group eda) and the resulting
|
||||
# box must carry the Tukey fences for the figure.
|
||||
from datascience.build_boxplot_stats import build_boxplot_stats
|
||||
box = build_boxplot_stats(
|
||||
_numeric_block(42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5))
|
||||
assert box
|
||||
assert "lower_fence" in box and "upper_fence" in box
|
||||
assert box["q1"] is not None and box["q3"] is not None
|
||||
|
||||
|
||||
def test_edge_sin_columnas_numericas_devuelve_none():
|
||||
prof = {"columns": [{"name": "c", "inferred_type": "categorical",
|
||||
"categorical": {"top": []}}]}
|
||||
assert build_num_distr(prof, {}) is None
|
||||
|
||||
|
||||
def test_edge_profile_none_y_vacio_no_revienta():
|
||||
assert build_num_distr(None, None) is None
|
||||
assert build_num_distr({}, {}) is None
|
||||
assert build_num_distr({"columns": []}, {}) is None
|
||||
|
||||
|
||||
def test_anti_corte_muchas_columnas_pdf_y_pptx():
|
||||
# 8 numeric columns + long note text: nothing may be cut. Every column
|
||||
# heading must survive in both the PDF text and the PPTX deck.
|
||||
ch = build_num_distr(_profile(n_numeric=8), {})
|
||||
names = [b.text for b in ch.blocks if b.kind == "heading" and b.level == 2]
|
||||
assert len(names) == 8
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "num.pdf")
|
||||
res_pdf = render_automatic_eda_pdf(_profile(n_numeric=8), pdf,
|
||||
{"write_manifest": False})
|
||||
assert res_pdf["path"] == pdf
|
||||
txt = _pdf_text(pdf)
|
||||
for name in names:
|
||||
assert name in txt, f"columna '{name}' cortada/ausente en el PDF"
|
||||
pptx = os.path.join(d, "num.pptx")
|
||||
res_pptx = render_automatic_eda_pptx(_profile(n_numeric=8), pptx,
|
||||
{"write_manifest": False})
|
||||
assert res_pptx["path"] == pptx
|
||||
assert res_pptx["n_slides"] >= 8 # at least one slide per column figure.
|
||||
|
||||
|
||||
def test_distribution_gloss_cubre_todas_las_etiquetas():
|
||||
# Every label detect_distribution_type can emit has a Spanish gloss.
|
||||
for label in ("normal-ish", "right-skewed", "left-skewed", "heavy-tail",
|
||||
"lognormal-ish", "multimodal", "discrete", "too_few_samples",
|
||||
"other"):
|
||||
assert label in _DIST_GLOSS and _DIST_GLOSS[label]
|
||||
@@ -0,0 +1,58 @@
|
||||
---
|
||||
name: build_boxplot_stats
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def build_boxplot_stats(numeric: dict) -> dict"
|
||||
description: "Deriva las estadisticas de un boxplot de Tukey desde el sub-bloque numeric de un ColumnProfile del grupo eda (salida de describe_numeric). Aplica la regla del 1.5*IQR a los percentiles p25/p50/p75 para obtener cuartiles, fences, bigotes reales y flags de outliers. Lectura defensiva con .get; NUNCA lanza. Si faltan los percentiles clave devuelve {} para que el caller omita el grafico."
|
||||
tags: [eda, statistics, profiling, boxplot, tukey, iqr, datascience]
|
||||
params:
|
||||
- name: numeric
|
||||
desc: "Sub-bloque numeric de un ColumnProfile del grupo eda (la salida de describe_numeric). Claves esperadas (todas pueden ser None): min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50, p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct, negative_pct, distribution_type, histogram. Solo se usan p25, median/p50, p75, min, max y n_outliers."
|
||||
output: "Dict con las cifras de un boxplot horizontal de Tukey: {q1=p25, median=median(o p50), q3=p75, iqr=q3-q1, lower_fence=q1-1.5*iqr, upper_fence=q3+1.5*iqr, whisker_lo=max(min,lower_fence), whisker_hi=min(max,upper_fence), min, max, has_low_outliers=min<lower_fence, has_high_outliers=max>upper_fence, n_outliers}. Numericos en float, flags en bool nativo, n_outliers en int. Si faltan p25/median(o p50)/p75 devuelve {} (dict vacio). Cuando min/max faltan, los bigotes caen a la fence correspondiente."
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests: ["test_boxplot_tukey_basico", "test_percentiles_faltan_devuelve_vacio", "test_median_cae_a_p50", "test_whiskers_usan_fence_si_falta_min_max", "test_tipos_salida_float_bool_int"]
|
||||
test_file_path: "python/functions/datascience/build_boxplot_stats_test.py"
|
||||
file_path: "python/functions/datascience/build_boxplot_stats.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience.build_boxplot_stats import build_boxplot_stats
|
||||
|
||||
# Sub-bloque numeric tal y como lo produce describe_numeric:
|
||||
numeric = {
|
||||
"min": 1.0, "max": 100.0,
|
||||
"p25": 10.0, "median": 25.0, "p75": 40.0,
|
||||
"iqr": 30.0, "n_outliers": 3,
|
||||
}
|
||||
box = build_boxplot_stats(numeric)
|
||||
print(box["lower_fence"], box["upper_fence"]) # -35.0 85.0
|
||||
print(box["whisker_lo"], box["whisker_hi"]) # 1.0 85.0
|
||||
print(box["has_low_outliers"], box["has_high_outliers"]) # False True
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
- Usala al dibujar un boxplot horizontal bajo el histograma en el capitulo `num_distr` de `AutomaticEDA`: convierte el bloque `numeric` de un `ColumnProfile` en las cifras exactas que el renderer necesita (cuartiles, fences, extremos de los bigotes y flags de outliers).
|
||||
- Cuando ya tengas los percentiles calculados (salida de `describe_numeric`) y solo necesites derivar la geometria del boxplot de Tukey sin volver a tocar los valores crudos.
|
||||
- Cuando quieras decidir si una columna tiene cola alta/baja (`has_high_outliers` / `has_low_outliers`) antes de proponer una transformacion (log, winsorize).
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Funcion pura, sin I/O y determinista. Lectura defensiva con `.get`: NUNCA lanza. Si faltan `p25`, `median`/`p50` o `p75` devuelve `{}` (dict vacio) — el caller debe omitir el boxplot.
|
||||
- Los `n_outliers` que se propagan vienen del bloque z-score del profile (`detect_outliers`, threshold 3.0), NO de la regla IQR. Son informativos: el conteo de Tukey que esta funcion calcula son los **fences** (`lower_fence`/`upper_fence`), no un recuento de puntos.
|
||||
- No recibe los valores crudos de la columna, solo deriva cifras desde los percentiles ya calculados. Por eso no puede contar cuantos puntos caen fuera de las fences, solo si los extremos (`min`/`max`) las superan.
|
||||
- `iqr` se recalcula como `q3 - q1` aunque el bloque traiga `numeric['iqr']`: asi funciona aunque esa clave falte.
|
||||
- Cuando `min`/`max` faltan, los bigotes caen a la fence correspondiente y los flags de outliers quedan en `False` (sin extremo real no se afirma cola).
|
||||
@@ -0,0 +1,94 @@
|
||||
"""build_boxplot_stats — Tukey boxplot statistics from an EDA `numeric` sub-block.
|
||||
|
||||
Pure function: no I/O, deterministic. Takes the `numeric` dict of a ColumnProfile
|
||||
(group `eda`, the output of describe_numeric) and derives the figures needed to
|
||||
draw a horizontal Tukey boxplot using the 1.5 * IQR rule.
|
||||
|
||||
It only derives numbers from already-computed percentiles; it never sees the raw
|
||||
column values. Reading is defensive (.get throughout) and the function NEVER
|
||||
raises: if the key percentiles (p25 / p50 / p75) are missing it returns {} so the
|
||||
caller can simply skip the boxplot.
|
||||
"""
|
||||
|
||||
|
||||
def _num(value):
|
||||
"""Coerce to float defensively; return None for None/bool/non-numeric."""
|
||||
# bool is a subclass of int; a percentile value is never a real bool, so
|
||||
# treat True/False as missing instead of silently coercing to 1.0/0.0.
|
||||
if value is None or isinstance(value, bool):
|
||||
return None
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def build_boxplot_stats(numeric: dict) -> dict:
|
||||
"""Derive Tukey boxplot statistics from the `numeric` sub-block of a profile.
|
||||
|
||||
Reads the percentiles already computed by describe_numeric and applies the
|
||||
classic 1.5 * IQR fence rule to obtain the whisker extremes and outlier
|
||||
flags of a horizontal boxplot. No raw values are needed.
|
||||
|
||||
Args:
|
||||
numeric: The `numeric` sub-block of an eda ColumnProfile (output of
|
||||
describe_numeric). Every value may be None; read defensively.
|
||||
|
||||
Returns:
|
||||
Dict with the boxplot figures
|
||||
{q1, median, q3, iqr, lower_fence, upper_fence, whisker_lo, whisker_hi,
|
||||
min, max, has_low_outliers, has_high_outliers, n_outliers}.
|
||||
If p25, p50/median or p75 are missing (None) returns {} (empty dict) so
|
||||
the caller omits the plot.
|
||||
"""
|
||||
if not isinstance(numeric, dict):
|
||||
return {}
|
||||
|
||||
q1 = _num(numeric.get("p25"))
|
||||
q3 = _num(numeric.get("p75"))
|
||||
# Prefer the explicit median; fall back to p50 (they are the same quantile).
|
||||
median = _num(numeric.get("median"))
|
||||
if median is None:
|
||||
median = _num(numeric.get("p50"))
|
||||
|
||||
# Without the three quartiles a boxplot cannot be drawn.
|
||||
if q1 is None or q3 is None or median is None:
|
||||
return {}
|
||||
|
||||
# Recompute the IQR from the quartiles rather than trusting numeric['iqr'],
|
||||
# which may be missing even when the percentiles are present.
|
||||
iqr = q3 - q1
|
||||
lower_fence = q1 - 1.5 * iqr
|
||||
upper_fence = q3 + 1.5 * iqr
|
||||
|
||||
mn = _num(numeric.get("min"))
|
||||
mx = _num(numeric.get("max"))
|
||||
|
||||
# Whisker extremes: the real data range clamped to the fences. When the
|
||||
# corresponding extreme is missing, fall back to the fence itself.
|
||||
whisker_lo = max(mn, lower_fence) if mn is not None else lower_fence
|
||||
whisker_hi = min(mx, upper_fence) if mx is not None else upper_fence
|
||||
|
||||
has_low_outliers = bool(mn is not None and mn < lower_fence)
|
||||
has_high_outliers = bool(mx is not None and mx > upper_fence)
|
||||
|
||||
# Informative only: these outliers come from the z-score block of the
|
||||
# profile, not from this IQR fence computation.
|
||||
raw_n = numeric.get("n_outliers")
|
||||
n_outliers = int(raw_n) if isinstance(raw_n, (int, float)) and not isinstance(raw_n, bool) else 0
|
||||
|
||||
return {
|
||||
"q1": q1,
|
||||
"median": median,
|
||||
"q3": q3,
|
||||
"iqr": iqr,
|
||||
"lower_fence": lower_fence,
|
||||
"upper_fence": upper_fence,
|
||||
"whisker_lo": whisker_lo,
|
||||
"whisker_hi": whisker_hi,
|
||||
"min": mn,
|
||||
"max": mx,
|
||||
"has_low_outliers": has_low_outliers,
|
||||
"has_high_outliers": has_high_outliers,
|
||||
"n_outliers": n_outliers,
|
||||
}
|
||||
@@ -0,0 +1,108 @@
|
||||
"""Tests para build_boxplot_stats."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from build_boxplot_stats import build_boxplot_stats
|
||||
|
||||
# Keys that a non-empty result dict must always contain.
|
||||
_EXPECTED_KEYS = {
|
||||
"q1", "median", "q3", "iqr", "lower_fence", "upper_fence",
|
||||
"whisker_lo", "whisker_hi", "min", "max",
|
||||
"has_low_outliers", "has_high_outliers", "n_outliers",
|
||||
}
|
||||
|
||||
|
||||
def test_boxplot_tukey_basico():
|
||||
"""Golden: bloque numeric con outlier alto claro -> fences IQR de Tukey."""
|
||||
numeric = {
|
||||
"min": 1.0, "max": 100.0,
|
||||
"p25": 10.0, "median": 25.0, "p75": 40.0,
|
||||
"iqr": 30.0, "n_outliers": 3,
|
||||
}
|
||||
box = build_boxplot_stats(numeric)
|
||||
|
||||
assert set(box.keys()) == _EXPECTED_KEYS
|
||||
|
||||
assert box["q1"] == 10.0
|
||||
assert box["median"] == 25.0
|
||||
assert box["q3"] == 40.0
|
||||
# iqr recomputado desde los cuartiles.
|
||||
assert box["iqr"] == 30.0
|
||||
# lower = 10 - 1.5*30 = -35 ; upper = 40 + 1.5*30 = 85.
|
||||
assert box["lower_fence"] == -35.0
|
||||
assert box["upper_fence"] == 85.0
|
||||
# whisker_lo = max(min=1, -35) = 1 ; whisker_hi = min(max=100, 85) = 85.
|
||||
assert box["whisker_lo"] == 1.0
|
||||
assert box["whisker_hi"] == 85.0
|
||||
assert box["min"] == 1.0
|
||||
assert box["max"] == 100.0
|
||||
# Solo hay outliers altos (100 > 85), no bajos (1 no < -35).
|
||||
assert box["has_low_outliers"] is False
|
||||
assert box["has_high_outliers"] is True
|
||||
# n_outliers se propaga del bloque z-score (informativo).
|
||||
assert box["n_outliers"] == 3
|
||||
|
||||
|
||||
def test_percentiles_faltan_devuelve_vacio():
|
||||
"""Si falta p25/median/p75 -> {} (caller omite el boxplot)."""
|
||||
# Falta p25.
|
||||
assert build_boxplot_stats({"median": 25.0, "p75": 40.0}) == {}
|
||||
# Falta p75.
|
||||
assert build_boxplot_stats({"p25": 10.0, "median": 25.0}) == {}
|
||||
# Falta median y p50.
|
||||
assert build_boxplot_stats({"p25": 10.0, "p75": 40.0}) == {}
|
||||
# numeric None / no dict tambien es vacio, nunca lanza.
|
||||
assert build_boxplot_stats(None) == {}
|
||||
assert build_boxplot_stats({}) == {}
|
||||
|
||||
|
||||
def test_median_cae_a_p50():
|
||||
"""median ausente cae a p50."""
|
||||
numeric = {"min": 0.0, "max": 10.0, "p25": 2.0, "p50": 5.0, "p75": 8.0}
|
||||
box = build_boxplot_stats(numeric)
|
||||
assert box["median"] == 5.0
|
||||
assert box["q1"] == 2.0
|
||||
assert box["q3"] == 8.0
|
||||
|
||||
|
||||
def test_whiskers_usan_fence_si_falta_min_max():
|
||||
"""Sin min/max los bigotes caen a las fences y no hay outliers marcados."""
|
||||
numeric = {"p25": 10.0, "median": 25.0, "p75": 40.0} # sin min ni max
|
||||
box = build_boxplot_stats(numeric)
|
||||
|
||||
assert box["min"] is None
|
||||
assert box["max"] is None
|
||||
# iqr = 30, fences -35 / 85; los bigotes caen a las fences.
|
||||
assert box["whisker_lo"] == box["lower_fence"] == -35.0
|
||||
assert box["whisker_hi"] == box["upper_fence"] == 85.0
|
||||
# Sin extremos reales, no se afirma que haya outliers.
|
||||
assert box["has_low_outliers"] is False
|
||||
assert box["has_high_outliers"] is False
|
||||
# n_outliers ausente -> 0.
|
||||
assert box["n_outliers"] == 0
|
||||
|
||||
|
||||
def test_tipos_salida_float_bool_int():
|
||||
"""Numericos en float, flags bool nativos, n_outliers int."""
|
||||
numeric = {
|
||||
"min": -50.0, "max": 200.0,
|
||||
"p25": 10.0, "median": 25.0, "p75": 40.0,
|
||||
"n_outliers": 7,
|
||||
}
|
||||
box = build_boxplot_stats(numeric)
|
||||
|
||||
for key in ("q1", "median", "q3", "iqr", "lower_fence", "upper_fence",
|
||||
"whisker_lo", "whisker_hi", "min", "max"):
|
||||
assert isinstance(box[key], float), f"{key} debe ser float"
|
||||
|
||||
assert isinstance(box["has_low_outliers"], bool)
|
||||
assert isinstance(box["has_high_outliers"], bool)
|
||||
assert isinstance(box["n_outliers"], int) and not isinstance(box["n_outliers"], bool)
|
||||
|
||||
# min=-50 < lower_fence=-35 -> outlier bajo ; max=200 > upper_fence=85 -> alto.
|
||||
assert box["has_low_outliers"] is True
|
||||
assert box["has_high_outliers"] is True
|
||||
assert box["n_outliers"] == 7
|
||||
Reference in New Issue
Block a user