Compare commits

..

1 Commits

Author SHA1 Message Date
egutierrez d412522db9 feat(eda): capítulo CALIDAD del AutomaticEDA (criterios + scores + problemas ES)
Añade el capítulo de calidad de datos al motor AutomaticEDA, siguiendo el
contrato de capítulos (build_calidad(profile, ctx) -> Chapter | None,
CHAPTER_VERSION). El capítulo responde lo que pidió el usuario, en español y
en formato de tabla:

- Intro "Cómo se calcula la calidad": explica los tres criterios y sus pesos
  (completitud 50%, validez 30%, consistencia 20%) antes de cualquier número,
  más una KVTable de resumen a nivel tabla (calidad global y agregados).
- Tabla "Scores por columna": score total más su desglose en completitud /
  validez / consistencia, ordenada de peor a mejor.
- Tabla "Problemas detectados": los issues en español por columna, separados de
  los flags de tipo. Cuando no hay problemas, una nota honesta.

Registry-first: el desglose y los issues NO se recalculan aquí; se consumen de
la función pura del registry column_quality_score (grupo eda), que ya deriva
{score, completeness, validity, consistency, issues} del ColumnProfile. El
capítulo es render-only y compone bloques del modelo; los renderers paginan las
tablas (parten por filas repitiendo cabecera) y envuelven celdas largas, de modo
que nada se corta en PDF ni en PPTX. La lista de issues por celda se acota a
160 caracteres con "(+N más)" para que una fila nunca crezca más que una página.

Test self-contained (sin DuckDB): golden con desglose + issues ES, edges
(None/{}/sin columnas -> None; perfil limpio -> nota), y anti-cortes (perfil de
22 columnas con nombres largos renderizado a PDF y PPTX: el nombre completo
sobrevive al envolverse, sin marcador de truncado).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 14:59:10 +02:00
7 changed files with 460 additions and 700 deletions
@@ -0,0 +1,266 @@
"""Data-quality chapter (CALIDAD) for AutomaticEDA.
Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
chapter answers, in Spanish and as tables, the three things the user asked for:
1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and
their weights (completeness, validity, consistency) before any number, plus a
table-level summary (global score and aggregates).
2. **Scores por columna** — a table with, per column, the total quality score and
its breakdown into completeness / validity / consistency.
3. **Problemas en español** — a second table listing, per column, the readable
issues in Spanish (kept separate from the type ``flags``).
The breakdown and the issues are NOT recomputed here: they come from the registry
function ``column_quality_score`` (group ``eda``), which already derives
``{score, completeness, validity, consistency, issues}`` from the ColumnProfile.
This chapter is render-only — it consumes that function and lays the result out
as model blocks; the renderers paginate tables (splitting by rows, repeating the
header) and wrap long cells so nothing is ever cut.
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
"""
from __future__ import annotations
from .. import model
# Reuse the registry's pure quality function (group ``eda``). Import defensively:
# if the package cannot be imported for any reason the chapter degrades to the
# per-column ``quality_score`` already present in the profile instead of failing.
try: # pragma: no cover - import wiring
from ...column_quality_score import column_quality_score as _column_quality_score
except Exception: # noqa: BLE001 - never let an import error abort the document.
_column_quality_score = None
CHAPTER_VERSION = "1.0.0"
CHAPTER_ID = "calidad"
CHAPTER_TITLE = "Calidad"
# Weights mirror column_quality_score: completeness 0.5, validity 0.3,
# consistency 0.2. Kept here only to render the human explanation; the actual
# numbers always come from the function so the two never drift in computation.
_CRITERIA_INTRO = (
"La calidad de cada columna es un score de 0 a 100 que combina tres "
"criterios, cada uno con un peso:\n\n"
"- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos "
"ni vacíos). Una columna con muchos nulos baja de score.\n"
"- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango "
"esperado (penaliza outliers y semánticas declaradas que no coinciden).\n"
"- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza "
"columnas constantes o identificadores de cardinalidad muy alta).\n\n"
"Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). "
"Los problemas detectados por columna se listan en español más abajo."
)
# Cap for the joined issues cell so a single row never grows taller than a page;
# the remainder is summarized as "(+N más)" instead of being silently dropped.
_ISSUES_MAXLEN = 160
def _fmt_score(value) -> str:
"""Format a 0-100 score as ``NN / 100`` (or a placeholder)."""
if value is None:
return ""
try:
num = float(value)
except (TypeError, ValueError):
return str(value)
if num != num: # NaN
return ""
text = f"{num:.1f}".rstrip("0").rstrip(".")
return f"{text} / 100"
def _fmt_unit_pct(value) -> str:
"""Format a 0-1 fraction as a percentage (``95%``)."""
if value is None:
return ""
try:
return f"{float(value) * 100:.0f}%"
except (TypeError, ValueError):
return str(value)
def _quality_of(col: dict) -> dict:
"""Return ``{score, completeness, validity, consistency, issues}`` for a column.
Uses the registry ``column_quality_score`` when available; otherwise falls
back to the per-column ``quality_score`` already in the profile (number only,
empty breakdown/issues). Never raises.
"""
if not isinstance(col, dict):
col = {}
if _column_quality_score is not None:
try:
res = _column_quality_score(col)
if isinstance(res, dict):
return res
except Exception: # noqa: BLE001 - degrade instead of aborting.
pass
# Fallback: only the final score is available pre-computed in the profile.
return {
"score": col.get("quality_score"),
"completeness": None,
"validity": None,
"consistency": None,
"issues": [],
}
def _join_issues(issues) -> str:
"""Join Spanish issue strings into one cell, truncating overly long lists.
The renderer wraps cell text, but a column with many long issues could make a
single row taller than a whole page; cap the length and append ``(+N más)``
so the count of hidden issues is honest rather than silently lost.
"""
if not isinstance(issues, (list, tuple)) or not issues:
return ""
parts = [model._safe_str(i).strip() for i in issues]
parts = [p for p in parts if p]
if not parts:
return ""
out = []
used = 0
for idx, part in enumerate(parts):
extra = len(part) + (2 if out else 0)
if used + extra > _ISSUES_MAXLEN and out:
remaining = len(parts) - idx
out.append(f"(+{remaining} más)")
return "; ".join(out)
out.append(part)
used += extra
return "; ".join(out)
def _columns_with_quality(profile: dict):
"""Yield ``(col, quality_dict)`` for every column dict in the profile."""
cols = profile.get("columns") or []
for c in cols:
if isinstance(c, dict):
yield c, _quality_of(c)
def _summary_block(profile: dict, evaluated: list):
"""Table-level KVTable: global score and quality aggregates."""
rows = []
score = profile.get("quality_score")
rows.append(("Calidad global", _fmt_score(score)))
rows.append(("Columnas evaluadas", str(len(evaluated))))
comps = [q.get("completeness") for _, q in evaluated
if isinstance(q.get("completeness"), (int, float))]
vals = [q.get("validity") for _, q in evaluated
if isinstance(q.get("validity"), (int, float))]
cons = [q.get("consistency") for _, q in evaluated
if isinstance(q.get("consistency"), (int, float))]
if comps:
rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
if vals:
rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals))))
if cons:
rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons))))
n_problem = sum(1 for _, q in evaluated if q.get("issues"))
rows.append(("Columnas con problemas", str(n_problem)))
# Extra table-wide quality signals already in the profile, when present.
dup_pct = profile.get("duplicate_pct")
if dup_pct is not None:
rows.append(("Filas duplicadas", _fmt_unit_pct_or_pct(dup_pct)))
null_cell_pct = profile.get("null_cell_pct")
if null_cell_pct is not None:
rows.append(("Celdas nulas (global)", _fmt_unit_pct_or_pct(null_cell_pct)))
constant_cols = profile.get("constant_cols")
if isinstance(constant_cols, (list, tuple)) and constant_cols:
rows.append(("Columnas constantes", str(len(constant_cols))))
all_null_cols = profile.get("all_null_cols")
if isinstance(all_null_cols, (list, tuple)) and all_null_cols:
rows.append(("Columnas 100% nulas", str(len(all_null_cols))))
return model.KVTable(rows=rows, title="Resumen de calidad")
def _fmt_unit_pct_or_pct(value) -> str:
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
try:
num = float(value)
except (TypeError, ValueError):
return model._safe_str(value)
if num != num: # NaN
return ""
pct = num * 100 if num <= 1.0 else num
text = f"{pct:.1f}".rstrip("0").rstrip(".")
return f"{text}%"
def _scores_block(evaluated: list):
"""DataTable with per-column score and its three-criteria breakdown."""
header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"]
rows = []
# Worst columns first so the reader sees the problems at the top.
ordered = sorted(
evaluated,
key=lambda cq: (cq[1].get("score")
if isinstance(cq[1].get("score"), (int, float)) else 101.0),
)
for col, q in ordered:
rows.append([
col.get("name") or "(col)",
_fmt_score(q.get("score")),
_fmt_unit_pct(q.get("completeness")),
_fmt_unit_pct(q.get("validity")),
_fmt_unit_pct(q.get("consistency")),
])
if not rows:
return None
return model.DataTable(header=header, rows=rows,
title="Scores de calidad por columna",
note="0 = peor, 100 = mejor; ordenado de peor a mejor")
def _issues_block(evaluated: list):
"""DataTable listing Spanish issues per column, or a Note when there are none."""
header = ["Columna", "Problemas detectados (español)"]
rows = []
for col, q in evaluated:
joined = _join_issues(q.get("issues"))
if joined:
rows.append([col.get("name") or "(col)", joined])
if not rows:
return model.Note(
"No se detectaron problemas de calidad en las columnas evaluadas.")
return model.DataTable(header=header, rows=rows,
title="Problemas de calidad por columna")
def build_calidad(profile: dict, ctx: dict):
"""Build the data-quality Chapter, or None if the profile has no columns.
Reads everything defensively; returns ``None`` when there are no columns to
score (the chapter does not apply), and never raises on a malformed profile.
"""
profile = profile or {}
if not isinstance(profile, dict):
profile = {}
ctx = ctx or {}
evaluated = list(_columns_with_quality(profile))
if not evaluated:
return None # no columns to score -> chapter does not apply.
blocks = [
model.Heading(text="Cómo se calcula la calidad", level=2),
model.Markdown(text=_CRITERIA_INTRO),
_summary_block(profile, evaluated),
model.Heading(text="Scores por columna", level=2),
]
scores = _scores_block(evaluated)
if scores is not None:
blocks.append(scores)
blocks.append(model.Heading(text="Problemas detectados", level=2))
blocks.append(_issues_block(evaluated))
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,194 @@
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut.
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
and deterministic. Verifies that the chapter explains the quality criteria, shows
per-column scores with the completeness/validity/consistency breakdown, lists the
issues in Spanish (separate from the type flags), returns None when it does not
apply, and that a wide profile with long names renders to PDF and PPTX without
cutting any cell text (long content wraps, it is never truncated).
"""
import os
import re
import tempfile
from pypdf import PdfReader
from pptx import Presentation
from datascience.automatic_eda.chapters.calidad import (
build_calidad,
CHAPTER_VERSION,
)
from datascience.automatic_eda import build_document, render_pdf, render_pptx
def _profile() -> dict:
"""A small profile with one column per quality problem (nulls, outliers,
constant, high-cardinality id) plus one clean column."""
return {
"table": "demo",
"quality_score": 72.5,
"duplicate_pct": 0.04,
"null_cell_pct": 0.11,
"constant_cols": ["flag_const"],
"all_null_cols": [],
"columns": [
{"name": "edad", "inferred_type": "integer", "null_pct": 0.2,
"numeric": {"outlier_pct": 0.15, "min": 0, "max": 99},
"quality_score": 60},
{"name": "nombre", "inferred_type": "text", "null_pct": 0.0,
"unique_pct": 0.98, "quality_score": 80},
{"name": "flag_const", "inferred_type": "text", "null_pct": 0.0,
"flags": ["constant"], "quality_score": 50},
{"name": "limpia", "inferred_type": "float", "null_pct": 0.0,
"numeric": {"outlier_pct": 0.0}, "quality_score": 100},
],
}
def _tables(chapter):
return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"]
def _scores_table(chapter):
for t in _tables(chapter):
if "Scores" in (t.title or ""):
return t
return None
def _issues_table(chapter):
for t in _tables(chapter):
if "Problemas" in (t.title or ""):
return t
return None
# --------------------------------------------------------------------------- #
# Golden
# --------------------------------------------------------------------------- #
def test_golden_chapter_estructura_y_version():
ch = build_calidad(_profile(), {})
assert ch is not None
assert ch.id == "calidad"
assert ch.version == CHAPTER_VERSION
kinds = [b.kind for b in ch.blocks]
# intro heading + markdown criteria + summary kv + scores table + issues table
assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds
def test_golden_intro_explica_criterios_y_pesos():
ch = build_calidad(_profile(), {})
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
for needle in ("Completitud", "Validez", "Consistencia",
"50%", "30%", "20%"):
assert needle in intro, f"falta {needle!r} en la intro de criterios"
def test_golden_scores_incluyen_desglose_por_criterio():
ch = build_calidad(_profile(), {})
scores = _scores_table(ch)
assert scores is not None
assert scores.header == ["Columna", "Calidad", "Completitud",
"Validez", "Consistencia"]
# 4 columns scored, none dropped.
assert len(scores.rows) == 4
names = {r[0] for r in scores.rows}
assert names == {"edad", "nombre", "flag_const", "limpia"}
def test_golden_issues_en_espanol_separados_de_flags():
ch = build_calidad(_profile(), {})
issues = _issues_table(ch)
assert issues is not None
flat = " | ".join(" ".join(r) for r in issues.rows)
assert "nulos" in flat # completeness issue (ES)
assert "outliers" in flat # validity issue (ES)
assert "columna constante" in flat
assert "posible id de alta cardinalidad" in flat
# The raw type flag string must NOT leak as a "problem".
assert "constant" not in flat or "columna constante" in flat
# --------------------------------------------------------------------------- #
# Edges
# --------------------------------------------------------------------------- #
def test_edge_none_vacio_sin_columnas_devuelve_none():
assert build_calidad(None, None) is None
assert build_calidad({}, {}) is None
assert build_calidad({"columns": []}, {}) is None
assert build_calidad("not a dict", {}) is None
def test_edge_perfil_limpio_sin_problemas_usa_nota():
prof = {
"quality_score": 100,
"columns": [
{"name": "a", "inferred_type": "float", "null_pct": 0.0,
"numeric": {"outlier_pct": 0.0}},
{"name": "b", "inferred_type": "float", "null_pct": 0.0,
"numeric": {"outlier_pct": 0.0}},
],
}
ch = build_calidad(prof, {})
assert ch is not None
assert _issues_table(ch) is None # no issues table
notes = [b for b in ch.blocks if b.kind == "note"]
assert notes and "No se detectaron problemas" in notes[0].text
# --------------------------------------------------------------------------- #
# Anti-cut: a wide profile with long names renders without truncation
# --------------------------------------------------------------------------- #
def _wide_profile(ncols: int = 22) -> dict:
cols = [
{"name": "identificador_unico_de_transaccion_con_nombre_muy_largo",
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99},
{"name": "columna_constante_sin_ninguna_variacion_de_valor",
"inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]},
]
for k in range(ncols - 2):
cols.append({
"name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo",
"inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05,
"numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000},
})
return {"table": "ancha", "quality_score": 70.0, "columns": cols}
def test_anticut_pdf_y_pptx_no_truncan_nombres_largos():
prof = _wide_profile(22)
full = build_document(prof, {"dataset_name": "ancha"})
assert any(c.id == "calidad" for c in full)
# Render ONLY the calidad chapter so the anti-cut assertions are scoped to
# this chapter (other chapters, e.g. portada, legitimately contain '…').
chapters = [c for c in full if c.id == "calidad"]
long_name = "metrica_numerica_de_negocio_00_con_nombre_largo"
with tempfile.TemporaryDirectory() as d:
pdf = os.path.join(d, "q.pdf")
pptx = os.path.join(d, "q.pptx")
rp = render_pdf(chapters, pdf, {"title": "EDA"})
rx = render_pptx(chapters, pptx, {"title": "EDA"})
assert os.path.exists(pdf) and os.path.exists(pptx)
# The wide table forces pagination across several pages/slides.
assert (rp or {}).get("n_pages", 0) >= 2
# PDF: the long name survives whole once wraps (spaces/newlines) removed,
# and there is no truncation marker.
pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages)
assert "" not in pdf_txt and "..." not in pdf_txt
norm = re.sub(r"\s+", "", pdf_txt)
assert long_name in norm, "el nombre largo se cortó en el PDF"
# PPTX: long name present in some cell, untruncated.
allt = []
for s in Presentation(pptx).slides:
for sh in s.shapes:
if sh.has_text_frame:
allt.append(sh.text_frame.text)
if sh.has_table:
for row in sh.table.rows:
for c in row.cells:
allt.append(c.text)
joined = re.sub(r"\s+", "", "\n".join(allt))
assert long_name in joined, "el nombre largo se cortó en el PPTX"
@@ -1,289 +0,0 @@
"""Numeric distributions chapter (NUM DISTR) for AutomaticEDA.
For every numeric column the chapter draws, as a single indivisible figure, a
histogram with the **mean, median and ±1σ band drawn as reference lines** and a
**Tukey boxplot right below it** sharing the same X axis — exactly the user
requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block
so the renderers rasterize and scale it to fit a whole page/slide and nothing is
ever cut; columns with many numerics simply flow across pages as small
multiples.
Data comes from the ``eda`` group profile and is never recomputed here:
- ``columns[i]['numeric']`` (the output of ``describe_numeric``) gives
``mean, median, std, min, max, p25, p75, iqr, n_outliers, outlier_pct,
distribution_type`` and the ``histogram`` bins ``[{lo, hi, count}]``.
- The boxplot five-number summary + Tukey 1.5·IQR fences are derived by the
pure registry function ``build_boxplot_stats`` (group ``eda``); this chapter
only consumes its output, it does not reimplement the statistics.
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
Reads everything defensively (``.get``) and never raises: a column whose figure
cannot be built is degraded to a short note instead of aborting the chapter.
"""
from __future__ import annotations
from .. import model
# Pure registry function (group ``eda``) that derives the Tukey boxplot stats
# from a ``numeric`` sub-block. Imported defensively so the chapter still builds
# (degrading the boxplot to a note) if the function is somehow unavailable.
try:
from datascience.build_boxplot_stats import build_boxplot_stats
except Exception: # noqa: BLE001 — keep the chapter importable no matter what.
build_boxplot_stats = None # type: ignore[assignment]
CHAPTER_VERSION = "1.0.0"
CHAPTER_ID = "num_distr"
CHAPTER_TITLE = "Distribuciones numéricas"
# Plain-Spanish gloss for every label ``detect_distribution_type`` can emit, so a
# non-expert reader understands the shape and the suggested next step (MUST-4.3).
_DIST_GLOSS = {
"normal-ish": "aproximadamente simétrica (campana); media y mediana casi "
"coinciden.",
"right-skewed": "asimétrica a la derecha (cola larga hacia valores altos); "
"la media supera a la mediana — considera una transformación "
"logarítmica.",
"left-skewed": "asimétrica a la izquierda (cola larga hacia valores bajos); "
"la media queda por debajo de la mediana.",
"heavy-tail": "colas pesadas (curtosis alta): más valores extremos de lo "
"que esperaría una normal — vigila los outliers.",
"lognormal-ish": "compatible con lognormal (simétrica al tomar logaritmos); "
"la re-expresión log suele normalizarla.",
"multimodal": "varios picos: probablemente mezcla de subgrupos — conviene "
"segmentar antes de resumir con una sola media.",
"discrete": "pocos valores distintos (discreta/ordinal); el histograma "
"cuenta niveles, no un continuo.",
"too_few_samples": "muestra demasiado pequeña para clasificar la forma con "
"fiabilidad.",
"other": "forma no encuadrada en las categorías estándar.",
}
def _fmt_num(value, decimals: int = 3) -> str:
"""Compact, defensive number formatting shared with the other chapters."""
if value is None:
return ""
if isinstance(value, bool):
return str(value)
if isinstance(value, int):
return f"{value:,}".replace(",", ".")
if isinstance(value, float):
if value != value: # NaN
return "NaN"
if value in (float("inf"), float("-inf")):
return str(value)
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
return text if text else "0"
return str(value)
def _numeric_columns(profile: dict) -> list:
"""Return the list of (name, numeric_dict) for columns with usable stats."""
out = []
for col in profile.get("columns") or []:
if not isinstance(col, dict):
continue
if col.get("inferred_type") != "numeric":
continue
num = col.get("numeric")
if not isinstance(num, dict) or not num:
continue
# A numeric block is renderable when it carries at least a center.
if num.get("mean") is None and num.get("median") is None:
continue
out.append((col.get("name") or "(columna)", num))
return out
def _make_hist_box(name: str, numeric: dict, box: dict):
"""Build the histogram (with mean/median/±σ lines) + boxplot figure.
Returned lazily to the renderer (a zero-arg callable via ``Figure.make``) so
matplotlib is only imported and the figure only drawn when a renderer needs
it. The two stacked axes share the X axis and are produced as a single
figure, which both renderers treat as one indivisible unit (scaled whole,
never cut).
"""
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
fig, (ax_h, ax_b) = plt.subplots(
2, 1, figsize=(6.4, 3.4), sharex=True,
gridspec_kw={"height_ratios": [3.2, 1.0], "hspace": 0.08})
# ---- Histogram from the precomputed equal-width bins {lo, hi, count}. ----
hist = numeric.get("histogram") or []
drew_bars = False
for b in hist:
if not isinstance(b, dict):
continue
lo = b.get("lo")
hi = b.get("hi")
count = b.get("count") or 0
if lo is None or hi is None:
continue
width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6)
ax_h.bar(lo, count, width=width, align="edge", color="#9ec6df",
edgecolor="#5b8aa6", linewidth=0.4, zorder=2)
drew_bars = True
if not drew_bars:
ax_h.text(0.5, 0.5, "(sin histograma)", ha="center", va="center",
fontsize=9, color="#8a8a8a", transform=ax_h.transAxes)
mean = numeric.get("mean")
median = numeric.get("median")
std = numeric.get("std")
# ±1σ band first (behind the lines), then median (solid) and mean (dashed).
if mean is not None and std is not None and std > 0:
ax_h.axvspan(mean - std, mean + std, color="#f0c27b", alpha=0.22,
zorder=1, label="±1σ")
if median is not None:
ax_h.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.6,
zorder=4, label=f"mediana = {_fmt_num(median)}")
if mean is not None:
ax_h.axvline(mean, color="#c0392b", linestyle="--", linewidth=1.6,
zorder=4, label=f"media = {_fmt_num(mean)}")
ax_h.set_ylabel("frecuencia", fontsize=8)
ax_h.tick_params(labelsize=7)
ax_h.legend(fontsize=6.5, loc="upper right", framealpha=0.85)
for spine in ("top", "right"):
ax_h.spines[spine].set_visible(False)
# ---- Tukey boxplot below, sharing the X axis (MUST-4.2). ----
if box:
stats = [{
"med": box.get("median"),
"q1": box.get("q1"),
"q3": box.get("q3"),
"whislo": box.get("whisker_lo"),
"whishi": box.get("whisker_hi"),
"fliers": [], # raw outlier values are not in the profile.
"label": "",
}]
bxp_kw = dict(
showfliers=False, widths=0.5, patch_artist=True,
boxprops={"facecolor": "#9ec6df", "edgecolor": "#5b8aa6"},
medianprops={"color": "#2e8b57", "linewidth": 1.6},
whiskerprops={"color": "#5b8aa6"},
capprops={"color": "#5b8aa6"})
try:
# ``orientation`` is the current API; older matplotlib uses ``vert``.
try:
ax_b.bxp(stats, orientation="horizontal", **bxp_kw)
except TypeError:
ax_b.bxp(stats, vert=False, **bxp_kw)
except Exception: # noqa: BLE001 — never let one axis kill the figure.
pass
# Mark the presence of out-of-fence points (the raw values are unknown).
if box.get("has_low_outliers") and box.get("min") is not None:
ax_b.plot([box["min"]], [1], marker="o", markersize=3.5,
color="#c0392b", zorder=5)
if box.get("has_high_outliers") and box.get("max") is not None:
ax_b.plot([box["max"]], [1], marker="o", markersize=3.5,
color="#c0392b", zorder=5)
else:
ax_b.text(0.5, 0.5, "(boxplot no disponible)", ha="center", va="center",
fontsize=8, color="#8a8a8a", transform=ax_b.transAxes)
ax_b.set_yticks([])
ax_b.set_xlabel(name, fontsize=8)
ax_b.tick_params(labelsize=7)
for spine in ("top", "right", "left"):
ax_b.spines[spine].set_visible(False)
fig.suptitle(name, fontsize=10, fontweight="bold", x=0.02, ha="left")
return fig
def _stats_note(name: str, numeric: dict, box: dict) -> str:
"""One compact line of the key numbers + a plain-Spanish shape gloss."""
bits = [
f"media {_fmt_num(numeric.get('mean'))}",
f"mediana {_fmt_num(numeric.get('median'))}",
f"σ {_fmt_num(numeric.get('std'))}",
f"min {_fmt_num(numeric.get('min'))}",
f"max {_fmt_num(numeric.get('max'))}",
f"IQR {_fmt_num(numeric.get('iqr'))}",
]
n_out = numeric.get("n_outliers")
out_pct = numeric.get("outlier_pct")
if n_out is not None:
pct = f" ({_fmt_num(out_pct, 2)}%)" if out_pct is not None else ""
bits.append(f"outliers {n_out}{pct}")
if box and (box.get("lower_fence") is not None):
bits.append(
f"vallas Tukey [{_fmt_num(box.get('lower_fence'))}, "
f"{_fmt_num(box.get('upper_fence'))}]")
line = " · ".join(bits)
dist = numeric.get("distribution_type")
gloss = _DIST_GLOSS.get(dist)
if dist and gloss:
line += f"\n\n**Forma ({dist}):** {gloss}"
return line
def _figure_maker(name: str, numeric: dict, box: dict):
"""Bind the per-column arguments so the lazy closure is loop-safe."""
def _make():
return _make_hist_box(name, numeric, box)
return _make
def build_num_distr(profile: dict, ctx: dict):
"""Build the numeric-distributions Chapter, or None if no numeric column.
Args:
profile: the ``eda`` group TableProfile dict.
ctx: presentation context (unused here beyond defensive handling).
Returns:
A ``model.Chapter`` with, per numeric column, a histogram+boxplot figure
and a stats note; or ``None`` when the dataset has no numeric column.
"""
profile = profile or {}
ctx = ctx or {}
numerics = _numeric_columns(profile)
if not numerics:
return None # chapter does not apply to a dataset with no numerics.
intro = (
"Para cada columna numérica se muestra su **histograma** con tres líneas "
"de referencia: la **media** (línea roja discontinua), la **mediana** "
"(línea verde continua) y la banda **±1σ** (zona sombreada). Debajo, "
"alineado al mismo eje, un **boxplot de Tukey**: la caja abarca del "
"primer al tercer cuartil (P25P75), la línea interior es la mediana y "
"los bigotes llegan hasta 1,5·IQR; los puntos rojos señalan que hay "
"valores más allá de las vallas. Comparar media y mediana revela la "
"asimetría de la distribución.")
blocks = [
model.Heading(text=CHAPTER_TITLE, level=1),
model.Markdown(text=intro),
]
for name, numeric in numerics:
box = {}
if build_boxplot_stats is not None:
try:
box = build_boxplot_stats(numeric) or {}
except Exception: # noqa: BLE001 — degrade, never raise.
box = {}
blocks.append(model.Heading(text=str(name), level=2))
blocks.append(model.Figure(
make=_figure_maker(name, numeric, box),
caption=f"Distribución de «{name}» — histograma (media/mediana/±σ) "
f"y boxplot."))
blocks.append(model.Markdown(text=_stats_note(name, numeric, box)))
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
version=CHAPTER_VERSION, blocks=blocks)
@@ -1,151 +0,0 @@
"""Tests for the NUM DISTR chapter — DoD: golden + edges + anti-cut.
Self-contained: builds synthetic ``numeric`` blocks (no DuckDB) so the suite is
fast and deterministic. Verifies that the chapter emits, per numeric column, a
histogram+boxplot figure plus a stats note; that the mean/median/±σ requirement
and the boxplot are present; that a profile with no numeric column yields None;
that None/empty never raises; and that with many numeric columns and long text
both the PDF and the PPTX render without cutting anything (every column heading
survives in the rendered output).
"""
import os
import re
import tempfile
from pypdf import PdfReader
from datascience.automatic_eda.chapters.num_distr import (
build_num_distr, CHAPTER_VERSION, _DIST_GLOSS,
)
from datascience.automatic_eda import model
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
def _numeric_block(mean, median, std, mn, mx, dist="normal-ish",
n_outliers=0, nbins=10):
"""A synthetic ``numeric`` sub-block shaped like describe_numeric's output."""
width = (mx - mn) / nbins if mx > mn else 1.0
hist = [{"lo": mn + i * width, "hi": mn + (i + 1) * width,
"count": (i + 1) * 3} for i in range(nbins)]
p25 = mn + (mx - mn) * 0.25
p75 = mn + (mx - mn) * 0.75
return {
"min": mn, "max": mx, "mean": mean, "median": median, "std": std,
"p25": p25, "p50": median, "p75": p75, "iqr": p75 - p25,
"n_outliers": n_outliers, "outlier_pct": 100.0 * n_outliers / 300.0,
"distribution_type": dist, "histogram": hist,
}
def _profile(n_numeric=2, extra_categorical=True):
cols = []
presets = [
("precio", 42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5),
("alcohol", 10.4, 10.3, 1.1, 8.0, 14.9, "normal-ish", 0),
("sulfatos", 0.66, 0.62, 0.17, 0.33, 2.0, "heavy-tail", 9),
("calidad", 5.6, 6.0, 0.8, 3.0, 8.0, "discrete", 0),
]
for i in range(n_numeric):
name, mean, med, std, mn, mx, dist, no = presets[i % len(presets)]
if i >= len(presets):
name = f"{name}_{i}"
cols.append({"name": name, "inferred_type": "numeric",
"numeric": _numeric_block(mean, med, std, mn, mx, dist, no)})
if extra_categorical:
cols.append({"name": "categoria", "inferred_type": "categorical",
"categorical": {"top": [{"value": "tinto", "count": 200}]}})
return {"table": "vinos", "n_rows": 300, "n_cols": len(cols),
"columns": cols}
def _pdf_text(path: str) -> str:
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
return re.sub(r"\s+", " ", txt)
def test_golden_chapter_estructura_y_bloques():
ch = build_num_distr(_profile(n_numeric=2), {})
assert ch is not None
assert ch.id == "num_distr"
assert ch.version == CHAPTER_VERSION
kinds = [b.kind for b in ch.blocks]
# Heading + intro Markdown, then per column: Heading + Figure + Markdown.
assert kinds[0] == "heading"
assert kinds[1] == "markdown"
assert kinds.count("figure") == 2 # one figure per numeric column.
assert kinds.count("heading") == 1 + 2 # chapter title + one per column.
# Each figure has a lazy maker that produces a real matplotlib figure.
figs = [b for b in ch.blocks if b.kind == "figure"]
fig = figs[0].make()
assert fig is not None
# Two stacked axes: histogram + boxplot share the figure.
assert len(fig.axes) == 2
import matplotlib.pyplot as plt
plt.close(fig)
def test_golden_media_mediana_sigma_y_boxplot_presentes():
# The intro documents the three reference lines and the Tukey boxplot; the
# per-column note carries the actual mean/median/σ numbers and the shape.
ch = build_num_distr(_profile(n_numeric=1, extra_categorical=False), {})
md_texts = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
assert "media" in md_texts and "mediana" in md_texts
assert "±1σ" in md_texts or "σ" in md_texts
assert "boxplot" in md_texts.lower()
assert "Tukey" in md_texts
# distribution_type gloss surfaced for the column (right-skewed preset).
assert _DIST_GLOSS["right-skewed"].split(";")[0][:20] in md_texts
def test_boxplot_stats_se_consumen_del_registry():
# The chapter must feed build_boxplot_stats (group eda) and the resulting
# box must carry the Tukey fences for the figure.
from datascience.build_boxplot_stats import build_boxplot_stats
box = build_boxplot_stats(
_numeric_block(42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5))
assert box
assert "lower_fence" in box and "upper_fence" in box
assert box["q1"] is not None and box["q3"] is not None
def test_edge_sin_columnas_numericas_devuelve_none():
prof = {"columns": [{"name": "c", "inferred_type": "categorical",
"categorical": {"top": []}}]}
assert build_num_distr(prof, {}) is None
def test_edge_profile_none_y_vacio_no_revienta():
assert build_num_distr(None, None) is None
assert build_num_distr({}, {}) is None
assert build_num_distr({"columns": []}, {}) is None
def test_anti_corte_muchas_columnas_pdf_y_pptx():
# 8 numeric columns + long note text: nothing may be cut. Every column
# heading must survive in both the PDF text and the PPTX deck.
ch = build_num_distr(_profile(n_numeric=8), {})
names = [b.text for b in ch.blocks if b.kind == "heading" and b.level == 2]
assert len(names) == 8
with tempfile.TemporaryDirectory() as d:
pdf = os.path.join(d, "num.pdf")
res_pdf = render_automatic_eda_pdf(_profile(n_numeric=8), pdf,
{"write_manifest": False})
assert res_pdf["path"] == pdf
txt = _pdf_text(pdf)
for name in names:
assert name in txt, f"columna '{name}' cortada/ausente en el PDF"
pptx = os.path.join(d, "num.pptx")
res_pptx = render_automatic_eda_pptx(_profile(n_numeric=8), pptx,
{"write_manifest": False})
assert res_pptx["path"] == pptx
assert res_pptx["n_slides"] >= 8 # at least one slide per column figure.
def test_distribution_gloss_cubre_todas_las_etiquetas():
# Every label detect_distribution_type can emit has a Spanish gloss.
for label in ("normal-ish", "right-skewed", "left-skewed", "heavy-tail",
"lognormal-ish", "multimodal", "discrete", "too_few_samples",
"other"):
assert label in _DIST_GLOSS and _DIST_GLOSS[label]
@@ -1,58 +0,0 @@
---
name: build_boxplot_stats
kind: function
lang: py
domain: datascience
version: "1.0.0"
purity: pure
signature: "def build_boxplot_stats(numeric: dict) -> dict"
description: "Deriva las estadisticas de un boxplot de Tukey desde el sub-bloque numeric de un ColumnProfile del grupo eda (salida de describe_numeric). Aplica la regla del 1.5*IQR a los percentiles p25/p50/p75 para obtener cuartiles, fences, bigotes reales y flags de outliers. Lectura defensiva con .get; NUNCA lanza. Si faltan los percentiles clave devuelve {} para que el caller omita el grafico."
tags: [eda, statistics, profiling, boxplot, tukey, iqr, datascience]
params:
- name: numeric
desc: "Sub-bloque numeric de un ColumnProfile del grupo eda (la salida de describe_numeric). Claves esperadas (todas pueden ser None): min, max, mean, median, mode, std, variance, cv, p1, p5, p25, p50, p75, p95, p99, iqr, skew, kurtosis, n_outliers, outlier_pct, zero_pct, negative_pct, distribution_type, histogram. Solo se usan p25, median/p50, p75, min, max y n_outliers."
output: "Dict con las cifras de un boxplot horizontal de Tukey: {q1=p25, median=median(o p50), q3=p75, iqr=q3-q1, lower_fence=q1-1.5*iqr, upper_fence=q3+1.5*iqr, whisker_lo=max(min,lower_fence), whisker_hi=min(max,upper_fence), min, max, has_low_outliers=min<lower_fence, has_high_outliers=max>upper_fence, n_outliers}. Numericos en float, flags en bool nativo, n_outliers en int. Si faltan p25/median(o p50)/p75 devuelve {} (dict vacio). Cuando min/max faltan, los bigotes caen a la fence correspondiente."
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: []
tested: true
tests: ["test_boxplot_tukey_basico", "test_percentiles_faltan_devuelve_vacio", "test_median_cae_a_p50", "test_whiskers_usan_fence_si_falta_min_max", "test_tipos_salida_float_bool_int"]
test_file_path: "python/functions/datascience/build_boxplot_stats_test.py"
file_path: "python/functions/datascience/build_boxplot_stats.py"
---
## Ejemplo
```python
import sys, os
sys.path.insert(0, os.path.join("python", "functions"))
from datascience.build_boxplot_stats import build_boxplot_stats
# Sub-bloque numeric tal y como lo produce describe_numeric:
numeric = {
"min": 1.0, "max": 100.0,
"p25": 10.0, "median": 25.0, "p75": 40.0,
"iqr": 30.0, "n_outliers": 3,
}
box = build_boxplot_stats(numeric)
print(box["lower_fence"], box["upper_fence"]) # -35.0 85.0
print(box["whisker_lo"], box["whisker_hi"]) # 1.0 85.0
print(box["has_low_outliers"], box["has_high_outliers"]) # False True
```
## Cuando usarla
- Usala al dibujar un boxplot horizontal bajo el histograma en el capitulo `num_distr` de `AutomaticEDA`: convierte el bloque `numeric` de un `ColumnProfile` en las cifras exactas que el renderer necesita (cuartiles, fences, extremos de los bigotes y flags de outliers).
- Cuando ya tengas los percentiles calculados (salida de `describe_numeric`) y solo necesites derivar la geometria del boxplot de Tukey sin volver a tocar los valores crudos.
- Cuando quieras decidir si una columna tiene cola alta/baja (`has_high_outliers` / `has_low_outliers`) antes de proponer una transformacion (log, winsorize).
## Gotchas
- Funcion pura, sin I/O y determinista. Lectura defensiva con `.get`: NUNCA lanza. Si faltan `p25`, `median`/`p50` o `p75` devuelve `{}` (dict vacio) — el caller debe omitir el boxplot.
- Los `n_outliers` que se propagan vienen del bloque z-score del profile (`detect_outliers`, threshold 3.0), NO de la regla IQR. Son informativos: el conteo de Tukey que esta funcion calcula son los **fences** (`lower_fence`/`upper_fence`), no un recuento de puntos.
- No recibe los valores crudos de la columna, solo deriva cifras desde los percentiles ya calculados. Por eso no puede contar cuantos puntos caen fuera de las fences, solo si los extremos (`min`/`max`) las superan.
- `iqr` se recalcula como `q3 - q1` aunque el bloque traiga `numeric['iqr']`: asi funciona aunque esa clave falte.
- Cuando `min`/`max` faltan, los bigotes caen a la fence correspondiente y los flags de outliers quedan en `False` (sin extremo real no se afirma cola).
@@ -1,94 +0,0 @@
"""build_boxplot_stats — Tukey boxplot statistics from an EDA `numeric` sub-block.
Pure function: no I/O, deterministic. Takes the `numeric` dict of a ColumnProfile
(group `eda`, the output of describe_numeric) and derives the figures needed to
draw a horizontal Tukey boxplot using the 1.5 * IQR rule.
It only derives numbers from already-computed percentiles; it never sees the raw
column values. Reading is defensive (.get throughout) and the function NEVER
raises: if the key percentiles (p25 / p50 / p75) are missing it returns {} so the
caller can simply skip the boxplot.
"""
def _num(value):
"""Coerce to float defensively; return None for None/bool/non-numeric."""
# bool is a subclass of int; a percentile value is never a real bool, so
# treat True/False as missing instead of silently coercing to 1.0/0.0.
if value is None or isinstance(value, bool):
return None
try:
return float(value)
except (TypeError, ValueError):
return None
def build_boxplot_stats(numeric: dict) -> dict:
"""Derive Tukey boxplot statistics from the `numeric` sub-block of a profile.
Reads the percentiles already computed by describe_numeric and applies the
classic 1.5 * IQR fence rule to obtain the whisker extremes and outlier
flags of a horizontal boxplot. No raw values are needed.
Args:
numeric: The `numeric` sub-block of an eda ColumnProfile (output of
describe_numeric). Every value may be None; read defensively.
Returns:
Dict with the boxplot figures
{q1, median, q3, iqr, lower_fence, upper_fence, whisker_lo, whisker_hi,
min, max, has_low_outliers, has_high_outliers, n_outliers}.
If p25, p50/median or p75 are missing (None) returns {} (empty dict) so
the caller omits the plot.
"""
if not isinstance(numeric, dict):
return {}
q1 = _num(numeric.get("p25"))
q3 = _num(numeric.get("p75"))
# Prefer the explicit median; fall back to p50 (they are the same quantile).
median = _num(numeric.get("median"))
if median is None:
median = _num(numeric.get("p50"))
# Without the three quartiles a boxplot cannot be drawn.
if q1 is None or q3 is None or median is None:
return {}
# Recompute the IQR from the quartiles rather than trusting numeric['iqr'],
# which may be missing even when the percentiles are present.
iqr = q3 - q1
lower_fence = q1 - 1.5 * iqr
upper_fence = q3 + 1.5 * iqr
mn = _num(numeric.get("min"))
mx = _num(numeric.get("max"))
# Whisker extremes: the real data range clamped to the fences. When the
# corresponding extreme is missing, fall back to the fence itself.
whisker_lo = max(mn, lower_fence) if mn is not None else lower_fence
whisker_hi = min(mx, upper_fence) if mx is not None else upper_fence
has_low_outliers = bool(mn is not None and mn < lower_fence)
has_high_outliers = bool(mx is not None and mx > upper_fence)
# Informative only: these outliers come from the z-score block of the
# profile, not from this IQR fence computation.
raw_n = numeric.get("n_outliers")
n_outliers = int(raw_n) if isinstance(raw_n, (int, float)) and not isinstance(raw_n, bool) else 0
return {
"q1": q1,
"median": median,
"q3": q3,
"iqr": iqr,
"lower_fence": lower_fence,
"upper_fence": upper_fence,
"whisker_lo": whisker_lo,
"whisker_hi": whisker_hi,
"min": mn,
"max": mx,
"has_low_outliers": has_low_outliers,
"has_high_outliers": has_high_outliers,
"n_outliers": n_outliers,
}
@@ -1,108 +0,0 @@
"""Tests para build_boxplot_stats."""
import os
import sys
sys.path.insert(0, os.path.dirname(__file__))
from build_boxplot_stats import build_boxplot_stats
# Keys that a non-empty result dict must always contain.
_EXPECTED_KEYS = {
"q1", "median", "q3", "iqr", "lower_fence", "upper_fence",
"whisker_lo", "whisker_hi", "min", "max",
"has_low_outliers", "has_high_outliers", "n_outliers",
}
def test_boxplot_tukey_basico():
"""Golden: bloque numeric con outlier alto claro -> fences IQR de Tukey."""
numeric = {
"min": 1.0, "max": 100.0,
"p25": 10.0, "median": 25.0, "p75": 40.0,
"iqr": 30.0, "n_outliers": 3,
}
box = build_boxplot_stats(numeric)
assert set(box.keys()) == _EXPECTED_KEYS
assert box["q1"] == 10.0
assert box["median"] == 25.0
assert box["q3"] == 40.0
# iqr recomputado desde los cuartiles.
assert box["iqr"] == 30.0
# lower = 10 - 1.5*30 = -35 ; upper = 40 + 1.5*30 = 85.
assert box["lower_fence"] == -35.0
assert box["upper_fence"] == 85.0
# whisker_lo = max(min=1, -35) = 1 ; whisker_hi = min(max=100, 85) = 85.
assert box["whisker_lo"] == 1.0
assert box["whisker_hi"] == 85.0
assert box["min"] == 1.0
assert box["max"] == 100.0
# Solo hay outliers altos (100 > 85), no bajos (1 no < -35).
assert box["has_low_outliers"] is False
assert box["has_high_outliers"] is True
# n_outliers se propaga del bloque z-score (informativo).
assert box["n_outliers"] == 3
def test_percentiles_faltan_devuelve_vacio():
"""Si falta p25/median/p75 -> {} (caller omite el boxplot)."""
# Falta p25.
assert build_boxplot_stats({"median": 25.0, "p75": 40.0}) == {}
# Falta p75.
assert build_boxplot_stats({"p25": 10.0, "median": 25.0}) == {}
# Falta median y p50.
assert build_boxplot_stats({"p25": 10.0, "p75": 40.0}) == {}
# numeric None / no dict tambien es vacio, nunca lanza.
assert build_boxplot_stats(None) == {}
assert build_boxplot_stats({}) == {}
def test_median_cae_a_p50():
"""median ausente cae a p50."""
numeric = {"min": 0.0, "max": 10.0, "p25": 2.0, "p50": 5.0, "p75": 8.0}
box = build_boxplot_stats(numeric)
assert box["median"] == 5.0
assert box["q1"] == 2.0
assert box["q3"] == 8.0
def test_whiskers_usan_fence_si_falta_min_max():
"""Sin min/max los bigotes caen a las fences y no hay outliers marcados."""
numeric = {"p25": 10.0, "median": 25.0, "p75": 40.0} # sin min ni max
box = build_boxplot_stats(numeric)
assert box["min"] is None
assert box["max"] is None
# iqr = 30, fences -35 / 85; los bigotes caen a las fences.
assert box["whisker_lo"] == box["lower_fence"] == -35.0
assert box["whisker_hi"] == box["upper_fence"] == 85.0
# Sin extremos reales, no se afirma que haya outliers.
assert box["has_low_outliers"] is False
assert box["has_high_outliers"] is False
# n_outliers ausente -> 0.
assert box["n_outliers"] == 0
def test_tipos_salida_float_bool_int():
"""Numericos en float, flags bool nativos, n_outliers int."""
numeric = {
"min": -50.0, "max": 200.0,
"p25": 10.0, "median": 25.0, "p75": 40.0,
"n_outliers": 7,
}
box = build_boxplot_stats(numeric)
for key in ("q1", "median", "q3", "iqr", "lower_fence", "upper_fence",
"whisker_lo", "whisker_hi", "min", "max"):
assert isinstance(box[key], float), f"{key} debe ser float"
assert isinstance(box["has_low_outliers"], bool)
assert isinstance(box["has_high_outliers"], bool)
assert isinstance(box["n_outliers"], int) and not isinstance(box["n_outliers"], bool)
# min=-50 < lower_fence=-35 -> outlier bajo ; max=200 > upper_fence=85 -> alto.
assert box["has_low_outliers"] is True
assert box["has_high_outliers"] is True
assert box["n_outliers"] == 7