feat(eda): capítulo CALIDAD del AutomaticEDA (criterios + scores + problemas ES)
Añade el capítulo de calidad de datos al motor AutomaticEDA, siguiendo el
contrato de capítulos (build_calidad(profile, ctx) -> Chapter | None,
CHAPTER_VERSION). El capítulo responde lo que pidió el usuario, en español y
en formato de tabla:
- Intro "Cómo se calcula la calidad": explica los tres criterios y sus pesos
(completitud 50%, validez 30%, consistencia 20%) antes de cualquier número,
más una KVTable de resumen a nivel tabla (calidad global y agregados).
- Tabla "Scores por columna": score total más su desglose en completitud /
validez / consistencia, ordenada de peor a mejor.
- Tabla "Problemas detectados": los issues en español por columna, separados de
los flags de tipo. Cuando no hay problemas, una nota honesta.
Registry-first: el desglose y los issues NO se recalculan aquí; se consumen de
la función pura del registry column_quality_score (grupo eda), que ya deriva
{score, completeness, validity, consistency, issues} del ColumnProfile. El
capítulo es render-only y compone bloques del modelo; los renderers paginan las
tablas (parten por filas repitiendo cabecera) y envuelven celdas largas, de modo
que nada se corta en PDF ni en PPTX. La lista de issues por celda se acota a
160 caracteres con "(+N más)" para que una fila nunca crezca más que una página.
Test self-contained (sin DuckDB): golden con desglose + issues ES, edges
(None/{}/sin columnas -> None; perfil limpio -> nota), y anti-cortes (perfil de
22 columnas con nombres largos renderizado a PDF y PPTX: el nombre completo
sobrevive al envolverse, sin marcador de truncado).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,266 @@
|
||||
"""Data-quality chapter (CALIDAD) for AutomaticEDA.
|
||||
|
||||
Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
|
||||
chapter answers, in Spanish and as tables, the three things the user asked for:
|
||||
|
||||
1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and
|
||||
their weights (completeness, validity, consistency) before any number, plus a
|
||||
table-level summary (global score and aggregates).
|
||||
2. **Scores por columna** — a table with, per column, the total quality score and
|
||||
its breakdown into completeness / validity / consistency.
|
||||
3. **Problemas en español** — a second table listing, per column, the readable
|
||||
issues in Spanish (kept separate from the type ``flags``).
|
||||
|
||||
The breakdown and the issues are NOT recomputed here: they come from the registry
|
||||
function ``column_quality_score`` (group ``eda``), which already derives
|
||||
``{score, completeness, validity, consistency, issues}`` from the ColumnProfile.
|
||||
This chapter is render-only — it consumes that function and lays the result out
|
||||
as model blocks; the renderers paginate tables (splitting by rows, repeating the
|
||||
header) and wrap long cells so nothing is ever cut.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
# Reuse the registry's pure quality function (group ``eda``). Import defensively:
|
||||
# if the package cannot be imported for any reason the chapter degrades to the
|
||||
# per-column ``quality_score`` already present in the profile instead of failing.
|
||||
try: # pragma: no cover - import wiring
|
||||
from ...column_quality_score import column_quality_score as _column_quality_score
|
||||
except Exception: # noqa: BLE001 - never let an import error abort the document.
|
||||
_column_quality_score = None
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "calidad"
|
||||
CHAPTER_TITLE = "Calidad"
|
||||
|
||||
# Weights mirror column_quality_score: completeness 0.5, validity 0.3,
|
||||
# consistency 0.2. Kept here only to render the human explanation; the actual
|
||||
# numbers always come from the function so the two never drift in computation.
|
||||
_CRITERIA_INTRO = (
|
||||
"La calidad de cada columna es un score de 0 a 100 que combina tres "
|
||||
"criterios, cada uno con un peso:\n\n"
|
||||
"- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos "
|
||||
"ni vacíos). Una columna con muchos nulos baja de score.\n"
|
||||
"- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango "
|
||||
"esperado (penaliza outliers y semánticas declaradas que no coinciden).\n"
|
||||
"- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza "
|
||||
"columnas constantes o identificadores de cardinalidad muy alta).\n\n"
|
||||
"Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). "
|
||||
"Los problemas detectados por columna se listan en español más abajo."
|
||||
)
|
||||
|
||||
# Cap for the joined issues cell so a single row never grows taller than a page;
|
||||
# the remainder is summarized as "(+N más)" instead of being silently dropped.
|
||||
_ISSUES_MAXLEN = 160
|
||||
|
||||
|
||||
def _fmt_score(value) -> str:
|
||||
"""Format a 0-100 score as ``NN / 100`` (or a placeholder)."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
num = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
if num != num: # NaN
|
||||
return "—"
|
||||
text = f"{num:.1f}".rstrip("0").rstrip(".")
|
||||
return f"{text} / 100"
|
||||
|
||||
|
||||
def _fmt_unit_pct(value) -> str:
|
||||
"""Format a 0-1 fraction as a percentage (``95%``)."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{float(value) * 100:.0f}%"
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
|
||||
|
||||
def _quality_of(col: dict) -> dict:
|
||||
"""Return ``{score, completeness, validity, consistency, issues}`` for a column.
|
||||
|
||||
Uses the registry ``column_quality_score`` when available; otherwise falls
|
||||
back to the per-column ``quality_score`` already in the profile (number only,
|
||||
empty breakdown/issues). Never raises.
|
||||
"""
|
||||
if not isinstance(col, dict):
|
||||
col = {}
|
||||
if _column_quality_score is not None:
|
||||
try:
|
||||
res = _column_quality_score(col)
|
||||
if isinstance(res, dict):
|
||||
return res
|
||||
except Exception: # noqa: BLE001 - degrade instead of aborting.
|
||||
pass
|
||||
# Fallback: only the final score is available pre-computed in the profile.
|
||||
return {
|
||||
"score": col.get("quality_score"),
|
||||
"completeness": None,
|
||||
"validity": None,
|
||||
"consistency": None,
|
||||
"issues": [],
|
||||
}
|
||||
|
||||
|
||||
def _join_issues(issues) -> str:
|
||||
"""Join Spanish issue strings into one cell, truncating overly long lists.
|
||||
|
||||
The renderer wraps cell text, but a column with many long issues could make a
|
||||
single row taller than a whole page; cap the length and append ``(+N más)``
|
||||
so the count of hidden issues is honest rather than silently lost.
|
||||
"""
|
||||
if not isinstance(issues, (list, tuple)) or not issues:
|
||||
return ""
|
||||
parts = [model._safe_str(i).strip() for i in issues]
|
||||
parts = [p for p in parts if p]
|
||||
if not parts:
|
||||
return ""
|
||||
out = []
|
||||
used = 0
|
||||
for idx, part in enumerate(parts):
|
||||
extra = len(part) + (2 if out else 0)
|
||||
if used + extra > _ISSUES_MAXLEN and out:
|
||||
remaining = len(parts) - idx
|
||||
out.append(f"(+{remaining} más)")
|
||||
return "; ".join(out)
|
||||
out.append(part)
|
||||
used += extra
|
||||
return "; ".join(out)
|
||||
|
||||
|
||||
def _columns_with_quality(profile: dict):
|
||||
"""Yield ``(col, quality_dict)`` for every column dict in the profile."""
|
||||
cols = profile.get("columns") or []
|
||||
for c in cols:
|
||||
if isinstance(c, dict):
|
||||
yield c, _quality_of(c)
|
||||
|
||||
|
||||
def _summary_block(profile: dict, evaluated: list):
|
||||
"""Table-level KVTable: global score and quality aggregates."""
|
||||
rows = []
|
||||
score = profile.get("quality_score")
|
||||
rows.append(("Calidad global", _fmt_score(score)))
|
||||
rows.append(("Columnas evaluadas", str(len(evaluated))))
|
||||
|
||||
comps = [q.get("completeness") for _, q in evaluated
|
||||
if isinstance(q.get("completeness"), (int, float))]
|
||||
vals = [q.get("validity") for _, q in evaluated
|
||||
if isinstance(q.get("validity"), (int, float))]
|
||||
cons = [q.get("consistency") for _, q in evaluated
|
||||
if isinstance(q.get("consistency"), (int, float))]
|
||||
if comps:
|
||||
rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
|
||||
if vals:
|
||||
rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals))))
|
||||
if cons:
|
||||
rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons))))
|
||||
|
||||
n_problem = sum(1 for _, q in evaluated if q.get("issues"))
|
||||
rows.append(("Columnas con problemas", str(n_problem)))
|
||||
|
||||
# Extra table-wide quality signals already in the profile, when present.
|
||||
dup_pct = profile.get("duplicate_pct")
|
||||
if dup_pct is not None:
|
||||
rows.append(("Filas duplicadas", _fmt_unit_pct_or_pct(dup_pct)))
|
||||
null_cell_pct = profile.get("null_cell_pct")
|
||||
if null_cell_pct is not None:
|
||||
rows.append(("Celdas nulas (global)", _fmt_unit_pct_or_pct(null_cell_pct)))
|
||||
constant_cols = profile.get("constant_cols")
|
||||
if isinstance(constant_cols, (list, tuple)) and constant_cols:
|
||||
rows.append(("Columnas constantes", str(len(constant_cols))))
|
||||
all_null_cols = profile.get("all_null_cols")
|
||||
if isinstance(all_null_cols, (list, tuple)) and all_null_cols:
|
||||
rows.append(("Columnas 100% nulas", str(len(all_null_cols))))
|
||||
|
||||
return model.KVTable(rows=rows, title="Resumen de calidad")
|
||||
|
||||
|
||||
def _fmt_unit_pct_or_pct(value) -> str:
|
||||
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
|
||||
try:
|
||||
num = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
if num != num: # NaN
|
||||
return "—"
|
||||
pct = num * 100 if num <= 1.0 else num
|
||||
text = f"{pct:.1f}".rstrip("0").rstrip(".")
|
||||
return f"{text}%"
|
||||
|
||||
|
||||
def _scores_block(evaluated: list):
|
||||
"""DataTable with per-column score and its three-criteria breakdown."""
|
||||
header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"]
|
||||
rows = []
|
||||
# Worst columns first so the reader sees the problems at the top.
|
||||
ordered = sorted(
|
||||
evaluated,
|
||||
key=lambda cq: (cq[1].get("score")
|
||||
if isinstance(cq[1].get("score"), (int, float)) else 101.0),
|
||||
)
|
||||
for col, q in ordered:
|
||||
rows.append([
|
||||
col.get("name") or "(col)",
|
||||
_fmt_score(q.get("score")),
|
||||
_fmt_unit_pct(q.get("completeness")),
|
||||
_fmt_unit_pct(q.get("validity")),
|
||||
_fmt_unit_pct(q.get("consistency")),
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=header, rows=rows,
|
||||
title="Scores de calidad por columna",
|
||||
note="0 = peor, 100 = mejor; ordenado de peor a mejor")
|
||||
|
||||
|
||||
def _issues_block(evaluated: list):
|
||||
"""DataTable listing Spanish issues per column, or a Note when there are none."""
|
||||
header = ["Columna", "Problemas detectados (español)"]
|
||||
rows = []
|
||||
for col, q in evaluated:
|
||||
joined = _join_issues(q.get("issues"))
|
||||
if joined:
|
||||
rows.append([col.get("name") or "(col)", joined])
|
||||
if not rows:
|
||||
return model.Note(
|
||||
"No se detectaron problemas de calidad en las columnas evaluadas.")
|
||||
return model.DataTable(header=header, rows=rows,
|
||||
title="Problemas de calidad por columna")
|
||||
|
||||
|
||||
def build_calidad(profile: dict, ctx: dict):
|
||||
"""Build the data-quality Chapter, or None if the profile has no columns.
|
||||
|
||||
Reads everything defensively; returns ``None`` when there are no columns to
|
||||
score (the chapter does not apply), and never raises on a malformed profile.
|
||||
"""
|
||||
profile = profile or {}
|
||||
if not isinstance(profile, dict):
|
||||
profile = {}
|
||||
ctx = ctx or {}
|
||||
|
||||
evaluated = list(_columns_with_quality(profile))
|
||||
if not evaluated:
|
||||
return None # no columns to score -> chapter does not apply.
|
||||
|
||||
blocks = [
|
||||
model.Heading(text="Cómo se calcula la calidad", level=2),
|
||||
model.Markdown(text=_CRITERIA_INTRO),
|
||||
_summary_block(profile, evaluated),
|
||||
model.Heading(text="Scores por columna", level=2),
|
||||
]
|
||||
scores = _scores_block(evaluated)
|
||||
if scores is not None:
|
||||
blocks.append(scores)
|
||||
blocks.append(model.Heading(text="Problemas detectados", level=2))
|
||||
blocks.append(_issues_block(evaluated))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
Reference in New Issue
Block a user