"""Data-quality chapter (CALIDAD) for AutomaticEDA. Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The chapter answers, in Spanish and as tables, the three things the user asked for: 1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and their weights (completeness, validity, consistency) before any number, plus a table-level summary (global score and aggregates). 2. **Scores por columna** — a table with, per column, the total quality score and its breakdown into completeness / validity / consistency. 3. **Problemas en español** — a second table listing, per column, the readable issues in Spanish (kept separate from the type ``flags``). The breakdown and the issues are NOT recomputed here: they come from the registry function ``column_quality_score`` (group ``eda``), which already derives ``{score, completeness, validity, consistency, issues}`` from the ColumnProfile. This chapter is render-only — it consumes that function and lays the result out as model blocks; the renderers paginate tables (splitting by rows, repeating the header) and wrap long cells so nothing is ever cut. Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". """ from __future__ import annotations from .. import model # Reuse the registry's pure quality function (group ``eda``). Import defensively: # if the package cannot be imported for any reason the chapter degrades to the # per-column ``quality_score`` already present in the profile instead of failing. try: # pragma: no cover - import wiring from ...column_quality_score import column_quality_score as _column_quality_score except Exception: # noqa: BLE001 - never let an import error abort the document. _column_quality_score = None CHAPTER_VERSION = "1.0.0" CHAPTER_ID = "calidad" CHAPTER_TITLE = "Calidad" # Weights mirror column_quality_score: completeness 0.5, validity 0.3, # consistency 0.2. Kept here only to render the human explanation; the actual # numbers always come from the function so the two never drift in computation. _CRITERIA_INTRO = ( "La calidad de cada columna es un score de 0 a 100 que combina tres " "criterios, cada uno con un peso:\n\n" "- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos " "ni vacíos). Una columna con muchos nulos baja de score.\n" "- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango " "esperado (penaliza outliers y semánticas declaradas que no coinciden).\n" "- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza " "columnas constantes o identificadores de cardinalidad muy alta).\n\n" "Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). " "Los problemas detectados por columna se listan en español más abajo." ) # Cap for the joined issues cell so a single row never grows taller than a page; # the remainder is summarized as "(+N más)" instead of being silently dropped. _ISSUES_MAXLEN = 160 def _fmt_score(value) -> str: """Format a 0-100 score as ``NN / 100`` (or a placeholder).""" if value is None: return "—" try: num = float(value) except (TypeError, ValueError): return str(value) if num != num: # NaN return "—" text = f"{num:.1f}".rstrip("0").rstrip(".") return f"{text} / 100" def _fmt_unit_pct(value) -> str: """Format a 0-1 fraction as a percentage (``95%``).""" if value is None: return "—" try: return f"{float(value) * 100:.0f}%" except (TypeError, ValueError): return str(value) def _quality_of(col: dict) -> dict: """Return ``{score, completeness, validity, consistency, issues}`` for a column. Uses the registry ``column_quality_score`` when available; otherwise falls back to the per-column ``quality_score`` already in the profile (number only, empty breakdown/issues). Never raises. """ if not isinstance(col, dict): col = {} if _column_quality_score is not None: try: res = _column_quality_score(col) if isinstance(res, dict): return res except Exception: # noqa: BLE001 - degrade instead of aborting. pass # Fallback: only the final score is available pre-computed in the profile. return { "score": col.get("quality_score"), "completeness": None, "validity": None, "consistency": None, "issues": [], } def _join_issues(issues) -> str: """Join Spanish issue strings into one cell, truncating overly long lists. The renderer wraps cell text, but a column with many long issues could make a single row taller than a whole page; cap the length and append ``(+N más)`` so the count of hidden issues is honest rather than silently lost. """ if not isinstance(issues, (list, tuple)) or not issues: return "" parts = [model._safe_str(i).strip() for i in issues] parts = [p for p in parts if p] if not parts: return "" out = [] used = 0 for idx, part in enumerate(parts): extra = len(part) + (2 if out else 0) if used + extra > _ISSUES_MAXLEN and out: remaining = len(parts) - idx out.append(f"(+{remaining} más)") return "; ".join(out) out.append(part) used += extra return "; ".join(out) def _columns_with_quality(profile: dict): """Yield ``(col, quality_dict)`` for every column dict in the profile.""" cols = profile.get("columns") or [] for c in cols: if isinstance(c, dict): yield c, _quality_of(c) def _summary_block(profile: dict, evaluated: list): """Table-level KVTable: global score and quality aggregates.""" rows = [] score = profile.get("quality_score") rows.append(("Calidad global", _fmt_score(score))) rows.append(("Columnas evaluadas", str(len(evaluated)))) comps = [q.get("completeness") for _, q in evaluated if isinstance(q.get("completeness"), (int, float))] vals = [q.get("validity") for _, q in evaluated if isinstance(q.get("validity"), (int, float))] cons = [q.get("consistency") for _, q in evaluated if isinstance(q.get("consistency"), (int, float))] if comps: rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps)))) if vals: rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals)))) if cons: rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons)))) n_problem = sum(1 for _, q in evaluated if q.get("issues")) rows.append(("Columnas con problemas", str(n_problem))) # Extra table-wide quality signals already in the profile, when present. dup_pct = profile.get("duplicate_pct") if dup_pct is not None: rows.append(("Filas duplicadas", _fmt_unit_pct_or_pct(dup_pct))) null_cell_pct = profile.get("null_cell_pct") if null_cell_pct is not None: rows.append(("Celdas nulas (global)", _fmt_unit_pct_or_pct(null_cell_pct))) constant_cols = profile.get("constant_cols") if isinstance(constant_cols, (list, tuple)) and constant_cols: rows.append(("Columnas constantes", str(len(constant_cols)))) all_null_cols = profile.get("all_null_cols") if isinstance(all_null_cols, (list, tuple)) and all_null_cols: rows.append(("Columnas 100% nulas", str(len(all_null_cols)))) return model.KVTable(rows=rows, title="Resumen de calidad") def _fmt_unit_pct_or_pct(value) -> str: """Format a value that may be a 0-1 fraction or an already-0-100 percentage.""" try: num = float(value) except (TypeError, ValueError): return model._safe_str(value) if num != num: # NaN return "—" pct = num * 100 if num <= 1.0 else num text = f"{pct:.1f}".rstrip("0").rstrip(".") return f"{text}%" def _scores_block(evaluated: list): """DataTable with per-column score and its three-criteria breakdown.""" header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"] rows = [] # Worst columns first so the reader sees the problems at the top. ordered = sorted( evaluated, key=lambda cq: (cq[1].get("score") if isinstance(cq[1].get("score"), (int, float)) else 101.0), ) for col, q in ordered: rows.append([ col.get("name") or "(col)", _fmt_score(q.get("score")), _fmt_unit_pct(q.get("completeness")), _fmt_unit_pct(q.get("validity")), _fmt_unit_pct(q.get("consistency")), ]) if not rows: return None return model.DataTable(header=header, rows=rows, title="Scores de calidad por columna", note="0 = peor, 100 = mejor; ordenado de peor a mejor") def _issues_block(evaluated: list): """DataTable listing Spanish issues per column, or a Note when there are none.""" header = ["Columna", "Problemas detectados (español)"] rows = [] for col, q in evaluated: joined = _join_issues(q.get("issues")) if joined: rows.append([col.get("name") or "(col)", joined]) if not rows: return model.Note( "No se detectaron problemas de calidad en las columnas evaluadas.") return model.DataTable(header=header, rows=rows, title="Problemas de calidad por columna") def build_calidad(profile: dict, ctx: dict): """Build the data-quality Chapter, or None if the profile has no columns. Reads everything defensively; returns ``None`` when there are no columns to score (the chapter does not apply), and never raises on a malformed profile. """ profile = profile or {} if not isinstance(profile, dict): profile = {} ctx = ctx or {} evaluated = list(_columns_with_quality(profile)) if not evaluated: return None # no columns to score -> chapter does not apply. blocks = [ model.Heading(text="Cómo se calcula la calidad", level=2), model.Markdown(text=_CRITERIA_INTRO), _summary_block(profile, evaluated), model.Heading(text="Scores por columna", level=2), ] scores = _scores_block(evaluated) if scores is not None: blocks.append(scores) blocks.append(model.Heading(text="Problemas detectados", level=2)) blocks.append(_issues_block(evaluated)) return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, version=CHAPTER_VERSION, blocks=blocks)