"""Data-quality chapter (CALIDAD) for AutomaticEDA. Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The chapter implements the quality model of report 2046: 1. **En qué se basa la calidad** — a concise intro naming the two scored dimensions and their weights (completitud 60%, validez 40%) plus the table-level row uniqueness, BEFORE any number, and stating that outliers are reported as observations and do **not** lower the score. The criteria terms (calidad de datos, completitud, validez, unicidad de registro) are hooked into the shared glossary as clickable jumps; their full definitions live in the GLOSARIO chapter, not inline here. 2. **Scores por columna** — a table with, per column, the total quality score and its breakdown into completeness / validity (no consistency dimension). 3. **Problemas de calidad** — a table listing ONLY real quality defects (nulls, empty cells, values not conforming to their type/semantics). 4. **Observaciones analíticas** — a SEPARATE table for outliers, constant columns, high-cardinality ids and strong skew, with an explicit note that these do not affect the score. The breakdown, issues and observations are NOT recomputed here: they come from the registry function ``column_quality_score`` (group ``eda``), which derives ``{score, completeness, validity, dimensions, applicable, issues, observations}`` from the ColumnProfile. This chapter is render-only. Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". """ from __future__ import annotations from .. import model # Reuse the registry's pure quality function (group ``eda``). Import defensively: # if the package cannot be imported for any reason the chapter degrades to the # per-column ``quality_score`` already present in the profile instead of failing. try: # pragma: no cover - import wiring from ...column_quality_score import column_quality_score as _column_quality_score except Exception: # noqa: BLE001 - never let an import error abort the document. _column_quality_score = None CHAPTER_VERSION = "2.0.0" CHAPTER_ID = "calidad" CHAPTER_TITLE = "Calidad" # Glossary terms this chapter explains (report 2046 §6). Registered in the shared # collector and marked clickable on their first appearance (contract §11.1). _TERMS = { "calidad_datos": ( "Calidad de datos (score 0-100)", "Mide hasta qué punto los datos están presentes y son utilizables tal " "cual, no si son «buenos para el análisis». Se compone solo de " "dimensiones medibles automáticamente desde el perfil de la tabla, sin " "fuente externa de verdad: completitud (60%), validez (40%, cuando es " "medible) y, a nivel de tabla, unicidad de registro. Los valores " "atípicos NO bajan la calidad: se listan aparte como observaciones.", ), "completitud": ( "Completitud", "Proporción de valores realmente presentes en una columna (1 − % de " "nulos; en texto, las celdas vacías también cuentan como faltantes). Los " "nulos y vacíos bajan el score porque falta información que debería " "estar. Pesa el 60% del score de columna.", ), "validez": ( "Validez", "Proporción de valores que encajan con su tipo o formato esperado: un " "número que parsea, una fecha legible, un email con forma de email. Los " "valores que no parsean a su tipo bajan el score. Si la columna es texto " "libre sin formato esperado, la validez no se puede medir y el score se " "basa solo en la completitud. Pesa el 40% del score cuando es medible.", ), "unicidad_registro": ( "Unicidad de registro", "A nivel de tabla, las filas duplicadas restan calidad al conjunto " "(1 − % de filas duplicadas). Es distinta de que una columna no-clave " "repita valores, que no es un defecto de calidad.", ), } # Cap for the joined cell so a single row never grows taller than a page; the # remainder is summarized as "(+N más)" instead of being silently dropped. _ISSUES_MAXLEN = 160 def _fmt_score(value) -> str: """Format a 0-100 score as ``NN / 100`` (or a placeholder).""" if value is None: return "—" try: num = float(value) except (TypeError, ValueError): return str(value) if num != num: # NaN return "—" text = f"{num:.1f}".rstrip("0").rstrip(".") return f"{text} / 100" def _fmt_unit_pct(value) -> str: """Format a 0-1 fraction as a percentage (``95%``).""" if value is None: return "—" try: return f"{float(value) * 100:.0f}%" except (TypeError, ValueError): return str(value) def _fmt_validity(value) -> str: """Validity is ``None`` when not applicable: show ``n/a`` not a fake 0%.""" if value is None: return "n/a" return _fmt_unit_pct(value) def _quality_of(col: dict) -> dict: """Return the quality dict for a column. Uses the registry ``column_quality_score`` when available; otherwise falls back to the per-column ``quality_score`` already in the profile (number only, empty breakdown/issues/observations). Never raises. """ if not isinstance(col, dict): col = {} if _column_quality_score is not None: try: res = _column_quality_score(col) if isinstance(res, dict): return res except Exception: # noqa: BLE001 - degrade instead of aborting. pass return { "score": col.get("quality_score"), "completeness": None, "validity": None, "issues": [], "observations": [], } def _join_cells(items) -> str: """Join Spanish strings into one cell, truncating overly long lists. The renderer wraps cell text, but a column with many long entries could make a single row taller than a whole page; cap the length and append ``(+N más)`` so the count of hidden entries is honest rather than silently lost. """ if not isinstance(items, (list, tuple)) or not items: return "" parts = [model._safe_str(i).strip() for i in items] parts = [p for p in parts if p] if not parts: return "" out = [] used = 0 for idx, part in enumerate(parts): extra = len(part) + (2 if out else 0) if used + extra > _ISSUES_MAXLEN and out: remaining = len(parts) - idx out.append(f"(+{remaining} más)") return "; ".join(out) out.append(part) used += extra return "; ".join(out) def _columns_with_quality(profile: dict): """Yield ``(col, quality_dict)`` for every column dict in the profile.""" cols = profile.get("columns") or [] for c in cols: if isinstance(c, dict): yield c, _quality_of(c) def _fmt_unit_pct_or_pct(value) -> str: """Format a value that may be a 0-1 fraction or an already-0-100 percentage.""" try: num = float(value) except (TypeError, ValueError): return model._safe_str(value) if num != num: # NaN return "—" pct = num * 100 if num <= 1.0 else num text = f"{pct:.1f}".rstrip("0").rstrip(".") return f"{text}%" def _row_uniqueness(profile: dict): """Return row uniqueness (1 - duplicate_pct) in [0,1], or None if unknown.""" dup = profile.get("duplicate_pct") if dup is None: return None try: d = float(dup) except (TypeError, ValueError): return None if d > 1.0: # tolerate a 0-100 scale d = d / 100.0 return max(0.0, min(1.0, 1.0 - d)) def _summary_block(profile: dict, evaluated: list): """Table-level KVTable: global score and quality aggregates.""" rows = [] score = profile.get("quality_score") rows.append(("Calidad global", _fmt_score(score))) rows.append(("Columnas evaluadas", str(len(evaluated)))) comps = [q.get("completeness") for _, q in evaluated if isinstance(q.get("completeness"), (int, float))] vals = [q.get("validity") for _, q in evaluated if isinstance(q.get("validity"), (int, float))] if comps: rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps)))) if vals: rows.append(("Validez media (donde aplica)", _fmt_unit_pct(sum(vals) / len(vals)))) ru = _row_uniqueness(profile) if ru is not None: rows.append(("Unicidad de registro", _fmt_unit_pct(ru))) n_problem = sum(1 for _, q in evaluated if q.get("issues")) rows.append(("Columnas con problemas", str(n_problem))) # Extra table-wide quality signals already in the profile, when present. dup_pct = profile.get("duplicate_pct") if dup_pct is not None: rows.append(("Filas duplicadas", _fmt_unit_pct_or_pct(dup_pct))) null_cell_pct = profile.get("null_cell_pct") if null_cell_pct is not None: rows.append(("Celdas nulas (global)", _fmt_unit_pct_or_pct(null_cell_pct))) constant_cols = profile.get("constant_cols") if isinstance(constant_cols, (list, tuple)) and constant_cols: rows.append(("Columnas constantes", str(len(constant_cols)))) all_null_cols = profile.get("all_null_cols") if isinstance(all_null_cols, (list, tuple)) and all_null_cols: rows.append(("Columnas 100% nulas", str(len(all_null_cols)))) return model.KVTable(rows=rows, title="Resumen de calidad") def _scores_block(evaluated: list): """DataTable with per-column score and its completeness/validity breakdown.""" header = ["Columna", "Calidad", "Completitud", "Validez"] rows = [] # Worst columns first so the reader sees the problems at the top. ordered = sorted( evaluated, key=lambda cq: (cq[1].get("score") if isinstance(cq[1].get("score"), (int, float)) else 101.0), ) for col, q in ordered: rows.append([ col.get("name") or "(col)", _fmt_score(q.get("score")), _fmt_unit_pct(q.get("completeness")), _fmt_validity(q.get("validity")), ]) if not rows: return None return model.DataTable(header=header, rows=rows, title="Scores de calidad por columna", note="0 = peor, 100 = mejor; «n/a» = dimensión no " "medible; ordenado de peor a mejor") def _issues_block(evaluated: list): """DataTable listing ONLY real quality defects per column, or a Note.""" header = ["Columna", "Problemas de calidad (español)"] rows = [] for col, q in evaluated: joined = _join_cells(q.get("issues")) if joined: rows.append([col.get("name") or "(col)", joined]) if not rows: return model.Note( "No se detectaron problemas de calidad en las columnas evaluadas.") return model.DataTable(header=header, rows=rows, title="Problemas de calidad por columna") def _observations_block(evaluated: list): """DataTable listing analytical observations per column, or None. Observations (outliers, constant columns, ids, strong skew) are NOT quality defects: they do not affect the score. Returned as a separate table from the issues so the report never presents a legitimate outlier as a problem. """ header = ["Columna", "Observaciones analíticas"] rows = [] for col, q in evaluated: joined = _join_cells(q.get("observations")) if joined: rows.append([col.get("name") or "(col)", joined]) if not rows: return None return model.DataTable( header=header, rows=rows, title="Observaciones analíticas por columna", note="No son defectos de calidad y NO afectan al score; orientan el " "análisis (atípicos, columnas constantes, identificadores).") def _term(key: str, label: str, mark: bool) -> str: """Render a term as a clickable glossary span when marking is enabled.""" if mark: return f"[[term:{key}]]**{label}**[[/term]]" return f"**{label}**" def _criteria_intro(mark: bool) -> str: """Intro: how the score is composed, with every term marked clickable. Concise on purpose: the definitions of each term (calidad de datos, completitud, validez, unicidad de registro) now live in the GLOSARIO chapter, so the body no longer repeats them — it only states how the score is composed and keeps each term marked so it stays a clickable jump. """ calidad = _term("calidad_datos", "calidad de datos", mark) completitud = _term("completitud", "completitud", mark) validez = _term("validez", "validez", mark) unicidad = _term("unicidad_registro", "unicidad de registro", mark) return ( f"La {calidad} de cada columna es un score de 0 a 100 que combina " f"{completitud} (peso 60%) y {validez} (peso 40%, cuando es medible); " f"a nivel de tabla se añade la {unicidad}. Los valores atípicos no " "bajan el score: se listan aparte como **observaciones analíticas**." ) def build_calidad(profile: dict, ctx: dict): """Build the data-quality Chapter, or None if the profile has no columns. Reads everything defensively; returns ``None`` when there are no columns to score (the chapter does not apply), and never raises on a malformed profile. """ profile = profile or {} if not isinstance(profile, dict): profile = {} ctx = ctx or {} evaluated = list(_columns_with_quality(profile)) if not evaluated: return None # no columns to score -> chapter does not apply. # Register the criteria terms in the shared glossary (if present) and mark # their first appearance clickable. Contract §11.1. glossary = ctx.get("glossary") mark = False if isinstance(glossary, model.GlossaryCollector): for key, (label, definition) in _TERMS.items(): glossary.add(key, label, definition) mark = True blocks = [ model.Heading(text="Cómo se calcula la calidad", level=2), model.Markdown(text=_criteria_intro(mark)), _summary_block(profile, evaluated), model.Heading(text="Scores por columna", level=2), ] scores = _scores_block(evaluated) if scores is not None: blocks.append(scores) blocks.append(model.Heading(text="Problemas de calidad", level=2)) blocks.append(_issues_block(evaluated)) observations = _observations_block(evaluated) if observations is not None: blocks.append(model.Heading(text="Observaciones analíticas", level=2)) blocks.append(model.Note( "Las observaciones siguientes NO son defectos de calidad y no " "afectan al score: son señales para orientar el análisis.")) blocks.append(observations) return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, version=CHAPTER_VERSION, blocks=blocks)