diff --git a/python/functions/datascience/automatic_eda/chapters/calidad.py b/python/functions/datascience/automatic_eda/chapters/calidad.py new file mode 100644 index 00000000..dcedcf6f --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/calidad.py @@ -0,0 +1,266 @@ +"""Data-quality chapter (CALIDAD) for AutomaticEDA. + +Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The +chapter answers, in Spanish and as tables, the three things the user asked for: + +1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and + their weights (completeness, validity, consistency) before any number, plus a + table-level summary (global score and aggregates). +2. **Scores por columna** — a table with, per column, the total quality score and + its breakdown into completeness / validity / consistency. +3. **Problemas en español** — a second table listing, per column, the readable + issues in Spanish (kept separate from the type ``flags``). + +The breakdown and the issues are NOT recomputed here: they come from the registry +function ``column_quality_score`` (group ``eda``), which already derives +``{score, completeness, validity, consistency, issues}`` from the ColumnProfile. +This chapter is render-only — it consumes that function and lays the result out +as model blocks; the renderers paginate tables (splitting by rows, repeating the +header) and wrap long cells so nothing is ever cut. + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +""" + +from __future__ import annotations + +from .. import model + +# Reuse the registry's pure quality function (group ``eda``). Import defensively: +# if the package cannot be imported for any reason the chapter degrades to the +# per-column ``quality_score`` already present in the profile instead of failing. +try: # pragma: no cover - import wiring + from ...column_quality_score import column_quality_score as _column_quality_score +except Exception: # noqa: BLE001 - never let an import error abort the document. + _column_quality_score = None + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "calidad" +CHAPTER_TITLE = "Calidad" + +# Weights mirror column_quality_score: completeness 0.5, validity 0.3, +# consistency 0.2. Kept here only to render the human explanation; the actual +# numbers always come from the function so the two never drift in computation. +_CRITERIA_INTRO = ( + "La calidad de cada columna es un score de 0 a 100 que combina tres " + "criterios, cada uno con un peso:\n\n" + "- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos " + "ni vacíos). Una columna con muchos nulos baja de score.\n" + "- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango " + "esperado (penaliza outliers y semánticas declaradas que no coinciden).\n" + "- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza " + "columnas constantes o identificadores de cardinalidad muy alta).\n\n" + "Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). " + "Los problemas detectados por columna se listan en español más abajo." +) + +# Cap for the joined issues cell so a single row never grows taller than a page; +# the remainder is summarized as "(+N más)" instead of being silently dropped. +_ISSUES_MAXLEN = 160 + + +def _fmt_score(value) -> str: + """Format a 0-100 score as ``NN / 100`` (or a placeholder).""" + if value is None: + return "—" + try: + num = float(value) + except (TypeError, ValueError): + return str(value) + if num != num: # NaN + return "—" + text = f"{num:.1f}".rstrip("0").rstrip(".") + return f"{text} / 100" + + +def _fmt_unit_pct(value) -> str: + """Format a 0-1 fraction as a percentage (``95%``).""" + if value is None: + return "—" + try: + return f"{float(value) * 100:.0f}%" + except (TypeError, ValueError): + return str(value) + + +def _quality_of(col: dict) -> dict: + """Return ``{score, completeness, validity, consistency, issues}`` for a column. + + Uses the registry ``column_quality_score`` when available; otherwise falls + back to the per-column ``quality_score`` already in the profile (number only, + empty breakdown/issues). Never raises. + """ + if not isinstance(col, dict): + col = {} + if _column_quality_score is not None: + try: + res = _column_quality_score(col) + if isinstance(res, dict): + return res + except Exception: # noqa: BLE001 - degrade instead of aborting. + pass + # Fallback: only the final score is available pre-computed in the profile. + return { + "score": col.get("quality_score"), + "completeness": None, + "validity": None, + "consistency": None, + "issues": [], + } + + +def _join_issues(issues) -> str: + """Join Spanish issue strings into one cell, truncating overly long lists. + + The renderer wraps cell text, but a column with many long issues could make a + single row taller than a whole page; cap the length and append ``(+N más)`` + so the count of hidden issues is honest rather than silently lost. + """ + if not isinstance(issues, (list, tuple)) or not issues: + return "" + parts = [model._safe_str(i).strip() for i in issues] + parts = [p for p in parts if p] + if not parts: + return "" + out = [] + used = 0 + for idx, part in enumerate(parts): + extra = len(part) + (2 if out else 0) + if used + extra > _ISSUES_MAXLEN and out: + remaining = len(parts) - idx + out.append(f"(+{remaining} más)") + return "; ".join(out) + out.append(part) + used += extra + return "; ".join(out) + + +def _columns_with_quality(profile: dict): + """Yield ``(col, quality_dict)`` for every column dict in the profile.""" + cols = profile.get("columns") or [] + for c in cols: + if isinstance(c, dict): + yield c, _quality_of(c) + + +def _summary_block(profile: dict, evaluated: list): + """Table-level KVTable: global score and quality aggregates.""" + rows = [] + score = profile.get("quality_score") + rows.append(("Calidad global", _fmt_score(score))) + rows.append(("Columnas evaluadas", str(len(evaluated)))) + + comps = [q.get("completeness") for _, q in evaluated + if isinstance(q.get("completeness"), (int, float))] + vals = [q.get("validity") for _, q in evaluated + if isinstance(q.get("validity"), (int, float))] + cons = [q.get("consistency") for _, q in evaluated + if isinstance(q.get("consistency"), (int, float))] + if comps: + rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps)))) + if vals: + rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals)))) + if cons: + rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons)))) + + n_problem = sum(1 for _, q in evaluated if q.get("issues")) + rows.append(("Columnas con problemas", str(n_problem))) + + # Extra table-wide quality signals already in the profile, when present. + dup_pct = profile.get("duplicate_pct") + if dup_pct is not None: + rows.append(("Filas duplicadas", _fmt_unit_pct_or_pct(dup_pct))) + null_cell_pct = profile.get("null_cell_pct") + if null_cell_pct is not None: + rows.append(("Celdas nulas (global)", _fmt_unit_pct_or_pct(null_cell_pct))) + constant_cols = profile.get("constant_cols") + if isinstance(constant_cols, (list, tuple)) and constant_cols: + rows.append(("Columnas constantes", str(len(constant_cols)))) + all_null_cols = profile.get("all_null_cols") + if isinstance(all_null_cols, (list, tuple)) and all_null_cols: + rows.append(("Columnas 100% nulas", str(len(all_null_cols)))) + + return model.KVTable(rows=rows, title="Resumen de calidad") + + +def _fmt_unit_pct_or_pct(value) -> str: + """Format a value that may be a 0-1 fraction or an already-0-100 percentage.""" + try: + num = float(value) + except (TypeError, ValueError): + return model._safe_str(value) + if num != num: # NaN + return "—" + pct = num * 100 if num <= 1.0 else num + text = f"{pct:.1f}".rstrip("0").rstrip(".") + return f"{text}%" + + +def _scores_block(evaluated: list): + """DataTable with per-column score and its three-criteria breakdown.""" + header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"] + rows = [] + # Worst columns first so the reader sees the problems at the top. + ordered = sorted( + evaluated, + key=lambda cq: (cq[1].get("score") + if isinstance(cq[1].get("score"), (int, float)) else 101.0), + ) + for col, q in ordered: + rows.append([ + col.get("name") or "(col)", + _fmt_score(q.get("score")), + _fmt_unit_pct(q.get("completeness")), + _fmt_unit_pct(q.get("validity")), + _fmt_unit_pct(q.get("consistency")), + ]) + if not rows: + return None + return model.DataTable(header=header, rows=rows, + title="Scores de calidad por columna", + note="0 = peor, 100 = mejor; ordenado de peor a mejor") + + +def _issues_block(evaluated: list): + """DataTable listing Spanish issues per column, or a Note when there are none.""" + header = ["Columna", "Problemas detectados (español)"] + rows = [] + for col, q in evaluated: + joined = _join_issues(q.get("issues")) + if joined: + rows.append([col.get("name") or "(col)", joined]) + if not rows: + return model.Note( + "No se detectaron problemas de calidad en las columnas evaluadas.") + return model.DataTable(header=header, rows=rows, + title="Problemas de calidad por columna") + + +def build_calidad(profile: dict, ctx: dict): + """Build the data-quality Chapter, or None if the profile has no columns. + + Reads everything defensively; returns ``None`` when there are no columns to + score (the chapter does not apply), and never raises on a malformed profile. + """ + profile = profile or {} + if not isinstance(profile, dict): + profile = {} + ctx = ctx or {} + + evaluated = list(_columns_with_quality(profile)) + if not evaluated: + return None # no columns to score -> chapter does not apply. + + blocks = [ + model.Heading(text="Cómo se calcula la calidad", level=2), + model.Markdown(text=_CRITERIA_INTRO), + _summary_block(profile, evaluated), + model.Heading(text="Scores por columna", level=2), + ] + scores = _scores_block(evaluated) + if scores is not None: + blocks.append(scores) + blocks.append(model.Heading(text="Problemas detectados", level=2)) + blocks.append(_issues_block(evaluated)) + + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/calidad_test.py b/python/functions/datascience/automatic_eda/chapters/calidad_test.py new file mode 100644 index 00000000..3e6bf5f6 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/calidad_test.py @@ -0,0 +1,194 @@ +"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut. + +Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast +and deterministic. Verifies that the chapter explains the quality criteria, shows +per-column scores with the completeness/validity/consistency breakdown, lists the +issues in Spanish (separate from the type flags), returns None when it does not +apply, and that a wide profile with long names renders to PDF and PPTX without +cutting any cell text (long content wraps, it is never truncated). +""" + +import os +import re +import tempfile + +from pypdf import PdfReader +from pptx import Presentation + +from datascience.automatic_eda.chapters.calidad import ( + build_calidad, + CHAPTER_VERSION, +) +from datascience.automatic_eda import build_document, render_pdf, render_pptx + + +def _profile() -> dict: + """A small profile with one column per quality problem (nulls, outliers, + constant, high-cardinality id) plus one clean column.""" + return { + "table": "demo", + "quality_score": 72.5, + "duplicate_pct": 0.04, + "null_cell_pct": 0.11, + "constant_cols": ["flag_const"], + "all_null_cols": [], + "columns": [ + {"name": "edad", "inferred_type": "integer", "null_pct": 0.2, + "numeric": {"outlier_pct": 0.15, "min": 0, "max": 99}, + "quality_score": 60}, + {"name": "nombre", "inferred_type": "text", "null_pct": 0.0, + "unique_pct": 0.98, "quality_score": 80}, + {"name": "flag_const", "inferred_type": "text", "null_pct": 0.0, + "flags": ["constant"], "quality_score": 50}, + {"name": "limpia", "inferred_type": "float", "null_pct": 0.0, + "numeric": {"outlier_pct": 0.0}, "quality_score": 100}, + ], + } + + +def _tables(chapter): + return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"] + + +def _scores_table(chapter): + for t in _tables(chapter): + if "Scores" in (t.title or ""): + return t + return None + + +def _issues_table(chapter): + for t in _tables(chapter): + if "Problemas" in (t.title or ""): + return t + return None + + +# --------------------------------------------------------------------------- # +# Golden +# --------------------------------------------------------------------------- # +def test_golden_chapter_estructura_y_version(): + ch = build_calidad(_profile(), {}) + assert ch is not None + assert ch.id == "calidad" + assert ch.version == CHAPTER_VERSION + kinds = [b.kind for b in ch.blocks] + # intro heading + markdown criteria + summary kv + scores table + issues table + assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds + + +def test_golden_intro_explica_criterios_y_pesos(): + ch = build_calidad(_profile(), {}) + intro = [b for b in ch.blocks if b.kind == "markdown"][0].text + for needle in ("Completitud", "Validez", "Consistencia", + "50%", "30%", "20%"): + assert needle in intro, f"falta {needle!r} en la intro de criterios" + + +def test_golden_scores_incluyen_desglose_por_criterio(): + ch = build_calidad(_profile(), {}) + scores = _scores_table(ch) + assert scores is not None + assert scores.header == ["Columna", "Calidad", "Completitud", + "Validez", "Consistencia"] + # 4 columns scored, none dropped. + assert len(scores.rows) == 4 + names = {r[0] for r in scores.rows} + assert names == {"edad", "nombre", "flag_const", "limpia"} + + +def test_golden_issues_en_espanol_separados_de_flags(): + ch = build_calidad(_profile(), {}) + issues = _issues_table(ch) + assert issues is not None + flat = " | ".join(" ".join(r) for r in issues.rows) + assert "nulos" in flat # completeness issue (ES) + assert "outliers" in flat # validity issue (ES) + assert "columna constante" in flat + assert "posible id de alta cardinalidad" in flat + # The raw type flag string must NOT leak as a "problem". + assert "constant" not in flat or "columna constante" in flat + + +# --------------------------------------------------------------------------- # +# Edges +# --------------------------------------------------------------------------- # +def test_edge_none_vacio_sin_columnas_devuelve_none(): + assert build_calidad(None, None) is None + assert build_calidad({}, {}) is None + assert build_calidad({"columns": []}, {}) is None + assert build_calidad("not a dict", {}) is None + + +def test_edge_perfil_limpio_sin_problemas_usa_nota(): + prof = { + "quality_score": 100, + "columns": [ + {"name": "a", "inferred_type": "float", "null_pct": 0.0, + "numeric": {"outlier_pct": 0.0}}, + {"name": "b", "inferred_type": "float", "null_pct": 0.0, + "numeric": {"outlier_pct": 0.0}}, + ], + } + ch = build_calidad(prof, {}) + assert ch is not None + assert _issues_table(ch) is None # no issues table + notes = [b for b in ch.blocks if b.kind == "note"] + assert notes and "No se detectaron problemas" in notes[0].text + + +# --------------------------------------------------------------------------- # +# Anti-cut: a wide profile with long names renders without truncation +# --------------------------------------------------------------------------- # +def _wide_profile(ncols: int = 22) -> dict: + cols = [ + {"name": "identificador_unico_de_transaccion_con_nombre_muy_largo", + "inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99}, + {"name": "columna_constante_sin_ninguna_variacion_de_valor", + "inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]}, + ] + for k in range(ncols - 2): + cols.append({ + "name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo", + "inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05, + "numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000}, + }) + return {"table": "ancha", "quality_score": 70.0, "columns": cols} + + +def test_anticut_pdf_y_pptx_no_truncan_nombres_largos(): + prof = _wide_profile(22) + full = build_document(prof, {"dataset_name": "ancha"}) + assert any(c.id == "calidad" for c in full) + # Render ONLY the calidad chapter so the anti-cut assertions are scoped to + # this chapter (other chapters, e.g. portada, legitimately contain '…'). + chapters = [c for c in full if c.id == "calidad"] + long_name = "metrica_numerica_de_negocio_00_con_nombre_largo" + with tempfile.TemporaryDirectory() as d: + pdf = os.path.join(d, "q.pdf") + pptx = os.path.join(d, "q.pptx") + rp = render_pdf(chapters, pdf, {"title": "EDA"}) + rx = render_pptx(chapters, pptx, {"title": "EDA"}) + assert os.path.exists(pdf) and os.path.exists(pptx) + # The wide table forces pagination across several pages/slides. + assert (rp or {}).get("n_pages", 0) >= 2 + + # PDF: the long name survives whole once wraps (spaces/newlines) removed, + # and there is no truncation marker. + pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages) + assert "…" not in pdf_txt and "..." not in pdf_txt + norm = re.sub(r"\s+", "", pdf_txt) + assert long_name in norm, "el nombre largo se cortó en el PDF" + + # PPTX: long name present in some cell, untruncated. + allt = [] + for s in Presentation(pptx).slides: + for sh in s.shapes: + if sh.has_text_frame: + allt.append(sh.text_frame.text) + if sh.has_table: + for row in sh.table.rows: + for c in row.cells: + allt.append(c.text) + joined = re.sub(r"\s+", "", "\n".join(allt)) + assert long_name in joined, "el nombre largo se cortó en el PPTX"