merge: capitulo AutomaticEDA calidad (verificado met)

2026-06-30 15:10:22 +02:00
parent 9286e3b6b1 d412522db9
commit d479a8e4e2
2 changed files with 460 additions and 0 deletions
@@ -0,0 +1,266 @@
 """Data-quality chapter (CALIDAD) for AutomaticEDA.
 Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
 chapter answers, in Spanish and as tables, the three things the user asked for:
 1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and
   their weights (completeness, validity, consistency) before any number, plus a
   table-level summary (global score and aggregates).
 2. **Scores por columna** — a table with, per column, the total quality score and
   its breakdown into completeness / validity / consistency.
 3. **Problemas en español** — a second table listing, per column, the readable
   issues in Spanish (kept separate from the type ``flags``).
 The breakdown and the issues are NOT recomputed here: they come from the registry
 function ``column_quality_score`` (group ``eda``), which already derives
 ``{score, completeness, validity, consistency, issues}`` from the ColumnProfile.
 This chapter is render-only — it consumes that function and lays the result out
 as model blocks; the renderers paginate tables (splitting by rows, repeating the
 header) and wrap long cells so nothing is ever cut.
 Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
 """
 from __future__ import annotations
 from .. import model
 # Reuse the registry's pure quality function (group ``eda``). Import defensively:
 # if the package cannot be imported for any reason the chapter degrades to the
 # per-column ``quality_score`` already present in the profile instead of failing.
 try:  # pragma: no cover - import wiring
    from ...column_quality_score import column_quality_score as _column_quality_score
 except Exception:  # noqa: BLE001 - never let an import error abort the document.
    _column_quality_score = None
 CHAPTER_VERSION = "1.0.0"
 CHAPTER_ID = "calidad"
 CHAPTER_TITLE = "Calidad"
 # Weights mirror column_quality_score: completeness 0.5, validity 0.3,
 # consistency 0.2. Kept here only to render the human explanation; the actual
 # numbers always come from the function so the two never drift in computation.
 _CRITERIA_INTRO = (
    "La calidad de cada columna es un score de 0 a 100 que combina tres "
    "criterios, cada uno con un peso:\n\n"
    "- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos "
    "ni vacíos). Una columna con muchos nulos baja de score.\n"
    "- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango "
    "esperado (penaliza outliers y semánticas declaradas que no coinciden).\n"
    "- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza "
    "columnas constantes o identificadores de cardinalidad muy alta).\n\n"
    "Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). "
    "Los problemas detectados por columna se listan en español más abajo."
 )
 # Cap for the joined issues cell so a single row never grows taller than a page;
 # the remainder is summarized as "(+N más)" instead of being silently dropped.
 _ISSUES_MAXLEN = 160
 def _fmt_score(value) -> str:
    """Format a 0-100 score as ``NN / 100`` (or a placeholder)."""
    if value is None:
        return "—"
    try:
        num = float(value)
    except (TypeError, ValueError):
        return str(value)
    if num != num:  # NaN
        return "—"
    text = f"{num:.1f}".rstrip("0").rstrip(".")
    return f"{text} / 100"
 def _fmt_unit_pct(value) -> str:
    """Format a 0-1 fraction as a percentage (``95%``)."""
    if value is None:
        return "—"
    try:
        return f"{float(value) * 100:.0f}%"
    except (TypeError, ValueError):
        return str(value)
 def _quality_of(col: dict) -> dict:
    """Return ``{score, completeness, validity, consistency, issues}`` for a column.
    Uses the registry ``column_quality_score`` when available; otherwise falls
    back to the per-column ``quality_score`` already in the profile (number only,
    empty breakdown/issues). Never raises.
    """
    if not isinstance(col, dict):
        col = {}
    if _column_quality_score is not None:
        try:
            res = _column_quality_score(col)
            if isinstance(res, dict):
                return res
        except Exception:  # noqa: BLE001 - degrade instead of aborting.
            pass
    # Fallback: only the final score is available pre-computed in the profile.
    return {
        "score": col.get("quality_score"),
        "completeness": None,
        "validity": None,
        "consistency": None,
        "issues": [],
    }
 def _join_issues(issues) -> str:
    """Join Spanish issue strings into one cell, truncating overly long lists.
    The renderer wraps cell text, but a column with many long issues could make a
    single row taller than a whole page; cap the length and append ``(+N más)``
    so the count of hidden issues is honest rather than silently lost.
    """
    if not isinstance(issues, (list, tuple)) or not issues:
        return ""
    parts = [model._safe_str(i).strip() for i in issues]
    parts = [p for p in parts if p]
    if not parts:
        return ""
    out = []
    used = 0
    for idx, part in enumerate(parts):
        extra = len(part) + (2 if out else 0)
        if used + extra > _ISSUES_MAXLEN and out:
            remaining = len(parts) - idx
            out.append(f"(+{remaining} más)")
            return "; ".join(out)
        out.append(part)
        used += extra
    return "; ".join(out)
 def _columns_with_quality(profile: dict):
    """Yield ``(col, quality_dict)`` for every column dict in the profile."""
    cols = profile.get("columns") or []
    for c in cols:
        if isinstance(c, dict):
            yield c, _quality_of(c)
 def _summary_block(profile: dict, evaluated: list):
    """Table-level KVTable: global score and quality aggregates."""
    rows = []
    score = profile.get("quality_score")
    rows.append(("Calidad global", _fmt_score(score)))
    rows.append(("Columnas evaluadas", str(len(evaluated))))
    comps = [q.get("completeness") for _, q in evaluated
             if isinstance(q.get("completeness"), (int, float))]
    vals = [q.get("validity") for _, q in evaluated
            if isinstance(q.get("validity"), (int, float))]
    cons = [q.get("consistency") for _, q in evaluated
            if isinstance(q.get("consistency"), (int, float))]
    if comps:
        rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
    if vals:
        rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals))))
    if cons:
        rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons))))
    n_problem = sum(1 for _, q in evaluated if q.get("issues"))
    rows.append(("Columnas con problemas", str(n_problem)))
    # Extra table-wide quality signals already in the profile, when present.
    dup_pct = profile.get("duplicate_pct")
    if dup_pct is not None:
        rows.append(("Filas duplicadas", _fmt_unit_pct_or_pct(dup_pct)))
    null_cell_pct = profile.get("null_cell_pct")
    if null_cell_pct is not None:
        rows.append(("Celdas nulas (global)", _fmt_unit_pct_or_pct(null_cell_pct)))
    constant_cols = profile.get("constant_cols")
    if isinstance(constant_cols, (list, tuple)) and constant_cols:
        rows.append(("Columnas constantes", str(len(constant_cols))))
    all_null_cols = profile.get("all_null_cols")
    if isinstance(all_null_cols, (list, tuple)) and all_null_cols:
        rows.append(("Columnas 100% nulas", str(len(all_null_cols))))
    return model.KVTable(rows=rows, title="Resumen de calidad")
 def _fmt_unit_pct_or_pct(value) -> str:
    """Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
    try:
        num = float(value)
    except (TypeError, ValueError):
        return model._safe_str(value)
    if num != num:  # NaN
        return "—"
    pct = num * 100 if num <= 1.0 else num
    text = f"{pct:.1f}".rstrip("0").rstrip(".")
    return f"{text}%"
 def _scores_block(evaluated: list):
    """DataTable with per-column score and its three-criteria breakdown."""
    header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"]
    rows = []
    # Worst columns first so the reader sees the problems at the top.
    ordered = sorted(
        evaluated,
        key=lambda cq: (cq[1].get("score")
                        if isinstance(cq[1].get("score"), (int, float)) else 101.0),
    )
    for col, q in ordered:
        rows.append([
            col.get("name") or "(col)",
            _fmt_score(q.get("score")),
            _fmt_unit_pct(q.get("completeness")),
            _fmt_unit_pct(q.get("validity")),
            _fmt_unit_pct(q.get("consistency")),
        ])
    if not rows:
        return None
    return model.DataTable(header=header, rows=rows,
                           title="Scores de calidad por columna",
                           note="0 = peor, 100 = mejor; ordenado de peor a mejor")
 def _issues_block(evaluated: list):
    """DataTable listing Spanish issues per column, or a Note when there are none."""
    header = ["Columna", "Problemas detectados (español)"]
    rows = []
    for col, q in evaluated:
        joined = _join_issues(q.get("issues"))
        if joined:
            rows.append([col.get("name") or "(col)", joined])
    if not rows:
        return model.Note(
            "No se detectaron problemas de calidad en las columnas evaluadas.")
    return model.DataTable(header=header, rows=rows,
                           title="Problemas de calidad por columna")
 def build_calidad(profile: dict, ctx: dict):
    """Build the data-quality Chapter, or None if the profile has no columns.
    Reads everything defensively; returns ``None`` when there are no columns to
    score (the chapter does not apply), and never raises on a malformed profile.
    """
    profile = profile or {}
    if not isinstance(profile, dict):
        profile = {}
    ctx = ctx or {}
    evaluated = list(_columns_with_quality(profile))
    if not evaluated:
        return None  # no columns to score -> chapter does not apply.
    blocks = [
        model.Heading(text="Cómo se calcula la calidad", level=2),
        model.Markdown(text=_CRITERIA_INTRO),
        _summary_block(profile, evaluated),
        model.Heading(text="Scores por columna", level=2),
    ]
    scores = _scores_block(evaluated)
    if scores is not None:
        blocks.append(scores)
    blocks.append(model.Heading(text="Problemas detectados", level=2))
    blocks.append(_issues_block(evaluated))
    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,194 @@
 """Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut.
 Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
 and deterministic. Verifies that the chapter explains the quality criteria, shows
 per-column scores with the completeness/validity/consistency breakdown, lists the
 issues in Spanish (separate from the type flags), returns None when it does not
 apply, and that a wide profile with long names renders to PDF and PPTX without
 cutting any cell text (long content wraps, it is never truncated).
 """
 import os
 import re
 import tempfile
 from pypdf import PdfReader
 from pptx import Presentation
 from datascience.automatic_eda.chapters.calidad import (
    build_calidad,
    CHAPTER_VERSION,
 )
 from datascience.automatic_eda import build_document, render_pdf, render_pptx
 def _profile() -> dict:
    """A small profile with one column per quality problem (nulls, outliers,
    constant, high-cardinality id) plus one clean column."""
    return {
        "table": "demo",
        "quality_score": 72.5,
        "duplicate_pct": 0.04,
        "null_cell_pct": 0.11,
        "constant_cols": ["flag_const"],
        "all_null_cols": [],
        "columns": [
            {"name": "edad", "inferred_type": "integer", "null_pct": 0.2,
             "numeric": {"outlier_pct": 0.15, "min": 0, "max": 99},
             "quality_score": 60},
            {"name": "nombre", "inferred_type": "text", "null_pct": 0.0,
             "unique_pct": 0.98, "quality_score": 80},
            {"name": "flag_const", "inferred_type": "text", "null_pct": 0.0,
             "flags": ["constant"], "quality_score": 50},
            {"name": "limpia", "inferred_type": "float", "null_pct": 0.0,
             "numeric": {"outlier_pct": 0.0}, "quality_score": 100},
        ],
    }
 def _tables(chapter):
    return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"]
 def _scores_table(chapter):
    for t in _tables(chapter):
        if "Scores" in (t.title or ""):
            return t
    return None
 def _issues_table(chapter):
    for t in _tables(chapter):
        if "Problemas" in (t.title or ""):
            return t
    return None
 # --------------------------------------------------------------------------- #
 # Golden
 # --------------------------------------------------------------------------- #
 def test_golden_chapter_estructura_y_version():
    ch = build_calidad(_profile(), {})
    assert ch is not None
    assert ch.id == "calidad"
    assert ch.version == CHAPTER_VERSION
    kinds = [b.kind for b in ch.blocks]
    # intro heading + markdown criteria + summary kv + scores table + issues table
    assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds
 def test_golden_intro_explica_criterios_y_pesos():
    ch = build_calidad(_profile(), {})
    intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
    for needle in ("Completitud", "Validez", "Consistencia",
                   "50%", "30%", "20%"):
        assert needle in intro, f"falta {needle!r} en la intro de criterios"
 def test_golden_scores_incluyen_desglose_por_criterio():
    ch = build_calidad(_profile(), {})
    scores = _scores_table(ch)
    assert scores is not None
    assert scores.header == ["Columna", "Calidad", "Completitud",
                             "Validez", "Consistencia"]
    # 4 columns scored, none dropped.
    assert len(scores.rows) == 4
    names = {r[0] for r in scores.rows}
    assert names == {"edad", "nombre", "flag_const", "limpia"}
 def test_golden_issues_en_espanol_separados_de_flags():
    ch = build_calidad(_profile(), {})
    issues = _issues_table(ch)
    assert issues is not None
    flat = " | ".join(" ".join(r) for r in issues.rows)
    assert "nulos" in flat            # completeness issue (ES)
    assert "outliers" in flat         # validity issue (ES)
    assert "columna constante" in flat
    assert "posible id de alta cardinalidad" in flat
    # The raw type flag string must NOT leak as a "problem".
    assert "constant" not in flat or "columna constante" in flat
 # --------------------------------------------------------------------------- #
 # Edges
 # --------------------------------------------------------------------------- #
 def test_edge_none_vacio_sin_columnas_devuelve_none():
    assert build_calidad(None, None) is None
    assert build_calidad({}, {}) is None
    assert build_calidad({"columns": []}, {}) is None
    assert build_calidad("not a dict", {}) is None
 def test_edge_perfil_limpio_sin_problemas_usa_nota():
    prof = {
        "quality_score": 100,
        "columns": [
            {"name": "a", "inferred_type": "float", "null_pct": 0.0,
             "numeric": {"outlier_pct": 0.0}},
            {"name": "b", "inferred_type": "float", "null_pct": 0.0,
             "numeric": {"outlier_pct": 0.0}},
        ],
    }
    ch = build_calidad(prof, {})
    assert ch is not None
    assert _issues_table(ch) is None  # no issues table
    notes = [b for b in ch.blocks if b.kind == "note"]
    assert notes and "No se detectaron problemas" in notes[0].text
 # --------------------------------------------------------------------------- #
 # Anti-cut: a wide profile with long names renders without truncation
 # --------------------------------------------------------------------------- #
 def _wide_profile(ncols: int = 22) -> dict:
    cols = [
        {"name": "identificador_unico_de_transaccion_con_nombre_muy_largo",
         "inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99},
        {"name": "columna_constante_sin_ninguna_variacion_de_valor",
         "inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]},
    ]
    for k in range(ncols - 2):
        cols.append({
            "name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo",
            "inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05,
            "numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000},
        })
    return {"table": "ancha", "quality_score": 70.0, "columns": cols}
 def test_anticut_pdf_y_pptx_no_truncan_nombres_largos():
    prof = _wide_profile(22)
    full = build_document(prof, {"dataset_name": "ancha"})
    assert any(c.id == "calidad" for c in full)
    # Render ONLY the calidad chapter so the anti-cut assertions are scoped to
    # this chapter (other chapters, e.g. portada, legitimately contain '…').
    chapters = [c for c in full if c.id == "calidad"]
    long_name = "metrica_numerica_de_negocio_00_con_nombre_largo"
    with tempfile.TemporaryDirectory() as d:
        pdf = os.path.join(d, "q.pdf")
        pptx = os.path.join(d, "q.pptx")
        rp = render_pdf(chapters, pdf, {"title": "EDA"})
        rx = render_pptx(chapters, pptx, {"title": "EDA"})
        assert os.path.exists(pdf) and os.path.exists(pptx)
        # The wide table forces pagination across several pages/slides.
        assert (rp or {}).get("n_pages", 0) >= 2
        # PDF: the long name survives whole once wraps (spaces/newlines) removed,
        # and there is no truncation marker.
        pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages)
        assert "…" not in pdf_txt and "..." not in pdf_txt
        norm = re.sub(r"\s+", "", pdf_txt)
        assert long_name in norm, "el nombre largo se cortó en el PDF"
        # PPTX: long name present in some cell, untruncated.
        allt = []
        for s in Presentation(pptx).slides:
            for sh in s.shapes:
                if sh.has_text_frame:
                    allt.append(sh.text_frame.text)
                if sh.has_table:
                    for row in sh.table.rows:
                        for c in row.cells:
                            allt.append(c.text)
        joined = re.sub(r"\s+", "", "\n".join(allt))
        assert long_name in joined, "el nombre largo se cortó en el PPTX"