fn_registry/python/functions/datascience/automatic_eda/chapters/calidad.py

"""Data-quality chapter (CALIDAD) for AutomaticEDA.

Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
chapter implements the quality model of report 2046:

1. **En qué se basa la calidad** — an intro paragraph explaining the two scored
   dimensions and their weights (completitud 60%, validez 40%) plus the
   table-level row uniqueness, BEFORE any number, and stating explicitly that
   outliers are reported as observations and do **not** lower the score. The
   criteria terms (calidad de datos, completitud, validez, unicidad de registro)
   are hooked into the shared glossary as clickable jumps.
2. **Scores por columna** — a table with, per column, the total quality score and
   its breakdown into completeness / validity (no consistency dimension).
3. **Problemas de calidad** — a table listing ONLY real quality defects
   (nulls, empty cells, values not conforming to their type/semantics).
4. **Observaciones analíticas** — a SEPARATE table for outliers, constant
   columns, high-cardinality ids and strong skew, with an explicit note that
   these do not affect the score.

The breakdown, issues and observations are NOT recomputed here: they come from
the registry function ``column_quality_score`` (group ``eda``), which derives
``{score, completeness, validity, dimensions, applicable, issues,
observations}`` from the ColumnProfile. This chapter is render-only.

Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
"""

from __future__ import annotations

from .. import model

# Reuse the registry's pure quality function (group ``eda``). Import defensively:
# if the package cannot be imported for any reason the chapter degrades to the
# per-column ``quality_score`` already present in the profile instead of failing.
try:  # pragma: no cover - import wiring
    from ...column_quality_score import column_quality_score as _column_quality_score
except Exception:  # noqa: BLE001 - never let an import error abort the document.
    _column_quality_score = None

CHAPTER_VERSION = "2.0.0"
CHAPTER_ID = "calidad"
CHAPTER_TITLE = "Calidad"

# Glossary terms this chapter explains (report 2046 §6). Registered in the shared
# collector and marked clickable on their first appearance (contract §11.1).
_TERMS = {
    "calidad_datos": (
        "Calidad de datos (score 0-100)",
        "Mide hasta qué punto los datos están presentes y son utilizables tal "
        "cual, no si son «buenos para el análisis». Se compone solo de "
        "dimensiones medibles automáticamente desde el perfil de la tabla, sin "
        "fuente externa de verdad: completitud (60%), validez (40%, cuando es "
        "medible) y, a nivel de tabla, unicidad de registro. Los valores "
        "atípicos NO bajan la calidad: se listan aparte como observaciones.",
    ),
    "completitud": (
        "Completitud",
        "Proporción de valores realmente presentes en una columna (1 − % de "
        "nulos; en texto, las celdas vacías también cuentan como faltantes). Los "
        "nulos y vacíos bajan el score porque falta información que debería "
        "estar. Pesa el 60% del score de columna.",
    ),
    "validez": (
        "Validez",
        "Proporción de valores que encajan con su tipo o formato esperado: un "
        "número que parsea, una fecha legible, un email con forma de email. Los "
        "valores que no parsean a su tipo bajan el score. Si la columna es texto "
        "libre sin formato esperado, la validez no se puede medir y el score se "
        "basa solo en la completitud. Pesa el 40% del score cuando es medible.",
    ),
    "unicidad_registro": (
        "Unicidad de registro",
        "A nivel de tabla, las filas duplicadas restan calidad al conjunto "
        "(1 − % de filas duplicadas). Es distinta de que una columna no-clave "
        "repita valores, que no es un defecto de calidad.",
    ),
}

# Cap for the joined cell so a single row never grows taller than a page; the
# remainder is summarized as "(+N más)" instead of being silently dropped.
_ISSUES_MAXLEN = 160


def _fmt_score(value) -> str:
    """Format a 0-100 score as ``NN / 100`` (or a placeholder)."""
    if value is None:
        return "—"
    try:
        num = float(value)
    except (TypeError, ValueError):
        return str(value)
    if num != num:  # NaN
        return "—"
    text = f"{num:.1f}".rstrip("0").rstrip(".")
    return f"{text} / 100"


def _fmt_unit_pct(value) -> str:
    """Format a 0-1 fraction as a percentage (``95%``)."""
    if value is None:
        return "—"
    try:
        return f"{float(value) * 100:.0f}%"
    except (TypeError, ValueError):
        return str(value)


def _fmt_validity(value) -> str:
    """Validity is ``None`` when not applicable: show ``n/a`` not a fake 0%."""
    if value is None:
        return "n/a"
    return _fmt_unit_pct(value)


def _quality_of(col: dict) -> dict:
    """Return the quality dict for a column.

    Uses the registry ``column_quality_score`` when available; otherwise falls
    back to the per-column ``quality_score`` already in the profile (number only,
    empty breakdown/issues/observations). Never raises.
    """
    if not isinstance(col, dict):
        col = {}
    if _column_quality_score is not None:
        try:
            res = _column_quality_score(col)
            if isinstance(res, dict):
                return res
        except Exception:  # noqa: BLE001 - degrade instead of aborting.
            pass
    return {
        "score": col.get("quality_score"),
        "completeness": None,
        "validity": None,
        "issues": [],
        "observations": [],
    }


def _join_cells(items) -> str:
    """Join Spanish strings into one cell, truncating overly long lists.

    The renderer wraps cell text, but a column with many long entries could make
    a single row taller than a whole page; cap the length and append ``(+N más)``
    so the count of hidden entries is honest rather than silently lost.
    """
    if not isinstance(items, (list, tuple)) or not items:
        return ""
    parts = [model._safe_str(i).strip() for i in items]
    parts = [p for p in parts if p]
    if not parts:
        return ""
    out = []
    used = 0
    for idx, part in enumerate(parts):
        extra = len(part) + (2 if out else 0)
        if used + extra > _ISSUES_MAXLEN and out:
            remaining = len(parts) - idx
            out.append(f"(+{remaining} más)")
            return "; ".join(out)
        out.append(part)
        used += extra
    return "; ".join(out)


def _columns_with_quality(profile: dict):
    """Yield ``(col, quality_dict)`` for every column dict in the profile."""
    cols = profile.get("columns") or []
    for c in cols:
        if isinstance(c, dict):
            yield c, _quality_of(c)


def _fmt_unit_pct_or_pct(value) -> str:
    """Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
    try:
        num = float(value)
    except (TypeError, ValueError):
        return model._safe_str(value)
    if num != num:  # NaN
        return "—"
    pct = num * 100 if num <= 1.0 else num
    text = f"{pct:.1f}".rstrip("0").rstrip(".")
    return f"{text}%"


def _row_uniqueness(profile: dict):
    """Return row uniqueness (1 - duplicate_pct) in [0,1], or None if unknown."""
    dup = profile.get("duplicate_pct")
    if dup is None:
        return None
    try:
        d = float(dup)
    except (TypeError, ValueError):
        return None
    if d > 1.0:  # tolerate a 0-100 scale
        d = d / 100.0
    return max(0.0, min(1.0, 1.0 - d))


def _summary_block(profile: dict, evaluated: list):
    """Table-level KVTable: global score and quality aggregates."""
    rows = []
    score = profile.get("quality_score")
    rows.append(("Calidad global", _fmt_score(score)))
    rows.append(("Columnas evaluadas", str(len(evaluated))))

    comps = [q.get("completeness") for _, q in evaluated
             if isinstance(q.get("completeness"), (int, float))]
    vals = [q.get("validity") for _, q in evaluated
            if isinstance(q.get("validity"), (int, float))]
    if comps:
        rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
    if vals:
        rows.append(("Validez media (donde aplica)",
                     _fmt_unit_pct(sum(vals) / len(vals))))

    ru = _row_uniqueness(profile)
    if ru is not None:
        rows.append(("Unicidad de registro", _fmt_unit_pct(ru)))

    n_problem = sum(1 for _, q in evaluated if q.get("issues"))
    rows.append(("Columnas con problemas", str(n_problem)))

    # Extra table-wide quality signals already in the profile, when present.
    dup_pct = profile.get("duplicate_pct")
    if dup_pct is not None:
        rows.append(("Filas duplicadas", _fmt_unit_pct_or_pct(dup_pct)))
    null_cell_pct = profile.get("null_cell_pct")
    if null_cell_pct is not None:
        rows.append(("Celdas nulas (global)", _fmt_unit_pct_or_pct(null_cell_pct)))
    constant_cols = profile.get("constant_cols")
    if isinstance(constant_cols, (list, tuple)) and constant_cols:
        rows.append(("Columnas constantes", str(len(constant_cols))))
    all_null_cols = profile.get("all_null_cols")
    if isinstance(all_null_cols, (list, tuple)) and all_null_cols:
        rows.append(("Columnas 100% nulas", str(len(all_null_cols))))

    return model.KVTable(rows=rows, title="Resumen de calidad")


def _scores_block(evaluated: list):
    """DataTable with per-column score and its completeness/validity breakdown."""
    header = ["Columna", "Calidad", "Completitud", "Validez"]
    rows = []
    # Worst columns first so the reader sees the problems at the top.
    ordered = sorted(
        evaluated,
        key=lambda cq: (cq[1].get("score")
                        if isinstance(cq[1].get("score"), (int, float)) else 101.0),
    )
    for col, q in ordered:
        rows.append([
            col.get("name") or "(col)",
            _fmt_score(q.get("score")),
            _fmt_unit_pct(q.get("completeness")),
            _fmt_validity(q.get("validity")),
        ])
    if not rows:
        return None
    return model.DataTable(header=header, rows=rows,
                           title="Scores de calidad por columna",
                           note="0 = peor, 100 = mejor; «n/a» = dimensión no "
                                "medible; ordenado de peor a mejor")


def _issues_block(evaluated: list):
    """DataTable listing ONLY real quality defects per column, or a Note."""
    header = ["Columna", "Problemas de calidad (español)"]
    rows = []
    for col, q in evaluated:
        joined = _join_cells(q.get("issues"))
        if joined:
            rows.append([col.get("name") or "(col)", joined])
    if not rows:
        return model.Note(
            "No se detectaron problemas de calidad en las columnas evaluadas.")
    return model.DataTable(header=header, rows=rows,
                           title="Problemas de calidad por columna")


def _observations_block(evaluated: list):
    """DataTable listing analytical observations per column, or None.

    Observations (outliers, constant columns, ids, strong skew) are NOT quality
    defects: they do not affect the score. Returned as a separate table from the
    issues so the report never presents a legitimate outlier as a problem.
    """
    header = ["Columna", "Observaciones analíticas"]
    rows = []
    for col, q in evaluated:
        joined = _join_cells(q.get("observations"))
        if joined:
            rows.append([col.get("name") or "(col)", joined])
    if not rows:
        return None
    return model.DataTable(
        header=header, rows=rows,
        title="Observaciones analíticas por columna",
        note="No son defectos de calidad y NO afectan al score; orientan el "
             "análisis (atípicos, columnas constantes, identificadores).")


def _term(key: str, label: str, mark: bool) -> str:
    """Render a term as a clickable glossary span when marking is enabled."""
    if mark:
        return f"[[term:{key}]]**{label}**[[/term]]"
    return f"**{label}**"


def _criteria_intro(mark: bool) -> str:
    """Intro paragraph explaining the two scored dimensions and the principle."""
    calidad = _term("calidad_datos", "calidad de datos", mark)
    completitud = _term("completitud", "Completitud (peso 60%)", mark)
    validez = _term("validez", "Validez (peso 40%, cuando es medible)", mark)
    unicidad = _term("unicidad_registro", "unicidad de registro", mark)
    return (
        f"La {calidad} de cada columna es un score de 0 a 100 que combina solo "
        "dimensiones medibles desde el perfil de la tabla, sin fuente externa "
        "de verdad:\n\n"
        f"- {completitud}: proporción de valores presentes (1 − % de nulos; en "
        "texto, las celdas vacías cuentan como faltantes). Los nulos y vacíos "
        "bajan el score.\n"
        f"- {validez}: proporción de valores que encajan con su tipo o formato "
        "(un número que parsea, una fecha legible, un email con forma de email). "
        "Si una columna es texto libre sin formato esperado, la validez no se "
        "mide y el score se basa solo en la completitud.\n\n"
        f"Score de columna = 100 × (0,6·completitud + 0,4·validez), "
        "renormalizado cuando la validez no aplica. A nivel de tabla se añade "
        f"la {unicidad} (1 − % de filas duplicadas).\n\n"
        "**Los valores atípicos (outliers) NO bajan la calidad.** Un valor "
        "extremo puede ser real y correcto; detectar atípicos es parte del "
        "análisis de la distribución, no un juicio de corrección. Por eso, junto "
        "con las columnas constantes y los identificadores, se listan aparte "
        "como **observaciones analíticas** que no afectan al score."
    )


def build_calidad(profile: dict, ctx: dict):
    """Build the data-quality Chapter, or None if the profile has no columns.

    Reads everything defensively; returns ``None`` when there are no columns to
    score (the chapter does not apply), and never raises on a malformed profile.
    """
    profile = profile or {}
    if not isinstance(profile, dict):
        profile = {}
    ctx = ctx or {}

    evaluated = list(_columns_with_quality(profile))
    if not evaluated:
        return None  # no columns to score -> chapter does not apply.

    # Register the criteria terms in the shared glossary (if present) and mark
    # their first appearance clickable. Contract §11.1.
    glossary = ctx.get("glossary")
    mark = False
    if isinstance(glossary, model.GlossaryCollector):
        for key, (label, definition) in _TERMS.items():
            glossary.add(key, label, definition)
        mark = True

    blocks = [
        model.Heading(text="Cómo se calcula la calidad", level=2),
        model.Markdown(text=_criteria_intro(mark)),
        _summary_block(profile, evaluated),
        model.Heading(text="Scores por columna", level=2),
    ]
    scores = _scores_block(evaluated)
    if scores is not None:
        blocks.append(scores)

    blocks.append(model.Heading(text="Problemas de calidad", level=2))
    blocks.append(_issues_block(evaluated))

    observations = _observations_block(evaluated)
    if observations is not None:
        blocks.append(model.Heading(text="Observaciones analíticas", level=2))
        blocks.append(model.Note(
            "Las observaciones siguientes NO son defectos de calidad y no "
            "afectan al score: son señales para orientar el análisis."))
        blocks.append(observations)

    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)