fn_registry/python/functions/datascience/automatic_eda/chapters/missingness.py

"""Missingness chapter (MISSINGNESS) — patterns of missing data.

Complements the CALIDAD chapter: where CALIDAD reports *how much* is missing per
column (the null percentage that lowers the completeness score), this chapter
reports the **pattern** of the missing data — whether columns tend to be missing
*together* (co-occurrence of absences) or independently. That distinction is what
separates data that is missing completely at random ([[term:mcar]]MCAR[[/term]])
from data missing as a function of another variable ([[term:mar]]MAR[[/term]]),
which is the key question to settle before imputing or modelling.

The chapter activates only when the table actually has missing data (at least one
column with a null in the aggregated profile); otherwise it returns ``None`` and
disappears from the document.

Sections, in order:

1. **Resumen global** — % of missing cells in the dataset, number of columns with
   nulls, and complete rows (no missing) vs incomplete rows (≥1 missing).
2. **Ranking por columna** — columns sorted by their null percentage, with a
   horizontal bar figure.
3. **Co-ocurrencia de ausencias** — the correlation of the binary is-null masks
   between columns (which columns tend to be missing together): a heatmap plus a
   table of the top column pairs that co-miss.
4. **Patrones de fila** — the most frequent "which columns are missing together"
   row patterns, in the style of missingno's pattern matrix.
5. **Lectura MCAR/MAR** — an interpretive, *exploratory* note (not a confirmatory
   test such as Little's) reading the absence correlations as a hint of MCAR
   (independent absences) vs MAR (co-occurring absences).

The aggregate per-column null counts come from the ``eda`` group ``TableProfile``
(``columns[i]['null_count'] / 'null_pct'`` and the table-level ``null_cell_pct``).
The per-row is-null mask needed for co-occurrence is built from raw data: a single
DuckDB push-down over ``ctx['db_path'] / ctx['table']`` (same pattern as the
AGREGACION chapter) covering ALL columns, with a fallback to the numeric-only
``ctx['raw_numeric']`` when no database is reachable. All the heavy lifting is
delegated to pure registry functions (``missingness_overview``,
``missingness_correlation``, ``missingness_row_patterns``) and two figure helpers
(``missingness_rank_bar_figure``, ``missingness_corr_heatmap_figure``); every one
is imported lazily and degrades to an honest note so this chapter never raises.

Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
"""

from __future__ import annotations

from .. import model

# 1.0.1 — keep-together: el ranking "Faltantes por columna" (su Heading + tabla +
# figura) se envuelve en un model.Group para que el paginador no separe la figura
# de su título/tabla (el heatmap de co-ocurrencia ya iba agrupado).
CHAPTER_VERSION = "1.0.1"
CHAPTER_ID = "missingness"
CHAPTER_TITLE = "Datos faltantes"

# Sample cap for the per-row is-null mask push-down. Co-occurrence and row
# patterns are computed on this sample; the global % of missing cells and the
# per-column ranking come from the (exact) aggregated profile instead.
MASK_SAMPLE = 5000
# Thresholds for the MCAR/MAR heuristic note. A pair counts as a *strong*
# co-occurrence when the absence correlation alone is high; as a *partial*
# co-occurrence when the absences overlap materially (high Jaccard) even if the
# Pearson correlation is modest — the usual case when one column is missing far
# more often than the other (e.g. Cabin 77% vs Age 20% in Titanic), which dilutes
# the correlation while the rows still co-miss in absolute terms.
_CORR_STRONG = 0.30
_JACCARD_NOTABLE = 0.20
# Rows shown in the top-pairs and row-patterns tables (bounded, never silently
# truncated: the table note reports the full count).
_TOP_PAIRS = 12
_TOP_PATTERNS = 12
# Truncate long column names in tables (the renderer also wraps).
_LABEL_MAX = 28

# Glossary terms this chapter explains (contract §11.1). Registered in the shared
# collector and marked clickable on their first appearance.
_TERMS = {
    "missingness": (
        "Patrón de datos faltantes (missingness)",
        "El patrón con el que faltan los datos: cuánto falta, en qué columnas y "
        "si las ausencias de unas columnas coinciden (co-ocurren) con las de "
        "otras. Analizarlo —no solo contar nulos— distingue datos que faltan al "
        "azar (MCAR) de los que faltan en función de otra variable (MAR), lo que "
        "decide cómo imputar o si descartar filas sin sesgar el análisis.",
    ),
    "mcar": (
        "MCAR (Missing Completely At Random)",
        "Los valores faltan de forma independiente de cualquier dato, observado o "
        "no: las ausencias de unas columnas no se relacionan entre sí ni con los "
        "valores. Es el caso más benigno —descartar filas o imputar la media no "
        "introduce sesgo—, pero rara vez se cumple del todo en datos reales.",
    ),
    "mar": (
        "MAR (Missing At Random)",
        "La probabilidad de que un valor falte depende de OTRAS variables "
        "observadas (p. ej. una medición que falta más en cierto grupo). Las "
        "ausencias co-ocurren entre columnas o se relacionan con los valores de "
        "otras; imputar exige condicionar en esas variables para no sesgar. La "
        "co-ocurrencia fuerte de ausencias es un indicio (exploratorio) de MAR.",
    ),
}


# --------------------------------------------------------------------------- #
# Small defensive formatters (own copy: the chapter never imports siblings).
# --------------------------------------------------------------------------- #
def _fmt_int(value) -> str:
    if value is None:
        return "—"
    try:
        return f"{int(round(float(value))):,}".replace(",", ".")
    except (TypeError, ValueError):
        return model._safe_str(value)


def _fmt_pct(value, decimals: int = 1) -> str:
    """Format an already-0-100 value as a percentage. None -> placeholder."""
    if value is None:
        return "—"
    try:
        return f"{float(value):.{decimals}f}%"
    except (TypeError, ValueError):
        return model._safe_str(value)


def _fmt_num(value, decimals: int = 3) -> str:
    if value is None:
        return "—"
    try:
        f = float(value)
    except (TypeError, ValueError):
        return model._safe_str(value)
    if f != f:  # NaN
        return "—"
    text = f"{f:.{decimals}f}".rstrip("0").rstrip(".")
    return text if text else "0"


def _truncate(text, limit: int = _LABEL_MAX) -> str:
    s = model._safe_str(text)
    if len(s) <= limit:
        return s
    return s[: max(1, limit - 1)].rstrip() + "…"


def _term(key: str, label: str, mark: bool) -> str:
    if mark:
        return f"[[term:{key}]]**{label}**[[/term]]"
    return f"**{label}**"


# --------------------------------------------------------------------------- #
# Profile reads (exact, all rows).
# --------------------------------------------------------------------------- #
def _null_count_of(col: dict):
    """Best-effort null count of a column: ``null_count`` or null_pct*n_rows."""
    nc = col.get("null_count")
    if isinstance(nc, (int, float)) and not isinstance(nc, bool):
        return int(nc)
    np_ = col.get("null_pct")
    nr = col.get("n_rows")
    if isinstance(np_, (int, float)) and isinstance(nr, (int, float)):
        return int(round(float(np_) * float(nr)))
    return 0


def _columns_with_nulls(profile: dict):
    """Return ``[(name, null_count, null_pct_0_100)]`` for columns with nulls,
    sorted by null percentage descending. Reads the aggregated profile (exact)."""
    cols = profile.get("columns") or []
    out = []
    for c in cols:
        if not isinstance(c, dict):
            continue
        nc = _null_count_of(c)
        if nc <= 0:
            continue
        np_ = c.get("null_pct")
        nr = c.get("n_rows") or profile.get("n_rows")
        if isinstance(np_, (int, float)) and not isinstance(np_, bool):
            pct = float(np_) * 100.0 if np_ <= 1.0 else float(np_)
        elif nr:
            pct = nc / float(nr) * 100.0
        else:
            pct = None
        out.append((c.get("name") or "(col)", nc, pct))
    out.sort(key=lambda t: (t[2] if t[2] is not None else -1.0), reverse=True)
    return out


def _global_missing_pct(profile: dict):
    """Table-level % of missing cells (0-100), exact, from the profile."""
    v = profile.get("null_cell_pct")
    if isinstance(v, (int, float)) and not isinstance(v, bool):
        return float(v) * 100.0 if v <= 1.0 else float(v)
    return None


# --------------------------------------------------------------------------- #
# Per-row is-null mask (sample): DuckDB push-down, fallback to raw_numeric.
# --------------------------------------------------------------------------- #
def _build_query_fn(ctx: dict):
    """Return ``(query_fn, table)`` for a DuckDB-backed ctx, or ``(None, None)``.

    Mirrors build_eda_render_ctx: a read-only closure over the registry wrapper.
    Only DuckDB is supported here; any other backend degrades to raw_numeric."""
    db_path = ctx.get("db_path")
    table = ctx.get("table")
    if not db_path or not table:
        return None, None
    try:
        from infra import duckdb_query_readonly
    except Exception:  # noqa: BLE001 — wrapper unavailable -> degrade.
        return None, None

    def query_fn(sql):
        return duckdb_query_readonly(db_path, sql)

    return query_fn, table


def _null_mask(profile: dict, ctx: dict):
    """Build the per-row is-null mask ``{col: [0/1, ...]}``.

    Tries a single DuckDB push-down over ALL columns first (so categorical
    columns like Cabin are covered, not only numeric ones); falls back to the
    numeric-only ``ctx['raw_numeric']`` (None -> missing); returns ``(None, 0,
    None)`` when neither is reachable. Never raises.
    Returns ``(mask, n_sampled, source)`` with source in {"db","raw_numeric"}.
    """
    cols = profile.get("columns") or []
    names = [c.get("name") for c in cols
             if isinstance(c, dict) and c.get("name")]
    # 1) DuckDB push-down over every column (covers categoricals too).
    query_fn, table = _build_query_fn(ctx)
    if query_fn is not None and names:
        try:
            from datascience.extract_null_mask import extract_null_mask

            res = extract_null_mask(query_fn, table, names, max_rows=MASK_SAMPLE)
            if isinstance(res, dict) and res.get("status") == "ok":
                mask = res.get("mask") or {}
                if mask:
                    return mask, int(res.get("n") or 0), "db"
        except Exception:  # noqa: BLE001 — degrade to raw_numeric.
            pass
    # 2) Fallback: numeric-only mask derived from raw_numeric (None -> missing).
    rn = ctx.get("raw_numeric")
    if isinstance(rn, dict) and rn:
        mask = {}
        for col, vals in rn.items():
            if isinstance(vals, (list, tuple)):
                mask[col] = [1 if v is None else 0 for v in vals]
        if mask:
            n = max((len(v) for v in mask.values()), default=0)
            return mask, n, "raw_numeric"
    return None, 0, None


# --------------------------------------------------------------------------- #
# Lazy registry delegations (each degrades to None on any failure).
# --------------------------------------------------------------------------- #
def _overview(mask: dict):
    try:
        from datascience.missingness_overview import missingness_overview

        out = missingness_overview(mask)
        return out if isinstance(out, dict) else None
    except Exception:  # noqa: BLE001
        return None


def _correlation(mask: dict, top_k: int):
    try:
        from datascience.missingness_correlation import missingness_correlation

        out = missingness_correlation(mask, top_k=top_k)
        return out if isinstance(out, dict) else None
    except Exception:  # noqa: BLE001
        return None


def _row_patterns(mask: dict, top_n: int):
    try:
        from datascience.missingness_row_patterns import missingness_row_patterns

        out = missingness_row_patterns(mask, top_n=top_n)
        return out if isinstance(out, dict) else None
    except Exception:  # noqa: BLE001
        return None


def _rank_bar_make(names, pcts, title):
    def make():
        try:
            from datascience.missingness_rank_bar_figure import (
                missingness_rank_bar_figure,
            )

            return missingness_rank_bar_figure(names, pcts, title=title)
        except Exception:  # noqa: BLE001 — minimal fallback figure.
            return _fallback_fig("ranking de nulos no disponible")

    return make


def _heatmap_make(matrix, labels, title):
    def make():
        try:
            from datascience.missingness_corr_heatmap_figure import (
                missingness_corr_heatmap_figure,
            )

            return missingness_corr_heatmap_figure(matrix, labels, title=title)
        except Exception:  # noqa: BLE001 — minimal fallback figure.
            return _fallback_fig("heatmap de co-ocurrencia no disponible")

    return make


def _fallback_fig(message: str):
    import matplotlib

    matplotlib.use("Agg")
    from matplotlib.figure import Figure

    fig = Figure(figsize=(5.0, 2.2))
    ax = fig.add_subplot(111)
    ax.text(0.5, 0.5, message, ha="center", va="center")
    ax.axis("off")
    return fig


# --------------------------------------------------------------------------- #
# Block builders.
# --------------------------------------------------------------------------- #
def _summary_block(profile: dict, with_nulls: list, overview, sampled, n_total):
    rows = []
    gpct = _global_missing_pct(profile)
    rows.append(("Celdas faltantes (global)", _fmt_pct(gpct)))
    rows.append(("Columnas con faltantes", str(len(with_nulls))))
    all_null = profile.get("all_null_cols")
    if isinstance(all_null, (list, tuple)) and all_null:
        rows.append(("Columnas 100% faltantes", str(len(all_null))))
    if isinstance(overview, dict):
        cr = overview.get("complete_rows")
        ir = overview.get("incomplete_rows")
        suffix = ""
        if (isinstance(sampled, int) and isinstance(n_total, (int, float))
                and sampled and n_total and sampled < n_total):
            suffix = f" (sobre muestra de {_fmt_int(sampled)} filas)"
        if cr is not None:
            rows.append(("Filas completas (sin faltantes)",
                         f"{_fmt_int(cr)} ({_fmt_pct(overview.get('complete_pct'))})"
                         + suffix))
        if ir is not None:
            rows.append(("Filas con ≥1 faltante",
                         f"{_fmt_int(ir)} "
                         f"({_fmt_pct(overview.get('incomplete_pct'))})" + suffix))
    return model.KVTable(rows=rows, title="Resumen de datos faltantes")


def _ranking_block(with_nulls: list):
    header = ["Columna", "Faltantes", "% faltante"]
    rows = [[_truncate(n), _fmt_int(c), _fmt_pct(p)] for (n, c, p) in with_nulls]
    if not rows:
        return None
    return model.DataTable(
        header=header, rows=rows, title="Faltantes por columna",
        note="ordenado de más a menos faltante")


def _ranking_figure(with_nulls: list):
    names = [n for (n, _, p) in with_nulls if p is not None]
    pcts = [p for (_, _, p) in with_nulls if p is not None]
    if not names:
        return None
    return model.Figure(
        make=_rank_bar_make(names, pcts, "% de valores faltantes por columna"),
        caption="Porcentaje de valores faltantes por columna (barras).")


def _pairs_block(corr: dict):
    """Top column pairs whose absences co-occur, as a table, or None."""
    pairs = (corr or {}).get("pairs") or []
    header = ["Columna A", "Columna B", "Corr. ausencia", "Co-faltan", "Jaccard"]
    rows = []
    for p in pairs[:_TOP_PAIRS]:
        if not isinstance(p, dict):
            continue
        rows.append([
            _truncate(p.get("a")),
            _truncate(p.get("b")),
            _fmt_num(p.get("corr")),
            _fmt_int(p.get("co_missing")),
            _fmt_num(p.get("jaccard")),
        ])
    if not rows:
        return None
    shown = len(rows)
    total = len(pairs)
    note = ("correlación de las máscaras is-null entre columnas; "
            "«Co-faltan» = nº de filas en que ambas faltan a la vez")
    if total > shown:
        note += f" — top {shown} de {total} pares"
    return model.DataTable(header=header, rows=rows,
                           title="Pares de columnas que co-faltan", note=note)


def _heatmap_block(corr: dict):
    cols = (corr or {}).get("columns") or []
    matrix = (corr or {}).get("matrix") or []
    if len(cols) < 2 or not matrix:
        return None
    labels = [_truncate(c, 16) for c in cols]
    return model.Figure(
        make=_heatmap_make(matrix, labels, "Co-ocurrencia de ausencias"),
        caption=("Correlación de las ausencias entre columnas (azul = faltan "
                 "juntas; rojo = cuando una falta la otra tiende a estar)."))


def _patterns_block(patterns_res: dict):
    patterns = (patterns_res or {}).get("patterns") or []
    header = ["Columnas que faltan juntas", "Filas", "%"]
    rows = []
    for p in patterns[:_TOP_PATTERNS]:
        if not isinstance(p, dict):
            continue
        cols = p.get("missing_cols") or []
        if cols:
            label = ", ".join(_truncate(c, 18) for c in cols)
        else:
            label = "(fila completa — sin faltantes)"
        rows.append([label, _fmt_int(p.get("n_rows")), _fmt_pct(p.get("pct"))])
    if not rows:
        return None
    total = (patterns_res or {}).get("n_patterns")
    shown = len(rows)
    note = "cada fila es un patrón de «qué columnas faltan juntas»"
    if isinstance(total, int) and total > shown:
        note += f" — top {shown} de {total} patrones distintos"
    return model.DataTable(header=header, rows=rows,
                           title="Patrones de fila más comunes", note=note)


def _mcar_mar_note(corr: dict, mark: bool):
    """Interpretive, exploratory MCAR/MAR note from the absence correlations.

    Reads the absence correlations at two levels so the verdict never contradicts
    the visible evidence: a *strong* correlation flags a clear non-random (MAR)
    pattern; a *partial* overlap (many rows co-miss — high Jaccard — even if the
    correlation is diluted by one column being missing far more often) flags a
    localized possible-MAR and cites the concrete co-missing pair; only when
    neither holds does it read the absences as compatible with MCAR."""

    def _pairs_with(attr_ok):
        out = []
        for p in (corr or {}).get("pairs") or []:
            if isinstance(p, dict) and attr_ok(p):
                out.append(p)
        return out

    def _cf(v):
        try:
            return float(v)
        except (TypeError, ValueError):
            return 0.0

    strong = _pairs_with(lambda p: abs(_cf(p.get("corr"))) >= _CORR_STRONG)
    partial = _pairs_with(
        lambda p: _cf(p.get("corr")) > 0 and _cf(p.get("jaccard")) >= _JACCARD_NOTABLE)
    mcar = _term("mcar", "MCAR", mark)
    mar = _term("mar", "MAR", mark)
    head = (
        "**Lectura exploratoria MCAR/MAR.** Esta es una heurística basada en la "
        "correlación de las ausencias entre columnas, NO un test confirmatorio "
        "(como el de Little); orienta, no demuestra. ")
    if strong:
        top = strong[0]
        ev = (f"«{model._safe_str(top.get('a'))}» y "
              f"«{model._safe_str(top.get('b'))}» "
              f"(corr {_fmt_num(top.get('corr'))})")
        body = (
            f"Hay ausencias que co-ocurren con fuerza —{ev}—: las columnas no "
            f"faltan de forma independiente, lo que es un indicio de un patrón no "
            f"aleatorio ({mar}). Antes de imputar o descartar filas conviene "
            f"comprobar si la ausencia depende de otra variable observada; en ese "
            f"caso la imputación debería condicionar en ella para no sesgar.")
    elif partial:
        top = max(partial, key=lambda p: _cf(p.get("jaccard")))
        ev = (f"«{model._safe_str(top.get('a'))}» y "
              f"«{model._safe_str(top.get('b'))}» faltan a la vez en "
              f"{_fmt_int(top.get('co_missing'))} filas "
              f"(Jaccard {_fmt_num(top.get('jaccard'))})")
        body = (
            f"Hay co-ocurrencia parcial de ausencias —{ev}—: algunas columnas "
            f"tienden a faltar juntas aunque la correlación global sea modesta "
            f"(habitual cuando una columna falta mucho más que la otra). Es un "
            f"indicio de un posible patrón localizado no aleatorio ({mar}); "
            f"conviene revisar si esa ausencia depende de otra variable observada "
            f"antes de imputar, en lugar de asumir que faltan al azar.")
    else:
        body = (
            f"Las ausencias entre columnas no muestran correlación ni solape "
            f"relevante: parecen independientes, lo que es compatible con que "
            f"falten al azar ({mcar}). Aun así, la ausencia podría depender de "
            f"variables no observadas (la heurística no lo descarta).")
    return model.Markdown(text=head + body)


def _intro_block(mark: bool, source):
    missingness = _term("missingness", "missingness", mark)
    text = (
        f"Este capítulo analiza el {missingness} de la tabla: no solo cuánto "
        "falta (eso lo cubre la calidad), sino DÓNDE falta y si las columnas "
        "faltan juntas. La co-ocurrencia de ausencias se calcula sobre la matriz "
        "binaria «is-null» por fila.")
    if source == "raw_numeric":
        text += (" Nota: no se pudo leer la tabla cruda completa, así que la "
                 "co-ocurrencia se limita a las columnas numéricas disponibles.")
    return model.Markdown(text=text)


# --------------------------------------------------------------------------- #
# Entry point.
# --------------------------------------------------------------------------- #
def build_missingness(profile: dict, ctx: dict):
    """Build the missingness Chapter, or None if the table has no missing data."""
    if not isinstance(profile, dict):
        profile = {}
    ctx = ctx or {}

    with_nulls = _columns_with_nulls(profile)
    if not with_nulls:
        return None  # no missing data anywhere -> chapter does not apply.

    # Register glossary terms (if a collector is present) and mark them clickable.
    glossary = ctx.get("glossary")
    mark = False
    if isinstance(glossary, model.GlossaryCollector):
        for key, (label, definition) in _TERMS.items():
            glossary.add(key, label, definition)
        mark = True

    # Per-row is-null mask (sample) for co-occurrence and row patterns.
    mask, sampled, source = _null_mask(profile, ctx)
    overview = _overview(mask) if mask else None
    n_total = profile.get("n_rows")

    blocks = [
        model.Heading(text="Cuánto y dónde faltan datos", level=2),
        _intro_block(mark, source),
        _summary_block(profile, with_nulls, overview, sampled, n_total),
    ]

    # Ranking "Faltantes por columna": keep the heading, its table and the bar
    # figure together on the same page/slide (keep-together) so the paginator never
    # strands the figure from its title/table. When there is no figure to draw, the
    # unit degrades honestly and stays flat (never a Group around a missing figure).
    rank_unit = [model.Heading(text="Faltantes por columna", level=2)]
    ranking = _ranking_block(with_nulls)
    if ranking is not None:
        rank_unit.append(ranking)
    rank_fig = _ranking_figure(with_nulls)
    if rank_fig is not None:
        rank_unit.append(rank_fig)
        blocks.append(model.Group(blocks=rank_unit))
    else:
        blocks.extend(rank_unit)

    # Co-occurrence + row patterns need the per-row mask. Without it, say so.
    if not mask:
        blocks.append(model.Note(
            "No se pudo construir la matriz «is-null» por fila (sin acceso a los "
            "datos crudos), así que no se analiza la co-ocurrencia de ausencias "
            "ni los patrones de fila en este informe."))
        return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                             version=CHAPTER_VERSION, blocks=blocks)

    corr = _correlation(mask, _TOP_PAIRS) or {}
    co_blocks = [model.Heading(text="Co-ocurrencia de ausencias", level=2)]
    heatmap = _heatmap_block(corr)
    if heatmap is not None:
        co_blocks.append(heatmap)
    pairs = _pairs_block(corr)
    if pairs is not None:
        co_blocks.append(pairs)
    if heatmap is None and pairs is None:
        co_blocks.append(model.Note(
            "Ninguna pareja de columnas comparte ausencias con variación "
            "suficiente para correlacionarlas (p. ej. una sola columna con "
            "faltantes), así que no hay co-ocurrencia que mostrar."))
    # Keep the co-occurrence heading next to its heatmap and table.
    blocks.append(model.Group(blocks=co_blocks))

    patterns_res = _row_patterns(mask, _TOP_PATTERNS) or {}
    patterns = _patterns_block(patterns_res)
    if patterns is not None:
        blocks.append(model.Heading(text="Patrones de fila", level=2))
        blocks.append(patterns)

    blocks.append(model.Heading(text="Lectura MCAR / MAR", level=2))
    blocks.append(_mcar_mar_note(corr, mark))

    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)