fn_registry/python/functions/datascience/missingness_overview.py

"""Pure EDA helper: dataset-level missingness overview from a 0/1 null mask.

Part of the `eda` capability group. Consumes a per-column null mask
(``{col_name: [int 0/1, ...]}`` aligned by row, ``1`` = value is missing,
``0`` = value is present) and derives dataset-wide missingness metrics: cell
count and percentage of missing data, how many columns carry any null, and how
many rows are complete vs. incomplete.

Dict-no-throw style of the `eda` group: it NEVER raises. A non-dict, an empty
dict, malformed columns, ragged lists or non-int cell values all degrade
gracefully to the zero/contract output. Stdlib only.

Ragged-length policy: columns are allowed to have different lengths. ``n_rows``
is the **maximum** column length; positions that don't exist in a shorter
column are treated as present (``0``). This keeps the ``n_rows * n_cols`` cell
grid well defined without dropping rows.
"""


def _is_missing(value) -> int:
    """Return ``1`` iff ``value`` denotes a missing cell, else ``0``.

    Only an exact equality to ``1`` (covers ``int`` ``1`` and ``float`` ``1.0``)
    counts as missing. ``None``, ``0``, strings and any other value are treated
    as present. The comparison cannot raise for standard inputs.
    """
    try:
        return 1 if value == 1 else 0
    except Exception:
        return 0


def missingness_overview(null_mask) -> dict:
    """Summarize dataset-level missingness from a 0/1 null mask.

    Args:
        null_mask: Dict ``{col_name: [int 0/1, ...]}`` where each list is aligned
            by row (``1`` = missing, ``0`` = present). Lists are normally all the
            same length (= number of rows). Defensive: a non-dict or empty dict
            returns the all-zero contract; non-list columns are treated as empty;
            ragged lists are aligned to the maximum length, padding the missing
            tail of shorter columns as present (``0``); ``None`` / non-int cells
            count as present.

    Returns:
        Dict with exactly these keys, all always present (the function never
        raises): ``n_rows``, ``n_cols``, ``n_cols_with_null``,
        ``n_missing_cells``, ``missing_cell_pct`` (0-100), ``complete_rows``,
        ``incomplete_rows``, ``complete_pct`` (0-100), ``incomplete_pct``
        (0-100). Percentages are ``0.0`` when the denominator is zero (no
        ``ZeroDivisionError``).
    """
    zero = {
        "n_rows": 0,
        "n_cols": 0,
        "n_cols_with_null": 0,
        "n_missing_cells": 0,
        "missing_cell_pct": 0.0,
        "complete_rows": 0,
        "incomplete_rows": 0,
        "complete_pct": 0.0,
        "incomplete_pct": 0.0,
    }

    if not isinstance(null_mask, dict) or not null_mask:
        return dict(zero)

    # Normalize every column to a list; non-list columns become empty.
    cols = {}
    for name, seq in null_mask.items():
        cols[name] = seq if isinstance(seq, (list, tuple)) else []

    n_cols = len(cols)
    lengths = [len(seq) for seq in cols.values()]
    n_rows = max(lengths) if lengths else 0

    if n_rows == 0:
        # Columns exist but carry no rows: everything zero except n_cols.
        out = dict(zero)
        out["n_cols"] = n_cols
        return out

    n_missing_cells = 0
    n_cols_with_null = 0
    row_has_missing = [False] * n_rows

    for seq in cols.values():
        col_len = len(seq)
        col_has_null = False
        for r in range(n_rows):
            if r < col_len and _is_missing(seq[r]):
                n_missing_cells += 1
                row_has_missing[r] = True
                col_has_null = True
        if col_has_null:
            n_cols_with_null += 1

    incomplete_rows = sum(1 for flag in row_has_missing if flag)
    complete_rows = n_rows - incomplete_rows

    total_cells = n_rows * n_cols
    missing_cell_pct = (n_missing_cells / total_cells * 100.0) if total_cells else 0.0
    complete_pct = complete_rows / n_rows * 100.0
    incomplete_pct = incomplete_rows / n_rows * 100.0

    return {
        "n_rows": n_rows,
        "n_cols": n_cols,
        "n_cols_with_null": n_cols_with_null,
        "n_missing_cells": n_missing_cells,
        "missing_cell_pct": missing_cell_pct,
        "complete_rows": complete_rows,
        "incomplete_rows": incomplete_rows,
        "complete_pct": complete_pct,
        "incomplete_pct": incomplete_pct,
    }