fn_registry/python/functions/datascience/missingness_row_patterns.py

"""missingness_row_patterns — distinct per-row missingness patterns (missingno matrix style).

Pure function: no I/O, deterministic, NEVER raises. Given a per-column null mask
aligned by row ({col: [0/1, ...]}, 1 = missing), it groups rows by their missing
"pattern" — the sorted tuple of column names that are missing in that row — and
counts how often each distinct pattern occurs.

This mirrors the missingno matrix idea: instead of plotting per-cell nullity, it
collapses each row to the SET of columns it lacks, surfacing co-missing structure
(e.g. "A and B always go missing together"). The empty pattern (a fully complete
row) is a first-class pattern and may appear in the result with missing_cols=[];
the caller labels it "(fila completa)".
"""


def _is_missing(cell) -> bool:
    """A cell counts as missing when it equals 1 (truthy 0/1 mask).

    None / 0 / non-numeric are treated as present. Defensive: never raises.
    """
    try:
        return int(cell) == 1
    except (TypeError, ValueError):
        return bool(cell)


def missingness_row_patterns(null_mask, top_n=10) -> dict:
    """Count distinct per-row missingness patterns from a column null mask.

    For each row, its pattern is the sorted tuple of column names missing in that
    row (the columns whose value is 1). The frequency of each distinct pattern is
    counted, including the empty pattern (a complete row with nothing missing).

    Args:
        null_mask: Dict {col: [0/1, ...]} aligned by row, where 1 means the cell
            is missing in that row. Read defensively; columns with differing
            lengths are tolerated (n_rows is the longest list; out-of-range cells
            count as present). Empty dict -> n_rows 0.
        top_n: Maximum number of patterns returned in `patterns`, ranked by
            n_rows desc (tiebreak: fewer columns first, then column names). The
            full count of distinct patterns is always reported in `n_patterns`.

    Returns:
        Dict:
        {
          "n_rows": int,            # total rows
          "n_patterns": int,        # distinct patterns (incl. the empty pattern)
          "complete_rows": int,     # rows with the empty pattern (nothing missing)
          "patterns": [             # top_n patterns, n_rows desc
             {"missing_cols": [col, ...], "n_rows": int, "pct": float}  # [] = complete row
          ],
        }
        For {} (or a non-dict) returns n_rows 0 and patterns []. NEVER raises.
    """
    empty = {"n_rows": 0, "n_patterns": 0, "complete_rows": 0, "patterns": []}
    if not isinstance(null_mask, dict) or not null_mask:
        return empty

    # Stable, canonical column order so each row's pattern tuple is sorted.
    items = sorted(null_mask.items(), key=lambda kv: str(kv[0]))
    names = [str(k) for k, _ in items]
    lists = [v if isinstance(v, (list, tuple)) else [] for _, v in items]

    n_rows = max((len(lst) for lst in lists), default=0)
    if n_rows == 0:
        return empty

    # Defensive parsing of top_n.
    try:
        limit = int(top_n)
    except (TypeError, ValueError):
        limit = 10
    if limit < 0:
        limit = 0

    counts: dict = {}
    n_cols = len(names)
    for r in range(n_rows):
        # names is sorted, so iterating in order yields an already-sorted tuple.
        pattern = tuple(
            names[c]
            for c in range(n_cols)
            if r < len(lists[c]) and _is_missing(lists[c][r])
        )
        counts[pattern] = counts.get(pattern, 0) + 1

    complete_rows = counts.get((), 0)
    n_patterns = len(counts)

    # Rank: n_rows desc, then fewer columns first, then column names (deterministic).
    ordered = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))

    patterns = [
        {
            "missing_cols": list(pat),
            "n_rows": cnt,
            "pct": round(100.0 * cnt / n_rows, 2),
        }
        for pat, cnt in ordered[:limit]
    ]

    return {
        "n_rows": n_rows,
        "n_patterns": n_patterns,
        "complete_rows": complete_rows,
        "patterns": patterns,
    }