"""Pure EDA helper: dataset-level missingness overview from a 0/1 null mask. Part of the `eda` capability group. Consumes a per-column null mask (``{col_name: [int 0/1, ...]}`` aligned by row, ``1`` = value is missing, ``0`` = value is present) and derives dataset-wide missingness metrics: cell count and percentage of missing data, how many columns carry any null, and how many rows are complete vs. incomplete. Dict-no-throw style of the `eda` group: it NEVER raises. A non-dict, an empty dict, malformed columns, ragged lists or non-int cell values all degrade gracefully to the zero/contract output. Stdlib only. Ragged-length policy: columns are allowed to have different lengths. ``n_rows`` is the **maximum** column length; positions that don't exist in a shorter column are treated as present (``0``). This keeps the ``n_rows * n_cols`` cell grid well defined without dropping rows. """ def _is_missing(value) -> int: """Return ``1`` iff ``value`` denotes a missing cell, else ``0``. Only an exact equality to ``1`` (covers ``int`` ``1`` and ``float`` ``1.0``) counts as missing. ``None``, ``0``, strings and any other value are treated as present. The comparison cannot raise for standard inputs. """ try: return 1 if value == 1 else 0 except Exception: return 0 def missingness_overview(null_mask) -> dict: """Summarize dataset-level missingness from a 0/1 null mask. Args: null_mask: Dict ``{col_name: [int 0/1, ...]}`` where each list is aligned by row (``1`` = missing, ``0`` = present). Lists are normally all the same length (= number of rows). Defensive: a non-dict or empty dict returns the all-zero contract; non-list columns are treated as empty; ragged lists are aligned to the maximum length, padding the missing tail of shorter columns as present (``0``); ``None`` / non-int cells count as present. Returns: Dict with exactly these keys, all always present (the function never raises): ``n_rows``, ``n_cols``, ``n_cols_with_null``, ``n_missing_cells``, ``missing_cell_pct`` (0-100), ``complete_rows``, ``incomplete_rows``, ``complete_pct`` (0-100), ``incomplete_pct`` (0-100). Percentages are ``0.0`` when the denominator is zero (no ``ZeroDivisionError``). """ zero = { "n_rows": 0, "n_cols": 0, "n_cols_with_null": 0, "n_missing_cells": 0, "missing_cell_pct": 0.0, "complete_rows": 0, "incomplete_rows": 0, "complete_pct": 0.0, "incomplete_pct": 0.0, } if not isinstance(null_mask, dict) or not null_mask: return dict(zero) # Normalize every column to a list; non-list columns become empty. cols = {} for name, seq in null_mask.items(): cols[name] = seq if isinstance(seq, (list, tuple)) else [] n_cols = len(cols) lengths = [len(seq) for seq in cols.values()] n_rows = max(lengths) if lengths else 0 if n_rows == 0: # Columns exist but carry no rows: everything zero except n_cols. out = dict(zero) out["n_cols"] = n_cols return out n_missing_cells = 0 n_cols_with_null = 0 row_has_missing = [False] * n_rows for seq in cols.values(): col_len = len(seq) col_has_null = False for r in range(n_rows): if r < col_len and _is_missing(seq[r]): n_missing_cells += 1 row_has_missing[r] = True col_has_null = True if col_has_null: n_cols_with_null += 1 incomplete_rows = sum(1 for flag in row_has_missing if flag) complete_rows = n_rows - incomplete_rows total_cells = n_rows * n_cols missing_cell_pct = (n_missing_cells / total_cells * 100.0) if total_cells else 0.0 complete_pct = complete_rows / n_rows * 100.0 incomplete_pct = incomplete_rows / n_rows * 100.0 return { "n_rows": n_rows, "n_cols": n_cols, "n_cols_with_null": n_cols_with_null, "n_missing_cells": n_missing_cells, "missing_cell_pct": missing_cell_pct, "complete_rows": complete_rows, "incomplete_rows": incomplete_rows, "complete_pct": complete_pct, "incomplete_pct": incomplete_pct, }