fn_registry/python/functions/datascience/missingness_overview_test.py

"""Tests para missingness_overview."""

import sys
import os

import pytest

sys.path.insert(0, os.path.dirname(__file__))

from missingness_overview import missingness_overview


# Output contract: every call returns exactly these 9 keys.
EXPECTED_KEYS = {
    "n_rows",
    "n_cols",
    "n_cols_with_null",
    "n_missing_cells",
    "missing_cell_pct",
    "complete_rows",
    "incomplete_rows",
    "complete_pct",
    "incomplete_pct",
}


def test_cooccurrence_three_cols_exact():
    # 3 columns, 5 rows. Hand-computed expectations:
    #   col a missing at rows 0, 4      -> 2
    #   col b missing at rows 0, 2      -> 2
    #   col c missing at row  4         -> 1
    #   n_missing_cells = 5, total_cells = 5*3 = 15 -> 33.333...%
    #   row 0 (a&b co-occur)  -> incomplete
    #   row 1 (all present)   -> complete
    #   row 2 (b only)        -> incomplete
    #   row 3 (all present)   -> complete
    #   row 4 (a&c co-occur)  -> incomplete
    mask = {
        "a": [1, 0, 0, 0, 1],
        "b": [1, 0, 1, 0, 0],
        "c": [0, 0, 0, 0, 1],
    }
    out = missingness_overview(mask)
    assert out["n_rows"] == 5
    assert out["n_cols"] == 3
    assert out["n_cols_with_null"] == 3
    assert out["n_missing_cells"] == 5
    assert out["missing_cell_pct"] == pytest.approx(33.33333333, abs=1e-6)
    assert out["complete_rows"] == 2
    assert out["incomplete_rows"] == 3
    assert out["complete_pct"] == pytest.approx(40.0)
    assert out["incomplete_pct"] == pytest.approx(60.0)


def test_empty_dict_all_zero():
    out = missingness_overview({})
    assert out == {
        "n_rows": 0,
        "n_cols": 0,
        "n_cols_with_null": 0,
        "n_missing_cells": 0,
        "missing_cell_pct": 0.0,
        "complete_rows": 0,
        "incomplete_rows": 0,
        "complete_pct": 0.0,
        "incomplete_pct": 0.0,
    }


def test_output_keys_contract():
    # The 9-key contract holds even for the garbage/zero path.
    assert set(missingness_overview({}).keys()) == EXPECTED_KEYS
    assert set(missingness_overview({"a": [1, 0]}).keys()) == EXPECTED_KEYS


def test_not_a_dict_returns_zero():
    for bad in (None, [1, 0, 1], 42, "nope", 3.14):
        out = missingness_overview(bad)
        assert out["n_rows"] == 0
        assert out["n_cols"] == 0
        assert out["n_missing_cells"] == 0
        assert out["missing_cell_pct"] == 0.0


def test_no_nulls_all_complete():
    mask = {"a": [0, 0, 0], "b": [0, 0, 0]}
    out = missingness_overview(mask)
    assert out["n_rows"] == 3
    assert out["n_cols"] == 2
    assert out["n_cols_with_null"] == 0
    assert out["n_missing_cells"] == 0
    assert out["missing_cell_pct"] == 0.0
    assert out["complete_rows"] == 3
    assert out["incomplete_rows"] == 0
    assert out["complete_pct"] == pytest.approx(100.0)
    assert out["incomplete_pct"] == pytest.approx(0.0)


def test_none_values_treated_as_present():
    # None and other non-1 values count as present (0).
    mask = {"a": [None, 1, None, "x", 0]}
    out = missingness_overview(mask)
    assert out["n_rows"] == 5
    assert out["n_cols"] == 1
    assert out["n_missing_cells"] == 1  # only the explicit 1 at row 1
    assert out["n_cols_with_null"] == 1
    assert out["complete_rows"] == 4
    assert out["incomplete_rows"] == 1


def test_unequal_lengths_pad_with_max():
    # Ragged lists: n_rows = max length; shorter column padded as present.
    #   a = [1, 1] -> missing at rows 0, 1
    #   b = [0]    -> row 1 padded to present
    #   n_rows = 2, n_cols = 2, total_cells = 4, n_missing_cells = 2 -> 50%
    mask = {"a": [1, 1], "b": [0]}
    out = missingness_overview(mask)
    assert out["n_rows"] == 2
    assert out["n_cols"] == 2
    assert out["n_cols_with_null"] == 1
    assert out["n_missing_cells"] == 2
    assert out["missing_cell_pct"] == pytest.approx(50.0)
    assert out["complete_rows"] == 0
    assert out["incomplete_rows"] == 2
    assert out["incomplete_pct"] == pytest.approx(100.0)


def test_columns_present_but_no_rows():
    # Columns exist but all empty -> zero metrics, n_cols preserved.
    out = missingness_overview({"a": [], "b": []})
    assert out["n_rows"] == 0
    assert out["n_cols"] == 2
    assert out["n_missing_cells"] == 0
    assert out["missing_cell_pct"] == 0.0
    assert out["complete_pct"] == 0.0


def test_never_raises_on_garbage():
    # Non-list column values, mixed junk -> must not raise.
    mask = {"a": "not a list", "b": 123, "c": [1, 0, 1]}
    out = missingness_overview(mask)
    assert set(out.keys()) == EXPECTED_KEYS
    assert out["n_rows"] == 3
    assert out["n_cols"] == 3
    assert out["n_missing_cells"] == 2  # only col c contributes
    assert out["n_cols_with_null"] == 1