fn_registry/python/functions/datascience/association_matrix_test.py

"""Tests para association_matrix."""

from datascience import association_matrix


def _find_pair(pairs, a, b):
    """Devuelve el par (a, b) sin importar el orden en que aparezca, o None."""
    for p in pairs:
        if {p["a"], p["b"]} == {a, b}:
            return p
    return None


def test_two_correlated_numerics_strong_pearson():
    columns = {
        "size": {"values": [1, 2, 3, 4, 5, 6, 7, 8], "type": "numeric"},
        "price": {
            "values": [2.1, 4.0, 5.9, 8.1, 10.0, 12.2, 13.8, 16.1],
            "type": "numeric",
        },
    }
    result = association_matrix(columns, strong_threshold=0.5)

    pair = _find_pair(result["pairs"], "size", "price")
    assert pair is not None
    assert pair["method"] == "pearson/spearman"
    assert abs(pair["value"]) > 0.95
    assert "pearson" in pair["extra"] and "spearman" in pair["extra"]
    # El par fuertemente correlado aparece en strong.
    assert _find_pair(result["strong"], "size", "price") is not None


def test_numeric_explained_by_category_strong_correlation_ratio():
    columns = {
        "region": {
            "values": ["N", "N", "S", "S", "E", "E", "W", "W"],
            "type": "categorical",
        },
        "score": {
            "values": [10.0, 11.0, 50.0, 49.0, 90.0, 91.0, 30.0, 31.0],
            "type": "numeric",
        },
    }
    result = association_matrix(columns, strong_threshold=0.5)

    pair = _find_pair(result["pairs"], "region", "score")
    assert pair is not None
    assert pair["method"] == "correlation_ratio"
    # La categoria explica casi toda la varianza de la numerica.
    assert pair["value"] > 0.9
    assert _find_pair(result["strong"], "region", "score") is not None


def test_independent_pair_not_strong():
    # x e y construidos para ser practicamente independientes (sin relacion).
    columns = {
        "x": {"values": [1, 2, 1, 2, 1, 2, 1, 2], "type": "numeric"},
        "y": {"values": [5, 5, 5, 5, 5, 5, 5, 6], "type": "numeric"},
    }
    result = association_matrix(columns, strong_threshold=0.5)

    pair = _find_pair(result["pairs"], "x", "y")
    assert pair is not None
    # Ni la metrica principal ni la MI superan el umbral fuerte.
    assert abs(pair["value"]) < 0.5
    assert pair["extra"]["mi"] < 0.5
    assert _find_pair(result["strong"], "x", "y") is None


def test_empty_dict_does_not_crash():
    result = association_matrix({})
    assert result["pairs"] == []
    assert result["strong"] == []
    assert "methods_legend" in result
    assert "pearson" in result["methods_legend"]


def test_single_column_returns_empty():
    columns = {"only": {"values": [1, 2, 3, 4], "type": "numeric"}}
    result = association_matrix(columns)
    assert result["pairs"] == []
    assert result["strong"] == []


def test_pairs_carry_significance_fields():
    # Tras la correccion FDR cada par evaluado lleva p_value, p_value_adjusted y
    # significant. Un par num-num fuertemente correlado es significativo.
    columns = {
        "size": {"values": [1, 2, 3, 4, 5, 6, 7, 8], "type": "numeric"},
        "price": {
            "values": [2.1, 4.0, 5.9, 8.1, 10.0, 12.2, 13.8, 16.1],
            "type": "numeric",
        },
    }
    result = association_matrix(columns, strong_threshold=0.5)
    pair = _find_pair(result["pairs"], "size", "price")
    assert "p_value" in pair and "p_value_adjusted" in pair and "significant" in pair
    assert pair["p_value"] is not None and pair["p_value"] < 0.05
    assert pair["significant"] is True
    # p ajustado nunca por debajo del crudo.
    assert pair["p_value_adjusted"] >= pair["p_value"] - 1e-12


def test_result_reports_multiple_testing_summary():
    columns = {
        "size": {"values": [1, 2, 3, 4, 5, 6, 7, 8], "type": "numeric"},
        "price": {
            "values": [2.1, 4.0, 5.9, 8.1, 10.0, 12.2, 13.8, 16.1],
            "type": "numeric",
        },
    }
    result = association_matrix(columns)
    # n_tests = total de pares evaluados.
    assert result["n_tests"] == len(result["pairs"])
    mt = result["multiple_testing"]
    assert mt["method"] == "bh"
    assert mt["alpha"] == 0.05
    assert mt["n_rejected"] >= 1
    assert mt["n_tests"] >= 1


def test_strong_requires_corrected_significance():
    # Par num-num con magnitud alta pero p-valor no diminuto. Con alpha normal es
    # fuerte; con un alpha mas estricto que su p-valor, deja de ser significativo
    # y sale de strong AUNQUE la magnitud siga por encima del umbral. Esto prueba
    # que strong se basa en la significancia corregida, no solo en el umbral.
    columns = {
        "a": {"values": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "type": "numeric"},
        "b": {"values": [2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12], "type": "numeric"},
    }
    relaxed = association_matrix(columns, strong_threshold=0.5, alpha=0.05)
    pair = _find_pair(relaxed["pairs"], "a", "b")
    assert pair["p_value"] is not None and pair["p_value"] < 0.05
    assert abs(pair["value"]) >= 0.5
    assert _find_pair(relaxed["strong"], "a", "b") is not None

    # alpha mas estricto que el p-valor del par -> ya no significativo.
    strict = association_matrix(
        columns, strong_threshold=0.5, alpha=pair["p_value"] / 10.0
    )
    sp = _find_pair(strict["pairs"], "a", "b")
    assert abs(sp["value"]) >= 0.5  # magnitud intacta
    assert sp["significant"] is False
    assert _find_pair(strict["strong"], "a", "b") is None


def test_bonferroni_method_is_accepted():
    columns = {
        "size": {"values": [1, 2, 3, 4, 5, 6, 7, 8], "type": "numeric"},
        "price": {
            "values": [2.1, 4.0, 5.9, 8.1, 10.0, 12.2, 13.8, 16.1],
            "type": "numeric",
        },
    }
    result = association_matrix(columns, fdr_method="bonferroni")
    assert result["multiple_testing"]["method"] == "bonferroni"
    pair = _find_pair(result["pairs"], "size", "price")
    assert pair["p_value_adjusted"] is not None


# --- H6: correlation_ratio espurio por cardinalidad casi-unica ---------------

def test_h6_categorica_casi_unica_excluida():
    # Una categorica con cardinalidad ~ n (id/free-text como Ticket) hace que cada
    # grupo tenga un solo valor -> varianza intra-grupo ~= 0 -> correlation_ratio
    # = 1 trivial. No debe aparecer ni evaluado ni como par fuerte.
    n = 60
    columns = {
        "ticket": {"values": [f"T{i}" for i in range(n)], "type": "categorical"},
        "fare": {"values": [float(i) * 1.3 for i in range(n)], "type": "numeric"},
    }
    result = association_matrix(columns)
    assert _find_pair(result["pairs"], "ticket", "fare") is None
    assert _find_pair(result["strong"], "ticket", "fare") is None


def test_h6_categorica_dispersa_con_nulos_excluida():
    # Categorica dispersa con muchos None (como Cabin: 147 distintos sobre 204
    # presentes): los pocos presentes son casi todos distintos -> grupos singleton.
    # Se mide sobre valores PRESENTES, no sobre n filas, para captarla.
    vals = [f"C{i}" if i % 4 == 0 else None for i in range(80)]  # ~20 presentes, distintos
    columns = {
        "cabin": {"values": vals, "type": "categorical"},
        "fare": {"values": [float(i) for i in range(80)], "type": "numeric"},
    }
    result = association_matrix(columns)
    assert _find_pair(result["pairs"], "cabin", "fare") is None


def test_h6_datetime_excluido_de_pares():
    # Datetime es indice unico-ish por fila -> correlation_ratio = 1 espurio contra
    # cualquier numerica. Se excluye de los pares de asociacion (las series se
    # analizan aparte, no aqui).
    columns = {
        "date": {
            "values": [f"2020-01-{i + 1:02d}" for i in range(10)],
            "type": "datetime",
        },
        "value": {"values": [float(i) for i in range(10)], "type": "numeric"},
    }
    result = association_matrix(columns)
    assert _find_pair(result["pairs"], "date", "value") is None


def test_h6_categorica_legitima_se_conserva():
    # Edge anti-sobrefiltrado: una categorica de baja cardinalidad (grupos grandes,
    # tamano medio >= 1.5) SIGUE evaluandose y su asociacion fuerte se conserva.
    columns = {
        "region": {
            "values": ["N", "N", "S", "S", "E", "E", "W", "W"],
            "type": "categorical",
        },
        "score": {
            "values": [10.0, 11.0, 50.0, 49.0, 90.0, 91.0, 30.0, 31.0],
            "type": "numeric",
        },
    }
    result = association_matrix(columns)
    assert _find_pair(result["pairs"], "region", "score") is not None
    assert _find_pair(result["strong"], "region", "score") is not None