Files
fn_registry/python/functions/datascience/association_matrix_test.py
T
Egutierrez e142ef026d fix(eda): hallazgos de comportamiento del benchmark (H2,H3,H6,H7,H8,H10,H11)
Ronda 4 (verificada con re-corrida sobre los datasets afectados):
- H2: stl_decompose deriva periodo de la frecuencia del indice (seattle period=365
  seasonal_strength=0.84; fin del period=2 espurio)
- H3+H10: infer_fk por senal de nombre (<X>Id->X.<X>Id) + excluir no-clave -> chinook
  111->9 FK, todas reales, cero absurdas, 16-27x mas rapido; base intacta (flag off->111)
- H6: association no computa eta2 si cardinalidad~=n (Ticket-Fare espurio fuera)
- H7: id secuencial monotono excluido de correlacion y PCA/KMeans (PassengerId fuera)
- H8: correlacion de series no estacionarias marcada espuria / sobre retornos
- H11: distribution_type usa modos/cardinalidad/normalidad (quality->discrete)
- 66 tests verdes

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 06:37:47 +02:00

221 lines
8.4 KiB
Python

"""Tests para association_matrix."""
from datascience import association_matrix
def _find_pair(pairs, a, b):
"""Devuelve el par (a, b) sin importar el orden en que aparezca, o None."""
for p in pairs:
if {p["a"], p["b"]} == {a, b}:
return p
return None
def test_two_correlated_numerics_strong_pearson():
columns = {
"size": {"values": [1, 2, 3, 4, 5, 6, 7, 8], "type": "numeric"},
"price": {
"values": [2.1, 4.0, 5.9, 8.1, 10.0, 12.2, 13.8, 16.1],
"type": "numeric",
},
}
result = association_matrix(columns, strong_threshold=0.5)
pair = _find_pair(result["pairs"], "size", "price")
assert pair is not None
assert pair["method"] == "pearson/spearman"
assert abs(pair["value"]) > 0.95
assert "pearson" in pair["extra"] and "spearman" in pair["extra"]
# El par fuertemente correlado aparece en strong.
assert _find_pair(result["strong"], "size", "price") is not None
def test_numeric_explained_by_category_strong_correlation_ratio():
columns = {
"region": {
"values": ["N", "N", "S", "S", "E", "E", "W", "W"],
"type": "categorical",
},
"score": {
"values": [10.0, 11.0, 50.0, 49.0, 90.0, 91.0, 30.0, 31.0],
"type": "numeric",
},
}
result = association_matrix(columns, strong_threshold=0.5)
pair = _find_pair(result["pairs"], "region", "score")
assert pair is not None
assert pair["method"] == "correlation_ratio"
# La categoria explica casi toda la varianza de la numerica.
assert pair["value"] > 0.9
assert _find_pair(result["strong"], "region", "score") is not None
def test_independent_pair_not_strong():
# x e y construidos para ser practicamente independientes (sin relacion).
columns = {
"x": {"values": [1, 2, 1, 2, 1, 2, 1, 2], "type": "numeric"},
"y": {"values": [5, 5, 5, 5, 5, 5, 5, 6], "type": "numeric"},
}
result = association_matrix(columns, strong_threshold=0.5)
pair = _find_pair(result["pairs"], "x", "y")
assert pair is not None
# Ni la metrica principal ni la MI superan el umbral fuerte.
assert abs(pair["value"]) < 0.5
assert pair["extra"]["mi"] < 0.5
assert _find_pair(result["strong"], "x", "y") is None
def test_empty_dict_does_not_crash():
result = association_matrix({})
assert result["pairs"] == []
assert result["strong"] == []
assert "methods_legend" in result
assert "pearson" in result["methods_legend"]
def test_single_column_returns_empty():
columns = {"only": {"values": [1, 2, 3, 4], "type": "numeric"}}
result = association_matrix(columns)
assert result["pairs"] == []
assert result["strong"] == []
def test_pairs_carry_significance_fields():
# Tras la correccion FDR cada par evaluado lleva p_value, p_value_adjusted y
# significant. Un par num-num fuertemente correlado es significativo.
columns = {
"size": {"values": [1, 2, 3, 4, 5, 6, 7, 8], "type": "numeric"},
"price": {
"values": [2.1, 4.0, 5.9, 8.1, 10.0, 12.2, 13.8, 16.1],
"type": "numeric",
},
}
result = association_matrix(columns, strong_threshold=0.5)
pair = _find_pair(result["pairs"], "size", "price")
assert "p_value" in pair and "p_value_adjusted" in pair and "significant" in pair
assert pair["p_value"] is not None and pair["p_value"] < 0.05
assert pair["significant"] is True
# p ajustado nunca por debajo del crudo.
assert pair["p_value_adjusted"] >= pair["p_value"] - 1e-12
def test_result_reports_multiple_testing_summary():
columns = {
"size": {"values": [1, 2, 3, 4, 5, 6, 7, 8], "type": "numeric"},
"price": {
"values": [2.1, 4.0, 5.9, 8.1, 10.0, 12.2, 13.8, 16.1],
"type": "numeric",
},
}
result = association_matrix(columns)
# n_tests = total de pares evaluados.
assert result["n_tests"] == len(result["pairs"])
mt = result["multiple_testing"]
assert mt["method"] == "bh"
assert mt["alpha"] == 0.05
assert mt["n_rejected"] >= 1
assert mt["n_tests"] >= 1
def test_strong_requires_corrected_significance():
# Par num-num con magnitud alta pero p-valor no diminuto. Con alpha normal es
# fuerte; con un alpha mas estricto que su p-valor, deja de ser significativo
# y sale de strong AUNQUE la magnitud siga por encima del umbral. Esto prueba
# que strong se basa en la significancia corregida, no solo en el umbral.
columns = {
"a": {"values": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "type": "numeric"},
"b": {"values": [2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12], "type": "numeric"},
}
relaxed = association_matrix(columns, strong_threshold=0.5, alpha=0.05)
pair = _find_pair(relaxed["pairs"], "a", "b")
assert pair["p_value"] is not None and pair["p_value"] < 0.05
assert abs(pair["value"]) >= 0.5
assert _find_pair(relaxed["strong"], "a", "b") is not None
# alpha mas estricto que el p-valor del par -> ya no significativo.
strict = association_matrix(
columns, strong_threshold=0.5, alpha=pair["p_value"] / 10.0
)
sp = _find_pair(strict["pairs"], "a", "b")
assert abs(sp["value"]) >= 0.5 # magnitud intacta
assert sp["significant"] is False
assert _find_pair(strict["strong"], "a", "b") is None
def test_bonferroni_method_is_accepted():
columns = {
"size": {"values": [1, 2, 3, 4, 5, 6, 7, 8], "type": "numeric"},
"price": {
"values": [2.1, 4.0, 5.9, 8.1, 10.0, 12.2, 13.8, 16.1],
"type": "numeric",
},
}
result = association_matrix(columns, fdr_method="bonferroni")
assert result["multiple_testing"]["method"] == "bonferroni"
pair = _find_pair(result["pairs"], "size", "price")
assert pair["p_value_adjusted"] is not None
# --- H6: correlation_ratio espurio por cardinalidad casi-unica ---------------
def test_h6_categorica_casi_unica_excluida():
# Una categorica con cardinalidad ~ n (id/free-text como Ticket) hace que cada
# grupo tenga un solo valor -> varianza intra-grupo ~= 0 -> correlation_ratio
# = 1 trivial. No debe aparecer ni evaluado ni como par fuerte.
n = 60
columns = {
"ticket": {"values": [f"T{i}" for i in range(n)], "type": "categorical"},
"fare": {"values": [float(i) * 1.3 for i in range(n)], "type": "numeric"},
}
result = association_matrix(columns)
assert _find_pair(result["pairs"], "ticket", "fare") is None
assert _find_pair(result["strong"], "ticket", "fare") is None
def test_h6_categorica_dispersa_con_nulos_excluida():
# Categorica dispersa con muchos None (como Cabin: 147 distintos sobre 204
# presentes): los pocos presentes son casi todos distintos -> grupos singleton.
# Se mide sobre valores PRESENTES, no sobre n filas, para captarla.
vals = [f"C{i}" if i % 4 == 0 else None for i in range(80)] # ~20 presentes, distintos
columns = {
"cabin": {"values": vals, "type": "categorical"},
"fare": {"values": [float(i) for i in range(80)], "type": "numeric"},
}
result = association_matrix(columns)
assert _find_pair(result["pairs"], "cabin", "fare") is None
def test_h6_datetime_excluido_de_pares():
# Datetime es indice unico-ish por fila -> correlation_ratio = 1 espurio contra
# cualquier numerica. Se excluye de los pares de asociacion (las series se
# analizan aparte, no aqui).
columns = {
"date": {
"values": [f"2020-01-{i + 1:02d}" for i in range(10)],
"type": "datetime",
},
"value": {"values": [float(i) for i in range(10)], "type": "numeric"},
}
result = association_matrix(columns)
assert _find_pair(result["pairs"], "date", "value") is None
def test_h6_categorica_legitima_se_conserva():
# Edge anti-sobrefiltrado: una categorica de baja cardinalidad (grupos grandes,
# tamano medio >= 1.5) SIGUE evaluandose y su asociacion fuerte se conserva.
columns = {
"region": {
"values": ["N", "N", "S", "S", "E", "E", "W", "W"],
"type": "categorical",
},
"score": {
"values": [10.0, 11.0, 50.0, 49.0, 90.0, 91.0, 30.0, 31.0],
"type": "numeric",
},
}
result = association_matrix(columns)
assert _find_pair(result["pairs"], "region", "score") is not None
assert _find_pair(result["strong"], "region", "score") is not None