fix(eda): hallazgos de comportamiento del benchmark (H2,H3,H6,H7,H8,H10,H11)
Ronda 4 (verificada con re-corrida sobre los datasets afectados): - H2: stl_decompose deriva periodo de la frecuencia del indice (seattle period=365 seasonal_strength=0.84; fin del period=2 espurio) - H3+H10: infer_fk por senal de nombre (<X>Id->X.<X>Id) + excluir no-clave -> chinook 111->9 FK, todas reales, cero absurdas, 16-27x mas rapido; base intacta (flag off->111) - H6: association no computa eta2 si cardinalidad~=n (Ticket-Fare espurio fuera) - H7: id secuencial monotono excluido de correlacion y PCA/KMeans (PassengerId fuera) - H8: correlacion de series no estacionarias marcada espuria / sobre retornos - H11: distribution_type usa modos/cardinalidad/normalidad (quality->discrete) - 66 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -43,3 +43,57 @@ def test_detect_exactly_30():
|
||||
values = rng.normal(0, 1, 30).tolist()
|
||||
result = detect_distribution_type(values)
|
||||
assert result["type"] != "too_few_samples"
|
||||
|
||||
|
||||
# --- H11: discrete / multimodal no deben etiquetarse "normal-ish" ---
|
||||
|
||||
|
||||
def test_detect_discrete_low_cardinality():
|
||||
# Rating ordinal de 6 niveles (como wine `quality`): skewness pequena,
|
||||
# antes caia en "normal-ish"; ahora debe ser "discrete".
|
||||
rng = np.random.default_rng(3)
|
||||
values = rng.integers(3, 9, size=1500).astype(float).tolist() # 6 valores distintos
|
||||
result = detect_distribution_type(values)
|
||||
assert result["type"] == "discrete", f"Got {result['type']}"
|
||||
assert result["stats"]["n_unique"] <= 15
|
||||
|
||||
|
||||
def test_detect_multimodal():
|
||||
# Mezcla bimodal claramente separada con skewness ~0: antes "normal-ish",
|
||||
# ahora "multimodal".
|
||||
rng = np.random.default_rng(4)
|
||||
values = np.concatenate(
|
||||
[rng.normal(-4, 0.6, 1000), rng.normal(4, 0.6, 1000)]
|
||||
).tolist()
|
||||
result = detect_distribution_type(values)
|
||||
assert result["type"] == "multimodal", f"Got {result['type']}"
|
||||
assert result["stats"]["n_modes"] >= 2
|
||||
|
||||
|
||||
def test_detect_normal_still_normal_after_fix():
|
||||
# Retrocompatibilidad: una normal continua genuina sigue "normal-ish"
|
||||
# pese a los nuevos checks de cardinalidad / modos.
|
||||
rng = np.random.default_rng(5)
|
||||
values = rng.normal(10, 2, 2000).tolist()
|
||||
result = detect_distribution_type(values)
|
||||
assert result["type"] == "normal-ish", f"Got {result['type']}"
|
||||
assert result["stats"]["n_modes"] == 1
|
||||
assert result["stats"]["n_unique"] > 15
|
||||
|
||||
|
||||
def test_detect_stats_has_new_keys():
|
||||
rng = np.random.default_rng(6)
|
||||
values = rng.normal(0, 1, 200).tolist()
|
||||
stats = detect_distribution_type(values)["stats"]
|
||||
for key in ("n_unique", "n_modes", "jb_stat", "jb_pvalue"):
|
||||
assert key in stats, f"missing {key}"
|
||||
|
||||
|
||||
def test_detect_unimodal_skewed_not_multimodal():
|
||||
# Continua unimodal sesgada (exponencial): el detector de modos no debe
|
||||
# inventar modos espurios y la etiqueta no debe ser "multimodal".
|
||||
rng = np.random.default_rng(8)
|
||||
values = rng.exponential(1.0, 2000).tolist()
|
||||
result = detect_distribution_type(values)
|
||||
assert result["type"] != "multimodal", f"Got {result['type']}"
|
||||
assert result["stats"]["n_modes"] == 1
|
||||
|
||||
Reference in New Issue
Block a user