763e06c127
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
65 lines
2.1 KiB
Python
65 lines
2.1 KiB
Python
"""Tests para kmeans_segments."""
|
|
|
|
import numpy as np
|
|
|
|
from kmeans_segments import kmeans_segments
|
|
|
|
|
|
def _three_blobs(seed: int = 0, per_blob: int = 40):
|
|
"""Genera 3 blobs gaussianos bien separados en 2D, alineados por fila."""
|
|
rng = np.random.default_rng(seed)
|
|
centers = [(0.0, 0.0), (12.0, 12.0), (0.0, 12.0)]
|
|
xs: list[float] = []
|
|
ys: list[float] = []
|
|
for cx, cy in centers:
|
|
pts = rng.normal(loc=(cx, cy), scale=0.4, size=(per_blob, 2))
|
|
xs.extend(float(p[0]) for p in pts)
|
|
ys.extend(float(p[1]) for p in pts)
|
|
return {"x": xs, "y": ys}
|
|
|
|
|
|
def test_three_separated_blobs_finds_k3():
|
|
columns = _three_blobs(seed=0, per_blob=40)
|
|
result = kmeans_segments(columns, k_min=2, k_max=8)
|
|
|
|
assert result["best_k"] == 3
|
|
assert result["silhouette"] > 0.5
|
|
assert result["n_features"] == 2
|
|
assert result["n_rows_used"] == 120
|
|
assert sum(result["cluster_sizes"]) == 120
|
|
assert len(result["centers"]) == 3
|
|
# scores_by_k cubre todo el rango probado.
|
|
ks = [s["k"] for s in result["scores_by_k"]]
|
|
assert ks == list(range(2, 9))
|
|
|
|
|
|
def test_insufficient_rows_returns_note():
|
|
# Solo 3 filas válidas, k_min*2 = 4 -> insuficiente.
|
|
columns = {"x": [1.0, 2.0, 3.0], "y": [1.0, 2.0, 3.0]}
|
|
result = kmeans_segments(columns, k_min=2, k_max=8)
|
|
|
|
assert result["best_k"] == 0
|
|
assert result["note"] == "datos insuficientes"
|
|
|
|
|
|
def test_insufficient_numeric_columns_returns_note():
|
|
# Una sola columna numérica; la otra es texto -> menos de 2 numéricas.
|
|
columns = {
|
|
"x": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
|
|
"label": ["a", "b", "c", "d", "e", "f"],
|
|
}
|
|
result = kmeans_segments(columns, k_min=2, k_max=8)
|
|
|
|
assert result["best_k"] == 0
|
|
assert result["note"] == "datos insuficientes"
|
|
|
|
|
|
def test_rows_with_none_are_dropped():
|
|
columns = _three_blobs(seed=1, per_blob=40)
|
|
# Inyectar None en una fila; debe descartarse, dejando 119.
|
|
columns["x"][0] = None
|
|
result = kmeans_segments(columns, k_min=2, k_max=8)
|
|
|
|
assert result["best_k"] == 3
|
|
assert result["n_rows_used"] == 119
|