4de071f2f9
project_clusters_2d (pura): PCA(2)+KMeans sobre el MISMO subset estandarizado, devolviendo proyeccion 2D y labels alineados por fila + centroides en espacio PCA + perfiles de cluster desestandarizados. Es la pieza que garantiza la alineacion points<->labels que pca_explained y kmeans_segments no cubren (estandarizan por separado y kmeans descarta los labels). Habilita el scatter PCA coloreado por cluster (MUST-8.1). describe_clusters_llm (impura): micro-analisis LLM de los clusters en una sola llamada a ask_llm (grupo claude-direct), devuelve titulo + descripcion por cluster con degradacion dict-no-throw a titulos genericos si el LLM no responde (MUST-8.2). Ambas re-exportadas en datascience/__init__.py. Tests: 6/6 y 9/9 (sin red). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
161 lines
5.4 KiB
Python
161 lines
5.4 KiB
Python
"""Tests para describe_clusters_llm.
|
|
|
|
NO acceden a red ni a credenciales: _parse_clusters_json es testeable aislada y la
|
|
unica via que llamaria al LLM (describe_clusters_llm) se prueba monkeypatcheando
|
|
ask_llm con respuestas simuladas. Cubre golden (LLM ok), edge (cluster faltante,
|
|
array envuelto en basura, lista vacia / input no-lista) y error (LLM caido, texto
|
|
no parseable) — todos sin tocar la red.
|
|
"""
|
|
|
|
import importlib
|
|
import json
|
|
|
|
from datascience.describe_clusters_llm import (
|
|
_parse_clusters_json,
|
|
describe_clusters_llm,
|
|
)
|
|
|
|
# Perfiles de ejemplo con la forma que produce project_clusters_2d.
|
|
_PROFILES = [
|
|
{
|
|
"cluster": 0,
|
|
"size": 60,
|
|
"pct": 60.0,
|
|
"centroid_original": {"acidez": 8.5, "alcohol": 9.2},
|
|
"distinctive": ["acidez", "alcohol"],
|
|
"centroid_z": {"acidez": 1.4, "alcohol": -0.9},
|
|
},
|
|
{
|
|
"cluster": 1,
|
|
"size": 40,
|
|
"pct": 40.0,
|
|
"centroid_original": {"acidez": 5.1, "alcohol": 13.0},
|
|
"distinctive": ["alcohol"],
|
|
"centroid_z": {"acidez": -0.7, "alcohol": 1.6},
|
|
},
|
|
]
|
|
_FEATURES = ["acidez", "alcohol", "azucar"]
|
|
|
|
|
|
def _patch_ask_llm(monkeypatch, returner):
|
|
"""Monkeypatchea ask_llm en el modulo bajo prueba con un callable simulado."""
|
|
mod = importlib.import_module("datascience.describe_clusters_llm")
|
|
monkeypatch.setattr(
|
|
mod, "ask_llm", lambda prompt, model="x", system="", echo=True: returner
|
|
)
|
|
|
|
|
|
# --- _parse_clusters_json (parser puro, sin red) ---
|
|
|
|
|
|
def test_parse_clusters_json_valid_array():
|
|
text = json.dumps(
|
|
[
|
|
{"cluster": 0, "title": "A", "description": "desc a"},
|
|
{"cluster": 1, "title": "B", "description": "desc b"},
|
|
]
|
|
)
|
|
parsed = _parse_clusters_json(text, 2)
|
|
assert parsed == [
|
|
{"cluster": 0, "title": "A", "description": "desc a"},
|
|
{"cluster": 1, "title": "B", "description": "desc b"},
|
|
]
|
|
|
|
|
|
def test_parse_clusters_json_wrapped_in_junk_text():
|
|
payload = [{"cluster": 0, "title": "Solo uno", "description": "d"}]
|
|
text = "Claro, aqui tienes el resultado:\n" + json.dumps(payload) + "\nEspero que sirva."
|
|
parsed = _parse_clusters_json(text, 1)
|
|
assert parsed[0]["title"] == "Solo uno"
|
|
assert parsed[0]["cluster"] == 0
|
|
|
|
|
|
def test_parse_clusters_json_non_json_returns_none():
|
|
# Texto sin array JSON -> degradacion (None) sin lanzar.
|
|
assert _parse_clusters_json("no hay json aqui", 2) is None
|
|
assert _parse_clusters_json("", 2) is None
|
|
assert _parse_clusters_json("{solo un objeto}", 2) is None
|
|
|
|
|
|
def test_parse_clusters_json_fills_missing_cluster_by_index():
|
|
text = json.dumps(
|
|
[
|
|
{"title": "A", "description": "d"},
|
|
{"title": "B", "description": "e"},
|
|
]
|
|
)
|
|
parsed = _parse_clusters_json(text, 2)
|
|
assert parsed[0]["cluster"] == 0
|
|
assert parsed[1]["cluster"] == 1
|
|
assert parsed[0]["title"] == "A"
|
|
|
|
|
|
# --- describe_clusters_llm (con ask_llm monkeypatcheado, sin red) ---
|
|
|
|
|
|
def test_describe_clusters_llm_ok_with_monkeypatched_llm(monkeypatch):
|
|
fake = json.dumps(
|
|
[
|
|
{
|
|
"cluster": 0,
|
|
"title": "Vinos de alta acidez",
|
|
"description": "Acidez por encima de la media y graduacion baja.",
|
|
},
|
|
{
|
|
"cluster": 1,
|
|
"title": "Vinos de alta graduacion",
|
|
"description": "Alcohol claramente por encima de la media.",
|
|
},
|
|
]
|
|
)
|
|
_patch_ask_llm(monkeypatch, fake)
|
|
|
|
out = describe_clusters_llm(_PROFILES, _FEATURES)
|
|
assert out["note"] == ""
|
|
assert out["model"] == "claude-haiku-4-5-20251001"
|
|
assert len(out["clusters"]) == 2
|
|
assert out["clusters"][0]["title"] == "Vinos de alta acidez"
|
|
assert set(out["clusters"][0].keys()) == {"cluster", "title", "description"}
|
|
|
|
|
|
def test_describe_clusters_llm_degrades_on_empty_response(monkeypatch):
|
|
# ask_llm devuelve "" (error/red caida) -> titulos genericos + note.
|
|
_patch_ask_llm(monkeypatch, "")
|
|
|
|
out = describe_clusters_llm(_PROFILES, _FEATURES)
|
|
assert out["clusters"][0]["title"] == "Cluster 0"
|
|
assert out["clusters"][1]["title"] == "Cluster 1"
|
|
assert out["clusters"][0]["description"] == ""
|
|
assert out["note"] == "LLM no disponible"
|
|
assert out["model"] == "claude-haiku-4-5-20251001"
|
|
|
|
|
|
def test_describe_clusters_llm_degrades_on_unparseable_response(monkeypatch):
|
|
_patch_ask_llm(monkeypatch, "lo siento, no puedo ayudarte con eso")
|
|
|
|
out = describe_clusters_llm(_PROFILES, _FEATURES)
|
|
assert out["clusters"][0]["title"] == "Cluster 0"
|
|
assert out["clusters"][1]["title"] == "Cluster 1"
|
|
assert out["note"] == "parse fallido"
|
|
|
|
|
|
def test_describe_clusters_llm_empty_list_skips_llm(monkeypatch):
|
|
# Con lista vacia NO debe llamarse al LLM en absoluto.
|
|
def boom(*args, **kwargs):
|
|
raise AssertionError("ask_llm no debe llamarse con lista vacia")
|
|
|
|
mod = importlib.import_module("datascience.describe_clusters_llm")
|
|
monkeypatch.setattr(mod, "ask_llm", boom)
|
|
|
|
out = describe_clusters_llm([], _FEATURES)
|
|
assert out["clusters"] == []
|
|
assert out["note"] == "sin clusters"
|
|
|
|
|
|
def test_describe_clusters_llm_non_list_input_skips_llm():
|
|
# Input no-lista (None) -> clusters vacio sin tocar la red.
|
|
out = describe_clusters_llm(None, _FEATURES)
|
|
assert out["clusters"] == []
|
|
assert out["note"] == "sin clusters"
|
|
assert out["model"] == "claude-haiku-4-5-20251001"
|