feat(eda): project_clusters_2d + describe_clusters_llm para el capitulo MODELOS

project_clusters_2d (pura): PCA(2)+KMeans sobre el MISMO subset estandarizado, devolviendo proyeccion 2D y labels alineados por fila + centroides en espacio PCA + perfiles de cluster desestandarizados. Es la pieza que garantiza la alineacion points<->labels que pca_explained y kmeans_segments no cubren (estandarizan por separado y kmeans descarta los labels). Habilita el scatter PCA coloreado por cluster (MUST-8.1). describe_clusters_llm (impura): micro-analisis LLM de los clusters en una sola llamada a ask_llm (grupo claude-direct), devuelve titulo + descripcion por cluster con degradacion dict-no-throw a titulos genericos si el LLM no responde (MUST-8.2). Ambas re-exportadas en datascience/__init__.py. Tests: 6/6 y 9/9 (sin red). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 14:57:27 +02:00
parent cb7a7fc1fd
commit 4de071f2f9
7 changed files with 931 additions and 0 deletions
@@ -0,0 +1,160 @@
+"""Tests para describe_clusters_llm.
+
+NO acceden a red ni a credenciales: _parse_clusters_json es testeable aislada y la
+unica via que llamaria al LLM (describe_clusters_llm) se prueba monkeypatcheando
+ask_llm con respuestas simuladas. Cubre golden (LLM ok), edge (cluster faltante,
+array envuelto en basura, lista vacia / input no-lista) y error (LLM caido, texto
+no parseable) — todos sin tocar la red.
+"""
+
+import importlib
+import json
+
+from datascience.describe_clusters_llm import (
+    _parse_clusters_json,
+    describe_clusters_llm,
+)
+
+# Perfiles de ejemplo con la forma que produce project_clusters_2d.
+_PROFILES = [
+    {
+        "cluster": 0,
+        "size": 60,
+        "pct": 60.0,
+        "centroid_original": {"acidez": 8.5, "alcohol": 9.2},
+        "distinctive": ["acidez", "alcohol"],
+        "centroid_z": {"acidez": 1.4, "alcohol": -0.9},
+    },
+    {
+        "cluster": 1,
+        "size": 40,
+        "pct": 40.0,
+        "centroid_original": {"acidez": 5.1, "alcohol": 13.0},
+        "distinctive": ["alcohol"],
+        "centroid_z": {"acidez": -0.7, "alcohol": 1.6},
+    },
+]
+_FEATURES = ["acidez", "alcohol", "azucar"]
+
+
+def _patch_ask_llm(monkeypatch, returner):
+    """Monkeypatchea ask_llm en el modulo bajo prueba con un callable simulado."""
+    mod = importlib.import_module("datascience.describe_clusters_llm")
+    monkeypatch.setattr(
+        mod, "ask_llm", lambda prompt, model="x", system="", echo=True: returner
+    )
+
+
+# --- _parse_clusters_json (parser puro, sin red) ---
+
+
+def test_parse_clusters_json_valid_array():
+    text = json.dumps(
+        [
+            {"cluster": 0, "title": "A", "description": "desc a"},
+            {"cluster": 1, "title": "B", "description": "desc b"},
+        ]
+    )
+    parsed = _parse_clusters_json(text, 2)
+    assert parsed == [
+        {"cluster": 0, "title": "A", "description": "desc a"},
+        {"cluster": 1, "title": "B", "description": "desc b"},
+    ]
+
+
+def test_parse_clusters_json_wrapped_in_junk_text():
+    payload = [{"cluster": 0, "title": "Solo uno", "description": "d"}]
+    text = "Claro, aqui tienes el resultado:\n" + json.dumps(payload) + "\nEspero que sirva."
+    parsed = _parse_clusters_json(text, 1)
+    assert parsed[0]["title"] == "Solo uno"
+    assert parsed[0]["cluster"] == 0
+
+
+def test_parse_clusters_json_non_json_returns_none():
+    # Texto sin array JSON -> degradacion (None) sin lanzar.
+    assert _parse_clusters_json("no hay json aqui", 2) is None
+    assert _parse_clusters_json("", 2) is None
+    assert _parse_clusters_json("{solo un objeto}", 2) is None
+
+
+def test_parse_clusters_json_fills_missing_cluster_by_index():
+    text = json.dumps(
+        [
+            {"title": "A", "description": "d"},
+            {"title": "B", "description": "e"},
+        ]
+    )
+    parsed = _parse_clusters_json(text, 2)
+    assert parsed[0]["cluster"] == 0
+    assert parsed[1]["cluster"] == 1
+    assert parsed[0]["title"] == "A"
+
+
+# --- describe_clusters_llm (con ask_llm monkeypatcheado, sin red) ---
+
+
+def test_describe_clusters_llm_ok_with_monkeypatched_llm(monkeypatch):
+    fake = json.dumps(
+        [
+            {
+                "cluster": 0,
+                "title": "Vinos de alta acidez",
+                "description": "Acidez por encima de la media y graduacion baja.",
+            },
+            {
+                "cluster": 1,
+                "title": "Vinos de alta graduacion",
+                "description": "Alcohol claramente por encima de la media.",
+            },
+        ]
+    )
+    _patch_ask_llm(monkeypatch, fake)
+
+    out = describe_clusters_llm(_PROFILES, _FEATURES)
+    assert out["note"] == ""
+    assert out["model"] == "claude-haiku-4-5-20251001"
+    assert len(out["clusters"]) == 2
+    assert out["clusters"][0]["title"] == "Vinos de alta acidez"
+    assert set(out["clusters"][0].keys()) == {"cluster", "title", "description"}
+
+
+def test_describe_clusters_llm_degrades_on_empty_response(monkeypatch):
+    # ask_llm devuelve "" (error/red caida) -> titulos genericos + note.
+    _patch_ask_llm(monkeypatch, "")
+
+    out = describe_clusters_llm(_PROFILES, _FEATURES)
+    assert out["clusters"][0]["title"] == "Cluster 0"
+    assert out["clusters"][1]["title"] == "Cluster 1"
+    assert out["clusters"][0]["description"] == ""
+    assert out["note"] == "LLM no disponible"
+    assert out["model"] == "claude-haiku-4-5-20251001"
+
+
+def test_describe_clusters_llm_degrades_on_unparseable_response(monkeypatch):
+    _patch_ask_llm(monkeypatch, "lo siento, no puedo ayudarte con eso")
+
+    out = describe_clusters_llm(_PROFILES, _FEATURES)
+    assert out["clusters"][0]["title"] == "Cluster 0"
+    assert out["clusters"][1]["title"] == "Cluster 1"
+    assert out["note"] == "parse fallido"
+
+
+def test_describe_clusters_llm_empty_list_skips_llm(monkeypatch):
+    # Con lista vacia NO debe llamarse al LLM en absoluto.
+    def boom(*args, **kwargs):
+        raise AssertionError("ask_llm no debe llamarse con lista vacia")
+
+    mod = importlib.import_module("datascience.describe_clusters_llm")
+    monkeypatch.setattr(mod, "ask_llm", boom)
+
+    out = describe_clusters_llm([], _FEATURES)
+    assert out["clusters"] == []
+    assert out["note"] == "sin clusters"
+
+
+def test_describe_clusters_llm_non_list_input_skips_llm():
+    # Input no-lista (None) -> clusters vacio sin tocar la red.
+    out = describe_clusters_llm(None, _FEATURES)
+    assert out["clusters"] == []
+    assert out["note"] == "sin clusters"
+    assert out["model"] == "claude-haiku-4-5-20251001"