32c7336bf6
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
212 lines
6.0 KiB
Python
212 lines
6.0 KiB
Python
"""Tests para eda_llm_insights.
|
|
|
|
NO acceden a red ni a credenciales: _build_prompt y _parse_llm_json son puras y
|
|
testeables aisladas; la unica via que llamaria al LLM (eda_llm_insights) se
|
|
prueba monkeypatcheando ask_llm con una respuesta simulada.
|
|
"""
|
|
|
|
import json
|
|
|
|
from datascience.eda_llm_insights import (
|
|
_build_prompt,
|
|
_parse_llm_json,
|
|
eda_llm_insights,
|
|
)
|
|
|
|
# Perfil de ejemplo con la forma que produce profile_table.
|
|
_PROFILE = {
|
|
"table": "ventas",
|
|
"n_rows": 1000,
|
|
"columns": [
|
|
{
|
|
"name": "importe",
|
|
"inferred_type": "numeric",
|
|
"semantic_type": "currency",
|
|
"null_pct": 0.0,
|
|
"distinct_count": 950,
|
|
"numeric": {"min": 1.0, "max": 999.0, "mean": 50.5, "p50": 42.0},
|
|
"categorical": None,
|
|
},
|
|
{
|
|
"name": "categoria",
|
|
"inferred_type": "categorical",
|
|
"semantic_type": "",
|
|
"null_pct": 0.05,
|
|
"distinct_count": 3,
|
|
"numeric": None,
|
|
"categorical": {
|
|
"top": [
|
|
{"value": "neumaticos", "count": 600, "pct": 0.6},
|
|
{"value": "frenos", "count": 300, "pct": 0.3},
|
|
{"value": "aceite", "count": 100, "pct": 0.1},
|
|
],
|
|
"mode": "neumaticos",
|
|
},
|
|
},
|
|
],
|
|
"correlations": {
|
|
"strong": [
|
|
{"a": "importe", "b": "categoria", "method": "correlation_ratio", "value": 0.72},
|
|
],
|
|
},
|
|
}
|
|
|
|
|
|
def test_build_prompt_includes_table_and_columns():
|
|
prompt = _build_prompt(_PROFILE)
|
|
assert isinstance(prompt, str)
|
|
assert "ventas" in prompt
|
|
assert "importe" in prompt
|
|
assert "categoria" in prompt
|
|
# n_rows presente.
|
|
assert "1000" in prompt
|
|
|
|
|
|
def test_build_prompt_includes_numeric_stats_and_top_values():
|
|
prompt = _build_prompt(_PROFILE)
|
|
# Stats numericas de importe.
|
|
assert "stats[" in prompt
|
|
assert "mean=50.5" in prompt
|
|
# Top valores de categorica.
|
|
assert "neumaticos" in prompt
|
|
# Correlaciones fuertes.
|
|
assert "correlation_ratio" in prompt
|
|
|
|
|
|
def test_build_prompt_handles_empty_profile():
|
|
prompt = _build_prompt({})
|
|
assert isinstance(prompt, str)
|
|
assert "Columnas: 0" in prompt
|
|
|
|
|
|
def test_parse_llm_json_plain():
|
|
payload = {"summary": "una tabla", "dictionary": [], "pii": []}
|
|
text = json.dumps(payload)
|
|
parsed = _parse_llm_json(text)
|
|
assert parsed["summary"] == "una tabla"
|
|
|
|
|
|
def test_parse_llm_json_with_fences():
|
|
payload = {"summary": "con fences", "analyses": ["a1"]}
|
|
text = "```json\n" + json.dumps(payload) + "\n```"
|
|
parsed = _parse_llm_json(text)
|
|
assert parsed["summary"] == "con fences"
|
|
assert parsed["analyses"] == ["a1"]
|
|
|
|
|
|
def test_parse_llm_json_with_surrounding_text():
|
|
payload = {"summary": "rodeado"}
|
|
text = "Aqui tienes el resultado:\n" + json.dumps(payload) + "\nEspero que sirva."
|
|
parsed = _parse_llm_json(text)
|
|
assert parsed["summary"] == "rodeado"
|
|
|
|
|
|
def test_parse_llm_json_nested_braces_in_strings():
|
|
# Un valor string con llaves no debe romper el matching.
|
|
text = '{"summary": "usa {placeholders}", "cleaning": ["fix {x}"]}'
|
|
parsed = _parse_llm_json(text)
|
|
assert parsed["summary"] == "usa {placeholders}"
|
|
assert parsed["cleaning"] == ["fix {x}"]
|
|
|
|
|
|
def test_parse_llm_json_raises_without_object():
|
|
try:
|
|
_parse_llm_json("no hay json aqui")
|
|
assert False, "esperaba ValueError"
|
|
except ValueError:
|
|
pass
|
|
|
|
|
|
def test_eda_llm_insights_ok_with_monkeypatched_llm(monkeypatch):
|
|
"""Simula la respuesta del LLM y verifica el shape de salida (sin red)."""
|
|
fake = {
|
|
"summary": "Tabla de ventas",
|
|
"row_meaning": "Una fila = una venta",
|
|
"dictionary": [
|
|
{
|
|
"column": "importe",
|
|
"description": "monto",
|
|
"business_meaning": "ingreso",
|
|
"unit": "EUR",
|
|
}
|
|
],
|
|
"pii": [],
|
|
"cleaning": ["normalizar categoria"],
|
|
"analyses": ["ventas por categoria"],
|
|
}
|
|
|
|
import importlib
|
|
|
|
mod = importlib.import_module("datascience.eda_llm_insights")
|
|
|
|
monkeypatch.setattr(
|
|
mod, "ask_llm", lambda prompt, model="x", system="", echo=True: json.dumps(fake)
|
|
)
|
|
|
|
out = eda_llm_insights(_PROFILE)
|
|
assert out["status"] == "ok"
|
|
llm = out["llm"]
|
|
assert set(llm.keys()) == {
|
|
"summary",
|
|
"row_meaning",
|
|
"dictionary",
|
|
"pii",
|
|
"cleaning",
|
|
"analyses",
|
|
}
|
|
assert llm["summary"] == "Tabla de ventas"
|
|
assert llm["dictionary"][0]["unit"] == "EUR"
|
|
|
|
|
|
def test_eda_llm_insights_fills_missing_keys(monkeypatch):
|
|
"""Si el LLM omite claves, se rellenan con defaults vacios."""
|
|
import importlib
|
|
|
|
mod = importlib.import_module("datascience.eda_llm_insights")
|
|
|
|
monkeypatch.setattr(
|
|
mod,
|
|
"ask_llm",
|
|
lambda prompt, model="x", system="", echo=True: '{"summary": "solo summary"}',
|
|
)
|
|
|
|
out = eda_llm_insights(_PROFILE)
|
|
assert out["status"] == "ok"
|
|
llm = out["llm"]
|
|
assert llm["summary"] == "solo summary"
|
|
assert llm["dictionary"] == []
|
|
assert llm["pii"] == []
|
|
assert llm["cleaning"] == []
|
|
assert llm["analyses"] == []
|
|
assert llm["row_meaning"] == ""
|
|
|
|
|
|
def test_eda_llm_insights_error_on_empty_profile():
|
|
out = eda_llm_insights({})
|
|
assert out["status"] == "error"
|
|
assert "profile" in out["error"]
|
|
|
|
|
|
def test_eda_llm_insights_error_on_empty_llm_response(monkeypatch):
|
|
import importlib
|
|
|
|
mod = importlib.import_module("datascience.eda_llm_insights")
|
|
|
|
monkeypatch.setattr(
|
|
mod, "ask_llm", lambda prompt, model="x", system="", echo=True: ""
|
|
)
|
|
out = eda_llm_insights(_PROFILE)
|
|
assert out["status"] == "error"
|
|
|
|
|
|
def test_eda_llm_insights_error_on_unparseable_llm_response(monkeypatch):
|
|
import importlib
|
|
|
|
mod = importlib.import_module("datascience.eda_llm_insights")
|
|
|
|
monkeypatch.setattr(
|
|
mod, "ask_llm", lambda prompt, model="x", system="", echo=True: "sin json"
|
|
)
|
|
out = eda_llm_insights(_PROFILE)
|
|
assert out["status"] == "error"
|