fn_registry/python/functions/datascience/eda_llm_insights.py

"""eda_llm_insights — capa LLM interpretativa del grupo de capacidad `eda`.

Toma un TableProfile YA CALCULADO (el dict que produce `profile_table`) y, con
UNA sola llamada al LLM, genera el bloque interpretativo "llm": resumen de la
tabla, significado de una fila, diccionario de datos, deteccion de PII (RGPD),
sugerencias de limpieza y analisis sugeridos.

Clave de coste y privacidad: NO se envian filas crudas al LLM. Solo viaja el
perfil AGREGADO (nombres, tipos, % nulos, distinct, top valores ya agregados de
categoricas, stats de numericas y pares de correlacion fuertes). Asi el coste es
minimo y ningun dato fila-a-fila sale del proceso.

Reusa `ask_llm` del registry (grupo claude-direct, API directa con el token
OAuth de Claude en ~/.claude/.credentials.json, arranque 0). Impura: una llamada
de red. Estilo dict-no-throw del grupo: nunca lanza; ante cualquier fallo (red,
LLM, parseo) devuelve {status:'error', error:str}.
"""

import json

from core.ask_llm import ask_llm

# Claves que el LLM debe devolver. Las que falten se rellenan con estos defaults.
_EXPECTED_KEYS = {
    "summary": "",
    "row_meaning": "",
    "dictionary": [],
    "pii": [],
    "cleaning": [],
    "analyses": [],
}

_SYSTEM = (
    "Eres un analista de datos senior. Recibes el PERFIL AGREGADO de una tabla "
    "(nunca filas crudas) y lo interpretas de forma util para un humano de "
    "negocio. Detectas datos personales/sensibles segun el RGPD. Respondes "
    "SIEMPRE y SOLO con un unico objeto JSON valido, sin texto alrededor, sin "
    "fences de markdown, con EXACTAMENTE estas claves: "
    '"summary" (str: que es la tabla, 2-3 frases), '
    '"row_meaning" (str: que representa una fila y su granularidad), '
    '"dictionary" (lista de objetos {"column","description","business_meaning","unit"}), '
    '"pii" (lista de objetos {"column","kind","severity"} con severity en '
    'low|medium|high, solo columnas con datos personales/sensibles), '
    '"cleaning" (lista de strings con sugerencias de limpieza/transformacion), '
    '"analyses" (lista de strings con preguntas/analisis sugeridos e hipotesis '
    "de relaciones). Responde en el mismo idioma que los nombres de columna."
)


def _fmt_num(value) -> str:
    """Formatea un numero de forma compacta para el prompt (None -> '?')."""
    if value is None:
        return "?"
    if isinstance(value, float):
        if value == int(value):
            return str(int(value))
        return f"{value:.4g}"
    return str(value)


def _build_prompt(profile: dict) -> str:
    """Construye un resumen textual compacto del perfil para el LLM.

    Funcion interna PURA: no toca red ni disco, es testeable sin credenciales.
    Incluye, por columna: name, inferred_type, semantic_type, null_pct, distinct;
    top-3 valores si categorical; min/max/mean/median si numeric. Cierra con la
    lista de correlations["strong"] si existe.

    Args:
        profile: TableProfile (dict de profile_table["profile"]).

    Returns:
        El texto del prompt.
    """
    profile = profile or {}
    table = profile.get("table", "(desconocida)")
    n_rows = profile.get("n_rows")
    cols = profile.get("columns") or []

    lines = [
        "Perfil agregado de una tabla. No hay filas crudas, solo metricas.",
        f"Tabla: {table}",
        f"Filas (n_rows): {_fmt_num(n_rows)}",
        f"Columnas: {len(cols)}",
        "",
        "Columnas:",
    ]

    for col in cols:
        name = col.get("name", "?")
        itype = col.get("inferred_type") or "?"
        stype = col.get("semantic_type") or ""
        null_pct = col.get("null_pct")
        null_str = f"{null_pct * 100:.1f}%" if isinstance(null_pct, (int, float)) else "?"
        distinct = col.get("distinct_count")

        parts = [
            f"- {name}",
            f"tipo={itype}",
        ]
        if stype:
            parts.append(f"semantic={stype}")
        parts.append(f"nulos={null_str}")
        parts.append(f"distinct={_fmt_num(distinct)}")

        if itype == "numeric" and isinstance(col.get("numeric"), dict):
            num = col["numeric"]
            parts.append(
                "stats[min={} max={} mean={} median={}]".format(
                    _fmt_num(num.get("min")),
                    _fmt_num(num.get("max")),
                    _fmt_num(num.get("mean")),
                    _fmt_num(num.get("p50") if num.get("p50") is not None else num.get("median")),
                )
            )
        elif isinstance(col.get("categorical"), dict):
            top = col["categorical"].get("top") or []
            top3 = ", ".join(
                f"{t.get('value')!r}({_fmt_num(t.get('count'))})" for t in top[:3]
            )
            if top3:
                parts.append(f"top3=[{top3}]")

        lines.append(" | ".join(parts))

    correlations = profile.get("correlations")
    strong = (correlations or {}).get("strong") if isinstance(correlations, dict) else None
    if strong:
        lines.append("")
        lines.append("Correlaciones/asociaciones fuertes:")
        for pair in strong:
            lines.append(
                "- {} ~ {} ({}={})".format(
                    pair.get("a", "?"),
                    pair.get("b", "?"),
                    pair.get("method", "?"),
                    _fmt_num(pair.get("value")),
                )
            )

    lines.append("")
    lines.append(
        "Devuelve el objeto JSON descrito en las instrucciones del sistema."
    )
    return "\n".join(lines)


def _parse_llm_json(text: str) -> dict:
    """Extrae el primer objeto JSON de la respuesta del LLM.

    Funcion interna testeable sin red. Tolera fences ```json ... ``` y texto
    alrededor del objeto. Localiza el primer '{' y hace matching de llaves
    (respetando strings/escapes) hasta cerrar el objeto, luego json.loads.

    Args:
        text: respuesta cruda del LLM.

    Returns:
        El dict parseado.

    Raises:
        ValueError: si no se encuentra un objeto JSON valido.
    """
    if not text or not isinstance(text, str):
        raise ValueError("empty LLM response")

    s = text.strip()
    # Quita fences de markdown si los hay.
    if s.startswith("```"):
        # Elimina la primera linea de fence (```json o ```) y un posible cierre.
        first_nl = s.find("\n")
        if first_nl != -1:
            s = s[first_nl + 1 :]
        if s.rstrip().endswith("```"):
            s = s.rstrip()[:-3]
        s = s.strip()

    start = s.find("{")
    if start == -1:
        raise ValueError("no JSON object found in LLM response")

    depth = 0
    in_str = False
    escape = False
    end = -1
    for i in range(start, len(s)):
        ch = s[i]
        if in_str:
            if escape:
                escape = False
            elif ch == "\\":
                escape = True
            elif ch == '"':
                in_str = False
            continue
        if ch == '"':
            in_str = True
        elif ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                end = i + 1
                break

    if end == -1:
        raise ValueError("unbalanced JSON object in LLM response")

    return json.loads(s[start:end])


def _normalize(parsed: dict) -> dict:
    """Asegura todas las claves esperadas, rellenando las que falten."""
    out = {}
    for key, default in _EXPECTED_KEYS.items():
        val = parsed.get(key, None)
        if val is None:
            out[key] = [] if isinstance(default, list) else default
        else:
            out[key] = val
    return out


def eda_llm_insights(
    profile: dict, model: str = "claude-haiku-4-5-20251001"
) -> dict:
    """Interpreta semanticamente un TableProfile con UNA llamada al LLM.

    Args:
        profile: TableProfile ya calculado (el dict que devuelve
            profile_table()["profile"]). Solo se le envia al LLM el resumen
            AGREGADO, nunca filas crudas.
        model: id del modelo Anthropic. Default claude-haiku-4-5-20251001
            (haiku, coste bajo).

    Returns:
        dict. En exito: {status:'ok', llm:{summary, row_meaning, dictionary,
        pii, cleaning, analyses}}. En error (sin lanzar):
        {status:'error', error:str}.
    """
    try:
        if not isinstance(profile, dict) or not profile:
            return {"status": "error", "error": "profile vacio o no es dict"}

        prompt = _build_prompt(profile)
        text = ask_llm(prompt, model=model, system=_SYSTEM, echo=False)
        if not text:
            return {"status": "error", "error": "respuesta vacia del LLM"}

        parsed = _parse_llm_json(text)
        if not isinstance(parsed, dict):
            return {"status": "error", "error": "el LLM no devolvio un objeto JSON"}

        return {"status": "ok", "llm": _normalize(parsed)}
    except Exception as e:  # noqa: BLE001
        return {"status": "error", "error": str(e)}