fn_registry/python/functions/datascience/extract_entities_llm.py

"""Extrae entidades de un chunk de texto usando un LLM inyectado."""

import sys
import os
import warnings
from typing import Callable

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))

from python.types.datascience.entity_candidate import EntityCandidate


def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str:
    """Construye el system prompt para extraccion de entidades."""
    lines = [
        "You are an entity extraction expert. Given text, extract all entities",
        "matching these types. For each entity, provide: name, type_ref,",
        "attributes (matching the metadata_fields for that type), and a",
        "confidence score (0.0-1.0).",
        "",
        "Entity types:",
    ]

    for schema_entry in entity_schema:
        label = schema_entry.get("label", "Unknown")
        type_ref = schema_entry.get("type_ref", "")
        metadata_fields = schema_entry.get("metadata_fields", [])
        lines.append(f"- {label} (type_ref: {type_ref})")
        if metadata_fields:
            lines.append(f"  fields: {', '.join(metadata_fields)}")

    lines += [
        "",
        'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}',
        "",
        "Rules:",
        "- Only extract entities explicitly mentioned in the text",
        "- Use the exact type_ref from the schema",
        "- Leave unknown attributes as null",
        "- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied",
        f"- {language_instruction}",
    ]

    return "\n".join(lines)


def extract_entities_llm(
    text: str,
    entity_schema: list[dict],
    llm_chat_json: Callable[[list[dict]], dict],
    language_instruction: str = "Respond in English.",
) -> list[EntityCandidate]:
    """Extrae entidades de un chunk de texto usando un LLM inyectado.

    Construye un system prompt con el schema de entity types, llama al LLM
    y valida la respuesta retornando una lista de EntityCandidate.

    Args:
        text: Chunk de texto a analizar.
        entity_schema: Lista de tipos con metadata fields. Cada entrada es un
            dict con las claves 'type_ref', 'label' y opcionalmente
            'metadata_fields'. Ejemplo:
            [{"type_ref": "osint_person_go_cybersecurity", "label": "Person",
              "metadata_fields": ["full_name", "alias"]}]
        llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style
            y retorna un dict con la respuesta JSON del LLM. Interfaz:
            llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict
        language_instruction: Instruccion de idioma para el LLM. Por defecto
            "Respond in English."

    Returns:
        Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM
        no retorna JSON valido o si no se encuentran entidades.

    Raises:
        ValueError: Si entity_schema esta vacio.
    """
    if not entity_schema:
        raise ValueError("entity_schema no puede estar vacio")

    valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema}
    type_ref_to_label = {
        entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema
    }

    system_prompt = _build_system_prompt(entity_schema, language_instruction)

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text},
    ]

    try:
        response = llm_chat_json(messages)
    except Exception as exc:
        warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2)
        return []

    raw_entities = response.get("entities", [])
    if not isinstance(raw_entities, list):
        warnings.warn(
            "extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista",
            stacklevel=2,
        )
        return []

    candidates: list[EntityCandidate] = []
    for item in raw_entities:
        if not isinstance(item, dict):
            continue

        name = item.get("name", "")
        if not name:
            continue

        type_ref = item.get("type_ref", "")
        if type_ref not in valid_type_refs:
            warnings.warn(
                f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'",
                stacklevel=2,
            )
            continue

        attributes = item.get("attributes", {})
        if not isinstance(attributes, dict):
            attributes = {}
        # Normalizar null values a None
        attributes = {k: v for k, v in attributes.items() if v is not None}

        confidence = item.get("confidence", 0.0)
        if not isinstance(confidence, (int, float)):
            confidence = 0.0
        confidence = float(max(0.0, min(1.0, confidence)))

        candidates.append(
            EntityCandidate(
                name=name,
                type_ref=type_ref,
                type_label=type_ref_to_label.get(type_ref, ""),
                attributes=attributes,
                confidence=confidence,
            )
        )

    return candidates