fn_registry/python/functions/datascience/extract_relations_llm.py

"""extract_relations_llm — extrae relaciones entre entidades usando un LLM."""

import logging
import sys
import os
from typing import Callable

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))

from python.types.datascience.entity_candidate import EntityCandidate
from python.types.datascience.relation_candidate import RelationCandidate

logger = logging.getLogger(__name__)


def extract_relations_llm(
    text: str,
    entities: list[EntityCandidate],
    relation_types: list[str],
    llm_chat_json: Callable[[list[dict]], dict],
    language_instruction: str = "Respond in English.",
) -> list[RelationCandidate]:
    """Extrae relaciones entre entidades de un chunk de texto usando un LLM.

    Dado el texto original y las entidades ya extraidas, pide al LLM que
    identifique relaciones entre pares de entidades. Las relaciones cuyo
    from_name o to_name no coincidan con ninguna entidad existente se descartan.
    Los tipos de relacion no permitidos se reemplazan por "related_to".

    Args:
        text: chunk de texto (el mismo que se uso para extraer las entidades).
        entities: entidades ya extraidas del chunk.
        relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
            "communicates_with", "owns", "related_to"].
        llm_chat_json: funcion inyectada que recibe una lista de mensajes
            (dicts con "role" y "content") y retorna un dict con la respuesta
            JSON del LLM.
        language_instruction: instruccion de idioma para el LLM.

    Returns:
        Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
        o si el LLM no encuentra relaciones.
    """
    if len(entities) < 2:
        return []

    entity_names = {e.name for e in entities}
    relation_types_set = set(relation_types)

    # Construir lista de entidades para el prompt
    entity_lines = "\n".join(
        f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
    )

    # Construir tipos de relacion para el prompt
    relation_types_str = ", ".join(relation_types)

    system_prompt = f"""\
You are a relation extraction expert. Given text and a list of entities already \
extracted, identify relationships between them.

Entities found in this text:
{entity_lines}

Allowed relation types: {relation_types_str}

Output JSON: {{"relations": [
  {{"from_name": "Entity A", "to_name": "Entity B",
   "relation_type": "employs", "description": "...", "confidence": 0.8}}
]}}

Rules:
- Only extract relations explicitly stated or strongly implied in the text
- from_name and to_name must match entity names exactly as listed above
- relation_type must be one of the allowed types
- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
- Do not invent entities not in the list above
- {language_instruction}"""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text},
    ]

    try:
        response = llm_chat_json(messages)
    except Exception as exc:
        logger.warning("extract_relations_llm: LLM call failed: %s", exc)
        return []

    raw_relations = response.get("relations", [])
    if not isinstance(raw_relations, list):
        logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
        return []

    results: list[RelationCandidate] = []
    for item in raw_relations:
        if not isinstance(item, dict):
            continue

        from_name = item.get("from_name", "")
        to_name = item.get("to_name", "")

        # Validar que ambos nombres corresponden a entidades existentes
        if from_name not in entity_names:
            logger.debug(
                "extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
                from_name,
            )
            continue
        if to_name not in entity_names:
            logger.debug(
                "extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
                to_name,
            )
            continue

        relation_type = item.get("relation_type", "")
        if relation_type not in relation_types_set:
            logger.debug(
                "extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
                relation_type,
            )
            relation_type = "related_to"

        confidence = item.get("confidence", 0.0)
        if not isinstance(confidence, (int, float)):
            confidence = 0.0
        confidence = float(max(0.0, min(1.0, confidence)))

        results.append(
            RelationCandidate(
                from_name=from_name,
                to_name=to_name,
                relation_type=relation_type,
                description=item.get("description", ""),
                confidence=confidence,
            )
        )

    return results