"""extract_relations_llm — extrae relaciones entre entidades usando un LLM.""" import logging import sys import os from typing import Callable sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", "")) from python.types.datascience.entity_candidate import EntityCandidate from python.types.datascience.relation_candidate import RelationCandidate logger = logging.getLogger(__name__) def extract_relations_llm( text: str, entities: list[EntityCandidate], relation_types: list[str], llm_chat_json: Callable[[list[dict]], dict], language_instruction: str = "Respond in English.", ) -> list[RelationCandidate]: """Extrae relaciones entre entidades de un chunk de texto usando un LLM. Dado el texto original y las entidades ya extraidas, pide al LLM que identifique relaciones entre pares de entidades. Las relaciones cuyo from_name o to_name no coincidan con ninguna entidad existente se descartan. Los tipos de relacion no permitidos se reemplazan por "related_to". Args: text: chunk de texto (el mismo que se uso para extraer las entidades). entities: entidades ya extraidas del chunk. relation_types: tipos de relacion permitidos, ej: ["funds", "employs", "communicates_with", "owns", "related_to"]. llm_chat_json: funcion inyectada que recibe una lista de mensajes (dicts con "role" y "content") y retorna un dict con la respuesta JSON del LLM. language_instruction: instruccion de idioma para el LLM. Returns: Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades o si el LLM no encuentra relaciones. """ if len(entities) < 2: return [] entity_names = {e.name for e in entities} relation_types_set = set(relation_types) # Construir lista de entidades para el prompt entity_lines = "\n".join( f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities ) # Construir tipos de relacion para el prompt relation_types_str = ", ".join(relation_types) system_prompt = f"""\ You are a relation extraction expert. Given text and a list of entities already \ extracted, identify relationships between them. Entities found in this text: {entity_lines} Allowed relation types: {relation_types_str} Output JSON: {{"relations": [ {{"from_name": "Entity A", "to_name": "Entity B", "relation_type": "employs", "description": "...", "confidence": 0.8}} ]}} Rules: - Only extract relations explicitly stated or strongly implied in the text - from_name and to_name must match entity names exactly as listed above - relation_type must be one of the allowed types - Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied - Do not invent entities not in the list above - {language_instruction}""" messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": text}, ] try: response = llm_chat_json(messages) except Exception as exc: logger.warning("extract_relations_llm: LLM call failed: %s", exc) return [] raw_relations = response.get("relations", []) if not isinstance(raw_relations, list): logger.warning("extract_relations_llm: 'relations' is not a list in LLM response") return [] results: list[RelationCandidate] = [] for item in raw_relations: if not isinstance(item, dict): continue from_name = item.get("from_name", "") to_name = item.get("to_name", "") # Validar que ambos nombres corresponden a entidades existentes if from_name not in entity_names: logger.debug( "extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando", from_name, ) continue if to_name not in entity_names: logger.debug( "extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando", to_name, ) continue relation_type = item.get("relation_type", "") if relation_type not in relation_types_set: logger.debug( "extract_relations_llm: tipo '%s' no permitido — usando 'related_to'", relation_type, ) relation_type = "related_to" confidence = item.get("confidence", 0.0) if not isinstance(confidence, (int, float)): confidence = 0.0 confidence = float(max(0.0, min(1.0, confidence))) results.append( RelationCandidate( from_name=from_name, to_name=to_name, relation_type=relation_type, description=item.get("description", ""), confidence=confidence, ) ) return results