63a9cb5273
Datascience: aggregate_by_group, deduplicate_entities/relations, detect_drift, diff_entities/relations, extract_entities/relations_llm, hotness_score, melt, merge_graphs, pivot, build_entity/relation_schema_prompt. Finance: avellaneda_stoikov_quotes, generate_gbm_prices, generate_taker_order, hawkes_intensity + módulo finance.py. Cybersecurity: envelope_encrypt/decrypt + módulo cybersecurity.py. Pipelines: extraction_pipeline, monte_carlo_market, run_market_sim. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
142 lines
4.8 KiB
Python
142 lines
4.8 KiB
Python
"""extract_relations_llm — extrae relaciones entre entidades usando un LLM."""
|
|
|
|
import logging
|
|
import sys
|
|
import os
|
|
from typing import Callable
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))
|
|
|
|
from python.types.datascience.entity_candidate import EntityCandidate
|
|
from python.types.datascience.relation_candidate import RelationCandidate
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def extract_relations_llm(
|
|
text: str,
|
|
entities: list[EntityCandidate],
|
|
relation_types: list[str],
|
|
llm_chat_json: Callable[[list[dict]], dict],
|
|
language_instruction: str = "Respond in English.",
|
|
) -> list[RelationCandidate]:
|
|
"""Extrae relaciones entre entidades de un chunk de texto usando un LLM.
|
|
|
|
Dado el texto original y las entidades ya extraidas, pide al LLM que
|
|
identifique relaciones entre pares de entidades. Las relaciones cuyo
|
|
from_name o to_name no coincidan con ninguna entidad existente se descartan.
|
|
Los tipos de relacion no permitidos se reemplazan por "related_to".
|
|
|
|
Args:
|
|
text: chunk de texto (el mismo que se uso para extraer las entidades).
|
|
entities: entidades ya extraidas del chunk.
|
|
relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
|
|
"communicates_with", "owns", "related_to"].
|
|
llm_chat_json: funcion inyectada que recibe una lista de mensajes
|
|
(dicts con "role" y "content") y retorna un dict con la respuesta
|
|
JSON del LLM.
|
|
language_instruction: instruccion de idioma para el LLM.
|
|
|
|
Returns:
|
|
Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
|
|
o si el LLM no encuentra relaciones.
|
|
"""
|
|
if len(entities) < 2:
|
|
return []
|
|
|
|
entity_names = {e.name for e in entities}
|
|
relation_types_set = set(relation_types)
|
|
|
|
# Construir lista de entidades para el prompt
|
|
entity_lines = "\n".join(
|
|
f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
|
|
)
|
|
|
|
# Construir tipos de relacion para el prompt
|
|
relation_types_str = ", ".join(relation_types)
|
|
|
|
system_prompt = f"""\
|
|
You are a relation extraction expert. Given text and a list of entities already \
|
|
extracted, identify relationships between them.
|
|
|
|
Entities found in this text:
|
|
{entity_lines}
|
|
|
|
Allowed relation types: {relation_types_str}
|
|
|
|
Output JSON: {{"relations": [
|
|
{{"from_name": "Entity A", "to_name": "Entity B",
|
|
"relation_type": "employs", "description": "...", "confidence": 0.8}}
|
|
]}}
|
|
|
|
Rules:
|
|
- Only extract relations explicitly stated or strongly implied in the text
|
|
- from_name and to_name must match entity names exactly as listed above
|
|
- relation_type must be one of the allowed types
|
|
- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
|
|
- Do not invent entities not in the list above
|
|
- {language_instruction}"""
|
|
|
|
messages = [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": text},
|
|
]
|
|
|
|
try:
|
|
response = llm_chat_json(messages)
|
|
except Exception as exc:
|
|
logger.warning("extract_relations_llm: LLM call failed: %s", exc)
|
|
return []
|
|
|
|
raw_relations = response.get("relations", [])
|
|
if not isinstance(raw_relations, list):
|
|
logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
|
|
return []
|
|
|
|
results: list[RelationCandidate] = []
|
|
for item in raw_relations:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
|
|
from_name = item.get("from_name", "")
|
|
to_name = item.get("to_name", "")
|
|
|
|
# Validar que ambos nombres corresponden a entidades existentes
|
|
if from_name not in entity_names:
|
|
logger.debug(
|
|
"extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
|
|
from_name,
|
|
)
|
|
continue
|
|
if to_name not in entity_names:
|
|
logger.debug(
|
|
"extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
|
|
to_name,
|
|
)
|
|
continue
|
|
|
|
relation_type = item.get("relation_type", "")
|
|
if relation_type not in relation_types_set:
|
|
logger.debug(
|
|
"extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
|
|
relation_type,
|
|
)
|
|
relation_type = "related_to"
|
|
|
|
confidence = item.get("confidence", 0.0)
|
|
if not isinstance(confidence, (int, float)):
|
|
confidence = 0.0
|
|
confidence = float(max(0.0, min(1.0, confidence)))
|
|
|
|
results.append(
|
|
RelationCandidate(
|
|
from_name=from_name,
|
|
to_name=to_name,
|
|
relation_type=relation_type,
|
|
description=item.get("description", ""),
|
|
confidence=confidence,
|
|
)
|
|
)
|
|
|
|
return results
|