63a9cb5273
Datascience: aggregate_by_group, deduplicate_entities/relations, detect_drift, diff_entities/relations, extract_entities/relations_llm, hotness_score, melt, merge_graphs, pivot, build_entity/relation_schema_prompt. Finance: avellaneda_stoikov_quotes, generate_gbm_prices, generate_taker_order, hawkes_intensity + módulo finance.py. Cybersecurity: envelope_encrypt/decrypt + módulo cybersecurity.py. Pipelines: extraction_pipeline, monte_carlo_market, run_market_sim. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
121 lines
4.3 KiB
Python
121 lines
4.3 KiB
Python
"""Tests para deduplicate_relations."""
|
|
|
|
import os
|
|
import sys
|
|
|
|
# Permitir importar RelationCandidate desde python/types/datascience/
|
|
_here = os.path.dirname(os.path.abspath(__file__))
|
|
_types_path = os.path.join(_here, "..", "..", "..", "python", "types", "datascience")
|
|
if _types_path not in sys.path:
|
|
sys.path.insert(0, _types_path)
|
|
|
|
from relation_candidate import RelationCandidate
|
|
from deduplicate_relations import deduplicate_relations
|
|
|
|
|
|
def _make_rel(
|
|
from_name: str,
|
|
to_name: str,
|
|
relation_type: str = "works_at",
|
|
description: str = "",
|
|
confidence: float = 0.8,
|
|
source_chunk_index: int = 0,
|
|
) -> RelationCandidate:
|
|
return RelationCandidate(
|
|
from_name=from_name,
|
|
to_name=to_name,
|
|
relation_type=relation_type,
|
|
description=description,
|
|
confidence=confidence,
|
|
source_chunk_index=source_chunk_index,
|
|
)
|
|
|
|
|
|
# entity_id_map tipico: claves en lowercase normalizado
|
|
_ENTITY_MAP: dict[str, str] = {
|
|
"john smith": "entity_001",
|
|
"acme corp": "entity_002",
|
|
"jane doe": "entity_003",
|
|
"google": "entity_004",
|
|
}
|
|
|
|
|
|
def test_dos_relaciones_identicas_se_colapsan_en_una():
|
|
"""2 relaciones identicas (from, to, type) → 1."""
|
|
rels = [
|
|
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9),
|
|
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.7),
|
|
]
|
|
result = deduplicate_relations(rels, _ENTITY_MAP)
|
|
assert len(result) == 1
|
|
assert result[0].from_id == "entity_001"
|
|
assert result[0].to_id == "entity_002"
|
|
assert result[0].confidence == 0.9 # max
|
|
|
|
|
|
def test_relacion_con_nombre_mergeado_se_resuelve_al_id_correcto():
|
|
"""Relacion con nombre mergeado → se resuelve al ID correcto."""
|
|
# entity_id_map incluye "smith, john" como alias de entity_001
|
|
merged_map = {**_ENTITY_MAP, "smith, john": "entity_001"}
|
|
rels = [_make_rel("Smith, John", "Acme Corp")]
|
|
result = deduplicate_relations(rels, merged_map)
|
|
assert len(result) == 1
|
|
assert result[0].from_id == "entity_001"
|
|
assert result[0].to_id == "entity_002"
|
|
|
|
|
|
def test_self_loop_se_descarta():
|
|
"""Self-loop (from_id == to_id) → descartado."""
|
|
rels = [_make_rel("John Smith", "John Smith", relation_type="knows")]
|
|
result = deduplicate_relations(rels, _ENTITY_MAP)
|
|
assert len(result) == 0
|
|
|
|
|
|
def test_nombre_no_mapeado_sin_fuzzy_match_se_descarta():
|
|
"""Relacion con nombre no mapeado y sin fuzzy match → descartada."""
|
|
rels = [_make_rel("Unknown Entity XYZ", "Acme Corp")]
|
|
result = deduplicate_relations(rels, _ENTITY_MAP)
|
|
assert len(result) == 0
|
|
|
|
|
|
def test_relaciones_distintas_se_mantienen():
|
|
"""Relaciones con (from, to, type) distintos → todas se mantienen."""
|
|
rels = [
|
|
_make_rel("John Smith", "Acme Corp", relation_type="works_at"),
|
|
_make_rel("Jane Doe", "Acme Corp", relation_type="works_at"),
|
|
_make_rel("John Smith", "Google", relation_type="invested_in"),
|
|
]
|
|
result = deduplicate_relations(rels, _ENTITY_MAP)
|
|
assert len(result) == 3
|
|
|
|
|
|
def test_merge_descripcion_concatena_unicas():
|
|
"""Merge de relaciones: descripciones unicas se concatenan."""
|
|
rels = [
|
|
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9),
|
|
_make_rel("John Smith", "Acme Corp", description="Acme fue fundada por John", confidence=0.7),
|
|
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.6),
|
|
]
|
|
result = deduplicate_relations(rels, _ENTITY_MAP)
|
|
assert len(result) == 1
|
|
assert "John es CEO" in result[0].description
|
|
assert "Acme fue fundada por John" in result[0].description
|
|
# La descripcion duplicada ("John es CEO") no aparece dos veces
|
|
assert result[0].description.count("John es CEO") == 1
|
|
assert result[0].confidence == 0.9
|
|
|
|
|
|
def test_lista_vacia_retorna_lista_vacia():
|
|
"""Lista vacia de relaciones → lista vacia."""
|
|
result = deduplicate_relations([], _ENTITY_MAP)
|
|
assert result == []
|
|
|
|
|
|
def test_fuzzy_match_resuelve_nombre_cercano():
|
|
"""Nombre con typo pequeño → fuzzy match lo resuelve."""
|
|
# "john smit" tiene distancia 1 de "john smith"
|
|
rels = [_make_rel("John Smit", "Acme Corp")]
|
|
result = deduplicate_relations(rels, _ENTITY_MAP)
|
|
assert len(result) == 1
|
|
assert result[0].from_id == "entity_001"
|