"""Tests para deduplicate_relations.""" import os import sys # Permitir importar RelationCandidate desde python/types/datascience/ _here = os.path.dirname(os.path.abspath(__file__)) _types_path = os.path.join(_here, "..", "..", "..", "python", "types", "datascience") if _types_path not in sys.path: sys.path.insert(0, _types_path) from relation_candidate import RelationCandidate from deduplicate_relations import deduplicate_relations def _make_rel( from_name: str, to_name: str, relation_type: str = "works_at", description: str = "", confidence: float = 0.8, source_chunk_index: int = 0, ) -> RelationCandidate: return RelationCandidate( from_name=from_name, to_name=to_name, relation_type=relation_type, description=description, confidence=confidence, source_chunk_index=source_chunk_index, ) # entity_id_map tipico: claves en lowercase normalizado _ENTITY_MAP: dict[str, str] = { "john smith": "entity_001", "acme corp": "entity_002", "jane doe": "entity_003", "google": "entity_004", } def test_dos_relaciones_identicas_se_colapsan_en_una(): """2 relaciones identicas (from, to, type) → 1.""" rels = [ _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9), _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.7), ] result = deduplicate_relations(rels, _ENTITY_MAP) assert len(result) == 1 assert result[0].from_id == "entity_001" assert result[0].to_id == "entity_002" assert result[0].confidence == 0.9 # max def test_relacion_con_nombre_mergeado_se_resuelve_al_id_correcto(): """Relacion con nombre mergeado → se resuelve al ID correcto.""" # entity_id_map incluye "smith, john" como alias de entity_001 merged_map = {**_ENTITY_MAP, "smith, john": "entity_001"} rels = [_make_rel("Smith, John", "Acme Corp")] result = deduplicate_relations(rels, merged_map) assert len(result) == 1 assert result[0].from_id == "entity_001" assert result[0].to_id == "entity_002" def test_self_loop_se_descarta(): """Self-loop (from_id == to_id) → descartado.""" rels = [_make_rel("John Smith", "John Smith", relation_type="knows")] result = deduplicate_relations(rels, _ENTITY_MAP) assert len(result) == 0 def test_nombre_no_mapeado_sin_fuzzy_match_se_descarta(): """Relacion con nombre no mapeado y sin fuzzy match → descartada.""" rels = [_make_rel("Unknown Entity XYZ", "Acme Corp")] result = deduplicate_relations(rels, _ENTITY_MAP) assert len(result) == 0 def test_relaciones_distintas_se_mantienen(): """Relaciones con (from, to, type) distintos → todas se mantienen.""" rels = [ _make_rel("John Smith", "Acme Corp", relation_type="works_at"), _make_rel("Jane Doe", "Acme Corp", relation_type="works_at"), _make_rel("John Smith", "Google", relation_type="invested_in"), ] result = deduplicate_relations(rels, _ENTITY_MAP) assert len(result) == 3 def test_merge_descripcion_concatena_unicas(): """Merge de relaciones: descripciones unicas se concatenan.""" rels = [ _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9), _make_rel("John Smith", "Acme Corp", description="Acme fue fundada por John", confidence=0.7), _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.6), ] result = deduplicate_relations(rels, _ENTITY_MAP) assert len(result) == 1 assert "John es CEO" in result[0].description assert "Acme fue fundada por John" in result[0].description # La descripcion duplicada ("John es CEO") no aparece dos veces assert result[0].description.count("John es CEO") == 1 assert result[0].confidence == 0.9 def test_lista_vacia_retorna_lista_vacia(): """Lista vacia de relaciones → lista vacia.""" result = deduplicate_relations([], _ENTITY_MAP) assert result == [] def test_fuzzy_match_resuelve_nombre_cercano(): """Nombre con typo pequeño → fuzzy match lo resuelve.""" # "john smit" tiene distancia 1 de "john smith" rels = [_make_rel("John Smit", "Acme Corp")] result = deduplicate_relations(rels, _ENTITY_MAP) assert len(result) == 1 assert result[0].from_id == "entity_001"