"""Tests para deduplicate_entities.""" import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..")) from python.types.datascience.entity_candidate import EntityCandidate from python.functions.datascience.deduplicate_entities import deduplicate_entities def _make(name: str, type_ref: str = "person", confidence: float = 0.9, **attrs) -> EntityCandidate: return EntityCandidate( name=name, type_ref=type_ref, type_label=type_ref.capitalize(), attributes=attrs, confidence=confidence, source_chunk_indices=[0], ) def test_john_smith_y_smith_john_merge(): """John Smith y Smith, John se mergean.""" a = _make("John Smith", type_ref="person") b = _make("Smith, John", type_ref="person") result = deduplicate_entities([a, b]) assert result.total_before == 2 assert result.total_after == 1 assert len(result.entities) == 1 assert len(result.merge_log) == 1 def test_google_y_google_llc_merge(): """Google y Google LLC se mergean.""" a = _make("Google", type_ref="organization") b = _make("Google LLC", type_ref="organization") result = deduplicate_entities([a, b]) assert result.total_after == 1 assert len(result.entities) == 1 def test_ip_matching_exacto(): """192.168.1.1 y 192.168.1.1 se mergean por matching exacto.""" a = _make("192.168.1.1", type_ref="ip", confidence=0.8) b = _make("192.168.1.1", type_ref="ip", confidence=0.9) result = deduplicate_entities([a, b]) assert result.total_after == 1 def test_same_name_different_type_no_merge(): """John Smith (person) y John Smith (organization) NO se mergean.""" a = _make("John Smith", type_ref="person") b = _make("John Smith", type_ref="organization") result = deduplicate_entities([a, b], same_type_only=True) assert result.total_after == 2 def test_clusters_transitivos(): """Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster.""" a = _make("Alice Johnson", type_ref="person") b = _make("Alice Johnso", type_ref="person") # muy similar a A c = _make("Alice Johns", type_ref="person") # muy similar a B result = deduplicate_entities([a, b, c], name_threshold=0.80) assert result.total_after == 1 def test_sin_duplicados_sin_cambios(): """Entidades sin duplicados pasan sin modificacion.""" a = _make("Alice Smith", type_ref="person") b = _make("Bob Jones", type_ref="person") c = _make("Charlie Brown", type_ref="person") result = deduplicate_entities([a, b, c]) assert result.total_before == 3 assert result.total_after == 3 assert len(result.merge_log) == 0 def test_confidence_y_atributos_merge_correctos(): """Confidence toma el max del cluster; atributos se fusionan.""" a = _make("John Smith", type_ref="person", confidence=0.7, role="CEO") b = _make("Smith, John", type_ref="person", confidence=0.95, company="Acme") result = deduplicate_entities([a, b]) assert result.total_after == 1 entity = result.entities[0] # confidence = max(0.7, 0.95) assert entity.confidence == 0.95 # atributos de ambos candidatos presentes assert "role" in entity.attributes assert "company" in entity.attributes def test_lista_vacia(): """Lista vacia retorna resultado vacio.""" result = deduplicate_entities([]) assert result.total_before == 0 assert result.total_after == 0 assert result.entities == [] assert result.merge_log == [] def test_name_to_id_resolucion(): """name_to_id contiene todos los nombres originales del cluster.""" a = _make("John Smith", type_ref="person") b = _make("Smith, John", type_ref="person") result = deduplicate_entities([a, b]) # Ambos nombres deben apuntar al mismo ID ids = list(result.entity_id_map.values()) assert len(ids) == 1 ent_id = ids[0] # name_to_id debe tener entradas para los nombres originales assert any(v == ent_id for v in result.name_to_id.values()) assert len(result.name_to_id) >= 2