fn_registry/python/functions/datascience/deduplicate_entities_test.py

"""Tests para deduplicate_entities."""

import sys
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))

from python.types.datascience.entity_candidate import EntityCandidate
from python.functions.datascience.deduplicate_entities import deduplicate_entities


def _make(name: str, type_ref: str = "person", confidence: float = 0.9, **attrs) -> EntityCandidate:
    return EntityCandidate(
        name=name,
        type_ref=type_ref,
        type_label=type_ref.capitalize(),
        attributes=attrs,
        confidence=confidence,
        source_chunk_indices=[0],
    )


def test_john_smith_y_smith_john_merge():
    """John Smith y Smith, John se mergean."""
    a = _make("John Smith", type_ref="person")
    b = _make("Smith, John", type_ref="person")
    result = deduplicate_entities([a, b])
    assert result.total_before == 2
    assert result.total_after == 1
    assert len(result.entities) == 1
    assert len(result.merge_log) == 1


def test_google_y_google_llc_merge():
    """Google y Google LLC se mergean."""
    a = _make("Google", type_ref="organization")
    b = _make("Google LLC", type_ref="organization")
    result = deduplicate_entities([a, b])
    assert result.total_after == 1
    assert len(result.entities) == 1


def test_ip_matching_exacto():
    """192.168.1.1 y 192.168.1.1 se mergean por matching exacto."""
    a = _make("192.168.1.1", type_ref="ip", confidence=0.8)
    b = _make("192.168.1.1", type_ref="ip", confidence=0.9)
    result = deduplicate_entities([a, b])
    assert result.total_after == 1


def test_same_name_different_type_no_merge():
    """John Smith (person) y John Smith (organization) NO se mergean."""
    a = _make("John Smith", type_ref="person")
    b = _make("John Smith", type_ref="organization")
    result = deduplicate_entities([a, b], same_type_only=True)
    assert result.total_after == 2


def test_clusters_transitivos():
    """Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster."""
    a = _make("Alice Johnson", type_ref="person")
    b = _make("Alice Johnso", type_ref="person")   # muy similar a A
    c = _make("Alice Johns", type_ref="person")    # muy similar a B
    result = deduplicate_entities([a, b, c], name_threshold=0.80)
    assert result.total_after == 1


def test_sin_duplicados_sin_cambios():
    """Entidades sin duplicados pasan sin modificacion."""
    a = _make("Alice Smith", type_ref="person")
    b = _make("Bob Jones", type_ref="person")
    c = _make("Charlie Brown", type_ref="person")
    result = deduplicate_entities([a, b, c])
    assert result.total_before == 3
    assert result.total_after == 3
    assert len(result.merge_log) == 0


def test_confidence_y_atributos_merge_correctos():
    """Confidence toma el max del cluster; atributos se fusionan."""
    a = _make("John Smith", type_ref="person", confidence=0.7, role="CEO")
    b = _make("Smith, John", type_ref="person", confidence=0.95, company="Acme")
    result = deduplicate_entities([a, b])
    assert result.total_after == 1
    entity = result.entities[0]
    # confidence = max(0.7, 0.95)
    assert entity.confidence == 0.95
    # atributos de ambos candidatos presentes
    assert "role" in entity.attributes
    assert "company" in entity.attributes


def test_lista_vacia():
    """Lista vacia retorna resultado vacio."""
    result = deduplicate_entities([])
    assert result.total_before == 0
    assert result.total_after == 0
    assert result.entities == []
    assert result.merge_log == []


def test_name_to_id_resolucion():
    """name_to_id contiene todos los nombres originales del cluster."""
    a = _make("John Smith", type_ref="person")
    b = _make("Smith, John", type_ref="person")
    result = deduplicate_entities([a, b])
    # Ambos nombres deben apuntar al mismo ID
    ids = list(result.entity_id_map.values())
    assert len(ids) == 1
    ent_id = ids[0]
    # name_to_id debe tener entradas para los nombres originales
    assert any(v == ent_id for v in result.name_to_id.values())
    assert len(result.name_to_id) >= 2