fn_registry/python/functions/datascience/deduplicate_relations_test.py

"""Tests para deduplicate_relations."""

import os
import sys

# Permitir importar RelationCandidate desde python/types/datascience/
_here = os.path.dirname(os.path.abspath(__file__))
_types_path = os.path.join(_here, "..", "..", "..", "python", "types", "datascience")
if _types_path not in sys.path:
    sys.path.insert(0, _types_path)

from relation_candidate import RelationCandidate
from deduplicate_relations import deduplicate_relations


def _make_rel(
    from_name: str,
    to_name: str,
    relation_type: str = "works_at",
    description: str = "",
    confidence: float = 0.8,
    source_chunk_index: int = 0,
) -> RelationCandidate:
    return RelationCandidate(
        from_name=from_name,
        to_name=to_name,
        relation_type=relation_type,
        description=description,
        confidence=confidence,
        source_chunk_index=source_chunk_index,
    )


# entity_id_map tipico: claves en lowercase normalizado
_ENTITY_MAP: dict[str, str] = {
    "john smith": "entity_001",
    "acme corp": "entity_002",
    "jane doe": "entity_003",
    "google": "entity_004",
}


def test_dos_relaciones_identicas_se_colapsan_en_una():
    """2 relaciones identicas (from, to, type) → 1."""
    rels = [
        _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9),
        _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.7),
    ]
    result = deduplicate_relations(rels, _ENTITY_MAP)
    assert len(result) == 1
    assert result[0].from_id == "entity_001"
    assert result[0].to_id == "entity_002"
    assert result[0].confidence == 0.9  # max


def test_relacion_con_nombre_mergeado_se_resuelve_al_id_correcto():
    """Relacion con nombre mergeado → se resuelve al ID correcto."""
    # entity_id_map incluye "smith, john" como alias de entity_001
    merged_map = {**_ENTITY_MAP, "smith, john": "entity_001"}
    rels = [_make_rel("Smith, John", "Acme Corp")]
    result = deduplicate_relations(rels, merged_map)
    assert len(result) == 1
    assert result[0].from_id == "entity_001"
    assert result[0].to_id == "entity_002"


def test_self_loop_se_descarta():
    """Self-loop (from_id == to_id) → descartado."""
    rels = [_make_rel("John Smith", "John Smith", relation_type="knows")]
    result = deduplicate_relations(rels, _ENTITY_MAP)
    assert len(result) == 0


def test_nombre_no_mapeado_sin_fuzzy_match_se_descarta():
    """Relacion con nombre no mapeado y sin fuzzy match → descartada."""
    rels = [_make_rel("Unknown Entity XYZ", "Acme Corp")]
    result = deduplicate_relations(rels, _ENTITY_MAP)
    assert len(result) == 0


def test_relaciones_distintas_se_mantienen():
    """Relaciones con (from, to, type) distintos → todas se mantienen."""
    rels = [
        _make_rel("John Smith", "Acme Corp", relation_type="works_at"),
        _make_rel("Jane Doe", "Acme Corp", relation_type="works_at"),
        _make_rel("John Smith", "Google", relation_type="invested_in"),
    ]
    result = deduplicate_relations(rels, _ENTITY_MAP)
    assert len(result) == 3


def test_merge_descripcion_concatena_unicas():
    """Merge de relaciones: descripciones unicas se concatenan."""
    rels = [
        _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9),
        _make_rel("John Smith", "Acme Corp", description="Acme fue fundada por John", confidence=0.7),
        _make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.6),
    ]
    result = deduplicate_relations(rels, _ENTITY_MAP)
    assert len(result) == 1
    assert "John es CEO" in result[0].description
    assert "Acme fue fundada por John" in result[0].description
    # La descripcion duplicada ("John es CEO") no aparece dos veces
    assert result[0].description.count("John es CEO") == 1
    assert result[0].confidence == 0.9


def test_lista_vacia_retorna_lista_vacia():
    """Lista vacia de relaciones → lista vacia."""
    result = deduplicate_relations([], _ENTITY_MAP)
    assert result == []


def test_fuzzy_match_resuelve_nombre_cercano():
    """Nombre con typo pequeño → fuzzy match lo resuelve."""
    # "john smit" tiene distancia 1 de "john smith"
    rels = [_make_rel("John Smit", "Acme Corp")]
    result = deduplicate_relations(rels, _ENTITY_MAP)
    assert len(result) == 1
    assert result[0].from_id == "entity_001"