Files
fn_registry/python/functions/datascience/deduplicate_entities_test.py
T
egutierrez 63a9cb5273 feat: funciones Python datascience, finance, cybersecurity y pipelines
Datascience: aggregate_by_group, deduplicate_entities/relations, detect_drift,
diff_entities/relations, extract_entities/relations_llm, hotness_score, melt,
merge_graphs, pivot, build_entity/relation_schema_prompt.
Finance: avellaneda_stoikov_quotes, generate_gbm_prices, generate_taker_order,
hawkes_intensity + módulo finance.py.
Cybersecurity: envelope_encrypt/decrypt + módulo cybersecurity.py.
Pipelines: extraction_pipeline, monte_carlo_market, run_market_sim.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 17:11:32 +02:00

114 lines
4.0 KiB
Python

"""Tests para deduplicate_entities."""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
from python.types.datascience.entity_candidate import EntityCandidate
from python.functions.datascience.deduplicate_entities import deduplicate_entities
def _make(name: str, type_ref: str = "person", confidence: float = 0.9, **attrs) -> EntityCandidate:
return EntityCandidate(
name=name,
type_ref=type_ref,
type_label=type_ref.capitalize(),
attributes=attrs,
confidence=confidence,
source_chunk_indices=[0],
)
def test_john_smith_y_smith_john_merge():
"""John Smith y Smith, John se mergean."""
a = _make("John Smith", type_ref="person")
b = _make("Smith, John", type_ref="person")
result = deduplicate_entities([a, b])
assert result.total_before == 2
assert result.total_after == 1
assert len(result.entities) == 1
assert len(result.merge_log) == 1
def test_google_y_google_llc_merge():
"""Google y Google LLC se mergean."""
a = _make("Google", type_ref="organization")
b = _make("Google LLC", type_ref="organization")
result = deduplicate_entities([a, b])
assert result.total_after == 1
assert len(result.entities) == 1
def test_ip_matching_exacto():
"""192.168.1.1 y 192.168.1.1 se mergean por matching exacto."""
a = _make("192.168.1.1", type_ref="ip", confidence=0.8)
b = _make("192.168.1.1", type_ref="ip", confidence=0.9)
result = deduplicate_entities([a, b])
assert result.total_after == 1
def test_same_name_different_type_no_merge():
"""John Smith (person) y John Smith (organization) NO se mergean."""
a = _make("John Smith", type_ref="person")
b = _make("John Smith", type_ref="organization")
result = deduplicate_entities([a, b], same_type_only=True)
assert result.total_after == 2
def test_clusters_transitivos():
"""Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster."""
a = _make("Alice Johnson", type_ref="person")
b = _make("Alice Johnso", type_ref="person") # muy similar a A
c = _make("Alice Johns", type_ref="person") # muy similar a B
result = deduplicate_entities([a, b, c], name_threshold=0.80)
assert result.total_after == 1
def test_sin_duplicados_sin_cambios():
"""Entidades sin duplicados pasan sin modificacion."""
a = _make("Alice Smith", type_ref="person")
b = _make("Bob Jones", type_ref="person")
c = _make("Charlie Brown", type_ref="person")
result = deduplicate_entities([a, b, c])
assert result.total_before == 3
assert result.total_after == 3
assert len(result.merge_log) == 0
def test_confidence_y_atributos_merge_correctos():
"""Confidence toma el max del cluster; atributos se fusionan."""
a = _make("John Smith", type_ref="person", confidence=0.7, role="CEO")
b = _make("Smith, John", type_ref="person", confidence=0.95, company="Acme")
result = deduplicate_entities([a, b])
assert result.total_after == 1
entity = result.entities[0]
# confidence = max(0.7, 0.95)
assert entity.confidence == 0.95
# atributos de ambos candidatos presentes
assert "role" in entity.attributes
assert "company" in entity.attributes
def test_lista_vacia():
"""Lista vacia retorna resultado vacio."""
result = deduplicate_entities([])
assert result.total_before == 0
assert result.total_after == 0
assert result.entities == []
assert result.merge_log == []
def test_name_to_id_resolucion():
"""name_to_id contiene todos los nombres originales del cluster."""
a = _make("John Smith", type_ref="person")
b = _make("Smith, John", type_ref="person")
result = deduplicate_entities([a, b])
# Ambos nombres deben apuntar al mismo ID
ids = list(result.entity_id_map.values())
assert len(ids) == 1
ent_id = ids[0]
# name_to_id debe tener entradas para los nombres originales
assert any(v == ent_id for v in result.name_to_id.values())
assert len(result.name_to_id) >= 2