63a9cb5273
Datascience: aggregate_by_group, deduplicate_entities/relations, detect_drift, diff_entities/relations, extract_entities/relations_llm, hotness_score, melt, merge_graphs, pivot, build_entity/relation_schema_prompt. Finance: avellaneda_stoikov_quotes, generate_gbm_prices, generate_taker_order, hawkes_intensity + módulo finance.py. Cybersecurity: envelope_encrypt/decrypt + módulo cybersecurity.py. Pipelines: extraction_pipeline, monte_carlo_market, run_market_sim. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
114 lines
4.0 KiB
Python
114 lines
4.0 KiB
Python
"""Tests para deduplicate_entities."""
|
|
|
|
import sys
|
|
import os
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
|
|
|
from python.types.datascience.entity_candidate import EntityCandidate
|
|
from python.functions.datascience.deduplicate_entities import deduplicate_entities
|
|
|
|
|
|
def _make(name: str, type_ref: str = "person", confidence: float = 0.9, **attrs) -> EntityCandidate:
|
|
return EntityCandidate(
|
|
name=name,
|
|
type_ref=type_ref,
|
|
type_label=type_ref.capitalize(),
|
|
attributes=attrs,
|
|
confidence=confidence,
|
|
source_chunk_indices=[0],
|
|
)
|
|
|
|
|
|
def test_john_smith_y_smith_john_merge():
|
|
"""John Smith y Smith, John se mergean."""
|
|
a = _make("John Smith", type_ref="person")
|
|
b = _make("Smith, John", type_ref="person")
|
|
result = deduplicate_entities([a, b])
|
|
assert result.total_before == 2
|
|
assert result.total_after == 1
|
|
assert len(result.entities) == 1
|
|
assert len(result.merge_log) == 1
|
|
|
|
|
|
def test_google_y_google_llc_merge():
|
|
"""Google y Google LLC se mergean."""
|
|
a = _make("Google", type_ref="organization")
|
|
b = _make("Google LLC", type_ref="organization")
|
|
result = deduplicate_entities([a, b])
|
|
assert result.total_after == 1
|
|
assert len(result.entities) == 1
|
|
|
|
|
|
def test_ip_matching_exacto():
|
|
"""192.168.1.1 y 192.168.1.1 se mergean por matching exacto."""
|
|
a = _make("192.168.1.1", type_ref="ip", confidence=0.8)
|
|
b = _make("192.168.1.1", type_ref="ip", confidence=0.9)
|
|
result = deduplicate_entities([a, b])
|
|
assert result.total_after == 1
|
|
|
|
|
|
def test_same_name_different_type_no_merge():
|
|
"""John Smith (person) y John Smith (organization) NO se mergean."""
|
|
a = _make("John Smith", type_ref="person")
|
|
b = _make("John Smith", type_ref="organization")
|
|
result = deduplicate_entities([a, b], same_type_only=True)
|
|
assert result.total_after == 2
|
|
|
|
|
|
def test_clusters_transitivos():
|
|
"""Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster."""
|
|
a = _make("Alice Johnson", type_ref="person")
|
|
b = _make("Alice Johnso", type_ref="person") # muy similar a A
|
|
c = _make("Alice Johns", type_ref="person") # muy similar a B
|
|
result = deduplicate_entities([a, b, c], name_threshold=0.80)
|
|
assert result.total_after == 1
|
|
|
|
|
|
def test_sin_duplicados_sin_cambios():
|
|
"""Entidades sin duplicados pasan sin modificacion."""
|
|
a = _make("Alice Smith", type_ref="person")
|
|
b = _make("Bob Jones", type_ref="person")
|
|
c = _make("Charlie Brown", type_ref="person")
|
|
result = deduplicate_entities([a, b, c])
|
|
assert result.total_before == 3
|
|
assert result.total_after == 3
|
|
assert len(result.merge_log) == 0
|
|
|
|
|
|
def test_confidence_y_atributos_merge_correctos():
|
|
"""Confidence toma el max del cluster; atributos se fusionan."""
|
|
a = _make("John Smith", type_ref="person", confidence=0.7, role="CEO")
|
|
b = _make("Smith, John", type_ref="person", confidence=0.95, company="Acme")
|
|
result = deduplicate_entities([a, b])
|
|
assert result.total_after == 1
|
|
entity = result.entities[0]
|
|
# confidence = max(0.7, 0.95)
|
|
assert entity.confidence == 0.95
|
|
# atributos de ambos candidatos presentes
|
|
assert "role" in entity.attributes
|
|
assert "company" in entity.attributes
|
|
|
|
|
|
def test_lista_vacia():
|
|
"""Lista vacia retorna resultado vacio."""
|
|
result = deduplicate_entities([])
|
|
assert result.total_before == 0
|
|
assert result.total_after == 0
|
|
assert result.entities == []
|
|
assert result.merge_log == []
|
|
|
|
|
|
def test_name_to_id_resolucion():
|
|
"""name_to_id contiene todos los nombres originales del cluster."""
|
|
a = _make("John Smith", type_ref="person")
|
|
b = _make("Smith, John", type_ref="person")
|
|
result = deduplicate_entities([a, b])
|
|
# Ambos nombres deben apuntar al mismo ID
|
|
ids = list(result.entity_id_map.values())
|
|
assert len(ids) == 1
|
|
ent_id = ids[0]
|
|
# name_to_id debe tener entradas para los nombres originales
|
|
assert any(v == ent_id for v in result.name_to_id.values())
|
|
assert len(result.name_to_id) >= 2
|