feat: funciones Python datascience, finance, cybersecurity y pipelines
Datascience: aggregate_by_group, deduplicate_entities/relations, detect_drift, diff_entities/relations, extract_entities/relations_llm, hotness_score, melt, merge_graphs, pivot, build_entity/relation_schema_prompt. Finance: avellaneda_stoikov_quotes, generate_gbm_prices, generate_taker_order, hawkes_intensity + módulo finance.py. Cybersecurity: envelope_encrypt/decrypt + módulo cybersecurity.py. Pipelines: extraction_pipeline, monte_carlo_market, run_market_sim. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,113 @@
|
||||
"""Tests para deduplicate_entities."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
|
||||
from python.types.datascience.entity_candidate import EntityCandidate
|
||||
from python.functions.datascience.deduplicate_entities import deduplicate_entities
|
||||
|
||||
|
||||
def _make(name: str, type_ref: str = "person", confidence: float = 0.9, **attrs) -> EntityCandidate:
|
||||
return EntityCandidate(
|
||||
name=name,
|
||||
type_ref=type_ref,
|
||||
type_label=type_ref.capitalize(),
|
||||
attributes=attrs,
|
||||
confidence=confidence,
|
||||
source_chunk_indices=[0],
|
||||
)
|
||||
|
||||
|
||||
def test_john_smith_y_smith_john_merge():
|
||||
"""John Smith y Smith, John se mergean."""
|
||||
a = _make("John Smith", type_ref="person")
|
||||
b = _make("Smith, John", type_ref="person")
|
||||
result = deduplicate_entities([a, b])
|
||||
assert result.total_before == 2
|
||||
assert result.total_after == 1
|
||||
assert len(result.entities) == 1
|
||||
assert len(result.merge_log) == 1
|
||||
|
||||
|
||||
def test_google_y_google_llc_merge():
|
||||
"""Google y Google LLC se mergean."""
|
||||
a = _make("Google", type_ref="organization")
|
||||
b = _make("Google LLC", type_ref="organization")
|
||||
result = deduplicate_entities([a, b])
|
||||
assert result.total_after == 1
|
||||
assert len(result.entities) == 1
|
||||
|
||||
|
||||
def test_ip_matching_exacto():
|
||||
"""192.168.1.1 y 192.168.1.1 se mergean por matching exacto."""
|
||||
a = _make("192.168.1.1", type_ref="ip", confidence=0.8)
|
||||
b = _make("192.168.1.1", type_ref="ip", confidence=0.9)
|
||||
result = deduplicate_entities([a, b])
|
||||
assert result.total_after == 1
|
||||
|
||||
|
||||
def test_same_name_different_type_no_merge():
|
||||
"""John Smith (person) y John Smith (organization) NO se mergean."""
|
||||
a = _make("John Smith", type_ref="person")
|
||||
b = _make("John Smith", type_ref="organization")
|
||||
result = deduplicate_entities([a, b], same_type_only=True)
|
||||
assert result.total_after == 2
|
||||
|
||||
|
||||
def test_clusters_transitivos():
|
||||
"""Clusters transitivos: A~B, B~C -> {A, B, C} en un solo cluster."""
|
||||
a = _make("Alice Johnson", type_ref="person")
|
||||
b = _make("Alice Johnso", type_ref="person") # muy similar a A
|
||||
c = _make("Alice Johns", type_ref="person") # muy similar a B
|
||||
result = deduplicate_entities([a, b, c], name_threshold=0.80)
|
||||
assert result.total_after == 1
|
||||
|
||||
|
||||
def test_sin_duplicados_sin_cambios():
|
||||
"""Entidades sin duplicados pasan sin modificacion."""
|
||||
a = _make("Alice Smith", type_ref="person")
|
||||
b = _make("Bob Jones", type_ref="person")
|
||||
c = _make("Charlie Brown", type_ref="person")
|
||||
result = deduplicate_entities([a, b, c])
|
||||
assert result.total_before == 3
|
||||
assert result.total_after == 3
|
||||
assert len(result.merge_log) == 0
|
||||
|
||||
|
||||
def test_confidence_y_atributos_merge_correctos():
|
||||
"""Confidence toma el max del cluster; atributos se fusionan."""
|
||||
a = _make("John Smith", type_ref="person", confidence=0.7, role="CEO")
|
||||
b = _make("Smith, John", type_ref="person", confidence=0.95, company="Acme")
|
||||
result = deduplicate_entities([a, b])
|
||||
assert result.total_after == 1
|
||||
entity = result.entities[0]
|
||||
# confidence = max(0.7, 0.95)
|
||||
assert entity.confidence == 0.95
|
||||
# atributos de ambos candidatos presentes
|
||||
assert "role" in entity.attributes
|
||||
assert "company" in entity.attributes
|
||||
|
||||
|
||||
def test_lista_vacia():
|
||||
"""Lista vacia retorna resultado vacio."""
|
||||
result = deduplicate_entities([])
|
||||
assert result.total_before == 0
|
||||
assert result.total_after == 0
|
||||
assert result.entities == []
|
||||
assert result.merge_log == []
|
||||
|
||||
|
||||
def test_name_to_id_resolucion():
|
||||
"""name_to_id contiene todos los nombres originales del cluster."""
|
||||
a = _make("John Smith", type_ref="person")
|
||||
b = _make("Smith, John", type_ref="person")
|
||||
result = deduplicate_entities([a, b])
|
||||
# Ambos nombres deben apuntar al mismo ID
|
||||
ids = list(result.entity_id_map.values())
|
||||
assert len(ids) == 1
|
||||
ent_id = ids[0]
|
||||
# name_to_id debe tener entradas para los nombres originales
|
||||
assert any(v == ent_id for v in result.name_to_id.values())
|
||||
assert len(result.name_to_id) >= 2
|
||||
Reference in New Issue
Block a user