Files
fn_registry/python/functions/datascience/deduplicate_relations_test.py
egutierrez 63a9cb5273 feat: funciones Python datascience, finance, cybersecurity y pipelines
Datascience: aggregate_by_group, deduplicate_entities/relations, detect_drift,
diff_entities/relations, extract_entities/relations_llm, hotness_score, melt,
merge_graphs, pivot, build_entity/relation_schema_prompt.
Finance: avellaneda_stoikov_quotes, generate_gbm_prices, generate_taker_order,
hawkes_intensity + módulo finance.py.
Cybersecurity: envelope_encrypt/decrypt + módulo cybersecurity.py.
Pipelines: extraction_pipeline, monte_carlo_market, run_market_sim.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 17:11:32 +02:00

121 lines
4.3 KiB
Python

"""Tests para deduplicate_relations."""
import os
import sys
# Permitir importar RelationCandidate desde python/types/datascience/
_here = os.path.dirname(os.path.abspath(__file__))
_types_path = os.path.join(_here, "..", "..", "..", "python", "types", "datascience")
if _types_path not in sys.path:
sys.path.insert(0, _types_path)
from relation_candidate import RelationCandidate
from deduplicate_relations import deduplicate_relations
def _make_rel(
from_name: str,
to_name: str,
relation_type: str = "works_at",
description: str = "",
confidence: float = 0.8,
source_chunk_index: int = 0,
) -> RelationCandidate:
return RelationCandidate(
from_name=from_name,
to_name=to_name,
relation_type=relation_type,
description=description,
confidence=confidence,
source_chunk_index=source_chunk_index,
)
# entity_id_map tipico: claves en lowercase normalizado
_ENTITY_MAP: dict[str, str] = {
"john smith": "entity_001",
"acme corp": "entity_002",
"jane doe": "entity_003",
"google": "entity_004",
}
def test_dos_relaciones_identicas_se_colapsan_en_una():
"""2 relaciones identicas (from, to, type) → 1."""
rels = [
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9),
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.7),
]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 1
assert result[0].from_id == "entity_001"
assert result[0].to_id == "entity_002"
assert result[0].confidence == 0.9 # max
def test_relacion_con_nombre_mergeado_se_resuelve_al_id_correcto():
"""Relacion con nombre mergeado → se resuelve al ID correcto."""
# entity_id_map incluye "smith, john" como alias de entity_001
merged_map = {**_ENTITY_MAP, "smith, john": "entity_001"}
rels = [_make_rel("Smith, John", "Acme Corp")]
result = deduplicate_relations(rels, merged_map)
assert len(result) == 1
assert result[0].from_id == "entity_001"
assert result[0].to_id == "entity_002"
def test_self_loop_se_descarta():
"""Self-loop (from_id == to_id) → descartado."""
rels = [_make_rel("John Smith", "John Smith", relation_type="knows")]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 0
def test_nombre_no_mapeado_sin_fuzzy_match_se_descarta():
"""Relacion con nombre no mapeado y sin fuzzy match → descartada."""
rels = [_make_rel("Unknown Entity XYZ", "Acme Corp")]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 0
def test_relaciones_distintas_se_mantienen():
"""Relaciones con (from, to, type) distintos → todas se mantienen."""
rels = [
_make_rel("John Smith", "Acme Corp", relation_type="works_at"),
_make_rel("Jane Doe", "Acme Corp", relation_type="works_at"),
_make_rel("John Smith", "Google", relation_type="invested_in"),
]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 3
def test_merge_descripcion_concatena_unicas():
"""Merge de relaciones: descripciones unicas se concatenan."""
rels = [
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.9),
_make_rel("John Smith", "Acme Corp", description="Acme fue fundada por John", confidence=0.7),
_make_rel("John Smith", "Acme Corp", description="John es CEO", confidence=0.6),
]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 1
assert "John es CEO" in result[0].description
assert "Acme fue fundada por John" in result[0].description
# La descripcion duplicada ("John es CEO") no aparece dos veces
assert result[0].description.count("John es CEO") == 1
assert result[0].confidence == 0.9
def test_lista_vacia_retorna_lista_vacia():
"""Lista vacia de relaciones → lista vacia."""
result = deduplicate_relations([], _ENTITY_MAP)
assert result == []
def test_fuzzy_match_resuelve_nombre_cercano():
"""Nombre con typo pequeño → fuzzy match lo resuelve."""
# "john smit" tiene distancia 1 de "john smith"
rels = [_make_rel("John Smit", "Acme Corp")]
result = deduplicate_relations(rels, _ENTITY_MAP)
assert len(result) == 1
assert result[0].from_id == "entity_001"