fn_registry/python/functions/datascience/deduplicate_entities.py

"""Deduplica entidades candidatas usando fuzzy matching de nombres."""

from __future__ import annotations

import sys
import os
import uuid

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))

from python.types.datascience.entity_candidate import EntityCandidate
from python.types.datascience.deduplication_result import DeduplicationResult
from python.functions.core.normalize_entity_name import normalize_entity_name
from python.functions.core.merge_entity_attributes import merge_entity_attributes


# ── Similitud helpers ──────────────────────────────────────────────────────────

def _levenshtein(a: str, b: str) -> int:
    """Distancia de edicion Levenshtein entre dos strings."""
    if a == b:
        return 0
    if not a:
        return len(b)
    if not b:
        return len(a)
    prev = list(range(len(b) + 1))
    for i, ca in enumerate(a, 1):
        curr = [i]
        for j, cb in enumerate(b, 1):
            cost = 0 if ca == cb else 1
            curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
        prev = curr
    return prev[-1]


def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float:
    """Similitud de Jaccard entre dos conjuntos de tokens."""
    set_a = set(tokens_a)
    set_b = set(tokens_b)
    if not set_a and not set_b:
        return 1.0
    inter = len(set_a & set_b)
    union = len(set_a | set_b)
    return inter / union if union else 0.0


def _name_similarity(a: str, b: str) -> float:
    """Score de similitud entre dos nombres normalizados.

    Combina similitud de Levenshtein y Jaccard sobre tokens.
    Aplica bonus de contencion (+0.3) y deteccion de acronimos.
    """
    if a == b:
        return 1.0

    # Similitud Levenshtein
    max_len = max(len(a), len(b))
    lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0

    # Similitud Jaccard sobre tokens
    tokens_a = a.split()
    tokens_b = b.split()
    jac_sim = _jaccard(tokens_a, tokens_b)

    score = max(lev_sim, jac_sim)

    # Bonus de contencion: un nombre contiene al otro
    if a in b or b in a:
        score = min(1.0, score + 0.3)

    # Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation"
    if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a):
        score = min(1.0, score + 0.3)

    return score


def _is_acronym_of(candidate: str, tokens: list[str]) -> bool:
    """Comprueba si candidate es un acronimo formado por las iniciales de tokens."""
    if not candidate or not tokens:
        return False
    initials = "".join(t[0] for t in tokens if t).upper()
    return candidate.upper() == initials


_EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"}


def _is_exact_type(entity_type: str) -> bool:
    """Tipos tecnicos donde solo se acepta matching exacto."""
    return entity_type.lower() in _EXACT_TYPES


# ── Union-Find ─────────────────────────────────────────────────────────────────

class _UnionFind:
    def __init__(self, n: int) -> None:
        self._parent = list(range(n))
        self._rank = [0] * n

    def find(self, x: int) -> int:
        while self._parent[x] != x:
            self._parent[x] = self._parent[self._parent[x]]
            x = self._parent[x]
        return x

    def union(self, x: int, y: int) -> None:
        rx, ry = self.find(x), self.find(y)
        if rx == ry:
            return
        if self._rank[rx] < self._rank[ry]:
            rx, ry = ry, rx
        self._parent[ry] = rx
        if self._rank[rx] == self._rank[ry]:
            self._rank[rx] += 1


# ── Implementacion principal ────────────────────────────────────────────────────

def deduplicate_entities(
    candidates: list[EntityCandidate],
    name_threshold: float = 0.85,
    same_type_only: bool = True,
) -> DeduplicationResult:
    """Agrupa entidades candidatas que refieren a la misma entidad real.

    Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para
    detectar clusters transitivos. Por cada cluster genera una entidad canonica
    mergeando atributos de todos sus miembros.

    Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se
    acepta matching exacto normalizado, ignorando el umbral de nombre.

    Args:
        candidates: lista de EntityCandidate a deduplicar.
        name_threshold: score minimo para considerar dos nombres iguales (0-1).
        same_type_only: si True, solo compara entidades del mismo type_ref.

    Returns:
        DeduplicationResult con entidades deduplicadas, mapas de resolucion
        e historial de merges.
    """
    if not candidates:
        return DeduplicationResult(
            entities=[],
            entity_id_map={},
            name_to_id={},
            merge_log=[],
            total_before=0,
            total_after=0,
        )

    n = len(candidates)

    # Paso 1: normalizar nombres
    normalized: list[str] = []
    for c in candidates:
        norm = normalize_entity_name(c.name, c.type_ref)
        normalized.append(norm)

    # Paso 2: Union-Find sobre todos los indices
    uf = _UnionFind(n)

    # Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only)
    merge_pairs: list[tuple[int, int, float]] = []

    for i in range(n):
        for j in range(i + 1, n):
            if same_type_only and candidates[i].type_ref != candidates[j].type_ref:
                continue

            ni, nj = normalized[i], normalized[j]
            et = candidates[i].type_ref.lower()

            if _is_exact_type(et):
                if ni == nj:
                    uf.union(i, j)
                    merge_pairs.append((i, j, 1.0))
                continue

            score = _name_similarity(ni, nj)
            if score >= name_threshold:
                uf.union(i, j)
                merge_pairs.append((i, j, score))

    # Paso 4: agrupar indices por raiz del Union-Find
    clusters: dict[int, list[int]] = {}
    for i in range(n):
        root = uf.find(i)
        clusters.setdefault(root, []).append(i)

    # Paso 5: merge por cluster
    merged_entities: list[EntityCandidate] = []
    entity_id_map: dict[str, str] = {}
    name_to_id: dict[str, str] = {}
    merge_log: list[dict] = []

    # Pares mergeados para construir el log
    merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {}
    for i, j, score in merge_pairs:
        root = uf.find(i)
        merged_pairs_by_root.setdefault(root, []).append((i, j, score))

    for root, indices in clusters.items():
        cluster_candidates = [candidates[idx] for idx in indices]

        if len(cluster_candidates) == 1:
            c = cluster_candidates[0]
            canonical_name = c.name
            canonical_norm = normalized[indices[0]]
            merged_attrs = c.attributes
            merged_confidence = c.confidence
            merged_chunks = list(c.source_chunk_indices)
            merged_from = list(c.merged_from) if c.merged_from else [c.name]
        else:
            # Candidato con mayor confidence es el canonico
            best = max(cluster_candidates, key=lambda c: c.confidence)
            canonical_name = best.name
            canonical_norm = normalize_entity_name(best.name, best.type_ref)

            merged_attrs = merge_entity_attributes(
                [c.attributes for c in cluster_candidates]
            )
            merged_confidence = max(c.confidence for c in cluster_candidates)

            merged_chunks: list[int] = []
            seen_chunks: set[int] = set()
            for c in cluster_candidates:
                for idx in c.source_chunk_indices:
                    if idx not in seen_chunks:
                        merged_chunks.append(idx)
                        seen_chunks.add(idx)

            merged_from: list[str] = []
            seen_names: set[str] = set()
            for c in cluster_candidates:
                names_to_add = c.merged_from if c.merged_from else [c.name]
                for nm in names_to_add:
                    if nm not in seen_names:
                        merged_from.append(nm)
                        seen_names.add(nm)

            # Log de merge
            other_names = [c.name for c in cluster_candidates if c is not best]
            pairs = merged_pairs_by_root.get(root, [])
            max_score = max((s for _, _, s in pairs), default=1.0)
            merge_log.append(
                {
                    "canonical": canonical_name,
                    "merged": other_names,
                    "score": round(max_score, 4),
                    "reason": "fuzzy_name",
                }
            )

        ent_id = str(uuid.uuid4())
        entity = EntityCandidate(
            name=canonical_name,
            name_normalized=canonical_norm,
            type_ref=cluster_candidates[0].type_ref,
            type_label=cluster_candidates[0].type_label,
            attributes=merged_attrs,
            confidence=merged_confidence,
            source_chunk_indices=merged_chunks,
            merged_from=merged_from,
        )
        merged_entities.append(entity)

        # Poblar mapas de resolucion
        entity_id_map[canonical_norm] = ent_id
        for orig_name in merged_from:
            name_to_id[orig_name] = ent_id
        name_to_id[canonical_norm] = ent_id

    return DeduplicationResult(
        entities=merged_entities,
        entity_id_map=entity_id_map,
        name_to_id=name_to_id,
        merge_log=merge_log,
        total_before=n,
        total_after=len(merged_entities),
    )