fn_registry/python/functions/datascience/align_relations_to_entities.py

"""Alinea triplets REBEL / mREBEL a nombres canonicos de entidades."""

from __future__ import annotations


def align_relations_to_entities(
    triplets: list[dict],
    entity_names: list[str],
) -> list[dict]:
    """Align REBEL triplets to a set of canonical entity names.

    For each triplet produced by ``parse_rebel_output``, tries to resolve the
    ``head`` and ``tail`` spans to a canonical entity name from ``entity_names``
    using the following strategy (in order):

    1. **Exact case-insensitive match** — ``"Inditex" == "inditex"``.
    2. **Substring match** — either the span contains an entity name, or an
       entity name contains the span.  When multiple entity names match, the
       *longest* one wins (most specific).

    Triplets are dropped when:
    - Neither ``head`` nor ``tail`` can be resolved to any entity name.
    - The resolved ``from`` and ``to`` are the same name (self-loop).

    Args:
        triplets: List of dicts produced by ``parse_rebel_output``, each with
            keys ``head``, ``head_type``, ``type``, ``tail``, ``tail_type``.
        entity_names: Canonical entity names to match against. Typically
            ``[e.name for e in entities]``.  Order does not matter; matching
            is case-insensitive.

    Returns:
        List of dicts with keys:
            ``from`` (str), ``kind`` (str), ``to`` (str),
            ``head_type`` (str), ``tail_type`` (str).
        ``from`` and ``to`` are values taken verbatim from ``entity_names``.
        Empty list if no triplet survives alignment.
    """
    if not triplets or not entity_names:
        return []

    # Pre-build lookup: lowercased -> original for O(1) exact lookup.
    lower_to_name: dict[str, str] = {n.lower(): n for n in entity_names}
    # Sort by length DESC for substring match (longest entity wins).
    names_by_len: list[str] = sorted(entity_names, key=len, reverse=True)

    def _resolve(span: str) -> str | None:
        """Return a canonical entity name for `span`, or None if no match."""
        if not span:
            return None
        span_lower = span.lower()

        # 1. Exact case-insensitive.
        if span_lower in lower_to_name:
            return lower_to_name[span_lower]

        # 2. Substring: longest entity that is contained in span, or whose
        #    name contains span (both directions), longest-wins.
        for name in names_by_len:
            name_lower = name.lower()
            if name_lower in span_lower or span_lower in name_lower:
                return name

        return None

    aligned: list[dict] = []
    for triplet in triplets:
        head_span = triplet.get("head", "")
        tail_span = triplet.get("tail", "")
        relation = triplet.get("type", "")

        from_name = _resolve(head_span)
        to_name = _resolve(tail_span)

        if from_name is None or to_name is None:
            continue
        if from_name == to_name:
            continue

        aligned.append(
            {
                "from": from_name,
                "kind": relation,
                "to": to_name,
                "head_type": triplet.get("head_type", ""),
                "tail_type": triplet.get("tail_type", ""),
            }
        )

    return aligned