"""Alinea triplets REBEL / mREBEL a nombres canonicos de entidades.""" from __future__ import annotations def align_relations_to_entities( triplets: list[dict], entity_names: list[str], ) -> list[dict]: """Align REBEL triplets to a set of canonical entity names. For each triplet produced by ``parse_rebel_output``, tries to resolve the ``head`` and ``tail`` spans to a canonical entity name from ``entity_names`` using the following strategy (in order): 1. **Exact case-insensitive match** — ``"Inditex" == "inditex"``. 2. **Substring match** — either the span contains an entity name, or an entity name contains the span. When multiple entity names match, the *longest* one wins (most specific). Triplets are dropped when: - Neither ``head`` nor ``tail`` can be resolved to any entity name. - The resolved ``from`` and ``to`` are the same name (self-loop). Args: triplets: List of dicts produced by ``parse_rebel_output``, each with keys ``head``, ``head_type``, ``type``, ``tail``, ``tail_type``. entity_names: Canonical entity names to match against. Typically ``[e.name for e in entities]``. Order does not matter; matching is case-insensitive. Returns: List of dicts with keys: ``from`` (str), ``kind`` (str), ``to`` (str), ``head_type`` (str), ``tail_type`` (str). ``from`` and ``to`` are values taken verbatim from ``entity_names``. Empty list if no triplet survives alignment. """ if not triplets or not entity_names: return [] # Pre-build lookup: lowercased -> original for O(1) exact lookup. lower_to_name: dict[str, str] = {n.lower(): n for n in entity_names} # Sort by length DESC for substring match (longest entity wins). names_by_len: list[str] = sorted(entity_names, key=len, reverse=True) def _resolve(span: str) -> str | None: """Return a canonical entity name for `span`, or None if no match.""" if not span: return None span_lower = span.lower() # 1. Exact case-insensitive. if span_lower in lower_to_name: return lower_to_name[span_lower] # 2. Substring: longest entity that is contained in span, or whose # name contains span (both directions), longest-wins. for name in names_by_len: name_lower = name.lower() if name_lower in span_lower or span_lower in name_lower: return name return None aligned: list[dict] = [] for triplet in triplets: head_span = triplet.get("head", "") tail_span = triplet.get("tail", "") relation = triplet.get("type", "") from_name = _resolve(head_span) to_name = _resolve(tail_span) if from_name is None or to_name is None: continue if from_name == to_name: continue aligned.append( { "from": from_name, "kind": relation, "to": to_name, "head_type": triplet.get("head_type", ""), "tail_type": triplet.get("tail_type", ""), } ) return aligned