fn_registry/python/functions/datascience/parse_rebel_output.py

"""Parser puro del wire format de REBEL / mREBEL."""

from __future__ import annotations


def parse_rebel_output(decoded_text: str) -> list[dict]:
    """Parse REBEL / mREBEL decoded output into typed triplets.

    The input is the string produced by the HuggingFace tokenizer with
    ``skip_special_tokens=False``, e.g.::

        tp_XX<triplet> Pablo Isla <per> Inditex <org> employer<triplet> ...

    Args:
        decoded_text: Raw decoded string from the seq2seq model, including
            special tokens like ``<triplet>``, ``<relation>``, ``<per>``,
            ``<org>``, ``<loc>``, etc.

    Returns:
        List of dicts with keys:
            ``head`` (str), ``head_type`` (str),
            ``type`` (str), ``tail`` (str), ``tail_type`` (str).
        Returns an empty list on empty input or if no complete triplet is
        found. Never raises.
    """
    if not decoded_text or not decoded_text.strip():
        return []

    triplets: list[dict] = []

    # Strip language / padding tokens common to mREBEL.
    text = (
        decoded_text
        .replace("<s>", "")
        .replace("<pad>", "")
        .replace("</s>", "")
        .replace("tp_XX", "")
        .replace("__en__", "")
        .strip()
    )

    current = "x"           # x=init, t=head span, s=tail span, o=relation span
    subject = ""
    relation = ""
    object_ = ""
    object_type = ""
    subject_type = ""

    for token in text.split():
        if token in ("<triplet>", "<relation>"):
            current = "t"
            if relation:
                triplets.append(
                    {
                        "head": subject.strip(),
                        "head_type": subject_type,
                        "type": relation.strip(),
                        "tail": object_.strip(),
                        "tail_type": object_type,
                    }
                )
                relation = ""
            subject = ""
        elif token.startswith("<") and token.endswith(">"):
            if current in ("t", "o"):
                # Closing the head span — now reading tail.
                current = "s"
                if relation:
                    triplets.append(
                        {
                            "head": subject.strip(),
                            "head_type": subject_type,
                            "type": relation.strip(),
                            "tail": object_.strip(),
                            "tail_type": object_type,
                        }
                    )
                object_ = ""
                subject_type = token[1:-1]
            else:
                # Closing the tail span — now reading relation.
                current = "o"
                object_type = token[1:-1]
                relation = ""
        else:
            if current == "t":
                subject += " " + token
            elif current == "s":
                object_ += " " + token
            elif current == "o":
                relation += " " + token

    # Flush the last triplet if all fields are present.
    if subject and relation and object_ and object_type and subject_type:
        triplets.append(
            {
                "head": subject.strip(),
                "head_type": subject_type,
                "type": relation.strip(),
                "tail": object_.strip(),
                "tail_type": object_type,
            }
        )

    return triplets