"""Parser puro del wire format de REBEL / mREBEL.""" from __future__ import annotations def parse_rebel_output(decoded_text: str) -> list[dict]: """Parse REBEL / mREBEL decoded output into typed triplets. The input is the string produced by the HuggingFace tokenizer with ``skip_special_tokens=False``, e.g.:: tp_XX Pablo Isla Inditex employer ... Args: decoded_text: Raw decoded string from the seq2seq model, including special tokens like ````, ````, ````, ````, ````, etc. Returns: List of dicts with keys: ``head`` (str), ``head_type`` (str), ``type`` (str), ``tail`` (str), ``tail_type`` (str). Returns an empty list on empty input or if no complete triplet is found. Never raises. """ if not decoded_text or not decoded_text.strip(): return [] triplets: list[dict] = [] # Strip language / padding tokens common to mREBEL. text = ( decoded_text .replace("", "") .replace("", "") .replace("", "") .replace("tp_XX", "") .replace("__en__", "") .strip() ) current = "x" # x=init, t=head span, s=tail span, o=relation span subject = "" relation = "" object_ = "" object_type = "" subject_type = "" for token in text.split(): if token in ("", ""): current = "t" if relation: triplets.append( { "head": subject.strip(), "head_type": subject_type, "type": relation.strip(), "tail": object_.strip(), "tail_type": object_type, } ) relation = "" subject = "" elif token.startswith("<") and token.endswith(">"): if current in ("t", "o"): # Closing the head span — now reading tail. current = "s" if relation: triplets.append( { "head": subject.strip(), "head_type": subject_type, "type": relation.strip(), "tail": object_.strip(), "tail_type": object_type, } ) object_ = "" subject_type = token[1:-1] else: # Closing the tail span — now reading relation. current = "o" object_type = token[1:-1] relation = "" else: if current == "t": subject += " " + token elif current == "s": object_ += " " + token elif current == "o": relation += " " + token # Flush the last triplet if all fields are present. if subject and relation and object_ and object_type and subject_type: triplets.append( { "head": subject.strip(), "head_type": subject_type, "type": relation.strip(), "tail": object_.strip(), "tail_type": object_type, } ) return triplets