chore: initial sync — gliner+glirel benchmark notebooks
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,154 @@
|
||||
"""Quick test of Babelscape/mREBEL on Spanish business text.
|
||||
|
||||
Compara directamente con GLiREL sobre el mismo texto. Si mREBEL produce
|
||||
tripletas semanticamente correctas en castellano, lo proponemos como
|
||||
sustituto/complemento de GLiREL en el pipeline `extract_graph_hybrid`.
|
||||
|
||||
Licencia mREBEL: CC BY-NC-SA 4.0 (no comercial). OK para uso personal/
|
||||
investigacion; revisar si pasa a produccion comercial.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import time
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
# Same sys.path cleanup as the notebook (avoid bigquery/datasets.py shadow)
|
||||
import os
|
||||
_pf = "/home/lucas/fn_registry/python/functions"
|
||||
sys.path = [p for p in sys.path if not p.startswith(_pf + "/")]
|
||||
if _pf not in sys.path:
|
||||
sys.path.insert(0, _pf)
|
||||
|
||||
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
||||
|
||||
TEXT_ES = (
|
||||
"Pablo Isla, expresidente de Inditex, ha sido nombrado consejero de Telefonica. "
|
||||
"La operacion fue anunciada por el presidente Jose Maria Alvarez-Pallete en Madrid el pasado lunes. "
|
||||
"Inditex factura mas de 30.000 millones anuales y tiene su sede en Arteixo, A Coruna. "
|
||||
"En paralelo, Iberdrola y Endesa firmaron un acuerdo de colaboracion en proyectos eolicos en Galicia. "
|
||||
"El presidente de Iberdrola, Ignacio Galan, se reunio con la CEO de Endesa, Marina Serrano, en Bilbao. "
|
||||
"El acuerdo movilizara 2.000 millones de euros en cinco anos. "
|
||||
"El BBVA, presidido por Carlos Torres, mostro interes en participar en la financiacion del proyecto. "
|
||||
"Su sede central esta en Bilbao."
|
||||
)
|
||||
|
||||
|
||||
def extract_triplets_typed(text: str) -> list[dict]:
|
||||
"""Parse mREBEL output (decoded with skip_special_tokens=False) into triplets.
|
||||
Format: <triplet> head <subj> head_type <rel> rel_type <obj> tail_type ...
|
||||
Adapted from the README example.
|
||||
"""
|
||||
triplets = []
|
||||
relation = ""
|
||||
text = text.strip()
|
||||
current = "x"
|
||||
subject, relation, object_, object_type, subject_type = "", "", "", "", ""
|
||||
|
||||
for token in (
|
||||
text.replace("<s>", "")
|
||||
.replace("<pad>", "")
|
||||
.replace("</s>", "")
|
||||
.replace("tp_XX", "")
|
||||
.replace("__en__", "")
|
||||
.split()
|
||||
):
|
||||
if token == "<triplet>" or token == "<relation>":
|
||||
current = "t"
|
||||
if relation != "":
|
||||
triplets.append(
|
||||
{
|
||||
"head": subject.strip(),
|
||||
"head_type": subject_type,
|
||||
"type": relation.strip(),
|
||||
"tail": object_.strip(),
|
||||
"tail_type": object_type,
|
||||
}
|
||||
)
|
||||
relation = ""
|
||||
subject = ""
|
||||
elif token.startswith("<") and token.endswith(">"):
|
||||
if current == "t" or current == "o":
|
||||
current = "s"
|
||||
if relation != "":
|
||||
triplets.append(
|
||||
{
|
||||
"head": subject.strip(),
|
||||
"head_type": subject_type,
|
||||
"type": relation.strip(),
|
||||
"tail": object_.strip(),
|
||||
"tail_type": object_type,
|
||||
}
|
||||
)
|
||||
object_ = ""
|
||||
subject_type = token[1:-1]
|
||||
else:
|
||||
current = "o"
|
||||
object_type = token[1:-1]
|
||||
relation = ""
|
||||
else:
|
||||
if current == "t":
|
||||
subject += " " + token
|
||||
elif current == "s":
|
||||
object_ += " " + token
|
||||
elif current == "o":
|
||||
relation += " " + token
|
||||
if subject != "" and relation != "" and object_ != "" and object_type != "" and subject_type != "":
|
||||
triplets.append(
|
||||
{
|
||||
"head": subject.strip(),
|
||||
"head_type": subject_type,
|
||||
"type": relation.strip(),
|
||||
"tail": object_.strip(),
|
||||
"tail_type": object_type,
|
||||
}
|
||||
)
|
||||
return triplets
|
||||
|
||||
|
||||
def main():
|
||||
print("[load] mREBEL...", flush=True)
|
||||
t0 = time.time()
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
"Babelscape/mrebel-large", src_lang="es_XX", tgt_lang="tp_XX"
|
||||
)
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-large")
|
||||
print(f"[load] mREBEL ready in {time.time()-t0:.1f}s")
|
||||
|
||||
print(f"\n[input ES] {len(TEXT_ES)} chars")
|
||||
inputs = tokenizer(TEXT_ES, max_length=512, padding=True, truncation=True, return_tensors="pt")
|
||||
print("[generate]")
|
||||
t0 = time.time()
|
||||
out = model.generate(
|
||||
inputs["input_ids"].to(model.device),
|
||||
attention_mask=inputs["attention_mask"].to(model.device),
|
||||
decoder_start_token_id=tokenizer.convert_tokens_to_ids("tp_XX"),
|
||||
max_length=512,
|
||||
num_beams=4,
|
||||
length_penalty=0.0,
|
||||
)
|
||||
print(f"[generate] {time.time()-t0:.1f}s")
|
||||
decoded = tokenizer.batch_decode(out, skip_special_tokens=False)
|
||||
print("\n=== RAW DECODED ===")
|
||||
print(decoded[0][:2000])
|
||||
print("\n=== TRIPLETS ===")
|
||||
triplets = extract_triplets_typed(decoded[0])
|
||||
print(f"n={len(triplets)}\n")
|
||||
for t in triplets:
|
||||
print(f" ({t['head']:32s} : {t['head_type']:15s}) --[{t['type']:25s}]--> ({t['tail']:32s} : {t['tail_type']:15s})")
|
||||
# Save for the notebook
|
||||
import json
|
||||
out_path = Path(__file__).resolve().parent / "mrebel_results.json"
|
||||
out_path.write_text(json.dumps({
|
||||
"text": TEXT_ES,
|
||||
"raw_decoded": decoded[0],
|
||||
"triplets": triplets,
|
||||
}, indent=2, ensure_ascii=False))
|
||||
print(f"\n[saved] {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user