chore: initial sync — gliner+glirel benchmark notebooks
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,188 @@
|
||||
"""Estudio de OpenIE / extraccion schema-less.
|
||||
|
||||
Compara 3 paradigmas sobre el mismo conjunto de textos:
|
||||
A. triplet-extract (EN) — pip install triplet-extract, OpenIE moderno spaCy-based
|
||||
B. spaCy ES dependency rules — version casera para castellano
|
||||
C. GLiNER2 con schema universal — schema-driven con relaciones amplias
|
||||
|
||||
Vuelca a openie_results.json para que el notebook lo cargue sin recargar modelos.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
|
||||
|
||||
HERE = Path(__file__).resolve().parent
|
||||
_pf = "/home/lucas/fn_registry/python/functions"
|
||||
sys.path = [p for p in sys.path if not p.startswith(_pf + "/")]
|
||||
if _pf not in sys.path:
|
||||
sys.path.insert(0, _pf)
|
||||
|
||||
|
||||
# ── Corpus EN (donde triplet-extract puede correr nativo) ──
|
||||
CORPUS_EN = {
|
||||
"personal_simple": "John kissed Mary at the park.",
|
||||
"personal_love": "Anna loves Bob and Bob admires Anna.",
|
||||
"corporate_short": "Carlos Torres chairs BBVA which has its headquarters in Bilbao.",
|
||||
"corporate_history": "Pablo Isla chaired Inditex from 2011 to 2022 and now serves on the board of Telefonica.",
|
||||
"mixed_emotional": "After the meeting, Sarah hugged her brother Tom who had just graduated.",
|
||||
}
|
||||
|
||||
# ── Corpus ES (probando version nativa spaCy + schema-driven GLiNER2) ──
|
||||
CORPUS_ES = {
|
||||
"personal_simple": "Enmanuel quiere a Ashlly desde hace anos.",
|
||||
"personal_family": "Maria abrazo a su hermano Tomas tras la reunion.",
|
||||
"corporate_short": "Carlos Torres preside BBVA, con sede central en Bilbao.",
|
||||
"corporate_history": "Pablo Isla presidio Inditex de 2011 a 2022 y ahora forma parte del consejo de Telefonica.",
|
||||
"mixed_emotional": "Despues de la cena, Sara llamo a su madre Lucia para contarle las noticias.",
|
||||
}
|
||||
|
||||
|
||||
def run_triplet_extract_en():
|
||||
"""A. triplet-extract sobre corpus EN."""
|
||||
from triplet_extract import extract
|
||||
out = {}
|
||||
print("\n[A] triplet-extract EN...", flush=True)
|
||||
for name, text in CORPUS_EN.items():
|
||||
t0 = time.time()
|
||||
triples = extract(text)
|
||||
elapsed = time.time() - t0
|
||||
out[name] = {
|
||||
"text": text,
|
||||
"elapsed_s": round(elapsed, 3),
|
||||
"n_triples": len(triples),
|
||||
"triples": [
|
||||
{"subject": t.subject, "relation": t.relation, "object": t.object,
|
||||
"confidence": round(float(t.confidence), 2)}
|
||||
for t in triples
|
||||
],
|
||||
}
|
||||
print(f" {name}: {len(triples)} triples en {elapsed:.2f}s", flush=True)
|
||||
return out
|
||||
|
||||
|
||||
def run_spacy_es_dep_rules():
|
||||
"""B. spaCy es_core_news_md + reglas de dependencia → tripletas."""
|
||||
import spacy
|
||||
print("\n[B] spaCy ES dep-rules...", flush=True)
|
||||
t0 = time.time()
|
||||
nlp = spacy.load("es_core_news_md")
|
||||
print(f" load: {time.time()-t0:.1f}s", flush=True)
|
||||
|
||||
def extract_triples_es(doc):
|
||||
"""Para cada verbo:
|
||||
- subject = token con dep nsubj/nsubj:pass (o el sujeto pronominal implicito)
|
||||
- object = nsubj+obj+obl (acepta diferentes preps)
|
||||
"""
|
||||
triples = []
|
||||
for token in doc:
|
||||
if token.pos_ != "VERB" and token.pos_ != "AUX":
|
||||
continue
|
||||
# encontrar sujeto
|
||||
subjs = [c for c in token.children if c.dep_ in ("nsubj", "nsubj:pass", "csubj")]
|
||||
# objetos directos / oblicuos / atributos
|
||||
objs_direct = [c for c in token.children if c.dep_ in ("obj", "dobj", "iobj", "attr")]
|
||||
objs_oblique = [c for c in token.children if c.dep_ in ("obl", "obl:agent", "nmod")]
|
||||
# tambien capturar "X a Y" (objeto preposicional con "a")
|
||||
for c in token.children:
|
||||
if c.dep_ == "obl" or c.dep_ == "obl:agent":
|
||||
objs_oblique.append(c)
|
||||
|
||||
for s in subjs:
|
||||
# span del sujeto (incluye modificadores)
|
||||
s_text = " ".join([t.text for t in s.subtree])
|
||||
# primero objetos directos
|
||||
for o in objs_direct + objs_oblique:
|
||||
o_text = " ".join([t.text for t in o.subtree])
|
||||
triples.append({
|
||||
"subject": s_text,
|
||||
"relation": token.lemma_,
|
||||
"object": o_text,
|
||||
"verb_form": token.text,
|
||||
})
|
||||
return triples
|
||||
|
||||
out = {}
|
||||
for name, text in CORPUS_ES.items():
|
||||
t0 = time.time()
|
||||
doc = nlp(text)
|
||||
triples = extract_triples_es(doc)
|
||||
elapsed = time.time() - t0
|
||||
# tambien NER para reportar entidades
|
||||
ents = [{"text": e.text, "label": e.label_} for e in doc.ents]
|
||||
out[name] = {
|
||||
"text": text,
|
||||
"elapsed_s": round(elapsed, 3),
|
||||
"n_triples": len(triples),
|
||||
"n_ents": len(ents),
|
||||
"triples": triples,
|
||||
"entities": ents,
|
||||
}
|
||||
print(f" {name}: {len(triples)} triples + {len(ents)} ents en {elapsed:.3f}s", flush=True)
|
||||
return out
|
||||
|
||||
|
||||
def run_gliner2_universal():
|
||||
"""C. GLiNER2 con schema universal (entity types amplios + relaciones diversas)."""
|
||||
from gliner2 import GLiNER2
|
||||
print("\n[C] GLiNER2 universal schema (ES)...", flush=True)
|
||||
t0 = time.time()
|
||||
model = GLiNER2.from_pretrained("fastino/gliner2-large-v1")
|
||||
print(f" load: {time.time()-t0:.1f}s", flush=True)
|
||||
|
||||
UNIVERSAL_ENT_LABELS = [
|
||||
"person", "organization", "location", "place",
|
||||
"date", "money", "product", "event",
|
||||
]
|
||||
UNIVERSAL_REL_LABELS = [
|
||||
# personal
|
||||
"loves", "knows", "married_to", "parent_of", "child_of", "sibling_of", "friend_of", "kissed", "hugged",
|
||||
# work
|
||||
"works_at", "ceo_of", "president_of", "employed_by", "member_of",
|
||||
# spatial
|
||||
"located_in", "headquartered_in", "born_in", "lives_in", "from",
|
||||
# corporate
|
||||
"subsidiary_of", "founded_by", "agreement_with", "acquired",
|
||||
# generic
|
||||
"related_to", "mentions", "part_of", "owns",
|
||||
]
|
||||
schema = model.create_schema().entities(UNIVERSAL_ENT_LABELS).relations(UNIVERSAL_REL_LABELS)
|
||||
|
||||
out = {}
|
||||
for name, text in CORPUS_ES.items():
|
||||
t0 = time.time()
|
||||
r = model.extract(text, schema=schema, threshold=0.3)
|
||||
elapsed = time.time() - t0
|
||||
n_ents = sum(len(v) for v in r["entities"].values())
|
||||
n_rels = sum(len(v) for v in r["relation_extraction"].values())
|
||||
out[name] = {
|
||||
"text": text,
|
||||
"elapsed_s": round(elapsed, 3),
|
||||
"n_ents": n_ents,
|
||||
"n_rels": n_rels,
|
||||
"entities": {k: list(v) for k, v in r["entities"].items() if v},
|
||||
"relations": {k: list(v) for k, v in r["relation_extraction"].items() if v},
|
||||
}
|
||||
print(f" {name}: {n_ents} ents + {n_rels} rels en {elapsed:.2f}s", flush=True)
|
||||
return out
|
||||
|
||||
|
||||
def main():
|
||||
out: dict = {"corpus_en": CORPUS_EN, "corpus_es": CORPUS_ES}
|
||||
out["A_triplet_extract_en"] = run_triplet_extract_en()
|
||||
out["B_spacy_es_dep"] = run_spacy_es_dep_rules()
|
||||
out["C_gliner2_universal_es"] = run_gliner2_universal()
|
||||
out_path = HERE / "openie_results.json"
|
||||
out_path.write_text(json.dumps(out, indent=2, ensure_ascii=False))
|
||||
print(f"\n[saved] {out_path}", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user