"""Estudio de OpenIE / extraccion schema-less. Compara 3 paradigmas sobre el mismo conjunto de textos: A. triplet-extract (EN) — pip install triplet-extract, OpenIE moderno spaCy-based B. spaCy ES dependency rules — version casera para castellano C. GLiNER2 con schema universal — schema-driven con relaciones amplias Vuelca a openie_results.json para que el notebook lo cargue sin recargar modelos. """ from __future__ import annotations import json import os import sys import time import warnings from pathlib import Path warnings.filterwarnings("ignore") os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") HERE = Path(__file__).resolve().parent _pf = "/home/lucas/fn_registry/python/functions" sys.path = [p for p in sys.path if not p.startswith(_pf + "/")] if _pf not in sys.path: sys.path.insert(0, _pf) # ── Corpus EN (donde triplet-extract puede correr nativo) ── CORPUS_EN = { "personal_simple": "John kissed Mary at the park.", "personal_love": "Anna loves Bob and Bob admires Anna.", "corporate_short": "Carlos Torres chairs BBVA which has its headquarters in Bilbao.", "corporate_history": "Pablo Isla chaired Inditex from 2011 to 2022 and now serves on the board of Telefonica.", "mixed_emotional": "After the meeting, Sarah hugged her brother Tom who had just graduated.", } # ── Corpus ES (probando version nativa spaCy + schema-driven GLiNER2) ── CORPUS_ES = { "personal_simple": "Enmanuel quiere a Ashlly desde hace anos.", "personal_family": "Maria abrazo a su hermano Tomas tras la reunion.", "corporate_short": "Carlos Torres preside BBVA, con sede central en Bilbao.", "corporate_history": "Pablo Isla presidio Inditex de 2011 a 2022 y ahora forma parte del consejo de Telefonica.", "mixed_emotional": "Despues de la cena, Sara llamo a su madre Lucia para contarle las noticias.", } def run_triplet_extract_en(): """A. triplet-extract sobre corpus EN.""" from triplet_extract import extract out = {} print("\n[A] triplet-extract EN...", flush=True) for name, text in CORPUS_EN.items(): t0 = time.time() triples = extract(text) elapsed = time.time() - t0 out[name] = { "text": text, "elapsed_s": round(elapsed, 3), "n_triples": len(triples), "triples": [ {"subject": t.subject, "relation": t.relation, "object": t.object, "confidence": round(float(t.confidence), 2)} for t in triples ], } print(f" {name}: {len(triples)} triples en {elapsed:.2f}s", flush=True) return out def run_spacy_es_dep_rules(): """B. spaCy es_core_news_md + reglas de dependencia → tripletas.""" import spacy print("\n[B] spaCy ES dep-rules...", flush=True) t0 = time.time() nlp = spacy.load("es_core_news_md") print(f" load: {time.time()-t0:.1f}s", flush=True) def extract_triples_es(doc): """Para cada verbo: - subject = token con dep nsubj/nsubj:pass (o el sujeto pronominal implicito) - object = nsubj+obj+obl (acepta diferentes preps) """ triples = [] for token in doc: if token.pos_ != "VERB" and token.pos_ != "AUX": continue # encontrar sujeto subjs = [c for c in token.children if c.dep_ in ("nsubj", "nsubj:pass", "csubj")] # objetos directos / oblicuos / atributos objs_direct = [c for c in token.children if c.dep_ in ("obj", "dobj", "iobj", "attr")] objs_oblique = [c for c in token.children if c.dep_ in ("obl", "obl:agent", "nmod")] # tambien capturar "X a Y" (objeto preposicional con "a") for c in token.children: if c.dep_ == "obl" or c.dep_ == "obl:agent": objs_oblique.append(c) for s in subjs: # span del sujeto (incluye modificadores) s_text = " ".join([t.text for t in s.subtree]) # primero objetos directos for o in objs_direct + objs_oblique: o_text = " ".join([t.text for t in o.subtree]) triples.append({ "subject": s_text, "relation": token.lemma_, "object": o_text, "verb_form": token.text, }) return triples out = {} for name, text in CORPUS_ES.items(): t0 = time.time() doc = nlp(text) triples = extract_triples_es(doc) elapsed = time.time() - t0 # tambien NER para reportar entidades ents = [{"text": e.text, "label": e.label_} for e in doc.ents] out[name] = { "text": text, "elapsed_s": round(elapsed, 3), "n_triples": len(triples), "n_ents": len(ents), "triples": triples, "entities": ents, } print(f" {name}: {len(triples)} triples + {len(ents)} ents en {elapsed:.3f}s", flush=True) return out def run_gliner2_universal(): """C. GLiNER2 con schema universal (entity types amplios + relaciones diversas).""" from gliner2 import GLiNER2 print("\n[C] GLiNER2 universal schema (ES)...", flush=True) t0 = time.time() model = GLiNER2.from_pretrained("fastino/gliner2-large-v1") print(f" load: {time.time()-t0:.1f}s", flush=True) UNIVERSAL_ENT_LABELS = [ "person", "organization", "location", "place", "date", "money", "product", "event", ] UNIVERSAL_REL_LABELS = [ # personal "loves", "knows", "married_to", "parent_of", "child_of", "sibling_of", "friend_of", "kissed", "hugged", # work "works_at", "ceo_of", "president_of", "employed_by", "member_of", # spatial "located_in", "headquartered_in", "born_in", "lives_in", "from", # corporate "subsidiary_of", "founded_by", "agreement_with", "acquired", # generic "related_to", "mentions", "part_of", "owns", ] schema = model.create_schema().entities(UNIVERSAL_ENT_LABELS).relations(UNIVERSAL_REL_LABELS) out = {} for name, text in CORPUS_ES.items(): t0 = time.time() r = model.extract(text, schema=schema, threshold=0.3) elapsed = time.time() - t0 n_ents = sum(len(v) for v in r["entities"].values()) n_rels = sum(len(v) for v in r["relation_extraction"].values()) out[name] = { "text": text, "elapsed_s": round(elapsed, 3), "n_ents": n_ents, "n_rels": n_rels, "entities": {k: list(v) for k, v in r["entities"].items() if v}, "relations": {k: list(v) for k, v in r["relation_extraction"].items() if v}, } print(f" {name}: {n_ents} ents + {n_rels} rels en {elapsed:.2f}s", flush=True) return out def main(): out: dict = {"corpus_en": CORPUS_EN, "corpus_es": CORPUS_ES} out["A_triplet_extract_en"] = run_triplet_extract_en() out["B_spacy_es_dep"] = run_spacy_es_dep_rules() out["C_gliner2_universal_es"] = run_gliner2_universal() out_path = HERE / "openie_results.json" out_path.write_text(json.dumps(out, indent=2, ensure_ascii=False)) print(f"\n[saved] {out_path}", flush=True) if __name__ == "__main__": main()