"""Extraccion de tripletas OpenIE schema-less en castellano via reglas de dependencia. Validado en notebook 09 del analisis gliner_glirel_tuning. LICENSE: spaCy MIT + es_core_news_md CC BY-SA 4.0. """ from __future__ import annotations import time from typing import Any # Determinantes y pronombres que no son entidades significativas STOP_TOKENS = { "el", "la", "los", "las", "un", "una", "unos", "unas", "esto", "eso", "aquello", "esta", "este", "estos", "estas", "que", "quien", "cual", "cuales", } def _clean_span(span_tokens) -> str: # type: ignore[type-arg] """Extrae texto de un span de tokens, eliminando preposiciones iniciales.""" toks = list(span_tokens) while toks and toks[0].pos_ == "ADP": toks = toks[1:] return " ".join(t.text for t in toks).strip() def _is_meaningful(text: str) -> bool: """Comprueba que un span no es vacio ni una stopword.""" if not text or not text.strip(): return False if text.lower() in STOP_TOKENS: return False return True def extract_triples_spacy_es(text: str, nlp: Any) -> dict: """Extract OpenIE-style (subject, relation, object) triples from Spanish text. Uses spaCy dependency rules to find subject-verb-object patterns. Schema-LESS: the relation is the verb's lemma (no fixed vocabulary). Also extracts spaCy NER entities (PER, ORG, LOC, MISC). Args: text: Spanish text to analyze. Works best with complete sentences. nlp: spaCy Language instance loaded with spacy_es_load_model. Returns: { "text": str, "triples": [ {"subject": str, "relation": str, "object": str, "verb_form": str, "object_dep": str, "prep": str|None}, ... ], "entities": [{"text": str, "label": str}, ...], "elapsed_s": float } """ t0 = time.time() doc = nlp(text) triples: list[dict] = [] for tok in doc: if tok.pos_ not in ("VERB", "AUX"): continue verb_lemma = tok.lemma_ verb_form = tok.text subjs = [ c for c in tok.children if c.dep_ in ("nsubj", "nsubj:pass", "csubj") ] if not subjs: continue objects: list[tuple] = [] for c in tok.children: if c.dep_ in ("obj", "dobj", "iobj", "attr", "xcomp", "ccomp"): objects.append((c, c.dep_, None)) elif c.dep_ in ("obl", "obl:agent", "nmod"): prep = None for cc in c.children: if cc.dep_ == "case" and cc.pos_ == "ADP": prep = cc.text.lower() break objects.append((c, c.dep_, prep)) for s in subjs: s_text = _clean_span(s.subtree) if not _is_meaningful(s_text): continue for o, dep, prep in objects: o_text = _clean_span(o.subtree) if not _is_meaningful(o_text): continue # Construir etiqueta de relacion rel = verb_lemma # Pasiva: marcar con [pass] if any(c.dep_ == "nsubj:pass" for c in tok.children): rel = f"{verb_lemma}[pass]" # Oblicuo con preposicion (excl. agente y "a" directa) elif prep and dep != "obl:agent" and prep != "a": rel = f"{verb_lemma}_{prep}" triples.append({ "subject": s_text, "relation": rel, "object": o_text, "verb_form": verb_form, "object_dep": dep, "prep": prep, }) ents = [{"text": e.text, "label": e.label_} for e in doc.ents] return { "text": text, "triples": triples, "entities": ents, "elapsed_s": round(time.time() - t0, 3), }