fn_registry/python/functions/datascience/extract_triples_spacy_es.py

"""Extraccion de tripletas OpenIE schema-less en castellano via reglas de dependencia.

Validado en notebook 09 del analisis gliner_glirel_tuning.
LICENSE: spaCy MIT + es_core_news_md CC BY-SA 4.0.
"""

from __future__ import annotations

import time
from typing import Any

# Determinantes y pronombres que no son entidades significativas
STOP_TOKENS = {
    "el", "la", "los", "las", "un", "una", "unos", "unas",
    "esto", "eso", "aquello", "esta", "este", "estos", "estas",
    "que", "quien", "cual", "cuales",
}


def _clean_span(span_tokens) -> str:  # type: ignore[type-arg]
    """Extrae texto de un span de tokens, eliminando preposiciones iniciales."""
    toks = list(span_tokens)
    while toks and toks[0].pos_ == "ADP":
        toks = toks[1:]
    return " ".join(t.text for t in toks).strip()


def _is_meaningful(text: str) -> bool:
    """Comprueba que un span no es vacio ni una stopword."""
    if not text or not text.strip():
        return False
    if text.lower() in STOP_TOKENS:
        return False
    return True


def extract_triples_spacy_es(text: str, nlp: Any) -> dict:
    """Extract OpenIE-style (subject, relation, object) triples from Spanish text.

    Uses spaCy dependency rules to find subject-verb-object patterns.
    Schema-LESS: the relation is the verb's lemma (no fixed vocabulary).
    Also extracts spaCy NER entities (PER, ORG, LOC, MISC).

    Args:
        text: Spanish text to analyze. Works best with complete sentences.
        nlp: spaCy Language instance loaded with spacy_es_load_model.

    Returns:
        {
          "text": str,
          "triples": [
            {"subject": str, "relation": str, "object": str,
             "verb_form": str, "object_dep": str, "prep": str|None},
            ...
          ],
          "entities": [{"text": str, "label": str}, ...],
          "elapsed_s": float
        }
    """
    t0 = time.time()
    doc = nlp(text)
    triples: list[dict] = []

    for tok in doc:
        if tok.pos_ not in ("VERB", "AUX"):
            continue

        verb_lemma = tok.lemma_
        verb_form = tok.text

        subjs = [
            c for c in tok.children
            if c.dep_ in ("nsubj", "nsubj:pass", "csubj")
        ]
        if not subjs:
            continue

        objects: list[tuple] = []
        for c in tok.children:
            if c.dep_ in ("obj", "dobj", "iobj", "attr", "xcomp", "ccomp"):
                objects.append((c, c.dep_, None))
            elif c.dep_ in ("obl", "obl:agent", "nmod"):
                prep = None
                for cc in c.children:
                    if cc.dep_ == "case" and cc.pos_ == "ADP":
                        prep = cc.text.lower()
                        break
                objects.append((c, c.dep_, prep))

        for s in subjs:
            s_text = _clean_span(s.subtree)
            if not _is_meaningful(s_text):
                continue
            for o, dep, prep in objects:
                o_text = _clean_span(o.subtree)
                if not _is_meaningful(o_text):
                    continue

                # Construir etiqueta de relacion
                rel = verb_lemma
                # Pasiva: marcar con [pass]
                if any(c.dep_ == "nsubj:pass" for c in tok.children):
                    rel = f"{verb_lemma}[pass]"
                # Oblicuo con preposicion (excl. agente y "a" directa)
                elif prep and dep != "obl:agent" and prep != "a":
                    rel = f"{verb_lemma}_{prep}"

                triples.append({
                    "subject": s_text,
                    "relation": rel,
                    "object": o_text,
                    "verb_form": verb_form,
                    "object_dep": dep,
                    "prep": prep,
                })

    ents = [{"text": e.text, "label": e.label_} for e in doc.ents]

    return {
        "text": text,
        "triples": triples,
        "entities": ents,
        "elapsed_s": round(time.time() - t0, 3),
    }