Files
gliner_glirel_tuning/build_notebook_09_spacy_es.py
2026-05-04 23:44:11 +02:00

330 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Construye notebooks/09_spacy_es_openie.ipynb — extraccion OpenIE-style
schema-less en castellano usando spaCy es_core_news_md + reglas de dependencia.
Live execution (spaCy es rapidisimo).
"""
from __future__ import annotations
from pathlib import Path
import nbformat as nbf
HERE = Path(__file__).resolve().parent
NB_PATH = HERE / "notebooks" / "09_spacy_es_openie.ipynb"
def _md(t: str): return nbf.v4.new_markdown_cell(t)
def _code(s: str):
cell = nbf.v4.new_code_cell(s); cell.outputs = []; cell.execution_count = None
return cell
def build():
cells = []
cells.append(_md(
"# OpenIE en castellano — spaCy ES + reglas de dependencia\n\n"
"**Paradigma:** schema-less. El predicado es **el verbo del propio texto**, no de un vocabulario fijo.\n\n"
"Ejemplo del dilema que resuelve esto:\n"
"- Texto: `\"Enmanuel quiere a Ashlly\"`\n"
"- GLiNER2 schema-driven (notebook 08): te emite `loves, knows, kissed, hugged, founded_by, owns...` — fuerza relaciones del schema\n"
"- spaCy ES dep-rules: `(Enmanuel, querer, Ashlly)` — el verbo `querer` viene del texto\n\n"
"## Por que spaCy ES nativo y NO 'translate + triplet-extract EN'\n\n"
"| | spaCy ES nativo | Translate + triplet-extract EN |\n"
"|---|---|---|\n"
"| Velocidad | ~5ms / frase | ~500ms-1s / frase (MarianMT + extract) |\n"
"| Predicado | Verbo original (`querer`, `abrazar`) | Verbo en EN (`loves`, `hugs`) — perdida del original |\n"
"| Riesgo nombres propios | Cero | Traduccion puede romperlos (Enmanuel → Emmanuel) |\n"
"| RAM extra | 50MB (es_core_news_md) | 300MB extra (MarianMT) |\n"
"| Schema-less de verdad | SI | SI |\n"
"| Maturity | Reglas hay que escribirlas | triplet-extract maduro pero EN-only |"
))
cells.append(_md("## 1. Setup"))
cells.append(_code(
"import warnings; warnings.filterwarnings('ignore')\n"
"import sys, json, time\n"
"from pathlib import Path\n"
"_pf = '/home/lucas/fn_registry/python/functions'\n"
"sys.path = [p for p in sys.path if not p.startswith(_pf + '/')]\n"
"if _pf not in sys.path: sys.path.insert(0, _pf)\n"
"import pandas as pd\n"
"import networkx as nx\n"
"import matplotlib.pyplot as plt\n"
"from matplotlib.patches import Patch\n"
"import spacy\n"
"\n"
"t0 = time.time()\n"
"nlp = spacy.load('es_core_news_md')\n"
"print(f'spaCy es_core_news_md ready in {time.time()-t0:.2f}s ({sum(1 for _ in nlp.pipeline)} pipes)')"
))
cells.append(_md(
"## 2. Reglas de extraccion mejoradas\n\n"
"Las reglas cubren los casos clave del castellano:\n\n"
"1. **Sujeto + verbo + objeto directo** (`obj`)\n"
"2. **\"a\" personal** (`obl:agent` o `obl` con prep `a` sobre persona) — `abrazo a Tomas`\n"
"3. **Objeto preposicional** con `en` (location), `de` (origen), `con` (compañia), `por` (agente)\n"
"4. **Copular** (`ser`, `estar`) — `Pablo es presidente`\n"
"5. **Verbos pronominales** (`se firmo`)\n"
"6. **Filtrar tripletas con sujeto/objeto vacio o solo determinantes**"
))
cells.append(_code(
"STOPS = {'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas',\n"
" 'esto', 'eso', 'aquello', 'esta', 'este', 'estos', 'estas',\n"
" 'que', 'quien', 'cual'}\n"
"\n"
"def clean_span(span_tokens):\n"
" \"\"\"Devuelve el texto del span quitando determinantes/preps al inicio si hace falta.\"\"\"\n"
" toks = list(span_tokens)\n"
" # quitar preposiciones iniciales (a, en, de, con, por...)\n"
" while toks and toks[0].pos_ == 'ADP':\n"
" toks = toks[1:]\n"
" return ' '.join(t.text for t in toks).strip()\n"
"\n"
"def is_meaningful(text):\n"
" if not text or not text.strip(): return False\n"
" if text.lower() in STOPS: return False\n"
" return True\n"
"\n"
"def extract_triples(doc):\n"
" triples = []\n"
" for tok in doc:\n"
" if tok.pos_ not in ('VERB', 'AUX'):\n"
" continue\n"
" verb_lemma = tok.lemma_\n"
" verb_form = tok.text\n"
"\n"
" # SUJETO\n"
" subjs = [c for c in tok.children if c.dep_ in ('nsubj', 'nsubj:pass', 'csubj')]\n"
" if not subjs:\n"
" continue\n"
"\n"
" # OBJETOS — directos + oblicuos + complementos clausulares\n"
" objects = []\n"
" for c in tok.children:\n"
" if c.dep_ in ('obj', 'dobj', 'iobj', 'attr', 'xcomp', 'ccomp'):\n"
" objects.append((c, c.dep_, None))\n"
" elif c.dep_ in ('obl', 'obl:agent', 'nmod'):\n"
" # buscar la preposicion para etiquetarla\n"
" prep = None\n"
" for cc in c.children:\n"
" if cc.dep_ == 'case' and cc.pos_ == 'ADP':\n"
" prep = cc.text.lower(); break\n"
" objects.append((c, c.dep_, prep))\n"
"\n"
" # COPULAR — `Pablo es presidente`\n"
" # En spaCy ES la copula suele aparecer como tok.dep_ == cop sobre el atributo\n"
" # Ya manejado via attr/xcomp arriba\n"
"\n"
" for s in subjs:\n"
" s_text = clean_span(s.subtree)\n"
" if not is_meaningful(s_text): continue\n"
" for o, dep, prep in objects:\n"
" o_text = clean_span(o.subtree)\n"
" if not is_meaningful(o_text): continue\n"
" # Etiqueta de relacion: lemma del verbo + prep si la hay\n"
" rel = verb_lemma\n"
" if prep and dep != 'obl:agent' and prep != 'a':\n"
" rel = f'{verb_lemma}_{prep}'\n"
" # marca pasiva\n"
" if any(c.dep_ == 'nsubj:pass' for c in tok.children):\n"
" rel = f'{verb_lemma}[pass]'\n"
" triples.append({\n"
" 'subject': s_text,\n"
" 'relation': rel,\n"
" 'object': o_text,\n"
" 'verb_form': verb_form,\n"
" 'object_dep': dep,\n"
" 'prep': prep,\n"
" })\n"
" return triples\n"
"\n"
"print('extract_triples ready')"
))
cells.append(_md(
"## 3. Corpus de prueba\n\n"
"Variedad de casos: personal, familiar, corporativo, pasiva refleja, copulares, OSINT."
))
cells.append(_code(
"CORPUS = {\n"
" 'personal_amor': 'Enmanuel quiere a Ashlly desde hace anos.',\n"
" 'personal_familia': 'Maria abrazo a su hermano Tomas tras la reunion.',\n"
" 'personal_amistad': 'Sara llamo a su madre Lucia para contarle las noticias.',\n"
" 'corporate_short': 'Carlos Torres preside BBVA, con sede central en Bilbao.',\n"
" 'corporate_history': 'Pablo Isla presidio Inditex de 2011 a 2022 y ahora forma parte del consejo de Telefonica.',\n"
" 'pasiva_refleja': 'Se firmaron acuerdos entre Iberdrola y Endesa.',\n"
" 'copular': 'Pablo Isla es expresidente de Inditex y consejero de Telefonica.',\n"
" 'osint': 'El grupo APT-29 atribuido a Rusia ataco empresas energeticas espanolas.',\n"
" 'biografico': 'Amancio Ortega fundo Inditex en 1985 en Arteixo.',\n"
" 'evento': 'El acuerdo movilizara dos mil millones en cinco anos.',\n"
"}\n"
"for k, v in CORPUS.items():\n"
" print(f'{k:20s}{v}')"
))
cells.append(_md("## 4. Ejecutar — un texto, ver tripletas y entidades NER"))
cells.append(_code(
"results = {}\n"
"for name, text in CORPUS.items():\n"
" t0 = time.time()\n"
" doc = nlp(text)\n"
" triples = extract_triples(doc)\n"
" elapsed = time.time() - t0\n"
" ents = [{'text': e.text, 'label': e.label_} for e in doc.ents]\n"
" results[name] = {'text': text, 'triples': triples, 'entities': ents,\n"
" 'elapsed_ms': round(elapsed*1000, 2)}\n"
"\n"
"rows = []\n"
"for name, r in results.items():\n"
" rows.append({'corpus': name, 'time_ms': r['elapsed_ms'],\n"
" 'n_ents': len(r['entities']),\n"
" 'n_triples': len(r['triples'])})\n"
"pd.DataFrame(rows)"
))
cells.append(_md("## 5. Tripletas extraidas por texto"))
cells.append(_code(
"for name, r in results.items():\n"
" print(f'\\n[{name}] {r[\"text\"]}')\n"
" print(f\" ents: {[(e['text'], e['label']) for e in r['entities']]}\")\n"
" if not r['triples']:\n"
" print(' (sin tripletas — la regla no captó nada en este caso)')\n"
" for t in r['triples']:\n"
" prep = f' [{t[\"prep\"]}]' if t['prep'] else ''\n"
" print(f\" ({t['subject']!r}, {t['relation']!r}{prep}, {t['object']!r})\")"
))
cells.append(_md(
"## 6. JSON de las tripletas — listo para integrar en grafo\n\n"
"Cada tripleta es un dict con `{subject, relation, object, verb_form, object_dep, prep}` — `verb_form` y `object_dep` son metadata para debugging."
))
cells.append(_code(
"all_triples = []\n"
"for name, r in results.items():\n"
" for t in r['triples']:\n"
" all_triples.append({**t, 'source': name})\n"
"df = pd.DataFrame(all_triples)\n"
"print(f'TOTAL: {len(df)} tripletas en {len(results)} textos')\n"
"df[['subject', 'relation', 'object', 'verb_form', 'prep', 'source']]"
))
cells.append(_md("## 7. Visualizacion — grafo combinado de todas las tripletas"))
cells.append(_code(
"G = nx.DiGraph()\n"
"for t in all_triples:\n"
" s = t['subject']; o = t['object']\n"
" G.add_node(s); G.add_node(o)\n"
" if not G.has_edge(s, o):\n"
" G.add_edge(s, o, kind=t['relation'])\n"
"\n"
"fig, ax = plt.subplots(figsize=(15, 11))\n"
"if G.number_of_nodes():\n"
" pos = nx.spring_layout(G, k=2.0, iterations=100, seed=42)\n"
" nx.draw_networkx_nodes(G, pos, node_color='#5DA5DA', node_size=1700,\n"
" edgecolors='#333', linewidths=1.3, ax=ax)\n"
" labels = {n: (n if len(n) <= 22 else n[:21]+'') for n in G.nodes}\n"
" nx.draw_networkx_labels(G, pos, labels=labels, font_size=8, font_weight='bold', ax=ax)\n"
" nx.draw_networkx_edges(G, pos, edge_color='#888', arrows=True, arrowsize=14,\n"
" width=1.2, alpha=0.7, ax=ax, connectionstyle='arc3,rad=0.08')\n"
" el = {(u, v): d['kind'] for u, v, d in G.edges(data=True)}\n"
" nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=7, ax=ax,\n"
" bbox=dict(boxstyle='round,pad=0.1', fc='white', ec='none', alpha=0.85))\n"
"ax.set_title(f'spaCy ES OpenIE — {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=12)\n"
"ax.axis('off'); plt.tight_layout(); plt.show()"
))
cells.append(_md("## 8. Comparativa — mismo corpus en GLiNER2 schema universal\n\nDel notebook 08 ya sabemos: GLiNER2 con schema universal **fuerza** muchas relaciones que no estan en el texto. Aqui re-ejecutamos para tener la cifra concreta y comparar."))
cells.append(_code(
"# Cargar GLiNER2 una sola vez si no esta cargado\n"
"from gliner2 import GLiNER2\n"
"t0 = time.time()\n"
"gl2 = GLiNER2.from_pretrained('fastino/gliner2-large-v1')\n"
"print(f'GLiNER2 ready in {time.time()-t0:.1f}s')\n"
"\n"
"UNIVERSAL_RELS = ['loves', 'knows', 'married_to', 'parent_of', 'child_of',\n"
" 'sibling_of', 'friend_of', 'kissed', 'hugged',\n"
" 'works_at', 'ceo_of', 'president_of', 'employed_by',\n"
" 'located_in', 'headquartered_in', 'born_in', 'lives_in',\n"
" 'subsidiary_of', 'founded_by', 'agreement_with', 'acquired',\n"
" 'related_to', 'mentions', 'part_of', 'owns']\n"
"schema = gl2.create_schema().entities(['person', 'organization', 'location', 'date', 'event']).relations(UNIVERSAL_RELS)\n"
"\n"
"comp = []\n"
"for name, text in CORPUS.items():\n"
" t0 = time.time()\n"
" g = gl2.extract(text, schema=schema, threshold=0.3)\n"
" g_time = time.time() - t0\n"
" n_g_rels = sum(len(v) for v in g['relation_extraction'].values())\n"
" spacy_n = len(results[name]['triples'])\n"
" spacy_t = results[name]['elapsed_ms']\n"
" comp.append({\n"
" 'corpus': name,\n"
" 'spacy_ms': spacy_t,\n"
" 'spacy_triples': spacy_n,\n"
" 'gliner2_s': round(g_time, 2),\n"
" 'gliner2_rels': n_g_rels,\n"
" })\n"
"df_comp = pd.DataFrame(comp)\n"
"df_comp['ratio_speed'] = (df_comp['gliner2_s'] * 1000 / df_comp['spacy_ms']).round(1)\n"
"df_comp"
))
cells.append(_md(
"## 9. Lectura final\n\n"
"**spaCy ES wins on:**\n"
"- ⭐ Velocidad: 200-1000× mas rapido que GLiNER2\n"
"- ⭐ Schema-less: predicado = verbo del texto, no del schema (`querer`, `abrazar`, `presidir` salen literales)\n"
"- ⭐ Sin alucinaciones: si la regla no encaja, devuelve vacio (mejor que inventarse)\n\n"
"**GLiNER2 universal wins on:**\n"
"- Recall (encuentra mas \"posibles\" relaciones, aunque sean discutibles)\n"
"- Output normalizado a un vocabulario controlado\n"
"- NER multilabel mas rico\n\n"
"**Limitaciones de spaCy ES dep-rules (mejorables):**\n"
"- Pasiva refleja (`se firmaron acuerdos`) — la regla la captura pero el sujeto puede salir vacio\n"
"- Pronombres (`su madre Lucia`) — no se resuelve `su` al sujeto previo (necesita coref)\n"
"- Verbos compuestos (`ha sido nombrado`) — auxiliar mas participio puede confundir\n"
"- Frases con `que` subordinado (`Pablo que dirige Inditex`)\n\n"
"## Stack hibrido recomendado para `graph_explorer`\n\n"
"```\n"
"spaCy ES dep-rules → relaciones schema-less (verbos del texto, ~5ms)\n"
" +\n"
"GLiNER2 universal → entidades tipadas + relaciones de schema controlado\n"
" +\n"
"merge: para cada par (s, o), preferir el predicado de spaCy si existe;\n"
" si no, usar el de GLiNER2 (con post-filter typed)\n"
"```\n\n"
"Esto da el mejor de ambos mundos:\n"
"- Verbos del texto cuando estan claros (alta confianza linguistica)\n"
"- Schema controlado como respaldo para casos donde la sintaxis es ambigua"
))
cells.append(_md(
"## 10. Funciones a promover al registry (proximo fn-constructor)\n\n"
"1. `spacy_es_load_model_py_datascience` (impure) — wrapper cacheado\n"
"2. `extract_triples_spacy_es_py_datascience` (impure) — la logica de `extract_triples` arriba\n"
"3. `merge_openie_with_typed_py_core` (pure) — merge GLiNER2 + spaCy ES con preferencia"
))
nb = nbf.v4.new_notebook()
nb.cells = cells
nb.metadata = {
"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
"language_info": {"name": "python"},
}
NB_PATH.parent.mkdir(parents=True, exist_ok=True)
nbf.write(nb, NB_PATH)
print(f"[done] {NB_PATH} cells={len(cells)}")
if __name__ == "__main__":
build()