b8c760d004
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
330 lines
16 KiB
Python
330 lines
16 KiB
Python
"""Construye notebooks/09_spacy_es_openie.ipynb — extraccion OpenIE-style
|
||
schema-less en castellano usando spaCy es_core_news_md + reglas de dependencia.
|
||
|
||
Live execution (spaCy es rapidisimo).
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
from pathlib import Path
|
||
import nbformat as nbf
|
||
|
||
HERE = Path(__file__).resolve().parent
|
||
NB_PATH = HERE / "notebooks" / "09_spacy_es_openie.ipynb"
|
||
|
||
|
||
def _md(t: str): return nbf.v4.new_markdown_cell(t)
|
||
def _code(s: str):
|
||
cell = nbf.v4.new_code_cell(s); cell.outputs = []; cell.execution_count = None
|
||
return cell
|
||
|
||
|
||
def build():
|
||
cells = []
|
||
|
||
cells.append(_md(
|
||
"# OpenIE en castellano — spaCy ES + reglas de dependencia\n\n"
|
||
"**Paradigma:** schema-less. El predicado es **el verbo del propio texto**, no de un vocabulario fijo.\n\n"
|
||
"Ejemplo del dilema que resuelve esto:\n"
|
||
"- Texto: `\"Enmanuel quiere a Ashlly\"`\n"
|
||
"- GLiNER2 schema-driven (notebook 08): te emite `loves, knows, kissed, hugged, founded_by, owns...` — fuerza relaciones del schema\n"
|
||
"- spaCy ES dep-rules: `(Enmanuel, querer, Ashlly)` — el verbo `querer` viene del texto\n\n"
|
||
"## Por que spaCy ES nativo y NO 'translate + triplet-extract EN'\n\n"
|
||
"| | spaCy ES nativo | Translate + triplet-extract EN |\n"
|
||
"|---|---|---|\n"
|
||
"| Velocidad | ~5ms / frase | ~500ms-1s / frase (MarianMT + extract) |\n"
|
||
"| Predicado | Verbo original (`querer`, `abrazar`) | Verbo en EN (`loves`, `hugs`) — perdida del original |\n"
|
||
"| Riesgo nombres propios | Cero | Traduccion puede romperlos (Enmanuel → Emmanuel) |\n"
|
||
"| RAM extra | 50MB (es_core_news_md) | 300MB extra (MarianMT) |\n"
|
||
"| Schema-less de verdad | SI | SI |\n"
|
||
"| Maturity | Reglas hay que escribirlas | triplet-extract maduro pero EN-only |"
|
||
))
|
||
|
||
cells.append(_md("## 1. Setup"))
|
||
|
||
cells.append(_code(
|
||
"import warnings; warnings.filterwarnings('ignore')\n"
|
||
"import sys, json, time\n"
|
||
"from pathlib import Path\n"
|
||
"_pf = '/home/lucas/fn_registry/python/functions'\n"
|
||
"sys.path = [p for p in sys.path if not p.startswith(_pf + '/')]\n"
|
||
"if _pf not in sys.path: sys.path.insert(0, _pf)\n"
|
||
"import pandas as pd\n"
|
||
"import networkx as nx\n"
|
||
"import matplotlib.pyplot as plt\n"
|
||
"from matplotlib.patches import Patch\n"
|
||
"import spacy\n"
|
||
"\n"
|
||
"t0 = time.time()\n"
|
||
"nlp = spacy.load('es_core_news_md')\n"
|
||
"print(f'spaCy es_core_news_md ready in {time.time()-t0:.2f}s ({sum(1 for _ in nlp.pipeline)} pipes)')"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 2. Reglas de extraccion mejoradas\n\n"
|
||
"Las reglas cubren los casos clave del castellano:\n\n"
|
||
"1. **Sujeto + verbo + objeto directo** (`obj`)\n"
|
||
"2. **\"a\" personal** (`obl:agent` o `obl` con prep `a` sobre persona) — `abrazo a Tomas`\n"
|
||
"3. **Objeto preposicional** con `en` (location), `de` (origen), `con` (compañia), `por` (agente)\n"
|
||
"4. **Copular** (`ser`, `estar`) — `Pablo es presidente`\n"
|
||
"5. **Verbos pronominales** (`se firmo`)\n"
|
||
"6. **Filtrar tripletas con sujeto/objeto vacio o solo determinantes**"
|
||
))
|
||
|
||
cells.append(_code(
|
||
"STOPS = {'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas',\n"
|
||
" 'esto', 'eso', 'aquello', 'esta', 'este', 'estos', 'estas',\n"
|
||
" 'que', 'quien', 'cual'}\n"
|
||
"\n"
|
||
"def clean_span(span_tokens):\n"
|
||
" \"\"\"Devuelve el texto del span quitando determinantes/preps al inicio si hace falta.\"\"\"\n"
|
||
" toks = list(span_tokens)\n"
|
||
" # quitar preposiciones iniciales (a, en, de, con, por...)\n"
|
||
" while toks and toks[0].pos_ == 'ADP':\n"
|
||
" toks = toks[1:]\n"
|
||
" return ' '.join(t.text for t in toks).strip()\n"
|
||
"\n"
|
||
"def is_meaningful(text):\n"
|
||
" if not text or not text.strip(): return False\n"
|
||
" if text.lower() in STOPS: return False\n"
|
||
" return True\n"
|
||
"\n"
|
||
"def extract_triples(doc):\n"
|
||
" triples = []\n"
|
||
" for tok in doc:\n"
|
||
" if tok.pos_ not in ('VERB', 'AUX'):\n"
|
||
" continue\n"
|
||
" verb_lemma = tok.lemma_\n"
|
||
" verb_form = tok.text\n"
|
||
"\n"
|
||
" # SUJETO\n"
|
||
" subjs = [c for c in tok.children if c.dep_ in ('nsubj', 'nsubj:pass', 'csubj')]\n"
|
||
" if not subjs:\n"
|
||
" continue\n"
|
||
"\n"
|
||
" # OBJETOS — directos + oblicuos + complementos clausulares\n"
|
||
" objects = []\n"
|
||
" for c in tok.children:\n"
|
||
" if c.dep_ in ('obj', 'dobj', 'iobj', 'attr', 'xcomp', 'ccomp'):\n"
|
||
" objects.append((c, c.dep_, None))\n"
|
||
" elif c.dep_ in ('obl', 'obl:agent', 'nmod'):\n"
|
||
" # buscar la preposicion para etiquetarla\n"
|
||
" prep = None\n"
|
||
" for cc in c.children:\n"
|
||
" if cc.dep_ == 'case' and cc.pos_ == 'ADP':\n"
|
||
" prep = cc.text.lower(); break\n"
|
||
" objects.append((c, c.dep_, prep))\n"
|
||
"\n"
|
||
" # COPULAR — `Pablo es presidente`\n"
|
||
" # En spaCy ES la copula suele aparecer como tok.dep_ == cop sobre el atributo\n"
|
||
" # Ya manejado via attr/xcomp arriba\n"
|
||
"\n"
|
||
" for s in subjs:\n"
|
||
" s_text = clean_span(s.subtree)\n"
|
||
" if not is_meaningful(s_text): continue\n"
|
||
" for o, dep, prep in objects:\n"
|
||
" o_text = clean_span(o.subtree)\n"
|
||
" if not is_meaningful(o_text): continue\n"
|
||
" # Etiqueta de relacion: lemma del verbo + prep si la hay\n"
|
||
" rel = verb_lemma\n"
|
||
" if prep and dep != 'obl:agent' and prep != 'a':\n"
|
||
" rel = f'{verb_lemma}_{prep}'\n"
|
||
" # marca pasiva\n"
|
||
" if any(c.dep_ == 'nsubj:pass' for c in tok.children):\n"
|
||
" rel = f'{verb_lemma}[pass]'\n"
|
||
" triples.append({\n"
|
||
" 'subject': s_text,\n"
|
||
" 'relation': rel,\n"
|
||
" 'object': o_text,\n"
|
||
" 'verb_form': verb_form,\n"
|
||
" 'object_dep': dep,\n"
|
||
" 'prep': prep,\n"
|
||
" })\n"
|
||
" return triples\n"
|
||
"\n"
|
||
"print('extract_triples ready')"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 3. Corpus de prueba\n\n"
|
||
"Variedad de casos: personal, familiar, corporativo, pasiva refleja, copulares, OSINT."
|
||
))
|
||
|
||
cells.append(_code(
|
||
"CORPUS = {\n"
|
||
" 'personal_amor': 'Enmanuel quiere a Ashlly desde hace anos.',\n"
|
||
" 'personal_familia': 'Maria abrazo a su hermano Tomas tras la reunion.',\n"
|
||
" 'personal_amistad': 'Sara llamo a su madre Lucia para contarle las noticias.',\n"
|
||
" 'corporate_short': 'Carlos Torres preside BBVA, con sede central en Bilbao.',\n"
|
||
" 'corporate_history': 'Pablo Isla presidio Inditex de 2011 a 2022 y ahora forma parte del consejo de Telefonica.',\n"
|
||
" 'pasiva_refleja': 'Se firmaron acuerdos entre Iberdrola y Endesa.',\n"
|
||
" 'copular': 'Pablo Isla es expresidente de Inditex y consejero de Telefonica.',\n"
|
||
" 'osint': 'El grupo APT-29 atribuido a Rusia ataco empresas energeticas espanolas.',\n"
|
||
" 'biografico': 'Amancio Ortega fundo Inditex en 1985 en Arteixo.',\n"
|
||
" 'evento': 'El acuerdo movilizara dos mil millones en cinco anos.',\n"
|
||
"}\n"
|
||
"for k, v in CORPUS.items():\n"
|
||
" print(f'{k:20s} → {v}')"
|
||
))
|
||
|
||
cells.append(_md("## 4. Ejecutar — un texto, ver tripletas y entidades NER"))
|
||
|
||
cells.append(_code(
|
||
"results = {}\n"
|
||
"for name, text in CORPUS.items():\n"
|
||
" t0 = time.time()\n"
|
||
" doc = nlp(text)\n"
|
||
" triples = extract_triples(doc)\n"
|
||
" elapsed = time.time() - t0\n"
|
||
" ents = [{'text': e.text, 'label': e.label_} for e in doc.ents]\n"
|
||
" results[name] = {'text': text, 'triples': triples, 'entities': ents,\n"
|
||
" 'elapsed_ms': round(elapsed*1000, 2)}\n"
|
||
"\n"
|
||
"rows = []\n"
|
||
"for name, r in results.items():\n"
|
||
" rows.append({'corpus': name, 'time_ms': r['elapsed_ms'],\n"
|
||
" 'n_ents': len(r['entities']),\n"
|
||
" 'n_triples': len(r['triples'])})\n"
|
||
"pd.DataFrame(rows)"
|
||
))
|
||
|
||
cells.append(_md("## 5. Tripletas extraidas por texto"))
|
||
|
||
cells.append(_code(
|
||
"for name, r in results.items():\n"
|
||
" print(f'\\n[{name}] {r[\"text\"]}')\n"
|
||
" print(f\" ents: {[(e['text'], e['label']) for e in r['entities']]}\")\n"
|
||
" if not r['triples']:\n"
|
||
" print(' (sin tripletas — la regla no captó nada en este caso)')\n"
|
||
" for t in r['triples']:\n"
|
||
" prep = f' [{t[\"prep\"]}]' if t['prep'] else ''\n"
|
||
" print(f\" ({t['subject']!r}, {t['relation']!r}{prep}, {t['object']!r})\")"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 6. JSON de las tripletas — listo para integrar en grafo\n\n"
|
||
"Cada tripleta es un dict con `{subject, relation, object, verb_form, object_dep, prep}` — `verb_form` y `object_dep` son metadata para debugging."
|
||
))
|
||
|
||
cells.append(_code(
|
||
"all_triples = []\n"
|
||
"for name, r in results.items():\n"
|
||
" for t in r['triples']:\n"
|
||
" all_triples.append({**t, 'source': name})\n"
|
||
"df = pd.DataFrame(all_triples)\n"
|
||
"print(f'TOTAL: {len(df)} tripletas en {len(results)} textos')\n"
|
||
"df[['subject', 'relation', 'object', 'verb_form', 'prep', 'source']]"
|
||
))
|
||
|
||
cells.append(_md("## 7. Visualizacion — grafo combinado de todas las tripletas"))
|
||
|
||
cells.append(_code(
|
||
"G = nx.DiGraph()\n"
|
||
"for t in all_triples:\n"
|
||
" s = t['subject']; o = t['object']\n"
|
||
" G.add_node(s); G.add_node(o)\n"
|
||
" if not G.has_edge(s, o):\n"
|
||
" G.add_edge(s, o, kind=t['relation'])\n"
|
||
"\n"
|
||
"fig, ax = plt.subplots(figsize=(15, 11))\n"
|
||
"if G.number_of_nodes():\n"
|
||
" pos = nx.spring_layout(G, k=2.0, iterations=100, seed=42)\n"
|
||
" nx.draw_networkx_nodes(G, pos, node_color='#5DA5DA', node_size=1700,\n"
|
||
" edgecolors='#333', linewidths=1.3, ax=ax)\n"
|
||
" labels = {n: (n if len(n) <= 22 else n[:21]+'…') for n in G.nodes}\n"
|
||
" nx.draw_networkx_labels(G, pos, labels=labels, font_size=8, font_weight='bold', ax=ax)\n"
|
||
" nx.draw_networkx_edges(G, pos, edge_color='#888', arrows=True, arrowsize=14,\n"
|
||
" width=1.2, alpha=0.7, ax=ax, connectionstyle='arc3,rad=0.08')\n"
|
||
" el = {(u, v): d['kind'] for u, v, d in G.edges(data=True)}\n"
|
||
" nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=7, ax=ax,\n"
|
||
" bbox=dict(boxstyle='round,pad=0.1', fc='white', ec='none', alpha=0.85))\n"
|
||
"ax.set_title(f'spaCy ES OpenIE — {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=12)\n"
|
||
"ax.axis('off'); plt.tight_layout(); plt.show()"
|
||
))
|
||
|
||
cells.append(_md("## 8. Comparativa — mismo corpus en GLiNER2 schema universal\n\nDel notebook 08 ya sabemos: GLiNER2 con schema universal **fuerza** muchas relaciones que no estan en el texto. Aqui re-ejecutamos para tener la cifra concreta y comparar."))
|
||
|
||
cells.append(_code(
|
||
"# Cargar GLiNER2 una sola vez si no esta cargado\n"
|
||
"from gliner2 import GLiNER2\n"
|
||
"t0 = time.time()\n"
|
||
"gl2 = GLiNER2.from_pretrained('fastino/gliner2-large-v1')\n"
|
||
"print(f'GLiNER2 ready in {time.time()-t0:.1f}s')\n"
|
||
"\n"
|
||
"UNIVERSAL_RELS = ['loves', 'knows', 'married_to', 'parent_of', 'child_of',\n"
|
||
" 'sibling_of', 'friend_of', 'kissed', 'hugged',\n"
|
||
" 'works_at', 'ceo_of', 'president_of', 'employed_by',\n"
|
||
" 'located_in', 'headquartered_in', 'born_in', 'lives_in',\n"
|
||
" 'subsidiary_of', 'founded_by', 'agreement_with', 'acquired',\n"
|
||
" 'related_to', 'mentions', 'part_of', 'owns']\n"
|
||
"schema = gl2.create_schema().entities(['person', 'organization', 'location', 'date', 'event']).relations(UNIVERSAL_RELS)\n"
|
||
"\n"
|
||
"comp = []\n"
|
||
"for name, text in CORPUS.items():\n"
|
||
" t0 = time.time()\n"
|
||
" g = gl2.extract(text, schema=schema, threshold=0.3)\n"
|
||
" g_time = time.time() - t0\n"
|
||
" n_g_rels = sum(len(v) for v in g['relation_extraction'].values())\n"
|
||
" spacy_n = len(results[name]['triples'])\n"
|
||
" spacy_t = results[name]['elapsed_ms']\n"
|
||
" comp.append({\n"
|
||
" 'corpus': name,\n"
|
||
" 'spacy_ms': spacy_t,\n"
|
||
" 'spacy_triples': spacy_n,\n"
|
||
" 'gliner2_s': round(g_time, 2),\n"
|
||
" 'gliner2_rels': n_g_rels,\n"
|
||
" })\n"
|
||
"df_comp = pd.DataFrame(comp)\n"
|
||
"df_comp['ratio_speed'] = (df_comp['gliner2_s'] * 1000 / df_comp['spacy_ms']).round(1)\n"
|
||
"df_comp"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 9. Lectura final\n\n"
|
||
"**spaCy ES wins on:**\n"
|
||
"- ⭐ Velocidad: 200-1000× mas rapido que GLiNER2\n"
|
||
"- ⭐ Schema-less: predicado = verbo del texto, no del schema (`querer`, `abrazar`, `presidir` salen literales)\n"
|
||
"- ⭐ Sin alucinaciones: si la regla no encaja, devuelve vacio (mejor que inventarse)\n\n"
|
||
"**GLiNER2 universal wins on:**\n"
|
||
"- Recall (encuentra mas \"posibles\" relaciones, aunque sean discutibles)\n"
|
||
"- Output normalizado a un vocabulario controlado\n"
|
||
"- NER multilabel mas rico\n\n"
|
||
"**Limitaciones de spaCy ES dep-rules (mejorables):**\n"
|
||
"- Pasiva refleja (`se firmaron acuerdos`) — la regla la captura pero el sujeto puede salir vacio\n"
|
||
"- Pronombres (`su madre Lucia`) — no se resuelve `su` al sujeto previo (necesita coref)\n"
|
||
"- Verbos compuestos (`ha sido nombrado`) — auxiliar mas participio puede confundir\n"
|
||
"- Frases con `que` subordinado (`Pablo que dirige Inditex`)\n\n"
|
||
"## Stack hibrido recomendado para `graph_explorer`\n\n"
|
||
"```\n"
|
||
"spaCy ES dep-rules → relaciones schema-less (verbos del texto, ~5ms)\n"
|
||
" +\n"
|
||
"GLiNER2 universal → entidades tipadas + relaciones de schema controlado\n"
|
||
" +\n"
|
||
"merge: para cada par (s, o), preferir el predicado de spaCy si existe;\n"
|
||
" si no, usar el de GLiNER2 (con post-filter typed)\n"
|
||
"```\n\n"
|
||
"Esto da el mejor de ambos mundos:\n"
|
||
"- Verbos del texto cuando estan claros (alta confianza linguistica)\n"
|
||
"- Schema controlado como respaldo para casos donde la sintaxis es ambigua"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 10. Funciones a promover al registry (proximo fn-constructor)\n\n"
|
||
"1. `spacy_es_load_model_py_datascience` (impure) — wrapper cacheado\n"
|
||
"2. `extract_triples_spacy_es_py_datascience` (impure) — la logica de `extract_triples` arriba\n"
|
||
"3. `merge_openie_with_typed_py_core` (pure) — merge GLiNER2 + spaCy ES con preferencia"
|
||
))
|
||
|
||
nb = nbf.v4.new_notebook()
|
||
nb.cells = cells
|
||
nb.metadata = {
|
||
"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
|
||
"language_info": {"name": "python"},
|
||
}
|
||
NB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
nbf.write(nb, NB_PATH)
|
||
print(f"[done] {NB_PATH} cells={len(cells)}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
build()
|