Files
gliner_glirel_tuning/build_notebook_nuextract.py
T
2026-05-04 23:44:11 +02:00

489 lines
28 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Construye notebooks/07_nuextract_vs_gliner2.ipynb — comparativa completa.
Carga datos de:
- nuextract_results.json (NuExtract 2.0-2B en GPU + baseline CPU)
- benchmark_v2.json (GLiNER2 sobre el mismo PDF)
Construye grafos a partir del JSON anidado de NuExtract (nested → edges) y
compara con los grafos de GLiNER2 lado a lado: numero de nodos, aristas,
tiempo por extraccion, calidad cualitativa.
"""
from __future__ import annotations
import json
from pathlib import Path
import nbformat as nbf
HERE = Path(__file__).resolve().parent
NB_PATH = HERE / "notebooks" / "07_nuextract_vs_gliner2.ipynb"
def _md(text: str):
return nbf.v4.new_markdown_cell(text)
def _code(src: str):
cell = nbf.v4.new_code_cell(src)
cell.outputs = []
cell.execution_count = None
return cell
def build():
cells = []
cells.append(_md(
"# NuExtract 2.0-2B (GPU) vs GLiNER2 — comparativa con visualizacion\n\n"
"**Pregunta:** ¿merece la pena un LLM con inferencia (NuExtract 2.0) en un proyecto donde "
"antes elegimos GLiNER2 por velocidad?\n\n"
"**Setup:**\n"
"- NuExtract 2.0-2B (Qwen2-VL-2B base, **MIT license**, 2B params, GPU BF16 sobre RTX 3070).\n"
"- GLiNER2-large-v1 (Apache 2.0, 340M params, CPU).\n"
"- Mismos corpora: `es_corporate_short` (8 frases), `LONG_TEXT_ES` (25 frases), 5 chunks del PDF de BBVA.\n\n"
"**Diferencia de paradigma:**\n"
"- **GLiNER2** = clasificador. Output: listas planas `{entities: {tipo: [names]}, relations: {tipo: [(h, t)]}}`.\n"
"- **NuExtract** = LLM generativo. Output: JSON arbitrario que tu defines en el `template`. Las relaciones se modelan como atributos de los objetos (`{org: {ceo: \"X\", headquartered_in: \"Y\"}}`).\n\n"
"**Hipotesis:** NuExtract gana en _riqueza estructural_ (atributos por entidad de un solo paso) pero pierde en velocidad — incluso con GPU."
))
cells.append(_md("## 1. Setup"))
cells.append(_code(
"import os, sys, json, warnings\n"
"warnings.filterwarnings('ignore')\n"
"from pathlib import Path\n"
"from collections import defaultdict\n"
"\n"
"_pf = '/home/lucas/fn_registry/python/functions'\n"
"sys.path = [p for p in sys.path if not p.startswith(_pf + '/')]\n"
"if _pf not in sys.path: sys.path.insert(0, _pf)\n"
"\n"
"import pandas as pd\n"
"import networkx as nx\n"
"import matplotlib.pyplot as plt\n"
"from matplotlib.patches import Patch\n"
"\n"
"NUEX = json.loads(Path('../nuextract_results.json').read_text())\n"
"\n"
"# Re-parsear el raw_text de cada test con un parser corregido (el original\n"
"# del script usaba rfind y solo capturaba el ultimo objeto pequeño).\n"
"def reparse(text):\n"
" if not text: return None\n"
" s = text.find('{')\n"
" if s < 0: return None\n"
" for end in range(len(text), s, -1):\n"
" try: return json.loads(text[s:end])\n"
" except Exception: continue\n"
" return None\n"
"for key in ['T1_corp_short_flat', 'T2_corp_short_rich', 'T3_long_text_rich']:\n"
" if key in NUEX:\n"
" NUEX[key]['parsed'] = reparse(NUEX[key].get('raw_text', ''))\n"
"for cr in NUEX.get('T4_pdf_chunks', []):\n"
" cr['parsed'] = reparse(cr.get('raw_text', ''))\n"
"GLNR_CORPUS = json.loads(Path('../benchmark_v2.json').read_text()) # GLiNER2 sobre 4 corpora\n"
"GLNR = json.loads(Path('../improvements.json').read_text()) # GLiNER2 sobre PDF + improvements\n"
"print('NuExtract keys:', list(NUEX.keys()))\n"
"print('GLiNER2 keys: ', list(GLNR.keys()))\n"
"print()\n"
"print('NuExtract device:', NUEX['meta']['device'], NUEX['meta']['dtype'])"
))
cells.append(_md(
"## 2. Tabla de tiempos — CPU vs GPU vs GLiNER2\n\n"
"Comparamos las 4 pasadas (T1-T4) de NuExtract contra GLiNER2 sobre los mismos corpora."
))
cells.append(_code(
"# Construir tabla de tiempos\n"
"rows = []\n"
"\n"
"# CPU baseline (capturado del run anterior)\n"
"cpu = NUEX.get('cpu_baseline', {})\n"
"if 'T1_flat' in cpu:\n"
" rows.append({'test': 'T1 corp_short flat', 'engine': 'NuExtract CPU', 'time_s': cpu['T1_flat']['elapsed_s'],\n"
" 'in_tok': cpu['T1_flat']['in_tok'], 'out_tok': cpu['T1_flat']['out_tok']})\n"
"if 'T2_rich' in cpu:\n"
" rows.append({'test': 'T2 corp_short rich', 'engine': 'NuExtract CPU', 'time_s': cpu['T2_rich']['elapsed_s'],\n"
" 'in_tok': cpu['T2_rich']['in_tok'], 'out_tok': cpu['T2_rich']['out_tok']})\n"
"\n"
"# GPU (este run)\n"
"for key, label in [('T1_corp_short_flat', 'T1 corp_short flat'),\n"
" ('T2_corp_short_rich', 'T2 corp_short rich'),\n"
" ('T3_long_text_rich', 'T3 long_text rich')]:\n"
" if key in NUEX:\n"
" r = NUEX[key]\n"
" rows.append({'test': label, 'engine': 'NuExtract GPU', 'time_s': r['elapsed_s'],\n"
" 'in_tok': r['n_input_tokens'], 'out_tok': r['n_output_tokens']})\n"
"\n"
"# GLiNER2 baseline timings (de benchmark_v2.json — el config A es el equivalente)\n"
"# A es el flat schema sobre 97 chunks PDF — para comparar con T4 PDF\n"
"rows.append({'test': 'PDF (97 chunks)', 'engine': 'GLiNER2 CPU', 'time_s': GLNR['configs'][0]['elapsed'],\n"
" 'in_tok': '-', 'out_tok': '-'})\n"
"rows.append({'test': 'PDF (97 chunks)', 'engine': 'GLiNER2 CPU t=0.3', 'time_s': GLNR['configs'][1]['elapsed'],\n"
" 'in_tok': '-', 'out_tok': '-'})\n"
"\n"
"df_times = pd.DataFrame(rows)\n"
"df_times"
))
cells.append(_md(
"## 3. Tiempos sobre el PDF — extrapolacion\n\n"
"5 chunks de muestra → estimacion del PDF completo."
))
cells.append(_code(
"if 'T4_pdf_chunks' in NUEX:\n"
" chunk_rows = []\n"
" for cr in NUEX['T4_pdf_chunks']:\n"
" chunk_rows.append({\n"
" 'chunk_idx': cr['chunk_idx'],\n"
" 'input_chars': cr['input_chars'],\n"
" 'time_s': cr['elapsed_s'],\n"
" 'in_tok': cr['n_input_tokens'],\n"
" 'out_tok': cr['n_output_tokens'],\n"
" })\n"
" df_chunks = pd.DataFrame(chunk_rows)\n"
" print('NuExtract GPU sobre 5 chunks del PDF:')\n"
" print(df_chunks)\n"
" print()\n"
" if 'full_pdf_extrapolation' in NUEX:\n"
" e = NUEX['full_pdf_extrapolation']\n"
" print(f\"Extrapolacion PDF entero ({e['n_chunks']} chunks):\")\n"
" print(f\" NuExtract GPU: {e['estimated_total_s']:.0f}s = {e['estimated_total_min']:.1f} min\")\n"
" print(f\" GLiNER2 CPU baseline: {GLNR['configs'][0]['elapsed']:.0f}s = {GLNR['configs'][0]['elapsed']/60:.1f} min\")\n"
" ratio = e['estimated_total_s'] / GLNR['configs'][0]['elapsed']\n"
" print(f\" ratio NuExtract/GLiNER2: {ratio:.1f}x\")\n"
"else:\n"
" print('T4_pdf_chunks no presente todavia')"
))
cells.append(_md(
"## 4. Estructura del output — paradigmas distintos\n\n"
"**NuExtract** rellena el template JSON. Lo que pidas, sale (si existe en el texto)."
))
cells.append(_code(
"# Mostrar el JSON parseado de T2 (rich corporate sobre 8 frases ES)\n"
"print('=== NuExtract T2 — schema rich corporate sobre es_corporate_short ===')\n"
"if 'T2_corp_short_rich' in NUEX:\n"
" parsed = NUEX['T2_corp_short_rich'].get('parsed')\n"
" if parsed:\n"
" print(json.dumps(parsed, indent=2, ensure_ascii=False))\n"
" else:\n"
" print('parsed = None (raw text:)')\n"
" print(NUEX['T2_corp_short_rich']['raw_text'][:1500])"
))
cells.append(_md("## 5. Convertir el JSON anidado de NuExtract a un grafo"))
cells.append(_code(
"def nuextract_corp_to_graph(parsed: dict) -> nx.DiGraph:\n"
" \"\"\"Convierte el output de schema_rich_corporate a un DiGraph.\n"
"\n"
" Mapeo:\n"
" org.name → nodo (type=organization)\n"
" org.ceo → nodo (type=person), arista person --ceo_of--> org\n"
" org.chairman_president → nodo, arista --president_of--> org\n"
" org.headquartered_in → nodo (type=location), arista org --headquartered_in--> loc\n"
" org.subsidiaries[] → cada sub: nodo + arista sub --subsidiary_of--> org\n"
" org.parent_company → nodo + arista org --subsidiary_of--> parent\n"
" person.name → nodo, person --role--> organization\n"
" agreement.between[] → entre cada par, arista A --agreement_with--> B\n"
" \"\"\"\n"
" G = nx.DiGraph()\n"
" if not parsed: return G\n"
" \n"
" def add_node(name, typ):\n"
" if name and isinstance(name, str) and name.strip():\n"
" G.add_node(name.strip(), type=typ)\n"
" \n"
" for org in parsed.get('organizations', []) or []:\n"
" if not isinstance(org, dict): continue\n"
" oname = (org.get('name') or '').strip()\n"
" if not oname: continue\n"
" add_node(oname, 'organization')\n"
" if org.get('ceo'):\n"
" add_node(org['ceo'], 'person')\n"
" G.add_edge(org['ceo'].strip(), oname, kind='ceo_of')\n"
" if org.get('chairman_president'):\n"
" add_node(org['chairman_president'], 'person')\n"
" G.add_edge(org['chairman_president'].strip(), oname, kind='president_of')\n"
" if org.get('headquartered_in'):\n"
" add_node(org['headquartered_in'], 'location')\n"
" G.add_edge(oname, org['headquartered_in'].strip(), kind='headquartered_in')\n"
" if org.get('parent_company'):\n"
" add_node(org['parent_company'], 'organization')\n"
" G.add_edge(oname, org['parent_company'].strip(), kind='subsidiary_of')\n"
" for sub in org.get('subsidiaries', []) or []:\n"
" if isinstance(sub, str) and sub.strip():\n"
" add_node(sub, 'organization')\n"
" G.add_edge(sub.strip(), oname, kind='subsidiary_of')\n"
" \n"
" for p in parsed.get('people', []) or []:\n"
" if not isinstance(p, dict): continue\n"
" pname = (p.get('name') or '').strip()\n"
" if not pname: continue\n"
" add_node(pname, 'person')\n"
" org = (p.get('organization') or '').strip()\n"
" role = (p.get('role') or 'works_at').strip()\n"
" if org:\n"
" add_node(org, 'organization')\n"
" # role es texto libre, lo metemos como kind\n"
" kind = role.lower().replace(' ', '_')[:30] if role else 'works_at'\n"
" G.add_edge(pname, org, kind=kind)\n"
" \n"
" for ag in parsed.get('agreements', []) or []:\n"
" if not isinstance(ag, dict): continue\n"
" parties = [p for p in (ag.get('between') or []) if isinstance(p, str) and p.strip()]\n"
" if len(parties) < 2: continue\n"
" for i, a in enumerate(parties):\n"
" for b in parties[i+1:]:\n"
" G.add_edge(a.strip(), b.strip(), kind='agreement_with')\n"
" \n"
" return G\n"
"\n"
"G_nuext_t2 = nuextract_corp_to_graph(NUEX['T2_corp_short_rich'].get('parsed'))\n"
"print(f'NuExtract T2 grafo: {G_nuext_t2.number_of_nodes()} nodos, {G_nuext_t2.number_of_edges()} aristas')"
))
cells.append(_md("## 6. Visualizacion lado a lado — 8 frases ES corporate"))
cells.append(_code(
"TYPE_COLOR = {'person': '#5DA5DA', 'organization': '#F17CB0', 'location': '#60BD68', '?': '#bbb'}\n"
"\n"
"def draw(ax, G, title, max_label=20):\n"
" if G.number_of_nodes() == 0:\n"
" ax.set_title(f'{title} (empty)'); ax.axis('off'); return\n"
" pos = nx.spring_layout(G, k=2.5, iterations=80, seed=42)\n"
" cols = [TYPE_COLOR.get(G.nodes[n].get('type'), '#bbb') for n in G.nodes]\n"
" nx.draw_networkx_nodes(G, pos, node_color=cols, node_size=1700, edgecolors='#333', linewidths=1.3, ax=ax)\n"
" labels = {n: (n if len(n) <= max_label else n[:max_label-1]+'') for n in G.nodes}\n"
" nx.draw_networkx_labels(G, pos, labels=labels, font_size=7.5, font_weight='bold', ax=ax)\n"
" nx.draw_networkx_edges(G, pos, edge_color='#888', arrows=True, arrowsize=12, width=1.0, alpha=0.65, ax=ax, connectionstyle='arc3,rad=0.08')\n"
" el = {(u,v): d['kind'] for u,v,d in G.edges(data=True)}\n"
" nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=6, ax=ax,\n"
" bbox=dict(boxstyle='round,pad=0.1', fc='white', ec='none', alpha=0.85))\n"
" ax.set_title(f'{title}: {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=11)\n"
" ax.axis('off')\n"
"\n"
"fig, axes = plt.subplots(1, 2, figsize=(20, 9))\n"
"draw(axes[0], G_nuext_t2, 'NuExtract 2.0-2B GPU\\n(8 frases, schema rich)')\n"
"\n"
"# Para GLiNER2 sobre el mismo texto, no tenemos benchmark v2 sobre es_corporate_short directamente.\n"
"# Notebook 04 dejo es_corporate_short con 14 ents + 8 rels via gliner2. Hardcodeamos del notebook 04 para comparar.\n"
"G_gliner2_t2 = nx.DiGraph()\n"
"_gliner2_short = { # del notebook 04 (es_corporate_short)\n"
" 'entities': {'person': ['Ignacio Galan','Carlos Torres','Pablo Isla','Jose Maria Alvarez-Pallete','Marina Serrano'],\n"
" 'organization': ['Iberdrola','Inditex','Endesa','BBVA'],\n"
" 'location': ['Bilbao','Galicia','Madrid','Arteixo','A Coruna']},\n"
" 'relations': [('Pablo Isla','works_at','Inditex'),\n"
" ('Pablo Isla','appointed_as','consejero de Telefonica'),\n"
" ('Marina Serrano','ceo_of','Endesa'),\n"
" ('Ignacio Galan','president_of','Iberdrola'),\n"
" ('Inditex','headquartered_in','Arteixo, A Coruna'),\n"
" ('Iberdrola','agreement_with','Endesa'),\n"
" ('Inditex','acquired','Pablo Isla')],\n"
"}\n"
"for typ, names in _gliner2_short['entities'].items():\n"
" for n in names: G_gliner2_t2.add_node(n, type=typ)\n"
"for h, k, t in _gliner2_short['relations']:\n"
" if h not in G_gliner2_t2: G_gliner2_t2.add_node(h, type='?')\n"
" if t not in G_gliner2_t2: G_gliner2_t2.add_node(t, type='?')\n"
" G_gliner2_t2.add_edge(h, t, kind=k)\n"
"draw(axes[1], G_gliner2_t2, 'GLiNER2 CPU\\n(8 frases, baseline notebook 04)')\n"
"\n"
"legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in TYPE_COLOR.items() if t != '?']\n"
"axes[0].legend(handles=legend, loc='upper left', fontsize=10)\n"
"plt.tight_layout(); plt.show()"
))
cells.append(_md(
"**Lectura del lado a lado:**\n\n"
"- **NuExtract** captura **atributos por entidad** (cada org tiene su `ceo`, `headquartered_in`, etc) en una sola pasada — el grafo se construye 'gratis' a partir del JSON anidado.\n"
"- **GLiNER2** extrae listas planas — el grafo emerge de las relaciones tipadas, pero a veces faltan atributos (no captura `parent_company`, `subsidiaries` directamente sin esos labels en el schema).\n"
"- Ambos tienen calidad alta en este corpus pequeño. Diferencia mas notable: NuExtract tiene mas dificultad con relaciones cruzadas (Iberdrola-Endesa) que GLiNER2 capta como `agreement_with`."
))
cells.append(_md(
"## 7. Long text (25 frases sector bancario) — NuExtract\n\n"
"**⚠️ Hallazgo importante:** En este test (T3), NuExtract **degenero en bucle de repeticion** y "
"agoto los 2048 max_new_tokens emitiendo `{\"between\": [\"BBVA\", \"Sabadell\"], \"topic\": \"OPA parcial\"...}` "
"repetido decenas de veces. El JSON resultante esta corrupto y `parsed = None`.\n\n"
"**Causa probable:** texto demasiado largo (400 words / ~952 tokens input + schema rico) sin `repetition_penalty`.\n"
"Mitigacion: anadir `repetition_penalty=1.1`, `do_sample=True, temperature=0.1`, o **trocear** el texto en chunks de ~150 words y agregar (mismo patron que GLiNER2).\n\n"
"**Implicacion operativa:** NuExtract requiere chunking SIEMPRE para texto medio-largo. GLiNER2 _tambien_ chunkea pero al menos no degenera — sigue extrayendo entidades correctas aunque baje recall."
))
cells.append(_code(
"G_nuext_long = nuextract_corp_to_graph(NUEX['T3_long_text_rich'].get('parsed'))\n"
"print(f'NuExtract T3 long_text: {G_nuext_long.number_of_nodes()} nodos, {G_nuext_long.number_of_edges()} aristas')\n"
"print()\n"
"print('Top entidades del JSON parseado:')\n"
"parsed = NUEX['T3_long_text_rich'].get('parsed') or {}\n"
"if parsed.get('organizations'):\n"
" print(f\" Organizations: {len(parsed['organizations'])}\")\n"
" for o in parsed['organizations'][:8]:\n"
" print(f\" {o.get('name'):30s} ceo={o.get('ceo')} pres={o.get('chairman_president')} hq={o.get('headquartered_in')}\")\n"
"if parsed.get('people'):\n"
" print(f\" People: {len(parsed['people'])}\")\n"
"if parsed.get('agreements'):\n"
" print(f\" Agreements: {len(parsed['agreements'])}\")"
))
cells.append(_code(
"fig, ax = plt.subplots(figsize=(15, 11))\n"
"draw(ax, G_nuext_long, 'NuExtract 2.0-2B GPU\\nLONG_TEXT_ES (25 frases sector bancario)', max_label=22)\n"
"legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in TYPE_COLOR.items() if t != '?']\n"
"ax.legend(handles=legend, loc='upper left', fontsize=10)\n"
"plt.tight_layout(); plt.show()"
))
cells.append(_md("## 8. PDF (5 chunks de muestra)"))
cells.append(_code(
"def nuextract_gdpr_to_graph(parsed: dict) -> nx.DiGraph:\n"
" \"\"\"Schema GDPR: data_controller / dpo_contact / data_categories / rights / authorities / laws.\"\"\"\n"
" G = nx.DiGraph()\n"
" if not parsed: return G\n"
" \n"
" def add_node(name, typ):\n"
" if name and isinstance(name, str) and name.strip():\n"
" G.add_node(name.strip(), type=typ)\n"
" \n"
" dc = parsed.get('data_controller') or {}\n"
" if isinstance(dc, dict) and dc.get('name'):\n"
" add_node(dc['name'], 'organization')\n"
" if dc.get('address'):\n"
" add_node(dc['address'], 'location')\n"
" G.add_edge(dc['name'].strip(), dc['address'].strip(), kind='located_in')\n"
" dpo = parsed.get('dpo_contact') or {}\n"
" if isinstance(dpo, dict) and dpo.get('email'):\n"
" add_node(dpo['email'], 'email')\n"
" if isinstance(dc, dict) and dc.get('name'):\n"
" G.add_edge(dpo['email'].strip(), dc['name'].strip(), kind='dpo_of')\n"
" for cat in parsed.get('data_categories', []) or []:\n"
" if isinstance(cat, str) and cat.strip():\n"
" add_node(cat, 'data_category')\n"
" for r in parsed.get('rights_listed', []) or []:\n"
" if isinstance(r, str) and r.strip():\n"
" add_node(r, 'right')\n"
" for a in parsed.get('authorities_mentioned', []) or []:\n"
" if isinstance(a, dict) and a.get('name'):\n"
" add_node(a['name'], 'authority')\n"
" if a.get('url_or_contact'):\n"
" add_node(a['url_or_contact'], 'url')\n"
" G.add_edge(a['name'].strip(), a['url_or_contact'].strip(), kind='contact')\n"
" for l in parsed.get('laws_mentioned', []) or []:\n"
" if isinstance(l, str) and l.strip():\n"
" add_node(l, 'law')\n"
" return G\n"
"\n"
"# Combinar grafos de los 5 chunks del PDF\n"
"G_pdf_combined = nx.DiGraph()\n"
"if 'T4_pdf_chunks' in NUEX:\n"
" for cr in NUEX['T4_pdf_chunks']:\n"
" Gc = nuextract_gdpr_to_graph(cr.get('parsed'))\n"
" for n, d in Gc.nodes(data=True):\n"
" if n not in G_pdf_combined:\n"
" G_pdf_combined.add_node(n, **d)\n"
" for u, v, d in Gc.edges(data=True):\n"
" G_pdf_combined.add_edge(u, v, **d)\n"
"print(f'NuExtract PDF (5 chunks combinados): {G_pdf_combined.number_of_nodes()} nodos, {G_pdf_combined.number_of_edges()} aristas')"
))
cells.append(_code(
"PDF_TYPE_COLOR = {'organization':'#F17CB0','person':'#5DA5DA','location':'#60BD68',\n"
" 'email':'#FAA43A','authority':'#7C7C7C','right':'#B276B2',\n"
" 'data_category':'#DECF3F','law':'#F15854','url':'#DECF3F'}\n"
"\n"
"def draw_typed(ax, G, title, type_color):\n"
" if G.number_of_nodes() == 0:\n"
" ax.set_title(f'{title} (empty)'); ax.axis('off'); return\n"
" pos = nx.spring_layout(G, k=2.0, iterations=80, seed=42)\n"
" cols = [type_color.get(G.nodes[n].get('type'), '#bbb') for n in G.nodes]\n"
" nx.draw_networkx_nodes(G, pos, node_color=cols, node_size=1500, edgecolors='#333', linewidths=1.2, ax=ax)\n"
" labels = {n: (n if len(n) <= 22 else n[:21]+'') for n in G.nodes}\n"
" nx.draw_networkx_labels(G, pos, labels=labels, font_size=7, font_weight='bold', ax=ax)\n"
" nx.draw_networkx_edges(G, pos, edge_color='#888', arrows=True, arrowsize=10, width=0.9, alpha=0.6, ax=ax, connectionstyle='arc3,rad=0.08')\n"
" el = {(u,v): d['kind'] for u,v,d in G.edges(data=True)}\n"
" nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=5.5, ax=ax,\n"
" bbox=dict(boxstyle='round,pad=0.05', fc='white', ec='none', alpha=0.85))\n"
" ax.set_title(f'{title}: {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=10)\n"
" ax.axis('off')\n"
"\n"
"fig, axes = plt.subplots(1, 2, figsize=(20, 11))\n"
"draw_typed(axes[0], G_pdf_combined, 'NuExtract GPU\\nPDF — 5 chunks combinados', PDF_TYPE_COLOR)\n"
"\n"
"# GLiNER2 sobre el PDF entero (97 chunks) ya esta en GLNR — config B post-coref\n"
"# Si tenemos el grafo post-coref no esta en este JSON. Reconstruimos de lo que hay.\n"
"# El config A del benchmark_v2 tiene los stats — usamos eso como referencia textual.\n"
"axes[1].axis('off')\n"
"axes[1].text(0.05, 0.92, 'GLiNER2 CPU sobre PDF entero (97 chunks)', fontsize=14, fontweight='bold', transform=axes[1].transAxes)\n"
"stats_a = GLNR['configs'][0]['stats']\n"
"stats_b = GLNR['configs'][1]['stats']\n"
"summary = (\n"
" f\"Config A (t=0.5 default):\\n\"\n"
" f\" ents: {stats_a['n_ents']}\\n\"\n"
" f\" rels: {stats_a['n_rels']}\\n\"\n"
" f\" edges: {stats_a['n_edges']}\\n\"\n"
" f\" isolates: {stats_a['n_isolates']}\\n\"\n"
" f\" conn%: {stats_a['connect_pct']}%\\n\"\n"
" f\" time: {GLNR['configs'][0]['elapsed']}s\\n\\n\"\n"
" f\"Config B (t=0.3):\\n\"\n"
" f\" ents: {stats_b['n_ents']}\\n\"\n"
" f\" rels: {stats_b['n_rels']}\\n\"\n"
" f\" edges: {stats_b['n_edges']}\\n\"\n"
" f\" isolates: {stats_b['n_isolates']}\\n\"\n"
" f\" conn%: {stats_b['connect_pct']}%\\n\"\n"
" f\" time: {GLNR['configs'][1]['elapsed']}s\"\n"
")\n"
"axes[1].text(0.05, 0.84, summary, fontsize=10, family='monospace', verticalalignment='top', transform=axes[1].transAxes)\n"
"\n"
"active = {G_pdf_combined.nodes[n].get('type') for n in G_pdf_combined.nodes}\n"
"legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in PDF_TYPE_COLOR.items() if t in active]\n"
"axes[0].legend(handles=legend, loc='upper left', fontsize=8)\n"
"plt.tight_layout(); plt.show()"
))
cells.append(_md(
"## 9. Conclusion — cuando usar cada uno\n\n"
"**Datos mas relevantes** (PDF de 89.882 chars / 97 chunks):\n\n"
"| | GLiNER2 CPU | NuExtract GPU 2B |\n"
"|---|---|---|\n"
"| Tiempo PDF entero | ~134s (a t=0.5) / ~139s (t=0.3) | extrapolado segun T4 |\n"
"| Modelo | 340M params | 2B params (6×) |\n"
"| Hardware | CPU | GPU dedicada |\n"
"| Output | Listas planas con tipos fijos | JSON arbitrario, anidado, atributos por entidad |\n"
"| Schema | `entities([...]).relations([...])` (palabras claves) | Plantilla JSON cualquiera (`{org: {ceo, ...}}`) |\n"
"| Riqueza | Limitada al schema declarado | Ilimitada — preguntas atributos arbitrarios |\n"
"| Determinismo | Alto (clasificador) | Generativo, puede tener variaciones |\n"
"| Licencia | Apache 2.0 | MIT (2B), Qwen Research (4B), MIT (8B) |\n\n"
"**Cuando GLiNER2:** alto throughput, schemas estables, tiempo critico, sin GPU. **Robusto frente a texto largo** (no degenera).\n\n"
"**Cuando NuExtract:** documento legal/financiero/OSINT donde quieres rellenar una ficha rica por entidad ('extrae para cada empresa: nombre, sede, CEO, presidencia, fundador, subsidiarias, normativa aplicable'), tienes GPU disponible, **y troceas el texto** (porque sin chunking degenera, ver §7).\n\n"
"**Decision para `graph_explorer`:** **GLiNER2 sigue siendo el motor por defecto**. Pero **anadir NuExtract como engine opcional** ('rich extraction') para documentos donde la riqueza estructural justifica el coste — y si el usuario tiene GPU detectable. El panel `paste_extract` puede ofrecer un toggle `[Quick (GLiNER2) | Rich (NuExtract GPU)]`.\n\n"
"**Numeros clave:**\n\n"
"| Metrica | GLiNER2 CPU | NuExtract CPU | NuExtract GPU |\n"
"|---|---|---|---|\n"
"| 8 frases ES (flat) | ~1s | 25s | **2.9s** |\n"
"| 8 frases ES (rich) | n/a (schema flat) | 117s | **9.9s** |\n"
"| 25 frases ES (rich) | ~1s | n/a | 53s + ⚠️ degeneracion |\n"
"| PDF entero (97 chunks) | 134s (2.2 min) | (estimado >2h) | 310s (5.2 min) — 2.3× mas lento |\n"
"| Modelo | 340M params, 700 MB disco | 2B params, 4 GB disco | mismo, BF16 |\n"
"| Speedup CPU→GPU | n/a | n/a | **8-12×** |"
))
nb = nbf.v4.new_notebook()
nb.cells = cells
nb.metadata = {
"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
"language_info": {"name": "python"},
}
NB_PATH.parent.mkdir(parents=True, exist_ok=True)
nbf.write(nb, NB_PATH)
print(f"[done] {NB_PATH} cells={len(cells)}")
if __name__ == "__main__":
build()