b8c760d004
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
489 lines
28 KiB
Python
489 lines
28 KiB
Python
"""Construye notebooks/07_nuextract_vs_gliner2.ipynb — comparativa completa.
|
||
|
||
Carga datos de:
|
||
- nuextract_results.json (NuExtract 2.0-2B en GPU + baseline CPU)
|
||
- benchmark_v2.json (GLiNER2 sobre el mismo PDF)
|
||
|
||
Construye grafos a partir del JSON anidado de NuExtract (nested → edges) y
|
||
compara con los grafos de GLiNER2 lado a lado: numero de nodos, aristas,
|
||
tiempo por extraccion, calidad cualitativa.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
from pathlib import Path
|
||
|
||
import nbformat as nbf
|
||
|
||
HERE = Path(__file__).resolve().parent
|
||
NB_PATH = HERE / "notebooks" / "07_nuextract_vs_gliner2.ipynb"
|
||
|
||
|
||
def _md(text: str):
|
||
return nbf.v4.new_markdown_cell(text)
|
||
|
||
|
||
def _code(src: str):
|
||
cell = nbf.v4.new_code_cell(src)
|
||
cell.outputs = []
|
||
cell.execution_count = None
|
||
return cell
|
||
|
||
|
||
def build():
|
||
cells = []
|
||
|
||
cells.append(_md(
|
||
"# NuExtract 2.0-2B (GPU) vs GLiNER2 — comparativa con visualizacion\n\n"
|
||
"**Pregunta:** ¿merece la pena un LLM con inferencia (NuExtract 2.0) en un proyecto donde "
|
||
"antes elegimos GLiNER2 por velocidad?\n\n"
|
||
"**Setup:**\n"
|
||
"- NuExtract 2.0-2B (Qwen2-VL-2B base, **MIT license**, 2B params, GPU BF16 sobre RTX 3070).\n"
|
||
"- GLiNER2-large-v1 (Apache 2.0, 340M params, CPU).\n"
|
||
"- Mismos corpora: `es_corporate_short` (8 frases), `LONG_TEXT_ES` (25 frases), 5 chunks del PDF de BBVA.\n\n"
|
||
"**Diferencia de paradigma:**\n"
|
||
"- **GLiNER2** = clasificador. Output: listas planas `{entities: {tipo: [names]}, relations: {tipo: [(h, t)]}}`.\n"
|
||
"- **NuExtract** = LLM generativo. Output: JSON arbitrario que tu defines en el `template`. Las relaciones se modelan como atributos de los objetos (`{org: {ceo: \"X\", headquartered_in: \"Y\"}}`).\n\n"
|
||
"**Hipotesis:** NuExtract gana en _riqueza estructural_ (atributos por entidad de un solo paso) pero pierde en velocidad — incluso con GPU."
|
||
))
|
||
|
||
cells.append(_md("## 1. Setup"))
|
||
|
||
cells.append(_code(
|
||
"import os, sys, json, warnings\n"
|
||
"warnings.filterwarnings('ignore')\n"
|
||
"from pathlib import Path\n"
|
||
"from collections import defaultdict\n"
|
||
"\n"
|
||
"_pf = '/home/lucas/fn_registry/python/functions'\n"
|
||
"sys.path = [p for p in sys.path if not p.startswith(_pf + '/')]\n"
|
||
"if _pf not in sys.path: sys.path.insert(0, _pf)\n"
|
||
"\n"
|
||
"import pandas as pd\n"
|
||
"import networkx as nx\n"
|
||
"import matplotlib.pyplot as plt\n"
|
||
"from matplotlib.patches import Patch\n"
|
||
"\n"
|
||
"NUEX = json.loads(Path('../nuextract_results.json').read_text())\n"
|
||
"\n"
|
||
"# Re-parsear el raw_text de cada test con un parser corregido (el original\n"
|
||
"# del script usaba rfind y solo capturaba el ultimo objeto pequeño).\n"
|
||
"def reparse(text):\n"
|
||
" if not text: return None\n"
|
||
" s = text.find('{')\n"
|
||
" if s < 0: return None\n"
|
||
" for end in range(len(text), s, -1):\n"
|
||
" try: return json.loads(text[s:end])\n"
|
||
" except Exception: continue\n"
|
||
" return None\n"
|
||
"for key in ['T1_corp_short_flat', 'T2_corp_short_rich', 'T3_long_text_rich']:\n"
|
||
" if key in NUEX:\n"
|
||
" NUEX[key]['parsed'] = reparse(NUEX[key].get('raw_text', ''))\n"
|
||
"for cr in NUEX.get('T4_pdf_chunks', []):\n"
|
||
" cr['parsed'] = reparse(cr.get('raw_text', ''))\n"
|
||
"GLNR_CORPUS = json.loads(Path('../benchmark_v2.json').read_text()) # GLiNER2 sobre 4 corpora\n"
|
||
"GLNR = json.loads(Path('../improvements.json').read_text()) # GLiNER2 sobre PDF + improvements\n"
|
||
"print('NuExtract keys:', list(NUEX.keys()))\n"
|
||
"print('GLiNER2 keys: ', list(GLNR.keys()))\n"
|
||
"print()\n"
|
||
"print('NuExtract device:', NUEX['meta']['device'], NUEX['meta']['dtype'])"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 2. Tabla de tiempos — CPU vs GPU vs GLiNER2\n\n"
|
||
"Comparamos las 4 pasadas (T1-T4) de NuExtract contra GLiNER2 sobre los mismos corpora."
|
||
))
|
||
|
||
cells.append(_code(
|
||
"# Construir tabla de tiempos\n"
|
||
"rows = []\n"
|
||
"\n"
|
||
"# CPU baseline (capturado del run anterior)\n"
|
||
"cpu = NUEX.get('cpu_baseline', {})\n"
|
||
"if 'T1_flat' in cpu:\n"
|
||
" rows.append({'test': 'T1 corp_short flat', 'engine': 'NuExtract CPU', 'time_s': cpu['T1_flat']['elapsed_s'],\n"
|
||
" 'in_tok': cpu['T1_flat']['in_tok'], 'out_tok': cpu['T1_flat']['out_tok']})\n"
|
||
"if 'T2_rich' in cpu:\n"
|
||
" rows.append({'test': 'T2 corp_short rich', 'engine': 'NuExtract CPU', 'time_s': cpu['T2_rich']['elapsed_s'],\n"
|
||
" 'in_tok': cpu['T2_rich']['in_tok'], 'out_tok': cpu['T2_rich']['out_tok']})\n"
|
||
"\n"
|
||
"# GPU (este run)\n"
|
||
"for key, label in [('T1_corp_short_flat', 'T1 corp_short flat'),\n"
|
||
" ('T2_corp_short_rich', 'T2 corp_short rich'),\n"
|
||
" ('T3_long_text_rich', 'T3 long_text rich')]:\n"
|
||
" if key in NUEX:\n"
|
||
" r = NUEX[key]\n"
|
||
" rows.append({'test': label, 'engine': 'NuExtract GPU', 'time_s': r['elapsed_s'],\n"
|
||
" 'in_tok': r['n_input_tokens'], 'out_tok': r['n_output_tokens']})\n"
|
||
"\n"
|
||
"# GLiNER2 baseline timings (de benchmark_v2.json — el config A es el equivalente)\n"
|
||
"# A es el flat schema sobre 97 chunks PDF — para comparar con T4 PDF\n"
|
||
"rows.append({'test': 'PDF (97 chunks)', 'engine': 'GLiNER2 CPU', 'time_s': GLNR['configs'][0]['elapsed'],\n"
|
||
" 'in_tok': '-', 'out_tok': '-'})\n"
|
||
"rows.append({'test': 'PDF (97 chunks)', 'engine': 'GLiNER2 CPU t=0.3', 'time_s': GLNR['configs'][1]['elapsed'],\n"
|
||
" 'in_tok': '-', 'out_tok': '-'})\n"
|
||
"\n"
|
||
"df_times = pd.DataFrame(rows)\n"
|
||
"df_times"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 3. Tiempos sobre el PDF — extrapolacion\n\n"
|
||
"5 chunks de muestra → estimacion del PDF completo."
|
||
))
|
||
|
||
cells.append(_code(
|
||
"if 'T4_pdf_chunks' in NUEX:\n"
|
||
" chunk_rows = []\n"
|
||
" for cr in NUEX['T4_pdf_chunks']:\n"
|
||
" chunk_rows.append({\n"
|
||
" 'chunk_idx': cr['chunk_idx'],\n"
|
||
" 'input_chars': cr['input_chars'],\n"
|
||
" 'time_s': cr['elapsed_s'],\n"
|
||
" 'in_tok': cr['n_input_tokens'],\n"
|
||
" 'out_tok': cr['n_output_tokens'],\n"
|
||
" })\n"
|
||
" df_chunks = pd.DataFrame(chunk_rows)\n"
|
||
" print('NuExtract GPU sobre 5 chunks del PDF:')\n"
|
||
" print(df_chunks)\n"
|
||
" print()\n"
|
||
" if 'full_pdf_extrapolation' in NUEX:\n"
|
||
" e = NUEX['full_pdf_extrapolation']\n"
|
||
" print(f\"Extrapolacion PDF entero ({e['n_chunks']} chunks):\")\n"
|
||
" print(f\" NuExtract GPU: {e['estimated_total_s']:.0f}s = {e['estimated_total_min']:.1f} min\")\n"
|
||
" print(f\" GLiNER2 CPU baseline: {GLNR['configs'][0]['elapsed']:.0f}s = {GLNR['configs'][0]['elapsed']/60:.1f} min\")\n"
|
||
" ratio = e['estimated_total_s'] / GLNR['configs'][0]['elapsed']\n"
|
||
" print(f\" ratio NuExtract/GLiNER2: {ratio:.1f}x\")\n"
|
||
"else:\n"
|
||
" print('T4_pdf_chunks no presente todavia')"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 4. Estructura del output — paradigmas distintos\n\n"
|
||
"**NuExtract** rellena el template JSON. Lo que pidas, sale (si existe en el texto)."
|
||
))
|
||
|
||
cells.append(_code(
|
||
"# Mostrar el JSON parseado de T2 (rich corporate sobre 8 frases ES)\n"
|
||
"print('=== NuExtract T2 — schema rich corporate sobre es_corporate_short ===')\n"
|
||
"if 'T2_corp_short_rich' in NUEX:\n"
|
||
" parsed = NUEX['T2_corp_short_rich'].get('parsed')\n"
|
||
" if parsed:\n"
|
||
" print(json.dumps(parsed, indent=2, ensure_ascii=False))\n"
|
||
" else:\n"
|
||
" print('parsed = None (raw text:)')\n"
|
||
" print(NUEX['T2_corp_short_rich']['raw_text'][:1500])"
|
||
))
|
||
|
||
cells.append(_md("## 5. Convertir el JSON anidado de NuExtract a un grafo"))
|
||
|
||
cells.append(_code(
|
||
"def nuextract_corp_to_graph(parsed: dict) -> nx.DiGraph:\n"
|
||
" \"\"\"Convierte el output de schema_rich_corporate a un DiGraph.\n"
|
||
"\n"
|
||
" Mapeo:\n"
|
||
" org.name → nodo (type=organization)\n"
|
||
" org.ceo → nodo (type=person), arista person --ceo_of--> org\n"
|
||
" org.chairman_president → nodo, arista --president_of--> org\n"
|
||
" org.headquartered_in → nodo (type=location), arista org --headquartered_in--> loc\n"
|
||
" org.subsidiaries[] → cada sub: nodo + arista sub --subsidiary_of--> org\n"
|
||
" org.parent_company → nodo + arista org --subsidiary_of--> parent\n"
|
||
" person.name → nodo, person --role--> organization\n"
|
||
" agreement.between[] → entre cada par, arista A --agreement_with--> B\n"
|
||
" \"\"\"\n"
|
||
" G = nx.DiGraph()\n"
|
||
" if not parsed: return G\n"
|
||
" \n"
|
||
" def add_node(name, typ):\n"
|
||
" if name and isinstance(name, str) and name.strip():\n"
|
||
" G.add_node(name.strip(), type=typ)\n"
|
||
" \n"
|
||
" for org in parsed.get('organizations', []) or []:\n"
|
||
" if not isinstance(org, dict): continue\n"
|
||
" oname = (org.get('name') or '').strip()\n"
|
||
" if not oname: continue\n"
|
||
" add_node(oname, 'organization')\n"
|
||
" if org.get('ceo'):\n"
|
||
" add_node(org['ceo'], 'person')\n"
|
||
" G.add_edge(org['ceo'].strip(), oname, kind='ceo_of')\n"
|
||
" if org.get('chairman_president'):\n"
|
||
" add_node(org['chairman_president'], 'person')\n"
|
||
" G.add_edge(org['chairman_president'].strip(), oname, kind='president_of')\n"
|
||
" if org.get('headquartered_in'):\n"
|
||
" add_node(org['headquartered_in'], 'location')\n"
|
||
" G.add_edge(oname, org['headquartered_in'].strip(), kind='headquartered_in')\n"
|
||
" if org.get('parent_company'):\n"
|
||
" add_node(org['parent_company'], 'organization')\n"
|
||
" G.add_edge(oname, org['parent_company'].strip(), kind='subsidiary_of')\n"
|
||
" for sub in org.get('subsidiaries', []) or []:\n"
|
||
" if isinstance(sub, str) and sub.strip():\n"
|
||
" add_node(sub, 'organization')\n"
|
||
" G.add_edge(sub.strip(), oname, kind='subsidiary_of')\n"
|
||
" \n"
|
||
" for p in parsed.get('people', []) or []:\n"
|
||
" if not isinstance(p, dict): continue\n"
|
||
" pname = (p.get('name') or '').strip()\n"
|
||
" if not pname: continue\n"
|
||
" add_node(pname, 'person')\n"
|
||
" org = (p.get('organization') or '').strip()\n"
|
||
" role = (p.get('role') or 'works_at').strip()\n"
|
||
" if org:\n"
|
||
" add_node(org, 'organization')\n"
|
||
" # role es texto libre, lo metemos como kind\n"
|
||
" kind = role.lower().replace(' ', '_')[:30] if role else 'works_at'\n"
|
||
" G.add_edge(pname, org, kind=kind)\n"
|
||
" \n"
|
||
" for ag in parsed.get('agreements', []) or []:\n"
|
||
" if not isinstance(ag, dict): continue\n"
|
||
" parties = [p for p in (ag.get('between') or []) if isinstance(p, str) and p.strip()]\n"
|
||
" if len(parties) < 2: continue\n"
|
||
" for i, a in enumerate(parties):\n"
|
||
" for b in parties[i+1:]:\n"
|
||
" G.add_edge(a.strip(), b.strip(), kind='agreement_with')\n"
|
||
" \n"
|
||
" return G\n"
|
||
"\n"
|
||
"G_nuext_t2 = nuextract_corp_to_graph(NUEX['T2_corp_short_rich'].get('parsed'))\n"
|
||
"print(f'NuExtract T2 grafo: {G_nuext_t2.number_of_nodes()} nodos, {G_nuext_t2.number_of_edges()} aristas')"
|
||
))
|
||
|
||
cells.append(_md("## 6. Visualizacion lado a lado — 8 frases ES corporate"))
|
||
|
||
cells.append(_code(
|
||
"TYPE_COLOR = {'person': '#5DA5DA', 'organization': '#F17CB0', 'location': '#60BD68', '?': '#bbb'}\n"
|
||
"\n"
|
||
"def draw(ax, G, title, max_label=20):\n"
|
||
" if G.number_of_nodes() == 0:\n"
|
||
" ax.set_title(f'{title} (empty)'); ax.axis('off'); return\n"
|
||
" pos = nx.spring_layout(G, k=2.5, iterations=80, seed=42)\n"
|
||
" cols = [TYPE_COLOR.get(G.nodes[n].get('type'), '#bbb') for n in G.nodes]\n"
|
||
" nx.draw_networkx_nodes(G, pos, node_color=cols, node_size=1700, edgecolors='#333', linewidths=1.3, ax=ax)\n"
|
||
" labels = {n: (n if len(n) <= max_label else n[:max_label-1]+'…') for n in G.nodes}\n"
|
||
" nx.draw_networkx_labels(G, pos, labels=labels, font_size=7.5, font_weight='bold', ax=ax)\n"
|
||
" nx.draw_networkx_edges(G, pos, edge_color='#888', arrows=True, arrowsize=12, width=1.0, alpha=0.65, ax=ax, connectionstyle='arc3,rad=0.08')\n"
|
||
" el = {(u,v): d['kind'] for u,v,d in G.edges(data=True)}\n"
|
||
" nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=6, ax=ax,\n"
|
||
" bbox=dict(boxstyle='round,pad=0.1', fc='white', ec='none', alpha=0.85))\n"
|
||
" ax.set_title(f'{title}: {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=11)\n"
|
||
" ax.axis('off')\n"
|
||
"\n"
|
||
"fig, axes = plt.subplots(1, 2, figsize=(20, 9))\n"
|
||
"draw(axes[0], G_nuext_t2, 'NuExtract 2.0-2B GPU\\n(8 frases, schema rich)')\n"
|
||
"\n"
|
||
"# Para GLiNER2 sobre el mismo texto, no tenemos benchmark v2 sobre es_corporate_short directamente.\n"
|
||
"# Notebook 04 dejo es_corporate_short con 14 ents + 8 rels via gliner2. Hardcodeamos del notebook 04 para comparar.\n"
|
||
"G_gliner2_t2 = nx.DiGraph()\n"
|
||
"_gliner2_short = { # del notebook 04 (es_corporate_short)\n"
|
||
" 'entities': {'person': ['Ignacio Galan','Carlos Torres','Pablo Isla','Jose Maria Alvarez-Pallete','Marina Serrano'],\n"
|
||
" 'organization': ['Iberdrola','Inditex','Endesa','BBVA'],\n"
|
||
" 'location': ['Bilbao','Galicia','Madrid','Arteixo','A Coruna']},\n"
|
||
" 'relations': [('Pablo Isla','works_at','Inditex'),\n"
|
||
" ('Pablo Isla','appointed_as','consejero de Telefonica'),\n"
|
||
" ('Marina Serrano','ceo_of','Endesa'),\n"
|
||
" ('Ignacio Galan','president_of','Iberdrola'),\n"
|
||
" ('Inditex','headquartered_in','Arteixo, A Coruna'),\n"
|
||
" ('Iberdrola','agreement_with','Endesa'),\n"
|
||
" ('Inditex','acquired','Pablo Isla')],\n"
|
||
"}\n"
|
||
"for typ, names in _gliner2_short['entities'].items():\n"
|
||
" for n in names: G_gliner2_t2.add_node(n, type=typ)\n"
|
||
"for h, k, t in _gliner2_short['relations']:\n"
|
||
" if h not in G_gliner2_t2: G_gliner2_t2.add_node(h, type='?')\n"
|
||
" if t not in G_gliner2_t2: G_gliner2_t2.add_node(t, type='?')\n"
|
||
" G_gliner2_t2.add_edge(h, t, kind=k)\n"
|
||
"draw(axes[1], G_gliner2_t2, 'GLiNER2 CPU\\n(8 frases, baseline notebook 04)')\n"
|
||
"\n"
|
||
"legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in TYPE_COLOR.items() if t != '?']\n"
|
||
"axes[0].legend(handles=legend, loc='upper left', fontsize=10)\n"
|
||
"plt.tight_layout(); plt.show()"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"**Lectura del lado a lado:**\n\n"
|
||
"- **NuExtract** captura **atributos por entidad** (cada org tiene su `ceo`, `headquartered_in`, etc) en una sola pasada — el grafo se construye 'gratis' a partir del JSON anidado.\n"
|
||
"- **GLiNER2** extrae listas planas — el grafo emerge de las relaciones tipadas, pero a veces faltan atributos (no captura `parent_company`, `subsidiaries` directamente sin esos labels en el schema).\n"
|
||
"- Ambos tienen calidad alta en este corpus pequeño. Diferencia mas notable: NuExtract tiene mas dificultad con relaciones cruzadas (Iberdrola-Endesa) que GLiNER2 capta como `agreement_with`."
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 7. Long text (25 frases sector bancario) — NuExtract\n\n"
|
||
"**⚠️ Hallazgo importante:** En este test (T3), NuExtract **degenero en bucle de repeticion** y "
|
||
"agoto los 2048 max_new_tokens emitiendo `{\"between\": [\"BBVA\", \"Sabadell\"], \"topic\": \"OPA parcial\"...}` "
|
||
"repetido decenas de veces. El JSON resultante esta corrupto y `parsed = None`.\n\n"
|
||
"**Causa probable:** texto demasiado largo (400 words / ~952 tokens input + schema rico) sin `repetition_penalty`.\n"
|
||
"Mitigacion: anadir `repetition_penalty=1.1`, `do_sample=True, temperature=0.1`, o **trocear** el texto en chunks de ~150 words y agregar (mismo patron que GLiNER2).\n\n"
|
||
"**Implicacion operativa:** NuExtract requiere chunking SIEMPRE para texto medio-largo. GLiNER2 _tambien_ chunkea pero al menos no degenera — sigue extrayendo entidades correctas aunque baje recall."
|
||
))
|
||
|
||
cells.append(_code(
|
||
"G_nuext_long = nuextract_corp_to_graph(NUEX['T3_long_text_rich'].get('parsed'))\n"
|
||
"print(f'NuExtract T3 long_text: {G_nuext_long.number_of_nodes()} nodos, {G_nuext_long.number_of_edges()} aristas')\n"
|
||
"print()\n"
|
||
"print('Top entidades del JSON parseado:')\n"
|
||
"parsed = NUEX['T3_long_text_rich'].get('parsed') or {}\n"
|
||
"if parsed.get('organizations'):\n"
|
||
" print(f\" Organizations: {len(parsed['organizations'])}\")\n"
|
||
" for o in parsed['organizations'][:8]:\n"
|
||
" print(f\" {o.get('name'):30s} ceo={o.get('ceo')} pres={o.get('chairman_president')} hq={o.get('headquartered_in')}\")\n"
|
||
"if parsed.get('people'):\n"
|
||
" print(f\" People: {len(parsed['people'])}\")\n"
|
||
"if parsed.get('agreements'):\n"
|
||
" print(f\" Agreements: {len(parsed['agreements'])}\")"
|
||
))
|
||
|
||
cells.append(_code(
|
||
"fig, ax = plt.subplots(figsize=(15, 11))\n"
|
||
"draw(ax, G_nuext_long, 'NuExtract 2.0-2B GPU\\nLONG_TEXT_ES (25 frases sector bancario)', max_label=22)\n"
|
||
"legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in TYPE_COLOR.items() if t != '?']\n"
|
||
"ax.legend(handles=legend, loc='upper left', fontsize=10)\n"
|
||
"plt.tight_layout(); plt.show()"
|
||
))
|
||
|
||
cells.append(_md("## 8. PDF (5 chunks de muestra)"))
|
||
|
||
cells.append(_code(
|
||
"def nuextract_gdpr_to_graph(parsed: dict) -> nx.DiGraph:\n"
|
||
" \"\"\"Schema GDPR: data_controller / dpo_contact / data_categories / rights / authorities / laws.\"\"\"\n"
|
||
" G = nx.DiGraph()\n"
|
||
" if not parsed: return G\n"
|
||
" \n"
|
||
" def add_node(name, typ):\n"
|
||
" if name and isinstance(name, str) and name.strip():\n"
|
||
" G.add_node(name.strip(), type=typ)\n"
|
||
" \n"
|
||
" dc = parsed.get('data_controller') or {}\n"
|
||
" if isinstance(dc, dict) and dc.get('name'):\n"
|
||
" add_node(dc['name'], 'organization')\n"
|
||
" if dc.get('address'):\n"
|
||
" add_node(dc['address'], 'location')\n"
|
||
" G.add_edge(dc['name'].strip(), dc['address'].strip(), kind='located_in')\n"
|
||
" dpo = parsed.get('dpo_contact') or {}\n"
|
||
" if isinstance(dpo, dict) and dpo.get('email'):\n"
|
||
" add_node(dpo['email'], 'email')\n"
|
||
" if isinstance(dc, dict) and dc.get('name'):\n"
|
||
" G.add_edge(dpo['email'].strip(), dc['name'].strip(), kind='dpo_of')\n"
|
||
" for cat in parsed.get('data_categories', []) or []:\n"
|
||
" if isinstance(cat, str) and cat.strip():\n"
|
||
" add_node(cat, 'data_category')\n"
|
||
" for r in parsed.get('rights_listed', []) or []:\n"
|
||
" if isinstance(r, str) and r.strip():\n"
|
||
" add_node(r, 'right')\n"
|
||
" for a in parsed.get('authorities_mentioned', []) or []:\n"
|
||
" if isinstance(a, dict) and a.get('name'):\n"
|
||
" add_node(a['name'], 'authority')\n"
|
||
" if a.get('url_or_contact'):\n"
|
||
" add_node(a['url_or_contact'], 'url')\n"
|
||
" G.add_edge(a['name'].strip(), a['url_or_contact'].strip(), kind='contact')\n"
|
||
" for l in parsed.get('laws_mentioned', []) or []:\n"
|
||
" if isinstance(l, str) and l.strip():\n"
|
||
" add_node(l, 'law')\n"
|
||
" return G\n"
|
||
"\n"
|
||
"# Combinar grafos de los 5 chunks del PDF\n"
|
||
"G_pdf_combined = nx.DiGraph()\n"
|
||
"if 'T4_pdf_chunks' in NUEX:\n"
|
||
" for cr in NUEX['T4_pdf_chunks']:\n"
|
||
" Gc = nuextract_gdpr_to_graph(cr.get('parsed'))\n"
|
||
" for n, d in Gc.nodes(data=True):\n"
|
||
" if n not in G_pdf_combined:\n"
|
||
" G_pdf_combined.add_node(n, **d)\n"
|
||
" for u, v, d in Gc.edges(data=True):\n"
|
||
" G_pdf_combined.add_edge(u, v, **d)\n"
|
||
"print(f'NuExtract PDF (5 chunks combinados): {G_pdf_combined.number_of_nodes()} nodos, {G_pdf_combined.number_of_edges()} aristas')"
|
||
))
|
||
|
||
cells.append(_code(
|
||
"PDF_TYPE_COLOR = {'organization':'#F17CB0','person':'#5DA5DA','location':'#60BD68',\n"
|
||
" 'email':'#FAA43A','authority':'#7C7C7C','right':'#B276B2',\n"
|
||
" 'data_category':'#DECF3F','law':'#F15854','url':'#DECF3F'}\n"
|
||
"\n"
|
||
"def draw_typed(ax, G, title, type_color):\n"
|
||
" if G.number_of_nodes() == 0:\n"
|
||
" ax.set_title(f'{title} (empty)'); ax.axis('off'); return\n"
|
||
" pos = nx.spring_layout(G, k=2.0, iterations=80, seed=42)\n"
|
||
" cols = [type_color.get(G.nodes[n].get('type'), '#bbb') for n in G.nodes]\n"
|
||
" nx.draw_networkx_nodes(G, pos, node_color=cols, node_size=1500, edgecolors='#333', linewidths=1.2, ax=ax)\n"
|
||
" labels = {n: (n if len(n) <= 22 else n[:21]+'…') for n in G.nodes}\n"
|
||
" nx.draw_networkx_labels(G, pos, labels=labels, font_size=7, font_weight='bold', ax=ax)\n"
|
||
" nx.draw_networkx_edges(G, pos, edge_color='#888', arrows=True, arrowsize=10, width=0.9, alpha=0.6, ax=ax, connectionstyle='arc3,rad=0.08')\n"
|
||
" el = {(u,v): d['kind'] for u,v,d in G.edges(data=True)}\n"
|
||
" nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=5.5, ax=ax,\n"
|
||
" bbox=dict(boxstyle='round,pad=0.05', fc='white', ec='none', alpha=0.85))\n"
|
||
" ax.set_title(f'{title}: {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=10)\n"
|
||
" ax.axis('off')\n"
|
||
"\n"
|
||
"fig, axes = plt.subplots(1, 2, figsize=(20, 11))\n"
|
||
"draw_typed(axes[0], G_pdf_combined, 'NuExtract GPU\\nPDF — 5 chunks combinados', PDF_TYPE_COLOR)\n"
|
||
"\n"
|
||
"# GLiNER2 sobre el PDF entero (97 chunks) ya esta en GLNR — config B post-coref\n"
|
||
"# Si tenemos el grafo post-coref no esta en este JSON. Reconstruimos de lo que hay.\n"
|
||
"# El config A del benchmark_v2 tiene los stats — usamos eso como referencia textual.\n"
|
||
"axes[1].axis('off')\n"
|
||
"axes[1].text(0.05, 0.92, 'GLiNER2 CPU sobre PDF entero (97 chunks)', fontsize=14, fontweight='bold', transform=axes[1].transAxes)\n"
|
||
"stats_a = GLNR['configs'][0]['stats']\n"
|
||
"stats_b = GLNR['configs'][1]['stats']\n"
|
||
"summary = (\n"
|
||
" f\"Config A (t=0.5 default):\\n\"\n"
|
||
" f\" ents: {stats_a['n_ents']}\\n\"\n"
|
||
" f\" rels: {stats_a['n_rels']}\\n\"\n"
|
||
" f\" edges: {stats_a['n_edges']}\\n\"\n"
|
||
" f\" isolates: {stats_a['n_isolates']}\\n\"\n"
|
||
" f\" conn%: {stats_a['connect_pct']}%\\n\"\n"
|
||
" f\" time: {GLNR['configs'][0]['elapsed']}s\\n\\n\"\n"
|
||
" f\"Config B (t=0.3):\\n\"\n"
|
||
" f\" ents: {stats_b['n_ents']}\\n\"\n"
|
||
" f\" rels: {stats_b['n_rels']}\\n\"\n"
|
||
" f\" edges: {stats_b['n_edges']}\\n\"\n"
|
||
" f\" isolates: {stats_b['n_isolates']}\\n\"\n"
|
||
" f\" conn%: {stats_b['connect_pct']}%\\n\"\n"
|
||
" f\" time: {GLNR['configs'][1]['elapsed']}s\"\n"
|
||
")\n"
|
||
"axes[1].text(0.05, 0.84, summary, fontsize=10, family='monospace', verticalalignment='top', transform=axes[1].transAxes)\n"
|
||
"\n"
|
||
"active = {G_pdf_combined.nodes[n].get('type') for n in G_pdf_combined.nodes}\n"
|
||
"legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in PDF_TYPE_COLOR.items() if t in active]\n"
|
||
"axes[0].legend(handles=legend, loc='upper left', fontsize=8)\n"
|
||
"plt.tight_layout(); plt.show()"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 9. Conclusion — cuando usar cada uno\n\n"
|
||
"**Datos mas relevantes** (PDF de 89.882 chars / 97 chunks):\n\n"
|
||
"| | GLiNER2 CPU | NuExtract GPU 2B |\n"
|
||
"|---|---|---|\n"
|
||
"| Tiempo PDF entero | ~134s (a t=0.5) / ~139s (t=0.3) | extrapolado segun T4 |\n"
|
||
"| Modelo | 340M params | 2B params (6×) |\n"
|
||
"| Hardware | CPU | GPU dedicada |\n"
|
||
"| Output | Listas planas con tipos fijos | JSON arbitrario, anidado, atributos por entidad |\n"
|
||
"| Schema | `entities([...]).relations([...])` (palabras claves) | Plantilla JSON cualquiera (`{org: {ceo, ...}}`) |\n"
|
||
"| Riqueza | Limitada al schema declarado | Ilimitada — preguntas atributos arbitrarios |\n"
|
||
"| Determinismo | Alto (clasificador) | Generativo, puede tener variaciones |\n"
|
||
"| Licencia | Apache 2.0 | MIT (2B), Qwen Research (4B), MIT (8B) |\n\n"
|
||
"**Cuando GLiNER2:** alto throughput, schemas estables, tiempo critico, sin GPU. **Robusto frente a texto largo** (no degenera).\n\n"
|
||
"**Cuando NuExtract:** documento legal/financiero/OSINT donde quieres rellenar una ficha rica por entidad ('extrae para cada empresa: nombre, sede, CEO, presidencia, fundador, subsidiarias, normativa aplicable'), tienes GPU disponible, **y troceas el texto** (porque sin chunking degenera, ver §7).\n\n"
|
||
"**Decision para `graph_explorer`:** **GLiNER2 sigue siendo el motor por defecto**. Pero **anadir NuExtract como engine opcional** ('rich extraction') para documentos donde la riqueza estructural justifica el coste — y si el usuario tiene GPU detectable. El panel `paste_extract` puede ofrecer un toggle `[Quick (GLiNER2) | Rich (NuExtract GPU)]`.\n\n"
|
||
"**Numeros clave:**\n\n"
|
||
"| Metrica | GLiNER2 CPU | NuExtract CPU | NuExtract GPU |\n"
|
||
"|---|---|---|---|\n"
|
||
"| 8 frases ES (flat) | ~1s | 25s | **2.9s** |\n"
|
||
"| 8 frases ES (rich) | n/a (schema flat) | 117s | **9.9s** |\n"
|
||
"| 25 frases ES (rich) | ~1s | n/a | 53s + ⚠️ degeneracion |\n"
|
||
"| PDF entero (97 chunks) | 134s (2.2 min) | (estimado >2h) | 310s (5.2 min) — 2.3× mas lento |\n"
|
||
"| Modelo | 340M params, 700 MB disco | 2B params, 4 GB disco | mismo, BF16 |\n"
|
||
"| Speedup CPU→GPU | n/a | n/a | **8-12×** |"
|
||
))
|
||
|
||
nb = nbf.v4.new_notebook()
|
||
nb.cells = cells
|
||
nb.metadata = {
|
||
"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
|
||
"language_info": {"name": "python"},
|
||
}
|
||
NB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
nbf.write(nb, NB_PATH)
|
||
print(f"[done] {NB_PATH} cells={len(cells)}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
build()
|