"""Construye notebooks/07_nuextract_vs_gliner2.ipynb — comparativa completa. Carga datos de: - nuextract_results.json (NuExtract 2.0-2B en GPU + baseline CPU) - benchmark_v2.json (GLiNER2 sobre el mismo PDF) Construye grafos a partir del JSON anidado de NuExtract (nested → edges) y compara con los grafos de GLiNER2 lado a lado: numero de nodos, aristas, tiempo por extraccion, calidad cualitativa. """ from __future__ import annotations import json from pathlib import Path import nbformat as nbf HERE = Path(__file__).resolve().parent NB_PATH = HERE / "notebooks" / "07_nuextract_vs_gliner2.ipynb" def _md(text: str): return nbf.v4.new_markdown_cell(text) def _code(src: str): cell = nbf.v4.new_code_cell(src) cell.outputs = [] cell.execution_count = None return cell def build(): cells = [] cells.append(_md( "# NuExtract 2.0-2B (GPU) vs GLiNER2 — comparativa con visualizacion\n\n" "**Pregunta:** ¿merece la pena un LLM con inferencia (NuExtract 2.0) en un proyecto donde " "antes elegimos GLiNER2 por velocidad?\n\n" "**Setup:**\n" "- NuExtract 2.0-2B (Qwen2-VL-2B base, **MIT license**, 2B params, GPU BF16 sobre RTX 3070).\n" "- GLiNER2-large-v1 (Apache 2.0, 340M params, CPU).\n" "- Mismos corpora: `es_corporate_short` (8 frases), `LONG_TEXT_ES` (25 frases), 5 chunks del PDF de BBVA.\n\n" "**Diferencia de paradigma:**\n" "- **GLiNER2** = clasificador. Output: listas planas `{entities: {tipo: [names]}, relations: {tipo: [(h, t)]}}`.\n" "- **NuExtract** = LLM generativo. Output: JSON arbitrario que tu defines en el `template`. Las relaciones se modelan como atributos de los objetos (`{org: {ceo: \"X\", headquartered_in: \"Y\"}}`).\n\n" "**Hipotesis:** NuExtract gana en _riqueza estructural_ (atributos por entidad de un solo paso) pero pierde en velocidad — incluso con GPU." )) cells.append(_md("## 1. Setup")) cells.append(_code( "import os, sys, json, warnings\n" "warnings.filterwarnings('ignore')\n" "from pathlib import Path\n" "from collections import defaultdict\n" "\n" "_pf = '/home/lucas/fn_registry/python/functions'\n" "sys.path = [p for p in sys.path if not p.startswith(_pf + '/')]\n" "if _pf not in sys.path: sys.path.insert(0, _pf)\n" "\n" "import pandas as pd\n" "import networkx as nx\n" "import matplotlib.pyplot as plt\n" "from matplotlib.patches import Patch\n" "\n" "NUEX = json.loads(Path('../nuextract_results.json').read_text())\n" "\n" "# Re-parsear el raw_text de cada test con un parser corregido (el original\n" "# del script usaba rfind y solo capturaba el ultimo objeto pequeño).\n" "def reparse(text):\n" " if not text: return None\n" " s = text.find('{')\n" " if s < 0: return None\n" " for end in range(len(text), s, -1):\n" " try: return json.loads(text[s:end])\n" " except Exception: continue\n" " return None\n" "for key in ['T1_corp_short_flat', 'T2_corp_short_rich', 'T3_long_text_rich']:\n" " if key in NUEX:\n" " NUEX[key]['parsed'] = reparse(NUEX[key].get('raw_text', ''))\n" "for cr in NUEX.get('T4_pdf_chunks', []):\n" " cr['parsed'] = reparse(cr.get('raw_text', ''))\n" "GLNR_CORPUS = json.loads(Path('../benchmark_v2.json').read_text()) # GLiNER2 sobre 4 corpora\n" "GLNR = json.loads(Path('../improvements.json').read_text()) # GLiNER2 sobre PDF + improvements\n" "print('NuExtract keys:', list(NUEX.keys()))\n" "print('GLiNER2 keys: ', list(GLNR.keys()))\n" "print()\n" "print('NuExtract device:', NUEX['meta']['device'], NUEX['meta']['dtype'])" )) cells.append(_md( "## 2. Tabla de tiempos — CPU vs GPU vs GLiNER2\n\n" "Comparamos las 4 pasadas (T1-T4) de NuExtract contra GLiNER2 sobre los mismos corpora." )) cells.append(_code( "# Construir tabla de tiempos\n" "rows = []\n" "\n" "# CPU baseline (capturado del run anterior)\n" "cpu = NUEX.get('cpu_baseline', {})\n" "if 'T1_flat' in cpu:\n" " rows.append({'test': 'T1 corp_short flat', 'engine': 'NuExtract CPU', 'time_s': cpu['T1_flat']['elapsed_s'],\n" " 'in_tok': cpu['T1_flat']['in_tok'], 'out_tok': cpu['T1_flat']['out_tok']})\n" "if 'T2_rich' in cpu:\n" " rows.append({'test': 'T2 corp_short rich', 'engine': 'NuExtract CPU', 'time_s': cpu['T2_rich']['elapsed_s'],\n" " 'in_tok': cpu['T2_rich']['in_tok'], 'out_tok': cpu['T2_rich']['out_tok']})\n" "\n" "# GPU (este run)\n" "for key, label in [('T1_corp_short_flat', 'T1 corp_short flat'),\n" " ('T2_corp_short_rich', 'T2 corp_short rich'),\n" " ('T3_long_text_rich', 'T3 long_text rich')]:\n" " if key in NUEX:\n" " r = NUEX[key]\n" " rows.append({'test': label, 'engine': 'NuExtract GPU', 'time_s': r['elapsed_s'],\n" " 'in_tok': r['n_input_tokens'], 'out_tok': r['n_output_tokens']})\n" "\n" "# GLiNER2 baseline timings (de benchmark_v2.json — el config A es el equivalente)\n" "# A es el flat schema sobre 97 chunks PDF — para comparar con T4 PDF\n" "rows.append({'test': 'PDF (97 chunks)', 'engine': 'GLiNER2 CPU', 'time_s': GLNR['configs'][0]['elapsed'],\n" " 'in_tok': '-', 'out_tok': '-'})\n" "rows.append({'test': 'PDF (97 chunks)', 'engine': 'GLiNER2 CPU t=0.3', 'time_s': GLNR['configs'][1]['elapsed'],\n" " 'in_tok': '-', 'out_tok': '-'})\n" "\n" "df_times = pd.DataFrame(rows)\n" "df_times" )) cells.append(_md( "## 3. Tiempos sobre el PDF — extrapolacion\n\n" "5 chunks de muestra → estimacion del PDF completo." )) cells.append(_code( "if 'T4_pdf_chunks' in NUEX:\n" " chunk_rows = []\n" " for cr in NUEX['T4_pdf_chunks']:\n" " chunk_rows.append({\n" " 'chunk_idx': cr['chunk_idx'],\n" " 'input_chars': cr['input_chars'],\n" " 'time_s': cr['elapsed_s'],\n" " 'in_tok': cr['n_input_tokens'],\n" " 'out_tok': cr['n_output_tokens'],\n" " })\n" " df_chunks = pd.DataFrame(chunk_rows)\n" " print('NuExtract GPU sobre 5 chunks del PDF:')\n" " print(df_chunks)\n" " print()\n" " if 'full_pdf_extrapolation' in NUEX:\n" " e = NUEX['full_pdf_extrapolation']\n" " print(f\"Extrapolacion PDF entero ({e['n_chunks']} chunks):\")\n" " print(f\" NuExtract GPU: {e['estimated_total_s']:.0f}s = {e['estimated_total_min']:.1f} min\")\n" " print(f\" GLiNER2 CPU baseline: {GLNR['configs'][0]['elapsed']:.0f}s = {GLNR['configs'][0]['elapsed']/60:.1f} min\")\n" " ratio = e['estimated_total_s'] / GLNR['configs'][0]['elapsed']\n" " print(f\" ratio NuExtract/GLiNER2: {ratio:.1f}x\")\n" "else:\n" " print('T4_pdf_chunks no presente todavia')" )) cells.append(_md( "## 4. Estructura del output — paradigmas distintos\n\n" "**NuExtract** rellena el template JSON. Lo que pidas, sale (si existe en el texto)." )) cells.append(_code( "# Mostrar el JSON parseado de T2 (rich corporate sobre 8 frases ES)\n" "print('=== NuExtract T2 — schema rich corporate sobre es_corporate_short ===')\n" "if 'T2_corp_short_rich' in NUEX:\n" " parsed = NUEX['T2_corp_short_rich'].get('parsed')\n" " if parsed:\n" " print(json.dumps(parsed, indent=2, ensure_ascii=False))\n" " else:\n" " print('parsed = None (raw text:)')\n" " print(NUEX['T2_corp_short_rich']['raw_text'][:1500])" )) cells.append(_md("## 5. Convertir el JSON anidado de NuExtract a un grafo")) cells.append(_code( "def nuextract_corp_to_graph(parsed: dict) -> nx.DiGraph:\n" " \"\"\"Convierte el output de schema_rich_corporate a un DiGraph.\n" "\n" " Mapeo:\n" " org.name → nodo (type=organization)\n" " org.ceo → nodo (type=person), arista person --ceo_of--> org\n" " org.chairman_president → nodo, arista --president_of--> org\n" " org.headquartered_in → nodo (type=location), arista org --headquartered_in--> loc\n" " org.subsidiaries[] → cada sub: nodo + arista sub --subsidiary_of--> org\n" " org.parent_company → nodo + arista org --subsidiary_of--> parent\n" " person.name → nodo, person --role--> organization\n" " agreement.between[] → entre cada par, arista A --agreement_with--> B\n" " \"\"\"\n" " G = nx.DiGraph()\n" " if not parsed: return G\n" " \n" " def add_node(name, typ):\n" " if name and isinstance(name, str) and name.strip():\n" " G.add_node(name.strip(), type=typ)\n" " \n" " for org in parsed.get('organizations', []) or []:\n" " if not isinstance(org, dict): continue\n" " oname = (org.get('name') or '').strip()\n" " if not oname: continue\n" " add_node(oname, 'organization')\n" " if org.get('ceo'):\n" " add_node(org['ceo'], 'person')\n" " G.add_edge(org['ceo'].strip(), oname, kind='ceo_of')\n" " if org.get('chairman_president'):\n" " add_node(org['chairman_president'], 'person')\n" " G.add_edge(org['chairman_president'].strip(), oname, kind='president_of')\n" " if org.get('headquartered_in'):\n" " add_node(org['headquartered_in'], 'location')\n" " G.add_edge(oname, org['headquartered_in'].strip(), kind='headquartered_in')\n" " if org.get('parent_company'):\n" " add_node(org['parent_company'], 'organization')\n" " G.add_edge(oname, org['parent_company'].strip(), kind='subsidiary_of')\n" " for sub in org.get('subsidiaries', []) or []:\n" " if isinstance(sub, str) and sub.strip():\n" " add_node(sub, 'organization')\n" " G.add_edge(sub.strip(), oname, kind='subsidiary_of')\n" " \n" " for p in parsed.get('people', []) or []:\n" " if not isinstance(p, dict): continue\n" " pname = (p.get('name') or '').strip()\n" " if not pname: continue\n" " add_node(pname, 'person')\n" " org = (p.get('organization') or '').strip()\n" " role = (p.get('role') or 'works_at').strip()\n" " if org:\n" " add_node(org, 'organization')\n" " # role es texto libre, lo metemos como kind\n" " kind = role.lower().replace(' ', '_')[:30] if role else 'works_at'\n" " G.add_edge(pname, org, kind=kind)\n" " \n" " for ag in parsed.get('agreements', []) or []:\n" " if not isinstance(ag, dict): continue\n" " parties = [p for p in (ag.get('between') or []) if isinstance(p, str) and p.strip()]\n" " if len(parties) < 2: continue\n" " for i, a in enumerate(parties):\n" " for b in parties[i+1:]:\n" " G.add_edge(a.strip(), b.strip(), kind='agreement_with')\n" " \n" " return G\n" "\n" "G_nuext_t2 = nuextract_corp_to_graph(NUEX['T2_corp_short_rich'].get('parsed'))\n" "print(f'NuExtract T2 grafo: {G_nuext_t2.number_of_nodes()} nodos, {G_nuext_t2.number_of_edges()} aristas')" )) cells.append(_md("## 6. Visualizacion lado a lado — 8 frases ES corporate")) cells.append(_code( "TYPE_COLOR = {'person': '#5DA5DA', 'organization': '#F17CB0', 'location': '#60BD68', '?': '#bbb'}\n" "\n" "def draw(ax, G, title, max_label=20):\n" " if G.number_of_nodes() == 0:\n" " ax.set_title(f'{title} (empty)'); ax.axis('off'); return\n" " pos = nx.spring_layout(G, k=2.5, iterations=80, seed=42)\n" " cols = [TYPE_COLOR.get(G.nodes[n].get('type'), '#bbb') for n in G.nodes]\n" " nx.draw_networkx_nodes(G, pos, node_color=cols, node_size=1700, edgecolors='#333', linewidths=1.3, ax=ax)\n" " labels = {n: (n if len(n) <= max_label else n[:max_label-1]+'…') for n in G.nodes}\n" " nx.draw_networkx_labels(G, pos, labels=labels, font_size=7.5, font_weight='bold', ax=ax)\n" " nx.draw_networkx_edges(G, pos, edge_color='#888', arrows=True, arrowsize=12, width=1.0, alpha=0.65, ax=ax, connectionstyle='arc3,rad=0.08')\n" " el = {(u,v): d['kind'] for u,v,d in G.edges(data=True)}\n" " nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=6, ax=ax,\n" " bbox=dict(boxstyle='round,pad=0.1', fc='white', ec='none', alpha=0.85))\n" " ax.set_title(f'{title}: {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=11)\n" " ax.axis('off')\n" "\n" "fig, axes = plt.subplots(1, 2, figsize=(20, 9))\n" "draw(axes[0], G_nuext_t2, 'NuExtract 2.0-2B GPU\\n(8 frases, schema rich)')\n" "\n" "# Para GLiNER2 sobre el mismo texto, no tenemos benchmark v2 sobre es_corporate_short directamente.\n" "# Notebook 04 dejo es_corporate_short con 14 ents + 8 rels via gliner2. Hardcodeamos del notebook 04 para comparar.\n" "G_gliner2_t2 = nx.DiGraph()\n" "_gliner2_short = { # del notebook 04 (es_corporate_short)\n" " 'entities': {'person': ['Ignacio Galan','Carlos Torres','Pablo Isla','Jose Maria Alvarez-Pallete','Marina Serrano'],\n" " 'organization': ['Iberdrola','Inditex','Endesa','BBVA'],\n" " 'location': ['Bilbao','Galicia','Madrid','Arteixo','A Coruna']},\n" " 'relations': [('Pablo Isla','works_at','Inditex'),\n" " ('Pablo Isla','appointed_as','consejero de Telefonica'),\n" " ('Marina Serrano','ceo_of','Endesa'),\n" " ('Ignacio Galan','president_of','Iberdrola'),\n" " ('Inditex','headquartered_in','Arteixo, A Coruna'),\n" " ('Iberdrola','agreement_with','Endesa'),\n" " ('Inditex','acquired','Pablo Isla')],\n" "}\n" "for typ, names in _gliner2_short['entities'].items():\n" " for n in names: G_gliner2_t2.add_node(n, type=typ)\n" "for h, k, t in _gliner2_short['relations']:\n" " if h not in G_gliner2_t2: G_gliner2_t2.add_node(h, type='?')\n" " if t not in G_gliner2_t2: G_gliner2_t2.add_node(t, type='?')\n" " G_gliner2_t2.add_edge(h, t, kind=k)\n" "draw(axes[1], G_gliner2_t2, 'GLiNER2 CPU\\n(8 frases, baseline notebook 04)')\n" "\n" "legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in TYPE_COLOR.items() if t != '?']\n" "axes[0].legend(handles=legend, loc='upper left', fontsize=10)\n" "plt.tight_layout(); plt.show()" )) cells.append(_md( "**Lectura del lado a lado:**\n\n" "- **NuExtract** captura **atributos por entidad** (cada org tiene su `ceo`, `headquartered_in`, etc) en una sola pasada — el grafo se construye 'gratis' a partir del JSON anidado.\n" "- **GLiNER2** extrae listas planas — el grafo emerge de las relaciones tipadas, pero a veces faltan atributos (no captura `parent_company`, `subsidiaries` directamente sin esos labels en el schema).\n" "- Ambos tienen calidad alta en este corpus pequeño. Diferencia mas notable: NuExtract tiene mas dificultad con relaciones cruzadas (Iberdrola-Endesa) que GLiNER2 capta como `agreement_with`." )) cells.append(_md( "## 7. Long text (25 frases sector bancario) — NuExtract\n\n" "**⚠️ Hallazgo importante:** En este test (T3), NuExtract **degenero en bucle de repeticion** y " "agoto los 2048 max_new_tokens emitiendo `{\"between\": [\"BBVA\", \"Sabadell\"], \"topic\": \"OPA parcial\"...}` " "repetido decenas de veces. El JSON resultante esta corrupto y `parsed = None`.\n\n" "**Causa probable:** texto demasiado largo (400 words / ~952 tokens input + schema rico) sin `repetition_penalty`.\n" "Mitigacion: anadir `repetition_penalty=1.1`, `do_sample=True, temperature=0.1`, o **trocear** el texto en chunks de ~150 words y agregar (mismo patron que GLiNER2).\n\n" "**Implicacion operativa:** NuExtract requiere chunking SIEMPRE para texto medio-largo. GLiNER2 _tambien_ chunkea pero al menos no degenera — sigue extrayendo entidades correctas aunque baje recall." )) cells.append(_code( "G_nuext_long = nuextract_corp_to_graph(NUEX['T3_long_text_rich'].get('parsed'))\n" "print(f'NuExtract T3 long_text: {G_nuext_long.number_of_nodes()} nodos, {G_nuext_long.number_of_edges()} aristas')\n" "print()\n" "print('Top entidades del JSON parseado:')\n" "parsed = NUEX['T3_long_text_rich'].get('parsed') or {}\n" "if parsed.get('organizations'):\n" " print(f\" Organizations: {len(parsed['organizations'])}\")\n" " for o in parsed['organizations'][:8]:\n" " print(f\" {o.get('name'):30s} ceo={o.get('ceo')} pres={o.get('chairman_president')} hq={o.get('headquartered_in')}\")\n" "if parsed.get('people'):\n" " print(f\" People: {len(parsed['people'])}\")\n" "if parsed.get('agreements'):\n" " print(f\" Agreements: {len(parsed['agreements'])}\")" )) cells.append(_code( "fig, ax = plt.subplots(figsize=(15, 11))\n" "draw(ax, G_nuext_long, 'NuExtract 2.0-2B GPU\\nLONG_TEXT_ES (25 frases sector bancario)', max_label=22)\n" "legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in TYPE_COLOR.items() if t != '?']\n" "ax.legend(handles=legend, loc='upper left', fontsize=10)\n" "plt.tight_layout(); plt.show()" )) cells.append(_md("## 8. PDF (5 chunks de muestra)")) cells.append(_code( "def nuextract_gdpr_to_graph(parsed: dict) -> nx.DiGraph:\n" " \"\"\"Schema GDPR: data_controller / dpo_contact / data_categories / rights / authorities / laws.\"\"\"\n" " G = nx.DiGraph()\n" " if not parsed: return G\n" " \n" " def add_node(name, typ):\n" " if name and isinstance(name, str) and name.strip():\n" " G.add_node(name.strip(), type=typ)\n" " \n" " dc = parsed.get('data_controller') or {}\n" " if isinstance(dc, dict) and dc.get('name'):\n" " add_node(dc['name'], 'organization')\n" " if dc.get('address'):\n" " add_node(dc['address'], 'location')\n" " G.add_edge(dc['name'].strip(), dc['address'].strip(), kind='located_in')\n" " dpo = parsed.get('dpo_contact') or {}\n" " if isinstance(dpo, dict) and dpo.get('email'):\n" " add_node(dpo['email'], 'email')\n" " if isinstance(dc, dict) and dc.get('name'):\n" " G.add_edge(dpo['email'].strip(), dc['name'].strip(), kind='dpo_of')\n" " for cat in parsed.get('data_categories', []) or []:\n" " if isinstance(cat, str) and cat.strip():\n" " add_node(cat, 'data_category')\n" " for r in parsed.get('rights_listed', []) or []:\n" " if isinstance(r, str) and r.strip():\n" " add_node(r, 'right')\n" " for a in parsed.get('authorities_mentioned', []) or []:\n" " if isinstance(a, dict) and a.get('name'):\n" " add_node(a['name'], 'authority')\n" " if a.get('url_or_contact'):\n" " add_node(a['url_or_contact'], 'url')\n" " G.add_edge(a['name'].strip(), a['url_or_contact'].strip(), kind='contact')\n" " for l in parsed.get('laws_mentioned', []) or []:\n" " if isinstance(l, str) and l.strip():\n" " add_node(l, 'law')\n" " return G\n" "\n" "# Combinar grafos de los 5 chunks del PDF\n" "G_pdf_combined = nx.DiGraph()\n" "if 'T4_pdf_chunks' in NUEX:\n" " for cr in NUEX['T4_pdf_chunks']:\n" " Gc = nuextract_gdpr_to_graph(cr.get('parsed'))\n" " for n, d in Gc.nodes(data=True):\n" " if n not in G_pdf_combined:\n" " G_pdf_combined.add_node(n, **d)\n" " for u, v, d in Gc.edges(data=True):\n" " G_pdf_combined.add_edge(u, v, **d)\n" "print(f'NuExtract PDF (5 chunks combinados): {G_pdf_combined.number_of_nodes()} nodos, {G_pdf_combined.number_of_edges()} aristas')" )) cells.append(_code( "PDF_TYPE_COLOR = {'organization':'#F17CB0','person':'#5DA5DA','location':'#60BD68',\n" " 'email':'#FAA43A','authority':'#7C7C7C','right':'#B276B2',\n" " 'data_category':'#DECF3F','law':'#F15854','url':'#DECF3F'}\n" "\n" "def draw_typed(ax, G, title, type_color):\n" " if G.number_of_nodes() == 0:\n" " ax.set_title(f'{title} (empty)'); ax.axis('off'); return\n" " pos = nx.spring_layout(G, k=2.0, iterations=80, seed=42)\n" " cols = [type_color.get(G.nodes[n].get('type'), '#bbb') for n in G.nodes]\n" " nx.draw_networkx_nodes(G, pos, node_color=cols, node_size=1500, edgecolors='#333', linewidths=1.2, ax=ax)\n" " labels = {n: (n if len(n) <= 22 else n[:21]+'…') for n in G.nodes}\n" " nx.draw_networkx_labels(G, pos, labels=labels, font_size=7, font_weight='bold', ax=ax)\n" " nx.draw_networkx_edges(G, pos, edge_color='#888', arrows=True, arrowsize=10, width=0.9, alpha=0.6, ax=ax, connectionstyle='arc3,rad=0.08')\n" " el = {(u,v): d['kind'] for u,v,d in G.edges(data=True)}\n" " nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=5.5, ax=ax,\n" " bbox=dict(boxstyle='round,pad=0.05', fc='white', ec='none', alpha=0.85))\n" " ax.set_title(f'{title}: {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=10)\n" " ax.axis('off')\n" "\n" "fig, axes = plt.subplots(1, 2, figsize=(20, 11))\n" "draw_typed(axes[0], G_pdf_combined, 'NuExtract GPU\\nPDF — 5 chunks combinados', PDF_TYPE_COLOR)\n" "\n" "# GLiNER2 sobre el PDF entero (97 chunks) ya esta en GLNR — config B post-coref\n" "# Si tenemos el grafo post-coref no esta en este JSON. Reconstruimos de lo que hay.\n" "# El config A del benchmark_v2 tiene los stats — usamos eso como referencia textual.\n" "axes[1].axis('off')\n" "axes[1].text(0.05, 0.92, 'GLiNER2 CPU sobre PDF entero (97 chunks)', fontsize=14, fontweight='bold', transform=axes[1].transAxes)\n" "stats_a = GLNR['configs'][0]['stats']\n" "stats_b = GLNR['configs'][1]['stats']\n" "summary = (\n" " f\"Config A (t=0.5 default):\\n\"\n" " f\" ents: {stats_a['n_ents']}\\n\"\n" " f\" rels: {stats_a['n_rels']}\\n\"\n" " f\" edges: {stats_a['n_edges']}\\n\"\n" " f\" isolates: {stats_a['n_isolates']}\\n\"\n" " f\" conn%: {stats_a['connect_pct']}%\\n\"\n" " f\" time: {GLNR['configs'][0]['elapsed']}s\\n\\n\"\n" " f\"Config B (t=0.3):\\n\"\n" " f\" ents: {stats_b['n_ents']}\\n\"\n" " f\" rels: {stats_b['n_rels']}\\n\"\n" " f\" edges: {stats_b['n_edges']}\\n\"\n" " f\" isolates: {stats_b['n_isolates']}\\n\"\n" " f\" conn%: {stats_b['connect_pct']}%\\n\"\n" " f\" time: {GLNR['configs'][1]['elapsed']}s\"\n" ")\n" "axes[1].text(0.05, 0.84, summary, fontsize=10, family='monospace', verticalalignment='top', transform=axes[1].transAxes)\n" "\n" "active = {G_pdf_combined.nodes[n].get('type') for n in G_pdf_combined.nodes}\n" "legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in PDF_TYPE_COLOR.items() if t in active]\n" "axes[0].legend(handles=legend, loc='upper left', fontsize=8)\n" "plt.tight_layout(); plt.show()" )) cells.append(_md( "## 9. Conclusion — cuando usar cada uno\n\n" "**Datos mas relevantes** (PDF de 89.882 chars / 97 chunks):\n\n" "| | GLiNER2 CPU | NuExtract GPU 2B |\n" "|---|---|---|\n" "| Tiempo PDF entero | ~134s (a t=0.5) / ~139s (t=0.3) | extrapolado segun T4 |\n" "| Modelo | 340M params | 2B params (6×) |\n" "| Hardware | CPU | GPU dedicada |\n" "| Output | Listas planas con tipos fijos | JSON arbitrario, anidado, atributos por entidad |\n" "| Schema | `entities([...]).relations([...])` (palabras claves) | Plantilla JSON cualquiera (`{org: {ceo, ...}}`) |\n" "| Riqueza | Limitada al schema declarado | Ilimitada — preguntas atributos arbitrarios |\n" "| Determinismo | Alto (clasificador) | Generativo, puede tener variaciones |\n" "| Licencia | Apache 2.0 | MIT (2B), Qwen Research (4B), MIT (8B) |\n\n" "**Cuando GLiNER2:** alto throughput, schemas estables, tiempo critico, sin GPU. **Robusto frente a texto largo** (no degenera).\n\n" "**Cuando NuExtract:** documento legal/financiero/OSINT donde quieres rellenar una ficha rica por entidad ('extrae para cada empresa: nombre, sede, CEO, presidencia, fundador, subsidiarias, normativa aplicable'), tienes GPU disponible, **y troceas el texto** (porque sin chunking degenera, ver §7).\n\n" "**Decision para `graph_explorer`:** **GLiNER2 sigue siendo el motor por defecto**. Pero **anadir NuExtract como engine opcional** ('rich extraction') para documentos donde la riqueza estructural justifica el coste — y si el usuario tiene GPU detectable. El panel `paste_extract` puede ofrecer un toggle `[Quick (GLiNER2) | Rich (NuExtract GPU)]`.\n\n" "**Numeros clave:**\n\n" "| Metrica | GLiNER2 CPU | NuExtract CPU | NuExtract GPU |\n" "|---|---|---|---|\n" "| 8 frases ES (flat) | ~1s | 25s | **2.9s** |\n" "| 8 frases ES (rich) | n/a (schema flat) | 117s | **9.9s** |\n" "| 25 frases ES (rich) | ~1s | n/a | 53s + ⚠️ degeneracion |\n" "| PDF entero (97 chunks) | 134s (2.2 min) | (estimado >2h) | 310s (5.2 min) — 2.3× mas lento |\n" "| Modelo | 340M params, 700 MB disco | 2B params, 4 GB disco | mismo, BF16 |\n" "| Speedup CPU→GPU | n/a | n/a | **8-12×** |" )) nb = nbf.v4.new_notebook() nb.cells = cells nb.metadata = { "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python"}, } NB_PATH.parent.mkdir(parents=True, exist_ok=True) nbf.write(nb, NB_PATH) print(f"[done] {NB_PATH} cells={len(cells)}") if __name__ == "__main__": build()