gliner_glirel_tuning/build_notebook_nuextract.py

"""Construye notebooks/07_nuextract_vs_gliner2.ipynb — comparativa completa.

Carga datos de:
  - nuextract_results.json  (NuExtract 2.0-2B en GPU + baseline CPU)
  - benchmark_v2.json        (GLiNER2 sobre el mismo PDF)

Construye grafos a partir del JSON anidado de NuExtract (nested → edges) y
compara con los grafos de GLiNER2 lado a lado: numero de nodos, aristas,
tiempo por extraccion, calidad cualitativa.
"""
from __future__ import annotations

import json
from pathlib import Path

import nbformat as nbf

HERE = Path(__file__).resolve().parent
NB_PATH = HERE / "notebooks" / "07_nuextract_vs_gliner2.ipynb"


def _md(text: str):
    return nbf.v4.new_markdown_cell(text)


def _code(src: str):
    cell = nbf.v4.new_code_cell(src)
    cell.outputs = []
    cell.execution_count = None
    return cell


def build():
    cells = []

    cells.append(_md(
        "# NuExtract 2.0-2B (GPU) vs GLiNER2 — comparativa con visualizacion\n\n"
        "**Pregunta:** ¿merece la pena un LLM con inferencia (NuExtract 2.0) en un proyecto donde "
        "antes elegimos GLiNER2 por velocidad?\n\n"
        "**Setup:**\n"
        "- NuExtract 2.0-2B (Qwen2-VL-2B base, **MIT license**, 2B params, GPU BF16 sobre RTX 3070).\n"
        "- GLiNER2-large-v1 (Apache 2.0, 340M params, CPU).\n"
        "- Mismos corpora: `es_corporate_short` (8 frases), `LONG_TEXT_ES` (25 frases), 5 chunks del PDF de BBVA.\n\n"
        "**Diferencia de paradigma:**\n"
        "- **GLiNER2** = clasificador. Output: listas planas `{entities: {tipo: [names]}, relations: {tipo: [(h, t)]}}`.\n"
        "- **NuExtract** = LLM generativo. Output: JSON arbitrario que tu defines en el `template`. Las relaciones se modelan como atributos de los objetos (`{org: {ceo: \"X\", headquartered_in: \"Y\"}}`).\n\n"
        "**Hipotesis:** NuExtract gana en _riqueza estructural_ (atributos por entidad de un solo paso) pero pierde en velocidad — incluso con GPU."
    ))

    cells.append(_md("## 1. Setup"))

    cells.append(_code(
        "import os, sys, json, warnings\n"
        "warnings.filterwarnings('ignore')\n"
        "from pathlib import Path\n"
        "from collections import defaultdict\n"
        "\n"
        "_pf = '/home/lucas/fn_registry/python/functions'\n"
        "sys.path = [p for p in sys.path if not p.startswith(_pf + '/')]\n"
        "if _pf not in sys.path: sys.path.insert(0, _pf)\n"
        "\n"
        "import pandas as pd\n"
        "import networkx as nx\n"
        "import matplotlib.pyplot as plt\n"
        "from matplotlib.patches import Patch\n"
        "\n"
        "NUEX = json.loads(Path('../nuextract_results.json').read_text())\n"
        "\n"
        "# Re-parsear el raw_text de cada test con un parser corregido (el original\n"
        "# del script usaba rfind y solo capturaba el ultimo objeto pequeño).\n"
        "def reparse(text):\n"
        "    if not text: return None\n"
        "    s = text.find('{')\n"
        "    if s < 0: return None\n"
        "    for end in range(len(text), s, -1):\n"
        "        try: return json.loads(text[s:end])\n"
        "        except Exception: continue\n"
        "    return None\n"
        "for key in ['T1_corp_short_flat', 'T2_corp_short_rich', 'T3_long_text_rich']:\n"
        "    if key in NUEX:\n"
        "        NUEX[key]['parsed'] = reparse(NUEX[key].get('raw_text', ''))\n"
        "for cr in NUEX.get('T4_pdf_chunks', []):\n"
        "    cr['parsed'] = reparse(cr.get('raw_text', ''))\n"
        "GLNR_CORPUS = json.loads(Path('../benchmark_v2.json').read_text())     # GLiNER2 sobre 4 corpora\n"
        "GLNR = json.loads(Path('../improvements.json').read_text())            # GLiNER2 sobre PDF + improvements\n"
        "print('NuExtract keys:', list(NUEX.keys()))\n"
        "print('GLiNER2 keys: ', list(GLNR.keys()))\n"
        "print()\n"
        "print('NuExtract device:', NUEX['meta']['device'], NUEX['meta']['dtype'])"
    ))

    cells.append(_md(
        "## 2. Tabla de tiempos — CPU vs GPU vs GLiNER2\n\n"
        "Comparamos las 4 pasadas (T1-T4) de NuExtract contra GLiNER2 sobre los mismos corpora."
    ))

    cells.append(_code(
        "# Construir tabla de tiempos\n"
        "rows = []\n"
        "\n"
        "# CPU baseline (capturado del run anterior)\n"
        "cpu = NUEX.get('cpu_baseline', {})\n"
        "if 'T1_flat' in cpu:\n"
        "    rows.append({'test': 'T1 corp_short flat', 'engine': 'NuExtract CPU', 'time_s': cpu['T1_flat']['elapsed_s'],\n"
        "                  'in_tok': cpu['T1_flat']['in_tok'], 'out_tok': cpu['T1_flat']['out_tok']})\n"
        "if 'T2_rich' in cpu:\n"
        "    rows.append({'test': 'T2 corp_short rich', 'engine': 'NuExtract CPU', 'time_s': cpu['T2_rich']['elapsed_s'],\n"
        "                  'in_tok': cpu['T2_rich']['in_tok'], 'out_tok': cpu['T2_rich']['out_tok']})\n"
        "\n"
        "# GPU (este run)\n"
        "for key, label in [('T1_corp_short_flat',  'T1 corp_short flat'),\n"
        "                    ('T2_corp_short_rich',  'T2 corp_short rich'),\n"
        "                    ('T3_long_text_rich',   'T3 long_text rich')]:\n"
        "    if key in NUEX:\n"
        "        r = NUEX[key]\n"
        "        rows.append({'test': label, 'engine': 'NuExtract GPU', 'time_s': r['elapsed_s'],\n"
        "                      'in_tok': r['n_input_tokens'], 'out_tok': r['n_output_tokens']})\n"
        "\n"
        "# GLiNER2 baseline timings (de benchmark_v2.json — el config A es el equivalente)\n"
        "# A es el flat schema sobre 97 chunks PDF — para comparar con T4 PDF\n"
        "rows.append({'test': 'PDF (97 chunks)', 'engine': 'GLiNER2 CPU', 'time_s': GLNR['configs'][0]['elapsed'],\n"
        "              'in_tok': '-', 'out_tok': '-'})\n"
        "rows.append({'test': 'PDF (97 chunks)', 'engine': 'GLiNER2 CPU t=0.3', 'time_s': GLNR['configs'][1]['elapsed'],\n"
        "              'in_tok': '-', 'out_tok': '-'})\n"
        "\n"
        "df_times = pd.DataFrame(rows)\n"
        "df_times"
    ))

    cells.append(_md(
        "## 3. Tiempos sobre el PDF — extrapolacion\n\n"
        "5 chunks de muestra → estimacion del PDF completo."
    ))

    cells.append(_code(
        "if 'T4_pdf_chunks' in NUEX:\n"
        "    chunk_rows = []\n"
        "    for cr in NUEX['T4_pdf_chunks']:\n"
        "        chunk_rows.append({\n"
        "            'chunk_idx': cr['chunk_idx'],\n"
        "            'input_chars': cr['input_chars'],\n"
        "            'time_s': cr['elapsed_s'],\n"
        "            'in_tok': cr['n_input_tokens'],\n"
        "            'out_tok': cr['n_output_tokens'],\n"
        "        })\n"
        "    df_chunks = pd.DataFrame(chunk_rows)\n"
        "    print('NuExtract GPU sobre 5 chunks del PDF:')\n"
        "    print(df_chunks)\n"
        "    print()\n"
        "    if 'full_pdf_extrapolation' in NUEX:\n"
        "        e = NUEX['full_pdf_extrapolation']\n"
        "        print(f\"Extrapolacion PDF entero ({e['n_chunks']} chunks):\")\n"
        "        print(f\"  NuExtract GPU: {e['estimated_total_s']:.0f}s = {e['estimated_total_min']:.1f} min\")\n"
        "        print(f\"  GLiNER2 CPU baseline: {GLNR['configs'][0]['elapsed']:.0f}s = {GLNR['configs'][0]['elapsed']/60:.1f} min\")\n"
        "        ratio = e['estimated_total_s'] / GLNR['configs'][0]['elapsed']\n"
        "        print(f\"  ratio NuExtract/GLiNER2: {ratio:.1f}x\")\n"
        "else:\n"
        "    print('T4_pdf_chunks no presente todavia')"
    ))

    cells.append(_md(
        "## 4. Estructura del output — paradigmas distintos\n\n"
        "**NuExtract** rellena el template JSON. Lo que pidas, sale (si existe en el texto)."
    ))

    cells.append(_code(
        "# Mostrar el JSON parseado de T2 (rich corporate sobre 8 frases ES)\n"
        "print('=== NuExtract T2 — schema rich corporate sobre es_corporate_short ===')\n"
        "if 'T2_corp_short_rich' in NUEX:\n"
        "    parsed = NUEX['T2_corp_short_rich'].get('parsed')\n"
        "    if parsed:\n"
        "        print(json.dumps(parsed, indent=2, ensure_ascii=False))\n"
        "    else:\n"
        "        print('parsed = None  (raw text:)')\n"
        "        print(NUEX['T2_corp_short_rich']['raw_text'][:1500])"
    ))

    cells.append(_md("## 5. Convertir el JSON anidado de NuExtract a un grafo"))

    cells.append(_code(
        "def nuextract_corp_to_graph(parsed: dict) -> nx.DiGraph:\n"
        "    \"\"\"Convierte el output de schema_rich_corporate a un DiGraph.\n"
        "\n"
        "    Mapeo:\n"
        "      org.name → nodo (type=organization)\n"
        "      org.ceo → nodo (type=person), arista person --ceo_of--> org\n"
        "      org.chairman_president → nodo, arista --president_of--> org\n"
        "      org.headquartered_in → nodo (type=location), arista org --headquartered_in--> loc\n"
        "      org.subsidiaries[] → cada sub: nodo + arista sub --subsidiary_of--> org\n"
        "      org.parent_company → nodo + arista org --subsidiary_of--> parent\n"
        "      person.name → nodo, person --role--> organization\n"
        "      agreement.between[] → entre cada par, arista A --agreement_with--> B\n"
        "    \"\"\"\n"
        "    G = nx.DiGraph()\n"
        "    if not parsed: return G\n"
        "    \n"
        "    def add_node(name, typ):\n"
        "        if name and isinstance(name, str) and name.strip():\n"
        "            G.add_node(name.strip(), type=typ)\n"
        "    \n"
        "    for org in parsed.get('organizations', []) or []:\n"
        "        if not isinstance(org, dict): continue\n"
        "        oname = (org.get('name') or '').strip()\n"
        "        if not oname: continue\n"
        "        add_node(oname, 'organization')\n"
        "        if org.get('ceo'):\n"
        "            add_node(org['ceo'], 'person')\n"
        "            G.add_edge(org['ceo'].strip(), oname, kind='ceo_of')\n"
        "        if org.get('chairman_president'):\n"
        "            add_node(org['chairman_president'], 'person')\n"
        "            G.add_edge(org['chairman_president'].strip(), oname, kind='president_of')\n"
        "        if org.get('headquartered_in'):\n"
        "            add_node(org['headquartered_in'], 'location')\n"
        "            G.add_edge(oname, org['headquartered_in'].strip(), kind='headquartered_in')\n"
        "        if org.get('parent_company'):\n"
        "            add_node(org['parent_company'], 'organization')\n"
        "            G.add_edge(oname, org['parent_company'].strip(), kind='subsidiary_of')\n"
        "        for sub in org.get('subsidiaries', []) or []:\n"
        "            if isinstance(sub, str) and sub.strip():\n"
        "                add_node(sub, 'organization')\n"
        "                G.add_edge(sub.strip(), oname, kind='subsidiary_of')\n"
        "    \n"
        "    for p in parsed.get('people', []) or []:\n"
        "        if not isinstance(p, dict): continue\n"
        "        pname = (p.get('name') or '').strip()\n"
        "        if not pname: continue\n"
        "        add_node(pname, 'person')\n"
        "        org = (p.get('organization') or '').strip()\n"
        "        role = (p.get('role') or 'works_at').strip()\n"
        "        if org:\n"
        "            add_node(org, 'organization')\n"
        "            # role es texto libre, lo metemos como kind\n"
        "            kind = role.lower().replace(' ', '_')[:30] if role else 'works_at'\n"
        "            G.add_edge(pname, org, kind=kind)\n"
        "    \n"
        "    for ag in parsed.get('agreements', []) or []:\n"
        "        if not isinstance(ag, dict): continue\n"
        "        parties = [p for p in (ag.get('between') or []) if isinstance(p, str) and p.strip()]\n"
        "        if len(parties) < 2: continue\n"
        "        for i, a in enumerate(parties):\n"
        "            for b in parties[i+1:]:\n"
        "                G.add_edge(a.strip(), b.strip(), kind='agreement_with')\n"
        "    \n"
        "    return G\n"
        "\n"
        "G_nuext_t2 = nuextract_corp_to_graph(NUEX['T2_corp_short_rich'].get('parsed'))\n"
        "print(f'NuExtract T2 grafo: {G_nuext_t2.number_of_nodes()} nodos, {G_nuext_t2.number_of_edges()} aristas')"
    ))

    cells.append(_md("## 6. Visualizacion lado a lado — 8 frases ES corporate"))

    cells.append(_code(
        "TYPE_COLOR = {'person': '#5DA5DA', 'organization': '#F17CB0', 'location': '#60BD68', '?': '#bbb'}\n"
        "\n"
        "def draw(ax, G, title, max_label=20):\n"
        "    if G.number_of_nodes() == 0:\n"
        "        ax.set_title(f'{title} (empty)'); ax.axis('off'); return\n"
        "    pos = nx.spring_layout(G, k=2.5, iterations=80, seed=42)\n"
        "    cols = [TYPE_COLOR.get(G.nodes[n].get('type'), '#bbb') for n in G.nodes]\n"
        "    nx.draw_networkx_nodes(G, pos, node_color=cols, node_size=1700, edgecolors='#333', linewidths=1.3, ax=ax)\n"
        "    labels = {n: (n if len(n) <= max_label else n[:max_label-1]+'…') for n in G.nodes}\n"
        "    nx.draw_networkx_labels(G, pos, labels=labels, font_size=7.5, font_weight='bold', ax=ax)\n"
        "    nx.draw_networkx_edges(G, pos, edge_color='#888', arrows=True, arrowsize=12, width=1.0, alpha=0.65, ax=ax, connectionstyle='arc3,rad=0.08')\n"
        "    el = {(u,v): d['kind'] for u,v,d in G.edges(data=True)}\n"
        "    nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=6, ax=ax,\n"
        "                                  bbox=dict(boxstyle='round,pad=0.1', fc='white', ec='none', alpha=0.85))\n"
        "    ax.set_title(f'{title}: {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=11)\n"
        "    ax.axis('off')\n"
        "\n"
        "fig, axes = plt.subplots(1, 2, figsize=(20, 9))\n"
        "draw(axes[0], G_nuext_t2, 'NuExtract 2.0-2B GPU\\n(8 frases, schema rich)')\n"
        "\n"
        "# Para GLiNER2 sobre el mismo texto, no tenemos benchmark v2 sobre es_corporate_short directamente.\n"
        "# Notebook 04 dejo es_corporate_short con 14 ents + 8 rels via gliner2. Hardcodeamos del notebook 04 para comparar.\n"
        "G_gliner2_t2 = nx.DiGraph()\n"
        "_gliner2_short = {  # del notebook 04 (es_corporate_short)\n"
        "    'entities': {'person': ['Ignacio Galan','Carlos Torres','Pablo Isla','Jose Maria Alvarez-Pallete','Marina Serrano'],\n"
        "                  'organization': ['Iberdrola','Inditex','Endesa','BBVA'],\n"
        "                  'location': ['Bilbao','Galicia','Madrid','Arteixo','A Coruna']},\n"
        "    'relations': [('Pablo Isla','works_at','Inditex'),\n"
        "                   ('Pablo Isla','appointed_as','consejero de Telefonica'),\n"
        "                   ('Marina Serrano','ceo_of','Endesa'),\n"
        "                   ('Ignacio Galan','president_of','Iberdrola'),\n"
        "                   ('Inditex','headquartered_in','Arteixo, A Coruna'),\n"
        "                   ('Iberdrola','agreement_with','Endesa'),\n"
        "                   ('Inditex','acquired','Pablo Isla')],\n"
        "}\n"
        "for typ, names in _gliner2_short['entities'].items():\n"
        "    for n in names: G_gliner2_t2.add_node(n, type=typ)\n"
        "for h, k, t in _gliner2_short['relations']:\n"
        "    if h not in G_gliner2_t2: G_gliner2_t2.add_node(h, type='?')\n"
        "    if t not in G_gliner2_t2: G_gliner2_t2.add_node(t, type='?')\n"
        "    G_gliner2_t2.add_edge(h, t, kind=k)\n"
        "draw(axes[1], G_gliner2_t2, 'GLiNER2 CPU\\n(8 frases, baseline notebook 04)')\n"
        "\n"
        "legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in TYPE_COLOR.items() if t != '?']\n"
        "axes[0].legend(handles=legend, loc='upper left', fontsize=10)\n"
        "plt.tight_layout(); plt.show()"
    ))

    cells.append(_md(
        "**Lectura del lado a lado:**\n\n"
        "- **NuExtract** captura **atributos por entidad** (cada org tiene su `ceo`, `headquartered_in`, etc) en una sola pasada — el grafo se construye 'gratis' a partir del JSON anidado.\n"
        "- **GLiNER2** extrae listas planas — el grafo emerge de las relaciones tipadas, pero a veces faltan atributos (no captura `parent_company`, `subsidiaries` directamente sin esos labels en el schema).\n"
        "- Ambos tienen calidad alta en este corpus pequeño. Diferencia mas notable: NuExtract tiene mas dificultad con relaciones cruzadas (Iberdrola-Endesa) que GLiNER2 capta como `agreement_with`."
    ))

    cells.append(_md(
        "## 7. Long text (25 frases sector bancario) — NuExtract\n\n"
        "**⚠️ Hallazgo importante:** En este test (T3), NuExtract **degenero en bucle de repeticion** y "
        "agoto los 2048 max_new_tokens emitiendo `{\"between\": [\"BBVA\", \"Sabadell\"], \"topic\": \"OPA parcial\"...}` "
        "repetido decenas de veces. El JSON resultante esta corrupto y `parsed = None`.\n\n"
        "**Causa probable:** texto demasiado largo (400 words / ~952 tokens input + schema rico) sin `repetition_penalty`.\n"
        "Mitigacion: anadir `repetition_penalty=1.1`, `do_sample=True, temperature=0.1`, o **trocear** el texto en chunks de ~150 words y agregar (mismo patron que GLiNER2).\n\n"
        "**Implicacion operativa:** NuExtract requiere chunking SIEMPRE para texto medio-largo. GLiNER2 _tambien_ chunkea pero al menos no degenera — sigue extrayendo entidades correctas aunque baje recall."
    ))

    cells.append(_code(
        "G_nuext_long = nuextract_corp_to_graph(NUEX['T3_long_text_rich'].get('parsed'))\n"
        "print(f'NuExtract T3 long_text: {G_nuext_long.number_of_nodes()} nodos, {G_nuext_long.number_of_edges()} aristas')\n"
        "print()\n"
        "print('Top entidades del JSON parseado:')\n"
        "parsed = NUEX['T3_long_text_rich'].get('parsed') or {}\n"
        "if parsed.get('organizations'):\n"
        "    print(f\"  Organizations: {len(parsed['organizations'])}\")\n"
        "    for o in parsed['organizations'][:8]:\n"
        "        print(f\"    {o.get('name'):30s}  ceo={o.get('ceo')}  pres={o.get('chairman_president')}  hq={o.get('headquartered_in')}\")\n"
        "if parsed.get('people'):\n"
        "    print(f\"  People: {len(parsed['people'])}\")\n"
        "if parsed.get('agreements'):\n"
        "    print(f\"  Agreements: {len(parsed['agreements'])}\")"
    ))

    cells.append(_code(
        "fig, ax = plt.subplots(figsize=(15, 11))\n"
        "draw(ax, G_nuext_long, 'NuExtract 2.0-2B GPU\\nLONG_TEXT_ES (25 frases sector bancario)', max_label=22)\n"
        "legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in TYPE_COLOR.items() if t != '?']\n"
        "ax.legend(handles=legend, loc='upper left', fontsize=10)\n"
        "plt.tight_layout(); plt.show()"
    ))

    cells.append(_md("## 8. PDF (5 chunks de muestra)"))

    cells.append(_code(
        "def nuextract_gdpr_to_graph(parsed: dict) -> nx.DiGraph:\n"
        "    \"\"\"Schema GDPR: data_controller / dpo_contact / data_categories / rights / authorities / laws.\"\"\"\n"
        "    G = nx.DiGraph()\n"
        "    if not parsed: return G\n"
        "    \n"
        "    def add_node(name, typ):\n"
        "        if name and isinstance(name, str) and name.strip():\n"
        "            G.add_node(name.strip(), type=typ)\n"
        "    \n"
        "    dc = parsed.get('data_controller') or {}\n"
        "    if isinstance(dc, dict) and dc.get('name'):\n"
        "        add_node(dc['name'], 'organization')\n"
        "        if dc.get('address'):\n"
        "            add_node(dc['address'], 'location')\n"
        "            G.add_edge(dc['name'].strip(), dc['address'].strip(), kind='located_in')\n"
        "    dpo = parsed.get('dpo_contact') or {}\n"
        "    if isinstance(dpo, dict) and dpo.get('email'):\n"
        "        add_node(dpo['email'], 'email')\n"
        "        if isinstance(dc, dict) and dc.get('name'):\n"
        "            G.add_edge(dpo['email'].strip(), dc['name'].strip(), kind='dpo_of')\n"
        "    for cat in parsed.get('data_categories', []) or []:\n"
        "        if isinstance(cat, str) and cat.strip():\n"
        "            add_node(cat, 'data_category')\n"
        "    for r in parsed.get('rights_listed', []) or []:\n"
        "        if isinstance(r, str) and r.strip():\n"
        "            add_node(r, 'right')\n"
        "    for a in parsed.get('authorities_mentioned', []) or []:\n"
        "        if isinstance(a, dict) and a.get('name'):\n"
        "            add_node(a['name'], 'authority')\n"
        "            if a.get('url_or_contact'):\n"
        "                add_node(a['url_or_contact'], 'url')\n"
        "                G.add_edge(a['name'].strip(), a['url_or_contact'].strip(), kind='contact')\n"
        "    for l in parsed.get('laws_mentioned', []) or []:\n"
        "        if isinstance(l, str) and l.strip():\n"
        "            add_node(l, 'law')\n"
        "    return G\n"
        "\n"
        "# Combinar grafos de los 5 chunks del PDF\n"
        "G_pdf_combined = nx.DiGraph()\n"
        "if 'T4_pdf_chunks' in NUEX:\n"
        "    for cr in NUEX['T4_pdf_chunks']:\n"
        "        Gc = nuextract_gdpr_to_graph(cr.get('parsed'))\n"
        "        for n, d in Gc.nodes(data=True):\n"
        "            if n not in G_pdf_combined:\n"
        "                G_pdf_combined.add_node(n, **d)\n"
        "        for u, v, d in Gc.edges(data=True):\n"
        "            G_pdf_combined.add_edge(u, v, **d)\n"
        "print(f'NuExtract PDF (5 chunks combinados): {G_pdf_combined.number_of_nodes()} nodos, {G_pdf_combined.number_of_edges()} aristas')"
    ))

    cells.append(_code(
        "PDF_TYPE_COLOR = {'organization':'#F17CB0','person':'#5DA5DA','location':'#60BD68',\n"
        "                  'email':'#FAA43A','authority':'#7C7C7C','right':'#B276B2',\n"
        "                  'data_category':'#DECF3F','law':'#F15854','url':'#DECF3F'}\n"
        "\n"
        "def draw_typed(ax, G, title, type_color):\n"
        "    if G.number_of_nodes() == 0:\n"
        "        ax.set_title(f'{title} (empty)'); ax.axis('off'); return\n"
        "    pos = nx.spring_layout(G, k=2.0, iterations=80, seed=42)\n"
        "    cols = [type_color.get(G.nodes[n].get('type'), '#bbb') for n in G.nodes]\n"
        "    nx.draw_networkx_nodes(G, pos, node_color=cols, node_size=1500, edgecolors='#333', linewidths=1.2, ax=ax)\n"
        "    labels = {n: (n if len(n) <= 22 else n[:21]+'…') for n in G.nodes}\n"
        "    nx.draw_networkx_labels(G, pos, labels=labels, font_size=7, font_weight='bold', ax=ax)\n"
        "    nx.draw_networkx_edges(G, pos, edge_color='#888', arrows=True, arrowsize=10, width=0.9, alpha=0.6, ax=ax, connectionstyle='arc3,rad=0.08')\n"
        "    el = {(u,v): d['kind'] for u,v,d in G.edges(data=True)}\n"
        "    nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=5.5, ax=ax,\n"
        "                                  bbox=dict(boxstyle='round,pad=0.05', fc='white', ec='none', alpha=0.85))\n"
        "    ax.set_title(f'{title}: {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=10)\n"
        "    ax.axis('off')\n"
        "\n"
        "fig, axes = plt.subplots(1, 2, figsize=(20, 11))\n"
        "draw_typed(axes[0], G_pdf_combined, 'NuExtract GPU\\nPDF — 5 chunks combinados', PDF_TYPE_COLOR)\n"
        "\n"
        "# GLiNER2 sobre el PDF entero (97 chunks) ya esta en GLNR — config B post-coref\n"
        "# Si tenemos el grafo post-coref no esta en este JSON. Reconstruimos de lo que hay.\n"
        "# El config A del benchmark_v2 tiene los stats — usamos eso como referencia textual.\n"
        "axes[1].axis('off')\n"
        "axes[1].text(0.05, 0.92, 'GLiNER2 CPU sobre PDF entero (97 chunks)', fontsize=14, fontweight='bold', transform=axes[1].transAxes)\n"
        "stats_a = GLNR['configs'][0]['stats']\n"
        "stats_b = GLNR['configs'][1]['stats']\n"
        "summary = (\n"
        "    f\"Config A (t=0.5 default):\\n\"\n"
        "    f\"  ents:     {stats_a['n_ents']}\\n\"\n"
        "    f\"  rels:     {stats_a['n_rels']}\\n\"\n"
        "    f\"  edges:    {stats_a['n_edges']}\\n\"\n"
        "    f\"  isolates: {stats_a['n_isolates']}\\n\"\n"
        "    f\"  conn%:    {stats_a['connect_pct']}%\\n\"\n"
        "    f\"  time:     {GLNR['configs'][0]['elapsed']}s\\n\\n\"\n"
        "    f\"Config B (t=0.3):\\n\"\n"
        "    f\"  ents:     {stats_b['n_ents']}\\n\"\n"
        "    f\"  rels:     {stats_b['n_rels']}\\n\"\n"
        "    f\"  edges:    {stats_b['n_edges']}\\n\"\n"
        "    f\"  isolates: {stats_b['n_isolates']}\\n\"\n"
        "    f\"  conn%:    {stats_b['connect_pct']}%\\n\"\n"
        "    f\"  time:     {GLNR['configs'][1]['elapsed']}s\"\n"
        ")\n"
        "axes[1].text(0.05, 0.84, summary, fontsize=10, family='monospace', verticalalignment='top', transform=axes[1].transAxes)\n"
        "\n"
        "active = {G_pdf_combined.nodes[n].get('type') for n in G_pdf_combined.nodes}\n"
        "legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in PDF_TYPE_COLOR.items() if t in active]\n"
        "axes[0].legend(handles=legend, loc='upper left', fontsize=8)\n"
        "plt.tight_layout(); plt.show()"
    ))

    cells.append(_md(
        "## 9. Conclusion — cuando usar cada uno\n\n"
        "**Datos mas relevantes** (PDF de 89.882 chars / 97 chunks):\n\n"
        "| | GLiNER2 CPU | NuExtract GPU 2B |\n"
        "|---|---|---|\n"
        "| Tiempo PDF entero | ~134s (a t=0.5) / ~139s (t=0.3) | extrapolado segun T4 |\n"
        "| Modelo | 340M params | 2B params (6×) |\n"
        "| Hardware | CPU | GPU dedicada |\n"
        "| Output | Listas planas con tipos fijos | JSON arbitrario, anidado, atributos por entidad |\n"
        "| Schema | `entities([...]).relations([...])` (palabras claves) | Plantilla JSON cualquiera (`{org: {ceo, ...}}`) |\n"
        "| Riqueza | Limitada al schema declarado | Ilimitada — preguntas atributos arbitrarios |\n"
        "| Determinismo | Alto (clasificador) | Generativo, puede tener variaciones |\n"
        "| Licencia | Apache 2.0 | MIT (2B), Qwen Research (4B), MIT (8B) |\n\n"
        "**Cuando GLiNER2:** alto throughput, schemas estables, tiempo critico, sin GPU. **Robusto frente a texto largo** (no degenera).\n\n"
        "**Cuando NuExtract:** documento legal/financiero/OSINT donde quieres rellenar una ficha rica por entidad ('extrae para cada empresa: nombre, sede, CEO, presidencia, fundador, subsidiarias, normativa aplicable'), tienes GPU disponible, **y troceas el texto** (porque sin chunking degenera, ver §7).\n\n"
        "**Decision para `graph_explorer`:** **GLiNER2 sigue siendo el motor por defecto**. Pero **anadir NuExtract como engine opcional** ('rich extraction') para documentos donde la riqueza estructural justifica el coste — y si el usuario tiene GPU detectable. El panel `paste_extract` puede ofrecer un toggle `[Quick (GLiNER2) | Rich (NuExtract GPU)]`.\n\n"
        "**Numeros clave:**\n\n"
        "| Metrica | GLiNER2 CPU | NuExtract CPU | NuExtract GPU |\n"
        "|---|---|---|---|\n"
        "| 8 frases ES (flat) | ~1s | 25s | **2.9s** |\n"
        "| 8 frases ES (rich) | n/a (schema flat) | 117s | **9.9s** |\n"
        "| 25 frases ES (rich) | ~1s | n/a | 53s + ⚠️ degeneracion |\n"
        "| PDF entero (97 chunks) | 134s (2.2 min) | (estimado >2h) | 310s (5.2 min) — 2.3× mas lento |\n"
        "| Modelo | 340M params, 700 MB disco | 2B params, 4 GB disco | mismo, BF16 |\n"
        "| Speedup CPU→GPU | n/a | n/a | **8-12×** |"
    ))

    nb = nbf.v4.new_notebook()
    nb.cells = cells
    nb.metadata = {
        "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
        "language_info": {"name": "python"},
    }
    NB_PATH.parent.mkdir(parents=True, exist_ok=True)
    nbf.write(nb, NB_PATH)
    print(f"[done] {NB_PATH}  cells={len(cells)}")


if __name__ == "__main__":
    build()