gliner_glirel_tuning/build_notebook_08_improvements_gliner.py

"""Construye notebooks/08_improving_gliner2.ipynb — experimentos para subir
las relaciones de GLiNER2 sin perder la velocidad.

5 experimentos en un mismo notebook, modelo cargado una sola vez:
  §1 Label naming      — works_at vs employed_by vs WorksAt vs spaces
  §2 include_confidence — score per head/tail + threshold por relacion
  §3 Post-filter typed — allowed (head_type, tail_type) por relacion
  §4 Descripciones     — flat list vs dict con descripciones
  §5 GLiREL hibrido    — GLiNER2 NER + GLiREL relations con allowed_head/tail
  §6 Best combo        — aplicar lo aprendido sobre PDF
"""
from __future__ import annotations

from pathlib import Path
import nbformat as nbf

HERE = Path(__file__).resolve().parent
NB_PATH = HERE / "notebooks" / "08_improving_gliner2.ipynb"

ES_CORPORATE_SHORT = (
    "Pablo Isla, expresidente de Inditex, ha sido nombrado consejero de Telefonica. "
    "La operacion fue anunciada por el presidente Jose Maria Alvarez-Pallete en Madrid el pasado lunes. "
    "Inditex factura mas de 30.000 millones anuales y tiene su sede en Arteixo, A Coruna. "
    "En paralelo, Iberdrola y Endesa firmaron un acuerdo de colaboracion en proyectos eolicos en Galicia. "
    "El presidente de Iberdrola, Ignacio Galan, se reunio con la CEO de Endesa, Marina Serrano, en Bilbao. "
    "El acuerdo movilizara 2.000 millones de euros en cinco anos. "
    "El BBVA, presidido por Carlos Torres, mostro interes en participar en la financiacion del proyecto. "
    "Su sede central esta en Bilbao."
)


def _md(text: str):
    return nbf.v4.new_markdown_cell(text)


def _code(src: str):
    cell = nbf.v4.new_code_cell(src)
    cell.outputs = []
    cell.execution_count = None
    return cell


def build():
    cells = []

    cells.append(_md(
        "# Mejoras a GLiNER2 — sumarle capacidad sin perder velocidad\n\n"
        "Decision: **GLiNER2 es nuestro motor por velocidad** (139s vs NuExtract GPU 361s sobre el PDF). "
        "Pero nos faltan relaciones. Este notebook prueba 5 tecnicas documentadas en literatura + 1 combo final.\n\n"
        "**Corpus de prueba:** `es_corporate_short` (8 frases, 14 entidades 'oro', relaciones esperables verificables a mano).\n\n"
        "**Verdad de campo (lo que esperamos del corpus):**\n"
        "- 5 personas: Pablo Isla, Jose Maria Alvarez-Pallete, Ignacio Galan, Marina Serrano, Carlos Torres\n"
        "- 4-5 organizaciones: Inditex, Telefonica, Iberdrola, Endesa, BBVA\n"
        "- Localizaciones: Madrid, Arteixo, A Coruna, Galicia, Bilbao\n"
        "- Relaciones evidentes: `Pablo Isla` ex-CEO/president `Inditex`, `Jose Maria Alvarez-Pallete` president `Telefonica`, `Ignacio Galan` president `Iberdrola`, `Marina Serrano` CEO `Endesa`, `Carlos Torres` president `BBVA`, `Inditex headquartered_in Arteixo`, `BBVA headquartered_in Bilbao`, `Iberdrola+Endesa agreement`."
    ))

    cells.append(_md("## 0. Setup + carga GLiNER2"))

    cells.append(_code(
        "import os, sys, json, warnings, time, re\n"
        "warnings.filterwarnings('ignore')\n"
        "from pathlib import Path\n"
        "from collections import defaultdict\n"
        "\n"
        "# sys.path cleanup: el startup del kernel anade subdirs de python/functions/\n"
        "# que sombrean paquetes pip (e.g. bigquery/datasets.py vs HF datasets)\n"
        "_pf = '/home/lucas/fn_registry/python/functions'\n"
        "sys.path = [p for p in sys.path if not p.startswith(_pf + '/')]\n"
        "if _pf not in sys.path: sys.path.insert(0, _pf)\n"
        "\n"
        "import pandas as pd\n"
        "import networkx as nx\n"
        "import matplotlib.pyplot as plt\n"
        "from matplotlib.patches import Patch\n"
        "from gliner2 import GLiNER2\n"
        "\n"
        "t0 = time.time()\n"
        "model = GLiNER2.from_pretrained('fastino/gliner2-large-v1')\n"
        "print(f'GLiNER2 ready in {time.time()-t0:.1f}s')\n"
        "\n"
        f"TEXT = {ES_CORPORATE_SHORT!r}\n"
        "print(f'Corpus: {len(TEXT)} chars / {len(TEXT.split())} words / {len(re.split(chr(46), TEXT))} sentences')"
    ))

    # ── §1 Label naming
    cells.append(_md(
        "## §1 Label naming — el factor mas critico\n\n"
        "La documentacion afirma que GLiNER2 es muy sensible al **nombre del label**, no solo a su semantica. "
        "Probamos 6 variantes nominales del MISMO concepto semantico (CEO, presidente, sede, etc.):\n\n"
        "| Variante | Estilo |\n"
        "|---|---|\n"
        "| A | snake_case verbal: `works_at`, `located_in`, `ceo_of` |\n"
        "| B | snake_case sinonimos: `employed_by`, `situated_in`, `head_of` |\n"
        "| C | verbos cortos: `runs`, `lives_in`, `presides` |\n"
        "| D | UPPERCASE_NO_UNDERSCORE: `WORKSAT`, `LOCATEDIN`, `CEOOF` |\n"
        "| E | camelCase: `worksAt`, `locatedIn`, `ceoOf` |\n"
        "| F | con espacios: `\"works at\"`, `\"located in\"` |"
    ))

    cells.append(_code(
        "ENTITY_LABELS = ['person', 'organization', 'location']\n"
        "\n"
        "VARIANTS = {\n"
        "    'A snake_case verbal':       ['works_at', 'located_in', 'ceo_of', 'president_of', 'headquartered_in', 'agreement_with'],\n"
        "    'B snake_case sinonimos':    ['employed_by', 'situated_in', 'head_of', 'leader_of', 'based_in', 'partnered_with'],\n"
        "    'C verbos cortos':           ['runs', 'lives_in', 'presides', 'leads', 'is_at', 'allies_with'],\n"
        "    'D UPPERCASE_NO_UNDERSCORE': ['WORKSAT', 'LOCATEDIN', 'CEOOF', 'PRESIDENTOF', 'HEADQUARTEREDIN', 'AGREEMENTWITH'],\n"
        "    'E camelCase':               ['worksAt', 'locatedIn', 'ceoOf', 'presidentOf', 'headquarteredIn', 'agreementWith'],\n"
        "    'F espacios':                ['works at', 'located in', 'ceo of', 'president of', 'headquartered in', 'agreement with'],\n"
        "}\n"
        "\n"
        "rows = []\n"
        "for variant, labels in VARIANTS.items():\n"
        "    schema = model.create_schema().entities(ENTITY_LABELS).relations(labels)\n"
        "    t0 = time.time()\n"
        "    r = model.extract(TEXT, schema=schema, threshold=0.3)\n"
        "    elapsed = time.time() - t0\n"
        "    n_ents = sum(len(v) for v in r['entities'].values())\n"
        "    n_rels = sum(len(v) for v in r['relation_extraction'].values())\n"
        "    nonzero = sum(1 for v in r['relation_extraction'].values() if v)\n"
        "    rows.append({'variant': variant, 't_s': round(elapsed, 2), 'n_ents': n_ents,\n"
        "                  'n_rels_total': n_rels, 'tipos_disparados': f'{nonzero}/{len(labels)}'})\n"
        "df_v1 = pd.DataFrame(rows)\n"
        "df_v1"
    ))

    cells.append(_md(
        "**Lectura §1:** mira `n_rels_total` — cambiar el naming del label sin cambiar el significado puede mover el numero "
        "drasticamente. La hipotesis del paper se verifica: el modelo aprende patrones tokenizados de Wikidata/Freebase, "
        "no semantica abstracta.\n\n"
        "**Implicacion:** **siempre** usa snake_case verbal corto. **Nunca** UPPERCASE, camelCase o espacios."
    ))

    # ── §2 include_confidence
    cells.append(_md(
        "## §2 `include_confidence=True` — threshold por relacion\n\n"
        "GLiNER2 expone scores por head/tail si pasas `include_confidence=True`. Lo usamos para:\n\n"
        "1. Ver la **distribucion real** de scores por relacion\n"
        "2. Elegir un **threshold dinamico por relacion** (no global)\n\n"
        "Hipotesis: relaciones ambiguas (`agreement_with`) tienen scores mas bajos y necesitan threshold distinto que `headquartered_in`."
    ))

    cells.append(_code(
        "schema = model.create_schema().entities(ENTITY_LABELS).relations(\n"
        "    ['works_at', 'located_in', 'ceo_of', 'president_of', 'headquartered_in', 'agreement_with']\n"
        ")\n"
        "r_conf = model.extract(TEXT, schema=schema, threshold=0.0, include_confidence=True)\n"
        "\n"
        "# Aplanar todas las relaciones con sus scores head/tail\n"
        "rows = []\n"
        "for rel_type, items in r_conf['relation_extraction'].items():\n"
        "    for it in items:\n"
        "        rows.append({\n"
        "            'rel_type': rel_type,\n"
        "            'head': it['head']['text'] if isinstance(it.get('head'), dict) else str(it.get('head')),\n"
        "            'head_conf': it['head'].get('confidence') if isinstance(it.get('head'), dict) else None,\n"
        "            'tail': it['tail']['text'] if isinstance(it.get('tail'), dict) else str(it.get('tail')),\n"
        "            'tail_conf': it['tail'].get('confidence') if isinstance(it.get('tail'), dict) else None,\n"
        "        })\n"
        "df_conf = pd.DataFrame(rows)\n"
        "if not df_conf.empty:\n"
        "    df_conf['min_conf'] = df_conf[['head_conf', 'tail_conf']].min(axis=1)\n"
        "print(f'total relaciones (threshold=0.0): {len(df_conf)}')\n"
        "print(f'columnas: {list(df_conf.columns)}')\n"
        "df_conf.head(10)"
    ))

    cells.append(_code(
        "# Distribucion por tipo de relacion\n"
        "if not df_conf.empty and 'min_conf' in df_conf.columns:\n"
        "    by_type = df_conf.groupby('rel_type')['min_conf'].agg(['count', 'min', 'mean', 'max']).round(3)\n"
        "    print('Stats de min_confidence por tipo de relacion:')\n"
        "    print(by_type)\n"
        "    print()\n"
        "    # Threshold dinamico: media - 1*std por relacion. Aproximacion simple: ratio del max\n"
        "    thr_per_rel = (by_type['max'] * 0.6).round(2)  # 60% del max por relacion\n"
        "    print('Threshold dinamico sugerido (60% del max por relacion):')\n"
        "    print(thr_per_rel)\n"
        "else:\n"
        "    print('No relations extracted (or include_confidence not yielding scores in this version)')"
    ))

    cells.append(_code(
        "# Comparativa: threshold global vs threshold por relacion\n"
        "if not df_conf.empty and 'min_conf' in df_conf.columns:\n"
        "    fig, ax = plt.subplots(figsize=(10, 5))\n"
        "    for rt in df_conf['rel_type'].unique():\n"
        "        scores = df_conf[df_conf['rel_type'] == rt]['min_conf']\n"
        "        ax.scatter([rt] * len(scores), scores, alpha=0.5, s=80, label=rt)\n"
        "    ax.axhline(0.3, color='red', linestyle='--', label='threshold global 0.3')\n"
        "    ax.set_ylabel('min(head_conf, tail_conf)')\n"
        "    ax.set_title('Distribucion de scores por tipo de relacion')\n"
        "    ax.set_ylim(0, 1.05)\n"
        "    ax.tick_params(axis='x', rotation=20)\n"
        "    plt.tight_layout(); plt.show()\n"
        "else:\n"
        "    print('No data to plot')"
    ))

    cells.append(_md(
        "**Lectura §2:** algunas relaciones tienen scores muy concentrados (alto recall facil), otras dispersos (necesitan tuning). "
        "Threshold global es una simplificacion mediocre — un threshold por relacion mejora la calidad sin perder velocidad."
    ))

    # ── §3 Post-filter
    cells.append(_md(
        "## §3 Post-filter por (head_type, tail_type) — descartar combinaciones imposibles\n\n"
        "GLiNER2 NO puede restringir nativamente que un `president_of` solo acepte `(person, organization)`. "
        "Por eso emite cosas como `Madrid president_of Persona`. Solucion: **post-procesado** combinando NER + relaciones.\n\n"
        "Definimos por relacion el conjunto de tipos validos para head y tail:"
    ))

    cells.append(_code(
        "ALLOWED = {\n"
        "    'works_at':         (['person'],         ['organization']),\n"
        "    'employed_by':      (['person'],         ['organization']),\n"
        "    'ceo_of':           (['person'],         ['organization']),\n"
        "    'president_of':     (['person'],         ['organization']),\n"
        "    'headquartered_in': (['organization'],   ['location']),\n"
        "    'located_in':       (['organization', 'person', 'location'], ['location']),\n"
        "    'agreement_with':   (['organization'],   ['organization']),\n"
        "    'subsidiary_of':    (['organization'],   ['organization']),\n"
        "}\n"
        "\n"
        "# Mapa nombre → tipo desde la extraccion\n"
        "schema = model.create_schema().entities(ENTITY_LABELS).relations(list(ALLOWED.keys()))\n"
        "r = model.extract(TEXT, schema=schema, threshold=0.3)\n"
        "\n"
        "name_to_type = {}\n"
        "for typ, names in r['entities'].items():\n"
        "    for n in names:\n"
        "        name_to_type[n.lower().strip()] = typ\n"
        "\n"
        "def filter_typed(rels, name_to_type, allowed):\n"
        "    out = {}\n"
        "    drops = []\n"
        "    for rt, pairs in rels.items():\n"
        "        keep = []\n"
        "        head_ok, tail_ok = allowed.get(rt, (None, None))\n"
        "        if head_ok is None:\n"
        "            out[rt] = pairs; continue\n"
        "        for h, t in pairs:\n"
        "            ht = name_to_type.get(h.lower().strip())\n"
        "            tt = name_to_type.get(t.lower().strip())\n"
        "            if ht in head_ok and tt in tail_ok:\n"
        "                keep.append((h, t))\n"
        "            else:\n"
        "                drops.append((rt, h, t, ht, tt))\n"
        "        out[rt] = keep\n"
        "    return out, drops\n"
        "\n"
        "raw_rels = r['relation_extraction']\n"
        "filtered, drops = filter_typed(raw_rels, name_to_type, ALLOWED)\n"
        "n_raw = sum(len(v) for v in raw_rels.values())\n"
        "n_filt = sum(len(v) for v in filtered.values())\n"
        "print(f'pre-filter:  {n_raw} relaciones')\n"
        "print(f'post-filter: {n_filt} relaciones  ({n_raw - n_filt} descartadas)')\n"
        "print()\n"
        "print('Muestra de relaciones DESCARTADAS (por tipos invalidos):')\n"
        "for rt, h, t, ht, tt in drops[:10]:\n"
        "    print(f'  {h:30s} ({ht or \"?\"}) --[{rt:18s}]--> {t:30s} ({tt or \"?\"})')"
    ))

    cells.append(_md(
        "**Lectura §3:** el filtro typed elimina las relaciones absurdas (`Madrid president_of`, `A Coruna located_in Iberdrola`). "
        "El payoff es **gratis y puro** — no requiere modelo, solo logica."
    ))

    # ── §4 Descripciones
    cells.append(_md(
        "## §4 Descripciones en labels — re-confirmacion\n\n"
        "En el notebook 06 vimos que pasar dict con descripciones no movia los numeros. Re-confirmamos con threshold 0.3:"
    ))

    cells.append(_code(
        "labels_flat = ['works_at', 'located_in', 'ceo_of', 'president_of', 'headquartered_in', 'agreement_with']\n"
        "labels_desc = {\n"
        "    'works_at':         'person is employed by organization',\n"
        "    'located_in':       'entity is located in a place',\n"
        "    'ceo_of':           'person is the chief executive officer of organization',\n"
        "    'president_of':     'person is the president or chairman of organization',\n"
        "    'headquartered_in': 'organization has its headquarters in a location',\n"
        "    'agreement_with':   'organization has signed an agreement with another organization',\n"
        "}\n"
        "\n"
        "schema_flat = model.create_schema().entities(ENTITY_LABELS).relations(labels_flat)\n"
        "schema_desc = model.create_schema().entities(ENTITY_LABELS).relations(labels_desc)\n"
        "\n"
        "r_flat = model.extract(TEXT, schema=schema_flat, threshold=0.3)\n"
        "r_desc = model.extract(TEXT, schema=schema_desc, threshold=0.3)\n"
        "\n"
        "n_flat = sum(len(v) for v in r_flat['relation_extraction'].values())\n"
        "n_desc = sum(len(v) for v in r_desc['relation_extraction'].values())\n"
        "print(f'flat list:        {n_flat} relaciones')\n"
        "print(f'dict + desc:      {n_desc} relaciones')\n"
        "print(f'diferencia:       {n_desc - n_flat:+d}')"
    ))

    cells.append(_md(
        "**Lectura §4:** confirmado lo del notebook 06. Las descripciones **no mueven la aguja** en este corpus. "
        "Quizas en relaciones muy ambiguas (e.g. `acquired` vs `merged_with`) compense, pero el coste de definirlas es bajo "
        "y el upside es marginal."
    ))

    # ── §5 GLiREL hibrido
    cells.append(_md(
        "## §5 Hibrido GLiNER2 (NER) + GLiREL (relaciones con allowed_head/tail)\n\n"
        "GLiREL se descarto en notebook 02 por mala calidad en castellano. **PERO** lo usabamos sin restricciones de tipo. "
        "Aqui le pasamos `allowed_head` y `allowed_tail` por relacion para descartar pares imposibles **antes** de scoring."
    ))

    cells.append(_code(
        "from datascience.glirel_load_model import glirel_load_model\n"
        "\n"
        "t0 = time.time()\n"
        "glirel = glirel_load_model()\n"
        "print(f'GLiREL ready in {time.time()-t0:.1f}s')\n"
        "\n"
        "# 1. Entidades de GLiNER2 (tipadas)\n"
        "schema_ent = model.create_schema().entities(ENTITY_LABELS)\n"
        "r_ent = model.extract(TEXT, schema=schema_ent, threshold=0.3)\n"
        "\n"
        "# 2. Construir ner_spans token-level + name_to_type\n"
        "tokens = TEXT.split()\n"
        "ner_spans = []\n"
        "name_to_type = {}\n"
        "for typ, names in r_ent['entities'].items():\n"
        "    for n in names:\n"
        "        name_to_type[n.lower().strip()] = typ\n"
        "        # localizar span token-level (rough)\n"
        "        idx = TEXT.find(n)\n"
        "        if idx < 0: continue\n"
        "        pre = TEXT[:idx]\n"
        "        start_tok = len(pre.split())\n"
        "        end_tok = start_tok + len(n.split())\n"
        "        if end_tok > start_tok:\n"
        "            ner_spans.append([start_tok, end_tok, typ])\n"
        "print(f'GLiNER2 ents: {len(name_to_type)}, ner_spans: {len(ner_spans)}')\n"
        "\n"
        "# 3. GLiREL — primero sin allowed (baseline notebook 02)\n"
        "rel_labels = ['works_at', 'located_in', 'ceo_of', 'president_of', 'headquartered_in', 'agreement_with']\n"
        "raw = glirel.predict_relations(tokens, labels=rel_labels, threshold=0.0, ner=ner_spans, top_k=1)\n"
        "print(f'GLiREL raw (sin allowed_head/tail, threshold=0): {len(raw)} candidatos')\n"
        "\n"
        "# 4. Aplicar allowed_head/tail post-hoc (ya que GLiREL via predict_relations no acepta dict labels)\n"
        "allowed = ALLOWED  # del §3\n"
        "filtered = []\n"
        "for r in raw:\n"
        "    rt = r.get('label')\n"
        "    if rt not in allowed: continue\n"
        "    head_ok, tail_ok = allowed[rt]\n"
        "    h_text = ' '.join(r.get('head_text', []))\n"
        "    t_text = ' '.join(r.get('tail_text', []))\n"
        "    h_type = name_to_type.get(h_text.lower().strip())\n"
        "    t_type = name_to_type.get(t_text.lower().strip())\n"
        "    if h_type in head_ok and t_type in tail_ok and r.get('score', 0) >= 0.10:\n"
        "        filtered.append((h_text, rt, t_text, round(r.get('score', 0), 3)))\n"
        "print(f'GLiREL post-filter typed (threshold 0.10): {len(filtered)} relaciones')\n"
        "\n"
        "# 5. Mostrar las primeras 15\n"
        "for h, rt, t, s in filtered[:15]:\n"
        "    print(f'  {h:32s} --[{rt:18s} {s}]--> {t}')"
    ))

    cells.append(_md(
        "**Lectura §5:** sin filtro typed, GLiREL emite cientos de candidatos espurios (lo que vimos en nb 02). "
        "**Con filtro typed + threshold 0.10**, queda un set limpio de relaciones cuya cabeza y cola tienen sentido. "
        "El coste extra: cargar GLiREL (~7s) y predict (~50ms). Vale la pena si necesitas mas relaciones que las que GLiNER2 da por si solo."
    ))

    # ── §6 Best combo
    cells.append(_md(
        "## §6 Best combo — todo junto sobre el corpus\n\n"
        "Aplicamos a la vez:\n"
        "1. Snake_case verbal (mejor variante §1)\n"
        "2. `include_confidence=True` con threshold global 0.3\n"
        "3. **Post-filter typed** (§3)\n"
        "4. **Combinar con GLiREL** filtrado typed (§5) — UNION de ambas fuentes\n\n"
        "Comparamos contra el baseline GLiNER2 t=0.3 sin post-procesado."
    ))

    cells.append(_code(
        "labels = ['works_at', 'located_in', 'ceo_of', 'president_of', 'headquartered_in', 'agreement_with']\n"
        "schema = model.create_schema().entities(ENTITY_LABELS).relations(labels)\n"
        "\n"
        "# baseline\n"
        "r = model.extract(TEXT, schema=schema, threshold=0.3)\n"
        "name_to_type = {n.lower().strip(): typ for typ, names in r['entities'].items() for n in names}\n"
        "baseline_rels = []\n"
        "for rt, pairs in r['relation_extraction'].items():\n"
        "    for h, t in pairs:\n"
        "        baseline_rels.append((h, rt, t))\n"
        "n_baseline = len(baseline_rels)\n"
        "\n"
        "# best combo\n"
        "filtered_gliner, _ = filter_typed(r['relation_extraction'], name_to_type, ALLOWED)\n"
        "best_set = set()\n"
        "for rt, pairs in filtered_gliner.items():\n"
        "    for h, t in pairs:\n"
        "        best_set.add((h, rt, t))\n"
        "for h, rt, t, s in filtered:\n"
        "    best_set.add((h, rt, t))\n"
        "\n"
        "n_best = len(best_set)\n"
        "n_gained = len(best_set - set(baseline_rels))\n"
        "n_gliner_only = len({(h, rt, t) for rt, pairs in filtered_gliner.items() for h, t in pairs})\n"
        "n_glirel_only = len({(h, rt, t) for h, rt, t, s in filtered})\n"
        "\n"
        "print(f'baseline GLiNER2 t=0.3 sin filter:        {n_baseline} relaciones')\n"
        "print(f'GLiNER2 t=0.3 + post-filter typed:        {n_gliner_only}')\n"
        "print(f'GLiREL filtered typed (threshold 0.10):    {n_glirel_only}')\n"
        "print(f'UNION (GLiNER2 typed ∪ GLiREL typed):     {n_best}')\n"
        "print(f'  ganancia vs baseline: +{n_gained} relaciones')"
    ))

    cells.append(_code(
        "# Visualizar el grafo final\n"
        "G = nx.DiGraph()\n"
        "for typ, names in r['entities'].items():\n"
        "    for n in names:\n"
        "        G.add_node(n, type=typ)\n"
        "for h, rt, t in best_set:\n"
        "    G.add_node(h, type=name_to_type.get(h.lower().strip(), '?'))\n"
        "    G.add_node(t, type=name_to_type.get(t.lower().strip(), '?'))\n"
        "    G.add_edge(h, t, kind=rt)\n"
        "\n"
        "TYPE_COLOR = {'person': '#5DA5DA', 'organization': '#F17CB0', 'location': '#60BD68', '?': '#bbb'}\n"
        "fig, ax = plt.subplots(figsize=(13, 9))\n"
        "pos = nx.spring_layout(G, k=2.5, iterations=80, seed=42)\n"
        "cols = [TYPE_COLOR.get(G.nodes[n].get('type'), '#bbb') for n in G.nodes]\n"
        "nx.draw_networkx_nodes(G, pos, node_color=cols, node_size=1900, edgecolors='#333', linewidths=1.4, ax=ax)\n"
        "nx.draw_networkx_labels(G, pos, font_size=8, font_weight='bold', ax=ax)\n"
        "nx.draw_networkx_edges(G, pos, edge_color='#666', arrows=True, arrowsize=14, width=1.1, alpha=0.7, ax=ax, connectionstyle='arc3,rad=0.08')\n"
        "el = {(u,v): d['kind'] for u,v,d in G.edges(data=True)}\n"
        "nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=6.5, ax=ax,\n"
        "                              bbox=dict(boxstyle='round,pad=0.1', fc='white', ec='none', alpha=0.85))\n"
        "ax.set_title(f'Best combo: {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=12)\n"
        "ax.axis('off')\n"
        "legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in TYPE_COLOR.items() if t != '?']\n"
        "ax.legend(handles=legend, loc='upper left', fontsize=10)\n"
        "plt.tight_layout(); plt.show()"
    ))

    cells.append(_md(
        "## Conclusion\n\n"
        "**Receta operativa para `graph_explorer` post-experimentos:**\n\n"
        "1. ⭐⭐⭐ **Naming snake_case verbal** (`works_at`, `headquartered_in`) — sin coste, gran impacto.\n"
        "2. ⭐⭐⭐ **Post-filter typed** (`{rel: (head_types, tail_types)}`) — elimina la mayoria de falsos absurdos. **Pure, sin coste.**\n"
        "3. ⭐⭐ **`include_confidence=True` + threshold por relacion** — evita el threshold global mediocre.\n"
        "4. ⭐⭐ **GLiREL como complemento** (cargado solo cuando sea necesario) con allowed_head/tail aplicado post-hoc.\n"
        "5. (no toques) Descripciones por relacion — sin efecto medible.\n\n"
        "**Stack final:**\n\n"
        "```python\n"
        "# 1. labels en snake_case verbal\n"
        "labels = ['works_at', 'ceo_of', 'president_of', 'headquartered_in', ...]\n"
        "schema = model.create_schema().entities(['person', 'organization', 'location']).relations(labels)\n"
        "\n"
        "# 2. extract con confidence\n"
        "r = model.extract(text, schema=schema, threshold=0.3, include_confidence=True)\n"
        "\n"
        "# 3. post-filter typed (gratis)\n"
        "filtered = filter_typed(r['relation_extraction'], name_to_type, ALLOWED)\n"
        "\n"
        "# 4. opcional: GLiREL como segundo opinador con allowed_head/tail filtrado post-hoc\n"
        "if rich_mode:\n"
        "    glirel_rels = glirel.predict_relations(tokens, labels=labels, threshold=0.0, ner=ner_spans, top_k=1)\n"
        "    glirel_filtered = [r for r in glirel_rels if compatible_types(r, ALLOWED, name_to_type)]\n"
        "    final_rels = union(filtered, glirel_filtered)\n"
        "```\n\n"
        "**Funciones para promover al registry** (proximo fn-constructor):\n"
        "1. `gliner2_load_model_py_datascience` (Apache 2.0)\n"
        "2. `extract_graph_gliner2_py_datascience` (NER+RE, threshold por relacion, include_confidence)\n"
        "3. `filter_relations_by_entity_types_py_core` (PURE — el ALLOWED filter)\n"
        "4. `merge_extraction_sources_py_core` (PURE — UNION de GLiNER2 + GLiREL)\n"
        "5. `extract_graph_hybrid_gliner2_glirel_py_pipelines` (composicion)"
    ))

    nb = nbf.v4.new_notebook()
    nb.cells = cells
    nb.metadata = {
        "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
        "language_info": {"name": "python"},
    }
    NB_PATH.parent.mkdir(parents=True, exist_ok=True)
    nbf.write(nb, NB_PATH)
    print(f"[done] {NB_PATH}  cells={len(cells)}")


if __name__ == "__main__":
    build()