b8c760d004
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
492 lines
26 KiB
Python
492 lines
26 KiB
Python
"""Construye notebooks/08_improving_gliner2.ipynb — experimentos para subir
|
||
las relaciones de GLiNER2 sin perder la velocidad.
|
||
|
||
5 experimentos en un mismo notebook, modelo cargado una sola vez:
|
||
§1 Label naming — works_at vs employed_by vs WorksAt vs spaces
|
||
§2 include_confidence — score per head/tail + threshold por relacion
|
||
§3 Post-filter typed — allowed (head_type, tail_type) por relacion
|
||
§4 Descripciones — flat list vs dict con descripciones
|
||
§5 GLiREL hibrido — GLiNER2 NER + GLiREL relations con allowed_head/tail
|
||
§6 Best combo — aplicar lo aprendido sobre PDF
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
from pathlib import Path
|
||
import nbformat as nbf
|
||
|
||
HERE = Path(__file__).resolve().parent
|
||
NB_PATH = HERE / "notebooks" / "08_improving_gliner2.ipynb"
|
||
|
||
ES_CORPORATE_SHORT = (
|
||
"Pablo Isla, expresidente de Inditex, ha sido nombrado consejero de Telefonica. "
|
||
"La operacion fue anunciada por el presidente Jose Maria Alvarez-Pallete en Madrid el pasado lunes. "
|
||
"Inditex factura mas de 30.000 millones anuales y tiene su sede en Arteixo, A Coruna. "
|
||
"En paralelo, Iberdrola y Endesa firmaron un acuerdo de colaboracion en proyectos eolicos en Galicia. "
|
||
"El presidente de Iberdrola, Ignacio Galan, se reunio con la CEO de Endesa, Marina Serrano, en Bilbao. "
|
||
"El acuerdo movilizara 2.000 millones de euros en cinco anos. "
|
||
"El BBVA, presidido por Carlos Torres, mostro interes en participar en la financiacion del proyecto. "
|
||
"Su sede central esta en Bilbao."
|
||
)
|
||
|
||
|
||
def _md(text: str):
|
||
return nbf.v4.new_markdown_cell(text)
|
||
|
||
|
||
def _code(src: str):
|
||
cell = nbf.v4.new_code_cell(src)
|
||
cell.outputs = []
|
||
cell.execution_count = None
|
||
return cell
|
||
|
||
|
||
def build():
|
||
cells = []
|
||
|
||
cells.append(_md(
|
||
"# Mejoras a GLiNER2 — sumarle capacidad sin perder velocidad\n\n"
|
||
"Decision: **GLiNER2 es nuestro motor por velocidad** (139s vs NuExtract GPU 361s sobre el PDF). "
|
||
"Pero nos faltan relaciones. Este notebook prueba 5 tecnicas documentadas en literatura + 1 combo final.\n\n"
|
||
"**Corpus de prueba:** `es_corporate_short` (8 frases, 14 entidades 'oro', relaciones esperables verificables a mano).\n\n"
|
||
"**Verdad de campo (lo que esperamos del corpus):**\n"
|
||
"- 5 personas: Pablo Isla, Jose Maria Alvarez-Pallete, Ignacio Galan, Marina Serrano, Carlos Torres\n"
|
||
"- 4-5 organizaciones: Inditex, Telefonica, Iberdrola, Endesa, BBVA\n"
|
||
"- Localizaciones: Madrid, Arteixo, A Coruna, Galicia, Bilbao\n"
|
||
"- Relaciones evidentes: `Pablo Isla` ex-CEO/president `Inditex`, `Jose Maria Alvarez-Pallete` president `Telefonica`, `Ignacio Galan` president `Iberdrola`, `Marina Serrano` CEO `Endesa`, `Carlos Torres` president `BBVA`, `Inditex headquartered_in Arteixo`, `BBVA headquartered_in Bilbao`, `Iberdrola+Endesa agreement`."
|
||
))
|
||
|
||
cells.append(_md("## 0. Setup + carga GLiNER2"))
|
||
|
||
cells.append(_code(
|
||
"import os, sys, json, warnings, time, re\n"
|
||
"warnings.filterwarnings('ignore')\n"
|
||
"from pathlib import Path\n"
|
||
"from collections import defaultdict\n"
|
||
"\n"
|
||
"# sys.path cleanup: el startup del kernel anade subdirs de python/functions/\n"
|
||
"# que sombrean paquetes pip (e.g. bigquery/datasets.py vs HF datasets)\n"
|
||
"_pf = '/home/lucas/fn_registry/python/functions'\n"
|
||
"sys.path = [p for p in sys.path if not p.startswith(_pf + '/')]\n"
|
||
"if _pf not in sys.path: sys.path.insert(0, _pf)\n"
|
||
"\n"
|
||
"import pandas as pd\n"
|
||
"import networkx as nx\n"
|
||
"import matplotlib.pyplot as plt\n"
|
||
"from matplotlib.patches import Patch\n"
|
||
"from gliner2 import GLiNER2\n"
|
||
"\n"
|
||
"t0 = time.time()\n"
|
||
"model = GLiNER2.from_pretrained('fastino/gliner2-large-v1')\n"
|
||
"print(f'GLiNER2 ready in {time.time()-t0:.1f}s')\n"
|
||
"\n"
|
||
f"TEXT = {ES_CORPORATE_SHORT!r}\n"
|
||
"print(f'Corpus: {len(TEXT)} chars / {len(TEXT.split())} words / {len(re.split(chr(46), TEXT))} sentences')"
|
||
))
|
||
|
||
# ── §1 Label naming
|
||
cells.append(_md(
|
||
"## §1 Label naming — el factor mas critico\n\n"
|
||
"La documentacion afirma que GLiNER2 es muy sensible al **nombre del label**, no solo a su semantica. "
|
||
"Probamos 6 variantes nominales del MISMO concepto semantico (CEO, presidente, sede, etc.):\n\n"
|
||
"| Variante | Estilo |\n"
|
||
"|---|---|\n"
|
||
"| A | snake_case verbal: `works_at`, `located_in`, `ceo_of` |\n"
|
||
"| B | snake_case sinonimos: `employed_by`, `situated_in`, `head_of` |\n"
|
||
"| C | verbos cortos: `runs`, `lives_in`, `presides` |\n"
|
||
"| D | UPPERCASE_NO_UNDERSCORE: `WORKSAT`, `LOCATEDIN`, `CEOOF` |\n"
|
||
"| E | camelCase: `worksAt`, `locatedIn`, `ceoOf` |\n"
|
||
"| F | con espacios: `\"works at\"`, `\"located in\"` |"
|
||
))
|
||
|
||
cells.append(_code(
|
||
"ENTITY_LABELS = ['person', 'organization', 'location']\n"
|
||
"\n"
|
||
"VARIANTS = {\n"
|
||
" 'A snake_case verbal': ['works_at', 'located_in', 'ceo_of', 'president_of', 'headquartered_in', 'agreement_with'],\n"
|
||
" 'B snake_case sinonimos': ['employed_by', 'situated_in', 'head_of', 'leader_of', 'based_in', 'partnered_with'],\n"
|
||
" 'C verbos cortos': ['runs', 'lives_in', 'presides', 'leads', 'is_at', 'allies_with'],\n"
|
||
" 'D UPPERCASE_NO_UNDERSCORE': ['WORKSAT', 'LOCATEDIN', 'CEOOF', 'PRESIDENTOF', 'HEADQUARTEREDIN', 'AGREEMENTWITH'],\n"
|
||
" 'E camelCase': ['worksAt', 'locatedIn', 'ceoOf', 'presidentOf', 'headquarteredIn', 'agreementWith'],\n"
|
||
" 'F espacios': ['works at', 'located in', 'ceo of', 'president of', 'headquartered in', 'agreement with'],\n"
|
||
"}\n"
|
||
"\n"
|
||
"rows = []\n"
|
||
"for variant, labels in VARIANTS.items():\n"
|
||
" schema = model.create_schema().entities(ENTITY_LABELS).relations(labels)\n"
|
||
" t0 = time.time()\n"
|
||
" r = model.extract(TEXT, schema=schema, threshold=0.3)\n"
|
||
" elapsed = time.time() - t0\n"
|
||
" n_ents = sum(len(v) for v in r['entities'].values())\n"
|
||
" n_rels = sum(len(v) for v in r['relation_extraction'].values())\n"
|
||
" nonzero = sum(1 for v in r['relation_extraction'].values() if v)\n"
|
||
" rows.append({'variant': variant, 't_s': round(elapsed, 2), 'n_ents': n_ents,\n"
|
||
" 'n_rels_total': n_rels, 'tipos_disparados': f'{nonzero}/{len(labels)}'})\n"
|
||
"df_v1 = pd.DataFrame(rows)\n"
|
||
"df_v1"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"**Lectura §1:** mira `n_rels_total` — cambiar el naming del label sin cambiar el significado puede mover el numero "
|
||
"drasticamente. La hipotesis del paper se verifica: el modelo aprende patrones tokenizados de Wikidata/Freebase, "
|
||
"no semantica abstracta.\n\n"
|
||
"**Implicacion:** **siempre** usa snake_case verbal corto. **Nunca** UPPERCASE, camelCase o espacios."
|
||
))
|
||
|
||
# ── §2 include_confidence
|
||
cells.append(_md(
|
||
"## §2 `include_confidence=True` — threshold por relacion\n\n"
|
||
"GLiNER2 expone scores por head/tail si pasas `include_confidence=True`. Lo usamos para:\n\n"
|
||
"1. Ver la **distribucion real** de scores por relacion\n"
|
||
"2. Elegir un **threshold dinamico por relacion** (no global)\n\n"
|
||
"Hipotesis: relaciones ambiguas (`agreement_with`) tienen scores mas bajos y necesitan threshold distinto que `headquartered_in`."
|
||
))
|
||
|
||
cells.append(_code(
|
||
"schema = model.create_schema().entities(ENTITY_LABELS).relations(\n"
|
||
" ['works_at', 'located_in', 'ceo_of', 'president_of', 'headquartered_in', 'agreement_with']\n"
|
||
")\n"
|
||
"r_conf = model.extract(TEXT, schema=schema, threshold=0.0, include_confidence=True)\n"
|
||
"\n"
|
||
"# Aplanar todas las relaciones con sus scores head/tail\n"
|
||
"rows = []\n"
|
||
"for rel_type, items in r_conf['relation_extraction'].items():\n"
|
||
" for it in items:\n"
|
||
" rows.append({\n"
|
||
" 'rel_type': rel_type,\n"
|
||
" 'head': it['head']['text'] if isinstance(it.get('head'), dict) else str(it.get('head')),\n"
|
||
" 'head_conf': it['head'].get('confidence') if isinstance(it.get('head'), dict) else None,\n"
|
||
" 'tail': it['tail']['text'] if isinstance(it.get('tail'), dict) else str(it.get('tail')),\n"
|
||
" 'tail_conf': it['tail'].get('confidence') if isinstance(it.get('tail'), dict) else None,\n"
|
||
" })\n"
|
||
"df_conf = pd.DataFrame(rows)\n"
|
||
"if not df_conf.empty:\n"
|
||
" df_conf['min_conf'] = df_conf[['head_conf', 'tail_conf']].min(axis=1)\n"
|
||
"print(f'total relaciones (threshold=0.0): {len(df_conf)}')\n"
|
||
"print(f'columnas: {list(df_conf.columns)}')\n"
|
||
"df_conf.head(10)"
|
||
))
|
||
|
||
cells.append(_code(
|
||
"# Distribucion por tipo de relacion\n"
|
||
"if not df_conf.empty and 'min_conf' in df_conf.columns:\n"
|
||
" by_type = df_conf.groupby('rel_type')['min_conf'].agg(['count', 'min', 'mean', 'max']).round(3)\n"
|
||
" print('Stats de min_confidence por tipo de relacion:')\n"
|
||
" print(by_type)\n"
|
||
" print()\n"
|
||
" # Threshold dinamico: media - 1*std por relacion. Aproximacion simple: ratio del max\n"
|
||
" thr_per_rel = (by_type['max'] * 0.6).round(2) # 60% del max por relacion\n"
|
||
" print('Threshold dinamico sugerido (60% del max por relacion):')\n"
|
||
" print(thr_per_rel)\n"
|
||
"else:\n"
|
||
" print('No relations extracted (or include_confidence not yielding scores in this version)')"
|
||
))
|
||
|
||
cells.append(_code(
|
||
"# Comparativa: threshold global vs threshold por relacion\n"
|
||
"if not df_conf.empty and 'min_conf' in df_conf.columns:\n"
|
||
" fig, ax = plt.subplots(figsize=(10, 5))\n"
|
||
" for rt in df_conf['rel_type'].unique():\n"
|
||
" scores = df_conf[df_conf['rel_type'] == rt]['min_conf']\n"
|
||
" ax.scatter([rt] * len(scores), scores, alpha=0.5, s=80, label=rt)\n"
|
||
" ax.axhline(0.3, color='red', linestyle='--', label='threshold global 0.3')\n"
|
||
" ax.set_ylabel('min(head_conf, tail_conf)')\n"
|
||
" ax.set_title('Distribucion de scores por tipo de relacion')\n"
|
||
" ax.set_ylim(0, 1.05)\n"
|
||
" ax.tick_params(axis='x', rotation=20)\n"
|
||
" plt.tight_layout(); plt.show()\n"
|
||
"else:\n"
|
||
" print('No data to plot')"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"**Lectura §2:** algunas relaciones tienen scores muy concentrados (alto recall facil), otras dispersos (necesitan tuning). "
|
||
"Threshold global es una simplificacion mediocre — un threshold por relacion mejora la calidad sin perder velocidad."
|
||
))
|
||
|
||
# ── §3 Post-filter
|
||
cells.append(_md(
|
||
"## §3 Post-filter por (head_type, tail_type) — descartar combinaciones imposibles\n\n"
|
||
"GLiNER2 NO puede restringir nativamente que un `president_of` solo acepte `(person, organization)`. "
|
||
"Por eso emite cosas como `Madrid president_of Persona`. Solucion: **post-procesado** combinando NER + relaciones.\n\n"
|
||
"Definimos por relacion el conjunto de tipos validos para head y tail:"
|
||
))
|
||
|
||
cells.append(_code(
|
||
"ALLOWED = {\n"
|
||
" 'works_at': (['person'], ['organization']),\n"
|
||
" 'employed_by': (['person'], ['organization']),\n"
|
||
" 'ceo_of': (['person'], ['organization']),\n"
|
||
" 'president_of': (['person'], ['organization']),\n"
|
||
" 'headquartered_in': (['organization'], ['location']),\n"
|
||
" 'located_in': (['organization', 'person', 'location'], ['location']),\n"
|
||
" 'agreement_with': (['organization'], ['organization']),\n"
|
||
" 'subsidiary_of': (['organization'], ['organization']),\n"
|
||
"}\n"
|
||
"\n"
|
||
"# Mapa nombre → tipo desde la extraccion\n"
|
||
"schema = model.create_schema().entities(ENTITY_LABELS).relations(list(ALLOWED.keys()))\n"
|
||
"r = model.extract(TEXT, schema=schema, threshold=0.3)\n"
|
||
"\n"
|
||
"name_to_type = {}\n"
|
||
"for typ, names in r['entities'].items():\n"
|
||
" for n in names:\n"
|
||
" name_to_type[n.lower().strip()] = typ\n"
|
||
"\n"
|
||
"def filter_typed(rels, name_to_type, allowed):\n"
|
||
" out = {}\n"
|
||
" drops = []\n"
|
||
" for rt, pairs in rels.items():\n"
|
||
" keep = []\n"
|
||
" head_ok, tail_ok = allowed.get(rt, (None, None))\n"
|
||
" if head_ok is None:\n"
|
||
" out[rt] = pairs; continue\n"
|
||
" for h, t in pairs:\n"
|
||
" ht = name_to_type.get(h.lower().strip())\n"
|
||
" tt = name_to_type.get(t.lower().strip())\n"
|
||
" if ht in head_ok and tt in tail_ok:\n"
|
||
" keep.append((h, t))\n"
|
||
" else:\n"
|
||
" drops.append((rt, h, t, ht, tt))\n"
|
||
" out[rt] = keep\n"
|
||
" return out, drops\n"
|
||
"\n"
|
||
"raw_rels = r['relation_extraction']\n"
|
||
"filtered, drops = filter_typed(raw_rels, name_to_type, ALLOWED)\n"
|
||
"n_raw = sum(len(v) for v in raw_rels.values())\n"
|
||
"n_filt = sum(len(v) for v in filtered.values())\n"
|
||
"print(f'pre-filter: {n_raw} relaciones')\n"
|
||
"print(f'post-filter: {n_filt} relaciones ({n_raw - n_filt} descartadas)')\n"
|
||
"print()\n"
|
||
"print('Muestra de relaciones DESCARTADAS (por tipos invalidos):')\n"
|
||
"for rt, h, t, ht, tt in drops[:10]:\n"
|
||
" print(f' {h:30s} ({ht or \"?\"}) --[{rt:18s}]--> {t:30s} ({tt or \"?\"})')"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"**Lectura §3:** el filtro typed elimina las relaciones absurdas (`Madrid president_of`, `A Coruna located_in Iberdrola`). "
|
||
"El payoff es **gratis y puro** — no requiere modelo, solo logica."
|
||
))
|
||
|
||
# ── §4 Descripciones
|
||
cells.append(_md(
|
||
"## §4 Descripciones en labels — re-confirmacion\n\n"
|
||
"En el notebook 06 vimos que pasar dict con descripciones no movia los numeros. Re-confirmamos con threshold 0.3:"
|
||
))
|
||
|
||
cells.append(_code(
|
||
"labels_flat = ['works_at', 'located_in', 'ceo_of', 'president_of', 'headquartered_in', 'agreement_with']\n"
|
||
"labels_desc = {\n"
|
||
" 'works_at': 'person is employed by organization',\n"
|
||
" 'located_in': 'entity is located in a place',\n"
|
||
" 'ceo_of': 'person is the chief executive officer of organization',\n"
|
||
" 'president_of': 'person is the president or chairman of organization',\n"
|
||
" 'headquartered_in': 'organization has its headquarters in a location',\n"
|
||
" 'agreement_with': 'organization has signed an agreement with another organization',\n"
|
||
"}\n"
|
||
"\n"
|
||
"schema_flat = model.create_schema().entities(ENTITY_LABELS).relations(labels_flat)\n"
|
||
"schema_desc = model.create_schema().entities(ENTITY_LABELS).relations(labels_desc)\n"
|
||
"\n"
|
||
"r_flat = model.extract(TEXT, schema=schema_flat, threshold=0.3)\n"
|
||
"r_desc = model.extract(TEXT, schema=schema_desc, threshold=0.3)\n"
|
||
"\n"
|
||
"n_flat = sum(len(v) for v in r_flat['relation_extraction'].values())\n"
|
||
"n_desc = sum(len(v) for v in r_desc['relation_extraction'].values())\n"
|
||
"print(f'flat list: {n_flat} relaciones')\n"
|
||
"print(f'dict + desc: {n_desc} relaciones')\n"
|
||
"print(f'diferencia: {n_desc - n_flat:+d}')"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"**Lectura §4:** confirmado lo del notebook 06. Las descripciones **no mueven la aguja** en este corpus. "
|
||
"Quizas en relaciones muy ambiguas (e.g. `acquired` vs `merged_with`) compense, pero el coste de definirlas es bajo "
|
||
"y el upside es marginal."
|
||
))
|
||
|
||
# ── §5 GLiREL hibrido
|
||
cells.append(_md(
|
||
"## §5 Hibrido GLiNER2 (NER) + GLiREL (relaciones con allowed_head/tail)\n\n"
|
||
"GLiREL se descarto en notebook 02 por mala calidad en castellano. **PERO** lo usabamos sin restricciones de tipo. "
|
||
"Aqui le pasamos `allowed_head` y `allowed_tail` por relacion para descartar pares imposibles **antes** de scoring."
|
||
))
|
||
|
||
cells.append(_code(
|
||
"from datascience.glirel_load_model import glirel_load_model\n"
|
||
"\n"
|
||
"t0 = time.time()\n"
|
||
"glirel = glirel_load_model()\n"
|
||
"print(f'GLiREL ready in {time.time()-t0:.1f}s')\n"
|
||
"\n"
|
||
"# 1. Entidades de GLiNER2 (tipadas)\n"
|
||
"schema_ent = model.create_schema().entities(ENTITY_LABELS)\n"
|
||
"r_ent = model.extract(TEXT, schema=schema_ent, threshold=0.3)\n"
|
||
"\n"
|
||
"# 2. Construir ner_spans token-level + name_to_type\n"
|
||
"tokens = TEXT.split()\n"
|
||
"ner_spans = []\n"
|
||
"name_to_type = {}\n"
|
||
"for typ, names in r_ent['entities'].items():\n"
|
||
" for n in names:\n"
|
||
" name_to_type[n.lower().strip()] = typ\n"
|
||
" # localizar span token-level (rough)\n"
|
||
" idx = TEXT.find(n)\n"
|
||
" if idx < 0: continue\n"
|
||
" pre = TEXT[:idx]\n"
|
||
" start_tok = len(pre.split())\n"
|
||
" end_tok = start_tok + len(n.split())\n"
|
||
" if end_tok > start_tok:\n"
|
||
" ner_spans.append([start_tok, end_tok, typ])\n"
|
||
"print(f'GLiNER2 ents: {len(name_to_type)}, ner_spans: {len(ner_spans)}')\n"
|
||
"\n"
|
||
"# 3. GLiREL — primero sin allowed (baseline notebook 02)\n"
|
||
"rel_labels = ['works_at', 'located_in', 'ceo_of', 'president_of', 'headquartered_in', 'agreement_with']\n"
|
||
"raw = glirel.predict_relations(tokens, labels=rel_labels, threshold=0.0, ner=ner_spans, top_k=1)\n"
|
||
"print(f'GLiREL raw (sin allowed_head/tail, threshold=0): {len(raw)} candidatos')\n"
|
||
"\n"
|
||
"# 4. Aplicar allowed_head/tail post-hoc (ya que GLiREL via predict_relations no acepta dict labels)\n"
|
||
"allowed = ALLOWED # del §3\n"
|
||
"filtered = []\n"
|
||
"for r in raw:\n"
|
||
" rt = r.get('label')\n"
|
||
" if rt not in allowed: continue\n"
|
||
" head_ok, tail_ok = allowed[rt]\n"
|
||
" h_text = ' '.join(r.get('head_text', []))\n"
|
||
" t_text = ' '.join(r.get('tail_text', []))\n"
|
||
" h_type = name_to_type.get(h_text.lower().strip())\n"
|
||
" t_type = name_to_type.get(t_text.lower().strip())\n"
|
||
" if h_type in head_ok and t_type in tail_ok and r.get('score', 0) >= 0.10:\n"
|
||
" filtered.append((h_text, rt, t_text, round(r.get('score', 0), 3)))\n"
|
||
"print(f'GLiREL post-filter typed (threshold 0.10): {len(filtered)} relaciones')\n"
|
||
"\n"
|
||
"# 5. Mostrar las primeras 15\n"
|
||
"for h, rt, t, s in filtered[:15]:\n"
|
||
" print(f' {h:32s} --[{rt:18s} {s}]--> {t}')"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"**Lectura §5:** sin filtro typed, GLiREL emite cientos de candidatos espurios (lo que vimos en nb 02). "
|
||
"**Con filtro typed + threshold 0.10**, queda un set limpio de relaciones cuya cabeza y cola tienen sentido. "
|
||
"El coste extra: cargar GLiREL (~7s) y predict (~50ms). Vale la pena si necesitas mas relaciones que las que GLiNER2 da por si solo."
|
||
))
|
||
|
||
# ── §6 Best combo
|
||
cells.append(_md(
|
||
"## §6 Best combo — todo junto sobre el corpus\n\n"
|
||
"Aplicamos a la vez:\n"
|
||
"1. Snake_case verbal (mejor variante §1)\n"
|
||
"2. `include_confidence=True` con threshold global 0.3\n"
|
||
"3. **Post-filter typed** (§3)\n"
|
||
"4. **Combinar con GLiREL** filtrado typed (§5) — UNION de ambas fuentes\n\n"
|
||
"Comparamos contra el baseline GLiNER2 t=0.3 sin post-procesado."
|
||
))
|
||
|
||
cells.append(_code(
|
||
"labels = ['works_at', 'located_in', 'ceo_of', 'president_of', 'headquartered_in', 'agreement_with']\n"
|
||
"schema = model.create_schema().entities(ENTITY_LABELS).relations(labels)\n"
|
||
"\n"
|
||
"# baseline\n"
|
||
"r = model.extract(TEXT, schema=schema, threshold=0.3)\n"
|
||
"name_to_type = {n.lower().strip(): typ for typ, names in r['entities'].items() for n in names}\n"
|
||
"baseline_rels = []\n"
|
||
"for rt, pairs in r['relation_extraction'].items():\n"
|
||
" for h, t in pairs:\n"
|
||
" baseline_rels.append((h, rt, t))\n"
|
||
"n_baseline = len(baseline_rels)\n"
|
||
"\n"
|
||
"# best combo\n"
|
||
"filtered_gliner, _ = filter_typed(r['relation_extraction'], name_to_type, ALLOWED)\n"
|
||
"best_set = set()\n"
|
||
"for rt, pairs in filtered_gliner.items():\n"
|
||
" for h, t in pairs:\n"
|
||
" best_set.add((h, rt, t))\n"
|
||
"for h, rt, t, s in filtered:\n"
|
||
" best_set.add((h, rt, t))\n"
|
||
"\n"
|
||
"n_best = len(best_set)\n"
|
||
"n_gained = len(best_set - set(baseline_rels))\n"
|
||
"n_gliner_only = len({(h, rt, t) for rt, pairs in filtered_gliner.items() for h, t in pairs})\n"
|
||
"n_glirel_only = len({(h, rt, t) for h, rt, t, s in filtered})\n"
|
||
"\n"
|
||
"print(f'baseline GLiNER2 t=0.3 sin filter: {n_baseline} relaciones')\n"
|
||
"print(f'GLiNER2 t=0.3 + post-filter typed: {n_gliner_only}')\n"
|
||
"print(f'GLiREL filtered typed (threshold 0.10): {n_glirel_only}')\n"
|
||
"print(f'UNION (GLiNER2 typed ∪ GLiREL typed): {n_best}')\n"
|
||
"print(f' ganancia vs baseline: +{n_gained} relaciones')"
|
||
))
|
||
|
||
cells.append(_code(
|
||
"# Visualizar el grafo final\n"
|
||
"G = nx.DiGraph()\n"
|
||
"for typ, names in r['entities'].items():\n"
|
||
" for n in names:\n"
|
||
" G.add_node(n, type=typ)\n"
|
||
"for h, rt, t in best_set:\n"
|
||
" G.add_node(h, type=name_to_type.get(h.lower().strip(), '?'))\n"
|
||
" G.add_node(t, type=name_to_type.get(t.lower().strip(), '?'))\n"
|
||
" G.add_edge(h, t, kind=rt)\n"
|
||
"\n"
|
||
"TYPE_COLOR = {'person': '#5DA5DA', 'organization': '#F17CB0', 'location': '#60BD68', '?': '#bbb'}\n"
|
||
"fig, ax = plt.subplots(figsize=(13, 9))\n"
|
||
"pos = nx.spring_layout(G, k=2.5, iterations=80, seed=42)\n"
|
||
"cols = [TYPE_COLOR.get(G.nodes[n].get('type'), '#bbb') for n in G.nodes]\n"
|
||
"nx.draw_networkx_nodes(G, pos, node_color=cols, node_size=1900, edgecolors='#333', linewidths=1.4, ax=ax)\n"
|
||
"nx.draw_networkx_labels(G, pos, font_size=8, font_weight='bold', ax=ax)\n"
|
||
"nx.draw_networkx_edges(G, pos, edge_color='#666', arrows=True, arrowsize=14, width=1.1, alpha=0.7, ax=ax, connectionstyle='arc3,rad=0.08')\n"
|
||
"el = {(u,v): d['kind'] for u,v,d in G.edges(data=True)}\n"
|
||
"nx.draw_networkx_edge_labels(G, pos, edge_labels=el, font_size=6.5, ax=ax,\n"
|
||
" bbox=dict(boxstyle='round,pad=0.1', fc='white', ec='none', alpha=0.85))\n"
|
||
"ax.set_title(f'Best combo: {G.number_of_nodes()} nodos, {G.number_of_edges()} aristas', fontsize=12)\n"
|
||
"ax.axis('off')\n"
|
||
"legend = [Patch(facecolor=c, edgecolor='#333', label=t) for t, c in TYPE_COLOR.items() if t != '?']\n"
|
||
"ax.legend(handles=legend, loc='upper left', fontsize=10)\n"
|
||
"plt.tight_layout(); plt.show()"
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## Conclusion\n\n"
|
||
"**Receta operativa para `graph_explorer` post-experimentos:**\n\n"
|
||
"1. ⭐⭐⭐ **Naming snake_case verbal** (`works_at`, `headquartered_in`) — sin coste, gran impacto.\n"
|
||
"2. ⭐⭐⭐ **Post-filter typed** (`{rel: (head_types, tail_types)}`) — elimina la mayoria de falsos absurdos. **Pure, sin coste.**\n"
|
||
"3. ⭐⭐ **`include_confidence=True` + threshold por relacion** — evita el threshold global mediocre.\n"
|
||
"4. ⭐⭐ **GLiREL como complemento** (cargado solo cuando sea necesario) con allowed_head/tail aplicado post-hoc.\n"
|
||
"5. (no toques) Descripciones por relacion — sin efecto medible.\n\n"
|
||
"**Stack final:**\n\n"
|
||
"```python\n"
|
||
"# 1. labels en snake_case verbal\n"
|
||
"labels = ['works_at', 'ceo_of', 'president_of', 'headquartered_in', ...]\n"
|
||
"schema = model.create_schema().entities(['person', 'organization', 'location']).relations(labels)\n"
|
||
"\n"
|
||
"# 2. extract con confidence\n"
|
||
"r = model.extract(text, schema=schema, threshold=0.3, include_confidence=True)\n"
|
||
"\n"
|
||
"# 3. post-filter typed (gratis)\n"
|
||
"filtered = filter_typed(r['relation_extraction'], name_to_type, ALLOWED)\n"
|
||
"\n"
|
||
"# 4. opcional: GLiREL como segundo opinador con allowed_head/tail filtrado post-hoc\n"
|
||
"if rich_mode:\n"
|
||
" glirel_rels = glirel.predict_relations(tokens, labels=labels, threshold=0.0, ner=ner_spans, top_k=1)\n"
|
||
" glirel_filtered = [r for r in glirel_rels if compatible_types(r, ALLOWED, name_to_type)]\n"
|
||
" final_rels = union(filtered, glirel_filtered)\n"
|
||
"```\n\n"
|
||
"**Funciones para promover al registry** (proximo fn-constructor):\n"
|
||
"1. `gliner2_load_model_py_datascience` (Apache 2.0)\n"
|
||
"2. `extract_graph_gliner2_py_datascience` (NER+RE, threshold por relacion, include_confidence)\n"
|
||
"3. `filter_relations_by_entity_types_py_core` (PURE — el ALLOWED filter)\n"
|
||
"4. `merge_extraction_sources_py_core` (PURE — UNION de GLiNER2 + GLiREL)\n"
|
||
"5. `extract_graph_hybrid_gliner2_glirel_py_pipelines` (composicion)"
|
||
))
|
||
|
||
nb = nbf.v4.new_notebook()
|
||
nb.cells = cells
|
||
nb.metadata = {
|
||
"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
|
||
"language_info": {"name": "python"},
|
||
}
|
||
NB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
nbf.write(nb, NB_PATH)
|
||
print(f"[done] {NB_PATH} cells={len(cells)}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
build()
|