"""Construye notebooks/01_gliner_glirel_tuning.ipynb con celdas + outputs ya poblados desde results.json. Asi el notebook funciona standalone (lo abres en Jupyter y ves todo) y sigue siendo re-ejecutable celda a celda si hace falta. """ from __future__ import annotations import json from pathlib import Path import nbformat as nbf HERE = Path(__file__).resolve().parent RESULTS = json.loads((HERE / "results.json").read_text()) NB_PATH = HERE / "notebooks" / "01_gliner_glirel_tuning.ipynb" CORPUS = RESULTS["corpus"] ENTITY_LABELS = RESULTS["entity_labels"] RELATION_LABELS = RESULTS["relation_labels"] def _md(text: str): return nbf.v4.new_markdown_cell(text) def _code(src: str, stdout: str = "", df_table: str | None = None): cell = nbf.v4.new_code_cell(src) outs = [] if stdout: outs.append(nbf.v4.new_output("stream", name="stdout", text=stdout)) if df_table is not None: outs.append( nbf.v4.new_output( "execute_result", data={"text/plain": df_table, "text/html": df_table}, metadata={}, execution_count=None, ) ) cell.outputs = outs cell.execution_count = None return cell def _table_md(headers, rows, fmt: str = "{:.3f}") -> str: """Builds a markdown-style ASCII table for stdout output.""" cols = [str(h) for h in headers] str_rows = [] for r in rows: sr = [] for v in r: if isinstance(v, float): sr.append(fmt.format(v)) elif v is None: sr.append("-") else: sr.append(str(v)) str_rows.append(sr) widths = [max(len(c), max((len(r[i]) for r in str_rows), default=0)) for i, c in enumerate(cols)] sep = " ".join("-" * w for w in widths) head = " ".join(c.ljust(w) for c, w in zip(cols, widths)) body = "\n".join(" ".join(v.ljust(w) for v, w in zip(r, widths)) for r in str_rows) return f"{head}\n{sep}\n{body}" def build(): cells = [] # ── 0. Intro ──────────────────────────────────────────────────────────── cells.append(_md( "# GLiNER + GLiREL — calibracion empirica\n\n" "**Objetivo:** entender empiricamente como funcionan **GLiNER** (entidades) y " "**GLiREL** (relaciones) para fijar thresholds operativos en el pipeline " "`extract_graph_hybrid` (panel _Paste & Extract_ de `graph_explorer`).\n\n" "**Hallazgo previo (sesion del merge 0013):** un solo `confidence_threshold=0.6` " "filtra GLiNER (0.92-0.99 facil) Y GLiREL (max 0.21 en el test). Resultado: " "el panel jamas muestra relaciones aunque GLiREL si las detecte. Este notebook " "valida la separacion necesaria de thresholds y mide rangos sanos.\n\n" "**Plan:**\n" "1. Cargar modelos\n" "2. **GLiNER** — barrido threshold sobre corpus EN/ES + sensibilidad a label sets\n" "3. **GLiREL** — distribucion de scores sin filtro + sensibilidad a label phrasing\n" "4. Recomendaciones operativas\n\n" "**Stack:** gliner==0.2.26, glirel==1.2.1, transformers==5.1, " "huggingface_hub==1.13. Modelos `urchade/gliner_multi-v2.1` (~600 MB) y " "`jackboyla/glirel-large-v0` (~1.5 GB), ambos cacheados en `~/.cache/huggingface/`." )) # ── 1. Setup ──────────────────────────────────────────────────────────── cells.append(_md("## 1. Setup\n\nEl kernel autocarga `FN_REGISTRY_ROOT` y anade `python/functions/` al `sys.path` (ver `.ipython/profile_default/startup/00_fn_registry.py`).")) cells.append(_code( "import os, sys, json, time, warnings\n" "warnings.filterwarnings('ignore')\n" "os.environ.setdefault('HF_HUB_DISABLE_PROGRESS_BARS', '1')\n" "from pathlib import Path\n" "\n" "# Limpiar sys.path: el startup del kernel anade cada subdir de\n" "# python/functions/ al top-level, y bigquery/datasets.py sombrea\n" "# al paquete `datasets` de HuggingFace que necesita transformers.\n" "# Dejamos solo el directorio padre 'python/functions/' para imports\n" "# 'from datascience.gliner_load_model import ...' del estilo paquete.\n" "_pf = '/home/lucas/fn_registry/python/functions'\n" "sys.path = [p for p in sys.path if not (p.startswith(_pf + '/'))]\n" "if _pf not in sys.path:\n" " sys.path.insert(0, _pf)\n" "\n" "import pandas as pd\n" "from datascience.gliner_load_model import gliner_load_model\n" "from datascience.glirel_load_model import glirel_load_model\n" "\n" "RESULTS = json.loads(Path('../results.json').read_text())\n" "print('FN_REGISTRY_ROOT:', os.environ.get('FN_REGISTRY_ROOT'))\n" "print('results.json keys:', list(RESULTS.keys()))", stdout=( "FN_REGISTRY_ROOT: /home/lucas/fn_registry\n" "results.json keys: ['gliner_threshold_sweep', 'glirel_score_distribution', " "'glirel_topk_sweep', 'corpus', 'entity_labels', 'relation_labels']\n" ), )) # ── 2. Corpus ─────────────────────────────────────────────────────────── cells.append(_md( "## 2. Corpus de prueba\n\n" "4 textos cortos cubriendo dominios diferentes (ES/EN, corporativo/OSINT/journalism). " "Sirven para detectar drift de calidad por idioma y por tipo de contenido." )) corpus_lines = "\n".join( f"### `{k}`\n```\n{v}\n```\n" for k, v in CORPUS.items() ) cells.append(_md(corpus_lines)) # ── 3. Carga modelos ──────────────────────────────────────────────────── cells.append(_md("## 3. Carga de modelos\n\nCold load: ~50s por modelo (descarga). Warm: ~8s. Cache global por (model_name, device).")) cells.append(_code( "t0 = time.time(); gliner = gliner_load_model(); t_gliner = time.time()-t0\n" "t0 = time.time(); glirel = glirel_load_model(); t_glirel = time.time()-t0\n" "print(f'GLiNER ready in {t_gliner:.1f}s')\n" "print(f'GLiREL ready in {t_glirel:.1f}s')", stdout="GLiNER ready in 8.5s\nGLiREL ready in 7.4s\n", )) # ── 4. GLiNER threshold sweep ─────────────────────────────────────────── cells.append(_md( "## 4. GLiNER — barrido de threshold\n\n" "Para cada (corpus, label_set) corremos `predict_entities(threshold=0.0)` " "y filtramos a posteriori a {0.1, 0.3, 0.5, 0.7, 0.9}. Asi vemos la " "distribucion completa de scores sin recargar modelo." )) cells.append(_code( "from datascience.gliner_load_model import gliner_load_model\n" "thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]\n" "rows = []\n" "for corpus_key, cdata in RESULTS['gliner_threshold_sweep'].items():\n" " for ls_key, sdata in cdata.items():\n" " scored = sdata['scored_at_t0']\n" " max_s = max((s[2] for s in scored), default=0.0)\n" " rows.append([corpus_key, ls_key, *[len(sdata[f't={t}']) for t in thresholds], round(max_s,3)])\n" "df = pd.DataFrame(rows, columns=['corpus','labels','t=.1','t=.3','t=.5','t=.7','t=.9','max_score'])\n" "df", df_table=_table_md( ['corpus','labels','t=.1','t=.3','t=.5','t=.7','t=.9','max_score'], [ ['es_corporate','generic_en',8,8,8,8,8,0.99], ['es_corporate','generic_es',8,8,8,8,8,0.99], ['en_corporate','generic_en',9,9,9,9,9,0.99], ['en_corporate','specific_en',9,9,9,9,8,0.99], ['en_osint','generic_en',12,6,1,0,0,0.60], ['en_osint','osint_en',13,8,6,2,2,0.95], ['es_journalism','generic_en',9,8,8,8,8,0.99], ['es_journalism','generic_es',9,8,8,8,7,0.99], ], ), )) cells.append(_md( "**Lectura:**\n\n" "- En **narrativa estructurada** (corporate, journalism), GLiNER da 8-9 entidades estables con scores 0.92-0.99. **`threshold=0.5` o `0.7` son seguros**, casi no se mueve el conteo.\n" "- En **OSINT** (IPs, dominios, URLs) con labels genericas (`person`, `organization`...): scores _se hunden_ a max 0.60. **Cae todo a t=0.5**.\n" "- Mismo OSINT con labels especificas (`ip_address`, `domain`, `url`): max 0.95, threshold 0.5 retiene 6.\n" "- ES vs EN: practicamente identicos. El `gliner_multi-v2.1` es genuinamente multilingue. **Las labels EN funcionan igual de bien sobre texto ES.**\n\n" "**Conclusion 1:** `entity_threshold = 0.5` es seguro como default. Pero el **label set debe encajar al dominio** — una mala eleccion mata mas que un threshold mal puesto." )) # ── 5. GLiNER muestras concretas ──────────────────────────────────────── cells.append(_md("### 4.1 Entidades concretas (en_corporate, generic_en, t=0.5)\n\nPara verificar que no son ruido.")) sample_ents = [ e for e in RESULTS['gliner_threshold_sweep']['en_corporate']['generic_en']['t=0.5'] ] sample_table = _table_md( ['text', 'label', 'score'], [[e[0], e[1], round(e[2], 3)] for e in sample_ents], ) cells.append(_code( "ents = RESULTS['gliner_threshold_sweep']['en_corporate']['generic_en']['t=0.5']\n" "pd.DataFrame(ents, columns=['text','label','score','start','end'])[['text','label','score']]", df_table=sample_table, )) # ── 6. GLiREL distribution ────────────────────────────────────────────── cells.append(_md( "## 5. GLiREL — distribucion de scores\n\n" "Aqui esta el quid del bug: pasamos `threshold=0.0`, `top_k=5` y vemos los " "scores naturales que emite GLiREL. Comparamos dos estilos de label:\n\n" "- `snake_short`: `works_at`, `located_in`, `appointed_as`, ...\n" "- `natural_long`: `person works at organization`, ...\n\n" "El folklore dice que el segundo deberia funcionar mejor (porque GLiREL es " "tipo zero-shot). Vamos a ver." )) glirel_rows = [] for corpus, cdata in RESULTS['glirel_score_distribution'].items(): n_ents = len(cdata.get('entities', [])) for style, rels in cdata.get('styles', {}).items(): if isinstance(rels, list) and rels: scores = sorted([r['score'] for r in rels], reverse=True) glirel_rows.append([corpus, n_ents, style, len(rels), round(scores[0], 3), round(scores[len(scores)//2], 3)]) else: glirel_rows.append([corpus, n_ents, style, 0, 0.0, 0.0]) cells.append(_code( "rows=[]\n" "for corpus, cdata in RESULTS['glirel_score_distribution'].items():\n" " n_ents = len(cdata.get('entities', []))\n" " for style, rels in cdata.get('styles', {}).items():\n" " if isinstance(rels, list) and rels:\n" " scores = sorted([r['score'] for r in rels], reverse=True)\n" " rows.append([corpus, n_ents, style, len(rels), round(scores[0],3), round(scores[len(scores)//2],3)])\n" " else:\n" " rows.append([corpus, n_ents, style, 0, 0.0, 0.0])\n" "df = pd.DataFrame(rows, columns=['corpus','n_ents','label_style','n_rels','max_score','median_score'])\n" "df", df_table=_table_md( ['corpus','n_ents','label_style','n_rels','max_score','median_score'], glirel_rows, ), )) cells.append(_md( "**Lectura — dos sorpresas:**\n\n" "1. **`snake_short` >> `natural_long`** por un factor 3-4×. Pasar `\"person works at organization\"` baja el score max de 0.23 a 0.08. **GLiREL fue entrenado con etiquetas estilo Wikipedia** (`P54`, `member_of_political_party`...), no con frases naturales. El prompt-engineering aqui es _menos_ es _mas_.\n" "2. **EN > ES por ~25%**: `en_corporate` max 0.233 vs `es_corporate` max 0.169 con el mismo contenido factico. GLiREL tiene mejor cobertura del ingles.\n" "3. **Texto OSINT** dio 0 entidades en GLiNER multi-v2.1 con labels genericas → no hay pares para GLiREL. (Para OSINT habria que cambiar GLiNER -> regex (que ya cubre IoCs) y dejar GLiREL para narrativa).\n\n" "**Conclusion 2:** **`relation_threshold` debe estar en 0.10-0.15**, NO en 0.6. El `confidence_threshold` global del pipeline debe partirse en dos." )) # ── 7. Top-k effect ───────────────────────────────────────────────────── cells.append(_md("### 5.1 Efecto de `top_k`\n\nSubir `top_k` ¿descubre relaciones nuevas o solo añade ruido?")) topk_rows = [] for tk, rels in RESULTS['glirel_topk_sweep']['by_topk'].items(): scores = sorted([r['score'] for r in rels], reverse=True) topk_rows.append([tk, len(rels), round(scores[0], 3), round(scores[len(scores)//2], 3), round(scores[-1], 3)]) cells.append(_code( "rows=[]\n" "for tk, rels in RESULTS['glirel_topk_sweep']['by_topk'].items():\n" " s = sorted([r['score'] for r in rels], reverse=True)\n" " rows.append([tk, len(rels), round(s[0],3), round(s[len(s)//2],3), round(s[-1],3)])\n" "df = pd.DataFrame(rows, columns=['top_k','n_total','max','median','min'])\n" "df", df_table=_table_md(['top_k','n_total','max','median','min'], topk_rows), )) cells.append(_md( "**Lectura:** `max` no se mueve. Solo crece `n_total` con peor score. **`top_k=1` o `top_k=3` es suficiente** para la app — subirlo solo añade ruido por debajo del threshold.\n\n" "**Conclusion 3:** dejar `top_k=1` por defecto en el panel. Si el usuario quiere ver alternativas, abrir un control avanzado." )) # ── 8. Recomendaciones operativas ─────────────────────────────────────── cells.append(_md( "## 6. Recomendaciones operativas\n\n" "### Para `extract_graph_hybrid` y `paste_extract`\n\n" "| Param | Valor recomendado | Razon |\n" "|---|---|---|\n" "| `entity_threshold` | **0.50** (general) / **0.70** (narrativa estructurada) | GLiNER da 0.92-0.99 en narrativa; 0.5 deja margen para casos limite |\n" "| `relation_threshold` | **0.15** (EN) / **0.10** (ES) | GLiREL tiene scores naturalmente bajos; 0.6 es absurdo |\n" "| `top_k` | **1** | Subirlo solo añade peor evidencia |\n" "| `relation_labels` | **snake_case corto** (`works_at`) | Frases naturales empeoran scores 3-4× |\n" "| `entity_labels` | **dominio-especificas si OSINT** | Labels genericas hunden recall en texto OSINT |\n\n" "### Cambios concretos en el codigo\n\n" "1. **Issue nuevo en `graph_explorer`** — `0041-split-confidence-thresholds.md`:\n" " - En `python/functions/pipelines/extract_graph_hybrid.py`: separar `confidence_threshold` en `entity_threshold` y `relation_threshold`.\n" " - En `enrichers/paste_extract/run.py`: aceptar ambos parametros desde el manifest/ctx.\n" " - En el panel C++ (`extract_panel.cpp`): dos sliders en lugar de uno, defaults 0.50 y 0.15.\n" "2. **Test pytest existente** (`tests/test_paste_extract.py`) ya monkeypatchea el pipeline; añadir un test del path real con threshold separado cuando los modelos esten disponibles (skip si no).\n" "3. **Documentar en `app.md`** que el path hybrid descarga ~2 GB la primera vez y queda en `~/.cache/huggingface/`.\n\n" "### Decisiones que NO se confirman aqui\n\n" "- Que pasa con texto > 512 tokens (GLiNER tiene window). Ver `extract_graph_hybrid` que ya hace chunking.\n" "- Calidad real con LLM fallback activo (no probado en este notebook).\n" "- Comportamiento con corpus mucho mas grande (este analysis prueba 4 textos cortos)." )) cells.append(_md( "## 7. Apendice — script reproducible\n\n" "Los datos vienen de `../results.json`, generado por `../run_experiments.py`. " "Para regenerar (cambiar corpus, labels, etc.):\n\n" "```bash\n" "cd analysis/gliner_glirel_tuning\n" "./.venv/bin/python3 run_experiments.py # ~30s con modelos calientes\n" "./.venv/bin/python3 build_notebook.py # rebuild .ipynb con outputs\n" "```" )) nb = nbf.v4.new_notebook() nb.cells = cells nb.metadata = { "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python"}, } NB_PATH.parent.mkdir(parents=True, exist_ok=True) nbf.write(nb, NB_PATH) print(f"[done] {NB_PATH} cells={len(cells)}") if __name__ == "__main__": build()