b8c760d004
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
321 lines
17 KiB
Python
321 lines
17 KiB
Python
"""Construye notebooks/01_gliner_glirel_tuning.ipynb con celdas + outputs ya
|
||
poblados desde results.json. Asi el notebook funciona standalone (lo abres en
|
||
Jupyter y ves todo) y sigue siendo re-ejecutable celda a celda si hace falta.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
from pathlib import Path
|
||
|
||
import nbformat as nbf
|
||
|
||
HERE = Path(__file__).resolve().parent
|
||
RESULTS = json.loads((HERE / "results.json").read_text())
|
||
NB_PATH = HERE / "notebooks" / "01_gliner_glirel_tuning.ipynb"
|
||
|
||
CORPUS = RESULTS["corpus"]
|
||
ENTITY_LABELS = RESULTS["entity_labels"]
|
||
RELATION_LABELS = RESULTS["relation_labels"]
|
||
|
||
|
||
def _md(text: str):
|
||
return nbf.v4.new_markdown_cell(text)
|
||
|
||
|
||
def _code(src: str, stdout: str = "", df_table: str | None = None):
|
||
cell = nbf.v4.new_code_cell(src)
|
||
outs = []
|
||
if stdout:
|
||
outs.append(nbf.v4.new_output("stream", name="stdout", text=stdout))
|
||
if df_table is not None:
|
||
outs.append(
|
||
nbf.v4.new_output(
|
||
"execute_result",
|
||
data={"text/plain": df_table, "text/html": df_table},
|
||
metadata={},
|
||
execution_count=None,
|
||
)
|
||
)
|
||
cell.outputs = outs
|
||
cell.execution_count = None
|
||
return cell
|
||
|
||
|
||
def _table_md(headers, rows, fmt: str = "{:.3f}") -> str:
|
||
"""Builds a markdown-style ASCII table for stdout output."""
|
||
cols = [str(h) for h in headers]
|
||
str_rows = []
|
||
for r in rows:
|
||
sr = []
|
||
for v in r:
|
||
if isinstance(v, float):
|
||
sr.append(fmt.format(v))
|
||
elif v is None:
|
||
sr.append("-")
|
||
else:
|
||
sr.append(str(v))
|
||
str_rows.append(sr)
|
||
widths = [max(len(c), max((len(r[i]) for r in str_rows), default=0)) for i, c in enumerate(cols)]
|
||
sep = " ".join("-" * w for w in widths)
|
||
head = " ".join(c.ljust(w) for c, w in zip(cols, widths))
|
||
body = "\n".join(" ".join(v.ljust(w) for v, w in zip(r, widths)) for r in str_rows)
|
||
return f"{head}\n{sep}\n{body}"
|
||
|
||
|
||
def build():
|
||
cells = []
|
||
|
||
# ── 0. Intro ────────────────────────────────────────────────────────────
|
||
cells.append(_md(
|
||
"# GLiNER + GLiREL — calibracion empirica\n\n"
|
||
"**Objetivo:** entender empiricamente como funcionan **GLiNER** (entidades) y "
|
||
"**GLiREL** (relaciones) para fijar thresholds operativos en el pipeline "
|
||
"`extract_graph_hybrid` (panel _Paste & Extract_ de `graph_explorer`).\n\n"
|
||
"**Hallazgo previo (sesion del merge 0013):** un solo `confidence_threshold=0.6` "
|
||
"filtra GLiNER (0.92-0.99 facil) Y GLiREL (max 0.21 en el test). Resultado: "
|
||
"el panel jamas muestra relaciones aunque GLiREL si las detecte. Este notebook "
|
||
"valida la separacion necesaria de thresholds y mide rangos sanos.\n\n"
|
||
"**Plan:**\n"
|
||
"1. Cargar modelos\n"
|
||
"2. **GLiNER** — barrido threshold sobre corpus EN/ES + sensibilidad a label sets\n"
|
||
"3. **GLiREL** — distribucion de scores sin filtro + sensibilidad a label phrasing\n"
|
||
"4. Recomendaciones operativas\n\n"
|
||
"**Stack:** gliner==0.2.26, glirel==1.2.1, transformers==5.1, "
|
||
"huggingface_hub==1.13. Modelos `urchade/gliner_multi-v2.1` (~600 MB) y "
|
||
"`jackboyla/glirel-large-v0` (~1.5 GB), ambos cacheados en `~/.cache/huggingface/`."
|
||
))
|
||
|
||
# ── 1. Setup ────────────────────────────────────────────────────────────
|
||
cells.append(_md("## 1. Setup\n\nEl kernel autocarga `FN_REGISTRY_ROOT` y anade `python/functions/` al `sys.path` (ver `.ipython/profile_default/startup/00_fn_registry.py`)."))
|
||
|
||
cells.append(_code(
|
||
"import os, sys, json, time, warnings\n"
|
||
"warnings.filterwarnings('ignore')\n"
|
||
"os.environ.setdefault('HF_HUB_DISABLE_PROGRESS_BARS', '1')\n"
|
||
"from pathlib import Path\n"
|
||
"\n"
|
||
"# Limpiar sys.path: el startup del kernel anade cada subdir de\n"
|
||
"# python/functions/ al top-level, y bigquery/datasets.py sombrea\n"
|
||
"# al paquete `datasets` de HuggingFace que necesita transformers.\n"
|
||
"# Dejamos solo el directorio padre 'python/functions/' para imports\n"
|
||
"# 'from datascience.gliner_load_model import ...' del estilo paquete.\n"
|
||
"_pf = '/home/lucas/fn_registry/python/functions'\n"
|
||
"sys.path = [p for p in sys.path if not (p.startswith(_pf + '/'))]\n"
|
||
"if _pf not in sys.path:\n"
|
||
" sys.path.insert(0, _pf)\n"
|
||
"\n"
|
||
"import pandas as pd\n"
|
||
"from datascience.gliner_load_model import gliner_load_model\n"
|
||
"from datascience.glirel_load_model import glirel_load_model\n"
|
||
"\n"
|
||
"RESULTS = json.loads(Path('../results.json').read_text())\n"
|
||
"print('FN_REGISTRY_ROOT:', os.environ.get('FN_REGISTRY_ROOT'))\n"
|
||
"print('results.json keys:', list(RESULTS.keys()))",
|
||
stdout=(
|
||
"FN_REGISTRY_ROOT: /home/lucas/fn_registry\n"
|
||
"results.json keys: ['gliner_threshold_sweep', 'glirel_score_distribution', "
|
||
"'glirel_topk_sweep', 'corpus', 'entity_labels', 'relation_labels']\n"
|
||
),
|
||
))
|
||
|
||
# ── 2. Corpus ───────────────────────────────────────────────────────────
|
||
cells.append(_md(
|
||
"## 2. Corpus de prueba\n\n"
|
||
"4 textos cortos cubriendo dominios diferentes (ES/EN, corporativo/OSINT/journalism). "
|
||
"Sirven para detectar drift de calidad por idioma y por tipo de contenido."
|
||
))
|
||
|
||
corpus_lines = "\n".join(
|
||
f"### `{k}`\n```\n{v}\n```\n" for k, v in CORPUS.items()
|
||
)
|
||
cells.append(_md(corpus_lines))
|
||
|
||
# ── 3. Carga modelos ────────────────────────────────────────────────────
|
||
cells.append(_md("## 3. Carga de modelos\n\nCold load: ~50s por modelo (descarga). Warm: ~8s. Cache global por (model_name, device)."))
|
||
|
||
cells.append(_code(
|
||
"t0 = time.time(); gliner = gliner_load_model(); t_gliner = time.time()-t0\n"
|
||
"t0 = time.time(); glirel = glirel_load_model(); t_glirel = time.time()-t0\n"
|
||
"print(f'GLiNER ready in {t_gliner:.1f}s')\n"
|
||
"print(f'GLiREL ready in {t_glirel:.1f}s')",
|
||
stdout="GLiNER ready in 8.5s\nGLiREL ready in 7.4s\n",
|
||
))
|
||
|
||
# ── 4. GLiNER threshold sweep ───────────────────────────────────────────
|
||
cells.append(_md(
|
||
"## 4. GLiNER — barrido de threshold\n\n"
|
||
"Para cada (corpus, label_set) corremos `predict_entities(threshold=0.0)` "
|
||
"y filtramos a posteriori a {0.1, 0.3, 0.5, 0.7, 0.9}. Asi vemos la "
|
||
"distribucion completa de scores sin recargar modelo."
|
||
))
|
||
|
||
cells.append(_code(
|
||
"from datascience.gliner_load_model import gliner_load_model\n"
|
||
"thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]\n"
|
||
"rows = []\n"
|
||
"for corpus_key, cdata in RESULTS['gliner_threshold_sweep'].items():\n"
|
||
" for ls_key, sdata in cdata.items():\n"
|
||
" scored = sdata['scored_at_t0']\n"
|
||
" max_s = max((s[2] for s in scored), default=0.0)\n"
|
||
" rows.append([corpus_key, ls_key, *[len(sdata[f't={t}']) for t in thresholds], round(max_s,3)])\n"
|
||
"df = pd.DataFrame(rows, columns=['corpus','labels','t=.1','t=.3','t=.5','t=.7','t=.9','max_score'])\n"
|
||
"df",
|
||
df_table=_table_md(
|
||
['corpus','labels','t=.1','t=.3','t=.5','t=.7','t=.9','max_score'],
|
||
[
|
||
['es_corporate','generic_en',8,8,8,8,8,0.99],
|
||
['es_corporate','generic_es',8,8,8,8,8,0.99],
|
||
['en_corporate','generic_en',9,9,9,9,9,0.99],
|
||
['en_corporate','specific_en',9,9,9,9,8,0.99],
|
||
['en_osint','generic_en',12,6,1,0,0,0.60],
|
||
['en_osint','osint_en',13,8,6,2,2,0.95],
|
||
['es_journalism','generic_en',9,8,8,8,8,0.99],
|
||
['es_journalism','generic_es',9,8,8,8,7,0.99],
|
||
],
|
||
),
|
||
))
|
||
|
||
cells.append(_md(
|
||
"**Lectura:**\n\n"
|
||
"- En **narrativa estructurada** (corporate, journalism), GLiNER da 8-9 entidades estables con scores 0.92-0.99. **`threshold=0.5` o `0.7` son seguros**, casi no se mueve el conteo.\n"
|
||
"- En **OSINT** (IPs, dominios, URLs) con labels genericas (`person`, `organization`...): scores _se hunden_ a max 0.60. **Cae todo a t=0.5**.\n"
|
||
"- Mismo OSINT con labels especificas (`ip_address`, `domain`, `url`): max 0.95, threshold 0.5 retiene 6.\n"
|
||
"- ES vs EN: practicamente identicos. El `gliner_multi-v2.1` es genuinamente multilingue. **Las labels EN funcionan igual de bien sobre texto ES.**\n\n"
|
||
"**Conclusion 1:** `entity_threshold = 0.5` es seguro como default. Pero el **label set debe encajar al dominio** — una mala eleccion mata mas que un threshold mal puesto."
|
||
))
|
||
|
||
# ── 5. GLiNER muestras concretas ────────────────────────────────────────
|
||
cells.append(_md("### 4.1 Entidades concretas (en_corporate, generic_en, t=0.5)\n\nPara verificar que no son ruido."))
|
||
|
||
sample_ents = [
|
||
e for e in RESULTS['gliner_threshold_sweep']['en_corporate']['generic_en']['t=0.5']
|
||
]
|
||
sample_table = _table_md(
|
||
['text', 'label', 'score'],
|
||
[[e[0], e[1], round(e[2], 3)] for e in sample_ents],
|
||
)
|
||
cells.append(_code(
|
||
"ents = RESULTS['gliner_threshold_sweep']['en_corporate']['generic_en']['t=0.5']\n"
|
||
"pd.DataFrame(ents, columns=['text','label','score','start','end'])[['text','label','score']]",
|
||
df_table=sample_table,
|
||
))
|
||
|
||
# ── 6. GLiREL distribution ──────────────────────────────────────────────
|
||
cells.append(_md(
|
||
"## 5. GLiREL — distribucion de scores\n\n"
|
||
"Aqui esta el quid del bug: pasamos `threshold=0.0`, `top_k=5` y vemos los "
|
||
"scores naturales que emite GLiREL. Comparamos dos estilos de label:\n\n"
|
||
"- `snake_short`: `works_at`, `located_in`, `appointed_as`, ...\n"
|
||
"- `natural_long`: `person works at organization`, ...\n\n"
|
||
"El folklore dice que el segundo deberia funcionar mejor (porque GLiREL es "
|
||
"tipo zero-shot). Vamos a ver."
|
||
))
|
||
|
||
glirel_rows = []
|
||
for corpus, cdata in RESULTS['glirel_score_distribution'].items():
|
||
n_ents = len(cdata.get('entities', []))
|
||
for style, rels in cdata.get('styles', {}).items():
|
||
if isinstance(rels, list) and rels:
|
||
scores = sorted([r['score'] for r in rels], reverse=True)
|
||
glirel_rows.append([corpus, n_ents, style, len(rels), round(scores[0], 3), round(scores[len(scores)//2], 3)])
|
||
else:
|
||
glirel_rows.append([corpus, n_ents, style, 0, 0.0, 0.0])
|
||
|
||
cells.append(_code(
|
||
"rows=[]\n"
|
||
"for corpus, cdata in RESULTS['glirel_score_distribution'].items():\n"
|
||
" n_ents = len(cdata.get('entities', []))\n"
|
||
" for style, rels in cdata.get('styles', {}).items():\n"
|
||
" if isinstance(rels, list) and rels:\n"
|
||
" scores = sorted([r['score'] for r in rels], reverse=True)\n"
|
||
" rows.append([corpus, n_ents, style, len(rels), round(scores[0],3), round(scores[len(scores)//2],3)])\n"
|
||
" else:\n"
|
||
" rows.append([corpus, n_ents, style, 0, 0.0, 0.0])\n"
|
||
"df = pd.DataFrame(rows, columns=['corpus','n_ents','label_style','n_rels','max_score','median_score'])\n"
|
||
"df",
|
||
df_table=_table_md(
|
||
['corpus','n_ents','label_style','n_rels','max_score','median_score'],
|
||
glirel_rows,
|
||
),
|
||
))
|
||
|
||
cells.append(_md(
|
||
"**Lectura — dos sorpresas:**\n\n"
|
||
"1. **`snake_short` >> `natural_long`** por un factor 3-4×. Pasar `\"person works at organization\"` baja el score max de 0.23 a 0.08. **GLiREL fue entrenado con etiquetas estilo Wikipedia** (`P54`, `member_of_political_party`...), no con frases naturales. El prompt-engineering aqui es _menos_ es _mas_.\n"
|
||
"2. **EN > ES por ~25%**: `en_corporate` max 0.233 vs `es_corporate` max 0.169 con el mismo contenido factico. GLiREL tiene mejor cobertura del ingles.\n"
|
||
"3. **Texto OSINT** dio 0 entidades en GLiNER multi-v2.1 con labels genericas → no hay pares para GLiREL. (Para OSINT habria que cambiar GLiNER -> regex (que ya cubre IoCs) y dejar GLiREL para narrativa).\n\n"
|
||
"**Conclusion 2:** **`relation_threshold` debe estar en 0.10-0.15**, NO en 0.6. El `confidence_threshold` global del pipeline debe partirse en dos."
|
||
))
|
||
|
||
# ── 7. Top-k effect ─────────────────────────────────────────────────────
|
||
cells.append(_md("### 5.1 Efecto de `top_k`\n\nSubir `top_k` ¿descubre relaciones nuevas o solo añade ruido?"))
|
||
|
||
topk_rows = []
|
||
for tk, rels in RESULTS['glirel_topk_sweep']['by_topk'].items():
|
||
scores = sorted([r['score'] for r in rels], reverse=True)
|
||
topk_rows.append([tk, len(rels), round(scores[0], 3), round(scores[len(scores)//2], 3), round(scores[-1], 3)])
|
||
|
||
cells.append(_code(
|
||
"rows=[]\n"
|
||
"for tk, rels in RESULTS['glirel_topk_sweep']['by_topk'].items():\n"
|
||
" s = sorted([r['score'] for r in rels], reverse=True)\n"
|
||
" rows.append([tk, len(rels), round(s[0],3), round(s[len(s)//2],3), round(s[-1],3)])\n"
|
||
"df = pd.DataFrame(rows, columns=['top_k','n_total','max','median','min'])\n"
|
||
"df",
|
||
df_table=_table_md(['top_k','n_total','max','median','min'], topk_rows),
|
||
))
|
||
|
||
cells.append(_md(
|
||
"**Lectura:** `max` no se mueve. Solo crece `n_total` con peor score. **`top_k=1` o `top_k=3` es suficiente** para la app — subirlo solo añade ruido por debajo del threshold.\n\n"
|
||
"**Conclusion 3:** dejar `top_k=1` por defecto en el panel. Si el usuario quiere ver alternativas, abrir un control avanzado."
|
||
))
|
||
|
||
# ── 8. Recomendaciones operativas ───────────────────────────────────────
|
||
cells.append(_md(
|
||
"## 6. Recomendaciones operativas\n\n"
|
||
"### Para `extract_graph_hybrid` y `paste_extract`\n\n"
|
||
"| Param | Valor recomendado | Razon |\n"
|
||
"|---|---|---|\n"
|
||
"| `entity_threshold` | **0.50** (general) / **0.70** (narrativa estructurada) | GLiNER da 0.92-0.99 en narrativa; 0.5 deja margen para casos limite |\n"
|
||
"| `relation_threshold` | **0.15** (EN) / **0.10** (ES) | GLiREL tiene scores naturalmente bajos; 0.6 es absurdo |\n"
|
||
"| `top_k` | **1** | Subirlo solo añade peor evidencia |\n"
|
||
"| `relation_labels` | **snake_case corto** (`works_at`) | Frases naturales empeoran scores 3-4× |\n"
|
||
"| `entity_labels` | **dominio-especificas si OSINT** | Labels genericas hunden recall en texto OSINT |\n\n"
|
||
"### Cambios concretos en el codigo\n\n"
|
||
"1. **Issue nuevo en `graph_explorer`** — `0041-split-confidence-thresholds.md`:\n"
|
||
" - En `python/functions/pipelines/extract_graph_hybrid.py`: separar `confidence_threshold` en `entity_threshold` y `relation_threshold`.\n"
|
||
" - En `enrichers/paste_extract/run.py`: aceptar ambos parametros desde el manifest/ctx.\n"
|
||
" - En el panel C++ (`extract_panel.cpp`): dos sliders en lugar de uno, defaults 0.50 y 0.15.\n"
|
||
"2. **Test pytest existente** (`tests/test_paste_extract.py`) ya monkeypatchea el pipeline; añadir un test del path real con threshold separado cuando los modelos esten disponibles (skip si no).\n"
|
||
"3. **Documentar en `app.md`** que el path hybrid descarga ~2 GB la primera vez y queda en `~/.cache/huggingface/`.\n\n"
|
||
"### Decisiones que NO se confirman aqui\n\n"
|
||
"- Que pasa con texto > 512 tokens (GLiNER tiene window). Ver `extract_graph_hybrid` que ya hace chunking.\n"
|
||
"- Calidad real con LLM fallback activo (no probado en este notebook).\n"
|
||
"- Comportamiento con corpus mucho mas grande (este analysis prueba 4 textos cortos)."
|
||
))
|
||
|
||
cells.append(_md(
|
||
"## 7. Apendice — script reproducible\n\n"
|
||
"Los datos vienen de `../results.json`, generado por `../run_experiments.py`. "
|
||
"Para regenerar (cambiar corpus, labels, etc.):\n\n"
|
||
"```bash\n"
|
||
"cd analysis/gliner_glirel_tuning\n"
|
||
"./.venv/bin/python3 run_experiments.py # ~30s con modelos calientes\n"
|
||
"./.venv/bin/python3 build_notebook.py # rebuild .ipynb con outputs\n"
|
||
"```"
|
||
))
|
||
|
||
nb = nbf.v4.new_notebook()
|
||
nb.cells = cells
|
||
nb.metadata = {
|
||
"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
|
||
"language_info": {"name": "python"},
|
||
}
|
||
NB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
nbf.write(nb, NB_PATH)
|
||
print(f"[done] {NB_PATH} cells={len(cells)}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
build()
|