chore: initial sync — gliner+glirel benchmark notebooks

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 23:44:11 +02:00
commit b8c760d004
49 changed files with 47850 additions and 0 deletions
+291
View File
@@ -0,0 +1,291 @@
<!DOCTYPE html>
<html lang="es">
<head>
<meta charset="utf-8">
<title>GLiNER2 Playground — graph_explorer</title>
<script src="/static/graphology.umd.min.js"></script>
<script src="/static/sigma.min.js"></script>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
html, body { height: 100%; font-family: -apple-system, "Segoe UI", Roboto, sans-serif;
background: #181a1f; color: #ddd; }
.app { display: grid; grid-template-columns: 420px 1fr; height: 100%; gap: 0; }
.left { padding: 16px; border-right: 1px solid #2a2d34; display: flex; flex-direction: column; gap: 12px; overflow-y: auto; }
h1 { font-size: 14px; font-weight: 600; letter-spacing: 0.02em; color: #fff; }
h1 .badge { background: #2c2f3a; color: #9aa0ad; padding: 2px 8px; border-radius: 4px;
font-size: 11px; margin-left: 8px; font-weight: 400; }
textarea { width: 100%; height: 320px; padding: 10px; font-family: ui-monospace, monospace;
font-size: 12px; line-height: 1.45; background: #14161b; color: #d8dadf;
border: 1px solid #2a2d34; border-radius: 6px; resize: vertical; }
textarea:focus { outline: none; border-color: #3d6cb8; }
.controls { display: flex; gap: 8px; align-items: center; }
button { background: #3d6cb8; color: #fff; border: none; padding: 8px 14px;
border-radius: 6px; font-weight: 600; cursor: pointer; font-size: 13px; }
button:hover { background: #4d7cc8; }
button:disabled { background: #555; cursor: not-allowed; }
label { font-size: 12px; color: #9aa0ad; display: flex; align-items: center; gap: 6px; }
input[type="number"] { width: 60px; padding: 4px 6px; background: #14161b; color: #d8dadf;
border: 1px solid #2a2d34; border-radius: 4px; font-size: 12px; }
.kpis { display: grid; grid-template-columns: 1fr 1fr; gap: 8px; margin-top: 4px; }
.kpi { background: #14161b; border: 1px solid #2a2d34; border-radius: 6px;
padding: 10px 12px; }
.kpi .num { font-size: 28px; font-weight: 700; color: #fff; }
.kpi .lbl { font-size: 11px; color: #9aa0ad; text-transform: uppercase; letter-spacing: 0.06em; }
.kpi.full { grid-column: span 2; }
.legend { display: flex; gap: 12px; flex-wrap: wrap; font-size: 11px; }
.legend-item { display: flex; align-items: center; gap: 4px; }
.swatch { width: 10px; height: 10px; border-radius: 50%; border: 1px solid #fff3; }
.right { background: #0e1015; position: relative; }
#graph { width: 100%; height: 100%; }
.empty-msg { position: absolute; inset: 0; display: flex; align-items: center;
justify-content: center; color: #4c5060; font-size: 14px; pointer-events: none; }
details { background: #14161b; border: 1px solid #2a2d34; border-radius: 6px; padding: 8px 10px;
font-size: 11px; color: #9aa0ad; }
details summary { cursor: pointer; color: #d8dadf; font-weight: 500; }
details pre { margin-top: 6px; font-size: 10px; line-height: 1.4; max-height: 280px; overflow: auto;
color: #d8dadf; font-family: ui-monospace, "JetBrains Mono", monospace;
background: #0e1015; padding: 6px; border-radius: 4px; white-space: pre; }
details[open] summary { color: #fff; margin-bottom: 4px; }
.examples { display: flex; flex-direction: column; gap: 4px; }
.examples a { color: #9aa0ad; font-size: 11px; cursor: pointer; padding: 4px 6px;
background: #14161b; border: 1px solid #2a2d34; border-radius: 4px; text-decoration: none; }
.examples a:hover { background: #1e2027; color: #d8dadf; }
</style>
</head>
<body>
<div class="app">
<div class="left">
<h1>GLiNER2 Playground <span class="badge">graph_explorer</span></h1>
<textarea id="input" placeholder="Pega aqui un texto en castellano (sector empresarial, OSINT, legal...)"></textarea>
<div class="controls">
<button id="btn">Procesar</button>
<label>threshold
<input id="threshold" type="number" value="0.3" step="0.05" min="0.1" max="0.9">
</label>
<span id="status" style="font-size: 11px; color: #6c7080;"></span>
</div>
<div class="kpis">
<div class="kpi"><div class="num" id="kpi-nodes"></div><div class="lbl">nodos</div></div>
<div class="kpi"><div class="num" id="kpi-edges"></div><div class="lbl">relaciones</div></div>
<div class="kpi full"><div class="num" id="kpi-time"></div><div class="lbl">tiempo (s)</div></div>
</div>
<div class="legend">
<div class="legend-item"><div class="swatch" style="background:#5DA5DA"></div>person</div>
<div class="legend-item"><div class="swatch" style="background:#F17CB0"></div>organization</div>
<div class="legend-item"><div class="swatch" style="background:#60BD68"></div>location</div>
</div>
<div class="examples">
<a data-ex="corp">📰 Ej: corporate ES (Pablo Isla / Inditex)</a>
<a data-ex="osint">🔒 Ej: OSINT ES (APT-29 / CozyBear)</a>
<a data-ex="banking">🏦 Ej: banca ES (BBVA / Sabadell / OPA)</a>
</div>
<details>
<summary>Stack aplicado</summary>
<pre>1. snake_case verbal labels
2. threshold (configurable)
3. post-filter typed (head_type, tail_type)
4. coreferencia normalize+substring
5. chunking automatico > 1500 chars
6. layout server-side (networkx spring_layout)
7. render: sigma.js + graphology</pre>
</details>
<details open>
<summary>Relaciones extraidas (texto)</summary>
<pre id="relations-text">(corre una extraccion para verlo)</pre>
</details>
<details>
<summary>Entidades extraidas por tipo</summary>
<pre id="entities-text">(corre una extraccion para verlo)</pre>
</details>
<details>
<summary>JSON completo</summary>
<pre id="raw-json">(corre una extraccion para verlo)</pre>
</details>
<details>
<summary>Relaciones descartadas por filtro typed</summary>
<pre id="dropped">(corre una extraccion para verlo)</pre>
</details>
</div>
<div class="right">
<div id="graph"></div>
<div class="empty-msg" id="empty">Pega un texto y pulsa Procesar</div>
</div>
</div>
<script>
// Filtra ResizeObserver warnings benignos (vis-network los disparaba; sigma puede tambien)
window.addEventListener('error', e => {
if (e.message && e.message.includes('ResizeObserver')) {
e.stopImmediatePropagation();
return false;
}
});
const TYPE_COLOR = { person:'#5DA5DA', organization:'#F17CB0', location:'#60BD68', '?':'#888' };
const EXAMPLES = {
corp: `Pablo Isla, expresidente de Inditex, ha sido nombrado consejero de Telefonica. La operacion fue anunciada por el presidente Jose Maria Alvarez-Pallete en Madrid el pasado lunes. Inditex factura mas de 30.000 millones anuales y tiene su sede en Arteixo, A Coruna. En paralelo, Iberdrola y Endesa firmaron un acuerdo de colaboracion en proyectos eolicos en Galicia. El presidente de Iberdrola, Ignacio Galan, se reunio con la CEO de Endesa, Marina Serrano, en Bilbao. El BBVA, presidido por Carlos Torres, mostro interes en participar en la financiacion del proyecto. Su sede central esta en Bilbao.`,
osint: `El 15 de agosto de 2024, el grupo APT-29 (atribuido a Rusia) lanzo una campana de phishing contra empresas energeticas espanolas. El servidor de comando y control 185.220.101.45 conectaba con sistemas internos de Iberdrola via TLS. El malware utilizado, identificado como CozyBear, exploto la vulnerabilidad CVE-2024-21412 en Microsoft Defender. El operador @phantomzero reivindico el ataque en un foro de la dark web. El analista Carlos Garcia, del CCN-CERT, publico un informe tecnico. Telefonica Tech alerto a sus clientes sobre indicadores de compromiso adicionales en el dominio cloudfront-cdn[.]net.`,
banking: `BBVA, presidido por Carlos Torres, anuncio en mayo de 2024 una OPA hostil sobre Banco Sabadell. Onur Genc, consejero delegado del banco desde 2018, lidero el proceso desde la sede central en Bilbao. Cesar Gonzalez-Bueno, CEO de Sabadell, defendio la independencia junto con su presidente Josep Oliu. Banco Santander, dirigido por Ana Botin, sigue siendo el primer banco espanol. CaixaBank, presidida por Jose Ignacio Goirigolzarri y con sede en Valencia, completo la fusion con Bankia. El Banco de Espana, gobernado por Pablo Hernandez de Cos, supervisa el sector. Luis de Guindos, vicepresidente del Banco Central Europeo, fue ministro de Economia en el gobierno de Mariano Rajoy.`
};
document.querySelectorAll('.examples a').forEach(a => {
a.onclick = () => { document.getElementById('input').value = EXAMPLES[a.dataset.ex] || ''; };
});
let renderer = null;
function renderGraph(data) {
const empty = document.getElementById('empty');
const container = document.getElementById('graph');
if (typeof graphology === 'undefined' || typeof Sigma === 'undefined') {
empty.textContent = 'Sigma o graphology no cargaron — verifica /static/';
return;
}
if (!data.nodes || !data.nodes.length) {
empty.style.display = 'flex';
empty.textContent = 'Sin nodos extraidos';
if (renderer) { renderer.kill(); renderer = null; }
return;
}
empty.style.display = 'none';
// Construir el grafo en graphology
const Graph = graphology.Graph || graphology.default || graphology;
const g = new Graph({ multi: false, type: 'directed', allowSelfLoops: false });
data.nodes.forEach(n => {
if (!g.hasNode(n.id)) {
g.addNode(n.id, {
label: n.label,
x: n.x || Math.random() * 10,
y: n.y || Math.random() * 10,
size: 10,
color: TYPE_COLOR[n.type] || '#888',
});
}
});
data.edges.forEach((e, i) => {
if (!g.hasNode(e.from) || !g.hasNode(e.to)) return;
if (e.from === e.to) return;
const eid = `e${i}`;
if (!g.hasEdge(e.from, e.to)) {
g.addEdgeWithKey(eid, e.from, e.to, {
label: e.label || '',
size: 1.5,
color: '#666',
type: 'arrow',
});
}
});
// Re-instanciar el renderer
if (renderer) { renderer.kill(); renderer = null; }
container.innerHTML = '';
renderer = new Sigma(g, container, {
renderEdgeLabels: true,
defaultEdgeType: 'arrow',
edgeLabelSize: 9,
edgeLabelColor: { color: '#aaa' },
labelColor: { color: '#fff' },
labelSize: 12,
labelDensity: 1.0,
labelGridCellSize: 80,
labelRenderedSizeThreshold: 6,
minCameraRatio: 0.05,
maxCameraRatio: 6,
});
}
document.getElementById('btn').onclick = async () => {
const text = document.getElementById('input').value.trim();
if (!text) { alert('Pega algo de texto'); return; }
const threshold = parseFloat(document.getElementById('threshold').value);
const btn = document.getElementById('btn');
const status = document.getElementById('status');
btn.disabled = true;
const estChunks = Math.max(1, Math.ceil(text.length / 1500));
status.textContent = estChunks > 1
? `procesando ${estChunks} chunks (~${(estChunks * 1.5).toFixed(0)}s)…`
: 'procesando...';
try {
const res = await fetch('/extract', {
method: 'POST', headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text, threshold }),
});
const data = await res.json();
if (!res.ok) throw new Error(data.error || 'extract failed');
document.getElementById('kpi-nodes').textContent = data.n_nodes;
document.getElementById('kpi-edges').textContent = data.n_edges;
document.getElementById('kpi-time').textContent = data.elapsed_s + 's';
// Texto de relaciones — alineado para legibilidad
const relsText = (data.edges || []).length
? (() => {
const padFrom = Math.max(...data.edges.map(e => e.from.length));
const padKind = Math.max(...data.edges.map(e => (e.label || '').length));
return data.edges.map(e =>
`${e.from.padEnd(padFrom)} --[${(e.label || '').padEnd(padKind)}]--> ${e.to}`
).join('\n');
})()
: '(sin relaciones — prueba a bajar threshold o cambiar el texto)';
document.getElementById('relations-text').textContent = relsText;
// Entidades agrupadas por tipo
const byType = {};
(data.nodes || []).forEach(n => {
const t = n.type || '?';
if (!byType[t]) byType[t] = [];
byType[t].push(n.id);
});
document.getElementById('entities-text').textContent =
Object.keys(byType).sort().map(t =>
`${t} (${byType[t].length}):\n ${byType[t].sort().join(', ')}`
).join('\n\n') || '(sin entidades)';
// JSON completo (pretty)
document.getElementById('raw-json').textContent = JSON.stringify({
n_nodes: data.n_nodes,
n_edges: data.n_edges,
n_chunks: data.n_chunks,
n_dropped_typed: data.n_dropped_typed,
elapsed_s: data.elapsed_s,
nodes: (data.nodes || []).map(n => ({ id: n.id, type: n.type })),
edges: data.edges,
}, null, 2);
document.getElementById('dropped').textContent = (data.dropped || []).length
? data.dropped.map(d => `${d.from} (${d.head_type}) -[${d.kind}]-> ${d.to} (${d.tail_type})`).join('\n')
: '(ninguna — el filtro typed no descarto nada)';
const chunkInfo = data.n_chunks > 1 ? ` · ${data.n_chunks} chunks` : '';
status.textContent = `${data.n_nodes} nodos · ${data.n_edges} aristas · ${data.elapsed_s}s${chunkInfo}`;
renderGraph(data);
} catch (e) {
console.error('[playground] extract failed:', e);
alert('Error: ' + e.message);
status.textContent = 'error';
} finally {
btn.disabled = false;
}
};
document.getElementById('input').addEventListener('keydown', e => {
if ((e.ctrlKey || e.metaKey) && e.key === 'Enter') document.getElementById('btn').click();
});
</script>
</body>
</html>
+264
View File
@@ -0,0 +1,264 @@
"""Playground server — GLiNER2 + post-filter typed sobre cualquier texto.
Aplica las recetas del notebook 08:
- snake_case verbal labels
- threshold=0.3
- post-filter por (head_type, tail_type)
- coreference simple normalize+substring
Run:
cd playground && ../.venv/bin/python3 server.py
Luego: http://localhost:7878
"""
from __future__ import annotations
import os
import re
import sys
import time
import warnings
from collections import defaultdict
from pathlib import Path
warnings.filterwarnings("ignore")
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
# sys.path cleanup (mismo workaround documentado en notebook 08)
_pf = "/home/lucas/fn_registry/python/functions"
sys.path = [p for p in sys.path if not p.startswith(_pf + "/")]
if _pf not in sys.path:
sys.path.insert(0, _pf)
from fastapi import FastAPI
from fastapi.responses import FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from gliner2 import GLiNER2
HERE = Path(__file__).resolve().parent
# ── carga modelo una sola vez ──
print("[load] GLiNER2-large-v1 (CPU)...", flush=True)
t0 = time.time()
MODEL = GLiNER2.from_pretrained("fastino/gliner2-large-v1")
print(f"[load] done in {time.time()-t0:.1f}s", flush=True)
# ── recetas del notebook 08 ──
ENTITY_LABELS = ["person", "organization", "location"]
RELATION_LABELS = [
"works_at", "located_in", "ceo_of", "president_of",
"headquartered_in", "agreement_with", "subsidiary_of", "founded_by",
]
ALLOWED = {
"works_at": (["person"], ["organization"]),
"ceo_of": (["person"], ["organization"]),
"president_of": (["person"], ["organization"]),
"headquartered_in": (["organization"], ["location"]),
"located_in": (["organization", "person", "location"], ["location"]),
"agreement_with": (["organization"], ["organization"]),
"subsidiary_of": (["organization"], ["organization"]),
"founded_by": (["organization"], ["person"]),
}
def normalize_name(s: str) -> str:
s = re.sub(r"[\.,;:\"'`()\[\]]", "", s.strip())
s = re.sub(r"\s+", " ", s)
return s.strip().lower()
def merge_aliases(names: list[str]) -> dict[str, str]:
norm_groups: dict = defaultdict(list)
for n in names:
norm_groups[normalize_name(n)].append(n)
canonical: dict = {}
for nrm, group in norm_groups.items():
winner = max(group, key=lambda x: (len(x), x))
for n in group:
canonical[n] = winner
canon_set = sorted(set(canonical.values()), key=len, reverse=True)
absorbed: dict = {}
for long_n in canon_set:
long_norm = normalize_name(long_n)
for short_n in canon_set:
if short_n == long_n or short_n in absorbed:
continue
short_norm = normalize_name(short_n)
if len(short_norm) < 4:
continue
if re.search(r"\b" + re.escape(short_norm) + r"\b", long_norm):
absorbed[short_n] = long_n
final: dict = {}
for orig, canon in canonical.items():
final[orig] = absorbed.get(canon, canon)
return final
def filter_typed(rels: dict, name_to_type: dict, allowed: dict) -> tuple[list, list]:
keep: list = []
drop: list = []
for rt, pairs in rels.items():
head_ok, tail_ok = allowed.get(rt, (None, None))
for h, t in pairs:
ht = name_to_type.get(h.lower().strip())
tt = name_to_type.get(t.lower().strip())
if head_ok is None or (ht in head_ok and tt in tail_ok):
keep.append({"from": h, "kind": rt, "to": t, "head_type": ht, "tail_type": tt})
else:
drop.append({"from": h, "kind": rt, "to": t, "head_type": ht, "tail_type": tt})
return keep, drop
def chunk_text(text: str, max_chars: int = 1500, overlap_sentences: int = 2):
"""Split largo en chunks con sliding window. Same pattern as notebook 06."""
sentences = re.split(r"(?<=[\.!?])\s+", text)
sentences = [s.strip() for s in sentences if s.strip()]
chunks = []
i = 0
while i < len(sentences):
current_sents: list[str] = []
current_len = 0
if chunks and overlap_sentences > 0:
prev_sents = chunks[-1][-overlap_sentences:]
overlap_len = sum(len(s) + 1 for s in prev_sents)
next_sentence_len = len(sentences[i]) + 1
if overlap_len + next_sentence_len <= max_chars:
current_sents = list(prev_sents)
current_len = overlap_len
if i < len(sentences):
current_sents.append(sentences[i])
current_len += len(sentences[i]) + 1
i += 1
while i < len(sentences) and current_len + len(sentences[i]) + 1 <= max_chars:
current_sents.append(sentences[i])
current_len += len(sentences[i]) + 1
i += 1
chunks.append(current_sents)
return [" ".join(c) for c in chunks]
def extract_graph(text: str, threshold: float = 0.3, max_chars_per_chunk: int = 1500) -> dict:
schema = MODEL.create_schema().entities(ENTITY_LABELS).relations(RELATION_LABELS)
# Chunking automatico si el texto es largo
if len(text) <= max_chars_per_chunk:
chunks = [text]
else:
chunks = chunk_text(text, max_chars=max_chars_per_chunk, overlap_sentences=2)
print(f"[extract] {len(text)}c → {len(chunks)} chunks", flush=True)
t0 = time.time()
# Acumuladores deduplicados
name_to_type: dict = {} # name_lower → type (last seen wins)
name_canonical: dict = {} # name_lower → original casing
raw_relations: dict = {} # rel_type → list of (h, t)
for idx, chunk in enumerate(chunks):
r = MODEL.extract(chunk, schema=schema, threshold=threshold)
for typ, names in r["entities"].items():
for n in names:
key = n.lower().strip()
if not key: continue
if key not in name_to_type:
name_to_type[key] = typ
name_canonical[key] = n.strip()
# if seen with different name_canonical, keep the longer
elif len(n.strip()) > len(name_canonical[key]):
name_canonical[key] = n.strip()
for rt, pairs in r["relation_extraction"].items():
if rt not in raw_relations: raw_relations[rt] = []
for h, t in pairs:
raw_relations[rt].append((h.strip(), t.strip()))
if (idx + 1) % 10 == 0:
print(f"[extract] chunk {idx+1}/{len(chunks)} ents acum={len(name_to_type)} rels acum={sum(len(v) for v in raw_relations.values())}", flush=True)
# Post-filter typed
keep, drop = filter_typed(raw_relations, name_to_type, ALLOWED)
# Coreferencia: alias map sobre los canonical names
original_names = list(name_canonical.values())
alias = merge_aliases(original_names)
# Construir nodos con alias aplicado
nodes_dict: dict = {}
for key, typ in name_to_type.items():
canon_orig = name_canonical[key]
canon_resolved = alias.get(canon_orig, canon_orig)
if canon_resolved not in nodes_dict:
nodes_dict[canon_resolved] = typ
# Construir aristas dedupeadas tras alias
edges_set: set = set()
for e in keep:
h_canon = alias.get(e["from"], e["from"])
t_canon = alias.get(e["to"], e["to"])
if h_canon == t_canon:
continue
if h_canon not in nodes_dict:
nodes_dict[h_canon] = e.get("head_type") or "?"
if t_canon not in nodes_dict:
nodes_dict[t_canon] = e.get("tail_type") or "?"
edges_set.add((h_canon, e["kind"], t_canon))
# Layout server-side (sigma solo renderiza)
import networkx as nx
G = nx.DiGraph()
for n, t in nodes_dict.items():
G.add_node(n)
for h, k, t in edges_set:
G.add_edge(h, t, kind=k)
if G.number_of_nodes() > 0:
try:
pos = nx.spring_layout(G, k=2.0, iterations=80, seed=42)
except Exception:
pos = {n: (0.0, 0.0) for n in G.nodes}
else:
pos = {}
elapsed = time.time() - t0
print(f"[extract] done {elapsed:.2f}s nodos={len(nodes_dict)} aristas={len(edges_set)}", flush=True)
return {
"elapsed_s": round(elapsed, 2),
"n_chunks": len(chunks),
"n_nodes": len(nodes_dict),
"n_edges": len(edges_set),
"n_dropped_typed": len(drop),
"nodes": [
{"id": n, "label": n, "type": t,
"x": float(pos.get(n, (0.0, 0.0))[0]),
"y": float(pos.get(n, (0.0, 0.0))[1])}
for n, t in nodes_dict.items()
],
"edges": [{"from": h, "to": t, "label": k} for h, k, t in edges_set],
"dropped": drop[:10],
}
# ── API ──
app = FastAPI(title="GLiNER2 Playground")
app.mount("/static", StaticFiles(directory=HERE / "static"), name="static")
class ExtractReq(BaseModel):
text: str
threshold: float = 0.3
@app.get("/")
def index():
return FileResponse(HERE / "index.html")
@app.post("/extract")
def extract(req: ExtractReq):
if not req.text.strip():
return JSONResponse({"error": "empty text"}, status_code=400)
return extract_graph(req.text, threshold=req.threshold)
if __name__ == "__main__":
import uvicorn
print("\nServing at http://localhost:7878\n", flush=True)
uvicorn.run(app, host="0.0.0.0", port=7878, log_level="warning")
File diff suppressed because one or more lines are too long
+1351
View File
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long