b8c760d004
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
308 lines
13 KiB
Python
308 lines
13 KiB
Python
"""Benchmark NuExtract 2.0-2B (MIT) sobre nuestros corpora.
|
|
|
|
Mide tiempo y calidad sobre:
|
|
T1. es_corporate_short (8 frases) con schema simple (paridad con notebook 02)
|
|
T2. es_corporate_short con schema rico anidado (lo que NuExtract hace mejor)
|
|
T3. LONG_TEXT_ES del notebook 05/06 (25 frases, sector bancario)
|
|
T4. 5 chunks del PDF de BBVA (extrapolar al PDF completo)
|
|
|
|
Vuelca a nuextract_results.json para que el notebook lo cargue sin recargar el modelo.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
import warnings
|
|
from pathlib import Path
|
|
|
|
warnings.filterwarnings("ignore")
|
|
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
|
|
|
|
HERE = Path(__file__).resolve().parent
|
|
_pf = "/home/lucas/fn_registry/python/functions"
|
|
sys.path = [p for p in sys.path if not p.startswith(_pf + "/")]
|
|
if _pf not in sys.path:
|
|
sys.path.insert(0, _pf)
|
|
|
|
from core.extract_pdf_text import extract_pdf_text
|
|
|
|
|
|
VAULT = Path("/home/lucas/vaults/osint_nlp_models")
|
|
PDF_PATH = VAULT / "test_documents" / "politica_proteccion_datos.pdf"
|
|
|
|
|
|
def clean_pdf_text(text: str) -> str:
|
|
text = re.sub(r"\b\d{1,2}/\d{1,2}\b", " ", text)
|
|
text = text.replace("\t", " ")
|
|
text = re.sub(r"-\s*\n\s*", "", text)
|
|
text = re.sub(r"(?<![\.!?])\n+", " ", text)
|
|
text = re.sub(r" {2,}", " ", text)
|
|
text = "\n".join(line.strip() for line in text.split("\n") if line.strip())
|
|
return text.strip()
|
|
|
|
|
|
def chunk_with_overlap(text: str, max_chars: int = 1500, overlap_sentences: int = 2):
|
|
sentences = re.split(r"(?<=[\.!?])\s+", text)
|
|
sentences = [s.strip() for s in sentences if s.strip()]
|
|
chunks = []
|
|
i = 0
|
|
while i < len(sentences):
|
|
current_sents: list[str] = []
|
|
current_len = 0
|
|
if chunks and overlap_sentences > 0:
|
|
prev_sents = chunks[-1]["sentences"][-overlap_sentences:]
|
|
overlap_len = sum(len(s) + 1 for s in prev_sents)
|
|
next_sentence_len = len(sentences[i]) + 1
|
|
if overlap_len + next_sentence_len <= max_chars:
|
|
current_sents = list(prev_sents)
|
|
current_len = overlap_len
|
|
if i < len(sentences):
|
|
current_sents.append(sentences[i])
|
|
current_len += len(sentences[i]) + 1
|
|
i += 1
|
|
while i < len(sentences) and current_len + len(sentences[i]) + 1 <= max_chars:
|
|
current_sents.append(sentences[i])
|
|
current_len += len(sentences[i]) + 1
|
|
i += 1
|
|
chunks.append({"text": " ".join(current_sents), "sentences": current_sents})
|
|
return chunks
|
|
|
|
|
|
# ── corpora ──
|
|
ES_CORPORATE_SHORT = (
|
|
"Pablo Isla, expresidente de Inditex, ha sido nombrado consejero de Telefonica. "
|
|
"La operacion fue anunciada por el presidente Jose Maria Alvarez-Pallete en Madrid el pasado lunes. "
|
|
"Inditex factura mas de 30.000 millones anuales y tiene su sede en Arteixo, A Coruna. "
|
|
"En paralelo, Iberdrola y Endesa firmaron un acuerdo de colaboracion en proyectos eolicos en Galicia. "
|
|
"El presidente de Iberdrola, Ignacio Galan, se reunio con la CEO de Endesa, Marina Serrano, en Bilbao. "
|
|
"El acuerdo movilizara 2.000 millones de euros en cinco anos. "
|
|
"El BBVA, presidido por Carlos Torres, mostro interes en participar en la financiacion del proyecto. "
|
|
"Su sede central esta en Bilbao."
|
|
)
|
|
|
|
LONG_TEXT_ES = (
|
|
"BBVA, presidido por Carlos Torres, completo en 2024 la integracion operativa de Banco Sabadell tras la fusion. "
|
|
"Onur Genc, consejero delegado del banco desde 2018, lidero el proceso desde la sede central en Bilbao. "
|
|
"El banco mantiene oficinas en Plaza San Nicolas 4 y opera en mas de 25 paises. "
|
|
"Banco Santander, dirigido por Ana Botin, sigue siendo el primer banco espanol por capitalizacion bursatil. "
|
|
"Hector Grisi asumio el cargo de CEO global de Santander en enero de 2023, reemplazando a Jose Antonio Alvarez. "
|
|
"CaixaBank, presidida por Jose Ignacio Goirigolzarri y con sede en Valencia desde 2017, completo la fusion con Bankia. "
|
|
"Gonzalo Gortazar es el consejero delegado de CaixaBank y reporta al consejo formado en parte por La Caixa. "
|
|
"El Banco de Espana, gobernado por Pablo Hernandez de Cos hasta 2024 y por Margarita Delgado en 2025, supervisa el sector. "
|
|
"Luis de Guindos, vicepresidente del Banco Central Europeo, fue ministro de Economia en el gobierno de Mariano Rajoy. "
|
|
"La Comision Nacional del Mercado de Valores, presidida por Rodrigo Buenaventura, regula los mercados financieros. "
|
|
"BBVA anuncio en mayo de 2024 una OPA hostil sobre Banco Sabadell que el consejo del banco rechazo inicialmente. "
|
|
"Cesar Gonzalez-Bueno, CEO de Sabadell, defendio la independencia del banco junto con su presidente Josep Oliu. "
|
|
"Repsol, presidida por Antonio Brufau y con CEO Josu Jon Imaz, vendio su filial mexicana a Macquarie. "
|
|
"Iberdrola, liderada por Ignacio Galan, opera Avangrid en EEUU y firmo un acuerdo PPA con Amazon. "
|
|
"Andy Jassy, CEO de Amazon desde Seattle, agradecio el contrato a Iberdrola en una nota publica. "
|
|
"Endesa, filial de la italiana Enel, tiene como CEO a Marina Serrano y opera en Espana, Portugal y Marruecos. "
|
|
"Ferrovial, presidida por Rafael del Pino, traslado su sede social a Holanda en 2022 generando polemica politica. "
|
|
"ACS, presidida por Florentino Perez, sigue siendo lider mundial en concesiones de infraestructura. "
|
|
"Inditex, fundada por Amancio Ortega y presidida por Marta Ortega desde 2022, tiene su sede en Arteixo, A Coruna. "
|
|
"Pablo Isla, expresidente de Inditex y actual consejero de Telefonica, se incorporo al consejo en 2024. "
|
|
"Telefonica, presidida por Jose Maria Alvarez-Pallete, sufrio la entrada del estado en su capital con SEPI. "
|
|
"Saudi Telecom Company adquirio un 9.9% de Telefonica en 2023, lo que motivo la respuesta del gobierno espanol. "
|
|
"Cristina Aldamiz-Echevarria fue nombrada directora de Recursos Humanos del Grupo Mapfre, dirigido por Antonio Huertas. "
|
|
"Naturgy, presidida por Francisco Reynes, recibio una OPA parcial del fondo emirati IFM en 2021 que se cancelo. "
|
|
"Indra, con Marc Murtra como presidente, se ha posicionado como contratista clave de Defensa para el ministerio de Margarita Robles."
|
|
)
|
|
|
|
# ── schemas ──
|
|
SCHEMA_FLAT = """{
|
|
"people": ["string"],
|
|
"organizations": ["string"],
|
|
"locations": ["string"]
|
|
}"""
|
|
|
|
SCHEMA_RICH_CORPORATE = """{
|
|
"organizations": [
|
|
{
|
|
"name": "string",
|
|
"ceo": "string",
|
|
"chairman_president": "string",
|
|
"headquartered_in": "string",
|
|
"subsidiaries": ["string"],
|
|
"parent_company": "string"
|
|
}
|
|
],
|
|
"people": [
|
|
{
|
|
"name": "string",
|
|
"role": "string",
|
|
"organization": "string"
|
|
}
|
|
],
|
|
"agreements": [
|
|
{
|
|
"between": ["string"],
|
|
"topic": "string",
|
|
"amount": "string"
|
|
}
|
|
]
|
|
}"""
|
|
|
|
SCHEMA_RICH_GDPR = """{
|
|
"data_controller": {
|
|
"name": "string",
|
|
"address": "string",
|
|
"registration": "string"
|
|
},
|
|
"dpo_contact": {
|
|
"email": "string",
|
|
"address": "string"
|
|
},
|
|
"data_categories": ["string"],
|
|
"rights_listed": ["string"],
|
|
"authorities_mentioned": [
|
|
{
|
|
"name": "string",
|
|
"url_or_contact": "string"
|
|
}
|
|
],
|
|
"laws_mentioned": ["string"]
|
|
}"""
|
|
|
|
|
|
def build_messages(tokenizer, document: str, template: str) -> str:
|
|
messages = [{"role": "user", "content": document}]
|
|
return tokenizer.apply_chat_template(
|
|
messages, template=template, tokenize=False, add_generation_prompt=True,
|
|
)
|
|
|
|
|
|
def run_extract(model, tokenizer, device: str, document: str, template: str, max_new_tokens: int = 2048):
|
|
text = build_messages(tokenizer, document, template)
|
|
inputs = tokenizer([text], padding=True, return_tensors="pt").to(device)
|
|
t0 = time.time()
|
|
generated = model.generate(
|
|
**inputs, do_sample=False, num_beams=1, max_new_tokens=max_new_tokens,
|
|
pad_token_id=tokenizer.eos_token_id,
|
|
)
|
|
elapsed = time.time() - t0
|
|
n_input_tokens = inputs["input_ids"].shape[1]
|
|
n_output_tokens = generated.shape[1] - n_input_tokens
|
|
# extract just the generated portion
|
|
out_text = tokenizer.decode(generated[0][n_input_tokens:], skip_special_tokens=True)
|
|
return {
|
|
"elapsed_s": round(elapsed, 2),
|
|
"n_input_tokens": int(n_input_tokens),
|
|
"n_output_tokens": int(n_output_tokens),
|
|
"raw_text": out_text,
|
|
}
|
|
|
|
|
|
def parse_json_safe(text: str):
|
|
# NuExtract output is JSON after the last assistant message; try to find it
|
|
s = text.rfind("{")
|
|
if s == -1: return None
|
|
# try progressively shorter substrings to find valid json end
|
|
for end in range(len(text), s, -1):
|
|
try:
|
|
return json.loads(text[s:end])
|
|
except Exception:
|
|
continue
|
|
return None
|
|
|
|
|
|
def main():
|
|
print("[load] loading model + tokenizer...", flush=True)
|
|
t0 = time.time()
|
|
import torch
|
|
from transformers import AutoTokenizer, AutoModelForImageTextToText
|
|
|
|
use_gpu = torch.cuda.is_available()
|
|
device = "cuda" if use_gpu else "cpu"
|
|
dtype = torch.bfloat16 if use_gpu else torch.float32
|
|
print(f"[device] {device} dtype={dtype}", flush=True)
|
|
if use_gpu:
|
|
print(f"[gpu] {torch.cuda.get_device_name(0)} {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB", flush=True)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
"numind/NuExtract-2.0-2B", trust_remote_code=True, padding_side="left",
|
|
)
|
|
# Try SDPA (fast and supported), fallback to eager. flash_attn requires extra install.
|
|
attn_impl = "sdpa" if use_gpu else "eager"
|
|
model = AutoModelForImageTextToText.from_pretrained(
|
|
"numind/NuExtract-2.0-2B",
|
|
trust_remote_code=True,
|
|
torch_dtype=dtype,
|
|
attn_implementation=attn_impl,
|
|
)
|
|
if use_gpu:
|
|
model = model.to(device)
|
|
model.eval()
|
|
print(f"[load] done in {time.time()-t0:.1f}s", flush=True)
|
|
out: dict = {
|
|
"meta": {"device": device, "dtype": str(dtype), "model": "numind/NuExtract-2.0-2B"},
|
|
"cpu_baseline": { # capturado en run anterior, antes del switch a GPU
|
|
"T1_flat": {"elapsed_s": 24.98, "in_tok": 245, "out_tok": 79},
|
|
"T2_rich": {"elapsed_s": 117.51, "in_tok": 351, "out_tok": 370},
|
|
},
|
|
}
|
|
|
|
# T1: es_corporate_short con schema FLAT
|
|
print("\n[T1] es_corporate_short + SCHEMA_FLAT...", flush=True)
|
|
r = run_extract(model, tokenizer, device, ES_CORPORATE_SHORT, SCHEMA_FLAT)
|
|
parsed = parse_json_safe(r["raw_text"])
|
|
print(f" {r['elapsed_s']}s in_tok={r['n_input_tokens']} out_tok={r['n_output_tokens']}", flush=True)
|
|
out["T1_corp_short_flat"] = {**r, "parsed": parsed, "input_chars": len(ES_CORPORATE_SHORT)}
|
|
|
|
# T2: es_corporate_short con SCHEMA_RICH_CORPORATE
|
|
print("\n[T2] es_corporate_short + SCHEMA_RICH_CORPORATE...", flush=True)
|
|
r = run_extract(model, tokenizer, device, ES_CORPORATE_SHORT, SCHEMA_RICH_CORPORATE)
|
|
parsed = parse_json_safe(r["raw_text"])
|
|
print(f" {r['elapsed_s']}s in_tok={r['n_input_tokens']} out_tok={r['n_output_tokens']}", flush=True)
|
|
out["T2_corp_short_rich"] = {**r, "parsed": parsed, "input_chars": len(ES_CORPORATE_SHORT)}
|
|
|
|
# T3: LONG_TEXT_ES con SCHEMA_RICH_CORPORATE
|
|
print("\n[T3] LONG_TEXT_ES (25 frases, 400 words) + SCHEMA_RICH_CORPORATE...", flush=True)
|
|
r = run_extract(model, tokenizer, device, LONG_TEXT_ES, SCHEMA_RICH_CORPORATE)
|
|
parsed = parse_json_safe(r["raw_text"])
|
|
print(f" {r['elapsed_s']}s in_tok={r['n_input_tokens']} out_tok={r['n_output_tokens']}", flush=True)
|
|
out["T3_long_text_rich"] = {**r, "parsed": parsed, "input_chars": len(LONG_TEXT_ES)}
|
|
|
|
# T4: 5 chunks del PDF
|
|
print("\n[T4] preparing PDF...", flush=True)
|
|
raw = extract_pdf_text(str(PDF_PATH))
|
|
clean = clean_pdf_text(raw)
|
|
chunks = chunk_with_overlap(clean, max_chars=1500, overlap_sentences=2)
|
|
out["pdf_meta"] = {"n_chunks": len(chunks), "clean_chars": len(clean)}
|
|
print(f" PDF: {len(clean):,} chars / {len(chunks)} chunks total — corremos solo 5 representativos", flush=True)
|
|
|
|
chunk_indices = [0, 5, 15, 30, 60] # representativos: inicio / medio / final
|
|
chunk_results = []
|
|
for idx in chunk_indices:
|
|
if idx >= len(chunks): continue
|
|
c = chunks[idx]
|
|
print(f" [chunk {idx}] {len(c['text'])}c — running...", flush=True)
|
|
r = run_extract(model, tokenizer, device, c["text"], SCHEMA_RICH_GDPR)
|
|
parsed = parse_json_safe(r["raw_text"])
|
|
print(f" {r['elapsed_s']}s in_tok={r['n_input_tokens']} out_tok={r['n_output_tokens']}", flush=True)
|
|
chunk_results.append({"chunk_idx": idx, **r, "parsed": parsed, "input_chars": len(c["text"])})
|
|
|
|
out["T4_pdf_chunks"] = chunk_results
|
|
|
|
# extrapolate full PDF time
|
|
if chunk_results:
|
|
avg_per_chunk = sum(cr["elapsed_s"] for cr in chunk_results) / len(chunk_results)
|
|
full_pdf_estimate = avg_per_chunk * len(chunks)
|
|
out["full_pdf_extrapolation"] = {
|
|
"avg_per_chunk_s": round(avg_per_chunk, 2),
|
|
"n_chunks": len(chunks),
|
|
"estimated_total_s": round(full_pdf_estimate, 1),
|
|
"estimated_total_min": round(full_pdf_estimate / 60, 1),
|
|
}
|
|
print(f"\n[extrapolation] PDF entero estimado: {full_pdf_estimate:.0f}s = {full_pdf_estimate/60:.1f} min", flush=True)
|
|
|
|
out_path = HERE / "nuextract_results.json"
|
|
out_path.write_text(json.dumps(out, indent=2, ensure_ascii=False))
|
|
print(f"\n[saved] {out_path}", flush=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|