chore: initial sync — gliner+glirel benchmark notebooks
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,535 @@
|
||||
"""NuExtract 2.0-2B GPU — version 'production' con todas las mejoras:
|
||||
- repetition_penalty=1.1 (evita bucles)
|
||||
- chunking forzado (max 800 chars / ~250 tokens) en TODO texto
|
||||
- 97 chunks completos del PDF (no muestra)
|
||||
- 25 frases ES troceadas adecuadamente
|
||||
- agregacion deduplicada con conteo
|
||||
- coreferencia simple (normalize + substring)
|
||||
|
||||
Vuelca a nuextract_full.json — listo para notebook 08.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import warnings
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
|
||||
|
||||
HERE = Path(__file__).resolve().parent
|
||||
_pf = "/home/lucas/fn_registry/python/functions"
|
||||
sys.path = [p for p in sys.path if not p.startswith(_pf + "/")]
|
||||
if _pf not in sys.path:
|
||||
sys.path.insert(0, _pf)
|
||||
|
||||
from core.extract_pdf_text import extract_pdf_text
|
||||
|
||||
|
||||
VAULT = Path("/home/lucas/vaults/osint_nlp_models")
|
||||
PDF_PATH = VAULT / "test_documents" / "politica_proteccion_datos.pdf"
|
||||
|
||||
|
||||
def clean_pdf_text(text: str) -> str:
|
||||
text = re.sub(r"\b\d{1,2}/\d{1,2}\b", " ", text)
|
||||
text = text.replace("\t", " ")
|
||||
text = re.sub(r"-\s*\n\s*", "", text)
|
||||
text = re.sub(r"(?<![\.!?])\n+", " ", text)
|
||||
text = re.sub(r" {2,}", " ", text)
|
||||
text = "\n".join(line.strip() for line in text.split("\n") if line.strip())
|
||||
return text.strip()
|
||||
|
||||
|
||||
def chunk_with_overlap(text: str, max_chars: int = 800, overlap_sentences: int = 1):
|
||||
sentences = re.split(r"(?<=[\.!?])\s+", text)
|
||||
sentences = [s.strip() for s in sentences if s.strip()]
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < len(sentences):
|
||||
current_sents: list[str] = []
|
||||
current_len = 0
|
||||
if chunks and overlap_sentences > 0:
|
||||
prev_sents = chunks[-1]["sentences"][-overlap_sentences:]
|
||||
overlap_len = sum(len(s) + 1 for s in prev_sents)
|
||||
next_sentence_len = len(sentences[i]) + 1
|
||||
if overlap_len + next_sentence_len <= max_chars:
|
||||
current_sents = list(prev_sents)
|
||||
current_len = overlap_len
|
||||
if i < len(sentences):
|
||||
current_sents.append(sentences[i])
|
||||
current_len += len(sentences[i]) + 1
|
||||
i += 1
|
||||
while i < len(sentences) and current_len + len(sentences[i]) + 1 <= max_chars:
|
||||
current_sents.append(sentences[i])
|
||||
current_len += len(sentences[i]) + 1
|
||||
i += 1
|
||||
chunks.append({"text": " ".join(current_sents), "sentences": current_sents})
|
||||
return chunks
|
||||
|
||||
|
||||
LONG_TEXT_ES = (
|
||||
"BBVA, presidido por Carlos Torres, completo en 2024 la integracion operativa de Banco Sabadell tras la fusion. "
|
||||
"Onur Genc, consejero delegado del banco desde 2018, lidero el proceso desde la sede central en Bilbao. "
|
||||
"El banco mantiene oficinas en Plaza San Nicolas 4 y opera en mas de 25 paises. "
|
||||
"Banco Santander, dirigido por Ana Botin, sigue siendo el primer banco espanol por capitalizacion bursatil. "
|
||||
"Hector Grisi asumio el cargo de CEO global de Santander en enero de 2023, reemplazando a Jose Antonio Alvarez. "
|
||||
"CaixaBank, presidida por Jose Ignacio Goirigolzarri y con sede en Valencia desde 2017, completo la fusion con Bankia. "
|
||||
"Gonzalo Gortazar es el consejero delegado de CaixaBank y reporta al consejo formado en parte por La Caixa. "
|
||||
"El Banco de Espana, gobernado por Pablo Hernandez de Cos hasta 2024 y por Margarita Delgado en 2025, supervisa el sector. "
|
||||
"Luis de Guindos, vicepresidente del Banco Central Europeo, fue ministro de Economia en el gobierno de Mariano Rajoy. "
|
||||
"La Comision Nacional del Mercado de Valores, presidida por Rodrigo Buenaventura, regula los mercados financieros. "
|
||||
"BBVA anuncio en mayo de 2024 una OPA hostil sobre Banco Sabadell que el consejo del banco rechazo inicialmente. "
|
||||
"Cesar Gonzalez-Bueno, CEO de Sabadell, defendio la independencia del banco junto con su presidente Josep Oliu. "
|
||||
"Repsol, presidida por Antonio Brufau y con CEO Josu Jon Imaz, vendio su filial mexicana a Macquarie. "
|
||||
"Iberdrola, liderada por Ignacio Galan, opera Avangrid en EEUU y firmo un acuerdo PPA con Amazon. "
|
||||
"Andy Jassy, CEO de Amazon desde Seattle, agradecio el contrato a Iberdrola en una nota publica. "
|
||||
"Endesa, filial de la italiana Enel, tiene como CEO a Marina Serrano y opera en Espana, Portugal y Marruecos. "
|
||||
"Ferrovial, presidida por Rafael del Pino, traslado su sede social a Holanda en 2022 generando polemica politica. "
|
||||
"ACS, presidida por Florentino Perez, sigue siendo lider mundial en concesiones de infraestructura. "
|
||||
"Inditex, fundada por Amancio Ortega y presidida por Marta Ortega desde 2022, tiene su sede en Arteixo, A Coruna. "
|
||||
"Pablo Isla, expresidente de Inditex y actual consejero de Telefonica, se incorporo al consejo en 2024. "
|
||||
"Telefonica, presidida por Jose Maria Alvarez-Pallete, sufrio la entrada del estado en su capital con SEPI. "
|
||||
"Saudi Telecom Company adquirio un 9.9% de Telefonica en 2023, lo que motivo la respuesta del gobierno espanol. "
|
||||
"Cristina Aldamiz-Echevarria fue nombrada directora de Recursos Humanos del Grupo Mapfre, dirigido por Antonio Huertas. "
|
||||
"Naturgy, presidida por Francisco Reynes, recibio una OPA parcial del fondo emirati IFM en 2021 que se cancelo. "
|
||||
"Indra, con Marc Murtra como presidente, se ha posicionado como contratista clave de Defensa para el ministerio de Margarita Robles."
|
||||
)
|
||||
|
||||
|
||||
SCHEMA_RICH_CORPORATE = """{
|
||||
"organizations": [
|
||||
{
|
||||
"name": "string",
|
||||
"ceo": "string",
|
||||
"chairman_president": "string",
|
||||
"headquartered_in": "string",
|
||||
"subsidiaries": ["string"],
|
||||
"parent_company": "string"
|
||||
}
|
||||
],
|
||||
"people": [
|
||||
{
|
||||
"name": "string",
|
||||
"role": "string",
|
||||
"organization": "string"
|
||||
}
|
||||
],
|
||||
"agreements": [
|
||||
{
|
||||
"between": ["string"],
|
||||
"topic": "string",
|
||||
"amount": "string"
|
||||
}
|
||||
]
|
||||
}"""
|
||||
|
||||
SCHEMA_RICH_GDPR = """{
|
||||
"data_controller": {
|
||||
"name": "string",
|
||||
"address": "string",
|
||||
"registration": "string"
|
||||
},
|
||||
"dpo_contact": {
|
||||
"email": "string",
|
||||
"address": "string"
|
||||
},
|
||||
"data_categories": ["string"],
|
||||
"rights_listed": ["string"],
|
||||
"authorities_mentioned": [
|
||||
{
|
||||
"name": "string",
|
||||
"url_or_contact": "string"
|
||||
}
|
||||
],
|
||||
"laws_mentioned": ["string"]
|
||||
}"""
|
||||
|
||||
|
||||
def parse_json_safe(text: str):
|
||||
"""Parser robusto: busca el PRIMER `{` y trunca progresivamente."""
|
||||
if not text: return None
|
||||
s = text.find("{")
|
||||
if s < 0: return None
|
||||
for end in range(len(text), s, -1):
|
||||
try:
|
||||
return json.loads(text[s:end])
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def run_extract(model, tokenizer, device, document, template, max_new_tokens=1024):
|
||||
messages = [{"role": "user", "content": document}]
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages, template=template, tokenize=False, add_generation_prompt=True,
|
||||
)
|
||||
inputs = tokenizer([text], padding=True, return_tensors="pt").to(device)
|
||||
t0 = time.time()
|
||||
generated = model.generate(
|
||||
**inputs,
|
||||
do_sample=False,
|
||||
num_beams=1,
|
||||
max_new_tokens=max_new_tokens,
|
||||
repetition_penalty=1.15, # ⭐ EVITA BUCLES
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
)
|
||||
elapsed = time.time() - t0
|
||||
n_input = inputs["input_ids"].shape[1]
|
||||
n_output = generated.shape[1] - n_input
|
||||
out_text = tokenizer.decode(generated[0][n_input:], skip_special_tokens=True)
|
||||
parsed = parse_json_safe(out_text)
|
||||
return {
|
||||
"elapsed_s": round(elapsed, 2),
|
||||
"n_input_tokens": int(n_input),
|
||||
"n_output_tokens": int(n_output),
|
||||
"raw_text": out_text,
|
||||
"parsed": parsed,
|
||||
}
|
||||
|
||||
|
||||
# ── agregadores y coreferencia ──
|
||||
|
||||
def aggregate_corporate(results: list[dict]) -> dict:
|
||||
"""Acumula organizations / people / agreements de N chunks."""
|
||||
orgs = {} # name_lower -> dict (con counts y mejores valores)
|
||||
people = {} # name_lower -> dict
|
||||
agreements = []
|
||||
|
||||
for r in results:
|
||||
parsed = r.get("parsed") or {}
|
||||
for o in parsed.get("organizations", []) or []:
|
||||
if not isinstance(o, dict): continue
|
||||
name = (o.get("name") or "").strip()
|
||||
if not name: continue
|
||||
key = name.lower()
|
||||
if key not in orgs:
|
||||
orgs[key] = {"name": name, "count": 0, "ceo": [], "chairman_president": [],
|
||||
"headquartered_in": [], "subsidiaries": set(), "parent_company": []}
|
||||
orgs[key]["count"] += 1
|
||||
for f in ("ceo", "chairman_president", "headquartered_in", "parent_company"):
|
||||
v = o.get(f)
|
||||
if v and isinstance(v, str) and v.strip():
|
||||
orgs[key][f].append(v.strip())
|
||||
for sub in (o.get("subsidiaries") or []):
|
||||
if isinstance(sub, str) and sub.strip():
|
||||
orgs[key]["subsidiaries"].add(sub.strip())
|
||||
|
||||
for p in parsed.get("people", []) or []:
|
||||
if not isinstance(p, dict): continue
|
||||
name = (p.get("name") or "").strip()
|
||||
if not name: continue
|
||||
key = name.lower()
|
||||
if key not in people:
|
||||
people[key] = {"name": name, "count": 0, "roles": [], "organizations": []}
|
||||
people[key]["count"] += 1
|
||||
r_ = p.get("role")
|
||||
if r_ and isinstance(r_, str) and r_.strip():
|
||||
people[key]["roles"].append(r_.strip())
|
||||
o_ = p.get("organization")
|
||||
if o_ and isinstance(o_, str) and o_.strip():
|
||||
people[key]["organizations"].append(o_.strip())
|
||||
|
||||
for ag in parsed.get("agreements", []) or []:
|
||||
if not isinstance(ag, dict): continue
|
||||
parties = [p.strip() for p in (ag.get("between") or []) if isinstance(p, str) and p.strip()]
|
||||
if len(parties) >= 2:
|
||||
agreements.append({"between": parties, "topic": ag.get("topic"), "amount": ag.get("amount")})
|
||||
|
||||
# Convertir sets a listas
|
||||
for o in orgs.values():
|
||||
o["subsidiaries"] = sorted(o["subsidiaries"])
|
||||
return {"organizations": list(orgs.values()), "people": list(people.values()), "agreements": agreements}
|
||||
|
||||
|
||||
def aggregate_gdpr(results: list[dict]) -> dict:
|
||||
out = {
|
||||
"data_controllers": [], # multiple by chunk
|
||||
"dpo_contacts": [],
|
||||
"data_categories": Counter(),
|
||||
"rights_listed": Counter(),
|
||||
"authorities": {}, # name_lower -> {name, contact_options[], count}
|
||||
"laws": Counter(),
|
||||
}
|
||||
for r in results:
|
||||
parsed = r.get("parsed") or {}
|
||||
dc = parsed.get("data_controller") or {}
|
||||
if isinstance(dc, dict) and dc.get("name"):
|
||||
out["data_controllers"].append(dc)
|
||||
dpo = parsed.get("dpo_contact") or {}
|
||||
if isinstance(dpo, dict) and (dpo.get("email") or dpo.get("address")):
|
||||
out["dpo_contacts"].append(dpo)
|
||||
for c in parsed.get("data_categories", []) or []:
|
||||
if isinstance(c, str) and c.strip():
|
||||
out["data_categories"][c.strip()] += 1
|
||||
for rt in parsed.get("rights_listed", []) or []:
|
||||
if isinstance(rt, str) and rt.strip():
|
||||
out["rights_listed"][rt.strip()] += 1
|
||||
for a in parsed.get("authorities_mentioned", []) or []:
|
||||
if not isinstance(a, dict): continue
|
||||
name = (a.get("name") or "").strip()
|
||||
if not name: continue
|
||||
key = name.lower()
|
||||
if key not in out["authorities"]:
|
||||
out["authorities"][key] = {"name": name, "contact_options": [], "count": 0}
|
||||
out["authorities"][key]["count"] += 1
|
||||
c = a.get("url_or_contact")
|
||||
if c and isinstance(c, str) and c.strip():
|
||||
out["authorities"][key]["contact_options"].append(c.strip())
|
||||
for l in parsed.get("laws_mentioned", []) or []:
|
||||
if isinstance(l, str) and l.strip():
|
||||
out["laws"][l.strip()] += 1
|
||||
out["data_categories"] = dict(out["data_categories"])
|
||||
out["rights_listed"] = dict(out["rights_listed"])
|
||||
out["laws"] = dict(out["laws"])
|
||||
out["authorities"] = list(out["authorities"].values())
|
||||
return out
|
||||
|
||||
|
||||
def normalize_name(s: str) -> str:
|
||||
s = s.strip()
|
||||
s = re.sub(r"[\.,;:\"'`()\[\]]", "", s)
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
return s.strip().lower()
|
||||
|
||||
|
||||
def merge_aliases(entity_names: list[str]) -> dict:
|
||||
"""Devuelve un dict {nombre_original → nombre_canonico}."""
|
||||
norm_groups = defaultdict(list)
|
||||
for n in entity_names:
|
||||
norm_groups[normalize_name(n)].append(n)
|
||||
canonical: dict = {}
|
||||
canonical_data: dict = {}
|
||||
for nrm, group in norm_groups.items():
|
||||
winner = max(group, key=lambda x: (len(x), x))
|
||||
for n in group:
|
||||
canonical[n] = winner
|
||||
canonical_data[winner] = group
|
||||
canon_names = sorted(canonical_data.keys(), key=len, reverse=True)
|
||||
absorbed = {}
|
||||
for long_n in canon_names:
|
||||
long_norm = normalize_name(long_n)
|
||||
for short_n in canon_names:
|
||||
if short_n == long_n or short_n in absorbed: continue
|
||||
short_norm = normalize_name(short_n)
|
||||
if len(short_norm) < 4: continue
|
||||
if re.search(r"\b" + re.escape(short_norm) + r"\b", long_norm):
|
||||
absorbed[short_n] = long_n
|
||||
final = {}
|
||||
for orig, canon in canonical.items():
|
||||
final[orig] = absorbed.get(canon, canon)
|
||||
return final
|
||||
|
||||
|
||||
def build_corporate_graph(agg: dict, alias_map: dict | None = None) -> dict:
|
||||
"""Construye nodos y aristas del grafo corporate."""
|
||||
if alias_map is None: alias_map = {}
|
||||
def resolve(n): return alias_map.get(n, n)
|
||||
|
||||
nodes = {} # name -> type
|
||||
edges = set() # (h, kind, t)
|
||||
|
||||
for org in agg["organizations"]:
|
||||
name = resolve(org["name"])
|
||||
nodes[name] = "organization"
|
||||
for ceo in org["ceo"]:
|
||||
ceo_r = resolve(ceo)
|
||||
nodes.setdefault(ceo_r, "person")
|
||||
edges.add((ceo_r, "ceo_of", name))
|
||||
for pres in org["chairman_president"]:
|
||||
pres_r = resolve(pres)
|
||||
nodes.setdefault(pres_r, "person")
|
||||
edges.add((pres_r, "president_of", name))
|
||||
for hq in org["headquartered_in"]:
|
||||
hq_r = resolve(hq)
|
||||
nodes.setdefault(hq_r, "location")
|
||||
edges.add((name, "headquartered_in", hq_r))
|
||||
for parent in org["parent_company"]:
|
||||
parent_r = resolve(parent)
|
||||
nodes.setdefault(parent_r, "organization")
|
||||
edges.add((name, "subsidiary_of", parent_r))
|
||||
for sub in org["subsidiaries"]:
|
||||
sub_r = resolve(sub)
|
||||
nodes.setdefault(sub_r, "organization")
|
||||
edges.add((sub_r, "subsidiary_of", name))
|
||||
|
||||
for p in agg["people"]:
|
||||
name = resolve(p["name"])
|
||||
nodes.setdefault(name, "person")
|
||||
for org in p["organizations"]:
|
||||
org_r = resolve(org)
|
||||
nodes.setdefault(org_r, "organization")
|
||||
edges.add((name, "works_at", org_r))
|
||||
|
||||
for ag in agg["agreements"]:
|
||||
parties = [resolve(p) for p in ag["between"]]
|
||||
for p in parties:
|
||||
nodes.setdefault(p, "organization")
|
||||
for i, a in enumerate(parties):
|
||||
for b in parties[i+1:]:
|
||||
edges.add((a, "agreement_with", b))
|
||||
|
||||
return {"nodes": nodes, "edges": list(edges)}
|
||||
|
||||
|
||||
def build_gdpr_graph(agg: dict, alias_map: dict | None = None) -> dict:
|
||||
if alias_map is None: alias_map = {}
|
||||
def resolve(n): return alias_map.get(n, n)
|
||||
nodes = {}
|
||||
edges = set()
|
||||
|
||||
# data_controller — pick the first non-empty
|
||||
for dc in agg["data_controllers"]:
|
||||
if dc.get("name"):
|
||||
name = resolve(dc["name"].strip())
|
||||
nodes[name] = "data_controller"
|
||||
if dc.get("address"):
|
||||
addr = resolve(dc["address"].strip())
|
||||
nodes.setdefault(addr, "location")
|
||||
edges.add((name, "located_in", addr))
|
||||
break # solo el primero
|
||||
|
||||
for dpo in agg["dpo_contacts"]:
|
||||
if dpo.get("email"):
|
||||
email = dpo["email"].strip()
|
||||
nodes.setdefault(email, "email")
|
||||
if dpo.get("address"):
|
||||
addr = dpo["address"].strip()
|
||||
nodes.setdefault(addr, "location")
|
||||
|
||||
for c in agg["data_categories"]:
|
||||
nodes.setdefault(c, "data_category")
|
||||
for r in agg["rights_listed"]:
|
||||
nodes.setdefault(r, "right")
|
||||
for a in agg["authorities"]:
|
||||
name = resolve(a["name"].strip())
|
||||
nodes.setdefault(name, "authority")
|
||||
for c in a["contact_options"][:1]: # 1 contact por authority
|
||||
nodes.setdefault(c, "url")
|
||||
edges.add((name, "contact", c))
|
||||
for l in agg["laws"]:
|
||||
nodes.setdefault(l, "law")
|
||||
|
||||
return {"nodes": nodes, "edges": list(edges)}
|
||||
|
||||
|
||||
# ── main ──
|
||||
|
||||
def main():
|
||||
print("[load] loading model + tokenizer...", flush=True)
|
||||
t0 = time.time()
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForImageTextToText
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
print("CUDA not available — exiting", flush=True)
|
||||
return
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
print(f"[device] {device} dtype={dtype}", flush=True)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
"numind/NuExtract-2.0-2B", trust_remote_code=True, padding_side="left",
|
||||
)
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
"numind/NuExtract-2.0-2B",
|
||||
trust_remote_code=True,
|
||||
torch_dtype=dtype,
|
||||
attn_implementation="sdpa",
|
||||
).to(device)
|
||||
model.eval()
|
||||
print(f"[load] done in {time.time()-t0:.1f}s", flush=True)
|
||||
|
||||
out: dict = {"meta": {"device": device, "dtype": str(dtype),
|
||||
"model": "numind/NuExtract-2.0-2B",
|
||||
"repetition_penalty": 1.15, "max_chars_chunk": 800}}
|
||||
|
||||
# ── A. LONG_TEXT_ES con chunking
|
||||
print("\n[A] LONG_TEXT_ES — chunking + run...", flush=True)
|
||||
long_chunks = chunk_with_overlap(LONG_TEXT_ES, max_chars=800, overlap_sentences=1)
|
||||
print(f" {len(LONG_TEXT_ES)} chars → {len(long_chunks)} chunks", flush=True)
|
||||
long_results = []
|
||||
t_start = time.time()
|
||||
for i, c in enumerate(long_chunks):
|
||||
r = run_extract(model, tokenizer, device, c["text"], SCHEMA_RICH_CORPORATE)
|
||||
ok = "OK" if r["parsed"] else "FAIL"
|
||||
print(f" [chunk {i+1}/{len(long_chunks)}] {len(c['text'])}c {r['elapsed_s']}s out={r['n_output_tokens']} {ok}", flush=True)
|
||||
long_results.append(r)
|
||||
long_elapsed = time.time() - t_start
|
||||
long_agg = aggregate_corporate(long_results)
|
||||
# alias map sobre todos los nombres mencionados
|
||||
all_names_long = ([o["name"] for o in long_agg["organizations"]]
|
||||
+ [p["name"] for p in long_agg["people"]]
|
||||
+ [hq for o in long_agg["organizations"] for hq in o["headquartered_in"]]
|
||||
+ [s for o in long_agg["organizations"] for s in o["subsidiaries"]])
|
||||
alias_long = merge_aliases(list(set(all_names_long)))
|
||||
long_graph = build_corporate_graph(long_agg, alias_long)
|
||||
print(f" total {long_elapsed:.1f}s agregado: orgs={len(long_agg['organizations'])} people={len(long_agg['people'])} agreements={len(long_agg['agreements'])}", flush=True)
|
||||
print(f" grafo: nodos={len(long_graph['nodes'])} aristas={len(long_graph['edges'])}", flush=True)
|
||||
out["long_text"] = {
|
||||
"elapsed_s": round(long_elapsed, 1),
|
||||
"n_chunks": len(long_chunks),
|
||||
"n_chunks_parsed_ok": sum(1 for r in long_results if r["parsed"] is not None),
|
||||
"agg": long_agg,
|
||||
"graph": {"nodes": long_graph["nodes"], "edges": long_graph["edges"]},
|
||||
"n_nodes": len(long_graph["nodes"]),
|
||||
"n_edges": len(long_graph["edges"]),
|
||||
"n_isolates": sum(1 for n in long_graph["nodes"] if n not in {a for a, _, _ in long_graph["edges"]} | {b for _, _, b in long_graph["edges"]}),
|
||||
}
|
||||
del long_results
|
||||
gc.collect()
|
||||
|
||||
# ── B. PDF entero
|
||||
print("\n[B] PDF — extract + clean + chunk + run all chunks...", flush=True)
|
||||
raw = extract_pdf_text(str(PDF_PATH))
|
||||
clean = clean_pdf_text(raw)
|
||||
pdf_chunks = chunk_with_overlap(clean, max_chars=800, overlap_sentences=1)
|
||||
print(f" PDF: {len(raw):,} → {len(clean):,} chars → {len(pdf_chunks)} chunks", flush=True)
|
||||
pdf_results = []
|
||||
t_start = time.time()
|
||||
for i, c in enumerate(pdf_chunks):
|
||||
r = run_extract(model, tokenizer, device, c["text"], SCHEMA_RICH_GDPR)
|
||||
if (i+1) % 10 == 0:
|
||||
ok_count = sum(1 for r in pdf_results if r["parsed"] is not None)
|
||||
print(f" [chunk {i+1}/{len(pdf_chunks)}] {ok_count}/{i+1} parsed OK ({time.time()-t_start:.0f}s acumulado)", flush=True)
|
||||
pdf_results.append(r)
|
||||
pdf_elapsed = time.time() - t_start
|
||||
pdf_agg = aggregate_gdpr(pdf_results)
|
||||
# alias map para autoridades + data controllers
|
||||
all_names_pdf = ([dc["name"] for dc in pdf_agg["data_controllers"] if dc.get("name")]
|
||||
+ [a["name"] for a in pdf_agg["authorities"]])
|
||||
alias_pdf = merge_aliases(list(set(all_names_pdf)))
|
||||
pdf_graph = build_gdpr_graph(pdf_agg, alias_pdf)
|
||||
print(f" total {pdf_elapsed:.1f}s = {pdf_elapsed/60:.1f} min", flush=True)
|
||||
print(f" parsed OK: {sum(1 for r in pdf_results if r['parsed'] is not None)}/{len(pdf_chunks)}", flush=True)
|
||||
print(f" grafo: nodos={len(pdf_graph['nodes'])} aristas={len(pdf_graph['edges'])}", flush=True)
|
||||
out["pdf"] = {
|
||||
"elapsed_s": round(pdf_elapsed, 1),
|
||||
"n_chunks": len(pdf_chunks),
|
||||
"n_chunks_parsed_ok": sum(1 for r in pdf_results if r["parsed"] is not None),
|
||||
"agg_summary": {
|
||||
"n_data_controllers": len(pdf_agg["data_controllers"]),
|
||||
"n_dpo_contacts": len(pdf_agg["dpo_contacts"]),
|
||||
"n_data_categories": len(pdf_agg["data_categories"]),
|
||||
"n_rights": len(pdf_agg["rights_listed"]),
|
||||
"n_authorities": len(pdf_agg["authorities"]),
|
||||
"n_laws": len(pdf_agg["laws"]),
|
||||
},
|
||||
"agg_full": pdf_agg,
|
||||
"graph": {"nodes": pdf_graph["nodes"], "edges": pdf_graph["edges"]},
|
||||
"n_nodes": len(pdf_graph["nodes"]),
|
||||
"n_edges": len(pdf_graph["edges"]),
|
||||
}
|
||||
|
||||
out_path = HERE / "nuextract_full.json"
|
||||
out_path.write_text(json.dumps(out, indent=2, ensure_ascii=False))
|
||||
print(f"\n[saved] {out_path}", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user