"""NuExtract 2.0-2B GPU — version 'production' con todas las mejoras: - repetition_penalty=1.1 (evita bucles) - chunking forzado (max 800 chars / ~250 tokens) en TODO texto - 97 chunks completos del PDF (no muestra) - 25 frases ES troceadas adecuadamente - agregacion deduplicada con conteo - coreferencia simple (normalize + substring) Vuelca a nuextract_full.json — listo para notebook 08. """ from __future__ import annotations import gc import json import os import re import sys import time import warnings from collections import Counter, defaultdict from pathlib import Path warnings.filterwarnings("ignore") os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") HERE = Path(__file__).resolve().parent _pf = "/home/lucas/fn_registry/python/functions" sys.path = [p for p in sys.path if not p.startswith(_pf + "/")] if _pf not in sys.path: sys.path.insert(0, _pf) from core.extract_pdf_text import extract_pdf_text VAULT = Path("/home/lucas/vaults/osint_nlp_models") PDF_PATH = VAULT / "test_documents" / "politica_proteccion_datos.pdf" def clean_pdf_text(text: str) -> str: text = re.sub(r"\b\d{1,2}/\d{1,2}\b", " ", text) text = text.replace("\t", " ") text = re.sub(r"-\s*\n\s*", "", text) text = re.sub(r"(? 0: prev_sents = chunks[-1]["sentences"][-overlap_sentences:] overlap_len = sum(len(s) + 1 for s in prev_sents) next_sentence_len = len(sentences[i]) + 1 if overlap_len + next_sentence_len <= max_chars: current_sents = list(prev_sents) current_len = overlap_len if i < len(sentences): current_sents.append(sentences[i]) current_len += len(sentences[i]) + 1 i += 1 while i < len(sentences) and current_len + len(sentences[i]) + 1 <= max_chars: current_sents.append(sentences[i]) current_len += len(sentences[i]) + 1 i += 1 chunks.append({"text": " ".join(current_sents), "sentences": current_sents}) return chunks LONG_TEXT_ES = ( "BBVA, presidido por Carlos Torres, completo en 2024 la integracion operativa de Banco Sabadell tras la fusion. " "Onur Genc, consejero delegado del banco desde 2018, lidero el proceso desde la sede central en Bilbao. " "El banco mantiene oficinas en Plaza San Nicolas 4 y opera en mas de 25 paises. " "Banco Santander, dirigido por Ana Botin, sigue siendo el primer banco espanol por capitalizacion bursatil. " "Hector Grisi asumio el cargo de CEO global de Santander en enero de 2023, reemplazando a Jose Antonio Alvarez. " "CaixaBank, presidida por Jose Ignacio Goirigolzarri y con sede en Valencia desde 2017, completo la fusion con Bankia. " "Gonzalo Gortazar es el consejero delegado de CaixaBank y reporta al consejo formado en parte por La Caixa. " "El Banco de Espana, gobernado por Pablo Hernandez de Cos hasta 2024 y por Margarita Delgado en 2025, supervisa el sector. " "Luis de Guindos, vicepresidente del Banco Central Europeo, fue ministro de Economia en el gobierno de Mariano Rajoy. " "La Comision Nacional del Mercado de Valores, presidida por Rodrigo Buenaventura, regula los mercados financieros. " "BBVA anuncio en mayo de 2024 una OPA hostil sobre Banco Sabadell que el consejo del banco rechazo inicialmente. " "Cesar Gonzalez-Bueno, CEO de Sabadell, defendio la independencia del banco junto con su presidente Josep Oliu. " "Repsol, presidida por Antonio Brufau y con CEO Josu Jon Imaz, vendio su filial mexicana a Macquarie. " "Iberdrola, liderada por Ignacio Galan, opera Avangrid en EEUU y firmo un acuerdo PPA con Amazon. " "Andy Jassy, CEO de Amazon desde Seattle, agradecio el contrato a Iberdrola en una nota publica. " "Endesa, filial de la italiana Enel, tiene como CEO a Marina Serrano y opera en Espana, Portugal y Marruecos. " "Ferrovial, presidida por Rafael del Pino, traslado su sede social a Holanda en 2022 generando polemica politica. " "ACS, presidida por Florentino Perez, sigue siendo lider mundial en concesiones de infraestructura. " "Inditex, fundada por Amancio Ortega y presidida por Marta Ortega desde 2022, tiene su sede en Arteixo, A Coruna. " "Pablo Isla, expresidente de Inditex y actual consejero de Telefonica, se incorporo al consejo en 2024. " "Telefonica, presidida por Jose Maria Alvarez-Pallete, sufrio la entrada del estado en su capital con SEPI. " "Saudi Telecom Company adquirio un 9.9% de Telefonica en 2023, lo que motivo la respuesta del gobierno espanol. " "Cristina Aldamiz-Echevarria fue nombrada directora de Recursos Humanos del Grupo Mapfre, dirigido por Antonio Huertas. " "Naturgy, presidida por Francisco Reynes, recibio una OPA parcial del fondo emirati IFM en 2021 que se cancelo. " "Indra, con Marc Murtra como presidente, se ha posicionado como contratista clave de Defensa para el ministerio de Margarita Robles." ) SCHEMA_RICH_CORPORATE = """{ "organizations": [ { "name": "string", "ceo": "string", "chairman_president": "string", "headquartered_in": "string", "subsidiaries": ["string"], "parent_company": "string" } ], "people": [ { "name": "string", "role": "string", "organization": "string" } ], "agreements": [ { "between": ["string"], "topic": "string", "amount": "string" } ] }""" SCHEMA_RICH_GDPR = """{ "data_controller": { "name": "string", "address": "string", "registration": "string" }, "dpo_contact": { "email": "string", "address": "string" }, "data_categories": ["string"], "rights_listed": ["string"], "authorities_mentioned": [ { "name": "string", "url_or_contact": "string" } ], "laws_mentioned": ["string"] }""" def parse_json_safe(text: str): """Parser robusto: busca el PRIMER `{` y trunca progresivamente.""" if not text: return None s = text.find("{") if s < 0: return None for end in range(len(text), s, -1): try: return json.loads(text[s:end]) except Exception: continue return None def run_extract(model, tokenizer, device, document, template, max_new_tokens=1024): messages = [{"role": "user", "content": document}] text = tokenizer.apply_chat_template( messages, template=template, tokenize=False, add_generation_prompt=True, ) inputs = tokenizer([text], padding=True, return_tensors="pt").to(device) t0 = time.time() generated = model.generate( **inputs, do_sample=False, num_beams=1, max_new_tokens=max_new_tokens, repetition_penalty=1.15, # ⭐ EVITA BUCLES pad_token_id=tokenizer.eos_token_id, ) elapsed = time.time() - t0 n_input = inputs["input_ids"].shape[1] n_output = generated.shape[1] - n_input out_text = tokenizer.decode(generated[0][n_input:], skip_special_tokens=True) parsed = parse_json_safe(out_text) return { "elapsed_s": round(elapsed, 2), "n_input_tokens": int(n_input), "n_output_tokens": int(n_output), "raw_text": out_text, "parsed": parsed, } # ── agregadores y coreferencia ── def aggregate_corporate(results: list[dict]) -> dict: """Acumula organizations / people / agreements de N chunks.""" orgs = {} # name_lower -> dict (con counts y mejores valores) people = {} # name_lower -> dict agreements = [] for r in results: parsed = r.get("parsed") or {} for o in parsed.get("organizations", []) or []: if not isinstance(o, dict): continue name = (o.get("name") or "").strip() if not name: continue key = name.lower() if key not in orgs: orgs[key] = {"name": name, "count": 0, "ceo": [], "chairman_president": [], "headquartered_in": [], "subsidiaries": set(), "parent_company": []} orgs[key]["count"] += 1 for f in ("ceo", "chairman_president", "headquartered_in", "parent_company"): v = o.get(f) if v and isinstance(v, str) and v.strip(): orgs[key][f].append(v.strip()) for sub in (o.get("subsidiaries") or []): if isinstance(sub, str) and sub.strip(): orgs[key]["subsidiaries"].add(sub.strip()) for p in parsed.get("people", []) or []: if not isinstance(p, dict): continue name = (p.get("name") or "").strip() if not name: continue key = name.lower() if key not in people: people[key] = {"name": name, "count": 0, "roles": [], "organizations": []} people[key]["count"] += 1 r_ = p.get("role") if r_ and isinstance(r_, str) and r_.strip(): people[key]["roles"].append(r_.strip()) o_ = p.get("organization") if o_ and isinstance(o_, str) and o_.strip(): people[key]["organizations"].append(o_.strip()) for ag in parsed.get("agreements", []) or []: if not isinstance(ag, dict): continue parties = [p.strip() for p in (ag.get("between") or []) if isinstance(p, str) and p.strip()] if len(parties) >= 2: agreements.append({"between": parties, "topic": ag.get("topic"), "amount": ag.get("amount")}) # Convertir sets a listas for o in orgs.values(): o["subsidiaries"] = sorted(o["subsidiaries"]) return {"organizations": list(orgs.values()), "people": list(people.values()), "agreements": agreements} def aggregate_gdpr(results: list[dict]) -> dict: out = { "data_controllers": [], # multiple by chunk "dpo_contacts": [], "data_categories": Counter(), "rights_listed": Counter(), "authorities": {}, # name_lower -> {name, contact_options[], count} "laws": Counter(), } for r in results: parsed = r.get("parsed") or {} dc = parsed.get("data_controller") or {} if isinstance(dc, dict) and dc.get("name"): out["data_controllers"].append(dc) dpo = parsed.get("dpo_contact") or {} if isinstance(dpo, dict) and (dpo.get("email") or dpo.get("address")): out["dpo_contacts"].append(dpo) for c in parsed.get("data_categories", []) or []: if isinstance(c, str) and c.strip(): out["data_categories"][c.strip()] += 1 for rt in parsed.get("rights_listed", []) or []: if isinstance(rt, str) and rt.strip(): out["rights_listed"][rt.strip()] += 1 for a in parsed.get("authorities_mentioned", []) or []: if not isinstance(a, dict): continue name = (a.get("name") or "").strip() if not name: continue key = name.lower() if key not in out["authorities"]: out["authorities"][key] = {"name": name, "contact_options": [], "count": 0} out["authorities"][key]["count"] += 1 c = a.get("url_or_contact") if c and isinstance(c, str) and c.strip(): out["authorities"][key]["contact_options"].append(c.strip()) for l in parsed.get("laws_mentioned", []) or []: if isinstance(l, str) and l.strip(): out["laws"][l.strip()] += 1 out["data_categories"] = dict(out["data_categories"]) out["rights_listed"] = dict(out["rights_listed"]) out["laws"] = dict(out["laws"]) out["authorities"] = list(out["authorities"].values()) return out def normalize_name(s: str) -> str: s = s.strip() s = re.sub(r"[\.,;:\"'`()\[\]]", "", s) s = re.sub(r"\s+", " ", s) return s.strip().lower() def merge_aliases(entity_names: list[str]) -> dict: """Devuelve un dict {nombre_original → nombre_canonico}.""" norm_groups = defaultdict(list) for n in entity_names: norm_groups[normalize_name(n)].append(n) canonical: dict = {} canonical_data: dict = {} for nrm, group in norm_groups.items(): winner = max(group, key=lambda x: (len(x), x)) for n in group: canonical[n] = winner canonical_data[winner] = group canon_names = sorted(canonical_data.keys(), key=len, reverse=True) absorbed = {} for long_n in canon_names: long_norm = normalize_name(long_n) for short_n in canon_names: if short_n == long_n or short_n in absorbed: continue short_norm = normalize_name(short_n) if len(short_norm) < 4: continue if re.search(r"\b" + re.escape(short_norm) + r"\b", long_norm): absorbed[short_n] = long_n final = {} for orig, canon in canonical.items(): final[orig] = absorbed.get(canon, canon) return final def build_corporate_graph(agg: dict, alias_map: dict | None = None) -> dict: """Construye nodos y aristas del grafo corporate.""" if alias_map is None: alias_map = {} def resolve(n): return alias_map.get(n, n) nodes = {} # name -> type edges = set() # (h, kind, t) for org in agg["organizations"]: name = resolve(org["name"]) nodes[name] = "organization" for ceo in org["ceo"]: ceo_r = resolve(ceo) nodes.setdefault(ceo_r, "person") edges.add((ceo_r, "ceo_of", name)) for pres in org["chairman_president"]: pres_r = resolve(pres) nodes.setdefault(pres_r, "person") edges.add((pres_r, "president_of", name)) for hq in org["headquartered_in"]: hq_r = resolve(hq) nodes.setdefault(hq_r, "location") edges.add((name, "headquartered_in", hq_r)) for parent in org["parent_company"]: parent_r = resolve(parent) nodes.setdefault(parent_r, "organization") edges.add((name, "subsidiary_of", parent_r)) for sub in org["subsidiaries"]: sub_r = resolve(sub) nodes.setdefault(sub_r, "organization") edges.add((sub_r, "subsidiary_of", name)) for p in agg["people"]: name = resolve(p["name"]) nodes.setdefault(name, "person") for org in p["organizations"]: org_r = resolve(org) nodes.setdefault(org_r, "organization") edges.add((name, "works_at", org_r)) for ag in agg["agreements"]: parties = [resolve(p) for p in ag["between"]] for p in parties: nodes.setdefault(p, "organization") for i, a in enumerate(parties): for b in parties[i+1:]: edges.add((a, "agreement_with", b)) return {"nodes": nodes, "edges": list(edges)} def build_gdpr_graph(agg: dict, alias_map: dict | None = None) -> dict: if alias_map is None: alias_map = {} def resolve(n): return alias_map.get(n, n) nodes = {} edges = set() # data_controller — pick the first non-empty for dc in agg["data_controllers"]: if dc.get("name"): name = resolve(dc["name"].strip()) nodes[name] = "data_controller" if dc.get("address"): addr = resolve(dc["address"].strip()) nodes.setdefault(addr, "location") edges.add((name, "located_in", addr)) break # solo el primero for dpo in agg["dpo_contacts"]: if dpo.get("email"): email = dpo["email"].strip() nodes.setdefault(email, "email") if dpo.get("address"): addr = dpo["address"].strip() nodes.setdefault(addr, "location") for c in agg["data_categories"]: nodes.setdefault(c, "data_category") for r in agg["rights_listed"]: nodes.setdefault(r, "right") for a in agg["authorities"]: name = resolve(a["name"].strip()) nodes.setdefault(name, "authority") for c in a["contact_options"][:1]: # 1 contact por authority nodes.setdefault(c, "url") edges.add((name, "contact", c)) for l in agg["laws"]: nodes.setdefault(l, "law") return {"nodes": nodes, "edges": list(edges)} # ── main ── def main(): print("[load] loading model + tokenizer...", flush=True) t0 = time.time() import torch from transformers import AutoTokenizer, AutoModelForImageTextToText if not torch.cuda.is_available(): print("CUDA not available — exiting", flush=True) return device = "cuda" dtype = torch.bfloat16 print(f"[device] {device} dtype={dtype}", flush=True) tokenizer = AutoTokenizer.from_pretrained( "numind/NuExtract-2.0-2B", trust_remote_code=True, padding_side="left", ) model = AutoModelForImageTextToText.from_pretrained( "numind/NuExtract-2.0-2B", trust_remote_code=True, torch_dtype=dtype, attn_implementation="sdpa", ).to(device) model.eval() print(f"[load] done in {time.time()-t0:.1f}s", flush=True) out: dict = {"meta": {"device": device, "dtype": str(dtype), "model": "numind/NuExtract-2.0-2B", "repetition_penalty": 1.15, "max_chars_chunk": 800}} # ── A. LONG_TEXT_ES con chunking print("\n[A] LONG_TEXT_ES — chunking + run...", flush=True) long_chunks = chunk_with_overlap(LONG_TEXT_ES, max_chars=800, overlap_sentences=1) print(f" {len(LONG_TEXT_ES)} chars → {len(long_chunks)} chunks", flush=True) long_results = [] t_start = time.time() for i, c in enumerate(long_chunks): r = run_extract(model, tokenizer, device, c["text"], SCHEMA_RICH_CORPORATE) ok = "OK" if r["parsed"] else "FAIL" print(f" [chunk {i+1}/{len(long_chunks)}] {len(c['text'])}c {r['elapsed_s']}s out={r['n_output_tokens']} {ok}", flush=True) long_results.append(r) long_elapsed = time.time() - t_start long_agg = aggregate_corporate(long_results) # alias map sobre todos los nombres mencionados all_names_long = ([o["name"] for o in long_agg["organizations"]] + [p["name"] for p in long_agg["people"]] + [hq for o in long_agg["organizations"] for hq in o["headquartered_in"]] + [s for o in long_agg["organizations"] for s in o["subsidiaries"]]) alias_long = merge_aliases(list(set(all_names_long))) long_graph = build_corporate_graph(long_agg, alias_long) print(f" total {long_elapsed:.1f}s agregado: orgs={len(long_agg['organizations'])} people={len(long_agg['people'])} agreements={len(long_agg['agreements'])}", flush=True) print(f" grafo: nodos={len(long_graph['nodes'])} aristas={len(long_graph['edges'])}", flush=True) out["long_text"] = { "elapsed_s": round(long_elapsed, 1), "n_chunks": len(long_chunks), "n_chunks_parsed_ok": sum(1 for r in long_results if r["parsed"] is not None), "agg": long_agg, "graph": {"nodes": long_graph["nodes"], "edges": long_graph["edges"]}, "n_nodes": len(long_graph["nodes"]), "n_edges": len(long_graph["edges"]), "n_isolates": sum(1 for n in long_graph["nodes"] if n not in {a for a, _, _ in long_graph["edges"]} | {b for _, _, b in long_graph["edges"]}), } del long_results gc.collect() # ── B. PDF entero print("\n[B] PDF — extract + clean + chunk + run all chunks...", flush=True) raw = extract_pdf_text(str(PDF_PATH)) clean = clean_pdf_text(raw) pdf_chunks = chunk_with_overlap(clean, max_chars=800, overlap_sentences=1) print(f" PDF: {len(raw):,} → {len(clean):,} chars → {len(pdf_chunks)} chunks", flush=True) pdf_results = [] t_start = time.time() for i, c in enumerate(pdf_chunks): r = run_extract(model, tokenizer, device, c["text"], SCHEMA_RICH_GDPR) if (i+1) % 10 == 0: ok_count = sum(1 for r in pdf_results if r["parsed"] is not None) print(f" [chunk {i+1}/{len(pdf_chunks)}] {ok_count}/{i+1} parsed OK ({time.time()-t_start:.0f}s acumulado)", flush=True) pdf_results.append(r) pdf_elapsed = time.time() - t_start pdf_agg = aggregate_gdpr(pdf_results) # alias map para autoridades + data controllers all_names_pdf = ([dc["name"] for dc in pdf_agg["data_controllers"] if dc.get("name")] + [a["name"] for a in pdf_agg["authorities"]]) alias_pdf = merge_aliases(list(set(all_names_pdf))) pdf_graph = build_gdpr_graph(pdf_agg, alias_pdf) print(f" total {pdf_elapsed:.1f}s = {pdf_elapsed/60:.1f} min", flush=True) print(f" parsed OK: {sum(1 for r in pdf_results if r['parsed'] is not None)}/{len(pdf_chunks)}", flush=True) print(f" grafo: nodos={len(pdf_graph['nodes'])} aristas={len(pdf_graph['edges'])}", flush=True) out["pdf"] = { "elapsed_s": round(pdf_elapsed, 1), "n_chunks": len(pdf_chunks), "n_chunks_parsed_ok": sum(1 for r in pdf_results if r["parsed"] is not None), "agg_summary": { "n_data_controllers": len(pdf_agg["data_controllers"]), "n_dpo_contacts": len(pdf_agg["dpo_contacts"]), "n_data_categories": len(pdf_agg["data_categories"]), "n_rights": len(pdf_agg["rights_listed"]), "n_authorities": len(pdf_agg["authorities"]), "n_laws": len(pdf_agg["laws"]), }, "agg_full": pdf_agg, "graph": {"nodes": pdf_graph["nodes"], "edges": pdf_graph["edges"]}, "n_nodes": len(pdf_graph["nodes"]), "n_edges": len(pdf_graph["edges"]), } out_path = HERE / "nuextract_full.json" out_path.write_text(json.dumps(out, indent=2, ensure_ascii=False)) print(f"\n[saved] {out_path}", flush=True) if __name__ == "__main__": main()