"""Extracción de grafo ontológico desde un documento. Uso: python extract.py python extract.py data/condiciones-generales-bizum.pdf Optimizaciones vs extraction_pipeline: - 1 sola llamada LLM por chunk (entities + relations + tipos sugeridos) - Chunks de 2000 chars - Paralelizado con ThreadPoolExecutor """ import sys import os import json import subprocess import time from concurrent.futures import ThreadPoolExecutor, as_completed sys.path.insert(0, os.path.join(os.path.dirname(__file__), "lib")) from extract_text_from_file import extract_text_from_file from core_functions import preprocess_text, extract_json_from_llm from split_text_into_chunks import split_text_into_chunks from deduplicate_entities import deduplicate_entities from deduplicate_relations import deduplicate_relations from entity_candidate import EntityCandidate from relation_candidate import RelationCandidate from render_sigma_html import render_sigma_html # ── Presets ──────────────────────────────────────────────────────────────────── OSINT_PRESETS = [ {"type_ref": "person", "label": "Person", "metadata_fields": ["full_name", "alias", "nationality", "dob", "gender", "risk_score"]}, {"type_ref": "organization", "label": "Organization", "metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"]}, {"type_ref": "location", "label": "Location", "metadata_fields": ["lat", "lon", "address", "country", "city"]}, {"type_ref": "event", "label": "Event", "metadata_fields": ["event_type", "date", "location", "description", "severity"]}, {"type_ref": "email", "label": "Email", "metadata_fields": ["address", "provider", "verified", "breached"]}, {"type_ref": "domain", "label": "Domain", "metadata_fields": ["fqdn", "registrar", "created_date", "expires_date"]}, {"type_ref": "ip_address", "label": "IP Address", "metadata_fields": ["ip", "asn", "country", "isp", "geolocation"]}, {"type_ref": "phone", "label": "Phone", "metadata_fields": ["number", "country_code", "carrier", "phone_type"]}, {"type_ref": "social_media", "label": "Social Media Account", "metadata_fields": ["platform", "username", "url", "followers", "verified"]}, {"type_ref": "document", "label": "Document", "metadata_fields": ["title", "format", "classification", "source"]}, {"type_ref": "crypto_wallet", "label": "Crypto Wallet", "metadata_fields": ["address", "blockchain", "balance"]}, {"type_ref": "malware", "label": "Malware", "metadata_fields": ["family", "hash_sha256", "threat_level"]}, {"type_ref": "vulnerability", "label": "Vulnerability", "metadata_fields": ["cve_id", "cvss", "affected_product", "exploited"]}, ] GENERIC_PRESETS = [ {"type_ref": "concept", "label": "Concept", "metadata_fields": ["name", "category", "definition"]}, {"type_ref": "url", "label": "URL/Link", "metadata_fields": ["url", "domain", "context"]}, {"type_ref": "date_reference", "label": "Date/Time", "metadata_fields": ["date", "precision", "context"]}, {"type_ref": "quantity", "label": "Quantity/Amount", "metadata_fields": ["value", "unit", "context"]}, {"type_ref": "coordinates", "label": "Coordinates", "metadata_fields": ["lat", "lon", "label"]}, {"type_ref": "text_fragment", "label": "Key Text Fragment", "metadata_fields": ["text", "category", "relevance"]}, ] # ── Custom presets (acumulativo, pensado para promoción al registry) ─────────── CUSTOM_PRESETS_PATH = os.path.join(os.path.dirname(__file__), "data", "custom_presets.json") def load_custom_presets() -> list[dict]: """Carga presets custom desde data/custom_presets.json si existe.""" if not os.path.exists(CUSTOM_PRESETS_PATH): return [] with open(CUSTOM_PRESETS_PATH) as f: data = json.load(f) return data.get("presets", []) def save_custom_presets(presets: list[dict]) -> None: """Guarda presets custom en data/custom_presets.json. Formato pensado para promoción al registry: { "presets": [ { "type_ref": "snake_case_id", "label": "Human Label", "metadata_fields": ["field1", "field2"], "reason": "why this type exists", "source_doc": "document where it was first discovered", "promoted": false // true cuando se registre en el registry } ] } """ os.makedirs(os.path.dirname(CUSTOM_PRESETS_PATH), exist_ok=True) with open(CUSTOM_PRESETS_PATH, "w") as f: json.dump({"presets": presets}, f, ensure_ascii=False, indent=2) def merge_suggested_into_custom(suggested: list[dict], source_doc: str) -> list[dict]: """Mergea tipos sugeridos con custom existentes. Dedup por type_ref.""" existing = load_custom_presets() existing_refs = {p["type_ref"] for p in existing} added = [] for s in suggested: ref = s.get("type_ref", "") if not ref or ref in existing_refs: continue existing_refs.add(ref) preset = { "type_ref": ref, "label": s.get("label", ref), "metadata_fields": s.get("metadata_fields", []), "reason": s.get("reason", ""), "source_doc": source_doc, "promoted": False, } existing.append(preset) added.append(preset) if added: save_custom_presets(existing) return added RELATION_TYPES = [ "employs", "works_for", "founded", "owns", "controls", "member_of", "affiliated_with", "collaborates_with", "communicates_with", "sent_to", "received_from", "located_in", "headquartered_in", "traveled_to", "operates_in", "participated_in", "caused", "occurred_at", "occurred_on", "mentions", "references", "describes", "authored", "published", "funds", "transacted_with", "invested_in", "hosts", "resolves_to", "exploits", "targets", "related_to", "part_of", "instance_of", "has_attribute", ] # ── LLM wrapper ─────────────────────────────────────────────────────────────── def claude_haiku_json(messages: list[dict]) -> dict: parts = [] for msg in messages: if msg["role"] == "system": parts.append(f"[SYSTEM]\n{msg['content']}") elif msg["role"] == "user": parts.append(f"[USER]\n{msg['content']}") prompt = "\n\n".join(parts) result = subprocess.run( ["claude", "-p", "--model", "haiku", "--output-format", "json", prompt], capture_output=True, text=True, timeout=120, ) if result.returncode != 0: raise RuntimeError(f"claude -p failed: {result.stderr[:200]}") envelope = json.loads(result.stdout) return extract_json_from_llm(envelope.get("result", "")) # ── Unified prompt ───────────────────────────────────────────────────────────── def build_unified_prompt(presets, rel_types): type_lines = [] for p in presets: fields = ", ".join(p.get("metadata_fields", [])) type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]") return ( "You are an entity and relation extraction expert. " "Given text, extract ALL entities and relations in a single pass.\n\n" "ENTITY TYPES:\n" + "\n".join(type_lines) + "\n\n" "RELATION TYPES: " + ", ".join(rel_types) + "\n\n" 'OUTPUT FORMAT (strict JSON):\n' '{\n' ' "entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}],\n' ' "relations": [{"from_name": "...", "to_name": "...", "relation_type": "...", "confidence": 0.8, "description": "..."}],\n' ' "suggested_types": [{"type_ref": "snake_case_id", "label": "Human Label", "metadata_fields": ["f1","f2"], "reason": "..."}]\n' '}\n\n' "RULES:\n" "- Extract ALL entities explicitly mentioned\n" "- Use exact type_ref from schema. Unknown attributes = null\n" "- Confidence: 1.0=explicit, 0.7=strongly implied, 0.5=weakly implied\n" "- Relations: from_name/to_name MUST match entity names exactly\n" "- suggested_types: for important entities that do NOT fit any type, suggest a new type. " "Use those suggested type_refs for those entities in the entities array.\n" '- If no new types needed: "suggested_types": []\n' "- Respond in the same language as the text for descriptions" ) # ── Process one chunk ────────────────────────────────────────────────────────── def process_chunk(chunk_idx: int, chunk_text: str, system_prompt: str): """Procesa un chunk: extrae entities + relations + suggested_types.""" try: resp = claude_haiku_json([ {"role": "system", "content": system_prompt}, {"role": "user", "content": chunk_text}, ]) except Exception as e: print(f" [WARN] chunk {chunk_idx}: {e}") return [], [], [] raw_entities = resp.get("entities", []) raw_relations = resp.get("relations", []) suggested = resp.get("suggested_types", []) entities = [] for ent in raw_entities: name = ent.get("name", "").strip() if not name: continue entities.append(EntityCandidate( name=name, type_ref=ent.get("type_ref", "concept"), attributes=ent.get("attributes", {}), confidence=float(ent.get("confidence", 0.5)), source_chunk_indices=[chunk_idx], )) relations = [] for rel in raw_relations: fn = rel.get("from_name", "").strip() tn = rel.get("to_name", "").strip() if not fn or not tn: continue relations.append(RelationCandidate( from_name=fn, to_name=tn, relation_type=rel.get("relation_type", "related_to"), confidence=float(rel.get("confidence", 0.5)), description=rel.get("description", ""), source_chunk_index=chunk_idx, )) return entities, relations, suggested # ── Sigma conversion ─────────────────────────────────────────────────────────── TYPE_COLORS = { "person": "#e74c3c", "organization": "#3498db", "location": "#2ecc71", "event": "#f39c12", "email": "#9b59b6", "domain": "#1abc9c", "ip_address": "#e67e22", "phone": "#95a5a6", "social_media": "#e91e63", "document": "#607d8b", "crypto_wallet": "#ff9800", "malware": "#f44336", "vulnerability": "#ff5722", "concept": "#00bcd4", "url": "#8bc34a", "date_reference": "#cddc39", "quantity": "#ffc107", "coordinates": "#4caf50", "text_fragment": "#78909c", } def to_sigma(entities, relations, entity_id_map): # Build name→UUID lookup from dedup map # entity_id_map: {name_variant -> uuid, ...} # Invert to uuid→canonical_name using entities list uuid_to_name = {} name_to_uuid = {} for e in entities: # Find this entity's UUID in the map uuid = entity_id_map.get(e.name, entity_id_map.get(e.name.lower().strip(), e.name)) uuid_to_name[uuid] = e.name name_to_uuid[e.name] = uuid degree = {} for r in relations: fid = r.from_id or r.from_name tid = r.to_id or r.to_name degree[fid] = degree.get(fid, 0) + 1 degree[tid] = degree.get(tid, 0) + 1 nodes = [] seen_uuids = set() for e in entities: uuid = name_to_uuid.get(e.name, e.name) if uuid in seen_uuids: continue seen_uuids.add(uuid) # Filter out 'type' — sigma.js reserves it for node render program reserved = {"type", "hidden", "x", "y"} attrs = {k: str(v) for k, v in (e.attributes or {}).items() if v is not None and k not in reserved} nodes.append({ "key": uuid, "attributes": { "label": e.name, "color": TYPE_COLORS.get(e.type_ref, "#aaaaaa"), "size": 4 + min(degree.get(uuid, 0) * 2, 20), "entity_type": e.type_ref, **attrs, }, }) node_keys = {n["key"] for n in nodes} edges = [] seen_edges = set() for i, r in enumerate(relations): fid = r.from_id or r.from_name tid = r.to_id or r.to_name if fid in node_keys and tid in node_keys and fid != tid: edge_key = (fid, tid, r.relation_type) if edge_key in seen_edges: continue seen_edges.add(edge_key) edges.append({ "key": f"e{i}", "source": fid, "target": tid, "attributes": {"label": r.relation_type}, }) return {"nodes": nodes, "edges": edges} # ── Reclasificación de entidades genéricas ───────────────────────────────────── GENERIC_TYPE_REFS = {"concept", "text_fragment", "url", "date_reference", "quantity", "coordinates"} def reclassify_generic_entities(entities, new_presets, workers=4): """Reclasifica entidades genéricas usando los tipos recién descubiertos. En vez de re-procesar chunks, hace 1 llamada batch a haiku con las entidades genéricas y los nuevos presets para reclasificarlas in-place. """ generic = [(i, e) for i, e in enumerate(entities) if e.type_ref in GENERIC_TYPE_REFS] if not generic or not new_presets: return 0 # Construir prompt de reclasificación type_lines = [] for p in new_presets: fields = ", ".join(p.get("metadata_fields", [])) type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]") system = ( "You reclassify entities into more specific types. " "For each entity, decide if it fits one of the NEW types below better than its current generic type. " "If it fits, return the new type_ref and updated attributes. If not, return null.\n\n" "NEW TYPES:\n" + "\n".join(type_lines) + "\n\n" 'OUTPUT: {"reclassified": [{"index": 0, "type_ref": "new_type", "attributes": {...}}, ...]}\n' "Only include entities that should change. Omit those that should stay as-is." ) # Procesar en batches de 30 entidades para no exceder contexto batch_size = 30 total_changed = 0 def _reclassify_batch(batch): items = [{"index": idx, "name": e.name, "current_type": e.type_ref, "attributes": e.attributes} for idx, e in batch] try: resp = claude_haiku_json([ {"role": "system", "content": system}, {"role": "user", "content": json.dumps(items, ensure_ascii=False)}, ]) return resp.get("reclassified", []) except Exception: return [] batches = [generic[i:i+batch_size] for i in range(0, len(generic), batch_size)] with ThreadPoolExecutor(max_workers=workers) as pool: futures = {pool.submit(_reclassify_batch, b): b for b in batches} for future in as_completed(futures): for item in future.result(): idx = item.get("index") new_ref = item.get("type_ref", "") if idx is not None and new_ref and 0 <= idx < len(entities): entities[idx].type_ref = new_ref if item.get("attributes"): entities[idx].attributes.update(item["attributes"]) total_changed += 1 return total_changed # ── Main ─────────────────────────────────────────────────────────────────────── def main(): if len(sys.argv) < 2: print("Uso: python extract.py ") sys.exit(1) file_path = sys.argv[1] if not os.path.isabs(file_path): file_path = os.path.join(os.path.dirname(__file__), file_path) workers = int(sys.argv[2]) if len(sys.argv) > 2 else 4 print(f"=== Ontology Graph Extraction ===") print(f"File: {file_path}") print(f"Workers: {workers}") start = time.monotonic() # 1. Extraer y preprocesar texto print("\n[1/5] Extracting text...") raw = extract_text_from_file(file_path) text = preprocess_text(raw) print(f" {len(text)} chars") # 2. Chunking print("[2/5] Chunking...") chunks = split_text_into_chunks(text, chunk_size=2000, overlap=200) print(f" {len(chunks)} chunks") # 3. Extracción paralela custom = load_custom_presets() # Solo usar custom no promovidos (los promovidos ya estarán en el registry) active_custom = [p for p in custom if not p.get("promoted", False)] all_presets = OSINT_PRESETS + GENERIC_PRESETS + active_custom print(f" Presets: {len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic + {len(active_custom)} custom") system_prompt = build_unified_prompt(all_presets, RELATION_TYPES) print(f"[3/5] Extracting entities + relations ({workers} workers)...") all_entities = [] all_relations = [] all_suggested = [] with ThreadPoolExecutor(max_workers=workers) as pool: futures = { pool.submit(process_chunk, i, chunk, system_prompt): i for i, chunk in enumerate(chunks) } for future in as_completed(futures): idx = futures[future] ents, rels, sugg = future.result() all_entities.extend(ents) all_relations.extend(rels) all_suggested.extend(sugg) print(f" chunk {idx+1}/{len(chunks)}: {len(ents)} entities, {len(rels)} relations" + (f", {len(sugg)} new types" if sugg else "")) # 4. Deduplicación print(f"\n[4/5] Deduplicating...") print(f" Raw: {len(all_entities)} entities, {len(all_relations)} relations") dedup = deduplicate_entities(all_entities, name_threshold=0.85) final_entities = dedup.entities entity_id_map = dedup.name_to_id final_relations = deduplicate_relations(all_relations, entity_id_map) print(f" Final: {len(final_entities)} entities, {len(final_relations)} relations") print(f" Merged: {dedup.total_before - dedup.total_after} entities, " f"{len(all_relations) - len(final_relations)} relations") # Registrar tipos sugeridos en custom_presets.json unique_suggested = [] if all_suggested: seen = set() for s in all_suggested: key = s.get("type_ref", "") if key and key not in seen: seen.add(key) unique_suggested.append(s) source_doc = os.path.basename(file_path) added = merge_suggested_into_custom(unique_suggested, source_doc) total_custom = len(load_custom_presets()) if added: print(f"\n New types registered ({len(added)}):") for p in added: print(f" + {p['label']} ({p['type_ref']}): {p['metadata_fields']}") print(f" Reason: {p['reason']}") print(f" Total custom presets: {total_custom} (in {CUSTOM_PRESETS_PATH})") # Reclasificar entidades genéricas con los tipos recién descubiertos n_generic = sum(1 for e in final_entities if e.type_ref in GENERIC_TYPE_REFS) if n_generic > 0: print(f"\n Reclassifying {n_generic} generic entities with new types...") changed = reclassify_generic_entities(final_entities, added, workers=workers) print(f" Reclassified: {changed}/{n_generic}") else: print(f"\n {len(unique_suggested)} suggested types already registered ({total_custom} total custom)") # Stats por tipo type_counts = {} for e in final_entities: type_counts[e.type_ref] = type_counts.get(e.type_ref, 0) + 1 print(f"\n Entity types:") for t, c in sorted(type_counts.items(), key=lambda x: -x[1]): print(f" {t}: {c}") rel_counts = {} for r in final_relations: rel_counts[r.relation_type] = rel_counts.get(r.relation_type, 0) + 1 print(f" Relation types:") for t, c in sorted(rel_counts.items(), key=lambda x: -x[1]): print(f" {t}: {c}") # 5. Visualización print(f"\n[5/5] Generating graph...") graph = to_sigma(final_entities, final_relations, entity_id_map) out_dir = os.path.join(os.path.dirname(__file__), "data") html_path = render_sigma_html(graph, os.path.join(out_dir, "ontology_graph.html"), "Ontology Graph") print(f" {len(graph['nodes'])} nodes, {len(graph['edges'])} edges") print(f" HTML: file://{html_path}") # Guardar JSON intermedio json_path = os.path.join(out_dir, "extraction_result.json") with open(json_path, "w") as f: json.dump({ "entities": [{"name": e.name, "type_ref": e.type_ref, "confidence": e.confidence, "attributes": e.attributes} for e in final_entities], "relations": [{"from": r.from_name, "to": r.to_name, "type": r.relation_type, "confidence": r.confidence, "description": r.description} for r in final_relations], "suggested_types": [dict(s) for s in (unique_suggested if all_suggested else [])], }, f, ensure_ascii=False, indent=2) print(f" JSON: {json_path}") elapsed = time.monotonic() - start print(f"\nDone in {elapsed:.1f}s") if __name__ == "__main__": main()