{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Ontology Graph Extraction\n", "\n", "Extrae entidades y relaciones de cualquier documento usando funciones del registry.\n", "- LLM: `claude -p --model haiku`\n", "- Tipos: OSINT del registry + genéricos (concept, url, date, quantity, text_fragment, coordinates)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'python.functions.core.extract_json_from_llm'", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m 3\u001b[39m ROOT = \u001b[33m'/home/lucas/fn_registry'\u001b[39m\n\u001b[32m 4\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m 5\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 6\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m 9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m 10\u001b[39m \n", "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'" ] } ], "source": [ "import sys, os, json, subprocess\n", "\n", "ROOT = '/home/lucas/fn_registry'\n", "os.environ['FN_REGISTRY_ROOT'] = ROOT\n", "sys.path.insert(0, ROOT)\n", "\n", "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n", "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n", "from python.functions.datascience.render_sigma_html import render_sigma_html\n", "\n", "print('Registry root:', ROOT)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'FN_REGISTRY_ROOT'", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mKeyError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m sys, os, json, subprocess\n\u001b[32m 2\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m ROOT = os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m]\n\u001b[32m 4\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 5\u001b[39m \n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n", "\u001b[36mFile \u001b[39m\u001b[32m:717\u001b[39m, in \u001b[36m_Environ.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n", "\u001b[31mKeyError\u001b[39m: 'FN_REGISTRY_ROOT'" ] } ], "source": [ "import sys, os, json, subprocess\n", "\n", "ROOT = os.environ['FN_REGISTRY_ROOT']\n", "sys.path.insert(0, ROOT)\n", "\n", "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n", "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n", "from python.functions.datascience.render_sigma_html import render_sigma_html\n", "\n", "print('Registry root:', ROOT)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## LLM wrapper: claude -p + haiku" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def claude_haiku_json(messages: list[dict]) -> dict:\n", " \"\"\"Wrapper que convierte messages OpenAI-style a claude -p --model haiku.\"\"\"\n", " # Construir prompt desde messages\n", " parts = []\n", " for msg in messages:\n", " role = msg['role']\n", " content = msg['content']\n", " if role == 'system':\n", " parts.append(f\"[SYSTEM]\\n{content}\")\n", " elif role == 'user':\n", " parts.append(f\"[USER]\\n{content}\")\n", " prompt = \"\\n\\n\".join(parts)\n", " \n", " result = subprocess.run(\n", " ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n", " capture_output=True, text=True, timeout=120\n", " )\n", " \n", " if result.returncode != 0:\n", " raise RuntimeError(f\"claude -p failed: {result.stderr}\")\n", " \n", " # Extraer el campo 'result' del JSON envelope de claude\n", " envelope = json.loads(result.stdout)\n", " raw_text = envelope.get('result', '')\n", " \n", " # Parsear JSON del LLM (maneja codeblocks, trailing commas, etc.)\n", " return extract_json_from_llm(raw_text)\n", "\n", "# Test rapido\n", "test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n", "print('LLM wrapper OK:', test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Entity presets: OSINT + genéricos" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# --- Presets OSINT (del registry) ---\n", "OSINT_PRESETS = [\n", " {\"type_ref\": \"osint_person_go_cybersecurity\", \"label\": \"Person\",\n", " \"metadata_fields\": [\"full_name\", \"alias\", \"nationality\", \"dob\", \"gender\", \"risk_score\"]},\n", " {\"type_ref\": \"osint_organization_go_cybersecurity\", \"label\": \"Organization\",\n", " \"metadata_fields\": [\"legal_name\", \"country\", \"sector\", \"founded\", \"risk_score\"]},\n", " {\"type_ref\": \"osint_location_go_cybersecurity\", \"label\": \"Location\",\n", " \"metadata_fields\": [\"lat\", \"lon\", \"address\", \"country\", \"city\"]},\n", " {\"type_ref\": \"osint_event_go_cybersecurity\", \"label\": \"Event\",\n", " \"metadata_fields\": [\"event_type\", \"date\", \"location\", \"description\", \"severity\"]},\n", " {\"type_ref\": \"osint_email_go_cybersecurity\", \"label\": \"Email\",\n", " \"metadata_fields\": [\"address\", \"provider\", \"verified\", \"breached\"]},\n", " {\"type_ref\": \"osint_domain_go_cybersecurity\", \"label\": \"Domain\",\n", " \"metadata_fields\": [\"fqdn\", \"registrar\", \"created_date\", \"expires_date\"]},\n", " {\"type_ref\": \"osint_ip_address_go_cybersecurity\", \"label\": \"IP Address\",\n", " \"metadata_fields\": [\"ip\", \"asn\", \"country\", \"isp\", \"geolocation\"]},\n", " {\"type_ref\": \"osint_phone_go_cybersecurity\", \"label\": \"Phone\",\n", " \"metadata_fields\": [\"number\", \"country_code\", \"carrier\", \"phone_type\"]},\n", " {\"type_ref\": \"osint_social_media_go_cybersecurity\", \"label\": \"Social Media Account\",\n", " \"metadata_fields\": [\"platform\", \"username\", \"url\", \"followers\", \"verified\"]},\n", " {\"type_ref\": \"osint_document_go_cybersecurity\", \"label\": \"Document\",\n", " \"metadata_fields\": [\"title\", \"format\", \"classification\", \"source\"]},\n", " {\"type_ref\": \"osint_crypto_wallet_go_cybersecurity\", \"label\": \"Crypto Wallet\",\n", " \"metadata_fields\": [\"address\", \"blockchain\", \"balance\"]},\n", " {\"type_ref\": \"osint_malware_go_cybersecurity\", \"label\": \"Malware\",\n", " \"metadata_fields\": [\"family\", \"hash_sha256\", \"threat_level\"]},\n", " {\"type_ref\": \"osint_vulnerability_go_cybersecurity\", \"label\": \"Vulnerability\",\n", " \"metadata_fields\": [\"cve_id\", \"cvss\", \"affected_product\", \"exploited\"]},\n", "]\n", "\n", "# --- Presets genéricos (sin tipo Go, inline) ---\n", "GENERIC_PRESETS = [\n", " {\"type_ref\": \"concept\", \"label\": \"Concept\",\n", " \"metadata_fields\": [\"name\", \"category\", \"definition\"]},\n", " {\"type_ref\": \"url\", \"label\": \"URL/Link\",\n", " \"metadata_fields\": [\"url\", \"domain\", \"context\"]},\n", " {\"type_ref\": \"date_reference\", \"label\": \"Date/Time\",\n", " \"metadata_fields\": [\"date\", \"precision\", \"context\"]},\n", " {\"type_ref\": \"quantity\", \"label\": \"Quantity/Amount\",\n", " \"metadata_fields\": [\"value\", \"unit\", \"context\"]},\n", " {\"type_ref\": \"coordinates\", \"label\": \"Coordinates\",\n", " \"metadata_fields\": [\"lat\", \"lon\", \"label\"]},\n", " {\"type_ref\": \"text_fragment\", \"label\": \"Key Text Fragment\",\n", " \"metadata_fields\": [\"text\", \"category\", \"relevance\"]},\n", "]\n", "\n", "ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n", "print(f'{len(ALL_PRESETS)} entity presets loaded ({len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic)')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Relation types" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "RELATION_TYPES = [\n", " # Personas / orgs\n", " \"employs\", \"works_for\", \"founded\", \"owns\", \"controls\",\n", " \"member_of\", \"affiliated_with\", \"collaborates_with\",\n", " # Comunicacion\n", " \"communicates_with\", \"sent_to\", \"received_from\",\n", " # Ubicacion\n", " \"located_in\", \"headquartered_in\", \"traveled_to\", \"operates_in\",\n", " # Eventos\n", " \"participated_in\", \"caused\", \"occurred_at\", \"occurred_on\",\n", " # Documentos / conceptos\n", " \"mentions\", \"references\", \"describes\", \"authored\", \"published\",\n", " # Financiero\n", " \"funds\", \"transacted_with\", \"invested_in\",\n", " # Tecnico\n", " \"hosts\", \"resolves_to\", \"exploits\", \"targets\",\n", " # Generico\n", " \"related_to\", \"part_of\", \"instance_of\", \"has_attribute\",\n", "]\n", "\n", "print(f'{len(RELATION_TYPES)} relation types')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extraer documento\n", "\n", "Pon tu documento en `data/` y cambia el path." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "DOC_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'document.pdf') # <-- cambiar\n", "\n", "# Progreso visible\n", "def on_progress(msg, pct):\n", " print(f' [{pct*100:5.1f}%] {msg}')\n", "\n", "result = extraction_pipeline(\n", " file_path=DOC_PATH,\n", " entity_presets=ALL_PRESETS,\n", " relation_types=RELATION_TYPES,\n", " llm_chat_json=claude_haiku_json,\n", " chunk_size=800,\n", " chunk_overlap=100,\n", " confidence_threshold=0.5,\n", " dedup_threshold=0.85,\n", " on_progress=on_progress,\n", ")\n", "\n", "print(f'\\nEntities: {result.stats.final_entities_count}')\n", "print(f'Relations: {result.stats.final_relations_count}')\n", "print(f'Chunks: {result.stats.total_chunks}')\n", "print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n", "print(f'Entity types: {result.stats.entity_types_found}')\n", "print(f'Relation types: {result.stats.relation_types_found}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Explorar resultados" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# Entities\n", "ent_rows = []\n", "for e in result.entities:\n", " ent_rows.append({\n", " 'id': e.id,\n", " 'name': e.name,\n", " 'type': e.type_ref,\n", " 'confidence': e.confidence,\n", " 'attributes': e.attributes,\n", " })\n", "df_entities = pd.DataFrame(ent_rows)\n", "print(f'=== Entities ({len(df_entities)}) ===')\n", "df_entities.sort_values('type')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Relations\n", "rel_rows = []\n", "for r in result.relations:\n", " rel_rows.append({\n", " 'from_name': r.from_name,\n", " 'relation': r.relation_type,\n", " 'to_name': r.to_name,\n", " 'confidence': r.confidence,\n", " 'description': r.description,\n", " })\n", "df_relations = pd.DataFrame(rel_rows)\n", "print(f'=== Relations ({len(df_relations)}) ===')\n", "df_relations.sort_values('relation')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Visualizar grafo con sigma.js" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Colores por tipo de entidad\n", "TYPE_COLORS = {\n", " 'osint_person_go_cybersecurity': '#e74c3c',\n", " 'osint_organization_go_cybersecurity': '#3498db',\n", " 'osint_location_go_cybersecurity': '#2ecc71',\n", " 'osint_event_go_cybersecurity': '#f39c12',\n", " 'osint_email_go_cybersecurity': '#9b59b6',\n", " 'osint_domain_go_cybersecurity': '#1abc9c',\n", " 'osint_ip_address_go_cybersecurity': '#e67e22',\n", " 'osint_phone_go_cybersecurity': '#95a5a6',\n", " 'osint_social_media_go_cybersecurity': '#e91e63',\n", " 'osint_document_go_cybersecurity': '#607d8b',\n", " 'osint_crypto_wallet_go_cybersecurity': '#ff9800',\n", " 'osint_malware_go_cybersecurity': '#f44336',\n", " 'osint_vulnerability_go_cybersecurity': '#ff5722',\n", " 'concept': '#00bcd4',\n", " 'url': '#8bc34a',\n", " 'date_reference': '#cddc39',\n", " 'quantity': '#ffc107',\n", " 'coordinates': '#4caf50',\n", " 'text_fragment': '#78909c',\n", "}\n", "DEFAULT_COLOR = '#aaaaaa'\n", "\n", "def extraction_to_sigma(result) -> dict:\n", " \"\"\"Convierte ExtractionResult a formato sigma.js/graphology.\"\"\"\n", " # Contar degree para tamaño de nodo\n", " degree = {}\n", " for r in result.relations:\n", " from_id = r.from_id or r.from_name\n", " to_id = r.to_id or r.to_name\n", " degree[from_id] = degree.get(from_id, 0) + 1\n", " degree[to_id] = degree.get(to_id, 0) + 1\n", "\n", " nodes = []\n", " for e in result.entities:\n", " eid = e.id or e.name\n", " nodes.append({\n", " 'key': eid,\n", " 'attributes': {\n", " 'label': e.name,\n", " 'color': TYPE_COLORS.get(e.type_ref, DEFAULT_COLOR),\n", " 'size': 4 + min(degree.get(eid, 0) * 2, 20),\n", " 'type': e.type_ref,\n", " **{k: str(v) for k, v in (e.attributes or {}).items() if v is not None},\n", " }\n", " })\n", "\n", " edges = []\n", " node_keys = {n['key'] for n in nodes}\n", " for i, r in enumerate(result.relations):\n", " from_id = r.from_id or r.from_name\n", " to_id = r.to_id or r.to_name\n", " if from_id in node_keys and to_id in node_keys:\n", " edges.append({\n", " 'key': f'e{i}',\n", " 'source': from_id,\n", " 'target': to_id,\n", " 'attributes': {\n", " 'label': r.relation_type,\n", " 'type': r.relation_type,\n", " }\n", " })\n", "\n", " return {'nodes': nodes, 'edges': edges}\n", "\n", "graph_data = extraction_to_sigma(result)\n", "print(f'Graph: {len(graph_data[\"nodes\"])} nodes, {len(graph_data[\"edges\"])} edges')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "output_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')\n", "html_path = render_sigma_html(\n", " graph_data=graph_data,\n", " output_path=os.path.join(output_dir, 'ontology_graph.html'),\n", " title='Ontology Graph',\n", ")\n", "print(f'Graph saved: {html_path}')\n", "print(f'Open in browser: file://{html_path}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Auto-discovery de nuevos tipos\n", "\n", "Si el documento contiene entidades que no encajan en los presets, haiku las detecta y sugiere nuevos presets." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def discover_new_types(result, existing_presets: list[dict]) -> list[dict]:\n", " \"\"\"Pide a haiku que sugiera tipos nuevos basandose en entidades de baja confianza o genericas.\"\"\"\n", " # Recopilar entidades clasificadas como concept/text_fragment (genéricos fallback)\n", " generic_entities = [\n", " {'name': e.name, 'type': e.type_ref, 'attributes': e.attributes}\n", " for e in result.entities\n", " if e.type_ref in ('concept', 'text_fragment', 'related_to')\n", " ]\n", " \n", " if not generic_entities:\n", " print('No hay entidades genéricas — los presets cubren todo.')\n", " return []\n", "\n", " existing_labels = [p['label'] for p in existing_presets]\n", " \n", " prompt_msg = [\n", " {'role': 'system', 'content': (\n", " 'You analyze entities extracted from a document and suggest new entity type presets. '\n", " 'Existing types: ' + ', '.join(existing_labels) + '. '\n", " 'For entities that dont fit existing types, suggest new type presets. '\n", " 'Output JSON: {\"new_presets\": [{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", '\n", " '\"metadata_fields\": [\"field1\", \"field2\", ...]}]}. '\n", " 'Only suggest types that are genuinely different from existing ones. '\n", " 'Return {\"new_presets\": []} if no new types are needed.'\n", " )},\n", " {'role': 'user', 'content': (\n", " 'These entities were classified as generic (concept/text_fragment) '\n", " 'because they didnt fit existing types:\\n\\n'\n", " + json.dumps(generic_entities[:30], ensure_ascii=False, indent=2)\n", " )}\n", " ]\n", " \n", " resp = claude_haiku_json(prompt_msg)\n", " new_presets = resp.get('new_presets', [])\n", " \n", " if new_presets:\n", " print(f'Discovered {len(new_presets)} new types:')\n", " for p in new_presets:\n", " print(f\" - {p['label']} ({p['type_ref']}): {p['metadata_fields']}\")\n", " else:\n", " print('No new types needed.')\n", " \n", " return new_presets\n", "\n", "new_types = discover_new_types(result, ALL_PRESETS)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Si se descubrieron tipos nuevos, re-extraer con presets ampliados\n", "if new_types:\n", " EXPANDED_PRESETS = ALL_PRESETS + new_types\n", " print(f'Re-extracting with {len(EXPANDED_PRESETS)} presets...')\n", " \n", " result = extraction_pipeline(\n", " file_path=DOC_PATH,\n", " entity_presets=EXPANDED_PRESETS,\n", " relation_types=RELATION_TYPES,\n", " llm_chat_json=claude_haiku_json,\n", " chunk_size=800,\n", " chunk_overlap=100,\n", " confidence_threshold=0.5,\n", " dedup_threshold=0.85,\n", " on_progress=on_progress,\n", " )\n", " \n", " print(f'\\nEntities: {result.stats.final_entities_count}')\n", " print(f'Relations: {result.stats.final_relations_count}')\n", " \n", " # Re-generar grafo\n", " graph_data = extraction_to_sigma(result)\n", " html_path = render_sigma_html(\n", " graph_data=graph_data,\n", " output_path=os.path.join(output_dir, 'ontology_graph.html'),\n", " title='Ontology Graph (expanded)',\n", " )\n", " print(f'Updated graph: file://{html_path}')\n", "else:\n", " print('No re-extraction needed.')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'python.functions.core.extract_json_from_llm'", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 5\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m 6\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 7\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, os.path.join(ROOT, \u001b[33m'python'\u001b[39m, \u001b[33m'functions'\u001b[39m))\n\u001b[32m 8\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m 12\u001b[39m \n", "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'" ] } ], "source": [ "import sys, os, json, subprocess\n", "from pathlib import Path\n", "\n", "ROOT = '/home/lucas/fn_registry'\n", "os.environ['FN_REGISTRY_ROOT'] = ROOT\n", "sys.path.insert(0, ROOT)\n", "sys.path.insert(0, os.path.join(ROOT, 'python', 'functions'))\n", "\n", "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n", "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n", "from python.functions.datascience.render_sigma_html import render_sigma_html\n", "\n", "print('OK: imports loaded')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "imports OK\n" ] } ], "source": [ "import sys, os, json, subprocess\n", "\n", "# Añadir lib/ al path\n", "sys.path.insert(0, '/home/lucas/fn_registry/analysis/ontology_graph/lib')\n", "\n", "from core_functions import extract_json_from_llm\n", "from extraction_pipeline import extraction_pipeline\n", "from render_sigma_html import render_sigma_html\n", "\n", "print('imports OK')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LLM wrapper OK: {'ok': True}\n" ] } ], "source": [ "def claude_haiku_json(messages: list[dict]) -> dict:\n", " \"\"\"Wrapper: messages OpenAI-style -> claude -p --model haiku -> dict.\"\"\"\n", " parts = []\n", " for msg in messages:\n", " role = msg['role']\n", " content = msg['content']\n", " if role == 'system':\n", " parts.append(f'[SYSTEM]\\n{content}')\n", " elif role == 'user':\n", " parts.append(f'[USER]\\n{content}')\n", " prompt = '\\n\\n'.join(parts)\n", " \n", " result = subprocess.run(\n", " ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n", " capture_output=True, text=True, timeout=120\n", " )\n", " if result.returncode != 0:\n", " raise RuntimeError(f'claude -p failed: {result.stderr}')\n", " \n", " envelope = json.loads(result.stdout)\n", " raw_text = envelope.get('result', '')\n", " return extract_json_from_llm(raw_text)\n", "\n", "# Test\n", "test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n", "print('LLM wrapper OK:', test)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "19 presets, 35 relation types\n" ] } ], "source": [ "OSINT_PRESETS = [\n", " {'type_ref': 'osint_person_go_cybersecurity', 'label': 'Person',\n", " 'metadata_fields': ['full_name', 'alias', 'nationality', 'dob', 'gender', 'risk_score']},\n", " {'type_ref': 'osint_organization_go_cybersecurity', 'label': 'Organization',\n", " 'metadata_fields': ['legal_name', 'country', 'sector', 'founded', 'risk_score']},\n", " {'type_ref': 'osint_location_go_cybersecurity', 'label': 'Location',\n", " 'metadata_fields': ['lat', 'lon', 'address', 'country', 'city']},\n", " {'type_ref': 'osint_event_go_cybersecurity', 'label': 'Event',\n", " 'metadata_fields': ['event_type', 'date', 'location', 'description', 'severity']},\n", " {'type_ref': 'osint_email_go_cybersecurity', 'label': 'Email',\n", " 'metadata_fields': ['address', 'provider', 'verified', 'breached']},\n", " {'type_ref': 'osint_domain_go_cybersecurity', 'label': 'Domain',\n", " 'metadata_fields': ['fqdn', 'registrar', 'created_date', 'expires_date']},\n", " {'type_ref': 'osint_ip_address_go_cybersecurity', 'label': 'IP Address',\n", " 'metadata_fields': ['ip', 'asn', 'country', 'isp', 'geolocation']},\n", " {'type_ref': 'osint_phone_go_cybersecurity', 'label': 'Phone',\n", " 'metadata_fields': ['number', 'country_code', 'carrier', 'phone_type']},\n", " {'type_ref': 'osint_social_media_go_cybersecurity', 'label': 'Social Media Account',\n", " 'metadata_fields': ['platform', 'username', 'url', 'followers', 'verified']},\n", " {'type_ref': 'osint_document_go_cybersecurity', 'label': 'Document',\n", " 'metadata_fields': ['title', 'format', 'classification', 'source']},\n", " {'type_ref': 'osint_crypto_wallet_go_cybersecurity', 'label': 'Crypto Wallet',\n", " 'metadata_fields': ['address', 'blockchain', 'balance']},\n", " {'type_ref': 'osint_malware_go_cybersecurity', 'label': 'Malware',\n", " 'metadata_fields': ['family', 'hash_sha256', 'threat_level']},\n", " {'type_ref': 'osint_vulnerability_go_cybersecurity', 'label': 'Vulnerability',\n", " 'metadata_fields': ['cve_id', 'cvss', 'affected_product', 'exploited']},\n", "]\n", "\n", "GENERIC_PRESETS = [\n", " {'type_ref': 'concept', 'label': 'Concept',\n", " 'metadata_fields': ['name', 'category', 'definition']},\n", " {'type_ref': 'url', 'label': 'URL/Link',\n", " 'metadata_fields': ['url', 'domain', 'context']},\n", " {'type_ref': 'date_reference', 'label': 'Date/Time',\n", " 'metadata_fields': ['date', 'precision', 'context']},\n", " {'type_ref': 'quantity', 'label': 'Quantity/Amount',\n", " 'metadata_fields': ['value', 'unit', 'context']},\n", " {'type_ref': 'coordinates', 'label': 'Coordinates',\n", " 'metadata_fields': ['lat', 'lon', 'label']},\n", " {'type_ref': 'text_fragment', 'label': 'Key Text Fragment',\n", " 'metadata_fields': ['text', 'category', 'relevance']},\n", "]\n", "\n", "ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n", "\n", "RELATION_TYPES = [\n", " 'employs', 'works_for', 'founded', 'owns', 'controls',\n", " 'member_of', 'affiliated_with', 'collaborates_with',\n", " 'communicates_with', 'sent_to', 'received_from',\n", " 'located_in', 'headquartered_in', 'traveled_to', 'operates_in',\n", " 'participated_in', 'caused', 'occurred_at', 'occurred_on',\n", " 'mentions', 'references', 'describes', 'authored', 'published',\n", " 'funds', 'transacted_with', 'invested_in',\n", " 'hosts', 'resolves_to', 'exploits', 'targets',\n", " 'related_to', 'part_of', 'instance_of', 'has_attribute',\n", "]\n", "\n", "print(f'{len(ALL_PRESETS)} presets, {len(RELATION_TYPES)} relation types')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " [ 0.0%] Extracting text from file...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 0.0%] Extracting entities from chunk 1/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 0.7%] Extracting entities from chunk 2/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 1.5%] Extracting entities from chunk 3/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 2.2%] Extracting entities from chunk 4/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 3.0%] Extracting entities from chunk 5/54\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/lucas/fn_registry/analysis/ontology_graph/lib/extraction_pipeline.py:113: UserWarning: extract_entities_llm: type_ref 'osint_service_go_cybersecurity' no esta en el schema, descartando entidad 'Bizum'\n", " candidates = extract_entities_llm(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 3.7%] Extracting entities from chunk 6/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 4.4%] Extracting entities from chunk 7/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 5.2%] Extracting entities from chunk 8/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 5.9%] Extracting entities from chunk 9/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 6.7%] Extracting entities from chunk 10/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 7.4%] Extracting entities from chunk 11/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 8.1%] Extracting entities from chunk 12/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 8.9%] Extracting entities from chunk 13/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 9.6%] Extracting entities from chunk 14/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 10.4%] Extracting entities from chunk 15/54\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " [ 11.1%] Extracting entities from chunk 16/54\n" ] } ], "source": [ "DOC_PATH = '/home/lucas/fn_registry/analysis/ontology_graph/data/condiciones-generales-bizum.pdf'\n", "\n", "def on_progress(msg, pct):\n", " print(f' [{pct*100:5.1f}%] {msg}')\n", "\n", "result = extraction_pipeline(\n", " file_path=DOC_PATH,\n", " entity_presets=ALL_PRESETS,\n", " relation_types=RELATION_TYPES,\n", " llm_chat_json=claude_haiku_json,\n", " chunk_size=800,\n", " chunk_overlap=100,\n", " confidence_threshold=0.5,\n", " dedup_threshold=0.85,\n", " on_progress=on_progress,\n", ")\n", "\n", "print(f'\\nEntities: {result.stats.final_entities_count}')\n", "print(f'Relations: {result.stats.final_relations_count}')\n", "print(f'Chunks: {result.stats.total_chunks}')\n", "print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n", "print(f'Entity types: {result.stats.entity_types_found}')\n", "print(f'Relation types: {result.stats.relation_types_found}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pipeline optimizado\n", "\n", "- 1 sola llamada LLM por chunk (entities + relations + tipos nuevos)\n", "- Chunks de 2000 chars\n", "- Paralelizado con ThreadPoolExecutor" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "from extract_text_from_file import extract_text_from_file\n", "from core_functions import preprocess_text\n", "from split_text_into_chunks import split_text_into_chunks\n", "from deduplicate_entities import deduplicate_entities\n", "from deduplicate_relations import deduplicate_relations\n", "from entity_candidate import EntityCandidate\n", "from relation_candidate import RelationCandidate\n", "\n", "def build_unified_prompt(entity_presets, relation_types):\n", " \"\"\"System prompt que pide entities + relations + tipos nuevos en 1 sola llamada.\"\"\"\n", " type_lines = []\n", " for p in entity_presets:\n", " fields = ', '.join(p.get('metadata_fields', []))\n", " type_lines.append(f\"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]\")\n", "\n", " return f'''You are an entity and relation extraction expert. Given text, extract ALL entities and relations in a single pass.\n", "\n", "ENTITY TYPES:\n", "{chr(10).join(type_lines)}\n", "\n", "RELATION TYPES: {', '.join(relation_types)}\n", "\n", "OUTPUT FORMAT (strict JSON):\n", "{{\n", " \"entities\": [\n", " {{\"name\": \"...\", \"type_ref\": \"...\", \"attributes\": {{...}}, \"confidence\": 0.9}}\n", " ],\n", " \"relations\": [\n", " {{\"from_name\": \"...\", \"to_name\": \"...\", \"relation_type\": \"...\", \"confidence\": 0.8, \"description\": \"...\"}}\n", " ],\n", " \"suggested_types\": [\n", " {{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", \"metadata_fields\": [\"field1\", \"field2\"], \"reason\": \"why this type is needed\"}}\n", " ]\n", "}}\n", "\n", "RULES:\n", "- Extract ALL entities explicitly mentioned in the text\n", "- Use exact type_ref from the schema. Leave unknown attributes as null\n", "- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied\n", "- Relations: from_name and to_name MUST match extracted entity names exactly\n", "- suggested_types: if you find important entities that do NOT fit any existing type, suggest a new type with its fields. Use these suggested types for those entities in the entities array.\n", "- If no suggested types are needed, return \"suggested_types\": []\n", "- Respond in the same language as the text for descriptions'''\n", "\n", "UNIFIED_PROMPT = build_unified_prompt(ALL_PRESETS, RELATION_TYPES)\n", "print(f'Prompt length: {len(UNIFIED_PROMPT)} chars')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.7" } }, "nbformat": 4, "nbformat_minor": 4 }