Files
fn-registry agent 40bea81603 chore: initial sync
2026-04-28 22:13:08 +02:00

936 lines
36 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ontology Graph Extraction\n",
"\n",
"Extrae entidades y relaciones de cualquier documento usando funciones del registry.\n",
"- LLM: `claude -p --model haiku`\n",
"- Tipos: OSINT del registry + genéricos (concept, url, date, quantity, text_fragment, coordinates)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'python.functions.core.extract_json_from_llm'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m 3\u001b[39m ROOT = \u001b[33m'/home/lucas/fn_registry'\u001b[39m\n\u001b[32m 4\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m 5\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 6\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m 9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m 10\u001b[39m \n",
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
]
}
],
"source": [
"import sys, os, json, subprocess\n",
"\n",
"ROOT = '/home/lucas/fn_registry'\n",
"os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
"sys.path.insert(0, ROOT)\n",
"\n",
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
"\n",
"print('Registry root:', ROOT)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'FN_REGISTRY_ROOT'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m sys, os, json, subprocess\n\u001b[32m 2\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m ROOT = os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m]\n\u001b[32m 4\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 5\u001b[39m \n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n",
"\u001b[36mFile \u001b[39m\u001b[32m<frozen os>:717\u001b[39m, in \u001b[36m_Environ.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n",
"\u001b[31mKeyError\u001b[39m: 'FN_REGISTRY_ROOT'"
]
}
],
"source": [
"import sys, os, json, subprocess\n",
"\n",
"ROOT = os.environ['FN_REGISTRY_ROOT']\n",
"sys.path.insert(0, ROOT)\n",
"\n",
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
"\n",
"print('Registry root:', ROOT)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## LLM wrapper: claude -p + haiku"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def claude_haiku_json(messages: list[dict]) -> dict:\n",
" \"\"\"Wrapper que convierte messages OpenAI-style a claude -p --model haiku.\"\"\"\n",
" # Construir prompt desde messages\n",
" parts = []\n",
" for msg in messages:\n",
" role = msg['role']\n",
" content = msg['content']\n",
" if role == 'system':\n",
" parts.append(f\"[SYSTEM]\\n{content}\")\n",
" elif role == 'user':\n",
" parts.append(f\"[USER]\\n{content}\")\n",
" prompt = \"\\n\\n\".join(parts)\n",
" \n",
" result = subprocess.run(\n",
" ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
" capture_output=True, text=True, timeout=120\n",
" )\n",
" \n",
" if result.returncode != 0:\n",
" raise RuntimeError(f\"claude -p failed: {result.stderr}\")\n",
" \n",
" # Extraer el campo 'result' del JSON envelope de claude\n",
" envelope = json.loads(result.stdout)\n",
" raw_text = envelope.get('result', '')\n",
" \n",
" # Parsear JSON del LLM (maneja codeblocks, trailing commas, etc.)\n",
" return extract_json_from_llm(raw_text)\n",
"\n",
"# Test rapido\n",
"test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
"print('LLM wrapper OK:', test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Entity presets: OSINT + genéricos"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# --- Presets OSINT (del registry) ---\n",
"OSINT_PRESETS = [\n",
" {\"type_ref\": \"osint_person_go_cybersecurity\", \"label\": \"Person\",\n",
" \"metadata_fields\": [\"full_name\", \"alias\", \"nationality\", \"dob\", \"gender\", \"risk_score\"]},\n",
" {\"type_ref\": \"osint_organization_go_cybersecurity\", \"label\": \"Organization\",\n",
" \"metadata_fields\": [\"legal_name\", \"country\", \"sector\", \"founded\", \"risk_score\"]},\n",
" {\"type_ref\": \"osint_location_go_cybersecurity\", \"label\": \"Location\",\n",
" \"metadata_fields\": [\"lat\", \"lon\", \"address\", \"country\", \"city\"]},\n",
" {\"type_ref\": \"osint_event_go_cybersecurity\", \"label\": \"Event\",\n",
" \"metadata_fields\": [\"event_type\", \"date\", \"location\", \"description\", \"severity\"]},\n",
" {\"type_ref\": \"osint_email_go_cybersecurity\", \"label\": \"Email\",\n",
" \"metadata_fields\": [\"address\", \"provider\", \"verified\", \"breached\"]},\n",
" {\"type_ref\": \"osint_domain_go_cybersecurity\", \"label\": \"Domain\",\n",
" \"metadata_fields\": [\"fqdn\", \"registrar\", \"created_date\", \"expires_date\"]},\n",
" {\"type_ref\": \"osint_ip_address_go_cybersecurity\", \"label\": \"IP Address\",\n",
" \"metadata_fields\": [\"ip\", \"asn\", \"country\", \"isp\", \"geolocation\"]},\n",
" {\"type_ref\": \"osint_phone_go_cybersecurity\", \"label\": \"Phone\",\n",
" \"metadata_fields\": [\"number\", \"country_code\", \"carrier\", \"phone_type\"]},\n",
" {\"type_ref\": \"osint_social_media_go_cybersecurity\", \"label\": \"Social Media Account\",\n",
" \"metadata_fields\": [\"platform\", \"username\", \"url\", \"followers\", \"verified\"]},\n",
" {\"type_ref\": \"osint_document_go_cybersecurity\", \"label\": \"Document\",\n",
" \"metadata_fields\": [\"title\", \"format\", \"classification\", \"source\"]},\n",
" {\"type_ref\": \"osint_crypto_wallet_go_cybersecurity\", \"label\": \"Crypto Wallet\",\n",
" \"metadata_fields\": [\"address\", \"blockchain\", \"balance\"]},\n",
" {\"type_ref\": \"osint_malware_go_cybersecurity\", \"label\": \"Malware\",\n",
" \"metadata_fields\": [\"family\", \"hash_sha256\", \"threat_level\"]},\n",
" {\"type_ref\": \"osint_vulnerability_go_cybersecurity\", \"label\": \"Vulnerability\",\n",
" \"metadata_fields\": [\"cve_id\", \"cvss\", \"affected_product\", \"exploited\"]},\n",
"]\n",
"\n",
"# --- Presets genéricos (sin tipo Go, inline) ---\n",
"GENERIC_PRESETS = [\n",
" {\"type_ref\": \"concept\", \"label\": \"Concept\",\n",
" \"metadata_fields\": [\"name\", \"category\", \"definition\"]},\n",
" {\"type_ref\": \"url\", \"label\": \"URL/Link\",\n",
" \"metadata_fields\": [\"url\", \"domain\", \"context\"]},\n",
" {\"type_ref\": \"date_reference\", \"label\": \"Date/Time\",\n",
" \"metadata_fields\": [\"date\", \"precision\", \"context\"]},\n",
" {\"type_ref\": \"quantity\", \"label\": \"Quantity/Amount\",\n",
" \"metadata_fields\": [\"value\", \"unit\", \"context\"]},\n",
" {\"type_ref\": \"coordinates\", \"label\": \"Coordinates\",\n",
" \"metadata_fields\": [\"lat\", \"lon\", \"label\"]},\n",
" {\"type_ref\": \"text_fragment\", \"label\": \"Key Text Fragment\",\n",
" \"metadata_fields\": [\"text\", \"category\", \"relevance\"]},\n",
"]\n",
"\n",
"ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
"print(f'{len(ALL_PRESETS)} entity presets loaded ({len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic)')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Relation types"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"RELATION_TYPES = [\n",
" # Personas / orgs\n",
" \"employs\", \"works_for\", \"founded\", \"owns\", \"controls\",\n",
" \"member_of\", \"affiliated_with\", \"collaborates_with\",\n",
" # Comunicacion\n",
" \"communicates_with\", \"sent_to\", \"received_from\",\n",
" # Ubicacion\n",
" \"located_in\", \"headquartered_in\", \"traveled_to\", \"operates_in\",\n",
" # Eventos\n",
" \"participated_in\", \"caused\", \"occurred_at\", \"occurred_on\",\n",
" # Documentos / conceptos\n",
" \"mentions\", \"references\", \"describes\", \"authored\", \"published\",\n",
" # Financiero\n",
" \"funds\", \"transacted_with\", \"invested_in\",\n",
" # Tecnico\n",
" \"hosts\", \"resolves_to\", \"exploits\", \"targets\",\n",
" # Generico\n",
" \"related_to\", \"part_of\", \"instance_of\", \"has_attribute\",\n",
"]\n",
"\n",
"print(f'{len(RELATION_TYPES)} relation types')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extraer documento\n",
"\n",
"Pon tu documento en `data/` y cambia el path."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"DOC_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'document.pdf') # <-- cambiar\n",
"\n",
"# Progreso visible\n",
"def on_progress(msg, pct):\n",
" print(f' [{pct*100:5.1f}%] {msg}')\n",
"\n",
"result = extraction_pipeline(\n",
" file_path=DOC_PATH,\n",
" entity_presets=ALL_PRESETS,\n",
" relation_types=RELATION_TYPES,\n",
" llm_chat_json=claude_haiku_json,\n",
" chunk_size=800,\n",
" chunk_overlap=100,\n",
" confidence_threshold=0.5,\n",
" dedup_threshold=0.85,\n",
" on_progress=on_progress,\n",
")\n",
"\n",
"print(f'\\nEntities: {result.stats.final_entities_count}')\n",
"print(f'Relations: {result.stats.final_relations_count}')\n",
"print(f'Chunks: {result.stats.total_chunks}')\n",
"print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
"print(f'Entity types: {result.stats.entity_types_found}')\n",
"print(f'Relation types: {result.stats.relation_types_found}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Explorar resultados"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# Entities\n",
"ent_rows = []\n",
"for e in result.entities:\n",
" ent_rows.append({\n",
" 'id': e.id,\n",
" 'name': e.name,\n",
" 'type': e.type_ref,\n",
" 'confidence': e.confidence,\n",
" 'attributes': e.attributes,\n",
" })\n",
"df_entities = pd.DataFrame(ent_rows)\n",
"print(f'=== Entities ({len(df_entities)}) ===')\n",
"df_entities.sort_values('type')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Relations\n",
"rel_rows = []\n",
"for r in result.relations:\n",
" rel_rows.append({\n",
" 'from_name': r.from_name,\n",
" 'relation': r.relation_type,\n",
" 'to_name': r.to_name,\n",
" 'confidence': r.confidence,\n",
" 'description': r.description,\n",
" })\n",
"df_relations = pd.DataFrame(rel_rows)\n",
"print(f'=== Relations ({len(df_relations)}) ===')\n",
"df_relations.sort_values('relation')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Visualizar grafo con sigma.js"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Colores por tipo de entidad\n",
"TYPE_COLORS = {\n",
" 'osint_person_go_cybersecurity': '#e74c3c',\n",
" 'osint_organization_go_cybersecurity': '#3498db',\n",
" 'osint_location_go_cybersecurity': '#2ecc71',\n",
" 'osint_event_go_cybersecurity': '#f39c12',\n",
" 'osint_email_go_cybersecurity': '#9b59b6',\n",
" 'osint_domain_go_cybersecurity': '#1abc9c',\n",
" 'osint_ip_address_go_cybersecurity': '#e67e22',\n",
" 'osint_phone_go_cybersecurity': '#95a5a6',\n",
" 'osint_social_media_go_cybersecurity': '#e91e63',\n",
" 'osint_document_go_cybersecurity': '#607d8b',\n",
" 'osint_crypto_wallet_go_cybersecurity': '#ff9800',\n",
" 'osint_malware_go_cybersecurity': '#f44336',\n",
" 'osint_vulnerability_go_cybersecurity': '#ff5722',\n",
" 'concept': '#00bcd4',\n",
" 'url': '#8bc34a',\n",
" 'date_reference': '#cddc39',\n",
" 'quantity': '#ffc107',\n",
" 'coordinates': '#4caf50',\n",
" 'text_fragment': '#78909c',\n",
"}\n",
"DEFAULT_COLOR = '#aaaaaa'\n",
"\n",
"def extraction_to_sigma(result) -> dict:\n",
" \"\"\"Convierte ExtractionResult a formato sigma.js/graphology.\"\"\"\n",
" # Contar degree para tamaño de nodo\n",
" degree = {}\n",
" for r in result.relations:\n",
" from_id = r.from_id or r.from_name\n",
" to_id = r.to_id or r.to_name\n",
" degree[from_id] = degree.get(from_id, 0) + 1\n",
" degree[to_id] = degree.get(to_id, 0) + 1\n",
"\n",
" nodes = []\n",
" for e in result.entities:\n",
" eid = e.id or e.name\n",
" nodes.append({\n",
" 'key': eid,\n",
" 'attributes': {\n",
" 'label': e.name,\n",
" 'color': TYPE_COLORS.get(e.type_ref, DEFAULT_COLOR),\n",
" 'size': 4 + min(degree.get(eid, 0) * 2, 20),\n",
" 'type': e.type_ref,\n",
" **{k: str(v) for k, v in (e.attributes or {}).items() if v is not None},\n",
" }\n",
" })\n",
"\n",
" edges = []\n",
" node_keys = {n['key'] for n in nodes}\n",
" for i, r in enumerate(result.relations):\n",
" from_id = r.from_id or r.from_name\n",
" to_id = r.to_id or r.to_name\n",
" if from_id in node_keys and to_id in node_keys:\n",
" edges.append({\n",
" 'key': f'e{i}',\n",
" 'source': from_id,\n",
" 'target': to_id,\n",
" 'attributes': {\n",
" 'label': r.relation_type,\n",
" 'type': r.relation_type,\n",
" }\n",
" })\n",
"\n",
" return {'nodes': nodes, 'edges': edges}\n",
"\n",
"graph_data = extraction_to_sigma(result)\n",
"print(f'Graph: {len(graph_data[\"nodes\"])} nodes, {len(graph_data[\"edges\"])} edges')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')\n",
"html_path = render_sigma_html(\n",
" graph_data=graph_data,\n",
" output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
" title='Ontology Graph',\n",
")\n",
"print(f'Graph saved: {html_path}')\n",
"print(f'Open in browser: file://{html_path}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Auto-discovery de nuevos tipos\n",
"\n",
"Si el documento contiene entidades que no encajan en los presets, haiku las detecta y sugiere nuevos presets."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def discover_new_types(result, existing_presets: list[dict]) -> list[dict]:\n",
" \"\"\"Pide a haiku que sugiera tipos nuevos basandose en entidades de baja confianza o genericas.\"\"\"\n",
" # Recopilar entidades clasificadas como concept/text_fragment (genéricos fallback)\n",
" generic_entities = [\n",
" {'name': e.name, 'type': e.type_ref, 'attributes': e.attributes}\n",
" for e in result.entities\n",
" if e.type_ref in ('concept', 'text_fragment', 'related_to')\n",
" ]\n",
" \n",
" if not generic_entities:\n",
" print('No hay entidades genéricas — los presets cubren todo.')\n",
" return []\n",
"\n",
" existing_labels = [p['label'] for p in existing_presets]\n",
" \n",
" prompt_msg = [\n",
" {'role': 'system', 'content': (\n",
" 'You analyze entities extracted from a document and suggest new entity type presets. '\n",
" 'Existing types: ' + ', '.join(existing_labels) + '. '\n",
" 'For entities that dont fit existing types, suggest new type presets. '\n",
" 'Output JSON: {\"new_presets\": [{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", '\n",
" '\"metadata_fields\": [\"field1\", \"field2\", ...]}]}. '\n",
" 'Only suggest types that are genuinely different from existing ones. '\n",
" 'Return {\"new_presets\": []} if no new types are needed.'\n",
" )},\n",
" {'role': 'user', 'content': (\n",
" 'These entities were classified as generic (concept/text_fragment) '\n",
" 'because they didnt fit existing types:\\n\\n'\n",
" + json.dumps(generic_entities[:30], ensure_ascii=False, indent=2)\n",
" )}\n",
" ]\n",
" \n",
" resp = claude_haiku_json(prompt_msg)\n",
" new_presets = resp.get('new_presets', [])\n",
" \n",
" if new_presets:\n",
" print(f'Discovered {len(new_presets)} new types:')\n",
" for p in new_presets:\n",
" print(f\" - {p['label']} ({p['type_ref']}): {p['metadata_fields']}\")\n",
" else:\n",
" print('No new types needed.')\n",
" \n",
" return new_presets\n",
"\n",
"new_types = discover_new_types(result, ALL_PRESETS)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Si se descubrieron tipos nuevos, re-extraer con presets ampliados\n",
"if new_types:\n",
" EXPANDED_PRESETS = ALL_PRESETS + new_types\n",
" print(f'Re-extracting with {len(EXPANDED_PRESETS)} presets...')\n",
" \n",
" result = extraction_pipeline(\n",
" file_path=DOC_PATH,\n",
" entity_presets=EXPANDED_PRESETS,\n",
" relation_types=RELATION_TYPES,\n",
" llm_chat_json=claude_haiku_json,\n",
" chunk_size=800,\n",
" chunk_overlap=100,\n",
" confidence_threshold=0.5,\n",
" dedup_threshold=0.85,\n",
" on_progress=on_progress,\n",
" )\n",
" \n",
" print(f'\\nEntities: {result.stats.final_entities_count}')\n",
" print(f'Relations: {result.stats.final_relations_count}')\n",
" \n",
" # Re-generar grafo\n",
" graph_data = extraction_to_sigma(result)\n",
" html_path = render_sigma_html(\n",
" graph_data=graph_data,\n",
" output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
" title='Ontology Graph (expanded)',\n",
" )\n",
" print(f'Updated graph: file://{html_path}')\n",
"else:\n",
" print('No re-extraction needed.')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'python.functions.core.extract_json_from_llm'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 5\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m 6\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m 7\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, os.path.join(ROOT, \u001b[33m'python'\u001b[39m, \u001b[33m'functions'\u001b[39m))\n\u001b[32m 8\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m 12\u001b[39m \n",
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
]
}
],
"source": [
"import sys, os, json, subprocess\n",
"from pathlib import Path\n",
"\n",
"ROOT = '/home/lucas/fn_registry'\n",
"os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
"sys.path.insert(0, ROOT)\n",
"sys.path.insert(0, os.path.join(ROOT, 'python', 'functions'))\n",
"\n",
"from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
"from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
"from python.functions.datascience.render_sigma_html import render_sigma_html\n",
"\n",
"print('OK: imports loaded')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"imports OK\n"
]
}
],
"source": [
"import sys, os, json, subprocess\n",
"\n",
"# Añadir lib/ al path\n",
"sys.path.insert(0, '/home/lucas/fn_registry/analysis/ontology_graph/lib')\n",
"\n",
"from core_functions import extract_json_from_llm\n",
"from extraction_pipeline import extraction_pipeline\n",
"from render_sigma_html import render_sigma_html\n",
"\n",
"print('imports OK')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LLM wrapper OK: {'ok': True}\n"
]
}
],
"source": [
"def claude_haiku_json(messages: list[dict]) -> dict:\n",
" \"\"\"Wrapper: messages OpenAI-style -> claude -p --model haiku -> dict.\"\"\"\n",
" parts = []\n",
" for msg in messages:\n",
" role = msg['role']\n",
" content = msg['content']\n",
" if role == 'system':\n",
" parts.append(f'[SYSTEM]\\n{content}')\n",
" elif role == 'user':\n",
" parts.append(f'[USER]\\n{content}')\n",
" prompt = '\\n\\n'.join(parts)\n",
" \n",
" result = subprocess.run(\n",
" ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
" capture_output=True, text=True, timeout=120\n",
" )\n",
" if result.returncode != 0:\n",
" raise RuntimeError(f'claude -p failed: {result.stderr}')\n",
" \n",
" envelope = json.loads(result.stdout)\n",
" raw_text = envelope.get('result', '')\n",
" return extract_json_from_llm(raw_text)\n",
"\n",
"# Test\n",
"test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
"print('LLM wrapper OK:', test)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"19 presets, 35 relation types\n"
]
}
],
"source": [
"OSINT_PRESETS = [\n",
" {'type_ref': 'osint_person_go_cybersecurity', 'label': 'Person',\n",
" 'metadata_fields': ['full_name', 'alias', 'nationality', 'dob', 'gender', 'risk_score']},\n",
" {'type_ref': 'osint_organization_go_cybersecurity', 'label': 'Organization',\n",
" 'metadata_fields': ['legal_name', 'country', 'sector', 'founded', 'risk_score']},\n",
" {'type_ref': 'osint_location_go_cybersecurity', 'label': 'Location',\n",
" 'metadata_fields': ['lat', 'lon', 'address', 'country', 'city']},\n",
" {'type_ref': 'osint_event_go_cybersecurity', 'label': 'Event',\n",
" 'metadata_fields': ['event_type', 'date', 'location', 'description', 'severity']},\n",
" {'type_ref': 'osint_email_go_cybersecurity', 'label': 'Email',\n",
" 'metadata_fields': ['address', 'provider', 'verified', 'breached']},\n",
" {'type_ref': 'osint_domain_go_cybersecurity', 'label': 'Domain',\n",
" 'metadata_fields': ['fqdn', 'registrar', 'created_date', 'expires_date']},\n",
" {'type_ref': 'osint_ip_address_go_cybersecurity', 'label': 'IP Address',\n",
" 'metadata_fields': ['ip', 'asn', 'country', 'isp', 'geolocation']},\n",
" {'type_ref': 'osint_phone_go_cybersecurity', 'label': 'Phone',\n",
" 'metadata_fields': ['number', 'country_code', 'carrier', 'phone_type']},\n",
" {'type_ref': 'osint_social_media_go_cybersecurity', 'label': 'Social Media Account',\n",
" 'metadata_fields': ['platform', 'username', 'url', 'followers', 'verified']},\n",
" {'type_ref': 'osint_document_go_cybersecurity', 'label': 'Document',\n",
" 'metadata_fields': ['title', 'format', 'classification', 'source']},\n",
" {'type_ref': 'osint_crypto_wallet_go_cybersecurity', 'label': 'Crypto Wallet',\n",
" 'metadata_fields': ['address', 'blockchain', 'balance']},\n",
" {'type_ref': 'osint_malware_go_cybersecurity', 'label': 'Malware',\n",
" 'metadata_fields': ['family', 'hash_sha256', 'threat_level']},\n",
" {'type_ref': 'osint_vulnerability_go_cybersecurity', 'label': 'Vulnerability',\n",
" 'metadata_fields': ['cve_id', 'cvss', 'affected_product', 'exploited']},\n",
"]\n",
"\n",
"GENERIC_PRESETS = [\n",
" {'type_ref': 'concept', 'label': 'Concept',\n",
" 'metadata_fields': ['name', 'category', 'definition']},\n",
" {'type_ref': 'url', 'label': 'URL/Link',\n",
" 'metadata_fields': ['url', 'domain', 'context']},\n",
" {'type_ref': 'date_reference', 'label': 'Date/Time',\n",
" 'metadata_fields': ['date', 'precision', 'context']},\n",
" {'type_ref': 'quantity', 'label': 'Quantity/Amount',\n",
" 'metadata_fields': ['value', 'unit', 'context']},\n",
" {'type_ref': 'coordinates', 'label': 'Coordinates',\n",
" 'metadata_fields': ['lat', 'lon', 'label']},\n",
" {'type_ref': 'text_fragment', 'label': 'Key Text Fragment',\n",
" 'metadata_fields': ['text', 'category', 'relevance']},\n",
"]\n",
"\n",
"ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
"\n",
"RELATION_TYPES = [\n",
" 'employs', 'works_for', 'founded', 'owns', 'controls',\n",
" 'member_of', 'affiliated_with', 'collaborates_with',\n",
" 'communicates_with', 'sent_to', 'received_from',\n",
" 'located_in', 'headquartered_in', 'traveled_to', 'operates_in',\n",
" 'participated_in', 'caused', 'occurred_at', 'occurred_on',\n",
" 'mentions', 'references', 'describes', 'authored', 'published',\n",
" 'funds', 'transacted_with', 'invested_in',\n",
" 'hosts', 'resolves_to', 'exploits', 'targets',\n",
" 'related_to', 'part_of', 'instance_of', 'has_attribute',\n",
"]\n",
"\n",
"print(f'{len(ALL_PRESETS)} presets, {len(RELATION_TYPES)} relation types')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 0.0%] Extracting text from file...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 0.0%] Extracting entities from chunk 1/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 0.7%] Extracting entities from chunk 2/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 1.5%] Extracting entities from chunk 3/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 2.2%] Extracting entities from chunk 4/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 3.0%] Extracting entities from chunk 5/54\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/lucas/fn_registry/analysis/ontology_graph/lib/extraction_pipeline.py:113: UserWarning: extract_entities_llm: type_ref 'osint_service_go_cybersecurity' no esta en el schema, descartando entidad 'Bizum'\n",
" candidates = extract_entities_llm(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 3.7%] Extracting entities from chunk 6/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 4.4%] Extracting entities from chunk 7/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 5.2%] Extracting entities from chunk 8/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 5.9%] Extracting entities from chunk 9/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 6.7%] Extracting entities from chunk 10/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 7.4%] Extracting entities from chunk 11/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 8.1%] Extracting entities from chunk 12/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 8.9%] Extracting entities from chunk 13/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 9.6%] Extracting entities from chunk 14/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 10.4%] Extracting entities from chunk 15/54\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" [ 11.1%] Extracting entities from chunk 16/54\n"
]
}
],
"source": [
"DOC_PATH = '/home/lucas/fn_registry/analysis/ontology_graph/data/condiciones-generales-bizum.pdf'\n",
"\n",
"def on_progress(msg, pct):\n",
" print(f' [{pct*100:5.1f}%] {msg}')\n",
"\n",
"result = extraction_pipeline(\n",
" file_path=DOC_PATH,\n",
" entity_presets=ALL_PRESETS,\n",
" relation_types=RELATION_TYPES,\n",
" llm_chat_json=claude_haiku_json,\n",
" chunk_size=800,\n",
" chunk_overlap=100,\n",
" confidence_threshold=0.5,\n",
" dedup_threshold=0.85,\n",
" on_progress=on_progress,\n",
")\n",
"\n",
"print(f'\\nEntities: {result.stats.final_entities_count}')\n",
"print(f'Relations: {result.stats.final_relations_count}')\n",
"print(f'Chunks: {result.stats.total_chunks}')\n",
"print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
"print(f'Entity types: {result.stats.entity_types_found}')\n",
"print(f'Relation types: {result.stats.relation_types_found}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pipeline optimizado\n",
"\n",
"- 1 sola llamada LLM por chunk (entities + relations + tipos nuevos)\n",
"- Chunks de 2000 chars\n",
"- Paralelizado con ThreadPoolExecutor"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"from extract_text_from_file import extract_text_from_file\n",
"from core_functions import preprocess_text\n",
"from split_text_into_chunks import split_text_into_chunks\n",
"from deduplicate_entities import deduplicate_entities\n",
"from deduplicate_relations import deduplicate_relations\n",
"from entity_candidate import EntityCandidate\n",
"from relation_candidate import RelationCandidate\n",
"\n",
"def build_unified_prompt(entity_presets, relation_types):\n",
" \"\"\"System prompt que pide entities + relations + tipos nuevos en 1 sola llamada.\"\"\"\n",
" type_lines = []\n",
" for p in entity_presets:\n",
" fields = ', '.join(p.get('metadata_fields', []))\n",
" type_lines.append(f\"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]\")\n",
"\n",
" return f'''You are an entity and relation extraction expert. Given text, extract ALL entities and relations in a single pass.\n",
"\n",
"ENTITY TYPES:\n",
"{chr(10).join(type_lines)}\n",
"\n",
"RELATION TYPES: {', '.join(relation_types)}\n",
"\n",
"OUTPUT FORMAT (strict JSON):\n",
"{{\n",
" \"entities\": [\n",
" {{\"name\": \"...\", \"type_ref\": \"...\", \"attributes\": {{...}}, \"confidence\": 0.9}}\n",
" ],\n",
" \"relations\": [\n",
" {{\"from_name\": \"...\", \"to_name\": \"...\", \"relation_type\": \"...\", \"confidence\": 0.8, \"description\": \"...\"}}\n",
" ],\n",
" \"suggested_types\": [\n",
" {{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", \"metadata_fields\": [\"field1\", \"field2\"], \"reason\": \"why this type is needed\"}}\n",
" ]\n",
"}}\n",
"\n",
"RULES:\n",
"- Extract ALL entities explicitly mentioned in the text\n",
"- Use exact type_ref from the schema. Leave unknown attributes as null\n",
"- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied\n",
"- Relations: from_name and to_name MUST match extracted entity names exactly\n",
"- suggested_types: if you find important entities that do NOT fit any existing type, suggest a new type with its fields. Use these suggested types for those entities in the entities array.\n",
"- If no suggested types are needed, return \"suggested_types\": []\n",
"- Respond in the same language as the text for descriptions'''\n",
"\n",
"UNIFIED_PROMPT = build_unified_prompt(ALL_PRESETS, RELATION_TYPES)\n",
"print(f'Prompt length: {len(UNIFIED_PROMPT)} chars')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}