ontology_graph/notebooks/01_extraction.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ontology Graph Extraction\n",
    "\n",
    "Extrae entidades y relaciones de cualquier documento usando funciones del registry.\n",
    "- LLM: `claude -p --model haiku`\n",
    "- Tipos: OSINT del registry + genéricos (concept, url, date, quantity, text_fragment, coordinates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'python.functions.core.extract_json_from_llm'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m      3\u001b[39m ROOT = \u001b[33m'/home/lucas/fn_registry'\u001b[39m\n\u001b[32m      4\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m      5\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m      6\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m      8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m      9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m     10\u001b[39m \n",
      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
     ]
    }
   ],
   "source": [
    "import sys, os, json, subprocess\n",
    "\n",
    "ROOT = '/home/lucas/fn_registry'\n",
    "os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
    "sys.path.insert(0, ROOT)\n",
    "\n",
    "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
    "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
    "from python.functions.datascience.render_sigma_html import render_sigma_html\n",
    "\n",
    "print('Registry root:', ROOT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'FN_REGISTRY_ROOT'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mKeyError\u001b[39m                                  Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m sys, os, json, subprocess\n\u001b[32m      2\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m ROOT = os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m]\n\u001b[32m      4\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m      5\u001b[39m \n\u001b[32m      6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n",
      "\u001b[36mFile \u001b[39m\u001b[32m<frozen os>:717\u001b[39m, in \u001b[36m_Environ.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n",
      "\u001b[31mKeyError\u001b[39m: 'FN_REGISTRY_ROOT'"
     ]
    }
   ],
   "source": [
    "import sys, os, json, subprocess\n",
    "\n",
    "ROOT = os.environ['FN_REGISTRY_ROOT']\n",
    "sys.path.insert(0, ROOT)\n",
    "\n",
    "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
    "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
    "from python.functions.datascience.render_sigma_html import render_sigma_html\n",
    "\n",
    "print('Registry root:', ROOT)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LLM wrapper: claude -p + haiku"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def claude_haiku_json(messages: list[dict]) -> dict:\n",
    "    \"\"\"Wrapper que convierte messages OpenAI-style a claude -p --model haiku.\"\"\"\n",
    "    # Construir prompt desde messages\n",
    "    parts = []\n",
    "    for msg in messages:\n",
    "        role = msg['role']\n",
    "        content = msg['content']\n",
    "        if role == 'system':\n",
    "            parts.append(f\"[SYSTEM]\\n{content}\")\n",
    "        elif role == 'user':\n",
    "            parts.append(f\"[USER]\\n{content}\")\n",
    "    prompt = \"\\n\\n\".join(parts)\n",
    "    \n",
    "    result = subprocess.run(\n",
    "        ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
    "        capture_output=True, text=True, timeout=120\n",
    "    )\n",
    "    \n",
    "    if result.returncode != 0:\n",
    "        raise RuntimeError(f\"claude -p failed: {result.stderr}\")\n",
    "    \n",
    "    # Extraer el campo 'result' del JSON envelope de claude\n",
    "    envelope = json.loads(result.stdout)\n",
    "    raw_text = envelope.get('result', '')\n",
    "    \n",
    "    # Parsear JSON del LLM (maneja codeblocks, trailing commas, etc.)\n",
    "    return extract_json_from_llm(raw_text)\n",
    "\n",
    "# Test rapido\n",
    "test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
    "print('LLM wrapper OK:', test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Entity presets: OSINT + genéricos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Presets OSINT (del registry) ---\n",
    "OSINT_PRESETS = [\n",
    "    {\"type_ref\": \"osint_person_go_cybersecurity\", \"label\": \"Person\",\n",
    "     \"metadata_fields\": [\"full_name\", \"alias\", \"nationality\", \"dob\", \"gender\", \"risk_score\"]},\n",
    "    {\"type_ref\": \"osint_organization_go_cybersecurity\", \"label\": \"Organization\",\n",
    "     \"metadata_fields\": [\"legal_name\", \"country\", \"sector\", \"founded\", \"risk_score\"]},\n",
    "    {\"type_ref\": \"osint_location_go_cybersecurity\", \"label\": \"Location\",\n",
    "     \"metadata_fields\": [\"lat\", \"lon\", \"address\", \"country\", \"city\"]},\n",
    "    {\"type_ref\": \"osint_event_go_cybersecurity\", \"label\": \"Event\",\n",
    "     \"metadata_fields\": [\"event_type\", \"date\", \"location\", \"description\", \"severity\"]},\n",
    "    {\"type_ref\": \"osint_email_go_cybersecurity\", \"label\": \"Email\",\n",
    "     \"metadata_fields\": [\"address\", \"provider\", \"verified\", \"breached\"]},\n",
    "    {\"type_ref\": \"osint_domain_go_cybersecurity\", \"label\": \"Domain\",\n",
    "     \"metadata_fields\": [\"fqdn\", \"registrar\", \"created_date\", \"expires_date\"]},\n",
    "    {\"type_ref\": \"osint_ip_address_go_cybersecurity\", \"label\": \"IP Address\",\n",
    "     \"metadata_fields\": [\"ip\", \"asn\", \"country\", \"isp\", \"geolocation\"]},\n",
    "    {\"type_ref\": \"osint_phone_go_cybersecurity\", \"label\": \"Phone\",\n",
    "     \"metadata_fields\": [\"number\", \"country_code\", \"carrier\", \"phone_type\"]},\n",
    "    {\"type_ref\": \"osint_social_media_go_cybersecurity\", \"label\": \"Social Media Account\",\n",
    "     \"metadata_fields\": [\"platform\", \"username\", \"url\", \"followers\", \"verified\"]},\n",
    "    {\"type_ref\": \"osint_document_go_cybersecurity\", \"label\": \"Document\",\n",
    "     \"metadata_fields\": [\"title\", \"format\", \"classification\", \"source\"]},\n",
    "    {\"type_ref\": \"osint_crypto_wallet_go_cybersecurity\", \"label\": \"Crypto Wallet\",\n",
    "     \"metadata_fields\": [\"address\", \"blockchain\", \"balance\"]},\n",
    "    {\"type_ref\": \"osint_malware_go_cybersecurity\", \"label\": \"Malware\",\n",
    "     \"metadata_fields\": [\"family\", \"hash_sha256\", \"threat_level\"]},\n",
    "    {\"type_ref\": \"osint_vulnerability_go_cybersecurity\", \"label\": \"Vulnerability\",\n",
    "     \"metadata_fields\": [\"cve_id\", \"cvss\", \"affected_product\", \"exploited\"]},\n",
    "]\n",
    "\n",
    "# --- Presets genéricos (sin tipo Go, inline) ---\n",
    "GENERIC_PRESETS = [\n",
    "    {\"type_ref\": \"concept\", \"label\": \"Concept\",\n",
    "     \"metadata_fields\": [\"name\", \"category\", \"definition\"]},\n",
    "    {\"type_ref\": \"url\", \"label\": \"URL/Link\",\n",
    "     \"metadata_fields\": [\"url\", \"domain\", \"context\"]},\n",
    "    {\"type_ref\": \"date_reference\", \"label\": \"Date/Time\",\n",
    "     \"metadata_fields\": [\"date\", \"precision\", \"context\"]},\n",
    "    {\"type_ref\": \"quantity\", \"label\": \"Quantity/Amount\",\n",
    "     \"metadata_fields\": [\"value\", \"unit\", \"context\"]},\n",
    "    {\"type_ref\": \"coordinates\", \"label\": \"Coordinates\",\n",
    "     \"metadata_fields\": [\"lat\", \"lon\", \"label\"]},\n",
    "    {\"type_ref\": \"text_fragment\", \"label\": \"Key Text Fragment\",\n",
    "     \"metadata_fields\": [\"text\", \"category\", \"relevance\"]},\n",
    "]\n",
    "\n",
    "ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
    "print(f'{len(ALL_PRESETS)} entity presets loaded ({len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Relation types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "RELATION_TYPES = [\n",
    "    # Personas / orgs\n",
    "    \"employs\", \"works_for\", \"founded\", \"owns\", \"controls\",\n",
    "    \"member_of\", \"affiliated_with\", \"collaborates_with\",\n",
    "    # Comunicacion\n",
    "    \"communicates_with\", \"sent_to\", \"received_from\",\n",
    "    # Ubicacion\n",
    "    \"located_in\", \"headquartered_in\", \"traveled_to\", \"operates_in\",\n",
    "    # Eventos\n",
    "    \"participated_in\", \"caused\", \"occurred_at\", \"occurred_on\",\n",
    "    # Documentos / conceptos\n",
    "    \"mentions\", \"references\", \"describes\", \"authored\", \"published\",\n",
    "    # Financiero\n",
    "    \"funds\", \"transacted_with\", \"invested_in\",\n",
    "    # Tecnico\n",
    "    \"hosts\", \"resolves_to\", \"exploits\", \"targets\",\n",
    "    # Generico\n",
    "    \"related_to\", \"part_of\", \"instance_of\", \"has_attribute\",\n",
    "]\n",
    "\n",
    "print(f'{len(RELATION_TYPES)} relation types')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extraer documento\n",
    "\n",
    "Pon tu documento en `data/` y cambia el path."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DOC_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'document.pdf')  # <-- cambiar\n",
    "\n",
    "# Progreso visible\n",
    "def on_progress(msg, pct):\n",
    "    print(f'  [{pct*100:5.1f}%] {msg}')\n",
    "\n",
    "result = extraction_pipeline(\n",
    "    file_path=DOC_PATH,\n",
    "    entity_presets=ALL_PRESETS,\n",
    "    relation_types=RELATION_TYPES,\n",
    "    llm_chat_json=claude_haiku_json,\n",
    "    chunk_size=800,\n",
    "    chunk_overlap=100,\n",
    "    confidence_threshold=0.5,\n",
    "    dedup_threshold=0.85,\n",
    "    on_progress=on_progress,\n",
    ")\n",
    "\n",
    "print(f'\\nEntities: {result.stats.final_entities_count}')\n",
    "print(f'Relations: {result.stats.final_relations_count}')\n",
    "print(f'Chunks: {result.stats.total_chunks}')\n",
    "print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
    "print(f'Entity types: {result.stats.entity_types_found}')\n",
    "print(f'Relation types: {result.stats.relation_types_found}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explorar resultados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Entities\n",
    "ent_rows = []\n",
    "for e in result.entities:\n",
    "    ent_rows.append({\n",
    "        'id': e.id,\n",
    "        'name': e.name,\n",
    "        'type': e.type_ref,\n",
    "        'confidence': e.confidence,\n",
    "        'attributes': e.attributes,\n",
    "    })\n",
    "df_entities = pd.DataFrame(ent_rows)\n",
    "print(f'=== Entities ({len(df_entities)}) ===')\n",
    "df_entities.sort_values('type')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Relations\n",
    "rel_rows = []\n",
    "for r in result.relations:\n",
    "    rel_rows.append({\n",
    "        'from_name': r.from_name,\n",
    "        'relation': r.relation_type,\n",
    "        'to_name': r.to_name,\n",
    "        'confidence': r.confidence,\n",
    "        'description': r.description,\n",
    "    })\n",
    "df_relations = pd.DataFrame(rel_rows)\n",
    "print(f'=== Relations ({len(df_relations)}) ===')\n",
    "df_relations.sort_values('relation')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualizar grafo con sigma.js"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Colores por tipo de entidad\n",
    "TYPE_COLORS = {\n",
    "    'osint_person_go_cybersecurity': '#e74c3c',\n",
    "    'osint_organization_go_cybersecurity': '#3498db',\n",
    "    'osint_location_go_cybersecurity': '#2ecc71',\n",
    "    'osint_event_go_cybersecurity': '#f39c12',\n",
    "    'osint_email_go_cybersecurity': '#9b59b6',\n",
    "    'osint_domain_go_cybersecurity': '#1abc9c',\n",
    "    'osint_ip_address_go_cybersecurity': '#e67e22',\n",
    "    'osint_phone_go_cybersecurity': '#95a5a6',\n",
    "    'osint_social_media_go_cybersecurity': '#e91e63',\n",
    "    'osint_document_go_cybersecurity': '#607d8b',\n",
    "    'osint_crypto_wallet_go_cybersecurity': '#ff9800',\n",
    "    'osint_malware_go_cybersecurity': '#f44336',\n",
    "    'osint_vulnerability_go_cybersecurity': '#ff5722',\n",
    "    'concept': '#00bcd4',\n",
    "    'url': '#8bc34a',\n",
    "    'date_reference': '#cddc39',\n",
    "    'quantity': '#ffc107',\n",
    "    'coordinates': '#4caf50',\n",
    "    'text_fragment': '#78909c',\n",
    "}\n",
    "DEFAULT_COLOR = '#aaaaaa'\n",
    "\n",
    "def extraction_to_sigma(result) -> dict:\n",
    "    \"\"\"Convierte ExtractionResult a formato sigma.js/graphology.\"\"\"\n",
    "    # Contar degree para tamaño de nodo\n",
    "    degree = {}\n",
    "    for r in result.relations:\n",
    "        from_id = r.from_id or r.from_name\n",
    "        to_id = r.to_id or r.to_name\n",
    "        degree[from_id] = degree.get(from_id, 0) + 1\n",
    "        degree[to_id] = degree.get(to_id, 0) + 1\n",
    "\n",
    "    nodes = []\n",
    "    for e in result.entities:\n",
    "        eid = e.id or e.name\n",
    "        nodes.append({\n",
    "            'key': eid,\n",
    "            'attributes': {\n",
    "                'label': e.name,\n",
    "                'color': TYPE_COLORS.get(e.type_ref, DEFAULT_COLOR),\n",
    "                'size': 4 + min(degree.get(eid, 0) * 2, 20),\n",
    "                'type': e.type_ref,\n",
    "                **{k: str(v) for k, v in (e.attributes or {}).items() if v is not None},\n",
    "            }\n",
    "        })\n",
    "\n",
    "    edges = []\n",
    "    node_keys = {n['key'] for n in nodes}\n",
    "    for i, r in enumerate(result.relations):\n",
    "        from_id = r.from_id or r.from_name\n",
    "        to_id = r.to_id or r.to_name\n",
    "        if from_id in node_keys and to_id in node_keys:\n",
    "            edges.append({\n",
    "                'key': f'e{i}',\n",
    "                'source': from_id,\n",
    "                'target': to_id,\n",
    "                'attributes': {\n",
    "                    'label': r.relation_type,\n",
    "                    'type': r.relation_type,\n",
    "                }\n",
    "            })\n",
    "\n",
    "    return {'nodes': nodes, 'edges': edges}\n",
    "\n",
    "graph_data = extraction_to_sigma(result)\n",
    "print(f'Graph: {len(graph_data[\"nodes\"])} nodes, {len(graph_data[\"edges\"])} edges')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')\n",
    "html_path = render_sigma_html(\n",
    "    graph_data=graph_data,\n",
    "    output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
    "    title='Ontology Graph',\n",
    ")\n",
    "print(f'Graph saved: {html_path}')\n",
    "print(f'Open in browser: file://{html_path}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Auto-discovery de nuevos tipos\n",
    "\n",
    "Si el documento contiene entidades que no encajan en los presets, haiku las detecta y sugiere nuevos presets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def discover_new_types(result, existing_presets: list[dict]) -> list[dict]:\n",
    "    \"\"\"Pide a haiku que sugiera tipos nuevos basandose en entidades de baja confianza o genericas.\"\"\"\n",
    "    # Recopilar entidades clasificadas como concept/text_fragment (genéricos fallback)\n",
    "    generic_entities = [\n",
    "        {'name': e.name, 'type': e.type_ref, 'attributes': e.attributes}\n",
    "        for e in result.entities\n",
    "        if e.type_ref in ('concept', 'text_fragment', 'related_to')\n",
    "    ]\n",
    "    \n",
    "    if not generic_entities:\n",
    "        print('No hay entidades genéricas — los presets cubren todo.')\n",
    "        return []\n",
    "\n",
    "    existing_labels = [p['label'] for p in existing_presets]\n",
    "    \n",
    "    prompt_msg = [\n",
    "        {'role': 'system', 'content': (\n",
    "            'You analyze entities extracted from a document and suggest new entity type presets. '\n",
    "            'Existing types: ' + ', '.join(existing_labels) + '. '\n",
    "            'For entities that dont fit existing types, suggest new type presets. '\n",
    "            'Output JSON: {\"new_presets\": [{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", '\n",
    "            '\"metadata_fields\": [\"field1\", \"field2\", ...]}]}. '\n",
    "            'Only suggest types that are genuinely different from existing ones. '\n",
    "            'Return {\"new_presets\": []} if no new types are needed.'\n",
    "        )},\n",
    "        {'role': 'user', 'content': (\n",
    "            'These entities were classified as generic (concept/text_fragment) '\n",
    "            'because they didnt fit existing types:\\n\\n'\n",
    "            + json.dumps(generic_entities[:30], ensure_ascii=False, indent=2)\n",
    "        )}\n",
    "    ]\n",
    "    \n",
    "    resp = claude_haiku_json(prompt_msg)\n",
    "    new_presets = resp.get('new_presets', [])\n",
    "    \n",
    "    if new_presets:\n",
    "        print(f'Discovered {len(new_presets)} new types:')\n",
    "        for p in new_presets:\n",
    "            print(f\"  - {p['label']} ({p['type_ref']}): {p['metadata_fields']}\")\n",
    "    else:\n",
    "        print('No new types needed.')\n",
    "    \n",
    "    return new_presets\n",
    "\n",
    "new_types = discover_new_types(result, ALL_PRESETS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Si se descubrieron tipos nuevos, re-extraer con presets ampliados\n",
    "if new_types:\n",
    "    EXPANDED_PRESETS = ALL_PRESETS + new_types\n",
    "    print(f'Re-extracting with {len(EXPANDED_PRESETS)} presets...')\n",
    "    \n",
    "    result = extraction_pipeline(\n",
    "        file_path=DOC_PATH,\n",
    "        entity_presets=EXPANDED_PRESETS,\n",
    "        relation_types=RELATION_TYPES,\n",
    "        llm_chat_json=claude_haiku_json,\n",
    "        chunk_size=800,\n",
    "        chunk_overlap=100,\n",
    "        confidence_threshold=0.5,\n",
    "        dedup_threshold=0.85,\n",
    "        on_progress=on_progress,\n",
    "    )\n",
    "    \n",
    "    print(f'\\nEntities: {result.stats.final_entities_count}')\n",
    "    print(f'Relations: {result.stats.final_relations_count}')\n",
    "    \n",
    "    # Re-generar grafo\n",
    "    graph_data = extraction_to_sigma(result)\n",
    "    html_path = render_sigma_html(\n",
    "        graph_data=graph_data,\n",
    "        output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
    "        title='Ontology Graph (expanded)',\n",
    "    )\n",
    "    print(f'Updated graph: file://{html_path}')\n",
    "else:\n",
    "    print('No re-extraction needed.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'python.functions.core.extract_json_from_llm'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m      5\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m      6\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m      7\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, os.path.join(ROOT, \u001b[33m'python'\u001b[39m, \u001b[33m'functions'\u001b[39m))\n\u001b[32m      8\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m     10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m     11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m     12\u001b[39m \n",
      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
     ]
    }
   ],
   "source": [
    "import sys, os, json, subprocess\n",
    "from pathlib import Path\n",
    "\n",
    "ROOT = '/home/lucas/fn_registry'\n",
    "os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
    "sys.path.insert(0, ROOT)\n",
    "sys.path.insert(0, os.path.join(ROOT, 'python', 'functions'))\n",
    "\n",
    "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
    "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
    "from python.functions.datascience.render_sigma_html import render_sigma_html\n",
    "\n",
    "print('OK: imports loaded')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "imports OK\n"
     ]
    }
   ],
   "source": [
    "import sys, os, json, subprocess\n",
    "\n",
    "# Añadir lib/ al path\n",
    "sys.path.insert(0, '/home/lucas/fn_registry/analysis/ontology_graph/lib')\n",
    "\n",
    "from core_functions import extract_json_from_llm\n",
    "from extraction_pipeline import extraction_pipeline\n",
    "from render_sigma_html import render_sigma_html\n",
    "\n",
    "print('imports OK')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LLM wrapper OK: {'ok': True}\n"
     ]
    }
   ],
   "source": [
    "def claude_haiku_json(messages: list[dict]) -> dict:\n",
    "    \"\"\"Wrapper: messages OpenAI-style -> claude -p --model haiku -> dict.\"\"\"\n",
    "    parts = []\n",
    "    for msg in messages:\n",
    "        role = msg['role']\n",
    "        content = msg['content']\n",
    "        if role == 'system':\n",
    "            parts.append(f'[SYSTEM]\\n{content}')\n",
    "        elif role == 'user':\n",
    "            parts.append(f'[USER]\\n{content}')\n",
    "    prompt = '\\n\\n'.join(parts)\n",
    "    \n",
    "    result = subprocess.run(\n",
    "        ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
    "        capture_output=True, text=True, timeout=120\n",
    "    )\n",
    "    if result.returncode != 0:\n",
    "        raise RuntimeError(f'claude -p failed: {result.stderr}')\n",
    "    \n",
    "    envelope = json.loads(result.stdout)\n",
    "    raw_text = envelope.get('result', '')\n",
    "    return extract_json_from_llm(raw_text)\n",
    "\n",
    "# Test\n",
    "test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
    "print('LLM wrapper OK:', test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19 presets, 35 relation types\n"
     ]
    }
   ],
   "source": [
    "OSINT_PRESETS = [\n",
    "    {'type_ref': 'osint_person_go_cybersecurity', 'label': 'Person',\n",
    "     'metadata_fields': ['full_name', 'alias', 'nationality', 'dob', 'gender', 'risk_score']},\n",
    "    {'type_ref': 'osint_organization_go_cybersecurity', 'label': 'Organization',\n",
    "     'metadata_fields': ['legal_name', 'country', 'sector', 'founded', 'risk_score']},\n",
    "    {'type_ref': 'osint_location_go_cybersecurity', 'label': 'Location',\n",
    "     'metadata_fields': ['lat', 'lon', 'address', 'country', 'city']},\n",
    "    {'type_ref': 'osint_event_go_cybersecurity', 'label': 'Event',\n",
    "     'metadata_fields': ['event_type', 'date', 'location', 'description', 'severity']},\n",
    "    {'type_ref': 'osint_email_go_cybersecurity', 'label': 'Email',\n",
    "     'metadata_fields': ['address', 'provider', 'verified', 'breached']},\n",
    "    {'type_ref': 'osint_domain_go_cybersecurity', 'label': 'Domain',\n",
    "     'metadata_fields': ['fqdn', 'registrar', 'created_date', 'expires_date']},\n",
    "    {'type_ref': 'osint_ip_address_go_cybersecurity', 'label': 'IP Address',\n",
    "     'metadata_fields': ['ip', 'asn', 'country', 'isp', 'geolocation']},\n",
    "    {'type_ref': 'osint_phone_go_cybersecurity', 'label': 'Phone',\n",
    "     'metadata_fields': ['number', 'country_code', 'carrier', 'phone_type']},\n",
    "    {'type_ref': 'osint_social_media_go_cybersecurity', 'label': 'Social Media Account',\n",
    "     'metadata_fields': ['platform', 'username', 'url', 'followers', 'verified']},\n",
    "    {'type_ref': 'osint_document_go_cybersecurity', 'label': 'Document',\n",
    "     'metadata_fields': ['title', 'format', 'classification', 'source']},\n",
    "    {'type_ref': 'osint_crypto_wallet_go_cybersecurity', 'label': 'Crypto Wallet',\n",
    "     'metadata_fields': ['address', 'blockchain', 'balance']},\n",
    "    {'type_ref': 'osint_malware_go_cybersecurity', 'label': 'Malware',\n",
    "     'metadata_fields': ['family', 'hash_sha256', 'threat_level']},\n",
    "    {'type_ref': 'osint_vulnerability_go_cybersecurity', 'label': 'Vulnerability',\n",
    "     'metadata_fields': ['cve_id', 'cvss', 'affected_product', 'exploited']},\n",
    "]\n",
    "\n",
    "GENERIC_PRESETS = [\n",
    "    {'type_ref': 'concept', 'label': 'Concept',\n",
    "     'metadata_fields': ['name', 'category', 'definition']},\n",
    "    {'type_ref': 'url', 'label': 'URL/Link',\n",
    "     'metadata_fields': ['url', 'domain', 'context']},\n",
    "    {'type_ref': 'date_reference', 'label': 'Date/Time',\n",
    "     'metadata_fields': ['date', 'precision', 'context']},\n",
    "    {'type_ref': 'quantity', 'label': 'Quantity/Amount',\n",
    "     'metadata_fields': ['value', 'unit', 'context']},\n",
    "    {'type_ref': 'coordinates', 'label': 'Coordinates',\n",
    "     'metadata_fields': ['lat', 'lon', 'label']},\n",
    "    {'type_ref': 'text_fragment', 'label': 'Key Text Fragment',\n",
    "     'metadata_fields': ['text', 'category', 'relevance']},\n",
    "]\n",
    "\n",
    "ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
    "\n",
    "RELATION_TYPES = [\n",
    "    'employs', 'works_for', 'founded', 'owns', 'controls',\n",
    "    'member_of', 'affiliated_with', 'collaborates_with',\n",
    "    'communicates_with', 'sent_to', 'received_from',\n",
    "    'located_in', 'headquartered_in', 'traveled_to', 'operates_in',\n",
    "    'participated_in', 'caused', 'occurred_at', 'occurred_on',\n",
    "    'mentions', 'references', 'describes', 'authored', 'published',\n",
    "    'funds', 'transacted_with', 'invested_in',\n",
    "    'hosts', 'resolves_to', 'exploits', 'targets',\n",
    "    'related_to', 'part_of', 'instance_of', 'has_attribute',\n",
    "]\n",
    "\n",
    "print(f'{len(ALL_PRESETS)} presets, {len(RELATION_TYPES)} relation types')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  0.0%] Extracting text from file...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  0.0%] Extracting entities from chunk 1/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  0.7%] Extracting entities from chunk 2/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  1.5%] Extracting entities from chunk 3/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  2.2%] Extracting entities from chunk 4/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  3.0%] Extracting entities from chunk 5/54\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/lucas/fn_registry/analysis/ontology_graph/lib/extraction_pipeline.py:113: UserWarning: extract_entities_llm: type_ref 'osint_service_go_cybersecurity' no esta en el schema, descartando entidad 'Bizum'\n",
      "  candidates = extract_entities_llm(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  3.7%] Extracting entities from chunk 6/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  4.4%] Extracting entities from chunk 7/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  5.2%] Extracting entities from chunk 8/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  5.9%] Extracting entities from chunk 9/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  6.7%] Extracting entities from chunk 10/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  7.4%] Extracting entities from chunk 11/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  8.1%] Extracting entities from chunk 12/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  8.9%] Extracting entities from chunk 13/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  9.6%] Extracting entities from chunk 14/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [ 10.4%] Extracting entities from chunk 15/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [ 11.1%] Extracting entities from chunk 16/54\n"
     ]
    }
   ],
   "source": [
    "DOC_PATH = '/home/lucas/fn_registry/analysis/ontology_graph/data/condiciones-generales-bizum.pdf'\n",
    "\n",
    "def on_progress(msg, pct):\n",
    "    print(f'  [{pct*100:5.1f}%] {msg}')\n",
    "\n",
    "result = extraction_pipeline(\n",
    "    file_path=DOC_PATH,\n",
    "    entity_presets=ALL_PRESETS,\n",
    "    relation_types=RELATION_TYPES,\n",
    "    llm_chat_json=claude_haiku_json,\n",
    "    chunk_size=800,\n",
    "    chunk_overlap=100,\n",
    "    confidence_threshold=0.5,\n",
    "    dedup_threshold=0.85,\n",
    "    on_progress=on_progress,\n",
    ")\n",
    "\n",
    "print(f'\\nEntities: {result.stats.final_entities_count}')\n",
    "print(f'Relations: {result.stats.final_relations_count}')\n",
    "print(f'Chunks: {result.stats.total_chunks}')\n",
    "print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
    "print(f'Entity types: {result.stats.entity_types_found}')\n",
    "print(f'Relation types: {result.stats.relation_types_found}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pipeline optimizado\n",
    "\n",
    "- 1 sola llamada LLM por chunk (entities + relations + tipos nuevos)\n",
    "- Chunks de 2000 chars\n",
    "- Paralelizado con ThreadPoolExecutor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "from extract_text_from_file import extract_text_from_file\n",
    "from core_functions import preprocess_text\n",
    "from split_text_into_chunks import split_text_into_chunks\n",
    "from deduplicate_entities import deduplicate_entities\n",
    "from deduplicate_relations import deduplicate_relations\n",
    "from entity_candidate import EntityCandidate\n",
    "from relation_candidate import RelationCandidate\n",
    "\n",
    "def build_unified_prompt(entity_presets, relation_types):\n",
    "    \"\"\"System prompt que pide entities + relations + tipos nuevos en 1 sola llamada.\"\"\"\n",
    "    type_lines = []\n",
    "    for p in entity_presets:\n",
    "        fields = ', '.join(p.get('metadata_fields', []))\n",
    "        type_lines.append(f\"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]\")\n",
    "\n",
    "    return f'''You are an entity and relation extraction expert. Given text, extract ALL entities and relations in a single pass.\n",
    "\n",
    "ENTITY TYPES:\n",
    "{chr(10).join(type_lines)}\n",
    "\n",
    "RELATION TYPES: {', '.join(relation_types)}\n",
    "\n",
    "OUTPUT FORMAT (strict JSON):\n",
    "{{\n",
    "  \"entities\": [\n",
    "    {{\"name\": \"...\", \"type_ref\": \"...\", \"attributes\": {{...}}, \"confidence\": 0.9}}\n",
    "  ],\n",
    "  \"relations\": [\n",
    "    {{\"from_name\": \"...\", \"to_name\": \"...\", \"relation_type\": \"...\", \"confidence\": 0.8, \"description\": \"...\"}}\n",
    "  ],\n",
    "  \"suggested_types\": [\n",
    "    {{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", \"metadata_fields\": [\"field1\", \"field2\"], \"reason\": \"why this type is needed\"}}\n",
    "  ]\n",
    "}}\n",
    "\n",
    "RULES:\n",
    "- Extract ALL entities explicitly mentioned in the text\n",
    "- Use exact type_ref from the schema. Leave unknown attributes as null\n",
    "- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied\n",
    "- Relations: from_name and to_name MUST match extracted entity names exactly\n",
    "- suggested_types: if you find important entities that do NOT fit any existing type, suggest a new type with its fields. Use these suggested types for those entities in the entities array.\n",
    "- If no suggested types are needed, return \"suggested_types\": []\n",
    "- Respond in the same language as the text for descriptions'''\n",
    "\n",
    "UNIFIED_PROMPT = build_unified_prompt(ALL_PRESETS, RELATION_TYPES)\n",
    "print(f'Prompt length: {len(UNIFIED_PROMPT)} chars')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}