chore: initial sync

2026-04-28 22:13:08 +02:00
commit 40bea81603
30 changed files with 6675 additions and 0 deletions
@@ -0,0 +1,40 @@
+# JUPYTER HABILITADO EN ESTE ANALISIS
+
+## Reglas OBLIGATORIAS para Claude
+
+### 1. CODIGO INMUTABLE — NUNCA MODIFICAR CELDAS EXISTENTES
+- **PROHIBIDO** usar NotebookEdit para reemplazar celdas existentes
+- **SIEMPRE** anadir celdas NUEVAS al final del notebook
+- Si hay un error en una celda, crear celda nueva con la correccion
+- El historial de trabajo debe quedar intacto para trazabilidad
+
+### 2. PROGRAMACION FUNCIONAL OBLIGATORIA
+- **Funciones puras**: sin efectos secundarios, mismo input -> mismo output
+- **Inmutabilidad**: nunca mutar datos, crear copias transformadas
+- **Composicion**: funciones pequenas que se combinan
+- Preferir: `map`, `filter`, `reduce`, list comprehensions
+- Evitar: loops con mutacion, `global`, modificar argumentos in-place
+
+### 3. SIEMPRE usar MCP jupyter para ejecutar codigo Python
+- Las ejecuciones se ven en tiempo real en Jupyter Lab del usuario
+- Compartimos variables y estado del kernel
+- **NUNCA usar bash para ejecutar Python en este analisis**
+
+### 4. Verificar Jupyter activo ANTES de ejecutar
+- Si no esta activo: pedir al usuario que ejecute `./run-jupyter-lab.sh`
+
+### 5. Gestion de notebooks
+- Notebooks en la carpeta `notebooks/` o subcarpetas
+- Si un notebook tiene >50 celdas, crear uno nuevo
+- Nombrar descriptivamente: `01_exploracion.ipynb`, `02_limpieza.ipynb`
+
+### 6. Gestion de Python
+- **SIEMPRE usar `uv`** para gestionar dependencias
+- Anadir paquetes con `uv add nombre_paquete`
+
+### 7. Acceso al fn_registry
+- `FN_REGISTRY_ROOT` apunta a la raiz del registry
+- Para importar funciones Python: `sys.path.insert(0, os.path.join(os.environ["FN_REGISTRY_ROOT"], "python", "functions"))`
+- Para consultar registry.db: `sqlite3` o `import sqlite3` con la ruta `$FN_REGISTRY_ROOT/registry.db`
+
+
@@ -0,0 +1,12 @@
+.venv/
+.mcp.json
+.jupyter-port
+.jupyter/
+.jupyter_ystore.db
+.ipython/
+__pycache__/
+*.pyc
+.ipynb_checkpoints/
+bin/
+data/
+.DS_Store
@@ -0,0 +1 @@
+3.13
@@ -0,0 +1,540 @@
+"""Extracción de grafo ontológico desde un documento.
+
+Uso: python extract.py <archivo>
+     python extract.py data/condiciones-generales-bizum.pdf
+
+Optimizaciones vs extraction_pipeline:
+- 1 sola llamada LLM por chunk (entities + relations + tipos sugeridos)
+- Chunks de 2000 chars
+- Paralelizado con ThreadPoolExecutor
+"""
+
+import sys
+import os
+import json
+import subprocess
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "lib"))
+
+from extract_text_from_file import extract_text_from_file
+from core_functions import preprocess_text, extract_json_from_llm
+from split_text_into_chunks import split_text_into_chunks
+from deduplicate_entities import deduplicate_entities
+from deduplicate_relations import deduplicate_relations
+from entity_candidate import EntityCandidate
+from relation_candidate import RelationCandidate
+from render_sigma_html import render_sigma_html
+
+# ── Presets ────────────────────────────────────────────────────────────────────
+
+OSINT_PRESETS = [
+    {"type_ref": "person", "label": "Person",
+     "metadata_fields": ["full_name", "alias", "nationality", "dob", "gender", "risk_score"]},
+    {"type_ref": "organization", "label": "Organization",
+     "metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"]},
+    {"type_ref": "location", "label": "Location",
+     "metadata_fields": ["lat", "lon", "address", "country", "city"]},
+    {"type_ref": "event", "label": "Event",
+     "metadata_fields": ["event_type", "date", "location", "description", "severity"]},
+    {"type_ref": "email", "label": "Email",
+     "metadata_fields": ["address", "provider", "verified", "breached"]},
+    {"type_ref": "domain", "label": "Domain",
+     "metadata_fields": ["fqdn", "registrar", "created_date", "expires_date"]},
+    {"type_ref": "ip_address", "label": "IP Address",
+     "metadata_fields": ["ip", "asn", "country", "isp", "geolocation"]},
+    {"type_ref": "phone", "label": "Phone",
+     "metadata_fields": ["number", "country_code", "carrier", "phone_type"]},
+    {"type_ref": "social_media", "label": "Social Media Account",
+     "metadata_fields": ["platform", "username", "url", "followers", "verified"]},
+    {"type_ref": "document", "label": "Document",
+     "metadata_fields": ["title", "format", "classification", "source"]},
+    {"type_ref": "crypto_wallet", "label": "Crypto Wallet",
+     "metadata_fields": ["address", "blockchain", "balance"]},
+    {"type_ref": "malware", "label": "Malware",
+     "metadata_fields": ["family", "hash_sha256", "threat_level"]},
+    {"type_ref": "vulnerability", "label": "Vulnerability",
+     "metadata_fields": ["cve_id", "cvss", "affected_product", "exploited"]},
+]
+
+GENERIC_PRESETS = [
+    {"type_ref": "concept", "label": "Concept",
+     "metadata_fields": ["name", "category", "definition"]},
+    {"type_ref": "url", "label": "URL/Link",
+     "metadata_fields": ["url", "domain", "context"]},
+    {"type_ref": "date_reference", "label": "Date/Time",
+     "metadata_fields": ["date", "precision", "context"]},
+    {"type_ref": "quantity", "label": "Quantity/Amount",
+     "metadata_fields": ["value", "unit", "context"]},
+    {"type_ref": "coordinates", "label": "Coordinates",
+     "metadata_fields": ["lat", "lon", "label"]},
+    {"type_ref": "text_fragment", "label": "Key Text Fragment",
+     "metadata_fields": ["text", "category", "relevance"]},
+]
+
+# ── Custom presets (acumulativo, pensado para promoción al registry) ───────────
+
+CUSTOM_PRESETS_PATH = os.path.join(os.path.dirname(__file__), "data", "custom_presets.json")
+
+
+def load_custom_presets() -> list[dict]:
+    """Carga presets custom desde data/custom_presets.json si existe."""
+    if not os.path.exists(CUSTOM_PRESETS_PATH):
+        return []
+    with open(CUSTOM_PRESETS_PATH) as f:
+        data = json.load(f)
+    return data.get("presets", [])
+
+
+def save_custom_presets(presets: list[dict]) -> None:
+    """Guarda presets custom en data/custom_presets.json.
+
+    Formato pensado para promoción al registry:
+    {
+      "presets": [
+        {
+          "type_ref": "snake_case_id",
+          "label": "Human Label",
+          "metadata_fields": ["field1", "field2"],
+          "reason": "why this type exists",
+          "source_doc": "document where it was first discovered",
+          "promoted": false  // true cuando se registre en el registry
+        }
+      ]
+    }
+    """
+    os.makedirs(os.path.dirname(CUSTOM_PRESETS_PATH), exist_ok=True)
+    with open(CUSTOM_PRESETS_PATH, "w") as f:
+        json.dump({"presets": presets}, f, ensure_ascii=False, indent=2)
+
+
+def merge_suggested_into_custom(suggested: list[dict], source_doc: str) -> list[dict]:
+    """Mergea tipos sugeridos con custom existentes. Dedup por type_ref."""
+    existing = load_custom_presets()
+    existing_refs = {p["type_ref"] for p in existing}
+
+    added = []
+    for s in suggested:
+        ref = s.get("type_ref", "")
+        if not ref or ref in existing_refs:
+            continue
+        existing_refs.add(ref)
+        preset = {
+            "type_ref": ref,
+            "label": s.get("label", ref),
+            "metadata_fields": s.get("metadata_fields", []),
+            "reason": s.get("reason", ""),
+            "source_doc": source_doc,
+            "promoted": False,
+        }
+        existing.append(preset)
+        added.append(preset)
+
+    if added:
+        save_custom_presets(existing)
+
+    return added
+
+
+RELATION_TYPES = [
+    "employs", "works_for", "founded", "owns", "controls",
+    "member_of", "affiliated_with", "collaborates_with",
+    "communicates_with", "sent_to", "received_from",
+    "located_in", "headquartered_in", "traveled_to", "operates_in",
+    "participated_in", "caused", "occurred_at", "occurred_on",
+    "mentions", "references", "describes", "authored", "published",
+    "funds", "transacted_with", "invested_in",
+    "hosts", "resolves_to", "exploits", "targets",
+    "related_to", "part_of", "instance_of", "has_attribute",
+]
+
+# ── LLM wrapper ───────────────────────────────────────────────────────────────
+
+def claude_haiku_json(messages: list[dict]) -> dict:
+    parts = []
+    for msg in messages:
+        if msg["role"] == "system":
+            parts.append(f"[SYSTEM]\n{msg['content']}")
+        elif msg["role"] == "user":
+            parts.append(f"[USER]\n{msg['content']}")
+    prompt = "\n\n".join(parts)
+
+    result = subprocess.run(
+        ["claude", "-p", "--model", "haiku", "--output-format", "json", prompt],
+        capture_output=True, text=True, timeout=120,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"claude -p failed: {result.stderr[:200]}")
+
+    envelope = json.loads(result.stdout)
+    return extract_json_from_llm(envelope.get("result", ""))
+
+# ── Unified prompt ─────────────────────────────────────────────────────────────
+
+def build_unified_prompt(presets, rel_types):
+    type_lines = []
+    for p in presets:
+        fields = ", ".join(p.get("metadata_fields", []))
+        type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
+
+    return (
+        "You are an entity and relation extraction expert. "
+        "Given text, extract ALL entities and relations in a single pass.\n\n"
+        "ENTITY TYPES:\n" + "\n".join(type_lines) + "\n\n"
+        "RELATION TYPES: " + ", ".join(rel_types) + "\n\n"
+        'OUTPUT FORMAT (strict JSON):\n'
+        '{\n'
+        '  "entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}],\n'
+        '  "relations": [{"from_name": "...", "to_name": "...", "relation_type": "...", "confidence": 0.8, "description": "..."}],\n'
+        '  "suggested_types": [{"type_ref": "snake_case_id", "label": "Human Label", "metadata_fields": ["f1","f2"], "reason": "..."}]\n'
+        '}\n\n'
+        "RULES:\n"
+        "- Extract ALL entities explicitly mentioned\n"
+        "- Use exact type_ref from schema. Unknown attributes = null\n"
+        "- Confidence: 1.0=explicit, 0.7=strongly implied, 0.5=weakly implied\n"
+        "- Relations: from_name/to_name MUST match entity names exactly\n"
+        "- suggested_types: for important entities that do NOT fit any type, suggest a new type. "
+        "Use those suggested type_refs for those entities in the entities array.\n"
+        '- If no new types needed: "suggested_types": []\n'
+        "- Respond in the same language as the text for descriptions"
+    )
+
+# ── Process one chunk ──────────────────────────────────────────────────────────
+
+def process_chunk(chunk_idx: int, chunk_text: str, system_prompt: str):
+    """Procesa un chunk: extrae entities + relations + suggested_types."""
+    try:
+        resp = claude_haiku_json([
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": chunk_text},
+        ])
+    except Exception as e:
+        print(f"  [WARN] chunk {chunk_idx}: {e}")
+        return [], [], []
+
+    raw_entities = resp.get("entities", [])
+    raw_relations = resp.get("relations", [])
+    suggested = resp.get("suggested_types", [])
+
+    entities = []
+    for ent in raw_entities:
+        name = ent.get("name", "").strip()
+        if not name:
+            continue
+        entities.append(EntityCandidate(
+            name=name,
+            type_ref=ent.get("type_ref", "concept"),
+            attributes=ent.get("attributes", {}),
+            confidence=float(ent.get("confidence", 0.5)),
+            source_chunk_indices=[chunk_idx],
+        ))
+
+    relations = []
+    for rel in raw_relations:
+        fn = rel.get("from_name", "").strip()
+        tn = rel.get("to_name", "").strip()
+        if not fn or not tn:
+            continue
+        relations.append(RelationCandidate(
+            from_name=fn,
+            to_name=tn,
+            relation_type=rel.get("relation_type", "related_to"),
+            confidence=float(rel.get("confidence", 0.5)),
+            description=rel.get("description", ""),
+            source_chunk_index=chunk_idx,
+        ))
+
+    return entities, relations, suggested
+
+# ── Sigma conversion ───────────────────────────────────────────────────────────
+
+TYPE_COLORS = {
+    "person": "#e74c3c",
+    "organization": "#3498db",
+    "location": "#2ecc71",
+    "event": "#f39c12",
+    "email": "#9b59b6",
+    "domain": "#1abc9c",
+    "ip_address": "#e67e22",
+    "phone": "#95a5a6",
+    "social_media": "#e91e63",
+    "document": "#607d8b",
+    "crypto_wallet": "#ff9800",
+    "malware": "#f44336",
+    "vulnerability": "#ff5722",
+    "concept": "#00bcd4",
+    "url": "#8bc34a",
+    "date_reference": "#cddc39",
+    "quantity": "#ffc107",
+    "coordinates": "#4caf50",
+    "text_fragment": "#78909c",
+}
+
+def to_sigma(entities, relations, entity_id_map):
+    # Build name→UUID lookup from dedup map
+    # entity_id_map: {name_variant -> uuid, ...}
+    # Invert to uuid→canonical_name using entities list
+    uuid_to_name = {}
+    name_to_uuid = {}
+    for e in entities:
+        # Find this entity's UUID in the map
+        uuid = entity_id_map.get(e.name, entity_id_map.get(e.name.lower().strip(), e.name))
+        uuid_to_name[uuid] = e.name
+        name_to_uuid[e.name] = uuid
+
+    degree = {}
+    for r in relations:
+        fid = r.from_id or r.from_name
+        tid = r.to_id or r.to_name
+        degree[fid] = degree.get(fid, 0) + 1
+        degree[tid] = degree.get(tid, 0) + 1
+
+    nodes = []
+    seen_uuids = set()
+    for e in entities:
+        uuid = name_to_uuid.get(e.name, e.name)
+        if uuid in seen_uuids:
+            continue
+        seen_uuids.add(uuid)
+        # Filter out 'type' — sigma.js reserves it for node render program
+        reserved = {"type", "hidden", "x", "y"}
+        attrs = {k: str(v) for k, v in (e.attributes or {}).items() if v is not None and k not in reserved}
+        nodes.append({
+            "key": uuid,
+            "attributes": {
+                "label": e.name,
+                "color": TYPE_COLORS.get(e.type_ref, "#aaaaaa"),
+                "size": 4 + min(degree.get(uuid, 0) * 2, 20),
+                "entity_type": e.type_ref,
+                **attrs,
+            },
+        })
+
+    node_keys = {n["key"] for n in nodes}
+    edges = []
+    seen_edges = set()
+    for i, r in enumerate(relations):
+        fid = r.from_id or r.from_name
+        tid = r.to_id or r.to_name
+        if fid in node_keys and tid in node_keys and fid != tid:
+            edge_key = (fid, tid, r.relation_type)
+            if edge_key in seen_edges:
+                continue
+            seen_edges.add(edge_key)
+            edges.append({
+                "key": f"e{i}",
+                "source": fid,
+                "target": tid,
+                "attributes": {"label": r.relation_type},
+            })
+
+    return {"nodes": nodes, "edges": edges}
+
+# ── Reclasificación de entidades genéricas ─────────────────────────────────────
+
+GENERIC_TYPE_REFS = {"concept", "text_fragment", "url", "date_reference", "quantity", "coordinates"}
+
+
+def reclassify_generic_entities(entities, new_presets, workers=4):
+    """Reclasifica entidades genéricas usando los tipos recién descubiertos.
+
+    En vez de re-procesar chunks, hace 1 llamada batch a haiku con las entidades
+    genéricas y los nuevos presets para reclasificarlas in-place.
+    """
+    generic = [(i, e) for i, e in enumerate(entities) if e.type_ref in GENERIC_TYPE_REFS]
+    if not generic or not new_presets:
+        return 0
+
+    # Construir prompt de reclasificación
+    type_lines = []
+    for p in new_presets:
+        fields = ", ".join(p.get("metadata_fields", []))
+        type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
+
+    system = (
+        "You reclassify entities into more specific types. "
+        "For each entity, decide if it fits one of the NEW types below better than its current generic type. "
+        "If it fits, return the new type_ref and updated attributes. If not, return null.\n\n"
+        "NEW TYPES:\n" + "\n".join(type_lines) + "\n\n"
+        'OUTPUT: {"reclassified": [{"index": 0, "type_ref": "new_type", "attributes": {...}}, ...]}\n'
+        "Only include entities that should change. Omit those that should stay as-is."
+    )
+
+    # Procesar en batches de 30 entidades para no exceder contexto
+    batch_size = 30
+    total_changed = 0
+
+    def _reclassify_batch(batch):
+        items = [{"index": idx, "name": e.name, "current_type": e.type_ref,
+                   "attributes": e.attributes} for idx, e in batch]
+        try:
+            resp = claude_haiku_json([
+                {"role": "system", "content": system},
+                {"role": "user", "content": json.dumps(items, ensure_ascii=False)},
+            ])
+            return resp.get("reclassified", [])
+        except Exception:
+            return []
+
+    batches = [generic[i:i+batch_size] for i in range(0, len(generic), batch_size)]
+
+    with ThreadPoolExecutor(max_workers=workers) as pool:
+        futures = {pool.submit(_reclassify_batch, b): b for b in batches}
+        for future in as_completed(futures):
+            for item in future.result():
+                idx = item.get("index")
+                new_ref = item.get("type_ref", "")
+                if idx is not None and new_ref and 0 <= idx < len(entities):
+                    entities[idx].type_ref = new_ref
+                    if item.get("attributes"):
+                        entities[idx].attributes.update(item["attributes"])
+                    total_changed += 1
+
+    return total_changed
+
+
+# ── Main ───────────────────────────────────────────────────────────────────────
+
+def main():
+    if len(sys.argv) < 2:
+        print("Uso: python extract.py <archivo>")
+        sys.exit(1)
+
+    file_path = sys.argv[1]
+    if not os.path.isabs(file_path):
+        file_path = os.path.join(os.path.dirname(__file__), file_path)
+
+    workers = int(sys.argv[2]) if len(sys.argv) > 2 else 4
+
+    print(f"=== Ontology Graph Extraction ===")
+    print(f"File: {file_path}")
+    print(f"Workers: {workers}")
+    start = time.monotonic()
+
+    # 1. Extraer y preprocesar texto
+    print("\n[1/5] Extracting text...")
+    raw = extract_text_from_file(file_path)
+    text = preprocess_text(raw)
+    print(f"  {len(text)} chars")
+
+    # 2. Chunking
+    print("[2/5] Chunking...")
+    chunks = split_text_into_chunks(text, chunk_size=2000, overlap=200)
+    print(f"  {len(chunks)} chunks")
+
+    # 3. Extracción paralela
+    custom = load_custom_presets()
+    # Solo usar custom no promovidos (los promovidos ya estarán en el registry)
+    active_custom = [p for p in custom if not p.get("promoted", False)]
+    all_presets = OSINT_PRESETS + GENERIC_PRESETS + active_custom
+    print(f"  Presets: {len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic + {len(active_custom)} custom")
+    system_prompt = build_unified_prompt(all_presets, RELATION_TYPES)
+
+    print(f"[3/5] Extracting entities + relations ({workers} workers)...")
+    all_entities = []
+    all_relations = []
+    all_suggested = []
+
+    with ThreadPoolExecutor(max_workers=workers) as pool:
+        futures = {
+            pool.submit(process_chunk, i, chunk, system_prompt): i
+            for i, chunk in enumerate(chunks)
+        }
+        for future in as_completed(futures):
+            idx = futures[future]
+            ents, rels, sugg = future.result()
+            all_entities.extend(ents)
+            all_relations.extend(rels)
+            all_suggested.extend(sugg)
+            print(f"  chunk {idx+1}/{len(chunks)}: {len(ents)} entities, {len(rels)} relations" +
+                  (f", {len(sugg)} new types" if sugg else ""))
+
+    # 4. Deduplicación
+    print(f"\n[4/5] Deduplicating...")
+    print(f"  Raw: {len(all_entities)} entities, {len(all_relations)} relations")
+
+    dedup = deduplicate_entities(all_entities, name_threshold=0.85)
+    final_entities = dedup.entities
+    entity_id_map = dedup.name_to_id
+
+    final_relations = deduplicate_relations(all_relations, entity_id_map)
+
+    print(f"  Final: {len(final_entities)} entities, {len(final_relations)} relations")
+    print(f"  Merged: {dedup.total_before - dedup.total_after} entities, "
+          f"{len(all_relations) - len(final_relations)} relations")
+
+    # Registrar tipos sugeridos en custom_presets.json
+    unique_suggested = []
+    if all_suggested:
+        seen = set()
+        for s in all_suggested:
+            key = s.get("type_ref", "")
+            if key and key not in seen:
+                seen.add(key)
+                unique_suggested.append(s)
+
+        source_doc = os.path.basename(file_path)
+        added = merge_suggested_into_custom(unique_suggested, source_doc)
+        total_custom = len(load_custom_presets())
+
+        if added:
+            print(f"\n  New types registered ({len(added)}):")
+            for p in added:
+                print(f"    + {p['label']} ({p['type_ref']}): {p['metadata_fields']}")
+                print(f"      Reason: {p['reason']}")
+            print(f"  Total custom presets: {total_custom} (in {CUSTOM_PRESETS_PATH})")
+
+            # Reclasificar entidades genéricas con los tipos recién descubiertos
+            n_generic = sum(1 for e in final_entities if e.type_ref in GENERIC_TYPE_REFS)
+            if n_generic > 0:
+                print(f"\n  Reclassifying {n_generic} generic entities with new types...")
+                changed = reclassify_generic_entities(final_entities, added, workers=workers)
+                print(f"  Reclassified: {changed}/{n_generic}")
+        else:
+            print(f"\n  {len(unique_suggested)} suggested types already registered ({total_custom} total custom)")
+
+    # Stats por tipo
+    type_counts = {}
+    for e in final_entities:
+        type_counts[e.type_ref] = type_counts.get(e.type_ref, 0) + 1
+    print(f"\n  Entity types:")
+    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
+        print(f"    {t}: {c}")
+
+    rel_counts = {}
+    for r in final_relations:
+        rel_counts[r.relation_type] = rel_counts.get(r.relation_type, 0) + 1
+    print(f"  Relation types:")
+    for t, c in sorted(rel_counts.items(), key=lambda x: -x[1]):
+        print(f"    {t}: {c}")
+
+    # 5. Visualización
+    print(f"\n[5/5] Generating graph...")
+    graph = to_sigma(final_entities, final_relations, entity_id_map)
+    out_dir = os.path.join(os.path.dirname(__file__), "data")
+    html_path = render_sigma_html(graph, os.path.join(out_dir, "ontology_graph.html"), "Ontology Graph")
+    print(f"  {len(graph['nodes'])} nodes, {len(graph['edges'])} edges")
+    print(f"  HTML: file://{html_path}")
+
+    # Guardar JSON intermedio
+    json_path = os.path.join(out_dir, "extraction_result.json")
+    with open(json_path, "w") as f:
+        json.dump({
+            "entities": [{"name": e.name, "type_ref": e.type_ref,
+                          "confidence": e.confidence, "attributes": e.attributes}
+                         for e in final_entities],
+            "relations": [{"from": r.from_name, "to": r.to_name,
+                           "type": r.relation_type, "confidence": r.confidence,
+                           "description": r.description}
+                          for r in final_relations],
+            "suggested_types": [dict(s) for s in (unique_suggested if all_suggested else [])],
+        }, f, ensure_ascii=False, indent=2)
+    print(f"  JSON: {json_path}")
+
+    elapsed = time.monotonic() - start
+    print(f"\nDone in {elapsed:.1f}s")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,43 @@
+"""Genera la seccion del system prompt que describe los entity types disponibles para extraccion."""
+
+
+def build_entity_schema_prompt(entity_presets: list[dict]) -> str:
+    """Genera texto legible para el LLM describiendo los entity types disponibles.
+
+    Formatea los presets del registry en una seccion del system prompt que indica
+    al LLM que tipos de entidades puede extraer y que atributos tiene cada uno.
+
+    Args:
+        entity_presets: Lista de presets con campos 'label', 'type_ref' y
+                        opcionalmente 'metadata_fields'. Ejemplo:
+                        [{"type_ref": "osint_person_go_cybersecurity",
+                          "label": "Person",
+                          "metadata_fields": ["full_name", "alias"]}]
+
+    Returns:
+        String formateado con la seccion del prompt. Retorna string vacio si
+        la lista de presets esta vacia.
+    """
+    if not entity_presets:
+        return ""
+
+    lines = ["Entity types available for extraction:", ""]
+
+    for i, preset in enumerate(entity_presets, start=1):
+        label = preset.get("label", "Unknown")
+        type_ref = preset.get("type_ref", "")
+        metadata_fields = preset.get("metadata_fields", [])
+
+        lines.append(f"{i}. {label} (type_ref: {type_ref})")
+
+        if metadata_fields:
+            attrs = ", ".join(metadata_fields)
+            lines.append(f"   Attributes: {attrs}")
+
+        lines.append("")
+
+    # Remove trailing blank line
+    if lines and lines[-1] == "":
+        lines.pop()
+
+    return "\n".join(lines)
@@ -0,0 +1,22 @@
+"""Genera la seccion del system prompt con los tipos de relacion permitidos."""
+
+
+def build_relation_schema_prompt(relation_types: list[str]) -> str:
+    """Genera texto legible para el LLM describiendo los tipos de relacion permitidos.
+
+    Formatea la lista de tipos de relacion en una seccion del system prompt que
+    indica al LLM que relaciones puede extraer entre entidades.
+
+    Args:
+        relation_types: Lista de strings con los tipos de relacion permitidos.
+                        Ejemplo: ["funds", "employs", "communicates_with"]
+
+    Returns:
+        String formateado con la seccion del prompt. Retorna string vacio si
+        la lista esta vacia.
+    """
+    if not relation_types:
+        return ""
+
+    joined = ", ".join(relation_types)
+    return f"Allowed relation types:\n{joined}"
@@ -0,0 +1,814 @@
+"""Core functional programming utilities — pure functions for list/collection operations."""
+
+import hashlib
+import re
+from functools import reduce as _reduce
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+
+def filter_list(xs: list, pred: Callable) -> list:
+    """Filter list by predicate. Does not mutate the original."""
+    return [x for x in xs if pred(x)]
+
+
+def map_list(xs: list, fn: Callable) -> list:
+    """Map function over list. Does not mutate the original."""
+    return [fn(x) for x in xs]
+
+
+def reduce_list(xs: list, initial: Any, fn: Callable) -> Any:
+    """Reduce list with accumulator. fn(acc, x) -> acc."""
+    return _reduce(fn, xs, initial)
+
+
+def flat_map(xs: list, fn: Callable) -> list:
+    """Map function over list then flatten one level."""
+    result = []
+    for x in xs:
+        result.extend(fn(x))
+    return result
+
+
+def flatten(xss: list) -> list:
+    """Flatten a list of lists one level."""
+    result = []
+    for xs in xss:
+        result.extend(xs)
+    return result
+
+
+def chunk(xs: list, size: int) -> list:
+    """Split list into chunks of given size. Last chunk may be smaller."""
+    if size <= 0:
+        return []
+    return [xs[i : i + size] for i in range(0, len(xs), size)]
+
+
+def take(xs: list, n: int) -> list:
+    """Take first n elements from list."""
+    return xs[:n]
+
+
+def drop(xs: list, n: int) -> list:
+    """Drop first n elements from list."""
+    return xs[n:]
+
+
+def unique(xs: list) -> list:
+    """Remove duplicates preserving order. Uses identity for hashable elements."""
+    seen = set()
+    result = []
+    for x in xs:
+        if x not in seen:
+            seen.add(x)
+            result.append(x)
+    return result
+
+
+def group_by(xs: list, key_fn: Callable) -> Dict:
+    """Group elements by key function. Returns dict of key -> list."""
+    groups: Dict = {}
+    for x in xs:
+        k = key_fn(x)
+        if k not in groups:
+            groups[k] = []
+        groups[k].append(x)
+    return groups
+
+
+def partition(xs: list, pred: Callable) -> Tuple[list, list]:
+    """Split list into (matches, non_matches) based on predicate."""
+    matches = []
+    non_matches = []
+    for x in xs:
+        if pred(x):
+            matches.append(x)
+        else:
+            non_matches.append(x)
+    return (matches, non_matches)
+
+
+def find(xs: list, pred: Callable) -> Any:
+    """Find first element matching predicate. Returns None if not found."""
+    for x in xs:
+        if pred(x):
+            return x
+    return None
+
+
+def find_index(xs: list, pred: Callable) -> int:
+    """Find index of first element matching predicate. Returns -1 if not found."""
+    for i, x in enumerate(xs):
+        if pred(x):
+            return i
+    return -1
+
+
+def zip_with(xs: list, ys: list, fn: Callable) -> list:
+    """Zip two lists with a combining function. Stops at shorter list."""
+    return [fn(x, y) for x, y in zip(xs, ys)]
+
+
+def all_of(xs: list, pred: Callable) -> bool:
+    """Return True if all elements match predicate."""
+    return all(pred(x) for x in xs)
+
+
+def any_of(xs: list, pred: Callable) -> bool:
+    """Return True if any element matches predicate."""
+    return any(pred(x) for x in xs)
+
+
+def pipe(value: Any, *fns: Callable) -> Any:
+    """Pipe a value through a sequence of functions left-to-right."""
+    result = value
+    for fn in fns:
+        result = fn(result)
+    return result
+
+
+def compose(*fns: Callable) -> Callable:
+    """Compose functions right-to-left. compose(f, g)(x) == f(g(x))."""
+    def composed(x: Any) -> Any:
+        result = x
+        for fn in reversed(fns):
+            result = fn(result)
+        return result
+    return composed
+
+
+# ── Tree manipulation ────────────────────────────────────────────────────────
+
+
+def flatten_tree(structure: Any) -> List[Dict]:
+    """Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
+    import copy
+    if isinstance(structure, dict):
+        node = copy.deepcopy(structure)
+        node.pop('nodes', None)
+        nodes = [node]
+        for key in list(structure.keys()):
+            if 'nodes' in key:
+                nodes.extend(flatten_tree(structure[key]))
+        return nodes
+    elif isinstance(structure, list):
+        nodes = []
+        for item in structure:
+            nodes.extend(flatten_tree(item))
+        return nodes
+    return []
+
+
+def tree_to_flat_list(structure: Any) -> List[Dict]:
+    """Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
+    if isinstance(structure, dict):
+        nodes = [structure]
+        if 'nodes' in structure:
+            nodes.extend(tree_to_flat_list(structure['nodes']))
+        return nodes
+    elif isinstance(structure, list):
+        nodes = []
+        for item in structure:
+            nodes.extend(tree_to_flat_list(item))
+        return nodes
+    return []
+
+
+def get_leaf_nodes(structure: Any) -> List[Dict]:
+    """Extract only leaf nodes (no children) from a hierarchical tree."""
+    import copy
+    if isinstance(structure, dict):
+        if not structure.get('nodes'):
+            node = copy.deepcopy(structure)
+            node.pop('nodes', None)
+            return [node]
+        leaf_nodes = []
+        for key in list(structure.keys()):
+            if 'nodes' in key:
+                leaf_nodes.extend(get_leaf_nodes(structure[key]))
+        return leaf_nodes
+    elif isinstance(structure, list):
+        leaf_nodes = []
+        for item in structure:
+            leaf_nodes.extend(get_leaf_nodes(item))
+        return leaf_nodes
+    return []
+
+
+def write_node_ids(data: Any, node_id: int = 0) -> int:
+    """Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
+    if isinstance(data, dict):
+        data['node_id'] = str(node_id).zfill(4)
+        node_id += 1
+        for key in list(data.keys()):
+            if 'nodes' in key:
+                node_id = write_node_ids(data[key], node_id)
+    elif isinstance(data, list):
+        for item in data:
+            node_id = write_node_ids(item, node_id)
+    return node_id
+
+
+def list_to_tree(data: List[Dict]) -> List[Dict]:
+    """Convert flat list with structure codes ('1.2.3') to nested tree."""
+    def get_parent_structure(structure):
+        if not structure:
+            return None
+        parts = str(structure).split('.')
+        return '.'.join(parts[:-1]) if len(parts) > 1 else None
+
+    nodes = {}
+    root_nodes = []
+
+    for item in data:
+        structure = item.get('structure')
+        node = {
+            'title': item.get('title'),
+            'start_index': item.get('start_index'),
+            'end_index': item.get('end_index'),
+            'nodes': []
+        }
+        nodes[structure] = node
+        parent_structure = get_parent_structure(structure)
+
+        if parent_structure and parent_structure in nodes:
+            nodes[parent_structure]['nodes'].append(node)
+        else:
+            root_nodes.append(node)
+
+    def clean_node(node):
+        if not node['nodes']:
+            del node['nodes']
+        else:
+            for child in node['nodes']:
+                clean_node(child)
+        return node
+
+    return [clean_node(node) for node in root_nodes]
+
+
+def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
+    """Recursively remove specified fields from a tree (dict/list)."""
+    if fields is None:
+        fields = ['text']
+    if isinstance(data, dict):
+        return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
+    elif isinstance(data, list):
+        return [remove_tree_fields(item, fields) for item in data]
+    return data
+
+
+def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
+    """Reorder fields of each node in a tree according to specified key order."""
+    if not order:
+        return structure
+    if isinstance(structure, dict):
+        if 'nodes' in structure:
+            structure['nodes'] = format_tree_structure(structure['nodes'], order)
+        if not structure.get('nodes'):
+            structure.pop('nodes', None)
+        return {key: structure[key] for key in order if key in structure}
+    elif isinstance(structure, list):
+        return [format_tree_structure(item, order) for item in structure]
+    return structure
+
+
+def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
+    """Create flat dict mapping node_id to node for O(1) lookup."""
+    mapping = {}
+    def _traverse(nodes):
+        for node in nodes:
+            if node.get('node_id'):
+                mapping[node['node_id']] = node
+            if node.get('nodes'):
+                _traverse(node['nodes'])
+    _traverse(tree)
+    return mapping
+
+
+# ── Text / JSON extraction ───────────────────────────────────────────────────
+
+
+def extract_json_from_llm(content: str) -> Dict:
+    """Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
+    import json
+    try:
+        start_idx = content.find("```json")
+        if start_idx != -1:
+            start_idx += 7
+            end_idx = content.rfind("```")
+            json_content = content[start_idx:end_idx].strip()
+        else:
+            json_content = content.strip()
+
+        json_content = json_content.replace('None', 'null')
+        json_content = json_content.replace('\n', ' ').replace('\r', ' ')
+        json_content = ' '.join(json_content.split())
+
+        return json.loads(json_content)
+    except (json.JSONDecodeError, Exception):
+        try:
+            json_content = json_content.replace(',]', ']').replace(',}', '}')
+            return json.loads(json_content)
+        except Exception:
+            return {}
+
+
+def parse_page_range(pages: str) -> List[int]:
+    """Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
+    result = []
+    for part in pages.split(','):
+        part = part.strip()
+        if '-' in part:
+            start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
+            if start > end:
+                raise ValueError(f"Invalid range '{part}': start must be <= end")
+            result.extend(range(start, end + 1))
+        else:
+            result.append(int(part))
+    return sorted(set(result))
+
+
+# ── Markdown parsing ─────────────────────────────────────────────────────────
+
+
+def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
+    """Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
+    import re
+    header_pattern = r'^(#{1,6})\s+(.+)$'
+    code_block_pattern = r'^```'
+    node_list = []
+    lines = markdown_content.split('\n')
+    in_code_block = False
+
+    for line_num, line in enumerate(lines, 1):
+        stripped_line = line.strip()
+        if re.match(code_block_pattern, stripped_line):
+            in_code_block = not in_code_block
+            continue
+        if not stripped_line:
+            continue
+        if not in_code_block:
+            match = re.match(header_pattern, stripped_line)
+            if match:
+                level = len(match.group(1))
+                title = match.group(2).strip()
+                node_list.append({'title': title, 'level': level, 'line_num': line_num})
+
+    return node_list, lines
+
+
+def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
+    """Build nested tree from flat list of headers with levels (h1>h2>h3)."""
+    if not node_list:
+        return []
+
+    stack = []
+    root_nodes = []
+    node_counter = 1
+
+    for node in node_list:
+        current_level = node['level']
+        tree_node = {
+            'title': node['title'],
+            'node_id': str(node_counter).zfill(4),
+            'line_num': node['line_num'],
+            'nodes': []
+        }
+        node_counter += 1
+
+        while stack and stack[-1][1] >= current_level:
+            stack.pop()
+
+        if not stack:
+            root_nodes.append(tree_node)
+        else:
+            parent_node, _ = stack[-1]
+            parent_node['nodes'].append(tree_node)
+
+        stack.append((tree_node, current_level))
+
+    def clean_empty_nodes(nodes):
+        for n in nodes:
+            if n['nodes']:
+                clean_empty_nodes(n['nodes'])
+            else:
+                del n['nodes']
+        return nodes
+
+    return clean_empty_nodes(root_nodes)
+
+
+# ── Pagination / chunking ────────────────────────────────────────────────────
+
+
+def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
+                        max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
+    """Group pages into text chunks respecting token limit with configurable overlap."""
+    import math
+    num_tokens = sum(token_lengths)
+
+    if num_tokens <= max_tokens:
+        return ["".join(page_contents)]
+
+    subsets = []
+    current_subset = []
+    current_token_count = 0
+
+    expected_parts = math.ceil(num_tokens / max_tokens)
+    avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
+
+    for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
+        if current_token_count + page_tokens > avg_tokens:
+            subsets.append(''.join(current_subset))
+            overlap_start = max(i - overlap_pages, 0)
+            current_subset = list(page_contents[overlap_start:i])
+            current_token_count = sum(token_lengths[overlap_start:i])
+
+        current_subset.append(page_content)
+        current_token_count += page_tokens
+
+    if current_subset:
+        subsets.append(''.join(current_subset))
+
+    return subsets
+
+
+def calculate_page_offset(pairs: List[Dict]) -> int:
+    """Calculate offset between logical page numbers and physical indices using reference pairs."""
+    differences = []
+    for pair in pairs:
+        try:
+            difference = pair['physical_index'] - pair['page']
+            differences.append(difference)
+        except (KeyError, TypeError):
+            continue
+
+    if not differences:
+        return 0
+
+    counts: Dict[int, int] = {}
+    for diff in differences:
+        counts[diff] = counts.get(diff, 0) + 1
+
+    return max(counts.items(), key=lambda x: x[1])[0]
+
+
+# ── Text preprocessing ───────────────────────────────────────────────────────
+
+
+def preprocess_text(text: str) -> str:
+    """Normalize whitespace and newlines in raw text.
+
+    Args:
+        text: Raw text to normalize.
+
+    Returns:
+        Normalized text with consistent newlines, stripped lines, and no
+        excessive blank lines.
+    """
+    # Normalize line endings: \r\n and \r -> \n
+    text = text.replace('\r\n', '\n').replace('\r', '\n')
+    # Reduce 3+ consecutive newlines to at most 2
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    # Strip whitespace from each line
+    text = '\n'.join(line.strip() for line in text.split('\n'))
+    # Strip globally
+    return text.strip()
+
+
+def get_text_stats(text: str) -> dict:
+    """Compute basic statistics of a text: characters, lines, words.
+
+    Args:
+        text: Input text to analyze.
+
+    Returns:
+        Dict with keys total_chars (int), total_lines (int), total_words (int).
+    """
+    return {
+        'total_chars': len(text),
+        'total_lines': text.count('\n') + 1,
+        'total_words': len(text.split()),
+    }
+
+
+# ── Git URL parsing ──────────────────────────────────────────────────────────
+
+_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
+
+
+def _sanitize_git_segment(segment: str) -> str:
+    """Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
+    if segment.endswith(".git"):
+        segment = segment[:-4]
+    return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
+
+
+def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
+    """Parse a code-hosting URL and return the 'org/repo' path component.
+
+    Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
+    Returns None if the URL does not match any known host or is malformed.
+
+    Args:
+        url: Repository URL in any supported format.
+        known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
+
+    Returns:
+        'org/repo' string or None.
+    """
+    from urllib.parse import urlparse
+
+    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
+    url = url.strip()
+
+    if url.startswith("git@"):
+        # git@github.com:org/repo.git
+        rest = url[len("git@"):]
+        if ":" not in rest:
+            return None
+        host, path = rest.split(":", 1)
+        if host not in hosts:
+            return None
+        segments = [s for s in path.split("/") if s]
+        if len(segments) < 2:
+            return None
+        org = _sanitize_git_segment(segments[0])
+        repo = _sanitize_git_segment(segments[1])
+        if not org or not repo:
+            return None
+        return f"{org}/{repo}"
+
+    for prefix in ("http://", "https://", "git://", "ssh://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            netloc = parsed.hostname or ""
+            if netloc not in hosts:
+                return None
+            segments = [s for s in parsed.path.split("/") if s]
+            if len(segments) < 2:
+                return None
+            org = _sanitize_git_segment(segments[0])
+            repo = _sanitize_git_segment(segments[1])
+            if not org or not repo:
+                return None
+            return f"{org}/{repo}"
+
+    return None
+
+
+def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
+    """Return True only if url points to a clonable git repository.
+
+    Accepts org/repo and org/repo/tree/<ref> paths.
+    Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
+
+    Args:
+        url: URL to verify.
+        known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
+
+    Returns:
+        True if url is a clonable repository URL.
+    """
+    from urllib.parse import urlparse
+
+    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
+    url = url.strip()
+
+    # SSH shorthand — always repo-level if host matches
+    if url.startswith("git@"):
+        rest = url[len("git@"):]
+        if ":" not in rest:
+            return False
+        host, _ = rest.split(":", 1)
+        return host in hosts
+
+    # git:// and ssh:// — always repo-level if host matches
+    for prefix in ("ssh://", "git://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            return (parsed.hostname or "") in hosts
+
+    # http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
+    for prefix in ("http://", "https://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            if (parsed.hostname or "") not in hosts:
+                return False
+            segments = [s for s in parsed.path.split("/") if s]
+            if len(segments) == 2:
+                return True
+            if len(segments) == 4 and segments[2] == "tree":
+                return True
+            return False
+
+    return False
+
+
+def validate_git_ssh_uri(url: str) -> None:
+    """Validate a git SSH URI of the form git@host:path.
+
+    Raises ValueError with a descriptive message if the URI is malformed.
+
+    Args:
+        url: URI string to validate.
+
+    Raises:
+        ValueError: If the URI does not conform to git SSH format.
+    """
+    if not url.startswith("git@"):
+        raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
+    rest = url[len("git@"):]
+    if ":" not in rest:
+        raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
+    _, path = rest.split(":", 1)
+    if not path:
+        raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
+
+
+# ---------------------------------------------------------------------------
+# Markdown parsing utilities
+# ---------------------------------------------------------------------------
+
+
+def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
+    """Extract YAML frontmatter delimited by '---' from the start of a markdown string.
+
+    Args:
+        content: Raw markdown string, optionally starting with YAML frontmatter.
+
+    Returns:
+        Tuple of (content_without_frontmatter, frontmatter_dict).
+        frontmatter_dict is None when no frontmatter is found.
+    """
+    pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
+    match = pattern.match(content)
+    if not match:
+        return content, None
+
+    raw = match.group(1)
+    remaining = content[match.end():]
+
+    try:
+        import yaml  # type: ignore
+        data = yaml.safe_load(raw)
+        if not isinstance(data, dict):
+            data = None
+    except Exception:
+        # Fallback: simple key: value parser (no yaml dependency)
+        data = {}
+        for line in raw.splitlines():
+            if ':' in line:
+                key, _, value = line.partition(':')
+                data[key.strip()] = value.strip()
+
+    return remaining, data
+
+
+def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
+    """Find all markdown headings (# to ######), excluding those inside code blocks,
+    HTML comments, and indented blocks.
+
+    Args:
+        content: Markdown text to search.
+
+    Returns:
+        List of (start_pos, end_pos, title, level) for each heading found.
+    """
+    excluded: List[Tuple[int, int]] = []
+
+    # Code blocks (triple backtick)
+    for m in re.finditer(r'```.*?```', content, re.DOTALL):
+        excluded.append((m.start(), m.end()))
+
+    # HTML comments
+    for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
+        excluded.append((m.start(), m.end()))
+
+    # Indented blocks (lines starting with 4 spaces or a tab)
+    for m in re.finditer(r'^(    |\t).+$', content, re.MULTILINE):
+        excluded.append((m.start(), m.end()))
+
+    def is_excluded(pos: int) -> bool:
+        return any(start <= pos < end for start, end in excluded)
+
+    results: List[Tuple[int, int, str, int]] = []
+    for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
+        # Skip escaped headings (\#)
+        before = content[m.start() - 1] if m.start() > 0 else ''
+        if before == '\\':
+            continue
+        if is_excluded(m.start()):
+            continue
+        level = len(m.group(1))
+        title = m.group(2).strip()
+        results.append((m.start(), m.end(), title, level))
+
+    return results
+
+
+def estimate_token_count(content: str) -> int:
+    """Estimate token count without a tokenizer.
+
+    CJK characters count as ~0.7 tokens each; other non-whitespace characters
+    count as ~0.3 tokens each.
+
+    Args:
+        content: Text to estimate.
+
+    Returns:
+        Estimated integer token count.
+    """
+    cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
+    without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
+    others = re.findall(r'\S', without_cjk)
+    return int(len(cjk) * 0.7 + len(others) * 0.3)
+
+
+def smart_split_content(
+    content: str,
+    max_tokens: int = 1024,
+    max_chars: int = 8000,
+) -> List[str]:
+    """Split large content into parts respecting token and character limits.
+
+    Splits by paragraphs (double newline). If a single paragraph exceeds the
+    limit it is force-cut into chunks of max_chars.
+
+    Args:
+        content: Text to split.
+        max_tokens: Maximum estimated tokens per part.
+        max_chars: Maximum characters per part.
+
+    Returns:
+        List of string parts.
+    """
+    paragraphs = content.split('\n\n')
+    parts: List[str] = []
+    current_parts: List[str] = []
+    current_tokens = 0
+    current_chars = 0
+
+    def flush() -> None:
+        if current_parts:
+            parts.append('\n\n'.join(current_parts))
+            current_parts.clear()
+
+    for para in paragraphs:
+        para_tokens = estimate_token_count(para)
+        para_chars = len(para)
+
+        # Single paragraph exceeds limits — force-cut it
+        if para_tokens > max_tokens or para_chars > max_chars:
+            flush()
+            current_tokens = 0
+            current_chars = 0
+            for i in range(0, len(para), max_chars):
+                parts.append(para[i:i + max_chars])
+            continue
+
+        # Would exceed limits if added — flush first
+        if (current_tokens + para_tokens > max_tokens or
+                current_chars + para_chars > max_chars):
+            flush()
+            current_tokens = 0
+            current_chars = 0
+
+        current_parts.append(para)
+        current_tokens += para_tokens
+        current_chars += para_chars
+
+    flush()
+    return parts if parts else [content]
+
+
+def sanitize_for_path(text: str, max_length: int = 50) -> str:
+    """Convert text to a safe string for use in file paths.
+
+    Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
+    with underscores. Truncates with a sha256 suffix if the result exceeds
+    max_length.
+
+    Args:
+        text: Input text to sanitize.
+        max_length: Maximum length of the returned string.
+
+    Returns:
+        Safe path-friendly string.
+    """
+    cleaned = re.sub(
+        r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
+        '',
+        text,
+    )
+    cleaned = cleaned.replace(' ', '_').strip('_')
+
+    if not cleaned:
+        return 'section'
+
+    if len(cleaned) <= max_length:
+        return cleaned
+
+    suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
+    return cleaned[:max_length - len(suffix)] + suffix
@@ -0,0 +1,283 @@
+"""Deduplica entidades candidatas usando fuzzy matching de nombres."""
+
+from __future__ import annotations
+
+import sys
+import os
+import uuid
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from entity_candidate import EntityCandidate
+from deduplication_result import DeduplicationResult
+from normalize_entity_name import normalize_entity_name
+from merge_entity_attributes import merge_entity_attributes
+
+
+# ── Similitud helpers ──────────────────────────────────────────────────────────
+
+def _levenshtein(a: str, b: str) -> int:
+    """Distancia de edicion Levenshtein entre dos strings."""
+    if a == b:
+        return 0
+    if not a:
+        return len(b)
+    if not b:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        curr = [i]
+        for j, cb in enumerate(b, 1):
+            cost = 0 if ca == cb else 1
+            curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
+        prev = curr
+    return prev[-1]
+
+
+def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float:
+    """Similitud de Jaccard entre dos conjuntos de tokens."""
+    set_a = set(tokens_a)
+    set_b = set(tokens_b)
+    if not set_a and not set_b:
+        return 1.0
+    inter = len(set_a & set_b)
+    union = len(set_a | set_b)
+    return inter / union if union else 0.0
+
+
+def _name_similarity(a: str, b: str) -> float:
+    """Score de similitud entre dos nombres normalizados.
+
+    Combina similitud de Levenshtein y Jaccard sobre tokens.
+    Aplica bonus de contencion (+0.3) y deteccion de acronimos.
+    """
+    if a == b:
+        return 1.0
+
+    # Similitud Levenshtein
+    max_len = max(len(a), len(b))
+    lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0
+
+    # Similitud Jaccard sobre tokens
+    tokens_a = a.split()
+    tokens_b = b.split()
+    jac_sim = _jaccard(tokens_a, tokens_b)
+
+    score = max(lev_sim, jac_sim)
+
+    # Bonus de contencion: un nombre contiene al otro
+    if a in b or b in a:
+        score = min(1.0, score + 0.3)
+
+    # Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation"
+    if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a):
+        score = min(1.0, score + 0.3)
+
+    return score
+
+
+def _is_acronym_of(candidate: str, tokens: list[str]) -> bool:
+    """Comprueba si candidate es un acronimo formado por las iniciales de tokens."""
+    if not candidate or not tokens:
+        return False
+    initials = "".join(t[0] for t in tokens if t).upper()
+    return candidate.upper() == initials
+
+
+_EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"}
+
+
+def _is_exact_type(entity_type: str) -> bool:
+    """Tipos tecnicos donde solo se acepta matching exacto."""
+    return entity_type.lower() in _EXACT_TYPES
+
+
+# ── Union-Find ─────────────────────────────────────────────────────────────────
+
+class _UnionFind:
+    def __init__(self, n: int) -> None:
+        self._parent = list(range(n))
+        self._rank = [0] * n
+
+    def find(self, x: int) -> int:
+        while self._parent[x] != x:
+            self._parent[x] = self._parent[self._parent[x]]
+            x = self._parent[x]
+        return x
+
+    def union(self, x: int, y: int) -> None:
+        rx, ry = self.find(x), self.find(y)
+        if rx == ry:
+            return
+        if self._rank[rx] < self._rank[ry]:
+            rx, ry = ry, rx
+        self._parent[ry] = rx
+        if self._rank[rx] == self._rank[ry]:
+            self._rank[rx] += 1
+
+
+# ── Implementacion principal ────────────────────────────────────────────────────
+
+def deduplicate_entities(
+    candidates: list[EntityCandidate],
+    name_threshold: float = 0.85,
+    same_type_only: bool = True,
+) -> DeduplicationResult:
+    """Agrupa entidades candidatas que refieren a la misma entidad real.
+
+    Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para
+    detectar clusters transitivos. Por cada cluster genera una entidad canonica
+    mergeando atributos de todos sus miembros.
+
+    Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se
+    acepta matching exacto normalizado, ignorando el umbral de nombre.
+
+    Args:
+        candidates: lista de EntityCandidate a deduplicar.
+        name_threshold: score minimo para considerar dos nombres iguales (0-1).
+        same_type_only: si True, solo compara entidades del mismo type_ref.
+
+    Returns:
+        DeduplicationResult con entidades deduplicadas, mapas de resolucion
+        e historial de merges.
+    """
+    if not candidates:
+        return DeduplicationResult(
+            entities=[],
+            entity_id_map={},
+            name_to_id={},
+            merge_log=[],
+            total_before=0,
+            total_after=0,
+        )
+
+    n = len(candidates)
+
+    # Paso 1: normalizar nombres
+    normalized: list[str] = []
+    for c in candidates:
+        norm = normalize_entity_name(c.name, c.type_ref)
+        normalized.append(norm)
+
+    # Paso 2: Union-Find sobre todos los indices
+    uf = _UnionFind(n)
+
+    # Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only)
+    merge_pairs: list[tuple[int, int, float]] = []
+
+    for i in range(n):
+        for j in range(i + 1, n):
+            if same_type_only and candidates[i].type_ref != candidates[j].type_ref:
+                continue
+
+            ni, nj = normalized[i], normalized[j]
+            et = candidates[i].type_ref.lower()
+
+            if _is_exact_type(et):
+                if ni == nj:
+                    uf.union(i, j)
+                    merge_pairs.append((i, j, 1.0))
+                continue
+
+            score = _name_similarity(ni, nj)
+            if score >= name_threshold:
+                uf.union(i, j)
+                merge_pairs.append((i, j, score))
+
+    # Paso 4: agrupar indices por raiz del Union-Find
+    clusters: dict[int, list[int]] = {}
+    for i in range(n):
+        root = uf.find(i)
+        clusters.setdefault(root, []).append(i)
+
+    # Paso 5: merge por cluster
+    merged_entities: list[EntityCandidate] = []
+    entity_id_map: dict[str, str] = {}
+    name_to_id: dict[str, str] = {}
+    merge_log: list[dict] = []
+
+    # Pares mergeados para construir el log
+    merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {}
+    for i, j, score in merge_pairs:
+        root = uf.find(i)
+        merged_pairs_by_root.setdefault(root, []).append((i, j, score))
+
+    for root, indices in clusters.items():
+        cluster_candidates = [candidates[idx] for idx in indices]
+
+        if len(cluster_candidates) == 1:
+            c = cluster_candidates[0]
+            canonical_name = c.name
+            canonical_norm = normalized[indices[0]]
+            merged_attrs = c.attributes
+            merged_confidence = c.confidence
+            merged_chunks = list(c.source_chunk_indices)
+            merged_from = list(c.merged_from) if c.merged_from else [c.name]
+        else:
+            # Candidato con mayor confidence es el canonico
+            best = max(cluster_candidates, key=lambda c: c.confidence)
+            canonical_name = best.name
+            canonical_norm = normalize_entity_name(best.name, best.type_ref)
+
+            merged_attrs = merge_entity_attributes(
+                [c.attributes for c in cluster_candidates]
+            )
+            merged_confidence = max(c.confidence for c in cluster_candidates)
+
+            merged_chunks: list[int] = []
+            seen_chunks: set[int] = set()
+            for c in cluster_candidates:
+                for idx in c.source_chunk_indices:
+                    if idx not in seen_chunks:
+                        merged_chunks.append(idx)
+                        seen_chunks.add(idx)
+
+            merged_from: list[str] = []
+            seen_names: set[str] = set()
+            for c in cluster_candidates:
+                names_to_add = c.merged_from if c.merged_from else [c.name]
+                for nm in names_to_add:
+                    if nm not in seen_names:
+                        merged_from.append(nm)
+                        seen_names.add(nm)
+
+            # Log de merge
+            other_names = [c.name for c in cluster_candidates if c is not best]
+            pairs = merged_pairs_by_root.get(root, [])
+            max_score = max((s for _, _, s in pairs), default=1.0)
+            merge_log.append(
+                {
+                    "canonical": canonical_name,
+                    "merged": other_names,
+                    "score": round(max_score, 4),
+                    "reason": "fuzzy_name",
+                }
+            )
+
+        ent_id = str(uuid.uuid4())
+        entity = EntityCandidate(
+            name=canonical_name,
+            name_normalized=canonical_norm,
+            type_ref=cluster_candidates[0].type_ref,
+            type_label=cluster_candidates[0].type_label,
+            attributes=merged_attrs,
+            confidence=merged_confidence,
+            source_chunk_indices=merged_chunks,
+            merged_from=merged_from,
+        )
+        merged_entities.append(entity)
+
+        # Poblar mapas de resolucion
+        entity_id_map[canonical_norm] = ent_id
+        for orig_name in merged_from:
+            name_to_id[orig_name] = ent_id
+        name_to_id[canonical_norm] = ent_id
+
+    return DeduplicationResult(
+        entities=merged_entities,
+        entity_id_map=entity_id_map,
+        name_to_id=name_to_id,
+        merge_log=merge_log,
+        total_before=n,
+        total_after=len(merged_entities),
+    )
@@ -0,0 +1,189 @@
+"""Deduplica RelationCandidate resolviendo nombres a IDs y colapsando duplicados."""
+
+import logging
+import os
+import sys
+
+logger = logging.getLogger(__name__)
+
+# --- Importar levenshtein_distance desde cybersecurity ---
+# Soporta dos contextos:
+#   1. Ejecutado desde python/functions/datascience/ (pytest local)
+#   2. Ejecutado desde la raiz del registry (fn run)
+def _levenshtein_distance(a: str, b: str) -> int:
+    """Calcula la distancia de edicion de Levenshtein entre dos strings."""
+    if len(a) < len(b):
+        return _levenshtein_distance(b, a)
+    if len(b) == 0:
+        return len(a)
+    prev_row = list(range(len(b) + 1))
+    for i, ca in enumerate(a):
+        curr_row = [i + 1]
+        for j, cb in enumerate(b):
+            cost = 0 if ca == cb else 1
+            curr_row.append(
+                min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
+            )
+        prev_row = curr_row
+    return prev_row[-1]
+
+
+try:
+    _here = os.path.dirname(os.path.abspath(__file__))
+    _cyber_path = os.path.join(_here, "..", "cybersecurity")
+    if _cyber_path not in sys.path:
+        sys.path.insert(0, _cyber_path)
+    from cybersecurity import levenshtein_distance as _lev
+except ImportError:
+    _lev = None  # type: ignore
+
+levenshtein_distance = _lev if _lev is not None else _levenshtein_distance
+
+
+def _fuzzy_resolve(name: str, entity_id_map: dict[str, str], threshold: int = 3) -> str:
+    """Intenta resolver un nombre contra las claves del mapa por fuzzy match.
+
+    Recorre todas las claves de entity_id_map y busca la mas cercana segun
+    distancia de Levenshtein. Retorna el entity_id si la distancia es <=
+    threshold, o '' si no hay match aceptable.
+
+    Args:
+        name: nombre a resolver (ya en lowercase strip).
+        entity_id_map: mapa nombre_normalizado -> entity_id.
+        threshold: distancia maxima de edicion para considerar match (default 3).
+
+    Returns:
+        entity_id del mejor match o '' si no hay match.
+    """
+    best_id = ""
+    best_dist = threshold + 1
+    for key, entity_id in entity_id_map.items():
+        dist = levenshtein_distance(name, key)
+        if dist < best_dist:
+            best_dist = dist
+            best_id = entity_id
+    return best_id if best_dist <= threshold else ""
+
+
+def deduplicate_relations(
+    relations: list,
+    entity_id_map: dict[str, str],
+) -> list:
+    """Deduplica relaciones candidatas resolviendo nombres a IDs de entidad finales.
+
+    Algoritmo:
+    1. Para cada RelationCandidate, intentar resolver from_name y to_name al
+       entity_id via entity_id_map (lookup exacto primero, ignorando mayusculas).
+       Si no hay match exacto, intentar fuzzy match con levenshtein_distance.
+       Si sigue sin match, descartar la relacion con warning.
+    2. Descartar self-loops (from_id == to_id).
+    3. Deduplicar por (from_id, to_id, relation_type):
+       - description: concatenar descripciones unicas separadas por '; '
+       - confidence: max del grupo
+    4. Retornar lista limpia de RelationCandidate con from_id y to_id resueltos.
+
+    Args:
+        relations: lista de RelationCandidate con from_name/to_name originales.
+        entity_id_map: mapa nombre_normalizado -> entity_id (output de
+            deduplicate_entities). Permite resolver nombres que fueron mergeados.
+
+    Returns:
+        Lista deduplicada de RelationCandidate con from_id y to_id resueltos.
+    """
+    # Importar tipo — funciona tanto desde datascience/ como desde raiz del registry
+    try:
+        _types_path = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            "..", "..", "..", "python", "types", "datascience",
+        )
+        if _types_path not in sys.path:
+            sys.path.insert(0, _types_path)
+        from relation_candidate import RelationCandidate
+    except ImportError:
+        from relation_candidate import RelationCandidate  # type: ignore
+
+    resolved: list = []
+
+    for rel in relations:
+        # --- Resolver from_name ---
+        from_key = rel.from_name.lower().strip()
+        from_id = entity_id_map.get(from_key, "")
+        if not from_id:
+            from_id = _fuzzy_resolve(from_key, entity_id_map)
+            if not from_id:
+                logger.warning(
+                    "deduplicate_relations: no se pudo resolver from_name=%r — descartando",
+                    rel.from_name,
+                )
+                continue
+
+        # --- Resolver to_name ---
+        to_key = rel.to_name.lower().strip()
+        to_id = entity_id_map.get(to_key, "")
+        if not to_id:
+            to_id = _fuzzy_resolve(to_key, entity_id_map)
+            if not to_id:
+                logger.warning(
+                    "deduplicate_relations: no se pudo resolver to_name=%r — descartando",
+                    rel.to_name,
+                )
+                continue
+
+        # --- Descartar self-loops ---
+        if from_id == to_id:
+            logger.debug(
+                "deduplicate_relations: self-loop descartado (from=%r, to=%r, type=%r)",
+                rel.from_name,
+                rel.to_name,
+                rel.relation_type,
+            )
+            continue
+
+        resolved.append(
+            RelationCandidate(
+                from_name=rel.from_name,
+                to_name=rel.to_name,
+                from_id=from_id,
+                to_id=to_id,
+                relation_type=rel.relation_type,
+                description=rel.description,
+                confidence=rel.confidence,
+                source_chunk_index=rel.source_chunk_index,
+            )
+        )
+
+    # --- Deduplicar por (from_id, to_id, relation_type) ---
+    groups: dict[tuple, list] = {}
+    for rel in resolved:
+        key = (rel.from_id, rel.to_id, rel.relation_type)
+        groups.setdefault(key, []).append(rel)
+
+    result: list = []
+    for (from_id, to_id, rel_type), group in groups.items():
+        if len(group) == 1:
+            result.append(group[0])
+            continue
+
+        # Mergear: max confidence + union de descripciones unicas
+        best_confidence = max(r.confidence for r in group)
+        seen_desc: set[str] = set()
+        descriptions: list[str] = []
+        for r in group:
+            if r.description and r.description not in seen_desc:
+                descriptions.append(r.description)
+                seen_desc.add(r.description)
+
+        result.append(
+            RelationCandidate(
+                from_name=group[0].from_name,
+                to_name=group[0].to_name,
+                from_id=from_id,
+                to_id=to_id,
+                relation_type=rel_type,
+                description="; ".join(descriptions),
+                confidence=best_confidence,
+                source_chunk_index=group[0].source_chunk_index,
+            )
+        )
+
+    return result
@@ -0,0 +1,22 @@
+"""DeduplicationResult — resultado del proceso de deduplicacion de entidades."""
+
+from dataclasses import dataclass, field
+
+from entity_candidate import EntityCandidate
+
+
+@dataclass
+class DeduplicationResult:
+    """Resultado de deduplicacion de entidades.
+
+    El `name_to_id` mapea TODOS los nombres originales (incluyendo los
+    mergeados) a su ID final, permitiendo resolver relaciones que usan
+    cualquier variante del nombre.
+    """
+
+    entities: list[EntityCandidate]
+    entity_id_map: dict[str, str]
+    name_to_id: dict[str, str]
+    merge_log: list[dict] = field(default_factory=list)
+    total_before: int = 0
+    total_after: int = 0
@@ -0,0 +1,34 @@
+"""EntityCandidate — candidato de entidad extraido por el LLM."""
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class EntityCandidate:
+    """Candidato de entidad extraido por el LLM.
+
+    Puede venir de un solo chunk o ser el resultado de mergear multiples
+    extracciones. `merged_from` rastrea los nombres originales para debugging.
+    """
+
+    name: str
+    name_normalized: str = ""
+    type_ref: str = ""
+    type_label: str = ""
+    attributes: dict = field(default_factory=dict)
+    confidence: float = 0.0
+    source_chunk_indices: list[int] = field(default_factory=list)
+    merged_from: list[str] = field(default_factory=list)
+
+    def to_dict(self) -> dict:
+        """Serializa el candidato a un diccionario."""
+        return {
+            "name": self.name,
+            "name_normalized": self.name_normalized,
+            "type_ref": self.type_ref,
+            "type_label": self.type_label,
+            "attributes": self.attributes,
+            "confidence": self.confidence,
+            "source_chunk_indices": self.source_chunk_indices,
+            "merged_from": self.merged_from,
+        }
@@ -0,0 +1,145 @@
+"""Extrae entidades de un chunk de texto usando un LLM inyectado."""
+
+import sys
+import os
+import warnings
+from typing import Callable
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from entity_candidate import EntityCandidate
+
+
+def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str:
+    """Construye el system prompt para extraccion de entidades."""
+    lines = [
+        "You are an entity extraction expert. Given text, extract all entities",
+        "matching these types. For each entity, provide: name, type_ref,",
+        "attributes (matching the metadata_fields for that type), and a",
+        "confidence score (0.0-1.0).",
+        "",
+        "Entity types:",
+    ]
+
+    for schema_entry in entity_schema:
+        label = schema_entry.get("label", "Unknown")
+        type_ref = schema_entry.get("type_ref", "")
+        metadata_fields = schema_entry.get("metadata_fields", [])
+        lines.append(f"- {label} (type_ref: {type_ref})")
+        if metadata_fields:
+            lines.append(f"  fields: {', '.join(metadata_fields)}")
+
+    lines += [
+        "",
+        'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}',
+        "",
+        "Rules:",
+        "- Only extract entities explicitly mentioned in the text",
+        "- Use the exact type_ref from the schema",
+        "- Leave unknown attributes as null",
+        "- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied",
+        f"- {language_instruction}",
+    ]
+
+    return "\n".join(lines)
+
+
+def extract_entities_llm(
+    text: str,
+    entity_schema: list[dict],
+    llm_chat_json: Callable[[list[dict]], dict],
+    language_instruction: str = "Respond in English.",
+) -> list[EntityCandidate]:
+    """Extrae entidades de un chunk de texto usando un LLM inyectado.
+
+    Construye un system prompt con el schema de entity types, llama al LLM
+    y valida la respuesta retornando una lista de EntityCandidate.
+
+    Args:
+        text: Chunk de texto a analizar.
+        entity_schema: Lista de tipos con metadata fields. Cada entrada es un
+            dict con las claves 'type_ref', 'label' y opcionalmente
+            'metadata_fields'. Ejemplo:
+            [{"type_ref": "osint_person_go_cybersecurity", "label": "Person",
+              "metadata_fields": ["full_name", "alias"]}]
+        llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style
+            y retorna un dict con la respuesta JSON del LLM. Interfaz:
+            llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict
+        language_instruction: Instruccion de idioma para el LLM. Por defecto
+            "Respond in English."
+
+    Returns:
+        Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM
+        no retorna JSON valido o si no se encuentran entidades.
+
+    Raises:
+        ValueError: Si entity_schema esta vacio.
+    """
+    if not entity_schema:
+        raise ValueError("entity_schema no puede estar vacio")
+
+    valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema}
+    type_ref_to_label = {
+        entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema
+    }
+
+    system_prompt = _build_system_prompt(entity_schema, language_instruction)
+
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": text},
+    ]
+
+    try:
+        response = llm_chat_json(messages)
+    except Exception as exc:
+        warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2)
+        return []
+
+    raw_entities = response.get("entities", [])
+    if not isinstance(raw_entities, list):
+        warnings.warn(
+            "extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista",
+            stacklevel=2,
+        )
+        return []
+
+    candidates: list[EntityCandidate] = []
+    for item in raw_entities:
+        if not isinstance(item, dict):
+            continue
+
+        name = item.get("name", "")
+        if not name:
+            continue
+
+        type_ref = item.get("type_ref", "")
+        if type_ref not in valid_type_refs:
+            warnings.warn(
+                f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'",
+                stacklevel=2,
+            )
+            continue
+
+        attributes = item.get("attributes", {})
+        if not isinstance(attributes, dict):
+            attributes = {}
+        # Normalizar null values a None
+        attributes = {k: v for k, v in attributes.items() if v is not None}
+
+        confidence = item.get("confidence", 0.0)
+        if not isinstance(confidence, (int, float)):
+            confidence = 0.0
+        confidence = float(max(0.0, min(1.0, confidence)))
+
+        candidates.append(
+            EntityCandidate(
+                name=name,
+                type_ref=type_ref,
+                type_label=type_ref_to_label.get(type_ref, ""),
+                attributes=attributes,
+                confidence=confidence,
+            )
+        )
+
+    return candidates
@@ -0,0 +1,141 @@
+"""extract_relations_llm — extrae relaciones entre entidades usando un LLM."""
+
+import logging
+import sys
+import os
+from typing import Callable
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))
+
+from entity_candidate import EntityCandidate
+from relation_candidate import RelationCandidate
+
+logger = logging.getLogger(__name__)
+
+
+def extract_relations_llm(
+    text: str,
+    entities: list[EntityCandidate],
+    relation_types: list[str],
+    llm_chat_json: Callable[[list[dict]], dict],
+    language_instruction: str = "Respond in English.",
+) -> list[RelationCandidate]:
+    """Extrae relaciones entre entidades de un chunk de texto usando un LLM.
+
+    Dado el texto original y las entidades ya extraidas, pide al LLM que
+    identifique relaciones entre pares de entidades. Las relaciones cuyo
+    from_name o to_name no coincidan con ninguna entidad existente se descartan.
+    Los tipos de relacion no permitidos se reemplazan por "related_to".
+
+    Args:
+        text: chunk de texto (el mismo que se uso para extraer las entidades).
+        entities: entidades ya extraidas del chunk.
+        relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
+            "communicates_with", "owns", "related_to"].
+        llm_chat_json: funcion inyectada que recibe una lista de mensajes
+            (dicts con "role" y "content") y retorna un dict con la respuesta
+            JSON del LLM.
+        language_instruction: instruccion de idioma para el LLM.
+
+    Returns:
+        Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
+        o si el LLM no encuentra relaciones.
+    """
+    if len(entities) < 2:
+        return []
+
+    entity_names = {e.name for e in entities}
+    relation_types_set = set(relation_types)
+
+    # Construir lista de entidades para el prompt
+    entity_lines = "\n".join(
+        f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
+    )
+
+    # Construir tipos de relacion para el prompt
+    relation_types_str = ", ".join(relation_types)
+
+    system_prompt = f"""\
+You are a relation extraction expert. Given text and a list of entities already \
+extracted, identify relationships between them.
+
+Entities found in this text:
+{entity_lines}
+
+Allowed relation types: {relation_types_str}
+
+Output JSON: {{"relations": [
+  {{"from_name": "Entity A", "to_name": "Entity B",
+   "relation_type": "employs", "description": "...", "confidence": 0.8}}
+]}}
+
+Rules:
+- Only extract relations explicitly stated or strongly implied in the text
+- from_name and to_name must match entity names exactly as listed above
+- relation_type must be one of the allowed types
+- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
+- Do not invent entities not in the list above
+- {language_instruction}"""
+
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": text},
+    ]
+
+    try:
+        response = llm_chat_json(messages)
+    except Exception as exc:
+        logger.warning("extract_relations_llm: LLM call failed: %s", exc)
+        return []
+
+    raw_relations = response.get("relations", [])
+    if not isinstance(raw_relations, list):
+        logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
+        return []
+
+    results: list[RelationCandidate] = []
+    for item in raw_relations:
+        if not isinstance(item, dict):
+            continue
+
+        from_name = item.get("from_name", "")
+        to_name = item.get("to_name", "")
+
+        # Validar que ambos nombres corresponden a entidades existentes
+        if from_name not in entity_names:
+            logger.debug(
+                "extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
+                from_name,
+            )
+            continue
+        if to_name not in entity_names:
+            logger.debug(
+                "extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
+                to_name,
+            )
+            continue
+
+        relation_type = item.get("relation_type", "")
+        if relation_type not in relation_types_set:
+            logger.debug(
+                "extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
+                relation_type,
+            )
+            relation_type = "related_to"
+
+        confidence = item.get("confidence", 0.0)
+        if not isinstance(confidence, (int, float)):
+            confidence = 0.0
+        confidence = float(max(0.0, min(1.0, confidence)))
+
+        results.append(
+            RelationCandidate(
+                from_name=from_name,
+                to_name=to_name,
+                relation_type=relation_type,
+                description=item.get("description", ""),
+                confidence=confidence,
+            )
+        )
+
+    return results
@@ -0,0 +1,92 @@
+"""Extract plain text from PDF, Markdown, or TXT files."""
+
+
+SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
+
+
+def _detect_encoding(data: bytes) -> str:
+    """Detect encoding of raw bytes using multiple fallback strategies."""
+    # Strategy 1: UTF-8
+    try:
+        data.decode("utf-8")
+        return "utf-8"
+    except UnicodeDecodeError:
+        pass
+
+    # Strategy 2: charset_normalizer
+    try:
+        from charset_normalizer import from_bytes
+
+        result = from_bytes(data).best()
+        if result is not None and result.encoding:
+            return result.encoding
+    except ImportError:
+        pass
+
+    # Strategy 3: chardet
+    try:
+        import chardet
+
+        detected = chardet.detect(data)
+        if detected and detected.get("encoding"):
+            return detected["encoding"]
+    except ImportError:
+        pass
+
+    # Last resort: UTF-8 with replacement
+    return "utf-8"
+
+
+def extract_text_from_file(file_path: str) -> str:
+    """Extract plain text from a file. Supports PDF, Markdown and TXT.
+
+    For PDF files uses PyMuPDF (fitz) to extract text from each page,
+    joining them with double newlines. For text-based files (.md, .markdown,
+    .txt) reads the file with automatic encoding detection.
+
+    Args:
+        file_path: Absolute or relative path to the file.
+
+    Returns:
+        str: Extracted plain text content.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        ValueError: If the file extension is not supported.
+        ImportError: If PyMuPDF is not installed and a PDF is provided.
+    """
+    import os
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    _, ext = os.path.splitext(file_path.lower())
+
+    if ext == ".pdf":
+        try:
+            import fitz  # PyMuPDF
+        except ImportError as e:
+            raise ImportError(
+                "PyMuPDF is required for PDF extraction. "
+                "Install it with: pip install PyMuPDF"
+            ) from e
+
+        doc = fitz.open(file_path)
+        pages = [page.get_text() for page in doc]
+        return "\n\n".join(pages)
+
+    elif ext in {".md", ".markdown", ".txt"}:
+        with open(file_path, "rb") as f:
+            raw = f.read()
+
+        encoding = _detect_encoding(raw)
+        try:
+            return raw.decode(encoding)
+        except (UnicodeDecodeError, LookupError):
+            return raw.decode("utf-8", errors="replace")
+
+    else:
+        raise ValueError(
+            f"Unsupported file extension: '{ext}'. "
+            f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
+        )
@@ -0,0 +1,208 @@
+"""Pipeline de extraccion de entidades y relaciones desde un documento."""
+
+from __future__ import annotations
+
+import sys
+import os
+import time
+import warnings
+from typing import Callable
+
+# Soporte para ejecucion desde la raiz del registry o desde el directorio del archivo
+
+from extract_text_from_file import extract_text_from_file
+from core_functions import preprocess_text
+from split_text_into_chunks import split_text_into_chunks
+from build_entity_schema_prompt import build_entity_schema_prompt
+from build_relation_schema_prompt import build_relation_schema_prompt
+from extract_entities_llm import extract_entities_llm
+from extract_relations_llm import extract_relations_llm
+from deduplicate_entities import deduplicate_entities
+from deduplicate_relations import deduplicate_relations
+from entity_candidate import EntityCandidate
+from extraction_result import ExtractionResult
+from extraction_stats import ExtractionStats
+
+
+def extraction_pipeline(
+    file_path: str,
+    entity_presets: list[dict],
+    relation_types: list[str],
+    llm_chat_json: Callable[[list[dict]], dict],
+    chunk_size: int = 500,
+    chunk_overlap: int = 50,
+    confidence_threshold: float = 0.5,
+    dedup_threshold: float = 0.85,
+    on_progress: Callable[[str, float], None] | None = None,
+) -> ExtractionResult:
+    """Pipeline completa de extraccion de entidades y relaciones desde un documento.
+
+    Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks
+    -> extract_entities_llm por chunk -> deduplicate_entities ->
+    extract_relations_llm por chunk -> deduplicate_relations.
+
+    Args:
+        file_path: ruta al archivo a procesar (PDF, Markdown, TXT).
+        entity_presets: lista de dicts con type_ref, label y metadata_fields.
+            Ejemplo: [{"type_ref": "osint_person_go_cybersecurity",
+                        "label": "Person",
+                        "metadata_fields": ["full_name", "nationality"]}]
+        relation_types: tipos de relacion permitidos para extraccion.
+            Ejemplo: ["funds", "employs", "communicates_with", "owns"]
+        llm_chat_json: funcion inyectada que recibe messages OpenAI y retorna dict
+            con la respuesta JSON ya parseada. Sin acoplamiento a ningun proveedor.
+        chunk_size: numero de caracteres por chunk (default 500).
+        chunk_overlap: overlap entre chunks consecutivos (default 50).
+        confidence_threshold: umbral minimo de confidence para aceptar entidades
+            candidatas antes de deduplicar (default 0.5).
+        dedup_threshold: score minimo de similitud para mergear entidades (default 0.85).
+        on_progress: callback opcional de progreso (message: str, pct: float 0-1).
+            0-40%: extraccion de entidades, 40-80%: extraccion de relaciones,
+            80-100%: deduplicacion.
+
+    Returns:
+        ExtractionResult con entidades y relaciones deduplicadas y stats del proceso.
+
+    Raises:
+        FileNotFoundError: si file_path no existe.
+        ValueError: si entity_presets esta vacio.
+    """
+    if not entity_presets:
+        raise ValueError("entity_presets no puede estar vacio")
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
+
+    def _progress(msg: str, pct: float) -> None:
+        if on_progress is not None:
+            try:
+                on_progress(msg, pct)
+            except Exception:
+                pass
+
+    start_time = time.monotonic()
+    stats = ExtractionStats()
+
+    # ── Paso 1: Extraer texto ──────────────────────────────────────────────────
+    _progress("Extracting text from file...", 0.0)
+    try:
+        raw_text = extract_text_from_file(file_path)
+    except Exception as exc:
+        warnings.warn(f"extraction_pipeline: error al extraer texto: {exc}")
+        raw_text = ""
+
+    # ── Paso 2: Preprocesar ────────────────────────────────────────────────────
+    clean_text = preprocess_text(raw_text)
+    stats.total_chars = len(clean_text)
+
+    # ── Paso 3: Dividir en chunks ──────────────────────────────────────────────
+    chunks = split_text_into_chunks(clean_text, chunk_size=chunk_size, overlap=chunk_overlap)
+    n = len(chunks)
+    stats.total_chunks = n
+
+    if n == 0:
+        stats.processing_time_seconds = time.monotonic() - start_time
+        return ExtractionResult(entities=[], relations=[], stats=stats)
+
+    # ── Paso 4: Extraer entidades por chunk ────────────────────────────────────
+    all_raw_entities: list[EntityCandidate] = []
+
+    for i, chunk in enumerate(chunks):
+        _progress(f"Extracting entities from chunk {i + 1}/{n}", (i / n) * 0.4)
+        try:
+            candidates = extract_entities_llm(
+                text=chunk,
+                entity_schema=entity_presets,
+                llm_chat_json=llm_chat_json,
+            )
+        except Exception as exc:
+            warnings.warn(
+                f"extraction_pipeline: error en extract_entities_llm chunk {i}: {exc}"
+            )
+            candidates = []
+
+        for candidate in candidates:
+            # Anotar el chunk de origen
+            if i not in candidate.source_chunk_indices:
+                candidate.source_chunk_indices.append(i)
+            all_raw_entities.append(candidate)
+
+    # ── Paso 5: Filtrar por confidence ─────────────────────────────────────────
+    filtered_entities = [
+        e for e in all_raw_entities if e.confidence >= confidence_threshold
+    ]
+    stats.raw_entities_count = len(filtered_entities)
+
+    # Actualizar stats de tipos
+    for ent in filtered_entities:
+        stats.entity_types_found[ent.type_ref] = (
+            stats.entity_types_found.get(ent.type_ref, 0) + 1
+        )
+
+    # ── Paso 6: Deduplicar entidades ───────────────────────────────────────────
+    _progress("Deduplicating entities...", 0.4)
+    dedup_result = deduplicate_entities(filtered_entities, name_threshold=dedup_threshold)
+
+    stats.final_entities_count = dedup_result.total_after
+    stats.entities_merged = dedup_result.total_before - dedup_result.total_after
+
+    final_entities = dedup_result.entities
+    entity_id_map = dedup_result.name_to_id  # nombre_original -> entity_id
+
+    # ── Paso 7: Extraer relaciones por chunk ───────────────────────────────────
+    all_raw_relations = []
+
+    for i, chunk in enumerate(chunks):
+        _progress(f"Extracting relations...", 0.4 + (i / n) * 0.4)
+
+        # Obtener entidades relevantes de este chunk
+        chunk_entities = [
+            e for e in final_entities if i in e.source_chunk_indices
+        ]
+        # Si no hay entidades en este chunk especifico, usar todas
+        if not chunk_entities:
+            chunk_entities = final_entities
+
+        if len(chunk_entities) < 2:
+            continue
+
+        try:
+            chunk_relations = extract_relations_llm(
+                text=chunk,
+                entities=chunk_entities,
+                relation_types=relation_types,
+                llm_chat_json=llm_chat_json,
+            )
+        except Exception as exc:
+            warnings.warn(
+                f"extraction_pipeline: error en extract_relations_llm chunk {i}: {exc}"
+            )
+            chunk_relations = []
+
+        for rel in chunk_relations:
+            rel.source_chunk_index = i
+        all_raw_relations.extend(chunk_relations)
+
+    stats.raw_relations_count = len(all_raw_relations)
+
+    # Actualizar stats de tipos de relacion
+    for rel in all_raw_relations:
+        stats.relation_types_found[rel.relation_type] = (
+            stats.relation_types_found.get(rel.relation_type, 0) + 1
+        )
+
+    # ── Paso 8: Deduplicar relaciones ──────────────────────────────────────────
+    _progress("Deduplicating relations...", 0.8)
+    final_relations = deduplicate_relations(all_raw_relations, entity_id_map)
+
+    stats.final_relations_count = len(final_relations)
+    stats.relations_merged = stats.raw_relations_count - len(final_relations)
+    stats.processing_time_seconds = time.monotonic() - start_time
+
+    _progress("Done", 1.0)
+
+    return ExtractionResult(
+        entities=final_entities,
+        relations=final_relations,
+        stats=stats,
+    )
@@ -0,0 +1,20 @@
+"""ExtractionResult — resultado final del pipeline de extraccion."""
+
+from dataclasses import dataclass, field
+
+from entity_candidate import EntityCandidate
+from extraction_stats import ExtractionStats
+from relation_candidate import RelationCandidate
+
+
+@dataclass
+class ExtractionResult:
+    """Resultado final del pipeline de extraccion de entidades y relaciones.
+
+    Contiene las listas deduplicadas de entidades y relaciones junto con
+    las estadisticas del proceso completo.
+    """
+
+    entities: list[EntityCandidate]
+    relations: list[RelationCandidate]
+    stats: ExtractionStats = field(default_factory=ExtractionStats)
@@ -0,0 +1,25 @@
+"""ExtractionStats — estadisticas del proceso de extraccion."""
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ExtractionStats:
+    """Estadisticas del proceso de extraccion.
+
+    Util para reporting y debugging. Registra conteos antes y despues de
+    deduplicacion, tiempo de procesamiento y distribucion de tipos encontrados.
+    """
+
+    total_chunks: int = 0
+    total_chars: int = 0
+    raw_entities_count: int = 0
+    final_entities_count: int = 0
+    entities_merged: int = 0
+    raw_relations_count: int = 0
+    final_relations_count: int = 0
+    relations_merged: int = 0
+    relations_discarded: int = 0
+    entity_types_found: dict[str, int] = field(default_factory=dict)
+    relation_types_found: dict[str, int] = field(default_factory=dict)
+    processing_time_seconds: float = 0.0
@@ -0,0 +1,78 @@
+"""Combina atributos de multiples candidatos de la misma entidad."""
+
+from __future__ import annotations
+
+_NUMERIC_FIELDS = {"risk_score", "balance", "cvss"}
+_DATE_MIN_FIELDS = {"first_seen", "created_date"}
+_DATE_MAX_FIELDS = {"last_seen", "expires_date"}
+_BOOL_FIELDS = {"verified", "exploited"}
+
+
+def merge_entity_attributes(attr_list: list[dict]) -> dict:
+    """Combina atributos de multiples candidatos de la misma entidad.
+
+    Para cada campo presente en cualquier candidato recopila todos los valores
+    non-null y aplica heuristicas de resolucion por tipo de campo:
+    - Numerico (risk_score, balance, cvss): max
+    - Fecha min (first_seen, created_date): min (mas antigua)
+    - Fecha max (last_seen, expires_date): max (mas reciente)
+    - Lista (cualquier valor de tipo list): union sin duplicados
+    - Boolean (verified, exploited): OR logico
+    - String: el mas largo
+
+    Args:
+        attr_list: Lista de dicts con los atributos de cada candidato.
+
+    Returns:
+        Dict con los atributos fusionados.
+    """
+    if not attr_list:
+        return {}
+
+    # Recopilar todas las claves presentes en cualquier candidato
+    all_keys: set[str] = set()
+    for attrs in attr_list:
+        all_keys.update(attrs.keys())
+
+    merged: dict = {}
+
+    for key in all_keys:
+        # Recopilar valores non-null
+        values = [attrs[key] for attrs in attr_list if key in attrs and attrs[key] is not None]
+
+        if not values:
+            merged[key] = None
+            continue
+
+        if len(values) == 1:
+            merged[key] = values[0]
+            continue
+
+        # Todos iguales
+        if all(v == values[0] for v in values):
+            merged[key] = values[0]
+            continue
+
+        # Resolver conflicto segun tipo de campo
+        if key in _NUMERIC_FIELDS:
+            merged[key] = max(values)
+        elif key in _DATE_MIN_FIELDS:
+            merged[key] = min(values)
+        elif key in _DATE_MAX_FIELDS:
+            merged[key] = max(values)
+        elif key in _BOOL_FIELDS:
+            merged[key] = any(values)
+        elif isinstance(values[0], list):
+            # Union de listas sin duplicados, preservando orden de aparicion
+            seen: list = []
+            for lst in values:
+                for item in lst:
+                    if item not in seen:
+                        seen.append(item)
+            merged[key] = seen
+        else:
+            # String u otro: usar el mas largo
+            str_values = [str(v) for v in values]
+            merged[key] = max(str_values, key=len)
+
+    return merged
@@ -0,0 +1,81 @@
+"""Normaliza el nombre de una entidad para comparacion y deduplicacion."""
+
+import re
+
+
+_TITLES = re.compile(
+    r"^\b(?:Dr|Mr|Mrs|Ms|Miss|Prof|Sr|Jr|Ing|Lic|Gen|Col|Maj|Capt|Sgt|Rev|Hon)\.?\s+",
+    re.IGNORECASE,
+)
+
+_LEGAL_SUFFIXES = re.compile(
+    r"\b(?:Inc|LLC|Ltd|Corp|Co|S\.?A|GmbH|B\.?V|N\.?V|PLC|AG|SRL|S\.?L|Pty|"
+    r"LP|LLP|LLLP|PC|PA|PLLC|Foundation|Group|Holdings|Enterprises?|"
+    r"International|Industries|Services?|Solutions?|Systems?|Technologies?)\.?\s*$",
+    re.IGNORECASE,
+)
+
+_MULTI_SPACE = re.compile(r"\s+")
+
+
+def normalize_entity_name(name: str, entity_type: str = "") -> str:
+    """Normaliza el nombre de una entidad para comparacion y deduplicacion.
+
+    Aplica reglas diferentes segun el tipo de entidad:
+    - ip / email / domain / crypto_wallet / phone: normalizacion tecnica
+    - person: normalizacion de nombre humano (titulos, formato apellido-nombre)
+    - organization: normalizacion corporativa (sufijos legales)
+    - default: lower + strip + colapsar espacios
+
+    Args:
+        name: nombre de la entidad a normalizar.
+        entity_type: tipo de entidad (ip, email, domain, crypto_wallet, phone,
+                     person, organization). Vacio = default.
+
+    Returns:
+        nombre normalizado como string.
+    """
+    name = name.strip()
+    et = entity_type.lower().strip()
+
+    if et == "ip":
+        return name.lower()
+
+    if et == "email":
+        return name.lower()
+
+    if et == "domain":
+        result = name.lower().rstrip(".")
+        if result.startswith("www."):
+            result = result[4:]
+        return result
+
+    if et == "crypto_wallet":
+        # Bitcoin addresses son case-sensitive — solo strip
+        return name
+
+    if et == "phone":
+        # Mantener solo digitos y el signo +
+        return re.sub(r"[^\d+]", "", name)
+
+    if et == "person":
+        # Remover titulos al inicio
+        result = _TITLES.sub("", name).strip()
+        # Detectar formato "Apellido, Nombre"
+        if "," in result:
+            parts = result.split(",", 1)
+            last = parts[0].strip()
+            first = parts[1].strip()
+            result = f"{first} {last}"
+        # Colapsar espacios y title case
+        result = _MULTI_SPACE.sub(" ", result).strip()
+        return result.title()
+
+    if et == "organization":
+        result = _LEGAL_SUFFIXES.sub("", name).strip()
+        result = _MULTI_SPACE.sub(" ", result).strip()
+        # Title case para consistencia
+        return result.title()
+
+    # Default: lower, strip, colapsar espacios
+    return _MULTI_SPACE.sub(" ", name.lower()).strip()
@@ -0,0 +1,35 @@
+"""RelationCandidate — candidato de relacion extraido por el LLM."""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class RelationCandidate:
+    """Candidato de relacion entre dos entidades extraido por el LLM.
+
+    `from_name` y `to_name` contienen los nombres crudos del texto. `from_id`
+    y `to_id` se llenan durante la fase de deduplicacion cuando se resuelven
+    contra los EntityCandidate finales.
+    """
+
+    from_name: str
+    to_name: str
+    from_id: str = ""
+    to_id: str = ""
+    relation_type: str = ""
+    description: str = ""
+    confidence: float = 0.0
+    source_chunk_index: int = -1
+
+    def to_dict(self) -> dict:
+        """Serializa el candidato a un diccionario."""
+        return {
+            "from_name": self.from_name,
+            "to_name": self.to_name,
+            "from_id": self.from_id,
+            "to_id": self.to_id,
+            "relation_type": self.relation_type,
+            "description": self.description,
+            "confidence": self.confidence,
+            "source_chunk_index": self.source_chunk_index,
+        }
@@ -0,0 +1,234 @@
+"""Renderiza un grafo sigma.js como HTML standalone con dark theme y layout ForceAtlas2."""
+
+import json
+import os
+
+
+_HTML_TEMPLATE = """\
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>{title}</title>
+    <script src="https://cdn.jsdelivr.net/npm/graphology@0.25.4/dist/graphology.umd.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/graphology-library@0.8.0/dist/graphology-library.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/sigma@2.4.0/build/sigma.min.js"></script>
+    <style>
+        * {{ box-sizing: border-box; margin: 0; padding: 0; }}
+        body {{ background: #1a1a2e; color: #eee; font-family: 'Segoe UI', system-ui, sans-serif; overflow: hidden; }}
+        #container {{ width: 100vw; height: 100vh; }}
+        #panel {{
+            position: absolute; top: 12px; right: 12px;
+            background: rgba(10, 10, 30, 0.88);
+            border: 1px solid rgba(255,255,255,0.12);
+            padding: 16px; border-radius: 10px;
+            z-index: 10; min-width: 200px; max-width: 260px;
+            backdrop-filter: blur(6px);
+        }}
+        #panel h3 {{ font-size: 14px; font-weight: 600; margin-bottom: 12px; color: #a0c4ff; letter-spacing: 0.5px; }}
+        #stats {{ font-size: 11px; color: #888; margin-bottom: 12px; }}
+        #filters {{ display: flex; flex-direction: column; gap: 6px; }}
+        .filter-item {{ display: flex; align-items: center; gap: 8px; font-size: 12px; cursor: pointer; }}
+        .filter-item input {{ cursor: pointer; accent-color: #a0c4ff; }}
+        .color-dot {{ width: 10px; height: 10px; border-radius: 50%; flex-shrink: 0; }}
+        #tooltip {{
+            position: absolute; display: none;
+            background: rgba(5, 5, 20, 0.95);
+            border: 1px solid rgba(255,255,255,0.15);
+            padding: 10px 14px; border-radius: 8px;
+            pointer-events: none; z-index: 20;
+            max-width: 300px; font-size: 12px; line-height: 1.6;
+        }}
+        #tooltip .tt-title {{ font-weight: 600; color: #a0c4ff; margin-bottom: 6px; font-size: 13px; }}
+        #tooltip .tt-row {{ display: flex; gap: 6px; }}
+        #tooltip .tt-key {{ color: #888; min-width: 80px; }}
+        #tooltip .tt-val {{ color: #eee; word-break: break-all; }}
+    </style>
+</head>
+<body>
+    <div id="container"></div>
+    <div id="panel">
+        <h3>{title}</h3>
+        <div id="stats"></div>
+        <div id="filters"></div>
+    </div>
+    <div id="tooltip"></div>
+
+    <script>
+    (function () {{
+        const graphData = {json_data};
+
+        // ── Build graphology graph ──────────────────────────────────────────────
+        const Graph = graphology.Graph || graphology;
+        const g = new Graph({{ multi: true, type: 'directed' }});
+
+        // Assign random initial positions
+        graphData.nodes.forEach(function (n) {{
+            g.addNode(n.key, Object.assign({{
+                x: (Math.random() - 0.5) * 10,
+                y: (Math.random() - 0.5) * 10,
+            }}, n.attributes));
+        }});
+
+        graphData.edges.forEach(function (e) {{
+            try {{
+                g.addEdgeWithKey(e.key, e.source, e.target, e.attributes || {{}});
+            }} catch (err) {{
+                // skip duplicate edge keys gracefully
+            }}
+        }});
+
+        // ── ForceAtlas2 layout (synchronous, 500 iterations) ───────────────────
+        const FA2 = graphologyLibrary.layoutForceAtlas2;
+        FA2.assign(g, {{
+            iterations: 500,
+            settings: {{
+                gravity: 1,
+                scalingRatio: 2,
+                slowDown: 5,
+                barnesHutOptimize: g.order > 300,
+            }},
+        }});
+
+        // ── Sigma renderer ──────────────────────────────────────────────────────
+        const renderer = new Sigma(g, document.getElementById('container'), {{
+            renderEdgeLabels: false,
+            defaultEdgeColor: '#444',
+            defaultNodeColor: '#95a5a6',
+            labelColor: {{ color: '#ccc' }},
+            labelSize: 11,
+            edgeReducer: function (edge, data) {{
+                return Object.assign({{}}, data, {{ size: Math.max(1, (data.weight || 1) * 0.8) }});
+            }},
+        }});
+
+        // ── Stats panel ─────────────────────────────────────────────────────────
+        document.getElementById('stats').textContent =
+            graphData.nodes.length + ' nodes · ' + graphData.edges.length + ' edges';
+
+        // ── Filter panel by node type ───────────────────────────────────────────
+        const typeColors = {{}};
+        graphData.nodes.forEach(function (n) {{
+            const t = n.attributes.entity_type || 'unknown';
+            typeColors[t] = n.attributes.color || '#95a5a6';
+        }});
+
+        const hiddenTypes = new Set();
+        const filtersDiv = document.getElementById('filters');
+
+        Object.keys(typeColors).sort().forEach(function (type) {{
+            const color = typeColors[type];
+            const label = document.createElement('label');
+            label.className = 'filter-item';
+
+            const cb = document.createElement('input');
+            cb.type = 'checkbox';
+            cb.checked = true;
+            cb.addEventListener('change', function () {{
+                if (cb.checked) hiddenTypes.delete(type);
+                else hiddenTypes.add(type);
+                renderer.refresh();
+            }});
+
+            const dot = document.createElement('span');
+            dot.className = 'color-dot';
+            dot.style.background = color;
+
+            label.appendChild(cb);
+            label.appendChild(dot);
+            label.appendChild(document.createTextNode(type));
+            filtersDiv.appendChild(label);
+        }});
+
+        // Node reducer applies type filter
+        renderer.setSetting('nodeReducer', function (node, data) {{
+            if (hiddenTypes.has(data.entity_type)) return Object.assign({{}}, data, {{ hidden: true }});
+            return data;
+        }});
+
+        // ── Tooltip on hover ────────────────────────────────────────────────────
+        const tooltip = document.getElementById('tooltip');
+
+        renderer.on('enterNode', function (ref) {{
+            const nodeAttrs = g.getNodeAttributes(ref.node);
+            const reserved = new Set(['x', 'y', 'size', 'color', 'label', 'type', 'hidden']);
+
+            let html = '<div class="tt-title">' + escHtml(nodeAttrs.label || ref.node) + '</div>';
+            html += '<div class="tt-row"><span class="tt-key">type</span><span class="tt-val">' + escHtml(nodeAttrs.entity_type || '') + '</span></div>';
+            html += '<div class="tt-row"><span class="tt-key">status</span><span class="tt-val">' + escHtml(nodeAttrs.status || '') + '</span></div>';
+            html += '<div class="tt-row"><span class="tt-key">domain</span><span class="tt-val">' + escHtml(nodeAttrs.domain || '') + '</span></div>';
+
+            Object.keys(nodeAttrs).sort().forEach(function (k) {{
+                if (!reserved.has(k) && !['status', 'domain', 'type', 'label'].includes(k)) {{
+                    html += '<div class="tt-row"><span class="tt-key">' + escHtml(k) + '</span><span class="tt-val">' + escHtml(String(nodeAttrs[k])) + '</span></div>';
+                }}
+            }});
+
+            tooltip.innerHTML = html;
+            tooltip.style.display = 'block';
+        }});
+
+        renderer.on('leaveNode', function () {{
+            tooltip.style.display = 'none';
+        }});
+
+        document.getElementById('container').addEventListener('mousemove', function (e) {{
+            tooltip.style.left = (e.clientX + 16) + 'px';
+            tooltip.style.top = (e.clientY + 16) + 'px';
+        }});
+
+        function escHtml(str) {{
+            return String(str)
+                .replace(/&/g, '&amp;')
+                .replace(/</g, '&lt;')
+                .replace(/>/g, '&gt;')
+                .replace(/"/g, '&quot;');
+        }}
+    }})();
+    </script>
+</body>
+</html>
+"""
+
+
+def render_sigma_html(
+    graph_data: dict,
+    output_path: str,
+    title: str = "OSINT Graph",
+) -> str:
+    """Genera un HTML standalone con sigma.js que visualiza el grafo OSINT.
+
+    Recibe el dict producido por ops_to_sigma_json, embebe los datos como JSON
+    en el HTML, aplica ForceAtlas2 (500 iteraciones sincrono) y renderiza con
+    sigma.js v2.4. Incluye dark theme, panel de filtros por tipo de nodo y
+    tooltip con metadata al hacer hover.
+
+    Args:
+        graph_data: Dict con claves 'nodes' y 'edges' en formato graphology/sigma.
+        output_path: Ruta del archivo HTML a escribir.
+        title: Titulo del grafo mostrado en el panel y la pestana.
+
+    Returns:
+        Ruta absoluta del archivo HTML escrito.
+
+    Raises:
+        Exception: Si no se puede escribir el archivo en output_path.
+    """
+    json_data = json.dumps(graph_data, ensure_ascii=False)
+
+    html = _HTML_TEMPLATE.format(
+        title=title,
+        json_data=json_data,
+    )
+
+    abs_path = os.path.abspath(output_path)
+    os.makedirs(os.path.dirname(abs_path) or ".", exist_ok=True)
+
+    try:
+        with open(abs_path, "w", encoding="utf-8") as f:
+            f.write(html)
+    except OSError as exc:
+        raise Exception(f"render_sigma_html: no se pudo escribir '{abs_path}': {exc}") from exc
+
+    return abs_path
@@ -0,0 +1,66 @@
+"""Split text into overlapping chunks with sentence-boundary awareness."""
+
+
+def split_text_into_chunks(
+    text: str, chunk_size: int = 500, overlap: int = 50
+) -> list[str]:
+    """Divide texto en chunks de tamaño fijo con overlap, cortando en límites de oración.
+
+    Args:
+        text: Texto a dividir.
+        chunk_size: Tamaño máximo de cada chunk en caracteres.
+        overlap: Número de caracteres de solapamiento entre chunks consecutivos.
+
+    Returns:
+        Lista de chunks. Vacía si el texto es vacío.
+    """
+    if not text:
+        return []
+
+    if len(text) <= chunk_size:
+        stripped = text.strip()
+        return [stripped] if stripped else []
+
+    # Separadores en orden de prioridad (más específicos primero)
+    separators = ["。", "！", "？", ".\n", "!\n", "?\n", "\n\n", ". ", "! ", "? "]
+
+    chunks: list[str] = []
+    start = 0
+    text_len = len(text)
+
+    while start < text_len:
+        end = start + chunk_size
+
+        if end < text_len:
+            # Buscar el último separador de oración dentro de text[start:end]
+            # Solo aceptar si está después del 30% del chunk
+            min_pos = start + int(chunk_size * 0.30)
+            best_end = None
+
+            for sep in separators:
+                sep_len = len(sep)
+                # Buscar la última ocurrencia del separador en text[start:end]
+                search_region = text[start:end]
+                pos = search_region.rfind(sep)
+                if pos == -1:
+                    continue
+                abs_pos = start + pos + sep_len
+                if abs_pos > min_pos:
+                    # Usar este separador solo si produce un corte más tarde que el mínimo
+                    # y más temprano que chunk_size (ya garantizado por rfind en [start:end])
+                    if best_end is None or abs_pos > best_end:
+                        best_end = abs_pos
+
+            if best_end is not None:
+                end = best_end
+
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+
+        start = end - overlap
+        # Protección contra bucle infinito si overlap >= chunk_size o end no avanza
+        if start >= end:
+            start = end
+
+    return chunks
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from ontology-graph!")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,935 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Ontology Graph Extraction\n",
+    "\n",
+    "Extrae entidades y relaciones de cualquier documento usando funciones del registry.\n",
+    "- LLM: `claude -p --model haiku`\n",
+    "- Tipos: OSINT del registry + genéricos (concept, url, date, quantity, text_fragment, coordinates)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'python.functions.core.extract_json_from_llm'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m      3\u001b[39m ROOT = \u001b[33m'/home/lucas/fn_registry'\u001b[39m\n\u001b[32m      4\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m      5\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m      6\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m      8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m      9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m     10\u001b[39m \n",
+      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
+     ]
+    }
+   ],
+   "source": [
+    "import sys, os, json, subprocess\n",
+    "\n",
+    "ROOT = '/home/lucas/fn_registry'\n",
+    "os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
+    "sys.path.insert(0, ROOT)\n",
+    "\n",
+    "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
+    "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
+    "from python.functions.datascience.render_sigma_html import render_sigma_html\n",
+    "\n",
+    "print('Registry root:', ROOT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "'FN_REGISTRY_ROOT'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mKeyError\u001b[39m                                  Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m sys, os, json, subprocess\n\u001b[32m      2\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m ROOT = os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m]\n\u001b[32m      4\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m      5\u001b[39m \n\u001b[32m      6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m<frozen os>:717\u001b[39m, in \u001b[36m_Environ.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n",
+      "\u001b[31mKeyError\u001b[39m: 'FN_REGISTRY_ROOT'"
+     ]
+    }
+   ],
+   "source": [
+    "import sys, os, json, subprocess\n",
+    "\n",
+    "ROOT = os.environ['FN_REGISTRY_ROOT']\n",
+    "sys.path.insert(0, ROOT)\n",
+    "\n",
+    "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
+    "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
+    "from python.functions.datascience.render_sigma_html import render_sigma_html\n",
+    "\n",
+    "print('Registry root:', ROOT)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LLM wrapper: claude -p + haiku"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def claude_haiku_json(messages: list[dict]) -> dict:\n",
+    "    \"\"\"Wrapper que convierte messages OpenAI-style a claude -p --model haiku.\"\"\"\n",
+    "    # Construir prompt desde messages\n",
+    "    parts = []\n",
+    "    for msg in messages:\n",
+    "        role = msg['role']\n",
+    "        content = msg['content']\n",
+    "        if role == 'system':\n",
+    "            parts.append(f\"[SYSTEM]\\n{content}\")\n",
+    "        elif role == 'user':\n",
+    "            parts.append(f\"[USER]\\n{content}\")\n",
+    "    prompt = \"\\n\\n\".join(parts)\n",
+    "    \n",
+    "    result = subprocess.run(\n",
+    "        ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
+    "        capture_output=True, text=True, timeout=120\n",
+    "    )\n",
+    "    \n",
+    "    if result.returncode != 0:\n",
+    "        raise RuntimeError(f\"claude -p failed: {result.stderr}\")\n",
+    "    \n",
+    "    # Extraer el campo 'result' del JSON envelope de claude\n",
+    "    envelope = json.loads(result.stdout)\n",
+    "    raw_text = envelope.get('result', '')\n",
+    "    \n",
+    "    # Parsear JSON del LLM (maneja codeblocks, trailing commas, etc.)\n",
+    "    return extract_json_from_llm(raw_text)\n",
+    "\n",
+    "# Test rapido\n",
+    "test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
+    "print('LLM wrapper OK:', test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Entity presets: OSINT + genéricos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- Presets OSINT (del registry) ---\n",
+    "OSINT_PRESETS = [\n",
+    "    {\"type_ref\": \"osint_person_go_cybersecurity\", \"label\": \"Person\",\n",
+    "     \"metadata_fields\": [\"full_name\", \"alias\", \"nationality\", \"dob\", \"gender\", \"risk_score\"]},\n",
+    "    {\"type_ref\": \"osint_organization_go_cybersecurity\", \"label\": \"Organization\",\n",
+    "     \"metadata_fields\": [\"legal_name\", \"country\", \"sector\", \"founded\", \"risk_score\"]},\n",
+    "    {\"type_ref\": \"osint_location_go_cybersecurity\", \"label\": \"Location\",\n",
+    "     \"metadata_fields\": [\"lat\", \"lon\", \"address\", \"country\", \"city\"]},\n",
+    "    {\"type_ref\": \"osint_event_go_cybersecurity\", \"label\": \"Event\",\n",
+    "     \"metadata_fields\": [\"event_type\", \"date\", \"location\", \"description\", \"severity\"]},\n",
+    "    {\"type_ref\": \"osint_email_go_cybersecurity\", \"label\": \"Email\",\n",
+    "     \"metadata_fields\": [\"address\", \"provider\", \"verified\", \"breached\"]},\n",
+    "    {\"type_ref\": \"osint_domain_go_cybersecurity\", \"label\": \"Domain\",\n",
+    "     \"metadata_fields\": [\"fqdn\", \"registrar\", \"created_date\", \"expires_date\"]},\n",
+    "    {\"type_ref\": \"osint_ip_address_go_cybersecurity\", \"label\": \"IP Address\",\n",
+    "     \"metadata_fields\": [\"ip\", \"asn\", \"country\", \"isp\", \"geolocation\"]},\n",
+    "    {\"type_ref\": \"osint_phone_go_cybersecurity\", \"label\": \"Phone\",\n",
+    "     \"metadata_fields\": [\"number\", \"country_code\", \"carrier\", \"phone_type\"]},\n",
+    "    {\"type_ref\": \"osint_social_media_go_cybersecurity\", \"label\": \"Social Media Account\",\n",
+    "     \"metadata_fields\": [\"platform\", \"username\", \"url\", \"followers\", \"verified\"]},\n",
+    "    {\"type_ref\": \"osint_document_go_cybersecurity\", \"label\": \"Document\",\n",
+    "     \"metadata_fields\": [\"title\", \"format\", \"classification\", \"source\"]},\n",
+    "    {\"type_ref\": \"osint_crypto_wallet_go_cybersecurity\", \"label\": \"Crypto Wallet\",\n",
+    "     \"metadata_fields\": [\"address\", \"blockchain\", \"balance\"]},\n",
+    "    {\"type_ref\": \"osint_malware_go_cybersecurity\", \"label\": \"Malware\",\n",
+    "     \"metadata_fields\": [\"family\", \"hash_sha256\", \"threat_level\"]},\n",
+    "    {\"type_ref\": \"osint_vulnerability_go_cybersecurity\", \"label\": \"Vulnerability\",\n",
+    "     \"metadata_fields\": [\"cve_id\", \"cvss\", \"affected_product\", \"exploited\"]},\n",
+    "]\n",
+    "\n",
+    "# --- Presets genéricos (sin tipo Go, inline) ---\n",
+    "GENERIC_PRESETS = [\n",
+    "    {\"type_ref\": \"concept\", \"label\": \"Concept\",\n",
+    "     \"metadata_fields\": [\"name\", \"category\", \"definition\"]},\n",
+    "    {\"type_ref\": \"url\", \"label\": \"URL/Link\",\n",
+    "     \"metadata_fields\": [\"url\", \"domain\", \"context\"]},\n",
+    "    {\"type_ref\": \"date_reference\", \"label\": \"Date/Time\",\n",
+    "     \"metadata_fields\": [\"date\", \"precision\", \"context\"]},\n",
+    "    {\"type_ref\": \"quantity\", \"label\": \"Quantity/Amount\",\n",
+    "     \"metadata_fields\": [\"value\", \"unit\", \"context\"]},\n",
+    "    {\"type_ref\": \"coordinates\", \"label\": \"Coordinates\",\n",
+    "     \"metadata_fields\": [\"lat\", \"lon\", \"label\"]},\n",
+    "    {\"type_ref\": \"text_fragment\", \"label\": \"Key Text Fragment\",\n",
+    "     \"metadata_fields\": [\"text\", \"category\", \"relevance\"]},\n",
+    "]\n",
+    "\n",
+    "ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
+    "print(f'{len(ALL_PRESETS)} entity presets loaded ({len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic)')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Relation types"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RELATION_TYPES = [\n",
+    "    # Personas / orgs\n",
+    "    \"employs\", \"works_for\", \"founded\", \"owns\", \"controls\",\n",
+    "    \"member_of\", \"affiliated_with\", \"collaborates_with\",\n",
+    "    # Comunicacion\n",
+    "    \"communicates_with\", \"sent_to\", \"received_from\",\n",
+    "    # Ubicacion\n",
+    "    \"located_in\", \"headquartered_in\", \"traveled_to\", \"operates_in\",\n",
+    "    # Eventos\n",
+    "    \"participated_in\", \"caused\", \"occurred_at\", \"occurred_on\",\n",
+    "    # Documentos / conceptos\n",
+    "    \"mentions\", \"references\", \"describes\", \"authored\", \"published\",\n",
+    "    # Financiero\n",
+    "    \"funds\", \"transacted_with\", \"invested_in\",\n",
+    "    # Tecnico\n",
+    "    \"hosts\", \"resolves_to\", \"exploits\", \"targets\",\n",
+    "    # Generico\n",
+    "    \"related_to\", \"part_of\", \"instance_of\", \"has_attribute\",\n",
+    "]\n",
+    "\n",
+    "print(f'{len(RELATION_TYPES)} relation types')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Extraer documento\n",
+    "\n",
+    "Pon tu documento en `data/` y cambia el path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DOC_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'document.pdf')  # <-- cambiar\n",
+    "\n",
+    "# Progreso visible\n",
+    "def on_progress(msg, pct):\n",
+    "    print(f'  [{pct*100:5.1f}%] {msg}')\n",
+    "\n",
+    "result = extraction_pipeline(\n",
+    "    file_path=DOC_PATH,\n",
+    "    entity_presets=ALL_PRESETS,\n",
+    "    relation_types=RELATION_TYPES,\n",
+    "    llm_chat_json=claude_haiku_json,\n",
+    "    chunk_size=800,\n",
+    "    chunk_overlap=100,\n",
+    "    confidence_threshold=0.5,\n",
+    "    dedup_threshold=0.85,\n",
+    "    on_progress=on_progress,\n",
+    ")\n",
+    "\n",
+    "print(f'\\nEntities: {result.stats.final_entities_count}')\n",
+    "print(f'Relations: {result.stats.final_relations_count}')\n",
+    "print(f'Chunks: {result.stats.total_chunks}')\n",
+    "print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
+    "print(f'Entity types: {result.stats.entity_types_found}')\n",
+    "print(f'Relation types: {result.stats.relation_types_found}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Explorar resultados"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Entities\n",
+    "ent_rows = []\n",
+    "for e in result.entities:\n",
+    "    ent_rows.append({\n",
+    "        'id': e.id,\n",
+    "        'name': e.name,\n",
+    "        'type': e.type_ref,\n",
+    "        'confidence': e.confidence,\n",
+    "        'attributes': e.attributes,\n",
+    "    })\n",
+    "df_entities = pd.DataFrame(ent_rows)\n",
+    "print(f'=== Entities ({len(df_entities)}) ===')\n",
+    "df_entities.sort_values('type')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Relations\n",
+    "rel_rows = []\n",
+    "for r in result.relations:\n",
+    "    rel_rows.append({\n",
+    "        'from_name': r.from_name,\n",
+    "        'relation': r.relation_type,\n",
+    "        'to_name': r.to_name,\n",
+    "        'confidence': r.confidence,\n",
+    "        'description': r.description,\n",
+    "    })\n",
+    "df_relations = pd.DataFrame(rel_rows)\n",
+    "print(f'=== Relations ({len(df_relations)}) ===')\n",
+    "df_relations.sort_values('relation')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Visualizar grafo con sigma.js"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Colores por tipo de entidad\n",
+    "TYPE_COLORS = {\n",
+    "    'osint_person_go_cybersecurity': '#e74c3c',\n",
+    "    'osint_organization_go_cybersecurity': '#3498db',\n",
+    "    'osint_location_go_cybersecurity': '#2ecc71',\n",
+    "    'osint_event_go_cybersecurity': '#f39c12',\n",
+    "    'osint_email_go_cybersecurity': '#9b59b6',\n",
+    "    'osint_domain_go_cybersecurity': '#1abc9c',\n",
+    "    'osint_ip_address_go_cybersecurity': '#e67e22',\n",
+    "    'osint_phone_go_cybersecurity': '#95a5a6',\n",
+    "    'osint_social_media_go_cybersecurity': '#e91e63',\n",
+    "    'osint_document_go_cybersecurity': '#607d8b',\n",
+    "    'osint_crypto_wallet_go_cybersecurity': '#ff9800',\n",
+    "    'osint_malware_go_cybersecurity': '#f44336',\n",
+    "    'osint_vulnerability_go_cybersecurity': '#ff5722',\n",
+    "    'concept': '#00bcd4',\n",
+    "    'url': '#8bc34a',\n",
+    "    'date_reference': '#cddc39',\n",
+    "    'quantity': '#ffc107',\n",
+    "    'coordinates': '#4caf50',\n",
+    "    'text_fragment': '#78909c',\n",
+    "}\n",
+    "DEFAULT_COLOR = '#aaaaaa'\n",
+    "\n",
+    "def extraction_to_sigma(result) -> dict:\n",
+    "    \"\"\"Convierte ExtractionResult a formato sigma.js/graphology.\"\"\"\n",
+    "    # Contar degree para tamaño de nodo\n",
+    "    degree = {}\n",
+    "    for r in result.relations:\n",
+    "        from_id = r.from_id or r.from_name\n",
+    "        to_id = r.to_id or r.to_name\n",
+    "        degree[from_id] = degree.get(from_id, 0) + 1\n",
+    "        degree[to_id] = degree.get(to_id, 0) + 1\n",
+    "\n",
+    "    nodes = []\n",
+    "    for e in result.entities:\n",
+    "        eid = e.id or e.name\n",
+    "        nodes.append({\n",
+    "            'key': eid,\n",
+    "            'attributes': {\n",
+    "                'label': e.name,\n",
+    "                'color': TYPE_COLORS.get(e.type_ref, DEFAULT_COLOR),\n",
+    "                'size': 4 + min(degree.get(eid, 0) * 2, 20),\n",
+    "                'type': e.type_ref,\n",
+    "                **{k: str(v) for k, v in (e.attributes or {}).items() if v is not None},\n",
+    "            }\n",
+    "        })\n",
+    "\n",
+    "    edges = []\n",
+    "    node_keys = {n['key'] for n in nodes}\n",
+    "    for i, r in enumerate(result.relations):\n",
+    "        from_id = r.from_id or r.from_name\n",
+    "        to_id = r.to_id or r.to_name\n",
+    "        if from_id in node_keys and to_id in node_keys:\n",
+    "            edges.append({\n",
+    "                'key': f'e{i}',\n",
+    "                'source': from_id,\n",
+    "                'target': to_id,\n",
+    "                'attributes': {\n",
+    "                    'label': r.relation_type,\n",
+    "                    'type': r.relation_type,\n",
+    "                }\n",
+    "            })\n",
+    "\n",
+    "    return {'nodes': nodes, 'edges': edges}\n",
+    "\n",
+    "graph_data = extraction_to_sigma(result)\n",
+    "print(f'Graph: {len(graph_data[\"nodes\"])} nodes, {len(graph_data[\"edges\"])} edges')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')\n",
+    "html_path = render_sigma_html(\n",
+    "    graph_data=graph_data,\n",
+    "    output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
+    "    title='Ontology Graph',\n",
+    ")\n",
+    "print(f'Graph saved: {html_path}')\n",
+    "print(f'Open in browser: file://{html_path}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Auto-discovery de nuevos tipos\n",
+    "\n",
+    "Si el documento contiene entidades que no encajan en los presets, haiku las detecta y sugiere nuevos presets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def discover_new_types(result, existing_presets: list[dict]) -> list[dict]:\n",
+    "    \"\"\"Pide a haiku que sugiera tipos nuevos basandose en entidades de baja confianza o genericas.\"\"\"\n",
+    "    # Recopilar entidades clasificadas como concept/text_fragment (genéricos fallback)\n",
+    "    generic_entities = [\n",
+    "        {'name': e.name, 'type': e.type_ref, 'attributes': e.attributes}\n",
+    "        for e in result.entities\n",
+    "        if e.type_ref in ('concept', 'text_fragment', 'related_to')\n",
+    "    ]\n",
+    "    \n",
+    "    if not generic_entities:\n",
+    "        print('No hay entidades genéricas — los presets cubren todo.')\n",
+    "        return []\n",
+    "\n",
+    "    existing_labels = [p['label'] for p in existing_presets]\n",
+    "    \n",
+    "    prompt_msg = [\n",
+    "        {'role': 'system', 'content': (\n",
+    "            'You analyze entities extracted from a document and suggest new entity type presets. '\n",
+    "            'Existing types: ' + ', '.join(existing_labels) + '. '\n",
+    "            'For entities that dont fit existing types, suggest new type presets. '\n",
+    "            'Output JSON: {\"new_presets\": [{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", '\n",
+    "            '\"metadata_fields\": [\"field1\", \"field2\", ...]}]}. '\n",
+    "            'Only suggest types that are genuinely different from existing ones. '\n",
+    "            'Return {\"new_presets\": []} if no new types are needed.'\n",
+    "        )},\n",
+    "        {'role': 'user', 'content': (\n",
+    "            'These entities were classified as generic (concept/text_fragment) '\n",
+    "            'because they didnt fit existing types:\\n\\n'\n",
+    "            + json.dumps(generic_entities[:30], ensure_ascii=False, indent=2)\n",
+    "        )}\n",
+    "    ]\n",
+    "    \n",
+    "    resp = claude_haiku_json(prompt_msg)\n",
+    "    new_presets = resp.get('new_presets', [])\n",
+    "    \n",
+    "    if new_presets:\n",
+    "        print(f'Discovered {len(new_presets)} new types:')\n",
+    "        for p in new_presets:\n",
+    "            print(f\"  - {p['label']} ({p['type_ref']}): {p['metadata_fields']}\")\n",
+    "    else:\n",
+    "        print('No new types needed.')\n",
+    "    \n",
+    "    return new_presets\n",
+    "\n",
+    "new_types = discover_new_types(result, ALL_PRESETS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Si se descubrieron tipos nuevos, re-extraer con presets ampliados\n",
+    "if new_types:\n",
+    "    EXPANDED_PRESETS = ALL_PRESETS + new_types\n",
+    "    print(f'Re-extracting with {len(EXPANDED_PRESETS)} presets...')\n",
+    "    \n",
+    "    result = extraction_pipeline(\n",
+    "        file_path=DOC_PATH,\n",
+    "        entity_presets=EXPANDED_PRESETS,\n",
+    "        relation_types=RELATION_TYPES,\n",
+    "        llm_chat_json=claude_haiku_json,\n",
+    "        chunk_size=800,\n",
+    "        chunk_overlap=100,\n",
+    "        confidence_threshold=0.5,\n",
+    "        dedup_threshold=0.85,\n",
+    "        on_progress=on_progress,\n",
+    "    )\n",
+    "    \n",
+    "    print(f'\\nEntities: {result.stats.final_entities_count}')\n",
+    "    print(f'Relations: {result.stats.final_relations_count}')\n",
+    "    \n",
+    "    # Re-generar grafo\n",
+    "    graph_data = extraction_to_sigma(result)\n",
+    "    html_path = render_sigma_html(\n",
+    "        graph_data=graph_data,\n",
+    "        output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
+    "        title='Ontology Graph (expanded)',\n",
+    "    )\n",
+    "    print(f'Updated graph: file://{html_path}')\n",
+    "else:\n",
+    "    print('No re-extraction needed.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'python.functions.core.extract_json_from_llm'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m      5\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m      6\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m      7\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, os.path.join(ROOT, \u001b[33m'python'\u001b[39m, \u001b[33m'functions'\u001b[39m))\n\u001b[32m      8\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m     10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m     11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m     12\u001b[39m \n",
+      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
+     ]
+    }
+   ],
+   "source": [
+    "import sys, os, json, subprocess\n",
+    "from pathlib import Path\n",
+    "\n",
+    "ROOT = '/home/lucas/fn_registry'\n",
+    "os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
+    "sys.path.insert(0, ROOT)\n",
+    "sys.path.insert(0, os.path.join(ROOT, 'python', 'functions'))\n",
+    "\n",
+    "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
+    "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
+    "from python.functions.datascience.render_sigma_html import render_sigma_html\n",
+    "\n",
+    "print('OK: imports loaded')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "imports OK\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys, os, json, subprocess\n",
+    "\n",
+    "# Añadir lib/ al path\n",
+    "sys.path.insert(0, '/home/lucas/fn_registry/analysis/ontology_graph/lib')\n",
+    "\n",
+    "from core_functions import extract_json_from_llm\n",
+    "from extraction_pipeline import extraction_pipeline\n",
+    "from render_sigma_html import render_sigma_html\n",
+    "\n",
+    "print('imports OK')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LLM wrapper OK: {'ok': True}\n"
+     ]
+    }
+   ],
+   "source": [
+    "def claude_haiku_json(messages: list[dict]) -> dict:\n",
+    "    \"\"\"Wrapper: messages OpenAI-style -> claude -p --model haiku -> dict.\"\"\"\n",
+    "    parts = []\n",
+    "    for msg in messages:\n",
+    "        role = msg['role']\n",
+    "        content = msg['content']\n",
+    "        if role == 'system':\n",
+    "            parts.append(f'[SYSTEM]\\n{content}')\n",
+    "        elif role == 'user':\n",
+    "            parts.append(f'[USER]\\n{content}')\n",
+    "    prompt = '\\n\\n'.join(parts)\n",
+    "    \n",
+    "    result = subprocess.run(\n",
+    "        ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
+    "        capture_output=True, text=True, timeout=120\n",
+    "    )\n",
+    "    if result.returncode != 0:\n",
+    "        raise RuntimeError(f'claude -p failed: {result.stderr}')\n",
+    "    \n",
+    "    envelope = json.loads(result.stdout)\n",
+    "    raw_text = envelope.get('result', '')\n",
+    "    return extract_json_from_llm(raw_text)\n",
+    "\n",
+    "# Test\n",
+    "test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
+    "print('LLM wrapper OK:', test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "19 presets, 35 relation types\n"
+     ]
+    }
+   ],
+   "source": [
+    "OSINT_PRESETS = [\n",
+    "    {'type_ref': 'osint_person_go_cybersecurity', 'label': 'Person',\n",
+    "     'metadata_fields': ['full_name', 'alias', 'nationality', 'dob', 'gender', 'risk_score']},\n",
+    "    {'type_ref': 'osint_organization_go_cybersecurity', 'label': 'Organization',\n",
+    "     'metadata_fields': ['legal_name', 'country', 'sector', 'founded', 'risk_score']},\n",
+    "    {'type_ref': 'osint_location_go_cybersecurity', 'label': 'Location',\n",
+    "     'metadata_fields': ['lat', 'lon', 'address', 'country', 'city']},\n",
+    "    {'type_ref': 'osint_event_go_cybersecurity', 'label': 'Event',\n",
+    "     'metadata_fields': ['event_type', 'date', 'location', 'description', 'severity']},\n",
+    "    {'type_ref': 'osint_email_go_cybersecurity', 'label': 'Email',\n",
+    "     'metadata_fields': ['address', 'provider', 'verified', 'breached']},\n",
+    "    {'type_ref': 'osint_domain_go_cybersecurity', 'label': 'Domain',\n",
+    "     'metadata_fields': ['fqdn', 'registrar', 'created_date', 'expires_date']},\n",
+    "    {'type_ref': 'osint_ip_address_go_cybersecurity', 'label': 'IP Address',\n",
+    "     'metadata_fields': ['ip', 'asn', 'country', 'isp', 'geolocation']},\n",
+    "    {'type_ref': 'osint_phone_go_cybersecurity', 'label': 'Phone',\n",
+    "     'metadata_fields': ['number', 'country_code', 'carrier', 'phone_type']},\n",
+    "    {'type_ref': 'osint_social_media_go_cybersecurity', 'label': 'Social Media Account',\n",
+    "     'metadata_fields': ['platform', 'username', 'url', 'followers', 'verified']},\n",
+    "    {'type_ref': 'osint_document_go_cybersecurity', 'label': 'Document',\n",
+    "     'metadata_fields': ['title', 'format', 'classification', 'source']},\n",
+    "    {'type_ref': 'osint_crypto_wallet_go_cybersecurity', 'label': 'Crypto Wallet',\n",
+    "     'metadata_fields': ['address', 'blockchain', 'balance']},\n",
+    "    {'type_ref': 'osint_malware_go_cybersecurity', 'label': 'Malware',\n",
+    "     'metadata_fields': ['family', 'hash_sha256', 'threat_level']},\n",
+    "    {'type_ref': 'osint_vulnerability_go_cybersecurity', 'label': 'Vulnerability',\n",
+    "     'metadata_fields': ['cve_id', 'cvss', 'affected_product', 'exploited']},\n",
+    "]\n",
+    "\n",
+    "GENERIC_PRESETS = [\n",
+    "    {'type_ref': 'concept', 'label': 'Concept',\n",
+    "     'metadata_fields': ['name', 'category', 'definition']},\n",
+    "    {'type_ref': 'url', 'label': 'URL/Link',\n",
+    "     'metadata_fields': ['url', 'domain', 'context']},\n",
+    "    {'type_ref': 'date_reference', 'label': 'Date/Time',\n",
+    "     'metadata_fields': ['date', 'precision', 'context']},\n",
+    "    {'type_ref': 'quantity', 'label': 'Quantity/Amount',\n",
+    "     'metadata_fields': ['value', 'unit', 'context']},\n",
+    "    {'type_ref': 'coordinates', 'label': 'Coordinates',\n",
+    "     'metadata_fields': ['lat', 'lon', 'label']},\n",
+    "    {'type_ref': 'text_fragment', 'label': 'Key Text Fragment',\n",
+    "     'metadata_fields': ['text', 'category', 'relevance']},\n",
+    "]\n",
+    "\n",
+    "ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
+    "\n",
+    "RELATION_TYPES = [\n",
+    "    'employs', 'works_for', 'founded', 'owns', 'controls',\n",
+    "    'member_of', 'affiliated_with', 'collaborates_with',\n",
+    "    'communicates_with', 'sent_to', 'received_from',\n",
+    "    'located_in', 'headquartered_in', 'traveled_to', 'operates_in',\n",
+    "    'participated_in', 'caused', 'occurred_at', 'occurred_on',\n",
+    "    'mentions', 'references', 'describes', 'authored', 'published',\n",
+    "    'funds', 'transacted_with', 'invested_in',\n",
+    "    'hosts', 'resolves_to', 'exploits', 'targets',\n",
+    "    'related_to', 'part_of', 'instance_of', 'has_attribute',\n",
+    "]\n",
+    "\n",
+    "print(f'{len(ALL_PRESETS)} presets, {len(RELATION_TYPES)} relation types')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  0.0%] Extracting text from file...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  0.0%] Extracting entities from chunk 1/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  0.7%] Extracting entities from chunk 2/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  1.5%] Extracting entities from chunk 3/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  2.2%] Extracting entities from chunk 4/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  3.0%] Extracting entities from chunk 5/54\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/lucas/fn_registry/analysis/ontology_graph/lib/extraction_pipeline.py:113: UserWarning: extract_entities_llm: type_ref 'osint_service_go_cybersecurity' no esta en el schema, descartando entidad 'Bizum'\n",
+      "  candidates = extract_entities_llm(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  3.7%] Extracting entities from chunk 6/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  4.4%] Extracting entities from chunk 7/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  5.2%] Extracting entities from chunk 8/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  5.9%] Extracting entities from chunk 9/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  6.7%] Extracting entities from chunk 10/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  7.4%] Extracting entities from chunk 11/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  8.1%] Extracting entities from chunk 12/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  8.9%] Extracting entities from chunk 13/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [  9.6%] Extracting entities from chunk 14/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [ 10.4%] Extracting entities from chunk 15/54\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  [ 11.1%] Extracting entities from chunk 16/54\n"
+     ]
+    }
+   ],
+   "source": [
+    "DOC_PATH = '/home/lucas/fn_registry/analysis/ontology_graph/data/condiciones-generales-bizum.pdf'\n",
+    "\n",
+    "def on_progress(msg, pct):\n",
+    "    print(f'  [{pct*100:5.1f}%] {msg}')\n",
+    "\n",
+    "result = extraction_pipeline(\n",
+    "    file_path=DOC_PATH,\n",
+    "    entity_presets=ALL_PRESETS,\n",
+    "    relation_types=RELATION_TYPES,\n",
+    "    llm_chat_json=claude_haiku_json,\n",
+    "    chunk_size=800,\n",
+    "    chunk_overlap=100,\n",
+    "    confidence_threshold=0.5,\n",
+    "    dedup_threshold=0.85,\n",
+    "    on_progress=on_progress,\n",
+    ")\n",
+    "\n",
+    "print(f'\\nEntities: {result.stats.final_entities_count}')\n",
+    "print(f'Relations: {result.stats.final_relations_count}')\n",
+    "print(f'Chunks: {result.stats.total_chunks}')\n",
+    "print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
+    "print(f'Entity types: {result.stats.entity_types_found}')\n",
+    "print(f'Relation types: {result.stats.relation_types_found}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pipeline optimizado\n",
+    "\n",
+    "- 1 sola llamada LLM por chunk (entities + relations + tipos nuevos)\n",
+    "- Chunks de 2000 chars\n",
+    "- Paralelizado con ThreadPoolExecutor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+    "from extract_text_from_file import extract_text_from_file\n",
+    "from core_functions import preprocess_text\n",
+    "from split_text_into_chunks import split_text_into_chunks\n",
+    "from deduplicate_entities import deduplicate_entities\n",
+    "from deduplicate_relations import deduplicate_relations\n",
+    "from entity_candidate import EntityCandidate\n",
+    "from relation_candidate import RelationCandidate\n",
+    "\n",
+    "def build_unified_prompt(entity_presets, relation_types):\n",
+    "    \"\"\"System prompt que pide entities + relations + tipos nuevos en 1 sola llamada.\"\"\"\n",
+    "    type_lines = []\n",
+    "    for p in entity_presets:\n",
+    "        fields = ', '.join(p.get('metadata_fields', []))\n",
+    "        type_lines.append(f\"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]\")\n",
+    "\n",
+    "    return f'''You are an entity and relation extraction expert. Given text, extract ALL entities and relations in a single pass.\n",
+    "\n",
+    "ENTITY TYPES:\n",
+    "{chr(10).join(type_lines)}\n",
+    "\n",
+    "RELATION TYPES: {', '.join(relation_types)}\n",
+    "\n",
+    "OUTPUT FORMAT (strict JSON):\n",
+    "{{\n",
+    "  \"entities\": [\n",
+    "    {{\"name\": \"...\", \"type_ref\": \"...\", \"attributes\": {{...}}, \"confidence\": 0.9}}\n",
+    "  ],\n",
+    "  \"relations\": [\n",
+    "    {{\"from_name\": \"...\", \"to_name\": \"...\", \"relation_type\": \"...\", \"confidence\": 0.8, \"description\": \"...\"}}\n",
+    "  ],\n",
+    "  \"suggested_types\": [\n",
+    "    {{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", \"metadata_fields\": [\"field1\", \"field2\"], \"reason\": \"why this type is needed\"}}\n",
+    "  ]\n",
+    "}}\n",
+    "\n",
+    "RULES:\n",
+    "- Extract ALL entities explicitly mentioned in the text\n",
+    "- Use exact type_ref from the schema. Leave unknown attributes as null\n",
+    "- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied\n",
+    "- Relations: from_name and to_name MUST match extracted entity names exactly\n",
+    "- suggested_types: if you find important entities that do NOT fit any existing type, suggest a new type with its fields. Use these suggested types for those entities in the entities array.\n",
+    "- If no suggested types are needed, return \"suggested_types\": []\n",
+    "- Respond in the same language as the text for descriptions'''\n",
+    "\n",
+    "UNIFIED_PROMPT = build_unified_prompt(ALL_PRESETS, RELATION_TYPES)\n",
+    "print(f'Prompt length: {len(UNIFIED_PROMPT)} chars')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -0,0 +1,15 @@
+[project]
+name = "ontology-graph"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "jupyter>=1.1.1",
+    "jupyter-collaboration>=4.3.0",
+    "jupyter-mcp-server>=0.4.0",
+    "jupyterlab>=4.5.6",
+    "matplotlib>=3.10.8",
+    "numpy>=2.4.4",
+    "pandas>=3.0.2",
+]
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Jupyter Lab — modo colaborativo con autodeteccion de puerto
+# Generado por write_jupyter_launcher (fn_registry)
+
+find_free_port() {
+    for port in 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899; do
+        if ! ss -tln 2>/dev/null | grep -q ":${port} " && \
+           ! lsof -i:"$port" >/dev/null 2>&1; then
+            echo $port
+            return
+        fi
+    done
+    echo 8888
+}
+
+PORT=${1:-$(find_free_port)}
+cd "$(dirname "$0")"
+
+echo $PORT > .jupyter-port
+
+source .venv/bin/activate 2>/dev/null || true
+
+if ! python -c "import jupyter_collaboration" 2>/dev/null; then
+    echo "ERROR: jupyter-collaboration no esta instalado"
+    echo "Instala con: uv add jupyter-collaboration"
+    exit 1
+fi
+
+echo "════════════════════════════════════════════════"
+echo "  Jupyter Lab + Colaboracion en puerto $PORT"
+echo "════════════════════════════════════════════════"
+echo ""
+echo "  Abre: http://localhost:$PORT"
+echo "  Ctrl+C para detener"
+echo ""
+
+jupyter lab \
+    --port=$PORT \
+    --no-browser \
+    --ServerApp.token='' \
+    --ServerApp.password='' \
+    --ServerApp.disable_check_xsrf=True \
+    --ServerApp.allow_origin='*' \
+    --ServerApp.root_dir="$(pwd)" \
+    --collaborative