chore: initial sync

2026-04-28 22:13:08 +02:00
commit 40bea81603
30 changed files with 6675 additions and 0 deletions
@@ -0,0 +1,40 @@
 # JUPYTER HABILITADO EN ESTE ANALISIS
 ## Reglas OBLIGATORIAS para Claude
 ### 1. CODIGO INMUTABLE — NUNCA MODIFICAR CELDAS EXISTENTES
 - **PROHIBIDO** usar NotebookEdit para reemplazar celdas existentes
 - **SIEMPRE** anadir celdas NUEVAS al final del notebook
 - Si hay un error en una celda, crear celda nueva con la correccion
 - El historial de trabajo debe quedar intacto para trazabilidad
 ### 2. PROGRAMACION FUNCIONAL OBLIGATORIA
 - **Funciones puras**: sin efectos secundarios, mismo input -> mismo output
 - **Inmutabilidad**: nunca mutar datos, crear copias transformadas
 - **Composicion**: funciones pequenas que se combinan
 - Preferir: `map`, `filter`, `reduce`, list comprehensions
 - Evitar: loops con mutacion, `global`, modificar argumentos in-place
 ### 3. SIEMPRE usar MCP jupyter para ejecutar codigo Python
 - Las ejecuciones se ven en tiempo real en Jupyter Lab del usuario
 - Compartimos variables y estado del kernel
 - **NUNCA usar bash para ejecutar Python en este analisis**
 ### 4. Verificar Jupyter activo ANTES de ejecutar
 - Si no esta activo: pedir al usuario que ejecute `./run-jupyter-lab.sh`
 ### 5. Gestion de notebooks
 - Notebooks en la carpeta `notebooks/` o subcarpetas
 - Si un notebook tiene >50 celdas, crear uno nuevo
 - Nombrar descriptivamente: `01_exploracion.ipynb`, `02_limpieza.ipynb`
 ### 6. Gestion de Python
 - **SIEMPRE usar `uv`** para gestionar dependencias
 - Anadir paquetes con `uv add nombre_paquete`
 ### 7. Acceso al fn_registry
 - `FN_REGISTRY_ROOT` apunta a la raiz del registry
 - Para importar funciones Python: `sys.path.insert(0, os.path.join(os.environ["FN_REGISTRY_ROOT"], "python", "functions"))`
 - Para consultar registry.db: `sqlite3` o `import sqlite3` con la ruta `$FN_REGISTRY_ROOT/registry.db`
@@ -0,0 +1,12 @@
 .venv/
 .mcp.json
 .jupyter-port
 .jupyter/
 .jupyter_ystore.db
 .ipython/
 __pycache__/
 *.pyc
 .ipynb_checkpoints/
 bin/
 data/
 .DS_Store
@@ -0,0 +1 @@
 3.13
@@ -0,0 +1,540 @@
 """Extracción de grafo ontológico desde un documento.
 Uso: python extract.py <archivo>
     python extract.py data/condiciones-generales-bizum.pdf
 Optimizaciones vs extraction_pipeline:
 - 1 sola llamada LLM por chunk (entities + relations + tipos sugeridos)
 - Chunks de 2000 chars
 - Paralelizado con ThreadPoolExecutor
 """
 import sys
 import os
 import json
 import subprocess
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "lib"))
 from extract_text_from_file import extract_text_from_file
 from core_functions import preprocess_text, extract_json_from_llm
 from split_text_into_chunks import split_text_into_chunks
 from deduplicate_entities import deduplicate_entities
 from deduplicate_relations import deduplicate_relations
 from entity_candidate import EntityCandidate
 from relation_candidate import RelationCandidate
 from render_sigma_html import render_sigma_html
 # ── Presets ────────────────────────────────────────────────────────────────────
 OSINT_PRESETS = [
    {"type_ref": "person", "label": "Person",
     "metadata_fields": ["full_name", "alias", "nationality", "dob", "gender", "risk_score"]},
    {"type_ref": "organization", "label": "Organization",
     "metadata_fields": ["legal_name", "country", "sector", "founded", "risk_score"]},
    {"type_ref": "location", "label": "Location",
     "metadata_fields": ["lat", "lon", "address", "country", "city"]},
    {"type_ref": "event", "label": "Event",
     "metadata_fields": ["event_type", "date", "location", "description", "severity"]},
    {"type_ref": "email", "label": "Email",
     "metadata_fields": ["address", "provider", "verified", "breached"]},
    {"type_ref": "domain", "label": "Domain",
     "metadata_fields": ["fqdn", "registrar", "created_date", "expires_date"]},
    {"type_ref": "ip_address", "label": "IP Address",
     "metadata_fields": ["ip", "asn", "country", "isp", "geolocation"]},
    {"type_ref": "phone", "label": "Phone",
     "metadata_fields": ["number", "country_code", "carrier", "phone_type"]},
    {"type_ref": "social_media", "label": "Social Media Account",
     "metadata_fields": ["platform", "username", "url", "followers", "verified"]},
    {"type_ref": "document", "label": "Document",
     "metadata_fields": ["title", "format", "classification", "source"]},
    {"type_ref": "crypto_wallet", "label": "Crypto Wallet",
     "metadata_fields": ["address", "blockchain", "balance"]},
    {"type_ref": "malware", "label": "Malware",
     "metadata_fields": ["family", "hash_sha256", "threat_level"]},
    {"type_ref": "vulnerability", "label": "Vulnerability",
     "metadata_fields": ["cve_id", "cvss", "affected_product", "exploited"]},
 ]
 GENERIC_PRESETS = [
    {"type_ref": "concept", "label": "Concept",
     "metadata_fields": ["name", "category", "definition"]},
    {"type_ref": "url", "label": "URL/Link",
     "metadata_fields": ["url", "domain", "context"]},
    {"type_ref": "date_reference", "label": "Date/Time",
     "metadata_fields": ["date", "precision", "context"]},
    {"type_ref": "quantity", "label": "Quantity/Amount",
     "metadata_fields": ["value", "unit", "context"]},
    {"type_ref": "coordinates", "label": "Coordinates",
     "metadata_fields": ["lat", "lon", "label"]},
    {"type_ref": "text_fragment", "label": "Key Text Fragment",
     "metadata_fields": ["text", "category", "relevance"]},
 ]
 # ── Custom presets (acumulativo, pensado para promoción al registry) ───────────
 CUSTOM_PRESETS_PATH = os.path.join(os.path.dirname(__file__), "data", "custom_presets.json")
 def load_custom_presets() -> list[dict]:
    """Carga presets custom desde data/custom_presets.json si existe."""
    if not os.path.exists(CUSTOM_PRESETS_PATH):
        return []
    with open(CUSTOM_PRESETS_PATH) as f:
        data = json.load(f)
    return data.get("presets", [])
 def save_custom_presets(presets: list[dict]) -> None:
    """Guarda presets custom en data/custom_presets.json.
    Formato pensado para promoción al registry:
    {
      "presets": [
        {
          "type_ref": "snake_case_id",
          "label": "Human Label",
          "metadata_fields": ["field1", "field2"],
          "reason": "why this type exists",
          "source_doc": "document where it was first discovered",
          "promoted": false  // true cuando se registre en el registry
        }
      ]
    }
    """
    os.makedirs(os.path.dirname(CUSTOM_PRESETS_PATH), exist_ok=True)
    with open(CUSTOM_PRESETS_PATH, "w") as f:
        json.dump({"presets": presets}, f, ensure_ascii=False, indent=2)
 def merge_suggested_into_custom(suggested: list[dict], source_doc: str) -> list[dict]:
    """Mergea tipos sugeridos con custom existentes. Dedup por type_ref."""
    existing = load_custom_presets()
    existing_refs = {p["type_ref"] for p in existing}
    added = []
    for s in suggested:
        ref = s.get("type_ref", "")
        if not ref or ref in existing_refs:
            continue
        existing_refs.add(ref)
        preset = {
            "type_ref": ref,
            "label": s.get("label", ref),
            "metadata_fields": s.get("metadata_fields", []),
            "reason": s.get("reason", ""),
            "source_doc": source_doc,
            "promoted": False,
        }
        existing.append(preset)
        added.append(preset)
    if added:
        save_custom_presets(existing)
    return added
 RELATION_TYPES = [
    "employs", "works_for", "founded", "owns", "controls",
    "member_of", "affiliated_with", "collaborates_with",
    "communicates_with", "sent_to", "received_from",
    "located_in", "headquartered_in", "traveled_to", "operates_in",
    "participated_in", "caused", "occurred_at", "occurred_on",
    "mentions", "references", "describes", "authored", "published",
    "funds", "transacted_with", "invested_in",
    "hosts", "resolves_to", "exploits", "targets",
    "related_to", "part_of", "instance_of", "has_attribute",
 ]
 # ── LLM wrapper ───────────────────────────────────────────────────────────────
 def claude_haiku_json(messages: list[dict]) -> dict:
    parts = []
    for msg in messages:
        if msg["role"] == "system":
            parts.append(f"[SYSTEM]\n{msg['content']}")
        elif msg["role"] == "user":
            parts.append(f"[USER]\n{msg['content']}")
    prompt = "\n\n".join(parts)
    result = subprocess.run(
        ["claude", "-p", "--model", "haiku", "--output-format", "json", prompt],
        capture_output=True, text=True, timeout=120,
    )
    if result.returncode != 0:
        raise RuntimeError(f"claude -p failed: {result.stderr[:200]}")
    envelope = json.loads(result.stdout)
    return extract_json_from_llm(envelope.get("result", ""))
 # ── Unified prompt ─────────────────────────────────────────────────────────────
 def build_unified_prompt(presets, rel_types):
    type_lines = []
    for p in presets:
        fields = ", ".join(p.get("metadata_fields", []))
        type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
    return (
        "You are an entity and relation extraction expert. "
        "Given text, extract ALL entities and relations in a single pass.\n\n"
        "ENTITY TYPES:\n" + "\n".join(type_lines) + "\n\n"
        "RELATION TYPES: " + ", ".join(rel_types) + "\n\n"
        'OUTPUT FORMAT (strict JSON):\n'
        '{\n'
        '  "entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}],\n'
        '  "relations": [{"from_name": "...", "to_name": "...", "relation_type": "...", "confidence": 0.8, "description": "..."}],\n'
        '  "suggested_types": [{"type_ref": "snake_case_id", "label": "Human Label", "metadata_fields": ["f1","f2"], "reason": "..."}]\n'
        '}\n\n'
        "RULES:\n"
        "- Extract ALL entities explicitly mentioned\n"
        "- Use exact type_ref from schema. Unknown attributes = null\n"
        "- Confidence: 1.0=explicit, 0.7=strongly implied, 0.5=weakly implied\n"
        "- Relations: from_name/to_name MUST match entity names exactly\n"
        "- suggested_types: for important entities that do NOT fit any type, suggest a new type. "
        "Use those suggested type_refs for those entities in the entities array.\n"
        '- If no new types needed: "suggested_types": []\n'
        "- Respond in the same language as the text for descriptions"
    )
 # ── Process one chunk ──────────────────────────────────────────────────────────
 def process_chunk(chunk_idx: int, chunk_text: str, system_prompt: str):
    """Procesa un chunk: extrae entities + relations + suggested_types."""
    try:
        resp = claude_haiku_json([
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": chunk_text},
        ])
    except Exception as e:
        print(f"  [WARN] chunk {chunk_idx}: {e}")
        return [], [], []
    raw_entities = resp.get("entities", [])
    raw_relations = resp.get("relations", [])
    suggested = resp.get("suggested_types", [])
    entities = []
    for ent in raw_entities:
        name = ent.get("name", "").strip()
        if not name:
            continue
        entities.append(EntityCandidate(
            name=name,
            type_ref=ent.get("type_ref", "concept"),
            attributes=ent.get("attributes", {}),
            confidence=float(ent.get("confidence", 0.5)),
            source_chunk_indices=[chunk_idx],
        ))
    relations = []
    for rel in raw_relations:
        fn = rel.get("from_name", "").strip()
        tn = rel.get("to_name", "").strip()
        if not fn or not tn:
            continue
        relations.append(RelationCandidate(
            from_name=fn,
            to_name=tn,
            relation_type=rel.get("relation_type", "related_to"),
            confidence=float(rel.get("confidence", 0.5)),
            description=rel.get("description", ""),
            source_chunk_index=chunk_idx,
        ))
    return entities, relations, suggested
 # ── Sigma conversion ───────────────────────────────────────────────────────────
 TYPE_COLORS = {
    "person": "#e74c3c",
    "organization": "#3498db",
    "location": "#2ecc71",
    "event": "#f39c12",
    "email": "#9b59b6",
    "domain": "#1abc9c",
    "ip_address": "#e67e22",
    "phone": "#95a5a6",
    "social_media": "#e91e63",
    "document": "#607d8b",
    "crypto_wallet": "#ff9800",
    "malware": "#f44336",
    "vulnerability": "#ff5722",
    "concept": "#00bcd4",
    "url": "#8bc34a",
    "date_reference": "#cddc39",
    "quantity": "#ffc107",
    "coordinates": "#4caf50",
    "text_fragment": "#78909c",
 }
 def to_sigma(entities, relations, entity_id_map):
    # Build name→UUID lookup from dedup map
    # entity_id_map: {name_variant -> uuid, ...}
    # Invert to uuid→canonical_name using entities list
    uuid_to_name = {}
    name_to_uuid = {}
    for e in entities:
        # Find this entity's UUID in the map
        uuid = entity_id_map.get(e.name, entity_id_map.get(e.name.lower().strip(), e.name))
        uuid_to_name[uuid] = e.name
        name_to_uuid[e.name] = uuid
    degree = {}
    for r in relations:
        fid = r.from_id or r.from_name
        tid = r.to_id or r.to_name
        degree[fid] = degree.get(fid, 0) + 1
        degree[tid] = degree.get(tid, 0) + 1
    nodes = []
    seen_uuids = set()
    for e in entities:
        uuid = name_to_uuid.get(e.name, e.name)
        if uuid in seen_uuids:
            continue
        seen_uuids.add(uuid)
        # Filter out 'type' — sigma.js reserves it for node render program
        reserved = {"type", "hidden", "x", "y"}
        attrs = {k: str(v) for k, v in (e.attributes or {}).items() if v is not None and k not in reserved}
        nodes.append({
            "key": uuid,
            "attributes": {
                "label": e.name,
                "color": TYPE_COLORS.get(e.type_ref, "#aaaaaa"),
                "size": 4 + min(degree.get(uuid, 0) * 2, 20),
                "entity_type": e.type_ref,
                **attrs,
            },
        })
    node_keys = {n["key"] for n in nodes}
    edges = []
    seen_edges = set()
    for i, r in enumerate(relations):
        fid = r.from_id or r.from_name
        tid = r.to_id or r.to_name
        if fid in node_keys and tid in node_keys and fid != tid:
            edge_key = (fid, tid, r.relation_type)
            if edge_key in seen_edges:
                continue
            seen_edges.add(edge_key)
            edges.append({
                "key": f"e{i}",
                "source": fid,
                "target": tid,
                "attributes": {"label": r.relation_type},
            })
    return {"nodes": nodes, "edges": edges}
 # ── Reclasificación de entidades genéricas ─────────────────────────────────────
 GENERIC_TYPE_REFS = {"concept", "text_fragment", "url", "date_reference", "quantity", "coordinates"}
 def reclassify_generic_entities(entities, new_presets, workers=4):
    """Reclasifica entidades genéricas usando los tipos recién descubiertos.
    En vez de re-procesar chunks, hace 1 llamada batch a haiku con las entidades
    genéricas y los nuevos presets para reclasificarlas in-place.
    """
    generic = [(i, e) for i, e in enumerate(entities) if e.type_ref in GENERIC_TYPE_REFS]
    if not generic or not new_presets:
        return 0
    # Construir prompt de reclasificación
    type_lines = []
    for p in new_presets:
        fields = ", ".join(p.get("metadata_fields", []))
        type_lines.append(f"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]")
    system = (
        "You reclassify entities into more specific types. "
        "For each entity, decide if it fits one of the NEW types below better than its current generic type. "
        "If it fits, return the new type_ref and updated attributes. If not, return null.\n\n"
        "NEW TYPES:\n" + "\n".join(type_lines) + "\n\n"
        'OUTPUT: {"reclassified": [{"index": 0, "type_ref": "new_type", "attributes": {...}}, ...]}\n'
        "Only include entities that should change. Omit those that should stay as-is."
    )
    # Procesar en batches de 30 entidades para no exceder contexto
    batch_size = 30
    total_changed = 0
    def _reclassify_batch(batch):
        items = [{"index": idx, "name": e.name, "current_type": e.type_ref,
                   "attributes": e.attributes} for idx, e in batch]
        try:
            resp = claude_haiku_json([
                {"role": "system", "content": system},
                {"role": "user", "content": json.dumps(items, ensure_ascii=False)},
            ])
            return resp.get("reclassified", [])
        except Exception:
            return []
    batches = [generic[i:i+batch_size] for i in range(0, len(generic), batch_size)]
    with ThreadPoolExecutor(max_workers=workers) as pool:
        futures = {pool.submit(_reclassify_batch, b): b for b in batches}
        for future in as_completed(futures):
            for item in future.result():
                idx = item.get("index")
                new_ref = item.get("type_ref", "")
                if idx is not None and new_ref and 0 <= idx < len(entities):
                    entities[idx].type_ref = new_ref
                    if item.get("attributes"):
                        entities[idx].attributes.update(item["attributes"])
                    total_changed += 1
    return total_changed
 # ── Main ───────────────────────────────────────────────────────────────────────
 def main():
    if len(sys.argv) < 2:
        print("Uso: python extract.py <archivo>")
        sys.exit(1)
    file_path = sys.argv[1]
    if not os.path.isabs(file_path):
        file_path = os.path.join(os.path.dirname(__file__), file_path)
    workers = int(sys.argv[2]) if len(sys.argv) > 2 else 4
    print(f"=== Ontology Graph Extraction ===")
    print(f"File: {file_path}")
    print(f"Workers: {workers}")
    start = time.monotonic()
    # 1. Extraer y preprocesar texto
    print("\n[1/5] Extracting text...")
    raw = extract_text_from_file(file_path)
    text = preprocess_text(raw)
    print(f"  {len(text)} chars")
    # 2. Chunking
    print("[2/5] Chunking...")
    chunks = split_text_into_chunks(text, chunk_size=2000, overlap=200)
    print(f"  {len(chunks)} chunks")
    # 3. Extracción paralela
    custom = load_custom_presets()
    # Solo usar custom no promovidos (los promovidos ya estarán en el registry)
    active_custom = [p for p in custom if not p.get("promoted", False)]
    all_presets = OSINT_PRESETS + GENERIC_PRESETS + active_custom
    print(f"  Presets: {len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic + {len(active_custom)} custom")
    system_prompt = build_unified_prompt(all_presets, RELATION_TYPES)
    print(f"[3/5] Extracting entities + relations ({workers} workers)...")
    all_entities = []
    all_relations = []
    all_suggested = []
    with ThreadPoolExecutor(max_workers=workers) as pool:
        futures = {
            pool.submit(process_chunk, i, chunk, system_prompt): i
            for i, chunk in enumerate(chunks)
        }
        for future in as_completed(futures):
            idx = futures[future]
            ents, rels, sugg = future.result()
            all_entities.extend(ents)
            all_relations.extend(rels)
            all_suggested.extend(sugg)
            print(f"  chunk {idx+1}/{len(chunks)}: {len(ents)} entities, {len(rels)} relations" +
                  (f", {len(sugg)} new types" if sugg else ""))
    # 4. Deduplicación
    print(f"\n[4/5] Deduplicating...")
    print(f"  Raw: {len(all_entities)} entities, {len(all_relations)} relations")
    dedup = deduplicate_entities(all_entities, name_threshold=0.85)
    final_entities = dedup.entities
    entity_id_map = dedup.name_to_id
    final_relations = deduplicate_relations(all_relations, entity_id_map)
    print(f"  Final: {len(final_entities)} entities, {len(final_relations)} relations")
    print(f"  Merged: {dedup.total_before - dedup.total_after} entities, "
          f"{len(all_relations) - len(final_relations)} relations")
    # Registrar tipos sugeridos en custom_presets.json
    unique_suggested = []
    if all_suggested:
        seen = set()
        for s in all_suggested:
            key = s.get("type_ref", "")
            if key and key not in seen:
                seen.add(key)
                unique_suggested.append(s)
        source_doc = os.path.basename(file_path)
        added = merge_suggested_into_custom(unique_suggested, source_doc)
        total_custom = len(load_custom_presets())
        if added:
            print(f"\n  New types registered ({len(added)}):")
            for p in added:
                print(f"    + {p['label']} ({p['type_ref']}): {p['metadata_fields']}")
                print(f"      Reason: {p['reason']}")
            print(f"  Total custom presets: {total_custom} (in {CUSTOM_PRESETS_PATH})")
            # Reclasificar entidades genéricas con los tipos recién descubiertos
            n_generic = sum(1 for e in final_entities if e.type_ref in GENERIC_TYPE_REFS)
            if n_generic > 0:
                print(f"\n  Reclassifying {n_generic} generic entities with new types...")
                changed = reclassify_generic_entities(final_entities, added, workers=workers)
                print(f"  Reclassified: {changed}/{n_generic}")
        else:
            print(f"\n  {len(unique_suggested)} suggested types already registered ({total_custom} total custom)")
    # Stats por tipo
    type_counts = {}
    for e in final_entities:
        type_counts[e.type_ref] = type_counts.get(e.type_ref, 0) + 1
    print(f"\n  Entity types:")
    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"    {t}: {c}")
    rel_counts = {}
    for r in final_relations:
        rel_counts[r.relation_type] = rel_counts.get(r.relation_type, 0) + 1
    print(f"  Relation types:")
    for t, c in sorted(rel_counts.items(), key=lambda x: -x[1]):
        print(f"    {t}: {c}")
    # 5. Visualización
    print(f"\n[5/5] Generating graph...")
    graph = to_sigma(final_entities, final_relations, entity_id_map)
    out_dir = os.path.join(os.path.dirname(__file__), "data")
    html_path = render_sigma_html(graph, os.path.join(out_dir, "ontology_graph.html"), "Ontology Graph")
    print(f"  {len(graph['nodes'])} nodes, {len(graph['edges'])} edges")
    print(f"  HTML: file://{html_path}")
    # Guardar JSON intermedio
    json_path = os.path.join(out_dir, "extraction_result.json")
    with open(json_path, "w") as f:
        json.dump({
            "entities": [{"name": e.name, "type_ref": e.type_ref,
                          "confidence": e.confidence, "attributes": e.attributes}
                         for e in final_entities],
            "relations": [{"from": r.from_name, "to": r.to_name,
                           "type": r.relation_type, "confidence": r.confidence,
                           "description": r.description}
                          for r in final_relations],
            "suggested_types": [dict(s) for s in (unique_suggested if all_suggested else [])],
        }, f, ensure_ascii=False, indent=2)
    print(f"  JSON: {json_path}")
    elapsed = time.monotonic() - start
    print(f"\nDone in {elapsed:.1f}s")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,43 @@
 """Genera la seccion del system prompt que describe los entity types disponibles para extraccion."""
 def build_entity_schema_prompt(entity_presets: list[dict]) -> str:
    """Genera texto legible para el LLM describiendo los entity types disponibles.
    Formatea los presets del registry en una seccion del system prompt que indica
    al LLM que tipos de entidades puede extraer y que atributos tiene cada uno.
    Args:
        entity_presets: Lista de presets con campos 'label', 'type_ref' y
                        opcionalmente 'metadata_fields'. Ejemplo:
                        [{"type_ref": "osint_person_go_cybersecurity",
                          "label": "Person",
                          "metadata_fields": ["full_name", "alias"]}]
    Returns:
        String formateado con la seccion del prompt. Retorna string vacio si
        la lista de presets esta vacia.
    """
    if not entity_presets:
        return ""
    lines = ["Entity types available for extraction:", ""]
    for i, preset in enumerate(entity_presets, start=1):
        label = preset.get("label", "Unknown")
        type_ref = preset.get("type_ref", "")
        metadata_fields = preset.get("metadata_fields", [])
        lines.append(f"{i}. {label} (type_ref: {type_ref})")
        if metadata_fields:
            attrs = ", ".join(metadata_fields)
            lines.append(f"   Attributes: {attrs}")
        lines.append("")
    # Remove trailing blank line
    if lines and lines[-1] == "":
        lines.pop()
    return "\n".join(lines)
@@ -0,0 +1,22 @@
 """Genera la seccion del system prompt con los tipos de relacion permitidos."""
 def build_relation_schema_prompt(relation_types: list[str]) -> str:
    """Genera texto legible para el LLM describiendo los tipos de relacion permitidos.
    Formatea la lista de tipos de relacion en una seccion del system prompt que
    indica al LLM que relaciones puede extraer entre entidades.
    Args:
        relation_types: Lista de strings con los tipos de relacion permitidos.
                        Ejemplo: ["funds", "employs", "communicates_with"]
    Returns:
        String formateado con la seccion del prompt. Retorna string vacio si
        la lista esta vacia.
    """
    if not relation_types:
        return ""
    joined = ", ".join(relation_types)
    return f"Allowed relation types:\n{joined}"
@@ -0,0 +1,814 @@
 """Core functional programming utilities — pure functions for list/collection operations."""
 import hashlib
 import re
 from functools import reduce as _reduce
 from typing import Any, Callable, Dict, List, Optional, Tuple
 def filter_list(xs: list, pred: Callable) -> list:
    """Filter list by predicate. Does not mutate the original."""
    return [x for x in xs if pred(x)]
 def map_list(xs: list, fn: Callable) -> list:
    """Map function over list. Does not mutate the original."""
    return [fn(x) for x in xs]
 def reduce_list(xs: list, initial: Any, fn: Callable) -> Any:
    """Reduce list with accumulator. fn(acc, x) -> acc."""
    return _reduce(fn, xs, initial)
 def flat_map(xs: list, fn: Callable) -> list:
    """Map function over list then flatten one level."""
    result = []
    for x in xs:
        result.extend(fn(x))
    return result
 def flatten(xss: list) -> list:
    """Flatten a list of lists one level."""
    result = []
    for xs in xss:
        result.extend(xs)
    return result
 def chunk(xs: list, size: int) -> list:
    """Split list into chunks of given size. Last chunk may be smaller."""
    if size <= 0:
        return []
    return [xs[i : i + size] for i in range(0, len(xs), size)]
 def take(xs: list, n: int) -> list:
    """Take first n elements from list."""
    return xs[:n]
 def drop(xs: list, n: int) -> list:
    """Drop first n elements from list."""
    return xs[n:]
 def unique(xs: list) -> list:
    """Remove duplicates preserving order. Uses identity for hashable elements."""
    seen = set()
    result = []
    for x in xs:
        if x not in seen:
            seen.add(x)
            result.append(x)
    return result
 def group_by(xs: list, key_fn: Callable) -> Dict:
    """Group elements by key function. Returns dict of key -> list."""
    groups: Dict = {}
    for x in xs:
        k = key_fn(x)
        if k not in groups:
            groups[k] = []
        groups[k].append(x)
    return groups
 def partition(xs: list, pred: Callable) -> Tuple[list, list]:
    """Split list into (matches, non_matches) based on predicate."""
    matches = []
    non_matches = []
    for x in xs:
        if pred(x):
            matches.append(x)
        else:
            non_matches.append(x)
    return (matches, non_matches)
 def find(xs: list, pred: Callable) -> Any:
    """Find first element matching predicate. Returns None if not found."""
    for x in xs:
        if pred(x):
            return x
    return None
 def find_index(xs: list, pred: Callable) -> int:
    """Find index of first element matching predicate. Returns -1 if not found."""
    for i, x in enumerate(xs):
        if pred(x):
            return i
    return -1
 def zip_with(xs: list, ys: list, fn: Callable) -> list:
    """Zip two lists with a combining function. Stops at shorter list."""
    return [fn(x, y) for x, y in zip(xs, ys)]
 def all_of(xs: list, pred: Callable) -> bool:
    """Return True if all elements match predicate."""
    return all(pred(x) for x in xs)
 def any_of(xs: list, pred: Callable) -> bool:
    """Return True if any element matches predicate."""
    return any(pred(x) for x in xs)
 def pipe(value: Any, *fns: Callable) -> Any:
    """Pipe a value through a sequence of functions left-to-right."""
    result = value
    for fn in fns:
        result = fn(result)
    return result
 def compose(*fns: Callable) -> Callable:
    """Compose functions right-to-left. compose(f, g)(x) == f(g(x))."""
    def composed(x: Any) -> Any:
        result = x
        for fn in reversed(fns):
            result = fn(result)
        return result
    return composed
 # ── Tree manipulation ────────────────────────────────────────────────────────
 def flatten_tree(structure: Any) -> List[Dict]:
    """Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
    import copy
    if isinstance(structure, dict):
        node = copy.deepcopy(structure)
        node.pop('nodes', None)
        nodes = [node]
        for key in list(structure.keys()):
            if 'nodes' in key:
                nodes.extend(flatten_tree(structure[key]))
        return nodes
    elif isinstance(structure, list):
        nodes = []
        for item in structure:
            nodes.extend(flatten_tree(item))
        return nodes
    return []
 def tree_to_flat_list(structure: Any) -> List[Dict]:
    """Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
    if isinstance(structure, dict):
        nodes = [structure]
        if 'nodes' in structure:
            nodes.extend(tree_to_flat_list(structure['nodes']))
        return nodes
    elif isinstance(structure, list):
        nodes = []
        for item in structure:
            nodes.extend(tree_to_flat_list(item))
        return nodes
    return []
 def get_leaf_nodes(structure: Any) -> List[Dict]:
    """Extract only leaf nodes (no children) from a hierarchical tree."""
    import copy
    if isinstance(structure, dict):
        if not structure.get('nodes'):
            node = copy.deepcopy(structure)
            node.pop('nodes', None)
            return [node]
        leaf_nodes = []
        for key in list(structure.keys()):
            if 'nodes' in key:
                leaf_nodes.extend(get_leaf_nodes(structure[key]))
        return leaf_nodes
    elif isinstance(structure, list):
        leaf_nodes = []
        for item in structure:
            leaf_nodes.extend(get_leaf_nodes(item))
        return leaf_nodes
    return []
 def write_node_ids(data: Any, node_id: int = 0) -> int:
    """Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
    if isinstance(data, dict):
        data['node_id'] = str(node_id).zfill(4)
        node_id += 1
        for key in list(data.keys()):
            if 'nodes' in key:
                node_id = write_node_ids(data[key], node_id)
    elif isinstance(data, list):
        for item in data:
            node_id = write_node_ids(item, node_id)
    return node_id
 def list_to_tree(data: List[Dict]) -> List[Dict]:
    """Convert flat list with structure codes ('1.2.3') to nested tree."""
    def get_parent_structure(structure):
        if not structure:
            return None
        parts = str(structure).split('.')
        return '.'.join(parts[:-1]) if len(parts) > 1 else None
    nodes = {}
    root_nodes = []
    for item in data:
        structure = item.get('structure')
        node = {
            'title': item.get('title'),
            'start_index': item.get('start_index'),
            'end_index': item.get('end_index'),
            'nodes': []
        }
        nodes[structure] = node
        parent_structure = get_parent_structure(structure)
        if parent_structure and parent_structure in nodes:
            nodes[parent_structure]['nodes'].append(node)
        else:
            root_nodes.append(node)
    def clean_node(node):
        if not node['nodes']:
            del node['nodes']
        else:
            for child in node['nodes']:
                clean_node(child)
        return node
    return [clean_node(node) for node in root_nodes]
 def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
    """Recursively remove specified fields from a tree (dict/list)."""
    if fields is None:
        fields = ['text']
    if isinstance(data, dict):
        return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
    elif isinstance(data, list):
        return [remove_tree_fields(item, fields) for item in data]
    return data
 def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
    """Reorder fields of each node in a tree according to specified key order."""
    if not order:
        return structure
    if isinstance(structure, dict):
        if 'nodes' in structure:
            structure['nodes'] = format_tree_structure(structure['nodes'], order)
        if not structure.get('nodes'):
            structure.pop('nodes', None)
        return {key: structure[key] for key in order if key in structure}
    elif isinstance(structure, list):
        return [format_tree_structure(item, order) for item in structure]
    return structure
 def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
    """Create flat dict mapping node_id to node for O(1) lookup."""
    mapping = {}
    def _traverse(nodes):
        for node in nodes:
            if node.get('node_id'):
                mapping[node['node_id']] = node
            if node.get('nodes'):
                _traverse(node['nodes'])
    _traverse(tree)
    return mapping
 # ── Text / JSON extraction ───────────────────────────────────────────────────
 def extract_json_from_llm(content: str) -> Dict:
    """Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
    import json
    try:
        start_idx = content.find("```json")
        if start_idx != -1:
            start_idx += 7
            end_idx = content.rfind("```")
            json_content = content[start_idx:end_idx].strip()
        else:
            json_content = content.strip()
        json_content = json_content.replace('None', 'null')
        json_content = json_content.replace('\n', ' ').replace('\r', ' ')
        json_content = ' '.join(json_content.split())
        return json.loads(json_content)
    except (json.JSONDecodeError, Exception):
        try:
            json_content = json_content.replace(',]', ']').replace(',}', '}')
            return json.loads(json_content)
        except Exception:
            return {}
 def parse_page_range(pages: str) -> List[int]:
    """Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
    result = []
    for part in pages.split(','):
        part = part.strip()
        if '-' in part:
            start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
            if start > end:
                raise ValueError(f"Invalid range '{part}': start must be <= end")
            result.extend(range(start, end + 1))
        else:
            result.append(int(part))
    return sorted(set(result))
 # ── Markdown parsing ─────────────────────────────────────────────────────────
 def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
    """Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
    import re
    header_pattern = r'^(#{1,6})\s+(.+)$'
    code_block_pattern = r'^```'
    node_list = []
    lines = markdown_content.split('\n')
    in_code_block = False
    for line_num, line in enumerate(lines, 1):
        stripped_line = line.strip()
        if re.match(code_block_pattern, stripped_line):
            in_code_block = not in_code_block
            continue
        if not stripped_line:
            continue
        if not in_code_block:
            match = re.match(header_pattern, stripped_line)
            if match:
                level = len(match.group(1))
                title = match.group(2).strip()
                node_list.append({'title': title, 'level': level, 'line_num': line_num})
    return node_list, lines
 def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
    """Build nested tree from flat list of headers with levels (h1>h2>h3)."""
    if not node_list:
        return []
    stack = []
    root_nodes = []
    node_counter = 1
    for node in node_list:
        current_level = node['level']
        tree_node = {
            'title': node['title'],
            'node_id': str(node_counter).zfill(4),
            'line_num': node['line_num'],
            'nodes': []
        }
        node_counter += 1
        while stack and stack[-1][1] >= current_level:
            stack.pop()
        if not stack:
            root_nodes.append(tree_node)
        else:
            parent_node, _ = stack[-1]
            parent_node['nodes'].append(tree_node)
        stack.append((tree_node, current_level))
    def clean_empty_nodes(nodes):
        for n in nodes:
            if n['nodes']:
                clean_empty_nodes(n['nodes'])
            else:
                del n['nodes']
        return nodes
    return clean_empty_nodes(root_nodes)
 # ── Pagination / chunking ────────────────────────────────────────────────────
 def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
                        max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
    """Group pages into text chunks respecting token limit with configurable overlap."""
    import math
    num_tokens = sum(token_lengths)
    if num_tokens <= max_tokens:
        return ["".join(page_contents)]
    subsets = []
    current_subset = []
    current_token_count = 0
    expected_parts = math.ceil(num_tokens / max_tokens)
    avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
    for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
        if current_token_count + page_tokens > avg_tokens:
            subsets.append(''.join(current_subset))
            overlap_start = max(i - overlap_pages, 0)
            current_subset = list(page_contents[overlap_start:i])
            current_token_count = sum(token_lengths[overlap_start:i])
        current_subset.append(page_content)
        current_token_count += page_tokens
    if current_subset:
        subsets.append(''.join(current_subset))
    return subsets
 def calculate_page_offset(pairs: List[Dict]) -> int:
    """Calculate offset between logical page numbers and physical indices using reference pairs."""
    differences = []
    for pair in pairs:
        try:
            difference = pair['physical_index'] - pair['page']
            differences.append(difference)
        except (KeyError, TypeError):
            continue
    if not differences:
        return 0
    counts: Dict[int, int] = {}
    for diff in differences:
        counts[diff] = counts.get(diff, 0) + 1
    return max(counts.items(), key=lambda x: x[1])[0]
 # ── Text preprocessing ───────────────────────────────────────────────────────
 def preprocess_text(text: str) -> str:
    """Normalize whitespace and newlines in raw text.
    Args:
        text: Raw text to normalize.
    Returns:
        Normalized text with consistent newlines, stripped lines, and no
        excessive blank lines.
    """
    # Normalize line endings: \r\n and \r -> \n
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    # Reduce 3+ consecutive newlines to at most 2
    text = re.sub(r'\n{3,}', '\n\n', text)
    # Strip whitespace from each line
    text = '\n'.join(line.strip() for line in text.split('\n'))
    # Strip globally
    return text.strip()
 def get_text_stats(text: str) -> dict:
    """Compute basic statistics of a text: characters, lines, words.
    Args:
        text: Input text to analyze.
    Returns:
        Dict with keys total_chars (int), total_lines (int), total_words (int).
    """
    return {
        'total_chars': len(text),
        'total_lines': text.count('\n') + 1,
        'total_words': len(text.split()),
    }
 # ── Git URL parsing ──────────────────────────────────────────────────────────
 _DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
 def _sanitize_git_segment(segment: str) -> str:
    """Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
    if segment.endswith(".git"):
        segment = segment[:-4]
    return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
 def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
    """Parse a code-hosting URL and return the 'org/repo' path component.
    Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
    Returns None if the URL does not match any known host or is malformed.
    Args:
        url: Repository URL in any supported format.
        known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
    Returns:
        'org/repo' string or None.
    """
    from urllib.parse import urlparse
    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
    url = url.strip()
    if url.startswith("git@"):
        # git@github.com:org/repo.git
        rest = url[len("git@"):]
        if ":" not in rest:
            return None
        host, path = rest.split(":", 1)
        if host not in hosts:
            return None
        segments = [s for s in path.split("/") if s]
        if len(segments) < 2:
            return None
        org = _sanitize_git_segment(segments[0])
        repo = _sanitize_git_segment(segments[1])
        if not org or not repo:
            return None
        return f"{org}/{repo}"
    for prefix in ("http://", "https://", "git://", "ssh://"):
        if url.startswith(prefix):
            parsed = urlparse(url)
            netloc = parsed.hostname or ""
            if netloc not in hosts:
                return None
            segments = [s for s in parsed.path.split("/") if s]
            if len(segments) < 2:
                return None
            org = _sanitize_git_segment(segments[0])
            repo = _sanitize_git_segment(segments[1])
            if not org or not repo:
                return None
            return f"{org}/{repo}"
    return None
 def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
    """Return True only if url points to a clonable git repository.
    Accepts org/repo and org/repo/tree/<ref> paths.
    Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
    Args:
        url: URL to verify.
        known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
    Returns:
        True if url is a clonable repository URL.
    """
    from urllib.parse import urlparse
    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
    url = url.strip()
    # SSH shorthand — always repo-level if host matches
    if url.startswith("git@"):
        rest = url[len("git@"):]
        if ":" not in rest:
            return False
        host, _ = rest.split(":", 1)
        return host in hosts
    # git:// and ssh:// — always repo-level if host matches
    for prefix in ("ssh://", "git://"):
        if url.startswith(prefix):
            parsed = urlparse(url)
            return (parsed.hostname or "") in hosts
    # http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
    for prefix in ("http://", "https://"):
        if url.startswith(prefix):
            parsed = urlparse(url)
            if (parsed.hostname or "") not in hosts:
                return False
            segments = [s for s in parsed.path.split("/") if s]
            if len(segments) == 2:
                return True
            if len(segments) == 4 and segments[2] == "tree":
                return True
            return False
    return False
 def validate_git_ssh_uri(url: str) -> None:
    """Validate a git SSH URI of the form git@host:path.
    Raises ValueError with a descriptive message if the URI is malformed.
    Args:
        url: URI string to validate.
    Raises:
        ValueError: If the URI does not conform to git SSH format.
    """
    if not url.startswith("git@"):
        raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
    rest = url[len("git@"):]
    if ":" not in rest:
        raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
    _, path = rest.split(":", 1)
    if not path:
        raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
 # ---------------------------------------------------------------------------
 # Markdown parsing utilities
 # ---------------------------------------------------------------------------
 def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
    """Extract YAML frontmatter delimited by '---' from the start of a markdown string.
    Args:
        content: Raw markdown string, optionally starting with YAML frontmatter.
    Returns:
        Tuple of (content_without_frontmatter, frontmatter_dict).
        frontmatter_dict is None when no frontmatter is found.
    """
    pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
    match = pattern.match(content)
    if not match:
        return content, None
    raw = match.group(1)
    remaining = content[match.end():]
    try:
        import yaml  # type: ignore
        data = yaml.safe_load(raw)
        if not isinstance(data, dict):
            data = None
    except Exception:
        # Fallback: simple key: value parser (no yaml dependency)
        data = {}
        for line in raw.splitlines():
            if ':' in line:
                key, _, value = line.partition(':')
                data[key.strip()] = value.strip()
    return remaining, data
 def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
    """Find all markdown headings (# to ######), excluding those inside code blocks,
    HTML comments, and indented blocks.
    Args:
        content: Markdown text to search.
    Returns:
        List of (start_pos, end_pos, title, level) for each heading found.
    """
    excluded: List[Tuple[int, int]] = []
    # Code blocks (triple backtick)
    for m in re.finditer(r'```.*?```', content, re.DOTALL):
        excluded.append((m.start(), m.end()))
    # HTML comments
    for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
        excluded.append((m.start(), m.end()))
    # Indented blocks (lines starting with 4 spaces or a tab)
    for m in re.finditer(r'^(    |\t).+$', content, re.MULTILINE):
        excluded.append((m.start(), m.end()))
    def is_excluded(pos: int) -> bool:
        return any(start <= pos < end for start, end in excluded)
    results: List[Tuple[int, int, str, int]] = []
    for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
        # Skip escaped headings (\#)
        before = content[m.start() - 1] if m.start() > 0 else ''
        if before == '\\':
            continue
        if is_excluded(m.start()):
            continue
        level = len(m.group(1))
        title = m.group(2).strip()
        results.append((m.start(), m.end(), title, level))
    return results
 def estimate_token_count(content: str) -> int:
    """Estimate token count without a tokenizer.
    CJK characters count as ~0.7 tokens each; other non-whitespace characters
    count as ~0.3 tokens each.
    Args:
        content: Text to estimate.
    Returns:
        Estimated integer token count.
    """
    cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
    without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
    others = re.findall(r'\S', without_cjk)
    return int(len(cjk) * 0.7 + len(others) * 0.3)
 def smart_split_content(
    content: str,
    max_tokens: int = 1024,
    max_chars: int = 8000,
 ) -> List[str]:
    """Split large content into parts respecting token and character limits.
    Splits by paragraphs (double newline). If a single paragraph exceeds the
    limit it is force-cut into chunks of max_chars.
    Args:
        content: Text to split.
        max_tokens: Maximum estimated tokens per part.
        max_chars: Maximum characters per part.
    Returns:
        List of string parts.
    """
    paragraphs = content.split('\n\n')
    parts: List[str] = []
    current_parts: List[str] = []
    current_tokens = 0
    current_chars = 0
    def flush() -> None:
        if current_parts:
            parts.append('\n\n'.join(current_parts))
            current_parts.clear()
    for para in paragraphs:
        para_tokens = estimate_token_count(para)
        para_chars = len(para)
        # Single paragraph exceeds limits — force-cut it
        if para_tokens > max_tokens or para_chars > max_chars:
            flush()
            current_tokens = 0
            current_chars = 0
            for i in range(0, len(para), max_chars):
                parts.append(para[i:i + max_chars])
            continue
        # Would exceed limits if added — flush first
        if (current_tokens + para_tokens > max_tokens or
                current_chars + para_chars > max_chars):
            flush()
            current_tokens = 0
            current_chars = 0
        current_parts.append(para)
        current_tokens += para_tokens
        current_chars += para_chars
    flush()
    return parts if parts else [content]
 def sanitize_for_path(text: str, max_length: int = 50) -> str:
    """Convert text to a safe string for use in file paths.
    Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
    with underscores. Truncates with a sha256 suffix if the result exceeds
    max_length.
    Args:
        text: Input text to sanitize.
        max_length: Maximum length of the returned string.
    Returns:
        Safe path-friendly string.
    """
    cleaned = re.sub(
        r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
        '',
        text,
    )
    cleaned = cleaned.replace(' ', '_').strip('_')
    if not cleaned:
        return 'section'
    if len(cleaned) <= max_length:
        return cleaned
    suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
    return cleaned[:max_length - len(suffix)] + suffix
@@ -0,0 +1,283 @@
 """Deduplica entidades candidatas usando fuzzy matching de nombres."""
 from __future__ import annotations
 import sys
 import os
 import uuid
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
 from entity_candidate import EntityCandidate
 from deduplication_result import DeduplicationResult
 from normalize_entity_name import normalize_entity_name
 from merge_entity_attributes import merge_entity_attributes
 # ── Similitud helpers ──────────────────────────────────────────────────────────
 def _levenshtein(a: str, b: str) -> int:
    """Distancia de edicion Levenshtein entre dos strings."""
    if a == b:
        return 0
    if not a:
        return len(b)
    if not b:
        return len(a)
    prev = list(range(len(b) + 1))
    for i, ca in enumerate(a, 1):
        curr = [i]
        for j, cb in enumerate(b, 1):
            cost = 0 if ca == cb else 1
            curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
        prev = curr
    return prev[-1]
 def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float:
    """Similitud de Jaccard entre dos conjuntos de tokens."""
    set_a = set(tokens_a)
    set_b = set(tokens_b)
    if not set_a and not set_b:
        return 1.0
    inter = len(set_a & set_b)
    union = len(set_a | set_b)
    return inter / union if union else 0.0
 def _name_similarity(a: str, b: str) -> float:
    """Score de similitud entre dos nombres normalizados.
    Combina similitud de Levenshtein y Jaccard sobre tokens.
    Aplica bonus de contencion (+0.3) y deteccion de acronimos.
    """
    if a == b:
        return 1.0
    # Similitud Levenshtein
    max_len = max(len(a), len(b))
    lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0
    # Similitud Jaccard sobre tokens
    tokens_a = a.split()
    tokens_b = b.split()
    jac_sim = _jaccard(tokens_a, tokens_b)
    score = max(lev_sim, jac_sim)
    # Bonus de contencion: un nombre contiene al otro
    if a in b or b in a:
        score = min(1.0, score + 0.3)
    # Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation"
    if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a):
        score = min(1.0, score + 0.3)
    return score
 def _is_acronym_of(candidate: str, tokens: list[str]) -> bool:
    """Comprueba si candidate es un acronimo formado por las iniciales de tokens."""
    if not candidate or not tokens:
        return False
    initials = "".join(t[0] for t in tokens if t).upper()
    return candidate.upper() == initials
 _EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"}
 def _is_exact_type(entity_type: str) -> bool:
    """Tipos tecnicos donde solo se acepta matching exacto."""
    return entity_type.lower() in _EXACT_TYPES
 # ── Union-Find ─────────────────────────────────────────────────────────────────
 class _UnionFind:
    def __init__(self, n: int) -> None:
        self._parent = list(range(n))
        self._rank = [0] * n
    def find(self, x: int) -> int:
        while self._parent[x] != x:
            self._parent[x] = self._parent[self._parent[x]]
            x = self._parent[x]
        return x
    def union(self, x: int, y: int) -> None:
        rx, ry = self.find(x), self.find(y)
        if rx == ry:
            return
        if self._rank[rx] < self._rank[ry]:
            rx, ry = ry, rx
        self._parent[ry] = rx
        if self._rank[rx] == self._rank[ry]:
            self._rank[rx] += 1
 # ── Implementacion principal ────────────────────────────────────────────────────
 def deduplicate_entities(
    candidates: list[EntityCandidate],
    name_threshold: float = 0.85,
    same_type_only: bool = True,
 ) -> DeduplicationResult:
    """Agrupa entidades candidatas que refieren a la misma entidad real.
    Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para
    detectar clusters transitivos. Por cada cluster genera una entidad canonica
    mergeando atributos de todos sus miembros.
    Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se
    acepta matching exacto normalizado, ignorando el umbral de nombre.
    Args:
        candidates: lista de EntityCandidate a deduplicar.
        name_threshold: score minimo para considerar dos nombres iguales (0-1).
        same_type_only: si True, solo compara entidades del mismo type_ref.
    Returns:
        DeduplicationResult con entidades deduplicadas, mapas de resolucion
        e historial de merges.
    """
    if not candidates:
        return DeduplicationResult(
            entities=[],
            entity_id_map={},
            name_to_id={},
            merge_log=[],
            total_before=0,
            total_after=0,
        )
    n = len(candidates)
    # Paso 1: normalizar nombres
    normalized: list[str] = []
    for c in candidates:
        norm = normalize_entity_name(c.name, c.type_ref)
        normalized.append(norm)
    # Paso 2: Union-Find sobre todos los indices
    uf = _UnionFind(n)
    # Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only)
    merge_pairs: list[tuple[int, int, float]] = []
    for i in range(n):
        for j in range(i + 1, n):
            if same_type_only and candidates[i].type_ref != candidates[j].type_ref:
                continue
            ni, nj = normalized[i], normalized[j]
            et = candidates[i].type_ref.lower()
            if _is_exact_type(et):
                if ni == nj:
                    uf.union(i, j)
                    merge_pairs.append((i, j, 1.0))
                continue
            score = _name_similarity(ni, nj)
            if score >= name_threshold:
                uf.union(i, j)
                merge_pairs.append((i, j, score))
    # Paso 4: agrupar indices por raiz del Union-Find
    clusters: dict[int, list[int]] = {}
    for i in range(n):
        root = uf.find(i)
        clusters.setdefault(root, []).append(i)
    # Paso 5: merge por cluster
    merged_entities: list[EntityCandidate] = []
    entity_id_map: dict[str, str] = {}
    name_to_id: dict[str, str] = {}
    merge_log: list[dict] = []
    # Pares mergeados para construir el log
    merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {}
    for i, j, score in merge_pairs:
        root = uf.find(i)
        merged_pairs_by_root.setdefault(root, []).append((i, j, score))
    for root, indices in clusters.items():
        cluster_candidates = [candidates[idx] for idx in indices]
        if len(cluster_candidates) == 1:
            c = cluster_candidates[0]
            canonical_name = c.name
            canonical_norm = normalized[indices[0]]
            merged_attrs = c.attributes
            merged_confidence = c.confidence
            merged_chunks = list(c.source_chunk_indices)
            merged_from = list(c.merged_from) if c.merged_from else [c.name]
        else:
            # Candidato con mayor confidence es el canonico
            best = max(cluster_candidates, key=lambda c: c.confidence)
            canonical_name = best.name
            canonical_norm = normalize_entity_name(best.name, best.type_ref)
            merged_attrs = merge_entity_attributes(
                [c.attributes for c in cluster_candidates]
            )
            merged_confidence = max(c.confidence for c in cluster_candidates)
            merged_chunks: list[int] = []
            seen_chunks: set[int] = set()
            for c in cluster_candidates:
                for idx in c.source_chunk_indices:
                    if idx not in seen_chunks:
                        merged_chunks.append(idx)
                        seen_chunks.add(idx)
            merged_from: list[str] = []
            seen_names: set[str] = set()
            for c in cluster_candidates:
                names_to_add = c.merged_from if c.merged_from else [c.name]
                for nm in names_to_add:
                    if nm not in seen_names:
                        merged_from.append(nm)
                        seen_names.add(nm)
            # Log de merge
            other_names = [c.name for c in cluster_candidates if c is not best]
            pairs = merged_pairs_by_root.get(root, [])
            max_score = max((s for _, _, s in pairs), default=1.0)
            merge_log.append(
                {
                    "canonical": canonical_name,
                    "merged": other_names,
                    "score": round(max_score, 4),
                    "reason": "fuzzy_name",
                }
            )
        ent_id = str(uuid.uuid4())
        entity = EntityCandidate(
            name=canonical_name,
            name_normalized=canonical_norm,
            type_ref=cluster_candidates[0].type_ref,
            type_label=cluster_candidates[0].type_label,
            attributes=merged_attrs,
            confidence=merged_confidence,
            source_chunk_indices=merged_chunks,
            merged_from=merged_from,
        )
        merged_entities.append(entity)
        # Poblar mapas de resolucion
        entity_id_map[canonical_norm] = ent_id
        for orig_name in merged_from:
            name_to_id[orig_name] = ent_id
        name_to_id[canonical_norm] = ent_id
    return DeduplicationResult(
        entities=merged_entities,
        entity_id_map=entity_id_map,
        name_to_id=name_to_id,
        merge_log=merge_log,
        total_before=n,
        total_after=len(merged_entities),
    )
@@ -0,0 +1,189 @@
 """Deduplica RelationCandidate resolviendo nombres a IDs y colapsando duplicados."""
 import logging
 import os
 import sys
 logger = logging.getLogger(__name__)
 # --- Importar levenshtein_distance desde cybersecurity ---
 # Soporta dos contextos:
 #   1. Ejecutado desde python/functions/datascience/ (pytest local)
 #   2. Ejecutado desde la raiz del registry (fn run)
 def _levenshtein_distance(a: str, b: str) -> int:
    """Calcula la distancia de edicion de Levenshtein entre dos strings."""
    if len(a) < len(b):
        return _levenshtein_distance(b, a)
    if len(b) == 0:
        return len(a)
    prev_row = list(range(len(b) + 1))
    for i, ca in enumerate(a):
        curr_row = [i + 1]
        for j, cb in enumerate(b):
            cost = 0 if ca == cb else 1
            curr_row.append(
                min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
            )
        prev_row = curr_row
    return prev_row[-1]
 try:
    _here = os.path.dirname(os.path.abspath(__file__))
    _cyber_path = os.path.join(_here, "..", "cybersecurity")
    if _cyber_path not in sys.path:
        sys.path.insert(0, _cyber_path)
    from cybersecurity import levenshtein_distance as _lev
 except ImportError:
    _lev = None  # type: ignore
 levenshtein_distance = _lev if _lev is not None else _levenshtein_distance
 def _fuzzy_resolve(name: str, entity_id_map: dict[str, str], threshold: int = 3) -> str:
    """Intenta resolver un nombre contra las claves del mapa por fuzzy match.
    Recorre todas las claves de entity_id_map y busca la mas cercana segun
    distancia de Levenshtein. Retorna el entity_id si la distancia es <=
    threshold, o '' si no hay match aceptable.
    Args:
        name: nombre a resolver (ya en lowercase strip).
        entity_id_map: mapa nombre_normalizado -> entity_id.
        threshold: distancia maxima de edicion para considerar match (default 3).
    Returns:
        entity_id del mejor match o '' si no hay match.
    """
    best_id = ""
    best_dist = threshold + 1
    for key, entity_id in entity_id_map.items():
        dist = levenshtein_distance(name, key)
        if dist < best_dist:
            best_dist = dist
            best_id = entity_id
    return best_id if best_dist <= threshold else ""
 def deduplicate_relations(
    relations: list,
    entity_id_map: dict[str, str],
 ) -> list:
    """Deduplica relaciones candidatas resolviendo nombres a IDs de entidad finales.
    Algoritmo:
    1. Para cada RelationCandidate, intentar resolver from_name y to_name al
       entity_id via entity_id_map (lookup exacto primero, ignorando mayusculas).
       Si no hay match exacto, intentar fuzzy match con levenshtein_distance.
       Si sigue sin match, descartar la relacion con warning.
    2. Descartar self-loops (from_id == to_id).
    3. Deduplicar por (from_id, to_id, relation_type):
       - description: concatenar descripciones unicas separadas por '; '
       - confidence: max del grupo
    4. Retornar lista limpia de RelationCandidate con from_id y to_id resueltos.
    Args:
        relations: lista de RelationCandidate con from_name/to_name originales.
        entity_id_map: mapa nombre_normalizado -> entity_id (output de
            deduplicate_entities). Permite resolver nombres que fueron mergeados.
    Returns:
        Lista deduplicada de RelationCandidate con from_id y to_id resueltos.
    """
    # Importar tipo — funciona tanto desde datascience/ como desde raiz del registry
    try:
        _types_path = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "..", "..", "..", "python", "types", "datascience",
        )
        if _types_path not in sys.path:
            sys.path.insert(0, _types_path)
        from relation_candidate import RelationCandidate
    except ImportError:
        from relation_candidate import RelationCandidate  # type: ignore
    resolved: list = []
    for rel in relations:
        # --- Resolver from_name ---
        from_key = rel.from_name.lower().strip()
        from_id = entity_id_map.get(from_key, "")
        if not from_id:
            from_id = _fuzzy_resolve(from_key, entity_id_map)
            if not from_id:
                logger.warning(
                    "deduplicate_relations: no se pudo resolver from_name=%r — descartando",
                    rel.from_name,
                )
                continue
        # --- Resolver to_name ---
        to_key = rel.to_name.lower().strip()
        to_id = entity_id_map.get(to_key, "")
        if not to_id:
            to_id = _fuzzy_resolve(to_key, entity_id_map)
            if not to_id:
                logger.warning(
                    "deduplicate_relations: no se pudo resolver to_name=%r — descartando",
                    rel.to_name,
                )
                continue
        # --- Descartar self-loops ---
        if from_id == to_id:
            logger.debug(
                "deduplicate_relations: self-loop descartado (from=%r, to=%r, type=%r)",
                rel.from_name,
                rel.to_name,
                rel.relation_type,
            )
            continue
        resolved.append(
            RelationCandidate(
                from_name=rel.from_name,
                to_name=rel.to_name,
                from_id=from_id,
                to_id=to_id,
                relation_type=rel.relation_type,
                description=rel.description,
                confidence=rel.confidence,
                source_chunk_index=rel.source_chunk_index,
            )
        )
    # --- Deduplicar por (from_id, to_id, relation_type) ---
    groups: dict[tuple, list] = {}
    for rel in resolved:
        key = (rel.from_id, rel.to_id, rel.relation_type)
        groups.setdefault(key, []).append(rel)
    result: list = []
    for (from_id, to_id, rel_type), group in groups.items():
        if len(group) == 1:
            result.append(group[0])
            continue
        # Mergear: max confidence + union de descripciones unicas
        best_confidence = max(r.confidence for r in group)
        seen_desc: set[str] = set()
        descriptions: list[str] = []
        for r in group:
            if r.description and r.description not in seen_desc:
                descriptions.append(r.description)
                seen_desc.add(r.description)
        result.append(
            RelationCandidate(
                from_name=group[0].from_name,
                to_name=group[0].to_name,
                from_id=from_id,
                to_id=to_id,
                relation_type=rel_type,
                description="; ".join(descriptions),
                confidence=best_confidence,
                source_chunk_index=group[0].source_chunk_index,
            )
        )
    return result
@@ -0,0 +1,22 @@
 """DeduplicationResult — resultado del proceso de deduplicacion de entidades."""
 from dataclasses import dataclass, field
 from entity_candidate import EntityCandidate
@dataclass
 class DeduplicationResult:
    """Resultado de deduplicacion de entidades.
    El `name_to_id` mapea TODOS los nombres originales (incluyendo los
    mergeados) a su ID final, permitiendo resolver relaciones que usan
    cualquier variante del nombre.
    """
    entities: list[EntityCandidate]
    entity_id_map: dict[str, str]
    name_to_id: dict[str, str]
    merge_log: list[dict] = field(default_factory=list)
    total_before: int = 0
    total_after: int = 0
@@ -0,0 +1,34 @@
 """EntityCandidate — candidato de entidad extraido por el LLM."""
 from dataclasses import dataclass, field
@dataclass
 class EntityCandidate:
    """Candidato de entidad extraido por el LLM.
    Puede venir de un solo chunk o ser el resultado de mergear multiples
    extracciones. `merged_from` rastrea los nombres originales para debugging.
    """
    name: str
    name_normalized: str = ""
    type_ref: str = ""
    type_label: str = ""
    attributes: dict = field(default_factory=dict)
    confidence: float = 0.0
    source_chunk_indices: list[int] = field(default_factory=list)
    merged_from: list[str] = field(default_factory=list)
    def to_dict(self) -> dict:
        """Serializa el candidato a un diccionario."""
        return {
            "name": self.name,
            "name_normalized": self.name_normalized,
            "type_ref": self.type_ref,
            "type_label": self.type_label,
            "attributes": self.attributes,
            "confidence": self.confidence,
            "source_chunk_indices": self.source_chunk_indices,
            "merged_from": self.merged_from,
        }
@@ -0,0 +1,145 @@
 """Extrae entidades de un chunk de texto usando un LLM inyectado."""
 import sys
 import os
 import warnings
 from typing import Callable
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
 from entity_candidate import EntityCandidate
 def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str:
    """Construye el system prompt para extraccion de entidades."""
    lines = [
        "You are an entity extraction expert. Given text, extract all entities",
        "matching these types. For each entity, provide: name, type_ref,",
        "attributes (matching the metadata_fields for that type), and a",
        "confidence score (0.0-1.0).",
        "",
        "Entity types:",
    ]
    for schema_entry in entity_schema:
        label = schema_entry.get("label", "Unknown")
        type_ref = schema_entry.get("type_ref", "")
        metadata_fields = schema_entry.get("metadata_fields", [])
        lines.append(f"- {label} (type_ref: {type_ref})")
        if metadata_fields:
            lines.append(f"  fields: {', '.join(metadata_fields)}")
    lines += [
        "",
        'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}',
        "",
        "Rules:",
        "- Only extract entities explicitly mentioned in the text",
        "- Use the exact type_ref from the schema",
        "- Leave unknown attributes as null",
        "- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied",
        f"- {language_instruction}",
    ]
    return "\n".join(lines)
 def extract_entities_llm(
    text: str,
    entity_schema: list[dict],
    llm_chat_json: Callable[[list[dict]], dict],
    language_instruction: str = "Respond in English.",
 ) -> list[EntityCandidate]:
    """Extrae entidades de un chunk de texto usando un LLM inyectado.
    Construye un system prompt con el schema de entity types, llama al LLM
    y valida la respuesta retornando una lista de EntityCandidate.
    Args:
        text: Chunk de texto a analizar.
        entity_schema: Lista de tipos con metadata fields. Cada entrada es un
            dict con las claves 'type_ref', 'label' y opcionalmente
            'metadata_fields'. Ejemplo:
            [{"type_ref": "osint_person_go_cybersecurity", "label": "Person",
              "metadata_fields": ["full_name", "alias"]}]
        llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style
            y retorna un dict con la respuesta JSON del LLM. Interfaz:
            llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict
        language_instruction: Instruccion de idioma para el LLM. Por defecto
            "Respond in English."
    Returns:
        Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM
        no retorna JSON valido o si no se encuentran entidades.
    Raises:
        ValueError: Si entity_schema esta vacio.
    """
    if not entity_schema:
        raise ValueError("entity_schema no puede estar vacio")
    valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema}
    type_ref_to_label = {
        entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema
    }
    system_prompt = _build_system_prompt(entity_schema, language_instruction)
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text},
    ]
    try:
        response = llm_chat_json(messages)
    except Exception as exc:
        warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2)
        return []
    raw_entities = response.get("entities", [])
    if not isinstance(raw_entities, list):
        warnings.warn(
            "extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista",
            stacklevel=2,
        )
        return []
    candidates: list[EntityCandidate] = []
    for item in raw_entities:
        if not isinstance(item, dict):
            continue
        name = item.get("name", "")
        if not name:
            continue
        type_ref = item.get("type_ref", "")
        if type_ref not in valid_type_refs:
            warnings.warn(
                f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'",
                stacklevel=2,
            )
            continue
        attributes = item.get("attributes", {})
        if not isinstance(attributes, dict):
            attributes = {}
        # Normalizar null values a None
        attributes = {k: v for k, v in attributes.items() if v is not None}
        confidence = item.get("confidence", 0.0)
        if not isinstance(confidence, (int, float)):
            confidence = 0.0
        confidence = float(max(0.0, min(1.0, confidence)))
        candidates.append(
            EntityCandidate(
                name=name,
                type_ref=type_ref,
                type_label=type_ref_to_label.get(type_ref, ""),
                attributes=attributes,
                confidence=confidence,
            )
        )
    return candidates
@@ -0,0 +1,141 @@
 """extract_relations_llm — extrae relaciones entre entidades usando un LLM."""
 import logging
 import sys
 import os
 from typing import Callable
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))
 from entity_candidate import EntityCandidate
 from relation_candidate import RelationCandidate
 logger = logging.getLogger(__name__)
 def extract_relations_llm(
    text: str,
    entities: list[EntityCandidate],
    relation_types: list[str],
    llm_chat_json: Callable[[list[dict]], dict],
    language_instruction: str = "Respond in English.",
 ) -> list[RelationCandidate]:
    """Extrae relaciones entre entidades de un chunk de texto usando un LLM.
    Dado el texto original y las entidades ya extraidas, pide al LLM que
    identifique relaciones entre pares de entidades. Las relaciones cuyo
    from_name o to_name no coincidan con ninguna entidad existente se descartan.
    Los tipos de relacion no permitidos se reemplazan por "related_to".
    Args:
        text: chunk de texto (el mismo que se uso para extraer las entidades).
        entities: entidades ya extraidas del chunk.
        relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
            "communicates_with", "owns", "related_to"].
        llm_chat_json: funcion inyectada que recibe una lista de mensajes
            (dicts con "role" y "content") y retorna un dict con la respuesta
            JSON del LLM.
        language_instruction: instruccion de idioma para el LLM.
    Returns:
        Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
        o si el LLM no encuentra relaciones.
    """
    if len(entities) < 2:
        return []
    entity_names = {e.name for e in entities}
    relation_types_set = set(relation_types)
    # Construir lista de entidades para el prompt
    entity_lines = "\n".join(
        f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
    )
    # Construir tipos de relacion para el prompt
    relation_types_str = ", ".join(relation_types)
    system_prompt = f"""\
 You are a relation extraction expert. Given text and a list of entities already \
 extracted, identify relationships between them.
 Entities found in this text:
 {entity_lines}
 Allowed relation types: {relation_types_str}
 Output JSON: {{"relations": [
  {{"from_name": "Entity A", "to_name": "Entity B",
   "relation_type": "employs", "description": "...", "confidence": 0.8}}
 ]}}
 Rules:
 - Only extract relations explicitly stated or strongly implied in the text
 - from_name and to_name must match entity names exactly as listed above
 - relation_type must be one of the allowed types
 - Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
 - Do not invent entities not in the list above
 - {language_instruction}"""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text},
    ]
    try:
        response = llm_chat_json(messages)
    except Exception as exc:
        logger.warning("extract_relations_llm: LLM call failed: %s", exc)
        return []
    raw_relations = response.get("relations", [])
    if not isinstance(raw_relations, list):
        logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
        return []
    results: list[RelationCandidate] = []
    for item in raw_relations:
        if not isinstance(item, dict):
            continue
        from_name = item.get("from_name", "")
        to_name = item.get("to_name", "")
        # Validar que ambos nombres corresponden a entidades existentes
        if from_name not in entity_names:
            logger.debug(
                "extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
                from_name,
            )
            continue
        if to_name not in entity_names:
            logger.debug(
                "extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
                to_name,
            )
            continue
        relation_type = item.get("relation_type", "")
        if relation_type not in relation_types_set:
            logger.debug(
                "extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
                relation_type,
            )
            relation_type = "related_to"
        confidence = item.get("confidence", 0.0)
        if not isinstance(confidence, (int, float)):
            confidence = 0.0
        confidence = float(max(0.0, min(1.0, confidence)))
        results.append(
            RelationCandidate(
                from_name=from_name,
                to_name=to_name,
                relation_type=relation_type,
                description=item.get("description", ""),
                confidence=confidence,
            )
        )
    return results
@@ -0,0 +1,92 @@
 """Extract plain text from PDF, Markdown, or TXT files."""
 SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
 def _detect_encoding(data: bytes) -> str:
    """Detect encoding of raw bytes using multiple fallback strategies."""
    # Strategy 1: UTF-8
    try:
        data.decode("utf-8")
        return "utf-8"
    except UnicodeDecodeError:
        pass
    # Strategy 2: charset_normalizer
    try:
        from charset_normalizer import from_bytes
        result = from_bytes(data).best()
        if result is not None and result.encoding:
            return result.encoding
    except ImportError:
        pass
    # Strategy 3: chardet
    try:
        import chardet
        detected = chardet.detect(data)
        if detected and detected.get("encoding"):
            return detected["encoding"]
    except ImportError:
        pass
    # Last resort: UTF-8 with replacement
    return "utf-8"
 def extract_text_from_file(file_path: str) -> str:
    """Extract plain text from a file. Supports PDF, Markdown and TXT.
    For PDF files uses PyMuPDF (fitz) to extract text from each page,
    joining them with double newlines. For text-based files (.md, .markdown,
    .txt) reads the file with automatic encoding detection.
    Args:
        file_path: Absolute or relative path to the file.
    Returns:
        str: Extracted plain text content.
    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file extension is not supported.
        ImportError: If PyMuPDF is not installed and a PDF is provided.
    """
    import os
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    _, ext = os.path.splitext(file_path.lower())
    if ext == ".pdf":
        try:
            import fitz  # PyMuPDF
        except ImportError as e:
            raise ImportError(
                "PyMuPDF is required for PDF extraction. "
                "Install it with: pip install PyMuPDF"
            ) from e
        doc = fitz.open(file_path)
        pages = [page.get_text() for page in doc]
        return "\n\n".join(pages)
    elif ext in {".md", ".markdown", ".txt"}:
        with open(file_path, "rb") as f:
            raw = f.read()
        encoding = _detect_encoding(raw)
        try:
            return raw.decode(encoding)
        except (UnicodeDecodeError, LookupError):
            return raw.decode("utf-8", errors="replace")
    else:
        raise ValueError(
            f"Unsupported file extension: '{ext}'. "
            f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
        )
@@ -0,0 +1,208 @@
 """Pipeline de extraccion de entidades y relaciones desde un documento."""
 from __future__ import annotations
 import sys
 import os
 import time
 import warnings
 from typing import Callable
 # Soporte para ejecucion desde la raiz del registry o desde el directorio del archivo
 from extract_text_from_file import extract_text_from_file
 from core_functions import preprocess_text
 from split_text_into_chunks import split_text_into_chunks
 from build_entity_schema_prompt import build_entity_schema_prompt
 from build_relation_schema_prompt import build_relation_schema_prompt
 from extract_entities_llm import extract_entities_llm
 from extract_relations_llm import extract_relations_llm
 from deduplicate_entities import deduplicate_entities
 from deduplicate_relations import deduplicate_relations
 from entity_candidate import EntityCandidate
 from extraction_result import ExtractionResult
 from extraction_stats import ExtractionStats
 def extraction_pipeline(
    file_path: str,
    entity_presets: list[dict],
    relation_types: list[str],
    llm_chat_json: Callable[[list[dict]], dict],
    chunk_size: int = 500,
    chunk_overlap: int = 50,
    confidence_threshold: float = 0.5,
    dedup_threshold: float = 0.85,
    on_progress: Callable[[str, float], None] | None = None,
 ) -> ExtractionResult:
    """Pipeline completa de extraccion de entidades y relaciones desde un documento.
    Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks
    -> extract_entities_llm por chunk -> deduplicate_entities ->
    extract_relations_llm por chunk -> deduplicate_relations.
    Args:
        file_path: ruta al archivo a procesar (PDF, Markdown, TXT).
        entity_presets: lista de dicts con type_ref, label y metadata_fields.
            Ejemplo: [{"type_ref": "osint_person_go_cybersecurity",
                        "label": "Person",
                        "metadata_fields": ["full_name", "nationality"]}]
        relation_types: tipos de relacion permitidos para extraccion.
            Ejemplo: ["funds", "employs", "communicates_with", "owns"]
        llm_chat_json: funcion inyectada que recibe messages OpenAI y retorna dict
            con la respuesta JSON ya parseada. Sin acoplamiento a ningun proveedor.
        chunk_size: numero de caracteres por chunk (default 500).
        chunk_overlap: overlap entre chunks consecutivos (default 50).
        confidence_threshold: umbral minimo de confidence para aceptar entidades
            candidatas antes de deduplicar (default 0.5).
        dedup_threshold: score minimo de similitud para mergear entidades (default 0.85).
        on_progress: callback opcional de progreso (message: str, pct: float 0-1).
            0-40%: extraccion de entidades, 40-80%: extraccion de relaciones,
            80-100%: deduplicacion.
    Returns:
        ExtractionResult con entidades y relaciones deduplicadas y stats del proceso.
    Raises:
        FileNotFoundError: si file_path no existe.
        ValueError: si entity_presets esta vacio.
    """
    if not entity_presets:
        raise ValueError("entity_presets no puede estar vacio")
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
    def _progress(msg: str, pct: float) -> None:
        if on_progress is not None:
            try:
                on_progress(msg, pct)
            except Exception:
                pass
    start_time = time.monotonic()
    stats = ExtractionStats()
    # ── Paso 1: Extraer texto ──────────────────────────────────────────────────
    _progress("Extracting text from file...", 0.0)
    try:
        raw_text = extract_text_from_file(file_path)
    except Exception as exc:
        warnings.warn(f"extraction_pipeline: error al extraer texto: {exc}")
        raw_text = ""
    # ── Paso 2: Preprocesar ────────────────────────────────────────────────────
    clean_text = preprocess_text(raw_text)
    stats.total_chars = len(clean_text)
    # ── Paso 3: Dividir en chunks ──────────────────────────────────────────────
    chunks = split_text_into_chunks(clean_text, chunk_size=chunk_size, overlap=chunk_overlap)
    n = len(chunks)
    stats.total_chunks = n
    if n == 0:
        stats.processing_time_seconds = time.monotonic() - start_time
        return ExtractionResult(entities=[], relations=[], stats=stats)
    # ── Paso 4: Extraer entidades por chunk ────────────────────────────────────
    all_raw_entities: list[EntityCandidate] = []
    for i, chunk in enumerate(chunks):
        _progress(f"Extracting entities from chunk {i + 1}/{n}", (i / n) * 0.4)
        try:
            candidates = extract_entities_llm(
                text=chunk,
                entity_schema=entity_presets,
                llm_chat_json=llm_chat_json,
            )
        except Exception as exc:
            warnings.warn(
                f"extraction_pipeline: error en extract_entities_llm chunk {i}: {exc}"
            )
            candidates = []
        for candidate in candidates:
            # Anotar el chunk de origen
            if i not in candidate.source_chunk_indices:
                candidate.source_chunk_indices.append(i)
            all_raw_entities.append(candidate)
    # ── Paso 5: Filtrar por confidence ─────────────────────────────────────────
    filtered_entities = [
        e for e in all_raw_entities if e.confidence >= confidence_threshold
    ]
    stats.raw_entities_count = len(filtered_entities)
    # Actualizar stats de tipos
    for ent in filtered_entities:
        stats.entity_types_found[ent.type_ref] = (
            stats.entity_types_found.get(ent.type_ref, 0) + 1
        )
    # ── Paso 6: Deduplicar entidades ───────────────────────────────────────────
    _progress("Deduplicating entities...", 0.4)
    dedup_result = deduplicate_entities(filtered_entities, name_threshold=dedup_threshold)
    stats.final_entities_count = dedup_result.total_after
    stats.entities_merged = dedup_result.total_before - dedup_result.total_after
    final_entities = dedup_result.entities
    entity_id_map = dedup_result.name_to_id  # nombre_original -> entity_id
    # ── Paso 7: Extraer relaciones por chunk ───────────────────────────────────
    all_raw_relations = []
    for i, chunk in enumerate(chunks):
        _progress(f"Extracting relations...", 0.4 + (i / n) * 0.4)
        # Obtener entidades relevantes de este chunk
        chunk_entities = [
            e for e in final_entities if i in e.source_chunk_indices
        ]
        # Si no hay entidades en este chunk especifico, usar todas
        if not chunk_entities:
            chunk_entities = final_entities
        if len(chunk_entities) < 2:
            continue
        try:
            chunk_relations = extract_relations_llm(
                text=chunk,
                entities=chunk_entities,
                relation_types=relation_types,
                llm_chat_json=llm_chat_json,
            )
        except Exception as exc:
            warnings.warn(
                f"extraction_pipeline: error en extract_relations_llm chunk {i}: {exc}"
            )
            chunk_relations = []
        for rel in chunk_relations:
            rel.source_chunk_index = i
        all_raw_relations.extend(chunk_relations)
    stats.raw_relations_count = len(all_raw_relations)
    # Actualizar stats de tipos de relacion
    for rel in all_raw_relations:
        stats.relation_types_found[rel.relation_type] = (
            stats.relation_types_found.get(rel.relation_type, 0) + 1
        )
    # ── Paso 8: Deduplicar relaciones ──────────────────────────────────────────
    _progress("Deduplicating relations...", 0.8)
    final_relations = deduplicate_relations(all_raw_relations, entity_id_map)
    stats.final_relations_count = len(final_relations)
    stats.relations_merged = stats.raw_relations_count - len(final_relations)
    stats.processing_time_seconds = time.monotonic() - start_time
    _progress("Done", 1.0)
    return ExtractionResult(
        entities=final_entities,
        relations=final_relations,
        stats=stats,
    )
@@ -0,0 +1,20 @@
 """ExtractionResult — resultado final del pipeline de extraccion."""
 from dataclasses import dataclass, field
 from entity_candidate import EntityCandidate
 from extraction_stats import ExtractionStats
 from relation_candidate import RelationCandidate
@dataclass
 class ExtractionResult:
    """Resultado final del pipeline de extraccion de entidades y relaciones.
    Contiene las listas deduplicadas de entidades y relaciones junto con
    las estadisticas del proceso completo.
    """
    entities: list[EntityCandidate]
    relations: list[RelationCandidate]
    stats: ExtractionStats = field(default_factory=ExtractionStats)
@@ -0,0 +1,25 @@
 """ExtractionStats — estadisticas del proceso de extraccion."""
 from dataclasses import dataclass, field
@dataclass
 class ExtractionStats:
    """Estadisticas del proceso de extraccion.
    Util para reporting y debugging. Registra conteos antes y despues de
    deduplicacion, tiempo de procesamiento y distribucion de tipos encontrados.
    """
    total_chunks: int = 0
    total_chars: int = 0
    raw_entities_count: int = 0
    final_entities_count: int = 0
    entities_merged: int = 0
    raw_relations_count: int = 0
    final_relations_count: int = 0
    relations_merged: int = 0
    relations_discarded: int = 0
    entity_types_found: dict[str, int] = field(default_factory=dict)
    relation_types_found: dict[str, int] = field(default_factory=dict)
    processing_time_seconds: float = 0.0
@@ -0,0 +1,78 @@
 """Combina atributos de multiples candidatos de la misma entidad."""
 from __future__ import annotations
 _NUMERIC_FIELDS = {"risk_score", "balance", "cvss"}
 _DATE_MIN_FIELDS = {"first_seen", "created_date"}
 _DATE_MAX_FIELDS = {"last_seen", "expires_date"}
 _BOOL_FIELDS = {"verified", "exploited"}
 def merge_entity_attributes(attr_list: list[dict]) -> dict:
    """Combina atributos de multiples candidatos de la misma entidad.
    Para cada campo presente en cualquier candidato recopila todos los valores
    non-null y aplica heuristicas de resolucion por tipo de campo:
    - Numerico (risk_score, balance, cvss): max
    - Fecha min (first_seen, created_date): min (mas antigua)
    - Fecha max (last_seen, expires_date): max (mas reciente)
    - Lista (cualquier valor de tipo list): union sin duplicados
    - Boolean (verified, exploited): OR logico
    - String: el mas largo
    Args:
        attr_list: Lista de dicts con los atributos de cada candidato.
    Returns:
        Dict con los atributos fusionados.
    """
    if not attr_list:
        return {}
    # Recopilar todas las claves presentes en cualquier candidato
    all_keys: set[str] = set()
    for attrs in attr_list:
        all_keys.update(attrs.keys())
    merged: dict = {}
    for key in all_keys:
        # Recopilar valores non-null
        values = [attrs[key] for attrs in attr_list if key in attrs and attrs[key] is not None]
        if not values:
            merged[key] = None
            continue
        if len(values) == 1:
            merged[key] = values[0]
            continue
        # Todos iguales
        if all(v == values[0] for v in values):
            merged[key] = values[0]
            continue
        # Resolver conflicto segun tipo de campo
        if key in _NUMERIC_FIELDS:
            merged[key] = max(values)
        elif key in _DATE_MIN_FIELDS:
            merged[key] = min(values)
        elif key in _DATE_MAX_FIELDS:
            merged[key] = max(values)
        elif key in _BOOL_FIELDS:
            merged[key] = any(values)
        elif isinstance(values[0], list):
            # Union de listas sin duplicados, preservando orden de aparicion
            seen: list = []
            for lst in values:
                for item in lst:
                    if item not in seen:
                        seen.append(item)
            merged[key] = seen
        else:
            # String u otro: usar el mas largo
            str_values = [str(v) for v in values]
            merged[key] = max(str_values, key=len)
    return merged
@@ -0,0 +1,81 @@
 """Normaliza el nombre de una entidad para comparacion y deduplicacion."""
 import re
 _TITLES = re.compile(
    r"^\b(?:Dr|Mr|Mrs|Ms|Miss|Prof|Sr|Jr|Ing|Lic|Gen|Col|Maj|Capt|Sgt|Rev|Hon)\.?\s+",
    re.IGNORECASE,
 )
 _LEGAL_SUFFIXES = re.compile(
    r"\b(?:Inc|LLC|Ltd|Corp|Co|S\.?A|GmbH|B\.?V|N\.?V|PLC|AG|SRL|S\.?L|Pty|"
    r"LP|LLP|LLLP|PC|PA|PLLC|Foundation|Group|Holdings|Enterprises?|"
    r"International|Industries|Services?|Solutions?|Systems?|Technologies?)\.?\s*$",
    re.IGNORECASE,
 )
 _MULTI_SPACE = re.compile(r"\s+")
 def normalize_entity_name(name: str, entity_type: str = "") -> str:
    """Normaliza el nombre de una entidad para comparacion y deduplicacion.
    Aplica reglas diferentes segun el tipo de entidad:
    - ip / email / domain / crypto_wallet / phone: normalizacion tecnica
    - person: normalizacion de nombre humano (titulos, formato apellido-nombre)
    - organization: normalizacion corporativa (sufijos legales)
    - default: lower + strip + colapsar espacios
    Args:
        name: nombre de la entidad a normalizar.
        entity_type: tipo de entidad (ip, email, domain, crypto_wallet, phone,
                     person, organization). Vacio = default.
    Returns:
        nombre normalizado como string.
    """
    name = name.strip()
    et = entity_type.lower().strip()
    if et == "ip":
        return name.lower()
    if et == "email":
        return name.lower()
    if et == "domain":
        result = name.lower().rstrip(".")
        if result.startswith("www."):
            result = result[4:]
        return result
    if et == "crypto_wallet":
        # Bitcoin addresses son case-sensitive — solo strip
        return name
    if et == "phone":
        # Mantener solo digitos y el signo +
        return re.sub(r"[^\d+]", "", name)
    if et == "person":
        # Remover titulos al inicio
        result = _TITLES.sub("", name).strip()
        # Detectar formato "Apellido, Nombre"
        if "," in result:
            parts = result.split(",", 1)
            last = parts[0].strip()
            first = parts[1].strip()
            result = f"{first} {last}"
        # Colapsar espacios y title case
        result = _MULTI_SPACE.sub(" ", result).strip()
        return result.title()
    if et == "organization":
        result = _LEGAL_SUFFIXES.sub("", name).strip()
        result = _MULTI_SPACE.sub(" ", result).strip()
        # Title case para consistencia
        return result.title()
    # Default: lower, strip, colapsar espacios
    return _MULTI_SPACE.sub(" ", name.lower()).strip()
@@ -0,0 +1,35 @@
 """RelationCandidate — candidato de relacion extraido por el LLM."""
 from dataclasses import dataclass
@dataclass
 class RelationCandidate:
    """Candidato de relacion entre dos entidades extraido por el LLM.
    `from_name` y `to_name` contienen los nombres crudos del texto. `from_id`
    y `to_id` se llenan durante la fase de deduplicacion cuando se resuelven
    contra los EntityCandidate finales.
    """
    from_name: str
    to_name: str
    from_id: str = ""
    to_id: str = ""
    relation_type: str = ""
    description: str = ""
    confidence: float = 0.0
    source_chunk_index: int = -1
    def to_dict(self) -> dict:
        """Serializa el candidato a un diccionario."""
        return {
            "from_name": self.from_name,
            "to_name": self.to_name,
            "from_id": self.from_id,
            "to_id": self.to_id,
            "relation_type": self.relation_type,
            "description": self.description,
            "confidence": self.confidence,
            "source_chunk_index": self.source_chunk_index,
        }
@@ -0,0 +1,234 @@
 """Renderiza un grafo sigma.js como HTML standalone con dark theme y layout ForceAtlas2."""
 import json
 import os
 _HTML_TEMPLATE = """\
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>{title}</title>
    <script src="https://cdn.jsdelivr.net/npm/graphology@0.25.4/dist/graphology.umd.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/graphology-library@0.8.0/dist/graphology-library.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/sigma@2.4.0/build/sigma.min.js"></script>
    <style>
        * {{ box-sizing: border-box; margin: 0; padding: 0; }}
        body {{ background: #1a1a2e; color: #eee; font-family: 'Segoe UI', system-ui, sans-serif; overflow: hidden; }}
        #container {{ width: 100vw; height: 100vh; }}
        #panel {{
            position: absolute; top: 12px; right: 12px;
            background: rgba(10, 10, 30, 0.88);
            border: 1px solid rgba(255,255,255,0.12);
            padding: 16px; border-radius: 10px;
            z-index: 10; min-width: 200px; max-width: 260px;
            backdrop-filter: blur(6px);
        }}
        #panel h3 {{ font-size: 14px; font-weight: 600; margin-bottom: 12px; color: #a0c4ff; letter-spacing: 0.5px; }}
        #stats {{ font-size: 11px; color: #888; margin-bottom: 12px; }}
        #filters {{ display: flex; flex-direction: column; gap: 6px; }}
        .filter-item {{ display: flex; align-items: center; gap: 8px; font-size: 12px; cursor: pointer; }}
        .filter-item input {{ cursor: pointer; accent-color: #a0c4ff; }}
        .color-dot {{ width: 10px; height: 10px; border-radius: 50%; flex-shrink: 0; }}
        #tooltip {{
            position: absolute; display: none;
            background: rgba(5, 5, 20, 0.95);
            border: 1px solid rgba(255,255,255,0.15);
            padding: 10px 14px; border-radius: 8px;
            pointer-events: none; z-index: 20;
            max-width: 300px; font-size: 12px; line-height: 1.6;
        }}
        #tooltip .tt-title {{ font-weight: 600; color: #a0c4ff; margin-bottom: 6px; font-size: 13px; }}
        #tooltip .tt-row {{ display: flex; gap: 6px; }}
        #tooltip .tt-key {{ color: #888; min-width: 80px; }}
        #tooltip .tt-val {{ color: #eee; word-break: break-all; }}
    </style>
 </head>
 <body>
    <div id="container"></div>
    <div id="panel">
        <h3>{title}</h3>
        <div id="stats"></div>
        <div id="filters"></div>
    </div>
    <div id="tooltip"></div>
    <script>
    (function () {{
        const graphData = {json_data};
        // ── Build graphology graph ──────────────────────────────────────────────
        const Graph = graphology.Graph || graphology;
        const g = new Graph({{ multi: true, type: 'directed' }});
        // Assign random initial positions
        graphData.nodes.forEach(function (n) {{
            g.addNode(n.key, Object.assign({{
                x: (Math.random() - 0.5) * 10,
                y: (Math.random() - 0.5) * 10,
            }}, n.attributes));
        }});
        graphData.edges.forEach(function (e) {{
            try {{
                g.addEdgeWithKey(e.key, e.source, e.target, e.attributes || {{}});
            }} catch (err) {{
                // skip duplicate edge keys gracefully
            }}
        }});
        // ── ForceAtlas2 layout (synchronous, 500 iterations) ───────────────────
        const FA2 = graphologyLibrary.layoutForceAtlas2;
        FA2.assign(g, {{
            iterations: 500,
            settings: {{
                gravity: 1,
                scalingRatio: 2,
                slowDown: 5,
                barnesHutOptimize: g.order > 300,
            }},
        }});
        // ── Sigma renderer ──────────────────────────────────────────────────────
        const renderer = new Sigma(g, document.getElementById('container'), {{
            renderEdgeLabels: false,
            defaultEdgeColor: '#444',
            defaultNodeColor: '#95a5a6',
            labelColor: {{ color: '#ccc' }},
            labelSize: 11,
            edgeReducer: function (edge, data) {{
                return Object.assign({{}}, data, {{ size: Math.max(1, (data.weight || 1) * 0.8) }});
            }},
        }});
        // ── Stats panel ─────────────────────────────────────────────────────────
        document.getElementById('stats').textContent =
            graphData.nodes.length + ' nodes · ' + graphData.edges.length + ' edges';
        // ── Filter panel by node type ───────────────────────────────────────────
        const typeColors = {{}};
        graphData.nodes.forEach(function (n) {{
            const t = n.attributes.entity_type || 'unknown';
            typeColors[t] = n.attributes.color || '#95a5a6';
        }});
        const hiddenTypes = new Set();
        const filtersDiv = document.getElementById('filters');
        Object.keys(typeColors).sort().forEach(function (type) {{
            const color = typeColors[type];
            const label = document.createElement('label');
            label.className = 'filter-item';
            const cb = document.createElement('input');
            cb.type = 'checkbox';
            cb.checked = true;
            cb.addEventListener('change', function () {{
                if (cb.checked) hiddenTypes.delete(type);
                else hiddenTypes.add(type);
                renderer.refresh();
            }});
            const dot = document.createElement('span');
            dot.className = 'color-dot';
            dot.style.background = color;
            label.appendChild(cb);
            label.appendChild(dot);
            label.appendChild(document.createTextNode(type));
            filtersDiv.appendChild(label);
        }});
        // Node reducer applies type filter
        renderer.setSetting('nodeReducer', function (node, data) {{
            if (hiddenTypes.has(data.entity_type)) return Object.assign({{}}, data, {{ hidden: true }});
            return data;
        }});
        // ── Tooltip on hover ────────────────────────────────────────────────────
        const tooltip = document.getElementById('tooltip');
        renderer.on('enterNode', function (ref) {{
            const nodeAttrs = g.getNodeAttributes(ref.node);
            const reserved = new Set(['x', 'y', 'size', 'color', 'label', 'type', 'hidden']);
            let html = '<div class="tt-title">' + escHtml(nodeAttrs.label || ref.node) + '</div>';
            html += '<div class="tt-row"><span class="tt-key">type</span><span class="tt-val">' + escHtml(nodeAttrs.entity_type || '') + '</span></div>';
            html += '<div class="tt-row"><span class="tt-key">status</span><span class="tt-val">' + escHtml(nodeAttrs.status || '') + '</span></div>';
            html += '<div class="tt-row"><span class="tt-key">domain</span><span class="tt-val">' + escHtml(nodeAttrs.domain || '') + '</span></div>';
            Object.keys(nodeAttrs).sort().forEach(function (k) {{
                if (!reserved.has(k) && !['status', 'domain', 'type', 'label'].includes(k)) {{
                    html += '<div class="tt-row"><span class="tt-key">' + escHtml(k) + '</span><span class="tt-val">' + escHtml(String(nodeAttrs[k])) + '</span></div>';
                }}
            }});
            tooltip.innerHTML = html;
            tooltip.style.display = 'block';
        }});
        renderer.on('leaveNode', function () {{
            tooltip.style.display = 'none';
        }});
        document.getElementById('container').addEventListener('mousemove', function (e) {{
            tooltip.style.left = (e.clientX + 16) + 'px';
            tooltip.style.top = (e.clientY + 16) + 'px';
        }});
        function escHtml(str) {{
            return String(str)
                .replace(/&/g, '&amp;')
                .replace(/</g, '&lt;')
                .replace(/>/g, '&gt;')
                .replace(/"/g, '&quot;');
        }}
    }})();
    </script>
 </body>
 </html>
 """
 def render_sigma_html(
    graph_data: dict,
    output_path: str,
    title: str = "OSINT Graph",
 ) -> str:
    """Genera un HTML standalone con sigma.js que visualiza el grafo OSINT.
    Recibe el dict producido por ops_to_sigma_json, embebe los datos como JSON
    en el HTML, aplica ForceAtlas2 (500 iteraciones sincrono) y renderiza con
    sigma.js v2.4. Incluye dark theme, panel de filtros por tipo de nodo y
    tooltip con metadata al hacer hover.
    Args:
        graph_data: Dict con claves 'nodes' y 'edges' en formato graphology/sigma.
        output_path: Ruta del archivo HTML a escribir.
        title: Titulo del grafo mostrado en el panel y la pestana.
    Returns:
        Ruta absoluta del archivo HTML escrito.
    Raises:
        Exception: Si no se puede escribir el archivo en output_path.
    """
    json_data = json.dumps(graph_data, ensure_ascii=False)
    html = _HTML_TEMPLATE.format(
        title=title,
        json_data=json_data,
    )
    abs_path = os.path.abspath(output_path)
    os.makedirs(os.path.dirname(abs_path) or ".", exist_ok=True)
    try:
        with open(abs_path, "w", encoding="utf-8") as f:
            f.write(html)
    except OSError as exc:
        raise Exception(f"render_sigma_html: no se pudo escribir '{abs_path}': {exc}") from exc
    return abs_path
@@ -0,0 +1,66 @@
 """Split text into overlapping chunks with sentence-boundary awareness."""
 def split_text_into_chunks(
    text: str, chunk_size: int = 500, overlap: int = 50
 ) -> list[str]:
    """Divide texto en chunks de tamaño fijo con overlap, cortando en límites de oración.
    Args:
        text: Texto a dividir.
        chunk_size: Tamaño máximo de cada chunk en caracteres.
        overlap: Número de caracteres de solapamiento entre chunks consecutivos.
    Returns:
        Lista de chunks. Vacía si el texto es vacío.
    """
    if not text:
        return []
    if len(text) <= chunk_size:
        stripped = text.strip()
        return [stripped] if stripped else []
    # Separadores en orden de prioridad (más específicos primero)
    separators = ["。", "！", "？", ".\n", "!\n", "?\n", "\n\n", ". ", "! ", "? "]
    chunks: list[str] = []
    start = 0
    text_len = len(text)
    while start < text_len:
        end = start + chunk_size
        if end < text_len:
            # Buscar el último separador de oración dentro de text[start:end]
            # Solo aceptar si está después del 30% del chunk
            min_pos = start + int(chunk_size * 0.30)
            best_end = None
            for sep in separators:
                sep_len = len(sep)
                # Buscar la última ocurrencia del separador en text[start:end]
                search_region = text[start:end]
                pos = search_region.rfind(sep)
                if pos == -1:
                    continue
                abs_pos = start + pos + sep_len
                if abs_pos > min_pos:
                    # Usar este separador solo si produce un corte más tarde que el mínimo
                    # y más temprano que chunk_size (ya garantizado por rfind en [start:end])
                    if best_end is None or abs_pos > best_end:
                        best_end = abs_pos
            if best_end is not None:
                end = best_end
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start = end - overlap
        # Protección contra bucle infinito si overlap >= chunk_size o end no avanza
        if start >= end:
            start = end
    return chunks
@@ -0,0 +1,6 @@
 def main():
    print("Hello from ontology-graph!")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,935 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ontology Graph Extraction\n",
    "\n",
    "Extrae entidades y relaciones de cualquier documento usando funciones del registry.\n",
    "- LLM: `claude -p --model haiku`\n",
    "- Tipos: OSINT del registry + genéricos (concept, url, date, quantity, text_fragment, coordinates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'python.functions.core.extract_json_from_llm'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m      3\u001b[39m ROOT = \u001b[33m'/home/lucas/fn_registry'\u001b[39m\n\u001b[32m      4\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m      5\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m      6\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m      8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m      9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m     10\u001b[39m \n",
      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
     ]
    }
   ],
   "source": [
    "import sys, os, json, subprocess\n",
    "\n",
    "ROOT = '/home/lucas/fn_registry'\n",
    "os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
    "sys.path.insert(0, ROOT)\n",
    "\n",
    "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
    "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
    "from python.functions.datascience.render_sigma_html import render_sigma_html\n",
    "\n",
    "print('Registry root:', ROOT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'FN_REGISTRY_ROOT'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mKeyError\u001b[39m                                  Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m sys, os, json, subprocess\n\u001b[32m      2\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m ROOT = os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m]\n\u001b[32m      4\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m      5\u001b[39m \n\u001b[32m      6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n",
      "\u001b[36mFile \u001b[39m\u001b[32m<frozen os>:717\u001b[39m, in \u001b[36m_Environ.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n",
      "\u001b[31mKeyError\u001b[39m: 'FN_REGISTRY_ROOT'"
     ]
    }
   ],
   "source": [
    "import sys, os, json, subprocess\n",
    "\n",
    "ROOT = os.environ['FN_REGISTRY_ROOT']\n",
    "sys.path.insert(0, ROOT)\n",
    "\n",
    "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
    "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
    "from python.functions.datascience.render_sigma_html import render_sigma_html\n",
    "\n",
    "print('Registry root:', ROOT)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LLM wrapper: claude -p + haiku"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def claude_haiku_json(messages: list[dict]) -> dict:\n",
    "    \"\"\"Wrapper que convierte messages OpenAI-style a claude -p --model haiku.\"\"\"\n",
    "    # Construir prompt desde messages\n",
    "    parts = []\n",
    "    for msg in messages:\n",
    "        role = msg['role']\n",
    "        content = msg['content']\n",
    "        if role == 'system':\n",
    "            parts.append(f\"[SYSTEM]\\n{content}\")\n",
    "        elif role == 'user':\n",
    "            parts.append(f\"[USER]\\n{content}\")\n",
    "    prompt = \"\\n\\n\".join(parts)\n",
    "    \n",
    "    result = subprocess.run(\n",
    "        ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
    "        capture_output=True, text=True, timeout=120\n",
    "    )\n",
    "    \n",
    "    if result.returncode != 0:\n",
    "        raise RuntimeError(f\"claude -p failed: {result.stderr}\")\n",
    "    \n",
    "    # Extraer el campo 'result' del JSON envelope de claude\n",
    "    envelope = json.loads(result.stdout)\n",
    "    raw_text = envelope.get('result', '')\n",
    "    \n",
    "    # Parsear JSON del LLM (maneja codeblocks, trailing commas, etc.)\n",
    "    return extract_json_from_llm(raw_text)\n",
    "\n",
    "# Test rapido\n",
    "test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
    "print('LLM wrapper OK:', test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Entity presets: OSINT + genéricos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- Presets OSINT (del registry) ---\n",
    "OSINT_PRESETS = [\n",
    "    {\"type_ref\": \"osint_person_go_cybersecurity\", \"label\": \"Person\",\n",
    "     \"metadata_fields\": [\"full_name\", \"alias\", \"nationality\", \"dob\", \"gender\", \"risk_score\"]},\n",
    "    {\"type_ref\": \"osint_organization_go_cybersecurity\", \"label\": \"Organization\",\n",
    "     \"metadata_fields\": [\"legal_name\", \"country\", \"sector\", \"founded\", \"risk_score\"]},\n",
    "    {\"type_ref\": \"osint_location_go_cybersecurity\", \"label\": \"Location\",\n",
    "     \"metadata_fields\": [\"lat\", \"lon\", \"address\", \"country\", \"city\"]},\n",
    "    {\"type_ref\": \"osint_event_go_cybersecurity\", \"label\": \"Event\",\n",
    "     \"metadata_fields\": [\"event_type\", \"date\", \"location\", \"description\", \"severity\"]},\n",
    "    {\"type_ref\": \"osint_email_go_cybersecurity\", \"label\": \"Email\",\n",
    "     \"metadata_fields\": [\"address\", \"provider\", \"verified\", \"breached\"]},\n",
    "    {\"type_ref\": \"osint_domain_go_cybersecurity\", \"label\": \"Domain\",\n",
    "     \"metadata_fields\": [\"fqdn\", \"registrar\", \"created_date\", \"expires_date\"]},\n",
    "    {\"type_ref\": \"osint_ip_address_go_cybersecurity\", \"label\": \"IP Address\",\n",
    "     \"metadata_fields\": [\"ip\", \"asn\", \"country\", \"isp\", \"geolocation\"]},\n",
    "    {\"type_ref\": \"osint_phone_go_cybersecurity\", \"label\": \"Phone\",\n",
    "     \"metadata_fields\": [\"number\", \"country_code\", \"carrier\", \"phone_type\"]},\n",
    "    {\"type_ref\": \"osint_social_media_go_cybersecurity\", \"label\": \"Social Media Account\",\n",
    "     \"metadata_fields\": [\"platform\", \"username\", \"url\", \"followers\", \"verified\"]},\n",
    "    {\"type_ref\": \"osint_document_go_cybersecurity\", \"label\": \"Document\",\n",
    "     \"metadata_fields\": [\"title\", \"format\", \"classification\", \"source\"]},\n",
    "    {\"type_ref\": \"osint_crypto_wallet_go_cybersecurity\", \"label\": \"Crypto Wallet\",\n",
    "     \"metadata_fields\": [\"address\", \"blockchain\", \"balance\"]},\n",
    "    {\"type_ref\": \"osint_malware_go_cybersecurity\", \"label\": \"Malware\",\n",
    "     \"metadata_fields\": [\"family\", \"hash_sha256\", \"threat_level\"]},\n",
    "    {\"type_ref\": \"osint_vulnerability_go_cybersecurity\", \"label\": \"Vulnerability\",\n",
    "     \"metadata_fields\": [\"cve_id\", \"cvss\", \"affected_product\", \"exploited\"]},\n",
    "]\n",
    "\n",
    "# --- Presets genéricos (sin tipo Go, inline) ---\n",
    "GENERIC_PRESETS = [\n",
    "    {\"type_ref\": \"concept\", \"label\": \"Concept\",\n",
    "     \"metadata_fields\": [\"name\", \"category\", \"definition\"]},\n",
    "    {\"type_ref\": \"url\", \"label\": \"URL/Link\",\n",
    "     \"metadata_fields\": [\"url\", \"domain\", \"context\"]},\n",
    "    {\"type_ref\": \"date_reference\", \"label\": \"Date/Time\",\n",
    "     \"metadata_fields\": [\"date\", \"precision\", \"context\"]},\n",
    "    {\"type_ref\": \"quantity\", \"label\": \"Quantity/Amount\",\n",
    "     \"metadata_fields\": [\"value\", \"unit\", \"context\"]},\n",
    "    {\"type_ref\": \"coordinates\", \"label\": \"Coordinates\",\n",
    "     \"metadata_fields\": [\"lat\", \"lon\", \"label\"]},\n",
    "    {\"type_ref\": \"text_fragment\", \"label\": \"Key Text Fragment\",\n",
    "     \"metadata_fields\": [\"text\", \"category\", \"relevance\"]},\n",
    "]\n",
    "\n",
    "ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
    "print(f'{len(ALL_PRESETS)} entity presets loaded ({len(OSINT_PRESETS)} OSINT + {len(GENERIC_PRESETS)} generic)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Relation types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "RELATION_TYPES = [\n",
    "    # Personas / orgs\n",
    "    \"employs\", \"works_for\", \"founded\", \"owns\", \"controls\",\n",
    "    \"member_of\", \"affiliated_with\", \"collaborates_with\",\n",
    "    # Comunicacion\n",
    "    \"communicates_with\", \"sent_to\", \"received_from\",\n",
    "    # Ubicacion\n",
    "    \"located_in\", \"headquartered_in\", \"traveled_to\", \"operates_in\",\n",
    "    # Eventos\n",
    "    \"participated_in\", \"caused\", \"occurred_at\", \"occurred_on\",\n",
    "    # Documentos / conceptos\n",
    "    \"mentions\", \"references\", \"describes\", \"authored\", \"published\",\n",
    "    # Financiero\n",
    "    \"funds\", \"transacted_with\", \"invested_in\",\n",
    "    # Tecnico\n",
    "    \"hosts\", \"resolves_to\", \"exploits\", \"targets\",\n",
    "    # Generico\n",
    "    \"related_to\", \"part_of\", \"instance_of\", \"has_attribute\",\n",
    "]\n",
    "\n",
    "print(f'{len(RELATION_TYPES)} relation types')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extraer documento\n",
    "\n",
    "Pon tu documento en `data/` y cambia el path."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DOC_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'document.pdf')  # <-- cambiar\n",
    "\n",
    "# Progreso visible\n",
    "def on_progress(msg, pct):\n",
    "    print(f'  [{pct*100:5.1f}%] {msg}')\n",
    "\n",
    "result = extraction_pipeline(\n",
    "    file_path=DOC_PATH,\n",
    "    entity_presets=ALL_PRESETS,\n",
    "    relation_types=RELATION_TYPES,\n",
    "    llm_chat_json=claude_haiku_json,\n",
    "    chunk_size=800,\n",
    "    chunk_overlap=100,\n",
    "    confidence_threshold=0.5,\n",
    "    dedup_threshold=0.85,\n",
    "    on_progress=on_progress,\n",
    ")\n",
    "\n",
    "print(f'\\nEntities: {result.stats.final_entities_count}')\n",
    "print(f'Relations: {result.stats.final_relations_count}')\n",
    "print(f'Chunks: {result.stats.total_chunks}')\n",
    "print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
    "print(f'Entity types: {result.stats.entity_types_found}')\n",
    "print(f'Relation types: {result.stats.relation_types_found}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explorar resultados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Entities\n",
    "ent_rows = []\n",
    "for e in result.entities:\n",
    "    ent_rows.append({\n",
    "        'id': e.id,\n",
    "        'name': e.name,\n",
    "        'type': e.type_ref,\n",
    "        'confidence': e.confidence,\n",
    "        'attributes': e.attributes,\n",
    "    })\n",
    "df_entities = pd.DataFrame(ent_rows)\n",
    "print(f'=== Entities ({len(df_entities)}) ===')\n",
    "df_entities.sort_values('type')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Relations\n",
    "rel_rows = []\n",
    "for r in result.relations:\n",
    "    rel_rows.append({\n",
    "        'from_name': r.from_name,\n",
    "        'relation': r.relation_type,\n",
    "        'to_name': r.to_name,\n",
    "        'confidence': r.confidence,\n",
    "        'description': r.description,\n",
    "    })\n",
    "df_relations = pd.DataFrame(rel_rows)\n",
    "print(f'=== Relations ({len(df_relations)}) ===')\n",
    "df_relations.sort_values('relation')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualizar grafo con sigma.js"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Colores por tipo de entidad\n",
    "TYPE_COLORS = {\n",
    "    'osint_person_go_cybersecurity': '#e74c3c',\n",
    "    'osint_organization_go_cybersecurity': '#3498db',\n",
    "    'osint_location_go_cybersecurity': '#2ecc71',\n",
    "    'osint_event_go_cybersecurity': '#f39c12',\n",
    "    'osint_email_go_cybersecurity': '#9b59b6',\n",
    "    'osint_domain_go_cybersecurity': '#1abc9c',\n",
    "    'osint_ip_address_go_cybersecurity': '#e67e22',\n",
    "    'osint_phone_go_cybersecurity': '#95a5a6',\n",
    "    'osint_social_media_go_cybersecurity': '#e91e63',\n",
    "    'osint_document_go_cybersecurity': '#607d8b',\n",
    "    'osint_crypto_wallet_go_cybersecurity': '#ff9800',\n",
    "    'osint_malware_go_cybersecurity': '#f44336',\n",
    "    'osint_vulnerability_go_cybersecurity': '#ff5722',\n",
    "    'concept': '#00bcd4',\n",
    "    'url': '#8bc34a',\n",
    "    'date_reference': '#cddc39',\n",
    "    'quantity': '#ffc107',\n",
    "    'coordinates': '#4caf50',\n",
    "    'text_fragment': '#78909c',\n",
    "}\n",
    "DEFAULT_COLOR = '#aaaaaa'\n",
    "\n",
    "def extraction_to_sigma(result) -> dict:\n",
    "    \"\"\"Convierte ExtractionResult a formato sigma.js/graphology.\"\"\"\n",
    "    # Contar degree para tamaño de nodo\n",
    "    degree = {}\n",
    "    for r in result.relations:\n",
    "        from_id = r.from_id or r.from_name\n",
    "        to_id = r.to_id or r.to_name\n",
    "        degree[from_id] = degree.get(from_id, 0) + 1\n",
    "        degree[to_id] = degree.get(to_id, 0) + 1\n",
    "\n",
    "    nodes = []\n",
    "    for e in result.entities:\n",
    "        eid = e.id or e.name\n",
    "        nodes.append({\n",
    "            'key': eid,\n",
    "            'attributes': {\n",
    "                'label': e.name,\n",
    "                'color': TYPE_COLORS.get(e.type_ref, DEFAULT_COLOR),\n",
    "                'size': 4 + min(degree.get(eid, 0) * 2, 20),\n",
    "                'type': e.type_ref,\n",
    "                **{k: str(v) for k, v in (e.attributes or {}).items() if v is not None},\n",
    "            }\n",
    "        })\n",
    "\n",
    "    edges = []\n",
    "    node_keys = {n['key'] for n in nodes}\n",
    "    for i, r in enumerate(result.relations):\n",
    "        from_id = r.from_id or r.from_name\n",
    "        to_id = r.to_id or r.to_name\n",
    "        if from_id in node_keys and to_id in node_keys:\n",
    "            edges.append({\n",
    "                'key': f'e{i}',\n",
    "                'source': from_id,\n",
    "                'target': to_id,\n",
    "                'attributes': {\n",
    "                    'label': r.relation_type,\n",
    "                    'type': r.relation_type,\n",
    "                }\n",
    "            })\n",
    "\n",
    "    return {'nodes': nodes, 'edges': edges}\n",
    "\n",
    "graph_data = extraction_to_sigma(result)\n",
    "print(f'Graph: {len(graph_data[\"nodes\"])} nodes, {len(graph_data[\"edges\"])} edges')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')\n",
    "html_path = render_sigma_html(\n",
    "    graph_data=graph_data,\n",
    "    output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
    "    title='Ontology Graph',\n",
    ")\n",
    "print(f'Graph saved: {html_path}')\n",
    "print(f'Open in browser: file://{html_path}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Auto-discovery de nuevos tipos\n",
    "\n",
    "Si el documento contiene entidades que no encajan en los presets, haiku las detecta y sugiere nuevos presets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def discover_new_types(result, existing_presets: list[dict]) -> list[dict]:\n",
    "    \"\"\"Pide a haiku que sugiera tipos nuevos basandose en entidades de baja confianza o genericas.\"\"\"\n",
    "    # Recopilar entidades clasificadas como concept/text_fragment (genéricos fallback)\n",
    "    generic_entities = [\n",
    "        {'name': e.name, 'type': e.type_ref, 'attributes': e.attributes}\n",
    "        for e in result.entities\n",
    "        if e.type_ref in ('concept', 'text_fragment', 'related_to')\n",
    "    ]\n",
    "    \n",
    "    if not generic_entities:\n",
    "        print('No hay entidades genéricas — los presets cubren todo.')\n",
    "        return []\n",
    "\n",
    "    existing_labels = [p['label'] for p in existing_presets]\n",
    "    \n",
    "    prompt_msg = [\n",
    "        {'role': 'system', 'content': (\n",
    "            'You analyze entities extracted from a document and suggest new entity type presets. '\n",
    "            'Existing types: ' + ', '.join(existing_labels) + '. '\n",
    "            'For entities that dont fit existing types, suggest new type presets. '\n",
    "            'Output JSON: {\"new_presets\": [{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", '\n",
    "            '\"metadata_fields\": [\"field1\", \"field2\", ...]}]}. '\n",
    "            'Only suggest types that are genuinely different from existing ones. '\n",
    "            'Return {\"new_presets\": []} if no new types are needed.'\n",
    "        )},\n",
    "        {'role': 'user', 'content': (\n",
    "            'These entities were classified as generic (concept/text_fragment) '\n",
    "            'because they didnt fit existing types:\\n\\n'\n",
    "            + json.dumps(generic_entities[:30], ensure_ascii=False, indent=2)\n",
    "        )}\n",
    "    ]\n",
    "    \n",
    "    resp = claude_haiku_json(prompt_msg)\n",
    "    new_presets = resp.get('new_presets', [])\n",
    "    \n",
    "    if new_presets:\n",
    "        print(f'Discovered {len(new_presets)} new types:')\n",
    "        for p in new_presets:\n",
    "            print(f\"  - {p['label']} ({p['type_ref']}): {p['metadata_fields']}\")\n",
    "    else:\n",
    "        print('No new types needed.')\n",
    "    \n",
    "    return new_presets\n",
    "\n",
    "new_types = discover_new_types(result, ALL_PRESETS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Si se descubrieron tipos nuevos, re-extraer con presets ampliados\n",
    "if new_types:\n",
    "    EXPANDED_PRESETS = ALL_PRESETS + new_types\n",
    "    print(f'Re-extracting with {len(EXPANDED_PRESETS)} presets...')\n",
    "    \n",
    "    result = extraction_pipeline(\n",
    "        file_path=DOC_PATH,\n",
    "        entity_presets=EXPANDED_PRESETS,\n",
    "        relation_types=RELATION_TYPES,\n",
    "        llm_chat_json=claude_haiku_json,\n",
    "        chunk_size=800,\n",
    "        chunk_overlap=100,\n",
    "        confidence_threshold=0.5,\n",
    "        dedup_threshold=0.85,\n",
    "        on_progress=on_progress,\n",
    "    )\n",
    "    \n",
    "    print(f'\\nEntities: {result.stats.final_entities_count}')\n",
    "    print(f'Relations: {result.stats.final_relations_count}')\n",
    "    \n",
    "    # Re-generar grafo\n",
    "    graph_data = extraction_to_sigma(result)\n",
    "    html_path = render_sigma_html(\n",
    "        graph_data=graph_data,\n",
    "        output_path=os.path.join(output_dir, 'ontology_graph.html'),\n",
    "        title='Ontology Graph (expanded)',\n",
    "    )\n",
    "    print(f'Updated graph: file://{html_path}')\n",
    "else:\n",
    "    print('No re-extraction needed.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'python.functions.core.extract_json_from_llm'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m      5\u001b[39m os.environ[\u001b[33m'FN_REGISTRY_ROOT'\u001b[39m] = ROOT\n\u001b[32m      6\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, ROOT)\n\u001b[32m      7\u001b[39m sys.path.insert(\u001b[32m0\u001b[39m, os.path.join(ROOT, \u001b[33m'python'\u001b[39m, \u001b[33m'functions'\u001b[39m))\n\u001b[32m      8\u001b[39m \n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.core.extract_json_from_llm \u001b[38;5;28;01mimport\u001b[39;00m extract_json_from_llm\n\u001b[32m     10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.extraction_pipeline \u001b[38;5;28;01mimport\u001b[39;00m extraction_pipeline\n\u001b[32m     11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m python.functions.datascience.render_sigma_html \u001b[38;5;28;01mimport\u001b[39;00m render_sigma_html\n\u001b[32m     12\u001b[39m \n",
      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'python.functions.core.extract_json_from_llm'"
     ]
    }
   ],
   "source": [
    "import sys, os, json, subprocess\n",
    "from pathlib import Path\n",
    "\n",
    "ROOT = '/home/lucas/fn_registry'\n",
    "os.environ['FN_REGISTRY_ROOT'] = ROOT\n",
    "sys.path.insert(0, ROOT)\n",
    "sys.path.insert(0, os.path.join(ROOT, 'python', 'functions'))\n",
    "\n",
    "from python.functions.core.extract_json_from_llm import extract_json_from_llm\n",
    "from python.functions.datascience.extraction_pipeline import extraction_pipeline\n",
    "from python.functions.datascience.render_sigma_html import render_sigma_html\n",
    "\n",
    "print('OK: imports loaded')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "imports OK\n"
     ]
    }
   ],
   "source": [
    "import sys, os, json, subprocess\n",
    "\n",
    "# Añadir lib/ al path\n",
    "sys.path.insert(0, '/home/lucas/fn_registry/analysis/ontology_graph/lib')\n",
    "\n",
    "from core_functions import extract_json_from_llm\n",
    "from extraction_pipeline import extraction_pipeline\n",
    "from render_sigma_html import render_sigma_html\n",
    "\n",
    "print('imports OK')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LLM wrapper OK: {'ok': True}\n"
     ]
    }
   ],
   "source": [
    "def claude_haiku_json(messages: list[dict]) -> dict:\n",
    "    \"\"\"Wrapper: messages OpenAI-style -> claude -p --model haiku -> dict.\"\"\"\n",
    "    parts = []\n",
    "    for msg in messages:\n",
    "        role = msg['role']\n",
    "        content = msg['content']\n",
    "        if role == 'system':\n",
    "            parts.append(f'[SYSTEM]\\n{content}')\n",
    "        elif role == 'user':\n",
    "            parts.append(f'[USER]\\n{content}')\n",
    "    prompt = '\\n\\n'.join(parts)\n",
    "    \n",
    "    result = subprocess.run(\n",
    "        ['claude', '-p', '--model', 'haiku', '--output-format', 'json', prompt],\n",
    "        capture_output=True, text=True, timeout=120\n",
    "    )\n",
    "    if result.returncode != 0:\n",
    "        raise RuntimeError(f'claude -p failed: {result.stderr}')\n",
    "    \n",
    "    envelope = json.loads(result.stdout)\n",
    "    raw_text = envelope.get('result', '')\n",
    "    return extract_json_from_llm(raw_text)\n",
    "\n",
    "# Test\n",
    "test = claude_haiku_json([{'role': 'user', 'content': 'Return JSON: {\"ok\": true}'}])\n",
    "print('LLM wrapper OK:', test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19 presets, 35 relation types\n"
     ]
    }
   ],
   "source": [
    "OSINT_PRESETS = [\n",
    "    {'type_ref': 'osint_person_go_cybersecurity', 'label': 'Person',\n",
    "     'metadata_fields': ['full_name', 'alias', 'nationality', 'dob', 'gender', 'risk_score']},\n",
    "    {'type_ref': 'osint_organization_go_cybersecurity', 'label': 'Organization',\n",
    "     'metadata_fields': ['legal_name', 'country', 'sector', 'founded', 'risk_score']},\n",
    "    {'type_ref': 'osint_location_go_cybersecurity', 'label': 'Location',\n",
    "     'metadata_fields': ['lat', 'lon', 'address', 'country', 'city']},\n",
    "    {'type_ref': 'osint_event_go_cybersecurity', 'label': 'Event',\n",
    "     'metadata_fields': ['event_type', 'date', 'location', 'description', 'severity']},\n",
    "    {'type_ref': 'osint_email_go_cybersecurity', 'label': 'Email',\n",
    "     'metadata_fields': ['address', 'provider', 'verified', 'breached']},\n",
    "    {'type_ref': 'osint_domain_go_cybersecurity', 'label': 'Domain',\n",
    "     'metadata_fields': ['fqdn', 'registrar', 'created_date', 'expires_date']},\n",
    "    {'type_ref': 'osint_ip_address_go_cybersecurity', 'label': 'IP Address',\n",
    "     'metadata_fields': ['ip', 'asn', 'country', 'isp', 'geolocation']},\n",
    "    {'type_ref': 'osint_phone_go_cybersecurity', 'label': 'Phone',\n",
    "     'metadata_fields': ['number', 'country_code', 'carrier', 'phone_type']},\n",
    "    {'type_ref': 'osint_social_media_go_cybersecurity', 'label': 'Social Media Account',\n",
    "     'metadata_fields': ['platform', 'username', 'url', 'followers', 'verified']},\n",
    "    {'type_ref': 'osint_document_go_cybersecurity', 'label': 'Document',\n",
    "     'metadata_fields': ['title', 'format', 'classification', 'source']},\n",
    "    {'type_ref': 'osint_crypto_wallet_go_cybersecurity', 'label': 'Crypto Wallet',\n",
    "     'metadata_fields': ['address', 'blockchain', 'balance']},\n",
    "    {'type_ref': 'osint_malware_go_cybersecurity', 'label': 'Malware',\n",
    "     'metadata_fields': ['family', 'hash_sha256', 'threat_level']},\n",
    "    {'type_ref': 'osint_vulnerability_go_cybersecurity', 'label': 'Vulnerability',\n",
    "     'metadata_fields': ['cve_id', 'cvss', 'affected_product', 'exploited']},\n",
    "]\n",
    "\n",
    "GENERIC_PRESETS = [\n",
    "    {'type_ref': 'concept', 'label': 'Concept',\n",
    "     'metadata_fields': ['name', 'category', 'definition']},\n",
    "    {'type_ref': 'url', 'label': 'URL/Link',\n",
    "     'metadata_fields': ['url', 'domain', 'context']},\n",
    "    {'type_ref': 'date_reference', 'label': 'Date/Time',\n",
    "     'metadata_fields': ['date', 'precision', 'context']},\n",
    "    {'type_ref': 'quantity', 'label': 'Quantity/Amount',\n",
    "     'metadata_fields': ['value', 'unit', 'context']},\n",
    "    {'type_ref': 'coordinates', 'label': 'Coordinates',\n",
    "     'metadata_fields': ['lat', 'lon', 'label']},\n",
    "    {'type_ref': 'text_fragment', 'label': 'Key Text Fragment',\n",
    "     'metadata_fields': ['text', 'category', 'relevance']},\n",
    "]\n",
    "\n",
    "ALL_PRESETS = OSINT_PRESETS + GENERIC_PRESETS\n",
    "\n",
    "RELATION_TYPES = [\n",
    "    'employs', 'works_for', 'founded', 'owns', 'controls',\n",
    "    'member_of', 'affiliated_with', 'collaborates_with',\n",
    "    'communicates_with', 'sent_to', 'received_from',\n",
    "    'located_in', 'headquartered_in', 'traveled_to', 'operates_in',\n",
    "    'participated_in', 'caused', 'occurred_at', 'occurred_on',\n",
    "    'mentions', 'references', 'describes', 'authored', 'published',\n",
    "    'funds', 'transacted_with', 'invested_in',\n",
    "    'hosts', 'resolves_to', 'exploits', 'targets',\n",
    "    'related_to', 'part_of', 'instance_of', 'has_attribute',\n",
    "]\n",
    "\n",
    "print(f'{len(ALL_PRESETS)} presets, {len(RELATION_TYPES)} relation types')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  0.0%] Extracting text from file...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  0.0%] Extracting entities from chunk 1/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  0.7%] Extracting entities from chunk 2/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  1.5%] Extracting entities from chunk 3/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  2.2%] Extracting entities from chunk 4/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  3.0%] Extracting entities from chunk 5/54\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/lucas/fn_registry/analysis/ontology_graph/lib/extraction_pipeline.py:113: UserWarning: extract_entities_llm: type_ref 'osint_service_go_cybersecurity' no esta en el schema, descartando entidad 'Bizum'\n",
      "  candidates = extract_entities_llm(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  3.7%] Extracting entities from chunk 6/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  4.4%] Extracting entities from chunk 7/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  5.2%] Extracting entities from chunk 8/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  5.9%] Extracting entities from chunk 9/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  6.7%] Extracting entities from chunk 10/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  7.4%] Extracting entities from chunk 11/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  8.1%] Extracting entities from chunk 12/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  8.9%] Extracting entities from chunk 13/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [  9.6%] Extracting entities from chunk 14/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [ 10.4%] Extracting entities from chunk 15/54\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  [ 11.1%] Extracting entities from chunk 16/54\n"
     ]
    }
   ],
   "source": [
    "DOC_PATH = '/home/lucas/fn_registry/analysis/ontology_graph/data/condiciones-generales-bizum.pdf'\n",
    "\n",
    "def on_progress(msg, pct):\n",
    "    print(f'  [{pct*100:5.1f}%] {msg}')\n",
    "\n",
    "result = extraction_pipeline(\n",
    "    file_path=DOC_PATH,\n",
    "    entity_presets=ALL_PRESETS,\n",
    "    relation_types=RELATION_TYPES,\n",
    "    llm_chat_json=claude_haiku_json,\n",
    "    chunk_size=800,\n",
    "    chunk_overlap=100,\n",
    "    confidence_threshold=0.5,\n",
    "    dedup_threshold=0.85,\n",
    "    on_progress=on_progress,\n",
    ")\n",
    "\n",
    "print(f'\\nEntities: {result.stats.final_entities_count}')\n",
    "print(f'Relations: {result.stats.final_relations_count}')\n",
    "print(f'Chunks: {result.stats.total_chunks}')\n",
    "print(f'Time: {result.stats.processing_time_seconds:.1f}s')\n",
    "print(f'Entity types: {result.stats.entity_types_found}')\n",
    "print(f'Relation types: {result.stats.relation_types_found}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pipeline optimizado\n",
    "\n",
    "- 1 sola llamada LLM por chunk (entities + relations + tipos nuevos)\n",
    "- Chunks de 2000 chars\n",
    "- Paralelizado con ThreadPoolExecutor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "from extract_text_from_file import extract_text_from_file\n",
    "from core_functions import preprocess_text\n",
    "from split_text_into_chunks import split_text_into_chunks\n",
    "from deduplicate_entities import deduplicate_entities\n",
    "from deduplicate_relations import deduplicate_relations\n",
    "from entity_candidate import EntityCandidate\n",
    "from relation_candidate import RelationCandidate\n",
    "\n",
    "def build_unified_prompt(entity_presets, relation_types):\n",
    "    \"\"\"System prompt que pide entities + relations + tipos nuevos en 1 sola llamada.\"\"\"\n",
    "    type_lines = []\n",
    "    for p in entity_presets:\n",
    "        fields = ', '.join(p.get('metadata_fields', []))\n",
    "        type_lines.append(f\"- {p['label']} (type_ref: {p['type_ref']}): [{fields}]\")\n",
    "\n",
    "    return f'''You are an entity and relation extraction expert. Given text, extract ALL entities and relations in a single pass.\n",
    "\n",
    "ENTITY TYPES:\n",
    "{chr(10).join(type_lines)}\n",
    "\n",
    "RELATION TYPES: {', '.join(relation_types)}\n",
    "\n",
    "OUTPUT FORMAT (strict JSON):\n",
    "{{\n",
    "  \"entities\": [\n",
    "    {{\"name\": \"...\", \"type_ref\": \"...\", \"attributes\": {{...}}, \"confidence\": 0.9}}\n",
    "  ],\n",
    "  \"relations\": [\n",
    "    {{\"from_name\": \"...\", \"to_name\": \"...\", \"relation_type\": \"...\", \"confidence\": 0.8, \"description\": \"...\"}}\n",
    "  ],\n",
    "  \"suggested_types\": [\n",
    "    {{\"type_ref\": \"snake_case_id\", \"label\": \"Human Label\", \"metadata_fields\": [\"field1\", \"field2\"], \"reason\": \"why this type is needed\"}}\n",
    "  ]\n",
    "}}\n",
    "\n",
    "RULES:\n",
    "- Extract ALL entities explicitly mentioned in the text\n",
    "- Use exact type_ref from the schema. Leave unknown attributes as null\n",
    "- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied\n",
    "- Relations: from_name and to_name MUST match extracted entity names exactly\n",
    "- suggested_types: if you find important entities that do NOT fit any existing type, suggest a new type with its fields. Use these suggested types for those entities in the entities array.\n",
    "- If no suggested types are needed, return \"suggested_types\": []\n",
    "- Respond in the same language as the text for descriptions'''\n",
    "\n",
    "UNIFIED_PROMPT = build_unified_prompt(ALL_PRESETS, RELATION_TYPES)\n",
    "print(f'Prompt length: {len(UNIFIED_PROMPT)} chars')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
@@ -0,0 +1,15 @@
 [project]
 name = "ontology-graph"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
    "jupyter>=1.1.1",
    "jupyter-collaboration>=4.3.0",
    "jupyter-mcp-server>=0.4.0",
    "jupyterlab>=4.5.6",
    "matplotlib>=3.10.8",
    "numpy>=2.4.4",
    "pandas>=3.0.2",
 ]
@@ -0,0 +1,45 @@
 #!/bin/bash
 # Jupyter Lab — modo colaborativo con autodeteccion de puerto
 # Generado por write_jupyter_launcher (fn_registry)
 find_free_port() {
    for port in 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899; do
        if ! ss -tln 2>/dev/null | grep -q ":${port} " && \
           ! lsof -i:"$port" >/dev/null 2>&1; then
            echo $port
            return
        fi
    done
    echo 8888
 }
 PORT=${1:-$(find_free_port)}
 cd "$(dirname "$0")"
 echo $PORT > .jupyter-port
 source .venv/bin/activate 2>/dev/null || true
 if ! python -c "import jupyter_collaboration" 2>/dev/null; then
    echo "ERROR: jupyter-collaboration no esta instalado"
    echo "Instala con: uv add jupyter-collaboration"
    exit 1
 fi
 echo "════════════════════════════════════════════════"
 echo "  Jupyter Lab + Colaboracion en puerto $PORT"
 echo "════════════════════════════════════════════════"
 echo ""
 echo "  Abre: http://localhost:$PORT"
 echo "  Ctrl+C para detener"
 echo ""
 jupyter lab \
    --port=$PORT \
    --no-browser \
    --ServerApp.token='' \
    --ServerApp.password='' \
    --ServerApp.disable_check_xsrf=True \
    --ServerApp.allow_origin='*' \
    --ServerApp.root_dir="$(pwd)" \
    --collaborative