merge: issue/0040-hybrid-extraction-pipeline — pipeline hibrido extraccion grafos

2026-04-30 16:53:31 +02:00
parent 0c1c08742b f8c34d4b16
commit 4e1f0c831d
6 changed files with 746 additions and 1 deletions
@@ -45,7 +45,7 @@
 | [0037](completed/0037-ioc-regex-extractor.md) | IoC regex extractor (IP, email, dominio, hash, wallet, CVE, MAC) | completado | alta | feature | — |
 | [0038](completed/0038-gliner-entity-extractor.md) | GLiNER entity extractor (zero-shot NER multilingue) | completado | alta | feature | 0039, 0040 |
 | [0039](completed/0039-glirel-relation-extractor.md) | GLiREL relation extractor (zero-shot triplets) | completado | media | feature | 0040 |
-| [0040](0040-hybrid-extraction-pipeline.md) | Pipeline hibrido extraccion grafos (regex + GLiNER + GLiREL + LLM fallback) | pendiente | media | feature | — |
+| [0040](completed/0040-hybrid-extraction-pipeline.md) | Pipeline hibrido extraccion grafos (regex + GLiNER + GLiREL + LLM fallback) | completado | media | feature | — |
 | [0041](completed/0041-cpp-app-best-practices.md) | C++ app shell estandarizado (PATTERNS.md + AppConfig extendido) | completado | alta | feature | 0043 |
 | [0042](completed/0042-cpp-layout-storage-public.md) | C++ layout_storage publico (extraer de shaders_lab) | completado | alta | feature | 0043 |
 | [0043](completed/0043-cpp-apps-standardize-shell.md) | Estandarizar shell de las 4 apps C++ | completado | alta | refactor | 0046 |
@@ -0,0 +1,192 @@
 ---
 name: extract_graph_hybrid
 kind: pipeline
 lang: py
 domain: pipelines
 version: "1.0.0"
 purity: impure
 signature: "def extract_graph_hybrid(chunks: list[str], entity_schema: list[dict], relation_types: list[str], gliner_model: Any, glirel_model: Any, llm_chat_json: Callable[[list[dict]], dict] | None = None, ioc_types: list[str] | None = None, confidence_threshold: float = 0.6, languages: str = 'Respond in Spanish.', min_entities_per_chunk: int = 2) -> tuple[list[EntityCandidate], list[RelationCandidate]]"
 description: "Pipeline hibrido en cascada que combina extract_iocs (regex, coste 0), GLiNER (zero-shot NER, coste bajo), GLiREL (zero-shot RE) y un LLM fallback opcional para chunks complejos o de baja confianza. Devuelve listas concatenadas listas para deduplicate_entities/deduplicate_relations."
 tags: [pipeline, extraction, entities, relations, gliner, glirel, ioc, regex, llm, nlp, datascience, cybersecurity, hybrid]
 uses_functions:
  - extract_iocs_py_cybersecurity
  - extract_entities_gliner_py_datascience
  - extract_relations_glirel_py_datascience
  - extract_entities_llm_py_datascience
  - extract_relations_llm_py_datascience
 uses_types:
  - entity_candidate_py_datascience
  - relation_candidate_py_datascience
 returns:
  - entity_candidate_py_datascience
  - relation_candidate_py_datascience
 returns_optional: false
 error_type: "error_go_core"
 imports:
  - typing.Any
  - typing.Callable
  - warnings
 params:
  - name: chunks
    desc: "Lista de fragmentos de texto ya cortados (p.ej. via split_text_into_chunks)."
  - name: entity_schema
    desc: "Schema para GLiNER y LLM. Lista de dicts con type_ref, label y opcional metadata_fields."
  - name: relation_types
    desc: "Tipos de relacion permitidos para GLiREL/LLM (ej: ['operates','owns','communicates_with'])."
  - name: gliner_model
    desc: "Instancia GLiNER cargada con gliner_load_model. Inyectada por el caller."
  - name: glirel_model
    desc: "Instancia GLiREL cargada con glirel_load_model. Inyectada por el caller."
  - name: llm_chat_json
    desc: "Cliente LLM inyectado (sin acoplamiento al proveedor). Si None, no hay fallback LLM."
  - name: ioc_types
    desc: "Subset de tipos para extract_iocs (email, ip_address, domain, file_hash, ...). None = todos."
  - name: confidence_threshold
    desc: "Por debajo de este umbral, GLiNER se considera de baja confianza y se invoca el LLM."
  - name: languages
    desc: "Instruccion de idioma passthrough al LLM (ej: 'Respond in Spanish.')."
  - name: min_entities_per_chunk
    desc: "Si un chunk arroja menos entidades que esto, se invoca el LLM como fallback (default 2)."
 output: "Tupla (entities, relations) con candidatas concatenadas (sin deduplicar). El caller debe pasar por deduplicate_entities y deduplicate_relations."
 tested: true
 tests:
  - "corpus OSINT con IoCs y entidades semanticas devuelve mezcla regex+GLiNER"
  - "chunks vacios o con solo whitespace se saltan"
  - "entity_schema vacio lanza ValueError"
  - "chunks no-lista lanza ValueError"
  - "GLiNER produciendo pocas entidades dispara fallback LLM si llm_chat_json esta presente"
  - "sin llm_chat_json no se invoca ningun fallback LLM"
  - "GLiREL sin relaciones dispara fallback LLM relations"
  - "ioc_types acota el set de extractores regex"
  - "errores de extractores se capturan con warnings y no abortan el pipeline"
 test_file_path: "python/functions/pipelines/tests/test_extract_graph_hybrid.py"
 file_path: "python/functions/pipelines/extract_graph_hybrid.py"
 ---
 ## Ejemplo
 ```python
 from python.functions.pipelines.extract_graph_hybrid import extract_graph_hybrid
 from python.functions.datascience.gliner_load_model import gliner_load_model
 from python.functions.datascience.glirel_load_model import glirel_load_model
 from python.functions.datascience.deduplicate_entities import deduplicate_entities
 from python.functions.datascience.deduplicate_relations import deduplicate_relations
 gliner = gliner_load_model("urchade/gliner_multi-v2.1", device="auto")
 glirel = glirel_load_model("jackboyla/glirel-large-v0", device="auto")
 entity_schema = [
    {"type_ref": "osint_person_go_cybersecurity",       "label": "Person"},
    {"type_ref": "osint_organization_go_cybersecurity", "label": "Organization"},
    {"type_ref": "osint_location_go_cybersecurity",     "label": "Location"},
 ]
 relation_types = ["operates", "owns", "communicates_with", "employed_by"]
 chunks = [
    "Alice Johnson works at OpenAI in San Francisco. Contact: alice@openai.com.",
    "The C2 server lives at 192.168.0.1 and resolves to evil-corp.com.",
 ]
 # Sin LLM (coste cero, solo regex + GLiNER + GLiREL)
 entities, relations = extract_graph_hybrid(
    chunks=chunks,
    entity_schema=entity_schema,
    relation_types=relation_types,
    gliner_model=gliner,
    glirel_model=glirel,
    llm_chat_json=None,
 )
 # Con LLM fallback solo en chunks complejos
 def llm_chat_json(messages):
    # llamar a OpenAI/Anthropic/Ollama y devolver el JSON ya parseado
    ...
 entities, relations = extract_graph_hybrid(
    chunks=chunks,
    entity_schema=entity_schema,
    relation_types=relation_types,
    gliner_model=gliner,
    glirel_model=glirel,
    llm_chat_json=llm_chat_json,
    confidence_threshold=0.6,
    min_entities_per_chunk=2,
 )
 # Deduplicar antes de persistir
 dedup = deduplicate_entities(entities, name_threshold=0.85)
 final_relations = deduplicate_relations(relations, dedup.name_to_id)
 ```
 ## Algoritmo
 Por cada chunk:
 1. **Regex (capa tecnica)** — `extract_iocs(chunk, ioc_types)` devuelve dicts
   `{value, start, end, type}` que se mapean a `EntityCandidate` con
   `type_ref` propio (`ioc_email`, `ioc_ip_address`, `ioc_domain`, ...) y
   `confidence=1.0`. Los offsets se anotan en `attributes['start'/'end']`
   para que GLiREL pueda mapearlos a tokens sin fallback `text.find`.
 2. **GLiNER (capa semantica)** — `extract_entities_gliner` con el schema y
   el `confidence_threshold` como filtro de score.
 3. **Merge** — IoCs + GLiNER deduplicados por `(name, type_ref)`. NO se
   colapsa fuzzy aqui; eso lo hace el caller.
 4. **LLM fallback (opcional)** — si el chunk tiene menos de
   `min_entities_per_chunk` entidades **o** `mean(gliner_confidence) <
   confidence_threshold` **y** `llm_chat_json is not None`, se invoca
   `extract_entities_llm` y se mezcla.
 5. **GLiREL (relaciones zero-shot)** — solo si hay >=2 entidades.
 6. **LLM fallback de relaciones (opcional)** — si GLiREL no devolvio nada
   con >=2 entidades **y** hay `llm_chat_json`, se invoca
   `extract_relations_llm` para ese chunk.
 `source_chunk_indices` y `source_chunk_index` se rellenan para que
 `deduplicate_relations` pueda reconstruir el grafo origen→destino.
 ## Por que cascada y no all-LLM
 | Capa | Coste por 100 KB | Latencia | Calidad |
 |------|------------------|----------|---------|
 | `extract_iocs` (regex) | 0 | <50 ms | Precision 100% en IoCs tecnicos |
 | GLiNER (`gliner_multi-v2.1`) | 0 (modelo local, GPU/CPU) | ~1-3 s/chunk en CPU, <0.5 s en GPU | F1 0.7-0.85 en NER zero-shot |
 | GLiREL (`glirel-large-v0`) | 0 (modelo local) | ~2-4 s/chunk en CPU | F1 0.5-0.75 en RE zero-shot |
 | LLM (GPT-4 / Claude Sonnet) | $0.5-3 por 100 KB | 5-15 s/chunk | F1 0.85-0.95 |
 El pipeline hibrido reserva el LLM (caro y lento) para los chunks que
 GLiNER/GLiREL no resuelven con suficiente confianza. En corpus OSINT
 tipicos el LLM se invoca en <20% de los chunks → coste total 5-10x menor
 que un pipeline 100% LLM con perdida de calidad <5 puntos F1.
 ## Solapamiento IoC ↔ GLiNER
 GLiNER puede detectar `apple.com` como `Organization` mientras que regex
 lo detecta como `domain`. **Decision intencional**: ambos se conservan
 con `type_ref` distinto (`osint_organization_go_cybersecurity` vs
 `ioc_domain`). `deduplicate_entities(..., same_type_only=True)` no las
 mezcla. El caller decide si quiere unificar (por ejemplo, anotando una
 relacion `domain_of` entre las dos).
 ## Recomendaciones operativas
 - **Batch size**: ~100-200 chunks de 500-1000 caracteres por llamada al
  pipeline. Mas chunks → mas paralelismo aprovechable; menos chunks →
  menos overhead de carga del modelo.
 - **Latencia esperada (CPU)**: ~3-5 s/chunk sin LLM, +5-15 s/chunk si
  cae al LLM fallback.
 - **Latencia esperada (GPU)**: ~0.5-1 s/chunk sin LLM.
 - **Cuando bajar `confidence_threshold`**: en corpus con jerga muy
  especifica donde GLiNER no aprendio bien — pero esto incrementa el
  coste si hay LLM (mas chunks caen al fallback).
 - **Cuando subir `min_entities_per_chunk`**: si quieres forzar fallback
  LLM en chunks "ricos" para asegurar cobertura completa.
 ## Notas
 - La deduplicacion fuzzy (Levenshtein + Union-Find) la hace
  `deduplicate_entities` — NO replicar aqui.
 - Los errores de cualquier extractor en cualquier chunk se capturan con
  `warnings.warn` y NO abortan el pipeline (robustez sobre completitud).
 - Las funciones LLM aceptan `language_instruction`; aqui se pasa como
  `languages` (default `"Respond in Spanish."`).
 - Pensar en una app `apps/osint_extractor/` que use este pipeline + sigma
  viz como demo. Fuera de scope de este issue.
@@ -0,0 +1,260 @@
 """Pipeline hibrido: extract_iocs + GLiNER + GLiREL + LLM fallback."""
 from __future__ import annotations
 import os
 import sys
 import warnings
 from typing import Any, Callable
 _ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
 if _ROOT not in sys.path:
    sys.path.insert(0, _ROOT)
 from python.functions.cybersecurity.extract_iocs import extract_iocs
 from python.functions.datascience.extract_entities_gliner import extract_entities_gliner
 from python.functions.datascience.extract_relations_glirel import extract_relations_glirel
 from python.functions.datascience.extract_entities_llm import extract_entities_llm
 from python.functions.datascience.extract_relations_llm import extract_relations_llm
 from python.types.datascience.entity_candidate import EntityCandidate
 from python.types.datascience.relation_candidate import RelationCandidate
 _IOC_TYPE_REF = {
    "email": "ioc_email",
    "ip_address": "ioc_ip_address",
    "domain": "ioc_domain",
    "file_hash": "ioc_file_hash",
    "crypto_wallet": "ioc_crypto_wallet",
    "cve_id": "ioc_cve_id",
    "mac_address": "ioc_mac_address",
    "phone_number": "ioc_phone_number",
 }
 _IOC_LABEL = {
    "email": "Email",
    "ip_address": "IPAddress",
    "domain": "Domain",
    "file_hash": "FileHash",
    "crypto_wallet": "CryptoWallet",
    "cve_id": "CVE",
    "mac_address": "MACAddress",
    "phone_number": "PhoneNumber",
 }
 def _ioc_dict_to_candidate(ioc: dict, chunk_index: int) -> EntityCandidate:
    """Convierte un dict de extract_iocs a EntityCandidate.
    Anota offsets en `attributes['start'/'end']` para que extract_relations_glirel
    pueda mapearlos a tokens sin fallback `text.find`.
    """
    ioc_type = ioc.get("type", "")
    return EntityCandidate(
        name=ioc.get("value", ""),
        type_ref=_IOC_TYPE_REF.get(ioc_type, f"ioc_{ioc_type}"),
        type_label=_IOC_LABEL.get(ioc_type, ioc_type),
        attributes={"start": ioc.get("start", -1), "end": ioc.get("end", -1)},
        confidence=1.0,
        source_chunk_indices=[chunk_index],
    )
 def _mean_confidence(entities: list[EntityCandidate]) -> float:
    if not entities:
        return 0.0
    return sum(e.confidence for e in entities) / len(entities)
 def _merge_entities_dedup_by_name_type(
    base: list[EntityCandidate],
    extra: list[EntityCandidate],
 ) -> list[EntityCandidate]:
    """Anade `extra` a `base` evitando duplicados exactos (name + type_ref).
    No usa fuzzy: la deduplicacion final la hace el caller con
    `deduplicate_entities`. Aqui solo evita el caso trivial de meter dos veces
    la misma cadena con el mismo type_ref dentro del mismo chunk.
    """
    seen = {(e.name, e.type_ref) for e in base}
    out = list(base)
    for e in extra:
        key = (e.name, e.type_ref)
        if key in seen:
            continue
        seen.add(key)
        out.append(e)
    return out
 def extract_graph_hybrid(
    chunks: list[str],
    entity_schema: list[dict],
    relation_types: list[str],
    gliner_model: Any,
    glirel_model: Any,
    llm_chat_json: Callable[[list[dict]], dict] | None = None,
    ioc_types: list[str] | None = None,
    confidence_threshold: float = 0.6,
    languages: str = "Respond in Spanish.",
    min_entities_per_chunk: int = 2,
 ) -> tuple[list[EntityCandidate], list[RelationCandidate]]:
    """Extrae triplets `(entidad, relacion, entidad)` combinando regex + GLiNER + GLiREL + LLM fallback.
    Cascada por chunk:
      1. `extract_iocs(chunk, ioc_types)` → entidades tecnicas (precision 100%, coste 0).
      2. `extract_entities_gliner(chunk, entity_schema, gliner_model)` → semanticas zero-shot.
      3. Si entidades < `min_entities_per_chunk` o `mean(confidence) < confidence_threshold`
         **y** hay `llm_chat_json` → `extract_entities_llm` para rellenar gaps.
      4. `extract_relations_glirel(chunk, entidades_chunk, relation_types, glirel_model)`.
      5. Si no salieron relaciones con >=2 entidades **y** hay `llm_chat_json` →
         `extract_relations_llm` para esos chunks.
    Args:
        chunks: Lista de fragmentos de texto a procesar (ya tokenizados/cortados).
        entity_schema: Schema para GLiNER y LLM. Lista de dicts con
            `type_ref`, `label` y opcional `metadata_fields`.
        relation_types: Tipos de relacion permitidos para GLiREL/LLM.
        gliner_model: Instancia GLiNER cargada con `gliner_load_model`.
        glirel_model: Instancia GLiREL cargada con `glirel_load_model`.
        llm_chat_json: Funcion inyectada que recibe messages OpenAI-style y
            retorna dict JSON. Si es None, no se invoca fallback LLM (ahorro maximo).
        ioc_types: Subset de tipos para `extract_iocs`. None = todos.
        confidence_threshold: Bajo este umbral se invoca el LLM como fallback.
        languages: Instruccion de idioma para el LLM (passthrough a las funciones LLM).
        min_entities_per_chunk: Si un chunk tiene menos entidades que esto,
            se considera "complejo" y se llama al LLM.
    Returns:
        Tupla `(entities, relations)` con todas las candidatas concatenadas.
        El caller debe pasar por `deduplicate_entities` y `deduplicate_relations`
        antes de persistir. Cada `EntityCandidate` lleva
        `source_chunk_indices=[i]` y cada `RelationCandidate`
        lleva `source_chunk_index=i`.
    Raises:
        ValueError: Si `entity_schema` esta vacio o `chunks` no es lista.
    """
    if not isinstance(chunks, list):
        raise ValueError("chunks debe ser una lista")
    if not entity_schema:
        raise ValueError("entity_schema no puede estar vacio")
    all_entities: list[EntityCandidate] = []
    all_relations: list[RelationCandidate] = []
    for i, chunk in enumerate(chunks):
        if not chunk or not chunk.strip():
            continue
        # ── Capa 1: regex IoCs ──────────────────────────────────────────────
        try:
            ioc_dicts = extract_iocs(chunk, ioc_types)
        except Exception as exc:
            warnings.warn(
                f"extract_graph_hybrid: extract_iocs fallo en chunk {i}: {exc}",
                stacklevel=2,
            )
            ioc_dicts = []
        ioc_entities = [_ioc_dict_to_candidate(d, i) for d in ioc_dicts]
        # ── Capa 2: GLiNER ──────────────────────────────────────────────────
        try:
            gliner_entities = extract_entities_gliner(
                text=chunk,
                entity_schema=entity_schema,
                model=gliner_model,
                threshold=confidence_threshold,
            )
        except Exception as exc:
            warnings.warn(
                f"extract_graph_hybrid: extract_entities_gliner fallo en chunk {i}: {exc}",
                stacklevel=2,
            )
            gliner_entities = []
        for ent in gliner_entities:
            if i not in ent.source_chunk_indices:
                ent.source_chunk_indices.append(i)
        chunk_entities = _merge_entities_dedup_by_name_type(ioc_entities, gliner_entities)
        # ── Capa 3: LLM entity fallback (opcional) ──────────────────────────
        needs_entity_llm = (
            len(chunk_entities) < min_entities_per_chunk
            or _mean_confidence(gliner_entities) < confidence_threshold
        )
        if needs_entity_llm and llm_chat_json is not None:
            try:
                llm_entities = extract_entities_llm(
                    text=chunk,
                    entity_schema=entity_schema,
                    llm_chat_json=llm_chat_json,
                    language_instruction=languages,
                )
            except Exception as exc:
                warnings.warn(
                    f"extract_graph_hybrid: extract_entities_llm fallo en chunk {i}: {exc}",
                    stacklevel=2,
                )
                llm_entities = []
            for ent in llm_entities:
                if i not in ent.source_chunk_indices:
                    ent.source_chunk_indices.append(i)
            chunk_entities = _merge_entities_dedup_by_name_type(chunk_entities, llm_entities)
        all_entities.extend(chunk_entities)
        # ── Capa 4: GLiREL ──────────────────────────────────────────────────
        if len(chunk_entities) >= 2:
            try:
                glirel_relations = extract_relations_glirel(
                    text=chunk,
                    entities=chunk_entities,
                    relation_types=relation_types,
                    model=glirel_model,
                    threshold=confidence_threshold,
                )
            except Exception as exc:
                warnings.warn(
                    f"extract_graph_hybrid: extract_relations_glirel fallo en chunk {i}: {exc}",
                    stacklevel=2,
                )
                glirel_relations = []
        else:
            glirel_relations = []
        for rel in glirel_relations:
            rel.source_chunk_index = i
        # ── Capa 5: LLM relation fallback (opcional) ────────────────────────
        if (
            llm_chat_json is not None
            and len(chunk_entities) >= 2
            and not glirel_relations
        ):
            try:
                llm_relations = extract_relations_llm(
                    text=chunk,
                    entities=chunk_entities,
                    relation_types=relation_types,
                    llm_chat_json=llm_chat_json,
                    language_instruction=languages,
                )
            except Exception as exc:
                warnings.warn(
                    f"extract_graph_hybrid: extract_relations_llm fallo en chunk {i}: {exc}",
                    stacklevel=2,
                )
                llm_relations = []
            for rel in llm_relations:
                rel.source_chunk_index = i
            glirel_relations.extend(llm_relations)
        all_relations.extend(glirel_relations)
    return all_entities, all_relations
@@ -0,0 +1,293 @@
 """Tests de integracion para extract_graph_hybrid.
 Stubs duck-typed para gliner/glirel/LLM permiten ejercitar la cascada
 sin descargar modelos pesados.
 """
 from __future__ import annotations
 import os
 import sys
 from dataclasses import dataclass, field
 from typing import Any
 import pytest
 _ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
 if _ROOT not in sys.path:
    sys.path.insert(0, _ROOT)
 from python.functions.pipelines.extract_graph_hybrid import extract_graph_hybrid
 from python.types.datascience.entity_candidate import EntityCandidate
 from python.types.datascience.relation_candidate import RelationCandidate
 # ── Stubs ──────────────────────────────────────────────────────────────────────
@dataclass
 class StubGliner:
    """Stub de GLiNER. `responses` se va consumiendo por chunk en orden."""
    responses: list[list[dict]] = field(default_factory=list)
    calls: int = 0
    def predict_entities(self, text, labels, threshold, flat_ner):
        idx = self.calls
        self.calls += 1
        if idx < len(self.responses):
            return self.responses[idx]
        return []
@dataclass
 class StubGlirel:
    """Stub de GLiREL. Mismo patron que StubGliner."""
    responses: list[list[dict]] = field(default_factory=list)
    calls: int = 0
    def predict_relations(self, tokens, labels, threshold, ner, top_k=1):
        idx = self.calls
        self.calls += 1
        if idx < len(self.responses):
            return self.responses[idx]
        return []
@dataclass
 class StubLLM:
    """LLM stub: enruta por contenido del system prompt."""
    entity_responses: list[dict] = field(default_factory=list)
    relation_responses: list[dict] = field(default_factory=list)
    entity_calls: int = 0
    relation_calls: int = 0
    def __call__(self, messages: list[dict]) -> dict:
        system = messages[0]["content"] if messages else ""
        if "relation extraction expert" in system.lower():
            idx = self.relation_calls
            self.relation_calls += 1
            if idx < len(self.relation_responses):
                return self.relation_responses[idx]
            return {"relations": []}
        idx = self.entity_calls
        self.entity_calls += 1
        if idx < len(self.entity_responses):
            return self.entity_responses[idx]
        return {"entities": []}
 SCHEMA = [
    {"type_ref": "osint_person_go_cybersecurity",       "label": "Person"},
    {"type_ref": "osint_organization_go_cybersecurity", "label": "Organization"},
    {"type_ref": "osint_location_go_cybersecurity",     "label": "Location"},
 ]
 RELATION_TYPES = ["operates", "owns", "communicates_with", "employed_by", "related_to"]
 # ── Tests ──────────────────────────────────────────────────────────────────────
 def test_corpus_osint_devuelve_mezcla_regex_gliner():
    """Corpus OSINT con IoCs y entidades semanticas devuelve mezcla regex+GLiNER."""
    chunks = [
        "Alice Johnson works at OpenAI. Contact: alice@openai.com",
    ]
    gliner = StubGliner(responses=[
        [
            {"start": 0, "end": 13, "text": "Alice Johnson", "label": "Person", "score": 0.92},
            {"start": 23, "end": 29, "text": "OpenAI", "label": "Organization", "score": 0.88},
        ],
    ])
    glirel = StubGlirel(responses=[[]])
    entities, relations = extract_graph_hybrid(
        chunks=chunks,
        entity_schema=SCHEMA,
        relation_types=RELATION_TYPES,
        gliner_model=gliner,
        glirel_model=glirel,
        llm_chat_json=None,
    )
    types = {e.type_ref for e in entities}
    # Regex IoC: email
    assert any(e.type_ref == "ioc_email" and e.name == "alice@openai.com" for e in entities)
    # GLiNER: persona y organizacion
    assert "osint_person_go_cybersecurity" in types
    assert "osint_organization_go_cybersecurity" in types
    # source_chunk_indices marcado
    assert all(0 in e.source_chunk_indices for e in entities)
    assert relations == []
 def test_chunks_vacios_se_saltan():
    """Chunks vacios o solo whitespace se saltan sin invocar modelos."""
    gliner = StubGliner(responses=[])
    glirel = StubGlirel(responses=[])
    entities, relations = extract_graph_hybrid(
        chunks=["", "   ", "\n\t"],
        entity_schema=SCHEMA,
        relation_types=RELATION_TYPES,
        gliner_model=gliner,
        glirel_model=glirel,
    )
    assert entities == []
    assert relations == []
    assert gliner.calls == 0
    assert glirel.calls == 0
 def test_entity_schema_vacio_lanza_value_error():
    """entity_schema vacio lanza ValueError."""
    with pytest.raises(ValueError):
        extract_graph_hybrid(
            chunks=["text"],
            entity_schema=[],
            relation_types=RELATION_TYPES,
            gliner_model=StubGliner(),
            glirel_model=StubGlirel(),
        )
 def test_chunks_no_lista_lanza_value_error():
    """chunks no-lista lanza ValueError."""
    with pytest.raises(ValueError):
        extract_graph_hybrid(
            chunks="no soy lista",  # type: ignore[arg-type]
            entity_schema=SCHEMA,
            relation_types=RELATION_TYPES,
            gliner_model=StubGliner(),
            glirel_model=StubGlirel(),
        )
 def test_gliner_pocas_entidades_dispara_fallback_llm():
    """GLiNER produciendo pocas entidades dispara fallback LLM."""
    chunks = ["Texto complejo sin patrones obvios."]
    gliner = StubGliner(responses=[[]])  # GLiNER no encuentra nada
    glirel = StubGlirel(responses=[[]])
    llm = StubLLM(entity_responses=[
        {"entities": [
            {"name": "Acme Corp", "type_ref": "osint_organization_go_cybersecurity",
             "attributes": {}, "confidence": 0.95},
            {"name": "Bob",       "type_ref": "osint_person_go_cybersecurity",
             "attributes": {}, "confidence": 0.9},
        ]},
    ])
    entities, _ = extract_graph_hybrid(
        chunks=chunks,
        entity_schema=SCHEMA,
        relation_types=RELATION_TYPES,
        gliner_model=gliner,
        glirel_model=glirel,
        llm_chat_json=llm,
        min_entities_per_chunk=2,
    )
    names = {e.name for e in entities}
    assert "Acme Corp" in names
    assert "Bob" in names
    assert llm.entity_calls == 1
 def test_sin_llm_no_se_invoca_fallback():
    """Sin llm_chat_json no se invoca ningun fallback LLM aunque GLiNER no encuentre nada."""
    gliner = StubGliner(responses=[[]])
    glirel = StubGlirel(responses=[[]])
    entities, relations = extract_graph_hybrid(
        chunks=["chunk dificil"],
        entity_schema=SCHEMA,
        relation_types=RELATION_TYPES,
        gliner_model=gliner,
        glirel_model=glirel,
        llm_chat_json=None,
    )
    # Nada de LLM, solo lo que diera regex (en este chunk: nada)
    assert entities == []
    assert relations == []
 def test_glirel_sin_relaciones_dispara_fallback_llm_relations():
    """GLiREL sin relaciones dispara fallback LLM relations."""
    chunks = ["Alice Johnson trabaja para OpenAI."]
    gliner = StubGliner(responses=[
        [
            {"start": 0, "end": 13, "text": "Alice Johnson", "label": "Person", "score": 0.95},
            {"start": 26, "end": 32, "text": "OpenAI", "label": "Organization", "score": 0.9},
        ],
    ])
    glirel = StubGlirel(responses=[[]])  # GLiREL no encuentra relaciones
    llm = StubLLM(relation_responses=[
        {"relations": [
            {"from_name": "Alice Johnson", "to_name": "OpenAI",
             "relation_type": "employed_by", "description": "...", "confidence": 0.9},
        ]},
    ])
    _, relations = extract_graph_hybrid(
        chunks=chunks,
        entity_schema=SCHEMA,
        relation_types=RELATION_TYPES,
        gliner_model=gliner,
        glirel_model=glirel,
        llm_chat_json=llm,
        confidence_threshold=0.5,
        min_entities_per_chunk=2,
    )
    assert len(relations) == 1
    assert relations[0].from_name == "Alice Johnson"
    assert relations[0].to_name == "OpenAI"
    assert relations[0].relation_type == "employed_by"
    assert relations[0].source_chunk_index == 0
    assert llm.relation_calls == 1
 def test_ioc_types_acota_extractores():
    """ioc_types acota el set de extractores regex."""
    chunks = ["Email: x@y.com, IP: 192.168.0.1, MD5: 5d41402abc4b2a76b9719d911017c592."]
    gliner = StubGliner(responses=[[]])
    glirel = StubGlirel(responses=[[]])
    entities, _ = extract_graph_hybrid(
        chunks=chunks,
        entity_schema=SCHEMA,
        relation_types=RELATION_TYPES,
        gliner_model=gliner,
        glirel_model=glirel,
        llm_chat_json=None,
        ioc_types=["email"],  # solo emails
    )
    types = {e.type_ref for e in entities}
    assert "ioc_email" in types
    assert "ioc_ip_address" not in types
    assert "ioc_file_hash" not in types
 def test_errores_se_capturan_con_warning():
    """Errores de extractores se capturan con warnings y no abortan el pipeline."""
    class BoomGliner:
        def predict_entities(self, *a, **k):
            raise RuntimeError("boom")
    class BoomGlirel:
        def predict_relations(self, *a, **k):
            raise RuntimeError("boom")
    chunks = ["Email: contact@example.com"]
    with pytest.warns(UserWarning):
        entities, relations = extract_graph_hybrid(
            chunks=chunks,
            entity_schema=SCHEMA,
            relation_types=RELATION_TYPES,
            gliner_model=BoomGliner(),
            glirel_model=BoomGlirel(),
            llm_chat_json=None,
        )
    # Aun asi extract_iocs deberia haber sacado el email
    assert any(e.type_ref == "ioc_email" for e in entities)
    assert relations == []