From b15332686a6bb4c76aacd6b41cbfe7798a52ed9e Mon Sep 17 00:00:00 2001
From: egutierrez <egutierrez@dead.dd>
Date: Thu, 30 Apr 2026 16:33:46 +0200
Subject: [PATCH] test(datascience): corpus stub para gliner_load_model +
 extract_entities_gliner

11 tests sin necesidad de descargar el modelo (200 MB):
- StubModel duck-typed que valida el contrato de predict_entities
- Threshold y flat_ner se propagan al modelo
- Schema vacio lanza ValueError; schema sin labels validos warning + []
- Excepcion del modelo se captura
- Label desconocido se descarta
- gliner_load_model: ImportError simulado, cache hit, _resolve_device
  auto cae a cpu si torch no esta presente

Refs #0038

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../functions/datascience/tests/__init__.py   |   0
 .../tests/test_extract_entities_gliner.py     | 198 ++++++++++++++++++
 2 files changed, 198 insertions(+)
 create mode 100644 python/functions/datascience/tests/__init__.py
 create mode 100644 python/functions/datascience/tests/test_extract_entities_gliner.py

diff --git a/python/functions/datascience/tests/__init__.py b/python/functions/datascience/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/functions/datascience/tests/test_extract_entities_gliner.py b/python/functions/datascience/tests/test_extract_entities_gliner.py
new file mode 100644
index 00000000..aa5d4778
--- /dev/null
+++ b/python/functions/datascience/tests/test_extract_entities_gliner.py
@@ -0,0 +1,198 @@
+"""Tests para extract_entities_gliner y gliner_load_model.
+
+El modelo real (gliner) es opcional. Estos tests usan un stub duck-typed
+para validar el contrato sin descargar 200 MB. Tests que requieran el
+modelo real se marcan con `pytest.importorskip('gliner')`.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+from dataclasses import dataclass
+
+import pytest
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
+
+from python.functions.datascience.extract_entities_gliner import (
+    extract_entities_gliner,
+)
+from python.functions.datascience.gliner_load_model import (
+    _MODEL_CACHE,
+    _resolve_device,
+    gliner_load_model,
+)
+from python.types.datascience.entity_candidate import EntityCandidate
+
+
+SCHEMA_BASIC = [
+    {
+        "type_ref": "osint_person_go_cybersecurity",
+        "label": "Person",
+        "metadata_fields": ["full_name"],
+    },
+    {
+        "type_ref": "osint_organization_go_cybersecurity",
+        "label": "Organization",
+        "metadata_fields": ["name"],
+    },
+    {
+        "type_ref": "osint_location_go_cybersecurity",
+        "label": "Location",
+        "metadata_fields": ["name"],
+    },
+]
+
+
+@dataclass
+class StubModel:
+    """Modelo stub que devuelve una lista preconfigurada."""
+
+    response: list[dict]
+    raise_exc: Exception | None = None
+    last_kwargs: dict | None = None
+
+    def predict_entities(self, text, labels, threshold, flat_ner):
+        self.last_kwargs = {
+            "text": text,
+            "labels": list(labels),
+            "threshold": threshold,
+            "flat_ner": flat_ner,
+        }
+        if self.raise_exc is not None:
+            raise self.raise_exc
+        return self.response
+
+
+# ---------- extract_entities_gliner ----------
+
+
+def test_schema_basico_y_modelo_stub_retorna_entity_candidate():
+    """Schema basico y modelo stub retorna EntityCandidate con offsets."""
+    text = "Alice Johnson works at OpenAI in San Francisco."
+    model = StubModel(response=[
+        {"start": 0, "end": 13, "text": "Alice Johnson", "label": "Person", "score": 0.92},
+        {"start": 23, "end": 29, "text": "OpenAI", "label": "Organization", "score": 0.87},
+        {"start": 33, "end": 46, "text": "San Francisco", "label": "Location", "score": 0.81},
+    ])
+    out = extract_entities_gliner(text, SCHEMA_BASIC, model, threshold=0.5)
+    assert len(out) == 3
+    assert all(isinstance(e, EntityCandidate) for e in out)
+
+    person = next(e for e in out if e.name == "Alice Johnson")
+    assert person.type_ref == "osint_person_go_cybersecurity"
+    assert person.type_label == "Person"
+    assert person.attributes["start"] == 0
+    assert person.attributes["end"] == 13
+    assert pytest.approx(person.confidence, 0.001) == 0.92
+
+
+def test_threshold_filtra_spans_con_score_bajo():
+    """Threshold filtra spans con score bajo."""
+    # El stub no aplica threshold internamente — el modelo real si. Este
+    # test verifica que el threshold se PASA al modelo (kwargs).
+    model = StubModel(response=[
+        {"start": 0, "end": 5, "text": "Alice", "label": "Person", "score": 0.95},
+    ])
+    extract_entities_gliner("Alice", SCHEMA_BASIC, model, threshold=0.7, flat_ner=False)
+    assert model.last_kwargs["threshold"] == 0.7
+    assert model.last_kwargs["flat_ner"] is False
+
+
+def test_schema_vacio_lanza_value_error():
+    """Schema vacio lanza ValueError."""
+    model = StubModel(response=[])
+    with pytest.raises(ValueError):
+        extract_entities_gliner("text", [], model)
+
+
+def test_schema_sin_labels_validos_retorna_vacio():
+    """Schema sin label+type_ref validos retorna vacio con warning."""
+    bad_schema = [{"label": "", "type_ref": ""}, {"label": "X"}]
+    model = StubModel(response=[])
+    with pytest.warns(UserWarning):
+        out = extract_entities_gliner("text", bad_schema, model)
+    assert out == []
+
+
+def test_excepcion_del_modelo_se_captura():
+    """Excepcion del modelo se captura y retorna vacio."""
+    model = StubModel(response=[], raise_exc=RuntimeError("model exploded"))
+    with pytest.warns(UserWarning):
+        out = extract_entities_gliner("text", SCHEMA_BASIC, model)
+    assert out == []
+
+
+def test_label_desconocido_se_descarta():
+    """Label desconocido se descarta."""
+    model = StubModel(response=[
+        {"start": 0, "end": 5, "text": "Alice", "label": "Person", "score": 0.9},
+        {"start": 6, "end": 10, "text": "blob", "label": "UnknownLabel", "score": 0.9},
+    ])
+    out = extract_entities_gliner("Alice blob", SCHEMA_BASIC, model)
+    names = [e.name for e in out]
+    assert "Alice" in names
+    assert "blob" not in names
+
+
+def test_flat_ner_se_propaga_al_modelo():
+    """flat_ner se propaga al modelo."""
+    model = StubModel(response=[])
+    extract_entities_gliner("text", SCHEMA_BASIC, model, flat_ner=True)
+    assert model.last_kwargs["flat_ner"] is True
+    extract_entities_gliner("text", SCHEMA_BASIC, model, flat_ner=False)
+    assert model.last_kwargs["flat_ner"] is False
+
+
+# ---------- gliner_load_model ----------
+
+
+def test_import_error_si_gliner_no_esta_instalado(monkeypatch):
+    """ImportError si gliner no esta instalado."""
+    _MODEL_CACHE.clear()
+
+    real_import = __builtins__["__import__"] if isinstance(__builtins__, dict) else __builtins__.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name == "gliner" or name.startswith("gliner."):
+            raise ImportError("gliner not installed (simulated)")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr("builtins.__import__", fake_import)
+
+    with pytest.raises(ImportError, match="gliner no esta instalado"):
+        gliner_load_model(model_name="dummy/model", device="cpu")
+
+
+def test_cache_devuelve_la_misma_instancia(monkeypatch):
+    """Cache devuelve la misma instancia con los mismos parametros."""
+    _MODEL_CACHE.clear()
+    sentinel = object()
+    _MODEL_CACHE[("dummy/model", "cpu")] = sentinel
+
+    out = gliner_load_model(model_name="dummy/model", device="cpu")
+    assert out is sentinel
+
+    # Limpiar al terminar para no contaminar otros tests.
+    _MODEL_CACHE.clear()
+
+
+def test_resolve_device_explicito_se_respeta():
+    """device explicito se respeta tal cual."""
+    assert _resolve_device("cpu") == "cpu"
+    assert _resolve_device("cuda") == "cuda"
+    assert _resolve_device("cuda:0") == "cuda:0"
+
+
+def test_resolve_device_auto_cae_a_cpu_sin_torch(monkeypatch):
+    """device='auto' resuelve a cpu o cuda segun torch.cuda.is_available."""
+    real_import = __builtins__["__import__"] if isinstance(__builtins__, dict) else __builtins__.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name == "torch":
+            raise ImportError("torch missing")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr("builtins.__import__", fake_import)
+    assert _resolve_device("auto") == "cpu"