fn_registry/python/functions/pipelines/extraction_pipeline_test.py

"""Tests para extraction_pipeline."""

from __future__ import annotations

import os
import sys
import tempfile

_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _ROOT not in sys.path:
    sys.path.insert(0, _ROOT)

from python.functions.pipelines.extraction_pipeline import extraction_pipeline


# ── LLM stubs ─────────────────────────────────────────────────────────────────

def _llm_with_entities(messages: list[dict]) -> dict:
    """LLM stub que retorna entidades fijas para el primer mensaje de extraccion."""
    system_content = messages[0]["content"] if messages else ""
    if "entity" in system_content.lower() or "entities" in system_content.lower():
        return {
            "entities": [
                {
                    "name": "John Smith",
                    "type_ref": "osint_person_go_cybersecurity",
                    "attributes": {"full_name": "John Smith", "nationality": "US"},
                    "confidence": 0.95,
                },
                {
                    "name": "evil-corp.com",
                    "type_ref": "osint_domain_go_cybersecurity",
                    "attributes": {"fqdn": "evil-corp.com"},
                    "confidence": 0.88,
                },
            ]
        }
    # Llamada de relaciones
    return {
        "relations": [
            {
                "from_name": "John Smith",
                "to_name": "evil-corp.com",
                "relation_type": "operates",
                "description": "John Smith operates evil-corp.com",
                "confidence": 0.8,
            }
        ]
    }


def _llm_empty(messages: list[dict]) -> dict:
    """LLM stub que retorna siempre resultado vacio."""
    system_content = messages[0]["content"] if messages else ""
    if "entit" in system_content.lower():
        return {"entities": []}
    return {"relations": []}


ENTITY_PRESETS = [
    {
        "type_ref": "osint_person_go_cybersecurity",
        "label": "Person",
        "metadata_fields": ["full_name", "alias", "nationality"],
    },
    {
        "type_ref": "osint_domain_go_cybersecurity",
        "label": "Domain",
        "metadata_fields": ["fqdn", "registrar"],
    },
]

RELATION_TYPES = ["operates", "owns", "funds", "communicates_with", "related_to"]


# ── Tests ──────────────────────────────────────────────────────────────────────

def test_documento_con_entidades_y_relaciones():
    """documento con entidades y relaciones retorna ExtractionResult completo"""
    text = (
        "John Smith, a US national, operates the domain evil-corp.com. "
        "He was identified as the main administrator of the infrastructure."
    )
    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
        f.write(text)
        tmp_path = f.name

    try:
        result = extraction_pipeline(
            file_path=tmp_path,
            entity_presets=ENTITY_PRESETS,
            relation_types=RELATION_TYPES,
            llm_chat_json=_llm_with_entities,
            chunk_size=500,
            chunk_overlap=50,
            confidence_threshold=0.5,
            dedup_threshold=0.85,
        )
        assert result is not None
        assert len(result.entities) >= 1
        assert result.stats.total_chunks >= 1
        assert result.stats.total_chars > 0
    finally:
        os.unlink(tmp_path)


def test_documento_vacio():
    """documento vacio retorna ExtractionResult con listas vacias"""
    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
        f.write("")
        tmp_path = f.name

    try:
        result = extraction_pipeline(
            file_path=tmp_path,
            entity_presets=ENTITY_PRESETS,
            relation_types=RELATION_TYPES,
            llm_chat_json=_llm_empty,
        )
        assert result is not None
        assert result.entities == []
        assert result.relations == []
        assert result.stats.total_chunks == 0
    finally:
        os.unlink(tmp_path)


def test_documento_sin_entidades_detectables():
    """documento sin entidades detectables retorna listas vacias"""
    text = "The weather is nice today. The sun shines brightly over the mountains."
    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
        f.write(text)
        tmp_path = f.name

    try:
        result = extraction_pipeline(
            file_path=tmp_path,
            entity_presets=ENTITY_PRESETS,
            relation_types=RELATION_TYPES,
            llm_chat_json=_llm_empty,
            confidence_threshold=0.5,
        )
        assert result is not None
        assert result.entities == []
        assert result.relations == []
        assert result.stats.raw_entities_count == 0
    finally:
        os.unlink(tmp_path)


def test_archivo_no_encontrado_lanza_filenotfounderror():
    """archivo no encontrado lanza FileNotFoundError"""
    import pytest
    with pytest.raises(FileNotFoundError):
        extraction_pipeline(
            file_path="/tmp/no_existe_para_test_extraccion_pipeline.txt",
            entity_presets=ENTITY_PRESETS,
            relation_types=RELATION_TYPES,
            llm_chat_json=_llm_empty,
        )


def test_entity_presets_vacio_lanza_valueerror():
    """entity presets vacio lanza ValueError"""
    import pytest
    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
        f.write("some text")
        tmp_path = f.name

    try:
        with pytest.raises(ValueError):
            extraction_pipeline(
                file_path=tmp_path,
                entity_presets=[],
                relation_types=RELATION_TYPES,
                llm_chat_json=_llm_empty,
            )
    finally:
        os.unlink(tmp_path)


def test_progress_callback_se_invoca():
    """progress callback se invoca durante la ejecucion"""
    calls: list[tuple[str, float]] = []

    def _on_progress(msg: str, pct: float) -> None:
        calls.append((msg, pct))

    text = "John Smith operates evil-corp.com."
    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
        f.write(text)
        tmp_path = f.name

    try:
        extraction_pipeline(
            file_path=tmp_path,
            entity_presets=ENTITY_PRESETS,
            relation_types=RELATION_TYPES,
            llm_chat_json=_llm_with_entities,
            on_progress=_on_progress,
        )
        assert len(calls) > 0
        messages = [c[0] for c in calls]
        assert any("Extracting" in m or "Done" in m or "Dedup" in m for m in messages)
    finally:
        os.unlink(tmp_path)


def test_stats_se_rellenan_correctamente():
    """stats se rellenan correctamente con conteos y tiempo"""
    text = "John Smith, a US national, operates the domain evil-corp.com."
    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
        f.write(text)
        tmp_path = f.name

    try:
        result = extraction_pipeline(
            file_path=tmp_path,
            entity_presets=ENTITY_PRESETS,
            relation_types=RELATION_TYPES,
            llm_chat_json=_llm_with_entities,
        )
        assert result.stats.total_chars > 0
        assert result.stats.total_chunks >= 1
        assert result.stats.processing_time_seconds >= 0.0
    finally:
        os.unlink(tmp_path)