fn_registry/python/functions/datascience/extract_entities_llm_test.py

"""Tests para extract_entities_llm."""

import warnings
import sys
import os
import pytest

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))

from python.functions.datascience.extract_entities_llm import extract_entities_llm
from python.types.datascience.entity_candidate import EntityCandidate

SCHEMA = [
    {
        "type_ref": "osint_person_go_cybersecurity",
        "label": "Person",
        "metadata_fields": ["full_name", "alias", "nationality", "dob", "risk_score"],
    },
    {
        "type_ref": "osint_domain_go_cybersecurity",
        "label": "Domain",
        "metadata_fields": ["fqdn", "registrar", "created_date"],
    },
]


def make_llm(response: dict):
    """Crea un stub de LLM que retorna la respuesta dada."""
    def _llm(messages: list[dict]) -> dict:
        return response
    return _llm


def test_texto_con_entidades_claras_retorna_entity_candidate():
    """texto con entidades claras retorna EntityCandidate"""
    llm = make_llm({
        "entities": [
            {
                "name": "John Smith",
                "type_ref": "osint_person_go_cybersecurity",
                "attributes": {"full_name": "John Smith", "nationality": "US"},
                "confidence": 0.95,
            },
            {
                "name": "evil-corp.com",
                "type_ref": "osint_domain_go_cybersecurity",
                "attributes": {"fqdn": "evil-corp.com"},
                "confidence": 0.88,
            },
        ]
    })

    result = extract_entities_llm(
        "John Smith, US citizen, linked to evil-corp.com.", SCHEMA, llm
    )

    assert len(result) == 2

    person = next(e for e in result if e.name == "John Smith")
    assert person.type_ref == "osint_person_go_cybersecurity"
    assert person.type_label == "Person"
    assert person.attributes["full_name"] == "John Smith"
    assert person.confidence == 0.95

    domain = next(e for e in result if e.name == "evil-corp.com")
    assert domain.type_ref == "osint_domain_go_cybersecurity"
    assert domain.type_label == "Domain"
    assert domain.attributes["fqdn"] == "evil-corp.com"
    assert domain.confidence == 0.88


def test_texto_sin_entidades_retorna_lista_vacia():
    """texto sin entidades retorna lista vacia"""
    llm = make_llm({"entities": []})

    result = extract_entities_llm(
        "The sky is blue and the grass is green.", SCHEMA, llm
    )

    assert result == []


def test_llm_retorna_json_mal_formado_retorna_lista_vacia_con_warning():
    """llm retorna json mal formado retorna lista vacia con warning"""
    def bad_llm(messages: list[dict]) -> dict:
        raise ValueError("JSON decode error")

    with warnings.catch_warnings(record=True) as caught:
        warnings.simplefilter("always")
        result = extract_entities_llm("Some text with entities.", SCHEMA, bad_llm)

    assert result == []
    assert len(caught) == 1
    assert "error llamando al LLM" in str(caught[0].message)


def test_type_ref_invalido_en_respuesta_se_descarta_con_warning():
    """type_ref invalido en respuesta se descarta con warning"""
    llm = make_llm({
        "entities": [
            {
                "name": "Valid Person",
                "type_ref": "osint_person_go_cybersecurity",
                "attributes": {},
                "confidence": 0.9,
            },
            {
                "name": "Unknown Thing",
                "type_ref": "nonexistent_type_ref",
                "attributes": {},
                "confidence": 0.8,
            },
        ]
    })

    with warnings.catch_warnings(record=True) as caught:
        warnings.simplefilter("always")
        result = extract_entities_llm("Text with entities.", SCHEMA, llm)

    assert len(result) == 1
    assert result[0].name == "Valid Person"
    assert any("nonexistent_type_ref" in str(w.message) for w in caught)


def test_confidence_se_propaga_correctamente():
    """confidence se propaga correctamente"""
    llm = make_llm({
        "entities": [
            {
                "name": "Implied Person",
                "type_ref": "osint_person_go_cybersecurity",
                "attributes": {},
                "confidence": 0.7,
            },
            {
                "name": "Weakly Implied Domain",
                "type_ref": "osint_domain_go_cybersecurity",
                "attributes": {},
                "confidence": 0.5,
            },
            {
                "name": "Explicit Entity",
                "type_ref": "osint_person_go_cybersecurity",
                "attributes": {},
                "confidence": 1.0,
            },
        ]
    })

    result = extract_entities_llm("Some text.", SCHEMA, llm)

    assert len(result) == 3
    confidences = {e.name: e.confidence for e in result}
    assert confidences["Implied Person"] == 0.7
    assert confidences["Weakly Implied Domain"] == 0.5
    assert confidences["Explicit Entity"] == 1.0


def test_schema_vacio_lanza_value_error():
    """schema vacio lanza ValueError"""
    llm = make_llm({"entities": []})

    with pytest.raises(ValueError, match="entity_schema no puede estar vacio"):
        extract_entities_llm("Some text.", [], llm)