"""Carga (y cachea) un modelo GLiNER2 (NER+RE joint en una sola pasada).

LICENSE: Apache 2.0 — uso comercial permitido.
Modelo por defecto: fastino/gliner2-large-v1
"""

from __future__ import annotations

from typing import Any

# Cache global: (model_name, device) -> instancia GLiNER2
_MODEL_CACHE: dict[tuple[str, str], Any] = {}


def _resolve_device(device: str) -> str:
    """Resuelve 'auto' a 'cuda' o 'cpu' segun disponibilidad de torch."""
    if device != "auto":
        return device
    try:
        import torch
    except ImportError:
        return "cpu"
    return "cuda" if torch.cuda.is_available() else "cpu"


def gliner2_load_model(
    model_name: str = "fastino/gliner2-large-v1",
    device: str = "auto",
) -> Any:
    """Load (and cache) a GLiNER2 model.

    GLiNER2 extracts entities AND relations in a single forward pass using
    a joint schema (entities + relation_labels). This is ~2x faster than
    running GLiNER + GLiREL separately for co-occurring entities.

    Returns model instance with .extract() and .create_schema() methods.

    LICENSE: Apache 2.0 — commercial use OK.

    Args:
        model_name: HuggingFace Hub model ID. Default: fastino/gliner2-large-v1.
        device: 'auto' uses CUDA if available, else CPU. 'cpu', 'cuda', 'cuda:N'.

    Returns:
        GLiNER2 instance cached by (model_name, device).
    """
    resolved = _resolve_device(device)
    key = (model_name, resolved)
    if key in _MODEL_CACHE:
        return _MODEL_CACHE[key]

    from gliner2 import GLiNER2  # type: ignore[import]

    m = GLiNER2.from_pretrained(model_name)
    if hasattr(m, "to") and resolved != "cpu":
        try:
            m.to(resolved)
        except Exception:
            pass  # Fallback to CPU silently

    _MODEL_CACHE[key] = m
    return m