fn_registry/python/functions/datascience/estimate_image_depth.py

"""
Estimación de profundidad monocular a partir de una sola imagen con Depth-Anything-V2.

Función del registry (grupo de capacidad `img-to-3d`, dominio `datascience`). Promovida desde
la app `img_to_3d_webapp` para que cualquier artefacto pueda estimar un mapa de profundidad sin
reimplementar la carga del modelo HuggingFace ni la normalización del resultado.

Impura: descarga/carga pesos de un modelo de transformers, usa GPU si está disponible y mantiene
una caché de pipelines a nivel de proceso para no recargar en cada llamada.
"""

from __future__ import annotations

import numpy as np
from PIL import Image

# El pipeline de transformers es caro de instanciar (carga de pesos). Se cachea por
# (modelo, device) a nivel de módulo para que un servicio no recargue en cada request.
# Es estado mutable de PROCESO: documentado como impureza (ver .md "Gotchas"). Se puede
# desactivar por llamada con use_cache=False.
_PIPE_CACHE: dict = {}


def _resolve_device(device: str) -> int:
    """Resuelve el índice de device para transformers.pipeline (0=GPU0, -1=CPU)."""
    import torch

    if device == "cpu":
        return -1
    if device == "auto":
        return 0 if torch.cuda.is_available() else -1
    # device explícito tipo "cuda:0" o un índice
    try:
        return int(device)
    except ValueError:
        return 0 if device.startswith("cuda") else -1


def _build_pipe(model_name: str, device: str):
    from transformers import pipeline

    return pipeline("depth-estimation", model=model_name, device=_resolve_device(device))


def _get_pipe(model_name: str, device: str, use_cache: bool):
    if not use_cache:
        return _build_pipe(model_name, device)
    key = (model_name, device)
    pipe = _PIPE_CACHE.get(key)
    if pipe is None:
        pipe = _build_pipe(model_name, device)
        _PIPE_CACHE[key] = pipe
    return pipe


def estimate_image_depth(
    image_path: str,
    model_name: str = "depth-anything/Depth-Anything-V2-Small-hf",
    device: str = "auto",
    use_cache: bool = True,
) -> dict:
    """
    Estima un mapa de profundidad monocular a partir de una única imagen.

    Parámetros:
        image_path: ruta a la imagen de entrada (cualquier formato que PIL abra).
        model_name: id de modelo HuggingFace de estimación de profundidad.
        device: "auto" (GPU si hay), "cpu", o índice/cadena cuda explícita ("cuda:0", "0").
        use_cache: si True (default) reutiliza el pipeline cacheado por (modelo, device) a
            nivel de proceso; si False construye uno nuevo y no toca la caché.

    Devuelve (dict, nunca lanza):
        Éxito: {"status": "ok", "depth": ndarray HxW float32 normalizado a [0,1]
                (1 = más cerca de la cámara), "image": PIL.Image RGB original,
                "height": H, "width": W, "model": model_name, "device": device}.
        Error: {"status": "error", "error": str} (ruta inválida, modelo no disponible,
                device inválido, fallo de inferencia).
    """
    try:
        image = Image.open(image_path).convert("RGB")
        pipe = _get_pipe(model_name, device, use_cache)
        result = pipe(image)
        depth = np.asarray(result["depth"], dtype=np.float32)

        # Normalizar a [0,1]. Depth-Anything devuelve disparidad relativa (mayor = más cerca).
        d = depth - depth.min()
        peak = d.max()
        if peak > 0:
            d = d / peak

        H, W = d.shape
        return {
            "status": "ok",
            "depth": d,
            "image": image,
            "height": int(H),
            "width": int(W),
            "model": model_name,
            "device": device,
        }
    except Exception as e:  # noqa: BLE001
        return {"status": "error", "error": str(e)}


if __name__ == "__main__":
    # Demo runner para `fn run estimate_image_depth_py_datascience <image_path> [model] [device]`.
    # Imprime un resumen JSON-serializable (el ndarray y la PIL.Image no se serializan).
    import json
    import sys

    if len(sys.argv) < 2:
        print(json.dumps({"status": "error", "error": "uso: <image_path> [model_name] [device]"}))
        sys.exit(1)

    path = sys.argv[1]
    model = sys.argv[2] if len(sys.argv) > 2 else "depth-anything/Depth-Anything-V2-Small-hf"
    dev = sys.argv[3] if len(sys.argv) > 3 else "auto"

    res = estimate_image_depth(path, model_name=model, device=dev)
    if res["status"] == "ok":
        depth = res["depth"]
        summary = {
            "status": "ok",
            "height": res["height"],
            "width": res["width"],
            "depth_min": float(depth.min()),
            "depth_max": float(depth.max()),
            "depth_mean": round(float(depth.mean()), 4),
            "model": res["model"],
            "device": res["device"],
        }
        print(json.dumps(summary))
    else:
        print(json.dumps(res))
        sys.exit(1)