#!/usr/bin/env bash set -euo pipefail # Ejecuta la API usando CUDA tomando el modelo GGUF de model_choice SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CONF_FILE="${SCRIPT_DIR}/api_cuda.conf" if [[ -f "${CONF_FILE}" ]]; then # shellcheck source=/dev/null source "${CONF_FILE}" else echo "Archivo de configuracion no encontrado en ${CONF_FILE}. Usando valores por defecto." fi # Valores por defecto si no vienen del .conf o del entorno HOST="${HOST:-0.0.0.0}" PORT="${PORT:-8000}" N_CTX="${N_CTX:-4096}" N_BATCH="${N_BATCH:-512}" N_THREADS="${N_THREADS:-}" N_GPU_LAYERS="${N_GPU_LAYERS:--1}" MAIN_GPU="${MAIN_GPU:-0}" SPLIT_MODE="${SPLIT_MODE:-1}" ROPE_FREQ_BASE="${ROPE_FREQ_BASE:-10000}" ROPE_FREQ_SCALE="${ROPE_FREQ_SCALE:-1.0}" OFFLOAD_KV_CACHE="${OFFLOAD_KV_CACHE:-true}" KEEP_MODEL_IN_MEMORY="${KEEP_MODEL_IN_MEMORY:-false}" TRY_MMAP="${TRY_MMAP:-true}" SEED="${SEED:-0}" FLASH_ATTN="${FLASH_ATTN:-false}" MM_PROJ_PATH="${MM_PROJ_PATH:-}" # Defaults for sampling (se pueden sobreescribir en la peticion) DEFAULT_MAX_TOKENS="${DEFAULT_MAX_TOKENS:-2048}" DEFAULT_TEMPERATURE="${DEFAULT_TEMPERATURE:-0.7}" DEFAULT_TOP_K="${DEFAULT_TOP_K:-40}" DEFAULT_REPEAT_PENALTY="${DEFAULT_REPEAT_PENALTY:-1.1}" DEFAULT_MIN_P="${DEFAULT_MIN_P:-0.05}" DEFAULT_TOP_P="${DEFAULT_TOP_P:-0.9}" # Determinar el modelo: usa MODEL_PATH si esta definido, si no toma el primer .gguf en model_choice MODEL_PATH_VALUE="${MODEL_PATH:-}" if [[ -z "${MODEL_PATH_VALUE}" ]]; then MODEL_PATH_VALUE="$(find "${SCRIPT_DIR}/model_choice" -maxdepth 1 -type f -name '*.gguf' | head -n 1 || true)" fi if [[ -z "${MODEL_PATH_VALUE}" || ! -f "${MODEL_PATH_VALUE}" ]]; then echo "No se encontro un modelo .gguf. Define MODEL_PATH en ${CONF_FILE} o agrega un archivo a model_choice/." exit 1 fi # Autodetectar mmproj si no se configuro (acepta .mmproj o archivos .gguf que contengan 'mmproj') if [[ -z "${MM_PROJ_PATH}" ]]; then MODEL_DIR="$(dirname "${MODEL_PATH_VALUE}")" MM_PROJ_PATH="$(find "${MODEL_DIR}" -maxdepth 1 -type f -name '*.mmproj' -o -name '*mmproj*.gguf' | head -n 1 || true)" fi if [[ -z "${MM_PROJ_PATH}" ]]; then MM_PROJ_PATH="$(find "${SCRIPT_DIR}/model_choice" -maxdepth 1 -type f -name '*.mmproj' -o -name '*mmproj*.gguf' | head -n 1 || true)" fi # Elegir ejecutor: preferimos uv run para respetar uv.lock; si no, se usa python directo if command -v uv >/dev/null 2>&1; then export UV_CACHE_DIR="${SCRIPT_DIR}/.uv_cache" PY_RUNNER=(uv run python) else PY_RUNNER=(python) fi CMD=( "${PY_RUNNER[@]}" "${SCRIPT_DIR}/main.py" --model-path "${MODEL_PATH_VALUE}" --host "${HOST}" --port "${PORT}" --n-ctx "${N_CTX}" --n-batch "${N_BATCH}" --n-gpu-layers "${N_GPU_LAYERS}" --main-gpu "${MAIN_GPU}" --split-mode "${SPLIT_MODE}" --rope-freq-base "${ROPE_FREQ_BASE}" --rope-freq-scale "${ROPE_FREQ_SCALE}" --offload-kv-cache "${OFFLOAD_KV_CACHE}" --keep-model-in-memory "${KEEP_MODEL_IN_MEMORY}" --try-mmap "${TRY_MMAP}" --seed "${SEED}" --flash-attn "${FLASH_ATTN}" --default-max-tokens "${DEFAULT_MAX_TOKENS}" --default-temperature "${DEFAULT_TEMPERATURE}" --default-top-k "${DEFAULT_TOP_K}" --default-repeat-penalty "${DEFAULT_REPEAT_PENALTY}" --default-min-p "${DEFAULT_MIN_P}" --default-top-p "${DEFAULT_TOP_P}" ) if [[ -n "${N_THREADS}" ]]; then CMD+=(--n-threads "${N_THREADS}") fi if [[ -n "${MM_PROJ_PATH}" ]]; then CMD+=(--mmproj-path "${MM_PROJ_PATH}") fi echo "Iniciando API con CUDA" echo "Modelo: ${MODEL_PATH_VALUE}" echo "Host/Port: ${HOST}:${PORT}" echo "n_ctx=${N_CTX}, n_batch=${N_BATCH}, n_gpu_layers=${N_GPU_LAYERS}, main_gpu=${MAIN_GPU}, split_mode=${SPLIT_MODE}, n_threads=${N_THREADS:-auto}" echo "rope_base=${ROPE_FREQ_BASE}, rope_scale=${ROPE_FREQ_SCALE}, offload_kv=${OFFLOAD_KV_CACHE}, mmap=${TRY_MMAP}, mlock=${KEEP_MODEL_IN_MEMORY}, seed=${SEED}, flash_attn=${FLASH_ATTN}" echo "mmproj_path=${MM_PROJ_PATH:-}" echo "defaults -> max_tokens=${DEFAULT_MAX_TOKENS}, temp=${DEFAULT_TEMPERATURE}, top_k=${DEFAULT_TOP_K}, repeat_penalty=${DEFAULT_REPEAT_PENALTY}, min_p=${DEFAULT_MIN_P}, top_p=${DEFAULT_TOP_P}" # Exportar bandera de compilacion por si se re-instala llama-cpp con cublas; no afecta si ya esta instalado export CMAKE_ARGS="-DLLAMA_CUBLAS=on" exec "${CMD[@]}"