Files

114 lines
4.2 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# Ejecuta la API usando CUDA tomando el modelo GGUF de model_choice
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
CONF_FILE="${SCRIPT_DIR}/api_cuda.conf"
if [[ -f "${CONF_FILE}" ]]; then
# shellcheck source=/dev/null
source "${CONF_FILE}"
else
echo "Archivo de configuracion no encontrado en ${CONF_FILE}. Usando valores por defecto."
fi
# Valores por defecto si no vienen del .conf o del entorno
HOST="${HOST:-0.0.0.0}"
PORT="${PORT:-8000}"
N_CTX="${N_CTX:-4096}"
N_BATCH="${N_BATCH:-512}"
N_THREADS="${N_THREADS:-}"
N_GPU_LAYERS="${N_GPU_LAYERS:--1}"
MAIN_GPU="${MAIN_GPU:-0}"
SPLIT_MODE="${SPLIT_MODE:-1}"
ROPE_FREQ_BASE="${ROPE_FREQ_BASE:-10000}"
ROPE_FREQ_SCALE="${ROPE_FREQ_SCALE:-1.0}"
OFFLOAD_KV_CACHE="${OFFLOAD_KV_CACHE:-true}"
KEEP_MODEL_IN_MEMORY="${KEEP_MODEL_IN_MEMORY:-false}"
TRY_MMAP="${TRY_MMAP:-true}"
SEED="${SEED:-0}"
FLASH_ATTN="${FLASH_ATTN:-false}"
MM_PROJ_PATH="${MM_PROJ_PATH:-}"
# Defaults for sampling (se pueden sobreescribir en la peticion)
DEFAULT_MAX_TOKENS="${DEFAULT_MAX_TOKENS:-2048}"
DEFAULT_TEMPERATURE="${DEFAULT_TEMPERATURE:-0.7}"
DEFAULT_TOP_K="${DEFAULT_TOP_K:-40}"
DEFAULT_REPEAT_PENALTY="${DEFAULT_REPEAT_PENALTY:-1.1}"
DEFAULT_MIN_P="${DEFAULT_MIN_P:-0.05}"
DEFAULT_TOP_P="${DEFAULT_TOP_P:-0.9}"
# Determinar el modelo: usa MODEL_PATH si esta definido, si no toma el primer .gguf en model_choice
MODEL_PATH_VALUE="${MODEL_PATH:-}"
if [[ -z "${MODEL_PATH_VALUE}" ]]; then
MODEL_PATH_VALUE="$(find "${SCRIPT_DIR}/model_choice" -maxdepth 1 -type f -name '*.gguf' | head -n 1 || true)"
fi
if [[ -z "${MODEL_PATH_VALUE}" || ! -f "${MODEL_PATH_VALUE}" ]]; then
echo "No se encontro un modelo .gguf. Define MODEL_PATH en ${CONF_FILE} o agrega un archivo a model_choice/."
exit 1
fi
# Autodetectar mmproj si no se configuro (acepta .mmproj o archivos .gguf que contengan 'mmproj')
if [[ -z "${MM_PROJ_PATH}" ]]; then
MODEL_DIR="$(dirname "${MODEL_PATH_VALUE}")"
MM_PROJ_PATH="$(find "${MODEL_DIR}" -maxdepth 1 -type f -name '*.mmproj' -o -name '*mmproj*.gguf' | head -n 1 || true)"
fi
if [[ -z "${MM_PROJ_PATH}" ]]; then
MM_PROJ_PATH="$(find "${SCRIPT_DIR}/model_choice" -maxdepth 1 -type f -name '*.mmproj' -o -name '*mmproj*.gguf' | head -n 1 || true)"
fi
# Elegir ejecutor: preferimos uv run para respetar uv.lock; si no, se usa python directo
if command -v uv >/dev/null 2>&1; then
export UV_CACHE_DIR="${SCRIPT_DIR}/.uv_cache"
PY_RUNNER=(uv run python)
else
PY_RUNNER=(python)
fi
CMD=(
"${PY_RUNNER[@]}"
"${SCRIPT_DIR}/main.py"
--model-path "${MODEL_PATH_VALUE}"
--host "${HOST}"
--port "${PORT}"
--n-ctx "${N_CTX}"
--n-batch "${N_BATCH}"
--n-gpu-layers "${N_GPU_LAYERS}"
--main-gpu "${MAIN_GPU}"
--split-mode "${SPLIT_MODE}"
--rope-freq-base "${ROPE_FREQ_BASE}"
--rope-freq-scale "${ROPE_FREQ_SCALE}"
--offload-kv-cache "${OFFLOAD_KV_CACHE}"
--keep-model-in-memory "${KEEP_MODEL_IN_MEMORY}"
--try-mmap "${TRY_MMAP}"
--seed "${SEED}"
--flash-attn "${FLASH_ATTN}"
--default-max-tokens "${DEFAULT_MAX_TOKENS}"
--default-temperature "${DEFAULT_TEMPERATURE}"
--default-top-k "${DEFAULT_TOP_K}"
--default-repeat-penalty "${DEFAULT_REPEAT_PENALTY}"
--default-min-p "${DEFAULT_MIN_P}"
--default-top-p "${DEFAULT_TOP_P}"
)
if [[ -n "${N_THREADS}" ]]; then
CMD+=(--n-threads "${N_THREADS}")
fi
if [[ -n "${MM_PROJ_PATH}" ]]; then
CMD+=(--mmproj-path "${MM_PROJ_PATH}")
fi
echo "Iniciando API con CUDA"
echo "Modelo: ${MODEL_PATH_VALUE}"
echo "Host/Port: ${HOST}:${PORT}"
echo "n_ctx=${N_CTX}, n_batch=${N_BATCH}, n_gpu_layers=${N_GPU_LAYERS}, main_gpu=${MAIN_GPU}, split_mode=${SPLIT_MODE}, n_threads=${N_THREADS:-auto}"
echo "rope_base=${ROPE_FREQ_BASE}, rope_scale=${ROPE_FREQ_SCALE}, offload_kv=${OFFLOAD_KV_CACHE}, mmap=${TRY_MMAP}, mlock=${KEEP_MODEL_IN_MEMORY}, seed=${SEED}, flash_attn=${FLASH_ATTN}"
echo "mmproj_path=${MM_PROJ_PATH:-<none>}"
echo "defaults -> max_tokens=${DEFAULT_MAX_TOKENS}, temp=${DEFAULT_TEMPERATURE}, top_k=${DEFAULT_TOP_K}, repeat_penalty=${DEFAULT_REPEAT_PENALTY}, min_p=${DEFAULT_MIN_P}, top_p=${DEFAULT_TOP_P}"
# Exportar bandera de compilacion por si se re-instala llama-cpp con cublas; no afecta si ya esta instalado
export CMAKE_ARGS="-DLLAMA_CUBLAS=on"
exec "${CMD[@]}"