114 lines
4.2 KiB
Bash
Executable File
114 lines
4.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Ejecuta la API usando CUDA tomando el modelo GGUF de model_choice
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
CONF_FILE="${SCRIPT_DIR}/api_cuda.conf"
|
|
|
|
if [[ -f "${CONF_FILE}" ]]; then
|
|
# shellcheck source=/dev/null
|
|
source "${CONF_FILE}"
|
|
else
|
|
echo "Archivo de configuracion no encontrado en ${CONF_FILE}. Usando valores por defecto."
|
|
fi
|
|
|
|
# Valores por defecto si no vienen del .conf o del entorno
|
|
HOST="${HOST:-0.0.0.0}"
|
|
PORT="${PORT:-8000}"
|
|
N_CTX="${N_CTX:-4096}"
|
|
N_BATCH="${N_BATCH:-512}"
|
|
N_THREADS="${N_THREADS:-}"
|
|
N_GPU_LAYERS="${N_GPU_LAYERS:--1}"
|
|
MAIN_GPU="${MAIN_GPU:-0}"
|
|
SPLIT_MODE="${SPLIT_MODE:-1}"
|
|
ROPE_FREQ_BASE="${ROPE_FREQ_BASE:-10000}"
|
|
ROPE_FREQ_SCALE="${ROPE_FREQ_SCALE:-1.0}"
|
|
OFFLOAD_KV_CACHE="${OFFLOAD_KV_CACHE:-true}"
|
|
KEEP_MODEL_IN_MEMORY="${KEEP_MODEL_IN_MEMORY:-false}"
|
|
TRY_MMAP="${TRY_MMAP:-true}"
|
|
SEED="${SEED:-0}"
|
|
FLASH_ATTN="${FLASH_ATTN:-false}"
|
|
MM_PROJ_PATH="${MM_PROJ_PATH:-}"
|
|
# Defaults for sampling (se pueden sobreescribir en la peticion)
|
|
DEFAULT_MAX_TOKENS="${DEFAULT_MAX_TOKENS:-2048}"
|
|
DEFAULT_TEMPERATURE="${DEFAULT_TEMPERATURE:-0.7}"
|
|
DEFAULT_TOP_K="${DEFAULT_TOP_K:-40}"
|
|
DEFAULT_REPEAT_PENALTY="${DEFAULT_REPEAT_PENALTY:-1.1}"
|
|
DEFAULT_MIN_P="${DEFAULT_MIN_P:-0.05}"
|
|
DEFAULT_TOP_P="${DEFAULT_TOP_P:-0.9}"
|
|
|
|
# Determinar el modelo: usa MODEL_PATH si esta definido, si no toma el primer .gguf en model_choice
|
|
MODEL_PATH_VALUE="${MODEL_PATH:-}"
|
|
if [[ -z "${MODEL_PATH_VALUE}" ]]; then
|
|
MODEL_PATH_VALUE="$(find "${SCRIPT_DIR}/model_choice" -maxdepth 1 -type f -name '*.gguf' | head -n 1 || true)"
|
|
fi
|
|
|
|
if [[ -z "${MODEL_PATH_VALUE}" || ! -f "${MODEL_PATH_VALUE}" ]]; then
|
|
echo "No se encontro un modelo .gguf. Define MODEL_PATH en ${CONF_FILE} o agrega un archivo a model_choice/."
|
|
exit 1
|
|
fi
|
|
|
|
# Autodetectar mmproj si no se configuro (acepta .mmproj o archivos .gguf que contengan 'mmproj')
|
|
if [[ -z "${MM_PROJ_PATH}" ]]; then
|
|
MODEL_DIR="$(dirname "${MODEL_PATH_VALUE}")"
|
|
MM_PROJ_PATH="$(find "${MODEL_DIR}" -maxdepth 1 -type f -name '*.mmproj' -o -name '*mmproj*.gguf' | head -n 1 || true)"
|
|
fi
|
|
if [[ -z "${MM_PROJ_PATH}" ]]; then
|
|
MM_PROJ_PATH="$(find "${SCRIPT_DIR}/model_choice" -maxdepth 1 -type f -name '*.mmproj' -o -name '*mmproj*.gguf' | head -n 1 || true)"
|
|
fi
|
|
|
|
# Elegir ejecutor: preferimos uv run para respetar uv.lock; si no, se usa python directo
|
|
if command -v uv >/dev/null 2>&1; then
|
|
export UV_CACHE_DIR="${SCRIPT_DIR}/.uv_cache"
|
|
PY_RUNNER=(uv run python)
|
|
else
|
|
PY_RUNNER=(python)
|
|
fi
|
|
|
|
CMD=(
|
|
"${PY_RUNNER[@]}"
|
|
"${SCRIPT_DIR}/main.py"
|
|
--model-path "${MODEL_PATH_VALUE}"
|
|
--host "${HOST}"
|
|
--port "${PORT}"
|
|
--n-ctx "${N_CTX}"
|
|
--n-batch "${N_BATCH}"
|
|
--n-gpu-layers "${N_GPU_LAYERS}"
|
|
--main-gpu "${MAIN_GPU}"
|
|
--split-mode "${SPLIT_MODE}"
|
|
--rope-freq-base "${ROPE_FREQ_BASE}"
|
|
--rope-freq-scale "${ROPE_FREQ_SCALE}"
|
|
--offload-kv-cache "${OFFLOAD_KV_CACHE}"
|
|
--keep-model-in-memory "${KEEP_MODEL_IN_MEMORY}"
|
|
--try-mmap "${TRY_MMAP}"
|
|
--seed "${SEED}"
|
|
--flash-attn "${FLASH_ATTN}"
|
|
--default-max-tokens "${DEFAULT_MAX_TOKENS}"
|
|
--default-temperature "${DEFAULT_TEMPERATURE}"
|
|
--default-top-k "${DEFAULT_TOP_K}"
|
|
--default-repeat-penalty "${DEFAULT_REPEAT_PENALTY}"
|
|
--default-min-p "${DEFAULT_MIN_P}"
|
|
--default-top-p "${DEFAULT_TOP_P}"
|
|
)
|
|
|
|
if [[ -n "${N_THREADS}" ]]; then
|
|
CMD+=(--n-threads "${N_THREADS}")
|
|
fi
|
|
if [[ -n "${MM_PROJ_PATH}" ]]; then
|
|
CMD+=(--mmproj-path "${MM_PROJ_PATH}")
|
|
fi
|
|
|
|
echo "Iniciando API con CUDA"
|
|
echo "Modelo: ${MODEL_PATH_VALUE}"
|
|
echo "Host/Port: ${HOST}:${PORT}"
|
|
echo "n_ctx=${N_CTX}, n_batch=${N_BATCH}, n_gpu_layers=${N_GPU_LAYERS}, main_gpu=${MAIN_GPU}, split_mode=${SPLIT_MODE}, n_threads=${N_THREADS:-auto}"
|
|
echo "rope_base=${ROPE_FREQ_BASE}, rope_scale=${ROPE_FREQ_SCALE}, offload_kv=${OFFLOAD_KV_CACHE}, mmap=${TRY_MMAP}, mlock=${KEEP_MODEL_IN_MEMORY}, seed=${SEED}, flash_attn=${FLASH_ATTN}"
|
|
echo "mmproj_path=${MM_PROJ_PATH:-<none>}"
|
|
echo "defaults -> max_tokens=${DEFAULT_MAX_TOKENS}, temp=${DEFAULT_TEMPERATURE}, top_k=${DEFAULT_TOP_K}, repeat_penalty=${DEFAULT_REPEAT_PENALTY}, min_p=${DEFAULT_MIN_P}, top_p=${DEFAULT_TOP_P}"
|
|
|
|
# Exportar bandera de compilacion por si se re-instala llama-cpp con cublas; no afecta si ya esta instalado
|
|
export CMAKE_ARGS="-DLLAMA_CUBLAS=on"
|
|
|
|
exec "${CMD[@]}"
|