diff --git a/.env.example b/.env.example index d288d24..e61d812 100644 --- a/.env.example +++ b/.env.example @@ -51,6 +51,14 @@ COHERE= ANTHROPIC= # OpenAI (pay-per-token) OPENAI= +# NVIDIA NIM API (paid — hosted on api.nvidia.com, includes free-rate-limited tier) +NVIDIA= +# Google Gemini API (paid) +GEMINI= +# Local vLLM inference (existing Docker containers: Gemma 4 on :8000, Qwen 3.6 on :8001) +VLLM_LOCAL= +# Local embedding server (Nomic Embed v2 on :8010) +EMBED_LOCAL= # Local CPU image generation — sd.cpp (sd-turbo, sdxl-turbo) SDCPP= # Local CUDA image generation — sd.cpp (FLUX, SDXL Lightning, Juggernaut XI + CPU models) @@ -144,6 +152,21 @@ ANTHROPIC_API_KEY=sk-ant-... # OpenAI — https://platform.openai.com/api-keys OPENAI_API_KEY=sk-... +# NVIDIA NIM — https://build.nvidia.com (default API base, set API_BASE for custom endpoint) +NVIDIA_API_KEY=nvapi-... +NVIDIA_API_BASE=https://integrate.api.nvidia.com/v1 + +# Google Gemini — https://aistudio.google.com +GEMINI_API_KEY=... + +# Local vLLM — existing Docker containers (Gemma 4 on :8000, Qwen 3.6 on :8001) +LOCAL_VLLM_API_KEY=local-vllm +GEMMA_LOCAL_API_BASE=http://172.17.0.1:8000/v1 +QWEN_LOCAL_API_BASE=http://172.17.0.1:8001/v1 + +# Local embedding server — Nomic Embed v2 on :8010 +EMBED_LOCAL_API_BASE=http://172.17.0.1:8010/v1 + # ── Service config ──────────────────────────────────────────────────────────── # Browser cluster diff --git a/docs/providers.md b/docs/providers.md index ca057d1..32d5fe8 100644 --- a/docs/providers.md +++ b/docs/providers.md @@ -274,6 +274,52 @@ CUDA-accelerated image generation. Same Go wrapper with CUDA backend. Non-blocki --- +## NVIDIA NIM API (paid — `NVIDIA=1`) + +NVIDIA NIM (NVIDIA Inference Microservices) hosts a wide range of open models on `api.nvidia.com`. Requires an API key from [build.nvidia.com](https://build.nvidia.com). Set `NVIDIA_API_BASE` to override the default endpoint. + +| Alias | Underlying model | Notes | +| ----- | ---------------- | ----- | +| `nvidia-kimi-k2` | moonshotai/kimi-k2-thinking | reasoning, thinking mode | +| `nvidia-palmyra-fin-70b` | writer/palmyra-fin-70b-32k | finance-specialized (Writer) | +| `nvidia-llama-3.2-90b` | meta/llama-3.2-90b-vision-instruct | multimodal, vision | +| `nvidia-qwen3-80b` | qwen/qwen3-next-80b-a3b-instruct | MoE, fast, general | +| `nvidia-qwen3-coder` | qwen/qwen3-coder-480b-a35b-instruct | code-specialized MoE | +| `nvidia-deepseek-v3.2` | deepseek-ai/deepseek-v3.2 | reasoning | +| `nvidia-nv-embedqa-e5-v5` | nvidia/nv-embedqa-e5-v5 | embeddings | + +## Google Gemini (paid — `GEMINI=1`) + +Google's Gemini models via the Gemini API. Requires an API key from [aistudio.google.com](https://aistudio.google.com). + +| Alias | Underlying model | Notes | +| ----- | ---------------- | ----- | +| `gemini-2.5-pro` | gemini/gemini-2.5-pro | flagship reasoning | +| `gemini-2.5-flash` | gemini/gemini-2.5-flash | balanced speed/quality | +| `gemini-2.5-flash-lite` | gemini/gemini-2.5-flash-lite | fast, cheap | +| `gemini-3-flash-preview` | gemini/gemini-3-flash-preview | preview, latest gen | +| `gemini-3.1-flash-lite-preview` | gemini/gemini-3.1-flash-lite-preview | preview lite | +| `gemini-embedding-001` | gemini/gemini-embedding-001 | embeddings | + +## Local vLLM (external — `VLLM_LOCAL=1`) + +Existing Docker-based vLLM instances running on the host. Uses `custom_llm_provider: openai` pointing at the local vLLM API servers. Requires `LOCAL_VLLM_API_KEY`, `GEMMA_LOCAL_API_BASE`, and `QWEN_LOCAL_API_BASE` in `.env`. + +| Alias | Model | Port | +| ----- | ----- | ---- | +| `local-vllm-gemma4` | gemma-4-31B-it-4bit-awq | 8000 | +| `local-vllm-qwen3.6` | qwen-3.6-35b-a3b-awq-4bit | 8001 | + +## Local Embedding (external — `EMBED_LOCAL=1`) + +External embedding server (Nomic Embed v2) running on the host at port 8010. Uses `custom_llm_provider: openai` with `mode: embedding`. + +| Alias | Model | Notes | +| ----- | ----- | ----- | +| `local-embed-nomic` | nomic-embed-text-v2 | text embeddings | + +--- + ## Fallbacks Every model has its own fallback chain. When a provider fails, is rate-limited, or returns an error, LiteLLM automatically tries the next model in the chain. Free providers are always tried first. diff --git a/litellm/build-config.py b/litellm/build-config.py index 9b906e7..be93516 100644 --- a/litellm/build-config.py +++ b/litellm/build-config.py @@ -96,6 +96,10 @@ def active_providers(env): ("talkies-cuda", lambda e: is_flag(e, "TALKIES_CUDA")), ("sdcpp", lambda e: is_flag(e, "SDCPP")), ("sdcpp-cuda", lambda e: is_flag(e, "SDCPP_CUDA")), + ("nvidia", lambda e: is_flag(e, "NVIDIA")), + ("gemini", lambda e: is_flag(e, "GEMINI")), + ("vllm-local", lambda e: is_flag(e, "VLLM_LOCAL")), + ("embed-local", lambda e: is_flag(e, "EMBED_LOCAL")), ] return [name for name, check in checks if check(env)] diff --git a/litellm/config/providers/embed-local.yaml b/litellm/config/providers/embed-local.yaml new file mode 100644 index 0000000..4a1ef83 --- /dev/null +++ b/litellm/config/providers/embed-local.yaml @@ -0,0 +1,8 @@ +- model_name: local-embed-nomic + litellm_params: + model: nomic-embed-text-v2 + api_base: os.environ/EMBED_LOCAL_API_BASE + api_key: os.environ/LOCAL_VLLM_API_KEY + custom_llm_provider: openai + model_info: + mode: embedding diff --git a/litellm/config/providers/gemini.yaml b/litellm/config/providers/gemini.yaml new file mode 100644 index 0000000..e5b5f95 --- /dev/null +++ b/litellm/config/providers/gemini.yaml @@ -0,0 +1,31 @@ +- model_name: gemini-2.5-pro + litellm_params: + model: gemini/gemini-2.5-pro + api_key: os.environ/GEMINI_API_KEY + +- model_name: gemini-2.5-flash + litellm_params: + model: gemini/gemini-2.5-flash + api_key: os.environ/GEMINI_API_KEY + +- model_name: gemini-2.5-flash-lite + litellm_params: + model: gemini/gemini-2.5-flash-lite + api_key: os.environ/GEMINI_API_KEY + +- model_name: gemini-3-flash-preview + litellm_params: + model: gemini/gemini-3-flash-preview + api_key: os.environ/GEMINI_API_KEY + +- model_name: gemini-3.1-flash-lite-preview + litellm_params: + model: gemini/gemini-3.1-flash-lite-preview + api_key: os.environ/GEMINI_API_KEY + +- model_name: gemini-embedding-001 + litellm_params: + model: gemini/gemini-embedding-001 + api_key: os.environ/GEMINI_API_KEY + model_info: + mode: embedding diff --git a/litellm/config/providers/nvidia.yaml b/litellm/config/providers/nvidia.yaml new file mode 100644 index 0000000..8026d45 --- /dev/null +++ b/litellm/config/providers/nvidia.yaml @@ -0,0 +1,50 @@ +- model_name: nvidia-kimi-k2 + litellm_params: + model: moonshotai/kimi-k2-thinking + api_base: os.environ/NVIDIA_API_BASE + api_key: os.environ/NVIDIA_API_KEY + custom_llm_provider: openai + +- model_name: nvidia-palmyra-fin-70b + litellm_params: + model: writer/palmyra-fin-70b-32k + api_base: os.environ/NVIDIA_API_BASE + api_key: os.environ/NVIDIA_API_KEY + custom_llm_provider: openai + +- model_name: nvidia-llama-3.2-90b + litellm_params: + model: meta/llama-3.2-90b-vision-instruct + api_base: os.environ/NVIDIA_API_BASE + api_key: os.environ/NVIDIA_API_KEY + custom_llm_provider: openai + +- model_name: nvidia-qwen3-80b + litellm_params: + model: qwen/qwen3-next-80b-a3b-instruct + api_base: os.environ/NVIDIA_API_BASE + api_key: os.environ/NVIDIA_API_KEY + custom_llm_provider: openai + +- model_name: nvidia-qwen3-coder + litellm_params: + model: qwen/qwen3-coder-480b-a35b-instruct + api_base: os.environ/NVIDIA_API_BASE + api_key: os.environ/NVIDIA_API_KEY + custom_llm_provider: openai + +- model_name: nvidia-deepseek-v3.2 + litellm_params: + model: deepseek-ai/deepseek-v3.2 + api_base: os.environ/NVIDIA_API_BASE + api_key: os.environ/NVIDIA_API_KEY + custom_llm_provider: openai + +- model_name: nvidia-nv-embedqa-e5-v5 + litellm_params: + model: nvidia/nv-embedqa-e5-v5 + api_base: os.environ/NVIDIA_API_BASE + api_key: os.environ/NVIDIA_API_KEY + custom_llm_provider: openai + model_info: + mode: embedding diff --git a/litellm/config/providers/vllm-local.yaml b/litellm/config/providers/vllm-local.yaml new file mode 100644 index 0000000..06955c4 --- /dev/null +++ b/litellm/config/providers/vllm-local.yaml @@ -0,0 +1,13 @@ +- model_name: local-vllm-gemma4 + litellm_params: + model: gemma-4-31B-it-4bit-awq + api_base: os.environ/GEMMA_LOCAL_API_BASE + api_key: os.environ/LOCAL_VLLM_API_KEY + custom_llm_provider: openai + +- model_name: local-vllm-qwen3.6 + litellm_params: + model: qwen-3.6-35b-a3b-awq-4bit + api_base: os.environ/QWEN_LOCAL_API_BASE + api_key: os.environ/LOCAL_VLLM_API_KEY + custom_llm_provider: openai diff --git a/recommend-limits.sh b/recommend-limits.sh index 65cef0b..a227fda 100644 --- a/recommend-limits.sh +++ b/recommend-limits.sh @@ -39,6 +39,7 @@ flag_talkies=0; flag_talkies_cuda=0 flag_ollama=0; flag_ollama_cuda=0; flag_browser=0 flag_claudebox=0; flag_cbzai=0; flag_hybrids3=0; flag_cloudflared=0 flag_librechat=0; flag_mcp=0; flag_sdcpp=0; flag_sdcpp_cuda=0 +flag_nvidia=0; flag_gemini=0; flag_vllm_local=0; flag_embed_local=0 if [ -f .env ]; then _v() { grep -E "^$1=" .env | cut -d= -f2 | tr -d '[:space:]' || true; } @@ -56,6 +57,10 @@ if [ -f .env ]; then [ "$(_v LIBRECHAT)" = "1" ] && flag_librechat=1 [ "$(_v SDCPP)" = "1" ] && flag_sdcpp=1 [ "$(_v SDCPP_CUDA)" = "1" ] && flag_sdcpp_cuda=1 + [ "$(_v NVIDIA)" = "1" ] && flag_nvidia=1 + [ "$(_v GEMINI)" = "1" ] && flag_gemini=1 + [ "$(_v VLLM_LOCAL)" = "1" ] && flag_vllm_local=1 + [ "$(_v EMBED_LOCAL)" = "1" ] && flag_embed_local=1 # mcp auto-enabled when image/STT/TTS providers active [ "$(_v HUGGINGFACE)" = "1" ] || [ "$(_v OPENAI)" = "1" ] || \ [ "$(_v TALKIES)" = "1" ] || [ "$(_v TALKIES_CUDA)" = "1" ] || \ @@ -86,7 +91,7 @@ echo " RAM: ${total_ram_mb} MB (effective: ${effective_ram_mb} MB — ${os_r echo " Swap: ${total_swap_mb} MB (effective: ${effective_swap_mb} MB at ${maxuse}%)" echo " Cores: ${total_cores} (effective: ${effective_cores} at ${maxuse}%)" echo " MAXUSE: ${maxuse}%" -echo " Enabled: ollama=${flag_ollama} ollama_cuda=${flag_ollama_cuda} talkies=${flag_talkies} talkies_cuda=${flag_talkies_cuda} sdcpp=${flag_sdcpp} sdcpp_cuda=${flag_sdcpp_cuda} browser=${flag_browser} claudebox=${flag_claudebox} cbzai=${flag_cbzai} hybrids3=${flag_hybrids3} cloudflared=${flag_cloudflared} librechat=${flag_librechat} mcp=${flag_mcp}" +echo " Enabled: ollama=${flag_ollama} ollama_cuda=${flag_ollama_cuda} talkies=${flag_talkies} talkies_cuda=${flag_talkies_cuda} sdcpp=${flag_sdcpp} sdcpp_cuda=${flag_sdcpp_cuda} browser=${flag_browser} claudebox=${flag_claudebox} cbzai=${flag_cbzai} hybrids3=${flag_hybrids3} cloudflared=${flag_cloudflared} librechat=${flag_librechat} mcp=${flag_mcp} nvidia=${flag_nvidia} gemini=${flag_gemini} vllm_local=${flag_vllm_local} embed_local=${flag_embed_local}" echo "" # ── Helpers ─────────────────────────────────────────────────────────────────── diff --git a/tests/test_litellm.sh b/tests/test_litellm.sh index 35a5276..be211db 100755 --- a/tests/test_litellm.sh +++ b/tests/test_litellm.sh @@ -154,6 +154,46 @@ if [ "${TALKIES_CUDA:-}" = "1" ]; then ) fi +# NVIDIA NIM models — only expected when NVIDIA=1 +if [ "${NVIDIA:-}" = "1" ]; then + EXPECTED_MODELS+=( + "nvidia-kimi-k2" + "nvidia-palmyra-fin-70b" + "nvidia-llama-3.2-90b" + "nvidia-qwen3-80b" + "nvidia-qwen3-coder" + "nvidia-deepseek-v3.2" + "nvidia-nv-embedqa-e5-v5" + ) +fi + +# Gemini models — only expected when GEMINI=1 +if [ "${GEMINI:-}" = "1" ]; then + EXPECTED_MODELS+=( + "gemini-2.5-pro" + "gemini-2.5-flash" + "gemini-2.5-flash-lite" + "gemini-3-flash-preview" + "gemini-3.1-flash-lite-preview" + "gemini-embedding-001" + ) +fi + +# Local vLLM models — only expected when VLLM_LOCAL=1 +if [ "${VLLM_LOCAL:-}" = "1" ]; then + EXPECTED_MODELS+=( + "local-vllm-gemma4" + "local-vllm-qwen3.6" + ) +fi + +# Local embed model — only expected when EMBED_LOCAL=1 +if [ "${EMBED_LOCAL:-}" = "1" ]; then + EXPECTED_MODELS+=( + "local-embed-nomic" + ) +fi + test_litellm_models_registered() { local models models=$(get "$BASE_URL/models")