mirror of
https://github.com/Alishahryar1/free-claude-code.git
synced 2026-04-28 03:20:01 +00:00
feat: add llama.cpp provider for local anthropic messages API
This commit is contained in:
parent
1aedf4763c
commit
5a36a32836
12 changed files with 576 additions and 61 deletions
|
|
@ -10,11 +10,13 @@ OPENROUTER_API_KEY=""
|
|||
LM_STUDIO_BASE_URL="http://localhost:1234/v1"
|
||||
|
||||
|
||||
# Llama.cpp Config (local provider, no API key required)
|
||||
LLAMACPP_BASE_URL="http://localhost:8080/v1"
|
||||
|
||||
|
||||
# All Claude model requests are mapped to these models, plain model is fallback
|
||||
# Format: provider_type/model/name
|
||||
# Valid providers: "nvidia_nim" | "open_router" | "lmstudio"
|
||||
# Note for LM Studio: You MUST specify the exact model identifier (e.g. "lmstudio/qwen3.5-27b").
|
||||
# If specify a model that isn't currently loaded, LM Studio will route the request to whatever
|
||||
# Valid providers: "nvidia_nim" | "open_router" | "lmstudio" | "llamacpp"
|
||||
# model that happens to be loaded.
|
||||
MODEL_OPUS="nvidia_nim/z-ai/glm4.7"
|
||||
MODEL_SONNET="open_router/arcee-ai/trinity-large-preview:free"
|
||||
|
|
|
|||
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -8,4 +8,5 @@ __pycache__
|
|||
agent_workspace
|
||||
.env
|
||||
server.log
|
||||
.coverage
|
||||
.coverage
|
||||
llama_cache
|
||||
133
README.md
133
README.md
|
|
@ -12,7 +12,7 @@
|
|||
[](https://github.com/astral-sh/ruff)
|
||||
[](https://github.com/Delgan/loguru)
|
||||
|
||||
A lightweight proxy that routes Claude Code's Anthropic API calls to **NVIDIA NIM** (40 req/min free), **OpenRouter** (hundreds of models), or **LM Studio** (fully local).
|
||||
A lightweight proxy that routes Claude Code's Anthropic API calls to **NVIDIA NIM** (40 req/min free), **OpenRouter** (hundreds of models), **LM Studio** (fully local), or **llama.cpp** (local with Anthropic endpoints).
|
||||
|
||||
[Quick Start](#quick-start) · [Providers](#providers) · [Discord Bot](#discord-bot) · [Configuration](#configuration) · [Development](#development) · [Contributing](#contributing)
|
||||
|
||||
|
|
@ -27,28 +27,29 @@ A lightweight proxy that routes Claude Code's Anthropic API calls to **NVIDIA NI
|
|||
|
||||
## Features
|
||||
|
||||
| Feature | Description |
|
||||
| -------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| **Zero Cost** | 40 req/min free on NVIDIA NIM. Free models on OpenRouter. Fully local with LM Studio |
|
||||
| **Drop-in Replacement** | Set 2 env vars. No modifications to Claude Code CLI or VSCode extension needed |
|
||||
| **3 Providers** | NVIDIA NIM, OpenRouter (hundreds of models), LM Studio (local & offline) |
|
||||
| **Per-Model Mapping** | Route Opus / Sonnet / Haiku to different models and providers. Mix providers freely |
|
||||
| **Thinking Token Support** | Parses `<think>` tags and `reasoning_content` into native Claude thinking blocks |
|
||||
| **Heuristic Tool Parser** | Models outputting tool calls as text are auto-parsed into structured tool use |
|
||||
| **Request Optimization** | 5 categories of trivial API calls intercepted locally, saving quota and latency |
|
||||
| **Smart Rate Limiting** | Proactive rolling-window throttle + reactive 429 exponential backoff + optional concurrency cap |
|
||||
| **Discord / Telegram Bot** | Remote autonomous coding with tree-based threading, session persistence, and live progress |
|
||||
| **Subagent Control** | Task tool interception forces `run_in_background=False`. No runaway subagents |
|
||||
| **Extensible** | Clean `BaseProvider` and `MessagingPlatform` ABCs. Add new providers or platforms easily |
|
||||
| Feature | Description |
|
||||
| -------------------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| **Zero Cost** | 40 req/min free on NVIDIA NIM. Free models on OpenRouter. Fully local with LM Studio |
|
||||
| **Drop-in Replacement** | Set 2 env vars. No modifications to Claude Code CLI or VSCode extension needed |
|
||||
| **4 Providers** | NVIDIA NIM, OpenRouter (hundreds of models), LM Studio (local), llama.cpp (`llama-server`) |
|
||||
| **Per-Model Mapping** | Route Opus / Sonnet / Haiku to different models and providers. Mix providers freely |
|
||||
| **Thinking Token Support** | Parses `<think>` tags and `reasoning_content` into native Claude thinking blocks |
|
||||
| **Heuristic Tool Parser** | Models outputting tool calls as text are auto-parsed into structured tool use |
|
||||
| **Request Optimization** | 5 categories of trivial API calls intercepted locally, saving quota and latency |
|
||||
| **Smart Rate Limiting** | Proactive rolling-window throttle + reactive 429 exponential backoff + optional concurrency cap |
|
||||
| **Discord / Telegram Bot** | Remote autonomous coding with tree-based threading, session persistence, and live progress |
|
||||
| **Subagent Control** | Task tool interception forces `run_in_background=False`. No runaway subagents |
|
||||
| **Extensible** | Clean `BaseProvider` and `MessagingPlatform` ABCs. Add new providers or platforms easily |
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
1. Get an API key (or use LM Studio locally):
|
||||
1. Get an API key (or use LM Studio / llama.cpp locally):
|
||||
- **NVIDIA NIM**: [build.nvidia.com/settings/api-keys](https://build.nvidia.com/settings/api-keys)
|
||||
- **OpenRouter**: [openrouter.ai/keys](https://openrouter.ai/keys)
|
||||
- **LM Studio**: No API key needed. Run locally with [LM Studio](https://lmstudio.ai)
|
||||
- **llama.cpp**: No API key needed. Run `llama-server` locally.
|
||||
2. Install [Claude Code](https://github.com/anthropics/claude-code)
|
||||
3. Install [uv](https://github.com/astral-sh/uv) (or `uv self update` if already installed)
|
||||
|
||||
|
|
@ -102,6 +103,20 @@ MODEL="lmstudio/unsloth/GLM-4.7-Flash-GGUF" # fallback
|
|||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>llama.cpp</b> (fully local, no API key)</summary>
|
||||
|
||||
```dotenv
|
||||
LLAMACPP_BASE_URL="http://localhost:8080/v1"
|
||||
|
||||
MODEL_OPUS="llamacpp/local-model"
|
||||
MODEL_SONNET="llamacpp/local-model"
|
||||
MODEL_HAIKU="llamacpp/local-model"
|
||||
MODEL="llamacpp/local-model"
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>Mix providers</b></summary>
|
||||
|
||||
|
|
@ -228,6 +243,7 @@ free-claude-code # starts the server
|
|||
| **NVIDIA NIM** | Free | 40 req/min | Daily driver, generous free tier |
|
||||
| **OpenRouter** | Free / Paid | Varies | Model variety, fallback options |
|
||||
| **LM Studio** | Free (local) | Unlimited | Privacy, offline use, no rate limits |
|
||||
| **llama.cpp** | Free (local) | Unlimited | Lightweight local inference engine |
|
||||
|
||||
Models use a prefix format: `provider_prefix/model/name`. An invalid prefix causes an error.
|
||||
|
||||
|
|
@ -236,6 +252,7 @@ Models use a prefix format: `provider_prefix/model/name`. An invalid prefix caus
|
|||
| NVIDIA NIM | `nvidia_nim/...` | `NVIDIA_NIM_API_KEY` | `integrate.api.nvidia.com/v1` |
|
||||
| OpenRouter | `open_router/...` | `OPENROUTER_API_KEY` | `openrouter.ai/api/v1` |
|
||||
| LM Studio | `lmstudio/...` | (none) | `localhost:1234/v1` |
|
||||
| llama.cpp | `llamacpp/...` | (none) | `localhost:8080/v1` |
|
||||
|
||||
<details>
|
||||
<summary><b>NVIDIA NIM models</b></summary>
|
||||
|
|
@ -282,6 +299,16 @@ Browse: [model.lmstudio.ai](https://model.lmstudio.ai)
|
|||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>llama.cpp models</b></summary>
|
||||
|
||||
Run models locally using `llama-server`. Ensure you have a tool-capable GGUF. Set `MODEL` to whatever arbitrary name you'd like (e.g. `llamacpp/my-model`), as `llama-server` ignores the model name when run via `/v1/messages`.
|
||||
|
||||
See the Unsloth docs for detailed instructions and capable models:
|
||||
[https://unsloth.ai/docs/models/qwen3.5#qwen3.5-small-0.8b-2b-4b-9b](https://unsloth.ai/docs/models/qwen3.5#qwen3.5-small-0.8b-2b-4b-9b)
|
||||
|
||||
</details>
|
||||
|
||||
---
|
||||
|
||||
## Discord Bot
|
||||
|
|
@ -289,6 +316,7 @@ Browse: [model.lmstudio.ai](https://model.lmstudio.ai)
|
|||
Control Claude Code remotely from Discord (or Telegram). Send tasks, watch live progress, and manage multiple concurrent sessions.
|
||||
|
||||
**Capabilities:**
|
||||
|
||||
- Tree-based message threading: reply to a message to fork the conversation
|
||||
- Session persistence across server restarts
|
||||
- Live streaming of thinking tokens, tool calls, and results
|
||||
|
|
@ -340,10 +368,10 @@ Get a token from [@BotFather](https://t.me/BotFather); find your user ID via [@u
|
|||
|
||||
Send voice messages on Discord or Telegram; they are transcribed and processed as regular prompts.
|
||||
|
||||
| Backend | Description | API Key |
|
||||
| ------- | ----------- | ------- |
|
||||
| **Local Whisper** (default) | [Hugging Face Whisper](https://huggingface.co/openai/whisper-large-v3-turbo) — free, offline, CUDA compatible | not required |
|
||||
| **NVIDIA NIM** | Whisper/Parakeet models via gRPC | `NVIDIA_NIM_API_KEY` |
|
||||
| Backend | Description | API Key |
|
||||
| --------------------------- | ------------------------------------------------------------------------------------------------------------- | -------------------- |
|
||||
| **Local Whisper** (default) | [Hugging Face Whisper](https://huggingface.co/openai/whisper-large-v3-turbo) — free, offline, CUDA compatible | not required |
|
||||
| **NVIDIA NIM** | Whisper/Parakeet models via gRPC | `NVIDIA_NIM_API_KEY` |
|
||||
|
||||
**Install the voice extras:**
|
||||
|
||||
|
|
@ -367,44 +395,45 @@ Configure via `WHISPER_DEVICE` (`cpu` | `cuda` | `nvidia_nim`) and `WHISPER_MODE
|
|||
|
||||
### Core
|
||||
|
||||
| Variable | Description | Default |
|
||||
| -------------------- | ------------------------------------------------------------------------ | ------------------------------------------------- |
|
||||
| `MODEL` | Fallback model (`provider/model/name` format; invalid prefix → error) | `nvidia_nim/stepfun-ai/step-3.5-flash` |
|
||||
| `MODEL_OPUS` | Model for Claude Opus requests (falls back to `MODEL`) | `nvidia_nim/z-ai/glm4.7` |
|
||||
| `MODEL_SONNET` | Model for Claude Sonnet requests (falls back to `MODEL`) | `open_router/arcee-ai/trinity-large-preview:free` |
|
||||
| `MODEL_HAIKU` | Model for Claude Haiku requests (falls back to `MODEL`) | `open_router/stepfun/step-3.5-flash:free` |
|
||||
| `NVIDIA_NIM_API_KEY` | NVIDIA API key | required for NIM |
|
||||
| `OPENROUTER_API_KEY` | OpenRouter API key | required for OpenRouter |
|
||||
| `LM_STUDIO_BASE_URL` | LM Studio server URL | `http://localhost:1234/v1` |
|
||||
| Variable | Description | Default |
|
||||
| -------------------- | --------------------------------------------------------------------- | ------------------------------------------------- |
|
||||
| `MODEL` | Fallback model (`provider/model/name` format; invalid prefix → error) | `nvidia_nim/stepfun-ai/step-3.5-flash` |
|
||||
| `MODEL_OPUS` | Model for Claude Opus requests (falls back to `MODEL`) | `nvidia_nim/z-ai/glm4.7` |
|
||||
| `MODEL_SONNET` | Model for Claude Sonnet requests (falls back to `MODEL`) | `open_router/arcee-ai/trinity-large-preview:free` |
|
||||
| `MODEL_HAIKU` | Model for Claude Haiku requests (falls back to `MODEL`) | `open_router/stepfun/step-3.5-flash:free` |
|
||||
| `NVIDIA_NIM_API_KEY` | NVIDIA API key | required for NIM |
|
||||
| `OPENROUTER_API_KEY` | OpenRouter API key | required for OpenRouter |
|
||||
| `LM_STUDIO_BASE_URL` | LM Studio server URL | `http://localhost:1234/v1` |
|
||||
| `LLAMACPP_BASE_URL` | llama.cpp server URL | `http://localhost:8080/v1` |
|
||||
|
||||
### Rate Limiting & Timeouts
|
||||
|
||||
| Variable | Description | Default |
|
||||
| -------------------------- | ------------------------------------------ | ------- |
|
||||
| `PROVIDER_RATE_LIMIT` | LLM API requests per window | `40` |
|
||||
| `PROVIDER_RATE_WINDOW` | Rate limit window (seconds) | `60` |
|
||||
| `PROVIDER_MAX_CONCURRENCY` | Max simultaneous open provider streams | `5` |
|
||||
| `HTTP_READ_TIMEOUT` | Read timeout for provider requests (s) | `120` |
|
||||
| `HTTP_WRITE_TIMEOUT` | Write timeout for provider requests (s) | `10` |
|
||||
| `HTTP_CONNECT_TIMEOUT` | Connect timeout for provider requests (s) | `2` |
|
||||
| Variable | Description | Default |
|
||||
| -------------------------- | ----------------------------------------- | ------- |
|
||||
| `PROVIDER_RATE_LIMIT` | LLM API requests per window | `40` |
|
||||
| `PROVIDER_RATE_WINDOW` | Rate limit window (seconds) | `60` |
|
||||
| `PROVIDER_MAX_CONCURRENCY` | Max simultaneous open provider streams | `5` |
|
||||
| `HTTP_READ_TIMEOUT` | Read timeout for provider requests (s) | `120` |
|
||||
| `HTTP_WRITE_TIMEOUT` | Write timeout for provider requests (s) | `10` |
|
||||
| `HTTP_CONNECT_TIMEOUT` | Connect timeout for provider requests (s) | `2` |
|
||||
|
||||
### Messaging & Voice
|
||||
|
||||
| Variable | Description | Default |
|
||||
| -------------------------- | ------------------------------------------------------------------ | --------- |
|
||||
| `MESSAGING_PLATFORM` | `discord` or `telegram` | `discord` |
|
||||
| `DISCORD_BOT_TOKEN` | Discord bot token | `""` |
|
||||
| `ALLOWED_DISCORD_CHANNELS` | Comma-separated channel IDs (empty = none allowed) | `""` |
|
||||
| `TELEGRAM_BOT_TOKEN` | Telegram bot token | `""` |
|
||||
| `ALLOWED_TELEGRAM_USER_ID` | Allowed Telegram user ID | `""` |
|
||||
| `CLAUDE_WORKSPACE` | Directory where the agent operates | `./agent_workspace` |
|
||||
| `ALLOWED_DIR` | Allowed directories for the agent | `""` |
|
||||
| `MESSAGING_RATE_LIMIT` | Messaging messages per window | `1` |
|
||||
| `MESSAGING_RATE_WINDOW` | Messaging window (seconds) | `1` |
|
||||
| `VOICE_NOTE_ENABLED` | Enable voice note handling | `true` |
|
||||
| `WHISPER_DEVICE` | `cpu` \| `cuda` \| `nvidia_nim` | `cpu` |
|
||||
| `WHISPER_MODEL` | Whisper model (local: `tiny`/`base`/`small`/`medium`/`large-v2`/`large-v3`/`large-v3-turbo`; NIM: `openai/whisper-large-v3`, `nvidia/parakeet-ctc-1.1b-asr`, etc.) | `base` |
|
||||
| `HF_TOKEN` | Hugging Face token for faster downloads (local Whisper, optional) | — |
|
||||
| Variable | Description | Default |
|
||||
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------- |
|
||||
| `MESSAGING_PLATFORM` | `discord` or `telegram` | `discord` |
|
||||
| `DISCORD_BOT_TOKEN` | Discord bot token | `""` |
|
||||
| `ALLOWED_DISCORD_CHANNELS` | Comma-separated channel IDs (empty = none allowed) | `""` |
|
||||
| `TELEGRAM_BOT_TOKEN` | Telegram bot token | `""` |
|
||||
| `ALLOWED_TELEGRAM_USER_ID` | Allowed Telegram user ID | `""` |
|
||||
| `CLAUDE_WORKSPACE` | Directory where the agent operates | `./agent_workspace` |
|
||||
| `ALLOWED_DIR` | Allowed directories for the agent | `""` |
|
||||
| `MESSAGING_RATE_LIMIT` | Messaging messages per window | `1` |
|
||||
| `MESSAGING_RATE_WINDOW` | Messaging window (seconds) | `1` |
|
||||
| `VOICE_NOTE_ENABLED` | Enable voice note handling | `true` |
|
||||
| `WHISPER_DEVICE` | `cpu` \| `cuda` \| `nvidia_nim` | `cpu` |
|
||||
| `WHISPER_MODEL` | Whisper model (local: `tiny`/`base`/`small`/`medium`/`large-v2`/`large-v3`/`large-v3-turbo`; NIM: `openai/whisper-large-v3`, `nvidia/parakeet-ctc-1.1b-asr`, etc.) | `base` |
|
||||
| `HF_TOKEN` | Hugging Face token for faster downloads (local Whisper, optional) | — |
|
||||
|
||||
<details>
|
||||
<summary><b>Advanced: Request optimization flags</b></summary>
|
||||
|
|
@ -433,7 +462,7 @@ See [`.env.example`](.env.example) for all supported parameters.
|
|||
free-claude-code/
|
||||
├── server.py # Entry point
|
||||
├── api/ # FastAPI routes, request detection, optimization handlers
|
||||
├── providers/ # BaseProvider, OpenAICompatibleProvider, NIM, OpenRouter, LM Studio
|
||||
├── providers/ # BaseProvider, OpenAICompatibleProvider, NIM, OpenRouter, LM Studio, llamacpp
|
||||
│ └── common/ # Shared utils (SSE builder, message converter, parsers, error mapping)
|
||||
├── messaging/ # MessagingPlatform ABC + Discord/Telegram bots, session management
|
||||
├── config/ # Settings, NIM config, logging
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from config.settings import get_settings as _get_settings
|
|||
from providers.base import BaseProvider, ProviderConfig
|
||||
from providers.common import get_user_facing_error_message
|
||||
from providers.exceptions import AuthenticationError
|
||||
from providers.llamacpp import LlamaCppProvider
|
||||
from providers.lmstudio import LMStudioProvider
|
||||
from providers.nvidia_nim import NVIDIA_NIM_BASE_URL, NvidiaNimProvider
|
||||
from providers.open_router import OPENROUTER_BASE_URL, OpenRouterProvider
|
||||
|
|
@ -69,13 +70,25 @@ def _create_provider_for_type(provider_type: str, settings: Settings) -> BasePro
|
|||
http_connect_timeout=settings.http_connect_timeout,
|
||||
)
|
||||
return LMStudioProvider(config)
|
||||
if provider_type == "llamacpp":
|
||||
config = ProviderConfig(
|
||||
api_key="llamacpp",
|
||||
base_url=settings.llamacpp_base_url,
|
||||
rate_limit=settings.provider_rate_limit,
|
||||
rate_window=settings.provider_rate_window,
|
||||
max_concurrency=settings.provider_max_concurrency,
|
||||
http_read_timeout=settings.http_read_timeout,
|
||||
http_write_timeout=settings.http_write_timeout,
|
||||
http_connect_timeout=settings.http_connect_timeout,
|
||||
)
|
||||
return LlamaCppProvider(config)
|
||||
logger.error(
|
||||
"Unknown provider_type: '{}'. Supported: 'nvidia_nim', 'open_router', 'lmstudio'",
|
||||
"Unknown provider_type: '{}'. Supported: 'nvidia_nim', 'open_router', 'lmstudio', 'llamacpp'",
|
||||
provider_type,
|
||||
)
|
||||
raise ValueError(
|
||||
f"Unknown provider_type: '{provider_type}'. "
|
||||
f"Supported: 'nvidia_nim', 'open_router', 'lmstudio'"
|
||||
f"Supported: 'nvidia_nim', 'open_router', 'lmstudio', 'llamacpp'"
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
10
claude-pick
10
claude-pick
|
|
@ -11,6 +11,7 @@ PORT="${CLAUDE_PICK_PORT:-8082}"
|
|||
BASE_URL="http://localhost:$PORT"
|
||||
OPENROUTER_MODELS_URL="https://openrouter.ai/api/v1/models"
|
||||
DEFAULT_LM_STUDIO_BASE_URL="http://localhost:1234/v1"
|
||||
DEFAULT_LLAMACPP_BASE_URL="http://localhost:8080/v1"
|
||||
|
||||
if ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "Error: python3 is required." >&2
|
||||
|
|
@ -140,9 +141,16 @@ case "$provider" in
|
|||
models="$(get_lmstudio_models)"
|
||||
prompt="Select an LM Studio model> "
|
||||
;;
|
||||
llamacpp|llama.cpp)
|
||||
# llama.cpp doesn't have a standardized /models endpoint that returns all loaded models reliably
|
||||
# in the same way, but it does support Anthropic routing. We can use a stub model or query if available.
|
||||
# For simple picker, we'll just allow passing a default or typing it in, but to match fzf we offer a stub.
|
||||
models="local-model\nllama-server"
|
||||
prompt="Select a llama.cpp model> "
|
||||
;;
|
||||
*)
|
||||
echo "Error: Unsupported PROVIDER_TYPE='$provider'." >&2
|
||||
echo "Expected one of: nvidia_nim, open_router, lmstudio" >&2
|
||||
echo "Expected one of: nvidia_nim, open_router, lmstudio, llamacpp" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
|
|
|||
|
|
@ -42,6 +42,12 @@ class Settings(BaseSettings):
|
|||
validation_alias="LM_STUDIO_BASE_URL",
|
||||
)
|
||||
|
||||
# ==================== Llama.cpp Config ====================
|
||||
llamacpp_base_url: str = Field(
|
||||
default="http://localhost:8080/v1",
|
||||
validation_alias="LLAMACPP_BASE_URL",
|
||||
)
|
||||
|
||||
# ==================== Model ====================
|
||||
# All Claude model requests are mapped to this single model (fallback)
|
||||
# Format: provider_type/model/name
|
||||
|
|
@ -145,7 +151,7 @@ class Settings(BaseSettings):
|
|||
def validate_model_format(cls, v: str | None) -> str | None:
|
||||
if v is None:
|
||||
return None
|
||||
valid_providers = ("nvidia_nim", "open_router", "lmstudio")
|
||||
valid_providers = ("nvidia_nim", "open_router", "lmstudio", "llamacpp")
|
||||
if "/" not in v:
|
||||
raise ValueError(
|
||||
f"Model must be prefixed with provider type. "
|
||||
|
|
@ -156,7 +162,7 @@ class Settings(BaseSettings):
|
|||
if provider not in valid_providers:
|
||||
raise ValueError(
|
||||
f"Invalid provider: '{provider}'. "
|
||||
f"Supported: 'nvidia_nim', 'open_router', 'lmstudio'"
|
||||
f"Supported: 'nvidia_nim', 'open_router', 'lmstudio', 'llamacpp'"
|
||||
)
|
||||
return v
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from .exceptions import (
|
|||
ProviderError,
|
||||
RateLimitError,
|
||||
)
|
||||
from .llamacpp import LlamaCppProvider
|
||||
from .lmstudio import LMStudioProvider
|
||||
from .nvidia_nim import NvidiaNimProvider
|
||||
from .open_router import OpenRouterProvider
|
||||
|
|
@ -19,6 +20,7 @@ __all__ = [
|
|||
"BaseProvider",
|
||||
"InvalidRequestError",
|
||||
"LMStudioProvider",
|
||||
"LlamaCppProvider",
|
||||
"NvidiaNimProvider",
|
||||
"OpenRouterProvider",
|
||||
"OverloadedError",
|
||||
|
|
|
|||
3
providers/llamacpp/__init__.py
Normal file
3
providers/llamacpp/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
from .client import LlamaCppProvider
|
||||
|
||||
__all__ = ["LlamaCppProvider"]
|
||||
147
providers/llamacpp/client.py
Normal file
147
providers/llamacpp/client.py
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
"""Llama.cpp provider implementation."""
|
||||
|
||||
import json
|
||||
from collections.abc import AsyncIterator
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
from loguru import logger
|
||||
|
||||
from providers.base import BaseProvider, ProviderConfig
|
||||
from providers.common import get_user_facing_error_message, map_error
|
||||
from providers.rate_limit import GlobalRateLimiter
|
||||
|
||||
LLAMACPP_DEFAULT_BASE_URL = "http://localhost:8080/v1"
|
||||
|
||||
|
||||
class LlamaCppProvider(BaseProvider):
|
||||
"""Llama.cpp provider using native Anthropic Messages API endpoint."""
|
||||
|
||||
def __init__(self, config: ProviderConfig):
|
||||
super().__init__(config)
|
||||
self._provider_name = "LLAMACPP"
|
||||
self._base_url = (config.base_url or LLAMACPP_DEFAULT_BASE_URL).rstrip("/")
|
||||
|
||||
# We need the base URL without /v1 if the user provided it with /v1,
|
||||
# so we can append /v1/messages safely.
|
||||
# Actually, if they provided http://localhost:8080/v1, we can just use
|
||||
# {base_url}/messages which becomes http://localhost:8080/v1/messages
|
||||
|
||||
self._global_rate_limiter = GlobalRateLimiter.get_instance(
|
||||
rate_limit=config.rate_limit,
|
||||
rate_window=config.rate_window,
|
||||
max_concurrency=config.max_concurrency,
|
||||
)
|
||||
self._client = httpx.AsyncClient(
|
||||
base_url=self._base_url,
|
||||
timeout=httpx.Timeout(
|
||||
config.http_read_timeout,
|
||||
connect=config.http_connect_timeout,
|
||||
read=config.http_read_timeout,
|
||||
write=config.http_write_timeout,
|
||||
),
|
||||
)
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
"""Release HTTP client resources."""
|
||||
await self._client.aclose()
|
||||
|
||||
async def stream_response(
|
||||
self,
|
||||
request: Any,
|
||||
input_tokens: int = 0,
|
||||
*,
|
||||
request_id: str | None = None,
|
||||
) -> AsyncIterator[str]:
|
||||
"""Stream response natively via Llama.cpp's Anthropic-compatible endpoint."""
|
||||
tag = self._provider_name
|
||||
req_tag = f" request_id={request_id}" if request_id else ""
|
||||
|
||||
# Dump the Anthropic Pydantic model directly into a dict
|
||||
body = request.model_dump(exclude_none=True)
|
||||
|
||||
# Remove extra_body, original_model, resolved_provider_model which are internal
|
||||
body.pop("extra_body", None)
|
||||
body.pop("original_model", None)
|
||||
body.pop("resolved_provider_model", None)
|
||||
|
||||
# Translate internal ThinkingConfig to Anthropic API schema
|
||||
if "thinking" in body:
|
||||
thinking_cfg = body.pop("thinking")
|
||||
if isinstance(thinking_cfg, dict) and thinking_cfg.get("enabled"):
|
||||
# Anthropic API requires a budget_tokens value when enabled
|
||||
body["thinking"] = {"type": "enabled"}
|
||||
|
||||
# Ensure max_tokens is present (Claude API requires it)
|
||||
if "max_tokens" not in body:
|
||||
body["max_tokens"] = 81920
|
||||
|
||||
logger.info(
|
||||
"{}_STREAM:{} natively passing Anthropic request to llama.cpp model={} msgs={} tools={}",
|
||||
tag,
|
||||
req_tag,
|
||||
body.get("model"),
|
||||
len(body.get("messages", [])),
|
||||
len(body.get("tools", [])),
|
||||
)
|
||||
|
||||
async with self._global_rate_limiter.concurrency_slot():
|
||||
try:
|
||||
# We use execute_with_retry around the streaming request context
|
||||
# To do this safely with httpx streaming, we await the chunk stream
|
||||
|
||||
async def _make_request():
|
||||
request_obj = self._client.build_request(
|
||||
"POST",
|
||||
"/messages",
|
||||
json=body,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
return await self._client.send(request_obj, stream=True)
|
||||
|
||||
response = await self._global_rate_limiter.execute_with_retry(
|
||||
_make_request
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
try:
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
text = await response.aread()
|
||||
logger.error(
|
||||
"{}_ERROR:{} HTTP {}: {}",
|
||||
tag,
|
||||
req_tag,
|
||||
response.status_code,
|
||||
text.decode("utf-8", errors="replace"),
|
||||
)
|
||||
raise e
|
||||
|
||||
async for line in response.aiter_lines():
|
||||
if line:
|
||||
yield f"{line}\n"
|
||||
else:
|
||||
yield "\n"
|
||||
|
||||
except Exception as e:
|
||||
logger.error("{}_ERROR:{} {}: {}", tag, req_tag, type(e).__name__, e)
|
||||
mapped_e = map_error(e)
|
||||
error_message = get_user_facing_error_message(
|
||||
mapped_e, read_timeout_s=self._config.http_read_timeout
|
||||
)
|
||||
if request_id:
|
||||
error_message += f"\nRequest ID: {request_id}"
|
||||
|
||||
logger.info(
|
||||
"{}_STREAM: Emitting native SSE error event for {}{}",
|
||||
tag,
|
||||
type(e).__name__,
|
||||
req_tag,
|
||||
)
|
||||
|
||||
# Emit an Anthropic-compatible error event
|
||||
error_event = {
|
||||
"type": "error",
|
||||
"error": {"type": "api_error", "message": error_message},
|
||||
}
|
||||
yield f"event: error\ndata: {json.dumps(error_event)}\n\n"
|
||||
|
|
@ -330,6 +330,39 @@ class TestPerModelMapping:
|
|||
s = Settings()
|
||||
assert s.model_opus == "open_router/deepseek/deepseek-r1"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"env_vars,expected_model,expected_haiku",
|
||||
[
|
||||
(
|
||||
{"MODEL": "nvidia_nim/meta/llama3-70b-instruct"},
|
||||
"nvidia_nim/meta/llama3-70b-instruct",
|
||||
None,
|
||||
),
|
||||
(
|
||||
{
|
||||
"MODEL": "open_router/anthropic/claude-3-opus",
|
||||
"MODEL_HAIKU": "open_router/anthropic/claude-3-haiku",
|
||||
},
|
||||
"open_router/anthropic/claude-3-opus",
|
||||
"open_router/anthropic/claude-3-haiku",
|
||||
),
|
||||
({"MODEL": "lmstudio/qwen2.5-7b"}, "lmstudio/qwen2.5-7b", None),
|
||||
({"MODEL": "llamacpp/local-model"}, "llamacpp/local-model", None),
|
||||
],
|
||||
)
|
||||
def test_settings_models_from_env(
|
||||
self, env_vars, expected_model, expected_haiku, monkeypatch
|
||||
):
|
||||
"""Test environment variables override model defaults."""
|
||||
from config.settings import Settings
|
||||
|
||||
for k, v in env_vars.items():
|
||||
monkeypatch.setenv(k, v)
|
||||
|
||||
s = Settings()
|
||||
assert s.model == expected_model
|
||||
assert s.model_haiku == expected_haiku
|
||||
|
||||
def test_model_sonnet_from_env(self, monkeypatch):
|
||||
"""MODEL_SONNET env var is loaded."""
|
||||
from config.settings import Settings
|
||||
|
|
@ -449,6 +482,7 @@ class TestPerModelMapping:
|
|||
assert Settings.parse_provider_type("nvidia_nim/meta/llama") == "nvidia_nim"
|
||||
assert Settings.parse_provider_type("open_router/deepseek/r1") == "open_router"
|
||||
assert Settings.parse_provider_type("lmstudio/qwen") == "lmstudio"
|
||||
assert Settings.parse_provider_type("llamacpp/model") == "llamacpp"
|
||||
|
||||
def test_parse_model_name(self):
|
||||
"""parse_model_name extracts model name from model string."""
|
||||
|
|
@ -456,3 +490,4 @@ class TestPerModelMapping:
|
|||
|
||||
assert Settings.parse_model_name("nvidia_nim/meta/llama") == "meta/llama"
|
||||
assert Settings.parse_model_name("lmstudio/qwen") == "qwen"
|
||||
assert Settings.parse_model_name("llamacpp/model") == "model"
|
||||
|
|
|
|||
|
|
@ -70,6 +70,19 @@ def lmstudio_provider(provider_config):
|
|||
return LMStudioProvider(lmstudio_config)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def llamacpp_provider(provider_config):
|
||||
from providers.llamacpp import LlamaCppProvider
|
||||
|
||||
llamacpp_config = ProviderConfig(
|
||||
api_key="llamacpp",
|
||||
base_url="http://localhost:8080/v1",
|
||||
rate_limit=10,
|
||||
rate_window=60,
|
||||
)
|
||||
return LlamaCppProvider(llamacpp_config)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_cli_session():
|
||||
session = MagicMock(spec=CLISession)
|
||||
|
|
|
|||
256
tests/providers/test_llamacpp.py
Normal file
256
tests/providers/test_llamacpp.py
Normal file
|
|
@ -0,0 +1,256 @@
|
|||
"""Tests for Llama.cpp native Anthropic provider."""
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from providers.base import ProviderConfig
|
||||
from providers.llamacpp import LlamaCppProvider
|
||||
|
||||
|
||||
class MockMessage:
|
||||
def __init__(self, role, content):
|
||||
self.role = role
|
||||
self.content = content
|
||||
|
||||
|
||||
class MockRequest:
|
||||
def __init__(self, **kwargs):
|
||||
self.model = "llamacpp-community/qwen2.5-7b-instruct"
|
||||
self.messages = [MockMessage("user", "Hello")]
|
||||
self.max_tokens = 100
|
||||
self.temperature = 0.5
|
||||
self.top_p = 0.9
|
||||
self.system = "System prompt"
|
||||
self.stop_sequences = None
|
||||
self.tools = []
|
||||
self.extra_body = {}
|
||||
self.thinking = MagicMock()
|
||||
self.thinking.enabled = True
|
||||
for k, v in kwargs.items():
|
||||
setattr(self, k, v)
|
||||
|
||||
def model_dump(self, exclude_none=True):
|
||||
return {
|
||||
"model": self.model,
|
||||
"messages": [{"role": m.role, "content": m.content} for m in self.messages],
|
||||
"max_tokens": self.max_tokens,
|
||||
"temperature": self.temperature,
|
||||
"extra_body": self.extra_body,
|
||||
"thinking": {"enabled": self.thinking.enabled} if self.thinking else None,
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def llamacpp_config():
|
||||
return ProviderConfig(
|
||||
api_key="llamacpp",
|
||||
base_url="http://localhost:8080/v1",
|
||||
rate_limit=10,
|
||||
rate_window=60,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_rate_limiter():
|
||||
"""Mock the global rate limiter to prevent waiting."""
|
||||
with patch("providers.llamacpp.client.GlobalRateLimiter") as mock:
|
||||
instance = mock.get_instance.return_value
|
||||
instance.wait_if_blocked = AsyncMock(return_value=False)
|
||||
|
||||
async def _passthrough(fn, *args, **kwargs):
|
||||
return await fn(*args, **kwargs)
|
||||
|
||||
instance.execute_with_retry = AsyncMock(side_effect=_passthrough)
|
||||
yield instance
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def llamacpp_provider(llamacpp_config):
|
||||
return LlamaCppProvider(llamacpp_config)
|
||||
|
||||
|
||||
def test_init(llamacpp_config):
|
||||
"""Test provider initialization."""
|
||||
with patch("httpx.AsyncClient"):
|
||||
provider = LlamaCppProvider(llamacpp_config)
|
||||
assert provider._base_url == "http://localhost:8080/v1"
|
||||
assert provider._provider_name == "LLAMACPP"
|
||||
|
||||
|
||||
def test_init_uses_configurable_timeouts():
|
||||
"""Test that provider passes configurable read/write/connect timeouts to client."""
|
||||
config = ProviderConfig(
|
||||
api_key="llamacpp",
|
||||
base_url="http://localhost:8080/v1",
|
||||
http_read_timeout=600.0,
|
||||
http_write_timeout=15.0,
|
||||
http_connect_timeout=5.0,
|
||||
)
|
||||
with patch("httpx.AsyncClient") as mock_client:
|
||||
LlamaCppProvider(config)
|
||||
call_kwargs = mock_client.call_args[1]
|
||||
timeout = call_kwargs["timeout"]
|
||||
assert timeout.read == 600.0
|
||||
assert timeout.write == 15.0
|
||||
assert timeout.connect == 5.0
|
||||
|
||||
|
||||
def test_init_base_url_strips_trailing_slash():
|
||||
"""Config with base_url trailing slash is stored without it."""
|
||||
config = ProviderConfig(
|
||||
api_key="llamacpp",
|
||||
base_url="http://localhost:8080/v1/",
|
||||
rate_limit=10,
|
||||
rate_window=60,
|
||||
)
|
||||
with patch("httpx.AsyncClient"):
|
||||
provider = LlamaCppProvider(config)
|
||||
assert provider._base_url == "http://localhost:8080/v1"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_response(llamacpp_provider):
|
||||
"""Test streaming native Anthropic response."""
|
||||
req = MockRequest()
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
|
||||
async def mock_aiter_lines():
|
||||
yield "event: message_start"
|
||||
yield 'data: {"type":"message_start","message":{}}'
|
||||
yield ""
|
||||
yield "event: content_block_delta"
|
||||
yield 'data: {"type":"content_block_delta","delta":{"type":"text_delta","text":"Hello World"}}'
|
||||
yield ""
|
||||
yield "event: message_stop"
|
||||
yield 'data: {"type":"message_stop"}'
|
||||
yield ""
|
||||
|
||||
mock_response.aiter_lines = mock_aiter_lines
|
||||
|
||||
with (
|
||||
patch.object(
|
||||
llamacpp_provider._client, "build_request", return_value=MagicMock()
|
||||
) as mock_build,
|
||||
patch.object(
|
||||
llamacpp_provider._client,
|
||||
"send",
|
||||
new_callable=AsyncMock,
|
||||
return_value=mock_response,
|
||||
),
|
||||
):
|
||||
events = [e async for e in llamacpp_provider.stream_response(req)]
|
||||
|
||||
# Verify request construction
|
||||
mock_build.assert_called_once()
|
||||
args, kwargs = mock_build.call_args
|
||||
assert args[0] == "POST"
|
||||
assert args[1] == "/messages"
|
||||
assert kwargs["json"]["model"] == "llamacpp-community/qwen2.5-7b-instruct"
|
||||
# Verify internal fields are popped
|
||||
assert "extra_body" not in kwargs["json"]
|
||||
assert kwargs["json"]["max_tokens"] == 100
|
||||
|
||||
# Verify internal ThinkingConfig is mapped to Anthropic API format
|
||||
assert kwargs["json"]["thinking"] == {"type": "enabled"}
|
||||
|
||||
# Verify events yielded correctly
|
||||
assert len(events) == 9
|
||||
assert events[0] == "event: message_start\n"
|
||||
assert events[1] == 'data: {"type":"message_start","message":{}}\n'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_response_adds_max_tokens_if_missing(llamacpp_provider):
|
||||
"""Fallback max_tokens to 81920 if not present."""
|
||||
req = MockRequest()
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
|
||||
async def empty_aiter():
|
||||
if False:
|
||||
yield ""
|
||||
|
||||
mock_response.aiter_lines = empty_aiter
|
||||
|
||||
with (
|
||||
patch.object(req, "model_dump", return_value={"model": "test"}),
|
||||
patch.object(llamacpp_provider._client, "build_request") as mock_build,
|
||||
patch.object(
|
||||
llamacpp_provider._client,
|
||||
"send",
|
||||
new_callable=AsyncMock,
|
||||
return_value=mock_response,
|
||||
),
|
||||
):
|
||||
# Just run the generator to completion
|
||||
[e async for e in llamacpp_provider.stream_response(req)]
|
||||
|
||||
_, kwargs = mock_build.call_args
|
||||
assert kwargs["json"]["max_tokens"] == 81920
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_error_status_code(llamacpp_provider):
|
||||
"""Non-200 status code raises an error that gets caught and yielded as an SSE API error."""
|
||||
req = MockRequest()
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 500
|
||||
mock_response.aread = AsyncMock(return_value=b"Internal Server Error")
|
||||
mock_response.raise_for_status = MagicMock(
|
||||
side_effect=httpx.HTTPStatusError(
|
||||
"Internal Server Error", request=MagicMock(), response=mock_response
|
||||
)
|
||||
)
|
||||
|
||||
with (
|
||||
patch.object(
|
||||
llamacpp_provider._client, "build_request", return_value=MagicMock()
|
||||
),
|
||||
patch.object(
|
||||
llamacpp_provider._client,
|
||||
"send",
|
||||
new_callable=AsyncMock,
|
||||
return_value=mock_response,
|
||||
),
|
||||
):
|
||||
events = [
|
||||
e
|
||||
async for e in llamacpp_provider.stream_response(req, request_id="TEST_ID")
|
||||
]
|
||||
|
||||
assert len(events) == 1
|
||||
assert events[0].startswith("event: error\ndata: {")
|
||||
assert "Internal Server Error" in events[0]
|
||||
assert "TEST_ID" in events[0]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_network_error(llamacpp_provider):
|
||||
"""Network errors are caught and yielded as SSE API error events."""
|
||||
req = MockRequest()
|
||||
|
||||
with (
|
||||
patch.object(
|
||||
llamacpp_provider._client, "build_request", return_value=MagicMock()
|
||||
),
|
||||
patch.object(
|
||||
llamacpp_provider._client,
|
||||
"send",
|
||||
new_callable=AsyncMock,
|
||||
side_effect=httpx.ConnectError("Connection refused"),
|
||||
),
|
||||
):
|
||||
events = [
|
||||
e
|
||||
async for e in llamacpp_provider.stream_response(req, request_id="TEST_ID2")
|
||||
]
|
||||
|
||||
assert len(events) == 1
|
||||
assert events[0].startswith("event: error\ndata: {")
|
||||
assert "Connection refused" in events[0]
|
||||
assert "TEST_ID2" in events[0]
|
||||
Loading…
Add table
Add a link
Reference in a new issue