diff --git a/.github/workflows/build-3rd-party.yml b/.github/workflows/build-3rd-party.yml
index 642d97864..82e53dbaf 100644
--- a/.github/workflows/build-3rd-party.yml
+++ b/.github/workflows/build-3rd-party.yml
@@ -22,9 +22,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-llguidance:
diff --git a/.github/workflows/build-android.yml b/.github/workflows/build-android.yml
index 99c3659b9..713ccdc7f 100644
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -27,9 +27,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
android:
diff --git a/.github/workflows/build-apple.yml b/.github/workflows/build-apple.yml
index 3cc384850..d2c99d0d5 100644
--- a/.github/workflows/build-apple.yml
+++ b/.github/workflows/build-apple.yml
@@ -32,9 +32,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
macos-latest-arm64:
diff --git a/.github/workflows/build-cann.yml b/.github/workflows/build-cann.yml
index b6acbbf9e..6d76ed499 100644
--- a/.github/workflows/build-cann.yml
+++ b/.github/workflows/build-cann.yml
@@ -29,9 +29,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
diff --git a/.github/workflows/build-hip.yml b/.github/workflows/build-hip.yml
index edafdb939..ff8283ae7 100644
--- a/.github/workflows/build-hip.yml
+++ b/.github/workflows/build-hip.yml
@@ -31,9 +31,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
diff --git a/.github/workflows/build-ibm.yml b/.github/workflows/build-ibm.yml
index 1742894b9..d2e4f3cda 100644
--- a/.github/workflows/build-ibm.yml
+++ b/.github/workflows/build-ibm.yml
@@ -29,9 +29,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
diff --git a/.github/workflows/build-msys.yml b/.github/workflows/build-msys.yml
index 57cec7c16..8214f2b8d 100644
--- a/.github/workflows/build-msys.yml
+++ b/.github/workflows/build-msys.yml
@@ -15,9 +15,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
windows-msys2:
diff --git a/.github/workflows/build-opencl.yml b/.github/workflows/build-opencl.yml
index 0f8cbe0fc..fccb06b88 100644
--- a/.github/workflows/build-opencl.yml
+++ b/.github/workflows/build-opencl.yml
@@ -30,9 +30,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
diff --git a/.github/workflows/build-openvino.yml b/.github/workflows/build-openvino.yml
index fdf96356e..47e04869c 100644
--- a/.github/workflows/build-openvino.yml
+++ b/.github/workflows/build-openvino.yml
@@ -29,9 +29,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-openvino:
diff --git a/.github/workflows/build-riscv.yml b/.github/workflows/build-riscv.yml
index d3c8f8ed3..c12aaa61f 100644
--- a/.github/workflows/build-riscv.yml
+++ b/.github/workflows/build-riscv.yml
@@ -29,9 +29,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
ubuntu-cpu-riscv64-native:
diff --git a/.github/workflows/build-rpc.yml b/.github/workflows/build-rpc.yml
index 3c52d7e96..c1ff98770 100644
--- a/.github/workflows/build-rpc.yml
+++ b/.github/workflows/build-rpc.yml
@@ -29,9 +29,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
diff --git a/.github/workflows/build-sanitize.yml b/.github/workflows/build-sanitize.yml
index 916560b84..29f7a2922 100644
--- a/.github/workflows/build-sanitize.yml
+++ b/.github/workflows/build-sanitize.yml
@@ -22,9 +22,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
ctest:
diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml
index 89ca03f27..d40de48d9 100644
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -50,9 +50,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
ggml-ci-nvidia-cuda:
diff --git a/.github/workflows/build-sycl.yml b/.github/workflows/build-sycl.yml
index f38533f3b..b0697f2f2 100644
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -29,9 +29,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
diff --git a/.github/workflows/build-vulkan.yml b/.github/workflows/build-vulkan.yml
index eb6d02680..b44f08c6e 100644
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -31,9 +31,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-vulkan-llvmpipe:
diff --git a/.github/workflows/build-webgpu.yml b/.github/workflows/build-webgpu.yml
index ff6690ffd..c7056358c 100644
--- a/.github/workflows/build-webgpu.yml
+++ b/.github/workflows/build-webgpu.yml
@@ -30,9 +30,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
macos-latest-webgpu:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index fbf64a3f9..69f3947a2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -52,9 +52,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
build-cmake-pkg:
diff --git a/.github/workflows/hip-quality-check.yml b/.github/workflows/hip-quality-check.yml
index d00d30ed6..5d03b1772 100644
--- a/.github/workflows/hip-quality-check.yml
+++ b/.github/workflows/hip-quality-check.yml
@@ -28,9 +28,9 @@ concurrency:
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
ubuntu-22-hip-quality-check:
diff --git a/.github/workflows/server-sanitize.yml b/.github/workflows/server-sanitize.yml
index 46e474672..c0817cbba 100644
--- a/.github/workflows/server-sanitize.yml
+++ b/.github/workflows/server-sanitize.yml
@@ -26,10 +26,10 @@ on:
]
env:
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
- LLAMA_LOG_VERBOSITY: 10
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml
index 91e065394..b9baede58 100644
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -29,10 +29,10 @@ on:
]
env:
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
- LLAMA_LOG_VERBOSITY: 10
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index dd3ff88f1..b30e33370 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -44,10 +44,10 @@ on:
]
env:
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
- LLAMA_LOG_VERBOSITY: 10
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
diff --git a/.github/workflows/ui-self-hosted.yml b/.github/workflows/ui-self-hosted.yml
index 64a4519c6..8a97a8284 100644
--- a/.github/workflows/ui-self-hosted.yml
+++ b/.github/workflows/ui-self-hosted.yml
@@ -30,10 +30,10 @@ on:
]
env:
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
- LLAMA_LOG_VERBOSITY: 10
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
diff --git a/.github/workflows/ui.yml b/.github/workflows/ui.yml
index b5e745718..b3712e450 100644
--- a/.github/workflows/ui.yml
+++ b/.github/workflows/ui.yml
@@ -26,10 +26,10 @@ on:
]
env:
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
- LLAMA_LOG_VERBOSITY: 10
+ LLAMA_ARG_LOG_COLORS: 1
+ LLAMA_ARG_LOG_PREFIX: 1
+ LLAMA_ARG_LOG_TIMESTAMPS: 1
+ LLAMA_ARG_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
diff --git a/ci/run.sh b/ci/run.sh
index 341008411..e4a34ff0a 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -701,8 +701,8 @@ function gg_sum_test_backend_ops_cpu {
## main
-export LLAMA_LOG_PREFIX=1
-export LLAMA_LOG_TIMESTAMPS=1
+export LLAMA_ARG_LOG_PREFIX=1
+export LLAMA_ARG_LOG_TIMESTAMPS=1
if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
diff --git a/common/arg.cpp b/common/arg.cpp
index 3df8010a2..bdc2e9eb4 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3026,7 +3026,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg(
{"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -3327,7 +3327,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params &, const std::string & value) {
common_log_set_file(common_log_main(), value.c_str());
}
- ).set_env("LLAMA_LOG_FILE"));
+ ).set_env("LLAMA_ARG_LOG_FILE"));
add_opt(common_arg(
{"--log-colors"}, "[on|off|auto]",
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -3344,7 +3344,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
}
}
- ).set_env("LLAMA_LOG_COLORS"));
+ ).set_env("LLAMA_ARG_LOG_COLORS"));
add_opt(common_arg(
{"-v", "--verbose", "--log-verbose"},
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3359,7 +3359,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.offline = true;
}
- ).set_env("LLAMA_OFFLINE"));
+ ).set_env("LLAMA_ARG_OFFLINE"));
add_opt(common_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
@@ -3374,7 +3374,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.verbosity = value;
common_log_set_verbosity_thold(value);
}
- ).set_env("LLAMA_LOG_VERBOSITY"));
+ ).set_env("LLAMA_ARG_LOG_VERBOSITY"));
add_opt(common_arg(
{"--log-prefix"},
{"--no-log-prefix"},
diff --git a/docs/autoparser.md b/docs/autoparser.md
index da2f1a3a0..33ede1a22 100644
--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@@ -459,7 +459,7 @@ Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepend
- Usage: `./bin/llama-template-analysis path/to/template.jinja`
-**Debug Logging**: Enable with `LLAMA_LOG_VERBOSITY=2`
+**Debug Logging**: Enable with `LLAMA_ARG_LOG_VERBOSITY=2`
- Shows detailed analysis steps, pattern extraction results, and generated parser structure
diff --git a/tools/cli/README.md b/tools/cli/README.md
index add4021e2..04aef0188 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -88,11 +88,11 @@
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_FILE_V) |
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) |
| `--log-disable` | Log disable |
-| `--log-file FNAME` | Log to file
(env: LLAMA_LOG_FILE) |
-| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_LOG_COLORS) |
+| `--log-file FNAME` | Log to file
(env: LLAMA_ARG_LOG_FILE) |
+| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_ARG_LOG_COLORS) |
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
-| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_OFFLINE) |
-| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)
(env: LLAMA_LOG_VERBOSITY) |
+| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_ARG_OFFLINE) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: trace (more info)
- 5: debug
(default: 3)
(env: LLAMA_ARG_LOG_VERBOSITY) |
| `--log-prefix, --no-log-prefix` | Enable prefix in log messages
(env: LLAMA_ARG_LOG_PREFIX) |
| `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages
(env: LLAMA_ARG_LOG_TIMESTAMPS) |
| `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
@@ -165,7 +165,7 @@
| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
-| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
+| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) |
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) |
@@ -194,6 +194,7 @@
| `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
| `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
+| `--spec-draft-backend-sampling, --no-spec-draft-backend-sampling` | offload draft sampling to the backend (default: enabled)
(env: LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING) |
| `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices |
| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
diff --git a/tools/completion/README.md b/tools/completion/README.md
index edb5da806..e8a1287f3 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -171,11 +171,11 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_FILE_V) |
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) |
| `--log-disable` | Log disable |
-| `--log-file FNAME` | Log to file
(env: LLAMA_LOG_FILE) |
-| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_LOG_COLORS) |
+| `--log-file FNAME` | Log to file
(env: LLAMA_ARG_LOG_FILE) |
+| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_ARG_LOG_COLORS) |
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
-| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_OFFLINE) |
-| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)
(env: LLAMA_LOG_VERBOSITY) |
+| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_ARG_OFFLINE) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: trace (more info)
- 5: debug
(default: 3)
(env: LLAMA_ARG_LOG_VERBOSITY) |
| `--log-prefix, --no-log-prefix` | Enable prefix in log messages
(env: LLAMA_ARG_LOG_PREFIX) |
| `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages
(env: LLAMA_ARG_LOG_TIMESTAMPS) |
| `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
diff --git a/tools/server/README.md b/tools/server/README.md
index 0b7f9f994..0d20ced87 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -105,11 +105,11 @@ For the full list of features, please refer to [server's changelog](https://gith
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_FILE_V) |
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) |
| `--log-disable` | Log disable |
-| `--log-file FNAME` | Log to file
(env: LLAMA_LOG_FILE) |
-| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_LOG_COLORS) |
+| `--log-file FNAME` | Log to file
(env: LLAMA_ARG_LOG_FILE) |
+| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_ARG_LOG_COLORS) |
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
-| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_OFFLINE) |
-| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)
(env: LLAMA_LOG_VERBOSITY) |
+| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_ARG_OFFLINE) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: trace (more info)
- 5: debug
(default: 3)
(env: LLAMA_ARG_LOG_VERBOSITY) |
| `--log-prefix, --no-log-prefix` | Enable prefix in log messages
(env: LLAMA_ARG_LOG_PREFIX) |
| `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages
(env: LLAMA_ARG_LOG_TIMESTAMPS) |
| `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
@@ -204,7 +204,7 @@ For the full list of features, please refer to [server's changelog](https://gith
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) |
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) |
-| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
+| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) |
| `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)
(env: LLAMA_ARG_CACHE_PROMPT) |
@@ -249,6 +249,7 @@ For the full list of features, please refer to [server's changelog](https://gith
| `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
| `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
+| `--spec-draft-backend-sampling, --no-spec-draft-backend-sampling` | offload draft sampling to the backend (default: enabled)
(env: LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING) |
| `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices |
| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |