Merge commit '1c641e6aac' into concedo_experimental

# Conflicts:
#	.devops/cloud-v-pipeline
#	.devops/llama-cli-cuda.Dockerfile
#	.devops/llama-cli-rocm.Dockerfile
#	.devops/llama-cli-vulkan.Dockerfile
#	.devops/llama-cli.Dockerfile
#	.devops/llama-cpp-clblast.srpm.spec
#	.devops/llama-cpp-cuda.srpm.spec
#	.devops/llama-cpp.srpm.spec
#	.devops/llama-server-cuda.Dockerfile
#	.devops/llama-server-rocm.Dockerfile
#	.devops/llama-server-vulkan.Dockerfile
#	.devops/llama-server.Dockerfile
#	.devops/nix/apps.nix
#	.devops/nix/package.nix
#	.devops/tools.sh
#	.dockerignore
#	.github/ISSUE_TEMPLATE/01-bug-low.yml
#	.github/ISSUE_TEMPLATE/02-bug-medium.yml
#	.github/ISSUE_TEMPLATE/03-bug-high.yml
#	.github/ISSUE_TEMPLATE/04-bug-critical.yml
#	.github/workflows/bench.yml
#	.github/workflows/build.yml
#	.github/workflows/docker.yml
#	.github/workflows/server.yml
#	.gitignore
#	Makefile
#	README-sycl.md
#	README.md
#	ci/run.sh
#	docs/token_generation_performance_tips.md
#	flake.nix
#	grammars/README.md
#	pocs/vdot/CMakeLists.txt
#	scripts/get-hellaswag.sh
#	scripts/get-wikitext-103.sh
#	scripts/get-wikitext-2.sh
#	scripts/get-winogrande.sh
#	scripts/hf.sh
#	scripts/pod-llama.sh
#	scripts/qnt-all.sh
#	scripts/run-all-ppl.sh
#	scripts/run-with-preset.py
#	scripts/server-llm.sh
#	tests/test-backend-ops.cpp
This commit is contained in:
Concedo 2024-06-14 18:41:37 +08:00
commit b53e760557
94 changed files with 457 additions and 317 deletions

View file

@ -1,4 +1,4 @@
set(TARGET server)
set(TARGET llama-server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})

View file

@ -80,26 +80,26 @@ The project is under active development, and we are [looking for feedback and co
## Build
`server` is built alongside everything else from the root of the project
`llama-server` is built alongside everything else from the root of the project
- Using `make`:
```bash
make server
make llama-server
```
- Using `CMake`:
```bash
cmake -B build
cmake --build build --config Release -t server
cmake --build build --config Release -t llama-server
```
Binary is at `./build/bin/server`
Binary is at `./build/bin/llama-server`
## Build with SSL
`server` can also be built with SSL support using OpenSSL 3
`llama-server` can also be built with SSL support using OpenSSL 3
- Using `make`:
@ -107,14 +107,14 @@ The project is under active development, and we are [looking for feedback and co
# NOTE: For non-system openssl, use the following:
# CXXFLAGS="-I /path/to/openssl/include"
# LDFLAGS="-L /path/to/openssl/lib"
make LLAMA_SERVER_SSL=true server
make LLAMA_SERVER_SSL=true llama-server
```
- Using `CMake`:
```bash
cmake -B build -DLLAMA_SERVER_SSL=ON
cmake --build build --config Release -t server
cmake --build build --config Release -t llama-server
```
## Quick Start
@ -124,13 +124,13 @@ To get started right away, run the following command, making sure to use the cor
### Unix-based systems (Linux, macOS, etc.)
```bash
./server -m models/7B/ggml-model.gguf -c 2048
./llama-server -m models/7B/ggml-model.gguf -c 2048
```
### Windows
```powershell
server.exe -m models\7B\ggml-model.gguf -c 2048
llama-server.exe -m models\7B\ggml-model.gguf -c 2048
```
The above command will start a server that by default listens on `127.0.0.1:8080`.
@ -629,11 +629,11 @@ bash chat.sh
### OAI-like API
The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi
The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
### API errors
`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
Example of an error:

View file

@ -99,7 +99,7 @@ The `bench.py` script does several steps:
It aims to be used in the CI, but you can run it manually:
```shell
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
--runner-label local \
--name local \
--branch `git rev-parse --abbrev-ref HEAD` \

View file

@ -245,7 +245,7 @@ def start_server(args):
def start_server_background(args):
# Start the server
server_path = '../../../build/bin/server'
server_path = '../../../build/bin/llama-server'
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
server_path = os.environ['LLAMA_SERVER_BIN_PATH']
server_args = [

View file

@ -44,12 +44,12 @@ http module.
### running using examples/server
bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]
### running using python3's server module
first run examples/server
* bin/server -m path/model.gguf
* ./llama-server -m path/model.gguf
next run this web front end in examples/server/public_simplechat
* cd ../examples/server/public_simplechat

View file

@ -148,7 +148,7 @@ struct server_slot {
int32_t n_prompt_tokens = 0;
int32_t n_prompt_tokens_processed = 0;
std::string prompt;
json prompt; // can be either a string, array of strings or array of token ids
// when a task is submitted, we first tokenize the prompt and store it here
std::vector<llama_token> prompt_tokens;
@ -823,8 +823,13 @@ struct server_context {
continue;
}
// skip the slot if it does not contains prompt
if (!slot.prompt.is_string()) {
continue;
}
// current slot's prompt
std::string slot_prompt = slot.prompt;
std::string slot_prompt = slot.prompt.get<std::string>();
// length of the current slot's prompt
int slot_prompt_len = slot_prompt.size();
@ -958,12 +963,12 @@ struct server_context {
return false;
}
if (prompt->is_string()) {
slot.prompt = prompt->get<std::string>();
} else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) {
slot.prompt = prompt->at(0).get<std::string>();
if ((prompt->is_string()) ||
(prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
(prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) {
slot.prompt = *prompt;
} else {
send_error(task, "\"prompt\" must be a string or an array of strings", ERROR_TYPE_INVALID_REQUEST);
send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
return false;
}
}

View file

@ -27,10 +27,8 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.
```shell
cd ../../..
mkdir build
cd build
cmake -DLLAMA_CURL=ON ../
cmake --build . --target server
cmake -B build -DLLAMA_CURL=ON
cmake --build build --target llama-server
```
2. Start the test: `./tests.sh`
@ -40,7 +38,7 @@ It's possible to override some scenario steps values with environment variables:
| variable | description |
|--------------------------|------------------------------------------------------------------------------------------------|
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/server` |
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |

View file

@ -1272,9 +1272,9 @@ def context_text(context):
def start_server_background(context):
if os.name == 'nt':
context.server_path = '../../../build/bin/Release/server.exe'
context.server_path = '../../../build/bin/Release/llama-server.exe'
else:
context.server_path = '../../../build/bin/server'
context.server_path = '../../../build/bin/llama-server'
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
server_listen_addr = context.server_fqdn