mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-13 02:19:41 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/full-cuda.Dockerfile # .devops/nix/devshells.nix # .devops/nix/nixpkgs-instances.nix # .devops/nix/package.nix # .devops/nix/scope.nix # README.md # docs/docker.md # examples/llama-bench/llama-bench.cpp # flake.lock # flake.nix # grammars/README.md # src/llama.cpp
This commit is contained in:
commit
73dca7e5bc
24 changed files with 2747 additions and 666 deletions
|
@ -66,6 +66,7 @@ extern "C" {
|
|||
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
||||
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
||||
};
|
||||
|
||||
// pre-tokenization types
|
||||
|
@ -267,9 +268,9 @@ extern "C" {
|
|||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// main_gpu interpretation depends on split_mode:
|
||||
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
||||
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
||||
// LLAMA_SPLIT_LAYER: ignored
|
||||
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
|
||||
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
|
||||
// LLAMA_SPLIT_MODE_LAYER: ignored
|
||||
int32_t main_gpu;
|
||||
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue