mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
Merge commit '5c86c9ed3e
' into concedo_experimental
# Conflicts: # tools/imatrix/imatrix.cpp # tools/mtmd/README.md # tools/run/README.md # tools/run/run.cpp
This commit is contained in:
commit
6bb44391bd
7 changed files with 47 additions and 21 deletions
|
@ -2628,6 +2628,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.i_chunk = value;
|
params.i_chunk = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--parse-special"},
|
||||||
|
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
||||||
|
[](common_params & params) {
|
||||||
|
params.parse_special = true;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-pps"},
|
{"-pps"},
|
||||||
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
||||||
|
|
|
@ -405,6 +405,7 @@ struct common_params {
|
||||||
|
|
||||||
bool process_output = false; // collect data for the output tensor
|
bool process_output = false; // collect data for the output tensor
|
||||||
bool compute_ppl = true; // whether to compute perplexity
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
||||||
|
|
||||||
// cvector-generator params
|
// cvector-generator params
|
||||||
int n_pca_batch = 100;
|
int n_pca_batch = 100;
|
||||||
|
|
|
@ -10,10 +10,11 @@ static __global__ void k_get_rows(
|
||||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||||
|
|
||||||
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||||
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2;
|
||||||
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
const int i10 = blockIdx.x;
|
||||||
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
const int i11 = blockIdx.z / ne12;
|
||||||
|
const int i12 = blockIdx.z % ne12;
|
||||||
|
|
||||||
if (i00 >= ne00) {
|
if (i00 >= ne00) {
|
||||||
return;
|
return;
|
||||||
|
@ -46,10 +47,11 @@ static __global__ void k_get_rows_float(
|
||||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||||
|
|
||||||
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
|
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||||
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
const int i00 = blockIdx.y * blockDim.x + threadIdx.x;
|
||||||
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
const int i10 = blockIdx.x;
|
||||||
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
const int i11 = blockIdx.z / ne12;
|
||||||
|
const int i12 = blockIdx.z % ne12;
|
||||||
|
|
||||||
if (i00 >= ne00) {
|
if (i00 >= ne00) {
|
||||||
return;
|
return;
|
||||||
|
@ -94,8 +96,8 @@ static void get_rows_cuda_q(
|
||||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||||
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
||||||
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
|
||||||
|
|
||||||
// strides in elements
|
// strides in elements
|
||||||
// const size_t s0 = nb0 / sizeof(dst_t);
|
// const size_t s0 = nb0 / sizeof(dst_t);
|
||||||
|
@ -127,8 +129,8 @@ static void get_rows_cuda_float(
|
||||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||||
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
||||||
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
|
||||||
|
|
||||||
// strides in elements
|
// strides in elements
|
||||||
// const size_t s0 = nb0 / sizeof(dst_t);
|
// const size_t s0 = nb0 / sizeof(dst_t);
|
||||||
|
|
|
@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||||
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
||||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||||
|
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
|
||||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||||
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
||||||
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||||
|
@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|im_start|>assistant\n";
|
ss << "<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
|
||||||
// Official mistral 'v7' template
|
// Official mistral 'v7' template
|
||||||
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
||||||
|
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
|
||||||
|
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
std::string content(message->content);
|
std::string content(message->content);
|
||||||
if (role == "system") {
|
if (role == "system") {
|
||||||
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
|
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
|
||||||
} else if (role == "user") {
|
} else if (role == "user") {
|
||||||
ss << "[INST] " << content << "[/INST]";
|
ss << "[INST]" << trailing_space << content << "[/INST]";
|
||||||
}
|
} else {
|
||||||
else {
|
ss << trailing_space << content << "</s>";
|
||||||
ss << " " << content << "</s>";
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
||||||
|
|
|
@ -14,6 +14,7 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||||
|
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
|
||||||
LLM_CHAT_TEMPLATE_PHI_3,
|
LLM_CHAT_TEMPLATE_PHI_3,
|
||||||
LLM_CHAT_TEMPLATE_PHI_4,
|
LLM_CHAT_TEMPLATE_PHI_4,
|
||||||
LLM_CHAT_TEMPLATE_FALCON_3,
|
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||||
|
|
|
@ -13490,6 +13490,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
||||||
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
||||||
const auto & it = model->gguf_kv.find(key);
|
const auto & it = model->gguf_kv.find(key);
|
||||||
if (it == model->gguf_kv.end()) {
|
if (it == model->gguf_kv.end()) {
|
||||||
|
// one-off fix for very popular models (so we are not flooded with issues)
|
||||||
|
// do not extend this list unless absolutely necessary
|
||||||
|
// Mistral-Small-2503 does not have built-in chat template
|
||||||
|
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
||||||
|
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
||||||
|
return "mistral-v7-tekken";
|
||||||
|
}
|
||||||
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -554,14 +554,19 @@ struct decode_embd_batch {
|
||||||
llama_batch get_view(int offset, int n_tokens) {
|
llama_batch get_view(int offset, int n_tokens) {
|
||||||
llama_pos * pos_ptr;
|
llama_pos * pos_ptr;
|
||||||
pos_view.clear();
|
pos_view.clear();
|
||||||
pos_view.resize(n_tokens * n_pos_per_embd);
|
pos_view.reserve(n_tokens * n_pos_per_embd);
|
||||||
if (n_pos_per_embd > 1) {
|
if (n_pos_per_embd > 1) {
|
||||||
// mrope
|
// mrope
|
||||||
// for example, with layout of src: 1234...1234...1234...1234...
|
// for example, with layout of src: 1234...1234...1234...1234...
|
||||||
// offset 2 will give us dst: 34...34...34...34...
|
// offset 2 will give us dst: 34...34...34...34...
|
||||||
for (int i = 0; i < n_pos_per_embd; i++) {
|
for (int i = 0; i < n_pos_per_embd; i++) {
|
||||||
auto src = pos.begin() + i * batch.n_tokens + offset;
|
// assume n_tokens is less than or equal to batch.n_tokens
|
||||||
pos_view.insert(pos_view.end(), src, src + n_tokens);
|
// batch.n_tokens is number of **total** tokens
|
||||||
|
// n_tokens is number of viewed token
|
||||||
|
size_t src_idx = i * batch.n_tokens + offset;
|
||||||
|
pos_view.insert(pos_view.end(),
|
||||||
|
pos.data() + src_idx,
|
||||||
|
pos.data() + src_idx + n_tokens);
|
||||||
}
|
}
|
||||||
pos_ptr = pos_view.data();
|
pos_ptr = pos_view.data();
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue