mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .gitignore # CMakeLists.txt # flake.lock # llama.cpp
This commit is contained in:
commit
8b29d5f848
4 changed files with 35 additions and 13 deletions
|
@ -6,6 +6,10 @@
|
||||||
#include "ggml-metal.h"
|
#include "ggml-metal.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
#include "ggml-sycl.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "ggml-rpc.h"
|
#include "ggml-rpc.h"
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
# include <windows.h>
|
# include <windows.h>
|
||||||
|
@ -79,6 +83,12 @@ static ggml_backend_t create_backend() {
|
||||||
if (!backend) {
|
if (!backend) {
|
||||||
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
||||||
}
|
}
|
||||||
|
#elif GGML_USE_SYCL
|
||||||
|
fprintf(stderr, "%s: using SYCL backend\n", __func__);
|
||||||
|
backend = ggml_backend_sycl_init(0); // init device 0
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// if there aren't GPU Backends fallback to CPU backend
|
// if there aren't GPU Backends fallback to CPU backend
|
||||||
|
|
|
@ -377,7 +377,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||||
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
||||||
GGML_ASSERT(galloc->bufts != NULL);
|
GGML_ASSERT(galloc->bufts != NULL);
|
||||||
|
|
||||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
|
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
||||||
GGML_ASSERT(galloc->buffers != NULL);
|
GGML_ASSERT(galloc->buffers != NULL);
|
||||||
|
|
||||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
#include "shaderop_mul_mat_q4_1.h"
|
#include "shaderop_mul_mat_q4_1.h"
|
||||||
#include "shaderop_mul_mat_q6_k.h"
|
#include "shaderop_mul_mat_q6_k.h"
|
||||||
#include "shaderop_mul_mat_mat_f32.h"
|
#include "shaderop_mul_mat_mat_f32.h"
|
||||||
|
#include "shaderop_getrows_f32.h"
|
||||||
#include "shaderop_getrows_f16.h"
|
#include "shaderop_getrows_f16.h"
|
||||||
#include "shaderop_getrows_q4_0.h"
|
#include "shaderop_getrows_q4_0.h"
|
||||||
#include "shaderop_getrows_q4_1.h"
|
#include "shaderop_getrows_q4_1.h"
|
||||||
|
@ -1146,6 +1147,14 @@ static void ggml_vk_get_rows(
|
||||||
seq.record<kp::OpAlgoDispatch>(s_algo);
|
seq.record<kp::OpAlgoDispatch>(s_algo);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename... Args>
|
||||||
|
static void ggml_vk_get_rows_f32(Args&&... args) {
|
||||||
|
const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv,
|
||||||
|
kp::shader_data::op_getrows_f32_comp_spv_len);
|
||||||
|
|
||||||
|
ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename... Args>
|
template <typename... Args>
|
||||||
static void ggml_vk_get_rows_f16(Args&&... args) {
|
static void ggml_vk_get_rows_f16(Args&&... args) {
|
||||||
const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
|
const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
|
||||||
|
@ -1371,6 +1380,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
|
||||||
return op->ne[3] == 1;
|
return op->ne[3] == 1;
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
switch (op->src[0]->type) {
|
switch (op->src[0]->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
|
@ -1661,7 +1671,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
{
|
{
|
||||||
if (src0t == GGML_TYPE_F16) {
|
if (src0t == GGML_TYPE_F32) {
|
||||||
|
ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
||||||
|
} else if (src0t == GGML_TYPE_F16) {
|
||||||
ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
||||||
} else if (src0t == GGML_TYPE_Q4_0) {
|
} else if (src0t == GGML_TYPE_Q4_0) {
|
||||||
ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
|
||||||
|
|
20
llama.cpp
20
llama.cpp
|
@ -2190,8 +2190,7 @@ struct llama_vocab {
|
||||||
std::vector<token_data> id_to_token;
|
std::vector<token_data> id_to_token;
|
||||||
|
|
||||||
std::vector<id> cache_special_tokens;
|
std::vector<id> cache_special_tokens;
|
||||||
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
|
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
||||||
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
|
|
||||||
|
|
||||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||||
|
|
||||||
|
@ -4908,23 +4907,19 @@ static void llm_load_vocab(
|
||||||
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
// build token to piece caches
|
// build token to piece cache
|
||||||
{
|
{
|
||||||
size_t size_cache = 0;
|
size_t size_cache = 0;
|
||||||
|
|
||||||
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
|
std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
|
||||||
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
|
|
||||||
|
|
||||||
for (uint32_t id = 0; id < n_vocab; ++id) {
|
for (uint32_t id = 0; id < n_vocab; ++id) {
|
||||||
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
|
cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
|
||||||
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
|
|
||||||
|
|
||||||
size_cache += cache_token_to_piece[id].size();
|
size_cache += cache_token_to_piece[id].size();
|
||||||
size_cache += cache_token_to_piece_special[id].size();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
||||||
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
@ -18638,9 +18633,14 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
||||||
return llama_token_to_piece_old(model, token, buf, length);
|
return llama_token_to_piece_old(model, token, buf, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
|
||||||
|
if (!special && llama_is_control_token(model->vocab, token)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
// if we have a cache - use it
|
// if we have a cache - use it
|
||||||
{
|
{
|
||||||
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
|
const auto & cache = model->vocab.cache_token_to_piece;
|
||||||
|
|
||||||
if (!cache.empty()) {
|
if (!cache.empty()) {
|
||||||
const auto & res = cache.at(token);
|
const auto & res = cache.at(token);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue