mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-15 19:39:42 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # Package.swift # flake.lock # tests/test-chat-template.cpp
This commit is contained in:
commit
02892c5cbd
4 changed files with 2762 additions and 2793 deletions
File diff suppressed because it is too large
Load diff
|
@ -51,26 +51,6 @@
|
||||||
margin-bottom: 0.5em;
|
margin-bottom: 0.5em;
|
||||||
}
|
}
|
||||||
|
|
||||||
button, input, textarea, .button, a.button, select {
|
|
||||||
color: #666;
|
|
||||||
border: 1px solid #ddd;
|
|
||||||
border-radius: 4px;
|
|
||||||
line-height: 1.5em;
|
|
||||||
padding: 0.25em 0.25em;
|
|
||||||
text-decoration: none;
|
|
||||||
font-size: 1.1rem;
|
|
||||||
}
|
|
||||||
|
|
||||||
button {
|
|
||||||
border: 1px solid #2a8aad;
|
|
||||||
background: #3584e4;
|
|
||||||
font-weight: normal;
|
|
||||||
color: #fff;
|
|
||||||
}
|
|
||||||
button:disabled {
|
|
||||||
background: #9cbce5;
|
|
||||||
}
|
|
||||||
|
|
||||||
#write form {
|
#write form {
|
||||||
margin: 1em 0 0 0;
|
margin: 1em 0 0 0;
|
||||||
display: flex;
|
display: flex;
|
||||||
|
@ -587,7 +567,7 @@
|
||||||
runCompletion();
|
runCompletion();
|
||||||
}
|
}
|
||||||
return html`
|
return html`
|
||||||
<div class="right">
|
<div>
|
||||||
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
|
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
|
||||||
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||||
<button onclick=${reset}>Reset</button>
|
<button onclick=${reset}>Reset</button>
|
||||||
|
|
102
ggml-sycl.cpp
102
ggml-sycl.cpp
|
@ -15996,73 +15996,76 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
||||||
static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
const ggml_tensor *src1,
|
const ggml_tensor *src1,
|
||||||
ggml_tensor *dst) try {
|
ggml_tensor *dst) try {
|
||||||
#if 0
|
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT &&
|
||||||
ggml_sycl_mul_mat_id_sycl(dst);
|
"mul_mat_id does not support split buffers");
|
||||||
// TODO: mmq/mmv support
|
const ggml_tensor *ids = dst->src[2];
|
||||||
#endif
|
|
||||||
|
|
||||||
const int64_t nb11 = src1->nb[1];
|
|
||||||
const int64_t nb1 = dst->nb[1];
|
|
||||||
|
|
||||||
const struct ggml_tensor * ids = src0;
|
|
||||||
const int32_t id = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
|
||||||
|
|
||||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
|
||||||
|
|
||||||
const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
|
||||||
|
|
||||||
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
const size_t nb11 = src1->nb[1];
|
||||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
const size_t nb1 = dst->nb[1];
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
||||||
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)).wait()));
|
|
||||||
// SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
|
||||||
} else {
|
|
||||||
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
|
||||||
}
|
|
||||||
|
|
||||||
const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
const int32_t id = ((int32_t *)dst->op_params)[0];
|
||||||
const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
const int32_t n_as = src0->ne[2];
|
||||||
|
|
||||||
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||||
|
const char *ids_dev = (const char *)ids->data;
|
||||||
|
|
||||||
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
|
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
||||||
|
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
||||||
|
|
||||||
|
const ggml_tensor_extra_gpu *src0_extra =
|
||||||
|
(const ggml_tensor_extra_gpu *)src0->extra;
|
||||||
|
const ggml_tensor_extra_gpu *src1_extra =
|
||||||
|
(const ggml_tensor_extra_gpu *)src1->extra;
|
||||||
|
const ggml_tensor_extra_gpu *dst_extra =
|
||||||
|
(const ggml_tensor_extra_gpu *)dst->extra;
|
||||||
|
|
||||||
|
ggml_tensor_extra_gpu src0_row_extra;
|
||||||
ggml_tensor_extra_gpu src1_row_extra;
|
ggml_tensor_extra_gpu src1_row_extra;
|
||||||
ggml_tensor_extra_gpu dst_row_extra;
|
ggml_tensor_extra_gpu dst_row_extra;
|
||||||
|
|
||||||
|
ggml_tensor src0_row = *src0;
|
||||||
ggml_tensor src1_row = *src1;
|
ggml_tensor src1_row = *src1;
|
||||||
ggml_tensor dst_row = *dst;
|
ggml_tensor dst_row = *dst;
|
||||||
|
|
||||||
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||||
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
||||||
|
|
||||||
|
src0_row.extra = &src0_row_extra;
|
||||||
src1_row.extra = &src1_row_extra;
|
src1_row.extra = &src1_row_extra;
|
||||||
dst_row.extra = &dst_row_extra;
|
dst_row.extra = &dst_row_extra;
|
||||||
|
|
||||||
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
|
||||||
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
? (char *)src0->data
|
||||||
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
: (char *)src0_extra->data_device[g_main_device];
|
||||||
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
|
||||||
|
? (char *)src1->data
|
||||||
|
: (char *)src1_extra->data_device[g_main_device];
|
||||||
|
char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
|
||||||
|
? (char *)dst->data
|
||||||
|
: (char *)dst_extra->data_device[g_main_device];
|
||||||
|
|
||||||
|
src0_row.ne[2] = 1;
|
||||||
|
src0_row.ne[3] = 1;
|
||||||
|
src0_row.nb[3] = src0->nb[2];
|
||||||
|
|
||||||
if (src1->ne[1] == 1) {
|
if (src1->ne[1] == 1) {
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
|
||||||
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
|
||||||
|
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
//int32_t row_id;
|
const int32_t row_id =
|
||||||
//SYCL_CHECK(syclMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), syclMemcpyDeviceToHost, g_syclStreams[g_main_device][0]));
|
*(const int32_t *)(ids_host.data() + i01 * ids->nb[1] +
|
||||||
//SYCL_CHECK(syclStreamSynchronize(g_syclStreams[g_main_device][0]));
|
id * ids->nb[0]);
|
||||||
|
|
||||||
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
|
||||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
src0_row_extra.data_device[g_main_device] =
|
||||||
|
src0_original + row_id * src0->nb[2];
|
||||||
|
src1_row_extra.data_device[g_main_device] =
|
||||||
|
src1_original + i01 * src1->nb[1];
|
||||||
|
dst_row_extra.data_device[g_main_device] =
|
||||||
|
dst_original + i01 * dst->nb[1];
|
||||||
|
|
||||||
src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
|
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
||||||
src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
|
|
||||||
|
|
||||||
dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
|
|
||||||
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
|
|
||||||
|
|
||||||
ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
||||||
|
@ -16072,8 +16075,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
||||||
|
|
||||||
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
||||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
|
||||||
|
|
||||||
int64_t num_src1_rows = 0;
|
int64_t num_src1_rows = 0;
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
||||||
|
@ -16086,7 +16087,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
|
stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
|
||||||
src1_original + i01 * nb11, nb11).wait()));
|
src1_original + i01 * nb11, nb11)));
|
||||||
num_src1_rows++;
|
num_src1_rows++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16094,6 +16095,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
src0_row_extra.data_device[g_main_device] =
|
||||||
|
src0_original + row_id * src0->nb[2];
|
||||||
|
|
||||||
src1_row.ne[1] = num_src1_rows;
|
src1_row.ne[1] = num_src1_rows;
|
||||||
dst_row.ne[1] = num_src1_rows;
|
dst_row.ne[1] = num_src1_rows;
|
||||||
|
|
||||||
|
@ -16105,7 +16109,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
dst_row.nb[2] = num_src1_rows*nb1;
|
dst_row.nb[2] = num_src1_rows*nb1;
|
||||||
dst_row.nb[3] = num_src1_rows*nb1;
|
dst_row.nb[3] = num_src1_rows*nb1;
|
||||||
|
|
||||||
ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
|
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
||||||
|
|
||||||
num_src1_rows = 0;
|
num_src1_rows = 0;
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
|
@ -16119,7 +16123,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||||
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
|
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
|
||||||
dst_original + i01 * nb1,
|
dst_original + i01 * nb1,
|
||||||
dst_contiguous.get() + num_src1_rows * nb1, nb1).wait()));
|
dst_contiguous.get() + num_src1_rows * nb1, nb1)));
|
||||||
num_src1_rows++;
|
num_src1_rows++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
22
llama.cpp
22
llama.cpp
|
@ -13378,6 +13378,11 @@ struct llama_beam_search_data {
|
||||||
}
|
}
|
||||||
llama_logit_info logit_info(ctx);
|
llama_logit_info logit_info(ctx);
|
||||||
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
||||||
|
|
||||||
|
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
||||||
|
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
||||||
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
|
|
||||||
size_t i=0;
|
size_t i=0;
|
||||||
if (next_beams.size() < n_beams) {
|
if (next_beams.size() < n_beams) {
|
||||||
for (; next_beams.size() < n_beams ; ++i) {
|
for (; next_beams.size() < n_beams ; ++i) {
|
||||||
|
@ -15778,6 +15783,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
||||||
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
||||||
ctx->output_ids[id] = i;
|
ctx->output_ids[id] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx->n_outputs = n_outputs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16946,6 +16953,21 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "### Response:\n";
|
ss << "### Response:\n";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
|
||||||
|
// CohereForAI/c4ai-command-r-plus
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
if (role == "system") {
|
||||||
|
ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
||||||
|
} else if (role == "user") {
|
||||||
|
ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
||||||
|
} else if (role == "assistant") {
|
||||||
|
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// template not supported
|
// template not supported
|
||||||
return -1;
|
return -1;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue