Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/bench.yml
#	.github/workflows/build.yml
#	.github/workflows/python-check-requirements.yml
#	README.md
#	docs/backend/SYCL.md
#	flake.lock
#	ggml/CMakeLists.txt
#	ggml/src/kompute-shaders/op_rope_f16.comp
#	ggml/src/kompute-shaders/op_rope_f32.comp
#	ggml/src/kompute-shaders/rope_common.comp
This commit is contained in:
Concedo 2024-08-14 22:25:43 +08:00
commit e8de0af3ec
18 changed files with 1326 additions and 101 deletions

View file

@ -3594,13 +3594,8 @@ namespace GGUFMeta {
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
// TODO: update when needed or think of some clever automatic way to do this
static size_t llama_model_max_nodes(const llama_model & /*model*/) {
//if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
// return 32768;
//}
return 8192;
static size_t llama_model_max_nodes(const llama_model & model) {
return std::max<size_t>(8192, model.tensors_by_name.size()*5);
}
struct llama_model_loader {
@ -14800,12 +14795,15 @@ static int llama_decode_internal(
res = nullptr;
embd = nullptr;
} else if (cparams.embeddings) {
res = nullptr; // do not extract logits for embedding case
embd = gf->nodes[gf->n_nodes - 1];
if (strcmp(embd->name, "result_embd_pooled") != 0) {
embd = gf->nodes[gf->n_nodes - 2];
res = nullptr; // do not extract logits for embedding case
embd = nullptr;
for (int i = gf->n_nodes - 1; i >= 0; --i) {
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
embd = gf->nodes[i];
break;
}
}
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
} else {
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");