diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 01a8a50a2..2c6d5d95b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1487,7 +1487,7 @@ class StableLMModel(Model): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA diff --git a/examples/server/public/loading.html b/examples/server/public/loading.html new file mode 100644 index 000000000..c3fd19a0f --- /dev/null +++ b/examples/server/public/loading.html @@ -0,0 +1,12 @@ + + + + + + +
+ The model is loading. Please wait.
+ The user interface will appear soon. +
+ + diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6d1b09491..8730f3ed2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -29,6 +29,7 @@ #include "system-prompts.js.hpp" #include "prompt-formats.js.hpp" #include "json-schema-to-grammar.mjs.hpp" +#include "loading.html.hpp" #include #include @@ -2593,10 +2594,16 @@ int main(int argc, char ** argv) { return false; }; - auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) { + auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) { server_state current_state = state.load(); if (current_state == SERVER_STATE_LOADING_MODEL) { - res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); + auto tmp = string_split(req.path, '.'); + if (req.path == "/" || tmp.back() == "html") { + res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); + res.status = 503; + } else { + res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); + } return false; } return true; @@ -2987,6 +2994,8 @@ int main(int argc, char ** argv) { }, [&](json error_data) { server_sent_event(sink, "error", error_data); }); + static const std::string ev_done = "data: [DONE]\n\n"; + sink.write(ev_done.data(), ev_done.size()); sink.done(); return true; }; diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 11587dd64..0f4249b13 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1020,6 +1020,8 @@ async def oai_chat_completions(user_prompt, event_data = line.split(': ', 1) assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```' chunk_raw = event_data[1] + if chunk_raw == '[DONE]': + break chunk = json.loads(chunk_raw) assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```" diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index bb852a6cc..3cf689bd8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3419,7 +3419,7 @@ double ggml_type_sizef(enum ggml_type type) { } GGML_CALL const char * ggml_type_name(enum ggml_type type) { - return type_traits[type].type_name; + return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE"; } GGML_CALL bool ggml_is_quantized(enum ggml_type type) { diff --git a/src/llama.cpp b/src/llama.cpp index 894c0d0a9..d5f2ff530 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15903,7 +15903,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { // clear unused states for (int i = 0; i < n_kv; ++i) { - uint32_t cell_id = i + kv_self.head; + const uint32_t cell_id = i + kv_self.head; llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; data[i] = (float) (kv_cell.src >= 0);