From 7e72aa74fd676a093eb9970e761085ec22734c71 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 31 Jul 2024 00:57:03 +1000 Subject: [PATCH 1/6] py: add_array() will not add to kv store if value is an empty array (#8774) * gguf_writer.py: add_array() should not add to kv store if empty * Apply suggestions from code review I was wondering if there was a specific reason for `if val` but good to hear we can safely use `len(val == 0` Co-authored-by: compilade --------- Co-authored-by: compilade --- gguf-py/gguf/gguf_writer.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index ba6f53cda..2e0b335ee 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -312,6 +312,8 @@ class GGUFWriter: self.add_key_value(key, val, GGUFValueType.STRING) def add_array(self, key: str, val: Sequence[Any]) -> None: + if len(val) == 0: + return self.add_key_value(key, val, GGUFValueType.ARRAY) @staticmethod @@ -845,7 +847,14 @@ class GGUFWriter: encoded_val = val.encode("utf-8") if isinstance(val, str) else val kv_data += self._pack("Q", len(encoded_val)) kv_data += encoded_val - elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val: + elif vtype == GGUFValueType.ARRAY: + + if not isinstance(val, Sequence): + raise ValueError("Invalid GGUF metadata array, expecting sequence") + + if len(val) == 0: + raise ValueError("Invalid GGUF metadata array. Empty array") + if isinstance(val, bytes): ltype = GGUFValueType.UINT8 else: From 268c5660062270a2c19a36fc655168aa287aaec2 Mon Sep 17 00:00:00 2001 From: Someone Date: Tue, 30 Jul 2024 23:35:30 +0300 Subject: [PATCH 2/6] nix: cuda: rely on propagatedBuildInputs (#8772) Listing individual outputs no longer necessary to reduce the runtime closure size after https://github.com/NixOS/nixpkgs/pull/323056. --- .devops/nix/package.nix | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 911c42ecb..a87423c71 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -126,16 +126,9 @@ let ++ optionals useMetalKit [ MetalKit ]; cudaBuildInputs = with cudaPackages; [ - cuda_cccl.dev # - - # A temporary hack for reducing the closure size, remove once cudaPackages - # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792 - cuda_cudart.dev - cuda_cudart.lib - cuda_cudart.static - libcublas.dev - libcublas.lib - libcublas.static + cuda_cudart + cuda_cccl # + libcublas ]; rocmBuildInputs = with rocmPackages; [ From 44d28ddd5caaa5e9de573bdaaa5b5b2448a29ace Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Wed, 31 Jul 2024 16:40:08 +0300 Subject: [PATCH 3/6] cmake : fix use of external ggml (#8787) --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 793709122..a31320635 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,7 +139,8 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o # determining _precisely_ which defines are necessary for the llama-config # package. # -get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS) +get_target_property(GGML_DIRECTORY ggml SOURCE_DIR) +get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS) get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS) set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES}) get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) From 398ede5efeb07b9adf9fbda7ea63f630d476a792 Mon Sep 17 00:00:00 2001 From: pculliton Date: Wed, 31 Jul 2024 11:12:10 -0400 Subject: [PATCH 4/6] Adding Gemma 2 2B configs (#8784) * Adding Gemma 2 2B configs Updates to Q scaling and Gemma 2 model sizes to match v2 2B model. * Update src/llama.cpp Co-authored-by: slaren --------- Co-authored-by: slaren --- src/llama.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index a207451f5..e6f303d31 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4969,6 +4969,7 @@ static void llm_load_hparams( hparams.attn_soft_cap = true; switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_2B; break; case 42: model.type = e_model::MODEL_9B; break; case 46: model.type = e_model::MODEL_27B; break; default: model.type = e_model::MODEL_UNKNOWN; @@ -11736,6 +11737,7 @@ struct llm_build_context { // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e switch (model.type) { + case e_model::MODEL_2B: case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; default: GGML_ABORT("fatal error"); From ed9d2854c9de4ae1f448334294e61167b04bec2a Mon Sep 17 00:00:00 2001 From: Clint Herron Date: Wed, 31 Jul 2024 15:51:06 -0400 Subject: [PATCH 5/6] Build: Fix potential race condition (#8781) * Fix potential race condition as pointed out by @fairydreaming in #8776 * Reference the .o rather than rebuilding every time. * Adding in CXXFLAGS and LDFLAGS * Removing unnecessary linker flags. --- Makefile | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index c82f4268a..f4ce4f1fb 100644 --- a/Makefile +++ b/Makefile @@ -1605,42 +1605,41 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ # Mark legacy binary targets as .PHONY so that they are always checked. .PHONY: main quantize perplexity embedding server +# Define the object file target +examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. # Eventually we will want to remove these target from building all the time. -main: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +main: examples/deprecation-warning/deprecation-warning.o + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead." -server: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +server: examples/deprecation-warning/deprecation-warning.o + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead." -quantize: examples/deprecation-warning/deprecation-warning.cpp +quantize: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard quantize)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead." @echo " Remove the 'quantize' binary to remove this warning." @echo "#########" endif -perplexity: examples/deprecation-warning/deprecation-warning.cpp +perplexity: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard perplexity)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead." @echo " Remove the 'perplexity' binary to remove this warning." @echo "#########" endif -embedding: examples/deprecation-warning/deprecation-warning.cpp +embedding: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard embedding)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead." @echo " Remove the 'embedding' binary to remove this warning." From afbbcf3c04e3c6420cad3d72571478cd62ac176c Mon Sep 17 00:00:00 2001 From: Igor Okulist Date: Wed, 31 Jul 2024 18:59:09 -0500 Subject: [PATCH 6/6] server : update llama-server embedding flag documentation (#8779) Fixes #8763 --- common/common.cpp | 2 +- examples/server/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 60c7eac75..521f849e2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1634,7 +1634,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); - options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" }); + options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" }); options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); diff --git a/examples/server/README.md b/examples/server/README.md index 33a2b95cc..de83ee7d0 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -247,7 +247,7 @@ server: --host HOST ip address to listen (default: 127.0.0.1) --port PORT port to listen (default: 8080) --path PATH path to serve static files from (default: ) - --embedding(s) enable embedding endpoint (default: disabled) + --embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled) --api-key KEY API key to use for authentication (default: none) --api-key-file FNAME path to file containing API keys (default: none) --ssl-key-file FNAME path to file a PEM-encoded SSL private key