From 496957e1cbcb522abc63aa18521036e40efce985 Mon Sep 17 00:00:00 2001 From: Diner Burger Date: Wed, 16 Jul 2025 15:17:25 -0400 Subject: [PATCH 1/6] llama : fix parameter order for hybrid memory initialization (#14725) --- src/llama-memory-hybrid.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index eedfaec53..d8e2086c8 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -38,9 +38,9 @@ llama_memory_hybrid::llama_memory_hybrid( type_v, v_trans, offload, + 1, kv_size, n_seq_max, - 1, n_pad, n_swa, swa_type From 19e5943d9e976b59ccd5a0a87ae3d2ff3da44390 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 16 Jul 2025 23:17:43 +0200 Subject: [PATCH 2/6] convert : make hf token optional (#14717) * make hf token optional * fail if we can't get necessary tokenizer config --- convert_hf_to_gguf_update.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 6a0d9a9ba..f7b6d97b1 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -7,7 +7,6 @@ import pathlib import re import requests -import sys import json import shutil import argparse @@ -69,8 +68,7 @@ args = parser.parse_args() hf_token = args.hf_token if args.hf_token is not None else hf_token if hf_token is None: - logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token") - sys.exit(1) + logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token") # TODO: this string has to exercise as much pre-tokenizer functionality as possible # will be updated with time - contributions welcome @@ -151,7 +149,7 @@ pre_computed_hashes = [ def download_file_with_auth(url, token, save_path): - headers = {"Authorization": f"Bearer {token}"} + headers = {"Authorization": f"Bearer {token}"} if token else None response = sess.get(url, headers=headers) response.raise_for_status() os.makedirs(os.path.dirname(save_path), exist_ok=True) @@ -250,10 +248,9 @@ for model in [*pre_computed_hashes, *all_models]: else: # otherwise, compute the hash of the tokenizer - # Skip if the tokenizer folder does not exist or there are other download issues previously - if not os.path.exists(f"models/tokenizers/{name}"): - logger.warning(f"Directory for tokenizer {name} not found. Skipping...") - continue + # Fail if the tokenizer folder with config does not exist or there are other download issues previously + if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"): + raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.") try: logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...") @@ -261,9 +258,8 @@ for model in [*pre_computed_hashes, *all_models]: tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False) else: tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") - except OSError as e: - logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") - continue # Skip to the next model if the tokenizer can't be loaded + except Exception as e: + raise OSError(f"Error loading tokenizer for model {name}.") from e chktok = tokenizer.encode(CHK_TXT) chkhsh = sha256(str(chktok).encode()).hexdigest() From 1ba45d49822c39ca9a552c7b75efe0495ff400c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 17 Jul 2025 01:52:08 +0200 Subject: [PATCH 3/6] ci : disable failing vulkan crossbuilds (#14723) --- .github/workflows/build-linux-cross.yml | 238 ++++++++++++------------ 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml index 7cfc82ba4..04ad187d3 100644 --- a/.github/workflows/build-linux-cross.yml +++ b/.github/workflows/build-linux-cross.yml @@ -48,98 +48,98 @@ jobs: cmake --build build --config Release -j $(nproc) - ubuntu-24-riscv64-vulkan-cross: - runs-on: ubuntu-24.04 + # ubuntu-24-riscv64-vulkan-cross: + # runs-on: ubuntu-24.04 - steps: - - uses: actions/checkout@v4 - - name: Setup Riscv - run: | - sudo dpkg --add-architecture riscv64 + # steps: + # - uses: actions/checkout@v4 + # - name: Setup Riscv + # run: | + # sudo dpkg --add-architecture riscv64 - # Add arch-specific repositories for non-amd64 architectures - cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe - EOF + # # Add arch-specific repositories for non-amd64 architectures + # cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list + # deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe + # deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe + # deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe + # deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe + # EOF - sudo apt-get update || true ;# Prevent failure due to missing URLs. + # sudo apt-get update || true ;# Prevent failure due to missing URLs. - sudo apt-get install -y --no-install-recommends \ - build-essential \ - glslc \ - gcc-14-riscv64-linux-gnu \ - g++-14-riscv64-linux-gnu \ - libvulkan-dev:riscv64 + # sudo apt-get install -y --no-install-recommends \ + # build-essential \ + # glslc \ + # gcc-14-riscv64-linux-gnu \ + # g++-14-riscv64-linux-gnu \ + # libvulkan-dev:riscv64 - - name: Build - run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_VULKAN=ON \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=riscv64 \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH + # - name: Build + # run: | + # cmake -B build -DLLAMA_CURL=OFF \ + # -DCMAKE_BUILD_TYPE=Release \ + # -DGGML_VULKAN=ON \ + # -DGGML_OPENMP=OFF \ + # -DLLAMA_BUILD_EXAMPLES=ON \ + # -DLLAMA_BUILD_TOOLS=ON \ + # -DLLAMA_BUILD_TESTS=OFF \ + # -DCMAKE_SYSTEM_NAME=Linux \ + # -DCMAKE_SYSTEM_PROCESSOR=riscv64 \ + # -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ + # -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \ + # -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + # -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \ + # -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + # -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + # -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - cmake --build build --config Release -j $(nproc) + # cmake --build build --config Release -j $(nproc) - ubuntu-24-arm64-vulkan-cross: - runs-on: ubuntu-24.04 + # ubuntu-24-arm64-vulkan-cross: + # runs-on: ubuntu-24.04 - steps: - - uses: actions/checkout@v4 - - name: Setup Arm64 - run: | - sudo dpkg --add-architecture arm64 + # steps: + # - uses: actions/checkout@v4 + # - name: Setup Arm64 + # run: | + # sudo dpkg --add-architecture arm64 - # Add arch-specific repositories for non-amd64 architectures - cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list - deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe - deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe - deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe - deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe - EOF + # # Add arch-specific repositories for non-amd64 architectures + # cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list + # deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe + # deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe + # deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe + # deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe + # EOF - sudo apt-get update || true ;# Prevent failure due to missing URLs. + # sudo apt-get update || true ;# Prevent failure due to missing URLs. - sudo apt-get install -y --no-install-recommends \ - build-essential \ - glslc \ - crossbuild-essential-arm64 \ - libvulkan-dev:arm64 + # sudo apt-get install -y --no-install-recommends \ + # build-essential \ + # glslc \ + # crossbuild-essential-arm64 \ + # libvulkan-dev:arm64 - - name: Build - run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_VULKAN=ON \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=aarch64 \ - -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \ - -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH + # - name: Build + # run: | + # cmake -B build -DLLAMA_CURL=OFF \ + # -DCMAKE_BUILD_TYPE=Release \ + # -DGGML_VULKAN=ON \ + # -DGGML_OPENMP=OFF \ + # -DLLAMA_BUILD_EXAMPLES=ON \ + # -DLLAMA_BUILD_TOOLS=ON \ + # -DLLAMA_BUILD_TESTS=OFF \ + # -DCMAKE_SYSTEM_NAME=Linux \ + # -DCMAKE_SYSTEM_PROCESSOR=aarch64 \ + # -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \ + # -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \ + # -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + # -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \ + # -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + # -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + # -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - cmake --build build --config Release -j $(nproc) + # cmake --build build --config Release -j $(nproc) ubuntu-24-ppc64el-cpu-cross: runs-on: ubuntu-24.04 @@ -185,52 +185,52 @@ jobs: cmake --build build --config Release -j $(nproc) - ubuntu-24-ppc64el-vulkan-cross: - runs-on: ubuntu-24.04 + # ubuntu-24-ppc64el-vulkan-cross: + # runs-on: ubuntu-24.04 - steps: - - uses: actions/checkout@v4 - - name: Setup PowerPC64le - run: | - sudo dpkg --add-architecture ppc64el + # steps: + # - uses: actions/checkout@v4 + # - name: Setup PowerPC64le + # run: | + # sudo dpkg --add-architecture ppc64el - # Add arch-specific repositories for non-amd64 architectures - cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe - EOF + # # Add arch-specific repositories for non-amd64 architectures + # cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list + # deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe + # deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe + # deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe + # deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe + # EOF - sudo apt-get update || true ;# Prevent failure due to missing URLs. + # sudo apt-get update || true ;# Prevent failure due to missing URLs. - sudo apt-get install -y --no-install-recommends \ - build-essential \ - glslc \ - gcc-14-powerpc64le-linux-gnu \ - g++-14-powerpc64le-linux-gnu \ - libvulkan-dev:ppc64el + # sudo apt-get install -y --no-install-recommends \ + # build-essential \ + # glslc \ + # gcc-14-powerpc64le-linux-gnu \ + # g++-14-powerpc64le-linux-gnu \ + # libvulkan-dev:ppc64el - - name: Build - run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_VULKAN=ON \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=ppc64 \ - -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH + # - name: Build + # run: | + # cmake -B build -DLLAMA_CURL=OFF \ + # -DCMAKE_BUILD_TYPE=Release \ + # -DGGML_VULKAN=ON \ + # -DGGML_OPENMP=OFF \ + # -DLLAMA_BUILD_EXAMPLES=ON \ + # -DLLAMA_BUILD_TOOLS=ON \ + # -DLLAMA_BUILD_TESTS=OFF \ + # -DCMAKE_SYSTEM_NAME=Linux \ + # -DCMAKE_SYSTEM_PROCESSOR=ppc64 \ + # -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \ + # -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \ + # -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + # -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \ + # -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ + # -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ + # -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - cmake --build build --config Release -j $(nproc) + # cmake --build build --config Release -j $(nproc) debian-13-loongarch64-cpu-cross: runs-on: ubuntu-24.04 From ad57d3edd2f48cf6dc41a98fd9b303435ecb4fb0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 17 Jul 2025 09:45:54 +0300 Subject: [PATCH 4/6] batch : fix uninitialized has_cpl flag (#14733) ggml-ci --- src/llama-batch.cpp | 2 ++ src/llama-batch.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index f8227777f..eb15d2c41 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -157,6 +157,8 @@ bool llama_batch_allocr::init( n_outputs += batch.logits[i] != 0; } + has_cpl = false; + // determine coupled sequences // these are pairs of sequences that have at least one token in the input batch that is assigned to both of them for (int32_t i = 0; i < batch.n_tokens; ++i) { diff --git a/src/llama-batch.h b/src/llama-batch.h index 1a24440ba..c811aef43 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -117,7 +117,7 @@ private: using seq_cpl_t = std::vector; // helper flag to quickly determine if there are any coupled sequences in the batch - bool has_cpl; + bool has_cpl = false; std::vector seq_pos; // seq_pos[s]: the set of positions in sequence s std::vector seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1 From d9b691081c04ec5fb0daa9d2b979f915c142963d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 17 Jul 2025 09:49:15 +0300 Subject: [PATCH 5/6] kv-cache : opt mask set input (#14600) ggml-ci --- src/llama-kv-cache-unified.cpp | 49 +++++++++++++--------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index 7e92e6b4d..baaa1d32d 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -1283,6 +1283,8 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub const int64_t n_tps = n_tokens/n_stream; const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD); + std::fill(data, data + ggml_nelements(dst), -INFINITY); + // Use only the previous KV cells of the correct sequence for each token of the ubatch. // It's assumed that if a token in the batch has multiple sequences, they are equivalent. // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch: @@ -1306,44 +1308,31 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub const llama_pos p1 = ubatch->pos[i]; + const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii); + for (uint32_t j = 0; j < n_kv; ++j) { - float f = 0.0f; - - bool masked = false; - if (cells.is_empty(j)) { - masked = true; - } else { - const llama_pos p0 = cells.pos_get(j); - - // mask the token if not the same sequence - masked = masked || (!cells.seq_has(j, seq_id)); - - // mask future tokens - masked = masked || (causal_attn && p0 > p1); - - // apply SWA if any - masked = masked || (is_masked_swa(p0, p1)); - - if (!masked && hparams.use_alibi) { - f = -std::abs(p0 - p1); - } + continue; } - if (masked) { - f = -INFINITY; + // mask the token if not the same sequence + if (!cells.seq_has(j, seq_id)) { + continue; } - data[h*n_stream*n_tps_pad*n_kv + s*n_tps_pad*n_kv + ii*n_kv + j] = f; - } + const llama_pos p0 = cells.pos_get(j); - // mask padded tokens - if (data) { - for (uint32_t ii = n_tps; ii < n_tps_pad; ++ii) { - for (uint32_t j = 0; j < n_kv; ++j) { - data[h*n_stream*n_tps_pad*n_kv + s*n_tps_pad*n_kv + ii*n_kv + j] = -INFINITY; - } + // mask future tokens + if (causal_attn && p0 > p1) { + continue; } + + // apply SWA if any + if (is_masked_swa(p0, p1)) { + continue; + } + + data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f; } } } From 086cf81e88fb75287b71ff19c08a206b7bc2e02f Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 17 Jul 2025 09:22:11 +0200 Subject: [PATCH 6/6] llama : fix parallel processing for lfm2 (#14705) --- src/llama-model.cpp | 56 +++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9d8a686e0..cdf1e4242 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -16554,7 +16554,19 @@ struct llm_build_lfm2 : public llm_graph_context { ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il) { - const auto * mctx_cur = static_cast(mctx)->get_recr(); + const auto * mctx_cur = static_cast(mctx)->get_recr(); + const uint32_t kv_head = mctx_cur->get_head(); + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + GGML_ASSERT(hparams.n_shortconv_l_cache > 1); + const uint32_t d_conv = hparams.n_shortconv_l_cache - 1; + + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur); cb(bcx, "model.layers.{}.conv.in_proj", il); @@ -16562,38 +16574,48 @@ struct llm_build_lfm2 : public llm_graph_context { constexpr auto n_chunks = 3; GGML_ASSERT(bcx->ne[0] % n_chunks == 0); auto const chunk_size = bcx->ne[0] / n_chunks; - auto * b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx)); - auto * c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx)); - auto * x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx)); + auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 0*chunk_size*ggml_element_size(bcx)); + auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 1*chunk_size*ggml_element_size(bcx)); + auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 2*chunk_size*ggml_element_size(bcx)); auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x)); - // read conv state directly, with build_rs generation is slower - ggml_tensor * conv_state = mctx_cur->get_r_l(il); - const int64_t n_seqs = ubatch.n_seqs; - ggml_tensor * conv = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs); - conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs); + // read conv state + auto * conv_state = mctx_cur->get_r_l(il); + auto * conv_rs = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs); + auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs); bx = ggml_concat(ctx0, conv, bx, 0); GGML_ASSERT(bx->ne[0] > conv->ne[0]); - auto * new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx)); + // last d_conv columns is a new conv state + auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], (bx->ne[0] - conv->ne[0])*ggml_element_size(bx)); GGML_ASSERT(ggml_are_same_shape(conv, new_conv)); - // write conv state - ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, conv_state)); + // write new conv conv state + ggml_build_forward_expand( + gf, + ggml_cpy( + ctx0, + new_conv, + ggml_view_1d( + ctx0, + conv_state, + ggml_nelements(new_conv), + kv_head*d_conv*n_embd*ggml_element_size(new_conv) + ) + ) + ); auto * conv_kernel = model.layers[il].shortconv.conv; - GGML_ASSERT(hparams.n_shortconv_l_cache > 0); - - // construct ssm_conv op - ggml_tensor * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel); + auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel); cb(conv_out, "model.layers.{}.conv.conv", il); auto * y = ggml_mul(ctx0, c, conv_out); - y = build_lora_mm(model.layers[il].shortconv.out_proj, y); cb(y, "model.layers.{}.conv.out_proj", il); + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs); return y; }