Merge commit 'acd38efee3' into concedo_experimental

# Conflicts: # .devops/cpu.Dockerfile # .devops/vulkan.Dockerfile # .github/workflows/build.yml # .github/workflows/docker.yml # CMakeLists.txt # README.md # cmake/llama-config.cmake.in # examples/simple-cmake-pkg/.gitignore # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-hip/CMakeLists.txt
2025-09-11 09:34:37 +00:00 · 2025-01-28 18:16:44 +08:00 · 2025-01-28 18:16:44 +08:00 · c5d4e07664
commit c5d4e07664
parent 6bf0b2d062 acd38efee3
11 changed files with 395 additions and 61 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -73,6 +73,8 @@ add_compile_definitions(GGML_USE_CPU_AARCH64)
 if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()
 file(GLOB GGML_SOURCES_CUDA "ggml/src/ggml-cuda/*.cu")
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1427,16 +1427,16 @@ struct server_queue {
    int post(server_task task, bool front = false) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        GGML_ASSERT(task.id != -1);
        // if this is cancel task make sure to clean up pending tasks
        if (task.type == SERVER_TASK_TYPE_CANCEL) {
            cleanup_pending_task(task.id_target);
        }
        QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
        if (front) {
            queue_tasks.push_front(std::move(task));
        } else {
            queue_tasks.push_back(std::move(task));
        }
        // if this is cancel task make sure to clean up pending tasks
        if (task.type == SERVER_TASK_TYPE_CANCEL) {
            cleanup_pending_task(task.id_target);
        }
        condition_tasks.notify_one();
        return task.id;
    }
@ -1448,16 +1448,16 @@ struct server_queue {
            if (task.id == -1) {
                task.id = id++;
            }
            // if this is cancel task make sure to clean up pending tasks
            if (task.type == SERVER_TASK_TYPE_CANCEL) {
                cleanup_pending_task(task.id_target);
            }
            QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
            if (front) {
                queue_tasks.push_front(std::move(task));
            } else {
                queue_tasks.push_back(std::move(task));
            }
            // if this is cancel task make sure to clean up pending tasks
            if (task.type == SERVER_TASK_TYPE_CANCEL) {
                cleanup_pending_task(task.id_target);
            }
        }
        condition_tasks.notify_one();
        return 0;
@ -1554,10 +1554,10 @@ struct server_queue {
    }
 private:
-    void cleanup_pending_task(int id_task) {
+    void cleanup_pending_task(int id_target) {
        // no need lock because this is called exclusively by post()
-        auto rm_func = [id_task](const server_task & task) {
+        auto rm_func = [id_target](const server_task & task) {
-            return task.id_target == id_task;
+            return task.id_target == id_target;
        };
        queue_tasks.erase(
            std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
--- a/examples/simple-cmake-pkg/CMakeLists.txt
+++ b/examples/simple-cmake-pkg/CMakeLists.txt
@ -0,0 +1,11 @@
 cmake_minimum_required(VERSION 3.12)
 project(llama-simple-cmake-pkg)
 set(TARGET llama-simple-cmake-pkg)
 find_package(Llama REQUIRED)
 add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../simple/simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ggml::all ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/simple-cmake-pkg/README.md
+++ b/examples/simple-cmake-pkg/README.md
@ -0,0 +1,34 @@
 # llama.cpp/example/simple-cmake-pkg
 This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
 ## Building
 Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
 ### Considerations
 When hardware acceleration libraries are used (e.g. CUDA, Metal, Vulkan, etc.), the appropriate dependencies will be searched for automatically. So, for example, when finding a package
 ### Build llama.cpp and install to llama.cpp/inst
 ```sh
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 cmake -S . -B build
 cmake --build build
 cmake --install build --prefix inst
 ### Build simple-cmake-pkg
 ```sh
 cd examples/simple-cmake-pkg
 cmake -S . -B build -DCMAKE_PREFIX_PATH=../../inst/lib/cmake
 cmake --build build
 ```
 ### Run simple-cmake-pkg
 ```sh
 ./build/llama-simple-cmake-pkg -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
 ```
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@ -0,0 +1,147 @@
@GGML_VARIABLES_EXPANDED@
@PACKAGE_INIT@
 set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
 set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
 set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
 find_package(Threads REQUIRED)
 find_library(GGML_LIBRARY ggml
    REQUIRED
    HINTS ${GGML_LIB_DIR}
    NO_CMAKE_FIND_ROOT_PATH)
 add_library(ggml::ggml UNKNOWN IMPORTED)
 set_target_properties(ggml::ggml
    PROPERTIES
        IMPORTED_LOCATION "${GGML_LIBRARY}")
 find_library(GGML_BASE_LIBRARY ggml-base
    REQUIRED
    HINTS ${GGML_LIB_DIR}
    NO_CMAKE_FIND_ROOT_PATH)
 add_library(ggml::ggml-base UNKNOWN IMPORTED)
 set_target_properties(ggml::ggml-base
    PROPERTIES
        IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
 if (NOT GGML_SHARED_LIB)
    if (APPLE AND GGML_ACCELERATE)
        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
    endif()
    if (GGML_OPENMP)
        find_package(OpenMP REQUIRED)
        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
    endif()
    if (GGML_CPU_HBM)
        find_library(memkind memkind REQUIRED)
        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
    endif()
    if (GGML_BLAS)
        find_package(BLAS REQUIRED)
        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
        list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS   ${BLAS_LINKER_FLAGS})
    endif()
    if (GGML_CUDA)
        find_package(CUDAToolkit REQUIRED)
    endif()
    if (GGML_METAL)
        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
        find_library(METAL_FRAMEWORK    Metal REQUIRED)
        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
        list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
                    ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
    endif()
    if (GGML_VULKAN)
        find_package(Vulkan REQUIRED)
        list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
    endif()
    if (GGML_HIP)
        find_package(hip     REQUIRED)
        find_package(hipblas REQUIRED)
        find_package(rocblas REQUIRED)
        list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
    endif()
    if (GGML_SYCL)
        find_package(DNNL)
        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
        endif()
        if (WIN32)
            find_package(IntelSYCL REQUIRED)
            find_package(MKL       REQUIRED)
            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
        endif()
    endif()
 endif()
 set(_ggml_all_targets "")
 foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
    string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
    string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
    find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
        REQUIRED
        HINTS ${GGML_LIB_DIR}
        NO_CMAKE_FIND_ROOT_PATH)
    message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
    add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
    set_target_properties(ggml::${_ggml_backend}
        PROPERTIES
            INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
            IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
            IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
            INTERFACE_COMPILE_FEATURES c_std_90
            POSITION_INDEPENDENT_CODE ON)
    string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
    if(is_cpu_variant)
        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base")
        set_target_properties(ggml::${_ggml_backend}
           PROPERTIES
               INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
        if(GGML_CPU_INTERFACE_LINK_OPTIONS)
            set_target_properties(ggml::${_ggml_backend}
                PROPERTIES
                    INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
        endif()
    else()
        list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base")
        set_target_properties(ggml::${_ggml_backend}
            PROPERTIES
                INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
        if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
            set_target_properties(ggml::${_ggml_backend}
                PROPERTIES
                    INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
        endif()
    endif()
    list(APPEND _ggml_all_targets ggml::${_ggml_backend})
 endforeach()
 add_library(ggml::all INTERFACE IMPORTED)
 set_target_properties(ggml::all
    PROPERTIES
        INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
 check_required_components(ggml)
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -131,6 +131,10 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif // GGML_CUDA_F16
 #if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
 #define GGML_USE_VMM
 #endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
 #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 #define FP16_AVAILABLE
 #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -155,7 +155,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;
-#if !defined(GGML_CUDA_NO_VMM)
+#if defined(GGML_USE_VMM)
        CUdevice device;
        CU_CHECK(cuDeviceGet(&device, id));
        CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@ -167,7 +167,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
            alloc_prop.location.id = id;
            CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
        }
-#endif // !defined(GGML_CUDA_NO_VMM)
+#endif // defined(GGML_USE_VMM)
        info.devices[id].vmm = !!device_vmm;
        cudaDeviceProp prop;
@ -301,7 +301,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 };
 // pool with virtual memory
-#if !defined(GGML_CUDA_NO_VMM)
+#if defined(GGML_USE_VMM)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@ -409,14 +409,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
        GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
    }
 };
-#endif // !defined(GGML_CUDA_NO_VMM)
+#endif // defined(GGML_USE_VMM)
 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_CUDA_NO_VMM)
+#if defined(GGML_USE_VMM)
    if (ggml_cuda_info().devices[device].vmm) {
        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
    }
-#endif // !defined(GGML_CUDA_NO_VMM)
+#endif // defined(GGML_USE_VMM)
    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }
@ -3256,7 +3256,7 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
        features.push_back({ "FORCE_CUBLAS", "1" });
    #endif
-    #ifdef GGML_CUDA_NO_VMM
+    #ifndef GGML_USE_VMM
        features.push_back({ "NO_VMM", "1" });
    #endif
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@ -19,7 +19,10 @@
 // max number of MTLCommandBuffer used to submit a graph for processing
 #define GGML_METAL_MAX_COMMAND_BUFFERS 8
-#define UNUSED(x) (void)(x)
+// create residency sets only on macOS >= 15.0
 #if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
 #define GGML_METAL_HAS_RESIDENCY_SETS 1
 #endif
 // globals
@ -39,6 +42,7 @@ static struct ggml_backend_metal_device_context {
    bool has_simdgroup_reduction;
    bool has_simdgroup_mm;
    bool has_residency_sets;
    bool has_bfloat;
    bool use_bfloat;
@ -48,6 +52,7 @@ static struct ggml_backend_metal_device_context {
    /*.mtl_device_ref_count    =*/ 0,
    /*.has_simdgroup_reduction =*/ false,
    /*.has_simdgroup_mm        =*/ false,
    /*.has_residency_sets      =*/ false,
    /*.has_bfloat              =*/ false,
    /*.use_bfloat              =*/ false,
    /*.name                    =*/ "",
@ -59,12 +64,18 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
    if (ctx->mtl_device == nil) {
        ctx->mtl_device = MTLCreateSystemDefaultDevice();
    }
    if (ctx->mtl_device) {
        ctx->has_simdgroup_reduction  = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
        ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
        ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
 #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
        ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
 #endif
        ctx->has_bfloat  = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
        ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
@ -90,8 +101,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
    ctx->mtl_device_ref_count--;
    if (ctx->mtl_device_ref_count == 0) {
-        [ctx->mtl_device release];
+        if (ctx->mtl_device) {
-        ctx->mtl_device = nil;
+            [ctx->mtl_device release];
            ctx->mtl_device = nil;
        }
    }
 }
@ -483,6 +496,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
    ctx->queue  = [device newCommandQueue];
    if (ctx->queue == nil) {
        GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
        return NULL;
    }
    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
    id<MTLLibrary> metal_library;
@ -649,6 +667,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
    GGML_LOG_INFO("%s: simdgroup reduction   = %s\n", __func__, ctx_dev->has_simdgroup_reduction     ? "true" : "false");
    GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm            ? "true" : "false");
    GGML_LOG_INFO("%s: has residency sets    = %s\n", __func__, ctx_dev->has_residency_sets          ? "true" : "false");
    GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, ctx_dev->has_bfloat                  ? "true" : "false");
    GGML_LOG_INFO("%s: use bfloat            = %s\n", __func__, ctx_dev->use_bfloat                  ? "true" : "false");
    GGML_LOG_INFO("%s: hasUnifiedMemory      = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
@ -1035,8 +1054,70 @@ struct ggml_backend_metal_buffer_context {
    // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
    int n_buffers;
    struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
    // optional MTLResidencySet
    id rset;
 };
 // rset init
 static bool ggml_backend_metal_buffer_rset_init(
        struct ggml_backend_metal_buffer_context * ctx,
        struct ggml_backend_metal_device_context * ctx_dev,
        id<MTLDevice> device) {
    ctx->rset = nil;
    if (!ctx_dev->has_residency_sets) {
        return true;
    }
 #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
    if (@available(macOS 15.0, *)) {
        MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
        desc.label = @"ggml_backend_metal";
        desc.initialCapacity = ctx->n_buffers;
        NSError * error;
        ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
        if (error) {
            GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            [desc release];
            return false;
        }
        [desc release];
        for (int i = 0; i < ctx->n_buffers; i++) {
            [ctx->rset addAllocation:ctx->buffers[i].metal];
        }
        [ctx->rset commit];
        [ctx->rset requestResidency];
        return true;
    }
 #else
    GGML_UNUSED(ctx_dev);
    GGML_UNUSED(device);
 #endif
    return true;
 }
 // rset free
 static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
 #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
    if (@available(macOS 15.0, *)) {
        if (ctx->rset) {
            [ctx->rset endResidency];
            [ctx->rset removeAllAllocations];
            [ctx->rset release];
        }
    }
 #else
    GGML_UNUSED(ctx);
 #endif
 }
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@ -4176,6 +4257,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
    for (int i = 0; i < ctx->n_buffers; i++) {
        [ctx->buffers[i].metal release];
    }
    ggml_backend_metal_buffer_rset_free(ctx);
    ggml_backend_metal_device_rel(buffer->buft->device->context);
    if (ctx->owned) {
@ -4198,19 +4281,19 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
 static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
    memset((char *)tensor->data + offset, value, size);
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
 }
 static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    memcpy((char *)tensor->data + offset, data, size);
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
 }
 static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    memcpy(data, (const char *)tensor->data + offset, size);
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
 }
 static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@ -4220,7 +4303,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
    }
    return false;
-    UNUSED(buffer);
+    GGML_UNUSED(buffer);
 }
 static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@ -4246,7 +4329,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
 static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
    return "Metal";
-    UNUSED(buft);
+    GGML_UNUSED(buft);
 }
 static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
@ -4270,8 +4353,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
    }
 #endif
 #endif
-    UNUSED(device);
+    GGML_UNUSED(device);
-    UNUSED(size_aligned);
+    GGML_UNUSED(size_aligned);
 }
 static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@ -4284,7 +4367,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
        size_aligned += (size_page - (size_aligned % size_page));
    }
-    id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
+    struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
    ctx->all_data = ggml_metal_host_malloc(size_aligned);
    ctx->all_size = size_aligned;
@ -4307,7 +4391,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
    if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
        GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
        free(ctx);
-        ggml_backend_metal_device_rel(buft->device->context);
+        ggml_backend_metal_device_rel(ctx_dev);
        return NULL;
    }
    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
        free(ctx);
        ggml_backend_metal_device_rel(ctx_dev);
        return NULL;
    }
@ -4318,7 +4409,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
 static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
    return 32;
-    UNUSED(buft);
+    GGML_UNUSED(buft);
 }
 static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
@ -4328,13 +4419,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
    return max_size;
-    UNUSED(buft);
+    GGML_UNUSED(buft);
 }
 static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
    return true;
-    UNUSED(buft);
+    GGML_UNUSED(buft);
 }
 ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
@ -4357,7 +4448,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
 static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
    return "Metal_Mapped";
-    UNUSED(buft);
+    GGML_UNUSED(buft);
 }
 static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
@ -4400,7 +4491,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
        size_aligned += (size_page - (size_aligned % size_page));
    }
-    id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
+    struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
    // the buffer fits into the max buffer size allowed by the device
    if (size_aligned <= device.maxBufferLength) {
@ -4453,6 +4545,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
        }
    }
    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
        free(ctx);
        ggml_backend_metal_device_rel(ctx_dev);
        return NULL;
    }
    return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
 }
@ -4461,7 +4560,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
 static const char * ggml_backend_metal_name(ggml_backend_t backend) {
    return "Metal";
-    UNUSED(backend);
+    GGML_UNUSED(backend);
 }
 static void ggml_backend_metal_free(ggml_backend_t backend) {
@ -4766,6 +4865,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
        }
    }
    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
        free(ctx);
        ggml_backend_metal_device_rel(ctx_dev);
        return NULL;
    }
    return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
 }
@ -4779,7 +4885,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
    return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
            buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
-    UNUSED(dev);
+    GGML_UNUSED(dev);
 }
 static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -89,6 +89,10 @@ struct vk_pipeline_struct {
    uint32_t parameter_count;
    std::array<uint32_t, 3> wg_denoms;
    uint32_t align;
    // set to true to request the pipeline is compiled after the dryrun
    bool needed {};
    // set to true when the shader has been compiled
    bool compiled {};
 };
 typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@ -190,8 +194,11 @@ struct vk_device_struct {
    bool mul_mat_id_m;
    bool mul_mat_id_s;
-    vk_matmul_pipeline pipeline_matmul_f32;
+    // set to true to indicate that some shaders need to be compiled after the dryrun
-    vk_matmul_pipeline pipeline_matmul_f32_f16;
+    bool need_compiles {};
    vk_matmul_pipeline pipeline_matmul_f32 {};
    vk_matmul_pipeline pipeline_matmul_f32_f16 {};
    vk_matmul_pipeline2 pipeline_matmul_f16;
    vk_matmul_pipeline2 pipeline_matmul_f16_f32;
    vk_pipeline pipeline_matmul_split_k_reduce;
@ -199,7 +206,7 @@ struct vk_device_struct {
    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
-    vk_matmul_pipeline pipeline_matmul_id_f32;
+    vk_matmul_pipeline pipeline_matmul_id_f32 {};
    vk_matmul_pipeline2 pipeline_matmul_id_f16;
    vk_matmul_pipeline2 pipeline_matmul_id_f16_f32;
@ -780,13 +787,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
    GGML_ASSERT(parameter_count > 0);
    GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
    pipeline = std::make_shared<vk_pipeline_struct>();
    pipeline->name = name;
    pipeline->parameter_count = parameter_count;
    pipeline->push_constant_size = push_constant_size;
    pipeline->wg_denoms = wg_denoms;
    pipeline->align = align;
    vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
    pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
@ -869,6 +869,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
    }
    pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
    pipeline->compiled = true;
    {
        std::lock_guard<std::mutex> guard(device->mutex);
@ -879,12 +880,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
        std::lock_guard<std::mutex> guard(compile_count_mutex);
        assert(compile_count > 0);
        compile_count--;
        // "Progress bar" for shader compiles
        static uint32_t total_compile_count = 0;
        if ((total_compile_count++ % 10) == 0) {
            std::cerr << ".";
        }
    }
    compile_count_cond.notify_all();
 }
@ -910,6 +905,10 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
 static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
    VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
    device->pipeline_descriptor_set_requirements[pipeline->name] += n;
    if (!pipeline->compiled) {
        pipeline->needed = true;
        device->need_compiles = true;
    }
 }
 static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
@ -1392,8 +1391,6 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
 static void ggml_vk_load_shaders(vk_device& device) {
    VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
    std::cerr << "ggml_vulkan: Compiling shaders";
    // some shaders have a minimum subgroup size
    const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
    const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
@ -1531,15 +1528,33 @@ static void ggml_vk_load_shaders(vk_device& device) {
        }
    }
-    device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
+    if (!device->pipeline_matmul_f32) {
-    device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
+        device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
-
+    }
-    device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
+    if (!device->pipeline_matmul_f32_f16) {
        device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
    }
    if (!device->pipeline_matmul_id_f32) {
        device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
    }
    std::vector<std::future<void>> compiles;
    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
                                              uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
                                              uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
        if (!pipeline) {
            pipeline = std::make_shared<vk_pipeline_struct>();
            pipeline->name = name;
            pipeline->parameter_count = parameter_count;
            pipeline->push_constant_size = push_constant_size;
            pipeline->wg_denoms = wg_denoms;
            pipeline->align = align;
        }
        if (!pipeline->needed || pipeline->compiled) {
            return;
        }
        {
            // wait until fewer than N compiles are in progress
            uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@ -2054,7 +2069,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    for (auto &c : compiles) {
        c.wait();
    }
-    std::cerr << "Done!" << std::endl;
+    device->need_compiles = false;
 }
 static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
@ -7664,6 +7679,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
    }
    if (ctx->device->need_compiles) {
        ggml_vk_load_shaders(ctx->device);
    }
    ggml_vk_preallocate_buffers(ctx);
    ggml_pipeline_allocate_descriptor_sets(ctx->device);
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -1314,10 +1314,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
            return {cpu_dev, &pimpl->cpu_buft_list};
        }
        const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
        auto * dev = devices.at(layer_gpu);
        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
        return {dev, &pimpl->gpu_buft_list.at(dev)};
    };
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -9446,6 +9446,7 @@ static struct llama_model * llama_model_load_from_file_impl(
            model->devices.push_back(*dev);
        }
    } else {
        std::vector<ggml_backend_dev_t> rpc_servers;
        // use all available devices
        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@ -9456,10 +9457,19 @@ static struct llama_model * llama_model_load_from_file_impl(
                    break;
                case GGML_BACKEND_DEVICE_TYPE_GPU:
-                    model->devices.push_back(dev);
+                    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
                    if (ggml_backend_reg_name(reg) == std::string("RPC")) {
                        rpc_servers.push_back(dev);
                    } else {
                        model->devices.push_back(dev);
                    }
                    break;
            }
        }
        // add RPC servers at the front of the list
        if (!rpc_servers.empty()) {
            model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
        }
    }
    // if using single GPU mode, remove all except the main GPU