mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 16:31:59 +00:00
Merge commit '9f102a1407' into concedo_experimental
# Conflicts: # .devops/intel.Dockerfile # .github/ISSUE_TEMPLATE/010-bug-compilation.yml # .github/ISSUE_TEMPLATE/011-bug-results.yml # .github/pull_request_template.md # CODEOWNERS # README.md # common/CMakeLists.txt # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/binary-ops.c # ggml/src/ggml-hexagon/htp/hex-dma.c # ggml/src/ggml-hexagon/htp/hex-dma.h # ggml/src/ggml-hexagon/htp/hex-dump.h # ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c # ggml/src/ggml-hexagon/htp/hvx-utils.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/htp/ssm-conv.c # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cvt.cl # ggml/src/ggml-rpc/ggml-rpc.cpp # scripts/snapdragon/adb/run-bench.sh # scripts/sync_vendor.py # tests/test-backend-ops.cpp # tools/llama-bench/llama-bench.cpp
This commit is contained in:
commit
c00fe0af5a
32 changed files with 1302 additions and 447 deletions
|
|
@ -350,14 +350,6 @@ llama_context::llama_context(
|
|||
|
||||
if (cparams.pipeline_parallel) {
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
|
||||
|
||||
if (!graph_reuse_disable) {
|
||||
// TODO: figure out a way to make graph reuse work with pipeline parallelism
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/20463
|
||||
LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
|
||||
|
||||
graph_reuse_disable = true;
|
||||
}
|
||||
}
|
||||
|
||||
sched_reserve();
|
||||
|
|
@ -1199,6 +1191,13 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
|||
if (!graph_reuse_disable && res->can_reuse(gparams)) {
|
||||
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
||||
|
||||
// with pipeline parallelism, the previous graph_compute_async may still be running
|
||||
// on the GPU. we must synchronize before set_inputs to avoid overwriting input tensors
|
||||
// that the previous compute is still reading.
|
||||
if (cparams.pipeline_parallel) {
|
||||
ggml_backend_sched_synchronize(sched.get());
|
||||
}
|
||||
|
||||
n_reused++;
|
||||
} else {
|
||||
res->reset();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue