mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # docs/build.md # examples/batched/batched.cpp # examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp # examples/deprecation-warning/deprecation-warning.cpp # examples/eval-callback/eval-callback.cpp # examples/gen-docs/gen-docs.cpp # examples/gguf-hash/gguf-hash.cpp # examples/gguf/gguf.cpp # examples/lookahead/lookahead.cpp # examples/lookup/lookup-create.cpp # examples/lookup/lookup-merge.cpp # examples/lookup/lookup-stats.cpp # examples/lookup/lookup.cpp # examples/parallel/parallel.cpp # examples/passkey/passkey.cpp # examples/retrieval/retrieval.cpp # examples/save-load-state/save-load-state.cpp # examples/simple-chat/simple-chat.cpp # examples/simple/simple.cpp # examples/speculative-simple/speculative-simple.cpp # examples/speculative/speculative.cpp # examples/sycl/ls-sycl-device.cpp # examples/training/finetune.cpp # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/amx/common.h # ggml/src/ggml-cpu/kleidiai/kernels.cpp # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cvt.cl # ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl # ggml/src/ggml-opencl/kernels/transpose.cl # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl # scripts/get-wikitext-2.sh # tests/test-backend-ops.cpp # tools/batched-bench/batched-bench.cpp # tools/cvector-generator/cvector-generator.cpp # tools/export-lora/export-lora.cpp # tools/imatrix/imatrix.cpp # tools/llama-bench/llama-bench.cpp # tools/perplexity/perplexity.cpp # tools/rpc/rpc-server.cpp # tools/tokenize/tokenize.cpp
This commit is contained in:
commit
d20e60ddd5
13 changed files with 96 additions and 22 deletions
|
|
@ -7,6 +7,7 @@
|
|||
#include <limits.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <clocale>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
|
|
@ -538,6 +539,8 @@ static std::string format_input_text(const std::string & prompt, const std::stri
|
|||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <clocale>
|
||||
#include <ctime>
|
||||
#include <algorithm>
|
||||
|
||||
|
|
@ -94,6 +95,8 @@ static void print_raw_embeddings(const float * emb,
|
|||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
|
||||
|
|
|
|||
|
|
@ -7612,6 +7612,18 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
|
|||
return false;
|
||||
}
|
||||
|
||||
if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) {
|
||||
// Intel Windows proprietary driver tuning
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
switch (src0_type) {
|
||||
// From tests on A770 Linux, may need more tuning
|
||||
case GGML_TYPE_Q4_0:
|
||||
|
|
|
|||
|
|
@ -7,6 +7,13 @@ struct Params {
|
|||
offset_src0: u32,
|
||||
offset_src1: u32,
|
||||
offset_dst: u32,
|
||||
offset_merged_src0: u32,
|
||||
offset_merged_src1: u32,
|
||||
|
||||
stride_src0_0: u32,
|
||||
stride_src0_1: u32,
|
||||
stride_src0_2: u32,
|
||||
stride_src0_3: u32,
|
||||
|
||||
stride_src1_0: u32,
|
||||
stride_src1_1: u32,
|
||||
|
|
@ -23,6 +30,21 @@ struct Params {
|
|||
b_ne3: u32,
|
||||
};
|
||||
|
||||
fn src0_index(_i: u32) -> u32 {
|
||||
var i = _i;
|
||||
let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
|
||||
i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
|
||||
let a_i2 = i / (params.a_ne1 * params.a_ne0);
|
||||
i = i % (params.a_ne1 * params.a_ne0);
|
||||
let a_i1 = i / params.a_ne0;
|
||||
let a_i0 = i % params.a_ne0;
|
||||
|
||||
return a_i0 * params.stride_src0_0 +
|
||||
a_i1 * params.stride_src0_1 +
|
||||
a_i2 * params.stride_src0_2 +
|
||||
a_i3 * params.stride_src0_3;
|
||||
}
|
||||
|
||||
fn src1_index(_i: u32) -> u32 {
|
||||
var i = _i;
|
||||
let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
|
||||
|
|
@ -53,17 +75,22 @@ fn src1_index(_i: u32) -> u32 {
|
|||
#define DataType f16
|
||||
#endif
|
||||
|
||||
#ifdef SRC_OVERLAP
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> merged_src: array<DataType>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> dst: array<DataType>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
#else
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src0: array<DataType>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> src1 : array<DataType>;
|
||||
|
||||
#ifdef INPLACE
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
#elif defined(OVERLAP)
|
||||
#if defined(INPLACE) || defined(OVERLAP)
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
|
|
@ -74,6 +101,7 @@ var<storage, read_write> dst: array<DataType>;
|
|||
@group(0) @binding(3)
|
||||
var<uniform> params: Params;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
fn op(a: DataType, b: DataType) -> DataType {
|
||||
#ifdef OP_ADD
|
||||
|
|
@ -87,13 +115,17 @@ fn op(a: DataType, b: DataType) -> DataType {
|
|||
#endif
|
||||
}
|
||||
|
||||
fn update(dst_i: u32, src0_i: u32, src1_i: u32){
|
||||
fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
|
||||
#ifdef SRC_OVERLAP
|
||||
let result = op(merged_src[src0_i], merged_src[src1_i]);
|
||||
#else
|
||||
let result = op(src0[src0_i], src1[src1_i]);
|
||||
#endif
|
||||
|
||||
#ifdef INPLACE
|
||||
src0[dst_i] = result;
|
||||
src0[src0_i] = result;
|
||||
#elif defined(OVERLAP)
|
||||
src1[dst_i] = result;
|
||||
src1[src1_i] = result;
|
||||
#else
|
||||
dst[dst_i] = result;
|
||||
#endif
|
||||
|
|
@ -102,6 +134,8 @@ fn update(dst_i: u32, src0_i: u32, src1_i: u32){
|
|||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x < params.ne) {
|
||||
update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x));
|
||||
let src0_i = params.offset_src0 + params.offset_merged_src0 + src0_index(gid.x);
|
||||
let src1_i = params.offset_src1 + params.offset_merged_src1 + src1_index(gid.x);
|
||||
update(params.offset_dst + gid.x, src0_i, src1_i);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1426,16 +1426,14 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
|
|||
}
|
||||
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
if (tensor->ne[i] != 1) {
|
||||
if (i > n) {
|
||||
if (tensor->nb[i] != next_nb) {
|
||||
return false;
|
||||
}
|
||||
next_nb *= tensor->ne[i];
|
||||
} else {
|
||||
// this dimension does not need to be contiguous
|
||||
next_nb = tensor->ne[i]*tensor->nb[i];
|
||||
if (i > n) {
|
||||
if (tensor->ne[i] != 1 && tensor->nb[i] != next_nb) {
|
||||
return false;
|
||||
}
|
||||
next_nb *= tensor->ne[i];
|
||||
} else {
|
||||
// this dimension does not need to be contiguous
|
||||
next_nb = tensor->ne[i]*tensor->nb[i];
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -100,9 +100,9 @@ std::string format(const char * fmt, ...) {
|
|||
|
||||
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
|
||||
snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0));
|
||||
for (size_t i = 1; i < ne.size(); i++) {
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i));
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include "chat.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <clocale>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
|
|
@ -85,6 +86,8 @@ static void sigint_handler(int signo) {
|
|||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
common_params params;
|
||||
g_params = ¶ms;
|
||||
|
||||
|
|
@ -377,7 +380,7 @@ int main(int argc, char ** argv) {
|
|||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
if (session_tokens.size() > n_match) {
|
||||
if (!llama_memory_seq_rm(mem, -1, n_match, -1)) {
|
||||
LOG_WRN("%s: unable to resuse common prefix (for example, when the memory is recurrent)\n", __func__);
|
||||
LOG_WRN("%s: unable to reuse common prefix (for example, when the memory is recurrent)\n", __func__);
|
||||
llama_memory_clear(mem, true);
|
||||
session_tokens.clear();
|
||||
n_match = 0;
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
#include <climits>
|
||||
#include <clocale>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <stdexcept>
|
||||
|
|
@ -568,6 +569,8 @@ static void gguf_merge(const split_params & split_params) {
|
|||
}
|
||||
|
||||
int main(int argc, const char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
split_params params;
|
||||
split_params_parse(argc, argv, params);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,10 @@
|
|||
#include <clocale>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
std::string filename = "main";
|
||||
if (argc >= 1) {
|
||||
filename = argv[0];
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
#include <vector>
|
||||
#include <limits.h>
|
||||
#include <cinttypes>
|
||||
#include <clocale>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
|
|
@ -274,6 +275,8 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
|
|||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
|
|
|||
|
|
@ -3,6 +3,10 @@
|
|||
#include "llama.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <clocale>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
|
@ -486,6 +490,8 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
|
|||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
#include "log.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <clocale>
|
||||
#include <exception>
|
||||
#include <signal.h>
|
||||
#include <thread> // for std::thread::hardware_concurrency
|
||||
|
|
@ -67,6 +68,8 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
|
|||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
// own arguments required by this example
|
||||
common_params params;
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <clocale>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
|
|
@ -536,6 +537,8 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
|
|||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
common_params params;
|
||||
|
||||
params.out_file = "output.wav";
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue