mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 11:39:08 +00:00
Merge branch 'dev'
This commit is contained in:
commit
ebd09fc83c
12 changed files with 1140 additions and 546 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -67,6 +67,7 @@ autogen-*.md
|
|||
|
||||
/main
|
||||
/server
|
||||
/profile-tool
|
||||
|
||||
# CI
|
||||
|
||||
|
@ -136,3 +137,7 @@ poetry.toml
|
|||
|
||||
# Video
|
||||
*.mp4
|
||||
|
||||
# fio
|
||||
fio_test*
|
||||
*.fio
|
7
Makefile
7
Makefile
|
@ -1,5 +1,5 @@
|
|||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = llama-cli
|
||||
BUILD_TARGETS = llama-cli profile-tool
|
||||
# BUILD_TARGETS = \
|
||||
# libllava.a \
|
||||
# llama-baby-llama \
|
||||
|
@ -1528,6 +1528,11 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
|||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
profile-tool: tools/profile_tool.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
swift: examples/batched.swift
|
||||
(cd examples/batched.swift; make build)
|
||||
|
|
18
README.md
18
README.md
|
@ -98,7 +98,7 @@ Here are the models we have tested so far. You can also try more on Hugging Face
|
|||
- **DeepSeek R1-8B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Llama-8B-GGUF)
|
||||
- **DeepSeek R1-14B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-14B-GGUF)
|
||||
- **DeepSeek R1-32B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-32B-GGUF)
|
||||
- **DeepSeek R1-70B (Q4K, Q6K, Q80):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF)
|
||||
- **DeepSeek R1-70B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-70B-GGUF)):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF)
|
||||
|
||||
## ⚙️ How to Use?
|
||||
|
||||
|
@ -287,7 +287,15 @@ By default, prima.cpp automatically profiles devices and assigns workloads. Howe
|
|||
|
||||
> Example: if `-lw "16,16,16,16"` is passed to the head device, then each of the 4 devices will handle 16 model layers. A worker with `-ngl 8` (if a GPU is available) will run 8/16 layers on the GPU.
|
||||
|
||||
**2. How to run in chat mode like in llama.cpp?**
|
||||
**2. How to manually profile my device?**
|
||||
|
||||
If `-lw` is set, prima.cpp skips profiling and runs directly with the user-defined `-lw` and `-ngl`. If you wish to profile a device manually, run `profile-tool` on that device.
|
||||
|
||||
```shell
|
||||
./profile-tool -m download/qwq-32b-q4_k_m.gguf
|
||||
```
|
||||
|
||||
**3. How to run in chat mode like in llama.cpp?**
|
||||
|
||||
To enable chat (conversation) mode, simply add the `-cnv` flag on the head device:
|
||||
|
||||
|
@ -298,7 +306,7 @@ To enable chat (conversation) mode, simply add the `-cnv` flag on the head devic
|
|||
|
||||
To quit the chat mode, input `quit` or `exit`.
|
||||
|
||||
**3. How to force prefetching after computing?**
|
||||
**4. How to force prefetching after computing?**
|
||||
|
||||
By default, prima.cpp only advises the OS to prefetch upcoming layer weights. The actual prefetching is then scheduled and handled by the OS, which may introduce some uncertainty. To explicitly trigger prefetching right after computing, you can use the `--force` flag on each device:
|
||||
|
||||
|
@ -309,11 +317,11 @@ By default, prima.cpp only advises the OS to prefetch upcoming layer weights. Th
|
|||
|
||||
This enables more aggressive overlap but also introduce extra memory access latency. Use `--force` only after testing, as its effect depends on your hardware and OS behavior.
|
||||
|
||||
**4. Does it support Windows?**
|
||||
**5. Does it support Windows?**
|
||||
|
||||
Not yet—but it's on the roadmap. Currently, prima.cpp can run on Linux, macOS, Android and HarmonyOS (via Termux). You can mix heterogeneous devices in the cluster.
|
||||
|
||||
**5. Does it support Vulkan or AMD GPUs?**
|
||||
**6. Does it support Vulkan or AMD GPUs?**
|
||||
|
||||
Not yet. Now prima.cpp supports only CUDA-based GPUs. Vulkan is in our roadmap, and AMD GPUs will be supported once we have that device.
|
||||
|
||||
|
|
|
@ -903,11 +903,17 @@ static bool assign_layers_to_device(
|
|||
float t_calc_cpu = (
|
||||
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
|
||||
float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
|
||||
// t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
||||
|
||||
|
@ -923,22 +929,34 @@ static bool assign_layers_to_device(
|
|||
t_calc_gpu = (
|
||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1m_f32 / (dev.gpu_props.metal_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
|
||||
t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
|
||||
// t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
|
||||
} else {
|
||||
t_calc_gpu = (
|
||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1m_f32 / (dev.gpu_props.cuda_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
|
||||
t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
|
||||
// t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
|
||||
}
|
||||
|
@ -1115,12 +1133,16 @@ static bool assign_layers_to_device(
|
|||
kappa = (
|
||||
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
|
||||
dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
// kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
||||
|
||||
kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||
|
@ -1505,6 +1527,12 @@ static bool assign_layers_to_device(
|
|||
//
|
||||
|
||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||
|
||||
#if !(defined(GGML_USE_METAL) || defined(GGML_USE_CUDA))
|
||||
// reset n_gpu_layers to 0 if GPU is not used
|
||||
params.n_gpu_layers = 0;
|
||||
#endif
|
||||
|
||||
llama_init_result iparams;
|
||||
auto mparams = llama_model_params_from_gpt_params(params);
|
||||
|
||||
|
@ -1554,19 +1582,13 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
uint32_t my_rank = params.rank;
|
||||
bool auto_schedule = params.n_layer_window[0] == 0;
|
||||
|
||||
// get device profile
|
||||
LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
|
||||
dev_info.rank = params.rank;
|
||||
if (n_world > 1) {
|
||||
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||
}
|
||||
|
||||
// create llama context
|
||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||
|
||||
if (n_world == 1) {
|
||||
uint32_t n_layers = llama_model_n_layers(model);
|
||||
// assign all layers to this device
|
||||
params.n_layer_window[0] = n_layers;
|
||||
cparams.n_layer_window[0] = n_layers;
|
||||
mparams.n_layer_window[0] = n_layers;
|
||||
|
@ -1577,16 +1599,34 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
// initialize sockets
|
||||
llama_init_sockets(lctx, n_world, my_rank);
|
||||
|
||||
// sychronize device profile to the master node
|
||||
struct device_info * dev_info_set = nullptr;
|
||||
// broadcast startup args
|
||||
struct startup_args args;
|
||||
if (my_rank == 0){
|
||||
args.should_profile = auto_schedule;
|
||||
}
|
||||
llama_bcast_startup_args(lctx, my_rank, &args);
|
||||
|
||||
auto_schedule = args.should_profile;
|
||||
// if n_world > 1 and need auto schdule, then prifile
|
||||
if (auto_schedule){
|
||||
// get device profile
|
||||
LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
|
||||
dev_info.rank = params.rank;
|
||||
if (n_world > 1) {
|
||||
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||
}
|
||||
}
|
||||
|
||||
// sychronize device profile to the master node
|
||||
if (my_rank == 0) {
|
||||
if (auto_schedule) {
|
||||
struct device_info * dev_info_set = nullptr;
|
||||
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
|
||||
dev_info_set[0] = dev_info;
|
||||
|
||||
llama_gather_device_info(lctx, dev_info_set);
|
||||
device_print_props(dev_info_set, n_world, model, cparams);
|
||||
|
||||
if (auto_schedule) {
|
||||
// automatically determine n_layer_window and n_gpu_layers
|
||||
if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
|
||||
LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__);
|
||||
|
@ -1601,7 +1641,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
llama_bcast_layer_setup(lctx, n_layer_window, nullptr);
|
||||
}
|
||||
} else {
|
||||
if (auto_schedule){
|
||||
llama_send_device_info(lctx, &dev_info);
|
||||
}
|
||||
llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers);
|
||||
}
|
||||
|
||||
|
@ -1766,33 +1808,25 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|||
return mparams;
|
||||
}
|
||||
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
if (s == "f32") {
|
||||
return GGML_TYPE_F32;
|
||||
}
|
||||
if (s == "f16") {
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
if (s == "q8_0") {
|
||||
return GGML_TYPE_Q8_0;
|
||||
}
|
||||
if (s == "q4_0") {
|
||||
return GGML_TYPE_Q4_0;
|
||||
}
|
||||
if (s == "q4_1") {
|
||||
return GGML_TYPE_Q4_1;
|
||||
}
|
||||
if (s == "iq4_nl") {
|
||||
return GGML_TYPE_IQ4_NL;
|
||||
}
|
||||
if (s == "q5_0") {
|
||||
return GGML_TYPE_Q5_0;
|
||||
}
|
||||
if (s == "q5_1") {
|
||||
return GGML_TYPE_Q5_1;
|
||||
}
|
||||
const std::vector<ggml_type> kv_cache_types = {
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F16,
|
||||
GGML_TYPE_BF16, // Added BF16 data type support
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_IQ4_NL,
|
||||
GGML_TYPE_Q5_0,
|
||||
GGML_TYPE_Q5_1,
|
||||
};
|
||||
|
||||
throw std::runtime_error("Invalid cache type: " + s);
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
for (const auto & type : kv_cache_types) {
|
||||
if (ggml_type_name(type) == s) {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("Unsupported cache type: " + s);
|
||||
}
|
||||
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||
|
|
|
@ -188,6 +188,9 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
|
|||
};
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
|
||||
if(n_embd < ggml_blck_size(src0t)){
|
||||
n_embd = 2 * ggml_blck_size(src0t);
|
||||
}
|
||||
struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd);
|
||||
struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd);
|
||||
|
||||
|
@ -208,10 +211,12 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
|
|||
ctx_cgraph = ggml_init(params0);
|
||||
|
||||
gf = ggml_new_graph(ctx_cgraph);
|
||||
|
||||
cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b);
|
||||
for (int i = 0; i < n_repeat - 1; i++) {
|
||||
cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
|
@ -345,7 +350,6 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in
|
|||
return 0.0f;
|
||||
}
|
||||
|
||||
size_t QK_K = 0;
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32: {
|
||||
matrix_B = malloc(embd_size * sizeof(float));
|
||||
|
@ -364,14 +368,18 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in
|
|||
ggml_fp32_to_fp16_row(temp_f32.data(), static_cast<ggml_fp16_t *>(matrix_B), embd_size);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
QK_K = ggml_blck_size(src0t);
|
||||
matrix_B = malloc((embd_size / QK_K) * ggml_type_size(src0t));
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
matrix_B = malloc((embd_size / ggml_blck_size(src0t) * ggml_type_size(src0t))); // The quantization block sizes are inconsistent for different quantization methods
|
||||
break;
|
||||
default:
|
||||
LOG_INF("Unsupported type: %d\n", src0t);
|
||||
|
@ -1349,31 +1357,45 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
|
|||
|
||||
gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.cuda_flops_q2k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.cuda_flops_iq2xxs_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.cuda_flops_iq1s_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.cuda_flops_iq4nl_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)gpu.cuda_flops_iq1m_f32 + EPS) / 1e9;
|
||||
#elif GGML_USE_METAL
|
||||
struct gpu_props gpu = dev_info.gpu_props;
|
||||
|
||||
gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.metal_flops_q2k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.metal_flops_iq2xxs_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.metal_flops_iq1s_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.metal_flops_iq4nl_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)gpu.metal_flops_iq1m_f32 + EPS) / 1e9;
|
||||
#endif
|
||||
|
||||
cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9;
|
||||
|
||||
cpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)cpu.flops_iq1m_f32 + EPS) / 1e9;
|
||||
double total_latency = 0.0f;
|
||||
|
||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||
|
@ -1387,11 +1409,16 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
|
|||
|
||||
total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_iq1m_f32 / ((double)cpu.flops_iq1m_f32 + EPS) / 1e9;
|
||||
|
||||
total_latency *= 1000; // convert to ms
|
||||
|
||||
|
@ -1696,15 +1723,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q4K x F32, GFLOPS)");
|
||||
LOG_INF("| CPU flops (Q2K x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32);
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q2k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q50 x F32, GFLOPS)");
|
||||
LOG_INF("| CPU flops (Q4K x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32);
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
|
@ -1720,12 +1747,42 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (IQ2XXS x F32, GFLOPS)");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq2xxs_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q50 x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q80 x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q80_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (IQ1S x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1s_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (IQ4NL x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq4nl_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (IQ1M x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1m_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Physical Mem Total (GiB) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical);
|
||||
|
@ -1882,15 +1939,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q4KxF32, GFLOPS)");
|
||||
LOG_INF("| Metal flops (Q2KxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32);
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q2k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q50xF32, GFLOPS)");
|
||||
LOG_INF("| Metal flops (Q4KxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32);
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
|
@ -1906,12 +1963,42 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (IQ2XXSxF32, GFLOPS)");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq2xxs_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q50xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q80xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q80_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (IQ1SxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1s_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (IQ4NLxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq4nl_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (IQ1MxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1m_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA VRAM Read BW (GB/s) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_read_vram_bw);
|
||||
|
@ -1936,15 +2023,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) ");
|
||||
LOG_INF("| CUDA flops (Q2KxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32);
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q2k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (Q50xF32, GFLOPS) ");
|
||||
LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32);
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
|
@ -1960,12 +2047,42 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (IQ2XXSxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq2xxs_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (Q50xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (Q80xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1s_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (IQ4NLxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq4nl_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (IQ1MxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1m_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output F32xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f32_f32);
|
||||
LOG_INF("\n");
|
||||
|
@ -1974,12 +2091,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f16_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output Q4KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32);
|
||||
LOG_INF("| Model flops (output Q2KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q2k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output Q50xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32);
|
||||
LOG_INF("| Model flops (output Q4KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output Q5KxF32) ");
|
||||
|
@ -1990,10 +2107,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output IQ2XXSxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq2xxs_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output Q50xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output Q80xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q80_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output IQ1SxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1s_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output IQ4NLxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq4nl_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output IQ1MxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1m_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer F32xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f32_f32);
|
||||
LOG_INF("\n");
|
||||
|
@ -2002,12 +2139,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f16_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer Q4KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32);
|
||||
LOG_INF("| Model flops (layer Q2KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q2k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer Q50xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32);
|
||||
LOG_INF("| Model flops (layer Q4KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer Q5KxF32) ");
|
||||
|
@ -2018,10 +2155,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer IQ2XXSxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq2xxs_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer Q50xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer Q80xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q80_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer IQ1SxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1s_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer IQ4NLxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq4nl_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer IQ1MxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1m_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input F32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f32);
|
||||
LOG_INF("\n");
|
||||
|
@ -2030,12 +2187,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f16);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k);
|
||||
LOG_INF("| Model params (input Q2K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q2k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50);
|
||||
LOG_INF("| Model params (input Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input Q5K) ");
|
||||
|
@ -2046,10 +2203,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input IQ2XXS) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq2xxs);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input Q80) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q80);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input IQ1S) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1s);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input IQ4NL) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq4nl);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input IQ1M) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1m);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer F32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f32);
|
||||
LOG_INF("\n");
|
||||
|
@ -2058,12 +2235,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f16);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k);
|
||||
LOG_INF("| Model params (layer Q2K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q2k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50);
|
||||
LOG_INF("| Model params (layer Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer Q5K) ");
|
||||
|
@ -2074,10 +2251,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer IQ2XXS) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq2xxs);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer Q80) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q80);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer IQ1S) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1s);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer IQ4NL) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq4nl);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer IQ1M) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1m);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output F32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f32);
|
||||
LOG_INF("\n");
|
||||
|
@ -2086,12 +2283,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f16);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k);
|
||||
LOG_INF("| Model params (output Q2K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q2k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50);
|
||||
LOG_INF("| Model params (output Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output Q5K) ");
|
||||
|
@ -2102,10 +2299,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output IQ2XXS) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq2xxs);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output Q80) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output IQ1S) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1s);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output IQ4NL) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq4nl);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output IQ1M) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1m);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model bytes (input) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_input);
|
||||
LOG_INF("\n");
|
||||
|
@ -2155,17 +2372,44 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
+ gpu_description_len
|
||||
+ sizeof(struct disk_props)
|
||||
+ sizeof(uint32_t) // cpu_props.cores
|
||||
+ sizeof(float) * 7 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q50_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
|
||||
+ sizeof(float) * 12 // - cpu_props.flops_f32_f32, cpu_props.flops_f16_f32,
|
||||
// - cpu_props.flops_q2k_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32
|
||||
// - cpu_props.flops_iq2xxs_f32
|
||||
// - cpu_props.flops_q50_f32, cpu_props.flops_q80_f32
|
||||
// - cpu_props.flops_iq1s_f32, cpu_props.flops_iq4nl_f32
|
||||
// - cpu_props.flops_iq1m_f32
|
||||
+ sizeof(struct memory_info)
|
||||
+ sizeof(struct gpu_support)
|
||||
+ sizeof(float) * 20; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
|
||||
// gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q50_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32,
|
||||
// gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q50_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32,
|
||||
// gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay
|
||||
+ sizeof(float) * 30; // GPU attributes
|
||||
// memory:
|
||||
// - memory_free, memory_total
|
||||
// - metal_read_vram_bw, cuda_read_vram_bw
|
||||
// Metal floating-point performance:
|
||||
// - metal_flops_f32_f32, metal_flops_f16_f32
|
||||
// - metal_flops_q2k_f32, metal_flops_q4k_f32, metal_flops_q5k_f32, metal_flops_q6k_f32
|
||||
// - metal_flops_iq2xxs_f32
|
||||
// - metal_flops_q50_f32, metal_flops_q80_f32
|
||||
// - metal_flops_iq1s_f32, metal_flops_iq4nl_f32
|
||||
// - metal_flops_iq1m_f32
|
||||
// CUDA floating-point performance:
|
||||
// - cuda_flops_f32_f32, cuda_flops_f16_f32
|
||||
// - cuda_flops_q2k_f32, cuda_flops_q4k_f32, cuda_flops_q5k_f32, cuda_flops_q6k_f32
|
||||
// - cuda_flops_iq2xxs_f32
|
||||
// - cuda_flops_q50_f32, cuda_flops_q80_f32
|
||||
// - cuda_flops_iq1s_f32, cuda_flops_iq4nl_f32
|
||||
// - cuda_flops_iq1m_f32
|
||||
// delay:
|
||||
// - metal_mem_cpy_delay, cuda_mem_cpy_delay
|
||||
|
||||
*buffer = (char *)malloc(total_size);
|
||||
char * ptr = *buffer;
|
||||
|
||||
if (*buffer == NULL) {
|
||||
LOG_ERR("%s: failed to allocate %zu bytes for device info serialization\n",
|
||||
__func__, total_size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// rank
|
||||
memcpy(ptr, &dev_info->rank, sizeof(uint32_t));
|
||||
ptr += sizeof(uint32_t);
|
||||
|
@ -2214,10 +2458,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->cpu_props.flops_f16_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q2k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float));
|
||||
|
@ -2226,9 +2470,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_iq2xxs_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q80_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_iq1s_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_iq4nl_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_iq1m_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->memory, sizeof(struct memory_info));
|
||||
ptr += sizeof(struct memory_info);
|
||||
|
||||
|
@ -2250,10 +2509,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->gpu_props.metal_flops_f16_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q2k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float));
|
||||
|
@ -2262,9 +2521,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_iq2xxs_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1s_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_iq4nl_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1m_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_mem_cpy_delay, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
|
@ -2277,10 +2551,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q2k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float));
|
||||
|
@ -2289,9 +2563,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq2xxs_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1s_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq4nl_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1m_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_mem_cpy_delay, sizeof(float));
|
||||
|
||||
// no need to synchronize model flops and model params
|
||||
|
@ -2366,10 +2655,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->cpu_props.flops_f16_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->cpu_props.flops_q2k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float));
|
||||
|
@ -2378,9 +2667,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_iq2xxs_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_q80_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_iq1s_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_iq4nl_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_iq1m_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->memory, ptr, sizeof(struct memory_info));
|
||||
ptr += sizeof(struct memory_info);
|
||||
|
||||
|
@ -2402,10 +2706,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->gpu_props.metal_flops_f16_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q2k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float));
|
||||
|
@ -2414,9 +2718,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_iq2xxs_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_iq1s_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_iq4nl_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_iq1m_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_mem_cpy_delay, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
|
@ -2429,10 +2748,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->gpu_props.cuda_flops_f16_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q2k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float));
|
||||
|
@ -2441,9 +2760,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_iq2xxs_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_iq1s_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_iq4nl_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_iq1m_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_mem_cpy_delay, ptr, sizeof(float));
|
||||
|
||||
// no need to synchronize model flops and model params
|
||||
|
|
|
@ -17,23 +17,34 @@ struct cpu_props {
|
|||
uint32_t cores;
|
||||
float flops_f32_f32; // in GFLOPS
|
||||
float flops_f16_f32; // in GFLOPS
|
||||
float flops_q2k_f32; // in GFLOPS
|
||||
float flops_q4k_f32; // in GFLOPS
|
||||
float flops_q50_f32; // in GFLOPS
|
||||
float flops_q5k_f32; // in GFLOPS
|
||||
float flops_q6k_f32; // in GFLOPS
|
||||
float flops_iq2xxs_f32; // in GFLOPS
|
||||
float flops_q50_f32; // in GFLOPS
|
||||
float flops_q80_f32; // in GFLOPS
|
||||
float flops_iq1s_f32; // in GFLOPS
|
||||
float flops_iq4nl_f32; // in GFLOPS
|
||||
float flops_iq1m_f32; // in GFLOPS
|
||||
|
||||
cpu_props() :
|
||||
name(""),
|
||||
cpu_props()
|
||||
: name (""),
|
||||
description (""),
|
||||
cores (0),
|
||||
flops_f32_f32 (0.0f),
|
||||
flops_f16_f32 (0.0f),
|
||||
flops_q2k_f32 (0.0f),
|
||||
flops_q4k_f32 (0.0f),
|
||||
flops_q50_f32(0.0f),
|
||||
flops_q5k_f32 (0.0f),
|
||||
flops_q6k_f32 (0.0f),
|
||||
flops_q80_f32(0.0f) {}
|
||||
flops_iq2xxs_f32(0.0f),
|
||||
flops_q50_f32 (0.0f),
|
||||
flops_q80_f32 (0.0f),
|
||||
flops_iq1s_f32 (0.0f),
|
||||
flops_iq4nl_f32 (0.0f),
|
||||
flops_iq1m_f32 (0.0f)
|
||||
{}
|
||||
};
|
||||
|
||||
struct memory_info {
|
||||
|
@ -82,20 +93,30 @@ struct gpu_props {
|
|||
float metal_read_vram_bw; // in GB/s
|
||||
float metal_flops_f32_f32; // in GFLOPS
|
||||
float metal_flops_f16_f32; // in GFLOPS
|
||||
float metal_flops_q2k_f32; // in GFLOPS
|
||||
float metal_flops_q4k_f32; // in GFLOPS
|
||||
float metal_flops_q50_f32; // in GFLOPS
|
||||
float metal_flops_q5k_f32; // in GFLOPS
|
||||
float metal_flops_q6k_f32; // in GFLOPS
|
||||
float metal_flops_iq2xxs_f32; // in GFLOPS
|
||||
float metal_flops_q50_f32; // in GFLOPS
|
||||
float metal_flops_q80_f32; // in GFLOPS
|
||||
float metal_flops_iq1s_f32; // in GFLOPS
|
||||
float metal_flops_iq4nl_f32; // in GFLOPS
|
||||
float metal_flops_iq1m_f32; // in GFLOPS
|
||||
float metal_mem_cpy_delay; // in ms
|
||||
float cuda_read_vram_bw; // in GB/s
|
||||
float cuda_flops_f32_f32; // in GFLOPS
|
||||
float cuda_flops_f16_f32; // in GFLOPS
|
||||
float cuda_flops_q2k_f32; // in GFLOPS
|
||||
float cuda_flops_q4k_f32; // in GFLOPS
|
||||
float cuda_flops_q50_f32; // in GFLOPS
|
||||
float cuda_flops_q5k_f32; // in GFLOPS
|
||||
float cuda_flops_q6k_f32; // in GFLOPS
|
||||
float cuda_flops_iq2xxs_f32; // in GFLOPS
|
||||
float cuda_flops_q50_f32; // in GFLOPS
|
||||
float cuda_flops_q80_f32; // in GFLOPS
|
||||
float cuda_flops_iq1s_f32; // in GFLOPS
|
||||
float cuda_flops_iq4nl_f32; // in GFLOPS
|
||||
float cuda_flops_iq1m_f32; // in GFLOPS
|
||||
float cuda_mem_cpy_delay; // in ms
|
||||
|
||||
gpu_props() :
|
||||
|
@ -106,20 +127,30 @@ struct gpu_props {
|
|||
metal_read_vram_bw (0.0f),
|
||||
metal_flops_f32_f32 (0.0f),
|
||||
metal_flops_f16_f32 (0.0f),
|
||||
metal_flops_q2k_f32 (0.0f),
|
||||
metal_flops_q4k_f32 (0.0f),
|
||||
metal_flops_q50_f32(0.0f),
|
||||
metal_flops_q5k_f32 (0.0f),
|
||||
metal_flops_q6k_f32 (0.0f),
|
||||
metal_flops_iq2xxs_f32 (0.0f),
|
||||
metal_flops_q50_f32 (0.0f),
|
||||
metal_flops_q80_f32 (0.0f),
|
||||
metal_flops_iq1s_f32 (0.0f),
|
||||
metal_flops_iq4nl_f32 (0.0f),
|
||||
metal_flops_iq1m_f32 (0.0f),
|
||||
metal_mem_cpy_delay (0.0f),
|
||||
cuda_read_vram_bw (0.0f),
|
||||
cuda_flops_f32_f32 (0.0f),
|
||||
cuda_flops_f16_f32 (0.0f),
|
||||
cuda_flops_q2k_f32 (0.0f),
|
||||
cuda_flops_q4k_f32 (0.0f),
|
||||
cuda_flops_q50_f32 (0.0f),
|
||||
cuda_flops_q5k_f32 (0.0f),
|
||||
cuda_flops_q6k_f32 (0.0f),
|
||||
cuda_flops_iq2xxs_f32 (0.0f),
|
||||
cuda_flops_q50_f32 (0.0f),
|
||||
cuda_flops_q80_f32 (0.0f),
|
||||
cuda_flops_iq1s_f32 (0.0f),
|
||||
cuda_flops_iq4nl_f32 (0.0f),
|
||||
cuda_flops_iq1m_f32 (0.0f),
|
||||
cuda_mem_cpy_delay (0.0f) {}
|
||||
};
|
||||
|
||||
|
@ -127,82 +158,134 @@ struct model_flops {
|
|||
float inp_embd_ms;
|
||||
int64_t output_f32_f32;
|
||||
int64_t output_f16_f32;
|
||||
int64_t output_q2k_f32;
|
||||
int64_t output_q4k_f32;
|
||||
int64_t output_q50_f32;
|
||||
int64_t output_q5k_f32;
|
||||
int64_t output_q6k_f32;
|
||||
int64_t output_iq2xxs_f32;
|
||||
int64_t output_q50_f32;
|
||||
int64_t output_q80_f32;
|
||||
int64_t output_iq1s_f32;
|
||||
int64_t output_iq4nl_f32;
|
||||
int64_t output_iq1m_f32;
|
||||
int64_t layer_f32_f32;
|
||||
int64_t layer_f16_f32;
|
||||
int64_t layer_q2k_f32;
|
||||
int64_t layer_q4k_f32;
|
||||
int64_t layer_q50_f32;
|
||||
int64_t layer_q5k_f32;
|
||||
int64_t layer_q6k_f32;
|
||||
int64_t layer_iq2xxs_f32;
|
||||
int64_t layer_q50_f32;
|
||||
int64_t layer_q80_f32;
|
||||
int64_t layer_iq1s_f32;
|
||||
int64_t layer_iq4nl_f32;
|
||||
int64_t layer_iq1m_f32;
|
||||
|
||||
model_flops() :
|
||||
inp_embd_ms (0.0f),
|
||||
output_f32_f32 (0),
|
||||
output_f16_f32 (0),
|
||||
output_q2k_f32 (0),
|
||||
output_q4k_f32 (0),
|
||||
output_q50_f32(0),
|
||||
output_q5k_f32 (0),
|
||||
output_q6k_f32 (0),
|
||||
output_iq2xxs_f32 (0),
|
||||
output_q50_f32 (0),
|
||||
output_q80_f32 (0),
|
||||
output_iq1s_f32 (0),
|
||||
output_iq4nl_f32 (0),
|
||||
output_iq1m_f32 (0),
|
||||
layer_f32_f32 (0),
|
||||
layer_f16_f32 (0),
|
||||
layer_q2k_f32 (0),
|
||||
layer_q4k_f32 (0),
|
||||
layer_q50_f32 (0),
|
||||
layer_q5k_f32 (0),
|
||||
layer_q6k_f32 (0),
|
||||
layer_q80_f32 (0) {}
|
||||
layer_iq2xxs_f32 (0),
|
||||
layer_q50_f32 (0),
|
||||
layer_q80_f32 (0),
|
||||
layer_iq1s_f32 (0),
|
||||
layer_iq4nl_f32 (0),
|
||||
layer_iq1m_f32 (0)
|
||||
{}
|
||||
};
|
||||
|
||||
struct model_params {
|
||||
int64_t input_f32;
|
||||
int64_t input_f16;
|
||||
int64_t input_q2k;
|
||||
int64_t input_q4k;
|
||||
int64_t input_q50;
|
||||
int64_t input_q5k;
|
||||
int64_t input_q6k;
|
||||
int64_t input_iq2xxs;
|
||||
int64_t input_q50;
|
||||
int64_t input_q80;
|
||||
int64_t input_iq1s;
|
||||
int64_t input_iq4nl;
|
||||
int64_t input_iq1m;
|
||||
int64_t output_f32;
|
||||
int64_t output_f16;
|
||||
int64_t output_q2k;
|
||||
int64_t output_q4k;
|
||||
int64_t output_q50;
|
||||
int64_t output_q5k;
|
||||
int64_t output_q6k;
|
||||
int64_t output_iq2xxs;
|
||||
int64_t output_q50;
|
||||
int64_t output_q80;
|
||||
int64_t output_iq1s;
|
||||
int64_t output_iq4nl;
|
||||
int64_t output_iq1m;
|
||||
int64_t layer_f32;
|
||||
int64_t layer_f16;
|
||||
int64_t layer_q2k;
|
||||
int64_t layer_q4k;
|
||||
int64_t layer_q50;
|
||||
int64_t layer_q5k;
|
||||
int64_t layer_q6k;
|
||||
int64_t layer_iq2xxs;
|
||||
int64_t layer_q50;
|
||||
int64_t layer_q80;
|
||||
int64_t layer_iq1s;
|
||||
int64_t layer_iq4nl;
|
||||
int64_t layer_iq1m;
|
||||
|
||||
model_params() :
|
||||
input_f32 (0),
|
||||
input_f16 (0),
|
||||
input_q2k (0),
|
||||
input_q4k (0),
|
||||
input_q50 (0),
|
||||
input_q5k (0),
|
||||
input_q6k (0),
|
||||
input_iq2xxs (0),
|
||||
input_q50 (0),
|
||||
input_q80 (0),
|
||||
input_iq1s (0),
|
||||
input_iq4nl (0),
|
||||
input_iq1m (0),
|
||||
output_f32 (0),
|
||||
output_f16 (0),
|
||||
output_q2k (0),
|
||||
output_q4k (0),
|
||||
output_q50(0),
|
||||
output_q5k (0),
|
||||
output_q6k (0),
|
||||
output_iq2xxs (0),
|
||||
output_q50 (0),
|
||||
output_q80 (0),
|
||||
output_iq1s (0),
|
||||
output_iq4nl (0),
|
||||
output_iq1m (0),
|
||||
layer_f32 (0),
|
||||
layer_f16 (0),
|
||||
layer_q2k (0),
|
||||
layer_q4k (0),
|
||||
layer_q50 (0),
|
||||
layer_q5k (0),
|
||||
layer_q6k (0),
|
||||
layer_q80 (0) {}
|
||||
layer_iq2xxs (0),
|
||||
layer_q50 (0),
|
||||
layer_q80 (0),
|
||||
layer_iq1s (0),
|
||||
layer_iq4nl (0),
|
||||
layer_iq1m (0)
|
||||
{}
|
||||
};
|
||||
|
||||
struct model_bytes {
|
||||
|
@ -229,6 +312,10 @@ struct disk_props {
|
|||
write_rnd_bw(0.0f) {}
|
||||
};
|
||||
|
||||
struct startup_args{
|
||||
bool should_profile;
|
||||
};
|
||||
|
||||
struct device_info {
|
||||
uint32_t rank;
|
||||
const char * device_name;
|
||||
|
|
|
@ -385,12 +385,12 @@ extern "C" {
|
|||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_IQ1_M = 29,
|
||||
GGML_TYPE_BF16 = 30,
|
||||
GGML_TYPE_Q4_0_4_4 = 31,
|
||||
GGML_TYPE_Q4_0_4_8 = 32,
|
||||
GGML_TYPE_Q4_0_8_8 = 33,
|
||||
// GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
|
||||
// GGML_TYPE_Q4_0_4_8 = 32,
|
||||
// GGML_TYPE_Q4_0_8_8 = 33,
|
||||
GGML_TYPE_TQ1_0 = 34,
|
||||
GGML_TYPE_TQ2_0 = 35,
|
||||
GGML_TYPE_COUNT,
|
||||
GGML_TYPE_COUNT = 39,
|
||||
};
|
||||
|
||||
// precision
|
||||
|
@ -431,9 +431,6 @@ extern "C" {
|
|||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
|
|
@ -15725,15 +15725,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
|
||||
} break;
|
||||
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
|
|
|
@ -1076,54 +1076,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_type = GGML_TYPE_BF16,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_4] = {
|
||||
.type_name = "q4_0_4x4",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 4,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.gemv = ggml_gemv_q4_0_4x4_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_8] = {
|
||||
.type_name = "q4_0_4x8",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 8,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.gemv = ggml_gemv_q4_0_4x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_8_8] = {
|
||||
.type_name = "q4_0_8x8",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 8,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 8,
|
||||
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
||||
},
|
||||
[GGML_TYPE_TQ1_0] = {
|
||||
.type_name = "tq1_0",
|
||||
.blck_size = QK_K,
|
||||
|
@ -3578,9 +3530,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|||
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
|
@ -9517,9 +9466,6 @@ static void ggml_compute_forward_add(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_add_q_f32(params, dst);
|
||||
} break;
|
||||
|
@ -9897,9 +9843,6 @@ static void ggml_compute_forward_add1(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_add1_q_f32(params, dst);
|
||||
} break;
|
||||
|
@ -10027,9 +9970,6 @@ static void ggml_compute_forward_acc(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
|
@ -13093,9 +13033,6 @@ static void ggml_compute_forward_out_prod(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_out_prod_q_f32(params, dst);
|
||||
} break;
|
||||
|
@ -13283,9 +13220,6 @@ static void ggml_compute_forward_set(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
|
@ -13547,9 +13481,6 @@ static void ggml_compute_forward_get_rows(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_get_rows_q(params, dst);
|
||||
} break;
|
||||
|
@ -14139,9 +14070,6 @@ static void ggml_compute_forward_clamp(
|
|||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
|
@ -21941,9 +21869,6 @@ size_t ggml_quantize_chunk(
|
|||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
|
|
|
@ -165,18 +165,18 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors, 1 bit quantization
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors, 1 bit quantization
|
||||
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
||||
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||
|
||||
|
@ -453,6 +453,7 @@ extern "C" {
|
|||
LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg);
|
||||
LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set);
|
||||
LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info);
|
||||
LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args);
|
||||
LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
|
||||
LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
|
||||
|
||||
|
|
241
src/llama.cpp
241
src/llama.cpp
|
@ -3560,16 +3560,26 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
|
|||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
return true;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return n_params->layer_q2k > 0 || n_params->output_q2k > 0;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
|
||||
case GGML_TYPE_Q5_K:
|
||||
return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return n_params->layer_q6k > 0 || n_params->output_q6k > 0;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
return n_params->layer_iq2xxs > 0 || n_params->output_iq2xxs > 0;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return n_params->layer_q80 > 0 || n_params->output_q80 > 0;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
return n_params->layer_iq1m > 0 || n_params->output_iq1m > 0;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized data type\n");
|
||||
}
|
||||
|
@ -3650,18 +3660,18 @@ void llama_profile_device(
|
|||
dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) {
|
||||
dev_info->cpu_props.flops_q2k_f32 = device_cpu_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q2k_f32 = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) {
|
||||
dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
|
||||
dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) {
|
||||
dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
|
||||
|
@ -3674,11 +3684,42 @@ void llama_profile_device(
|
|||
dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_IQ2_XXS)) {
|
||||
dev_info->cpu_props.flops_iq2xxs_f32 = device_cpu_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_iq2xxs_f32= device_metal_flops(model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_iq2xxs_f32 = device_cuda_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
|
||||
dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) {
|
||||
dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) {
|
||||
dev_info->cpu_props.flops_iq1s_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) {
|
||||
dev_info->cpu_props.flops_iq4nl_f32 = device_cpu_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_IQ1_M)) {
|
||||
dev_info->cpu_props.flops_iq1m_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_iq1m_f32= device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
|
||||
|
@ -4844,9 +4885,7 @@ struct llama_model_loader {
|
|||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
|
||||
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
|
||||
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
|
||||
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
|
@ -5654,9 +5693,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
|
@ -18997,10 +19033,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
|
||||
new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
new_type = GGML_TYPE_Q4_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
|
@ -19323,9 +19355,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
|
@ -19646,14 +19675,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
int chunk_size_multiplier = 1;
|
||||
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
|
||||
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
||||
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
|
||||
|
@ -19666,8 +19687,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
|
||||
chunk_size_multiplier;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
|
@ -20242,6 +20262,46 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_
|
|||
return 0;
|
||||
}
|
||||
|
||||
int llama_bcast_startup_args(llama_context * ctx, uint32_t rank, startup_args * args) {
|
||||
int32_t n_world = ctx->cparams.n_world;
|
||||
if (n_world == 1) {
|
||||
return 0;
|
||||
}
|
||||
GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
|
||||
if (rank == 0){
|
||||
// send
|
||||
try {
|
||||
std::vector<zmq::message_t> send_msgs;
|
||||
send_msgs.emplace_back("should_profile", strlen("should_profile"));
|
||||
send_msgs.emplace_back(&args->should_profile, sizeof(args->should_profile));
|
||||
zmq::send_multipart(*ctx->send_socket, send_msgs);
|
||||
} catch (const zmq::error_t& e) {
|
||||
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
||||
return -1;
|
||||
}
|
||||
}else {
|
||||
// receive
|
||||
std::vector<zmq::message_t> recv_msgs;
|
||||
if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
|
||||
return -1;
|
||||
}
|
||||
GGML_ASSERT(recv_msgs[0].to_string() == "should_profile");
|
||||
GGML_ASSERT(recv_msgs[1].size() == sizeof(bool));
|
||||
bool should_profile = *static_cast<bool*>(recv_msgs[1].data());
|
||||
args->should_profile = should_profile;
|
||||
if ((int)rank != (int)n_world - 1){
|
||||
// send
|
||||
try {
|
||||
zmq::send_multipart(*ctx->send_socket, recv_msgs);
|
||||
} catch (const zmq::error_t& e) {
|
||||
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) {
|
||||
uint32_t n_world = ctx->cparams.n_world;
|
||||
if (n_world == 1) {
|
||||
|
@ -21054,21 +21114,36 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
|
|||
case GGML_TYPE_F16:
|
||||
n_flops->output_f16_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
n_flops->output_q2k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_flops->output_q4k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_flops->output_q50_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
n_flops->output_q5k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_flops->output_q6k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
n_flops->output_iq2xxs_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_flops->output_q50_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_flops->output_q80_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
n_flops->output_iq1s_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
n_flops->output_iq4nl_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
n_flops->output_iq1m_f32 += n;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||
}
|
||||
|
@ -21082,21 +21157,36 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
|
|||
case GGML_TYPE_F16:
|
||||
n_flops->layer_f16_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
n_flops->layer_q2k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_flops->layer_q4k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_flops->layer_q50_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
n_flops->layer_q5k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_flops->layer_q6k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
n_flops->layer_iq2xxs_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_flops->layer_q50_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_flops->layer_q80_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
n_flops->layer_iq1s_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
n_flops->layer_iq4nl_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
n_flops->layer_iq1m_f32 += n;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
|
||||
}
|
||||
|
@ -21118,21 +21208,36 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
|||
case GGML_TYPE_F16:
|
||||
n_params->input_f16 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
n_params->input_q2k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_params->input_q4k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->input_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
n_params->input_q5k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_params->input_q6k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
n_params->input_iq2xxs += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->input_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_params->input_q80 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
n_params->input_iq1s += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
n_params->input_iq4nl += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
n_params->input_iq1m += n_i64t;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||
}
|
||||
|
@ -21146,21 +21251,36 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
|||
case GGML_TYPE_F16:
|
||||
n_params->output_f16 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
n_params->output_q2k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_params->output_q4k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->output_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
n_params->output_q5k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_params->output_q6k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
n_params->output_iq2xxs += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->output_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_params->output_q80 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
n_params->output_iq1s += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
n_params->output_iq4nl += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
n_params->output_iq1m += n_i64t;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||
}
|
||||
|
@ -21174,21 +21294,36 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
|||
case GGML_TYPE_F16:
|
||||
n_params->layer_f16 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
n_params->layer_q2k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_params->layer_q4k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->layer_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
n_params->layer_q5k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_params->layer_q6k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
n_params->layer_iq2xxs += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->layer_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_params->layer_q80 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
n_params->layer_iq1s += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
n_params->layer_iq4nl += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
n_params->layer_iq1m += n_i64t;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
|
||||
}
|
||||
|
@ -21479,19 +21614,29 @@ void llama_model_n_flops(
|
|||
// use average values instead of total values
|
||||
n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
|
||||
n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
|
||||
n_flops->layer_q2k_f32 = static_cast<int64_t>((double)n_flops->layer_q2k_f32 / (double)n_layer);
|
||||
n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
|
||||
n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer);
|
||||
n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
|
||||
n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
|
||||
n_flops->layer_iq2xxs_f32 = static_cast<int64_t>((double)n_flops->layer_iq2xxs_f32 / (double)n_layer);
|
||||
n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer);
|
||||
n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
|
||||
n_flops->layer_iq1s_f32 = static_cast<int64_t>((double)n_flops->layer_iq1s_f32 / (double)n_layer);
|
||||
n_flops->layer_iq4nl_f32 = static_cast<int64_t>((double)n_flops->layer_iq4nl_f32 / (double)n_layer);
|
||||
n_flops->layer_iq1m_f32 = static_cast<int64_t>((double)n_flops->layer_iq1m_f32 / (double)n_layer);
|
||||
|
||||
n_params->layer_f32 = static_cast<int64_t>((double)n_params->layer_f32 / (double)n_layer);
|
||||
n_params->layer_f16 = static_cast<int64_t>((double)n_params->layer_f16 / (double)n_layer);
|
||||
n_params->layer_q2k = static_cast<int64_t>((double)n_params->layer_q2k / (double)n_layer);
|
||||
n_params->layer_q4k = static_cast<int64_t>((double)n_params->layer_q4k / (double)n_layer);
|
||||
n_params->layer_q50 = static_cast<int64_t>((double)n_params->layer_q50 / (double)n_layer);
|
||||
n_params->layer_q5k = static_cast<int64_t>((double)n_params->layer_q5k / (double)n_layer);
|
||||
n_params->layer_q6k = static_cast<int64_t>((double)n_params->layer_q6k / (double)n_layer);
|
||||
n_params->layer_iq2xxs = static_cast<int64_t>((double)n_params->layer_iq2xxs / (double)n_layer);
|
||||
n_params->layer_q50 = static_cast<int64_t>((double)n_params->layer_q50 / (double)n_layer);
|
||||
n_params->layer_q80 = static_cast<int64_t>((double)n_params->layer_q80 / (double)n_layer);
|
||||
n_params->layer_iq1s = static_cast<int64_t>((double)n_params->layer_iq1s / (double)n_layer);
|
||||
n_params->layer_iq4nl = static_cast<int64_t>((double)n_params->layer_iq4nl / (double)n_layer);
|
||||
n_params->layer_iq1m = static_cast<int64_t>((double)n_params->layer_iq1m / (double)n_layer);
|
||||
|
||||
n_bytes->nb_layer = static_cast<int64_t>((double)n_bytes->nb_layer / (double)n_layer);
|
||||
|
||||
|
|
62
tools/profile_tool.cpp
Normal file
62
tools/profile_tool.cpp
Normal file
|
@ -0,0 +1,62 @@
|
|||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
static void print_usage(int argc, char ** argv) {
|
||||
(void) argc;
|
||||
|
||||
LOG("\nexample usage:\n");
|
||||
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
||||
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||
LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 0.0) {
|
||||
LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 0.0) {
|
||||
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
auto mparams = llama_model_params_from_gpt_params(params);
|
||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
|
||||
struct llama_model * model = nullptr;
|
||||
|
||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
} else if (!params.model_url.empty()) {
|
||||
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
} else {
|
||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
}
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
return -1;
|
||||
}
|
||||
|
||||
llama_model_loader * ml = llama_model_load(params.model.c_str(), model, &mparams);
|
||||
|
||||
device_info dev_info;
|
||||
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||
device_print_props(&dev_info, 1, model, cparams);
|
||||
|
||||
llama_free_model(model);
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Reference in a new issue