mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # ggml/src/ggml-sycl/ggml-sycl.cpp # pyproject.toml
This commit is contained in:
commit
ffe23f0e93
23 changed files with 158 additions and 160 deletions
|
@ -92,6 +92,7 @@ enum common_sampler_type {
|
||||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||||
|
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
||||||
};
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
@ -157,6 +158,7 @@ struct common_params_sampling {
|
||||||
std::vector<enum common_sampler_type> samplers = {
|
std::vector<enum common_sampler_type> samplers = {
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||||
COMMON_SAMPLER_TYPE_DRY,
|
COMMON_SAMPLER_TYPE_DRY,
|
||||||
|
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
||||||
COMMON_SAMPLER_TYPE_TOP_K,
|
COMMON_SAMPLER_TYPE_TOP_K,
|
||||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||||
COMMON_SAMPLER_TYPE_TOP_P,
|
COMMON_SAMPLER_TYPE_TOP_P,
|
||||||
|
|
|
@ -229,11 +229,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
params.logit_bias.data()));
|
params.logit_bias.data()));
|
||||||
|
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
if (params.top_n_sigma >= 0) {
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
|
||||||
} else {
|
|
||||||
for (const auto & cnstr : params.samplers) {
|
for (const auto & cnstr : params.samplers) {
|
||||||
switch (cnstr) {
|
switch (cnstr) {
|
||||||
case COMMON_SAMPLER_TYPE_DRY:
|
case COMMON_SAMPLER_TYPE_DRY:
|
||||||
|
@ -253,6 +248,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||||
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
|
@ -275,7 +273,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
GGML_ASSERT(false && "unknown sampler type");
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||||
} else if (params.mirostat == 1) {
|
} else if (params.mirostat == 1) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||||
|
@ -475,6 +472,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||||
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||||
|
@ -490,6 +488,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||||
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||||
|
@ -504,6 +503,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
|
@ -517,6 +517,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
@ -552,6 +553,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||||
|
|
|
@ -2761,6 +2761,13 @@ class Qwen2MoeModel(TextModel):
|
||||||
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
||||||
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
||||||
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
||||||
|
# YaRN is not enabled by default
|
||||||
|
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
||||||
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
|
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ extern "C" {
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
|
@ -59,7 +59,7 @@ extern "C" {
|
||||||
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||||
|
|
|
@ -686,11 +686,15 @@ extern "C" {
|
||||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||||
|
|
||||||
|
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
|
||||||
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
||||||
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||||
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||||
|
|
||||||
|
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
|
||||||
|
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
||||||
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
|
@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||||
return SIZE_MAX;
|
return SIZE_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
||||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||||
if (buft->iface.get_alloc_size) {
|
if (buft->iface.get_alloc_size) {
|
||||||
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
||||||
|
@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
||||||
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
|
||||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -719,6 +719,7 @@ void launch_fattn(
|
||||||
size_t nb23 = V->nb[3];
|
size_t nb23 = V->nb[3];
|
||||||
|
|
||||||
if (need_f16_K && K->type != GGML_TYPE_F16) {
|
if (need_f16_K && K->type != GGML_TYPE_F16) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguously_allocated(K));
|
||||||
K_f16.alloc(ggml_nelements(K));
|
K_f16.alloc(ggml_nelements(K));
|
||||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
||||||
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
||||||
|
@ -733,6 +734,7 @@ void launch_fattn(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (need_f16_V && V->type != GGML_TYPE_F16) {
|
if (need_f16_V && V->type != GGML_TYPE_F16) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguously_allocated(V));
|
||||||
V_f16.alloc(ggml_nelements(V));
|
V_f16.alloc(ggml_nelements(V));
|
||||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
||||||
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
||||||
|
|
|
@ -556,8 +556,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
|
||||||
|
|
||||||
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||||
// initialize padding to 0 to avoid possible NaN values
|
// initialize padding to 0 to avoid possible NaN values
|
||||||
size_t original_size = ggml_nbytes(tensor);
|
const size_t original_size = ggml_nbytes(tensor);
|
||||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||||
|
|
||||||
if (padded_size > original_size) {
|
if (padded_size > original_size) {
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
|
@ -680,6 +680,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
||||||
|
|
||||||
if (ggml_is_quantized(tensor->type)) {
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||||
|
GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
|
||||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -801,6 +802,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||||
|
|
||||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||||
|
@ -852,6 +854,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
|
||||||
// split tensors must always be set in their entirety at once
|
// split tensors must always be set in their entirety at once
|
||||||
GGML_ASSERT(offset == 0);
|
GGML_ASSERT(offset == 0);
|
||||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||||
|
|
||||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||||
|
|
||||||
|
@ -890,6 +893,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
|
||||||
// split tensors must always be set in their entirety at once
|
// split tensors must always be set in their entirety at once
|
||||||
GGML_ASSERT(offset == 0);
|
GGML_ASSERT(offset == 0);
|
||||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||||
|
|
||||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||||
|
|
||||||
|
@ -971,6 +975,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
|
||||||
|
|
||||||
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||||
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||||
|
|
||||||
size_t total_size = 0;
|
size_t total_size = 0;
|
||||||
|
|
||||||
|
@ -1532,6 +1537,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
|
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
|
||||||
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||||
|
GGML_ASSERT(!src0->view_src);
|
||||||
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
||||||
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
||||||
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
|
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
|
||||||
|
@ -2065,6 +2072,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
ggml_tensor src0_slice = *src0;
|
ggml_tensor src0_slice = *src0;
|
||||||
src0_slice.ne[2] = 1;
|
src0_slice.ne[2] = 1;
|
||||||
src0_slice.nb[3] = src0_slice.nb[2];
|
src0_slice.nb[3] = src0_slice.nb[2];
|
||||||
|
src0_slice.op = GGML_OP_VIEW;
|
||||||
|
src0_slice.view_src = dst->src[0]; // non-const pointer to src0
|
||||||
src0_slice.data = (char *) src0->data + i02*nb02;
|
src0_slice.data = (char *) src0->data + i02*nb02;
|
||||||
|
|
||||||
ggml_tensor src1_slice;
|
ggml_tensor src1_slice;
|
||||||
|
|
|
@ -89,6 +89,17 @@ void ggml_cuda_mul_mat_q(
|
||||||
const float * src1_d = (const float *) src1->data;
|
const float * src1_d = (const float *) src1->data;
|
||||||
float * dst_d = (float *) dst->data;
|
float * dst_d = (float *) dst->data;
|
||||||
|
|
||||||
|
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||||
|
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||||
|
GGML_ASSERT(!src0->view_src);
|
||||||
|
const size_t size_data = ggml_nbytes(src0);
|
||||||
|
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||||
|
if (size_alloc > size_data) {
|
||||||
|
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||||
|
|
||||||
const int64_t s01 = src0->nb[1] / ts_src0;
|
const int64_t s01 = src0->nb[1] / ts_src0;
|
||||||
|
@ -118,7 +129,7 @@ void ggml_cuda_mul_mat_q(
|
||||||
|
|
||||||
const mmq_args args = {
|
const mmq_args args = {
|
||||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
|
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
|
||||||
ne00, ne01, ne1, s01, s1,
|
ne00, ne01, ne1, s01, ne11, s1,
|
||||||
ne02, ne12, s02, s12, s2,
|
ne02, ne12, s02, s12, s2,
|
||||||
ne03, ne13, s03, s13, s3,
|
ne03, ne13, s03, s13, s3,
|
||||||
use_stream_k};
|
use_stream_k};
|
||||||
|
@ -202,7 +213,7 @@ void ggml_cuda_mul_mat_q(
|
||||||
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
||||||
const mmq_args args = {
|
const mmq_args args = {
|
||||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
|
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
|
||||||
ne00, ne01, ne_get_rows, s01, s1,
|
ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
|
||||||
ne02, ne02, s02, s12, s2,
|
ne02, ne02, s02, s12, s2,
|
||||||
ne03, ne13, s03, s13, s3,
|
ne03, ne13, s03, s13, s3,
|
||||||
use_stream_k};
|
use_stream_k};
|
||||||
|
@ -241,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||||
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
||||||
const mmq_args args = {
|
const mmq_args args = {
|
||||||
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
||||||
ne00, row_diff, src1_ncols, stride01, nrows_dst,
|
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
|
||||||
1, 1, 0, 0, 0,
|
1, 1, 0, 0, 0,
|
||||||
1, 1, 0, 0, 0,
|
1, 1, 0, 0, 0,
|
||||||
use_stream_k};
|
use_stream_k};
|
||||||
|
|
|
@ -2523,7 +2523,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
|
||||||
static __device__ __forceinline__ void mul_mat_q_process_tile(
|
static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||||
const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
|
const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
|
||||||
const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||||
const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
|
const int nrows_x, const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||||
const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
|
const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
|
||||||
|
|
||||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||||
|
@ -2607,7 +2607,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||||
static __global__ void mul_mat_q(
|
static __global__ void mul_mat_q(
|
||||||
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
|
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
|
||||||
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
|
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||||
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||||
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||||
|
|
||||||
|
@ -2620,7 +2620,7 @@ static __global__ void mul_mat_q(
|
||||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||||
constexpr int mmq_y = get_mmq_y_device();
|
constexpr int mmq_y = get_mmq_y_device();
|
||||||
|
|
||||||
const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x
|
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; // Number of tiles x
|
||||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||||
|
|
||||||
// Initialize the ids for writing back data with just the index.
|
// Initialize the ids for writing back data with just the index.
|
||||||
|
@ -2649,8 +2649,8 @@ static __global__ void mul_mat_q(
|
||||||
|
|
||||||
// Defaults for regular matrix multiplication:
|
// Defaults for regular matrix multiplication:
|
||||||
int col_low = 0;
|
int col_low = 0;
|
||||||
int col_high = ncols_y;
|
int col_high = ncols_dst;
|
||||||
int col_diff = ncols_y;
|
int col_diff = ncols_dst;
|
||||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||||
|
|
||||||
|
@ -2690,7 +2690,7 @@ static __global__ void mul_mat_q(
|
||||||
|
|
||||||
constexpr bool fixup = false;
|
constexpr bool fixup = false;
|
||||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, stride_row_x, ncols_y, stride_col_dst,
|
||||||
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
|
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -2721,8 +2721,8 @@ static __global__ void mul_mat_q(
|
||||||
|
|
||||||
// Defaults for regular matrix multiplication:
|
// Defaults for regular matrix multiplication:
|
||||||
int col_low = 0;
|
int col_low = 0;
|
||||||
int col_high = ncols_y;
|
int col_high = ncols_dst;
|
||||||
int col_diff = ncols_y;
|
int col_diff = ncols_dst;
|
||||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||||
|
|
||||||
|
@ -2768,7 +2768,7 @@ static __global__ void mul_mat_q(
|
||||||
|
|
||||||
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
||||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, stride_row_x, ncols_y, stride_col_dst,
|
||||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||||
|
|
||||||
kbc += blocks_per_ne00;
|
kbc += blocks_per_ne00;
|
||||||
|
@ -2793,8 +2793,8 @@ static __global__ void mul_mat_q(
|
||||||
|
|
||||||
// Defaults for regular matrix multiplication:
|
// Defaults for regular matrix multiplication:
|
||||||
int col_low = 0;
|
int col_low = 0;
|
||||||
int col_high = ncols_y;
|
int col_high = ncols_dst;
|
||||||
int col_diff = ncols_y;
|
int col_diff = ncols_dst;
|
||||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||||
|
|
||||||
|
@ -2835,7 +2835,7 @@ static __global__ void mul_mat_q(
|
||||||
|
|
||||||
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
||||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, stride_row_x, ncols_y, stride_col_dst,
|
||||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2843,7 +2843,7 @@ static __global__ void mul_mat_q(
|
||||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||||
static __global__ void mul_mat_q_stream_k_fixup(
|
static __global__ void mul_mat_q_stream_k_fixup(
|
||||||
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
|
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst,
|
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
|
||||||
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) {
|
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) {
|
||||||
constexpr int mmq_y = get_mmq_y_device();
|
constexpr int mmq_y = get_mmq_y_device();
|
||||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||||
|
@ -2852,7 +2852,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||||
|
|
||||||
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
|
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
|
||||||
|
|
||||||
const int ntx = (ncols_y + mmq_x - 1) / mmq_x;
|
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x;
|
||||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||||
|
|
||||||
const int bidx0 = blockIdx.x;
|
const int bidx0 = blockIdx.x;
|
||||||
|
@ -2927,7 +2927,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||||
dst += offset_dst;
|
dst += offset_dst;
|
||||||
|
|
||||||
const int i_max = nrows_x - it*mmq_y - 1;
|
const int i_max = nrows_x - it*mmq_y - 1;
|
||||||
const int j_max = ncols_y - jt*mmq_x - 1;
|
const int j_max = ncols_dst - jt*mmq_x - 1;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||||
|
@ -2990,7 +2990,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||||
|
|
||||||
struct mmq_args {
|
struct mmq_args {
|
||||||
const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
|
const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
|
||||||
int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst;
|
int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst;
|
||||||
int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
|
int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
|
||||||
int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
|
int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
|
||||||
bool use_stream_k;
|
bool use_stream_k;
|
||||||
|
@ -3027,7 +3027,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||||
|
|
||||||
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
|
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
|
||||||
const int ntx = (args.ncols_y + mmq_x - 1) / mmq_x;
|
const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x;
|
||||||
const int ntzw = args.nchannels_y * args.nsamples_y;
|
const int ntzw = args.nchannels_y * args.nsamples_y;
|
||||||
const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
|
const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
|
||||||
|
|
||||||
|
@ -3041,14 +3041,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||||
constexpr bool need_check = false;
|
constexpr bool need_check = false;
|
||||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||||
} else {
|
} else {
|
||||||
constexpr bool need_check = true;
|
constexpr bool need_check = true;
|
||||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||||
}
|
}
|
||||||
|
@ -3069,7 +3069,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||||
|
|
||||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||||
|
|
||||||
|
@ -3078,14 +3078,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||||
}
|
}
|
||||||
|
|
||||||
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
|
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
|
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
|
||||||
} else {
|
} else {
|
||||||
constexpr bool need_check = true;
|
constexpr bool need_check = true;
|
||||||
|
|
||||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||||
|
|
||||||
|
@ -3094,7 +3094,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||||
}
|
}
|
||||||
|
|
||||||
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
|
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
|
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -513,6 +513,17 @@ void ggml_cuda_mul_mat_vec_q(
|
||||||
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
||||||
float * dst_d = (float *) dst->data;
|
float * dst_d = (float *) dst->data;
|
||||||
|
|
||||||
|
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||||
|
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||||
|
GGML_ASSERT(!src0->view_src);
|
||||||
|
const size_t size_data = ggml_nbytes(src0);
|
||||||
|
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||||
|
if (size_alloc > size_data) {
|
||||||
|
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||||
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
|
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
|
||||||
{
|
{
|
||||||
|
|
|
@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
|
||||||
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
||||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ne00 % 4 == 0);
|
||||||
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
|
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
|
||||||
|
|
||||||
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||||
|
|
|
@ -1312,6 +1312,10 @@ bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
||||||
return ggml_is_contiguous_n(tensor, 2);
|
return ggml_is_contiguous_n(tensor, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor) {
|
||||||
|
return ggml_nbytes(tensor) == ggml_nelements(tensor) * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
# pyright: reportUnusedImport=false
|
|
||||||
|
|
||||||
from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
|
|
||||||
from .gguf_dump import main as gguf_dump_entrypoint
|
|
||||||
from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
|
|
||||||
from .gguf_new_metadata import main as gguf_new_metadata_entrypoint
|
|
||||||
from .gguf_editor_gui import main as gguf_editor_gui_entrypoint
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.16.2"
|
version = "0.16.3"
|
||||||
description = "Read and write ML models in GGUF for GGML"
|
description = "Read and write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
@ -36,8 +36,8 @@ requires = ["poetry-core>=1.0.0"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
|
gguf-convert-endian = "gguf.scripts.gguf_convert_endian:main"
|
||||||
gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
|
gguf-dump = "gguf.scripts.gguf_dump:main"
|
||||||
gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
|
gguf-set-metadata = "gguf.scripts.gguf_set_metadata:main"
|
||||||
gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"
|
gguf-new-metadata = "gguf.scripts.gguf_new_metadata:main"
|
||||||
gguf-editor-gui = "gguf.scripts:gguf_editor_gui_entrypoint"
|
gguf-editor-gui = "gguf.scripts.gguf_editor_gui:main"
|
||||||
|
|
|
@ -782,7 +782,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type_gate == LLM_FFN_PAR) {
|
if (gate && type_gate == LLM_FFN_PAR) {
|
||||||
cur = ggml_mul(ctx0, cur, tmp);
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
cb(cur, "ffn_gate_par", il);
|
cb(cur, "ffn_gate_par", il);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1750,6 +1750,10 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
||||||
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||||
|
|
||||||
|
if (ctx->n < 0.0f) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// find max logit and calculate mean
|
// find max logit and calculate mean
|
||||||
float max = cur_p->data[0].logit;
|
float max = cur_p->data[0].logit;
|
||||||
float logits_sum = 0;
|
float logits_sum = 0;
|
||||||
|
|
|
@ -1,63 +0,0 @@
|
||||||
#include <stdio.h>
|
|
||||||
#include <assert.h>
|
|
||||||
|
|
||||||
#include "mtmd.h"
|
|
||||||
|
|
||||||
int main(void) {
|
|
||||||
printf("\n\nTesting libmtmd C API...\n");
|
|
||||||
printf("--------\n\n");
|
|
||||||
|
|
||||||
struct mtmd_context_params params = mtmd_context_params_default();
|
|
||||||
printf("Default image marker: %s\n", params.image_marker);
|
|
||||||
|
|
||||||
mtmd_input_chunks * chunks = mtmd_test_create_input_chunks();
|
|
||||||
|
|
||||||
if (!chunks) {
|
|
||||||
fprintf(stderr, "Failed to create input chunks\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t n_chunks = mtmd_input_chunks_size(chunks);
|
|
||||||
printf("Number of chunks: %zu\n", n_chunks);
|
|
||||||
assert(n_chunks > 0);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < n_chunks; i++) {
|
|
||||||
const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
|
|
||||||
assert(chunk != NULL);
|
|
||||||
enum mtmd_input_chunk_type type = mtmd_input_chunk_get_type(chunk);
|
|
||||||
printf("Chunk %zu type: %d\n", i, type);
|
|
||||||
|
|
||||||
if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
||||||
size_t n_tokens;
|
|
||||||
const llama_token * tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
|
||||||
printf(" Text chunk with %zu tokens\n", n_tokens);
|
|
||||||
assert(tokens != NULL);
|
|
||||||
assert(n_tokens > 0);
|
|
||||||
for (size_t j = 0; j < n_tokens; j++) {
|
|
||||||
assert(tokens[j] >= 0);
|
|
||||||
printf(" > Token %zu: %d\n", j, tokens[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
||||||
const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
|
||||||
size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
|
|
||||||
size_t nx = mtmd_image_tokens_get_nx(image_tokens);
|
|
||||||
size_t ny = mtmd_image_tokens_get_ny(image_tokens);
|
|
||||||
const char * id = mtmd_image_tokens_get_id(image_tokens);
|
|
||||||
assert(n_tokens > 0);
|
|
||||||
assert(nx > 0);
|
|
||||||
assert(ny > 0);
|
|
||||||
assert(id != NULL);
|
|
||||||
printf(" Image chunk with %zu tokens\n", n_tokens);
|
|
||||||
printf(" Image size: %zu x %zu\n", nx, ny);
|
|
||||||
printf(" Image ID: %s\n", id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Free the chunks
|
|
||||||
mtmd_input_chunks_free(chunks);
|
|
||||||
|
|
||||||
printf("\n\nDONE: test libmtmd C API...\n");
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
Binary file not shown.
|
@ -146,6 +146,7 @@ struct slot_params {
|
||||||
{"top_k", sampling.top_k},
|
{"top_k", sampling.top_k},
|
||||||
{"top_p", sampling.top_p},
|
{"top_p", sampling.top_p},
|
||||||
{"min_p", sampling.min_p},
|
{"min_p", sampling.min_p},
|
||||||
|
{"top_n_sigma", sampling.top_n_sigma},
|
||||||
{"xtc_probability", sampling.xtc_probability},
|
{"xtc_probability", sampling.xtc_probability},
|
||||||
{"xtc_threshold", sampling.xtc_threshold},
|
{"xtc_threshold", sampling.xtc_threshold},
|
||||||
{"typical_p", sampling.typ_p},
|
{"typical_p", sampling.typ_p},
|
||||||
|
@ -248,6 +249,7 @@ struct server_task {
|
||||||
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
|
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
|
||||||
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
|
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
|
||||||
params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
|
params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
|
||||||
|
params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma);
|
||||||
params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
|
params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
|
||||||
params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
|
params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
|
||||||
params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
|
params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
|
||||||
|
|
|
@ -157,6 +157,9 @@ export default function ChatScreen() {
|
||||||
clearExtraContext();
|
clearExtraContext();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// for vscode context
|
||||||
|
textarea.refOnSubmit.current = sendNewMessage;
|
||||||
|
|
||||||
const handleEditMessage = async (msg: Message, content: string) => {
|
const handleEditMessage = async (msg: Message, content: string) => {
|
||||||
if (!viewingChat) return;
|
if (!viewingChat) return;
|
||||||
setCurrNodeId(msg.id);
|
setCurrNodeId(msg.id);
|
||||||
|
|
|
@ -37,6 +37,7 @@ export interface ChatTextareaApi {
|
||||||
setValue: (value: string) => void;
|
setValue: (value: string) => void;
|
||||||
focus: () => void;
|
focus: () => void;
|
||||||
ref: React.RefObject<HTMLTextAreaElement>;
|
ref: React.RefObject<HTMLTextAreaElement>;
|
||||||
|
refOnSubmit: React.MutableRefObject<(() => void) | null>; // Submit handler
|
||||||
onInput: (event: React.FormEvent<HTMLTextAreaElement>) => void; // Input handler
|
onInput: (event: React.FormEvent<HTMLTextAreaElement>) => void; // Input handler
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,6 +47,7 @@ export interface ChatTextareaApi {
|
||||||
export function useChatTextarea(initValue: string): ChatTextareaApi {
|
export function useChatTextarea(initValue: string): ChatTextareaApi {
|
||||||
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
|
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
|
||||||
const textareaRef = useRef<HTMLTextAreaElement>(null);
|
const textareaRef = useRef<HTMLTextAreaElement>(null);
|
||||||
|
const onSubmitRef = useRef<(() => void) | null>(null);
|
||||||
|
|
||||||
// Effect to set initial value and height on mount or when initValue changes
|
// Effect to set initial value and height on mount or when initValue changes
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
|
@ -91,6 +93,7 @@ export function useChatTextarea(initValue: string): ChatTextareaApi {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
ref: textareaRef,
|
ref: textareaRef,
|
||||||
|
refOnSubmit: onSubmitRef,
|
||||||
onInput: handleInput,
|
onInput: handleInput,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,6 +33,9 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
textarea.focus();
|
textarea.focus();
|
||||||
|
setTimeout(() => {
|
||||||
|
textarea.refOnSubmit.current?.();
|
||||||
|
}, 10); // wait for setExtraContext to finish
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue