mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge commit 'f7625019c5
' into concedo_experimental
# Conflicts: # .github/ISSUE_TEMPLATE/bug.md # .github/workflows/build.yml # CMakeLists.txt # Makefile # README.md # tests/test-backend-ops.cpp # tests/test-opt.cpp # tests/test-quantize-fns.cpp
This commit is contained in:
commit
3ccaf8e09a
37 changed files with 3514 additions and 698 deletions
114
llama.cpp
114
llama.cpp
|
@ -874,9 +874,9 @@ struct LLM_TN {
|
|||
//
|
||||
|
||||
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
||||
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
||||
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
||||
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
||||
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
||||
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
||||
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
||||
};
|
||||
|
||||
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||
|
@ -886,7 +886,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|||
}
|
||||
}
|
||||
|
||||
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
}
|
||||
|
||||
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
||||
|
@ -1608,7 +1608,7 @@ struct llama_hparams {
|
|||
bool causal_attn = true;
|
||||
bool need_kq_pos = false;
|
||||
|
||||
uint32_t pooling_type = LLAMA_POOLING_NONE;
|
||||
uint32_t pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
|
@ -2377,9 +2377,9 @@ namespace GGUFMeta {
|
|||
|
||||
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
||||
switch (ty) {
|
||||
case LLAMA_KV_OVERRIDE_BOOL: return "bool";
|
||||
case LLAMA_KV_OVERRIDE_INT: return "int";
|
||||
case LLAMA_KV_OVERRIDE_FLOAT: return "float";
|
||||
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
||||
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
||||
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
@ -2390,13 +2390,13 @@ namespace GGUFMeta {
|
|||
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
||||
__func__, override_type_to_str(override->tag), override->key);
|
||||
switch (override->tag) {
|
||||
case LLAMA_KV_OVERRIDE_BOOL: {
|
||||
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
||||
LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
|
||||
} break;
|
||||
case LLAMA_KV_OVERRIDE_INT: {
|
||||
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
||||
LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
|
||||
} break;
|
||||
case LLAMA_KV_OVERRIDE_FLOAT: {
|
||||
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
||||
LLAMA_LOG_INFO("%.6f\n", override->float_value);
|
||||
} break;
|
||||
default:
|
||||
|
@ -2415,7 +2415,7 @@ namespace GGUFMeta {
|
|||
template<typename OT>
|
||||
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
||||
try_override(OT & target, const struct llama_model_kv_override *override) {
|
||||
if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
|
||||
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, override)) {
|
||||
target = override->bool_value;
|
||||
return true;
|
||||
}
|
||||
|
@ -2425,7 +2425,7 @@ namespace GGUFMeta {
|
|||
template<typename OT>
|
||||
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
||||
try_override(OT & target, const struct llama_model_kv_override *override) {
|
||||
if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
|
||||
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, override)) {
|
||||
target = override->int_value;
|
||||
return true;
|
||||
}
|
||||
|
@ -2435,7 +2435,7 @@ namespace GGUFMeta {
|
|||
template<typename OT>
|
||||
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
||||
try_override(T & target, const struct llama_model_kv_override *override) {
|
||||
if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
|
||||
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, override)) {
|
||||
target = override->float_value;
|
||||
return true;
|
||||
}
|
||||
|
@ -2578,6 +2578,7 @@ struct llama_model_loader {
|
|||
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
||||
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
||||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
|
@ -2936,6 +2937,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
|
@ -3042,7 +3045,7 @@ static void llm_load_hparams(
|
|||
std::string rope_scaling("linear");
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
||||
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
||||
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
|
||||
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
||||
|
||||
// rope_freq_scale (inverse of the kv) is optional
|
||||
float ropescale = 0.0f;
|
||||
|
@ -3712,7 +3715,7 @@ static bool llm_load_tensors(
|
|||
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
|
||||
if (split_mode == LLAMA_SPLIT_LAYER) {
|
||||
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
||||
// calculate the split points
|
||||
int device_count = llama_get_device_count();
|
||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
||||
|
@ -3751,10 +3754,10 @@ static bool llm_load_tensors(
|
|||
}
|
||||
} else {
|
||||
ggml_backend_buffer_type_t split_buft;
|
||||
if (split_mode == LLAMA_SPLIT_ROW) {
|
||||
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
||||
} else {
|
||||
// LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
|
||||
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
||||
split_buft = llama_default_buffer_type_offload(main_gpu);
|
||||
}
|
||||
// assign the repeating layers
|
||||
|
@ -5139,7 +5142,7 @@ struct llm_build_context {
|
|||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||
do_rope_shift (worst_case || kv_self.has_shift),
|
||||
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
||||
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_TYPE_NONE),
|
||||
cb (cb),
|
||||
buf_compute_meta (lctx.buf_compute_meta) {
|
||||
// all initializations should be done in init()
|
||||
|
@ -6119,12 +6122,12 @@ struct llm_build_context {
|
|||
cur = inpL;
|
||||
|
||||
// pooling layer
|
||||
if (pooling_type == LLAMA_POOLING_MEAN) {
|
||||
if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
||||
} else if (pooling_type == LLAMA_POOLING_CLS) {
|
||||
} else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
||||
} else {
|
||||
GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
|
||||
GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
|
||||
}
|
||||
cb(cur, "result_embd", -1);
|
||||
|
||||
|
@ -7823,7 +7826,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
||||
|
@ -7851,7 +7854,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
|
||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
||||
|
@ -10867,6 +10870,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
|
@ -10898,13 +10907,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
new_type = GGML_TYPE_IQ3_XXS;
|
||||
}
|
||||
} else if (name.find("attn_q.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
||||
new_type = GGML_TYPE_IQ3_XXS;
|
||||
}
|
||||
} else if (name.find("ffn_down") != std::string::npos) {
|
||||
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
||||
int i_layer = info.first, n_layer = info.second;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
||||
|
@ -10915,6 +10928,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
||||
: GGML_TYPE_Q3_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
|
||||
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
||||
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
||||
}
|
||||
|
@ -10946,37 +10963,41 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
}
|
||||
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
else if (name.find("ffn_gate") != std::string::npos) {
|
||||
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
||||
int i_layer = info.first, n_layer = info.second;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
||||
new_type = GGML_TYPE_IQ3_XXS;
|
||||
}
|
||||
++qs.i_ffn_gate;
|
||||
}
|
||||
else if (name.find("ffn_up") != std::string::npos) {
|
||||
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
||||
int i_layer = info.first, n_layer = info.second;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
||||
new_type = GGML_TYPE_IQ3_XXS;
|
||||
}
|
||||
++qs.i_ffn_up;
|
||||
}
|
||||
|
@ -10996,7 +11017,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
||||
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
||||
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0) {
|
||||
|
@ -11011,6 +11032,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
|
||||
|
@ -11042,7 +11064,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
// K-quants
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_XS: quantized_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
||||
|
@ -11056,6 +11078,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
|
@ -11650,7 +11674,7 @@ static int llama_apply_lora_from_file_internal(
|
|||
struct llama_model_params llama_model_default_params() {
|
||||
struct llama_model_params result = {
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_LAYER,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
|
@ -11676,7 +11700,7 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.n_batch =*/ 512,
|
||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||
/*.rope_freq_base =*/ 0.0f,
|
||||
/*.rope_freq_scale =*/ 0.0f,
|
||||
/*.yarn_ext_factor =*/ -1.0f,
|
||||
|
@ -11854,16 +11878,16 @@ struct llama_context * llama_new_context_with_model(
|
|||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||
|
||||
auto rope_scaling_type = params.rope_scaling_type;
|
||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
|
||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
||||
rope_scaling_type = hparams.rope_scaling_type_train;
|
||||
}
|
||||
|
||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
|
||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
|
||||
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
||||
}
|
||||
|
||||
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
||||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
|
||||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
|
@ -11897,8 +11921,8 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
#elif defined(GGML_USE_CUBLAS)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
// with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
|
||||
if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
|
||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
||||
|
@ -11907,7 +11931,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
ctx->backends.push_back(backend);
|
||||
} else {
|
||||
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
||||
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
||||
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
||||
if (backend == nullptr) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue