mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # CMakeLists.txt # tests/test-backend-ops.cpp
This commit is contained in:
commit
74de4cbf44
5 changed files with 25 additions and 10 deletions
|
@ -283,12 +283,8 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * KQV = dst;
|
|
||||||
const ggml_tensor * Q = dst->src[0];
|
const ggml_tensor * Q = dst->src[0];
|
||||||
|
|
||||||
const int32_t precision = KQV->op_params[2];
|
|
||||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
|
||||||
|
|
||||||
if (Q->ne[1] <= 16) {
|
if (Q->ne[1] <= 16) {
|
||||||
constexpr int cols_per_block = 16;
|
constexpr int cols_per_block = 16;
|
||||||
constexpr int parallel_blocks = 4;
|
constexpr int parallel_blocks = 4;
|
||||||
|
|
|
@ -283,9 +283,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
if (is_neox) {
|
|
||||||
pos = (const int32_t *) src1_d;
|
pos = (const int32_t *) src1_d;
|
||||||
|
|
||||||
|
if (is_neox) {
|
||||||
if (src2 != nullptr) {
|
if (src2 != nullptr) {
|
||||||
freq_factors = (const float *) src2->data;
|
freq_factors = (const float *) src2->data;
|
||||||
}
|
}
|
||||||
|
|
20
ggml.c
20
ggml.c
|
@ -6248,6 +6248,8 @@ static struct ggml_tensor * ggml_rope_impl(
|
||||||
float xpos_base,
|
float xpos_base,
|
||||||
bool xpos_down,
|
bool xpos_down,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
|
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_vector(b));
|
GGML_ASSERT(ggml_is_vector(b));
|
||||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||||
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
||||||
|
@ -14416,7 +14418,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
freq_factors = (const float *) src2->data;
|
freq_factors = (const float *) src2->data;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for mode 1");
|
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
||||||
}
|
}
|
||||||
|
|
||||||
// backward process uses inverse rotation by cos and sin.
|
// backward process uses inverse rotation by cos and sin.
|
||||||
|
@ -14532,6 +14534,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: deduplicate f16/f32 code
|
||||||
static void ggml_compute_forward_rope_f16(
|
static void ggml_compute_forward_rope_f16(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
struct ggml_tensor * dst,
|
struct ggml_tensor * dst,
|
||||||
|
@ -14539,6 +14542,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
const struct ggml_tensor * src1 = dst->src[1];
|
const struct ggml_tensor * src1 = dst->src[1];
|
||||||
|
const struct ggml_tensor * src2 = dst->src[2];
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
|
@ -14591,6 +14595,17 @@ static void ggml_compute_forward_rope_f16(
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
|
const float * freq_factors = NULL;
|
||||||
|
if (is_neox) {
|
||||||
|
if (src2 != NULL) {
|
||||||
|
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
||||||
|
freq_factors = (const float *) src2->data;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
||||||
|
}
|
||||||
|
|
||||||
// backward process uses inverse rotation by cos and sin.
|
// backward process uses inverse rotation by cos and sin.
|
||||||
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
||||||
// this essentially just switches the sign of sin.
|
// this essentially just switches the sign of sin.
|
||||||
|
@ -14663,10 +14678,11 @@ static void ggml_compute_forward_rope_f16(
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
float cur_rot = inv_ndims * ic - ib;
|
||||||
|
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(
|
rope_yarn(
|
||||||
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
||||||
&cos_theta, &sin_theta
|
&cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
sin_theta *= sin_sign;
|
sin_theta *= sin_sign;
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -1467,7 +1467,7 @@ extern "C" {
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// rotary position embedding
|
// rotary position embedding
|
||||||
// if mode & 1 == 1, skip n_past elements (DEPRECATED)
|
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
|
||||||
// if mode & 2 == 1, GPT-NeoX style
|
// if mode & 2 == 1, GPT-NeoX style
|
||||||
// if mode & 4 == 1, ChatGLM style
|
// if mode & 4 == 1, ChatGLM style
|
||||||
//
|
//
|
||||||
|
|
|
@ -3815,14 +3815,17 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||||
|
|
||||||
static const char * llama_model_type_name(e_model type) {
|
static const char * llama_model_type_name(e_model type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
|
case MODEL_17M: return "17M";
|
||||||
case MODEL_22M: return "22M";
|
case MODEL_22M: return "22M";
|
||||||
case MODEL_33M: return "33M";
|
case MODEL_33M: return "33M";
|
||||||
case MODEL_109M: return "109M";
|
case MODEL_109M: return "109M";
|
||||||
case MODEL_137M: return "137M";
|
case MODEL_137M: return "137M";
|
||||||
|
case MODEL_335M: return "335M";
|
||||||
case MODEL_0_5B: return "0.5B";
|
case MODEL_0_5B: return "0.5B";
|
||||||
case MODEL_1B: return "1B";
|
case MODEL_1B: return "1B";
|
||||||
case MODEL_2B: return "2B";
|
case MODEL_2B: return "2B";
|
||||||
case MODEL_3B: return "3B";
|
case MODEL_3B: return "3B";
|
||||||
|
case MODEL_4B: return "4B";
|
||||||
case MODEL_7B: return "7B";
|
case MODEL_7B: return "7B";
|
||||||
case MODEL_8B: return "8B";
|
case MODEL_8B: return "8B";
|
||||||
case MODEL_12B: return "12B";
|
case MODEL_12B: return "12B";
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue