Merge branch 'upstream' into concedo_experimental

# Conflicts: # CMakeLists.txt # tests/test-backend-ops.cpp
2025-09-11 09:34:37 +00:00 · 2024-05-22 22:12:40 +08:00 · 2024-05-22 22:12:40 +08:00 · 74de4cbf44
commit 74de4cbf44
parent cb872f7d9b fcda1128bc
5 changed files with 25 additions and 10 deletions
--- a/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml-cuda/fattn-tile-f32.cu
@ -283,12 +283,8 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
 }
 void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * KQV = dst;
    const ggml_tensor * Q = dst->src[0];
    const int32_t precision = KQV->op_params[2];
    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
    if (Q->ne[1] <= 16) {
        constexpr int cols_per_block = 16;
        constexpr int parallel_blocks = 4;
--- a/ggml-cuda/rope.cu
+++ b/ggml-cuda/rope.cu
@ -283,9 +283,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
    if (is_neox) {
    pos = (const int32_t *) src1_d;
    if (is_neox) {
        if (src2 != nullptr) {
            freq_factors = (const float *) src2->data;
        }
--- a/ggml.c
+++ b/ggml.c
@ -6248,6 +6248,8 @@ static struct ggml_tensor * ggml_rope_impl(
        float                 xpos_base,
        bool                  xpos_down,
        bool                  inplace) {
    GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
    GGML_ASSERT(ggml_is_vector(b));
    GGML_ASSERT(b->type == GGML_TYPE_I32);
    GGML_ASSERT(a->ne[2] == b->ne[0]);
@ -14416,7 +14418,7 @@ static void ggml_compute_forward_rope_f32(
            freq_factors = (const float *) src2->data;
        }
    } else {
-        GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for mode 1");
+        GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
    }
    // backward process uses inverse rotation by cos and sin.
@ -14532,6 +14534,7 @@ static void ggml_compute_forward_rope_f32(
    }
 }
 // TODO: deduplicate f16/f32 code
 static void ggml_compute_forward_rope_f16(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
@ -14539,6 +14542,7 @@ static void ggml_compute_forward_rope_f16(
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
    const struct ggml_tensor * src2 = dst->src[2];
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
@ -14591,6 +14595,17 @@ static void ggml_compute_forward_rope_f16(
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
    const float * freq_factors = NULL;
    if (is_neox) {
        if (src2 != NULL) {
            GGML_ASSERT(src2->type == GGML_TYPE_F32);
            GGML_ASSERT(src2->ne[0] >= n_dims / 2);
            freq_factors = (const float *) src2->data;
        }
    } else {
        GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
    }
    // backward process uses inverse rotation by cos and sin.
    // cos and sin build a rotation matrix, where the inverse is the transpose.
    // this essentially just switches the sign of sin.
@ -14663,10 +14678,11 @@ static void ggml_compute_forward_rope_f16(
                            // simplified from `(ib * n_dims + ic) * inv_ndims`
                            float cur_rot = inv_ndims * ic - ib;
                            float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
                            float cos_theta, sin_theta;
                            rope_yarn(
-                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+                                theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
                            sin_theta *= sin_sign;
--- a/ggml.h
+++ b/ggml.h
@ -1467,7 +1467,7 @@ extern "C" {
            struct ggml_tensor  * b);
    // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
+    // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
    // if mode & 2 == 1, GPT-NeoX style
    // if mode & 4 == 1, ChatGLM style
    //
--- a/llama.cpp
+++ b/llama.cpp
@ -3815,14 +3815,17 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
 static const char * llama_model_type_name(e_model type) {
    switch (type) {
        case MODEL_17M:    return "17M";
        case MODEL_22M:    return "22M";
        case MODEL_33M:    return "33M";
        case MODEL_109M:   return "109M";
        case MODEL_137M:   return "137M";
        case MODEL_335M:   return "335M";
        case MODEL_0_5B:   return "0.5B";
        case MODEL_1B:     return "1B";
        case MODEL_2B:     return "2B";
        case MODEL_3B:     return "3B";
        case MODEL_4B:     return "4B";
        case MODEL_7B:     return "7B";
        case MODEL_8B:     return "8B";
        case MODEL_12B:    return "12B";