Merge branch 'master' into concedo

# Conflicts: # .github/workflows/build.yml # .github/workflows/docker.yml # CMakeLists.txt # Makefile # README.md
2025-09-15 03:19:41 +00:00 · 2023-04-19 16:34:51 +08:00 · 2023-04-19 16:34:51 +08:00 · f662a9a230
commit f662a9a230
parent 65bfcdb1cc 6667401238
10 changed files with 863 additions and 95 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -485,6 +485,7 @@ struct llama_file_loader {
                case GGML_TYPE_F16:
                case GGML_TYPE_Q4_0:
                case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q4_2:
                    break;
                default: {
                    throw format("unrecognized tensor type %u\n", shard.type);
@ -557,6 +558,7 @@ struct llama_file_saver {
            case GGML_TYPE_F16:
            case GGML_TYPE_Q4_0:
            case GGML_TYPE_Q4_1:
+            case GGML_TYPE_Q4_2:
                break;
            default: LLAMA_ASSERT(false);
        }
@ -845,6 +847,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
                                      return "mostly Q4_1, some F16";
+        case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
        default:                      return "unknown, may not work";
    }
 }
@ -1578,6 +1581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    switch (ftype) {
        case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
        case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
        default: throw format("invalid output file type %d\n", ftype);
    };

@ -1651,6 +1655,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                    {
                        new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
                    } break;
+                case GGML_TYPE_Q4_2:
+                    {
+                        new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+                    } break;
                default:
                    LLAMA_ASSERT(false);
            }
@ -1962,7 +1970,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
                base_t = dest_t;
            }

-            if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
+            if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) {
                if (!warned) {
                    fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
                                    "use a f16 or f32 base model with --lora-base\n", __func__);