Merge branch 'master' into concedo

# Conflicts: # .github/workflows/build.yml # README.md
2025-09-11 17:44:38 +00:00 · 2023-04-25 20:44:22 +08:00 · 2023-04-25 20:44:22 +08:00 · 235daf4016
commit 235daf4016
parent 72b2331ad6 54bb60e268
11 changed files with 821 additions and 656 deletions
--- a/4
+++ b/4
@ -128,9 +128,9 @@ ifdef LLAMA_CUBLAS
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
-	NVCCFLAGS = --forward-unknown-to-host-linker -arch=native
+	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -34,4 +34,5 @@ else()
    add_subdirectory(quantize-stats)
    add_subdirectory(perplexity)
    add_subdirectory(embedding)
    add_subdirectory(save-load-state)
 endif()
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.interactive = true;
        } else if (arg == "--embedding") {
            params.embedding = true;
        } else if (arg == "--interactive-start") {
            params.interactive = true;
        } else if (arg == "--interactive-first") {
-            params.interactive_start = true;
+            params.interactive_first = true;
        } else if (arg == "-ins" || arg == "--instruct") {
            params.instruct = true;
        } else if (arg == "--color") {
--- a/examples/common.h
+++ b/examples/common.h
@ -43,7 +43,7 @@ struct gpt_params {
    bool interactive       = false; // interactive mode
    bool embedding         = false; // get only sentence embedding
-    bool interactive_start = false; // wait for user input immediately
+    bool interactive_first = false; // wait for user input immediately
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool ignore_eos        = false; // do not stop generating after eos
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -21,12 +21,20 @@ To get started right away, run the following command, making sure to use the cor
 ./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
 ```
 The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```bash
 ./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
 ```
 For an interactive experience, try this command:
 ```bash
 ./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
 ```
 Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
 ## Common Options
 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
@ -84,6 +92,8 @@ Instruction mode is particularly useful when working with Alpaca models, which a
 -   `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
 Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
 By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
 ## Context Management
@ -114,7 +124,7 @@ The following options are related to controlling the text generation process, in
 The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value.
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
 ### RNG Seed
@ -126,7 +136,7 @@ The RNG seed is used to initialize the random number generator that influences t
 -   `--temp N`: Adjust the randomness of the generated text (default: 0.8).
-Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism.
+Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
 Example usage: `--temp 0.8`
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -178,12 +178,12 @@ int main(int argc, char ** argv) {
    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
-        params.interactive_start = true;
+        params.interactive_first = true;
        params.antiprompt.push_back("### Instruction:\n\n");
    }
    // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.interactive_start) {
+    if (params.antiprompt.size() != 0 || params.interactive_first) {
        params.interactive = true;
    }
@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
 #endif
               " - Press Return to return control to LLaMa.\n"
               " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = params.interactive_start;
+        is_interacting = params.interactive_first;
    }
    bool is_antiprompt = false;
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@ -0,0 +1,4 @@
 set(TARGET save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -0,0 +1,128 @@
 #include <vector>
 #include <cstdio>
 #include <chrono>
 #include "common.h"
 #include "llama.h"
 #include "llama.cpp"
 using namespace std;
 int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";
    params.seed = 42;
    params.n_threads = 4;
    params.repeat_last_n = 64;
    params.prompt = "The quick brown fox";
    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }
    auto lparams = llama_context_default_params();
    lparams.n_ctx      = params.n_ctx;
    lparams.n_parts    = params.n_parts;
    lparams.seed       = params.seed;
    lparams.f16_kv     = params.memory_f16;
    lparams.use_mmap   = params.use_mmap;
    lparams.use_mlock  = params.use_mlock;
    auto n_past = 0;
    auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
    // init
    auto ctx = llama_init_from_file(params.model.c_str(), lparams);
    auto tokens = vector<llama_token>(params.n_ctx);
    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
    if (n_prompt_tokens < 1) {
        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
        return 1;
    }
    // evaluate prompt
    llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
    n_past += n_prompt_tokens;
    // Save state (rng, logits, embedding and kv_cache) to file
    FILE *fp_write = fopen("dump_state.bin", "wb");
    auto state_size = llama_get_state_size(ctx);
    auto state_mem = new uint8_t[state_size];
    llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
    fwrite(state_mem, 1, state_size, fp_write);
    fclose(fp_write);
    // save state (last tokens)
    auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
    auto n_past_saved = n_past;
    // first run
    printf("\n%s", params.prompt.c_str());
    for (auto i = 0; i < params.n_predict; i++) {
        auto next_token = llama_sample_top_p_top_k(
            ctx,
            &last_n_tokens_data.back() - params.repeat_last_n,
            params.repeat_last_n,
            40,
            1.0,
            1.0,
            1.1);
        auto next_token_str = llama_token_to_str(ctx, next_token);
        last_n_tokens_data.push_back(next_token);
        printf("%s", next_token_str);
        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            return 1;
        }
        n_past += 1;
    }
    printf("\n\n");
    // free old model
    llama_free(ctx);
    // load new model
    auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
    // Load state (rng, logits, embedding and kv_cache) from file
    FILE *fp_read = fopen("dump_state.bin", "rb");
    auto state_size2 = llama_get_state_size(ctx2);
    if (state_size != state_size2) {
        fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
    }
    fread(state_mem, 1, state_size, fp_read);
    llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
    fclose(fp_read);
    // restore state (last tokens)
    last_n_tokens_data = last_n_tokens_data_saved;
    n_past = n_past_saved;
    // second run
    for (auto i = 0; i < params.n_predict; i++) {
        auto next_token = llama_sample_top_p_top_k(
            ctx2,
            &last_n_tokens_data.back() - params.repeat_last_n,
            params.repeat_last_n,
            40,
            1.0,
            1.0,
            1.1);
        auto next_token_str = llama_token_to_str(ctx2, next_token);
        last_n_tokens_data.push_back(next_token);
        printf("%s", next_token_str);
        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            return 1;
        }
        n_past += 1;
    }
    printf("\n\n");
    return 0;
 }
--- a/ggml.c
+++ b/ggml.c
@ -438,7 +438,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
 {
    // Load 8 bytes from memory
-    __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
+    __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
    // Expand bytes into uint16_t values
    __m128i bytes = _mm_cvtepu8_epi16( tmp );
@ -6781,15 +6781,20 @@ static void ggml_compute_forward_sum_f32(
    const size_t nb02 = src0->nb[2];
    const size_t nb03 = src0->nb[3];
    ggml_float sum     = 0;
    float      row_sum = 0;
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = 0; i01 < ne01; i01++) {
                ggml_vec_sum_f32(ne00,
-                        (float *) (dst->data),
+                        &row_sum,
                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
                sum += row_sum;
            }
        }
    }
    ((float *) dst->data)[0] = sum;
 }
 static void ggml_compute_forward_sum(
--- a/ggml.h
+++ b/ggml.h
@ -169,14 +169,27 @@
 //
 //
-#ifdef  __cplusplus
+#ifdef GGML_SHARED
-extern "C" {
+#    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef GGML_BUILD
 #            define GGML_API __declspec(dllexport)
 #        else
 #            define GGML_API __declspec(dllimport)
 #        endif
 #    else
 #        define GGML_API __attribute__ ((visibility ("default")))
 #    endif
 #else
 #    define GGML_API
 #endif
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1
 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
 #define GGML_MAX_PARAMS        16
@ -184,6 +197,10 @@ extern "C" {
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #ifdef __ARM_NEON
    // we use the built-in 16-bit float type
    typedef __fp16 ggml_fp16_t;
@ -192,14 +209,13 @@ typedef uint16_t ggml_fp16_t;
 #endif
    // convert FP16 <-> FP32
-float       ggml_fp16_to_fp32(ggml_fp16_t x);
+    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
-ggml_fp16_t ggml_fp32_to_fp16(float x);
+    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
    struct ggml_object;
    struct ggml_context;
    enum ggml_type {
    // explicitly numbered values are used in llama.cpp files
        GGML_TYPE_F32  = 0,
        GGML_TYPE_F16  = 1,
        GGML_TYPE_Q4_0 = 2,
@ -341,60 +357,64 @@ struct ggml_init_params {
        bool   no_alloc;   // don't allocate memory for the tensor data
    };
-void    ggml_time_init(void); // call this once at the beginning of the program
+    // misc
 int64_t ggml_time_ms(void);
 int64_t ggml_time_us(void);
 int64_t ggml_cycles(void);
 int64_t ggml_cycles_per_ms(void);
-void ggml_print_object (const struct ggml_object * obj);
+    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
-void ggml_print_objects(const struct ggml_context * ctx);
+    GGML_API int64_t ggml_time_ms(void);
    GGML_API int64_t ggml_time_us(void);
    GGML_API int64_t ggml_cycles(void);
    GGML_API int64_t ggml_cycles_per_ms(void);
-int64_t ggml_nelements(const struct ggml_tensor * tensor);
+    GGML_API void    ggml_print_object (const struct ggml_object * obj);
-size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
+    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
-int    ggml_blck_size (enum ggml_type type);
+    GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
-size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
+    GGML_API size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
 float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
-const char * ggml_type_name(enum ggml_type type);
+    GGML_API int     ggml_blck_size (enum ggml_type type);
    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
-size_t ggml_element_size(const struct ggml_tensor * tensor);
+    GGML_API const char * ggml_type_name(enum ggml_type type);
-bool ggml_is_quantized(enum ggml_type type);
+    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
-struct ggml_context * ggml_init(struct ggml_init_params params);
+    GGML_API bool    ggml_is_quantized(enum ggml_type type);
 void ggml_free(struct ggml_context * ctx);
-size_t ggml_used_mem(const struct ggml_context * ctx);
+    // main
-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
    GGML_API void    ggml_free(struct ggml_context * ctx);
-struct ggml_tensor * ggml_new_tensor(
+    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
    GGML_API struct ggml_tensor * ggml_new_tensor(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int    n_dims,
            const int64_t *ne);
-struct ggml_tensor * ggml_new_tensor_1d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_1d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0);
-struct ggml_tensor * ggml_new_tensor_2d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_2d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0,
            int64_t ne1);
-struct ggml_tensor * ggml_new_tensor_3d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_3d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0,
            int64_t ne1,
            int64_t ne2);
-struct ggml_tensor * ggml_new_tensor_4d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_4d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0,
@ -402,128 +422,127 @@ struct ggml_tensor * ggml_new_tensor_4d(
            int64_t ne2,
            int64_t ne3);
-struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
-struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
-struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
-struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
-struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
- void * ggml_get_data    (const struct ggml_tensor * tensor);
+    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
-float * ggml_get_data_f32(const struct ggml_tensor * tensor);
+    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
    //
    // operations on tensors with backpropagation
    //
-struct ggml_tensor * ggml_dup(
+    GGML_API struct ggml_tensor * ggml_dup(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_add(
+    GGML_API struct ggml_tensor * ggml_add(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-
+    GGML_API struct ggml_tensor * ggml_add_inplace(
 struct ggml_tensor * ggml_add_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_sub(
+    GGML_API struct ggml_tensor * ggml_sub(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_mul(
+    GGML_API struct ggml_tensor * ggml_mul(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_div(
+    GGML_API struct ggml_tensor * ggml_div(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_sqr(
+    GGML_API struct ggml_tensor * ggml_sqr(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_sqrt(
+    GGML_API struct ggml_tensor * ggml_sqrt(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // return scalar
    // TODO: compute sum along rows
-struct ggml_tensor * ggml_sum(
+    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // mean along rows
-struct ggml_tensor * ggml_mean(
+    GGML_API struct ggml_tensor * ggml_mean(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // if a is the same shape as b, and a is not parameter, return a
    // otherwise, return a new tensor: repeat(a) to fit in b
-struct ggml_tensor * ggml_repeat(
+    GGML_API struct ggml_tensor * ggml_repeat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_abs(
+    GGML_API struct ggml_tensor * ggml_abs(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_sgn(
+    GGML_API struct ggml_tensor * ggml_sgn(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_neg(
+    GGML_API struct ggml_tensor * ggml_neg(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_step(
+    GGML_API struct ggml_tensor * ggml_step(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_relu(
+    GGML_API struct ggml_tensor * ggml_relu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // TODO: double-check this computation is correct
-struct ggml_tensor * ggml_gelu(
+    GGML_API struct ggml_tensor * ggml_gelu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_silu(
+    GGML_API struct ggml_tensor * ggml_silu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // normalize along rows
    // TODO: eps is hardcoded to 1e-5 for now
-struct ggml_tensor * ggml_norm(
+    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_rms_norm(
+    GGML_API struct ggml_tensor * ggml_rms_norm(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // A: m rows, n columns
    // B: p rows, n columns (i.e. we transpose it internally)
    // result is m columns, p rows
-struct ggml_tensor * ggml_mul_mat(
+    GGML_API struct ggml_tensor * ggml_mul_mat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
@ -533,32 +552,32 @@ struct ggml_tensor * ggml_mul_mat(
    //
    // in-place, returns view(a)
-struct ggml_tensor * ggml_scale(
+    GGML_API struct ggml_tensor * ggml_scale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // a -> b, return view(b)
-struct ggml_tensor * ggml_cpy(
+    GGML_API struct ggml_tensor * ggml_cpy(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // make contiguous
-struct ggml_tensor * ggml_cont(
+    GGML_API struct ggml_tensor * ggml_cont(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // return view(a), b specifies the new shape
    // TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape(
+    GGML_API struct ggml_tensor * ggml_reshape(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // return view(a)
    // TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape_2d(
+    GGML_API struct ggml_tensor * ggml_reshape_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -566,7 +585,7 @@ struct ggml_tensor * ggml_reshape_2d(
    // return view(a)
    // TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape_3d(
+    GGML_API struct ggml_tensor * ggml_reshape_3d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -574,13 +593,13 @@ struct ggml_tensor * ggml_reshape_3d(
            int64_t               ne2);
    // offset in bytes
-struct ggml_tensor * ggml_view_1d(
+    GGML_API struct ggml_tensor * ggml_view_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
            size_t                offset);
-struct ggml_tensor * ggml_view_2d(
+    GGML_API struct ggml_tensor * ggml_view_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -588,7 +607,7 @@ struct ggml_tensor * ggml_view_2d(
            size_t                nb1, // row stride in bytes
            size_t                offset);
-struct ggml_tensor * ggml_view_3d(
+    GGML_API struct ggml_tensor * ggml_view_3d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -598,7 +617,7 @@ struct ggml_tensor * ggml_view_3d(
            size_t                nb2, // slice stride in bytes
            size_t                offset);
-struct ggml_tensor * ggml_permute(
+    GGML_API struct ggml_tensor * ggml_permute(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   axis0,
@ -607,24 +626,24 @@ struct ggml_tensor * ggml_permute(
            int                   axis3);
    // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
-struct ggml_tensor * ggml_transpose(
+    GGML_API struct ggml_tensor * ggml_transpose(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_get_rows(
+    GGML_API struct ggml_tensor * ggml_get_rows(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // set elements above the diagonal to -INF
    // in-place, returns view(a)
-struct ggml_tensor * ggml_diag_mask_inf(
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past);
    // in-place, returns view(a)
-struct ggml_tensor * ggml_soft_max(
+    GGML_API struct ggml_tensor * ggml_soft_max(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@ -633,7 +652,7 @@ struct ggml_tensor * ggml_soft_max(
    // if mode & 1 == 1, skip n_past elements
    // if mode & 2 == 1, GPT-NeoX style
    // TODO: avoid creating a new tensor every time
-struct ggml_tensor * ggml_rope(
+    GGML_API struct ggml_tensor * ggml_rope(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
@ -644,24 +663,24 @@ struct ggml_tensor * ggml_rope(
    // TODO: we don't support extra parameters for now
    //       that's why we are hard-coding the stride, padding, and dilation
    //       not great ..
-struct ggml_tensor * ggml_conv_1d_1s(
+    GGML_API struct ggml_tensor * ggml_conv_1d_1s(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_conv_1d_2s(
+    GGML_API struct ggml_tensor * ggml_conv_1d_2s(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_flash_attn(
+    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
            struct ggml_tensor  * k,
            struct ggml_tensor  * v,
            bool                  masked);
-struct ggml_tensor * ggml_flash_ff(
+    GGML_API struct ggml_tensor * ggml_flash_ff(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b0,
@ -670,15 +689,15 @@ struct ggml_tensor * ggml_flash_ff(
            struct ggml_tensor  * c1);
    // Mapping operations
-typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
+    GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
-typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+    GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-struct ggml_tensor * ggml_map_unary_f32(
+    GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
            const  ggml_unary_op_f32_t fun);
-struct ggml_tensor * ggml_map_binary_f32(
+    GGML_API struct ggml_tensor * ggml_map_binary_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
@ -688,23 +707,23 @@ struct ggml_tensor * ggml_map_binary_f32(
    // automatic differentiation
    //
-void ggml_set_param(
+    GGML_API void ggml_set_param(
            struct ggml_context * ctx,
            struct ggml_tensor * tensor);
-void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
+    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
+    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
    // print info and performance information for the graph
-void ggml_graph_print(const struct ggml_cgraph * cgraph);
+    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
    // dump the graph into a file using the dot format
-void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
    //
    // optimization
@ -797,10 +816,10 @@ struct ggml_opt_params {
        } lbfgs;
    };
-struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
+    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
    // optimize the function defined by the tensor f
-enum ggml_opt_result ggml_opt(
+    GGML_API enum ggml_opt_result ggml_opt(
            struct ggml_context * ctx,
            struct ggml_opt_params params,
            struct ggml_tensor * f);
@ -809,32 +828,32 @@ enum ggml_opt_result ggml_opt(
    // quantization
    //
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
    //
    // system info
    //
-int ggml_cpu_has_avx(void);
+    GGML_API int ggml_cpu_has_avx        (void);
-int ggml_cpu_has_avx2(void);
+    GGML_API int ggml_cpu_has_avx2       (void);
-int ggml_cpu_has_avx512(void);
+    GGML_API int ggml_cpu_has_avx512     (void);
-int ggml_cpu_has_avx512_vbmi(void);
+    GGML_API int ggml_cpu_has_avx512_vbmi(void);
-int ggml_cpu_has_avx512_vnni(void);
+    GGML_API int ggml_cpu_has_avx512_vnni(void);
-int ggml_cpu_has_fma(void);
+    GGML_API int ggml_cpu_has_fma        (void);
-int ggml_cpu_has_neon(void);
+    GGML_API int ggml_cpu_has_neon       (void);
-int ggml_cpu_has_arm_fma(void);
+    GGML_API int ggml_cpu_has_arm_fma    (void);
-int ggml_cpu_has_f16c(void);
+    GGML_API int ggml_cpu_has_f16c       (void);
-int ggml_cpu_has_fp16_va(void);
+    GGML_API int ggml_cpu_has_fp16_va    (void);
-int ggml_cpu_has_wasm_simd(void);
+    GGML_API int ggml_cpu_has_wasm_simd  (void);
-int ggml_cpu_has_blas(void);
+    GGML_API int ggml_cpu_has_blas       (void);
-int ggml_cpu_has_cublas(void);
+    GGML_API int ggml_cpu_has_cublas     (void);
-int ggml_cpu_has_sse3(void);
+    GGML_API int ggml_cpu_has_sse3       (void);
-int ggml_cpu_has_vsx(void);
+    GGML_API int ggml_cpu_has_vsx        (void);
    //
--- a/llama.cpp
+++ b/llama.cpp
@ -54,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
        { MODEL_7B,    512ull * MB },
        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,   512ull * MB },
+        { MODEL_65B,  1024ull * MB },
    };
    return _MEM_REQ_SCRATCH0;
 }
@ -65,7 +65,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
        { MODEL_7B,    512ull * MB },
        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,   512ull * MB },
+        { MODEL_65B,  1024ull * MB },
    };
    return _MEM_REQ_SCRATCH1;
 }