Merge branch 'master' into concedo

# Conflicts:
#	.github/workflows/build.yml
#	README.md
This commit is contained in:
Concedo 2023-04-25 20:44:22 +08:00
commit 235daf4016
11 changed files with 821 additions and 656 deletions

View file

@ -128,9 +128,9 @@ ifdef LLAMA_CUBLAS
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
NVCC = nvcc NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-linker -arch=native NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif endif
ifdef LLAMA_GPROF ifdef LLAMA_GPROF
CFLAGS += -pg CFLAGS += -pg

View file

@ -34,4 +34,5 @@ else()
add_subdirectory(quantize-stats) add_subdirectory(quantize-stats)
add_subdirectory(perplexity) add_subdirectory(perplexity)
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(save-load-state)
endif() endif()

View file

@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.interactive = true; params.interactive = true;
} else if (arg == "--embedding") { } else if (arg == "--embedding") {
params.embedding = true; params.embedding = true;
} else if (arg == "--interactive-start") {
params.interactive = true;
} else if (arg == "--interactive-first") { } else if (arg == "--interactive-first") {
params.interactive_start = true; params.interactive_first = true;
} else if (arg == "-ins" || arg == "--instruct") { } else if (arg == "-ins" || arg == "--instruct") {
params.instruct = true; params.instruct = true;
} else if (arg == "--color") { } else if (arg == "--color") {

View file

@ -43,7 +43,7 @@ struct gpt_params {
bool interactive = false; // interactive mode bool interactive = false; // interactive mode
bool embedding = false; // get only sentence embedding bool embedding = false; // get only sentence embedding
bool interactive_start = false; // wait for user input immediately bool interactive_first = false; // wait for user input immediately
bool instruct = false; // instruction mode (used for Alpaca models) bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos bool ignore_eos = false; // do not stop generating after eos

View file

@ -21,12 +21,20 @@ To get started right away, run the following command, making sure to use the cor
./main -m models/7B/ggml-model.bin --prompt "Once upon a time" ./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
``` ```
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
```bash
./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
```
For an interactive experience, try this command: For an interactive experience, try this command:
```bash ```bash
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:' ./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
``` ```
Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
## Common Options ## Common Options
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
@ -84,6 +92,8 @@ Instruction mode is particularly useful when working with Alpaca models, which a
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions. - `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
## Context Management ## Context Management
@ -114,7 +124,7 @@ The following options are related to controlling the text generation process, in
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit. The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
### RNG Seed ### RNG Seed
@ -126,7 +136,7 @@ The RNG seed is used to initialize the random number generator that influences t
- `--temp N`: Adjust the randomness of the generated text (default: 0.8). - `--temp N`: Adjust the randomness of the generated text (default: 0.8).
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
Example usage: `--temp 0.8` Example usage: `--temp 0.8`

View file

@ -178,12 +178,12 @@ int main(int argc, char ** argv) {
// in instruct mode, we inject a prefix and a suffix to each input by the user // in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) { if (params.instruct) {
params.interactive_start = true; params.interactive_first = true;
params.antiprompt.push_back("### Instruction:\n\n"); params.antiprompt.push_back("### Instruction:\n\n");
} }
// enable interactive mode if reverse prompt or interactive start is specified // enable interactive mode if reverse prompt or interactive start is specified
if (params.antiprompt.size() != 0 || params.interactive_start) { if (params.antiprompt.size() != 0 || params.interactive_first) {
params.interactive = true; params.interactive = true;
} }
@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
#endif #endif
" - Press Return to return control to LLaMa.\n" " - Press Return to return control to LLaMa.\n"
" - If you want to submit another line, end your input in '\\'.\n\n"); " - If you want to submit another line, end your input in '\\'.\n\n");
is_interacting = params.interactive_start; is_interacting = params.interactive_first;
} }
bool is_antiprompt = false; bool is_antiprompt = false;

View file

@ -0,0 +1,4 @@
set(TARGET save-load-state)
add_executable(${TARGET} save-load-state.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,128 @@
#include <vector>
#include <cstdio>
#include <chrono>
#include "common.h"
#include "llama.h"
#include "llama.cpp"
using namespace std;
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
params.seed = 42;
params.n_threads = 4;
params.repeat_last_n = 64;
params.prompt = "The quick brown fox";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
auto n_past = 0;
auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
// init
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
auto tokens = vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
return 1;
}
// evaluate prompt
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
n_past += n_prompt_tokens;
// Save state (rng, logits, embedding and kv_cache) to file
FILE *fp_write = fopen("dump_state.bin", "wb");
auto state_size = llama_get_state_size(ctx);
auto state_mem = new uint8_t[state_size];
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
fwrite(state_mem, 1, state_size, fp_write);
fclose(fp_write);
// save state (last tokens)
auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
auto n_past_saved = n_past;
// first run
printf("\n%s", params.prompt.c_str());
for (auto i = 0; i < params.n_predict; i++) {
auto next_token = llama_sample_top_p_top_k(
ctx,
&last_n_tokens_data.back() - params.repeat_last_n,
params.repeat_last_n,
40,
1.0,
1.0,
1.1);
auto next_token_str = llama_token_to_str(ctx, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
return 1;
}
n_past += 1;
}
printf("\n\n");
// free old model
llama_free(ctx);
// load new model
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
// Load state (rng, logits, embedding and kv_cache) from file
FILE *fp_read = fopen("dump_state.bin", "rb");
auto state_size2 = llama_get_state_size(ctx2);
if (state_size != state_size2) {
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
}
fread(state_mem, 1, state_size, fp_read);
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
fclose(fp_read);
// restore state (last tokens)
last_n_tokens_data = last_n_tokens_data_saved;
n_past = n_past_saved;
// second run
for (auto i = 0; i < params.n_predict; i++) {
auto next_token = llama_sample_top_p_top_k(
ctx2,
&last_n_tokens_data.back() - params.repeat_last_n,
params.repeat_last_n,
40,
1.0,
1.0,
1.1);
auto next_token_str = llama_token_to_str(ctx2, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
return 1;
}
n_past += 1;
}
printf("\n\n");
return 0;
}

9
ggml.c
View file

@ -438,7 +438,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
{ {
// Load 8 bytes from memory // Load 8 bytes from memory
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi ); __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
// Expand bytes into uint16_t values // Expand bytes into uint16_t values
__m128i bytes = _mm_cvtepu8_epi16( tmp ); __m128i bytes = _mm_cvtepu8_epi16( tmp );
@ -6781,15 +6781,20 @@ static void ggml_compute_forward_sum_f32(
const size_t nb02 = src0->nb[2]; const size_t nb02 = src0->nb[2];
const size_t nb03 = src0->nb[3]; const size_t nb03 = src0->nb[3];
ggml_float sum = 0;
float row_sum = 0;
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = 0; i01 < ne01; i01++) { for (int64_t i01 = 0; i01 < ne01; i01++) {
ggml_vec_sum_f32(ne00, ggml_vec_sum_f32(ne00,
(float *) (dst->data), &row_sum,
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
sum += row_sum;
} }
} }
} }
((float *) dst->data)[0] = sum;
} }
static void ggml_compute_forward_sum( static void ggml_compute_forward_sum(

253
ggml.h
View file

@ -169,14 +169,27 @@
// //
// //
#ifdef __cplusplus #ifdef GGML_SHARED
extern "C" { # if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BUILD
# define GGML_API __declspec(dllexport)
# else
# define GGML_API __declspec(dllimport)
# endif
# else
# define GGML_API __attribute__ ((visibility ("default")))
# endif
#else
# define GGML_API
#endif #endif
#include <stdint.h> #include <stdint.h>
#include <stddef.h> #include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1
#define GGML_MAX_DIMS 4 #define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096 #define GGML_MAX_NODES 4096
#define GGML_MAX_PARAMS 16 #define GGML_MAX_PARAMS 16
@ -184,6 +197,10 @@ extern "C" {
#define GGML_MAX_OPT 4 #define GGML_MAX_OPT 4
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __ARM_NEON #ifdef __ARM_NEON
// we use the built-in 16-bit float type // we use the built-in 16-bit float type
typedef __fp16 ggml_fp16_t; typedef __fp16 ggml_fp16_t;
@ -192,14 +209,13 @@ typedef uint16_t ggml_fp16_t;
#endif #endif
// convert FP16 <-> FP32 // convert FP16 <-> FP32
float ggml_fp16_to_fp32(ggml_fp16_t x); GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
ggml_fp16_t ggml_fp32_to_fp16(float x); GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
struct ggml_object; struct ggml_object;
struct ggml_context; struct ggml_context;
enum ggml_type { enum ggml_type {
// explicitly numbered values are used in llama.cpp files
GGML_TYPE_F32 = 0, GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1, GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_0 = 2,
@ -341,60 +357,64 @@ struct ggml_init_params {
bool no_alloc; // don't allocate memory for the tensor data bool no_alloc; // don't allocate memory for the tensor data
}; };
void ggml_time_init(void); // call this once at the beginning of the program // misc
int64_t ggml_time_ms(void);
int64_t ggml_time_us(void);
int64_t ggml_cycles(void);
int64_t ggml_cycles_per_ms(void);
void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_time_init(void); // call this once at the beginning of the program
void ggml_print_objects(const struct ggml_context * ctx); GGML_API int64_t ggml_time_ms(void);
GGML_API int64_t ggml_time_us(void);
GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void);
int64_t ggml_nelements(const struct ggml_tensor * tensor); GGML_API void ggml_print_object (const struct ggml_object * obj);
size_t ggml_nbytes (const struct ggml_tensor * tensor); GGML_API void ggml_print_objects(const struct ggml_context * ctx);
int ggml_blck_size (enum ggml_type type); GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
const char * ggml_type_name(enum ggml_type type); GGML_API int ggml_blck_size (enum ggml_type type);
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
size_t ggml_element_size(const struct ggml_tensor * tensor); GGML_API const char * ggml_type_name(enum ggml_type type);
bool ggml_is_quantized(enum ggml_type type); GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
struct ggml_context * ggml_init(struct ggml_init_params params); GGML_API bool ggml_is_quantized(enum ggml_type type);
void ggml_free(struct ggml_context * ctx);
size_t ggml_used_mem(const struct ggml_context * ctx); // main
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
GGML_API void ggml_free(struct ggml_context * ctx);
struct ggml_tensor * ggml_new_tensor( GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int n_dims, int n_dims,
const int64_t *ne); const int64_t *ne);
struct ggml_tensor * ggml_new_tensor_1d( GGML_API struct ggml_tensor * ggml_new_tensor_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int64_t ne0); int64_t ne0);
struct ggml_tensor * ggml_new_tensor_2d( GGML_API struct ggml_tensor * ggml_new_tensor_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int64_t ne0, int64_t ne0,
int64_t ne1); int64_t ne1);
struct ggml_tensor * ggml_new_tensor_3d( GGML_API struct ggml_tensor * ggml_new_tensor_3d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int64_t ne0, int64_t ne0,
int64_t ne1, int64_t ne1,
int64_t ne2); int64_t ne2);
struct ggml_tensor * ggml_new_tensor_4d( GGML_API struct ggml_tensor * ggml_new_tensor_4d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int64_t ne0, int64_t ne0,
@ -402,128 +422,127 @@ struct ggml_tensor * ggml_new_tensor_4d(
int64_t ne2, int64_t ne2,
int64_t ne3); int64_t ne3);
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
float * ggml_get_data_f32(const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
// //
// operations on tensors with backpropagation // operations on tensors with backpropagation
// //
struct ggml_tensor * ggml_dup( GGML_API struct ggml_tensor * ggml_dup(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_add( GGML_API struct ggml_tensor * ggml_add(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_add_inplace(
struct ggml_tensor * ggml_add_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_sub( GGML_API struct ggml_tensor * ggml_sub(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_mul( GGML_API struct ggml_tensor * ggml_mul(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_div( GGML_API struct ggml_tensor * ggml_div(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_sqr( GGML_API struct ggml_tensor * ggml_sqr(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_sqrt( GGML_API struct ggml_tensor * ggml_sqrt(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// return scalar // return scalar
// TODO: compute sum along rows // TODO: compute sum along rows
struct ggml_tensor * ggml_sum( GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// mean along rows // mean along rows
struct ggml_tensor * ggml_mean( GGML_API struct ggml_tensor * ggml_mean(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// if a is the same shape as b, and a is not parameter, return a // if a is the same shape as b, and a is not parameter, return a
// otherwise, return a new tensor: repeat(a) to fit in b // otherwise, return a new tensor: repeat(a) to fit in b
struct ggml_tensor * ggml_repeat( GGML_API struct ggml_tensor * ggml_repeat(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_abs( GGML_API struct ggml_tensor * ggml_abs(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_sgn( GGML_API struct ggml_tensor * ggml_sgn(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_neg( GGML_API struct ggml_tensor * ggml_neg(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_step( GGML_API struct ggml_tensor * ggml_step(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_relu( GGML_API struct ggml_tensor * ggml_relu(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// TODO: double-check this computation is correct // TODO: double-check this computation is correct
struct ggml_tensor * ggml_gelu( GGML_API struct ggml_tensor * ggml_gelu(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_silu( GGML_API struct ggml_tensor * ggml_silu(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// normalize along rows // normalize along rows
// TODO: eps is hardcoded to 1e-5 for now // TODO: eps is hardcoded to 1e-5 for now
struct ggml_tensor * ggml_norm( GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_rms_norm( GGML_API struct ggml_tensor * ggml_rms_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// A: m rows, n columns // A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally) // B: p rows, n columns (i.e. we transpose it internally)
// result is m columns, p rows // result is m columns, p rows
struct ggml_tensor * ggml_mul_mat( GGML_API struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
@ -533,32 +552,32 @@ struct ggml_tensor * ggml_mul_mat(
// //
// in-place, returns view(a) // in-place, returns view(a)
struct ggml_tensor * ggml_scale( GGML_API struct ggml_tensor * ggml_scale(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// a -> b, return view(b) // a -> b, return view(b)
struct ggml_tensor * ggml_cpy( GGML_API struct ggml_tensor * ggml_cpy(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// make contiguous // make contiguous
struct ggml_tensor * ggml_cont( GGML_API struct ggml_tensor * ggml_cont(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// return view(a), b specifies the new shape // return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape( GGML_API struct ggml_tensor * ggml_reshape(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// return view(a) // return view(a)
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape_2d( GGML_API struct ggml_tensor * ggml_reshape_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int64_t ne0, int64_t ne0,
@ -566,7 +585,7 @@ struct ggml_tensor * ggml_reshape_2d(
// return view(a) // return view(a)
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape_3d( GGML_API struct ggml_tensor * ggml_reshape_3d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int64_t ne0, int64_t ne0,
@ -574,13 +593,13 @@ struct ggml_tensor * ggml_reshape_3d(
int64_t ne2); int64_t ne2);
// offset in bytes // offset in bytes
struct ggml_tensor * ggml_view_1d( GGML_API struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int64_t ne0, int64_t ne0,
size_t offset); size_t offset);
struct ggml_tensor * ggml_view_2d( GGML_API struct ggml_tensor * ggml_view_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int64_t ne0, int64_t ne0,
@ -588,7 +607,7 @@ struct ggml_tensor * ggml_view_2d(
size_t nb1, // row stride in bytes size_t nb1, // row stride in bytes
size_t offset); size_t offset);
struct ggml_tensor * ggml_view_3d( GGML_API struct ggml_tensor * ggml_view_3d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int64_t ne0, int64_t ne0,
@ -598,7 +617,7 @@ struct ggml_tensor * ggml_view_3d(
size_t nb2, // slice stride in bytes size_t nb2, // slice stride in bytes
size_t offset); size_t offset);
struct ggml_tensor * ggml_permute( GGML_API struct ggml_tensor * ggml_permute(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int axis0, int axis0,
@ -607,24 +626,24 @@ struct ggml_tensor * ggml_permute(
int axis3); int axis3);
// alias for ggml_permute(ctx, a, 1, 0, 2, 3) // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
struct ggml_tensor * ggml_transpose( GGML_API struct ggml_tensor * ggml_transpose(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_get_rows( GGML_API struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// set elements above the diagonal to -INF // set elements above the diagonal to -INF
// in-place, returns view(a) // in-place, returns view(a)
struct ggml_tensor * ggml_diag_mask_inf( GGML_API struct ggml_tensor * ggml_diag_mask_inf(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past); int n_past);
// in-place, returns view(a) // in-place, returns view(a)
struct ggml_tensor * ggml_soft_max( GGML_API struct ggml_tensor * ggml_soft_max(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
@ -633,7 +652,7 @@ struct ggml_tensor * ggml_soft_max(
// if mode & 1 == 1, skip n_past elements // if mode & 1 == 1, skip n_past elements
// if mode & 2 == 1, GPT-NeoX style // if mode & 2 == 1, GPT-NeoX style
// TODO: avoid creating a new tensor every time // TODO: avoid creating a new tensor every time
struct ggml_tensor * ggml_rope( GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
@ -644,24 +663,24 @@ struct ggml_tensor * ggml_rope(
// TODO: we don't support extra parameters for now // TODO: we don't support extra parameters for now
// that's why we are hard-coding the stride, padding, and dilation // that's why we are hard-coding the stride, padding, and dilation
// not great .. // not great ..
struct ggml_tensor * ggml_conv_1d_1s( GGML_API struct ggml_tensor * ggml_conv_1d_1s(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_conv_1d_2s( GGML_API struct ggml_tensor * ggml_conv_1d_2s(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_flash_attn( GGML_API struct ggml_tensor * ggml_flash_attn(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * q, struct ggml_tensor * q,
struct ggml_tensor * k, struct ggml_tensor * k,
struct ggml_tensor * v, struct ggml_tensor * v,
bool masked); bool masked);
struct ggml_tensor * ggml_flash_ff( GGML_API struct ggml_tensor * ggml_flash_ff(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b0, struct ggml_tensor * b0,
@ -670,15 +689,15 @@ struct ggml_tensor * ggml_flash_ff(
struct ggml_tensor * c1); struct ggml_tensor * c1);
// Mapping operations // Mapping operations
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
struct ggml_tensor * ggml_map_unary_f32( GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
const ggml_unary_op_f32_t fun); const ggml_unary_op_f32_t fun);
struct ggml_tensor * ggml_map_binary_f32( GGML_API struct ggml_tensor * ggml_map_binary_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
@ -688,23 +707,23 @@ struct ggml_tensor * ggml_map_binary_f32(
// automatic differentiation // automatic differentiation
// //
void ggml_set_param( GGML_API void ggml_set_param(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * tensor); struct ggml_tensor * tensor);
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
void ggml_graph_reset (struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
// print info and performance information for the graph // print info and performance information for the graph
void ggml_graph_print(const struct ggml_cgraph * cgraph); GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
// dump the graph into a file using the dot format // dump the graph into a file using the dot format
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
// //
// optimization // optimization
@ -797,10 +816,10 @@ struct ggml_opt_params {
} lbfgs; } lbfgs;
}; };
struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
// optimize the function defined by the tensor f // optimize the function defined by the tensor f
enum ggml_opt_result ggml_opt( GGML_API enum ggml_opt_result ggml_opt(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_opt_params params, struct ggml_opt_params params,
struct ggml_tensor * f); struct ggml_tensor * f);
@ -809,32 +828,32 @@ enum ggml_opt_result ggml_opt(
// quantization // quantization
// //
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
// //
// system info // system info
// //
int ggml_cpu_has_avx(void); GGML_API int ggml_cpu_has_avx (void);
int ggml_cpu_has_avx2(void); GGML_API int ggml_cpu_has_avx2 (void);
int ggml_cpu_has_avx512(void); GGML_API int ggml_cpu_has_avx512 (void);
int ggml_cpu_has_avx512_vbmi(void); GGML_API int ggml_cpu_has_avx512_vbmi(void);
int ggml_cpu_has_avx512_vnni(void); GGML_API int ggml_cpu_has_avx512_vnni(void);
int ggml_cpu_has_fma(void); GGML_API int ggml_cpu_has_fma (void);
int ggml_cpu_has_neon(void); GGML_API int ggml_cpu_has_neon (void);
int ggml_cpu_has_arm_fma(void); GGML_API int ggml_cpu_has_arm_fma (void);
int ggml_cpu_has_f16c(void); GGML_API int ggml_cpu_has_f16c (void);
int ggml_cpu_has_fp16_va(void); GGML_API int ggml_cpu_has_fp16_va (void);
int ggml_cpu_has_wasm_simd(void); GGML_API int ggml_cpu_has_wasm_simd (void);
int ggml_cpu_has_blas(void); GGML_API int ggml_cpu_has_blas (void);
int ggml_cpu_has_cublas(void); GGML_API int ggml_cpu_has_cublas (void);
int ggml_cpu_has_sse3(void); GGML_API int ggml_cpu_has_sse3 (void);
int ggml_cpu_has_vsx(void); GGML_API int ggml_cpu_has_vsx (void);
// //

View file

@ -54,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
{ MODEL_7B, 512ull * MB }, { MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB }, { MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB }, { MODEL_30B, 512ull * MB },
{ MODEL_65B, 512ull * MB }, { MODEL_65B, 1024ull * MB },
}; };
return _MEM_REQ_SCRATCH0; return _MEM_REQ_SCRATCH0;
} }
@ -65,7 +65,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
{ MODEL_7B, 512ull * MB }, { MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB }, { MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB }, { MODEL_30B, 512ull * MB },
{ MODEL_65B, 512ull * MB }, { MODEL_65B, 1024ull * MB },
}; };
return _MEM_REQ_SCRATCH1; return _MEM_REQ_SCRATCH1;
} }