#pragma once #ifdef __cplusplus extern "C" { #endif #include #include #include #define GGML_RWKV_MAX_DIMS 4 #define GGML_RWKV_MAX_NODES 4096 #define GGML_RWKV_MAX_PARAMS 16 #define GGML_RWKV_MAX_CONTEXTS 64 #define GGML_RWKV_MAX_OPT 4 #ifdef __ARM_NEON // we use the built-in 16-bit float type typedef __fp16 ggml_rwkv_fp16_t; #else typedef uint16_t ggml_rwkv_fp16_t; #endif // convert FP16 <-> FP32 float ggml_rwkv_fp16_to_fp32(ggml_rwkv_fp16_t x); ggml_rwkv_fp16_t ggml_rwkv_fp32_to_fp16(float x); struct ggml_rwkv_object; struct ggml_rwkv_context; enum ggml_rwkv_type { GGML_RWKV_TYPE_Q4_0, // Stores min and delta per block, does quantized matmul. GGML_RWKV_TYPE_Q4_1, // Same as Q4_1, but stores outliers separately, and matmul is done in FP32. // An outlier is the single absmax element in the quantized block. GGML_RWKV_TYPE_Q4_1_O, GGML_RWKV_TYPE_I8, GGML_RWKV_TYPE_I16, GGML_RWKV_TYPE_I32, GGML_RWKV_TYPE_F16, GGML_RWKV_TYPE_F32, GGML_RWKV_TYPE_COUNT, }; // available tensor operations: enum ggml_rwkv_op { GGML_RWKV_OP_NONE = 0, GGML_RWKV_OP_DUP, GGML_RWKV_OP_ADD, GGML_RWKV_OP_SUB, GGML_RWKV_OP_MUL, GGML_RWKV_OP_DIV, GGML_RWKV_OP_SQR, GGML_RWKV_OP_SQRT, GGML_RWKV_OP_SUM, GGML_RWKV_OP_MEAN, GGML_RWKV_OP_REPEAT, GGML_RWKV_OP_ABS, GGML_RWKV_OP_SGN, GGML_RWKV_OP_NEG, // Element-wise exponential function `e^x`. // Same as `torch.exp(x)` from PyTorch. GGML_RWKV_OP_EXP, // Element-wise `1 - x`. GGML_RWKV_OP_1_MINUS_X, // Element-wise maximum of 2 values. Argument shapes must match. // Same as `torch.maximum(x)` from PyTorch. GGML_RWKV_OP_MAX, GGML_RWKV_OP_STEP, GGML_RWKV_OP_RELU, GGML_RWKV_OP_GELU, // Element-wise sigmoid activation `1 / (1 + e^-x)`, also called logistic function. // Same as `torch.sigmoid(x)` from PyTorch. GGML_RWKV_OP_SIGMOID, GGML_RWKV_OP_SILU, GGML_RWKV_OP_NORM, // normalize GGML_RWKV_OP_RMS_NORM, GGML_RWKV_OP_MUL_MAT, GGML_RWKV_OP_SCALE, GGML_RWKV_OP_CPY, GGML_RWKV_OP_RESHAPE, GGML_RWKV_OP_VIEW, GGML_RWKV_OP_PERMUTE, GGML_RWKV_OP_TRANSPOSE, GGML_RWKV_OP_GET_ROWS, GGML_RWKV_OP_DIAG_MASK_INF, GGML_RWKV_OP_SOFT_MAX, GGML_RWKV_OP_ROPE, GGML_RWKV_OP_CONV_1D_1S, GGML_RWKV_OP_CONV_1D_2S, GGML_RWKV_OP_FLASH_ATTN, GGML_RWKV_OP_FLASH_FF, GGML_RWKV_OP_COUNT, }; // n-dimensional tensor struct ggml_rwkv_tensor { enum ggml_rwkv_type type; int n_dims; int ne[GGML_RWKV_MAX_DIMS]; // number of elements size_t nb[GGML_RWKV_MAX_DIMS]; // stride in bytes: // nb[0] = sizeof(type) // nb[1] = nb[0] * ne[0] + padding // nb[i] = nb[i-1] * ne[i-1] // compute data enum ggml_rwkv_op op; bool is_param; struct ggml_rwkv_tensor * grad; struct ggml_rwkv_tensor * src0; struct ggml_rwkv_tensor * src1; struct ggml_rwkv_tensor * opt[GGML_RWKV_MAX_OPT]; // thread scheduling int n_tasks; // performance int perf_runs; int64_t perf_cycles; int64_t perf_time_us; void * data; char padding[8]; }; // computation graph struct ggml_rwkv_cgraph { int n_nodes; int n_leafs; int n_threads; size_t work_size; struct ggml_rwkv_tensor * work; struct ggml_rwkv_tensor * nodes[GGML_RWKV_MAX_NODES]; struct ggml_rwkv_tensor * grads[GGML_RWKV_MAX_NODES]; struct ggml_rwkv_tensor * leafs[GGML_RWKV_MAX_NODES]; // performance int perf_runs; int64_t perf_cycles; int64_t perf_time_us; }; // scratch buffer struct ggml_rwkv_scratch { size_t offs; size_t size; void * data; }; struct ggml_rwkv_init_params { // memory pool size_t mem_size; // bytes void * mem_buffer; // if NULL, memory will be allocated internally }; void ggml_rwkv_time_init(void); // call this once at the beginning of the program int64_t ggml_rwkv_time_ms(void); int64_t ggml_rwkv_time_us(void); int64_t ggml_rwkv_cycles(void); int64_t ggml_rwkv_cycles_per_ms(void); void ggml_rwkv_print_object (const struct ggml_rwkv_object * obj); void ggml_rwkv_print_objects(const struct ggml_rwkv_context * ctx); int ggml_rwkv_nelements(const struct ggml_rwkv_tensor * tensor); size_t ggml_rwkv_nbytes (const struct ggml_rwkv_tensor * tensor); int ggml_rwkv_blck_size (enum ggml_rwkv_type type); size_t ggml_rwkv_type_size (enum ggml_rwkv_type type); // size in bytes for all elements in a block float ggml_rwkv_type_sizef(enum ggml_rwkv_type type); // ggml_rwkv_type_size()/ggml_rwkv_blck_size() as float size_t ggml_rwkv_element_size(const struct ggml_rwkv_tensor * tensor); struct ggml_rwkv_context * ggml_rwkv_init(struct ggml_rwkv_init_params params); void ggml_rwkv_free(struct ggml_rwkv_context * ctx); size_t ggml_rwkv_used_mem(const struct ggml_rwkv_context * ctx); size_t ggml_rwkv_set_scratch(struct ggml_rwkv_context * ctx, struct ggml_rwkv_scratch scratch); bool ggml_rwkv_mlock_supported(void); bool ggml_rwkv_mlock(struct ggml_rwkv_context * ctx, char ** err_p); struct ggml_rwkv_tensor * ggml_rwkv_new_tensor( struct ggml_rwkv_context * ctx, enum ggml_rwkv_type type, int n_dims, const int *ne); struct ggml_rwkv_tensor * ggml_rwkv_new_tensor_1d( struct ggml_rwkv_context * ctx, enum ggml_rwkv_type type, int ne0); struct ggml_rwkv_tensor * ggml_rwkv_new_tensor_2d( struct ggml_rwkv_context * ctx, enum ggml_rwkv_type type, int ne0, int ne1); struct ggml_rwkv_tensor * ggml_rwkv_new_tensor_3d( struct ggml_rwkv_context * ctx, enum ggml_rwkv_type type, int ne0, int ne1, int ne2); struct ggml_rwkv_tensor * ggml_rwkv_new_tensor_4d( struct ggml_rwkv_context * ctx, enum ggml_rwkv_type type, int ne0, int ne1, int ne2, int ne3); struct ggml_rwkv_tensor * ggml_rwkv_new_i32(struct ggml_rwkv_context * ctx, int32_t value); struct ggml_rwkv_tensor * ggml_rwkv_new_f32(struct ggml_rwkv_context * ctx, float value); struct ggml_rwkv_tensor * ggml_rwkv_dup_tensor (struct ggml_rwkv_context * ctx, const struct ggml_rwkv_tensor * src); struct ggml_rwkv_tensor * ggml_rwkv_view_tensor(struct ggml_rwkv_context * ctx, const struct ggml_rwkv_tensor * src); struct ggml_rwkv_tensor * ggml_rwkv_set_zero(struct ggml_rwkv_tensor * tensor); struct ggml_rwkv_tensor * ggml_rwkv_set_i32 (struct ggml_rwkv_tensor * tensor, int32_t value); struct ggml_rwkv_tensor * ggml_rwkv_set_f32 (struct ggml_rwkv_tensor * tensor, float value); int32_t ggml_rwkv_get_i32_1d(const struct ggml_rwkv_tensor * tensor, int i); void ggml_rwkv_set_i32_1d(const struct ggml_rwkv_tensor * tensor, int i, int32_t value); float ggml_rwkv_get_f32_1d(const struct ggml_rwkv_tensor * tensor, int i); void ggml_rwkv_set_f32_1d(const struct ggml_rwkv_tensor * tensor, int i, float value); void * ggml_rwkv_get_data (const struct ggml_rwkv_tensor * tensor); float * ggml_rwkv_get_data_f32(const struct ggml_rwkv_tensor * tensor); // // operations on tensors with backpropagation // struct ggml_rwkv_tensor * ggml_rwkv_dup( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_add( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); struct ggml_rwkv_tensor * ggml_rwkv_sub( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); struct ggml_rwkv_tensor * ggml_rwkv_mul( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); struct ggml_rwkv_tensor * ggml_rwkv_div( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); struct ggml_rwkv_tensor * ggml_rwkv_sqr( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_sqrt( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); // return scalar // TODO: compute sum along rows struct ggml_rwkv_tensor * ggml_rwkv_sum( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); // mean along rows struct ggml_rwkv_tensor * ggml_rwkv_mean( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); // if a is the same shape as b, and a is not parameter, return a // otherwise, return a new tensor: repeat(a) to fit in b struct ggml_rwkv_tensor * ggml_rwkv_repeat( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); struct ggml_rwkv_tensor * ggml_rwkv_abs( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_sgn( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_neg( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_exp( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_1_minus_x( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_max( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); struct ggml_rwkv_tensor * ggml_rwkv_step( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_relu( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); // TODO: double-check this computation is correct struct ggml_rwkv_tensor * ggml_rwkv_gelu( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_sigmoid( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_silu( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); // normalize along rows // TODO: eps is hardcoded to 1e-5 for now struct ggml_rwkv_tensor * ggml_rwkv_norm( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_rms_norm( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); // A: m rows, n columns // B: p rows, n columns (i.e. we transpose it internally) // result is m columns, p rows struct ggml_rwkv_tensor * ggml_rwkv_mul_mat( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); // // operations on tensors without backpropagation // // in-place, returns view(a) struct ggml_rwkv_tensor * ggml_rwkv_scale( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); // a -> b, return view(b) struct ggml_rwkv_tensor * ggml_rwkv_cpy( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); // return view(a), b specifies the new shape // TODO: when we start computing gradient, make a copy instead of view struct ggml_rwkv_tensor * ggml_rwkv_reshape( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); // return view(a) // TODO: when we start computing gradient, make a copy instead of view struct ggml_rwkv_tensor * ggml_rwkv_reshape_2d( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, int ne0, int ne1); // return view(a) // TODO: when we start computing gradient, make a copy instead of view struct ggml_rwkv_tensor * ggml_rwkv_reshape_3d( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, int ne0, int ne1, int ne2); // offset in bytes struct ggml_rwkv_tensor * ggml_rwkv_view_1d( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, int ne0, size_t offset); struct ggml_rwkv_tensor * ggml_rwkv_view_2d( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, int ne0, int ne1, size_t nb1, // row stride in bytes size_t offset); struct ggml_rwkv_tensor * ggml_rwkv_permute( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, int axis0, int axis1, int axis2, int axis3); // alias for ggml_rwkv_permute(ctx, a, 1, 0, 2, 3) struct ggml_rwkv_tensor * ggml_rwkv_transpose( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); struct ggml_rwkv_tensor * ggml_rwkv_get_rows( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); // set elements above the diagonal to -INF // in-place, returns view(a) struct ggml_rwkv_tensor * ggml_rwkv_diag_mask_inf( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, int n_past); // in-place, returns view(a) struct ggml_rwkv_tensor * ggml_rwkv_soft_max( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a); // rotary position embedding // in-place, returns view(a) // if mode == 1, skip n_past elements // TODO: avoid creating a new tensor every time struct ggml_rwkv_tensor * ggml_rwkv_rope( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, int n_past, int n_dims, int mode); // padding = 1 // TODO: we don't support extra parameters for now // that's why we are hard-coding the stride, padding, and dilation // not great .. struct ggml_rwkv_tensor * ggml_rwkv_conv_1d_1s( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); struct ggml_rwkv_tensor * ggml_rwkv_conv_1d_2s( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b); struct ggml_rwkv_tensor * ggml_rwkv_flash_attn( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * q, struct ggml_rwkv_tensor * k, struct ggml_rwkv_tensor * v, bool masked); struct ggml_rwkv_tensor * ggml_rwkv_flash_ff( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * a, struct ggml_rwkv_tensor * b0, struct ggml_rwkv_tensor * b1, struct ggml_rwkv_tensor * c0, struct ggml_rwkv_tensor * c1); // // automatic differentiation // void ggml_rwkv_set_param( struct ggml_rwkv_context * ctx, struct ggml_rwkv_tensor * tensor); void ggml_rwkv_build_forward_expand(struct ggml_rwkv_cgraph * cgraph, struct ggml_rwkv_tensor * tensor); struct ggml_rwkv_cgraph ggml_rwkv_build_forward (struct ggml_rwkv_tensor * tensor); struct ggml_rwkv_cgraph ggml_rwkv_build_backward(struct ggml_rwkv_context * ctx, struct ggml_rwkv_cgraph * gf, bool keep); void ggml_rwkv_graph_compute(struct ggml_rwkv_context * ctx, struct ggml_rwkv_cgraph * cgraph); void ggml_rwkv_graph_reset (struct ggml_rwkv_cgraph * cgraph); // print info and performance information for the graph void ggml_rwkv_graph_print(const struct ggml_rwkv_cgraph * cgraph); // dump the graph into a file using the dot format void ggml_rwkv_graph_dump_dot(const struct ggml_rwkv_cgraph * gb, const struct ggml_rwkv_cgraph * gf, const char * filename); // // optimization // // optimization methods enum ggml_rwkv_opt_type { GGML_RWKV_OPT_ADAM, GGML_RWKV_OPT_LBFGS, }; // linesearch methods enum ggml_rwkv_linesearch { GGML_RWKV_LINESEARCH_DEFAULT = 1, GGML_RWKV_LINESEARCH_BACKTRACKING_ARMIJO = 0, GGML_RWKV_LINESEARCH_BACKTRACKING_WOLFE = 1, GGML_RWKV_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, }; // optimization return values enum ggml_rwkv_opt_result { GGML_RWKV_OPT_OK = 0, GGML_RWKV_OPT_DID_NOT_CONVERGE, GGML_RWKV_OPT_NO_CONTEXT, GGML_RWKV_OPT_INVALID_WOLFE, GGML_RWKV_OPT_FAIL, GGML_RWKV_LINESEARCH_FAIL = -128, GGML_RWKV_LINESEARCH_MINIMUM_STEP, GGML_RWKV_LINESEARCH_MAXIMUM_STEP, GGML_RWKV_LINESEARCH_MAXIMUM_ITERATIONS, GGML_RWKV_LINESEARCH_INVALID_PARAMETERS, }; // optimization parameters // // see ggml.c (ggml_rwkv_opt_default_params) for default values // struct ggml_rwkv_opt_params { enum ggml_rwkv_opt_type type; int n_threads; // delta-based convergence test // // if past == 0 - disabled // if past > 0: // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) // int past; float delta; // maximum number of iterations without improvement // // if 0 - disabled // if > 0: // assume convergence if no cost improvement in this number of iterations // int max_no_improvement; bool print_forward_graph; bool print_backward_graph; // ADAM parameters struct { int n_iter; float alpha; // learning rate float beta1; float beta2; float eps; // epsilon for numerical stability float eps_f; // epsilon for convergence test float eps_g; // epsilon for convergence test } adam; // LBFGS parameters struct { int m; // number of corrections to approximate the inv. Hessian int n_iter; int max_linesearch; float eps; // convergence tolerance float ftol; // line search tolerance float wolfe; float min_step; float max_step; enum ggml_rwkv_linesearch linesearch; } lbfgs; }; struct ggml_rwkv_opt_params ggml_rwkv_opt_default_params(enum ggml_rwkv_opt_type type); // optimize the function defined by the tensor f enum ggml_rwkv_opt_result ggml_rwkv_opt( struct ggml_rwkv_context * ctx, struct ggml_rwkv_opt_params params, struct ggml_rwkv_tensor * f); // // quantization // size_t ggml_rwkv_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); size_t ggml_rwkv_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); size_t ggml_rwkv_quantize_q4_1_o(const float * src, void * dst, int n, int k, int64_t * hist); // // system info // int ggml_rwkv_cpu_has_avx(void); int ggml_rwkv_cpu_has_avx2(void); int ggml_rwkv_cpu_has_avx512(void); int ggml_rwkv_cpu_has_fma(void); int ggml_rwkv_cpu_has_neon(void); int ggml_rwkv_cpu_has_arm_fma(void); int ggml_rwkv_cpu_has_f16c(void); int ggml_rwkv_cpu_has_fp16_va(void); int ggml_rwkv_cpu_has_wasm_simd(void); int ggml_rwkv_cpu_has_blas(void); int ggml_rwkv_cpu_has_sse3(void); int ggml_rwkv_cpu_has_vsx(void); // Run test suite for ggml. // Exits normally, if all tests pass. // Aborts the execution if any test did not pass. void ggml_rwkv_run_test_suite(); #ifdef __cplusplus } #endif