mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-31 05:03:44 +00:00
* spec: support MTP
* fix batch size
* rename files
* cont : simplify (#7)
* MTP: clean-up (#9)
* MTP: clean-up
* review: use llama_context_type instead of llama_graph_type
* review: remove llama_model_has_mtp
* review: fix convert issues
* convert: fix pycheck
* review: formatting
* use `mtp-` for identifying mtp models
* convert: fix mtp conversion
* mtp -> draft-mtp
* remove unused llama_arch
* add need_embd in speculative
* llama: allow partial seq_rm for GDN models for speculative decoding
Currently speculative checkpoint needs to restart from a checkpoint
after some draft tokens are not accepted, this leads to some wastage in
running the target again. This PR adds the ability to rollback upto
`draft_max` by storing the GDN intermediates.
* fix pending state
* vulkan: add GDN partial rollback
* meta: extend check to axis 1
* metal: add GDN partial rollback
Extend the gated delta net kernel to store intermediate states for
partial rollback support on the Metal backend.
- Add K (snapshot slot count) as a function constant
- Read input state from slot 0 of the 3D state tensor
- Write intermediate states to different slots during token loop
- For K=1, maintain backward-compatible single-slot behavior
Ref: 8c05923630
Assisted-by: llama.cpp:local pi
* delta_net_base: use ggml_pad instead of new_tensor
* review: add need_rs_seq
* review: rename part_bounded to n_rs
* review: deslop comments
* review: rename, add asserts
* server : adjust checkpoint logic (#11)
* server : adjust checkpoint logic
* cont : rm asserts
* server-context: fix early exit
* spec : fix compatibility with n-gram and add TODOs (#13)
* metal : cleanup
* llama : fix faulty bitwise check in recurrent memory
* server : disable RS-based MTP in combination with other spec types
* spec : add TODOs
* cont : fix comment
* cont : update comment
* common : fix logic for ngram + mtp compat
* llama-memory: enable checkpointing with partial rollback
* cont: add test-case for loading into a dirty ctx
* llama-memory-recurrent: clear rs_idx in clear
* download: fix mtp path
* llama-arch: fix enorm op
* docs: update docs
* conversion: fix type annotations
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
207 lines
7.5 KiB
C++
207 lines
7.5 KiB
C++
#pragma once
|
|
|
|
#include "llama.h"
|
|
|
|
#include "llama-impl.h"
|
|
#include "llama-arch.h"
|
|
#include "llama-hparams.h"
|
|
#include "llama-mmap.h"
|
|
|
|
#include "ggml-cpp.h"
|
|
|
|
#include <cstddef>
|
|
#include <cstring>
|
|
#include <map>
|
|
#include <stdexcept>
|
|
#include <unordered_map>
|
|
|
|
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
|
|
|
// lists of buffer types used for each layer
|
|
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
|
|
|
enum llama_fver {
|
|
GGUF_FILE_VERSION_V1 = 1,
|
|
GGUF_FILE_VERSION_V2 = 2,
|
|
GGUF_FILE_VERSION_V3 = 3,
|
|
};
|
|
|
|
const char * llama_file_version_name(llama_fver version);
|
|
|
|
struct llama_model_loader {
|
|
// Holds information on a model weight
|
|
struct llama_tensor_weight {
|
|
uint16_t idx; // source file index
|
|
size_t offs; // tensor data offset in the original file
|
|
|
|
ggml_tensor * tensor;
|
|
|
|
llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor));
|
|
if (tensor_idx < 0) {
|
|
throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
|
|
}
|
|
|
|
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
|
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
|
|
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
|
|
}
|
|
}
|
|
};
|
|
|
|
// custom comparator to sort weights more nicely by layer
|
|
struct weight_name_comparer {
|
|
bool operator()(const std::string & a, const std::string & b) const {
|
|
int a_layer = -1;
|
|
int b_layer = -1;
|
|
sscanf(a.c_str(), "blk.%d.", &a_layer);
|
|
sscanf(b.c_str(), "blk.%d.", &b_layer);
|
|
if (a_layer != b_layer) {
|
|
return a_layer < b_layer;
|
|
}
|
|
return a < b;
|
|
}
|
|
};
|
|
|
|
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
|
static const int TENSOR_DUPLICATED = 1 << 1;
|
|
static const int TENSOR_SKIP = 1 << 2;
|
|
static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;
|
|
|
|
int n_kv = 0;
|
|
int n_tensors = 0;
|
|
int n_created = 0;
|
|
|
|
uint64_t n_elements = 0;
|
|
size_t n_bytes = 0;
|
|
|
|
bool use_mmap = false;
|
|
bool use_direct_io = false;
|
|
bool check_tensors;
|
|
bool no_alloc;
|
|
|
|
llama_files files;
|
|
llama_ftype ftype;
|
|
llama_fver fver;
|
|
|
|
llama_mmaps mappings;
|
|
|
|
std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
|
|
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
|
|
const llama_model_tensor_buft_override * tensor_buft_overrides;
|
|
|
|
gguf_context_ptr metadata_ptr;
|
|
struct gguf_context * metadata; // either metadata_ptr.get() or externally set
|
|
llama_model_set_tensor_data_t set_tensor_data;
|
|
void * set_tensor_data_ud;
|
|
std::vector<ggml_context_ptr> contexts;
|
|
|
|
std::string arch_name;
|
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
|
|
|
size_t size_done = 0;
|
|
size_t size_data = 0;
|
|
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
|
|
|
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
|
struct ggml_backend_buft_comparator {
|
|
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
|
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
|
}
|
|
};
|
|
|
|
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
|
|
|
// track tensors that had to be moved for debugging:
|
|
size_t n_tensors_moved = 0;
|
|
std::string first_tensor_moved_name;
|
|
std::string first_tensor_moved_type_name;
|
|
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
|
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
|
|
|
llama_model_loader(
|
|
struct gguf_context * metadata,
|
|
llama_model_set_tensor_data_t set_tensor_data,
|
|
void * set_tensor_data_ud,
|
|
const std::string & fname,
|
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
|
FILE * file,
|
|
bool use_mmap,
|
|
bool use_direct_io,
|
|
bool check_tensors,
|
|
bool no_alloc,
|
|
const llama_model_kv_override * param_overrides_p,
|
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
|
|
|
template<typename T>
|
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
|
get_arr_n(const std::string & key, T & result, bool required = true);
|
|
|
|
template<typename T>
|
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
|
get_arr_n(enum llm_kv kid, T & result, bool required = true);
|
|
|
|
template<typename T>
|
|
bool get_arr(const std::string & key, std::vector<T> & result, bool required = true);
|
|
|
|
template<typename T, size_t N_MAX>
|
|
bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true);
|
|
|
|
template<typename T>
|
|
bool get_arr(enum llm_kv kid, T & result, bool required = true);
|
|
|
|
template<typename T>
|
|
bool get_key(const std::string & key, T & result, bool required = true);
|
|
|
|
template<typename T>
|
|
bool get_key(enum llm_kv kid, T & result, bool required = true);
|
|
|
|
template<typename T, size_t N_MAX>
|
|
bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true);
|
|
|
|
template<typename T>
|
|
bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
|
|
|
|
bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
|
|
|
|
std::string get_arch_name() const;
|
|
|
|
enum llm_arch get_arch() const;
|
|
|
|
const llama_tensor_weight * get_weight(const char * name) const;
|
|
|
|
const llama_tensor_weight & require_weight(const char * name) const;
|
|
|
|
struct ggml_tensor * get_tensor_meta(const char * name) const;
|
|
|
|
struct ggml_tensor * require_tensor_meta(const std::string & name) const;
|
|
|
|
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
|
|
|
|
struct ggml_tensor * create_tensor(
|
|
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
|
|
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
|
|
|
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
|
|
|
|
void done_getting_tensors(bool partial = false) const;
|
|
|
|
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);
|
|
|
|
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
|
|
|
|
// for backwards compatibility, does not support ggml-backend
|
|
void load_data_for(struct ggml_tensor * cur) const;
|
|
|
|
// Returns false if cancelled by progress_callback
|
|
bool load_all_data(
|
|
struct ggml_context * ctx,
|
|
llama_buf_map & bufs,
|
|
llama_mlocks * lmlocks,
|
|
llama_progress_callback progress_callback,
|
|
void * progress_callback_user_data);
|
|
|
|
std::string ftype_name() const;
|
|
|
|
void print_info() const;
|
|
};
|