mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .gitignore # CONTRIBUTING.md # Makefile # examples/llava/CMakeLists.txt # scripts/sync-ggml-am.sh # scripts/sync-ggml.last # scripts/sync-ggml.sh # src/llama-vocab.cpp
This commit is contained in:
commit
bdfe8526b8
44 changed files with 2241 additions and 439 deletions
|
@ -685,14 +685,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
}
|
}
|
||||||
if (arg == "--lora") {
|
if (arg == "--lora") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
params.lora_adapters.push_back({
|
||||||
|
std::string(argv[i]),
|
||||||
|
1.0,
|
||||||
|
});
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--lora-scaled") {
|
if (arg == "--lora-scaled") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
const char* lora_adapter = argv[i];
|
std::string lora_adapter = argv[i];
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
params.lora_adapters.push_back({
|
||||||
|
lora_adapter,
|
||||||
|
std::stof(argv[i]),
|
||||||
|
});
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--lora-init-without-apply") {
|
||||||
|
params.lora_init_without_apply = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--control-vector") {
|
if (arg == "--control-vector") {
|
||||||
|
@ -1655,6 +1665,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
||||||
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
|
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
|
||||||
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
|
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
|
||||||
|
options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
options.push_back({ "logging" });
|
options.push_back({ "logging" });
|
||||||
|
@ -1767,6 +1778,17 @@ std::string string_get_sortable_timestamp() {
|
||||||
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||||
|
if (search.empty()) {
|
||||||
|
return; // Avoid infinite loop if 'search' is an empty string
|
||||||
|
}
|
||||||
|
size_t pos = 0;
|
||||||
|
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||||
|
s.replace(pos, search.length(), replace);
|
||||||
|
pos += replace.length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void string_process_escapes(std::string & input) {
|
void string_process_escapes(std::string & input) {
|
||||||
std::size_t input_len = input.length();
|
std::size_t input_len = input.length();
|
||||||
std::size_t output_idx = 0;
|
std::size_t output_idx = 0;
|
||||||
|
@ -2092,17 +2114,22 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
// load and optionally apply lora adapters
|
||||||
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
for (auto & la : params.lora_adapters) {
|
||||||
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
llama_lora_adapter_container loaded_la;
|
||||||
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
|
loaded_la.path = la.path;
|
||||||
if (adapter == nullptr) {
|
loaded_la.scale = la.scale;
|
||||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
||||||
|
if (loaded_la.adapter == nullptr) {
|
||||||
|
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
llama_lora_adapter_set(lctx, adapter, lora_scale);
|
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
||||||
|
}
|
||||||
|
if (!params.lora_init_without_apply) {
|
||||||
|
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.ignore_eos) {
|
if (params.ignore_eos) {
|
||||||
|
@ -2141,6 +2168,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
|
||||||
|
llama_lora_adapter_clear(ctx);
|
||||||
|
for (auto & la : lora_adapters) {
|
||||||
|
if (la.scale != 0.0f) {
|
||||||
|
llama_lora_adapter_set(ctx, la.adapter, la.scale);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
||||||
auto mparams = llama_model_default_params();
|
auto mparams = llama_model_default_params();
|
||||||
|
|
||||||
|
@ -3163,19 +3199,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stream, "lora:\n");
|
fprintf(stream, "lora:\n");
|
||||||
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
for (auto & la : params.lora_adapters) {
|
||||||
if (std::get<1>(la) != 1.0f) {
|
if (la.scale == 1.0f) {
|
||||||
continue;
|
fprintf(stream, " - %s\n", la.path.c_str());
|
||||||
}
|
}
|
||||||
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
|
|
||||||
}
|
}
|
||||||
fprintf(stream, "lora_scaled:\n");
|
fprintf(stream, "lora_scaled:\n");
|
||||||
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
for (auto & la : params.lora_adapters) {
|
||||||
if (std::get<1>(la) == 1.0f) {
|
if (la.scale != 1.0f) {
|
||||||
continue;
|
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
|
||||||
}
|
}
|
||||||
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
|
||||||
}
|
}
|
||||||
|
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||||
|
|
|
@ -33,6 +33,15 @@
|
||||||
|
|
||||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||||
|
|
||||||
|
struct llama_lora_adapter_info {
|
||||||
|
std::string path;
|
||||||
|
float scale;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_lora_adapter_container : llama_lora_adapter_info {
|
||||||
|
struct llama_lora_adapter * adapter;
|
||||||
|
};
|
||||||
|
|
||||||
// build info
|
// build info
|
||||||
|
|
||||||
struct llama_control_vector_load_info;
|
struct llama_control_vector_load_info;
|
||||||
|
@ -148,8 +157,8 @@ struct gpt_params {
|
||||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
std::vector<llama_model_kv_override> kv_overrides;
|
std::vector<llama_model_kv_override> kv_overrides;
|
||||||
|
|
||||||
// TODO: avoid tuple, use struct
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
||||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
||||||
|
|
||||||
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
|
||||||
|
@ -299,6 +308,8 @@ std::vector<std::string> string_split(std::string input, char separator);
|
||||||
std::string string_strip(const std::string & str);
|
std::string string_strip(const std::string & str);
|
||||||
std::string string_get_sortable_timestamp();
|
std::string string_get_sortable_timestamp();
|
||||||
|
|
||||||
|
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
||||||
|
|
||||||
template<class T>
|
template<class T>
|
||||||
static std::vector<T> string_split(const std::string & str, char delim) {
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
||||||
std::vector<T> values;
|
std::vector<T> values;
|
||||||
|
@ -333,6 +344,7 @@ std::string fs_get_cache_file(const std::string & filename);
|
||||||
struct llama_init_result {
|
struct llama_init_result {
|
||||||
struct llama_model * model = nullptr;
|
struct llama_model * model = nullptr;
|
||||||
struct llama_context * context = nullptr;
|
struct llama_context * context = nullptr;
|
||||||
|
std::vector<llama_lora_adapter_container> lora_adapters;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
||||||
|
@ -343,6 +355,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
|
|
||||||
|
// clear LoRA adapters from context, then apply new list of adapters
|
||||||
|
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
|
||||||
|
|
||||||
// Batch utils
|
// Batch utils
|
||||||
|
|
||||||
void llama_batch_clear(struct llama_batch & batch);
|
void llama_batch_clear(struct llama_batch & batch);
|
||||||
|
|
|
@ -251,12 +251,7 @@ class Model:
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||||
del name, new_name, bid, n_dims # unused
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
|
||||||
del name, new_name, bid, n_dims # unused
|
del name, new_name, bid, n_dims # unused
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
@ -285,55 +280,47 @@ class Model:
|
||||||
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
||||||
data: np.ndarray # type hint
|
data: np.ndarray # type hint
|
||||||
n_dims = len(data.shape)
|
n_dims = len(data.shape)
|
||||||
data_dtype = data.dtype
|
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
|
||||||
data_qtype: gguf.GGMLQuantizationType | None = None
|
|
||||||
|
|
||||||
# when both are True, f32 should win
|
|
||||||
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
|
|
||||||
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
|
|
||||||
|
|
||||||
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
||||||
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
if n_dims <= 1 or new_name.endswith("_norm.weight"):
|
||||||
extra_f32 = any(cond for cond in (
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
extra_f32,
|
|
||||||
n_dims == 1,
|
|
||||||
new_name.endswith("_norm.weight"),
|
|
||||||
))
|
|
||||||
|
|
||||||
|
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
||||||
# Some tensor types are always in float32
|
# Some tensor types are always in float32
|
||||||
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
|
if data_qtype is False and (
|
||||||
|
any(
|
||||||
|
self.match_model_tensor_name(new_name, key, bid)
|
||||||
|
for key in (
|
||||||
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
||||||
gguf.MODEL_TENSOR.POS_EMBD,
|
gguf.MODEL_TENSOR.POS_EMBD,
|
||||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
or not name.endswith(".weight")
|
||||||
extra_f16 = any(cond for cond in (
|
):
|
||||||
extra_f16,
|
|
||||||
(name.endswith(".weight") and n_dims >= 2),
|
|
||||||
))
|
|
||||||
|
|
||||||
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
|
||||||
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
|
||||||
data = gguf.quantize_bf16(data)
|
|
||||||
assert data.dtype == np.uint16
|
|
||||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
|
||||||
|
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
|
||||||
data = gguf.quantize_q8_0(data)
|
|
||||||
assert data.dtype == np.uint8
|
|
||||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
|
||||||
|
|
||||||
else: # default to float16 for quantized tensors
|
|
||||||
if data_dtype != np.float16:
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
data_qtype = gguf.GGMLQuantizationType.F16
|
|
||||||
|
|
||||||
if data_qtype is None: # by default, convert to float32
|
|
||||||
if data_dtype != np.float32:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||||
|
if isinstance(data_qtype, bool):
|
||||||
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = gguf.quants.quantize(data, data_qtype)
|
||||||
|
except gguf.QuantError as e:
|
||||||
|
logger.warning("%s, %s", e, "falling back to F16")
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
data = gguf.quants.quantize(data, data_qtype)
|
||||||
|
|
||||||
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||||
|
|
||||||
# reverse shape to make it similar to the internal ggml dimension order
|
# reverse shape to make it similar to the internal ggml dimension order
|
||||||
|
@ -1765,7 +1752,7 @@ class DbrxModel(Model):
|
||||||
|
|
||||||
return [(new_name, data_torch)]
|
return [(new_name, data_torch)]
|
||||||
|
|
||||||
def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||||
del name, new_name, bid # unused
|
del name, new_name, bid # unused
|
||||||
|
|
||||||
return n_dims > 1
|
return n_dims > 1
|
||||||
|
@ -2786,18 +2773,22 @@ class MambaModel(Model):
|
||||||
|
|
||||||
return [(new_name, data_torch)]
|
return [(new_name, data_torch)]
|
||||||
|
|
||||||
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||||
del n_dims # unused
|
if bid is not None and new_name in (
|
||||||
|
self.format_tensor_name(
|
||||||
return bid is not None and new_name in (
|
n, bid, ".weight" if name.endswith(".weight") else ""
|
||||||
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
|
)
|
||||||
|
for n in [
|
||||||
gguf.MODEL_TENSOR.SSM_CONV1D,
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
||||||
gguf.MODEL_TENSOR.SSM_X,
|
gguf.MODEL_TENSOR.SSM_X,
|
||||||
gguf.MODEL_TENSOR.SSM_DT,
|
gguf.MODEL_TENSOR.SSM_DT,
|
||||||
gguf.MODEL_TENSOR.SSM_A,
|
gguf.MODEL_TENSOR.SSM_A,
|
||||||
gguf.MODEL_TENSOR.SSM_D,
|
gguf.MODEL_TENSOR.SSM_D,
|
||||||
]
|
]
|
||||||
)
|
):
|
||||||
|
return gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
|
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("CohereForCausalLM")
|
@Model.register("CohereForCausalLM")
|
||||||
|
|
|
@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
|
||||||
### Unix-based systems (Linux, macOS, etc.):
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null
|
||||||
```
|
```
|
||||||
|
|
||||||
### Windows:
|
### Windows:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will output space-separated float values.
|
The above command will output space-separated float values.
|
||||||
|
@ -50,11 +50,11 @@ The above command will output space-separated float values.
|
||||||
### Unix-based systems (Linux, macOS, etc.):
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
```
|
```
|
||||||
|
|
||||||
### Windows:
|
### Windows:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
```
|
```
|
||||||
|
|
|
@ -50,20 +50,6 @@ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context
|
||||||
return ctx_gguf;
|
return ctx_gguf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
||||||
std::string result;
|
|
||||||
for (size_t pos = 0; ; pos += search.length()) {
|
|
||||||
auto new_pos = s.find(search, pos);
|
|
||||||
if (new_pos == std::string::npos) {
|
|
||||||
result += s.substr(pos, s.size() - pos);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
result += s.substr(pos, new_pos - pos) + replace;
|
|
||||||
pos = new_pos;
|
|
||||||
}
|
|
||||||
s = std::move(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct file_input {
|
struct file_input {
|
||||||
struct ggml_context * ctx_meta = nullptr;
|
struct ggml_context * ctx_meta = nullptr;
|
||||||
struct gguf_context * ctx_gguf = nullptr;
|
struct gguf_context * ctx_gguf = nullptr;
|
||||||
|
@ -135,7 +121,7 @@ struct lora_merge_ctx {
|
||||||
|
|
||||||
lora_merge_ctx(
|
lora_merge_ctx(
|
||||||
std::string & base_fname,
|
std::string & base_fname,
|
||||||
std::vector<std::tuple<std::string, float>> & lora_files,
|
std::vector<llama_lora_adapter_info> & lora_files,
|
||||||
std::string & outfile,
|
std::string & outfile,
|
||||||
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
||||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||||
|
@ -144,9 +130,9 @@ struct lora_merge_ctx {
|
||||||
throw std::runtime_error("split model is not yet supported");
|
throw std::runtime_error("split model is not yet supported");
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto lora_inp : lora_files) {
|
for (auto & lora_inp : lora_files) {
|
||||||
auto fname = std::get<0>(lora_inp);
|
auto fname = lora_inp.path;
|
||||||
auto scale = std::get<1>(lora_inp);
|
auto scale = lora_inp.scale;
|
||||||
std::unique_ptr<file_input> adapter(new file_input(fname, scale));
|
std::unique_ptr<file_input> adapter(new file_input(fname, scale));
|
||||||
check_metadata_lora(adapter.get());
|
check_metadata_lora(adapter.get());
|
||||||
adapters.push_back(std::move(adapter));
|
adapters.push_back(std::move(adapter));
|
||||||
|
@ -407,7 +393,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
g_verbose = (params.verbosity == 1);
|
g_verbose = (params.verbosity == 1);
|
||||||
try {
|
try {
|
||||||
lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
|
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
|
||||||
ctx.run_merge();
|
ctx.run_merge();
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "%s\n", err.what());
|
fprintf(stderr, "%s\n", err.what());
|
||||||
|
|
|
@ -28,6 +28,14 @@
|
||||||
#include "ggml-cann.h"
|
#include "ggml-cann.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
# define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
static uint64_t get_time_ns() {
|
static uint64_t get_time_ns() {
|
||||||
using clock = std::chrono::high_resolution_clock;
|
using clock = std::chrono::high_resolution_clock;
|
||||||
|
@ -97,6 +105,27 @@ static std::string get_cpu_info() {
|
||||||
}
|
}
|
||||||
fclose(f);
|
fclose(f);
|
||||||
}
|
}
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
HKEY hKey;
|
||||||
|
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
||||||
|
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
||||||
|
0,
|
||||||
|
KEY_READ,
|
||||||
|
&hKey) != ERROR_SUCCESS) {
|
||||||
|
// fail to open registry key
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
char cpu_brand[256];
|
||||||
|
DWORD cpu_brand_size = sizeof(cpu_brand);
|
||||||
|
if (RegQueryValueExA(hKey,
|
||||||
|
TEXT("ProcessorNameString"),
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
(LPBYTE)cpu_brand,
|
||||||
|
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||||
|
id.assign(cpu_brand, cpu_brand_size);
|
||||||
|
}
|
||||||
|
RegCloseKey(hKey);
|
||||||
#endif
|
#endif
|
||||||
// TODO: other platforms
|
// TODO: other platforms
|
||||||
return id;
|
return id;
|
||||||
|
|
99
examples/llava/README-minicpmv2.5.md
Normal file
99
examples/llava/README-minicpmv2.5.md
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
## MiniCPM-Llama3-V 2.5
|
||||||
|
|
||||||
|
### Prepare models and code
|
||||||
|
|
||||||
|
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
|
||||||
|
|
||||||
|
Clone llama.cpp:
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
```
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||||
|
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
|
||||||
|
python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||||
|
|
||||||
|
# quantize int4 version
|
||||||
|
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
|
```
|
||||||
|
|
||||||
|
Build for Linux or Mac
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make
|
||||||
|
make llama-minicpmv-cli
|
||||||
|
```
|
||||||
|
|
||||||
|
Inference on Linux or Mac
|
||||||
|
```
|
||||||
|
# run f16 version
|
||||||
|
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
|
||||||
|
# run quantized int4 version
|
||||||
|
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
|
||||||
|
# or run in interactive mode
|
||||||
|
./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
|
||||||
|
```
|
||||||
|
|
||||||
|
### Android
|
||||||
|
|
||||||
|
#### Build on Android device using Termux
|
||||||
|
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
|
||||||
|
|
||||||
|
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
|
||||||
|
|
||||||
|
Install tools in Termux:
|
||||||
|
```
|
||||||
|
apt update && apt upgrade -y
|
||||||
|
apt install git make cmake
|
||||||
|
```
|
||||||
|
|
||||||
|
It's recommended to move your model inside the `~/` directory for best performance:
|
||||||
|
```
|
||||||
|
cd storage/downloads
|
||||||
|
mv model.gguf ~/
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Building the Project using Android NDK
|
||||||
|
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
|
||||||
|
|
||||||
|
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir build-android
|
||||||
|
cd build-android
|
||||||
|
export NDK=/your_ndk_path
|
||||||
|
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
||||||
|
make
|
||||||
|
```
|
||||||
|
|
||||||
|
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
|
||||||
|
|
||||||
|
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
||||||
|
|
||||||
|
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
||||||
|
```
|
||||||
|
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
|
||||||
|
$cd /data/data/com.termux/files/home/bin
|
||||||
|
$chmod +x ./*
|
||||||
|
```
|
||||||
|
|
||||||
|
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
|
||||||
|
|
||||||
|
```
|
||||||
|
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
|
||||||
|
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
|
||||||
|
```
|
||||||
|
|
||||||
|
Now, you can start chatting:
|
||||||
|
```
|
||||||
|
$cd /data/data/com.termux/files/home/bin
|
||||||
|
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
```
|
|
@ -80,6 +80,7 @@ static std::string format(const char * fmt, ...) {
|
||||||
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
||||||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||||
|
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
||||||
#define KEY_USE_GELU "clip.use_gelu"
|
#define KEY_USE_GELU "clip.use_gelu"
|
||||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||||
|
@ -127,12 +128,20 @@ static std::string format(const char * fmt, ...) {
|
||||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||||
|
|
||||||
|
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||||
|
#define TN_MINICPMV_QUERY "resampler.query"
|
||||||
|
#define TN_MINICPMV_PROJ "resampler.proj.weight"
|
||||||
|
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
|
||||||
|
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
||||||
|
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
||||||
|
|
||||||
|
|
||||||
enum projector_type {
|
enum projector_type {
|
||||||
PROJECTOR_TYPE_MLP,
|
PROJECTOR_TYPE_MLP,
|
||||||
PROJECTOR_TYPE_MLP_NORM,
|
PROJECTOR_TYPE_MLP_NORM,
|
||||||
PROJECTOR_TYPE_LDP,
|
PROJECTOR_TYPE_LDP,
|
||||||
PROJECTOR_TYPE_LDPV2,
|
PROJECTOR_TYPE_LDPV2,
|
||||||
|
PROJECTOR_TYPE_RESAMPLER,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -140,6 +149,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||||
|
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -200,17 +210,14 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
||||||
}
|
}
|
||||||
|
|
||||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||||
std::string result;
|
if (search.empty()) {
|
||||||
for (size_t pos = 0; ; pos += search.length()) {
|
return; // Avoid infinite loop if 'search' is an empty string
|
||||||
auto new_pos = s.find(search, pos);
|
|
||||||
if (new_pos == std::string::npos) {
|
|
||||||
result += s.substr(pos, s.size() - pos);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
result += s.substr(pos, new_pos - pos) + replace;
|
size_t pos = 0;
|
||||||
pos = new_pos;
|
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||||
|
s.replace(pos, search.length(), replace);
|
||||||
|
pos += replace.length();
|
||||||
}
|
}
|
||||||
s = std::move(result);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||||
|
@ -492,12 +499,33 @@ struct clip_vision_model {
|
||||||
struct ggml_tensor * mm_model_mlp_2_b;
|
struct ggml_tensor * mm_model_mlp_2_b;
|
||||||
struct ggml_tensor * mm_model_peg_0_w;
|
struct ggml_tensor * mm_model_peg_0_w;
|
||||||
struct ggml_tensor * mm_model_peg_0_b;
|
struct ggml_tensor * mm_model_peg_0_b;
|
||||||
|
|
||||||
|
// MINICPMV projection
|
||||||
|
struct ggml_tensor * mm_model_pos_embed_k;
|
||||||
|
struct ggml_tensor * mm_model_query;
|
||||||
|
struct ggml_tensor * mm_model_proj;
|
||||||
|
struct ggml_tensor * mm_model_kv_proj;
|
||||||
|
struct ggml_tensor * mm_model_attn_q_w;
|
||||||
|
struct ggml_tensor * mm_model_attn_q_b;
|
||||||
|
struct ggml_tensor * mm_model_attn_k_w;
|
||||||
|
struct ggml_tensor * mm_model_attn_k_b;
|
||||||
|
struct ggml_tensor * mm_model_attn_v_w;
|
||||||
|
struct ggml_tensor * mm_model_attn_v_b;
|
||||||
|
struct ggml_tensor * mm_model_attn_o_w;
|
||||||
|
struct ggml_tensor * mm_model_attn_o_b;
|
||||||
|
struct ggml_tensor * mm_model_ln_q_w;
|
||||||
|
struct ggml_tensor * mm_model_ln_q_b;
|
||||||
|
struct ggml_tensor * mm_model_ln_kv_w;
|
||||||
|
struct ggml_tensor * mm_model_ln_kv_b;
|
||||||
|
struct ggml_tensor * mm_model_ln_post_w;
|
||||||
|
struct ggml_tensor * mm_model_ln_post_b;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_ctx {
|
struct clip_ctx {
|
||||||
bool has_text_encoder = false;
|
bool has_text_encoder = false;
|
||||||
bool has_vision_encoder = false;
|
bool has_vision_encoder = false;
|
||||||
bool has_llava_projector = false;
|
bool has_llava_projector = false;
|
||||||
|
bool has_minicpmv_projector = false;
|
||||||
|
|
||||||
struct clip_vision_model vision_model;
|
struct clip_vision_model vision_model;
|
||||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||||
|
@ -522,9 +550,11 @@ struct clip_ctx {
|
||||||
|
|
||||||
ggml_backend_t backend = NULL;
|
ggml_backend_t backend = NULL;
|
||||||
ggml_gallocr_t compute_alloc = NULL;
|
ggml_gallocr_t compute_alloc = NULL;
|
||||||
|
|
||||||
|
struct clip_image_size * load_image_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -534,19 +564,32 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const int image_size = hparams.image_size;
|
const int image_size = hparams.image_size;
|
||||||
|
int image_size_width = image_size;
|
||||||
|
int image_size_height = image_size;
|
||||||
|
if (ctx->has_minicpmv_projector) {
|
||||||
|
if (load_image_size == nullptr) {
|
||||||
|
load_image_size = clip_image_size_init();
|
||||||
|
}
|
||||||
|
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
|
image_size_width = load_image_size->width;
|
||||||
|
image_size_height = load_image_size->height;
|
||||||
|
if (is_inf) {
|
||||||
|
image_size_width = imgs->data->nx;
|
||||||
|
image_size_height = imgs->data->ny;
|
||||||
|
}
|
||||||
|
}
|
||||||
const int patch_size = hparams.patch_size;
|
const int patch_size = hparams.patch_size;
|
||||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
|
|
||||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||||
const int hidden_size = hparams.hidden_size;
|
const int hidden_size = hparams.hidden_size;
|
||||||
const int n_head = hparams.n_head;
|
const int n_head = hparams.n_head;
|
||||||
const int d_head = hidden_size / n_head;
|
const int d_head = hidden_size / n_head;
|
||||||
const int n_layer = hparams.n_layer;
|
int n_layer = hparams.n_layer;
|
||||||
const float eps = hparams.eps;
|
const float eps = hparams.eps;
|
||||||
|
|
||||||
const int batch_size = imgs->size;
|
const int batch_size = imgs->size;
|
||||||
|
|
||||||
if (ctx->has_llava_projector) {
|
if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
|
||||||
GGML_ASSERT(batch_size == 1);
|
GGML_ASSERT(batch_size == 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -559,7 +602,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
|
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
|
||||||
ggml_set_name(inp_raw, "inp_raw");
|
ggml_set_name(inp_raw, "inp_raw");
|
||||||
ggml_set_input(inp_raw);
|
ggml_set_input(inp_raw);
|
||||||
|
|
||||||
|
@ -572,9 +615,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
||||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
// concat class_embeddings and patch_embeddings
|
|
||||||
struct ggml_tensor * embeddings = inp;
|
struct ggml_tensor * embeddings = inp;
|
||||||
|
struct ggml_tensor * pos_embed = nullptr;
|
||||||
|
|
||||||
|
if (ctx->has_llava_projector) {
|
||||||
|
// concat class_embeddings and patch_embeddings
|
||||||
if (ctx->has_class_embedding) {
|
if (ctx->has_class_embedding) {
|
||||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||||
ggml_set_name(embeddings, "embeddings");
|
ggml_set_name(embeddings, "embeddings");
|
||||||
|
@ -584,7 +629,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||||
ggml_set_name(positions, "positions");
|
ggml_set_name(positions, "positions");
|
||||||
|
@ -593,6 +638,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
embeddings =
|
embeddings =
|
||||||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||||
|
|
||||||
|
if (ctx->has_minicpmv_projector) {
|
||||||
|
int pos_w = image_size_width/patch_size;
|
||||||
|
int pos_h = image_size_height/patch_size;
|
||||||
|
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
|
||||||
|
ggml_set_name(pos_embed, "pos_embed");
|
||||||
|
ggml_set_input(pos_embed);
|
||||||
|
}
|
||||||
|
|
||||||
// pre-layernorm
|
// pre-layernorm
|
||||||
if (ctx->has_pre_norm) {
|
if (ctx->has_pre_norm) {
|
||||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||||
|
@ -602,6 +655,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
}
|
}
|
||||||
|
|
||||||
// loop over layers
|
// loop over layers
|
||||||
|
if (ctx->has_minicpmv_projector) {
|
||||||
|
n_layer += 1;
|
||||||
|
}
|
||||||
for (int il = 0; il < n_layer - 1; il++) {
|
for (int il = 0; il < n_layer - 1; il++) {
|
||||||
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||||
|
|
||||||
|
@ -691,7 +747,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
}
|
}
|
||||||
|
|
||||||
// llava projector
|
// llava projector
|
||||||
{
|
if (ctx->has_llava_projector) {
|
||||||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||||
|
|
||||||
struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
|
||||||
|
@ -872,6 +928,65 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// minicpmv projector
|
||||||
|
else if (ctx->has_minicpmv_projector)
|
||||||
|
{
|
||||||
|
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||||
|
struct ggml_tensor * q = model.mm_model_query;
|
||||||
|
{ // layernorm
|
||||||
|
q = ggml_norm(ctx0, q, eps);
|
||||||
|
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
||||||
|
}
|
||||||
|
struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
||||||
|
{ // layernorm
|
||||||
|
v = ggml_norm(ctx0, v, eps);
|
||||||
|
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
|
||||||
|
}
|
||||||
|
struct ggml_tensor * k;
|
||||||
|
{ // position
|
||||||
|
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
|
||||||
|
k = ggml_add(ctx0, v, pos_embed);
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // attention
|
||||||
|
const int hidden_size = 4096;
|
||||||
|
const int d_head = 128;
|
||||||
|
const int n_head = hidden_size/d_head;
|
||||||
|
const int num_query = 96;
|
||||||
|
|
||||||
|
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
||||||
|
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||||
|
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
|
||||||
|
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
|
||||||
|
// permute
|
||||||
|
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
|
||||||
|
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||||
|
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
|
||||||
|
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
||||||
|
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||||
|
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
||||||
|
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
||||||
|
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||||
|
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
||||||
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||||
|
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
|
||||||
|
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
|
||||||
|
|
||||||
|
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
|
||||||
|
}
|
||||||
|
{ // layernorm
|
||||||
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||||
|
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
|
||||||
|
}
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// build the graph
|
// build the graph
|
||||||
ggml_build_forward_expand(gf, embeddings);
|
ggml_build_forward_expand(gf, embeddings);
|
||||||
|
@ -1029,7 +1144,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
|
new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
|
||||||
|
if (idx != -1) {
|
||||||
|
new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
||||||
|
|
||||||
GGML_ASSERT(new_clip->has_vision_encoder);
|
GGML_ASSERT(new_clip->has_vision_encoder);
|
||||||
GGML_ASSERT(!new_clip->has_text_encoder);
|
GGML_ASSERT(!new_clip->has_text_encoder);
|
||||||
|
|
||||||
|
@ -1040,6 +1161,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||||
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||||
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||||
|
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||||
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||||
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
@ -1281,6 +1403,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
||||||
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
||||||
}
|
}
|
||||||
|
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||||
|
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
|
||||||
|
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
|
||||||
|
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
|
||||||
|
vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
|
||||||
|
vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
|
||||||
|
vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
|
||||||
|
vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
|
||||||
|
vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
|
||||||
|
vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
|
||||||
|
vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
|
||||||
|
vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
|
||||||
|
vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
|
||||||
|
vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
|
||||||
|
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
|
||||||
|
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
|
||||||
|
vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
|
||||||
|
vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
|
||||||
|
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
||||||
|
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||||
|
@ -1319,7 +1462,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||||
clip_image_f32_batch batch;
|
clip_image_f32_batch batch;
|
||||||
batch.size = 1;
|
batch.size = 1;
|
||||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||||
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||||
|
@ -1328,6 +1471,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
return new_clip;
|
return new_clip;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
|
||||||
|
ctx_clip->load_image_size = load_image_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct clip_image_size * clip_image_size_init() {
|
||||||
|
struct clip_image_size * load_image_size = new struct clip_image_size();
|
||||||
|
load_image_size->width = 448;
|
||||||
|
load_image_size->height = 448;
|
||||||
|
return load_image_size;
|
||||||
|
}
|
||||||
|
|
||||||
struct clip_image_u8 * clip_image_u8_init() {
|
struct clip_image_u8 * clip_image_u8_init() {
|
||||||
return new clip_image_u8();
|
return new clip_image_u8();
|
||||||
}
|
}
|
||||||
|
@ -1598,9 +1752,184 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
||||||
return patches;
|
return patches;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int ensure_divide(int length, int patch_size) {
|
||||||
|
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
||||||
|
int width = original_size.first;
|
||||||
|
int height = original_size.second;
|
||||||
|
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
||||||
|
float r = static_cast<float>(width) / height;
|
||||||
|
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
||||||
|
width = static_cast<int>(height * r);
|
||||||
|
}
|
||||||
|
int best_width = ensure_divide(width, patch_size);
|
||||||
|
int best_height = ensure_divide(height, patch_size);
|
||||||
|
return std::make_pair(best_width, best_height);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
||||||
|
int width, height;
|
||||||
|
std::tie(width, height) = original_size;
|
||||||
|
int grid_x, grid_y;
|
||||||
|
std::tie(grid_x, grid_y) = grid;
|
||||||
|
|
||||||
|
int refine_width = ensure_divide(width, grid_x);
|
||||||
|
int refine_height = ensure_divide(height, grid_y);
|
||||||
|
|
||||||
|
int grid_width = refine_width / grid_x;
|
||||||
|
int grid_height = refine_height / grid_y;
|
||||||
|
|
||||||
|
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
|
||||||
|
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
|
||||||
|
int best_grid_width, best_grid_height;
|
||||||
|
std::tie(best_grid_width, best_grid_height) = best_grid_size;
|
||||||
|
|
||||||
|
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
|
||||||
|
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
|
||||||
|
return refine_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int clip(int x, int lower, int upper) {
|
||||||
|
return std::max(lower, std::min(x, upper));
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
||||||
|
std::vector<int> candidate_split_grids_nums;
|
||||||
|
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
||||||
|
if (i == 1 || i > max_slice_nums) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
candidate_split_grids_nums.push_back(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::pair<int, int>> candidate_grids;
|
||||||
|
for (int split_grids_nums : candidate_split_grids_nums) {
|
||||||
|
int m = 1;
|
||||||
|
while (m <= split_grids_nums) {
|
||||||
|
if (split_grids_nums % m == 0) {
|
||||||
|
candidate_grids.emplace_back(m, split_grids_nums / m);
|
||||||
|
}
|
||||||
|
++m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<int, int> best_grid{1, 1};
|
||||||
|
float min_error = std::numeric_limits<float>::infinity();
|
||||||
|
for (const auto& grid : candidate_grids) {
|
||||||
|
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
|
||||||
|
if (error < min_error) {
|
||||||
|
best_grid = grid;
|
||||||
|
min_error = error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return best_grid;
|
||||||
|
}
|
||||||
|
|
||||||
|
// inspired from LLaVA-UHD:
|
||||||
|
// -> https://arxiv.org/pdf/2403.11703
|
||||||
|
// -> https://github.com/thunlp/LLaVA-UHD
|
||||||
|
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
||||||
|
static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
|
||||||
|
const std::pair<int, int> original_size={img->nx,img->ny};
|
||||||
|
const int original_width = img->nx;
|
||||||
|
const int original_height = img->ny;
|
||||||
|
const float log_ratio = log(1.0*original_width/original_height);
|
||||||
|
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
||||||
|
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||||
|
|
||||||
|
std::vector<std::vector<clip_image_u8 *>> images;
|
||||||
|
LOG_TEE("%s: multiple %d\n", __func__, multiple);
|
||||||
|
images.push_back(std::vector<clip_image_u8 *>());
|
||||||
|
|
||||||
|
if (multiple <= 1) {
|
||||||
|
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
|
||||||
|
clip_image_u8 * source_image = clip_image_u8_init();
|
||||||
|
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||||
|
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
|
||||||
|
images[images.size()-1].push_back(source_image);
|
||||||
|
}
|
||||||
|
else if (multiple > 1) {
|
||||||
|
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
|
||||||
|
clip_image_u8 * source_image = clip_image_u8_init();
|
||||||
|
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||||
|
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
||||||
|
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
||||||
|
images[images.size()-1].push_back(source_image);
|
||||||
|
|
||||||
|
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||||
|
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
||||||
|
|
||||||
|
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
||||||
|
clip_image_u8 * refine_image = clip_image_u8_init();
|
||||||
|
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
||||||
|
|
||||||
|
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
||||||
|
|
||||||
|
// split_to_patches
|
||||||
|
int width = refine_image->nx;
|
||||||
|
int height = refine_image->ny;
|
||||||
|
int grid_x = int(width / best_grid.first);
|
||||||
|
int grid_y = int(height / best_grid.second);
|
||||||
|
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
|
||||||
|
images.push_back(std::vector<clip_image_u8 *>());
|
||||||
|
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
|
||||||
|
clip_image_u8 * patch = clip_image_u8_init();
|
||||||
|
patch->nx = grid_x;
|
||||||
|
patch->ny = grid_y;
|
||||||
|
patch->buf.resize(3 * patch->nx * patch->ny);
|
||||||
|
for (int y = patches_i; y < patches_i + grid_y; ++y) {
|
||||||
|
for (int x = patches_j; x < patches_j + grid_x; ++x) {
|
||||||
|
const int i = 3 * (y * refine_image->nx + x);
|
||||||
|
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
|
||||||
|
patch->buf[j] = refine_image->buf[i];
|
||||||
|
patch->buf[j+1] = refine_image->buf[i+1];
|
||||||
|
patch->buf[j+2] = refine_image->buf[i+2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
images[images.size()-1].push_back(patch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return images;
|
||||||
|
}
|
||||||
|
|
||||||
|
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
|
||||||
|
const int max_slice_nums=9;
|
||||||
|
const int scale_resolution=448;
|
||||||
|
const int original_width = ctx_clip->load_image_size->width;
|
||||||
|
const int original_height = ctx_clip->load_image_size->height;
|
||||||
|
const float log_ratio = log(1.0*original_width/original_height);
|
||||||
|
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
||||||
|
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||||
|
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||||
|
return best_grid.first;
|
||||||
|
}
|
||||||
|
|
||||||
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
||||||
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
||||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
||||||
|
if (clip_is_minicpmv(ctx)) {
|
||||||
|
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
|
||||||
|
res_imgs->size = 0;
|
||||||
|
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||||
|
res_imgs->size += imgs[i].size();
|
||||||
|
}
|
||||||
|
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||||
|
int idx = 0;
|
||||||
|
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||||
|
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||||
|
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
||||||
|
clip_image_f32 * res = clip_image_f32_init();
|
||||||
|
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
||||||
|
res_imgs->data[idx++] = *res;
|
||||||
|
clip_image_f32_free(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool pad_to_square = true;
|
bool pad_to_square = true;
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
|
@ -1816,11 +2145,99 @@ int clip_n_patches(const struct clip_ctx * ctx) {
|
||||||
|
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
||||||
n_patches /= 4;
|
n_patches /= 4;
|
||||||
|
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||||
|
n_patches = 96;
|
||||||
}
|
}
|
||||||
|
|
||||||
return n_patches;
|
return n_patches;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
|
||||||
|
assert(embed_dim % 2 == 0);
|
||||||
|
int H = pos.size();
|
||||||
|
int W = pos[0].size();
|
||||||
|
|
||||||
|
std::vector<float> omega(embed_dim / 2);
|
||||||
|
for (int i = 0; i < embed_dim / 2; ++i) {
|
||||||
|
omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
||||||
|
for (int h = 0; h < H; ++h) {
|
||||||
|
for (int w = 0; w < W; ++w) {
|
||||||
|
for (int d = 0; d < embed_dim / 2; ++d) {
|
||||||
|
float out_value = pos[h][w] * omega[d];
|
||||||
|
emb[h][w][d] = sin(out_value);
|
||||||
|
emb[h][w][d + embed_dim / 2] = cos(out_value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return emb;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
|
||||||
|
assert(embed_dim % 2 == 0);
|
||||||
|
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
|
||||||
|
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
|
||||||
|
|
||||||
|
int H = emb_h.size();
|
||||||
|
int W = emb_h[0].size();
|
||||||
|
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
||||||
|
|
||||||
|
for (int h = 0; h < H; ++h) {
|
||||||
|
for (int w = 0; w < W; ++w) {
|
||||||
|
for (int d = 0; d < embed_dim / 2; ++d) {
|
||||||
|
emb[h][w][d] = emb_h[h][w][d];
|
||||||
|
emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return emb;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
|
||||||
|
int grid_h_size = image_size.first;
|
||||||
|
int grid_w_size = image_size.second;
|
||||||
|
|
||||||
|
std::vector<float> grid_h(grid_h_size);
|
||||||
|
std::vector<float> grid_w(grid_w_size);
|
||||||
|
|
||||||
|
for (int i = 0; i < grid_h_size; ++i) {
|
||||||
|
grid_h[i] = static_cast<float>(i);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < grid_w_size; ++i) {
|
||||||
|
grid_w[i] = static_cast<float>(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
|
||||||
|
for (int h = 0; h < grid_h_size; ++h) {
|
||||||
|
for (int w = 0; w < grid_w_size; ++w) {
|
||||||
|
grid[h][w] = grid_w[w];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
|
||||||
|
for (int h = 0; h < grid_h_size; ++h) {
|
||||||
|
for (int w = 0; w < grid_w_size; ++w) {
|
||||||
|
grid_2d[0][h][w] = grid_h[h];
|
||||||
|
grid_2d[1][h][w] = grid_w[w];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
|
||||||
|
|
||||||
|
int H = image_size.first;
|
||||||
|
int W = image_size.second;
|
||||||
|
std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
|
||||||
|
for (int h = 0; h < H; ++h) {
|
||||||
|
for (int w = 0; w < W; ++w) {
|
||||||
|
pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos_embed_2d;
|
||||||
|
}
|
||||||
|
|
||||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
|
@ -1843,9 +2260,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
if (ctx->has_llava_projector) {
|
if (ctx->has_llava_projector) {
|
||||||
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
||||||
}
|
}
|
||||||
|
if (ctx->has_minicpmv_projector) {
|
||||||
|
GGML_ASSERT(batch_size == 1);
|
||||||
|
}
|
||||||
|
|
||||||
// build the inference graph
|
// build the inference graph
|
||||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
||||||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
|
||||||
|
|
||||||
// set inputs
|
// set inputs
|
||||||
|
@ -1853,8 +2273,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const int image_size = hparams.image_size;
|
const int image_size = hparams.image_size;
|
||||||
|
int image_size_width = image_size;
|
||||||
|
int image_size_height = image_size;
|
||||||
|
if (ctx->has_minicpmv_projector) {
|
||||||
|
image_size_width = imgs->data[0].nx;
|
||||||
|
image_size_height = imgs->data[0].ny;
|
||||||
|
}
|
||||||
const int patch_size = hparams.patch_size;
|
const int patch_size = hparams.patch_size;
|
||||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -1864,7 +2290,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
for (size_t i = 0; i < imgs->size; i++) {
|
for (size_t i = 0; i < imgs->size; i++) {
|
||||||
const int nx = imgs->data[i].nx;
|
const int nx = imgs->data[i].nx;
|
||||||
const int ny = imgs->data[i].ny;
|
const int ny = imgs->data[i].ny;
|
||||||
|
if (!ctx->has_minicpmv_projector) {
|
||||||
GGML_ASSERT(nx == image_size && ny == image_size);
|
GGML_ASSERT(nx == image_size && ny == image_size);
|
||||||
|
}
|
||||||
|
|
||||||
const int n = nx * ny;
|
const int n = nx * ny;
|
||||||
|
|
||||||
|
@ -1881,7 +2309,44 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
||||||
free(data);
|
free(data);
|
||||||
}
|
}
|
||||||
|
if (ctx->has_minicpmv_projector) {
|
||||||
|
{
|
||||||
|
// inspired from siglip:
|
||||||
|
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
||||||
|
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
||||||
|
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||||
|
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||||
|
for (int i = 0; i < num_positions; i++) {
|
||||||
|
positions_data[i] = std::floor(70.0*i/num_positions);
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||||
|
free(positions_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// inspired from resampler of Qwen-VL:
|
||||||
|
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
||||||
|
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||||
|
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
||||||
|
if(ctx->load_image_size==nullptr){
|
||||||
|
ctx->load_image_size= clip_image_size_init();
|
||||||
|
}
|
||||||
|
int pos_w = ctx->load_image_size->width/patch_size;
|
||||||
|
int pos_h = ctx->load_image_size->height/patch_size;
|
||||||
|
int embed_dim = 4096;
|
||||||
|
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||||
|
|
||||||
|
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
|
||||||
|
for(int i=0;i<pos_w * pos_h;++i){
|
||||||
|
for(int j=0;j<embed_dim;++j){
|
||||||
|
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
|
||||||
|
free(pos_embed_data);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
{
|
{
|
||||||
if (ctx->has_class_embedding) {
|
if (ctx->has_class_embedding) {
|
||||||
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
||||||
|
@ -1913,6 +2378,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||||
free(patches_data);
|
free(patches_data);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||||
|
@ -2081,7 +2547,14 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||||
return ctx->vision_model.mm_3_b->ne[0];
|
return ctx->vision_model.mm_3_b->ne[0];
|
||||||
}
|
}
|
||||||
|
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||||
|
return 4096;
|
||||||
|
}
|
||||||
|
|
||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||||
|
return ctx->has_minicpmv_projector;
|
||||||
|
}
|
||||||
|
|
|
@ -18,14 +18,17 @@
|
||||||
# define CLIP_API
|
# define CLIP_API
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct clip_ctx;
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct clip_ctx;
|
struct clip_ctx;
|
||||||
|
|
||||||
|
struct clip_image_size {
|
||||||
|
int width;
|
||||||
|
int height;
|
||||||
|
};
|
||||||
|
|
||||||
struct clip_image_u8_batch {
|
struct clip_image_u8_batch {
|
||||||
struct clip_image_u8 * data;
|
struct clip_image_u8 * data;
|
||||||
size_t size;
|
size_t size;
|
||||||
|
@ -55,6 +58,10 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
||||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||||
|
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||||
|
|
||||||
|
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||||
|
|
||||||
|
@ -78,6 +85,8 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
|
||||||
|
|
||||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||||
|
|
||||||
|
CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -202,6 +202,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
|
||||||
|
int width = image->nx;
|
||||||
|
int height = image->ny;
|
||||||
|
int num_patches = (height / patch_size) * (width / patch_size);
|
||||||
|
clip_image_f32 * patch = clip_image_f32_init();
|
||||||
|
patch->nx = patch_size * num_patches;
|
||||||
|
patch->ny = patch_size;
|
||||||
|
patch->buf.resize(3 * patch->nx * patch->ny);
|
||||||
|
|
||||||
|
int patch_index = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < height; i += patch_size) {
|
||||||
|
for (int j = 0; j < width; j += patch_size) {
|
||||||
|
for (int pi = 0; pi < patch_size; ++pi) {
|
||||||
|
for (int pj = 0; pj < patch_size; ++pj) {
|
||||||
|
int input_index = ((i + pi) * width + (j + pj)) * 3;
|
||||||
|
int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
|
||||||
|
patch->buf[output_index] = image->buf[input_index];
|
||||||
|
patch->buf[output_index+1] = image->buf[input_index+1];
|
||||||
|
patch->buf[output_index+2] = image->buf[input_index+2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
patch_index++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return patch;
|
||||||
|
}
|
||||||
|
|
||||||
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
||||||
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
|
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
|
||||||
|
@ -218,7 +245,44 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
|
|
||||||
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
||||||
|
|
||||||
if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
if (clip_is_minicpmv(ctx_clip)) {
|
||||||
|
std::vector<float *> image_embd_v;
|
||||||
|
image_embd_v.resize(img_res_v.size);
|
||||||
|
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||||
|
for (size_t i = 0; i < img_res_v.size; i++) {
|
||||||
|
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
||||||
|
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
||||||
|
int patch_size=14;
|
||||||
|
load_image_size->width = img_res_v.data[i].nx;
|
||||||
|
load_image_size->height = img_res_v.data[i].ny;
|
||||||
|
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||||
|
const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||||
|
if (!encoded) {
|
||||||
|
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
||||||
|
LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||||
|
}
|
||||||
|
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||||
|
LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||||
|
|
||||||
|
int n_img_pos_out = 0;
|
||||||
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||||
|
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
|
||||||
|
n_img_pos_out += clip_n_patches(ctx_clip);
|
||||||
|
}
|
||||||
|
*n_img_pos = n_img_pos_out;
|
||||||
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||||
|
free(image_embd_v[i]);
|
||||||
|
}
|
||||||
|
image_embd_v.clear();
|
||||||
|
load_image_size->width = img->nx;
|
||||||
|
load_image_size->height = img->ny;
|
||||||
|
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||||
|
LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
|
}
|
||||||
|
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||||
// flat / default llava-1.5 type embedding
|
// flat / default llava-1.5 type embedding
|
||||||
*n_img_pos = clip_n_patches(ctx_clip);
|
*n_img_pos = clip_n_patches(ctx_clip);
|
||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
||||||
|
@ -228,7 +292,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// spatial_unpad llava-1.6 type embedding
|
// spatial_unpad llava-1.6 type embedding
|
||||||
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
||||||
std::vector<float *> image_embd_v;
|
std::vector<float *> image_embd_v;
|
||||||
|
@ -297,7 +362,11 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
int num_max_patches = 6;
|
||||||
|
if (clip_is_minicpmv(ctx_clip)) {
|
||||||
|
num_max_patches = 10;
|
||||||
|
}
|
||||||
|
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
||||||
if (!image_embd) {
|
if (!image_embd) {
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -17,12 +17,11 @@
|
||||||
# define LLAVA_API
|
# define LLAVA_API
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct clip_ctx;
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct clip_ctx;
|
||||||
struct llava_image_embed {
|
struct llava_image_embed {
|
||||||
float * embed;
|
float * embed;
|
||||||
int n_image_pos;
|
int n_image_pos;
|
||||||
|
@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip,
|
||||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
||||||
/** build an image embed from a path to an image filename */
|
/** build an image embed from a path to an image filename */
|
||||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
||||||
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|
||||||
/** free an embedding made with llava_image_embed_make_* */
|
/** free an embedding made with llava_image_embed_make_* */
|
||||||
|
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
||||||
|
|
||||||
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
||||||
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
||||||
|
|
309
examples/llava/minicpmv-cli.cpp
Normal file
309
examples/llava/minicpmv-cli.cpp
Normal file
|
@ -0,0 +1,309 @@
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "log.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "clip.h"
|
||||||
|
#include "llava.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
struct llava_context {
|
||||||
|
struct clip_ctx * ctx_clip = NULL;
|
||||||
|
struct llama_context * ctx_llama = NULL;
|
||||||
|
struct llama_model * model = NULL;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||||
|
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
|
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||||
|
(void) level;
|
||||||
|
(void) user_data;
|
||||||
|
LOG_TEE("%s", text);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_model * llava_init(gpt_params * params) {
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params->numa);
|
||||||
|
|
||||||
|
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
|
||||||
|
|
||||||
|
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||||
|
if (model == NULL) {
|
||||||
|
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return model;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
|
||||||
|
auto prompt = params->prompt;
|
||||||
|
if (prompt.empty()) {
|
||||||
|
prompt = "describe the image in detail.";
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||||
|
if (params->n_ctx < 2048) {
|
||||||
|
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
||||||
|
LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||||
|
ctx_params.n_ctx = 2048;
|
||||||
|
} else {
|
||||||
|
ctx_params.n_ctx = params->n_ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
if (ctx_llama == NULL) {
|
||||||
|
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||||
|
|
||||||
|
ctx_llava->ctx_llama = ctx_llama;
|
||||||
|
ctx_llava->model = model;
|
||||||
|
return ctx_llava;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llava_free(struct llava_context * ctx_llava) {
|
||||||
|
if (ctx_llava->ctx_clip) {
|
||||||
|
clip_free(ctx_llava->ctx_clip);
|
||||||
|
ctx_llava->ctx_clip = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_free(ctx_llava->ctx_llama);
|
||||||
|
llama_free_model(ctx_llava->model);
|
||||||
|
llama_backend_free();
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct clip_ctx * clip_init_context(gpt_params * params) {
|
||||||
|
const char * clip_path = params->mmproj.c_str();
|
||||||
|
|
||||||
|
auto prompt = params->prompt;
|
||||||
|
if (prompt.empty()) {
|
||||||
|
prompt = "describe the image in detail.";
|
||||||
|
}
|
||||||
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||||
|
return ctx_clip;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||||
|
int N = (int) tokens.size();
|
||||||
|
for (int i = 0; i < N; i += n_batch) {
|
||||||
|
int n_eval = (int) tokens.size() - i;
|
||||||
|
if (n_eval > n_batch) {
|
||||||
|
n_eval = n_batch;
|
||||||
|
}
|
||||||
|
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||||
|
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*n_past += n_eval;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
tokens.push_back(id);
|
||||||
|
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||||
|
std::string str2 = str;
|
||||||
|
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
|
||||||
|
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
|
||||||
|
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||||
|
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||||
|
|
||||||
|
auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||||
|
slice_embed->embed = image_embed;
|
||||||
|
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
||||||
|
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
||||||
|
llava_image_embed_free(slice_embed);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
|
||||||
|
std::string system_prompt;
|
||||||
|
int idx = 0;
|
||||||
|
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
||||||
|
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
|
||||||
|
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||||
|
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||||
|
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||||
|
if (num_image_embeds > 1) {
|
||||||
|
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
||||||
|
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
||||||
|
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
||||||
|
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||||
|
if (j == num_image_embeds_col - 1) {
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||||
|
}
|
||||||
|
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * sample(struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_llama,
|
||||||
|
int * n_past) {
|
||||||
|
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
||||||
|
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
|
||||||
|
static std::string ret;
|
||||||
|
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||||
|
ret = "</s>";
|
||||||
|
} else {
|
||||||
|
ret = llama_token_to_piece(ctx_llama, id);
|
||||||
|
}
|
||||||
|
eval_id(ctx_llama, id, n_past);
|
||||||
|
return ret.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||||
|
auto ctx_clip = clip_init_context(params);
|
||||||
|
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
|
||||||
|
if (!embeds) {
|
||||||
|
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// process the prompt
|
||||||
|
if (params->prompt.empty() && params->interactive == false) {
|
||||||
|
LOG_TEE("prompt should be given or interactive mode should be on");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto model = llava_init(params);
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
const int64_t t_llava_init_start_us = ggml_time_us();
|
||||||
|
auto ctx_llava = llava_init_context(params, model);
|
||||||
|
ctx_llava->ctx_clip = ctx_clip;
|
||||||
|
const int64_t t_llava_init_end_us = ggml_time_us();
|
||||||
|
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||||
|
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||||
|
|
||||||
|
const int64_t t_process_image_start_us = ggml_time_us();
|
||||||
|
process_image(ctx_llava, embeds, params, n_past);
|
||||||
|
const int64_t t_process_image_end_us = ggml_time_us();
|
||||||
|
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||||
|
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||||
|
|
||||||
|
llava_image_embed_free(embeds);
|
||||||
|
return ctx_llava;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
|
||||||
|
std::string user_prompt = prompt;
|
||||||
|
if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
|
||||||
|
|
||||||
|
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||||
|
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
|
||||||
|
// generate the response
|
||||||
|
|
||||||
|
LOG_TEE("\n");
|
||||||
|
|
||||||
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
||||||
|
return ctx_sampling;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
|
||||||
|
|
||||||
|
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
ggml_time_init();
|
||||||
|
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
show_additional_info(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef LOG_DISABLE_LOGS
|
||||||
|
log_set_target(log_filename_generator("llava", "log"));
|
||||||
|
LOG_TEE("Log start\n");
|
||||||
|
log_dump_cmdline(argc, argv);
|
||||||
|
llama_log_set(llama_log_callback_logTee, nullptr);
|
||||||
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
|
if (params.mmproj.empty() || (params.image.empty())) {
|
||||||
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
show_additional_info(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto & image : params.image) {
|
||||||
|
int n_past = 0;
|
||||||
|
auto ctx_llava = minicpmv_init(¶ms, image, n_past);
|
||||||
|
|
||||||
|
if (!params.prompt.empty()) {
|
||||||
|
LOG_TEE("<user>%s\n", params.prompt.c_str());
|
||||||
|
LOG_TEE("<assistant>");
|
||||||
|
auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true);
|
||||||
|
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||||
|
std::string response = "";
|
||||||
|
bool have_tmp = false;
|
||||||
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
|
auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
|
||||||
|
response += tmp;
|
||||||
|
if (strcmp(tmp, "</s>") == 0){
|
||||||
|
if(!have_tmp)continue;
|
||||||
|
else break;
|
||||||
|
}
|
||||||
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
|
have_tmp = true;
|
||||||
|
printf("%s", tmp);
|
||||||
|
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||||
|
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
llama_sampling_free(ctx_sampling);
|
||||||
|
}else {
|
||||||
|
while (true) {
|
||||||
|
LOG_TEE("<user>");
|
||||||
|
std::string prompt;
|
||||||
|
std::getline(std::cin, prompt);
|
||||||
|
LOG_TEE("<assistant>");
|
||||||
|
auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||||
|
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||||
|
std::string response = "";
|
||||||
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
|
auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
|
||||||
|
response += tmp;
|
||||||
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
|
printf("%s", tmp);// mistral llava-1.6
|
||||||
|
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
llama_sampling_free(ctx_sampling);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
llama_print_timings(ctx_llava->ctx_llama);
|
||||||
|
|
||||||
|
ctx_llava->model = NULL;
|
||||||
|
llava_free(ctx_llava);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
382
examples/llava/minicpmv-convert-image-encoder-to-gguf.py
Normal file
382
examples/llava/minicpmv-convert-image-encoder-to-gguf.py
Normal file
|
@ -0,0 +1,382 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from gguf import *
|
||||||
|
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
|
||||||
|
|
||||||
|
TEXT = "clip.text"
|
||||||
|
VISION = "clip.vision"
|
||||||
|
|
||||||
|
|
||||||
|
def add_key_str(raw_key: str, arch: str) -> str:
|
||||||
|
return raw_key.format(arch=arch)
|
||||||
|
|
||||||
|
|
||||||
|
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
|
||||||
|
if name in (
|
||||||
|
"logit_scale",
|
||||||
|
"text_model.embeddings.position_ids",
|
||||||
|
"vision_model.embeddings.position_ids",
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
|
||||||
|
if has_minicpmv and name in ["visual_projection.weight"]:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if name.startswith("v") and not has_vision:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if name.startswith("t") and not has_text:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_tensor_name(name: str) -> str:
|
||||||
|
if "projection" in name:
|
||||||
|
return name
|
||||||
|
if "mm_projector" in name:
|
||||||
|
name = name.replace("model.mm_projector", "mm")
|
||||||
|
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
|
||||||
|
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
|
||||||
|
return name
|
||||||
|
|
||||||
|
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
||||||
|
|
||||||
|
|
||||||
|
def bytes_to_unicode():
|
||||||
|
"""
|
||||||
|
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||||
|
The reversible bpe codes work on unicode strings.
|
||||||
|
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||||
|
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||||
|
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||||
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||||
|
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||||
|
"""
|
||||||
|
bs = (
|
||||||
|
list(range(ord("!"), ord("~") + 1))
|
||||||
|
+ list(range(ord("¡"), ord("¬") + 1))
|
||||||
|
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||||
|
)
|
||||||
|
cs = bs[:]
|
||||||
|
n = 0
|
||||||
|
for b in range(2**8):
|
||||||
|
if b not in bs:
|
||||||
|
bs.append(b)
|
||||||
|
cs.append(2**8 + n)
|
||||||
|
n += 1
|
||||||
|
cs = [chr(n) for n in cs]
|
||||||
|
return dict(zip(bs, cs))
|
||||||
|
|
||||||
|
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
||||||
|
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||||
|
ap.add_argument("--text-only", action="store_true", required=False,
|
||||||
|
help="Save a text-only model. It can't be used to encode images")
|
||||||
|
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||||
|
help="Save a vision-only model. It can't be used to encode texts")
|
||||||
|
ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
|
||||||
|
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||||
|
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
||||||
|
help="The clip model is from openclip (for ViT-SO400M type))")
|
||||||
|
ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
|
||||||
|
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
|
||||||
|
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||||
|
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||||
|
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
||||||
|
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
|
||||||
|
default_image_std = [0.26862954, 0.26130258, 0.27577711]
|
||||||
|
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||||
|
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||||
|
|
||||||
|
# with proper
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if args.text_only and args.vision_only:
|
||||||
|
print("--text-only and --image-only arguments cannot be specified at the same time.")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
if args.use_f32:
|
||||||
|
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
|
||||||
|
|
||||||
|
# output in the same directory as the model if output_dir is None
|
||||||
|
dir_model = args.model_dir
|
||||||
|
|
||||||
|
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
|
||||||
|
vocab = None
|
||||||
|
tokens = None
|
||||||
|
else:
|
||||||
|
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||||
|
vocab = json.load(f)
|
||||||
|
tokens = [key for key in vocab]
|
||||||
|
|
||||||
|
# possible data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
#
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
|
ftype = 1
|
||||||
|
if args.use_f32:
|
||||||
|
ftype = 0
|
||||||
|
|
||||||
|
# if args.clip_model_is_vision or args.clip_model_is_openclip:
|
||||||
|
# model = CLIPVisionModel.from_pretrained(dir_model)
|
||||||
|
# processor = None
|
||||||
|
# else:
|
||||||
|
# model = CLIPModel.from_pretrained(dir_model)
|
||||||
|
# processor = CLIPProcessor.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
default_vision_config = {
|
||||||
|
"hidden_size": 1152,
|
||||||
|
"image_size": 980,
|
||||||
|
"intermediate_size": 4304,
|
||||||
|
"model_type": "idefics2",
|
||||||
|
"num_attention_heads": 16,
|
||||||
|
"num_hidden_layers": 27,
|
||||||
|
"patch_size": 14,
|
||||||
|
}
|
||||||
|
vision_config = Idefics2VisionConfig(**default_vision_config)
|
||||||
|
model = Idefics2VisionTransformer(vision_config)
|
||||||
|
|
||||||
|
processor = None
|
||||||
|
# if model.attn_pool is not None:
|
||||||
|
# model.attn_pool = torch.nn.Identity()
|
||||||
|
|
||||||
|
# model.blocks = model.blocks[:-1]
|
||||||
|
model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
|
||||||
|
|
||||||
|
fname_middle = None
|
||||||
|
has_text_encoder = True
|
||||||
|
has_vision_encoder = True
|
||||||
|
has_minicpmv_projector = False
|
||||||
|
if args.text_only:
|
||||||
|
fname_middle = "text-"
|
||||||
|
has_vision_encoder = False
|
||||||
|
elif args.minicpmv_projector is not None:
|
||||||
|
fname_middle = "mmproj-"
|
||||||
|
has_text_encoder = False
|
||||||
|
has_minicpmv_projector = True
|
||||||
|
elif args.vision_only:
|
||||||
|
fname_middle = "vision-"
|
||||||
|
has_text_encoder = False
|
||||||
|
else:
|
||||||
|
fname_middle = ""
|
||||||
|
|
||||||
|
output_dir = args.output_dir if args.output_dir is not None else dir_model
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||||
|
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||||
|
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||||
|
|
||||||
|
fout.add_bool("clip.has_text_encoder", has_text_encoder)
|
||||||
|
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
|
||||||
|
fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
|
||||||
|
fout.add_file_type(ftype)
|
||||||
|
if args.text_only:
|
||||||
|
fout.add_description("text-only CLIP model")
|
||||||
|
elif args.vision_only and not has_minicpmv_projector:
|
||||||
|
fout.add_description("vision-only CLIP model")
|
||||||
|
elif has_minicpmv_projector:
|
||||||
|
fout.add_description("image encoder for MiniCPM-V")
|
||||||
|
# add projector type
|
||||||
|
fout.add_string("clip.projector_type", "resampler")
|
||||||
|
else:
|
||||||
|
fout.add_description("two-tower CLIP model")
|
||||||
|
|
||||||
|
if has_vision_encoder:
|
||||||
|
# vision_model hparams
|
||||||
|
fout.add_uint32("clip.vision.image_size", 448)
|
||||||
|
fout.add_uint32("clip.vision.patch_size", 14)
|
||||||
|
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
|
||||||
|
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
|
||||||
|
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||||
|
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
|
||||||
|
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||||
|
block_count = 26
|
||||||
|
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
|
||||||
|
|
||||||
|
if processor is not None:
|
||||||
|
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
||||||
|
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|
||||||
|
else:
|
||||||
|
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
|
||||||
|
image_std = args.image_std if args.image_std is not None else default_image_std
|
||||||
|
fout.add_array("clip.vision.image_mean", image_mean)
|
||||||
|
fout.add_array("clip.vision.image_std", image_std)
|
||||||
|
|
||||||
|
use_gelu = True
|
||||||
|
fout.add_bool("clip.use_gelu", use_gelu)
|
||||||
|
|
||||||
|
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
||||||
|
"""
|
||||||
|
embed_dim: output dimension for each position
|
||||||
|
pos: a list of positions to be encoded: size (M,)
|
||||||
|
out: (M, D)
|
||||||
|
"""
|
||||||
|
assert embed_dim % 2 == 0
|
||||||
|
omega = np.arange(embed_dim // 2, dtype=np.float32)
|
||||||
|
omega /= embed_dim / 2.
|
||||||
|
omega = 1. / 10000 ** omega # (D/2,)
|
||||||
|
|
||||||
|
pos = pos.reshape(-1) # (M,)
|
||||||
|
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
|
||||||
|
|
||||||
|
emb_sin = np.sin(out) # (M, D/2)
|
||||||
|
emb_cos = np.cos(out) # (M, D/2)
|
||||||
|
|
||||||
|
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
|
||||||
|
return emb
|
||||||
|
|
||||||
|
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
|
||||||
|
assert embed_dim % 2 == 0
|
||||||
|
|
||||||
|
# use half of dimensions to encode grid_h
|
||||||
|
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
|
||||||
|
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
|
||||||
|
|
||||||
|
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
|
||||||
|
return emb
|
||||||
|
|
||||||
|
|
||||||
|
# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
|
||||||
|
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
|
||||||
|
"""
|
||||||
|
grid_size: int of the grid height and width
|
||||||
|
return:
|
||||||
|
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
|
||||||
|
"""
|
||||||
|
if isinstance(grid_size, int):
|
||||||
|
grid_h_size, grid_w_size = grid_size, grid_size
|
||||||
|
else:
|
||||||
|
grid_h_size, grid_w_size = grid_size[0], grid_size[1]
|
||||||
|
|
||||||
|
grid_h = np.arange(grid_h_size, dtype=np.float32)
|
||||||
|
grid_w = np.arange(grid_w_size, dtype=np.float32)
|
||||||
|
grid = np.meshgrid(grid_w, grid_h) # here w goes first
|
||||||
|
grid = np.stack(grid, axis=0)
|
||||||
|
|
||||||
|
grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
|
||||||
|
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
|
||||||
|
if cls_token:
|
||||||
|
pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
|
||||||
|
return pos_embed
|
||||||
|
|
||||||
|
def _replace_name_resampler(s, v):
|
||||||
|
if re.match("resampler.pos_embed", s):
|
||||||
|
return {
|
||||||
|
s: v,
|
||||||
|
re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))),
|
||||||
|
}
|
||||||
|
if re.match("resampler.proj", s):
|
||||||
|
return {
|
||||||
|
re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))),
|
||||||
|
re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
|
||||||
|
}
|
||||||
|
if re.match("resampler.attn.in_proj_.*", s):
|
||||||
|
return {
|
||||||
|
re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
|
||||||
|
re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
|
||||||
|
re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
|
||||||
|
}
|
||||||
|
return {s: v}
|
||||||
|
|
||||||
|
if has_minicpmv_projector:
|
||||||
|
projector = torch.load(args.minicpmv_projector)
|
||||||
|
new_state_dict = {}
|
||||||
|
for k, v in projector.items():
|
||||||
|
kvs = _replace_name_resampler(k, v)
|
||||||
|
for nk, nv in kvs.items():
|
||||||
|
new_state_dict[nk] = nv
|
||||||
|
projector = new_state_dict
|
||||||
|
ftype_cur = 0
|
||||||
|
for name, data in projector.items():
|
||||||
|
name = get_tensor_name(name)
|
||||||
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
if ftype == 1:
|
||||||
|
if name[-7:] == ".weight" and n_dims == 2:
|
||||||
|
print(" Converting to float16")
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
ftype_cur = 1
|
||||||
|
else:
|
||||||
|
print(" Converting to float32")
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
ftype_cur = 0
|
||||||
|
else:
|
||||||
|
if data.dtype != np.float32:
|
||||||
|
print(" Converting to float32")
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
ftype_cur = 0
|
||||||
|
|
||||||
|
fout.add_tensor(name, data)
|
||||||
|
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||||
|
|
||||||
|
print("Projector tensors added\n")
|
||||||
|
|
||||||
|
def _replace_name(s, v):
|
||||||
|
s = "vision_model." + s
|
||||||
|
if re.match("vision_model.embeddings.position_embedding", s):
|
||||||
|
v = v.unsqueeze(0)
|
||||||
|
return {s: v}
|
||||||
|
|
||||||
|
return {s: v}
|
||||||
|
|
||||||
|
state_dict = model.state_dict()
|
||||||
|
new_state_dict = {}
|
||||||
|
for k, v in state_dict.items():
|
||||||
|
kvs = _replace_name(k, v)
|
||||||
|
for nk, nv in kvs.items():
|
||||||
|
new_state_dict[nk] = nv
|
||||||
|
state_dict = new_state_dict
|
||||||
|
for name, data in state_dict.items():
|
||||||
|
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
|
||||||
|
# we don't need this
|
||||||
|
print(f"skipping parameter: {name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = get_tensor_name(name)
|
||||||
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
|
||||||
|
# ftype == 0 -> float32, ftype == 1 -> float16
|
||||||
|
ftype_cur = 0
|
||||||
|
if n_dims == 4:
|
||||||
|
print(f"tensor {name} is always saved in f16")
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
ftype_cur = 1
|
||||||
|
elif ftype == 1:
|
||||||
|
if name[-7:] == ".weight" and n_dims == 2:
|
||||||
|
print(" Converting to float16")
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
ftype_cur = 1
|
||||||
|
else:
|
||||||
|
print(" Converting to float32")
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
ftype_cur = 0
|
||||||
|
else:
|
||||||
|
if data.dtype != np.float32:
|
||||||
|
print(" Converting to float32")
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
ftype_cur = 0
|
||||||
|
|
||||||
|
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||||
|
fout.add_tensor(name, data)
|
||||||
|
|
||||||
|
|
||||||
|
fout.write_header_to_file()
|
||||||
|
fout.write_kv_data_to_file()
|
||||||
|
fout.write_tensors_to_file()
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
print("Done. Output file: " + fname_out)
|
47
examples/llava/minicpmv-surgery.py
Normal file
47
examples/llava/minicpmv-surgery.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
# find the model part that includes the the multimodal projector weights
|
||||||
|
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
|
||||||
|
checkpoint = model.state_dict()
|
||||||
|
|
||||||
|
# get a list of mm tensor names
|
||||||
|
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
|
||||||
|
|
||||||
|
# store these tensors in a new dictionary and torch.save them
|
||||||
|
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||||
|
torch.save(projector, f"{args.model}/minicpmv.projector")
|
||||||
|
|
||||||
|
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
|
||||||
|
if len(clip_tensors) > 0:
|
||||||
|
clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||||
|
torch.save(clip, f"{args.model}/minicpmv.clip")
|
||||||
|
|
||||||
|
# added tokens should be removed to be able to convert Mistral models
|
||||||
|
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||||
|
with open(f"{args.model}/added_tokens.json", "w") as f:
|
||||||
|
f.write("{}\n")
|
||||||
|
|
||||||
|
config = model.llm.config
|
||||||
|
config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5"
|
||||||
|
config.auto_map = {
|
||||||
|
"AutoConfig": "configuration_minicpm.MiniCPMConfig",
|
||||||
|
"AutoModel": "modeling_minicpm.MiniCPMModel",
|
||||||
|
"AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
|
||||||
|
"AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
|
||||||
|
"AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
|
||||||
|
}
|
||||||
|
model.llm.save_pretrained(f"{args.model}/model")
|
||||||
|
tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
|
||||||
|
tok.save_pretrained(f"{args.model}/model")
|
||||||
|
# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py")
|
||||||
|
|
||||||
|
print("Done!")
|
||||||
|
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||||
|
print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.")
|
|
@ -2,3 +2,4 @@
|
||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
pillow~=10.2.0
|
pillow~=10.2.0
|
||||||
torch~=2.2.1
|
torch~=2.2.1
|
||||||
|
torchvision==0.17.1
|
||||||
|
|
|
@ -92,7 +92,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||||
}
|
}
|
||||||
|
|
||||||
// usage:
|
// usage:
|
||||||
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||||
//
|
//
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
static void usage(const char * executable) {
|
static void usage(const char * executable) {
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
|
||||||
|
> insecure. **Never run the RPC server on an open network or in a sensitive environment!**
|
||||||
|
|
||||||
The `rpc-server` allows running `ggml` backend on a remote host.
|
The `rpc-server` allows running `ggml` backend on a remote host.
|
||||||
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
|
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
|
||||||
This can be used for distributed LLM inference with `llama.cpp` in the following way:
|
This can be used for distributed LLM inference with `llama.cpp` in the following way:
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
struct rpc_server_params {
|
struct rpc_server_params {
|
||||||
std::string host = "0.0.0.0";
|
std::string host = "127.0.0.1";
|
||||||
int port = 50052;
|
int port = 50052;
|
||||||
size_t backend_mem = 0;
|
size_t backend_mem = 0;
|
||||||
};
|
};
|
||||||
|
@ -114,6 +114,17 @@ int main(int argc, char * argv[]) {
|
||||||
fprintf(stderr, "Invalid parameters\n");
|
fprintf(stderr, "Invalid parameters\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.host != "127.0.0.1") {
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
|
||||||
|
fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
|
||||||
|
fprintf(stderr, " Never expose the RPC server to an open network!\n");
|
||||||
|
fprintf(stderr, " This is an experimental feature and is not secure!\n");
|
||||||
|
fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_t backend = create_backend();
|
ggml_backend_t backend = create_backend();
|
||||||
if (!backend) {
|
if (!backend) {
|
||||||
fprintf(stderr, "Failed to create backend\n");
|
fprintf(stderr, "Failed to create backend\n");
|
||||||
|
|
|
@ -207,41 +207,6 @@ model:
|
||||||
-hff, --hf-file FILE Hugging Face model file (default: unused)
|
-hff, --hf-file FILE Hugging Face model file (default: unused)
|
||||||
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable)
|
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable)
|
||||||
|
|
||||||
retrieval:
|
|
||||||
|
|
||||||
--context-file FNAME file to load context from (repeat to specify multiple files)
|
|
||||||
--chunk-size N minimum length of embedded text chunks (default: 64)
|
|
||||||
--chunk-separator STRING
|
|
||||||
separator between chunks (default: '
|
|
||||||
')
|
|
||||||
|
|
||||||
passkey:
|
|
||||||
|
|
||||||
--junk N number of times to repeat the junk text (default: 250)
|
|
||||||
--pos N position of the passkey in the junk text (default: -1)
|
|
||||||
|
|
||||||
imatrix:
|
|
||||||
|
|
||||||
-o, --output FNAME output file (default: 'imatrix.dat')
|
|
||||||
--output-frequency N output the imatrix every N iterations (default: 10)
|
|
||||||
--save-frequency N save an imatrix copy every N iterations (default: 0)
|
|
||||||
--process-output collect data for the output tensor (default: false)
|
|
||||||
--no-ppl do not compute perplexity (default: true)
|
|
||||||
--chunk N start processing the input from chunk N (default: 0)
|
|
||||||
|
|
||||||
bench:
|
|
||||||
|
|
||||||
-pps is the prompt shared across parallel sequences (default: false)
|
|
||||||
-npp n0,n1,... number of prompt tokens
|
|
||||||
-ntg n0,n1,... number of text generation tokens
|
|
||||||
-npl n0,n1,... number of parallel prompts
|
|
||||||
|
|
||||||
embedding:
|
|
||||||
|
|
||||||
--embd-normalize normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
||||||
--embd-output-format empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
||||||
--embd-separator separator of embendings (default \n) for example "<#sep#>"
|
|
||||||
|
|
||||||
server:
|
server:
|
||||||
|
|
||||||
--host HOST ip address to listen (default: 127.0.0.1)
|
--host HOST ip address to listen (default: 127.0.0.1)
|
||||||
|
@ -267,7 +232,8 @@ server:
|
||||||
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||||
-sps, --slot-prompt-similarity SIMILARITY
|
-sps, --slot-prompt-similarity SIMILARITY
|
||||||
how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
|
how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
|
||||||
|
--lora-init-without-apply
|
||||||
|
load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
|
||||||
|
|
||||||
logging:
|
logging:
|
||||||
|
|
||||||
|
@ -279,15 +245,6 @@ logging:
|
||||||
--log-file FNAME Specify a log filename (without extension)
|
--log-file FNAME Specify a log filename (without extension)
|
||||||
--log-new Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
|
--log-new Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
|
||||||
--log-append Don't truncate the old log file.
|
--log-append Don't truncate the old log file.
|
||||||
|
|
||||||
cvector:
|
|
||||||
|
|
||||||
-o, --output FNAME output file (default: 'control_vector.gguf')
|
|
||||||
--positive-file FNAME positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt')
|
|
||||||
--negative-file FNAME negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt')
|
|
||||||
--pca-batch N batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100)
|
|
||||||
--pca-iter N number of iterations used for PCA (default: 1000)
|
|
||||||
--method {pca,mean} dimensionality reduction method to be used (default: pca)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
@ -411,7 +368,8 @@ node index.js
|
||||||
|
|
||||||
## API Endpoints
|
## API Endpoints
|
||||||
|
|
||||||
- **GET** `/health`: Returns the current state of the server:
|
### GET `/health`: Returns the current state of the server
|
||||||
|
|
||||||
- 503 -> `{"status": "loading model"}` if the model is still being loaded.
|
- 503 -> `{"status": "loading model"}` if the model is still being loaded.
|
||||||
- 500 -> `{"status": "error"}` if the model failed to load.
|
- 500 -> `{"status": "error"}` if the model failed to load.
|
||||||
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||||
|
@ -420,7 +378,7 @@ node index.js
|
||||||
|
|
||||||
If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
|
If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
|
||||||
|
|
||||||
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
### POST `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
|
@ -498,7 +456,7 @@ node index.js
|
||||||
|
|
||||||
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
|
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
|
||||||
|
|
||||||
### Result JSON
|
**Response format**
|
||||||
|
|
||||||
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
|
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
|
||||||
|
|
||||||
|
@ -537,7 +495,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
|
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
|
||||||
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
|
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
|
||||||
|
|
||||||
- **POST** `/tokenize`: Tokenize a given text.
|
### POST `/tokenize`: Tokenize a given text
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
|
@ -545,13 +503,15 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
|
|
||||||
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||||
|
|
||||||
- **POST** `/detokenize`: Convert tokens to text.
|
### POST `/detokenize`: Convert tokens to text
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
`tokens`: Set the tokens to detokenize.
|
`tokens`: Set the tokens to detokenize.
|
||||||
|
|
||||||
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
### POST `/embedding`: Generate embedding of a given text
|
||||||
|
|
||||||
|
The same as [the embedding example](../embedding) does.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
|
@ -559,7 +519,9 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
|
|
||||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||||
|
|
||||||
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
|
### POST `/infill`: For code infilling.
|
||||||
|
|
||||||
|
Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
|
@ -571,7 +533,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
|
|
||||||
- **GET** `/props`: Return current server settings.
|
- **GET** `/props`: Return current server settings.
|
||||||
|
|
||||||
### Result JSON
|
**Response format**
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
@ -589,7 +551,9 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
|
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
|
||||||
- `chat_template` - the model's original Jinja2 prompt template
|
- `chat_template` - the model's original Jinja2 prompt template
|
||||||
|
|
||||||
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
|
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
|
||||||
|
|
||||||
|
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
|
@ -641,7 +605,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API.
|
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
|
@ -675,9 +639,9 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
|
### GET `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
|
||||||
|
|
||||||
### Result JSON
|
**Response format**
|
||||||
|
|
||||||
```json
|
```json
|
||||||
[
|
[
|
||||||
|
@ -738,7 +702,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
|
### GET `/metrics`: Prometheus compatible metrics exporter endpoint if `--metrics` is enabled:
|
||||||
|
|
||||||
Available metrics:
|
Available metrics:
|
||||||
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
|
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
|
||||||
|
@ -750,13 +714,13 @@ Available metrics:
|
||||||
- `llamacpp:requests_processing`: Number of requests processing.
|
- `llamacpp:requests_processing`: Number of requests processing.
|
||||||
- `llamacpp:requests_deferred`: Number of requests deferred.
|
- `llamacpp:requests_deferred`: Number of requests deferred.
|
||||||
|
|
||||||
- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
|
### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
|
`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
|
||||||
|
|
||||||
### Result JSON
|
**Response format**
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
@ -770,13 +734,13 @@ Available metrics:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
|
### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
|
`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
|
||||||
|
|
||||||
### Result JSON
|
**Response format**
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
@ -790,9 +754,9 @@ Available metrics:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
|
### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
|
||||||
|
|
||||||
### Result JSON
|
**Response format**
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
@ -801,6 +765,42 @@ Available metrics:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### GET `/lora-adapters`: Get list of all LoRA adapters
|
||||||
|
|
||||||
|
If an adapter is disabled, the scale will be set to 0.
|
||||||
|
|
||||||
|
**Response format**
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"path": "my_adapter_1.gguf",
|
||||||
|
"scale": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"path": "my_adapter_2.gguf",
|
||||||
|
"scale": 0.0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
### POST `/lora-adapters`: Set list of LoRA adapters
|
||||||
|
|
||||||
|
To disable an adapter, either remove it from the list below, or set scale to 0.
|
||||||
|
|
||||||
|
**Request format**
|
||||||
|
|
||||||
|
To know the `id` of the adapter, use GET `/lora-adapters`
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{"id": 0, "scale": 0.2},
|
||||||
|
{"id": 1, "scale": 0.8}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Change system prompt on runtime
|
### Change system prompt on runtime
|
||||||
|
|
|
@ -79,6 +79,7 @@ enum server_task_type {
|
||||||
SERVER_TASK_TYPE_SLOT_SAVE,
|
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||||
SERVER_TASK_TYPE_SLOT_RESTORE,
|
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||||
SERVER_TASK_TYPE_SLOT_ERASE,
|
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||||
|
SERVER_TASK_TYPE_SET_LORA,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct server_task {
|
struct server_task {
|
||||||
|
@ -623,6 +624,7 @@ struct server_response {
|
||||||
struct server_context {
|
struct server_context {
|
||||||
llama_model * model = nullptr;
|
llama_model * model = nullptr;
|
||||||
llama_context * ctx = nullptr;
|
llama_context * ctx = nullptr;
|
||||||
|
std::vector<llama_lora_adapter_container> lora_adapters;
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
|
@ -682,6 +684,7 @@ struct server_context {
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model;
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context;
|
||||||
|
lora_adapters = llama_init.lora_adapters;
|
||||||
params.n_parallel -= 1; // but be sneaky about it
|
params.n_parallel -= 1; // but be sneaky about it
|
||||||
if (model == nullptr) {
|
if (model == nullptr) {
|
||||||
LOG_ERROR("unable to load model", {{"model", params.model}});
|
LOG_ERROR("unable to load model", {{"model", params.model}});
|
||||||
|
@ -973,6 +976,8 @@ struct server_context {
|
||||||
(prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
|
(prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
|
||||||
(prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) {
|
(prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) {
|
||||||
slot.prompt = *prompt;
|
slot.prompt = *prompt;
|
||||||
|
} else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) {
|
||||||
|
slot.prompt = prompt->at(0);
|
||||||
} else {
|
} else {
|
||||||
send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
|
||||||
return false;
|
return false;
|
||||||
|
@ -1851,6 +1856,14 @@ struct server_context {
|
||||||
};
|
};
|
||||||
queue_results.send(result);
|
queue_results.send(result);
|
||||||
} break;
|
} break;
|
||||||
|
case SERVER_TASK_TYPE_SET_LORA:
|
||||||
|
{
|
||||||
|
llama_lora_adapters_apply(ctx, lora_adapters);
|
||||||
|
server_task_result result;
|
||||||
|
result.id = task.id;
|
||||||
|
result.data = json{{ "success", true }};
|
||||||
|
queue_results.send(result);
|
||||||
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3329,6 +3342,55 @@ int main(int argc, char ** argv) {
|
||||||
return res.set_content(root.dump(), "application/json; charset=utf-8");
|
return res.set_content(root.dump(), "application/json; charset=utf-8");
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const auto handle_lora_adapters_list = [&](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
|
json result = json::array();
|
||||||
|
for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) {
|
||||||
|
auto & la = ctx_server.lora_adapters[i];
|
||||||
|
result.push_back({
|
||||||
|
{"id", i},
|
||||||
|
{"path", la.path},
|
||||||
|
{"scale", la.scale},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
res.set_content(result.dump(), "application/json");
|
||||||
|
res.status = 200; // HTTP OK
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
|
|
||||||
|
const std::vector<json> body = json::parse(req.body);
|
||||||
|
int max_idx = ctx_server.lora_adapters.size();
|
||||||
|
|
||||||
|
// clear existing value
|
||||||
|
for (auto & la : ctx_server.lora_adapters) {
|
||||||
|
la.scale = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// set value
|
||||||
|
for (auto entry : body) {
|
||||||
|
int id = entry.at("id");
|
||||||
|
float scale = entry.at("scale");
|
||||||
|
if (0 <= id && id < max_idx) {
|
||||||
|
ctx_server.lora_adapters[id].scale = scale;
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("invalid adapter id");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
server_task task;
|
||||||
|
task.type = SERVER_TASK_TYPE_SET_LORA;
|
||||||
|
const int id_task = ctx_server.queue_tasks.post(task);
|
||||||
|
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||||
|
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
res.set_content(result.data.dump(), "application/json");
|
||||||
|
res.status = 200; // HTTP OK
|
||||||
|
};
|
||||||
|
|
||||||
auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
|
auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
|
||||||
return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
|
return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
|
||||||
res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
|
res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
|
||||||
|
@ -3367,7 +3429,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// register API routes
|
// register API routes
|
||||||
svr->Get ("/health", handle_health);
|
svr->Get ("/health", handle_health);
|
||||||
svr->Get ("/slots", handle_slots);
|
|
||||||
svr->Get ("/metrics", handle_metrics);
|
svr->Get ("/metrics", handle_metrics);
|
||||||
svr->Get ("/props", handle_props);
|
svr->Get ("/props", handle_props);
|
||||||
svr->Get ("/v1/models", handle_models);
|
svr->Get ("/v1/models", handle_models);
|
||||||
|
@ -3382,6 +3443,11 @@ int main(int argc, char ** argv) {
|
||||||
svr->Post("/v1/embeddings", handle_embeddings);
|
svr->Post("/v1/embeddings", handle_embeddings);
|
||||||
svr->Post("/tokenize", handle_tokenize);
|
svr->Post("/tokenize", handle_tokenize);
|
||||||
svr->Post("/detokenize", handle_detokenize);
|
svr->Post("/detokenize", handle_detokenize);
|
||||||
|
// LoRA adapters hotswap
|
||||||
|
svr->Get ("/lora-adapters", handle_lora_adapters_list);
|
||||||
|
svr->Post("/lora-adapters", handle_lora_adapters_apply);
|
||||||
|
// Save & load slots
|
||||||
|
svr->Get ("/slots", handle_slots);
|
||||||
if (!params.slot_save_path.empty()) {
|
if (!params.slot_save_path.empty()) {
|
||||||
// only enable slot endpoints if slot_save_path is set
|
// only enable slot endpoints if slot_save_path is set
|
||||||
svr->Post("/slots/:id_slot", handle_slots_action);
|
svr->Post("/slots/:id_slot", handle_slots_action);
|
||||||
|
|
36
examples/server/tests/features/lora.feature
Normal file
36
examples/server/tests/features/lora.feature
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
@llama.cpp
|
||||||
|
@lora
|
||||||
|
Feature: llama.cpp server
|
||||||
|
|
||||||
|
Background: Server startup
|
||||||
|
Given a server listening on localhost:8080
|
||||||
|
And a model url https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/stories15M_MOE-F16.gguf
|
||||||
|
And a model file stories15M_MOE-F16.gguf
|
||||||
|
And a model alias stories15M_MOE
|
||||||
|
And a lora adapter file from https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf
|
||||||
|
And 42 as server seed
|
||||||
|
And 1024 as batch size
|
||||||
|
And 1024 as ubatch size
|
||||||
|
And 2048 KV cache size
|
||||||
|
And 64 max tokens to predict
|
||||||
|
And 0.0 temperature
|
||||||
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
|
|
||||||
|
Scenario: Completion LoRA disabled
|
||||||
|
Given switch off lora adapter 0
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
Look in thy glass
|
||||||
|
"""
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 64 tokens are predicted matching little|girl|three|years|old
|
||||||
|
|
||||||
|
Scenario: Completion LoRA enabled
|
||||||
|
Given switch on lora adapter 0
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
Look in thy glass
|
||||||
|
"""
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 64 tokens are predicted matching eye|love|glass|sun
|
|
@ -7,6 +7,7 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
import requests
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
from re import RegexFlag
|
from re import RegexFlag
|
||||||
|
@ -70,6 +71,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
|
||||||
context.user_api_key = None
|
context.user_api_key = None
|
||||||
context.response_format = None
|
context.response_format = None
|
||||||
context.temperature = None
|
context.temperature = None
|
||||||
|
context.lora_file = None
|
||||||
|
|
||||||
context.tasks_result = []
|
context.tasks_result = []
|
||||||
context.concurrent_tasks = []
|
context.concurrent_tasks = []
|
||||||
|
@ -82,6 +84,12 @@ def step_download_hf_model(context, hf_file: str, hf_repo: str):
|
||||||
context.model_hf_file = hf_file
|
context.model_hf_file = hf_file
|
||||||
context.model_file = os.path.basename(hf_file)
|
context.model_file = os.path.basename(hf_file)
|
||||||
|
|
||||||
|
@step('a lora adapter file from {lora_file_url}')
|
||||||
|
def step_download_lora_file(context, lora_file_url: str):
|
||||||
|
file_name = lora_file_url.split('/').pop()
|
||||||
|
context.lora_file = f'../../../{file_name}'
|
||||||
|
with open(context.lora_file, 'wb') as f:
|
||||||
|
f.write(requests.get(lora_file_url).content)
|
||||||
|
|
||||||
@step('a model file {model_file}')
|
@step('a model file {model_file}')
|
||||||
def step_model_file(context, model_file: str):
|
def step_model_file(context, model_file: str):
|
||||||
|
@ -849,6 +857,17 @@ async def step_erase_slot(context, slot_id):
|
||||||
context.response = response
|
context.response = response
|
||||||
|
|
||||||
|
|
||||||
|
@step('switch {on_or_off} lora adapter {lora_id:d}')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def toggle_lora_adapter(context, on_or_off: str, lora_id: int):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(f'{context.base_url}/lora-adapters',
|
||||||
|
json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}],
|
||||||
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
|
context.response = response
|
||||||
|
print([{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}])
|
||||||
|
|
||||||
|
|
||||||
@step('the server responds with status code {status_code:d}')
|
@step('the server responds with status code {status_code:d}')
|
||||||
def step_server_responds_with_status_code(context, status_code):
|
def step_server_responds_with_status_code(context, status_code):
|
||||||
assert context.response.status == status_code
|
assert context.response.status == status_code
|
||||||
|
@ -1326,6 +1345,8 @@ def start_server_background(context):
|
||||||
server_args.extend(['--grp-attn-w', context.n_ga_w])
|
server_args.extend(['--grp-attn-w', context.n_ga_w])
|
||||||
if context.debug:
|
if context.debug:
|
||||||
server_args.append('--verbose')
|
server_args.append('--verbose')
|
||||||
|
if context.lora_file:
|
||||||
|
server_args.extend(['--lora', context.lora_file])
|
||||||
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
||||||
server_args.extend(['--log-format', "text"])
|
server_args.extend(['--log-format', "text"])
|
||||||
|
|
||||||
|
|
|
@ -4,3 +4,4 @@ huggingface_hub~=0.20.3
|
||||||
numpy~=1.26.4
|
numpy~=1.26.4
|
||||||
openai~=1.30.3
|
openai~=1.30.3
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
|
requests~=2.32.3
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
|
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
|
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
|
@ -12,9 +12,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.
|
||||||
|
|
||||||
List all SYCL devices with ID, compute capability, max work group size, ect.
|
List all SYCL devices with ID, compute capability, max work group size, ect.
|
||||||
|
|
||||||
1. Build the llama.cpp for SYCL for all targets.
|
1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
|
||||||
|
|
||||||
2. Enable oneAPI running environment
|
2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
|
||||||
|
|
||||||
```
|
```
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
@ -29,19 +29,13 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
Check the ID in startup log, like:
|
Check the ID in startup log, like:
|
||||||
|
|
||||||
```
|
```
|
||||||
found 4 SYCL devices:
|
found 2 SYCL devices:
|
||||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
| | | | |Max | |Max |Global | |
|
||||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
| | | | |compute|Max work|sub |mem | |
|
||||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
|ID| Device Type| Name|Version|units |group |group|size | Driver version|
|
||||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|
|
||||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
| 0| [level_zero:gpu:0]| Intel Arc A770 Graphics| 1.3| 512| 1024| 32| 16225M| 1.3.29138|
|
||||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
| 1| [level_zero:gpu:1]| Intel UHD Graphics 750| 1.3| 32| 512| 32| 62631M| 1.3.29138|
|
||||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
|
||||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|Attribute|Note|
|
|
||||||
|-|-|
|
|
||||||
|compute capability 1.3|Level-zero running time, recommended |
|
|
||||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
|
||||||
|
|
|
@ -50,6 +50,8 @@ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
// helper to check if the device supports a specific family
|
||||||
|
|
|
@ -16,6 +16,8 @@
|
||||||
|
|
||||||
#if defined(__GNUC__)
|
#if defined(__GNUC__)
|
||||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
|
@ -351,15 +351,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
|
||||||
}
|
}
|
||||||
|
|
||||||
// an async copy would normally happen after all the queued operations on both backends are completed
|
// an async copy would normally happen after all the queued operations on both backends are completed
|
||||||
// sync src, set_async dst
|
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
|
||||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
|
||||||
ggml_backend_synchronize(backend_src);
|
ggml_backend_synchronize(backend_src);
|
||||||
ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
|
|
||||||
} else {
|
|
||||||
ggml_backend_synchronize(backend_src);
|
|
||||||
ggml_backend_tensor_copy(src, dst);
|
|
||||||
ggml_backend_synchronize(backend_dst);
|
ggml_backend_synchronize(backend_dst);
|
||||||
}
|
ggml_backend_tensor_copy(src, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
// events
|
// events
|
||||||
|
@ -1782,7 +1777,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_synchronize(split_backend);
|
ggml_backend_synchronize(split_backend);
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
||||||
|
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
||||||
|
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
||||||
|
ggml_backend_synchronize(input_backend);
|
||||||
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||||
|
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||||
|
} else {
|
||||||
|
ggml_backend_synchronize(split_backend);
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_copy(input, input_cpy);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1501,7 +1501,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
}
|
}
|
||||||
|
|
||||||
// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
|
// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
|
||||||
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
||||||
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
||||||
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
||||||
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
|
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
|
||||||
|
@ -2362,33 +2362,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
|
|
||||||
|
|
||||||
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
||||||
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||||
|
|
||||||
if (!ggml_backend_buffer_is_cuda(src->buffer)) {
|
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
|
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// device -> device
|
// device -> device copy
|
||||||
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
|
||||||
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
|
||||||
|
|
||||||
if (backend_src != backend_dst) {
|
|
||||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
|
||||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
|
||||||
|
|
||||||
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
||||||
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
#ifndef NDEBUG
|
||||||
|
GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (backend_src != backend_dst) {
|
||||||
// copy on src stream
|
// copy on src stream
|
||||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||||
} else {
|
} else {
|
||||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
return false;
|
return false;
|
||||||
|
@ -2397,7 +2399,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// record event on src stream
|
// record event on src stream after the copy
|
||||||
if (!cuda_ctx_src->copy_event) {
|
if (!cuda_ctx_src->copy_event) {
|
||||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||||
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
||||||
|
@ -2409,7 +2411,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
|
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
|
||||||
} else {
|
} else {
|
||||||
// src and dst are on the same backend
|
// src and dst are on the same backend
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -2746,11 +2748,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
{
|
{
|
||||||
struct ggml_tensor * a = op->src[0];
|
struct ggml_tensor * a = op->src[0];
|
||||||
if (op->op == GGML_OP_MUL_MAT) {
|
|
||||||
struct ggml_tensor * b = op->src[1];
|
struct ggml_tensor * b = op->src[1];
|
||||||
if (a->ne[3] != b->ne[3]) {
|
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
switch (a->type) {
|
switch (a->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
|
@ -2881,7 +2884,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_FLASH_ATTN_EXT:
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
|
return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
|
||||||
#else
|
#else
|
||||||
if (op->src[0]->ne[0] == 128) {
|
if (op->src[0]->ne[0] == 128) {
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -210,7 +210,7 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_COUNT
|
GGML_METAL_KERNEL_TYPE_COUNT
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_metal_context {
|
struct ggml_backend_metal_context {
|
||||||
int n_cb;
|
int n_cb;
|
||||||
|
|
||||||
id<MTLDevice> device;
|
id<MTLDevice> device;
|
||||||
|
@ -224,6 +224,10 @@ struct ggml_metal_context {
|
||||||
bool support_simdgroup_mm;
|
bool support_simdgroup_mm;
|
||||||
|
|
||||||
bool should_capture_next_compute;
|
bool should_capture_next_compute;
|
||||||
|
|
||||||
|
// abort ggml_metal_graph_compute if callback returns true
|
||||||
|
ggml_abort_callback abort_callback;
|
||||||
|
void * abort_callback_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
// MSL code
|
// MSL code
|
||||||
|
@ -289,7 +293,7 @@ static void * ggml_metal_host_malloc(size_t n) {
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
||||||
|
|
||||||
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
|
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
|
||||||
|
@ -306,7 +310,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
||||||
|
|
||||||
// Configure context
|
// Configure context
|
||||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
struct ggml_backend_metal_context * ctx = malloc(sizeof(struct ggml_backend_metal_context));
|
||||||
ctx->device = device;
|
ctx->device = device;
|
||||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||||
ctx->queue = [ctx->device newCommandQueue];
|
ctx->queue = [ctx->device newCommandQueue];
|
||||||
|
@ -668,7 +672,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_metal_free(struct ggml_metal_context * ctx) {
|
static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
|
||||||
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
||||||
|
|
||||||
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
|
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
|
||||||
|
@ -734,7 +738,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
|
||||||
return nil;
|
return nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
|
static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) {
|
||||||
for (size_t i = 0, n = 3; i < n; ++i) {
|
for (size_t i = 0, n = 3; i < n; ++i) {
|
||||||
if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
|
if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -845,7 +849,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_status ggml_metal_graph_compute(
|
static enum ggml_status ggml_metal_graph_compute(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_backend_metal_context * ctx,
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf) {
|
||||||
|
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
|
@ -878,9 +882,12 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
||||||
command_buffer_builder[cb_idx] = command_buffer;
|
command_buffer_builder[cb_idx] = command_buffer;
|
||||||
|
|
||||||
// enqueue the command buffers in order to specify their execution order
|
// always enqueue the first two command buffers
|
||||||
|
// enqueue all of the command buffers if we don't need to abort
|
||||||
|
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
||||||
[command_buffer enqueue];
|
[command_buffer enqueue];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
|
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
|
||||||
|
|
||||||
|
@ -2827,7 +2834,9 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
|
|
||||||
[encoder endEncoding];
|
[encoder endEncoding];
|
||||||
|
|
||||||
|
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
||||||
[command_buffer commit];
|
[command_buffer commit];
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Wait for completion and check status of each command buffer
|
// Wait for completion and check status of each command buffer
|
||||||
|
@ -2847,6 +2856,23 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
|
|
||||||
return GGML_STATUS_FAILED;
|
return GGML_STATUS_FAILED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
|
||||||
|
if (!next_buffer) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
||||||
|
if (next_queued) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
||||||
|
GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
||||||
|
return GGML_STATUS_ABORTED;
|
||||||
|
}
|
||||||
|
|
||||||
|
[next_buffer commit];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (should_capture) {
|
if (should_capture) {
|
||||||
|
@ -3150,7 +3176,7 @@ GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
|
GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||||
ggml_metal_free(ctx);
|
ggml_metal_free(ctx);
|
||||||
free(backend);
|
free(backend);
|
||||||
}
|
}
|
||||||
|
@ -3162,13 +3188,13 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||||
|
|
||||||
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||||
|
|
||||||
return ggml_metal_supports_op(metal_ctx, op);
|
return ggml_metal_supports_op(metal_ctx, op);
|
||||||
}
|
}
|
||||||
|
@ -3213,9 +3239,9 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_metal_init(void) {
|
ggml_backend_t ggml_backend_metal_init(void) {
|
||||||
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3237,15 +3263,24 @@ bool ggml_backend_is_metal(ggml_backend_t backend) {
|
||||||
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||||
|
|
||||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) {
|
||||||
|
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||||
|
|
||||||
|
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||||
|
|
||||||
|
ctx->abort_callback = abort_callback;
|
||||||
|
ctx->abort_callback_data = user_data;
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
||||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||||
|
|
||||||
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
||||||
}
|
}
|
||||||
|
@ -3253,7 +3288,7 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
||||||
void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
||||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
|
||||||
ctx->should_capture_next_compute = true;
|
ctx->should_capture_next_compute = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -197,6 +197,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
||||||
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
|
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
if (inet_addr(host) == INADDR_NONE) {
|
||||||
|
fprintf(stderr, "Invalid host address: %s\n", host);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
struct sockaddr_in serv_addr;
|
struct sockaddr_in serv_addr;
|
||||||
serv_addr.sin_family = AF_INET;
|
serv_addr.sin_family = AF_INET;
|
||||||
serv_addr.sin_addr.s_addr = inet_addr(host);
|
serv_addr.sin_addr.s_addr = inet_addr(host);
|
||||||
|
@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
|
||||||
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
|
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// require that the tensor data does not go beyond the buffer end
|
||||||
|
uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
|
||||||
|
uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
|
||||||
|
uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
|
||||||
|
GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
|
||||||
|
GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
|
||||||
|
|
||||||
result->op = (ggml_op) tensor->op;
|
result->op = (ggml_op) tensor->op;
|
||||||
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
||||||
result->op_params[i] = tensor->op_params[i];
|
result->op_params[i] = tensor->op_params[i];
|
||||||
|
@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||||
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
|
||||||
uint64_t offset;
|
uint64_t offset;
|
||||||
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
|
||||||
size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
|
const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
|
||||||
|
|
||||||
struct ggml_init_params params {
|
struct ggml_init_params params {
|
||||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||||
|
@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
||||||
|
|
||||||
|
// sanitize tensor->data
|
||||||
|
{
|
||||||
|
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
|
||||||
|
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
|
||||||
|
|
||||||
|
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
|
||||||
|
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
|
const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
|
||||||
ggml_backend_tensor_set(tensor, data, offset, size);
|
ggml_backend_tensor_set(tensor, data, offset, size);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
|
@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
||||||
|
|
||||||
|
// sanitize tensor->data
|
||||||
|
{
|
||||||
|
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
|
||||||
|
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
|
||||||
|
|
||||||
|
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
|
||||||
|
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// output serialization format: | data (size bytes) |
|
// output serialization format: | data (size bytes) |
|
||||||
output.resize(size, 0);
|
output.resize(size, 0);
|
||||||
ggml_backend_tensor_get(tensor, output.data(), offset, size);
|
ggml_backend_tensor_get(tensor, output.data(), offset, size);
|
||||||
|
|
|
@ -874,7 +874,7 @@ namespace dpct
|
||||||
inline std::string get_preferred_gpu_platform_name() {
|
inline std::string get_preferred_gpu_platform_name() {
|
||||||
std::string result;
|
std::string result;
|
||||||
|
|
||||||
std::string filter = "level-zero";
|
std::string filter = "";
|
||||||
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
|
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
|
||||||
if (env) {
|
if (env) {
|
||||||
if (std::strstr(env, "level_zero")) {
|
if (std::strstr(env, "level_zero")) {
|
||||||
|
@ -892,11 +892,24 @@ namespace dpct
|
||||||
else {
|
else {
|
||||||
throw std::runtime_error("invalid device filter: " + std::string(env));
|
throw std::runtime_error("invalid device filter: " + std::string(env));
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
auto default_device = sycl::device(sycl::default_selector_v);
|
||||||
|
auto default_platform_name = default_device.get_platform().get_info<sycl::info::platform::name>();
|
||||||
|
|
||||||
|
if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) {
|
||||||
|
filter = "level-zero";
|
||||||
|
}
|
||||||
|
else if (std::strstr(default_platform_name.c_str(), "CUDA")) {
|
||||||
|
filter = "cuda";
|
||||||
|
}
|
||||||
|
else if (std::strstr(default_platform_name.c_str(), "HIP")) {
|
||||||
|
filter = "hip";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto plaform_list = sycl::platform::get_platforms();
|
auto platform_list = sycl::platform::get_platforms();
|
||||||
|
|
||||||
for (const auto& platform : plaform_list) {
|
for (const auto& platform : platform_list) {
|
||||||
auto devices = platform.get_devices();
|
auto devices = platform.get_devices();
|
||||||
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
|
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
|
||||||
return d.is_gpu();
|
return d.is_gpu();
|
||||||
|
|
|
@ -2108,9 +2108,9 @@ void ggml_vk_instance_init() {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||||
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
|
||||||
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
|
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
|
||||||
ggml_vk_instance_init();
|
ggml_vk_instance_init();
|
||||||
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
||||||
|
|
||||||
ctx->name = GGML_VK_NAME + std::to_string(idx);
|
ctx->name = GGML_VK_NAME + std::to_string(idx);
|
||||||
|
|
||||||
|
|
|
@ -56,6 +56,9 @@ int ggml_sve_cnt_b = 0;
|
||||||
// disable POSIX deprecation warnings
|
// disable POSIX deprecation warnings
|
||||||
// these functions are never going away, anyway
|
// these functions are never going away, anyway
|
||||||
#pragma warning(disable: 4996)
|
#pragma warning(disable: 4996)
|
||||||
|
|
||||||
|
// unreachable code because of multiple instances of code after GGML_ABORT
|
||||||
|
#pragma warning(disable: 4702)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
|
@ -3745,7 +3748,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
struct ggml_tensor * view_src,
|
struct ggml_tensor * view_src,
|
||||||
size_t view_offs) {
|
size_t view_offs) {
|
||||||
|
|
||||||
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
|
||||||
|
GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
||||||
|
|
||||||
// find the base tensor and absolute offset
|
// find the base tensor and absolute offset
|
||||||
if (view_src != NULL && view_src->view_src != NULL) {
|
if (view_src != NULL && view_src->view_src != NULL) {
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
|
find_package (Threads REQUIRED)
|
||||||
|
|
||||||
set(TARGET vulkan-shaders-gen)
|
set(TARGET vulkan-shaders-gen)
|
||||||
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#include <direct.h> // For _mkdir on Windows
|
#include <direct.h> // For _mkdir on Windows
|
||||||
|
#include <algorithm> // For std::replace on w64devkit
|
||||||
#else
|
#else
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <sys/wait.h>
|
#include <sys/wait.h>
|
||||||
|
|
|
@ -1146,6 +1146,9 @@ class GGMLQuantizationType(IntEnum):
|
||||||
F64 = 28
|
F64 = 28
|
||||||
IQ1_M = 29
|
IQ1_M = 29
|
||||||
BF16 = 30
|
BF16 = 30
|
||||||
|
Q4_0_4_4 = 31
|
||||||
|
Q4_0_4_8 = 32
|
||||||
|
Q4_0_8_8 = 33
|
||||||
|
|
||||||
|
|
||||||
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
||||||
|
@ -1158,7 +1161,7 @@ class LlamaFileType(IntEnum):
|
||||||
MOSTLY_F16 = 1 # except 1d tensors
|
MOSTLY_F16 = 1 # except 1d tensors
|
||||||
MOSTLY_Q4_0 = 2 # except 1d tensors
|
MOSTLY_Q4_0 = 2 # except 1d tensors
|
||||||
MOSTLY_Q4_1 = 3 # except 1d tensors
|
MOSTLY_Q4_1 = 3 # except 1d tensors
|
||||||
MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
# MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
||||||
# MOSTLY_Q4_2 = 5 # support has been removed
|
# MOSTLY_Q4_2 = 5 # support has been removed
|
||||||
# MOSTLY_Q4_3 = 6 # support has been removed
|
# MOSTLY_Q4_3 = 6 # support has been removed
|
||||||
MOSTLY_Q8_0 = 7 # except 1d tensors
|
MOSTLY_Q8_0 = 7 # except 1d tensors
|
||||||
|
@ -1187,6 +1190,9 @@ class LlamaFileType(IntEnum):
|
||||||
MOSTLY_IQ4_XS = 30 # except 1d tensors
|
MOSTLY_IQ4_XS = 30 # except 1d tensors
|
||||||
MOSTLY_IQ1_M = 31 # except 1d tensors
|
MOSTLY_IQ1_M = 31 # except 1d tensors
|
||||||
MOSTLY_BF16 = 32 # except 1d tensors
|
MOSTLY_BF16 = 32 # except 1d tensors
|
||||||
|
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
|
||||||
|
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
|
||||||
|
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
|
||||||
|
|
||||||
GUESSED = 1024 # not specified in the model file
|
GUESSED = 1024 # not specified in the model file
|
||||||
|
|
||||||
|
@ -1260,6 +1266,9 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||||
GGMLQuantizationType.F64: (1, 8),
|
GGMLQuantizationType.F64: (1, 8),
|
||||||
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
||||||
GGMLQuantizationType.BF16: (1, 2),
|
GGMLQuantizationType.BF16: (1, 2),
|
||||||
|
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
||||||
|
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
||||||
|
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||||
class LazyNumpyTensor(LazyBase):
|
class LazyNumpyTensor(LazyBase):
|
||||||
_tensor_type = np.ndarray
|
_tensor_type = np.ndarray
|
||||||
|
|
||||||
|
shape: tuple[int, ...] # Makes the type checker happy in quants.py
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
|
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
|
||||||
# The initial idea was to use np.nan as the fill value,
|
# The initial idea was to use np.nan as the fill value,
|
||||||
|
|
|
@ -174,7 +174,7 @@ class Metadata:
|
||||||
org_component, model_full_name_component = None, model_id
|
org_component, model_full_name_component = None, model_id
|
||||||
|
|
||||||
# Check if we erroneously matched against './' or '../' etc...
|
# Check if we erroneously matched against './' or '../' etc...
|
||||||
if org_component is not None and org_component[0] == '.':
|
if org_component is not None and len(org_component) > 0 and org_component[0] == '.':
|
||||||
org_component = None
|
org_component = None
|
||||||
|
|
||||||
name_parts: list[str] = model_full_name_component.split('-')
|
name_parts: list[str] = model_full_name_component.split('-')
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import Callable, Sequence
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Any, Callable, Sequence
|
||||||
|
|
||||||
from numpy.typing import DTypeLike
|
from numpy.typing import DTypeLike
|
||||||
|
|
||||||
|
@ -9,32 +10,22 @@ from .lazy import LazyNumpyTensor
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
|
||||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||||
if shape[-1] % block_size != 0:
|
if shape[-1] % block_size != 0:
|
||||||
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
||||||
return (*shape[:-1], shape[-1] // block_size * type_size)
|
return (*shape[:-1], shape[-1] // block_size * type_size)
|
||||||
|
|
||||||
|
|
||||||
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
|
||||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||||
if shape[-1] % type_size != 0:
|
if shape[-1] % type_size != 0:
|
||||||
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
||||||
return (*shape[:-1], shape[-1] // type_size * block_size)
|
return (*shape[:-1], shape[-1] // type_size * block_size)
|
||||||
|
|
||||||
|
|
||||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
|
||||||
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
|
||||||
n = n.astype(np.float32, copy=False).view(np.uint32)
|
|
||||||
# force nan to quiet
|
|
||||||
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
|
|
||||||
# round to nearest even
|
|
||||||
n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
|
|
||||||
return n.astype(np.uint16)
|
|
||||||
|
|
||||||
|
|
||||||
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
|
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
|
||||||
def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
|
def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
|
||||||
rows = arr.reshape((-1, arr.shape[-1]))
|
rows = arr.reshape((-1, arr.shape[-1]))
|
||||||
osize = 1
|
osize = 1
|
||||||
for dim in oshape:
|
for dim in oshape:
|
||||||
|
@ -46,27 +37,6 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.
|
||||||
return out.reshape(oshape)
|
return out.reshape(oshape)
|
||||||
|
|
||||||
|
|
||||||
def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
|
|
||||||
return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape)
|
|
||||||
|
|
||||||
|
|
||||||
__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16)
|
|
||||||
|
|
||||||
|
|
||||||
def quantize_bf16(n: np.ndarray):
|
|
||||||
if type(n) is LazyNumpyTensor:
|
|
||||||
return __quantize_bf16_lazy(n)
|
|
||||||
else:
|
|
||||||
return __quantize_bf16_array(n)
|
|
||||||
|
|
||||||
|
|
||||||
__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
|
|
||||||
|
|
||||||
|
|
||||||
def can_quantize_to_q8_0(n: np.ndarray) -> bool:
|
|
||||||
return n.shape[-1] % __q8_block_size == 0
|
|
||||||
|
|
||||||
|
|
||||||
# round away from zero
|
# round away from zero
|
||||||
# ref: https://stackoverflow.com/a/59143326/22827863
|
# ref: https://stackoverflow.com/a/59143326/22827863
|
||||||
def np_roundf(n: np.ndarray) -> np.ndarray:
|
def np_roundf(n: np.ndarray) -> np.ndarray:
|
||||||
|
@ -76,18 +46,151 @@ def np_roundf(n: np.ndarray) -> np.ndarray:
|
||||||
return np.sign(n) * b
|
return np.sign(n) * b
|
||||||
|
|
||||||
|
|
||||||
def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
|
class QuantError(Exception): ...
|
||||||
return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
|
|
||||||
|
|
||||||
|
|
||||||
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
|
_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
|
||||||
def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
|
|
||||||
shape = n.shape
|
|
||||||
assert shape[-1] % __q8_block_size == 0
|
|
||||||
|
|
||||||
n_blocks = n.size // __q8_block_size
|
|
||||||
|
|
||||||
blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
|
def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
|
||||||
|
if qtype == GGMLQuantizationType.F32:
|
||||||
|
return data.astype(np.float32, copy=False)
|
||||||
|
elif qtype == GGMLQuantizationType.F16:
|
||||||
|
return data.astype(np.float16, copy=False)
|
||||||
|
elif (q := _type_traits.get(qtype)) is not None:
|
||||||
|
return q.quantize(data)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
|
||||||
|
|
||||||
|
|
||||||
|
def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
|
||||||
|
if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16:
|
||||||
|
return data.astype(np.float32, copy=False)
|
||||||
|
elif (q := _type_traits.get(qtype)) is not None:
|
||||||
|
return q.dequantize(data)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
|
||||||
|
|
||||||
|
|
||||||
|
class __Quant(ABC):
|
||||||
|
qtype: GGMLQuantizationType
|
||||||
|
block_size: int
|
||||||
|
type_size: int
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
return TypeError("Quant conversion classes can't have instances")
|
||||||
|
|
||||||
|
def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
|
||||||
|
cls.qtype = qtype
|
||||||
|
cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
|
||||||
|
cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
|
||||||
|
cls.__quantize_array,
|
||||||
|
meta_noop=(np.uint8, cls.__shape_to_bytes)
|
||||||
|
)
|
||||||
|
cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
|
||||||
|
cls.__dequantize_array,
|
||||||
|
meta_noop=(np.float32, cls.__shape_from_bytes)
|
||||||
|
)
|
||||||
|
assert qtype not in _type_traits
|
||||||
|
_type_traits[qtype] = cls
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def quantize_rows(cls, rows: np.ndarray) -> np.ndarray:
|
||||||
|
rows = rows.astype(np.float32, copy=False)
|
||||||
|
shape = rows.shape
|
||||||
|
n_blocks = rows.size // cls.block_size
|
||||||
|
blocks = rows.reshape((n_blocks, cls.block_size))
|
||||||
|
blocks = cls.quantize_blocks(blocks)
|
||||||
|
assert blocks.dtype == np.uint8
|
||||||
|
assert blocks.shape[-1] == cls.type_size
|
||||||
|
return blocks.reshape(cls.__shape_to_bytes(shape))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray:
|
||||||
|
rows = rows.view(np.uint8)
|
||||||
|
shape = rows.shape
|
||||||
|
n_blocks = rows.size // cls.type_size
|
||||||
|
blocks = rows.reshape((n_blocks, cls.type_size))
|
||||||
|
blocks = cls.dequantize_blocks(blocks)
|
||||||
|
assert blocks.dtype == np.float32
|
||||||
|
assert blocks.shape[-1] == cls.block_size
|
||||||
|
return blocks.reshape(cls.__shape_from_bytes(shape))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __shape_to_bytes(cls, shape: Sequence[int]):
|
||||||
|
return quant_shape_to_byte_shape(shape, cls.qtype)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __shape_from_bytes(cls, shape: Sequence[int]):
|
||||||
|
return quant_shape_from_byte_shape(shape, cls.qtype)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __quantize_array(cls, array: np.ndarray) -> np.ndarray:
|
||||||
|
return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __dequantize_array(cls, array: np.ndarray) -> np.ndarray:
|
||||||
|
return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
|
||||||
|
return tensor.shape[-1] % cls.block_size == 0
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
|
||||||
|
if not cls.can_quantize(tensor):
|
||||||
|
raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
|
||||||
|
if isinstance(tensor, LazyNumpyTensor):
|
||||||
|
return cls.__quantize_lazy(tensor)
|
||||||
|
else:
|
||||||
|
return cls.__quantize_array(tensor)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
|
||||||
|
if isinstance(tensor, LazyNumpyTensor):
|
||||||
|
return cls.__dequantize_lazy(tensor)
|
||||||
|
else:
|
||||||
|
return cls.__dequantize_array(tensor)
|
||||||
|
|
||||||
|
|
||||||
|
class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
|
||||||
|
@classmethod
|
||||||
|
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||||
|
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
n = blocks.view(np.uint32)
|
||||||
|
# force nan to quiet
|
||||||
|
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
|
||||||
|
# round to nearest even
|
||||||
|
n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
|
||||||
|
return n.astype(np.uint16).view(np.uint8)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
|
||||||
|
@classmethod
|
||||||
|
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
|
||||||
|
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
|
||||||
d = abs(blocks).max(axis=1, keepdims=True) / 127
|
d = abs(blocks).max(axis=1, keepdims=True) / 127
|
||||||
with np.errstate(divide="ignore"):
|
with np.errstate(divide="ignore"):
|
||||||
|
@ -99,23 +202,12 @@ def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
|
||||||
# (n_blocks, block_size)
|
# (n_blocks, block_size)
|
||||||
qs = qs.astype(np.int8).view(np.uint8)
|
qs = qs.astype(np.int8).view(np.uint8)
|
||||||
|
|
||||||
assert d.shape[1] + qs.shape[1] == __q8_type_size
|
return np.concatenate([d, qs], axis=1)
|
||||||
|
|
||||||
return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
|
@classmethod
|
||||||
|
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
d, x = np.split(blocks, [2], axis=1)
|
||||||
|
d = d.view(np.float16).astype(np.float32)
|
||||||
|
x = x.view(np.int8).astype(np.float32)
|
||||||
|
|
||||||
|
return (x * d)
|
||||||
def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
|
|
||||||
return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
|
|
||||||
|
|
||||||
|
|
||||||
__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
|
|
||||||
__quantize_q8_0_array,
|
|
||||||
meta_noop=(np.uint8, __quantize_q8_0_shape_change),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def quantize_q8_0(data: np.ndarray):
|
|
||||||
if type(data) is LazyNumpyTensor:
|
|
||||||
return __quantize_q8_0_lazy(data)
|
|
||||||
else:
|
|
||||||
return __quantize_q8_0_array(data)
|
|
||||||
|
|
|
@ -345,7 +345,7 @@ extern "C" {
|
||||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||||
enum ggml_type output_tensor_type; // output tensor type
|
enum ggml_type output_tensor_type; // output tensor type
|
||||||
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
enum ggml_type token_embedding_type; // token embeddings tensor type
|
||||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
|
|
|
@ -24,3 +24,18 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
||||||
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||||
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||||
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
|
|
||||||
|
//
|
||||||
|
// helpers
|
||||||
|
//
|
||||||
|
|
||||||
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||||
|
if (search.empty()) {
|
||||||
|
return; // Avoid infinite loop if 'search' is an empty string
|
||||||
|
}
|
||||||
|
size_t pos = 0;
|
||||||
|
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||||
|
s.replace(pos, search.length(), replace);
|
||||||
|
pos += replace.length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -17,17 +17,6 @@
|
||||||
// helpers
|
// helpers
|
||||||
//
|
//
|
||||||
|
|
||||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
||||||
if (search.empty()) {
|
|
||||||
return; // Avoid infinite loop if 'search' is an empty string
|
|
||||||
}
|
|
||||||
size_t pos = 0;
|
|
||||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
|
||||||
s.replace(pos, search.length(), replace);
|
|
||||||
pos += replace.length();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
||||||
static std::string format(const char * fmt, ...) {
|
static std::string format(const char * fmt, ...) {
|
||||||
va_list ap;
|
va_list ap;
|
||||||
|
|
|
@ -13245,13 +13245,13 @@ struct llm_build_context {
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur);
|
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur);
|
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur);
|
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
@ -13285,7 +13285,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13359,13 +13359,13 @@ struct llm_build_context {
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
|
llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
|
||||||
|
@ -13412,7 +13412,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13429,13 +13429,13 @@ struct llm_build_context {
|
||||||
|
|
||||||
// cross-attention
|
// cross-attention
|
||||||
{
|
{
|
||||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur);
|
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc);
|
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc);
|
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
@ -13464,7 +13464,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13521,7 +13521,7 @@ struct llm_build_context {
|
||||||
cb(cur, "result_norm", -1);
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
// lm_head
|
// lm_head
|
||||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
cb(cur, "result_output", -1);
|
cb(cur, "result_output", -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15371,7 +15371,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
||||||
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
||||||
if (n_expert > 1) {
|
if (n_expert > 1) {
|
||||||
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
|
||||||
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
||||||
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
||||||
// tensor name.
|
// tensor name.
|
||||||
|
@ -17396,6 +17396,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
||||||
// TODO: replace all non-fatal assertions with returned errors or exceptions
|
// TODO: replace all non-fatal assertions with returned errors or exceptions
|
||||||
struct llama_data_write {
|
struct llama_data_write {
|
||||||
virtual void write(const void * src, size_t size) = 0;
|
virtual void write(const void * src, size_t size) = 0;
|
||||||
|
virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
|
||||||
virtual size_t get_size_written() = 0;
|
virtual size_t get_size_written() = 0;
|
||||||
virtual ~llama_data_write() = default;
|
virtual ~llama_data_write() = default;
|
||||||
|
|
||||||
|
@ -17518,9 +17519,8 @@ struct llama_data_write {
|
||||||
// Read each range of cells of k_size length each into tmp_buf and write out
|
// Read each range of cells of k_size length each into tmp_buf and write out
|
||||||
for (const auto & range : cell_ranges) {
|
for (const auto & range : cell_ranges) {
|
||||||
const size_t range_size = range.second - range.first;
|
const size_t range_size = range.second - range.first;
|
||||||
tmp_buf.resize(range_size * k_size_row);
|
const size_t buf_size = range_size * k_size_row;
|
||||||
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
|
write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
|
||||||
write(tmp_buf.data(), tmp_buf.size());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17539,9 +17539,8 @@ struct llama_data_write {
|
||||||
// Read each range of cells of v_size length each into tmp_buf and write out
|
// Read each range of cells of v_size length each into tmp_buf and write out
|
||||||
for (const auto & range : cell_ranges) {
|
for (const auto & range : cell_ranges) {
|
||||||
const size_t range_size = range.second - range.first;
|
const size_t range_size = range.second - range.first;
|
||||||
tmp_buf.resize(range_size * v_size_row);
|
const size_t buf_size = range_size * v_size_row;
|
||||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
|
write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
|
||||||
write(tmp_buf.data(), tmp_buf.size());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -17567,9 +17566,8 @@ struct llama_data_write {
|
||||||
for (const auto & range : cell_ranges) {
|
for (const auto & range : cell_ranges) {
|
||||||
const size_t range_size = range.second - range.first;
|
const size_t range_size = range.second - range.first;
|
||||||
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
||||||
tmp_buf.resize(range_size * v_size_el);
|
const size_t buf_size = range_size * v_size_el;
|
||||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
|
||||||
write(tmp_buf.data(), tmp_buf.size());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -17928,12 +17926,14 @@ struct llama_data_write_dummy : llama_data_write {
|
||||||
|
|
||||||
llama_data_write_dummy() {}
|
llama_data_write_dummy() {}
|
||||||
|
|
||||||
// TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context
|
|
||||||
|
|
||||||
void write(const void * /* src */, size_t size) override {
|
void write(const void * /* src */, size_t size) override {
|
||||||
size_written += size;
|
size_written += size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
|
||||||
|
size_written += size;
|
||||||
|
}
|
||||||
|
|
||||||
size_t get_size_written() override {
|
size_t get_size_written() override {
|
||||||
return size_written;
|
return size_written;
|
||||||
}
|
}
|
||||||
|
@ -17956,6 +17956,16 @@ struct llama_data_write_buffer : llama_data_write {
|
||||||
buf_size -= size;
|
buf_size -= size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
|
||||||
|
if (size > buf_size) {
|
||||||
|
throw std::runtime_error("unexpectedly reached end of buffer");
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_get(tensor, ptr, offset, size);
|
||||||
|
ptr += size;
|
||||||
|
size_written += size;
|
||||||
|
buf_size -= size;
|
||||||
|
}
|
||||||
|
|
||||||
size_t get_size_written() override {
|
size_t get_size_written() override {
|
||||||
return size_written;
|
return size_written;
|
||||||
}
|
}
|
||||||
|
@ -17991,6 +18001,7 @@ struct llama_data_read_buffer : llama_data_read {
|
||||||
struct llama_data_write_file : llama_data_write {
|
struct llama_data_write_file : llama_data_write {
|
||||||
llama_file * file;
|
llama_file * file;
|
||||||
size_t size_written = 0;
|
size_t size_written = 0;
|
||||||
|
std::vector<uint8_t> temp_buffer;
|
||||||
|
|
||||||
llama_data_write_file(llama_file * f) : file(f) {}
|
llama_data_write_file(llama_file * f) : file(f) {}
|
||||||
|
|
||||||
|
@ -17999,6 +18010,12 @@ struct llama_data_write_file : llama_data_write {
|
||||||
size_written += size;
|
size_written += size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
|
||||||
|
temp_buffer.resize(size);
|
||||||
|
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
|
||||||
|
write(temp_buffer.data(), temp_buffer.size());
|
||||||
|
}
|
||||||
|
|
||||||
size_t get_size_written() override {
|
size_t get_size_written() override {
|
||||||
return size_written;
|
return size_written;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue