mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
add e5m2 support for use in Kobo, also made a separate contribution PR https://github.com/leejet/stable-diffusion.cpp/pull/460
This commit is contained in:
parent
3813f6c517
commit
dd95f88c19
2 changed files with 67 additions and 1 deletions
|
@ -615,6 +615,48 @@ uint16_t f8_e4m3_to_f16(uint8_t f8) {
|
||||||
return ggml_fp32_to_fp16(*reinterpret_cast<const float*>(&result));
|
return ggml_fp32_to_fp16(*reinterpret_cast<const float*>(&result));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
uint16_t f8_e5m2_to_f16(uint8_t fp8) {
|
||||||
|
uint8_t sign = (fp8 >> 7) & 0x1;
|
||||||
|
uint8_t exponent = (fp8 >> 2) & 0x1F;
|
||||||
|
uint8_t mantissa = fp8 & 0x3;
|
||||||
|
|
||||||
|
uint16_t fp16_sign = sign << 15;
|
||||||
|
uint16_t fp16_exponent;
|
||||||
|
uint16_t fp16_mantissa;
|
||||||
|
|
||||||
|
if (exponent == 0 && mantissa == 0) { //zero
|
||||||
|
return fp16_sign;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (exponent == 0x1F) { //NAN and INF
|
||||||
|
fp16_exponent = 0x1F;
|
||||||
|
fp16_mantissa = mantissa ? (mantissa << 8) : 0;
|
||||||
|
return fp16_sign | (fp16_exponent << 10) | fp16_mantissa;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (exponent == 0) { //subnormal numbers
|
||||||
|
fp16_exponent = 0;
|
||||||
|
fp16_mantissa = (mantissa << 8);
|
||||||
|
return fp16_sign | fp16_mantissa;
|
||||||
|
}
|
||||||
|
|
||||||
|
//normal numbers
|
||||||
|
int16_t true_exponent = (int16_t)exponent - 15 + 15;
|
||||||
|
if (true_exponent <= 0) {
|
||||||
|
fp16_exponent = 0;
|
||||||
|
fp16_mantissa = (mantissa << 8);
|
||||||
|
} else if (true_exponent >= 0x1F) {
|
||||||
|
fp16_exponent = 0x1F;
|
||||||
|
fp16_mantissa = 0;
|
||||||
|
} else {
|
||||||
|
fp16_exponent = (uint16_t)true_exponent;
|
||||||
|
fp16_mantissa = mantissa << 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
return fp16_sign | (fp16_exponent << 10) | fp16_mantissa;
|
||||||
|
}
|
||||||
|
|
||||||
void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
|
void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
|
||||||
// support inplace op
|
// support inplace op
|
||||||
for (int64_t i = n - 1; i >= 0; i--) {
|
for (int64_t i = n - 1; i >= 0; i--) {
|
||||||
|
@ -628,6 +670,12 @@ void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
|
||||||
dst[i] = f8_e4m3_to_f16(src[i]);
|
dst[i] = f8_e4m3_to_f16(src[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
|
||||||
|
// support inplace op
|
||||||
|
for (int64_t i = n - 1; i >= 0; i--) {
|
||||||
|
dst[i] = f8_e5m2_to_f16(src[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void convert_tensor(void* src,
|
void convert_tensor(void* src,
|
||||||
ggml_type src_type,
|
ggml_type src_type,
|
||||||
|
@ -867,6 +915,8 @@ ggml_type str_to_ggml_type(const std::string& dtype) {
|
||||||
ttype = GGML_TYPE_F32;
|
ttype = GGML_TYPE_F32;
|
||||||
} else if (dtype == "F8_E4M3") {
|
} else if (dtype == "F8_E4M3") {
|
||||||
ttype = GGML_TYPE_F16;
|
ttype = GGML_TYPE_F16;
|
||||||
|
} else if (dtype == "F8_E5M2") {
|
||||||
|
ttype = GGML_TYPE_F16;
|
||||||
}
|
}
|
||||||
return ttype;
|
return ttype;
|
||||||
}
|
}
|
||||||
|
@ -980,6 +1030,10 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
|
||||||
tensor_storage.is_f8_e4m3 = true;
|
tensor_storage.is_f8_e4m3 = true;
|
||||||
// f8 -> f16
|
// f8 -> f16
|
||||||
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
|
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
|
||||||
|
} else if (dtype == "F8_E5M2") {
|
||||||
|
tensor_storage.is_f8_e5m2 = true;
|
||||||
|
// f8 -> f16
|
||||||
|
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size);
|
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size);
|
||||||
}
|
}
|
||||||
|
@ -1646,6 +1700,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
|
||||||
} else if (tensor_storage.is_f8_e4m3) {
|
} else if (tensor_storage.is_f8_e4m3) {
|
||||||
// inplace op
|
// inplace op
|
||||||
f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
|
f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
|
||||||
|
} else if (tensor_storage.is_f8_e5m2) {
|
||||||
|
// inplace op
|
||||||
|
f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
read_buffer.resize(tensor_storage.nbytes());
|
read_buffer.resize(tensor_storage.nbytes());
|
||||||
|
@ -1657,6 +1714,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
|
||||||
} else if (tensor_storage.is_f8_e4m3) {
|
} else if (tensor_storage.is_f8_e4m3) {
|
||||||
// inplace op
|
// inplace op
|
||||||
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||||
|
} else if (tensor_storage.is_f8_e5m2) {
|
||||||
|
// inplace op
|
||||||
|
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||||
}
|
}
|
||||||
|
|
||||||
convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
|
convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
|
||||||
|
@ -1672,6 +1732,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
|
||||||
} else if (tensor_storage.is_f8_e4m3) {
|
} else if (tensor_storage.is_f8_e4m3) {
|
||||||
// inplace op
|
// inplace op
|
||||||
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||||
|
} else if (tensor_storage.is_f8_e5m2) {
|
||||||
|
// inplace op
|
||||||
|
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tensor_storage.type == dst_tensor->type) {
|
if (tensor_storage.type == dst_tensor->type) {
|
||||||
|
|
|
@ -35,6 +35,7 @@ struct TensorStorage {
|
||||||
ggml_type type = GGML_TYPE_F32;
|
ggml_type type = GGML_TYPE_F32;
|
||||||
bool is_bf16 = false;
|
bool is_bf16 = false;
|
||||||
bool is_f8_e4m3 = false;
|
bool is_f8_e4m3 = false;
|
||||||
|
bool is_f8_e5m2 = false;
|
||||||
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
|
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
|
||||||
int n_dims = 0;
|
int n_dims = 0;
|
||||||
|
|
||||||
|
@ -64,7 +65,7 @@ struct TensorStorage {
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t nbytes_to_read() const {
|
int64_t nbytes_to_read() const {
|
||||||
if (is_bf16 || is_f8_e4m3) {
|
if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
|
||||||
return nbytes() / 2;
|
return nbytes() / 2;
|
||||||
} else {
|
} else {
|
||||||
return nbytes();
|
return nbytes();
|
||||||
|
@ -114,6 +115,8 @@ struct TensorStorage {
|
||||||
type_name = "bf16";
|
type_name = "bf16";
|
||||||
} else if (is_f8_e4m3) {
|
} else if (is_f8_e4m3) {
|
||||||
type_name = "f8_e4m3";
|
type_name = "f8_e4m3";
|
||||||
|
} else if (is_f8_e5m2) {
|
||||||
|
type_name = "f8_e5m2";
|
||||||
}
|
}
|
||||||
ss << name << " | " << type_name << " | ";
|
ss << name << " | " << type_name << " | ";
|
||||||
ss << n_dims << " [";
|
ss << n_dims << " [";
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue