Add fp8 linear kernel;\n Add empty cache to fit in 16G VRAM; By 'wkGCaSS - 知乎 https://zhuanlan.zhihu.com/p/25491611225'

This commit is contained in:
Azure 2025-02-22 13:05:08 +00:00
parent b4fb633991
commit 7b7c6a657d
5 changed files with 331 additions and 2 deletions

View file

@ -127,6 +127,7 @@ GGML_BLOCK_SIZES = {
"Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2,
"Q6_K": 256 // 2 + 256 // 4 + 256 // 16 + 2,
"IQ4_XS": 2 + 2 + 256 // 2 + 256 // 64,
"FP8": 1,
}
GGML_ELEMENTS_PER_BLOCK = {
@ -142,6 +143,7 @@ GGML_ELEMENTS_PER_BLOCK = {
"Q5_K": 256,
"Q6_K": 256,
"IQ4_XS": 256,
"FP8": 1,
}
DATA_TYPES = {
@ -158,6 +160,7 @@ DATA_TYPES = {
"uint64": 10,
"int64": 11,
"float64": 12,
"FP8": 13,
}
class GGUFLoader:
@ -393,6 +396,9 @@ def read_value(f, data_type):
elem_type, count = struct.unpack("<IQ", f.read(4 + 8))
return [read_value(f, elem_type) for _ in range(count)]
elif data_type == DATA_TYPES["FP8"]:
return struct.unpack("<B", f.read(1))[0]
else:
raise NotImplementedError(f"Data type {data_type} not implemented")