Add fp8 linear kernel;\n Add empty cache to fit in 16G VRAM; By 'wkGCaSS - 知乎 https://zhuanlan.zhihu.com/p/25491611225'

This commit is contained in:
Azure 2025-02-22 13:05:08 +00:00
parent b4fb633991
commit 7b7c6a657d
5 changed files with 331 additions and 2 deletions

View file

@ -70,7 +70,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
target_dtype = torch.get_default_dtype()
device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
print(f"loading {translated_key} to {device}")
# device = "cpu" if "embd" in translated_key else "cuda"
torch.cuda.empty_cache() # To fit in 16G VRAM. By "wkGCaSS - 知乎 https://zhuanlan.zhihu.com/p/25491611225"
weights = gguf_loader.load_gguf_tensor(translated_key, device = device).to(dtype = target_dtype)
set_param(module, name, weights)
del weights