mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-05 12:09:48 +00:00
use marlin for lm_head, lm_head only calc last token for prefill extend context window to 19K for DeepSeek-V3/R1 within 24GB VRAM
16 lines
No EOL
547 B
Python
16 lines
No EOL
547 B
Python
import os
|
|
import sys
|
|
sys.path.insert(0,"/home/zbx/ktransformers")
|
|
from ktransformers.util.custom_gguf import GGUFLoader
|
|
import torch
|
|
|
|
gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
|
|
gguf_loader_2 = GGUFLoader("/mnt/data/chenht/model/gguf_for_ktransformers/DeepSeek-V3-bf16/")
|
|
|
|
torch.set_default_dtype(torch.bfloat16)
|
|
|
|
tensor_1 = gguf_loader_1.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
|
|
tensor_2 = gguf_loader_2.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
|
|
|
|
print(tensor_1[0, -64:])
|
|
print(tensor_2[0, -64:]) |