kvcache-ai-ktransformers/ktransformers/tests/dequant_gpu_t.py

40 lines
No EOL
1.2 KiB
Python

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
# add path
import sys
sys.path.append("../..")
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
from ktransformers.util.custom_loader import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
import torch
import KTransformersOps
torch.set_default_dtype(torch.bfloat16)
import time
from transformers import (
AutoConfig,
)
gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m")
model_name = "/data/Qwen2-57B-A14B-Instruct"
key = "blk.0."
target = "ffn_up_exps.weight"
data = gguf_config.get_mmap_tensor(key + target)
_, factors, offsets, qs1, qs2= dequantize_q4_k(data)
factors_cpu = torch.from_numpy(factors)
offsets_cpu = torch.from_numpy(offsets)
qs1_cpu = torch.from_numpy(qs1)
qs2_cpu = torch.from_numpy(qs2)
_, factors, offsets, qs1, qs2 = dequantize_q4_k_gpu(data)
print(torch.allclose(factors.cpu(), factors_cpu))
print(torch.allclose(offsets.cpu(), offsets_cpu))
print(torch.allclose(qs1.cpu(), qs1_cpu))
print(torch.allclose(qs2.cpu(), qs2_cpu))