[ADD] support multi-gpu qlen>1 q5_k

This commit is contained in:
chenxl 2024-08-12 11:17:29 +00:00
parent f293803156
commit f5f79f5c0e
63 changed files with 3271 additions and 1285 deletions

View file

@ -1,12 +1,9 @@
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
# os.environ["CUDA_VISIBLE_DEVICES"]="1,2"
# add path
import sys
current_path = os.path.abspath(os.path.dirname(__file__))
sys.path.append(current_path+"/../..")
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
# from ktransformers.operators.linear import KTransformerLinear, QuantizedLinearMarlin
# from ktransformers.operators.experts import KTransformersMLPExpert, MLPExpertsTorch
@ -18,36 +15,23 @@ import time
from transformers import (
AutoConfig,
)
import os
# CUDA_LAUNCH_BLOCKING=1
os.environ["CUDA_LAUNCH_BLOCKING"]="1"
gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m")
model_name = "/data/Qwen2-57B-A14B-Instruct"
key = "blk.0."
target = "ffn_down_exps.weight"
t1 = time.time()
q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
# q_weight_cpu = torch.from_numpy(q_weight_cpu)
t2 = time.time()
q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda")
t3 = time.time()
print()
allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu().to(torch.float32), atol=1e-6)
print(f"Q6k {key+target}")
print("load gguf tensor from cpu cost: ", t2-t1)
print("load gguf tensor from gpu cost: ", t3-t2)
print("allclose: ", allclose)
# Q4k
key = "blk.1."
target = "ffn_up_shexp.weight"
target = "attn_q.weight"
t1 = time.time()
q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
# q_weight_cpu = torch.from_numpy(q_weight_cpu)
t2 = time.time()
q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda")
q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0")
t3 = time.time()
print()
allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu(), atol=1e-6)
@ -55,3 +39,20 @@ print(f"Q4k {key+target}")
print("load gguf tensor from cpu cost: ", t2-t1)
print("load gguf tensor from gpu cost: ", t3-t2)
print("allclose: ", allclose)
# Q6k
key = "blk.0."
target = "ffn_down_exps.weight"
t1 = time.time()
q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
t2 = time.time()
q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0")
t3 = time.time()
print()
allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu().to(torch.float32), atol=1e-6)
print(f"Q6k {key+target}")
print("load gguf tensor from cpu cost: ", t2-t1)
print("load gguf tensor from gpu cost: ", t3-t2)
print("allclose: ", allclose)