mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-28 11:49:51 +00:00
Some checks failed
Book-CI / test (push) Has been cancelled
Book-CI / test-1 (push) Has been cancelled
Book-CI / test-2 (push) Has been cancelled
Deploy / deploy (macos-latest) (push) Has been cancelled
Deploy / deploy (ubuntu-latest) (push) Has been cancelled
Deploy / deploy (windows-latest) (push) Has been cancelled
* Change named 'KT-SFT' to 'kt-sft' * [docs]: update kt-sft name --------- Co-authored-by: ZiWei Yuan <yzwliam@126.com>
149 lines
No EOL
5.5 KiB
Python
149 lines
No EOL
5.5 KiB
Python
import os
|
||
import platform
|
||
import sys
|
||
|
||
project_dir = os.path.dirname(os.path.dirname(__file__))
|
||
sys.path.insert(0, project_dir)
|
||
|
||
from torchviz import make_dot
|
||
from torch import nn
|
||
import torch
|
||
from transformers import (
|
||
AutoTokenizer,
|
||
AutoConfig,
|
||
AutoModelForCausalLM,
|
||
GenerationConfig,
|
||
TextStreamer,
|
||
)
|
||
import unittest
|
||
from torch.autograd import gradcheck
|
||
|
||
from ktransformers.operators.linear import KLinearTorch, KTransformersLinear
|
||
from ktransformers.sft.peft_utils.lora_layer import KTransformersLinearLora
|
||
from ktransformers.util.custom_loader import GGUFLoader
|
||
from ktransformers.operators.experts import KExpertsTorch
|
||
from ktransformers.util.utils import load_weights
|
||
|
||
gguf_loader = GGUFLoader(gguf_path="/home/yj/ktransformers/GGUF-DeepSeek-V2-Lite-Chat")
|
||
config = AutoConfig.from_pretrained("/home/yj/ktransformers/DeepSeek-V2-Lite-Chat", trust_remote_code=True)
|
||
torch.set_default_dtype(config.torch_dtype)
|
||
|
||
class TestKExpertsTorch(unittest.TestCase):
|
||
def setUp(self):
|
||
torch.backends.cudnn.deterministic = True
|
||
torch.backends.cudnn.benchmark = False
|
||
self.base_device = "cpu"
|
||
self.num_experts = 8
|
||
# model = KExpertsTorch(
|
||
# key="blk.1",
|
||
# gguf_loader=gguf_loader,
|
||
# config=config,
|
||
# n_routed_experts=self.num_experts,
|
||
# device=self.base_device
|
||
# )
|
||
# model.load()
|
||
|
||
def _run_single_device_test(self, device, seed=42):
|
||
"""在指定设备上运行前向反向传播并返回梯度"""
|
||
torch.manual_seed(seed)
|
||
if device == "cuda":
|
||
torch.cuda.manual_seed_all(seed)
|
||
|
||
model = KExpertsTorch(
|
||
key="blk.1",
|
||
gguf_loader=gguf_loader,
|
||
config=config,
|
||
n_routed_experts=self.num_experts,
|
||
device=device
|
||
)
|
||
model.load(device=device)
|
||
|
||
with torch.random.fork_rng():
|
||
torch.manual_seed(seed)
|
||
batch_size = 2
|
||
hidden_size = model.config.hidden_size
|
||
input_tensor = torch.randn(batch_size, hidden_size, device=device, requires_grad=True)
|
||
expert_ids = torch.randint(0, self.num_experts,
|
||
(batch_size, model.config.num_experts_per_tok),
|
||
device=device)
|
||
weights = torch.randn(batch_size, model.config.num_experts_per_tok, device=device)
|
||
weights = torch.softmax(weights, dim=-1)
|
||
|
||
print(f"input_tensor.device:{input_tensor.device}")
|
||
print(f"torch.device(device):{torch.device(device)}")
|
||
# assert input_tensor.device == torch.device(device)
|
||
for p in model.parameters():
|
||
print(f"p.device:{p.device}")
|
||
|
||
for name, param in model.named_parameters():
|
||
print(name, param.size())
|
||
|
||
|
||
model.to(device)
|
||
with torch.autocast(device_type=device, enabled=False):
|
||
output = model(input_tensor, expert_ids, weights)
|
||
|
||
loss = output.sum()
|
||
|
||
|
||
# dot = make_dot(output, params=dict(model.named_parameters()))
|
||
# dot.render(f"origin_moe_{torch.device(device)}_graph", format="svg")
|
||
|
||
loss.backward()
|
||
|
||
gradients = {
|
||
"input": input_tensor.grad.clone().cpu(),
|
||
"loss": loss.clone().cpu(),
|
||
"model": [p.grad.clone().cpu() for p in model.parameters() if p.grad is not None]
|
||
}
|
||
return gradients
|
||
|
||
def test_forward_gradient(self):
|
||
# for param in model.parameters():
|
||
# self.assertEqual(param.dtype, config.torch_dtype)
|
||
|
||
cpu_gradients = self._run_single_device_test("cpu")
|
||
print(f"cpu_gradients: {cpu_gradients}")
|
||
|
||
self.assertIsNotNone(cpu_gradients["input"])
|
||
self.assertTrue(all(g is not None for g in cpu_gradients["model"]))
|
||
|
||
if torch.cuda.is_available():
|
||
gpu_gradients = self._run_single_device_test("cuda")
|
||
|
||
print(f"gpu_gradients: {gpu_gradients}")
|
||
|
||
|
||
max_diff = (cpu_gradients["input"] - gpu_gradients["input"].cpu()).abs().max()
|
||
print(f"Input梯度最大差异: {max_diff.item()}")
|
||
|
||
self.assertTrue(torch.allclose(cpu_gradients["input"], gpu_gradients["input"], atol=1e-4, rtol=1e-3),
|
||
f"Input梯度差异超出阈值,最大差异: {max_diff.item()}")
|
||
|
||
for i, (cpu_g, gpu_g) in enumerate(zip(cpu_gradients["model"], gpu_gradients["model"])):
|
||
diff = (cpu_g - gpu_g.cpu()).abs().max()
|
||
print(f"参数梯度 {i} 最大差异: {diff.item()}")
|
||
self.assertTrue(torch.allclose(cpu_g, gpu_g, atol=1e-4, rtol=1e-3),
|
||
f"参数梯度 {i} 差异超出阈值,最大差异: {diff.item()}")
|
||
|
||
else:
|
||
raise ImportError("NO CUDA FOR TEST!!")
|
||
|
||
# def test_detach_effect(self):
|
||
# input_tensor = torch.randn(1, model.config.hidden_size, device="cpu", requires_grad=True)
|
||
# expert_ids = torch.tensor([[0, 1]], device="cpu")
|
||
# weights = torch.tensor([[0.5, 0.5]], device="cpu")
|
||
|
||
# output = model(input_tensor, expert_ids, weights)
|
||
|
||
# # dot = make_dot(output, params=dict(model.named_parameters()))
|
||
# # dot.render("origin_moe_cpu_graph", format="svg")
|
||
|
||
# loss = output.sum()
|
||
# loss.backward()
|
||
|
||
# self.assertIsNotNone(input_tensor.grad)
|
||
# self.assertTrue(all(p.grad is not None for p in model.parameters()))
|
||
|
||
if __name__ == '__main__':
|
||
unittest.main() |