kvcache-ai-ktransformers/kt-sft/ktransformers/moe_test_module_old.py
Peilin Li 171578a7ec
Some checks failed
Book-CI / test (push) Has been cancelled
Book-CI / test-1 (push) Has been cancelled
Book-CI / test-2 (push) Has been cancelled
Deploy / deploy (macos-latest) (push) Has been cancelled
Deploy / deploy (ubuntu-latest) (push) Has been cancelled
Deploy / deploy (windows-latest) (push) Has been cancelled
[refactor]: Change named 'KT-SFT' to 'kt-sft' (#1626)
* Change named 'KT-SFT' to 'kt-sft'

* [docs]: update kt-sft name

---------

Co-authored-by: ZiWei Yuan <yzwliam@126.com>
2025-11-17 11:48:42 +08:00

149 lines
No EOL
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import platform
import sys
project_dir = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, project_dir)
from torchviz import make_dot
from torch import nn
import torch
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForCausalLM,
GenerationConfig,
TextStreamer,
)
import unittest
from torch.autograd import gradcheck
from ktransformers.operators.linear import KLinearTorch, KTransformersLinear
from ktransformers.sft.peft_utils.lora_layer import KTransformersLinearLora
from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.operators.experts import KExpertsTorch
from ktransformers.util.utils import load_weights
gguf_loader = GGUFLoader(gguf_path="/home/yj/ktransformers/GGUF-DeepSeek-V2-Lite-Chat")
config = AutoConfig.from_pretrained("/home/yj/ktransformers/DeepSeek-V2-Lite-Chat", trust_remote_code=True)
torch.set_default_dtype(config.torch_dtype)
class TestKExpertsTorch(unittest.TestCase):
def setUp(self):
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
self.base_device = "cpu"
self.num_experts = 8
# model = KExpertsTorch(
# key="blk.1",
# gguf_loader=gguf_loader,
# config=config,
# n_routed_experts=self.num_experts,
# device=self.base_device
# )
# model.load()
def _run_single_device_test(self, device, seed=42):
"""在指定设备上运行前向反向传播并返回梯度"""
torch.manual_seed(seed)
if device == "cuda":
torch.cuda.manual_seed_all(seed)
model = KExpertsTorch(
key="blk.1",
gguf_loader=gguf_loader,
config=config,
n_routed_experts=self.num_experts,
device=device
)
model.load(device=device)
with torch.random.fork_rng():
torch.manual_seed(seed)
batch_size = 2
hidden_size = model.config.hidden_size
input_tensor = torch.randn(batch_size, hidden_size, device=device, requires_grad=True)
expert_ids = torch.randint(0, self.num_experts,
(batch_size, model.config.num_experts_per_tok),
device=device)
weights = torch.randn(batch_size, model.config.num_experts_per_tok, device=device)
weights = torch.softmax(weights, dim=-1)
print(f"input_tensor.device:{input_tensor.device}")
print(f"torch.device(device):{torch.device(device)}")
# assert input_tensor.device == torch.device(device)
for p in model.parameters():
print(f"p.device:{p.device}")
for name, param in model.named_parameters():
print(name, param.size())
model.to(device)
with torch.autocast(device_type=device, enabled=False):
output = model(input_tensor, expert_ids, weights)
loss = output.sum()
# dot = make_dot(output, params=dict(model.named_parameters()))
# dot.render(f"origin_moe_{torch.device(device)}_graph", format="svg")
loss.backward()
gradients = {
"input": input_tensor.grad.clone().cpu(),
"loss": loss.clone().cpu(),
"model": [p.grad.clone().cpu() for p in model.parameters() if p.grad is not None]
}
return gradients
def test_forward_gradient(self):
# for param in model.parameters():
# self.assertEqual(param.dtype, config.torch_dtype)
cpu_gradients = self._run_single_device_test("cpu")
print(f"cpu_gradients: {cpu_gradients}")
self.assertIsNotNone(cpu_gradients["input"])
self.assertTrue(all(g is not None for g in cpu_gradients["model"]))
if torch.cuda.is_available():
gpu_gradients = self._run_single_device_test("cuda")
print(f"gpu_gradients: {gpu_gradients}")
max_diff = (cpu_gradients["input"] - gpu_gradients["input"].cpu()).abs().max()
print(f"Input梯度最大差异: {max_diff.item()}")
self.assertTrue(torch.allclose(cpu_gradients["input"], gpu_gradients["input"], atol=1e-4, rtol=1e-3),
f"Input梯度差异超出阈值最大差异: {max_diff.item()}")
for i, (cpu_g, gpu_g) in enumerate(zip(cpu_gradients["model"], gpu_gradients["model"])):
diff = (cpu_g - gpu_g.cpu()).abs().max()
print(f"参数梯度 {i} 最大差异: {diff.item()}")
self.assertTrue(torch.allclose(cpu_g, gpu_g, atol=1e-4, rtol=1e-3),
f"参数梯度 {i} 差异超出阈值,最大差异: {diff.item()}")
else:
raise ImportError("NO CUDA FOR TEST!!")
# def test_detach_effect(self):
# input_tensor = torch.randn(1, model.config.hidden_size, device="cpu", requires_grad=True)
# expert_ids = torch.tensor([[0, 1]], device="cpu")
# weights = torch.tensor([[0.5, 0.5]], device="cpu")
# output = model(input_tensor, expert_ids, weights)
# # dot = make_dot(output, params=dict(model.named_parameters()))
# # dot.render("origin_moe_cpu_graph", format="svg")
# loss = output.sum()
# loss.backward()
# self.assertIsNotNone(input_tensor.grad)
# self.assertTrue(all(p.grad is not None for p in model.parameters()))
if __name__ == '__main__':
unittest.main()