kvcache-ai-ktransformers/kt-sft/test_adapter/time_test_lora_train.py
Peilin Li 171578a7ec
Some checks failed
Book-CI / test (push) Has been cancelled
Book-CI / test-1 (push) Has been cancelled
Book-CI / test-2 (push) Has been cancelled
Deploy / deploy (macos-latest) (push) Has been cancelled
Deploy / deploy (ubuntu-latest) (push) Has been cancelled
Deploy / deploy (windows-latest) (push) Has been cancelled
[refactor]: Change named 'KT-SFT' to 'kt-sft' (#1626)
* Change named 'KT-SFT' to 'kt-sft'

* [docs]: update kt-sft name

---------

Co-authored-by: ZiWei Yuan <yzwliam@126.com>
2025-11-17 11:48:42 +08:00

113 lines
No EOL
6.8 KiB
Python

import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
with record_function("model_inference"):
model(inputs)
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
# --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
# Name Self CPU % Self CPU CPU total % CPU total CPU time avg CPU Mem Self CPU Mem # of Calls
# --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
# aten::mkldnn_convolution 73.87% 37.241ms 74.04% 37.326ms 7.465ms 9.25 Mb 0 b 5
# aten::addmm 12.98% 6.545ms 13.11% 6.609ms 2.203ms 179.53 Kb 179.53 Kb 3
# aten::max_pool2d_with_indices 6.63% 3.343ms 6.63% 3.343ms 1.114ms 5.05 Mb 5.05 Mb 3
# aten::clamp_min 2.12% 1.071ms 2.12% 1.071ms 153.000us 0 b 0 b 7
# aten::bernoulli_ 1.20% 607.000us 1.23% 622.000us 311.000us 0 b -260.00 Kb 2
# --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
# Self CPU time total: 50.416ms
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
# --------------------------------- ------------ -------------------------------------------
# Name CPU total Input Shapes
# --------------------------------- ------------ -------------------------------------------
# model_inference 57.503ms []
# aten::conv2d 8.008ms [5,64,56,56], [64,64,3,3], [], ..., []]
# aten::convolution 7.956ms [[5,64,56,56], [64,64,3,3], [], ..., []] #卷积统计
# aten::_convolution 7.909ms [[5,64,56,56], [64,64,3,3], [], ..., []]
# aten::mkldnn_convolution 7.834ms [[5,64,56,56], [64,64,3,3], [], ..., []]
# aten::conv2d 6.332ms [[5,512,7,7], [512,512,3,3], [], ..., []]
# aten::convolution 6.303ms [[5,512,7,7], [512,512,3,3], [], ..., []] #卷积统计
# aten::_convolution 6.273ms [[5,512,7,7], [512,512,3,3], [], ..., []]
# aten::mkldnn_convolution 6.233ms [[5,512,7,7], [512,512,3,3], [], ..., []]
# aten::conv2d 4.751ms [[5,256,14,14], [256,256,3,3], [], ..., []]
# --------------------------------- ------------ -------------------------------------------
# Self CPU time total: 57.549ms
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()
with profile(activities=[
ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
with record_function("model_inference"):
model(inputs)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
# ------------------------------------------------------- ------------ ------------
# Name Self CUDA CUDA total
# ------------------------------------------------------- ------------ ------------
# model_inference 0.000us 11.666ms
# aten::conv2d 0.000us 10.484ms
# aten::convolution 0.000us 10.484ms
# aten::_convolution 0.000us 10.484ms
# aten::_convolution_nogroup 0.000us 10.484ms
# aten::thnn_conv2d 0.000us 10.484ms
# aten::thnn_conv2d_forward 10.484ms 10.484ms
# void at::native::im2col_kernel<float>(long, float co... 3.844ms 3.844ms
# sgemm_32x32x32_NN 3.206ms 3.206ms
# sgemm_32x32x32_NN_vec 3.093ms 3.093ms
# ------------------------------------------------------- ------------ ------------
# Self CPU time total: 23.015ms
# Self CUDA time total: 11.666ms
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)
with profile(activities=[ProfilerActivity.CPU],
profile_memory=True, record_shapes=True) as prof:
model(inputs)
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10)) # 算子自身使用的内存总量,不包括子算子
print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
model(inputs)
prof.export_chrome_trace("trace.json")
with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
with_stack=True,
) as prof:
model(inputs)
# Print aggregated stats
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2)) # 启用stack tracing会带来额外开销
# ------------------------- -----------------------------------------------------------
# Name Source Location
# ------------------------- -----------------------------------------------------------
# aten::thnn_conv2d_forward .../torch/nn/modules/conv.py(439): _conv_forward
# .../torch/nn/modules/conv.py(443): forward
# .../torch/nn/modules/module.py(1051): _call_impl
# .../site-packages/torchvision/models/resnet.py(63): forward
# .../torch/nn/modules/module.py(1051): _call_impl
# aten::thnn_conv2d_forward .../torch/nn/modules/conv.py(439): _conv_forward
# .../torch/nn/modules/conv.py(443): forward
# .../torch/nn/modules/module.py(1051): _call_impl
# .../site-packages/torchvision/models/resnet.py(59): forward
# .../torch/nn/modules/module.py(1051): _call_impl
# ------------------------- -----------------------------------------------------------
# Self CPU time total: 34.016ms
# Self CUDA time total: 11.659ms