mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-05-02 13:41:15 +00:00
Some checks failed
Book-CI / test (push) Has been cancelled
Book-CI / test-1 (push) Has been cancelled
Book-CI / test-2 (push) Has been cancelled
Deploy / deploy (macos-latest) (push) Has been cancelled
Deploy / deploy (ubuntu-latest) (push) Has been cancelled
Deploy / deploy (windows-latest) (push) Has been cancelled
* Change named 'KT-SFT' to 'kt-sft' * [docs]: update kt-sft name --------- Co-authored-by: ZiWei Yuan <yzwliam@126.com>
113 lines
No EOL
6.8 KiB
Python
113 lines
No EOL
6.8 KiB
Python
import torch
|
|
import torchvision.models as models
|
|
from torch.profiler import profile, record_function, ProfilerActivity
|
|
|
|
model = models.resnet18()
|
|
inputs = torch.randn(5, 3, 224, 224)
|
|
|
|
|
|
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
|
|
with record_function("model_inference"):
|
|
model(inputs)
|
|
|
|
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
|
|
# --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
|
# Name Self CPU % Self CPU CPU total % CPU total CPU time avg CPU Mem Self CPU Mem # of Calls
|
|
# --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
|
# aten::mkldnn_convolution 73.87% 37.241ms 74.04% 37.326ms 7.465ms 9.25 Mb 0 b 5
|
|
# aten::addmm 12.98% 6.545ms 13.11% 6.609ms 2.203ms 179.53 Kb 179.53 Kb 3
|
|
# aten::max_pool2d_with_indices 6.63% 3.343ms 6.63% 3.343ms 1.114ms 5.05 Mb 5.05 Mb 3
|
|
# aten::clamp_min 2.12% 1.071ms 2.12% 1.071ms 153.000us 0 b 0 b 7
|
|
# aten::bernoulli_ 1.20% 607.000us 1.23% 622.000us 311.000us 0 b -260.00 Kb 2
|
|
# --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
|
# Self CPU time total: 50.416ms
|
|
|
|
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
|
|
# --------------------------------- ------------ -------------------------------------------
|
|
# Name CPU total Input Shapes
|
|
# --------------------------------- ------------ -------------------------------------------
|
|
# model_inference 57.503ms []
|
|
# aten::conv2d 8.008ms [5,64,56,56], [64,64,3,3], [], ..., []]
|
|
# aten::convolution 7.956ms [[5,64,56,56], [64,64,3,3], [], ..., []] #卷积统计
|
|
# aten::_convolution 7.909ms [[5,64,56,56], [64,64,3,3], [], ..., []]
|
|
# aten::mkldnn_convolution 7.834ms [[5,64,56,56], [64,64,3,3], [], ..., []]
|
|
# aten::conv2d 6.332ms [[5,512,7,7], [512,512,3,3], [], ..., []]
|
|
# aten::convolution 6.303ms [[5,512,7,7], [512,512,3,3], [], ..., []] #卷积统计
|
|
# aten::_convolution 6.273ms [[5,512,7,7], [512,512,3,3], [], ..., []]
|
|
# aten::mkldnn_convolution 6.233ms [[5,512,7,7], [512,512,3,3], [], ..., []]
|
|
# aten::conv2d 4.751ms [[5,256,14,14], [256,256,3,3], [], ..., []]
|
|
# --------------------------------- ------------ -------------------------------------------
|
|
# Self CPU time total: 57.549ms
|
|
|
|
model = models.resnet18().cuda()
|
|
inputs = torch.randn(5, 3, 224, 224).cuda()
|
|
|
|
with profile(activities=[
|
|
ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
|
|
with record_function("model_inference"):
|
|
model(inputs)
|
|
|
|
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
|
|
|
|
# ------------------------------------------------------- ------------ ------------
|
|
# Name Self CUDA CUDA total
|
|
# ------------------------------------------------------- ------------ ------------
|
|
# model_inference 0.000us 11.666ms
|
|
# aten::conv2d 0.000us 10.484ms
|
|
# aten::convolution 0.000us 10.484ms
|
|
# aten::_convolution 0.000us 10.484ms
|
|
# aten::_convolution_nogroup 0.000us 10.484ms
|
|
# aten::thnn_conv2d 0.000us 10.484ms
|
|
# aten::thnn_conv2d_forward 10.484ms 10.484ms
|
|
# void at::native::im2col_kernel<float>(long, float co... 3.844ms 3.844ms
|
|
# sgemm_32x32x32_NN 3.206ms 3.206ms
|
|
# sgemm_32x32x32_NN_vec 3.093ms 3.093ms
|
|
# ------------------------------------------------------- ------------ ------------
|
|
# Self CPU time total: 23.015ms
|
|
# Self CUDA time total: 11.666ms
|
|
|
|
model = models.resnet18()
|
|
inputs = torch.randn(5, 3, 224, 224)
|
|
|
|
with profile(activities=[ProfilerActivity.CPU],
|
|
profile_memory=True, record_shapes=True) as prof:
|
|
model(inputs)
|
|
|
|
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10)) # 算子自身使用的内存总量,不包括子算子
|
|
|
|
print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
|
|
|
|
|
|
model = models.resnet18().cuda()
|
|
inputs = torch.randn(5, 3, 224, 224).cuda()
|
|
|
|
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
|
|
model(inputs)
|
|
|
|
prof.export_chrome_trace("trace.json")
|
|
|
|
|
|
with profile(
|
|
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
|
|
with_stack=True,
|
|
) as prof:
|
|
model(inputs)
|
|
|
|
# Print aggregated stats
|
|
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2)) # 启用stack tracing会带来额外开销
|
|
# ------------------------- -----------------------------------------------------------
|
|
# Name Source Location
|
|
# ------------------------- -----------------------------------------------------------
|
|
# aten::thnn_conv2d_forward .../torch/nn/modules/conv.py(439): _conv_forward
|
|
# .../torch/nn/modules/conv.py(443): forward
|
|
# .../torch/nn/modules/module.py(1051): _call_impl
|
|
# .../site-packages/torchvision/models/resnet.py(63): forward
|
|
# .../torch/nn/modules/module.py(1051): _call_impl
|
|
# aten::thnn_conv2d_forward .../torch/nn/modules/conv.py(439): _conv_forward
|
|
# .../torch/nn/modules/conv.py(443): forward
|
|
# .../torch/nn/modules/module.py(1051): _call_impl
|
|
# .../site-packages/torchvision/models/resnet.py(59): forward
|
|
# .../torch/nn/modules/module.py(1051): _call_impl
|
|
# ------------------------- -----------------------------------------------------------
|
|
# Self CPU time total: 34.016ms
|
|
# Self CUDA time total: 11.659ms |