kvcache-ai-ktransformers/KT-SFT/test_adapter/time_test_lora_train.py
2025-11-04 04:24:30 +00:00

113 lines
No EOL
6.8 KiB
Python

import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
with record_function("model_inference"):
model(inputs)
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
# --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
# Name Self CPU % Self CPU CPU total % CPU total CPU time avg CPU Mem Self CPU Mem # of Calls
# --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
# aten::mkldnn_convolution 73.87% 37.241ms 74.04% 37.326ms 7.465ms 9.25 Mb 0 b 5
# aten::addmm 12.98% 6.545ms 13.11% 6.609ms 2.203ms 179.53 Kb 179.53 Kb 3
# aten::max_pool2d_with_indices 6.63% 3.343ms 6.63% 3.343ms 1.114ms 5.05 Mb 5.05 Mb 3
# aten::clamp_min 2.12% 1.071ms 2.12% 1.071ms 153.000us 0 b 0 b 7
# aten::bernoulli_ 1.20% 607.000us 1.23% 622.000us 311.000us 0 b -260.00 Kb 2
# --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
# Self CPU time total: 50.416ms
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
# --------------------------------- ------------ -------------------------------------------
# Name CPU total Input Shapes
# --------------------------------- ------------ -------------------------------------------
# model_inference 57.503ms []
# aten::conv2d 8.008ms [5,64,56,56], [64,64,3,3], [], ..., []]
# aten::convolution 7.956ms [[5,64,56,56], [64,64,3,3], [], ..., []] #卷积统计
# aten::_convolution 7.909ms [[5,64,56,56], [64,64,3,3], [], ..., []]
# aten::mkldnn_convolution 7.834ms [[5,64,56,56], [64,64,3,3], [], ..., []]
# aten::conv2d 6.332ms [[5,512,7,7], [512,512,3,3], [], ..., []]
# aten::convolution 6.303ms [[5,512,7,7], [512,512,3,3], [], ..., []] #卷积统计
# aten::_convolution 6.273ms [[5,512,7,7], [512,512,3,3], [], ..., []]
# aten::mkldnn_convolution 6.233ms [[5,512,7,7], [512,512,3,3], [], ..., []]
# aten::conv2d 4.751ms [[5,256,14,14], [256,256,3,3], [], ..., []]
# --------------------------------- ------------ -------------------------------------------
# Self CPU time total: 57.549ms
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()
with profile(activities=[
ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
with record_function("model_inference"):
model(inputs)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
# ------------------------------------------------------- ------------ ------------
# Name Self CUDA CUDA total
# ------------------------------------------------------- ------------ ------------
# model_inference 0.000us 11.666ms
# aten::conv2d 0.000us 10.484ms
# aten::convolution 0.000us 10.484ms
# aten::_convolution 0.000us 10.484ms
# aten::_convolution_nogroup 0.000us 10.484ms
# aten::thnn_conv2d 0.000us 10.484ms
# aten::thnn_conv2d_forward 10.484ms 10.484ms
# void at::native::im2col_kernel<float>(long, float co... 3.844ms 3.844ms
# sgemm_32x32x32_NN 3.206ms 3.206ms
# sgemm_32x32x32_NN_vec 3.093ms 3.093ms
# ------------------------------------------------------- ------------ ------------
# Self CPU time total: 23.015ms
# Self CUDA time total: 11.666ms
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)
with profile(activities=[ProfilerActivity.CPU],
profile_memory=True, record_shapes=True) as prof:
model(inputs)
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10)) # 算子自身使用的内存总量,不包括子算子
print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
model(inputs)
prof.export_chrome_trace("trace.json")
with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
with_stack=True,
) as prof:
model(inputs)
# Print aggregated stats
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2)) # 启用stack tracing会带来额外开销
# ------------------------- -----------------------------------------------------------
# Name Source Location
# ------------------------- -----------------------------------------------------------
# aten::thnn_conv2d_forward .../torch/nn/modules/conv.py(439): _conv_forward
# .../torch/nn/modules/conv.py(443): forward
# .../torch/nn/modules/module.py(1051): _call_impl
# .../site-packages/torchvision/models/resnet.py(63): forward
# .../torch/nn/modules/module.py(1051): _call_impl
# aten::thnn_conv2d_forward .../torch/nn/modules/conv.py(439): _conv_forward
# .../torch/nn/modules/conv.py(443): forward
# .../torch/nn/modules/module.py(1051): _call_impl
# .../site-packages/torchvision/models/resnet.py(59): forward
# .../torch/nn/modules/module.py(1051): _call_impl
# ------------------------- -----------------------------------------------------------
# Self CPU time total: 34.016ms
# Self CUDA time total: 11.659ms