# c = a + b (shape: [n * n]) n = 1024 a = torch.rand((n,n), device="cuda:0") b = torch.rand((n,n), device="cuda:0") cuda_c = torch.rand((n,n), device="cuda:0")
ntest = 10
# func是一个函数,这个类似c++传函数指针 defshow_time(func): times = list() res = None # 对GPU进行预热,避免首次运行时性能不稳定 for _ inrange(10): res = func() for _ inrange(ntest): # 同步线程以获得准确的cuda运行时间 torch.cuda.synchronize(device="cuda:0") # 第一个time是import的time,即模块名,第二个是函数名,返回当前时间的时间戳 start_time = time.time() func() torch.cuda.synchronize(device="cuda:0") end_time = time.time() times.append((end_time-start_time)*1e6) return times, res
defrun_cuda(): cuda_module.torch_launch_add2(cuda_c, a, b, n) return cuda_c
defrun_torch(): c = a + b return c.contiguous() # 确保返回的结果在内存中是连续的
torch.allclose(cuda_res, torch_res) print("Kernel test passed.")
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# 执行如下命令即可 python run_time.py # 运行结果 Using /home/xxx/.cache/torch_extensions as PyTorch extensions root... Detected CUDA files, patching ldflags Emitting ninja build file /home/xxx/.cache/torch_extensions/add2/build.ninja... Building extension module add2... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) Loading extension module add2... Running cuda... Cuda time: 53.453us Running torch... Torch time: 59.795us Kernel test passed.
# run_time.py import time import numpy as np import torch import add2
# c = a + b (shape: [n * n]) n = 1024 a = torch.rand((n,n), device="cuda:0") b = torch.rand((n,n), device="cuda:0") cuda_c = torch.rand((n,n), device="cuda:0")
ntest = 10
defshow_time(func): times = list() res = None for _ inrange(10): res = func() for _ inrange(ntest): torch.cuda.synchronize(device="cuda:0") start_time = time.time() func() torch.cuda.synchronize(device="cuda:0") end_time = time.time() times.append((end_time-start_time)*1e6) return times, res
defrun_cuda(): add2.torch_launch_add2(cuda_c, a, b, n) return cuda_c
# c = a + b (shape: [n * n]) n = 1024 a = torch.rand((n,n), device="cuda:0") b = torch.rand((n,n), device="cuda:0") cuda_c = torch.rand((n,n), device="cuda:0")
ntest = 10
defshow_time(func): times = list() res = None for _ inrange(10): res = func() for _ inrange(ntest): torch.cuda.synchronize(device="cuda:0") start_time = time.time() func() torch.cuda.synchronize(device="cuda:0") end_time = time.time() times.append((end_time-start_time)*1e6) return times, res
defrun_cuda(): if args.compiler == 'jit': cuda_module.torch_launch_add2(cuda_c, a, b, n) elif args.compiler == 'setup': add2.torch_launch_add2(cuda_c, a, b, n) else: raise Exception("Type of cuda compiler must be one of jit/setup.")
if args.compiler == 'jit': from torch.utils.cpp_extension import load cuda_module = load(name="add2", extra_include_paths=["include"], sources=["kernel/add2_ops.cpp", "kernel/add2_kernel.cu"], verbose=True) elif args.compiler == 'setup': import add2 else: raise Exception("Type of cuda compiler must be one of jit/setup.")