You need to enable JavaScript to run this app.
最新活动
大模型
产品
解决方案
定价
生态与合作
支持与服务
开发者
了解我们

PyTorch/LibTorch中间隔执行GPU模型推理时耗时显著增加的原因探究

PyTorch/LibTorch中间隔执行GPU模型推理时耗时显著增加的原因探究

最近我碰到了一个挺费解的问题——在PyTorch和LibTorch环境下,让GPU模型隔一段时间再跑推理,耗时会比连续执行时高很多,甚至能差几十上百倍!我做了好几组测试,下面跟大家唠唠具体情况和背后的原因。


一、Python下的测试现象

我写了一个最小可复现的Python测试代码,用来对比连续推理和间隔推理的耗时差异:

import torch
import torch.nn as nn
import time

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.shared_layers = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1)
        )
        self.policy_head = nn.Linear(64, 400)
        self.value_head = nn.Linear(64, 1)
        self.action_value_head = nn.Linear(64, 400)

    def forward(self, x):
        x = self.shared_layers(x)
        x = x.view(x.size(0), -1)
        policy = self.policy_head(x)
        value = self.value_head(x)
        action_values = self.action_value_head(x)
        return policy, value, action_values

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("using device: ", device)
    model = Net().to(device)
    model.eval()

    print("\nnow no sleep:")
    for i in range(10):
        states = torch.rand(128, 4, 20, 20, device = device)
        #no time.sleep
        start_time = time.time()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()

        # forward
        policy, value, action_value = model(states)

        end_event.record()
        torch.cuda.synchronize()
        end_time = time.time()
        total_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
        print(f"time cost counted by gpu: {total_time:.4f} second")
        total_time = end_time - start_time
        print(f"time cost counted by cpu:{total_time:.4f} second")


    print("\nnow sleep 1:")
    for i in range(10):
        states = torch.rand(128, 4, 20, 20, device = device)
        time.sleep(1)
        start_time = time.time()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()

        # forward
        policy, value, action_value = model(states)

        end_event.record()
        torch.cuda.synchronize()
        end_time = time.time()
        total_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
        print(f"time cost counted by gpu: {total_time:.4f} second")
        total_time = end_time - start_time
        print(f"time cost counted by cpu:{total_time:.4f} second")


    print("\nnow sleep 10:")
    for i in range(10):
        states = torch.rand(128, 4, 20, 20, device = device)
        time.sleep(10)
        start_time = time.time()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()

        # forward
        policy, value, action_value = model(states)

        end_event.record()
        torch.cuda.synchronize()
        end_time = time.time()
        total_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
        print(f"time cost counted by gpu: {total_time:.4f} second")
        total_time = end_time - start_time
        print(f"time cost counted by cpu:{total_time:.4f} second")


    print("\nnow sleep 60:")
    for i in range(10):
        states = torch.rand(128, 4, 20, 20, device = device)
        time.sleep(60)
        start_time = time.time()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()

        # forward
        policy, value, action_value = model(states)

        end_event.record()
        torch.cuda.synchronize()
        end_time = time.time()
        total_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
        print(f"time cost counted by gpu: {total_time:.4f} second")
        total_time = end_time - start_time
        print(f"time cost counted by cpu:{total_time:.4f} second")

运行后的结果很直观:

using device:  cuda

now no sleep:
time cost counted by gpu: 0.2766 second
time cost counted by cpu:0.2768 second
time cost counted by gpu: 0.0012 second
time cost counted by cpu:0.0013 second
time cost counted by gpu: 0.0004 second
time cost counted by cpu:0.0004 second
...(后续稳定在0.0003-0.0004s)

now sleep 1:
time cost counted by gpu: 0.0005 second
time cost counted by cpu:0.0007 second
...(稳定在0.0006-0.0009s)

now sleep 10:
time cost counted by gpu: 0.0007 second
time cost counted by cpu:0.0009 second
...(稳定在0.0007-0.0016s)

now sleep 60:
time cost counted by gpu: 0.0014 second
time cost counted by cpu:0.0101 second
...(稳定在0.0011-0.0035s,CPU计时甚至到0.02s)

能明显看到:

  • 连续推理时,除了第一次的"暖身"耗时,后续稳定在极低水平;
  • 间隔时间越长,推理耗时越高,sleep60s后的耗时是连续推理的25倍以上。

二、C++ LibTorch下的放大现象

在C++用LibTorch写的程序里,这个差异更夸张。核心代码片段如下:

#include <torch/torch.h>
#include <iostream>
#include <chrono>
#include <thread>

std::shared_ptr<c10::IValue> results_tensor = std::make_shared<c10::IValue>();
std::shared_ptr<torch::jit::script::Module> model;

int main() {
    // 初始化模型(假设已提前加载到GPU)
    model = torch::jit::load("model.pt").to(torch::kCUDA);
    model->eval();

    // 构造输入张量
    torch::Tensor input = torch::rand({128, 4, 20, 20}).to(torch::kCUDA);

    std::cout << "连续循环推理耗时:" << std::endl;
    for(int i=0; i<10; i++) {
        auto start = std::chrono::high_resolution_clock::now();
        *results_tensor = model->forward({input});
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> duration = end - start;
        std::cout << "Inference time: " << duration.count() << "s" << std::endl;
    }

    std::cout << "\n间隔1秒推理耗时:" << std::endl;
    for(int i=0; i<10; i++) {
        std::this_thread::sleep_for(std::chrono::seconds(1));
        auto start = std::chrono::high_resolution_clock::now();
        *results_tensor = model->forward({input});
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> duration = end - start;
        std::cout << "Inference time: " << duration.count() << "s" << std::endl;
    }

    return 0;
}

运行结果:

连续循环推理耗时:
Inference time: 0.00012s
Inference time: 0.00011s
...(稳定在0.0001s左右)

间隔1秒推理耗时:
Inference time: 0.0103s
Inference time: 0.0099s
...(稳定在0.01s左右)

连续推理和间隔推理的耗时差了整整100倍!


三、背后的核心原因:GPU的动态功耗管理

这个现象的本质是GPU的动态功耗管理(DPM)机制。为了节能和控温,现代GPU在闲置时会自动降低核心/显存频率,甚至切换到低功耗待机状态。当突然有计算任务到来时,GPU需要从低功耗状态"唤醒":调整电压、提升频率到工作状态,这个过程需要额外的硬件响应时间,导致推理耗时骤增。

而连续执行推理时,GPU一直处于活跃状态,会维持在最高工作频率,所以每次推理的耗时都稳定在极低水平。

从测试结果也能完美对应这个逻辑:闲置时间越长(sleep越久),GPU降频的幅度越大,恢复到高频需要的时间越多,所以推理耗时增加得越明显。


四、验证与解决思路

如果想确认这个结论,可以尝试强制GPU保持最高性能模式:比如NVIDIA显卡可以在NVIDIA控制面板的「3D设置」中,把「电源管理模式」设为「最高性能优先」,或者用nvidia-smi命令行锁频。设置后再测试,间隔推理的耗时会和连续推理几乎无差异。

如果业务场景需要间隔执行GPU推理,又想避免耗时波动,还可以定期发送小的"暖身"任务(比如空推理或者小张量推理),让GPU保持活跃状态,避免进入低功耗模式。


备注:内容来源于stack exchange,提问作者HelpMePlease

火山引擎 最新活动