PyTorch/LibTorch中间隔执行GPU模型推理时耗时显著增加的原因探究

阿华AIGC实验室

2026-4-14

最近我碰到了一个挺费解的问题——在PyTorch和LibTorch环境下，让GPU模型隔一段时间再跑推理，耗时会比连续执行时高很多，甚至能差几十上百倍！我做了好几组测试，下面跟大家唠唠具体情况和背后的原因。

一、Python下的测试现象

我写了一个最小可复现的Python测试代码，用来对比连续推理和间隔推理的耗时差异：

import torch
import torch.nn as nn
import time

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.shared_layers = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1)
        )
        self.policy_head = nn.Linear(64, 400)
        self.value_head = nn.Linear(64, 1)
        self.action_value_head = nn.Linear(64, 400)

    def forward(self, x):
        x = self.shared_layers(x)
        x = x.view(x.size(0), -1)
        policy = self.policy_head(x)
        value = self.value_head(x)
        action_values = self.action_value_head(x)
        return policy, value, action_values

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("using device: ", device)
    model = Net().to(device)
    model.eval()

    print("\nnow no sleep:")
    for i in range(10):
        states = torch.rand(128, 4, 20, 20, device = device)
        #no time.sleep
        start_time = time.time()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()

        # forward
        policy, value, action_value = model(states)

        end_event.record()
        torch.cuda.synchronize()
        end_time = time.time()
        total_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
        print(f"time cost counted by gpu: {total_time:.4f} second")
        total_time = end_time - start_time
        print(f"time cost counted by cpu：{total_time:.4f} second")


    print("\nnow sleep 1:")
    for i in range(10):
        states = torch.rand(128, 4, 20, 20, device = device)
        time.sleep(1)
        start_time = time.time()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()

        # forward
        policy, value, action_value = model(states)

        end_event.record()
        torch.cuda.synchronize()
        end_time = time.time()
        total_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
        print(f"time cost counted by gpu: {total_time:.4f} second")
        total_time = end_time - start_time
        print(f"time cost counted by cpu：{total_time:.4f} second")


    print("\nnow sleep 10:")
    for i in range(10):
        states = torch.rand(128, 4, 20, 20, device = device)
        time.sleep(10)
        start_time = time.time()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()

        # forward
        policy, value, action_value = model(states)

        end_event.record()
        torch.cuda.synchronize()
        end_time = time.time()
        total_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
        print(f"time cost counted by gpu: {total_time:.4f} second")
        total_time = end_time - start_time
        print(f"time cost counted by cpu：{total_time:.4f} second")


    print("\nnow sleep 60:")
    for i in range(10):
        states = torch.rand(128, 4, 20, 20, device = device)
        time.sleep(60)
        start_time = time.time()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()

        # forward
        policy, value, action_value = model(states)

        end_event.record()
        torch.cuda.synchronize()
        end_time = time.time()
        total_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
        print(f"time cost counted by gpu: {total_time:.4f} second")
        total_time = end_time - start_time
        print(f"time cost counted by cpu：{total_time:.4f} second")

运行后的结果很直观：

using device:  cuda

now no sleep:
time cost counted by gpu: 0.2766 second
time cost counted by cpu：0.2768 second
time cost counted by gpu: 0.0012 second
time cost counted by cpu：0.0013 second
time cost counted by gpu: 0.0004 second
time cost counted by cpu：0.0004 second
...（后续稳定在0.0003-0.0004s）

now sleep 1:
time cost counted by gpu: 0.0005 second
time cost counted by cpu：0.0007 second
...（稳定在0.0006-0.0009s）

now sleep 10:
time cost counted by gpu: 0.0007 second
time cost counted by cpu：0.0009 second
...（稳定在0.0007-0.0016s）

now sleep 60:
time cost counted by gpu: 0.0014 second
time cost counted by cpu：0.0101 second
...（稳定在0.0011-0.0035s，CPU计时甚至到0.02s）

能明显看到：

连续推理时，除了第一次的"暖身"耗时，后续稳定在极低水平；
间隔时间越长，推理耗时越高，sleep60s后的耗时是连续推理的25倍以上。

二、C++ LibTorch下的放大现象

在C++用LibTorch写的程序里，这个差异更夸张。核心代码片段如下：

#include <torch/torch.h>
#include <iostream>
#include <chrono>
#include <thread>

std::shared_ptr<c10::IValue> results_tensor = std::make_shared<c10::IValue>();
std::shared_ptr<torch::jit::script::Module> model;

int main() {
    // 初始化模型（假设已提前加载到GPU）
    model = torch::jit::load("model.pt").to(torch::kCUDA);
    model->eval();

    // 构造输入张量
    torch::Tensor input = torch::rand({128, 4, 20, 20}).to(torch::kCUDA);

    std::cout << "连续循环推理耗时：" << std::endl;
    for(int i=0; i<10; i++) {
        auto start = std::chrono::high_resolution_clock::now();
        *results_tensor = model->forward({input});
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> duration = end - start;
        std::cout << "Inference time: " << duration.count() << "s" << std::endl;
    }

    std::cout << "\n间隔1秒推理耗时：" << std::endl;
    for(int i=0; i<10; i++) {
        std::this_thread::sleep_for(std::chrono::seconds(1));
        auto start = std::chrono::high_resolution_clock::now();
        *results_tensor = model->forward({input});
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> duration = end - start;
        std::cout << "Inference time: " << duration.count() << "s" << std::endl;
    }

    return 0;
}

运行结果：

连续循环推理耗时：
Inference time: 0.00012s
Inference time: 0.00011s
...（稳定在0.0001s左右）

间隔1秒推理耗时：
Inference time: 0.0103s
Inference time: 0.0099s
...（稳定在0.01s左右）

连续推理和间隔推理的耗时差了整整100倍！