Flux2推理在2块GPU上仅单卡运行的问题咨询及解决方案求助

阿华AIGC实验室

2026-4-1

问题描述

我正在尝试用2张A100 GPU运行FLUX.2-dev的图像生成推理，模型加载阶段通过device_map="balanced"确实把模型权重分配到了两块GPU上（从内存占用能明显看出来），但实际执行推理步骤时，用watch nvidia-smi监控发现只有其中一块GPU有计算负载，另一块几乎处于空闲状态。想请教各位，怎么才能让两块GPU真正协同参与Flux2的推理过程？

我的实现代码

import torch
from diffusers import Flux2Pipeline
from accelerate import PartialState
import argparse
from pathlib import Path

def main():
    parser = argparse.ArgumentParser(description='Generate images using FLUX.2-dev with multi-GPU support')
    parser.add_argument('--prompt', type=str, default="Futuristic city", help='Text prompt for image generation')
    parser.add_argument('--output', type=str, default='flux2_output2.png', help='Output image filename')
    parser.add_argument('--steps', type=int, default=28, help='Number of inference steps (default: 28, max recommended: 50)')
    parser.add_argument('--guidance-scale', type=float, default=4.0, help='Guidance scale for generation (default: 4.0)')
    parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility')
    parser.add_argument('--height', type=int, default=1024, help='Output image height')
    parser.add_argument('--width', type=int, default=1024, help='Output image width')
    args = parser.parse_args()

    print("=" * 80)
    print("FLUX.2-dev Image Generation")
    print("=" * 80)
    print(f"\nPrompt: {args.prompt}")
    print(f"Output: {args.output}")
    print(f"Steps: {args.steps}")
    print(f"Guidance Scale: {args.guidance_scale}")
    print(f"Seed: {args.seed}")
    print(f"Size: {args.width}x{args.height}")
    print("\n" + "=" * 80)

    # Model repository
    model_id = "black-forest-labs/FLUX.2-dev"
    print("\nLoading FLUX.2-dev model...")
    print("This will distribute the model across your 2 A100 GPUs automatically...")

    # Load the pipeline with device_map="balanced" to distribute across GPUs
    # Using bfloat16 for A100s (optimal precision)
    pipe = Flux2Pipeline.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="balanced"  # Distributes model across available GPUs
    )

    # Enable memory efficient attention
    pipe.enable_attention_slicing()

    print("\n Model loaded successfully!")
    print(f" Model distributed across GPUs: {torch.cuda.device_count()} GPUs detected")

    # Print GPU memory allocation
    for i in range(torch.cuda.device_count()):
        allocated = torch.cuda.memory_allocated(i) / 1024**3
        reserved = torch.cuda.memory_reserved(i) / 1024**3
        print(f" GPU {i}: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")

    print("\nGenerating image...")

    # Set up generator for reproducibility
    # Note: For multi-GPU, we set the generator on cuda:0
    generator = torch.Generator(device="cuda:0").manual_seed(args.seed)

    # Generate image
    output = pipe(
        prompt=args.prompt,
        height=args.height,
        width=args.width,
        num_inference_steps=args.steps,
        guidance_scale=args.guidance_scale,
        generator=generator,
    )

    image = output.images[0]

    # Save the image
    output_path = Path(args.output)
    image.save(output_path)

    print(f"\nImage generated successfully!")
    print(f"Saved to: {output_path.absolute()}")

    # Print final GPU memory usage
    print("\nFinal GPU Memory Usage:")
    for i in range(torch.cuda.device_count()):
        allocated = torch.cuda.memory_allocated(i) / 1024**3
        reserved = torch.cuda.memory_reserved(i) / 1024**3
        print(f" GPU {i}: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")

    print("\n" + "=" * 80)

if __name__ == "__main__":
    main()

依赖环境（requirements.txt）

torch>=2.0.0
diffusers>=0.32.0
transformers>=4.40.0
accelerate>=0.26.0
bitsandbytes>=0.43.0
sentencepiece>=0.1.99
protobuf>=3.20.0
Pillow>=10.0.0
huggingface_hub>=0.20.0