如何在Julia中高效读取XYZ格式文本文件？

阿华AIGC实验室

2026-5-9

Optimizing XYZ File Parsing in Julia for Large Trajectories

I totally get the frustration with slow parsing and excessive GC when dealing with huge XYZ files—those multi-million frame trajectories can eat up memory and time faster than you’d expect. Let’s break down the issues in your current code and fix them step by step.

First: Fix a Critical Bug in Your Current Code

Your enumerate loop has a big problem: modifying the loop variable i inside the loop (with i +=1) does nothing to the outer loop's iteration index. This means your code is reprocessing lines over and over, which is a major contributor to the slowdown and excessive allocations. That’s why even your pre-allocation approach was slower—this bug was undermining everything.

Key Optimizations for Speed and Memory Efficiency

Here are the changes that will make a huge difference:

1. Stream the File Instead of Loading It All at Once

readlines loads every line into memory upfront, which is unnecessary (and memory-heavy) for large files. Use eachline or paired open/readline calls to process lines one at a time, keeping your memory footprint low.

2. Pre-Allocate Arrays (Properly)

Instead of starting with empty arrays and push!ing (which causes repeated resizing), first scan the file to count the number of frames and per-frame atom counts. Then pre-allocate all your output arrays to the exact size you need. This eliminates most garbage collection from dynamic resizing.

3. Avoid Unnecessary Intermediate Objects

Skip string concatenation for headers: store the atom count and comment line as a tuple instead of merging them into one string (saves memory and avoids redundant string operations).
If your trajectory has identical atom labels across all frames (super common in MD simulations), only store the labels once instead of per frame—this cuts memory usage drastically.
Minimize temporary arrays from split: process split results directly without storing them unnecessarily.

4. Speed Up Coordinate Parsing

Use vectorized parsing where possible, and avoid repeated parse calls for individual values by parsing entire coordinate chunks at once.

Optimized Code

First, let's handle the common case where all frames have the same number of atoms and atom labels:

function read_xyz_fixed(ifile::String)
    # First pass: count frames, get atom count and labels from first frame
    n_frames = 0
    atom_count = 0
    base_labels = String[]
    open(ifile, "r") do f
        while !eof(f)
            line = strip(readline(f))
            isempty(line) && continue  # skip blank lines
            # Check if line is atom count
            count = tryparse(Int, line)
            if count !== nothing
                n_frames += 1
                if n_frames == 1
                    atom_count = count
                    # Read and skip comment line for first pass
                    readline(f)
                    # Capture base atom labels
                    for _ in 1:atom_count
                        parts = split(strip(readline(f)))
                        push!(base_labels, parts[1])
                    end
                else
                    # Skip comment and atom lines for other frames in first pass
                    readline(f)
                    for _ in 1:atom_count
                        readline(f)
                    end
                end
            end
        end
    end

    # Pre-allocate output arrays
    headers = Vector{Tuple{Int, String}}(undef, n_frames)
    # Reuse base labels for all frames (saves massive memory)
    atom_labels = fill(base_labels, n_frames)
    # Pre-allocate each geometry matrix
    geoms = Vector{Matrix{Float64}}(undef, n_frames)
    for i in 1:n_frames
        geoms[i] = zeros(Float64, 3, atom_count)
    end

    # Second pass: fill headers and coordinates
    frame_idx = 0
    open(ifile, "r") do f
        while !eof(f)
            line = strip(readline(f))
            isempty(line) && continue
            count = tryparse(Int, line)
            if count !== nothing
                frame_idx += 1
                # Read comment line
                comment = strip(readline(f))
                headers[frame_idx] = (count, comment)
                # Populate coordinates
                current_geom = geoms[frame_idx]
                for j in 1:atom_count
                    parts = split(strip(readline(f)))
                    current_geom[:, j] = parse.(Float64, parts[2:4])
                end
            end
        end
    end

    return headers, atom_labels, geoms
end

If you need to handle variable atom counts per frame (less common but possible), here's a modified version:

function read_xyz_variable(ifile::String)
    # First pass: count frames and collect per-frame atom counts
    n_frames = 0
    frame_atom_counts = Int[]
    open(ifile, "r") do f
        while !eof(f)
            line = strip(readline(f))
            isempty(line) && continue
            count = tryparse(Int, line)
            if count !== nothing
                n_frames += 1
                push!(frame_atom_counts, count)
                # Skip comment and atom lines
                readline(f)
                for _ in 1:count
                    readline(f)
                end
            end
        end
    end

    # Pre-allocate all output structures
    headers = Vector{Tuple{Int, String}}(undef, n_frames)
    atom_labels = Vector{Vector{String}}(undef, n_frames)
    geoms = Vector{Matrix{Float64}}(undef, n_frames)
    for i in 1:n_frames
        count = frame_atom_counts[i]
        atom_labels[i] = Vector{String}(undef, count)
        geoms[i] = zeros(Float64, 3, count)
    end

    # Second pass: fill in all data
    frame_idx = 0
    open(ifile, "r") do f
        while !eof(f)
            line = strip(readline(f))
            isempty(line) && continue
            count = tryparse(Int, line)
            if count !== nothing
                frame_idx += 1
                current_count = count
                # Read comment line
                comment = strip(readline(f))
                headers[frame_idx] = (current_count, comment)
                # Populate labels and coordinates
                current_labels = atom_labels[frame_idx]
                current_geom = geoms[frame_idx]
                for j in 1:current_count
                    parts = split(strip(readline(f)))
                    current_labels[j] = parts[1]
                    current_geom[:, j] = parse.(Float64, parts[2:4])
                end
            end
        end
    end

    return headers, atom_labels, geoms
end

Why This Works Better

Eliminated repeated line processing: The fixed loop logic ensures we only read each line twice (once for counting, once for parsing) instead of reprocessing lines due to the original bug.
Minimal allocations: Pre-allocated arrays eliminate the overhead of push! and dynamic resizing, which was a huge source of GC.
Lower memory usage: Reusing atom labels (when possible) cuts down on redundant data storage by tens of gigabytes for large trajectories.
Streamed reading: Using open and readline avoids loading the entire 650MB file into memory at once.