如何用Python自动提取Word文档中以增强型图元文件（Enhanced Metafile）形式粘贴的Excel图表？

阿华AIGC实验室

2026-4-23

我完全懂你碰到的麻烦——之前用docx2txt能顺利提取普通PNG图片，但换成粘贴成增强型图元文件（EMF）的Excel图表就彻底失效，转成PDF再提取也没结果对吧？这是因为EMF作为Windows原生的矢量图格式，存储逻辑和普通位图不一样，常规工具识别不了。下面给你两个靠谱的解决方案，分别适配Windows和跨平台场景：

方案一：Windows环境下直接调用Word COM对象（最省心）

这个方法直接借助Word自身的API来导出EMF图表，兼容性拉满，毕竟是原生操作。

首先先安装依赖：

pip install pywin32

然后用这段代码：

import win32com.client as win32
import os

def extract_emf_charts_from_docx(doc_path, output_folder):
    # 先创建输出文件夹，不存在就自动生成
    os.makedirs(output_folder, exist_ok=True)
    
    # 启动Word后台进程，不显示窗口
    word = win32.gencache.EnsureDispatch('Word.Application')
    word.Visible = False
    
    try:
        # 打开目标文档
        doc = word.Documents.Open(os.path.abspath(doc_path))
        
        # 遍历文档里的所有形状对象
        for idx, shape in enumerate(doc.Shapes):
            # 13代表msoPicture类型，EMF属于这个范畴
            if shape.Type == 13:
                # 导出为PNG格式（也可以改成wdExportFormatJPG导出JPG）
                output_path = os.path.join(output_folder, f"excel_chart_{idx+1}.png")
                shape.Export(output_path, 2)  # 2对应wdExportFormatPNG常量
                print(f"已成功导出图表：{output_path}")
                
        print("所有EMF格式的Excel图表提取完成！")
    except Exception as e:
        print(f"提取过程中出错了：{str(e)}")
    finally:
        # 一定要记得关闭文档和Word进程，避免残留
        doc.Close(SaveChanges=False)
        word.Quit()

# 替换成你的文件路径和输出文件夹
doc_path = "./Report.docx"
output_folder = "./Extracted_Charts"
extract_emf_charts_from_docx(doc_path, output_folder)

原理很简单：Word的COM对象能直接识别文档里的所有形状，包括EMF格式的嵌入图表，通过Export方法可以直接把这些图元文件转成常用的位图格式，完全避开了常规工具识别不了EMF的问题。

方案二：跨平台通用方法（适配Linux/macOS）

如果你的环境不是Windows，可以借助ODT开放格式和图片转换工具来实现，步骤稍微多一点，但胜在跨平台。

前置准备

安装LibreOffice（确保系统里有soffice命令）
安装依赖库：

pip install wand

（注：wand依赖ImageMagick，需要提前安装对应系统的ImageMagick包）

完整代码

import os
import subprocess
import zipfile
import shutil
from wand.image import Image

def docx_to_odt(doc_path, odt_output):
    # 用LibreOffice把docx转成ODT格式（ODT本质是压缩包）
    subprocess.run([
        "soffice",
        "--headless",  # 后台运行
        "--convert-to", "odt",
        os.path.abspath(doc_path),
        "--outdir", os.path.dirname(os.path.abspath(odt_output))
    ], check=True)

def extract_emf_from_odt(odt_path, temp_emf_dir):
    # 解压ODT文件，提取里面的EMF图片
    os.makedirs(temp_emf_dir, exist_ok=True)
    with zipfile.ZipFile(odt_path, 'r') as zip_ref:
        # ODT的图片都存在Pictures目录下，筛选出EMF文件
        for file_info in zip_ref.infolist():
            if file_info.filename.startswith("Pictures/") and file_info.filename.endswith(".emf"):
                zip_ref.extract(file_info, temp_emf_dir)

def emf_to_png(emf_dir, output_dir):
    # 把提取到的EMF转成PNG
    os.makedirs(output_dir, exist_ok=True)
    for emf_file in os.listdir(emf_dir):
        if emf_file.endswith(".emf"):
            emf_full_path = os.path.join(emf_dir, emf_file)
            png_full_path = os.path.join(output_dir, os.path.splitext(emf_file)[0] + ".png")
            with Image(filename=emf_full_path) as img:
                img.save(filename=png_full_path)
            print(f"已转换EMF为PNG：{png_full_path}")

# 配置路径
source_doc = "./Report.docx"
temp_odt = "./temp_report.odt"
temp_emf_folder = "./temp_emf_files"
final_output_folder = "./Extracted_Charts"

# 执行完整流程
docx_to_odt(source_doc, temp_odt)
extract_emf_from_odt(temp_odt, temp_emf_folder)
emf_to_png(temp_emf_folder, final_output_folder)

# 清理临时文件（可选，按需保留）
shutil.rmtree(temp_emf_folder)
os.remove(temp_odt)