如何使用Python免费库提取.docx自定义列表样式中的编号（如F23、R10）

阿华AIGC实验室

2026-4-14

我太懂你这种头疼了——好不容易找到能提取文本的库，结果关键的自定义编号（Fxx、Rxx）全丢了，付费库又用不起，免费库的表层用法还搞不定。下面给你分享两个亲测有效的免费Python方案，都是基于开源工具，完全符合业务场景需求：

方案一：用python-docx深入解析列表编号定义

你之前可能只用了python-docx的表层para.text提取文本，但其实可以通过它的XML底层API拿到自定义编号的规则。先安装依赖：

pip install python-docx

然后用这段代码就能把编号和文本一起捞出来：

from docx import Document

def extract_custom_list_items(doc_path):
    doc = Document(doc_path)
    extracted_content = []
    
    # 先把同样式的段落分组，方便计算编号位置
    finding_paras = [p for p in doc.paragraphs if p.style.name == "Finding"]
    recommendation_paras = [p for p in doc.paragraphs if p.style.name == "Recommendation"]

    for para in doc.paragraphs:
        style_name = para.style.name
        if style_name not in ["Finding", "Recommendation"]:
            continue
        
        # 定位当前段落在同样式列表中的位置
        if style_name == "Finding":
            idx = finding_paras.index(para) + 1
        else:
            idx = recommendation_paras.index(para) + 1

        # 深入XML结构获取编号格式（比如"F%1"）
        num_pr = para._p.xpath('.//w:numPr')
        if not num_pr:
            extracted_content.append(f"{style_name[0]}{idx}. {para.text}")
            continue
        
        level = para._p.xpath('.//w:ilvl/@w:val')[0]
        num_id = para._p.xpath('.//w:numId/@w:val')[0]
        num_def = doc.part.numbering_part.numbering_definitions.num_by_id(int(num_id))
        level_def = num_def.level_by_index(int(level))
        # 替换格式中的占位符为实际编号
        number_text = level_def.lvlText.replace("%1", str(idx))
        extracted_content.append(f"{number_text}. {para.text}")
    
    return extracted_content

# 调用示例
for item in extract_custom_list_items("your_target_doc.docx"):
    print(item)

这个代码的核心是：先把目标样式的段落分组，然后通过python-docx的内部XML解析，拿到自定义编号的格式模板（比如F%1），再结合段落在同样式列表中的位置，替换占位符生成真实编号，最后和段落文本拼接。

方案二：直接解析docx的底层XML（最可控）

如果python-docx的封装还是满足不了你的特殊情况，那就直接操作docx的本质——压缩包里的XML文件。这个方案完全不依赖高层库的限制，灵活性拉满。先安装依赖：

pip install lxml

代码示例如下：

import zipfile
from lxml import etree

def extract_custom_numbers_via_raw_xml(doc_path):
    extracted = []
    ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
    numbering_rules = {}

    # 第一步：先解析numbering.xml，拿到所有自定义编号的规则
    with zipfile.ZipFile(doc_path, 'r') as zf:
        with zf.open('word/numbering.xml') as f:
            num_tree = etree.fromstring(f.read())
            for num in num_tree.xpath('//w:num', namespaces=ns):
                num_id = num.xpath('./w:numId/@w:val', namespaces=ns)[0]
                for lvl in num.xpath('./w:lvl', namespaces=ns):
                    lvl_id = lvl.xpath('./w:ilvl/@w:val', namespaces=ns)[0]
                    lvl_text = lvl.xpath('./w:lvlText/@w:val', namespaces=ns)[0]
                    start_num = int(lvl.xpath('./w:start/@w:val', namespaces=ns)[0])
                    numbering_rules[(num_id, lvl_id)] = (lvl_text, start_num)
        
        # 第二步：解析document.xml，提取目标段落和对应编号
        with zf.open('word/document.xml') as f:
            doc_tree = etree.fromstring(f.read())
            for para in doc_tree.xpath('//w:p', namespaces=ns):
                # 检查段落样式是否是目标样式
                p_style = para.xpath('./w:pPr/w:pStyle/@w:val', namespaces=ns)
                if not p_style or p_style[0] not in ["Finding", "Recommendation"]:
                    continue
                
                # 拿到当前段落的编号属性
                num_pr = para.xpath('./w:pPr/w:numPr', namespaces=ns)
                if not num_pr:
                    continue
                num_id = num_pr[0].xpath('./w:numId/@w:val', namespaces=ns)[0]
                lvl_id = num_pr[0].xpath('./w:ilvl/@w:val', namespaces=ns)[0]
                
                # 处理手动调整过的非连续编号
                custom_start = num_pr[0].xpath('./w:numStart/@w:val', namespaces=ns)
                if custom_start:
                    current_num = int(custom_start[0])
                else:
                    # 自动编号的情况：统计同规则下的段落位置
                    same_rule_paras = doc_tree.xpath(
                        f'//w:p[./w:pPr/w:numPr/w:numId/@w:val="{num_id}" and ./w:pPr/w:numPr/w:ilvl/@w:val="{lvl_id}"]',
                        namespaces=ns
                    )
                    current_num = same_rule_paras.index(para) + numbering_rules[(num_id, lvl_id)][1]
                
                # 生成最终的编号文本
                lvl_format = numbering_rules[(num_id, lvl_id)][0]
                number_text = lvl_format.replace("%1", str(current_num))
                # 提取段落文本
                para_text = ''.join(para.xpath('.//w:t/text()', namespaces=ns))
                extracted.append(f"{number_text}. {para_text}")
    
    return extracted

# 调用示例
for content in extract_custom_numbers_via_raw_xml("your_target_doc.docx"):
    print(content)

这个方案的优势是完全不依赖高层库的封装，直接读取docx的核心XML文件，能处理各种复杂的自定义编号场景，包括手动调整过的非连续编号。