feat: 完善论文排版脚本

重构 format_thesis_docx.py 增强排版功能；新增 markdown_to_docx.py 转换工具；移除旧版排版文档 Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-02-28 13:20:04 +08:00
parent 4dd3fac720
commit 38741f80dd
3 changed files with 283 additions and 42 deletions
--- a/example/format_thesis_docx.py
+++ b/example/format_thesis_docx.py
@@ -1,9 +1,9 @@
 from docx import Document
-from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
 from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
 from docx.shared import Cm, Pt, RGBColor
-
+import re

 SRC = r"/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.docx"
 DST = r"/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版-排版.docx"
@@ -63,25 +63,6 @@ def is_numbered_paragraph(paragraph) -> bool:
    return ppr.numPr is not None


-def set_table_all_borders_black(table):
-    for row in table.rows:
-        for cell in row.cells:
-            tc = cell._tc
-            tc_pr = tc.get_or_add_tcPr()
-            tc_borders = tc_pr.find(qn("w:tcBorders"))
-            if tc_borders is None:
-                tc_borders = OxmlElement("w:tcBorders")
-                tc_pr.append(tc_borders)
-            for edge in ("top", "left", "bottom", "right", "insideH", "insideV"):
-                edge_tag = qn(f"w:{edge}")
-                elem = tc_borders.find(edge_tag)
-                if elem is None:
-                    elem = OxmlElement(f"w:{edge}")
-                    tc_borders.append(elem)
-                elem.set(qn("w:val"), "single")
-                elem.set(qn("w:sz"), "4")
-                elem.set(qn("w:color"), "000000")
-                elem.set(qn("w:space"), "0")
 def iter_table_paragraphs(table):
    for row in table.rows:
        for cell in row.cells:
@@ -90,6 +71,162 @@ def iter_table_paragraphs(table):
            for t in cell.tables:
                yield from iter_table_paragraphs(t)

+
+def format_table_paragraph(p, bold: bool = False):
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    fmt = p.paragraph_format
+    fmt.line_spacing = 1.0
+    fmt.space_before = Pt(0)
+    fmt.space_after = Pt(0)
+    fmt.first_line_indent = Pt(0)
+    set_runs_font(p, "宋体", 10.5, bold=bold)
+    set_runs_common(p, italic=False, color_black=True)
+
+
+def set_table_style_like_template(table):
+    tbl = table._tbl
+    tbl_pr = tbl.tblPr
+    if tbl_pr is None:
+        tbl_pr = OxmlElement("w:tblPr")
+        tbl.insert(0, tbl_pr)
+
+    tbl_style = tbl_pr.find(qn("w:tblStyle"))
+    if tbl_style is None:
+        tbl_style = OxmlElement("w:tblStyle")
+        tbl_pr.append(tbl_style)
+    tbl_style.set(qn("w:val"), "Table Grid")
+
+    tbl_w = tbl_pr.find(qn("w:tblW"))
+    if tbl_w is None:
+        tbl_w = OxmlElement("w:tblW")
+        tbl_pr.append(tbl_w)
+    tbl_w.set(qn("w:type"), "pct")
+    tbl_w.set(qn("w:w"), "4997")
+
+    tbl_jc = tbl_pr.find(qn("w:jc"))
+    if tbl_jc is None:
+        tbl_jc = OxmlElement("w:jc")
+        tbl_pr.append(tbl_jc)
+    tbl_jc.set(qn("w:val"), "center")
+
+    tbl_cell_mar = tbl_pr.find(qn("w:tblCellMar"))
+    if tbl_cell_mar is None:
+        tbl_cell_mar = OxmlElement("w:tblCellMar")
+        tbl_pr.append(tbl_cell_mar)
+    for edge, width in (("top", "120"), ("bottom", "120"), ("left", "140"), ("right", "140")):
+        elem = tbl_cell_mar.find(qn(f"w:{edge}"))
+        if elem is None:
+            elem = OxmlElement(f"w:{edge}")
+            tbl_cell_mar.append(elem)
+        elem.set(qn("w:w"), width)
+        elem.set(qn("w:type"), "dxa")
+
+    tbl_borders = tbl_pr.find(qn("w:tblBorders"))
+    if tbl_borders is None:
+        tbl_borders = OxmlElement("w:tblBorders")
+        tbl_pr.append(tbl_borders)
+    for edge in ("top", "left", "bottom", "right", "insideH", "insideV"):
+        elem = tbl_borders.find(qn(f"w:{edge}"))
+        if elem is None:
+            elem = OxmlElement(f"w:{edge}")
+            tbl_borders.append(elem)
+        elem.set(qn("w:val"), "single")
+        elem.set(qn("w:sz"), "4")
+        elem.set(qn("w:color"), "auto")
+        elem.set(qn("w:space"), "0")
+
+    for row in table.rows:
+        tr_pr = row._tr.get_or_add_trPr()
+        tr_height = tr_pr.find(qn("w:trHeight"))
+        if tr_height is None:
+            tr_height = OxmlElement("w:trHeight")
+            tr_pr.append(tr_height)
+        tr_height.set(qn("w:val"), "620")
+        tr_height.set(qn("w:hRule"), "atLeast")
+
+        for cell in row.cells:
+            tc_pr = cell._tc.get_or_add_tcPr()
+            v_align = tc_pr.find(qn("w:vAlign"))
+            if v_align is None:
+                v_align = OxmlElement("w:vAlign")
+                tc_pr.append(v_align)
+            v_align.set(qn("w:val"), "center")
+
+            tc_borders = tc_pr.find(qn("w:tcBorders"))
+            if tc_borders is None:
+                tc_borders = OxmlElement("w:tcBorders")
+                tc_pr.append(tc_borders)
+            for edge in ("top", "left", "bottom", "right"):
+                elem = tc_borders.find(qn(f"w:{edge}"))
+                if elem is None:
+                    elem = OxmlElement(f"w:{edge}")
+                    tc_borders.append(elem)
+                elem.set(qn("w:val"), "single")
+                elem.set(qn("w:sz"), "4")
+                elem.set(qn("w:color"), "auto")
+                elem.set(qn("w:space"), "0")
+
+
+def set_table_header_gray(table):
+    if not table.rows:
+        return
+    for cell in table.rows[0].cells:
+        tc_pr = cell._tc.get_or_add_tcPr()
+        shd = tc_pr.find(qn("w:shd"))
+        if shd is None:
+            shd = OxmlElement("w:shd")
+            tc_pr.append(shd)
+        shd.set(qn("w:val"), "clear")
+        shd.set(qn("w:color"), "auto")
+        shd.set(qn("w:fill"), "D9D9D9")
+
+
+def cleanup_paragraph_spaces(paragraph):
+    runs = paragraph.runs
+    if not runs:
+        return
+    for run in runs:
+        if run.text:
+            run.text = re.sub(r"[ \t]{2,}", " ", run.text)
+    runs[0].text = runs[0].text.lstrip(" \t\u3000")
+    runs[-1].text = runs[-1].text.rstrip(" \t\u3000")
+
+
+def remove_redundant_blank_paragraphs(doc):
+    prev_blank = False
+    for p in list(doc.paragraphs):
+        text = p.text.replace("\u3000", " ").strip()
+        is_blank = text == ""
+        if is_blank and prev_blank:
+            p._element.getparent().remove(p._element)
+            continue
+        prev_blank = is_blank
+
+
+def add_page_break_between_chapters(doc):
+    chapter_pattern = re.compile(r"^第\s*\d+\s*章")
+    chapter_paragraphs = []
+    for p in list(doc.paragraphs):
+        text = p.text.replace("\u3000", " ").strip()
+        if not text or not chapter_pattern.match(text):
+            continue
+        chapter_paragraphs.append(p)
+
+    for index, p in enumerate(chapter_paragraphs):
+        if index == 0:
+            continue
+        prev = p._element.getprevious()
+        has_page_break = False
+        if prev is not None:
+            for br in prev.findall('.//w:br', {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
+                if br.attrib.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') == 'page':
+                    has_page_break = True
+                    break
+        if not has_page_break:
+            break_paragraph = p.insert_paragraph_before("")
+            break_paragraph.add_run().add_break(WD_BREAK.PAGE)
+
+
 def set_first_line_two_chars(paragraph, twips: int = 420, chars: int = 200):
    ppr = paragraph._p.get_or_add_pPr()
    ind = ppr.find(qn("w:ind"))
@@ -99,6 +236,19 @@ def set_first_line_two_chars(paragraph, twips: int = 420, chars: int = 200):
    ind.set(qn("w:firstLine"), str(twips))
    ind.set(qn("w:firstLineChars"), str(chars))

+
+def apply_para_format(paragraph, line_spacing: float, first_line_pt: float | None = None, align=None):
+    fmt = paragraph.paragraph_format
+    fmt.line_spacing = line_spacing
+    fmt.space_before = Pt(0)
+    fmt.space_after = Pt(0)
+    if first_line_pt is not None:
+        fmt.first_line_indent = Pt(first_line_pt)
+        set_first_line_two_chars(paragraph)
+    if align is not None:
+        paragraph.alignment = align
+
+
 def format_paragraph(p):
    style_name = p.style.name if p.style is not None else ""
    if style_name == "Heading 1":
@@ -109,10 +259,10 @@ def format_paragraph(p):
        set_runs_font(p, "黑体", 16, True)
    elif style_name == "Heading 3":
        apply_para_format(p, 1.5, 28)
-        set_runs_font(p, "宋体", 14, True)
+        set_runs_font(p, "黑体", 14, True)
    elif style_name == "Heading 4":
        apply_para_format(p, 1.5, 24)
-        set_runs_font(p, "宋体", 12, True)
+        set_runs_font(p, "黑体", 14, True)
        set_runs_common(p, italic=False, color_black=True)
    elif is_numbered_paragraph(p) or style_name.startswith("List Number"):
        p.paragraph_format.line_spacing = 1.5
@@ -124,17 +274,6 @@ def format_paragraph(p):
        set_runs_common(p, color_black=True)


-def apply_para_format(paragraph, line_spacing: float, first_line_pt: float | None = None, align=None):
-    fmt = paragraph.paragraph_format
-    fmt.line_spacing = line_spacing
-    fmt.space_before = Pt(0)
-    fmt.space_after = Pt(0)
-    if first_line_pt is not None:
-        fmt.first_line_indent = Pt(first_line_pt)
-        set_first_line_two_chars(paragraph)
-
-    if align is not None:
-        paragraph.alignment = align
 def set_page_layout(doc):
    for section in doc.sections:
        section.page_width = Cm(21.0)
@@ -146,6 +285,7 @@ def set_page_layout(doc):
        section.header_distance = Cm(1.5)
        section.footer_distance = Cm(1.75)

+
 def main():
    doc = Document(SRC)

@@ -155,40 +295,44 @@ def main():
    h3 = doc.styles["Heading 3"]
    h4 = doc.styles["Heading 4"]

-    # 正文：宋体小四，首行缩进2字符（约24pt），1.5倍行距
    set_style_font(normal, "宋体", 10.5)
    normal.paragraph_format.line_spacing = 1.5
    normal.paragraph_format.first_line_indent = Pt(21)

-    # 标题1：黑体二号，加粗，居中，1.5倍行距
    set_style_font(h1, "黑体", 22, True)
    h1.paragraph_format.line_spacing = 1.5
    h1.paragraph_format.first_line_indent = Pt(0)

-    # 标题2：黑体三号，加粗，首行缩进2字符，1.5倍行距
    set_style_font(h2, "黑体", 16, True)
    h2.paragraph_format.line_spacing = 1.5
    h2.paragraph_format.first_line_indent = Pt(32)

-    # 标题3：宋体四号，加粗，首行缩进2字符，1.5倍行距
    set_style_font(h3, "黑体", 14, True)
    h3.paragraph_format.line_spacing = 1.5
    h3.paragraph_format.first_line_indent = Pt(28)

-    # 标题4：加粗，取消斜体，黑色，1.5倍行距
    set_style_font(h4, "黑体", 14, True)
    h4.font.italic = False
    h4.paragraph_format.line_spacing = 1.5
    h4.paragraph_format.first_line_indent = Pt(24)
+
    set_page_layout(doc)
+
    for p in doc.paragraphs:
        format_paragraph(p)
+        cleanup_paragraph_spaces(p)

    for t in doc.tables:
-        set_table_all_borders_black(t)
-        for p in iter_table_paragraphs(t):
-            format_paragraph(p)
+        set_table_style_like_template(t)
+        set_table_header_gray(t)
+        for row_index, row in enumerate(t.rows):
+            for cell in row.cells:
+                for p in cell.paragraphs:
+                    format_table_paragraph(p, bold=(row_index == 0))
+                    cleanup_paragraph_spaces(p)

+    remove_redundant_blank_paragraphs(doc)
+    add_page_break_between_chapters(doc)
    doc.save(DST)
    print(DST)

--- a/example/markdown_to_docx.py
+++ b/example/markdown_to_docx.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+from pathlib import Path
+import re
+from docx import Document
+
+
+INPUT_MD = Path("/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.md")
+OUTPUT_DOCX = Path("/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.docx")
+
+
+def is_table_separator(line: str) -> bool:
+    stripped = line.strip()
+    if not stripped.startswith("|"):
+        return False
+    core = stripped.strip("|").replace(" ", "")
+    return bool(core) and all(ch in "-:|" for ch in core)
+
+
+def split_table_row(line: str) -> list[str]:
+    raw = line.strip().strip("|")
+    return [cell.strip() for cell in raw.split("|")]
+
+
+def convert_markdown_to_docx(md_text: str, doc: Document) -> None:
+    lines = md_text.splitlines()
+    i = 0
+
+    while i < len(lines):
+        line = lines[i]
+        stripped = line.strip()
+
+        if not stripped:
+            doc.add_paragraph("")
+            i += 1
+            continue
+
+        # Table block
+        if stripped.startswith("|") and i + 1 < len(lines) and is_table_separator(lines[i + 1]):
+            headers = split_table_row(lines[i])
+            i += 2
+            rows: list[list[str]] = []
+            while i < len(lines):
+                row_line = lines[i].strip()
+                if not row_line.startswith("|"):
+                    break
+                rows.append(split_table_row(lines[i]))
+                i += 1
+
+            cols = max(1, len(headers))
+            table = doc.add_table(rows=1, cols=cols)
+            for c in range(cols):
+                table.cell(0, c).text = headers[c] if c < len(headers) else ""
+
+            for row in rows:
+                cells = table.add_row().cells
+                for c in range(cols):
+                    cells[c].text = row[c] if c < len(row) else ""
+            continue
+
+        # Heading
+        heading_match = re.match(r"^(#{1,6})\s+(.*)$", stripped)
+        if heading_match:
+            level = min(4, len(heading_match.group(1)))
+            text = heading_match.group(2).strip()
+            doc.add_heading(text, level=level)
+            i += 1
+            continue
+
+        # Ordered list
+        if re.match(r"^\d+\.\s+", stripped):
+            text = re.sub(r"^\d+\.\s+", "", stripped)
+            doc.add_paragraph(text, style="List Number")
+            i += 1
+            continue
+
+        # Unordered list
+        if stripped.startswith("- "):
+            doc.add_paragraph(stripped[2:].strip(), style="List Bullet")
+            i += 1
+            continue
+
+        # Plain paragraph
+        doc.add_paragraph(stripped)
+        i += 1
+
+
+def main() -> None:
+    md_text = INPUT_MD.read_text(encoding="utf-8")
+    doc = Document()
+    convert_markdown_to_docx(md_text, doc)
+    doc.save(OUTPUT_DOCX)
+    print(OUTPUT_DOCX)
+
+
+if __name__ == "__main__":
+    main()
--- a/example/萌贝母婴商城毕业论文初稿-2026版-排版.docx
+++ b/example/萌贝母婴商城毕业论文初稿-2026版-排版.docx