feat: 完善论文排版脚本

重构 format_thesis_docx.py 增强排版功能;新增 markdown_to_docx.py 转换工具;移除旧版排版文档

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
2026-02-28 13:20:04 +08:00
parent 4dd3fac720
commit 38741f80dd
3 changed files with 283 additions and 42 deletions

View File

@@ -1,9 +1,9 @@
from docx import Document from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
from docx.oxml import OxmlElement from docx.oxml import OxmlElement
from docx.oxml.ns import qn from docx.oxml.ns import qn
from docx.shared import Cm, Pt, RGBColor from docx.shared import Cm, Pt, RGBColor
import re
SRC = r"/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.docx" SRC = r"/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.docx"
DST = r"/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版-排版.docx" DST = r"/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版-排版.docx"
@@ -63,25 +63,6 @@ def is_numbered_paragraph(paragraph) -> bool:
return ppr.numPr is not None return ppr.numPr is not None
def set_table_all_borders_black(table):
for row in table.rows:
for cell in row.cells:
tc = cell._tc
tc_pr = tc.get_or_add_tcPr()
tc_borders = tc_pr.find(qn("w:tcBorders"))
if tc_borders is None:
tc_borders = OxmlElement("w:tcBorders")
tc_pr.append(tc_borders)
for edge in ("top", "left", "bottom", "right", "insideH", "insideV"):
edge_tag = qn(f"w:{edge}")
elem = tc_borders.find(edge_tag)
if elem is None:
elem = OxmlElement(f"w:{edge}")
tc_borders.append(elem)
elem.set(qn("w:val"), "single")
elem.set(qn("w:sz"), "4")
elem.set(qn("w:color"), "000000")
elem.set(qn("w:space"), "0")
def iter_table_paragraphs(table): def iter_table_paragraphs(table):
for row in table.rows: for row in table.rows:
for cell in row.cells: for cell in row.cells:
@@ -90,6 +71,162 @@ def iter_table_paragraphs(table):
for t in cell.tables: for t in cell.tables:
yield from iter_table_paragraphs(t) yield from iter_table_paragraphs(t)
def format_table_paragraph(p, bold: bool = False):
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
fmt = p.paragraph_format
fmt.line_spacing = 1.0
fmt.space_before = Pt(0)
fmt.space_after = Pt(0)
fmt.first_line_indent = Pt(0)
set_runs_font(p, "宋体", 10.5, bold=bold)
set_runs_common(p, italic=False, color_black=True)
def set_table_style_like_template(table):
tbl = table._tbl
tbl_pr = tbl.tblPr
if tbl_pr is None:
tbl_pr = OxmlElement("w:tblPr")
tbl.insert(0, tbl_pr)
tbl_style = tbl_pr.find(qn("w:tblStyle"))
if tbl_style is None:
tbl_style = OxmlElement("w:tblStyle")
tbl_pr.append(tbl_style)
tbl_style.set(qn("w:val"), "Table Grid")
tbl_w = tbl_pr.find(qn("w:tblW"))
if tbl_w is None:
tbl_w = OxmlElement("w:tblW")
tbl_pr.append(tbl_w)
tbl_w.set(qn("w:type"), "pct")
tbl_w.set(qn("w:w"), "4997")
tbl_jc = tbl_pr.find(qn("w:jc"))
if tbl_jc is None:
tbl_jc = OxmlElement("w:jc")
tbl_pr.append(tbl_jc)
tbl_jc.set(qn("w:val"), "center")
tbl_cell_mar = tbl_pr.find(qn("w:tblCellMar"))
if tbl_cell_mar is None:
tbl_cell_mar = OxmlElement("w:tblCellMar")
tbl_pr.append(tbl_cell_mar)
for edge, width in (("top", "120"), ("bottom", "120"), ("left", "140"), ("right", "140")):
elem = tbl_cell_mar.find(qn(f"w:{edge}"))
if elem is None:
elem = OxmlElement(f"w:{edge}")
tbl_cell_mar.append(elem)
elem.set(qn("w:w"), width)
elem.set(qn("w:type"), "dxa")
tbl_borders = tbl_pr.find(qn("w:tblBorders"))
if tbl_borders is None:
tbl_borders = OxmlElement("w:tblBorders")
tbl_pr.append(tbl_borders)
for edge in ("top", "left", "bottom", "right", "insideH", "insideV"):
elem = tbl_borders.find(qn(f"w:{edge}"))
if elem is None:
elem = OxmlElement(f"w:{edge}")
tbl_borders.append(elem)
elem.set(qn("w:val"), "single")
elem.set(qn("w:sz"), "4")
elem.set(qn("w:color"), "auto")
elem.set(qn("w:space"), "0")
for row in table.rows:
tr_pr = row._tr.get_or_add_trPr()
tr_height = tr_pr.find(qn("w:trHeight"))
if tr_height is None:
tr_height = OxmlElement("w:trHeight")
tr_pr.append(tr_height)
tr_height.set(qn("w:val"), "620")
tr_height.set(qn("w:hRule"), "atLeast")
for cell in row.cells:
tc_pr = cell._tc.get_or_add_tcPr()
v_align = tc_pr.find(qn("w:vAlign"))
if v_align is None:
v_align = OxmlElement("w:vAlign")
tc_pr.append(v_align)
v_align.set(qn("w:val"), "center")
tc_borders = tc_pr.find(qn("w:tcBorders"))
if tc_borders is None:
tc_borders = OxmlElement("w:tcBorders")
tc_pr.append(tc_borders)
for edge in ("top", "left", "bottom", "right"):
elem = tc_borders.find(qn(f"w:{edge}"))
if elem is None:
elem = OxmlElement(f"w:{edge}")
tc_borders.append(elem)
elem.set(qn("w:val"), "single")
elem.set(qn("w:sz"), "4")
elem.set(qn("w:color"), "auto")
elem.set(qn("w:space"), "0")
def set_table_header_gray(table):
if not table.rows:
return
for cell in table.rows[0].cells:
tc_pr = cell._tc.get_or_add_tcPr()
shd = tc_pr.find(qn("w:shd"))
if shd is None:
shd = OxmlElement("w:shd")
tc_pr.append(shd)
shd.set(qn("w:val"), "clear")
shd.set(qn("w:color"), "auto")
shd.set(qn("w:fill"), "D9D9D9")
def cleanup_paragraph_spaces(paragraph):
runs = paragraph.runs
if not runs:
return
for run in runs:
if run.text:
run.text = re.sub(r"[ \t]{2,}", " ", run.text)
runs[0].text = runs[0].text.lstrip(" \t\u3000")
runs[-1].text = runs[-1].text.rstrip(" \t\u3000")
def remove_redundant_blank_paragraphs(doc):
prev_blank = False
for p in list(doc.paragraphs):
text = p.text.replace("\u3000", " ").strip()
is_blank = text == ""
if is_blank and prev_blank:
p._element.getparent().remove(p._element)
continue
prev_blank = is_blank
def add_page_break_between_chapters(doc):
chapter_pattern = re.compile(r"^第\s*\d+\s*章")
chapter_paragraphs = []
for p in list(doc.paragraphs):
text = p.text.replace("\u3000", " ").strip()
if not text or not chapter_pattern.match(text):
continue
chapter_paragraphs.append(p)
for index, p in enumerate(chapter_paragraphs):
if index == 0:
continue
prev = p._element.getprevious()
has_page_break = False
if prev is not None:
for br in prev.findall('.//w:br', {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
if br.attrib.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') == 'page':
has_page_break = True
break
if not has_page_break:
break_paragraph = p.insert_paragraph_before("")
break_paragraph.add_run().add_break(WD_BREAK.PAGE)
def set_first_line_two_chars(paragraph, twips: int = 420, chars: int = 200): def set_first_line_two_chars(paragraph, twips: int = 420, chars: int = 200):
ppr = paragraph._p.get_or_add_pPr() ppr = paragraph._p.get_or_add_pPr()
ind = ppr.find(qn("w:ind")) ind = ppr.find(qn("w:ind"))
@@ -99,6 +236,19 @@ def set_first_line_two_chars(paragraph, twips: int = 420, chars: int = 200):
ind.set(qn("w:firstLine"), str(twips)) ind.set(qn("w:firstLine"), str(twips))
ind.set(qn("w:firstLineChars"), str(chars)) ind.set(qn("w:firstLineChars"), str(chars))
def apply_para_format(paragraph, line_spacing: float, first_line_pt: float | None = None, align=None):
fmt = paragraph.paragraph_format
fmt.line_spacing = line_spacing
fmt.space_before = Pt(0)
fmt.space_after = Pt(0)
if first_line_pt is not None:
fmt.first_line_indent = Pt(first_line_pt)
set_first_line_two_chars(paragraph)
if align is not None:
paragraph.alignment = align
def format_paragraph(p): def format_paragraph(p):
style_name = p.style.name if p.style is not None else "" style_name = p.style.name if p.style is not None else ""
if style_name == "Heading 1": if style_name == "Heading 1":
@@ -109,10 +259,10 @@ def format_paragraph(p):
set_runs_font(p, "黑体", 16, True) set_runs_font(p, "黑体", 16, True)
elif style_name == "Heading 3": elif style_name == "Heading 3":
apply_para_format(p, 1.5, 28) apply_para_format(p, 1.5, 28)
set_runs_font(p, "", 14, True) set_runs_font(p, "", 14, True)
elif style_name == "Heading 4": elif style_name == "Heading 4":
apply_para_format(p, 1.5, 24) apply_para_format(p, 1.5, 24)
set_runs_font(p, "", 12, True) set_runs_font(p, "", 14, True)
set_runs_common(p, italic=False, color_black=True) set_runs_common(p, italic=False, color_black=True)
elif is_numbered_paragraph(p) or style_name.startswith("List Number"): elif is_numbered_paragraph(p) or style_name.startswith("List Number"):
p.paragraph_format.line_spacing = 1.5 p.paragraph_format.line_spacing = 1.5
@@ -124,17 +274,6 @@ def format_paragraph(p):
set_runs_common(p, color_black=True) set_runs_common(p, color_black=True)
def apply_para_format(paragraph, line_spacing: float, first_line_pt: float | None = None, align=None):
fmt = paragraph.paragraph_format
fmt.line_spacing = line_spacing
fmt.space_before = Pt(0)
fmt.space_after = Pt(0)
if first_line_pt is not None:
fmt.first_line_indent = Pt(first_line_pt)
set_first_line_two_chars(paragraph)
if align is not None:
paragraph.alignment = align
def set_page_layout(doc): def set_page_layout(doc):
for section in doc.sections: for section in doc.sections:
section.page_width = Cm(21.0) section.page_width = Cm(21.0)
@@ -146,6 +285,7 @@ def set_page_layout(doc):
section.header_distance = Cm(1.5) section.header_distance = Cm(1.5)
section.footer_distance = Cm(1.75) section.footer_distance = Cm(1.75)
def main(): def main():
doc = Document(SRC) doc = Document(SRC)
@@ -155,40 +295,44 @@ def main():
h3 = doc.styles["Heading 3"] h3 = doc.styles["Heading 3"]
h4 = doc.styles["Heading 4"] h4 = doc.styles["Heading 4"]
# 正文宋体小四首行缩进2字符约24pt1.5倍行距
set_style_font(normal, "宋体", 10.5) set_style_font(normal, "宋体", 10.5)
normal.paragraph_format.line_spacing = 1.5 normal.paragraph_format.line_spacing = 1.5
normal.paragraph_format.first_line_indent = Pt(21) normal.paragraph_format.first_line_indent = Pt(21)
# 标题1黑体二号加粗居中1.5倍行距
set_style_font(h1, "黑体", 22, True) set_style_font(h1, "黑体", 22, True)
h1.paragraph_format.line_spacing = 1.5 h1.paragraph_format.line_spacing = 1.5
h1.paragraph_format.first_line_indent = Pt(0) h1.paragraph_format.first_line_indent = Pt(0)
# 标题2黑体三号加粗首行缩进2字符1.5倍行距
set_style_font(h2, "黑体", 16, True) set_style_font(h2, "黑体", 16, True)
h2.paragraph_format.line_spacing = 1.5 h2.paragraph_format.line_spacing = 1.5
h2.paragraph_format.first_line_indent = Pt(32) h2.paragraph_format.first_line_indent = Pt(32)
# 标题3宋体四号加粗首行缩进2字符1.5倍行距
set_style_font(h3, "黑体", 14, True) set_style_font(h3, "黑体", 14, True)
h3.paragraph_format.line_spacing = 1.5 h3.paragraph_format.line_spacing = 1.5
h3.paragraph_format.first_line_indent = Pt(28) h3.paragraph_format.first_line_indent = Pt(28)
# 标题4加粗取消斜体黑色1.5倍行距
set_style_font(h4, "黑体", 14, True) set_style_font(h4, "黑体", 14, True)
h4.font.italic = False h4.font.italic = False
h4.paragraph_format.line_spacing = 1.5 h4.paragraph_format.line_spacing = 1.5
h4.paragraph_format.first_line_indent = Pt(24) h4.paragraph_format.first_line_indent = Pt(24)
set_page_layout(doc) set_page_layout(doc)
for p in doc.paragraphs: for p in doc.paragraphs:
format_paragraph(p) format_paragraph(p)
cleanup_paragraph_spaces(p)
for t in doc.tables: for t in doc.tables:
set_table_all_borders_black(t) set_table_style_like_template(t)
for p in iter_table_paragraphs(t): set_table_header_gray(t)
format_paragraph(p) for row_index, row in enumerate(t.rows):
for cell in row.cells:
for p in cell.paragraphs:
format_table_paragraph(p, bold=(row_index == 0))
cleanup_paragraph_spaces(p)
remove_redundant_blank_paragraphs(doc)
add_page_break_between_chapters(doc)
doc.save(DST) doc.save(DST)
print(DST) print(DST)

View File

@@ -0,0 +1,97 @@
from __future__ import annotations
from pathlib import Path
import re
from docx import Document
INPUT_MD = Path("/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.md")
OUTPUT_DOCX = Path("/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.docx")
def is_table_separator(line: str) -> bool:
stripped = line.strip()
if not stripped.startswith("|"):
return False
core = stripped.strip("|").replace(" ", "")
return bool(core) and all(ch in "-:|" for ch in core)
def split_table_row(line: str) -> list[str]:
raw = line.strip().strip("|")
return [cell.strip() for cell in raw.split("|")]
def convert_markdown_to_docx(md_text: str, doc: Document) -> None:
lines = md_text.splitlines()
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
if not stripped:
doc.add_paragraph("")
i += 1
continue
# Table block
if stripped.startswith("|") and i + 1 < len(lines) and is_table_separator(lines[i + 1]):
headers = split_table_row(lines[i])
i += 2
rows: list[list[str]] = []
while i < len(lines):
row_line = lines[i].strip()
if not row_line.startswith("|"):
break
rows.append(split_table_row(lines[i]))
i += 1
cols = max(1, len(headers))
table = doc.add_table(rows=1, cols=cols)
for c in range(cols):
table.cell(0, c).text = headers[c] if c < len(headers) else ""
for row in rows:
cells = table.add_row().cells
for c in range(cols):
cells[c].text = row[c] if c < len(row) else ""
continue
# Heading
heading_match = re.match(r"^(#{1,6})\s+(.*)$", stripped)
if heading_match:
level = min(4, len(heading_match.group(1)))
text = heading_match.group(2).strip()
doc.add_heading(text, level=level)
i += 1
continue
# Ordered list
if re.match(r"^\d+\.\s+", stripped):
text = re.sub(r"^\d+\.\s+", "", stripped)
doc.add_paragraph(text, style="List Number")
i += 1
continue
# Unordered list
if stripped.startswith("- "):
doc.add_paragraph(stripped[2:].strip(), style="List Bullet")
i += 1
continue
# Plain paragraph
doc.add_paragraph(stripped)
i += 1
def main() -> None:
md_text = INPUT_MD.read_text(encoding="utf-8")
doc = Document()
convert_markdown_to_docx(md_text, doc)
doc.save(OUTPUT_DOCX)
print(OUTPUT_DOCX)
if __name__ == "__main__":
main()