feat: 完善论文排版脚本
重构 format_thesis_docx.py 增强排版功能;新增 markdown_to_docx.py 转换工具;移除旧版排版文档 Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
@@ -1,9 +1,9 @@
|
||||
from docx import Document
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.oxml.ns import qn
|
||||
from docx.shared import Cm, Pt, RGBColor
|
||||
|
||||
import re
|
||||
|
||||
SRC = r"/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.docx"
|
||||
DST = r"/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版-排版.docx"
|
||||
@@ -63,25 +63,6 @@ def is_numbered_paragraph(paragraph) -> bool:
|
||||
return ppr.numPr is not None
|
||||
|
||||
|
||||
def set_table_all_borders_black(table):
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
tc = cell._tc
|
||||
tc_pr = tc.get_or_add_tcPr()
|
||||
tc_borders = tc_pr.find(qn("w:tcBorders"))
|
||||
if tc_borders is None:
|
||||
tc_borders = OxmlElement("w:tcBorders")
|
||||
tc_pr.append(tc_borders)
|
||||
for edge in ("top", "left", "bottom", "right", "insideH", "insideV"):
|
||||
edge_tag = qn(f"w:{edge}")
|
||||
elem = tc_borders.find(edge_tag)
|
||||
if elem is None:
|
||||
elem = OxmlElement(f"w:{edge}")
|
||||
tc_borders.append(elem)
|
||||
elem.set(qn("w:val"), "single")
|
||||
elem.set(qn("w:sz"), "4")
|
||||
elem.set(qn("w:color"), "000000")
|
||||
elem.set(qn("w:space"), "0")
|
||||
def iter_table_paragraphs(table):
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
@@ -90,6 +71,162 @@ def iter_table_paragraphs(table):
|
||||
for t in cell.tables:
|
||||
yield from iter_table_paragraphs(t)
|
||||
|
||||
|
||||
def format_table_paragraph(p, bold: bool = False):
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
fmt = p.paragraph_format
|
||||
fmt.line_spacing = 1.0
|
||||
fmt.space_before = Pt(0)
|
||||
fmt.space_after = Pt(0)
|
||||
fmt.first_line_indent = Pt(0)
|
||||
set_runs_font(p, "宋体", 10.5, bold=bold)
|
||||
set_runs_common(p, italic=False, color_black=True)
|
||||
|
||||
|
||||
def set_table_style_like_template(table):
|
||||
tbl = table._tbl
|
||||
tbl_pr = tbl.tblPr
|
||||
if tbl_pr is None:
|
||||
tbl_pr = OxmlElement("w:tblPr")
|
||||
tbl.insert(0, tbl_pr)
|
||||
|
||||
tbl_style = tbl_pr.find(qn("w:tblStyle"))
|
||||
if tbl_style is None:
|
||||
tbl_style = OxmlElement("w:tblStyle")
|
||||
tbl_pr.append(tbl_style)
|
||||
tbl_style.set(qn("w:val"), "Table Grid")
|
||||
|
||||
tbl_w = tbl_pr.find(qn("w:tblW"))
|
||||
if tbl_w is None:
|
||||
tbl_w = OxmlElement("w:tblW")
|
||||
tbl_pr.append(tbl_w)
|
||||
tbl_w.set(qn("w:type"), "pct")
|
||||
tbl_w.set(qn("w:w"), "4997")
|
||||
|
||||
tbl_jc = tbl_pr.find(qn("w:jc"))
|
||||
if tbl_jc is None:
|
||||
tbl_jc = OxmlElement("w:jc")
|
||||
tbl_pr.append(tbl_jc)
|
||||
tbl_jc.set(qn("w:val"), "center")
|
||||
|
||||
tbl_cell_mar = tbl_pr.find(qn("w:tblCellMar"))
|
||||
if tbl_cell_mar is None:
|
||||
tbl_cell_mar = OxmlElement("w:tblCellMar")
|
||||
tbl_pr.append(tbl_cell_mar)
|
||||
for edge, width in (("top", "120"), ("bottom", "120"), ("left", "140"), ("right", "140")):
|
||||
elem = tbl_cell_mar.find(qn(f"w:{edge}"))
|
||||
if elem is None:
|
||||
elem = OxmlElement(f"w:{edge}")
|
||||
tbl_cell_mar.append(elem)
|
||||
elem.set(qn("w:w"), width)
|
||||
elem.set(qn("w:type"), "dxa")
|
||||
|
||||
tbl_borders = tbl_pr.find(qn("w:tblBorders"))
|
||||
if tbl_borders is None:
|
||||
tbl_borders = OxmlElement("w:tblBorders")
|
||||
tbl_pr.append(tbl_borders)
|
||||
for edge in ("top", "left", "bottom", "right", "insideH", "insideV"):
|
||||
elem = tbl_borders.find(qn(f"w:{edge}"))
|
||||
if elem is None:
|
||||
elem = OxmlElement(f"w:{edge}")
|
||||
tbl_borders.append(elem)
|
||||
elem.set(qn("w:val"), "single")
|
||||
elem.set(qn("w:sz"), "4")
|
||||
elem.set(qn("w:color"), "auto")
|
||||
elem.set(qn("w:space"), "0")
|
||||
|
||||
for row in table.rows:
|
||||
tr_pr = row._tr.get_or_add_trPr()
|
||||
tr_height = tr_pr.find(qn("w:trHeight"))
|
||||
if tr_height is None:
|
||||
tr_height = OxmlElement("w:trHeight")
|
||||
tr_pr.append(tr_height)
|
||||
tr_height.set(qn("w:val"), "620")
|
||||
tr_height.set(qn("w:hRule"), "atLeast")
|
||||
|
||||
for cell in row.cells:
|
||||
tc_pr = cell._tc.get_or_add_tcPr()
|
||||
v_align = tc_pr.find(qn("w:vAlign"))
|
||||
if v_align is None:
|
||||
v_align = OxmlElement("w:vAlign")
|
||||
tc_pr.append(v_align)
|
||||
v_align.set(qn("w:val"), "center")
|
||||
|
||||
tc_borders = tc_pr.find(qn("w:tcBorders"))
|
||||
if tc_borders is None:
|
||||
tc_borders = OxmlElement("w:tcBorders")
|
||||
tc_pr.append(tc_borders)
|
||||
for edge in ("top", "left", "bottom", "right"):
|
||||
elem = tc_borders.find(qn(f"w:{edge}"))
|
||||
if elem is None:
|
||||
elem = OxmlElement(f"w:{edge}")
|
||||
tc_borders.append(elem)
|
||||
elem.set(qn("w:val"), "single")
|
||||
elem.set(qn("w:sz"), "4")
|
||||
elem.set(qn("w:color"), "auto")
|
||||
elem.set(qn("w:space"), "0")
|
||||
|
||||
|
||||
def set_table_header_gray(table):
|
||||
if not table.rows:
|
||||
return
|
||||
for cell in table.rows[0].cells:
|
||||
tc_pr = cell._tc.get_or_add_tcPr()
|
||||
shd = tc_pr.find(qn("w:shd"))
|
||||
if shd is None:
|
||||
shd = OxmlElement("w:shd")
|
||||
tc_pr.append(shd)
|
||||
shd.set(qn("w:val"), "clear")
|
||||
shd.set(qn("w:color"), "auto")
|
||||
shd.set(qn("w:fill"), "D9D9D9")
|
||||
|
||||
|
||||
def cleanup_paragraph_spaces(paragraph):
|
||||
runs = paragraph.runs
|
||||
if not runs:
|
||||
return
|
||||
for run in runs:
|
||||
if run.text:
|
||||
run.text = re.sub(r"[ \t]{2,}", " ", run.text)
|
||||
runs[0].text = runs[0].text.lstrip(" \t\u3000")
|
||||
runs[-1].text = runs[-1].text.rstrip(" \t\u3000")
|
||||
|
||||
|
||||
def remove_redundant_blank_paragraphs(doc):
|
||||
prev_blank = False
|
||||
for p in list(doc.paragraphs):
|
||||
text = p.text.replace("\u3000", " ").strip()
|
||||
is_blank = text == ""
|
||||
if is_blank and prev_blank:
|
||||
p._element.getparent().remove(p._element)
|
||||
continue
|
||||
prev_blank = is_blank
|
||||
|
||||
|
||||
def add_page_break_between_chapters(doc):
|
||||
chapter_pattern = re.compile(r"^第\s*\d+\s*章")
|
||||
chapter_paragraphs = []
|
||||
for p in list(doc.paragraphs):
|
||||
text = p.text.replace("\u3000", " ").strip()
|
||||
if not text or not chapter_pattern.match(text):
|
||||
continue
|
||||
chapter_paragraphs.append(p)
|
||||
|
||||
for index, p in enumerate(chapter_paragraphs):
|
||||
if index == 0:
|
||||
continue
|
||||
prev = p._element.getprevious()
|
||||
has_page_break = False
|
||||
if prev is not None:
|
||||
for br in prev.findall('.//w:br', {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
||||
if br.attrib.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') == 'page':
|
||||
has_page_break = True
|
||||
break
|
||||
if not has_page_break:
|
||||
break_paragraph = p.insert_paragraph_before("")
|
||||
break_paragraph.add_run().add_break(WD_BREAK.PAGE)
|
||||
|
||||
|
||||
def set_first_line_two_chars(paragraph, twips: int = 420, chars: int = 200):
|
||||
ppr = paragraph._p.get_or_add_pPr()
|
||||
ind = ppr.find(qn("w:ind"))
|
||||
@@ -99,6 +236,19 @@ def set_first_line_two_chars(paragraph, twips: int = 420, chars: int = 200):
|
||||
ind.set(qn("w:firstLine"), str(twips))
|
||||
ind.set(qn("w:firstLineChars"), str(chars))
|
||||
|
||||
|
||||
def apply_para_format(paragraph, line_spacing: float, first_line_pt: float | None = None, align=None):
|
||||
fmt = paragraph.paragraph_format
|
||||
fmt.line_spacing = line_spacing
|
||||
fmt.space_before = Pt(0)
|
||||
fmt.space_after = Pt(0)
|
||||
if first_line_pt is not None:
|
||||
fmt.first_line_indent = Pt(first_line_pt)
|
||||
set_first_line_two_chars(paragraph)
|
||||
if align is not None:
|
||||
paragraph.alignment = align
|
||||
|
||||
|
||||
def format_paragraph(p):
|
||||
style_name = p.style.name if p.style is not None else ""
|
||||
if style_name == "Heading 1":
|
||||
@@ -109,10 +259,10 @@ def format_paragraph(p):
|
||||
set_runs_font(p, "黑体", 16, True)
|
||||
elif style_name == "Heading 3":
|
||||
apply_para_format(p, 1.5, 28)
|
||||
set_runs_font(p, "宋体", 14, True)
|
||||
set_runs_font(p, "黑体", 14, True)
|
||||
elif style_name == "Heading 4":
|
||||
apply_para_format(p, 1.5, 24)
|
||||
set_runs_font(p, "宋体", 12, True)
|
||||
set_runs_font(p, "黑体", 14, True)
|
||||
set_runs_common(p, italic=False, color_black=True)
|
||||
elif is_numbered_paragraph(p) or style_name.startswith("List Number"):
|
||||
p.paragraph_format.line_spacing = 1.5
|
||||
@@ -124,17 +274,6 @@ def format_paragraph(p):
|
||||
set_runs_common(p, color_black=True)
|
||||
|
||||
|
||||
def apply_para_format(paragraph, line_spacing: float, first_line_pt: float | None = None, align=None):
|
||||
fmt = paragraph.paragraph_format
|
||||
fmt.line_spacing = line_spacing
|
||||
fmt.space_before = Pt(0)
|
||||
fmt.space_after = Pt(0)
|
||||
if first_line_pt is not None:
|
||||
fmt.first_line_indent = Pt(first_line_pt)
|
||||
set_first_line_two_chars(paragraph)
|
||||
|
||||
if align is not None:
|
||||
paragraph.alignment = align
|
||||
def set_page_layout(doc):
|
||||
for section in doc.sections:
|
||||
section.page_width = Cm(21.0)
|
||||
@@ -146,6 +285,7 @@ def set_page_layout(doc):
|
||||
section.header_distance = Cm(1.5)
|
||||
section.footer_distance = Cm(1.75)
|
||||
|
||||
|
||||
def main():
|
||||
doc = Document(SRC)
|
||||
|
||||
@@ -155,40 +295,44 @@ def main():
|
||||
h3 = doc.styles["Heading 3"]
|
||||
h4 = doc.styles["Heading 4"]
|
||||
|
||||
# 正文:宋体小四,首行缩进2字符(约24pt),1.5倍行距
|
||||
set_style_font(normal, "宋体", 10.5)
|
||||
normal.paragraph_format.line_spacing = 1.5
|
||||
normal.paragraph_format.first_line_indent = Pt(21)
|
||||
|
||||
# 标题1:黑体二号,加粗,居中,1.5倍行距
|
||||
set_style_font(h1, "黑体", 22, True)
|
||||
h1.paragraph_format.line_spacing = 1.5
|
||||
h1.paragraph_format.first_line_indent = Pt(0)
|
||||
|
||||
# 标题2:黑体三号,加粗,首行缩进2字符,1.5倍行距
|
||||
set_style_font(h2, "黑体", 16, True)
|
||||
h2.paragraph_format.line_spacing = 1.5
|
||||
h2.paragraph_format.first_line_indent = Pt(32)
|
||||
|
||||
# 标题3:宋体四号,加粗,首行缩进2字符,1.5倍行距
|
||||
set_style_font(h3, "黑体", 14, True)
|
||||
h3.paragraph_format.line_spacing = 1.5
|
||||
h3.paragraph_format.first_line_indent = Pt(28)
|
||||
|
||||
# 标题4:加粗,取消斜体,黑色,1.5倍行距
|
||||
set_style_font(h4, "黑体", 14, True)
|
||||
h4.font.italic = False
|
||||
h4.paragraph_format.line_spacing = 1.5
|
||||
h4.paragraph_format.first_line_indent = Pt(24)
|
||||
|
||||
set_page_layout(doc)
|
||||
|
||||
for p in doc.paragraphs:
|
||||
format_paragraph(p)
|
||||
cleanup_paragraph_spaces(p)
|
||||
|
||||
for t in doc.tables:
|
||||
set_table_all_borders_black(t)
|
||||
for p in iter_table_paragraphs(t):
|
||||
format_paragraph(p)
|
||||
set_table_style_like_template(t)
|
||||
set_table_header_gray(t)
|
||||
for row_index, row in enumerate(t.rows):
|
||||
for cell in row.cells:
|
||||
for p in cell.paragraphs:
|
||||
format_table_paragraph(p, bold=(row_index == 0))
|
||||
cleanup_paragraph_spaces(p)
|
||||
|
||||
remove_redundant_blank_paragraphs(doc)
|
||||
add_page_break_between_chapters(doc)
|
||||
doc.save(DST)
|
||||
print(DST)
|
||||
|
||||
|
||||
97
example/markdown_to_docx.py
Normal file
97
example/markdown_to_docx.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
from docx import Document
|
||||
|
||||
|
||||
INPUT_MD = Path("/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.md")
|
||||
OUTPUT_DOCX = Path("/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.docx")
|
||||
|
||||
|
||||
def is_table_separator(line: str) -> bool:
|
||||
stripped = line.strip()
|
||||
if not stripped.startswith("|"):
|
||||
return False
|
||||
core = stripped.strip("|").replace(" ", "")
|
||||
return bool(core) and all(ch in "-:|" for ch in core)
|
||||
|
||||
|
||||
def split_table_row(line: str) -> list[str]:
|
||||
raw = line.strip().strip("|")
|
||||
return [cell.strip() for cell in raw.split("|")]
|
||||
|
||||
|
||||
def convert_markdown_to_docx(md_text: str, doc: Document) -> None:
|
||||
lines = md_text.splitlines()
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
stripped = line.strip()
|
||||
|
||||
if not stripped:
|
||||
doc.add_paragraph("")
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Table block
|
||||
if stripped.startswith("|") and i + 1 < len(lines) and is_table_separator(lines[i + 1]):
|
||||
headers = split_table_row(lines[i])
|
||||
i += 2
|
||||
rows: list[list[str]] = []
|
||||
while i < len(lines):
|
||||
row_line = lines[i].strip()
|
||||
if not row_line.startswith("|"):
|
||||
break
|
||||
rows.append(split_table_row(lines[i]))
|
||||
i += 1
|
||||
|
||||
cols = max(1, len(headers))
|
||||
table = doc.add_table(rows=1, cols=cols)
|
||||
for c in range(cols):
|
||||
table.cell(0, c).text = headers[c] if c < len(headers) else ""
|
||||
|
||||
for row in rows:
|
||||
cells = table.add_row().cells
|
||||
for c in range(cols):
|
||||
cells[c].text = row[c] if c < len(row) else ""
|
||||
continue
|
||||
|
||||
# Heading
|
||||
heading_match = re.match(r"^(#{1,6})\s+(.*)$", stripped)
|
||||
if heading_match:
|
||||
level = min(4, len(heading_match.group(1)))
|
||||
text = heading_match.group(2).strip()
|
||||
doc.add_heading(text, level=level)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Ordered list
|
||||
if re.match(r"^\d+\.\s+", stripped):
|
||||
text = re.sub(r"^\d+\.\s+", "", stripped)
|
||||
doc.add_paragraph(text, style="List Number")
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Unordered list
|
||||
if stripped.startswith("- "):
|
||||
doc.add_paragraph(stripped[2:].strip(), style="List Bullet")
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Plain paragraph
|
||||
doc.add_paragraph(stripped)
|
||||
i += 1
|
||||
|
||||
|
||||
def main() -> None:
|
||||
md_text = INPUT_MD.read_text(encoding="utf-8")
|
||||
doc = Document()
|
||||
convert_markdown_to_docx(md_text, doc)
|
||||
doc.save(OUTPUT_DOCX)
|
||||
print(OUTPUT_DOCX)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Binary file not shown.
Reference in New Issue
Block a user