diff --git a/example/format_thesis_docx.py b/example/format_thesis_docx.py index 14ec1d9..66caf4d 100644 --- a/example/format_thesis_docx.py +++ b/example/format_thesis_docx.py @@ -1,9 +1,9 @@ from docx import Document -from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Cm, Pt, RGBColor - +import re SRC = r"/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.docx" DST = r"/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版-排版.docx" @@ -63,25 +63,6 @@ def is_numbered_paragraph(paragraph) -> bool: return ppr.numPr is not None -def set_table_all_borders_black(table): - for row in table.rows: - for cell in row.cells: - tc = cell._tc - tc_pr = tc.get_or_add_tcPr() - tc_borders = tc_pr.find(qn("w:tcBorders")) - if tc_borders is None: - tc_borders = OxmlElement("w:tcBorders") - tc_pr.append(tc_borders) - for edge in ("top", "left", "bottom", "right", "insideH", "insideV"): - edge_tag = qn(f"w:{edge}") - elem = tc_borders.find(edge_tag) - if elem is None: - elem = OxmlElement(f"w:{edge}") - tc_borders.append(elem) - elem.set(qn("w:val"), "single") - elem.set(qn("w:sz"), "4") - elem.set(qn("w:color"), "000000") - elem.set(qn("w:space"), "0") def iter_table_paragraphs(table): for row in table.rows: for cell in row.cells: @@ -90,6 +71,162 @@ def iter_table_paragraphs(table): for t in cell.tables: yield from iter_table_paragraphs(t) + +def format_table_paragraph(p, bold: bool = False): + p.alignment = WD_ALIGN_PARAGRAPH.CENTER + fmt = p.paragraph_format + fmt.line_spacing = 1.0 + fmt.space_before = Pt(0) + fmt.space_after = Pt(0) + fmt.first_line_indent = Pt(0) + set_runs_font(p, "宋体", 10.5, bold=bold) + set_runs_common(p, italic=False, color_black=True) + + +def set_table_style_like_template(table): + tbl = table._tbl + tbl_pr = tbl.tblPr + if tbl_pr is None: + tbl_pr = OxmlElement("w:tblPr") + tbl.insert(0, tbl_pr) + + tbl_style = tbl_pr.find(qn("w:tblStyle")) + if tbl_style is None: + tbl_style = OxmlElement("w:tblStyle") + tbl_pr.append(tbl_style) + tbl_style.set(qn("w:val"), "Table Grid") + + tbl_w = tbl_pr.find(qn("w:tblW")) + if tbl_w is None: + tbl_w = OxmlElement("w:tblW") + tbl_pr.append(tbl_w) + tbl_w.set(qn("w:type"), "pct") + tbl_w.set(qn("w:w"), "4997") + + tbl_jc = tbl_pr.find(qn("w:jc")) + if tbl_jc is None: + tbl_jc = OxmlElement("w:jc") + tbl_pr.append(tbl_jc) + tbl_jc.set(qn("w:val"), "center") + + tbl_cell_mar = tbl_pr.find(qn("w:tblCellMar")) + if tbl_cell_mar is None: + tbl_cell_mar = OxmlElement("w:tblCellMar") + tbl_pr.append(tbl_cell_mar) + for edge, width in (("top", "120"), ("bottom", "120"), ("left", "140"), ("right", "140")): + elem = tbl_cell_mar.find(qn(f"w:{edge}")) + if elem is None: + elem = OxmlElement(f"w:{edge}") + tbl_cell_mar.append(elem) + elem.set(qn("w:w"), width) + elem.set(qn("w:type"), "dxa") + + tbl_borders = tbl_pr.find(qn("w:tblBorders")) + if tbl_borders is None: + tbl_borders = OxmlElement("w:tblBorders") + tbl_pr.append(tbl_borders) + for edge in ("top", "left", "bottom", "right", "insideH", "insideV"): + elem = tbl_borders.find(qn(f"w:{edge}")) + if elem is None: + elem = OxmlElement(f"w:{edge}") + tbl_borders.append(elem) + elem.set(qn("w:val"), "single") + elem.set(qn("w:sz"), "4") + elem.set(qn("w:color"), "auto") + elem.set(qn("w:space"), "0") + + for row in table.rows: + tr_pr = row._tr.get_or_add_trPr() + tr_height = tr_pr.find(qn("w:trHeight")) + if tr_height is None: + tr_height = OxmlElement("w:trHeight") + tr_pr.append(tr_height) + tr_height.set(qn("w:val"), "620") + tr_height.set(qn("w:hRule"), "atLeast") + + for cell in row.cells: + tc_pr = cell._tc.get_or_add_tcPr() + v_align = tc_pr.find(qn("w:vAlign")) + if v_align is None: + v_align = OxmlElement("w:vAlign") + tc_pr.append(v_align) + v_align.set(qn("w:val"), "center") + + tc_borders = tc_pr.find(qn("w:tcBorders")) + if tc_borders is None: + tc_borders = OxmlElement("w:tcBorders") + tc_pr.append(tc_borders) + for edge in ("top", "left", "bottom", "right"): + elem = tc_borders.find(qn(f"w:{edge}")) + if elem is None: + elem = OxmlElement(f"w:{edge}") + tc_borders.append(elem) + elem.set(qn("w:val"), "single") + elem.set(qn("w:sz"), "4") + elem.set(qn("w:color"), "auto") + elem.set(qn("w:space"), "0") + + +def set_table_header_gray(table): + if not table.rows: + return + for cell in table.rows[0].cells: + tc_pr = cell._tc.get_or_add_tcPr() + shd = tc_pr.find(qn("w:shd")) + if shd is None: + shd = OxmlElement("w:shd") + tc_pr.append(shd) + shd.set(qn("w:val"), "clear") + shd.set(qn("w:color"), "auto") + shd.set(qn("w:fill"), "D9D9D9") + + +def cleanup_paragraph_spaces(paragraph): + runs = paragraph.runs + if not runs: + return + for run in runs: + if run.text: + run.text = re.sub(r"[ \t]{2,}", " ", run.text) + runs[0].text = runs[0].text.lstrip(" \t\u3000") + runs[-1].text = runs[-1].text.rstrip(" \t\u3000") + + +def remove_redundant_blank_paragraphs(doc): + prev_blank = False + for p in list(doc.paragraphs): + text = p.text.replace("\u3000", " ").strip() + is_blank = text == "" + if is_blank and prev_blank: + p._element.getparent().remove(p._element) + continue + prev_blank = is_blank + + +def add_page_break_between_chapters(doc): + chapter_pattern = re.compile(r"^第\s*\d+\s*章") + chapter_paragraphs = [] + for p in list(doc.paragraphs): + text = p.text.replace("\u3000", " ").strip() + if not text or not chapter_pattern.match(text): + continue + chapter_paragraphs.append(p) + + for index, p in enumerate(chapter_paragraphs): + if index == 0: + continue + prev = p._element.getprevious() + has_page_break = False + if prev is not None: + for br in prev.findall('.//w:br', {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}): + if br.attrib.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') == 'page': + has_page_break = True + break + if not has_page_break: + break_paragraph = p.insert_paragraph_before("") + break_paragraph.add_run().add_break(WD_BREAK.PAGE) + + def set_first_line_two_chars(paragraph, twips: int = 420, chars: int = 200): ppr = paragraph._p.get_or_add_pPr() ind = ppr.find(qn("w:ind")) @@ -99,6 +236,19 @@ def set_first_line_two_chars(paragraph, twips: int = 420, chars: int = 200): ind.set(qn("w:firstLine"), str(twips)) ind.set(qn("w:firstLineChars"), str(chars)) + +def apply_para_format(paragraph, line_spacing: float, first_line_pt: float | None = None, align=None): + fmt = paragraph.paragraph_format + fmt.line_spacing = line_spacing + fmt.space_before = Pt(0) + fmt.space_after = Pt(0) + if first_line_pt is not None: + fmt.first_line_indent = Pt(first_line_pt) + set_first_line_two_chars(paragraph) + if align is not None: + paragraph.alignment = align + + def format_paragraph(p): style_name = p.style.name if p.style is not None else "" if style_name == "Heading 1": @@ -109,10 +259,10 @@ def format_paragraph(p): set_runs_font(p, "黑体", 16, True) elif style_name == "Heading 3": apply_para_format(p, 1.5, 28) - set_runs_font(p, "宋体", 14, True) + set_runs_font(p, "黑体", 14, True) elif style_name == "Heading 4": apply_para_format(p, 1.5, 24) - set_runs_font(p, "宋体", 12, True) + set_runs_font(p, "黑体", 14, True) set_runs_common(p, italic=False, color_black=True) elif is_numbered_paragraph(p) or style_name.startswith("List Number"): p.paragraph_format.line_spacing = 1.5 @@ -124,17 +274,6 @@ def format_paragraph(p): set_runs_common(p, color_black=True) -def apply_para_format(paragraph, line_spacing: float, first_line_pt: float | None = None, align=None): - fmt = paragraph.paragraph_format - fmt.line_spacing = line_spacing - fmt.space_before = Pt(0) - fmt.space_after = Pt(0) - if first_line_pt is not None: - fmt.first_line_indent = Pt(first_line_pt) - set_first_line_two_chars(paragraph) - - if align is not None: - paragraph.alignment = align def set_page_layout(doc): for section in doc.sections: section.page_width = Cm(21.0) @@ -146,6 +285,7 @@ def set_page_layout(doc): section.header_distance = Cm(1.5) section.footer_distance = Cm(1.75) + def main(): doc = Document(SRC) @@ -155,40 +295,44 @@ def main(): h3 = doc.styles["Heading 3"] h4 = doc.styles["Heading 4"] - # 正文:宋体小四,首行缩进2字符(约24pt),1.5倍行距 set_style_font(normal, "宋体", 10.5) normal.paragraph_format.line_spacing = 1.5 normal.paragraph_format.first_line_indent = Pt(21) - # 标题1:黑体二号,加粗,居中,1.5倍行距 set_style_font(h1, "黑体", 22, True) h1.paragraph_format.line_spacing = 1.5 h1.paragraph_format.first_line_indent = Pt(0) - # 标题2:黑体三号,加粗,首行缩进2字符,1.5倍行距 set_style_font(h2, "黑体", 16, True) h2.paragraph_format.line_spacing = 1.5 h2.paragraph_format.first_line_indent = Pt(32) - # 标题3:宋体四号,加粗,首行缩进2字符,1.5倍行距 set_style_font(h3, "黑体", 14, True) h3.paragraph_format.line_spacing = 1.5 h3.paragraph_format.first_line_indent = Pt(28) - # 标题4:加粗,取消斜体,黑色,1.5倍行距 set_style_font(h4, "黑体", 14, True) h4.font.italic = False h4.paragraph_format.line_spacing = 1.5 h4.paragraph_format.first_line_indent = Pt(24) + set_page_layout(doc) + for p in doc.paragraphs: format_paragraph(p) + cleanup_paragraph_spaces(p) for t in doc.tables: - set_table_all_borders_black(t) - for p in iter_table_paragraphs(t): - format_paragraph(p) + set_table_style_like_template(t) + set_table_header_gray(t) + for row_index, row in enumerate(t.rows): + for cell in row.cells: + for p in cell.paragraphs: + format_table_paragraph(p, bold=(row_index == 0)) + cleanup_paragraph_spaces(p) + remove_redundant_blank_paragraphs(doc) + add_page_break_between_chapters(doc) doc.save(DST) print(DST) diff --git a/example/markdown_to_docx.py b/example/markdown_to_docx.py new file mode 100644 index 0000000..94ea066 --- /dev/null +++ b/example/markdown_to_docx.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from pathlib import Path +import re +from docx import Document + + +INPUT_MD = Path("/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.md") +OUTPUT_DOCX = Path("/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.docx") + + +def is_table_separator(line: str) -> bool: + stripped = line.strip() + if not stripped.startswith("|"): + return False + core = stripped.strip("|").replace(" ", "") + return bool(core) and all(ch in "-:|" for ch in core) + + +def split_table_row(line: str) -> list[str]: + raw = line.strip().strip("|") + return [cell.strip() for cell in raw.split("|")] + + +def convert_markdown_to_docx(md_text: str, doc: Document) -> None: + lines = md_text.splitlines() + i = 0 + + while i < len(lines): + line = lines[i] + stripped = line.strip() + + if not stripped: + doc.add_paragraph("") + i += 1 + continue + + # Table block + if stripped.startswith("|") and i + 1 < len(lines) and is_table_separator(lines[i + 1]): + headers = split_table_row(lines[i]) + i += 2 + rows: list[list[str]] = [] + while i < len(lines): + row_line = lines[i].strip() + if not row_line.startswith("|"): + break + rows.append(split_table_row(lines[i])) + i += 1 + + cols = max(1, len(headers)) + table = doc.add_table(rows=1, cols=cols) + for c in range(cols): + table.cell(0, c).text = headers[c] if c < len(headers) else "" + + for row in rows: + cells = table.add_row().cells + for c in range(cols): + cells[c].text = row[c] if c < len(row) else "" + continue + + # Heading + heading_match = re.match(r"^(#{1,6})\s+(.*)$", stripped) + if heading_match: + level = min(4, len(heading_match.group(1))) + text = heading_match.group(2).strip() + doc.add_heading(text, level=level) + i += 1 + continue + + # Ordered list + if re.match(r"^\d+\.\s+", stripped): + text = re.sub(r"^\d+\.\s+", "", stripped) + doc.add_paragraph(text, style="List Number") + i += 1 + continue + + # Unordered list + if stripped.startswith("- "): + doc.add_paragraph(stripped[2:].strip(), style="List Bullet") + i += 1 + continue + + # Plain paragraph + doc.add_paragraph(stripped) + i += 1 + + +def main() -> None: + md_text = INPUT_MD.read_text(encoding="utf-8") + doc = Document() + convert_markdown_to_docx(md_text, doc) + doc.save(OUTPUT_DOCX) + print(OUTPUT_DOCX) + + +if __name__ == "__main__": + main() diff --git a/example/萌贝母婴商城毕业论文初稿-2026版-排版.docx b/example/萌贝母婴商城毕业论文初稿-2026版-排版.docx deleted file mode 100644 index 35970cc..0000000 Binary files a/example/萌贝母婴商城毕业论文初稿-2026版-排版.docx and /dev/null differ