cuimengxue/example/markdown_to_docx.py

from __future__ import annotations

from pathlib import Path
import re
from docx import Document


INPUT_MD = Path("/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.md")
OUTPUT_DOCX = Path("/Users/apple/code/bs/mying/example/萌贝母婴商城毕业论文初稿-2026版.docx")


def is_table_separator(line: str) -> bool:
    stripped = line.strip()
    if not stripped.startswith("|"):
        return False
    core = stripped.strip("|").replace(" ", "")
    return bool(core) and all(ch in "-:|" for ch in core)


def split_table_row(line: str) -> list[str]:
    raw = line.strip().strip("|")
    return [cell.strip() for cell in raw.split("|")]


def convert_markdown_to_docx(md_text: str, doc: Document) -> None:
    lines = md_text.splitlines()
    i = 0

    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        if not stripped:
            doc.add_paragraph("")
            i += 1
            continue

        # Table block
        if stripped.startswith("|") and i + 1 < len(lines) and is_table_separator(lines[i + 1]):
            headers = split_table_row(lines[i])
            i += 2
            rows: list[list[str]] = []
            while i < len(lines):
                row_line = lines[i].strip()
                if not row_line.startswith("|"):
                    break
                rows.append(split_table_row(lines[i]))
                i += 1

            cols = max(1, len(headers))
            table = doc.add_table(rows=1, cols=cols)
            for c in range(cols):
                table.cell(0, c).text = headers[c] if c < len(headers) else ""

            for row in rows:
                cells = table.add_row().cells
                for c in range(cols):
                    cells[c].text = row[c] if c < len(row) else ""
            continue

        # Heading
        heading_match = re.match(r"^(#{1,6})\s+(.*)$", stripped)
        if heading_match:
            level = min(4, len(heading_match.group(1)))
            text = heading_match.group(2).strip()
            doc.add_heading(text, level=level)
            i += 1
            continue

        # Ordered list
        if re.match(r"^\d+\.\s+", stripped):
            text = re.sub(r"^\d+\.\s+", "", stripped)
            doc.add_paragraph(text, style="List Number")
            i += 1
            continue

        # Unordered list
        if stripped.startswith("- "):
            doc.add_paragraph(stripped[2:].strip(), style="List Bullet")
            i += 1
            continue

        # Plain paragraph
        doc.add_paragraph(stripped)
        i += 1


def main() -> None:
    md_text = INPUT_MD.read_text(encoding="utf-8")
    doc = Document()
    convert_markdown_to_docx(md_text, doc)
    doc.save(OUTPUT_DOCX)
    print(OUTPUT_DOCX)


if __name__ == "__main__":
    main()