nursing-home/thesis/generate_docx.py

#!/usr/bin/env python3
"""
Generate formatted DOCX thesis from markdown files.
Matches the style of the reference document (2106090117-佟欣鑫-论文.docx).

Formatting spec (from reference analysis):
- Page: A4 (11906x16838 twips), margins 2.5cm all sides
- Normal text: 宋体/Times New Roman, 小四(12pt/sz=24), line spacing 1.5x(360twips), first-line indent 2chars
- Heading 1 (章): 黑体/Times New Roman, 二号(22pt/sz=44), bold, centered, spacing before/after
- Heading 2 (节): 黑体/Arial, 小三(15pt/sz=32 half-pt), bold, left
- Heading 3 (小节): 黑体, 四号(14pt/sz=28), bold, left
- Title (摘要/Abstract): 黑体, 小三(15pt/sz=32), bold, centered
- Caption: 黑体, 五号(10.5pt/sz=20)
- Header: 大连科技学院2026届本科毕业设计（论文）
- Footer: page numbers
- TOC styles: toc1=黑体14pt, toc2=宋体14pt indent
"""

import os
import re
import sys
from docx import Document
from docx.shared import Pt, Cm, Inches, Twips, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.section import WD_ORIENT
from docx.oxml.ns import qn, nsdecls
from docx.oxml import parse_xml
from lxml import etree

THESIS_DIR = os.path.dirname(os.path.abspath(__file__))
DIAGRAMS_DIR = os.path.join(THESIS_DIR, 'diagrams')

# ─── Helper functions ───

def set_run_font(run, cn_font='宋体', en_font='Times New Roman', size=Pt(12), bold=False, italic=False):
    """Set font for a run with both Chinese and English fonts."""
    run.font.size = size
    run.font.bold = bold
    run.font.italic = italic
    run.font.name = en_font
    r = run._element
    rPr = r.find(qn('w:rPr'))
    if rPr is None:
        rPr = parse_xml(f'<w:rPr {nsdecls("w")}></w:rPr>')
        r.insert(0, rPr)
    rFonts = rPr.find(qn('w:rFonts'))
    if rFonts is None:
        rFonts = parse_xml(f'<w:rFonts {nsdecls("w")}/>')
        rPr.insert(0, rFonts)
    rFonts.set(qn('w:eastAsia'), cn_font)
    rFonts.set(qn('w:ascii'), en_font)
    rFonts.set(qn('w:hAnsi'), en_font)


def set_paragraph_spacing(paragraph, line_spacing=360, before=0, after=0, first_line_chars=None, first_line=None):
    """Set paragraph spacing and indentation."""
    pPr = paragraph._element.find(qn('w:pPr'))
    if pPr is None:
        pPr = parse_xml(f'<w:pPr {nsdecls("w")}></w:pPr>')
        paragraph._element.insert(0, pPr)

    # Spacing
    spacing = pPr.find(qn('w:spacing'))
    if spacing is None:
        spacing = parse_xml(f'<w:spacing {nsdecls("w")}/>')
        pPr.append(spacing)
    if line_spacing:
        spacing.set(qn('w:line'), str(line_spacing))
        spacing.set(qn('w:lineRule'), 'auto')
    if before:
        spacing.set(qn('w:before'), str(before))
    if after:
        spacing.set(qn('w:after'), str(after))

    # Indentation
    if first_line_chars or first_line:
        ind = pPr.find(qn('w:ind'))
        if ind is None:
            ind = parse_xml(f'<w:ind {nsdecls("w")}/>')
            pPr.append(ind)
        if first_line_chars:
            ind.set(qn('w:firstLineChars'), str(first_line_chars))
        if first_line:
            ind.set(qn('w:firstLine'), str(first_line))


def add_body_paragraph(doc, text, cn_font='宋体', en_font='Times New Roman', size=Pt(12),
                       bold=False, alignment=None, first_line_indent=True, line_spacing=360):
    """Add a normal body paragraph."""
    p = doc.add_paragraph()
    if alignment:
        p.alignment = alignment
    run = p.add_run(text)
    set_run_font(run, cn_font, en_font, size, bold)
    if first_line_indent:
        set_paragraph_spacing(p, line_spacing=line_spacing, first_line_chars=200, first_line=480)
    else:
        set_paragraph_spacing(p, line_spacing=line_spacing)
    return p


def add_heading_chapter(doc, text):
    """Add chapter heading (第X章) - 黑体 二号 bold centered."""
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(text)
    set_run_font(run, '黑体', 'Times New Roman', Pt(22), bold=True)
    set_paragraph_spacing(p, line_spacing=360, before=312, after=312)
    return p


def add_heading_section(doc, text):
    """Add section heading (X.X) - 黑体 小三 bold left."""
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.LEFT
    run = p.add_run(text)
    set_run_font(run, '黑体', 'Times New Roman', Pt(15), bold=True)
    set_paragraph_spacing(p, line_spacing=360, before=156, after=156)
    return p


def add_heading_subsection(doc, text):
    """Add subsection heading (X.X.X) - 黑体 四号 bold left."""
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.LEFT
    run = p.add_run(text)
    set_run_font(run, '黑体', 'Times New Roman', Pt(14), bold=True)
    set_paragraph_spacing(p, line_spacing=360, before=78, after=78)
    return p


def add_title(doc, text):
    """Add a title (摘要, Abstract, etc.) - 黑体 小三 bold centered."""
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(text)
    set_run_font(run, '黑体', 'Times New Roman', Pt(15), bold=True)
    set_paragraph_spacing(p, line_spacing=360, before=240, after=60)
    return p


def add_caption(doc, text):
    """Add figure/table caption - 黑体 五号 centered."""
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(text)
    set_run_font(run, '黑体', 'Times New Roman', Pt(10.5), bold=False)
    set_paragraph_spacing(p, line_spacing=360, before=60, after=60)
    return p


def add_image(doc, image_path, width=None):
    """Add an image centered."""
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run()
    if os.path.exists(image_path):
        if width:
            run.add_picture(image_path, width=width)
        else:
            # Auto-size but max width ~14cm
            from PIL import Image
            try:
                img = Image.open(image_path)
                w, h = img.size
                max_width = Cm(14)
                aspect = h / w
                if Cm(w * 2.54 / 96) > max_width:
                    run.add_picture(image_path, width=max_width)
                else:
                    run.add_picture(image_path, width=Cm(min(w * 2.54 / 96, 14)))
            except ImportError:
                run.add_picture(image_path, width=Cm(14))
    else:
        run.add_text(f'[图片缺失: {image_path}]')
    set_paragraph_spacing(p, line_spacing=360)
    return p


def add_table_from_md(doc, headers, rows):
    """Add a formatted table from markdown table data."""
    table = doc.add_table(rows=1 + len(rows), cols=len(headers))
    table.alignment = WD_TABLE_ALIGNMENT.CENTER

    # Set table style
    tbl = table._tbl
    tblPr = tbl.find(qn('w:tblPr'))
    if tblPr is None:
        tblPr = parse_xml(f'<w:tblPr {nsdecls("w")}></w:tblPr>')
        tbl.insert(0, tblPr)
    borders = parse_xml(
        f'<w:tblBorders {nsdecls("w")}>'
        '<w:top w:val="single" w:sz="4" w:space="0" w:color="000000"/>'
        '<w:left w:val="single" w:sz="4" w:space="0" w:color="000000"/>'
        '<w:bottom w:val="single" w:sz="4" w:space="0" w:color="000000"/>'
        '<w:right w:val="single" w:sz="4" w:space="0" w:color="000000"/>'
        '<w:insideH w:val="single" w:sz="4" w:space="0" w:color="000000"/>'
        '<w:insideV w:val="single" w:sz="4" w:space="0" w:color="000000"/>'
        '</w:tblBorders>'
    )
    tblPr.append(borders)

    # Header row - gray background, bold text, vertical center
    for i, h in enumerate(headers):
        cell = table.cell(0, i)
        cell.text = ''
        # Gray background
        shading = parse_xml(f'<w:shd {nsdecls("w")} w:fill="D9D9D9" w:val="clear"/>')
        cell._element.find(qn('w:tcPr')).append(shading) if cell._element.find(qn('w:tcPr')) is not None else None
        tcPr = cell._element.find(qn('w:tcPr'))
        if tcPr is None:
            tcPr = parse_xml(f'<w:tcPr {nsdecls("w")}></w:tcPr>')
            cell._element.insert(0, tcPr)
        shading = tcPr.find(qn('w:shd'))
        if shading is None:
            shading = parse_xml(f'<w:shd {nsdecls("w")} w:fill="D9D9D9" w:val="clear"/>')
            tcPr.append(shading)
        else:
            shading.set(qn('w:fill'), 'D9D9D9')
        # Vertical center
        vAlign = tcPr.find(qn('w:vAlign'))
        if vAlign is None:
            vAlign = parse_xml(f'<w:vAlign {nsdecls("w")} w:val="center"/>')
            tcPr.append(vAlign)
        p = cell.paragraphs[0]
        p.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = p.add_run(h.strip())
        set_run_font(run, '黑体', 'Times New Roman', Pt(10.5), bold=True)
        set_paragraph_spacing(p, line_spacing=300)

    # Data rows - vertical center
    for r_idx, row in enumerate(rows):
        for c_idx, cell_text in enumerate(row):
            if c_idx < len(headers):
                cell = table.cell(r_idx + 1, c_idx)
                cell.text = ''
                # Vertical center
                tcPr = cell._element.find(qn('w:tcPr'))
                if tcPr is None:
                    tcPr = parse_xml(f'<w:tcPr {nsdecls("w")}></w:tcPr>')
                    cell._element.insert(0, tcPr)
                vAlign = tcPr.find(qn('w:vAlign'))
                if vAlign is None:
                    vAlign = parse_xml(f'<w:vAlign {nsdecls("w")} w:val="center"/>')
                    tcPr.append(vAlign)
                p = cell.paragraphs[0]
                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
                run = p.add_run(cell_text.strip())
                set_run_font(run, '宋体', 'Times New Roman', Pt(10.5), bold=False)
                set_paragraph_spacing(p, line_spacing=300)

    return table


def setup_page(doc):
    """Set up page size, margins, headers, footers."""
    section = doc.sections[0]
    section.page_width = Twips(11906)
    section.page_height = Twips(16838)
    section.top_margin = Cm(2.5)
    section.bottom_margin = Cm(2.5)
    section.left_margin = Cm(2.5)
    section.right_margin = Cm(2.5)
    section.header_distance = Cm(1.27)
    section.footer_distance = Cm(1.27)

    # Header
    header = section.header
    header.is_linked_to_previous = False
    hp = header.paragraphs[0]
    hp.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = hp.add_run('大连科技学院2026届本科毕业设计（论文）')
    set_run_font(run, '宋体', 'Times New Roman', Pt(9), bold=False)
    # Add bottom border to header paragraph
    pPr = hp._element.find(qn('w:pPr'))
    if pPr is None:
        pPr = parse_xml(f'<w:pPr {nsdecls("w")}></w:pPr>')
        hp._element.insert(0, pPr)
    pBdr = parse_xml(
        f'<w:pBdr {nsdecls("w")}>'
        '<w:bottom w:val="single" w:sz="6" w:space="1" w:color="000000"/>'
        '</w:pBdr>'
    )
    pPr.append(pBdr)

    # Footer with page number
    footer = section.footer
    footer.is_linked_to_previous = False
    fp = footer.paragraphs[0]
    fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
    # Add page number field
    run = fp.add_run()
    fldChar1 = parse_xml(f'<w:fldChar {nsdecls("w")} w:fldCharType="begin"/>')
    run._element.append(fldChar1)
    run2 = fp.add_run()
    instrText = parse_xml(f'<w:instrText {nsdecls("w")} xml:space="preserve"> PAGE </w:instrText>')
    run2._element.append(instrText)
    run3 = fp.add_run()
    fldChar2 = parse_xml(f'<w:fldChar {nsdecls("w")} w:fldCharType="end"/>')
    run3._element.append(fldChar2)
    set_run_font(run, '宋体', 'Times New Roman', Pt(9))
    set_run_font(run2, '宋体', 'Times New Roman', Pt(9))
    set_run_font(run3, '宋体', 'Times New Roman', Pt(9))


def add_cover_page(doc):
    """Add the thesis cover page."""
    # Blank lines for spacing
    for _ in range(3):
        p = doc.add_paragraph()
        set_paragraph_spacing(p, line_spacing=360)

    # University name
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run('大连科技学院')
    set_run_font(run, '黑体', 'Times New Roman', Pt(26), bold=True)
    set_paragraph_spacing(p, line_spacing=360)

    # Thesis type
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run('毕业设计（论文）')
    set_run_font(run, '黑体', 'Times New Roman', Pt(26), bold=True)
    set_paragraph_spacing(p, line_spacing=360, after=600)

    # Title
    for _ in range(2):
        p = doc.add_paragraph()
        set_paragraph_spacing(p, line_spacing=360)

    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run('论文题目：基于Spring Boot的养老院管理系统的设计与实现')
    set_run_font(run, '黑体', 'Times New Roman', Pt(16), bold=True)
    set_paragraph_spacing(p, line_spacing=480)

    # Blank lines
    for _ in range(4):
        p = doc.add_paragraph()
        set_paragraph_spacing(p, line_spacing=360)

    # Info fields
    info_fields = [
        ('学    院：', '网络与通信学院'),
        ('专    业：', '网络工程'),
        ('学    号：', '          '),
        ('学生姓名：', '          '),
        ('指导教师：', '          '),
    ]
    for label, value in info_fields:
        p = doc.add_paragraph()
        p.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = p.add_run(label)
        set_run_font(run, '宋体', 'Times New Roman', Pt(14), bold=False)
        run2 = p.add_run(value)
        set_run_font(run2, '宋体', 'Times New Roman', Pt(14), bold=False)
        # Add underline to value
        run2.font.underline = True
        set_paragraph_spacing(p, line_spacing=480)

    # Page break
    doc.add_page_break()


def parse_markdown_files():
    """Parse the 4 thesis markdown files and return structured content."""
    files = [
        os.path.join(THESIS_DIR, '论文.md'),
        os.path.join(THESIS_DIR, 'chapter3.md'),
        os.path.join(THESIS_DIR, 'chapter4.md'),
        os.path.join(THESIS_DIR, 'chapter5_6_7.md'),
    ]

    content = []
    for f in files:
        with open(f, 'r', encoding='utf-8') as fh:
            content.append(fh.read())

    return '\n\n'.join(content)


def process_markdown(doc, md_text):
    """Process markdown text and add to document with proper formatting."""
    lines = md_text.split('\n')
    i = 0
    in_table = False
    table_headers = []
    table_rows = []
    skip_toc = False

    while i < len(lines):
        line = lines[i].rstrip()

        # Skip empty lines
        if not line.strip():
            i += 1
            continue

        # Skip the TOC section
        if line.strip() == '## 目录':
            skip_toc = True
            i += 1
            continue
        if skip_toc:
            if line.startswith('# ') or line.startswith('## 摘要'):
                skip_toc = False
            else:
                i += 1
                continue

        # Main title (skip - already on cover page)
        if line.startswith('# 基于') or line.startswith('# 第'):
            text = line.lstrip('# ').strip()
            if '第' in text and '章' in text:
                add_heading_chapter(doc, text)
            i += 1
            continue

        # 摘要 / Abstract title
        if line.strip() == '## 摘要':
            add_title(doc, '摘  要')
            i += 1
            continue
        if line.strip() == '## Abstract':
            doc.add_page_break()
            add_title(doc, 'Abstract')
            i += 1
            continue

        # Keywords line
        if line.startswith('关键词：') or line.startswith('关键词:'):
            p = doc.add_paragraph()
            run = p.add_run('关键词：')
            set_run_font(run, '黑体', 'Times New Roman', Pt(12), bold=True)
            run2 = p.add_run(line.split('：', 1)[1] if '：' in line else line.split(':', 1)[1])
            set_run_font(run2, '宋体', 'Times New Roman', Pt(12))
            set_paragraph_spacing(p, line_spacing=360)
            i += 1
            continue
        if line.startswith('Keywords:') or line.startswith('Key words:'):
            p = doc.add_paragraph()
            run = p.add_run('Key words: ')
            set_run_font(run, 'Times New Roman', 'Times New Roman', Pt(12), bold=True)
            kw_text = line.split(':', 1)[1].strip() if ':' in line else ''
            run2 = p.add_run(kw_text)
            set_run_font(run2, 'Times New Roman', 'Times New Roman', Pt(12))
            set_paragraph_spacing(p, line_spacing=360)
            # Page break after English abstract keywords
            doc.add_page_break()
            i += 1
            continue

        # Section headings
        if line.startswith('## '):
            text = line[3:].strip()
            # Check if it's a special section
            if text in ['参考文献']:
                doc.add_page_break()
                add_heading_chapter(doc, text)
            elif text in ['致谢']:
                doc.add_page_break()
                add_heading_chapter(doc, '致  谢')
            else:
                add_heading_section(doc, text)
            i += 1
            continue

        # Subsection headings
        if line.startswith('### '):
            text = line[4:].strip()
            add_heading_subsection(doc, text)
            i += 1
            continue

        # Image
        img_match = re.match(r'!\[(.+?)\]\((.+?)\)', line)
        if img_match:
            alt_text = img_match.group(1)
            img_path = img_match.group(2)
            full_path = os.path.join(THESIS_DIR, img_path)
            add_image(doc, full_path)
            i += 1
            continue

        # Figure/table caption (line like "图4.1 xxx" or "表4.1 xxx")
        if re.match(r'^(图|表)\d+\.\d+', line.strip()):
            add_caption(doc, line.strip())
            i += 1
            continue

        # Table detection
        if '|' in line and line.strip().startswith('|'):
            # Parse table
            if not in_table:
                in_table = True
                # Parse header
                cells = [c.strip() for c in line.strip().strip('|').split('|')]
                table_headers = cells
                i += 1
                # Skip separator line
                if i < len(lines) and '---' in lines[i]:
                    i += 1
                table_rows = []
                continue
            else:
                cells = [c.strip() for c in line.strip().strip('|').split('|')]
                table_rows.append(cells)
                i += 1
                # Check if next line is still table
                if i >= len(lines) or not lines[i].strip().startswith('|'):
                    in_table = False
                    add_table_from_md(doc, table_headers, table_rows)
                    table_headers = []
                    table_rows = []
                continue

        # Reference items [1], [2], etc.
        ref_match = re.match(r'^\[(\d+)\]\s*(.+)', line.strip())
        if ref_match:
            p = doc.add_paragraph()
            run = p.add_run(line.strip())
            set_run_font(run, '宋体', 'Times New Roman', Pt(10.5))
            set_paragraph_spacing(p, line_spacing=360)
            i += 1
            continue

        # Numbered items like （1）, （2）
        num_match = re.match(r'^（\d+）', line.strip())
        if num_match:
            add_body_paragraph(doc, line.strip(), first_line_indent=True)
            i += 1
            continue

        # Normal body text
        if line.strip():
            add_body_paragraph(doc, line.strip(), first_line_indent=True)

        i += 1


def main():
    doc = Document()

    # Setup page
    setup_page(doc)

    # Cover page
    add_cover_page(doc)

    # Parse and process markdown
    md_text = parse_markdown_files()
    process_markdown(doc, md_text)

    # Save
    output_path = os.path.join(THESIS_DIR, '基于Spring Boot的养老院管理系统的设计与实现.docx')
    doc.save(output_path)
    print(f'Thesis saved to: {output_path}')
    print(f'File size: {os.path.getsize(output_path) / 1024:.1f} KB')


if __name__ == '__main__':
    main()