| |
| import re, pathlib |
| import docx |
|
|
| BASE = pathlib.Path(__file__).resolve().parent.parent |
| RAW = BASE / "raw_docs" |
| OUT = BASE / "converted" |
| OUT.mkdir(exist_ok=True) |
|
|
| def table_to_markdown(table) -> str: |
| """Convert a python-docx table into Markdown format.""" |
| rows = list(table.rows) |
| if not rows: |
| return "" |
| |
| |
| data = [] |
| for row in rows: |
| row_data = [] |
| for cell in row.cells: |
| cell_text = cell.text.strip() |
| if not cell_text: |
| cell_text = " " |
| cell_text = cell_text.replace('\n', ' ') |
| cell_text = ' '.join(cell_text.split()) |
| row_data.append(cell_text) |
| data.append(row_data) |
| |
| if not data: |
| return "" |
| |
| |
| max_cols = max(len(row) for row in data) |
| for row in data: |
| while len(row) < max_cols: |
| row.append(" ") |
| |
| |
| header = "| " + " | ".join(data[0]) + " |" |
| sep = "| " + " | ".join(["---"] * len(data[0])) + " |" |
| body = ["| " + " | ".join(row) + " |" for row in data[1:]] |
|
|
| return "\n".join([header, sep] + body) |
|
|
| def get_paragraph_formatting(paragraph): |
| """Extract formatting information from a paragraph.""" |
| text = paragraph.text.strip() |
| if not text: |
| return None |
| |
| is_bold = any(run.bold for run in paragraph.runs if run.text.strip()) |
| is_italic = any(run.italic for run in paragraph.runs if run.text.strip()) |
| |
| return { |
| 'text': text, |
| 'bold': is_bold, |
| 'italic': is_italic |
| } |
|
|
| def format_paragraph(para_info): |
| """Format paragraph based on bold/italic.""" |
| if not para_info: |
| return "" |
| |
| text = para_info['text'] |
| if para_info['bold'] and para_info['italic']: |
| return f"__*{text}*__" |
| elif para_info['bold']: |
| return f"__{text}__" |
| elif para_info['italic']: |
| return f"*{text}*" |
| return text |
|
|
| def clean_and_normalize(text: str) -> str: |
| """Normalize Vietnamese legal document structure with proper hierarchy.""" |
| lines = text.split('\n') |
| processed_lines = [] |
| |
| |
| for i, line in enumerate(lines): |
| original_line = line |
| line = line.strip() |
| if not line: |
| processed_lines.append(original_line) |
| continue |
| |
| |
| if re.match(r"^__CHƯƠNG\s+[IVXLC]+", line): |
| line = "# " + re.sub(r"__", "", line) |
| processed_lines.append(line) |
| continue |
|
|
| |
| if re.match(r"^__Điều\s+\d+", line): |
| line = "## " + re.sub(r"__", "", line) |
| processed_lines.append(line) |
| continue |
|
|
| |
| |
| bold_italic_match = re.match(r"^__\*(.*?)\*__(.*)$", line) |
| if bold_italic_match: |
| header_text = bold_italic_match.group(1).strip() |
| content_text = bold_italic_match.group(2).strip() |
| |
| |
| if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", header_text): |
| |
| if ':' in header_text: |
| parts = header_text.split(':', 1) |
| header = parts[0].strip() + ':' |
| header_content = parts[1].strip() |
| processed_lines.append("#### Điểm " + header) |
| if header_content: |
| processed_lines.append(header_content) |
| if content_text: |
| processed_lines.append(content_text) |
| else: |
| processed_lines.append("#### Điểm " + header_text) |
| if content_text: |
| processed_lines.append(content_text) |
| continue |
|
|
| |
| prev_line_empty = (i == 0 or not lines[i-1].strip()) |
| prev_line_is_header = (i > 0 and lines[i-1].strip() and |
| (re.match(r'^__Điều\s+\d+', lines[i-1].strip()) or |
| lines[i-1].strip().startswith(('##', '###', '####')))) |
| |
| prev_line_is_content = (i > 0 and lines[i-1].strip() and |
| (lines[i-1].strip().startswith(('-', '+', '*')) or |
| re.match(r'^[A-ZÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴ]', lines[i-1].strip()))) |
| is_standalone = prev_line_empty or prev_line_is_header or prev_line_is_content |
| |
| |
| if re.match(r"^__\d+\.\s+.*__$", line) and is_standalone: |
| clean_text = re.sub(r"^__(.*)__$", r"\1", line) |
| khoan_match = re.match(r"^(\d+)\.\s+(.*)", clean_text) |
| if khoan_match: |
| number = khoan_match.group(1) |
| content = khoan_match.group(2) |
| line = f"### Khoản {number}. {content}" |
| else: |
| line = "### Khoản " + clean_text |
| processed_lines.append(line) |
| continue |
|
|
| |
| if (re.match(r"^\d+\.\s+[A-ZÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴ]", line) and |
| is_standalone and len(line.split()) <= 8): |
| |
| prev_content_idx = i - 1 |
| while prev_content_idx >= 0 and not lines[prev_content_idx].strip(): |
| prev_content_idx -= 1 |
| |
| if (prev_content_idx >= 0 and |
| (re.match(r'^__Điều\s+\d+', lines[prev_content_idx].strip()) or |
| lines[prev_content_idx].strip().startswith('## Điều'))): |
| khoan_match = re.match(r"^(\d+)\.\s+(.*)", line) |
| if khoan_match: |
| number = khoan_match.group(1) |
| content = khoan_match.group(2) |
| line = f"### Khoản {number}. {content}" |
| processed_lines.append(line) |
| continue |
|
|
| |
| |
| |
| |
| if re.match(r"^__\*.*\*__$", line) and is_standalone: |
| clean_text = re.sub(r"^__\*(.*)\*__$", r"\1", line) |
| |
| if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", clean_text): |
| |
| if ':' in clean_text: |
| parts = clean_text.split(':', 1) |
| header = parts[0].strip() + ':' |
| content = parts[1].strip() |
| processed_lines.append("#### Điểm " + header) |
| if content: |
| processed_lines.append(content) |
| else: |
| processed_lines.append("#### Điểm " + clean_text) |
| continue |
| |
| |
| if re.match(r"^\*.*\*$", line) and is_standalone: |
| clean_text = re.sub(r"^\*(.*)\*$", r"\1", line) |
| |
| if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\)|\*)\s+", clean_text): |
| |
| if ':' in clean_text: |
| parts = clean_text.split(':', 1) |
| header = parts[0].strip() + ':' |
| content = parts[1].strip() |
| processed_lines.append("#### Điểm " + header) |
| if content: |
| processed_lines.append(content) |
| else: |
| processed_lines.append("#### Điểm " + clean_text) |
| continue |
|
|
| |
| if re.match(r"^__\d+\.\d+\.\s+.*__$", line): |
| clean_text = re.sub(r"^__(.*)__$", r"\1", line) |
| line = "#### Điểm " + clean_text |
| processed_lines.append(line) |
| continue |
|
|
| |
| if re.match(r"^\*\d+\.\d+\.\s+.*\*$", line): |
| clean_text = re.sub(r"^\*(.*)\*$", r"\1", line) |
| line = "#### Điểm " + clean_text |
| processed_lines.append(line) |
| continue |
|
|
| |
| if re.match(r"^__\*\*.*\*__$", line) and is_standalone: |
| clean_text = re.sub(r"^__\*\*(.*)\*__$", r"\1", line) |
| line = "#### Điểm *" + clean_text |
| processed_lines.append(line) |
| continue |
|
|
| |
| |
|
|
| |
| if re.match(r"^__\*[a-z]\)\s+.*\*__$", line): |
| clean_text = re.sub(r"^__\*(.*)\*__$", r"\1", line) |
| |
| if ':' in clean_text: |
| parts = clean_text.split(':', 1) |
| header = parts[0].strip() + ':' |
| content = parts[1].strip() |
| processed_lines.append("#### Điểm " + header) |
| if content: |
| processed_lines.append(content) |
| else: |
| processed_lines.append("#### Điểm " + clean_text) |
| continue |
|
|
| |
| |
| |
|
|
| |
| |
| |
| bold_match = re.match(r"^(__.*?__)\s*(.*)$", line) |
| if bold_match: |
| header_part = bold_match.group(1) |
| content_part = bold_match.group(2).strip() |
| |
| |
| clean_header = re.sub(r"^__(.*)__$", r"\1", header_part) |
| if re.match(r"^\d+\.\s+", clean_header): |
| khoan_match = re.match(r"^(\d+)\.\s+(.*)", clean_header) |
| if khoan_match: |
| number = khoan_match.group(1) |
| header_content = khoan_match.group(2) |
| processed_lines.append(f"### Khoản {number}. {header_content}") |
| if content_part: |
| processed_lines.append(content_part) |
| continue |
| |
| elif re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\))\s+", clean_header): |
| processed_lines.append("#### Điểm " + clean_header) |
| if content_part: |
| processed_lines.append(content_part) |
| continue |
|
|
| |
| |
| italic_match = re.match(r"^(\*.*?\*)\s*(.*)$", line) |
| if italic_match: |
| header_part = italic_match.group(1) |
| content_part = italic_match.group(2).strip() |
| |
| |
| clean_header = re.sub(r"^\*(.*)\*$", r"\1", header_part) |
| if re.match(r"^(\d+\.|\d+\.\d+\.|\d+\.\d+\.\d+\.|[a-z]\))\s+", clean_header): |
| processed_lines.append("#### Điểm " + clean_header) |
| if content_part: |
| processed_lines.append(content_part) |
| continue |
|
|
| |
| |
|
|
| processed_lines.append(line) |
| |
| |
| return '\n'.join(processed_lines).strip() |
|
|
| def convert_doc_to_md(doc_path, md_path): |
| """Convert document (paragraphs + tables) to Markdown with normalization.""" |
| doc = docx.Document(doc_path) |
| markdown_lines = [] |
| |
| for element in doc.element.body: |
| if element.tag.endswith('tbl'): |
| table = docx.table.Table(element, doc) |
| md_table = table_to_markdown(table) |
| if markdown_lines and markdown_lines[-1].strip(): |
| markdown_lines.append("") |
| markdown_lines.append(md_table) |
| markdown_lines.append("") |
| |
| elif element.tag.endswith('p'): |
| paragraph = docx.text.paragraph.Paragraph(element, doc) |
| para_info = get_paragraph_formatting(paragraph) |
| if para_info and para_info['text']: |
| markdown_lines.append(format_paragraph(para_info)) |
| |
| |
| final_text = '\n'.join(markdown_lines) |
| final_text = clean_and_normalize(final_text) |
| |
| md_path.write_text(final_text, encoding="utf-8") |
| return md_path |
|
|
| if __name__ == "__main__": |
| for doc in RAW.iterdir(): |
| if doc.suffix.lower() not in [".doc", ".docx"]: |
| print("Skipping:", doc); continue |
| out = OUT / (doc.stem + ".md") |
| convert_doc_to_md(doc, out) |
| print("Converted:", out) |
|
|