Spaces:

xuanbao01
/

snote

Runtime error

App Files Files Community

snote / scripts /document_parser.py

xuanbao01

Upload folder using huggingface_hub

44c5827 verified 9 months ago

raw

history blame contribute delete

14.5 kB

	#!/usr/bin/env python3
	import re, pathlib
	import docx # from python-docx

	BASE = pathlib.Path(__file__).resolve().parent.parent
	RAW = BASE / "raw_docs"
	OUT = BASE / "converted"
	OUT.mkdir(exist_ok=True)

	def table_to_markdown(table) -> str:
	"""Convert a python-docx table into Markdown format."""
	rows = list(table.rows)
	if not rows:
	return ""

	# Extract text from each cell
	data = []
	for row in rows:
	row_data = []
	for cell in row.cells:
	cell_text = cell.text.strip()
	if not cell_text:
	cell_text = " "
	cell_text = cell_text.replace('\n', ' ')
	cell_text = ' '.join(cell_text.split())
	row_data.append(cell_text)
	data.append(row_data)

	if not data:
	return ""

	# Ensure all rows same length
	max_cols = max(len(row) for row in data)
	for row in data:
	while len(row) < max_cols:
	row.append(" ")

	# Build markdown table
	header = "\| " + " \| ".join(data[0]) + " \|"
	sep = "\| " + " \| ".join(["---"] * len(data[0])) + " \|"
	body = ["\| " + " \| ".join(row) + " \|" for row in data[1:]]

	return "\n".join([header, sep] + body)

	def get_paragraph_formatting(paragraph):
	"""Extract formatting information from a paragraph."""
	text = paragraph.text.strip()
	if not text:
	return None

	is_bold = any(run.bold for run in paragraph.runs if run.text.strip())
	is_italic = any(run.italic for run in paragraph.runs if run.text.strip())

	return {
	'text': text,
	'bold': is_bold,
	'italic': is_italic
	}

	def format_paragraph(para_info):
	"""Format paragraph based on bold/italic."""
	if not para_info:
	return ""

	text = para_info['text']
	if para_info['bold'] and para_info['italic']:
	return f"__{text}__"
	elif para_info['bold']:
	return f"__{text}__"
	elif para_info['italic']:
	return f"{text}"
	return text

	def clean_and_normalize(text: str) -> str:
	"""Normalize Vietnamese legal document structure with proper hierarchy."""
	lines = text.split('\n')
	processed_lines = []

	# First pass: Convert basic formatting and handle Khoản/Điểm
	for i, line in enumerate(lines):
	original_line = line
	line = line.strip()
	if not line:
	processed_lines.append(original_line)
	continue

	# CHƯƠNG -> #
	if re.match(r"^__CHƯƠNG\s+[IVXLC]+", line):
	line = "# " + re.sub(r"__", "", line)
	processed_lines.append(line)
	continue

	# Điều -> ##
	if re.match(r"^__Điều\s+\d+", line):
	line = "## " + re.sub(r"__", "", line)
	processed_lines.append(line)
	continue

	# Handle mixed formatting first (before other rules)
	# Check for bold+italic header followed by plain text: __header__ content
	bold_italic_match = re.match(r"^__\(.?)\__(.)$", line)
	if bold_italic_match:
	header_text = bold_italic_match.group(1).strip()
	content_text = bold_italic_match.group(2).strip()

	# Check if header starts with number, letter, or asterisk
	if re.match(r"^(\d+\.\|\d+\.\d+\.\|\d+\.\d+\.\d+\.\|[a-z]\)\|\*)\s+", header_text):
	# Split at colon if present
	if ':' in header_text:
	parts = header_text.split(':', 1)
	header = parts[0].strip() + ':'
	header_content = parts[1].strip()
	processed_lines.append("#### Điểm " + header)
	if header_content:
	processed_lines.append(header_content)
	if content_text:
	processed_lines.append(content_text)
	else:
	processed_lines.append("#### Điểm " + header_text)
	if content_text:
	processed_lines.append(content_text)
	continue

	# Check if this line is standalone (at beginning of paragraph)
	prev_line_empty = (i == 0 or not lines[i-1].strip())
	prev_line_is_header = (i > 0 and lines[i-1].strip() and
	(re.match(r'^__Điều\s+\d+', lines[i-1].strip()) or
	lines[i-1].strip().startswith(('##', '###', '####'))))
	# Also consider it standalone if previous line is a bullet point or content line
	prev_line_is_content = (i > 0 and lines[i-1].strip() and
	(lines[i-1].strip().startswith(('-', '+', '*')) or
	re.match(r'^[A-ZÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴ]', lines[i-1].strip())))
	is_standalone = prev_line_empty or prev_line_is_header or prev_line_is_content

	# Khoản: Bold text (__text__), standalone, starts with number
	if re.match(r"^__\d+\.\s+.*__$", line) and is_standalone:
	clean_text = re.sub(r"^__(.*)__$", r"\1", line)
	khoan_match = re.match(r"^(\d+)\.\s+(.*)", clean_text)
	if khoan_match:
	number = khoan_match.group(1)
	content = khoan_match.group(2)
	line = f"### Khoản {number}. {content}"
	else:
	line = "### Khoản " + clean_text
	processed_lines.append(line)
	continue

	# Handle plain numbered items that follow Điều and look like section headers
	if (re.match(r"^\d+\.\s+[A-ZÀÁẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÈÉẺẼẸÊẾỀỂỄỆÌÍỈĨỊÒÓỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÙÚỦŨỤƯỨỪỬỮỰỲÝỶỸỴ]", line) and
	is_standalone and len(line.split()) <= 8): # Short enough to be a header
	# Check if previous non-empty line is a Điều
	prev_content_idx = i - 1
	while prev_content_idx >= 0 and not lines[prev_content_idx].strip():
	prev_content_idx -= 1

	if (prev_content_idx >= 0 and
	(re.match(r'^__Điều\s+\d+', lines[prev_content_idx].strip()) or
	lines[prev_content_idx].strip().startswith('## Điều'))):
	khoan_match = re.match(r"^(\d+)\.\s+(.*)", line)
	if khoan_match:
	number = khoan_match.group(1)
	content = khoan_match.group(2)
	line = f"### Khoản {number}. {content}"
	processed_lines.append(line)
	continue

	# Note: Only bold numbered items should be converted to Khoản
	# Plain numbered items should remain as regular numbered lists

	# Điểm: Bold+italic text (__text__), standalone, starts with number or letter
	if re.match(r"^__\.\*__$", line) and is_standalone:
	clean_text = re.sub(r"^__\(.)\*__$", r"\1", line)
	# Check if starts with number (1., 1.1., etc.), letter (a), b), etc.), or asterisk (*)
	if re.match(r"^(\d+\.\|\d+\.\d+\.\|\d+\.\d+\.\d+\.\|[a-z]\)\|\*)\s+", clean_text):
	# Check if there's content after a colon that should be separated
	if ':' in clean_text:
	parts = clean_text.split(':', 1)
	header = parts[0].strip() + ':'
	content = parts[1].strip()
	processed_lines.append("#### Điểm " + header)
	if content:
	processed_lines.append(content)
	else:
	processed_lines.append("#### Điểm " + clean_text)
	continue

	# Điểm: Just italic text (text), standalone, starts with number or letter
	if re.match(r"^\.\*$", line) and is_standalone:
	clean_text = re.sub(r"^\(.)\*$", r"\1", line)
	# Check if starts with number (1., 1.1., etc.), letter (a), b), etc.), or asterisk (*)
	if re.match(r"^(\d+\.\|\d+\.\d+\.\|\d+\.\d+\.\d+\.\|[a-z]\)\|\*)\s+", clean_text):
	# Check if there's content after a colon that should be separated
	if ':' in clean_text:
	parts = clean_text.split(':', 1)
	header = parts[0].strip() + ':'
	content = parts[1].strip()
	processed_lines.append("#### Điểm " + header)
	if content:
	processed_lines.append(content)
	else:
	processed_lines.append("#### Điểm " + clean_text)
	continue

	# Handle numbered sub-items like "1.1.", "1.2.", etc. - bold format
	if re.match(r"^__\d+\.\d+\.\s+.*__$", line):
	clean_text = re.sub(r"^__(.*)__$", r"\1", line)
	line = "#### Điểm " + clean_text
	processed_lines.append(line)
	continue

	# Handle numbered sub-items like "1.1.", "1.2.", etc. - italic format
	if re.match(r"^\\d+\.\d+\.\s+.\*$", line):
	clean_text = re.sub(r"^\(.)\*$", r"\1", line)
	line = "#### Điểm " + clean_text
	processed_lines.append(line)
	continue

	# Handle asterisk items that are bold+italic: __** text*__
	if re.match(r"^__\\.\__$", line) and is_standalone:
	clean_text = re.sub(r"^__\\(.)\__$", r"\1", line)
	line = "#### Điểm *" + clean_text
	processed_lines.append(line)
	continue

	# Note: Plain numbered sub-items should remain as regular text
	# Only bold or italic formatted items should be converted to Điểm

	# Handle lettered items like "a)", "b)", "c)", etc. that are bold+italic
	if re.match(r"^__\[a-z]\)\s+.\*__$", line):
	clean_text = re.sub(r"^__\(.)\*__$", r"\1", line)
	# Check if there's content after a colon that should be separated
	if ':' in clean_text:
	parts = clean_text.split(':', 1)
	header = parts[0].strip() + ':'
	content = parts[1].strip()
	processed_lines.append("#### Điểm " + header)
	if content:
	processed_lines.append(content)
	else:
	processed_lines.append("#### Điểm " + clean_text)
	continue

	# Handle mixed formatting: bold/italic header + plain text content on same line
	# Example: "__1. Header:__ Plain text content" should become "#### Điểm 1. Header:" + "Plain text content"



	# Check for bold header followed by plain text
	# Pattern: __header text__ remaining plain text
	bold_match = re.match(r"^(__.?__)\s(.*)$", line)
	if bold_match:
	header_part = bold_match.group(1)
	content_part = bold_match.group(2).strip()

	# Process the header part for Khoản
	clean_header = re.sub(r"^__(.*)__$", r"\1", header_part)
	if re.match(r"^\d+\.\s+", clean_header):
	khoan_match = re.match(r"^(\d+)\.\s+(.*)", clean_header)
	if khoan_match:
	number = khoan_match.group(1)
	header_content = khoan_match.group(2)
	processed_lines.append(f"### Khoản {number}. {header_content}")
	if content_part:
	processed_lines.append(content_part)
	continue
	# Process the header part for Điểm
	elif re.match(r"^(\d+\.\|\d+\.\d+\.\|\d+\.\d+\.\d+\.\|[a-z]\))\s+", clean_header):
	processed_lines.append("#### Điểm " + clean_header)
	if content_part:
	processed_lines.append(content_part)
	continue

	# Check for italic header followed by plain text
	# Pattern: header text remaining plain text
	italic_match = re.match(r"^(\.?\)\s(.*)$", line)
	if italic_match:
	header_part = italic_match.group(1)
	content_part = italic_match.group(2).strip()

	# Process the header part
	clean_header = re.sub(r"^\(.)\*$", r"\1", header_part)
	if re.match(r"^(\d+\.\|\d+\.\d+\.\|\d+\.\d+\.\d+\.\|[a-z]\))\s+", clean_header):
	processed_lines.append("#### Điểm " + clean_header)
	if content_part:
	processed_lines.append(content_part)
	continue

	# Note: Plain numbered items should remain as regular numbered lists
	# Only convert to Điểm if they have proper formatting (bold/italic)

	processed_lines.append(line)

	# No need for Text: labels, just return the processed content
	return '\n'.join(processed_lines).strip()

	def convert_doc_to_md(doc_path, md_path):
	"""Convert document (paragraphs + tables) to Markdown with normalization."""
	doc = docx.Document(doc_path)
	markdown_lines = []

	for element in doc.element.body:
	if element.tag.endswith('tbl'): # Table
	table = docx.table.Table(element, doc)
	md_table = table_to_markdown(table)
	if markdown_lines and markdown_lines[-1].strip():
	markdown_lines.append("")
	markdown_lines.append(md_table)
	markdown_lines.append("")

	elif element.tag.endswith('p'): # Paragraph
	paragraph = docx.text.paragraph.Paragraph(element, doc)
	para_info = get_paragraph_formatting(paragraph)
	if para_info and para_info['text']:
	markdown_lines.append(format_paragraph(para_info))

	# Join + normalize
	final_text = '\n'.join(markdown_lines)
	final_text = clean_and_normalize(final_text)

	md_path.write_text(final_text, encoding="utf-8")
	return md_path

	if __name__ == "__main__":
	for doc in RAW.iterdir():
	if doc.suffix.lower() not in [".doc", ".docx"]:
	print("Skipping:", doc); continue
	out = OUT / (doc.stem + ".md")
	convert_doc_to_md(doc, out)
	print("Converted:", out)