Spaces:

liampond
/

CantusSVS-hf

Sleeping

App Files Files Community

CantusSVS-hf / app.py

liampond

Update app.py

1c1f57b verified 8 months ago

raw

history blame contribute delete

17.9 kB

	import streamlit as st
	st.set_page_config(page_title="CantusSVS", layout="wide")

	import os
	import yaml
	import shutil
	import traceback
	import json
	import requests
	import zipfile
	import streamlit.components.v1 as components
	from pathlib import Path
	from webapp.services.defaults.default_splitter import split_syllable


	def patch_config_yaml_files():
	root = "/tmp/cantussvs_v1"
	checkpoints_root = os.path.join(root, "checkpoints")
	data_root = os.path.join(root, "data")

	for dirpath, _, filenames in os.walk(checkpoints_root):
	for filename in filenames:
	if filename == "config.yaml":
	full_path = os.path.join(dirpath, filename)
	try:
	with open(full_path, "r") as f:
	config = yaml.safe_load(f)

	if not isinstance(config, dict):
	continue

	modified = False
	for key, value in config.items():
	if isinstance(value, str):
	if value.startswith("checkpoints/"):
	rel = value.split("/", 1)[1]
	config[key] = os.path.join(checkpoints_root, rel)
	modified = True
	elif value.startswith("data/"):
	rel = value.split("/", 1)[1]
	config[key] = os.path.join(data_root, rel)
	modified = True

	if modified:
	with open(full_path, "w") as f:
	yaml.dump(config, f)
	print(f"✅ Patched paths in {full_path}")
	except Exception as e:
	print(f"❌ Failed to patch {full_path}: {e}")

	# Disable Streamlit file watcher
	os.environ['STREAMLIT_SERVER_FILE_WATCHER_TYPE'] = 'none'

	# Ensure project root is on the import path
	PROJECT_ROOT = Path(__file__).resolve().parent
	import sys
	sys.path.insert(0, str(PROJECT_ROOT))

	from webapp.services.parsing.mei_parser import parse_mei_for_editor
	from webapp.services.parsing.ds_builder import build_ds_from_notes
	from webapp.services.parsing.ds_validator import validate_ds
	from webapp.services.phonemes.phoneme_dict import PHONEMES as permitted_phonemes
	from inference.pipeline import run_inference

	def safe_symlink(src, dst):
	try:
	if os.path.islink(dst):
	if os.readlink(dst) == src:
	print(f"✅ Symlink already correct: {dst} → {src}")
	return
	else:
	print(f"⚠️ Symlink exists but points elsewhere. Skipping: {dst}")
	return
	elif os.path.exists(dst):
	print(f"❗ Cannot create symlink, path exists and is not a symlink: {dst}")
	return
	os.symlink(src, dst)
	print(f"✅ Created symlink: {dst} → {src}")
	except Exception as e:
	print(f"❗ Failed to create symlink {dst} -> {src}: {e}")

	# Directories
	HF_CHECKPOINTS_DIR = "/tmp/cantussvs_v1/checkpoints"
	HF_DATA_DIR = "/tmp/cantussvs_v1/data"
	DEMO_FILES = PROJECT_ROOT / "webapp/demo_files"
	UPLOAD_MEI_DIR = PROJECT_ROOT / "webapp/uploaded_mei"
	UPLOAD_DS_DIR = PROJECT_ROOT / "webapp/uploaded_ds"
	TMP_DS_DIR = PROJECT_ROOT / "webapp/tmp_ds"
	OUTPUT_DIR = PROJECT_ROOT / "webapp/output"
	for d in [DEMO_FILES, UPLOAD_MEI_DIR, UPLOAD_DS_DIR, TMP_DS_DIR, OUTPUT_DIR]:
	d.mkdir(parents=True, exist_ok=True)

	@st.cache_resource
	def download_and_extract_from_hf():
	url = "https://huggingface.co/datasets/liampond/CantusSVS/resolve/main/cantussvs_v1.zip"
	zip_path = "/tmp/cantussvs_v1.zip"
	extract_dir = "/tmp/cantussvs_v1"

	if not os.path.exists(extract_dir):
	st.write("📦 Downloading data + model from Hugging Face...")
	r = requests.get(url, stream=True)
	with open(zip_path, "wb") as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)

	st.write("📂 Extracting contents...")
	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
	zip_ref.extractall(extract_dir)

	# ✅ Only do this once, right after unzip
	patch_config_yaml_files()

	safe_symlink(os.path.join(extract_dir, "checkpoints"), "checkpoints")
	safe_symlink(os.path.join(extract_dir, "data"), "data")

	return extract_dir

	# Call it once and use it globally
	base_path = download_and_extract_from_hf()
	patch_config_yaml_files()
	st.write("✅ Loaded assets to:", base_path)

	# CSS styling
	# st.markdown("""
	# <style>
	# html, body, [class*="css"] { font-size: 18px !important; }
	# div[data-testid="stSelectbox"] label,
	# div[data-testid="stNumberInput"] label,
	# div[data-testid="stTextInput"] label { font-size: 13px; padding-bottom: 0px; }
	# div[data-testid="stSlider"] label { font-size: 0px; }
	# div.stButton > button:first-child {
	# background-color: black; color: white;
	# font-size: 14px; padding: 4px 10px;
	# border-radius: 8px;
	# }
	# section[data-testid="stFileUploaderDropzone"] { padding: 2rem; }

	# .tooltip {
	# position: relative;
	# display: inline-block;
	# border-bottom: 1px dotted white;
	# cursor: help;
	# }

	# .tooltip .tooltiptext {
	# visibility: hidden;
	# width: 250px;
	# background-color: black;
	# color: #fff;
	# text-align: center;
	# border-radius: 6px;
	# padding: 6px;
	# position: absolute;
	# z-index: 1;
	# bottom: 125%; /* Position above */
	# left: 50%;
	# margin-left: -125px;
	# opacity: 0;
	# transition: opacity 0.3s;
	# }

	# .tooltip:hover .tooltiptext {
	# visibility: visible;
	# opacity: 1;
	# }
	# </style>
	# """, unsafe_allow_html=True)

	# Phoneme mappings
	phoneme_display_map = { "ap": "Pause", "br": "Breath" }
	display_to_phoneme = {v: k for k, v in phoneme_display_map.items()}
	full_phoneme_list_display = [phoneme_display_map.get(p, p) for p in permitted_phonemes]

	# Pitch list D4-D5
	allowed_pitches = ["D4", "D#4", "E4", "F4", "F#4", "G4", "G#4", "A4", "A#4", "B4", "C5", "C#5", "D5"]

	# Title
	st.title("CantusSVS: Latin Singing Voice Synthesis")

	st.markdown("""
	# About CantusSVS

	<p>CantusSVS is a web-based Singing Voice Synthesis (SVS) system designed for composers and musicians to synthesize Latin chant audio from a custom musical score.
	Built on top of the DiffSinger AI model, CantusSVS enables detailed, precise control over melody, rhythm, phonemes, and timing without any programming knowledge required.</p>

	<p>Designed by Liam Pond as the final project for MUS6329X: Projet en informatique musicale (Prof. Dominic Thibault) at the Université de Montréal. For more information, you can view the README.md under the 'Files' tab of this Space.</p>

	You can find DiffSinger in the following paper:
	DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism

	Liu, Jinglin, Chengxi Li, Yi Ren, Feiyang Chen, and Zhou Zhao. 2022. "Diffsinger: Singing Voice Synthesis via Shallow Diffusion Mechanism." In Proceedings of the AAAI Conference on Artificial Intelligence 36 10: 11020–11028. [https://arxiv.org/abs/2105.02446](http://dx.doi.org/10.1609/aaai.v36i10.21350).

	Model training was done using Cedar, a cluster provided by the Digital Research Alliance of Canada. To train your own model locally, follow [this tutorial](https://youtu.be/Sxt11TAflV0?feature=shared) by [tigermeat](https://www.youtube.com/@spicytigermeat).

	For general help training and creating a dataset, [this tutorial](https://docs.google.com/document/d/1uMsepxbdUW65PfIWL1pt2OM6ZKa5ybTTJOpZ733Ht6s/view) by [PixPrucer](https://bsky.app/profile/pixprucer.bsky.social) is an excellent guide. For help, join the [DiffSinger Discord server](https://discord.gg/DZ6fhEUfnb).

	The dataset used for this project was built using [Adventus: Dominica prima adventus Domini](https://youtu.be/ThnPySybDJs?feature=shared), the first track from [Psallentes](https://psallentes.com/)' album Salzinnes Saints. Psallentes is a Belgian women's chorus that specializes in Late Medieval and Renaissance music. Salzinnes Saints is an album of music from the [Salzinnes Antiphonal](https://www.smu.ca/academics/archives/the-salzinnes-antiphonal.html), a mid-sixteenth century choirbook with the music and text for the Liturgy of the Hours.

	---

	# How to Use CantusSVS

	## 1. Compose Your Music

	Compose the chant you want to synthesize using the notation software of your choice. [MuseScore 4](https://musescore.org/en/download) is recommended.
	The chant must adhere to the following conditions:

	- Monophonic only (one note at a time, no harmonies or chords)
	- Pitch range of <span class="tooltip">D4 to D5<span class="tooltiptext">Because training data was limited outside this range, synthesis outside these pitches is very poor.</span></span>
	- Lyrics (Latin) under each note, separated by syllable

	## 2. Export Your Score to MEI

	When your score is complete, export it to MEI.

	In MuseScore:
	- Go to File → Export
	- Choose the `.mei` file format
	- Save it to your computer

	## 3. Upload Your Score to CantusSVS

	In the CantusSVS web app:

	- Select MEI mode
	- Adjust the tempo if necessary using the provided slider
	- Upload your `.mei` file
	- Your score will be displayed using Verovio
	- You may use the demo `.mei` file if you wish

	## 4. Edit Phonemes, Durations, and Pitches

	CantusSVS automatically suggests phoneme splits for each syllable.
	However, you will have the opportunity to review phonemes, durations, and pitches.

	## 5. Synthesize the Audio

	When you're done:

	- Click Confirm
	- CantusSVS will create a `.ds` file which are processed through pretrained DiffSinger models
	- The synthesized chant will be generated

	This can take a few minutes depending on input length

	## 6. Listen and Download

	After synthesis you can either listen to your chant directly in the app or download a `.wav` file to your computer.

	---
	""", unsafe_allow_html=True)

	st.markdown("""
	<script>
	const tooltipSpan = window.parent.document.querySelector('span[style*="border-bottom: 1px dotted black"]');
	if (tooltipSpan) {
	tooltipSpan.addEventListener('mouseover', () => {
	tooltipSpan.children[0].style.visibility = 'visible';
	tooltipSpan.children[0].style.opacity = 1;
	});
	tooltipSpan.addEventListener('mouseout', () => {
	tooltipSpan.children[0].style.visibility = 'hidden';
	tooltipSpan.children[0].style.opacity = 0;
	});
	}
	</script>
	""", unsafe_allow_html=True)

	filetype = st.selectbox("Select file type:", ["MEI", "DS"])

	def handle_exception(context_message):
	st.error(f"{context_message}. See console.")
	print("\n" + "="*30)
	print(f"Exception during {context_message}")
	traceback.print_exc()
	print("="*30 + "\n")
	st.stop()

	if filetype == "MEI":
	st.header("1. Select MEI Source")
	use_demo = st.checkbox("Use demo MEI file", value=False)
	tempo = st.slider("Tempo (BPM)", 1, 300, 60)

	if use_demo:
	mei_path = DEMO_FILES / "Demo1.mei"
	if not mei_path.exists():
	st.error("Demo MEI file missing.")
	st.stop()
	with open(mei_path, "rb") as f:
	mei_file_bytes = f.read()
	else:
	mei_file = st.file_uploader("Upload your MEI file", type="mei")
	if not mei_file:
	st.stop()
	mei_path = UPLOAD_MEI_DIR / mei_file.name
	with open(mei_path, "wb") as f:
	f.write(mei_file.getbuffer())
	mei_file_bytes = mei_file.getvalue()

	mei_text = mei_file_bytes.decode("utf-8")

	try:
	raw_notes = parse_mei_for_editor(mei_path, tempo)
	except Exception:
	handle_exception("MEI parsing")

	# Always update session state
	st.session_state.original_raw_notes = raw_notes

	syllable_groups = []
	for note in st.session_state.original_raw_notes:
	syllable_text = note["lyric"]
	pitch = note["pitch"]
	syllable = split_syllable(
	syllable=syllable_text,
	note_duration=note["duration"],
	tempo=tempo,
	pitch=pitch
	)
	syllable_groups.append({
	"syllable": syllable_text,
	"phonemes": syllable
	})

	if "edited_syllables" not in st.session_state:
	st.session_state.edited_syllables = syllable_groups

	st.subheader("Score Preview")
	components.html(f"""<div id=\"app\" style=\"border: 1px solid lightgray; min-height: 400px;\"></div><script type=\"module\">import 'https://editor.verovio.org/javascript/app/verovio-app.js';const app=new Verovio.App(document.getElementById(\"app\"),{{defaultView:'document',documentZoom:4}});app.loadData(`{mei_text}`);</script>""", height=500)

	st.header("2. Edit Phonemes, Durations, and Pitches")
	updated_syllables = []

	if "previous_tempo" not in st.session_state:
	st.session_state.previous_tempo = tempo

	if tempo != st.session_state.previous_tempo:
	for i, note in enumerate(st.session_state.original_raw_notes):
	updated = split_syllable(
	syllable=note["lyric"],
	note_duration=note["duration"],
	tempo=tempo,
	pitch=note["pitch"]
	)
	# preserve existing phoneme values (if possible)
	for j, ph in enumerate(updated):
	try:
	existing = st.session_state.edited_syllables[i]["phonemes"][j]
	ph["phoneme"] = existing["phoneme"]
	ph["pitch"] = existing["pitch"]
	except IndexError:
	pass # new phoneme or longer split
	st.session_state.edited_syllables[i]["phonemes"] = updated
	st.session_state.previous_tempo = tempo

	for idx, group in enumerate(st.session_state.edited_syllables):
	st.markdown(f"#### {group['syllable'].capitalize()}")
	new_phonemes = []
	for j, ph in enumerate(group["phonemes"]):
	col1, col2, col3, col4 = st.columns([3, 3, 3, 1]) # new column for delete button
	with col1:
	phoneme_display = st.selectbox(
	"Phoneme",
	full_phoneme_list_display,
	index=full_phoneme_list_display.index(phoneme_display_map.get(ph["phoneme"], ph["phoneme"])),
	key=f"phoneme_{idx}_{j}"
	)
	phoneme_internal = display_to_phoneme.get(phoneme_display, phoneme_display)
	with col2:
	duration = st.number_input(
	"Duration (seconds)",
	min_value=0.0, max_value=5.0,
	value=float(ph["duration"]),
	step=0.01, format="%.2f",
	key=f"duration_num_{idx}_{j}"
	)
	with col3:
	pitch = st.selectbox(
	"Pitch",
	allowed_pitches,
	index=allowed_pitches.index(ph["pitch"]) if ph["pitch"] in allowed_pitches else 0,
	key=f"pitch_{idx}_{j}"
	)
	with col4:
	if st.button("❌", key=f"remove_{idx}_{j}"):
	group["phonemes"].pop(j)
	st.experimental_rerun() # force rerender safely

	new_phonemes.append({"phoneme": phoneme_internal, "duration": duration, "pitch": pitch})

	if st.button("➕ Add Phoneme", key=f"add_phoneme_{idx}"):
	group["phonemes"].append({"phoneme": "a", "duration": 0.2, "pitch": "D4"})
	st.experimental_rerun()

	updated_syllables.append({"syllable": group["syllable"], "phonemes": new_phonemes})
	st.divider()

	st.session_state.edited_syllables = updated_syllables

	st.header("3. Synthesize")
	confirm_clicked = st.button("✅ Synthesize", key="confirm_button_mei")

	if confirm_clicked:
	ds_path = TMP_DS_DIR / f"{mei_path.stem}.ds"
	try:
	all_phonemes = [ph for syllable in st.session_state.edited_syllables for ph in syllable["phonemes"]]
	build_ds_from_notes(all_phonemes, ds_path)
	with open(ds_path, "r", encoding="utf-8") as f:
	ds_data = json.load(f)
	validate_ds(ds_data)
	st.success(f"DS file created: {ds_path.name}")
	except Exception:
	handle_exception("DS generation or validation")

	with st.spinner("Running DiffSinger inference…"):
	try:
	wav_path = run_inference(ds_path, OUTPUT_DIR, mei_path.stem)
	except Exception:
	handle_exception("inference")

	st.success("Synthesis complete!")
	st.audio(str(wav_path))
	st.download_button("Download WAV", data=open(wav_path, "rb"), file_name=wav_path.name)

	elif filetype == "DS":
	st.header("1. Upload DS File")
	ds_file = st.file_uploader("Upload your .ds file", type=["ds", "json"])

	st.header("2. Synthesize")
	synth_clicked = st.button("✅ Synthesize", key="synthesize_button_ds")

	if synth_clicked:
	if not ds_file:
	st.error("Please upload a .ds file.")
	st.stop()
	ds_path = UPLOAD_DS_DIR / ds_file.name
	with open(ds_path, "wb") as f:
	f.write(ds_file.getbuffer())
	with open(ds_path, "r", encoding="utf-8") as f:
	ds_data = json.load(f)

	try:
	validate_ds(ds_data)
	except Exception as e:
	st.error(f"Invalid DS file: {e}")
	st.stop()

	with st.spinner("Running DiffSinger inference…"):
	try:
	wav_path = run_inference(ds_path, OUTPUT_DIR, ds_path.stem)
	except Exception:
	handle_exception("inference")

	st.success("Synthesis complete!")
	st.audio(str(wav_path))
	st.download_button("Download WAV", data=open(wav_path, "rb"), file_name=wav_path.name)