Spaces:
Sleeping
Sleeping
| import os | |
| # Activer le serveur MCP | |
| os.environ['GRADIO_MCP_SERVER'] = 'True' | |
| import gradio as gr | |
| import torchaudio | |
| import torch | |
| from pydub import AudioSegment, effects | |
| import uuid | |
| import subprocess | |
| import time | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| from pathlib import Path | |
| import sys | |
| from pydub.silence import split_on_silence | |
| import re | |
| from unicodedata import normalize | |
| import numpy as np | |
| import spaces | |
| from huggingface_hub import snapshot_download | |
| import threading | |
| import requests | |
| import tempfile | |
| # Télécharger les ressources NLTK | |
| nltk.download("punkt", quiet=True) | |
| nltk.download("punkt_tab", quiet=True) | |
| # Definition of problematic characters by language | |
| PROBLEMATIC_CHARS = { | |
| 'global': ['&', '%', '@', '#', '$', '*', '+', '=', '()', '[]', '{}', '<>', '|', '/', '\\', '"', '…', '«', '»', '"', '"', ''', '''], | |
| 'fr': ['&', '%', '@', '#', '$', '*', '+', '=', 'etc.'], | |
| 'en': ['&', '%', '@', '#', '$', '*', '+', '=', 'etc.'], | |
| # Add specific characters for each language as needed | |
| } | |
| # Replacement rules by language | |
| REPLACEMENT_RULES = { | |
| 'global': { | |
| '&': {'fr': ' et ', 'en': ' and ', 'es': ' y ', 'de': ' und ', 'it': ' e ', 'pt': ' e ', 'default': ' and '}, | |
| '%': {'fr': ' pourcent ', 'en': ' percent ', 'de': ' prozent ', 'default': ' percent '}, | |
| '@': {'fr': ' arobase ', 'en': ' at ', 'default': ' at '}, | |
| '#': {'fr': ' hashtag ', 'en': ' hashtag ', 'default': ' hashtag '}, | |
| '...': {'default': ', '}, | |
| '…': {'default': ', '}, | |
| '"': {'default': ''}, | |
| "'": {'default': ''}, | |
| '«': {'default': ''}, | |
| '»': {'default': ''}, | |
| '"': {'default': ''}, | |
| '"': {'default': ''}, | |
| ''': {'default': ''}, | |
| ''': {'default': ''}, | |
| }, | |
| # You can add language-specific rules | |
| } | |
| def analyze_text(text, language_code): | |
| """Analyze text to detect potential pronunciation issues for voice synthesis. | |
| This function examines text for problematic characters, special symbols, URLs, | |
| numbers, and other elements that might affect speech quality in voice cloning. | |
| Args: | |
| text: The text to analyze for speech synthesis compatibility | |
| language_code: Language code (en, fr, es, de, it, pt, pl, tr, ru, nl, cs, ar, zh, hu, ko, ja, hi) | |
| Returns: | |
| Dictionary containing detected issues and suggestions for improvement | |
| """ | |
| issues = [] | |
| # Basic unicode normalization | |
| normalized_text = normalize('NFC', text) | |
| # Détection des emojis | |
| import re | |
| emoji_pattern = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" # emoticons | |
| "\U0001F300-\U0001F5FF" # symbols & pictographs | |
| "\U0001F680-\U0001F6FF" # transport & map symbols | |
| "\U0001F700-\U0001F77F" # alchemical symbols | |
| "\U0001F780-\U0001F7FF" # Geometric Shapes | |
| "\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
| "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
| "\U0001FA00-\U0001FA6F" # Chess Symbols | |
| "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
| "\U00002702-\U000027B0" # Dingbats | |
| "\U000024C2-\U0001F251" | |
| "]+", flags=re.UNICODE | |
| ) | |
| emojis = emoji_pattern.findall(text) | |
| if emojis: | |
| issues.append({ | |
| 'type': 'emojis', | |
| 'description': 'Emojis that will be removed during preprocessing', | |
| 'instances': emojis, | |
| 'suggestion': 'Emojis are replaced with spaces for better pronunciation' | |
| }) | |
| # URL detection | |
| urls = re.findall(r'https?://\S+|www\.\S+', text) | |
| if urls: | |
| issues.append({ | |
| 'type': 'url', | |
| 'description': 'Detected URLs that may be mispronounced', | |
| 'instances': urls, | |
| 'suggestion': 'Replace URLs with textual descriptions' | |
| }) | |
| # Email detection | |
| emails = re.findall(r'\S+@\S+\.\S+', text) | |
| if emails: | |
| issues.append({ | |
| 'type': 'email', | |
| 'description': 'Detected email addresses that may be mispronounced', | |
| 'instances': emails, | |
| 'suggestion': 'Replace emails with descriptive text' | |
| }) | |
| # Detection of quotes and citation characters (completely exclude apostrophe) | |
| quote_chars = ['"', '«', '»', '"', '"', ''', '''] | |
| found_quotes = [] | |
| # For English, completely exclude apostrophes from problematic characters | |
| if language_code == 'en': | |
| # Don't report apostrophes as problematic in English | |
| pass | |
| else: | |
| # Look only for quotes, not apostrophes | |
| for char in quote_chars: | |
| if char in text: | |
| found_quotes.append(char) | |
| if found_quotes: | |
| issues.append({ | |
| 'type': 'quotes', | |
| 'description': 'Quotes and citation characters that may affect pronunciation', | |
| 'instances': found_quotes, | |
| 'suggestion': 'Remove quotes and citation characters for better pronunciation' | |
| }) | |
| # Detection of problematic characters (exclude apostrophes) | |
| global_chars = [c for c in PROBLEMATIC_CHARS.get('global', []) if c != "'"] | |
| lang_specific_chars = PROBLEMATIC_CHARS.get(language_code, []) | |
| all_problematic_chars = set(global_chars + lang_specific_chars) - set(quote_chars) # Exclude quotes already treated | |
| found_chars = [] | |
| for char in all_problematic_chars: | |
| if char in text: | |
| found_chars.append(char) | |
| if found_chars: | |
| issues.append({ | |
| 'type': 'special_chars', | |
| 'description': 'Special characters that may cause pronunciation problems', | |
| 'instances': found_chars, | |
| 'suggestion': 'Replace special characters with their textual equivalent' | |
| }) | |
| # Detection of long numbers (beyond 3 digits) | |
| numbers = re.findall(r'\b\d{4,}\b', text) | |
| if numbers: | |
| suggestion = "Write numbers in full" | |
| if language_code == 'fr': | |
| suggestion += " or add spaces between thousands (e.g., 10 000)" | |
| elif language_code == 'en': | |
| suggestion += " or use commas for thousands (e.g., 10,000)" | |
| issues.append({ | |
| 'type': 'numbers', | |
| 'description': 'Long numbers that may be mispronounced', | |
| 'instances': numbers, | |
| 'suggestion': suggestion | |
| }) | |
| # Detection of Roman numerals, with exception for the pronoun "I" in English | |
| if language_code == 'en': | |
| # In English, exclude "I" as a Roman numeral because it's a personal pronoun | |
| roman_pattern = r'\b(?!I\b)[IVXLCDM]+\b' | |
| roman_numerals = re.findall(roman_pattern, text) | |
| if roman_numerals: | |
| issues.append({ | |
| 'type': 'roman_numerals', | |
| 'description': 'Roman numerals that may be mispronounced', | |
| 'instances': roman_numerals, | |
| 'suggestion': 'Replace Roman numerals with Arabic numbers' | |
| }) | |
| else: | |
| # For other languages, keep normal detection | |
| roman_pattern = r'\b[IVXLCDM]+\b' | |
| roman_numerals = re.findall(roman_pattern, text) | |
| if roman_numerals: | |
| issues.append({ | |
| 'type': 'roman_numerals', | |
| 'description': 'Roman numerals that may be mispronounced', | |
| 'instances': roman_numerals, | |
| 'suggestion': 'Replace Roman numerals with Arabic numbers' | |
| }) | |
| # Detection of abbreviations by language | |
| abbreviation_patterns = { | |
| 'fr': [r'\bM\.\s', r'\bMme\.\s', r'\bMlle\.\s', r'\bDr\.\s', r'\bProf\.\s', r'\betc\.\s', r'\bex\.\s'], | |
| 'en': [r'\bMr\.\s', r'\bMrs\.\s', r'\bDr\.\s', r'\bProf\.\s', r'\betc\.\s', r'\be\.g\.\s', r'\bi\.e\.\s'], | |
| 'es': [r'\bSr\.\s', r'\bSra\.\s', r'\bDr\.\s', r'\betc\.\s'], | |
| 'default': [r'\b[A-Z]\.\s', r'\b[A-Z][a-z]+\.\s'] | |
| } | |
| patterns = abbreviation_patterns.get(language_code, abbreviation_patterns['default']) | |
| found_abbrevs = [] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, text) | |
| found_abbrevs.extend(matches) | |
| if found_abbrevs: | |
| issues.append({ | |
| 'type': 'abbreviations', | |
| 'description': 'Detected abbreviations that may be mispronounced', | |
| 'instances': found_abbrevs, | |
| 'suggestion': 'Write abbreviations in full' | |
| }) | |
| # Detection of repeated punctuation | |
| repeated_punct = re.findall(r'([!?.,;:]{2,})', text) | |
| if repeated_punct: | |
| issues.append({ | |
| 'type': 'repeated_punct', | |
| 'description': 'Repeated punctuation that may cause incorrect pauses', | |
| 'instances': repeated_punct, | |
| 'suggestion': 'Simplify punctuation (use only one character)' | |
| }) | |
| # Detection of missing spaces around punctuation, excluding decimal numbers | |
| missing_spaces = [] | |
| # Specific patterns to look for | |
| patterns = [ | |
| r'[a-zA-ZÀ-ÿ][,.;:!?][a-zA-ZÀ-ÿ]' # letter+punctuation+letter | |
| ] | |
| # In English, exclude contractions with apostrophes (I'm, don't, isn't, etc.) | |
| if language_code != 'en': | |
| for pattern in patterns: | |
| matches = re.findall(pattern, text) | |
| if matches: | |
| missing_spaces.extend(matches) | |
| if missing_spaces: | |
| issues.append({ | |
| 'type': 'missing_spaces', | |
| 'description': 'Punctuation without spaces that may affect pronunciation', | |
| 'instances': missing_spaces, | |
| 'suggestion': 'Add appropriate spaces around punctuation (except for decimal numbers)' | |
| }) | |
| # Detection of language-specific issues | |
| if language_code == 'fr': | |
| # Poorly formatted ordinal numbers in French | |
| ordinals = re.findall(r'\b\d+(eme|ème|er|ere|ère)\b', text) | |
| if ordinals: | |
| issues.append({ | |
| 'type': 'fr_ordinals', | |
| 'description': 'Ordinal numbers that may be mispronounced', | |
| 'instances': ordinals, | |
| 'suggestion': 'Write ordinals in letters (premier, deuxième, etc.)' | |
| }) | |
| elif language_code == 'en': | |
| # English-specific issues | |
| dates = re.findall(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', text) | |
| if dates: | |
| issues.append({ | |
| 'type': 'en_dates', | |
| 'description': 'Dates in numeric format that may be misinterpreted', | |
| 'instances': dates, | |
| 'suggestion': 'Write dates in full (e.g., January 1st, 2022)' | |
| }) | |
| return { | |
| 'issues': issues, | |
| 'has_issues': len(issues) > 0, | |
| 'normalized_text': normalized_text | |
| } | |
| # Add a function to convert numbers to text | |
| def number_to_text_fr(number_str): | |
| """ | |
| Converts a number (integer or decimal) to French text. | |
| Args: | |
| number_str (str): The number to convert to text format | |
| Returns: | |
| str: The number written out in words | |
| """ | |
| parts = number_str.replace(',', '.').split('.') | |
| # Function to convert an integer to text | |
| def int_to_text(n): | |
| if n == '0': | |
| return 'zéro' | |
| units = ['', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf'] | |
| teens = ['dix', 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf'] | |
| tens = ['', 'dix', 'vingt', 'trente', 'quarante', 'cinquante', 'soixante', 'soixante', 'quatre-vingt', 'quatre-vingt'] | |
| n = int(n) | |
| if n < 10: | |
| return units[n] | |
| elif n < 20: | |
| return teens[n-10] | |
| elif n < 70: | |
| div, mod = divmod(n, 10) | |
| return tens[div] + ('-et-un' if mod == 1 else ('-' + units[mod] if mod else '')) | |
| elif n < 80: | |
| div, mod = divmod(n, 10) | |
| return tens[div] + ('-' + teens[mod-10] if mod else '') | |
| elif n < 90: | |
| div, mod = divmod(n, 10) | |
| return tens[div] + (('-' + units[mod]) if mod else 's') | |
| elif n < 100: | |
| div, mod = divmod(n, 10) | |
| return tens[div] + ('-' + teens[mod-10] if mod else 's') | |
| else: | |
| if n < 200: | |
| return 'cent' + (' ' + int_to_text(n % 100) if n % 100 else '') | |
| else: | |
| div, mod = divmod(n, 100) | |
| return int_to_text(div) + ' cent' + ('s' if div > 1 and mod == 0 else '') + (' ' + int_to_text(mod) if mod else '') | |
| # Process the integer part | |
| integer_part = int_to_text(parts[0]) | |
| # If there's a decimal part | |
| if len(parts) > 1 and parts[1]: | |
| # If the decimal part is 1 or 2 digits | |
| decimal_part = parts[1] | |
| if len(decimal_part) <= 2: | |
| decimal_text = int_to_text(decimal_part) | |
| # For 01, 02, etc. we say "un", "deux", etc. rather than "un", "deux" | |
| if len(decimal_part) == 2 and decimal_part[0] == '0': | |
| decimal_text = int_to_text(decimal_part[1]) | |
| return f"{integer_part} virgule {decimal_text}" | |
| else: | |
| # For more than 2 digits, we pronounce each digit | |
| decimal_text = ' '.join(int_to_text(d) for d in decimal_part) | |
| return f"{integer_part} virgule {decimal_text}" | |
| return integer_part | |
| def preprocess_text(text, language_code, apply_replacements=True): | |
| """Preprocess and clean text for optimal voice synthesis results. | |
| This function automatically fixes common text issues like special characters, | |
| numbers, URLs, and language-specific elements to improve speech quality. | |
| Args: | |
| text: The text to preprocess for voice synthesis | |
| language_code: Language code (en, fr, es, de, it, pt, pl, tr, ru, nl, cs, ar, zh, hu, ko, ja, hi) | |
| apply_replacements: If True, applies automatic character replacements for better pronunciation | |
| Returns: | |
| The preprocessed text ready for high-quality voice synthesis | |
| """ | |
| # Unicode normalization | |
| text = normalize('NFC', text) | |
| if apply_replacements: | |
| # Détection et suppression des emojis et caractères spéciaux Unicode | |
| import re | |
| # Regex pour détecter les emojis et symboles Unicode | |
| emoji_pattern = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" # emoticons | |
| "\U0001F300-\U0001F5FF" # symbols & pictographs | |
| "\U0001F680-\U0001F6FF" # transport & map symbols | |
| "\U0001F700-\U0001F77F" # alchemical symbols | |
| "\U0001F780-\U0001F7FF" # Geometric Shapes | |
| "\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
| "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
| "\U0001FA00-\U0001FA6F" # Chess Symbols | |
| "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
| "\U00002702-\U000027B0" # Dingbats | |
| "\U000024C2-\U0001F251" | |
| "]+", flags=re.UNICODE | |
| ) | |
| # Remplacer les emojis par un espace | |
| text = emoji_pattern.sub(' ', text) | |
| # Apply global replacement rules | |
| for char, replacements in REPLACEMENT_RULES.get('global', {}).items(): | |
| if char in text: | |
| # Use language-specific rule if available, otherwise default rule | |
| replacement = replacements.get(language_code, replacements.get('default', char)) | |
| text = text.replace(char, replacement) | |
| # Transform URLs and emails | |
| text = re.sub(r'https?://\S+|www\.\S+', ' URL link ', text) | |
| text = re.sub(r'\S+@\S+\.\S+', ' email address ', text) | |
| # Process quotes (removal or replacement) | |
| # Straight quotes " and ' | |
| text = text.replace('"', '') | |
| text = text.replace("'", '') | |
| # French quotes « and » | |
| text = text.replace('«', '') | |
| text = text.replace('»', '') | |
| # Smart typographic quotes (curly quotes) | |
| text = text.replace('"', '') # opening quote | |
| text = text.replace('"', '') # closing quote | |
| text = text.replace(''', '') # opening apostrophe | |
| text = text.replace(''', '') # closing apostrophe | |
| # Replace Roman numerals with their equivalent (if needed) | |
| if language_code in ['fr', 'en', 'es', 'it', 'pt']: | |
| roman_numerals = { | |
| 'I': '1', 'II': '2', 'III': '3', 'IV': '4', 'V': '5', | |
| 'VI': '6', 'VII': '7', 'VIII': '8', 'IX': '9', 'X': '10', | |
| 'XI': '11', 'XII': '12', 'XIII': '13', 'XIV': '14', 'XV': '15', | |
| 'XVI': '16', 'XVII': '17', 'XVIII': '18', 'XIX': '19', 'XX': '20' | |
| } | |
| # Exception for the personal pronoun "I" in English | |
| if language_code == 'en': | |
| # Use a regex that only detects true Roman numerals | |
| # and not the personal pronoun "I" in English | |
| for roman, arabic in roman_numerals.items(): | |
| if roman == 'I': | |
| # For "I" in English, check that it's not alone or between spaces | |
| # A true Roman numeral I will typically be followed by a period or | |
| # in a numeric context | |
| text = re.sub(r'\b(I)\b(?!\'m|\'ve|\'ll|\'d|\.)', roman, text) # Preserve "I" pronoun | |
| text = re.sub(r'\b(I)\.', arabic + '.', text) # I. => 1. | |
| else: | |
| # For other Roman numerals, standard behavior | |
| text = re.sub(fr'\b{roman}\b', arabic, text) | |
| else: | |
| # For other languages, replace all Roman numerals | |
| for roman, arabic in roman_numerals.items(): | |
| text = re.sub(fr'\b{roman}\b', arabic, text) | |
| # Language-specific processing for French | |
| if language_code == 'fr': | |
| # Replace common numbers | |
| text = re.sub(r'\b1er\b', 'premier', text) | |
| text = re.sub(r'\b1ère\b', 'première', text) | |
| text = re.sub(r'\b(\d+)(ème)\b', r'\1 ième', text) | |
| # Improved processing of decimal numbers and percentages in French | |
| # Search for patterns like "2,95 %" or "2,95%" | |
| def replace_decimal_percent(match): | |
| num = match.group(1) | |
| return number_to_text_fr(num) + " pour cent" | |
| # Search for decimal numbers followed by % (with or without space) | |
| text = re.sub(r'(\d+,\d+)\s*%', replace_decimal_percent, text) | |
| # Process decimal numbers without percentage | |
| def replace_decimal(match): | |
| return number_to_text_fr(match.group(0)) | |
| # Search for decimal numbers (with comma) | |
| text = re.sub(r'\b\d+,\d+\b', replace_decimal, text) | |
| # Process simple percentages | |
| text = re.sub(r'(\d+)\s*%', lambda m: number_to_text_fr(m.group(1)) + " pour cent", text) | |
| # Apply French typographical rules for punctuation: | |
| # - No space before: . , ... ) ] } | |
| # - Space after: . , ... ) ] } | |
| # - Space before and after: : ; ! ? « » | |
| # First, normalize by removing all spaces around punctuation | |
| text = re.sub(r'\s*([.,;:!?\[\]\(\)\{\}])\s*', r'\1', text) | |
| # Then, add spaces according to French rules | |
| # Simple punctuation with space after only | |
| text = re.sub(r'([.,)])', r'\1 ', text) | |
| # Punctuation with space before and after | |
| text = re.sub(r'([;:!?])', r' \1 ', text) | |
| # Special case for French quotes | |
| text = re.sub(r'«', r'« ', text) | |
| text = re.sub(r'»', r' »', text) | |
| # Language-specific processing for English | |
| elif language_code == 'en': | |
| # Replace ordinals | |
| text = re.sub(r'\b1st\b', 'first', text) | |
| text = re.sub(r'\b2nd\b', 'second', text) | |
| text = re.sub(r'\b3rd\b', 'third', text) | |
| text = re.sub(r'\b(\d+)th\b', r'\1th', text) | |
| # Process percentages in English (decimals with point) | |
| text = re.sub(r'(\d+\.\d+)%', r'\1 percent', text) | |
| text = re.sub(r'(\d+)%', r'\1 percent', text) | |
| # English typographical rules: no space before punctuation, space after | |
| text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) | |
| # For other languages, general rule: no space before, space after punctuation | |
| else: | |
| text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) | |
| # Clean up multiple spaces | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def format_issues_for_display(analysis_result, language_code, tokenizer_analysis=None): | |
| """ | |
| Formats detected issues for display in the interface. | |
| Args: | |
| analysis_result (dict): Result of the text analysis | |
| language_code (str): Language code | |
| tokenizer_analysis (dict): Result of tokenizer analysis (optional) | |
| Returns: | |
| str: Formatted text for display | |
| """ | |
| if not analysis_result['has_issues'] and (tokenizer_analysis is None or not tokenizer_analysis['has_issues']): | |
| return "✅ No issues detected in the text." | |
| formatted_text = "⚠️ Potential issues detected:\n\n" | |
| # Format standard text analysis issues | |
| if analysis_result['has_issues']: | |
| formatted_text += "📊 Text analysis results:\n" | |
| for issue in analysis_result['issues']: | |
| formatted_text += f"- {issue['description']}:\n" | |
| formatted_text += f" • Detected: {', '.join(repr(i) for i in issue['instances'])}\n" | |
| formatted_text += f" • Suggestion: {issue['suggestion']}\n\n" | |
| # Format tokenizer analysis issues (if available) | |
| if tokenizer_analysis and tokenizer_analysis['has_issues']: | |
| formatted_text += "\n🔍 Tokenizer analysis results:\n" | |
| for issue in tokenizer_analysis['issues']: | |
| formatted_text += f"- {issue['description']}:\n" | |
| formatted_text += f" • Detected: {', '.join(repr(i) for i in issue['instances'])}\n" | |
| formatted_text += f" • Suggestion: {issue['suggestion']}\n\n" | |
| if 'cleaned_text' in tokenizer_analysis: | |
| formatted_text += "\n📝 Cleaned text by XTTS tokenizer:\n" | |
| formatted_text += f"{tokenizer_analysis['cleaned_text']}\n\n" | |
| formatted_text += "\nEnable text preprocessing to automatically fix some of these issues." | |
| return formatted_text | |
| repo_id = "XTTS-v2" | |
| # Télécharger le modèle seulement s'il n'existe pas déjà | |
| if not os.path.exists(repo_id) or not os.path.exists(os.path.join(repo_id, "config.json")): | |
| try: | |
| print("Téléchargement du modèle XTTS-v2...") | |
| snapshot_download( | |
| repo_id="coqui/XTTS-v2", | |
| local_dir=repo_id, | |
| allow_patterns=["*.safetensors", "*.wav", "*.json", "*.pth"] | |
| ) | |
| print("Modèle téléchargé avec succès!") | |
| except Exception as e: | |
| print(f"Erreur lors du téléchargement: {e}") | |
| print("Essai avec git clone...") | |
| try: | |
| import subprocess | |
| result = subprocess.run( | |
| ["git", "clone", "https://huggingface.co/coqui/XTTS-v2", repo_id], | |
| capture_output=True, | |
| text=True | |
| ) | |
| if result.returncode == 0: | |
| print("Modèle téléchargé avec git clone!") | |
| else: | |
| print(f"Erreur git clone: {result.stderr}") | |
| raise Exception("Impossible de télécharger le modèle") | |
| except Exception as git_error: | |
| print(f"Erreur git clone: {git_error}") | |
| raise Exception("Veuillez télécharger le modèle manuellement avec: git clone https://huggingface.co/coqui/XTTS-v2") | |
| else: | |
| print("Modèle XTTS-v2 déjà présent.") | |
| # Relative path management | |
| BASE_DIR = Path(os.path.dirname(os.path.abspath(__file__))) | |
| MODELS_DIR = repo_id # BASE_DIR / "XTTS-v2" | |
| REF_AUDIO_DIR = BASE_DIR / "ref_audio_files" | |
| OUTPUT_DIR = BASE_DIR / "outputs" | |
| TEMP_DIR = OUTPUT_DIR / "temp" | |
| # Create necessary folders | |
| REF_AUDIO_DIR.mkdir(exist_ok=True) | |
| OUTPUT_DIR.mkdir(exist_ok=True) | |
| TEMP_DIR.mkdir(exist_ok=True) | |
| # Languages supported by XTTS | |
| SUPPORTED_LANGUAGES = { | |
| "English": "en", | |
| "French": "fr", | |
| "Spanish": "es", | |
| "German": "de", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Polish": "pl", | |
| "Turkish": "tr", | |
| "Russian": "ru", | |
| "Dutch": "nl", | |
| "Czech": "cs", | |
| "Arabic": "ar", | |
| "Chinese": "zh-cn", | |
| "Japanese": "ja", | |
| "Korean": "ko", | |
| "Hungarian": "hu", | |
| "Hindi": "hi" | |
| } | |
| print(f"Initializing model from: {MODELS_DIR}") | |
| # Clean temporary files | |
| def cleanup_temp_files(): | |
| """Cleans temporary files in the TEMP_DIR folder""" | |
| try: | |
| for file in TEMP_DIR.glob("*"): | |
| if file.is_file(): | |
| os.remove(file) | |
| except Exception as e: | |
| print(f"Error while cleaning temporary files: {e}") | |
| # Clean old generated MP3 files (optional) | |
| def cleanup_old_outputs(max_age_days=7): | |
| """Deletes MP3 files older than max_age_days in the OUTPUT_DIR folder""" | |
| try: | |
| now = time.time() | |
| for file in OUTPUT_DIR.glob("*.mp3"): | |
| if file.is_file(): | |
| # If the file is older than max_age_days | |
| if os.path.getmtime(file) < now - (max_age_days * 86400): | |
| os.remove(file) | |
| except Exception as e: | |
| print("error cleanup old outputs") | |
| # Import XTTS modules | |
| try: | |
| from TTS.tts.configs.xtts_config import XttsConfig | |
| from TTS.tts.models.xtts import Xtts | |
| except ImportError as e: | |
| print(f"TTS import error: {e}") | |
| print("Please install dependencies with: pip install coqui-tts") | |
| sys.exit(1) | |
| # Install language-specific dependencies | |
| def install_language_dependencies(): | |
| """Check and install required dependencies for Asian languages""" | |
| try: | |
| # For Chinese (zh-cn) | |
| try: | |
| import pypinyin | |
| except ImportError: | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "pypinyin"]) | |
| # For Japanese (ja) | |
| try: | |
| import cutlet | |
| # Test if fugashi and mecab are also installed | |
| try: | |
| import fugashi | |
| except ImportError: | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "fugashi", "mecab-python3", "unidic-lite"]) | |
| except ImportError: | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "cutlet", "fugashi", "mecab-python3", "unidic-lite"]) | |
| # For Korean (ko) | |
| try: | |
| import hangul_romanize | |
| except ImportError: | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "hangul-romanize"]) | |
| return True | |
| except Exception as e: | |
| return False | |
| # Model initialization and configuration | |
| try: | |
| # Try to install language dependencies | |
| install_language_dependencies() | |
| config = XttsConfig() | |
| config.load_json(str("XTTS-v2/config.json")) | |
| model = Xtts.init_from_config(config) | |
| # model.load_safetensors_checkpoint( | |
| # config, checkpoint_dir=MODELS_DIR, use_deepspeed=False | |
| #) | |
| model.load_checkpoint(config, checkpoint_dir=str(MODELS_DIR), eval=True) | |
| if torch.cuda.is_available(): | |
| model.cuda() | |
| print("Model loaded on GPU") | |
| else: | |
| print("GPU not available, using CPU") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| print(f"Make sure the XTTS-v2 model is present in: {MODELS_DIR}") | |
| sys.exit(1) | |
| def remove_silence( | |
| audio_segment, | |
| silence_thresh=-45, | |
| min_silence_len=300, | |
| keep_silence=100 | |
| ): | |
| """ | |
| Optimisé: Coupe audio_segment autour des silences puis reconstruit l'audio | |
| en supprimant les silences. Ajuste silence_thresh et min_silence_len | |
| en fonction du niveau sonore de votre audio. | |
| """ | |
| # Vérifie que l'audio n'est pas trop court pour éviter les problèmes | |
| if len(audio_segment) < 1000: # moins d'une seconde | |
| return audio_segment | |
| # Première tentative avec les paramètres fournis | |
| chunks = split_on_silence( | |
| audio_segment, | |
| min_silence_len=min_silence_len, | |
| silence_thresh=silence_thresh, | |
| keep_silence=keep_silence | |
| ) | |
| # Si aucun segment n'est détecté ou peu de segments, ajuster les paramètres | |
| if not chunks or len(chunks) < 2: | |
| # Essayer avec des paramètres plus souples | |
| chunks = split_on_silence( | |
| audio_segment, | |
| min_silence_len=200, # Réduire pour détecter des silences plus courts | |
| silence_thresh=silence_thresh + 5, # Augmenter le seuil (moins négatif) pour détecter plus de silences | |
| keep_silence=keep_silence | |
| ) | |
| # Recombiner toutes les pièces non silencieuses | |
| if chunks: | |
| processed_audio = AudioSegment.empty() | |
| for chunk in chunks: | |
| processed_audio += chunk | |
| # Vérifier que l'audio n'a pas été trop raccourci | |
| length_ratio = len(processed_audio) / len(audio_segment) | |
| if length_ratio < 0.7: # Si plus de 30% a été supprimé | |
| # Garder une version moins agressive | |
| chunks = split_on_silence( | |
| audio_segment, | |
| min_silence_len=min_silence_len * 2, # Plus long, détecte moins de silences | |
| silence_thresh=silence_thresh - 5, # Plus strict (plus négatif) | |
| keep_silence=keep_silence * 2 # Garder plus de silence | |
| ) | |
| if chunks: | |
| processed_audio = AudioSegment.empty() | |
| for chunk in chunks: | |
| processed_audio += chunk | |
| else: | |
| return audio_segment | |
| return processed_audio | |
| else: | |
| # Si tout l'audio est considéré comme du silence, retourner l'original | |
| return audio_segment | |
| def chunk_sentence_by_words(sentence, max_length=200): | |
| """ | |
| Divise une phrase en sous-chunks (max. max_length caractères) | |
| sans couper au milieu d'un mot. | |
| Optimisé pour la performance. | |
| """ | |
| # Si la phrase est déjà suffisamment courte, la retourner directement | |
| if len(sentence) <= max_length: | |
| return [sentence] | |
| words = sentence.split() # division par mots | |
| sub_chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for word in words: | |
| # Si ajouter ce mot dépasserait la longueur max, commencer un nouveau chunk | |
| word_len = len(word) + (1 if current_length > 0 else 0) # +1 pour l'espace | |
| if current_length + word_len > max_length: | |
| if current_chunk: # S'assurer qu'on a quelque chose à ajouter | |
| sub_chunks.append(" ".join(current_chunk)) | |
| current_chunk = [] | |
| current_length = 0 | |
| # Traiter les mots individuels qui sont plus longs que max_length | |
| if len(word) > max_length: | |
| sub_chunks.append(word) | |
| continue | |
| # Ajouter le mot au chunk courant | |
| current_chunk.append(word) | |
| current_length += word_len | |
| # Ajouter le dernier chunk s'il en reste | |
| if current_chunk: | |
| sub_chunks.append(" ".join(current_chunk)) | |
| return sub_chunks | |
| def split_text(text, max_length=150): | |
| """ | |
| - Divise 'text' en phrases (via sent_tokenize). | |
| - Si une phrase dépasse max_length, la divise mot par mot | |
| en utilisant chunk_sentence_by_words. | |
| - Retourne une liste de chunks, chacun ≤ max_length caractères. | |
| Optimisé pour la performance. | |
| """ | |
| # Vérifier que le texte n'est pas vide | |
| if not text.strip(): | |
| return [] | |
| # Division en phrases avec gestion d'erreur améliorée | |
| try: | |
| raw_sentences = sent_tokenize(text) | |
| if not raw_sentences: | |
| raw_sentences = [text] | |
| except Exception as e: | |
| # En cas d'erreur, utiliser une simple division par points | |
| raw_sentences = [s.strip() + '.' for s in text.split('.') if s.strip()] | |
| if not raw_sentences: | |
| raw_sentences = [text] | |
| # Initialiser la liste finale de chunks | |
| final_chunks = [] | |
| # Traiter chaque phrase | |
| for sentence in raw_sentences: | |
| sentence = sentence.strip() | |
| if not sentence: | |
| continue | |
| # Si la phrase entière est courte, l'ajouter directement | |
| if len(sentence) <= max_length: | |
| final_chunks.append(sentence) | |
| else: | |
| # Sinon, la diviser en sous-chunks | |
| sub_chunks = chunk_sentence_by_words(sentence, max_length) | |
| final_chunks.extend(sub_chunks) | |
| # S'assurer qu'on a des chunks | |
| if not final_chunks: | |
| for i in range(0, len(text), max_length): | |
| chunk = text[i:i+max_length] | |
| if chunk.strip(): # Ne pas ajouter de segments vides | |
| final_chunks.append(chunk) | |
| return final_chunks | |
| def check_language_dependencies(language): | |
| """ | |
| Vérifie les dépendances nécessaires pour une langue donnée. | |
| Cette fonction s'exécute sur CPU. | |
| Args: | |
| language (str): Code de langue à vérifier | |
| Returns: | |
| tuple: (None, None) si tout est ok, ou (None, message_erreur) si problème | |
| """ | |
| # Dépendances spécifiques par langue | |
| language_dependencies = { | |
| "zh-cn": "pypinyin", | |
| "ja": "cutlet,fugashi,unidic-lite", | |
| "ko": "hangul-romanize", | |
| } | |
| if language in language_dependencies: | |
| try: | |
| # Essayer d'importer dynamiquement la dépendance | |
| if language == "zh-cn": | |
| import importlib | |
| importlib.import_module("pypinyin") | |
| elif language == "ja": | |
| import importlib | |
| importlib.import_module("cutlet") | |
| # Vérifier les dépendances supplémentaires pour le japonais | |
| try: | |
| importlib.import_module("fugashi") | |
| # Vérifier si unidic-lite est installé | |
| try: | |
| import unidic_lite | |
| except ImportError: | |
| raise ImportError("Japanese requires: unidic-lite") | |
| except ImportError: | |
| raise ImportError("Japanese requires: fugashi and unidic-lite") | |
| elif language == "ko": | |
| import importlib | |
| importlib.import_module("hangul_romanize") | |
| except ImportError as e: | |
| dependency = language_dependencies[language] | |
| language_name = { | |
| "zh-cn": "Chinese", | |
| "ja": "Japanese", | |
| "ko": "Korean" | |
| }[language] | |
| # Message personnalisé pour les dépendances japonaises | |
| if language == "ja" and "fugashi" in str(e): | |
| install_command = "pip install fugashi mecab-python3 unidic-lite" | |
| error_message = f""" | |
| Error: Missing dependencies for {language_name} language. | |
| Please run the following command to install the required packages: | |
| {install_command} | |
| Then restart the application. | |
| """ | |
| else: | |
| install_command = f"pip install {dependency}" | |
| error_message = f""" | |
| Error: Missing dependency for {language_name} language. | |
| Please run the following command to install the required package: | |
| {install_command} | |
| Then restart the application. | |
| """ | |
| return None, error_message | |
| return None, None | |
| def synthesize_speech( | |
| text, | |
| language, | |
| temperature, | |
| speed, | |
| reference_audio, | |
| do_sample=True, | |
| repetition_penalty=1.0, | |
| length_penalty=1.0, | |
| gpt_cond_len=30, | |
| top_k=50, | |
| top_p=0.85, | |
| remove_silence_enabled=True, | |
| silence_threshold=-45, | |
| min_silence_len=300, | |
| keep_silence=100, | |
| text_splitting_method="Native XTTS splitting", | |
| max_chars_per_segment=250, | |
| enable_preprocessing=True | |
| ): | |
| """Generate speech from text by orchestrating preprocessing, synthesis, and post-processing. | |
| This function acts as the main pipeline for TTS generation. It takes raw text and parameters, | |
| handles dependencies, preprocesses text, generates a raw audio waveform using the XTTS model, | |
| and then post-processes the audio (normalization, silence removal) to produce a final MP3 file. | |
| Args: | |
| text (str): The text to convert to speech. | |
| language (str): Language code for synthesis (e.g., 'en', 'fr'). | |
| temperature (float): Controls randomness in generation (0.1-1.5, recommended: 0.75). | |
| speed (float): Speech speed multiplier (0.5-2.0, 1.0 = normal speed). | |
| reference_audio (str): File path or URL to reference audio for voice cloning. | |
| do_sample (bool): Enable sampling for more natural speech variation. | |
| repetition_penalty (float): Penalty for repetitive speech (1.0-5.0, recommended: 5.0). | |
| length_penalty (float): Penalty affecting speech length (1.0-2.0, recommended: 1.0). | |
| gpt_cond_len (int): Conditioning length for GPT model (10-50, recommended: 30). | |
| top_k (int): Top-K sampling parameter (0-50, 0 = disabled). | |
| top_p (float): Top-P sampling parameter (0.0-1.0, 0 = disabled). | |
| remove_silence_enabled (bool): Remove silent parts from generated audio. | |
| silence_threshold (int): dB threshold for silence detection (-60 to -20). | |
| min_silence_len (int): Minimum silence length in ms to detect (300-1000). | |
| keep_silence (int): Amount of silence to keep in ms (100-500). | |
| text_splitting_method (str): Method for splitting long text. | |
| max_chars_per_segment (int): Maximum characters per segment for custom splitting. | |
| enable_preprocessing (bool): Automatically preprocess text for better pronunciation. | |
| Returns: | |
| tuple: (audio_file_path, error_message, preprocessed_text) | |
| - audio_file_path (str): Path to the generated MP3 audio file, or None on error. | |
| - error_message (str): A description of the error if one occurred, otherwise None. | |
| - preprocessed_text (str): The text after preprocessing has been applied. | |
| """ | |
| # Part 1: Validation and Parameter Setup | |
| if not text.strip(): | |
| return None, "Error: Text cannot be empty", text | |
| _, error_message = check_language_dependencies(language) | |
| if error_message: | |
| return None, error_message, text | |
| if top_k == 0: | |
| top_k = None | |
| if top_p == 0: | |
| top_p = None | |
| if temperature <= 0: | |
| temperature = 0.75 | |
| if repetition_penalty <= 0: | |
| repetition_penalty = 5.0 | |
| if length_penalty <= 0: | |
| length_penalty = 1.0 | |
| reference_audio_path = reference_audio | |
| # Part 2: Text Preprocessing | |
| preprocessed_text = text | |
| if enable_preprocessing: | |
| preprocessed_text = preprocess_text(text, language) | |
| print(f"Preprocessed text: {preprocessed_text}") | |
| # Part 3: Waveform Generation (Core Synthesis) | |
| try: | |
| if text_splitting_method == "Custom splitting": | |
| text_chunks = split_text(preprocessed_text, max_length=max_chars_per_segment) | |
| print(f"Text split into {len(text_chunks)} segments (max {max_chars_per_segment} characters per segment)") | |
| if not text_chunks: | |
| return None, "Error: The text could not be split into segments", preprocessed_text | |
| outputs_wav_list = [] | |
| for i, chunk in enumerate(text_chunks): | |
| print(f"Processing segment {i+1}/{len(text_chunks)}: {chunk}") | |
| chunk_output = model.synthesize( | |
| chunk, config, speaker_wav=reference_audio_path, language=language, | |
| temperature=temperature, do_sample=do_sample, speed=speed, | |
| enable_text_splitting=True, repetition_penalty=repetition_penalty, | |
| length_penalty=length_penalty, gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p | |
| ) | |
| outputs_wav_list.append(chunk_output["wav"]) | |
| if outputs_wav_list: | |
| outputs_wav = np.concatenate(outputs_wav_list) | |
| else: | |
| return None, "Error: No audio segment could be generated", preprocessed_text | |
| else: | |
| # Always enable native XTTS splitting by default for better AI agent compatibility | |
| use_native_splitting = True | |
| if text_splitting_method == "No splitting": | |
| use_native_splitting = False | |
| print("Native XTTS splitting disabled by user request") | |
| elif len(preprocessed_text) > 150: | |
| print("Long text detected: native XTTS splitting is enabled") | |
| use_native_splitting = True | |
| print(f"Generating with parameters: temperature={temperature}, do_sample={do_sample}, repetition_penalty={repetition_penalty}, length_penalty={length_penalty}, top_k={top_k}, top_p={top_p}, enable_text_splitting={use_native_splitting}") | |
| outputs = model.synthesize( | |
| preprocessed_text, config, speaker_wav=reference_audio_path, language=language, | |
| temperature=temperature, do_sample=do_sample, speed=speed, | |
| enable_text_splitting=use_native_splitting, repetition_penalty=repetition_penalty, | |
| length_penalty=length_penalty, gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p | |
| ) | |
| outputs_wav = outputs["wav"] | |
| except Exception as e: | |
| error_message = f"Error during audio generation: {str(e)}" | |
| print(error_message) | |
| error_str = str(e) | |
| if "Chinese requires: pypinyin" in error_str: | |
| error_message = "Error: Missing pypinyin package for Chinese language support.\n\nPlease run: pip install pypinyin" | |
| elif "No module named 'cutlet'" in error_str: | |
| error_message = "Error: Missing cutlet package for Japanese language support.\n\nPlease run: pip install cutlet" | |
| elif "Japanese requires: fugashi" in error_str: | |
| error_message = "Error: Missing fugashi package for Japanese language support.\n\nPlease run: pip install fugashi mecab-python3 unidic-lite" | |
| elif "Japanese requires: unidic-lite" in error_str: | |
| error_message = "Error: Missing unidic-lite package for Japanese language support.\n\nPlease run: pip install unidic-lite" | |
| elif "Failed initializing MeCab" in error_str or "no such file or directory: /usr/local/etc/mecabrc" in error_str: | |
| error_message = """Error: MeCab initialization failed for Japanese language support. | |
| Please run: pip install fugashi mecab-python3 unidic-lite | |
| If the error persists, you may need to install MeCab dictionaries: | |
| - For Ubuntu/Debian: sudo apt-get install mecab mecab-ipadic | |
| - For macOS with Homebrew: brew install mecab mecab-ipadic | |
| """ | |
| elif "Korean requires: hangul_romanize" in error_str: | |
| error_message = "Error: Missing hangul-romanize package for Korean language support.\n\nPlease run: pip install hangul-romanize" | |
| return None, error_message, preprocessed_text | |
| # Part 4: Audio Post-Processing | |
| try: | |
| temp_audio_path = str(TEMP_DIR / f"temp_chunk_audio_{uuid.uuid4()}.wav") | |
| torchaudio.save(temp_audio_path, torch.tensor(outputs_wav).unsqueeze(0), 24000) | |
| audio_segment = AudioSegment.from_wav(temp_audio_path) | |
| # Normalisation du volume de manière moins agressive | |
| target_dbfs = -18.0 | |
| current_dbfs = audio_segment.dBFS | |
| if current_dbfs < -50: | |
| delta_db = -18.0 - current_dbfs | |
| delta_db = min(delta_db, 20.0) | |
| audio_segment = audio_segment.apply_gain(delta_db) | |
| else: | |
| delta_db = target_dbfs - current_dbfs | |
| audio_segment = audio_segment.apply_gain(delta_db) | |
| combined_audio = audio_segment | |
| # Suppression des silences si activée | |
| if remove_silence_enabled: | |
| padding = AudioSegment.silent(duration=500, frame_rate=combined_audio.frame_rate) | |
| padded_audio = padding + combined_audio + padding | |
| processed_audio = remove_silence( | |
| padded_audio, | |
| silence_thresh=silence_threshold, | |
| min_silence_len=min_silence_len, | |
| keep_silence=keep_silence | |
| ) | |
| if len(processed_audio) > len(combined_audio) + 900: | |
| trim_length = min(500, len(processed_audio) // 10) | |
| combined_audio = processed_audio[trim_length:-trim_length] | |
| else: | |
| combined_audio = processed_audio | |
| timestamp = time.strftime("%Y%m%d-%H%M%S") | |
| final_output_path = str(TEMP_DIR / f"temp_output_{timestamp}_{uuid.uuid4()}.mp3") | |
| combined_audio.export(final_output_path, format="mp3", bitrate="192k") | |
| try: | |
| os.remove(temp_audio_path) | |
| except: | |
| pass | |
| return final_output_path, None, preprocessed_text | |
| except Exception as e: | |
| error_message = f"Error during audio processing: {str(e)}" | |
| print(error_message) | |
| return None, error_message, preprocessed_text | |
| def download_audio_from_url(url): | |
| """Downloads an audio file from a URL and saves it to a temporary file.""" | |
| try: | |
| if not url.startswith(('http://', 'https://')): | |
| raise ValueError("URL must start with http:// or https://") | |
| response = requests.get(url, stream=True, timeout=20) # 20 seconds timeout | |
| response.raise_for_status() | |
| # Use a temporary file to store the audio | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| temp_audio.write(chunk) | |
| print(f"Audio downloaded from {url} to {temp_audio.name}") | |
| return temp_audio.name | |
| except (requests.exceptions.RequestException, ValueError) as e: | |
| print(f"Failed to download audio from {url}: {e}") | |
| return None | |
| def voice_clone_synthesis( | |
| text: str, | |
| reference_audio_url: str = None, | |
| example_audio_name: str = None, | |
| language: str = "English", | |
| temperature: float = 0.75, | |
| speed: float = 1.0, | |
| do_sample: bool = True, | |
| repetition_penalty: float = 5.0, | |
| length_penalty: float = 1.0, | |
| gpt_cond_len: int = 30, | |
| top_k: int = 50, | |
| top_p: float = 0.85, | |
| remove_silence_enabled: bool = True, | |
| silence_threshold: int = -45, | |
| min_silence_len: int = 300, | |
| keep_silence: int = 100, | |
| text_splitting_method: str = "Native XTTS splitting", | |
| max_chars_per_segment: int = 250, | |
| enable_preprocessing: bool = False | |
| ): | |
| """ | |
| 🎤 Generates speech by cloning a voice from a reference audio URL. | |
| This tool takes text and a URL to a reference audio file, and synthesizes | |
| the text in the voice from the reference audio. It supports 17 languages | |
| and offers advanced control over the generation process. | |
| Args: | |
| text (str): The text to be synthesized. Required. | |
| reference_audio_url (str, optional): A public URL pointing to a reference audio file (WAV or MP3). | |
| Provide this OR example_audio_name, but not both. | |
| example_audio_name (str, optional): The name of a pre-defined example audio file. | |
| Valid choices: 'Boy.mp3', 'Buddha.mp3', 'Buddha2.mp3', 'Budhiya.mp3', | |
| 'Energeticboy.mp3', 'Female_1.wav', 'Girl.mp3', 'Littlekid.mp3', | |
| 'Male(deep).mp3', 'Male.mp3'. | |
| Provide this OR reference_audio_url, but not both. | |
| language (str): The language of the text. Defaults to "English". | |
| Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, | |
| Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. | |
| temperature (float): Controls the randomness of the output. Higher values make it more random. | |
| Range: 0.1-1.5. Default: 0.75. Recommended: 0.75 for balanced output. | |
| speed (float): The speed of the generated speech. | |
| Range: 0.5-2.0. Default: 1.0. Example: 0.8 = slower, 1.2 = faster. | |
| do_sample (bool): Whether to use sampling for generation. Recommended: True. Default: True. | |
| repetition_penalty (float): Penalty for repeating words or phrases. IMPORTANT: Must be > 1.0. | |
| Range: 1.0-5.0. Default: 5.0. Higher values reduce repetition. AI agents should use values like 1.1, 1.5, 2.0, 3.0, 4.0, 5.0. | |
| length_penalty (float): Penalty for sentence length. IMPORTANT: Must be > 1.0. | |
| Range: 1.0-2.0. Default: 1.0. Higher values encourage shorter sentences. | |
| gpt_cond_len (int): Conditioning length for the GPT model. | |
| Range: 10-50. Default: 30. Higher values use more context. | |
| top_k (int): Top-K sampling parameter. 0 to disable top-k. | |
| Range: 0-50. Default: 50. Lower values make output more focused. | |
| top_p (float): Top-P (nucleus) sampling parameter. 0.0 to disable top-p. | |
| Range: 0.0-1.0. Default: 0.85. Lower values make output more focused. | |
| remove_silence_enabled (bool): Enable/disable automatic silence removal. Default: True. | |
| silence_threshold (int): Silence threshold in dB for silence detection. | |
| Range: -60 to -20. Default: -45. More negative = more sensitive to silence. | |
| min_silence_len (int): Minimum length of silence to be removed in milliseconds. | |
| Range: 300-1000. Default: 300. | |
| keep_silence (int): Amount of silence to keep at the beginning/end in milliseconds. | |
| Range: 100-500. Default: 100. | |
| text_splitting_method (str): Method for splitting text. | |
| Valid choices: 'Native XTTS splitting', 'Custom splitting', 'No splitting'. | |
| Default: 'Native XTTS splitting'. Recommended for most use cases. | |
| max_chars_per_segment (int): Max characters per segment when using 'Custom splitting'. | |
| Range: 50-400. Default: 250. Only relevant when text_splitting_method = 'Custom splitting'. | |
| enable_preprocessing (bool): Enable automatic text preprocessing to clean problematic characters. | |
| Default: False. Recommended: True for better pronunciation. | |
| Returns: | |
| str: A URL to the generated MP3 audio file. | |
| Examples: | |
| Basic usage with example audio: | |
| voice_clone_synthesis( | |
| text="Hello world!", | |
| example_audio_name="audio_1.wav", | |
| language="English" | |
| ) | |
| Advanced usage with custom parameters: | |
| voice_clone_synthesis( | |
| text="Bonjour le monde!", | |
| example_audio_name="audio_2.wav", | |
| language="French", | |
| temperature=0.8, | |
| speed=1.1, | |
| repetition_penalty=2.0, # Note: > 1.0 required | |
| length_penalty=1.2, # Note: > 1.0 required | |
| enable_preprocessing=True | |
| ) | |
| Raises: | |
| gr.Error: If parameters are out of range or invalid combinations are used. | |
| """ | |
| # Validate and convert parameter types early for better AI agent feedback | |
| temperature = float(temperature) | |
| speed = float(speed) | |
| repetition_penalty = float(repetition_penalty) | |
| length_penalty = float(length_penalty) | |
| gpt_cond_len = int(gpt_cond_len) | |
| top_k = int(top_k) | |
| top_p = float(top_p) | |
| silence_threshold = int(silence_threshold) | |
| min_silence_len = int(min_silence_len) | |
| keep_silence = int(keep_silence) | |
| max_chars_per_segment = int(max_chars_per_segment) | |
| if not (0.1 <= temperature <= 1.5): | |
| raise gr.Error(f"Temperature must be between 0.1 and 1.5, got {temperature}") | |
| if not (0.5 <= speed <= 2.0): | |
| raise gr.Error(f"Speed must be between 0.5 and 2.0, got {speed}") | |
| if not (1.0 <= repetition_penalty <= 5.0): | |
| raise gr.Error(f"Repetition penalty must be between 1.0 and 5.0, got {repetition_penalty}") | |
| if not (1.0 <= length_penalty <= 2.0): | |
| raise gr.Error(f"Length penalty must be between 1.0 and 2.0, got {length_penalty}") | |
| if not (10 <= gpt_cond_len <= 50): | |
| raise gr.Error(f"GPT conditioning length must be between 10 and 50, got {gpt_cond_len}") | |
| if not (0 <= top_k <= 50): | |
| raise gr.Error(f"Top-K must be between 0 and 50, got {top_k}") | |
| if not (0.0 <= top_p <= 1.0): | |
| raise gr.Error(f"Top-P must be between 0.0 and 1.0, got {top_p}") | |
| if not (-60 <= silence_threshold <= -20): | |
| raise gr.Error(f"Silence threshold must be between -60 and -20 dB, got {silence_threshold}") | |
| if not (300 <= min_silence_len <= 1000): | |
| raise gr.Error(f"Minimum silence length must be between 300 and 1000 ms, got {min_silence_len}") | |
| if not (100 <= keep_silence <= 500): | |
| raise gr.Error(f"Keep silence must be between 100 and 500 ms, got {keep_silence}") | |
| if not (50 <= max_chars_per_segment <= 400): | |
| raise gr.Error(f"Max characters per segment must be between 50 and 400, got {max_chars_per_segment}") | |
| valid_splitting_methods = ["Native XTTS splitting", "Custom splitting", "No splitting"] | |
| if text_splitting_method not in valid_splitting_methods: | |
| raise gr.Error(f"Text splitting method must be one of {valid_splitting_methods}, got '{text_splitting_method}'") | |
| valid_example_audios = ["audio_1.wav", "audio_2.wav", "audio_3.wav", "audio_4.wav", "audio_5.wav", "guzel_ses.wav", "guzel_ses_rapide.wav"] | |
| if example_audio_name and example_audio_name not in valid_example_audios: | |
| raise gr.Error(f"Example audio name must be one of {valid_example_audios}, got '{example_audio_name}'") | |
| reference_audio_path = None | |
| downloaded_path = None # To keep track of downloaded file for cleanup | |
| # Ensure only one reference type is provided | |
| if reference_audio_url and example_audio_name: | |
| raise gr.Error("Please provide either 'reference_audio_url' or 'example_audio_name', but not both.") | |
| if not reference_audio_url and not example_audio_name: | |
| raise gr.Error("You must provide either 'reference_audio_url' or 'example_audio_name'.") | |
| # Use the example audio if provided | |
| if example_audio_name: | |
| if example_audio_name not in file_path_mapping: | |
| available_files = ", ".join(files_display) | |
| raise gr.Error(f"Invalid example audio name. Available files are: {available_files}") | |
| reference_audio_path = file_path_mapping[example_audio_name] | |
| print(f"Using example audio: {reference_audio_path}") | |
| # Otherwise, download from URL | |
| if reference_audio_url: | |
| print(f"Downloading reference audio from: {reference_audio_url}") | |
| downloaded_path = download_audio_from_url(reference_audio_url) | |
| if not downloaded_path: | |
| raise gr.Error("Failed to download or process the reference audio from the provided URL.") | |
| reference_audio_path = downloaded_path | |
| # Validate the selected audio file | |
| is_valid, error_message = validate_audio_file(reference_audio_path) | |
| if not is_valid: | |
| if downloaded_path and os.path.exists(downloaded_path): os.remove(downloaded_path) | |
| raise gr.Error(error_message) | |
| language_code = SUPPORTED_LANGUAGES.get(language) | |
| if not language_code: | |
| if downloaded_path and os.path.exists(downloaded_path): os.remove(downloaded_path) | |
| raise gr.Error(f"Language '{language}' is not supported.") | |
| audio_path, error, _ = synthesize_speech( | |
| text=text, language=language_code, temperature=temperature, speed=speed, | |
| reference_audio=reference_audio_path, do_sample=do_sample, | |
| repetition_penalty=repetition_penalty, length_penalty=length_penalty, | |
| gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p, | |
| remove_silence_enabled=remove_silence_enabled, | |
| silence_threshold=silence_threshold, min_silence_len=min_silence_len, | |
| keep_silence=keep_silence, text_splitting_method=text_splitting_method, | |
| max_chars_per_segment=max_chars_per_segment, | |
| enable_preprocessing=enable_preprocessing | |
| ) | |
| # Clean up downloaded file if it exists | |
| if downloaded_path and os.path.exists(downloaded_path): | |
| os.remove(downloaded_path) | |
| if error: | |
| raise gr.Error(error) | |
| return audio_path | |
| def analyze_text_for_speech(text: str, language: str): | |
| """ | |
| 📊 Analyzes text for potential pronunciation and synthesis issues. | |
| This tool examines text for elements that could be mispronounced by the TTS model, | |
| such as special characters, numbers, URLs, and language-specific patterns. | |
| It provides a structured report of potential issues. | |
| Args: | |
| text (str): The text to analyze. Required. | |
| language (str): The language of the text. Required. | |
| Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, | |
| Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. | |
| Note: Use exact language names (case-sensitive). | |
| Returns: | |
| dict: A dictionary containing the analysis results with these keys: | |
| - standard_analysis_issues: List of detected issues with descriptions and suggestions | |
| - has_issues: Boolean indicating if any issues were found | |
| - xtts_cleaned_text: Preprocessed version of the text ready for synthesis | |
| Example: | |
| analyze_text_for_speech( | |
| text="Hello! This costs $15.99 & includes free shipping.", | |
| language="English" | |
| ) | |
| Raises: | |
| gr.Error: If the language is not supported. | |
| """ | |
| language_code = SUPPORTED_LANGUAGES.get(language) | |
| if not language_code: | |
| raise gr.Error(f"Language '{language}' is not supported.") | |
| standard_analysis = analyze_text(text, language_code) | |
| # tokenizer_analysis = analyze_with_tokenizer(text, language_code) | |
| combined_issues = { | |
| "standard_analysis_issues": standard_analysis.get('issues', []), | |
| # "tokenizer_analysis_issues": tokenizer_analysis.get('issues', []), | |
| "has_issues": standard_analysis.get('has_issues', False), # or tokenizer_analysis.get('has_issues', False), | |
| "xtts_cleaned_text": preprocess_text(text, language_code) # tokenizer_analysis.get('cleaned_text', text) | |
| } | |
| return combined_issues | |
| def preprocess_text_for_speech(text: str, language: str): | |
| """ | |
| 🔧 Preprocesses and cleans text for optimal speech synthesis. | |
| This tool applies a series of cleaning and normalization rules to the input text | |
| to improve its compatibility with the TTS model. This includes handling numbers, | |
| special characters, URLs, and applying language-specific typographical rules. | |
| Args: | |
| text (str): The text to preprocess. Required. | |
| language (str): The language of the text. Required. | |
| Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, | |
| Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. | |
| Note: Use exact language names (case-sensitive). | |
| Returns: | |
| str: The cleaned and preprocessed text ready for speech synthesis. | |
| Example: | |
| preprocess_text_for_speech( | |
| text="Visit https://example.com & pay $25.50!", | |
| language="English" | |
| ) | |
| # Returns: "Visit example.com and pay twenty-five dollars and fifty cents!" | |
| Raises: | |
| gr.Error: If the language is not supported. | |
| """ | |
| language_code = SUPPORTED_LANGUAGES.get(language) | |
| if not language_code: | |
| raise gr.Error(f"Language '{language}' is not supported.") | |
| return preprocess_text(text, language_code, apply_replacements=True) | |
| # Example texts for different languages | |
| EXAMPLE_TEXTS = { | |
| "fr": "Bonjour, je suis une voix générée par intelligence artificielle. Comment puis-je vous aider aujourd'hui?", | |
| "en": "Hello, I am a voice generated by artificial intelligence. How may I assist you today?", | |
| "es": "Hola, soy una voz generada por inteligencia artificial. ¿Cómo puedo ayudarte hoy?", | |
| "de": "Hallo, ich bin eine von künstlicher Intelligenz generierte Stimme. Wie kann ich Ihnen heute helfen?", | |
| "it": "Ciao, sono una voce generata dall'intelligenza artificiale. Come posso aiutarti oggi?", | |
| "pt": "Olá, sou uma voz gerada por inteligência artificial. Como posso ajudá-lo hoje?", | |
| "ar": "مرحبا، أنا صوت تم إنشاؤه بواسطة الذكاء الاصطناعي. كيف يمكنني مساعدتك اليوم؟", | |
| "zh-cn": "你好,我是由人工智能生成的声音。今天我能为您提供什么帮助?", | |
| "ja": "こんにちは、私は人工知能によって生成された音声です。今日はどのようにお手伝いできますか?", | |
| "ko": "안녕하세요, 저는 인공지능으로 생성된 목소리입니다. 오늘 어떻게 도와드릴까요?", | |
| "ru": "Здравствуйте, я голос, сгенерированный искусственным интеллектом. Чем я могу вам помочь сегодня?", | |
| "nl": "Hallo, ik ben een stem gegenereerd door kunstmatige intelligentie. Hoe kan ik u vandaag helpen?", | |
| "cs": "Dobrý den, jsem hlas vytvořený umělou inteligencí. Jak vám mohu dnes pomoci?", | |
| "pl": "Dzień dobry, jestem głosem wygenerowanym przez sztuczną inteligencję. Jak mogę ci dziś pomóc?", | |
| "tr": "Merhaba, ben yapay zeka tarafından oluşturulan bir sesim. Bugün size nasıl yardımcı olabilirim?", | |
| "hu": "Üdvözlöm, én egy mesterséges intelligencia által generált hang vagyok. Hogyan segíthetek ma?", | |
| "hi": "नमस्ते, मैं कृत्रिम बुद्धिमत्ता द्वारा उत्पन्न एक आवाज हूं। मैं आज आपकी कैसे मदद कर सकता हूं?" | |
| } | |
| # Function to analyze text with the XTTS tokenizer | |
| def analyze_with_tokenizer(text, language_code): | |
| """ | |
| Analyzes text using the XTTS model's tokenizer to detect | |
| parts that may be problematic for pronunciation. | |
| Args: | |
| text (str): The text to analyze | |
| language_code (str): Language code (fr, en, etc.) | |
| Returns: | |
| dict: A dictionary containing detected issues and suggestions | |
| """ | |
| import torch | |
| from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners | |
| issues = [] | |
| original_text = text | |
| try: | |
| # 1. Run the same preprocessing as the XTTS model uses internally | |
| cleaned_text = text | |
| print(f"Using XTTS cleaners for language: {language_code}") | |
| # The multilingual_cleaners object is a dictionary mapping language codes to cleaner functions. | |
| if language_code in multilingual_cleaners: | |
| cleaner_fn = multilingual_cleaners[language_code] | |
| cleaned_text = cleaner_fn(text) | |
| else: | |
| # If no specific cleaner is available, we just use the original text. | |
| # The TTS model will use its default basic cleaners internally. | |
| print(f"No specific cleaner for language {language_code}, using original text for analysis.") | |
| cleaned_text = text | |
| # 2. Tokenize the text as XTTS would | |
| # Compare the original and cleaned text to detect changes | |
| if original_text != cleaned_text: | |
| # Find the parts that have been modified | |
| import difflib | |
| # Create an object to compare the two texts | |
| differ = difflib.Differ() | |
| diff = list(differ.compare(original_text.split(), cleaned_text.split())) | |
| # Find the words that have been removed or changed | |
| modified_words = [] | |
| for d in diff: | |
| if d.startswith('- '): | |
| word = d[2:] | |
| if len(word) > 1: # Ignore individual characters | |
| modified_words.append(word) | |
| if modified_words: | |
| issues.append({ | |
| 'type': 'tokenizer_changes', | |
| 'description': 'Words that might be mispronounced', | |
| 'instances': modified_words, | |
| 'suggestion': 'Consider reformulating these parts or using automatic preprocessing' | |
| }) | |
| # 3. Check for words out of vocabulary (OOV) using the XTTS tokenizer | |
| # This part would require accessing the tokenizer's vocabulary, | |
| # which might not be directly accessible. | |
| # 4. Check for rare words that might be mispronounced | |
| words = text.split() | |
| long_words = [w for w in words if len(w) > 12] # Extremely long words | |
| if long_words: | |
| issues.append({ | |
| 'type': 'long_words', | |
| 'description': 'Extremely long words that might be mispronounced', | |
| 'instances': long_words, | |
| 'suggestion': 'Check if these words are pronounced correctly, try splitting them or reformulating' | |
| }) | |
| # 5. Check for special characters that are preserved after cleaning | |
| import re | |
| special_chars = re.findall(r'[^a-zA-Z0-9\s.,;:!?\'"-]', cleaned_text) | |
| if special_chars: | |
| unique_special_chars = list(set(special_chars)) | |
| issues.append({ | |
| 'type': 'special_chars_preserved', | |
| 'description': 'Special characters preserved by the tokenizer', | |
| 'instances': unique_special_chars, | |
| 'suggestion': 'These characters might cause pronunciation issues' | |
| }) | |
| return { | |
| 'issues': issues, | |
| 'has_issues': len(issues) > 0, | |
| 'cleaned_text': cleaned_text | |
| } | |
| except Exception as e: | |
| print(f"Error in tokenizer analysis: {e}") | |
| return { | |
| 'issues': [{ | |
| 'type': 'analysis_error', | |
| 'description': 'Error during analysis with the tokenizer', | |
| 'instances': [str(e)], | |
| 'suggestion': 'Technical error, please try again' | |
| }], | |
| 'has_issues': True, | |
| 'cleaned_text': text | |
| } | |
| # Function to combine both analyses | |
| def combined_analysis(text, language): | |
| """Perform comprehensive text analysis for optimal voice synthesis quality. | |
| This function combines standard text analysis with XTTS tokenizer analysis | |
| to detect and report all potential issues that might affect speech synthesis. | |
| Args: | |
| text: The text to analyze for speech synthesis compatibility | |
| language: Language name (English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Hungarian, Korean, Japanese, Hindi) | |
| Returns: | |
| A tuple containing detailed analysis report and cleaned text ready for synthesis | |
| """ | |
| language_code = SUPPORTED_LANGUAGES[language] | |
| # Run standard analysis | |
| standard_analysis = analyze_text(text, language_code) | |
| # Run analysis with tokenizer | |
| tokenizer_analysis = analyze_with_tokenizer(text, language_code) | |
| # Combine results | |
| display_text = format_issues_for_display(standard_analysis, language_code, tokenizer_analysis) | |
| # Get the preprocessed text (prefer the result from the tokenizer if it exists) | |
| cleaned_text = tokenizer_analysis.get('cleaned_text', "") | |
| if not cleaned_text or cleaned_text == text: | |
| cleaned_text = preprocess_text(text, language_code) if text else "" | |
| return display_text, cleaned_text | |
| def cleanup_old_files(max_age_minutes=60): | |
| """ | |
| Optimized: deletes temporary files older than max_age_minutes. | |
| This function can be called regularly to prevent accumulation of files. | |
| """ | |
| try: | |
| now = time.time() | |
| count_removed = 0 | |
| # Clean temporary files | |
| for file in TEMP_DIR.glob("*"): | |
| if file.is_file(): | |
| file_age_minutes = (now - os.path.getmtime(file)) / 60 | |
| if file_age_minutes > max_age_minutes: | |
| os.remove(file) | |
| count_removed += 1 | |
| # Clean old output files | |
| for file in OUTPUT_DIR.glob("*.mp3"): | |
| if file.is_file(): | |
| file_age_days = (now - os.path.getmtime(file)) / (24 * 60 * 60) | |
| if file_age_days > 7: # Keep one week | |
| os.remove(file) | |
| count_removed += 1 | |
| return count_removed | |
| except Exception as e: | |
| return 0 | |
| # Create interface with Gradio Blocks | |
| with gr.Blocks(theme=gr.themes.Ocean(), css=""" | |
| .gradio-container { | |
| max-width: 1280px !important; | |
| margin: auto !important; | |
| } | |
| #header { | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| padding: 10px 0; | |
| } | |
| """) as interface: | |
| with gr.Row(elem_id="header"): | |
| gr.Markdown( | |
| """ | |
| <div style="text-align: center;"> | |
| <h1 style="margin: 0; font-size: 1.8rem;">🎙️ Voice Cloning Studio</h1> | |
| <p style="margin: 0; font-size: 1rem;">Bring any voice to life from a 3-second audio sample.</p> | |
| </div> | |
| """ | |
| ) | |
| # Get all reference audio files and simplify their display | |
| try: | |
| files_paths = [str(f) for f in REF_AUDIO_DIR.iterdir() if f.is_file() and f.suffix.lower() in ['.wav', '.mp3']] | |
| files_display = [os.path.basename(f) for f in files_paths] | |
| file_path_mapping = dict(zip(files_display, files_paths)) | |
| except Exception as e: | |
| files_paths = [] | |
| files_display = [] | |
| file_path_mapping = {} | |
| with gr.Row(equal_height=False): | |
| # LEFT COLUMN: Inputs & Settings | |
| with gr.Column(scale=2): | |
| with gr.Tabs(): | |
| with gr.TabItem("1. Voice"): | |
| gr.Markdown("### Select a Reference Voice") | |
| gr.Markdown("Choose a pre-defined example or upload your own 3-10 second audio clip. For best results, use a clear, high-quality recording with no background noise.") | |
| example_audio_dropdown = gr.Dropdown( | |
| choices=files_display, | |
| label="Reference Audio (from examples)", | |
| value=files_display[0] if files_display else None, | |
| interactive=True | |
| ) | |
| reference_audio_input = gr.Audio( | |
| label="Reference Audio (upload your own)", | |
| type="filepath" | |
| ) | |
| with gr.TabItem("2. Text & Language"): | |
| gr.Markdown("### Enter Text and Select Language") | |
| lang_dropdown = gr.Dropdown( | |
| choices=list(SUPPORTED_LANGUAGES.keys()), | |
| value="English", | |
| label="Language" | |
| ) | |
| text_input = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Enter text here...", | |
| lines=5, | |
| value="Hello, I am a voice generated by artificial intelligence. How may I assist you today?" | |
| ) | |
| with gr.Row(): | |
| example_buttons = [] | |
| example_langs_to_show = ["en", "fr", "es", "de", "zh-cn"] | |
| for lang in example_langs_to_show: | |
| if lang in EXAMPLE_TEXTS: | |
| example_buttons.append(gr.Button(f"Example ({lang.upper()})")) | |
| with gr.Accordion("Text Analysis & Preprocessing", open=True): | |
| with gr.Row(): | |
| analyze_button = gr.Button("Analyze Text") | |
| enable_preprocessing = gr.Checkbox( | |
| value=False, | |
| label="Preprocess text automatically" | |
| ) | |
| text_analysis_output = gr.Textbox( | |
| label="Text Analysis", | |
| value="Click 'Analyze Text' to see results here.", | |
| lines=6 | |
| ) | |
| preprocessed_text_output = gr.Textbox( | |
| label="Preprocessed Text", | |
| value="The processed text will appear here after analysis or generation.", | |
| lines=3, | |
| visible=True | |
| ) | |
| with gr.TabItem("3. Settings"): | |
| gr.Markdown("### Fine-Tune Your Audio") | |
| gr.Markdown("Adjust these settings to control the style and quality of the generated speech.") | |
| with gr.Accordion("Generation Settings", open=True): | |
| with gr.Row(): | |
| with gr.Column(): | |
| temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, step=0.05, value=0.75, label="Temperature") | |
| speed_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.05, value=1.0, label="Speed") | |
| do_sample = gr.Checkbox(value=True, label="Enable Sampling (do_sample)") | |
| with gr.Column(): | |
| repetition_penalty = gr.Slider(minimum=1.0, maximum=5.0, step=0.1, value=5.0, label="Repetition Penalty") | |
| length_penalty = gr.Slider(minimum=1.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty") | |
| gpt_cond_len = gr.Slider(minimum=10, maximum=50, step=1, value=30, label="GPT Conditioning Length") | |
| top_k = gr.Slider(minimum=0, maximum=50, step=1, value=50, label="Top-K") | |
| top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.85, label="Top-P") | |
| with gr.Accordion("Text Splitting", open=False): | |
| text_splitting_method = gr.Radio( | |
| choices=["Native XTTS splitting", "Custom splitting", "No splitting"], | |
| value="Native XTTS splitting", | |
| label="Text Splitting Method" | |
| ) | |
| enable_text_splitting = gr.Checkbox( | |
| value=True, | |
| label="enable_text_splitting (XTTS parameter)", | |
| visible=False | |
| ) | |
| max_chars_per_segment = gr.Slider( | |
| minimum=50, | |
| maximum=400, | |
| step=10, | |
| value=250, | |
| label="Max characters per segment", | |
| visible=False | |
| ) | |
| with gr.Accordion("Silence Removal", open=False): | |
| remove_silence_enabled = gr.Checkbox(value=True, label="Remove silences from audio") | |
| silence_threshold = gr.Slider(minimum=-60, maximum=-20, step=5, value=-45, label="Silence threshold (dB)") | |
| min_silence_len = gr.Slider(minimum=300, maximum=1000, step=50, value=300, label="Minimum silence length (ms)") | |
| keep_silence = gr.Slider(minimum=100, maximum=500, step=10, value=100, label="Silence to keep (ms)") | |
| # RIGHT COLUMN: Output | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 4. Generate & Listen") | |
| gr.Markdown("Click the button to generate your audio. Results will appear below.") | |
| generate_button = gr.Button("Generate Audio", variant="primary", scale=1) | |
| output_audio = gr.Audio(label="Generated Audio") | |
| output_message = gr.Textbox(label="Status & Tips", visible=True, lines=8) | |
| with gr.Accordion("User Guide, Disclaimer & API Info", open=False): | |
| with gr.Tabs(): | |
| with gr.TabItem("🎯 Quick Start Guide"): | |
| gr.Markdown(""" | |
| ## 🎯 Quick User Guide | |
| 1. **Choose a reference voice**: In the **Voice** tab, select an example from the dropdown or upload your own clear audio file (3-10 seconds). | |
| 2. **Enter your text**: In the **Text & Language** tab, type or paste the text you want to synthesize and select the correct language. | |
| 3. **Generate**: Click the "Generate Audio" button. | |
| 4. **Iterate**: If you're not happy with the result, try regenerating. Small changes to the settings in the **Settings** tab can produce different results. | |
| ### 🔍 Essential Tips | |
| - **Reference Audio Quality**: The quality of the generated audio heavily depends on the reference. Use clean recordings with no background noise. | |
| - **Text Preprocessing**: Keep "Preprocess text automatically" enabled. It improves pronunciation of numbers, symbols, and URLs. Use the "Analyze Text" button to see potential issues. | |
| - **Optimizing Results**: For long texts, "Native XTTS splitting" is recommended. To change the speech style, try regenerating, adjusting the `Temperature`, or changing the `Speed`. | |
| - **Languages**: Ensure the selected language matches the text. | |
| """) | |
| with gr.TabItem("⚠️ Disclaimer"): | |
| gr.Markdown(""" | |
| ## ⚠️ Disclaimer and Legal Notice | |
| **By using this voice cloning application, you acknowledge and agree to the following:** | |
| 1. This application is provided "as is" without any warranties of any kind, either express or implied. | |
| 2. The creator(s) of this application accept no responsibility or liability for any misuse of the technology. | |
| 3. You are solely responsible for obtaining proper consent when cloning someone else's voice. | |
| 4. You agree not to use this technology for deceptive, harmful, or illegal purposes. | |
| 5. Voice cloning results may vary in quality and accuracy; no specific results are guaranteed. | |
| 6. You understand that voice cloning technology has ethical implications and agree to use it responsibly. | |
| The technology is intended for legitimate creative, educational, and accessibility purposes only. | |
| --- | |
| ### License & Model Information | |
| By accessing or using any feature within this space, you acknowledge and accept the terms of the following license: [https://coqui.ai/cpml](https://coqui.ai/cpml). | |
| **Model source:** [coqui/XTTS-v2](https://huggingface.co/coqui/XTTS-v2) | |
| """) | |
| with gr.TabItem("🔧 API Tools"): | |
| gr.Markdown(f""" | |
| ## 🛠️ Model Context Protocol (MCP) Tools | |
| This application exposes MCP tools that you can use with LLMs. | |
| **MCP Endpoint:** `https://hasanbasbunar-voice-cloning-xtts-v2.hf.space/gradio_api/mcp/sse` | |
| --- | |
| ### 🎤 `voice_clone_synthesis` | |
| Generates an audio file by cloning a voice from a reference audio file (provided via URL or a local example). | |
| **Parameters:** | |
| - `text` (string, required): The text to synthesize. | |
| - `reference_audio_url` (string, optional): A public URL for a reference audio file (WAV, MP3). **Provide this OR `example_audio_name`.** | |
| - `example_audio_name` (string, optional): The name of a predefined example audio file. **Provide this OR `reference_audio_url`.** Available files are: {', '.join(files_display)}. | |
| - `language` (string, optional): The language of the text. Default: "English". | |
| - ... (and other advanced parameters, see the function's docstring for a full list). | |
| **Returns:** | |
| - `string`: A URL to the generated MP3 audio file. | |
| --- | |
| ### 📊 `analyze_text_for_speech` | |
| Analyzes text for potential pronunciation issues. | |
| **Parameters:** | |
| - `text` (string, required): The text to analyze. | |
| - `language` (string, required): The language of the text. | |
| **Returns:** | |
| - `object`: A JSON object with the detected issues. | |
| --- | |
| ### 🔧 `preprocess_text_for_speech` | |
| Cleans and preprocesses text for optimal speech synthesis. | |
| **Parameters:** | |
| - `text` (string, required): The text to preprocess. | |
| - `language` (string, required): The language of the text. | |
| **Returns:** | |
| - `string`: The cleaned text. | |
| """) | |
| # Functions for example texts | |
| for i, lang_code in enumerate(example_langs_to_show): | |
| if lang_code in EXAMPLE_TEXTS: | |
| lang_name = next((k for k, v in SUPPORTED_LANGUAGES.items() if v == lang_code), None) | |
| if lang_name: | |
| example_buttons[i].click( | |
| lambda t, l: (t, l), | |
| inputs=[gr.Textbox(value=EXAMPLE_TEXTS[lang_code], visible=False), gr.Textbox(value=lang_name, visible=False)], | |
| outputs=[text_input, lang_dropdown], | |
| api_name=False | |
| ) | |
| # Function to analyze text and display results | |
| def analyze_input_text(text, language): | |
| language_code = SUPPORTED_LANGUAGES[language] | |
| analysis = analyze_text(text, language_code) | |
| display_text = format_issues_for_display(analysis, language_code) | |
| # Preprocess text and display it | |
| preprocessed = preprocess_text(text, language_code) if text else "" | |
| return display_text, preprocessed | |
| # Connect event handlers for text analysis | |
| text_input.change( | |
| analyze_input_text, | |
| inputs=[text_input, lang_dropdown], | |
| outputs=[text_analysis_output, preprocessed_text_output], | |
| api_name=False | |
| ) | |
| lang_dropdown.change( | |
| analyze_input_text, | |
| inputs=[text_input, lang_dropdown], | |
| outputs=[text_analysis_output, preprocessed_text_output], | |
| api_name=False | |
| ) | |
| analyze_button.click( | |
| combined_analysis, | |
| inputs=[text_input, lang_dropdown], | |
| outputs=[text_analysis_output, preprocessed_text_output], | |
| api_name=False | |
| ) | |
| # Function to validate audio files | |
| def validate_audio_file(file_path, max_size_mb=20, min_duration_sec=1, max_duration_sec=60): | |
| """ | |
| Validates audio files to ensure they are valid, have appropriate size and duration. | |
| Args: | |
| file_path (str): Path to the audio file | |
| max_size_mb (int): Maximum file size in MB | |
| min_duration_sec (float): Minimum duration in seconds | |
| max_duration_sec (float): Maximum duration in seconds | |
| Returns: | |
| tuple: (is_valid, error_message) | |
| """ | |
| # Check if file exists | |
| if not os.path.exists(file_path): | |
| return False, "Error: File does not exist" | |
| # Check file extension | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| if file_ext not in ['.mp3', '.wav']: | |
| return False, f"Error: Invalid file format {file_ext}. Only MP3 and WAV files are supported." | |
| # Check file size | |
| file_size_mb = os.path.getsize(file_path) / (1024 * 1024) | |
| if file_size_mb > max_size_mb: | |
| return False, f"Error: File size ({file_size_mb:.1f} MB) exceeds the maximum allowed size ({max_size_mb} MB)" | |
| try: | |
| # Check audio duration and integrity | |
| if file_ext == '.mp3': | |
| audio = AudioSegment.from_mp3(file_path) | |
| else: | |
| audio = AudioSegment.from_wav(file_path) | |
| duration_sec = len(audio) / 1000 | |
| if duration_sec < min_duration_sec: | |
| return False, f"Error: Audio duration ({duration_sec:.1f} sec) is too short (min: {min_duration_sec} sec)" | |
| if duration_sec > max_duration_sec: | |
| return False, f"Error: Audio duration ({duration_sec:.1f} sec) is too long (max: {max_duration_sec} sec)" | |
| # Additional check for very quiet audio | |
| if audio.dBFS < -50: | |
| return True, "Warning: Audio is very quiet, which may result in poor voice cloning quality" | |
| return True, None | |
| except Exception as e: | |
| return False, f"Error: Failed to process audio file - {str(e)}" | |
| def handle_synthesis_request( | |
| text, language, temperature, speed, reference_audio, example_audio_name, | |
| do_sample, enable_text_splitting, repetition_penalty, length_penalty, | |
| gpt_cond_len, top_k, top_p, remove_silence_enabled, silence_threshold, | |
| min_silence_len, keep_silence, text_splitting_method, max_chars_per_segment, | |
| enable_preprocessing | |
| ): | |
| """ | |
| Gradio callback to handle the "Generate Audio" button click. | |
| This function orchestrates the synthesis process by: | |
| 1. Selecting and validating the reference audio. | |
| 2. Calling the main `synthesize_speech` function. | |
| 3. Formatting the output (audio and messages) for the Gradio interface. | |
| """ | |
| language_code = SUPPORTED_LANGUAGES[language] | |
| # Ensure penalties are float | |
| repetition_penalty = float(repetition_penalty) | |
| length_penalty = float(length_penalty) | |
| # Select reference audio | |
| final_reference_audio = reference_audio | |
| if not final_reference_audio and example_audio_name: | |
| final_reference_audio = file_path_mapping.get(example_audio_name) | |
| # Validate reference audio | |
| if final_reference_audio: | |
| is_valid, error_message = validate_audio_file(final_reference_audio) | |
| if not is_valid: | |
| return None, error_message, "" | |
| # Call the main synthesis function | |
| audio_path, error_message, preprocessed_text = synthesize_speech( | |
| text=text, | |
| language=language_code, | |
| temperature=temperature, | |
| speed=speed, | |
| reference_audio=final_reference_audio, | |
| do_sample=do_sample, | |
| repetition_penalty=repetition_penalty, | |
| length_penalty=length_penalty, | |
| gpt_cond_len=gpt_cond_len, | |
| top_k=top_k, | |
| top_p=top_p, | |
| remove_silence_enabled=remove_silence_enabled, | |
| silence_threshold=silence_threshold, | |
| min_silence_len=min_silence_len, | |
| keep_silence=keep_silence, | |
| text_splitting_method=text_splitting_method, | |
| max_chars_per_segment=max_chars_per_segment, | |
| enable_preprocessing=enable_preprocessing | |
| ) | |
| if error_message: | |
| return None, error_message, preprocessed_text | |
| success_message = f""" | |
| ✅ Audio generation successful! | |
| 💾 Use the download button to save the audio. | |
| 🔄 If you're not satisfied with the result (e.g., pronunciation, intonation, or pace), feel free to click "Generate Audio" again. | |
| ℹ️ The generation process includes randomness controlled by the temperature parameter ({temperature:.2f}), so each output is unique. | |
| 🎤 For different results, try another voice from the "Reference Audio (examples)" dropdown or upload your own. | |
| ⚙️ If the result is still not satisfactory after several attempts, consider adjusting parameters in the "Advanced Settings" accordion. | |
| """ | |
| return audio_path, success_message, preprocessed_text | |
| generate_button.click( | |
| handle_synthesis_request, | |
| inputs=[ | |
| text_input, lang_dropdown, temperature_slider, speed_slider, | |
| reference_audio_input, example_audio_dropdown, do_sample, | |
| enable_text_splitting, repetition_penalty, length_penalty, | |
| gpt_cond_len, top_k, top_p, remove_silence_enabled, | |
| silence_threshold, min_silence_len, keep_silence, | |
| text_splitting_method, max_chars_per_segment, enable_preprocessing | |
| ], | |
| outputs=[output_audio, output_message, preprocessed_text_output], | |
| api_name=False | |
| ) | |
| # Function to update visibility and value of fields based on the splitting method | |
| def update_text_splitting_options(method): | |
| # Update the state of enable_text_splitting based on the selected method | |
| is_native = method == "Native XTTS splitting" | |
| is_custom = method == "Custom splitting" | |
| # Value of the enable_text_splitting checkbox | |
| enable_splitting = is_native | |
| # Visibility of the max_chars_per_segment slider | |
| show_max_chars = is_custom | |
| return gr.update(value=enable_splitting), gr.update(visible=show_max_chars) | |
| # Connect the function to the radio button change event | |
| text_splitting_method.change( | |
| update_text_splitting_options, | |
| inputs=[text_splitting_method], | |
| outputs=[enable_text_splitting, max_chars_per_segment], | |
| api_name=False | |
| ) | |
| # Section for API endpoints (hidden from UI) | |
| with gr.Tab("API Endpoints", visible=False): | |
| # API: voice_clone_synthesis | |
| with gr.Row(): | |
| api_synth_text = gr.Textbox(label="Text") | |
| api_synth_ref_url = gr.Textbox(label="Reference Audio URL") | |
| api_synth_example_name = gr.Dropdown(files_display, label="Example Audio Name") | |
| api_synth_lang = gr.Dropdown(list(SUPPORTED_LANGUAGES.keys()), label="Language", value="English") | |
| api_synth_temp = gr.Slider(minimum=0.1, maximum=1.5, value=0.75, label="Temperature") | |
| api_synth_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Speed") | |
| api_synth_do_sample = gr.Checkbox(value=True, label="Do Sample") | |
| api_synth_rep_penalty = gr.Slider(minimum=1.0, maximum=5.0, value=5.0, label="Repetition Penalty") | |
| api_synth_len_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.0, label="Length Penalty") | |
| api_synth_gpt_cond_len = gr.Slider(minimum=10, maximum=50, value=30, label="GPT Cond Length") | |
| api_synth_top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K") | |
| api_synth_top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.85, label="Top P") | |
| api_synth_remove_silence = gr.Checkbox(value=True, label="Remove Silence") | |
| api_synth_silence_thresh = gr.Slider(minimum=-60, maximum=-20, value=-45, label="Silence Threshold") | |
| api_synth_min_silence_len = gr.Slider(minimum=300, maximum=1000, value=300, label="Min Silence Length") | |
| api_synth_keep_silence = gr.Slider(minimum=100, maximum=500, value=100, label="Keep Silence") | |
| api_synth_split_method = gr.Radio(choices=["Native XTTS splitting", "Custom splitting", "No splitting"], value="Native XTTS splitting", label="Splitting Method") | |
| api_synth_max_chars = gr.Slider(minimum=50, maximum=400, value=250, label="Max Chars") | |
| api_synth_preprocess = gr.Checkbox(value=False, label="Enable Preprocessing") | |
| api_synth_output_audio = gr.Audio(label="Generated Audio") | |
| api_synth_trigger = gr.Button("Synthesize_API") | |
| # API: analyze_text_for_speech | |
| with gr.Row(): | |
| api_analyze_text = gr.Textbox(label="Text") | |
| api_analyze_lang = gr.Dropdown(list(SUPPORTED_LANGUAGES.keys()), label="Language", value="English") | |
| api_analyze_output = gr.JSON(label="Analysis Result") | |
| api_analyze_trigger = gr.Button("Analyze_API") | |
| # API: preprocess_text_for_speech | |
| with gr.Row(): | |
| api_preprocess_text = gr.Textbox(label="Text") | |
| api_preprocess_lang = gr.Dropdown(list(SUPPORTED_LANGUAGES.keys()), label="Language", value="English") | |
| api_preprocess_output = gr.Textbox(label="Preprocessed Text") | |
| api_preprocess_trigger = gr.Button("Preprocess_API") | |
| # Hook API names to the triggers | |
| api_synth_trigger.click( | |
| fn=voice_clone_synthesis, | |
| inputs=[ | |
| api_synth_text, api_synth_ref_url, api_synth_example_name, api_synth_lang, api_synth_temp, | |
| api_synth_speed, api_synth_do_sample, api_synth_rep_penalty, | |
| api_synth_len_penalty, api_synth_gpt_cond_len, api_synth_top_k, | |
| api_synth_top_p, api_synth_remove_silence, api_synth_silence_thresh, | |
| api_synth_min_silence_len, api_synth_keep_silence, api_synth_split_method, | |
| api_synth_max_chars, api_synth_preprocess | |
| ], | |
| outputs=[api_synth_output_audio], | |
| api_name="voice_clone_synthesis" | |
| ) | |
| api_analyze_trigger.click( | |
| fn=analyze_text_for_speech, | |
| inputs=[api_analyze_text, api_analyze_lang], | |
| outputs=[api_analyze_output], | |
| api_name="analyze_text_for_speech" | |
| ) | |
| api_preprocess_trigger.click( | |
| fn=preprocess_text_for_speech, | |
| inputs=[api_preprocess_text, api_preprocess_lang], | |
| outputs=[api_preprocess_output], | |
| api_name="preprocess_text_for_speech" | |
| ) | |
| if __name__ == "__main__": | |
| # Setup periodic cleanup task to run every hour | |
| def periodic_cleanup(): | |
| """Run cleanup task periodically in background""" | |
| while True: | |
| try: | |
| # Sleep for 60 minutes | |
| time.sleep(60 * 60) | |
| # Run cleanup | |
| files_removed = cleanup_old_files(max_age_minutes=60) | |
| except Exception as e: | |
| print(f"Error in background cleanup task: {e}") | |
| # Start cleanup thread | |
| cleanup_thread = threading.Thread(target=periodic_cleanup, daemon=True) | |
| cleanup_thread.start() | |
| # Launch main interface with MCP enabled directly | |
| interface.queue() | |
| interface.launch(share=False, allowed_paths=[str(REF_AUDIO_DIR)]) |