Spaces:

subhankarg
/

MagpieTTS_Internal_Demo

Runtime error

App Files Files Community

MagpieTTS_Internal_Demo / nemo /collections /common /tokenizers /en_ja_tokenizers.py

subhankarg

Upload folder using huggingface_hub

0558aa4 verified 12 days ago

raw

history blame contribute delete

3.29 kB

	# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import re
	from typing import List

	try:
	import ipadic
	import MeCab

	HAVE_MECAB = True
	HAVE_IPADIC = True
	except (ImportError, ModuleNotFoundError):
	HAVE_MECAB = False
	HAVE_IPADIC = False


	class EnJaProcessor:
	"""
	Tokenizer, Detokenizer and Normalizer utilities for Japanese & English
	Args:
	lang_id: One of ['en', 'ja'].
	"""

	def __init__(self, lang_id: str):
	from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer

	self.lang_id = lang_id
	self.moses_tokenizer = MosesTokenizer(lang=lang_id)
	self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
	self.normalizer = MosesPunctNormalizer(
	lang=lang_id, pre_replace_unicode_punct=True, post_remove_control_chars=True
	)

	def detokenize(self, tokens: List[str]) -> str:
	"""
	Detokenizes a list of tokens
	Args:
	tokens: list of strings as tokens
	Returns:
	detokenized Japanese or English string
	"""
	return self.moses_detokenizer.detokenize(tokens)

	def tokenize(self, text) -> str:
	"""
	Tokenizes text using Moses. Returns a string of tokens.
	"""
	tokens = self.moses_tokenizer.tokenize(text)
	return ' '.join(tokens)

	def normalize(self, text) -> str:
	# Normalization doesn't handle Japanese periods correctly;
	# '。'becomes '.'.
	if self.lang_id == 'en':
	return self.normalizer.normalize(text)
	else:
	return text


	class JaMecabProcessor:
	"""
	Tokenizer, Detokenizer and Normalizer utilities for Japanese MeCab & English
	"""

	def __init__(self):
	if not HAVE_MECAB or not HAVE_IPADIC:
	raise ImportError("Please ensure that you have installed `MeCab` and `ipadic` to use JaMecabProcessor")

	self.mecab_tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS + " -Owakati")

	def detokenize(self, text: List[str]) -> str:
	from pangu import spacing

	RE_WS_IN_FW = re.compile(
	r'([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])'
	)

	detokenize = lambda s: spacing(RE_WS_IN_FW.sub(r'\1', s)).strip()
	return detokenize(' '.join(text))

	def tokenize(self, text) -> str:
	"""
	Tokenizes text using Moses. Returns a string of tokens.
	"""
	return self.mecab_tokenizer.parse(text).strip()

	def normalize(self, text) -> str:
	return text