Spaces:
Runtime error
Runtime error
| # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import re | |
| from typing import List | |
| try: | |
| import ipadic | |
| import MeCab | |
| HAVE_MECAB = True | |
| HAVE_IPADIC = True | |
| except (ImportError, ModuleNotFoundError): | |
| HAVE_MECAB = False | |
| HAVE_IPADIC = False | |
| class EnJaProcessor: | |
| """ | |
| Tokenizer, Detokenizer and Normalizer utilities for Japanese & English | |
| Args: | |
| lang_id: One of ['en', 'ja']. | |
| """ | |
| def __init__(self, lang_id: str): | |
| from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer | |
| self.lang_id = lang_id | |
| self.moses_tokenizer = MosesTokenizer(lang=lang_id) | |
| self.moses_detokenizer = MosesDetokenizer(lang=lang_id) | |
| self.normalizer = MosesPunctNormalizer( | |
| lang=lang_id, pre_replace_unicode_punct=True, post_remove_control_chars=True | |
| ) | |
| def detokenize(self, tokens: List[str]) -> str: | |
| """ | |
| Detokenizes a list of tokens | |
| Args: | |
| tokens: list of strings as tokens | |
| Returns: | |
| detokenized Japanese or English string | |
| """ | |
| return self.moses_detokenizer.detokenize(tokens) | |
| def tokenize(self, text) -> str: | |
| """ | |
| Tokenizes text using Moses. Returns a string of tokens. | |
| """ | |
| tokens = self.moses_tokenizer.tokenize(text) | |
| return ' '.join(tokens) | |
| def normalize(self, text) -> str: | |
| # Normalization doesn't handle Japanese periods correctly; | |
| # '。'becomes '.'. | |
| if self.lang_id == 'en': | |
| return self.normalizer.normalize(text) | |
| else: | |
| return text | |
| class JaMecabProcessor: | |
| """ | |
| Tokenizer, Detokenizer and Normalizer utilities for Japanese MeCab & English | |
| """ | |
| def __init__(self): | |
| if not HAVE_MECAB or not HAVE_IPADIC: | |
| raise ImportError("Please ensure that you have installed `MeCab` and `ipadic` to use JaMecabProcessor") | |
| self.mecab_tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS + " -Owakati") | |
| def detokenize(self, text: List[str]) -> str: | |
| from pangu import spacing | |
| RE_WS_IN_FW = re.compile( | |
| r'([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])' | |
| ) | |
| detokenize = lambda s: spacing(RE_WS_IN_FW.sub(r'\1', s)).strip() | |
| return detokenize(' '.join(text)) | |
| def tokenize(self, text) -> str: | |
| """ | |
| Tokenizes text using Moses. Returns a string of tokens. | |
| """ | |
| return self.mecab_tokenizer.parse(text).strip() | |
| def normalize(self, text) -> str: | |
| return text | |