Spaces:
Runtime error
Runtime error
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from megatron.core.datasets.megatron_tokenizer import MegatronLegacyTokenizer as MegatronTokenizer | |
| class NullTokenizer(MegatronTokenizer): | |
| """ | |
| Synthetic tokenizer for performance benchmarking and debugging | |
| Args: | |
| vocab_size: vocabulary size for embedding | |
| """ | |
| def __init__(self, vocab_size): | |
| super().__init__(None, vocab_size=vocab_size) | |
| self._vocab_size_without_eod = int(vocab_size) | |
| self._eod_id = self._vocab_size_without_eod | |
| def tokenize(self, text): | |
| return [int(x) for x in text.split(' ')] | |
| def detokenize(self, ids): | |
| text = [str(x) for x in ids] | |
| return ' '.join(text) | |
| def offsets(self, ids: list[int], text: str) -> list[int]: | |
| offsets, start_idx = [], 0 | |
| for id_ in ids: | |
| offsets.append(start_idx) | |
| start_idx += 1 + len(str(id_)) | |
| return offsets | |
| def vocab_size(self): | |
| return self._vocab_size_without_eod + 1 | |
| def vocab(self): | |
| raise NotImplementedError | |
| def inv_vocab(self): | |
| raise NotImplementedError | |
| def cls(self): | |
| return -1 | |
| def sep(self): | |
| return -1 | |
| def mask(self): | |
| return -1 | |
| def eod(self): | |
| return self._eod_id | |
| def additional_special_tokens_ids(self): | |
| return None | |