| | import os |
| | from typing import Optional, Dict, Any, Tuple |
| |
|
| | import sentencepiece |
| | from torch import TensorType |
| | from transformers import PreTrainedTokenizer |
| | from transformers.models.deberta_v2.tokenization_deberta_v2 import SPMTokenizer |
| | from transformers.tokenization_utils_base import TextInput, PreTokenizedInput, TruncationStrategy |
| | from transformers.utils import PaddingStrategy |
| |
|
| |
|
| | class QiDeBERTaTokenizer(PreTrainedTokenizer): |
| | vocab_files_names = {"spm_model_file": "spm.model"} |
| |
|
| | def __init__( |
| | self, |
| | spm_model_file: str, |
| | bos_token: str = '<s>', |
| | eos_token: str = '</s>', |
| | unk_token: str = '<unk>', |
| | sep_token: str = '</s>', |
| | pad_token: str = '<pad>', |
| | cls_token: str = '<s>', |
| | mask_token: str = '<mask>', |
| | do_lower_case=False, |
| | split_by_punct=False, |
| | sp_model_kwargs: Optional[Dict[str, Any]] = None, |
| | **kwargs, |
| | ): |
| | self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs |
| |
|
| | if not os.path.isfile(spm_model_file): |
| | raise ValueError( |
| | f"Can't find a vocabulary file at path '{spm_model_file}'. To load the vocabulary from a Google pretrained" |
| | " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" |
| | ) |
| | self.do_lower_case = do_lower_case |
| | self.split_by_punct = split_by_punct |
| | self.spm_model_file = spm_model_file |
| | self._tokenizer = SPMTokenizer( |
| | spm_model_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs |
| | ) |
| |
|
| | super().__init__( |
| | do_lower_case=do_lower_case, |
| | bos_token=bos_token, |
| | eos_token=eos_token, |
| | unk_token=unk_token, |
| | sep_token=sep_token, |
| | pad_token=pad_token, |
| | cls_token=cls_token, |
| | mask_token=mask_token, |
| | split_by_punct=split_by_punct, |
| | sp_model_kwargs=self.sp_model_kwargs, |
| | **kwargs, |
| | ) |
| | self._tokenizer.special_tokens = self.all_special_tokens |
| | self.space_token_id = self._tokenizer.spm.PieceToId('▁') |
| |
|
| | @property |
| | def vocab_size(self): |
| | return len(self.vocab) |
| |
|
| | @property |
| | def vocab(self): |
| | return self._tokenizer.vocab |
| |
|
| | def __call__( |
| | self, |
| | text: TextInput|PreTokenizedInput|list[TextInput]|list[PreTokenizedInput], |
| | text_pair: Optional[TextInput|PreTokenizedInput|list[TextInput]|list[PreTokenizedInput]] = None, |
| | text_target: Optional[TextInput|PreTokenizedInput|list[TextInput]|list[PreTokenizedInput]] = None, |
| | text_pair_target: Optional[TextInput|PreTokenizedInput|list[TextInput]|list[PreTokenizedInput]] = None, |
| | add_special_tokens: bool = True, |
| | padding: bool|str|PaddingStrategy = False, |
| | truncation: Optional[bool|str|TruncationStrategy] = None, |
| | max_length: Optional[int] = None, |
| | stride: int = 0, |
| | is_split_into_words: bool = False, |
| | pad_to_multiple_of: Optional[int] = None, |
| | padding_side: Optional[str] = None, |
| | return_tensors: str|TensorType = 'pt', |
| | return_token_type_ids: bool = False, |
| | return_attention_mask: bool = True, |
| | return_overflowing_tokens: bool = False, |
| | return_special_tokens_mask: bool = False, |
| | return_offsets_mapping: bool = False, |
| | return_length: bool = False, |
| | verbose: bool = True, |
| | **kwargs, |
| | ): |
| | return super().__call__( |
| | text=text, |
| | text_pair=text_pair, |
| | text_target=text_target, |
| | text_pair_target=text_pair_target, |
| | add_special_tokens=add_special_tokens, |
| | padding=padding, |
| | truncation=truncation, |
| | max_length=max_length, |
| | stride=stride, |
| | is_split_into_words=is_split_into_words, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | padding_side=padding_side, |
| | return_tensors=return_tensors, |
| | return_token_type_ids=return_token_type_ids, |
| | return_attention_mask=return_attention_mask, |
| | return_overflowing_tokens=return_overflowing_tokens, |
| | return_special_tokens_mask=return_special_tokens_mask, |
| | return_offsets_mapping=return_offsets_mapping, |
| | return_length=return_length, |
| | verbose=verbose, |
| | **kwargs, |
| | ) |
| |
|
| | def get_vocab(self): |
| | vocab = self.vocab.copy() |
| | vocab.update(self.get_added_vocab()) |
| | return vocab |
| |
|
| | def _tokenize(self, text: str) -> list[str]: |
| | """Take as input a string and return a list of strings (tokens) for words/sub-words""" |
| | if self.do_lower_case: |
| | text = text.lower() |
| | return self._tokenizer.tokenize(text) |
| |
|
| | def tokenize(self, text: TextInput, **kwargs) -> list[str]: |
| | result = super().tokenize(text, **kwargs) |
| | return result[1:] if result[0] == '▁' else result |
| |
|
| | def _convert_token_to_id(self, token: str): |
| | """Converts a token (str) in an id using the vocab.""" |
| | return self._tokenizer.spm.PieceToId(token) |
| |
|
| | def _convert_id_to_token(self, index: int): |
| | """Converts an index (integer) in a token (str) using the vocab.""" |
| | return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token |
| |
|
| | def convert_tokens_to_string(self, tokens): |
| | """Converts a sequence of tokens (string) in a single string.""" |
| | return self._tokenizer.decode(tokens) |
| |
|
| | def build_inputs_with_special_tokens(self, token_ids_0: list[int]|str, token_ids_1: Optional[list[int]|str] = None): |
| | """ |
| | Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and |
| | adding special tokens. A DeBERTa sequence has the following format: |
| | |
| | - 单个序列: [CLS] X [SEP] |
| | - 序列对: [CLS] A [SEP] B [SEP] |
| | |
| | Args: |
| | token_ids_0 (`List[int]`): |
| | 将向其添加特殊令牌的 ID 列表。 |
| | token_ids_1 (`List[int]`, *optional*): |
| | 序列对的可选第二个 ID 列表。 |
| | |
| | Returns: |
| | `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. |
| | """ |
| | cls = [self.cls_token_id] |
| | sep = [self.sep_token_id] |
| |
|
| | if isinstance(token_ids_0, str): |
| | token_ids_0 = self._tokenizer.spm.encode_as_ids(token_ids_0)[1:] |
| |
|
| | if token_ids_1 is None: |
| | return cls + token_ids_0 + sep |
| | else: |
| | if isinstance(token_ids_1, str): |
| | token_ids_1 = self._tokenizer.spm.encode_as_ids(token_ids_1)[1:] |
| |
|
| | return cls + token_ids_0 + sep + token_ids_1 + sep |
| |
|
| | def get_special_tokens_mask(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens=False): |
| | """ |
| | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
| | special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. |
| | |
| | Args: |
| | token_ids_0 (`List[int]`): |
| | List of IDs. |
| | token_ids_1 (`List[int]`, *optional*): |
| | Optional second list of IDs for sequence pairs. |
| | already_has_special_tokens (`bool`, *optional*, defaults to `False`): |
| | token列表是否已使用模型的bos、eos特殊令牌进行格式化。 |
| | |
| | Returns: |
| | `List[int]`: 范围 [0, 1] 中的整数列表:1 表示特殊token,0 表示正文序列token。 |
| | """ |
| |
|
| | if already_has_special_tokens: |
| | return super().get_special_tokens_mask( |
| | token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True |
| | ) |
| |
|
| | if token_ids_1 is not None: |
| | return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] |
| | return [1] + ([0] * len(token_ids_0)) + [1] |
| |
|
| | def create_token_type_ids_from_sequences(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None): |
| | """ |
| | Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa |
| | sequence pair mask has the following format: |
| | |
| | ``` |
| | 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 |
| | | first sequence | second sequence | |
| | ``` |
| | |
| | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). |
| | |
| | Args: |
| | token_ids_0 (`List[int]`): |
| | List of IDs. |
| | token_ids_1 (`List[int]`, *optional*): |
| | Optional second list of IDs for sequence pairs. |
| | |
| | Returns: |
| | `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). |
| | """ |
| | sep = [self.sep_token_id] |
| | cls = [self.cls_token_id] |
| | if token_ids_1 is None: |
| | return len(cls + token_ids_0 + sep) * [0] |
| | return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] |
| |
|
| | def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): |
| | add_prefix_space = kwargs.pop("add_prefix_space", False) |
| | if is_split_into_words or add_prefix_space: |
| | text = " " + text |
| | return (text, kwargs) |
| |
|
| | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: |
| | return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix) |
| |
|
| | def _get_bos_piece(self) -> str: |
| | """ |
| | 获取BOS Piece |
| | :return: |
| | """ |
| | return self._tokenizer.spm.IdToPiece(self._tokenizer.spm.bos_id()) |
| |
|
| | def _get_eos_piece(self) -> str: |
| | """ |
| | 获取EOS Piece |
| | :return: |
| | """ |
| | return self._tokenizer.spm.IdToPiece(self._tokenizer.spm.eos_id()) |
| |
|
| | def processor(self) -> sentencepiece.SentencePieceProcessor: |
| | return self._tokenizer.spm |
| |
|