更新模型版本。

6509208 9 months ago

10 kB

	import os
	from typing import Optional, Dict, Any, Tuple

	import sentencepiece
	from torch import TensorType
	from transformers import PreTrainedTokenizer
	from transformers.models.deberta_v2.tokenization_deberta_v2 import SPMTokenizer
	from transformers.tokenization_utils_base import TextInput, PreTokenizedInput, TruncationStrategy
	from transformers.utils import PaddingStrategy


	class QiDeBERTaTokenizer(PreTrainedTokenizer):
	vocab_files_names = {"spm_model_file": "spm.model"}

	def __init__(
	self,
	spm_model_file: str,
	bos_token: str = '<s>',
	eos_token: str = '</s>',
	unk_token: str = '<unk>',
	sep_token: str = '</s>',
	pad_token: str = '<pad>',
	cls_token: str = '<s>',
	mask_token: str = '<mask>',
	do_lower_case=False,
	split_by_punct=False,
	sp_model_kwargs: Optional[Dict[str, Any]] = None,
	**kwargs,
	):
	self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

	if not os.path.isfile(spm_model_file):
	raise ValueError(
	f"Can't find a vocabulary file at path '{spm_model_file}'. To load the vocabulary from a Google pretrained"
	" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
	)
	self.do_lower_case = do_lower_case
	self.split_by_punct = split_by_punct
	self.spm_model_file = spm_model_file
	self._tokenizer = SPMTokenizer(
	spm_model_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
	)

	super().__init__(
	do_lower_case=do_lower_case,
	bos_token=bos_token,
	eos_token=eos_token,
	unk_token=unk_token,
	sep_token=sep_token,
	pad_token=pad_token,
	cls_token=cls_token,
	mask_token=mask_token,
	split_by_punct=split_by_punct,
	sp_model_kwargs=self.sp_model_kwargs,
	**kwargs,
	)
	self._tokenizer.special_tokens = self.all_special_tokens
	self.space_token_id = self._tokenizer.spm.PieceToId('▁')

	@property
	def vocab_size(self):
	return len(self.vocab)

	@property
	def vocab(self):
	return self._tokenizer.vocab

	def __call__(
	self,
	text: TextInput\|PreTokenizedInput\|list[TextInput]\|list[PreTokenizedInput],
	text_pair: Optional[TextInput\|PreTokenizedInput\|list[TextInput]\|list[PreTokenizedInput]] = None,
	text_target: Optional[TextInput\|PreTokenizedInput\|list[TextInput]\|list[PreTokenizedInput]] = None,
	text_pair_target: Optional[TextInput\|PreTokenizedInput\|list[TextInput]\|list[PreTokenizedInput]] = None,
	add_special_tokens: bool = True,
	padding: bool\|str\|PaddingStrategy = False,
	truncation: Optional[bool\|str\|TruncationStrategy] = None,
	max_length: Optional[int] = None,
	stride: int = 0,
	is_split_into_words: bool = False,
	pad_to_multiple_of: Optional[int] = None,
	padding_side: Optional[str] = None,
	return_tensors: str\|TensorType = 'pt',
	return_token_type_ids: bool = False,
	return_attention_mask: bool = True,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	**kwargs,
	):
	return super().__call__(
	text=text,
	text_pair=text_pair,
	text_target=text_target,
	text_pair_target=text_pair_target,
	add_special_tokens=add_special_tokens,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	stride=stride,
	is_split_into_words=is_split_into_words,
	pad_to_multiple_of=pad_to_multiple_of,
	padding_side=padding_side,
	return_tensors=return_tensors,
	return_token_type_ids=return_token_type_ids,
	return_attention_mask=return_attention_mask,
	return_overflowing_tokens=return_overflowing_tokens,
	return_special_tokens_mask=return_special_tokens_mask,
	return_offsets_mapping=return_offsets_mapping,
	return_length=return_length,
	verbose=verbose,
	**kwargs,
	)

	def get_vocab(self):
	vocab = self.vocab.copy()
	vocab.update(self.get_added_vocab())
	return vocab

	def _tokenize(self, text: str) -> list[str]:
	"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
	if self.do_lower_case:
	text = text.lower()
	return self._tokenizer.tokenize(text)

	def tokenize(self, text: TextInput, **kwargs) -> list[str]:
	result = super().tokenize(text, **kwargs)
	return result[1:] if result[0] == '▁' else result

	def _convert_token_to_id(self, token: str):
	"""Converts a token (str) in an id using the vocab."""
	return self._tokenizer.spm.PieceToId(token)

	def _convert_id_to_token(self, index: int):
	"""Converts an index (integer) in a token (str) using the vocab."""
	return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token

	def convert_tokens_to_string(self, tokens):
	"""Converts a sequence of tokens (string) in a single string."""
	return self._tokenizer.decode(tokens)

	def build_inputs_with_special_tokens(self, token_ids_0: list[int]\|str, token_ids_1: Optional[list[int]\|str] = None):
	"""
	Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
	adding special tokens. A DeBERTa sequence has the following format:

	- 单个序列: [CLS] X [SEP]
	- 序列对: [CLS] A [SEP] B [SEP]

	Args:
	token_ids_0 (`List[int]`):
	将向其添加特殊令牌的 ID 列表。
	token_ids_1 (`List[int]`, optional):
	序列对的可选第二个 ID 列表。

	Returns:
	`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
	"""
	cls = [self.cls_token_id]
	sep = [self.sep_token_id]

	if isinstance(token_ids_0, str):
	token_ids_0 = self._tokenizer.spm.encode_as_ids(token_ids_0)[1:]

	if token_ids_1 is None:
	return cls + token_ids_0 + sep
	else:
	if isinstance(token_ids_1, str):
	token_ids_1 = self._tokenizer.spm.encode_as_ids(token_ids_1)[1:]

	return cls + token_ids_0 + sep + token_ids_1 + sep

	def get_special_tokens_mask(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens=False):
	"""
	Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
	special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

	Args:
	token_ids_0 (`List[int]`):
	List of IDs.
	token_ids_1 (`List[int]`, optional):
	Optional second list of IDs for sequence pairs.
	already_has_special_tokens (`bool`, optional, defaults to `False`):
	token列表是否已使用模型的bos、eos特殊令牌进行格式化。

	Returns:
	`List[int]`: 范围 [0， 1] 中的整数列表：1 表示特殊token，0 表示正文序列token。
	"""

	if already_has_special_tokens:
	return super().get_special_tokens_mask(
	token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
	)

	if token_ids_1 is not None:
	return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
	return [1] + ([0] * len(token_ids_0)) + [1]

	def create_token_type_ids_from_sequences(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None):
	"""
	Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
	sequence pair mask has the following format:

	```
	0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
	\| first sequence \| second sequence \|
	```

	If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

	Args:
	token_ids_0 (`List[int]`):
	List of IDs.
	token_ids_1 (`List[int]`, optional):
	Optional second list of IDs for sequence pairs.

	Returns:
	`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
	"""
	sep = [self.sep_token_id]
	cls = [self.cls_token_id]
	if token_ids_1 is None:
	return len(cls + token_ids_0 + sep) * [0]
	return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

	def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
	add_prefix_space = kwargs.pop("add_prefix_space", False)
	if is_split_into_words or add_prefix_space:
	text = " " + text
	return (text, kwargs)

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
	return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix)

	def _get_bos_piece(self) -> str:
	"""
	获取BOS Piece
	:return:
	"""
	return self._tokenizer.spm.IdToPiece(self._tokenizer.spm.bos_id())

	def _get_eos_piece(self) -> str:
	"""
	获取EOS Piece
	:return:
	"""
	return self._tokenizer.spm.IdToPiece(self._tokenizer.spm.eos_id())

	def processor(self) -> sentencepiece.SentencePieceProcessor:
	return self._tokenizer.spm