
    rh+"                         d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ  e       rd	d
lmZ ndZ ej                  e      ZdddZdZ G d de	      ZdgZy)z'Tokenization classes for RemBERT model.    N)copyfile)Optional   )
AddedToken)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )RemBertTokenizerzsentencepiece.modelztokenizer.json)
vocab_filetokenizer_fileu   ▁c            
            e Zd ZdZeZeZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z	 dde	e
   dee	e
      de	e
   fdZ	 dde	e
   dee	e
      dede	e
   fdZdd	ed
ee   dee   fdZ xZS )RemBertTokenizerFasta  
    Construct a "fast" RemBert tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        remove_space (`bool`, *optional*, defaults to `True`):
            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
        keep_accents (`bool`, *optional*, defaults to `False`):
            Whether or not to keep accents when tokenizing.
        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
            The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
            that is used for the end of sequence. The token used is the `sep_token`.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    c                     t        |t              rt        |dd      n|}t        |   |f||||||||	|
||d| || _        || _        || _        || _        y )NTF)lstriprstrip)r   do_lower_caseremove_spacekeep_accents	bos_token	eos_token	unk_token	sep_token	pad_token	cls_token
mask_token)	
isinstancestrr   super__init__r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                 /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/rembert/tokenization_rembert_fast.pyr    zRemBertTokenizerFast.__init__X   s    " KUU_adJeZ
4Fku
	
)'%%!	
 	
  +(($    token_ids_0token_ids_1returnc                 f    | j                   g}| j                  g}|||z   |z   S ||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RemBERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idcls_token_id)r!   r&   r'   sepclss        r$    build_inputs_with_special_tokensz5RemBertTokenizerFast.build_inputs_with_special_tokens   sP    &   !  !$s**[ 3&4s::r%   already_has_special_tokensc                 
   |r9|t        d      |D cg c]   }|| j                  | j                  fv rdnd" c}S |+dgdgt        |      z  z   dgz   dgt        |      z  z   dgz   S dgdgt        |      z  z   dgz   S c c}w )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Set to True if the token list is already formatted with special tokens for the model

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.r
   r   )
ValueErrorr*   r+   len)r!   r&   r'   r/   xs        r$   get_special_tokens_maskz,RemBertTokenizerFast.get_special_tokens_mask   s    & && R  VaaPQt00$2C2CDDA!Kaa"31#K 001QC7A3[AQ;QRVWUXXXsqcC,,-33	 bs   %B save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rt        | j                  |       |fS )NzVocabulary path (z) should be a directory- r   )
ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r!   r5   r6   out_vocab_files       r$   save_vocabularyz$RemBertTokenizerFast.save_vocabulary   s    ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r%   )NNTTF[CLS][SEP]z<unk>rE   z<pad>rD   z[MASK])N)NF)__name__
__module____qualname____doc__r@   vocab_files_namesr   slow_tokenizer_classr    listintr   r.   boolr4   r   tuplerC   __classcell__)r#   s   @r$   r   r   &   s    ,\ *+ &%R JN;9;3;DI3F;	c;4 sx4943;DI3F4ko4	c4>!c !HSM !]bcf]g !r%   r   )rI   r:   shutilr   typingr   tokenization_utilsr   tokenization_utils_fastr   utilsr   r	   tokenization_rembertr   
get_loggerrF   r=   r@   SPIECE_UNDERLINEr   __all__ r%   r$   <module>r[      sm    . 	   , > 8 6			H	%#8L\]   ]!2 ]!@ "
"r%   