
    rh                         d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ  e       rd	d
lmZ ndZ ej                  e      ZdddZdZ G d de	      ZdgZy)z&Tokenization classes for ALBERT model.    N)copyfile)Optional   )
AddedToken)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )AlbertTokenizerzspiece.modelztokenizer.json)
vocab_filetokenizer_fileu   ▁c                        e Zd ZdZeZeZ	 	 	 	 	 	 	 	 	 	 	 	 d
 fd	Z	 dde	e
   dee	e
      de	e
   fdZddedee   dee   fd	Z xZS )AlbertTokenizerFasta  
    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        remove_space (`bool`, *optional*, defaults to `True`):
            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
        keep_accents (`bool`, *optional*, defaults to `False`):
            Whether or not to keep accents when tokenizing.
        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
            The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
            that is used for the end of sequence. The token used is the `sep_token`.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    c                     t        |t              rt        |ddd      n|}t        |   |f||||||||	|
||d| || _        || _        || _        || _        y )NTF)lstriprstrip
normalized)r   do_lower_caseremove_spacekeep_accents	bos_token	eos_token	unk_token	sep_token	pad_token	cls_token
mask_token)	
isinstancestrr   super__init__r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                 /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/albert/tokenization_albert_fast.pyr!   zAlbertTokenizerFast.__init__X   s    ( *c* z$uO 	 		
)'%%!	
 	
  +(($    token_ids_0token_ids_1returnc                 f    | j                   g}| j                  g}|||z   |z   S ||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An ALBERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idcls_token_id)r"   r'   r(   sepclss        r%    build_inputs_with_special_tokensz4AlbertTokenizerFast.build_inputs_with_special_tokens   sP    &   !  !$s**[ 3&4s::r&   save_directoryfilename_prefixc                    | j                   st        d      t        j                  j	                  |      st
        j                  d| d       y t        j                  j                  ||r|dz   ndt        d   z         }t        j                  j                  | j                        t        j                  j                  |      k7  rt        | j                  |       |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory- r   )can_save_slow_tokenizer
ValueErrorospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r"   r0   r1   out_vocab_files       r%   save_vocabularyz#AlbertTokenizerFast.save_vocabulary   s    ++ 
 ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r&   )NNTTF[CLS][SEP]z<unk>rB   z<pad>rA   z[MASK])N)__name__
__module____qualname____doc__r=   vocab_files_namesr   slow_tokenizer_classr!   listintr   r/   r   tupler@   __classcell__)r$   s   @r%   r   r   &   s    ,\ ** +%\ JN;9;3;DI3F;	c;2!c !HSM !]bcf]g !r&   r   )rF   r7   shutilr   typingr   tokenization_utilsr   tokenization_utils_fastr   utilsr   r	   tokenization_albertr   
get_loggerrC   r:   r=   SPIECE_UNDERLINEr   __all__ r&   r%   <module>rW      sl    - 	   , > 8 4O			H	%#1EUV   I!1 I!X !
!r&   