
    rh.                         d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
  e	       rdd	lmZ ndZ e
j                  e      Zd
ddZ G d de      ZdgZy)zTokenization classes for XGLM.    N)copyfile)Optional   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )XGLMTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_filec                        e Zd ZdZeZddgZeZ	 	 	 	 	 	 	 	 d fd	Z		 dde
e   dee
e      de
e   fdZ	 dde
e   dee
e      de
e   fd	Zdd
edee   dee   fdZ xZS )XGLMTokenizerFasta{	  
    Construct a "fast" XGLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from [`RobertaTokenizer`]
    and [`XLNetTokenizer`]. Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`list[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
    	input_idsattention_maskc	                 *   d| _         t        | j                         D 
cg c]  }
d|
 d
 }}
|	j                  dg       xs g |	d<   |	dxx   |D cg c]  }||	d   vs| c}z  cc<   t        |   |f|||||||d|	 || _        y c c}
w c c}w )N   z<madeupword>additional_special_tokens)r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token)num_madeup_wordsrangegetsuper__init__r   )selfr   r   r   r   r   r   r   r   kwargsimadeup_wordsword	__class__s                /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/xglm/tokenization_xglm_fast.pyr   zXGLMTokenizerFast.__init__X   s     !"49$:O:O4PQq+aS*QQ.4jj9TVX.Y._]_*+*+)0
T@[9\-\D0
 	
+ 	
	
)
	
 
	
 %' R0
s   BB
B
token_ids_0token_ids_1returnc                 \    || j                   g|z   S | j                   g}||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idr    r'   r(   seps       r&    build_inputs_with_special_tokensz2XGLMTokenizerFast.build_inputs_with_special_tokens{   sF    ( %%&44  ![ 3&,{::    c                 z    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.

        r   )r+   lenr,   s       r&   $create_token_type_ids_from_sequencesz6XGLMTokenizerFast.create_token_type_ids_from_sequences   sU    $   !s[()QC//3$s*S0;>?1#EEr/   save_directoryfilename_prefixc                    | j                   st        d      t        j                  j	                  |      st
        j                  d| d       y t        j                  j                  ||r|dz   ndt        d   z         }t        j                  j                  | j                        t        j                  j                  |      k7  rt        | j                  |       |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )can_save_slow_tokenizer
ValueErrorospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r    r3   r4   out_vocab_files       r&   save_vocabularyz!XGLMTokenizerFast.save_vocabulary   s    ++ 
 ww}}^,LL,^,<<TUVo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r/   )NN<s></s>rE   rD   z<unk>z<pad>)N)__name__
__module____qualname____doc__r@   vocab_files_namesmodel_input_namesr
   slow_tokenizer_classr   listintr   r.   r2   strtuplerC   __classcell__)r%   s   @r&   r   r   $   s    -^ *$&67( !%H JN;9;3;DI3F;	c;4 JNF9F3;DI3FF	cF0!c !HSM !]bcf]g !r/   r   )rI   r:   shutilr   typingr   tokenization_utils_fastr   utilsr   r   tokenization_xglmr
   
get_loggerrF   r=   r@   r   __all__ r/   r&   <module>rZ      sc    % 	   > 8 0M 
		H	%#<P`a Y!/ Y!x 
r/   