
    rh>'                     x    d Z ddlZddlmZ ddlmZmZ ddlmZ  ej                  e
      Z G d de      ZdgZy)	z"Tokenization class for model ByT5.    N)Optional   )
AddedTokenPreTrainedTokenizer)loggingc            
       N    e Zd ZdZddgZ	 	 	 	 	 d	 d fdZed        Zd Z	 dde	e
   d	ee	e
      d
ede	e
   f fdZde	e
   de	e
   fdZ	 dde	e
   d	ee	e
      de	e
   fdZ	 dde	e
   d	ee	e
      de	e
   fdZdede	e   fdZd Zd Zd Zddedee   dee   fdZ xZS )ByT5Tokenizera  
    Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 125):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
            like in ByT5 preprocessing see
            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
        additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    	input_idsattention_maskreturnc           	      0   |dkD  r|t        |      D cg c]  }d| d
 }}nK|dkD  rF|Dt        |      dkD  r6t        t        t        d |                  }||k7  rt	        d| d| d      t        |t              rt        |dd	      n|}t        |t              rt        |dd	      n|}t        |t              rt        |dd	      n|}|||d
| _        t        | j                        | _	        d| _
        t        	| 0  d|||d|d| y c c}w )Nr   z
<extra_id_>c                 .    t        dt        |       v       S )Nextra_id)boolstr)xs    }/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/byt5/tokenization_byt5.py<lambda>z(ByT5Tokenizer.__init__.<locals>.<lambda>L   s    Ds1v9M4N     zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to ByT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r            )	eos_token	unk_token	pad_token	extra_idsadditional_special_tokens )rangelensetfilter
ValueError
isinstancer   r   _added_tokens_decoderoffset_utf_vocab_sizesuper__init__)
selfr   r   r   r   r    kwargsiextra_tokens	__class__s
            r   r,   zByT5Tokenizer.__init__>   sI    q=6>DI)DT(Uq:aS):(U%(U]8DMfIgjkIks6*NPi#jklLy( &yk1RSlRm n( (  HRR[]`GaJydCgp	GQR[]`GaJydCgp	GQR[]`GaJydCgp	)2yY%O"$445# 	
&?	
 	
' )Vs   Dc                     | j                   S N)r*   )r-   s    r   
vocab_sizezByT5Tokenizer.vocab_sizee   s    ###r   c                     t        | j                  | j                  z         D ci c]  }| j                  |      | }}|j	                  | j
                         |S c c}w r3   )r"   r4   r)   convert_ids_to_tokensupdateadded_tokens_encoder)r-   r/   vocabs      r   	get_vocabzByT5Tokenizer.get_vocabi   sW    ;@SWS^S^A^;_`a++A.1``T../ as   Atoken_ids_0token_ids_1already_has_special_tokensc                     |rt         |   ||d      S |dgt        |      z  dgz   S dgt        |      z  dgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r;   r<   r=   r   r   )r+   get_special_tokens_maskr#   )r-   r;   r<   r=   r1   s       r   r?   z%ByT5Tokenizer.get_special_tokens_maskn   sy    $ &72'[]a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr   	token_idsc                     t        |      dkD  r7|d   | j                  k(  r%t        j                  d| j                   d       |S || j                  gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r#   eos_token_idwarningswarnr   )r-   r@   s     r   _add_eos_if_not_presentz%ByT5Tokenizer._add_eos_if_not_present   s]    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r   c                 t    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        r   )rC   r#   )r-   r;   r<   eoss       r   $create_token_type_ids_from_sequencesz2ByT5Tokenizer.create_token_type_ids_from_sequences   sP        !{S()QC//;${2S89QC??r   c                 X    | j                  |      }||S | j                  |      }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )rF   )r-   r;   r<   s      r    build_inputs_with_special_tokensz.ByT5Tokenizer.build_inputs_with_special_tokens   s;    & 22;?66{CK,,r   textc                 ^    |j                  d      D cg c]  }t        |       }}|S c c}w )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsutf-8)encodechr)r-   rL   r/   tokenss       r   	_tokenizezByT5Tokenizer._tokenize   s,    "&++g"67Q#a&77 8s   *c                 Z    t        |      dk7  rd}|S t        |      | j                  z   }|S )z0Converts a token (str) in an id using the vocab.r   N)r#   ordr)   )r-   tokentoken_ids      r   _convert_token_to_idz"ByT5Tokenizer._convert_token_to_id   s4     u:?H  5zDKK/Hr   c                 6    t        || j                  z
        }|S )z=Converts an index (integer) in a token (str) using the vocab.)rP   r)   )r-   indexrU   s      r   _convert_id_to_tokenz"ByT5Tokenizer._convert_id_to_token   s    EDKK'(r   c                    d}|D ]i  }|| j                   v r| j                   |   j                  d      }n5|| j                  v r|j                  d      }nt        t	        |      g      }||z  }k |j                  dd      }|S )z:Converts a sequence of tokens (string) in a single string.r   rN   ignore)errors)added_tokens_decoderrO   r8   bytesrT   decode)r-   rQ   bstringrU   
tok_stringstrings         r   convert_tokens_to_stringz&ByT5Tokenizer.convert_tokens_to_string   s     	"E111!66u=DDWM
$333"\\'2
"CJ<0
z!G	" 9r   save_directoryfilename_prefixc                      y)Nr!   r!   )r-   re   rf   s      r   save_vocabularyzByT5Tokenizer.save_vocabulary   s    r   )z</s>z<unk>z<pad>}   N)r   N)NFr3   )__name__
__module____qualname____doc__model_input_namesr,   propertyr4   r:   listintr   r   r?   rF   rI   rK   r   rR   rW   rZ   rd   tuplerh   __classcell__)r1   s   @r   r	   r	      sg   @ %&67 "&%
 
%
N $ $ sxO9O3;DI3FOkoO	cO8	3c 	3tCy 	3 JN@9@3;DI3F@	c@. JN-9-3;DI3F-	c-4c d3i 

c HSM ]bcf]g r   r	   )rm   rD   typingr   tokenization_utilsr   r   utilsr   
get_loggerrj   loggerr	   __all__r!   r   r   <module>rz      sD    )   A  
		H	%N' Nb 
r   