
    rh                         d dl Z d dlmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZmZ ddlmZ  e       rd d	lmZ  ed
       G d de	j"                  j$                               ZdgZy)    N)OptionalUnion)pad_model_inputs   )keras)is_keras_nlp_availablerequires   )GPT2Tokenizer)BytePairTokenizer)	keras_nlp)backendsc            
            e Zd ZdZ	 	 ddeeef   dee   dee   dee   f fdZ	e
defd       Ze
d	eeej                  f   fd
       Ze
d        Zd Zddee   fdZ xZS )TFGPT2Tokenizera7  
    This is an in-graph tokenizer for GPT2. It should be initialized similarly to other tokenizers, using the
    `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
    from an existing standard tokenizer object.

    In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
    when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
    than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
    straight from `tf.string` inputs to outputs.

    Args:
        vocab (dict[str, int]): Vocabulary dict for Byte Pair Tokenizer
        merges (list[str]): Merges list for Byte Pair Tokenizer
    vocabmerges
max_lengthpad_token_idc                     t         |           || _        || _        || _        || _        t        |||      | _        y )N)sequence_length)super__init__r   r   r   r   r   tf_tokenizer)selfr   r   r   r   	__class__s        /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_tf.pyr   zTFGPT2Tokenizer.__init__!   s>     	($
-eVZX    	tokenizerc                     |j                   D cg c]  }dj                  |       }}|j                         } | ||g|i |S c c}w )ag  Creates TFGPT2Tokenizer from GPT2Tokenizer

        Args:
            tokenizer (GPT2Tokenizer)

        Examples:

        ```python
        from transformers import AutoTokenizer, TFGPT2Tokenizer

        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
        tf_tokenizer = TFGPT2Tokenizer.from_tokenizer(tokenizer)
        ```
         )	bpe_ranksjoin	get_vocab)clsr   argskwargsmr   r   s          r   from_tokenizerzTFGPT2Tokenizer.from_tokenizer0   sO      (1':':;!#((1+;;##%5&242622 <s   Apretrained_model_name_or_pathc                 `    t        j                  |g|i |} | j                  |g|i |S )a_  Creates TFGPT2Tokenizer from pretrained GPT2Tokenizer

        Args:
            pretrained_model_name_or_path (Union[str, os.PathLike]): Path to pretrained model

        Examples:

        ```python
        from transformers import TFGPT2Tokenizer

        tf_tokenizer = TFGPT2Tokenizer.from_pretrained("openai-community/gpt2")
        ```
        )r   from_pretrainedr(   )r$   r)   init_inputsr&   r   s        r   r+   zTFGPT2Tokenizer.from_pretrainedD   s>     "112OhR]hagh	!s!!)DkDVDDr   c                      | di |S )zCreates TFGPT2Tokenizer from configurations

        Args:
            config (Dict): Dictionary with keys such as stated in `get_config`.
         r.   )r$   configs     r   from_configzTFGPT2Tokenizer.from_configV   s     }V}r   c                 `    | j                   | j                  | j                  | j                  dS )Nr   r   r   r   r2   )r   s    r   
get_configzTFGPT2Tokenizer.get_config_   s*    ZZkk// --	
 	
r   c                     | j                  |      }t        j                  |      }| j                  -||n| j                  }|t        ||| j                        \  }}||dS )N)max_seq_length	pad_value)attention_mask	input_ids)r   tf	ones_liker   r   r   )r   xr   r8   r7   s        r   callzTFGPT2Tokenizer.callg   sk    %%a(	i0('1'=4??J%,<jDDUDU-)	> #1yIIr   )NN)N)__name__
__module____qualname____doc__dictstrintlistr   r   classmethodr   r(   r   osPathLiker+   r0   r3   r<   __classcell__)r   s   @r   r   r      s    & %)&*YCH~Y S	Y SM	Y
 smY 3} 3 3& EE#r{{BR<S E E"  
J(3- Jr   r   )rF   typingr   r   
tensorflowr9   tensorflow_textr   modeling_tf_utilsr   utils.import_utilsr   r	   tokenization_gpt2r   keras_nlp.tokenizersr   layersLayerr   __all__r.   r   r   <module>rS      s`    	 "  , & B , 6 
>"cJell(( cJ #cJL 
r   