
    rh	                         d Z ddlmZ ddlmZ ddlmZ ddlmZ  ej                  e
      Zdd	d
dZ G d de      ZdgZy)z)Fast Tokenization classes for OpenAI GPT.    )Optional   )PreTrainedTokenizerFast)logging   )OpenAIGPTTokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                   f     e Zd ZdZeZddgZeZd
 fd	Z	e
d        Zddedee   dee   fd	Z xZS )OpenAIGPTTokenizerFasta  
    Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
    the following peculiarities:

    - lower case all inputs
    - uses BERT's BasicTokenizer for pre-BPE tokenization

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    	input_idsattention_maskc                 .    t        |   ||f||d| y )N)r   	unk_token)super__init__)selfr	   r
   r   r   kwargs	__class__s         /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/openai/tokenization_openai_fast.pyr   zOpenAIGPTTokenizerFast.__init__6   s     [o[dohno    c                      y)NT )r   s    r   do_lower_casez$OpenAIGPTTokenizerFast.do_lower_case9   s    r   save_directoryfilename_prefixreturnc                 f    | j                   j                  j                  ||      }t        |      S )N)name)
_tokenizermodelsavetuple)r   r   r   filess       r   save_vocabularyz&OpenAIGPTTokenizerFast.save_vocabulary=   s+    %%**>*PU|r   )NNNz<unk>)N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   slow_tokenizer_classr   propertyr   strr   r$   r&   __classcell__)r   s   @r   r   r      sa    ( *$&67-p  c HSM ]bcf]g r   r   N)r*   typingr   tokenization_utils_fastr   utilsr   tokenization_openair   
get_loggerr'   loggerr+   r   __all__r   r   r   <module>r9      sP    0  >  3 
		H	%#/`pq "4 "J $
$r   