
    rh                         d Z ddlZddlmZ ddlmZ ddlmZ  ej                  e	      Z
ddiZd	 Z G d
 de      ZdgZy)zTokenization classes for ESM.    N)Optional   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                     t        | d      5 }|j                         j                         }|D cg c]  }|j                          c}cd d d        S c c}w # 1 sw Y   y xY w)Nr)openread
splitlinesstrip)r   flinesls       {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/esm/tokenization_esm.pyload_vocab_filer      sS    	j#	 *!##%#()a	)* *)* *s   #AAAAA!c            
            e Zd ZdZeZddgZ	 	 	 	 	 d fd	Zdede	fdZ
de	defd	Zd
 Zd Zde	defdZdede	fdZ	 ddee   deee      dee   fdZ	 ddedee   dedee   fdZd Zedefd       Z xZS )EsmTokenizerz&
    Constructs an ESM tokenizer.
    	input_idsattention_maskc           	      V   t        |      | _        t        t        | j                              | _        t        | j                        D 	ci c]  \  }}	|	|
 c}	}| _        t        
|   d|||||d| | j                  | _        | j                  | j                         y c c}	}w )N)	unk_token	cls_token	pad_token
mask_token	eos_token )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)selfr   r   r   r   r   r   kwargsindtok	__class__s             r   r%   zEsmTokenizer.__init__+   s     **5 4??!;<6?6PQ(#sS#XQ 	
!	
 	
 '+oo#$556 Rs   B%indexreturnc                 N    | j                   j                  || j                        S Nr"   getr   r(   r-   s     r   _convert_id_to_tokenz!EsmTokenizer._convert_id_to_tokenG         $$UDNN;;    tokenc                     | j                   j                  || j                   j                  | j                              S r0   r#   r2   r   r(   r7   s     r   _convert_token_to_idz!EsmTokenizer._convert_token_to_idJ   0      $$UD,=,=,A,A$..,QRRr6   c                 "    |j                         S r0   )split)r(   textr)   s      r   	_tokenizezEsmTokenizer._tokenizeM   s    zz|r6   c                 p    | j                   j                         }|j                  | j                         |S r0   )r#   copyupdateadded_tokens_encoder)r(   
base_vocabs     r   	get_vocabzEsmTokenizer.get_vocabP   s0    &&++-
$334r6   c                     | j                   j                  || j                   j                  | j                              S r0   r9   r:   s     r   token_to_idzEsmTokenizer.token_to_idU   r<   r6   c                 N    | j                   j                  || j                        S r0   r1   r3   s     r   id_to_tokenzEsmTokenizer.id_to_tokenX   r5   r6   token_ids_0token_ids_1c                     | j                   g}| j                  g}|| j                  ||z   S ||z   |z   S | j                  t        d      ||z   |z   |z   |z   S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r(   rK   rL   clsseps        r    build_inputs_with_special_tokensz-EsmTokenizer.build_inputs_with_special_tokens[   s       !  !  ([(([(3..&\]][ 3&4s::r6   already_has_special_tokensc                     |r-|t        d      |D cg c]  }|| j                  v rdnd c}S dgdgt        |      z  z   dgz   }||dgt        |      z  dgz   z  }|S c c}w )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`list[int]`):
                List of ids of the first sequence.
            token_ids_1 (`list[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.   r   )rP   all_special_idslen)r(   rK   rL   rT   r7   masks         r   get_special_tokens_maskz$EsmTokenizer.get_special_tokens_maski   s    $ && R 
 LWW%$"6"66AA=WWsqcC,,-3"QC#k**aS00D	 Xs   A!c                     t         j                  j                  ||r|dz   nddz         }t        |d      5 }|j	                  dj                  | j
                               d d d        |fS # 1 sw Y   |fS xY w)N- r   w
)ospathjoinr   writer   )r(   save_directoryfilename_prefixr   r   s        r   save_vocabularyzEsmTokenizer.save_vocabulary   sk    WW\\.O?S3Hacgr2rs
*c" 	0aGGDIIdoo./	0}	0}s   +A--A8c                 ,    t        | j                        S r0   )rX   r   )r(   s    r   
vocab_sizezEsmTokenizer.vocab_size   s    4??##r6   )z<unk>z<cls>z<pad>z<mask>z<eos>r0   )NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr%   intstrr4   r;   r@   rF   rH   rJ   listr   rS   boolrZ   rf   propertyrh   __classcell__)r,   s   @r   r   r   #   s    *$&67
 78<# <# <S# S# S
S S S< < < JN;9;3;DI3F;	c; in.6tnae	c> $C $ $r6   r   )rl   r`   typingr   tokenization_utilsr   utilsr   
get_loggerri   loggerrm   r   r   __all__r   r6   r   <module>r|      sT    $ 	  5  
		H	%!;/ *m$& m$` 
r6   