
    rh                     p    d Z ddlmZ ddlmZmZ ddlmZ  ej                  e	      Z
 G d de      ZdgZy)	zTokenization class for Dia.    )Optional   )
AddedTokenPreTrainedTokenizer)loggingc            	            e Zd ZdZddgZ	 	 	 	 ddee   dee   dee   def fdZe	d	        Z
d
 Zdedee   fdZd Zd Zdee   defdZddedee   dee   fdZ xZS )DiaTokenizera  
    Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        unk_token (`str`, *optional*, defaults to `"<pad>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequences when encoding. Sequences longer than this will be truncated.
        offset (`int`, *optional*, defaults to 0):
            The offset of the tokenizer.
    	input_idsattention_mask	pad_token	unk_token
max_lengthoffsetc                     t        |t              rt        |      n|}t        |t              rt        |      n|}d| _        |t        d      t        d      d| _        || _        t        |   d|||d| y )N   z[S1]z[S2])r         )r   r   r    )
isinstancestrr   _utf_vocab_size_added_tokens_decoderr   super__init__)selfr   r   r   r   kwargs	__class__s         {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/dia/tokenization_dia.pyr   zDiaTokenizer.__init__/   s}     .8	3-GJy)Y	-7	3-GJy)Y	#)2z&7IjY_N`%a" 	
!	
 		
    c                     | j                   S N)r   )r   s    r   
vocab_sizezDiaTokenizer.vocab_sizeE   s    ###r   c                     t        | j                  | j                  z         D ci c]  }| j                  |      | }}|j	                  | j
                         |S c c}w r!   )ranger"   r   convert_ids_to_tokensupdateadded_tokens_encoder)r   ivocabs      r   	get_vocabzDiaTokenizer.get_vocabI   sW    ;@SWS^S^A^;_`a++A.1``T../ as   Atextreturnc                 ^    |j                  d      D cg c]  }t        |       }}|S c c}w )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsutf-8)encodechr)r   r+   r(   tokenss       r   	_tokenizezDiaTokenizer._tokenizeN   s,    "&++g"67Q#a&77 8s   *c                 Z    t        |      dk7  rd}|S t        |      | j                  z   }|S )z0Converts a token (str) in an id using the vocab.r   N)lenordr   )r   tokentoken_ids      r   _convert_token_to_idz!DiaTokenizer._convert_token_to_idS   s4     u:?H  5zDKK/Hr   c                 6    t        || j                  z
        }|S )z=Converts an index (integer) in a token (str) using the vocab.)r0   r   )r   indexr6   s      r   _convert_id_to_tokenz!DiaTokenizer._convert_id_to_token]   s    EDKK'(r   r1   c                    d}|D ]p  }|| j                   v r*| j                   |   }t        |      j                  d      }n1|| j                  v r|j                  d      }n|j                  d      }||z  }r |j	                  dd      }|S )z:Converts a sequence of tokens (string) in a single string.r   r.   ignore)errors)added_tokens_decoderr   r/   r'   decode)r   r1   bstringr6   added_token_obj
tok_stringstrings          r   convert_tokens_to_stringz%DiaTokenizer.convert_tokens_to_stringb   s     	"E111"&";";E"B 188A
$333"\\'2
"\\'2
z!G	" 9r   save_directoryfilename_prefixc                      y)Nr   r   )r   rF   rG   s      r   save_vocabularyzDiaTokenizer.save_vocabularyr   s    r   )<pad>rJ   i   r   r!   )__name__
__module____qualname____doc__model_input_namesr   r   intr   propertyr"   r*   listr2   r8   r;   rE   tuplerI   __classcell__)r   s   @r   r	   r	      s    $ %&67 $+#*$(
C=
 C=
 SM	

 
, $ $
c d3i 

tCy S  c HSM ]bcf]g r   r	   N)rN   typingr   tokenization_utilsr   r   utilsr   
get_loggerrK   loggerr	   __all__r   r   r   <module>r[      sA    "  A  
		H	%Y& Yx 
r   