
    rh&                         d Z ddlZddlZddlmZ ddlmZmZ  e       rddlZddl	m
Z
 ddlmZ  ej                  e      Zdd	iZd
 Z G d d      Z G d de
      ZdgZy)z Tokenization classes for CPMAnt.    N)Optional)is_jieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                     t        j                         }t        | dd      5 }|j                         }ddd       t	              D ]  \  }}|j                  d      }|||<    |S # 1 sw Y   4xY w)z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r	   vocabreadertokensindextokens         /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocabr   $   sw    ##%E	j#	0 $F!!#$!&) uT"e L$ $s   A''A0c                       e Zd ZddZd Zy)WordpieceTokenizerc                 .    || _         || _        || _        y N)r   	unk_tokenmax_input_chars_per_word)selfr   r!   r"   s       r   __init__zWordpieceTokenizer.__init__0   s    
"(@%    c                    t        |      }t        |      | j                  kD  r| j                  gS d}g }|t        |      k  rt        |      }d }||k  r0dj	                  |||       }|| j
                  v r|}n|dz  }||k  r0|!|j                  | j                         |dz  }n|j                  |       |}|t        |      k  r|S )Nr       )listlenr"   r!   joinr   append)r#   r   charsstart
sub_tokensend
cur_substrsubstrs           r   tokenizezWordpieceTokenizer.tokenize5   s    Uu:555NN##
c%j e*CJ#+uS!12TZZ'!'Jq #+ !!!$..1
!!*- c%j   r%   N)<unk>   )__name__
__module____qualname__r$   r3    r%   r   r   r   /   s    A
r%   r   c            
       T    e Zd ZdZeZddgZdZ	 	 	 	 	 	 	 	 	 d fd	Ze	d        Z
e	d        Ze	d        Ze	d	efd
       Zd Zd Z fdZd Zdee   d	efdZd Zd Zddedee   d	ee   fdZ	 ddee   deee      d	ee   fdZ	 ddee   deee      ded	ee   f fdZ xZS )CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    	input_idsattention_maskFc                 X   t        | dg       || _        || _        t        |      | _        | j                  |	   | j                  d<   | j                  |   | j                  d<   | j                  |	= | j                  |= t        j                  t        | j                  j                         d             | _        | j                  j                         D ci c]  \  }}||
 c}}| _	        t        | j                  |      | _        t        | 4  d||||||||	|
d	| y c c}}w )	Njieba r   c                     | d   S Nr(   r9   xs    r   <lambda>z*CpmAntTokenizer.__init__.<locals>.<lambda>       Z[\]Z^ r%   key)r   r!   )		bod_token	eod_token	bos_token	eos_token	pad_tokenr!   
line_tokenspace_tokenpadding_sider9   )r   rI   rJ   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr$   )r#   r	   rI   rJ   rK   rL   rM   r!   rN   rO   rP   kwargskv	__class__s                 r   r$   zCpmAntTokenizer.__init__l   s    	$	*""!*- LL5S!\\*5TLL%LL$"..vdll6H6H6JP^/_`)-););)=>A1>#5DLLT]#^  	
!#%	
 	
	 ?s   D&c                 4    | j                   | j                     S r    )rQ   rI   r#   s    r   bod_token_idzCpmAntTokenizer.bod_token_id       ||DNN++r%   c                 4    | j                   | j                     S r    )rQ   rJ   r\   s    r   eod_token_idzCpmAntTokenizer.eod_token_id   r^   r%   c                      | j                   d   S )Nr   rQ   r\   s    r   
newline_idzCpmAntTokenizer.newline_id   s    ||D!!r%   returnc                 ,    t        | j                        S r    )r*   rQ   r\   s    r   
vocab_sizezCpmAntTokenizer.vocab_size   s    4<<  r%   c                 B    t        | j                  fi | j                  S r    )dictrQ   added_tokens_encoderr\   s    r   	get_vocabzCpmAntTokenizer.get_vocab   s    DLL>D$=$=>>r%   c                     g }t        j                  |d      D ],  }|j                  | j                  j	                  |             . |S )zTokenize a string.F)cut_all)r?   cutextendrU   r3   )r#   textoutput_tokensrD   s       r   	_tokenizezCpmAntTokenizer._tokenize   sH    4/ 	GA  !9!9!B!B1!EF	Gr%   c                     |D cg c]
  }|dk\  s	| }}|D cg c]4  }|| j                   k7  s|| j                  k7  s#|| j                  k7  s3|6 }}t        |   |fi |S c c}w c c}w )zDecode ids into a string.r   )pad_token_ideos_token_idbos_token_idrV   _decode)r#   	token_idsrW   irD   rZ   s        r   rv   zCpmAntTokenizer._decode   s     )41Q!VQ4	4 
A):):$:qDDUDU?UZ[_c_p_pZpA
	 
 wy3F33	 5
s    
A&A&A+A+ A+A+c                     || j                   v S r    rb   r#   r   s     r   checkzCpmAntTokenizer.check   s    $$r%   r   c                 $    dj                  |      S )Nr'   )r+   )r#   r   s     r   convert_tokens_to_stringz(CpmAntTokenizer.convert_tokens_to_string   s    wwvr%   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)rQ   getr!   rz   s     r   _convert_token_to_idz$CpmAntTokenizer._convert_token_to_id   s,    ||t||'7'7'GHHr%   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)rT   r   r!   )r#   r   s     r   _convert_id_to_tokenz$CpmAntTokenizer._convert_id_to_token   s    ||t~~66r%   save_directoryfilename_prefixc                     t         j                  j                  |      r2t         j                  j                  ||r|dz   ndt        d   z         }n|r|dz   nd|z   }d}d| j
                  v r)| j
                  d   | j
                  d<   | j
                  d= d| j
                  v r)| j
                  d   | j
                  d<   | j
                  d= t        j                  t        | j
                  j                         d	 
            | _        t        |dd      5 }| j
                  j                         D ]>  \  }}||k7  rt        j                  d| d       |}|j                  |dz          |dz  }@ 	 d d d        |fS # 1 sw Y   |fS xY w)N-r'   r	   r   r@   </_>r   </n>c                     | d   S rB   r9   rC   s    r   rE   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>   rF   r%   rG   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r(   )ospathisdirr+   VOCAB_FILES_NAMESrQ   r   r   rR   rS   r   loggerwarningwrite)r#   r   r   r	   r   writerr   token_indexs           r   save_vocabularyzCpmAntTokenizer.save_vocabulary   sx   77==(/3!6rUfgsUt tJ 4C/C/n\J$,,#'<<#4DLL S!4<<#'<<#5DLL T""..vdll6H6H6JP^/_`*cG4 		&*ll&8&8&: "{K'NN/
| <N N (EUT\*
		 }		 }s   AFFtoken_ids_0token_ids_1c                 h    || j                   g|z   S | j                   g|z   | j                   gz   |z   S )a1  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CPMAnt sequence has the following format:

        - single sequence: `[BOS] Sequence`.

        Args:
            token_ids_0 (`list[int]`): The first tokenized sequence that special tokens will be added.
            token_ids_1 (`list[int]`): The optional second tokenized sequence that special tokens will be added.

        Returns:
            `list[int]`: The model input with special tokens.
        )ru   )r#   r   r   s      r    build_inputs_with_special_tokensz0CpmAntTokenizer.build_inputs_with_special_tokens   sE      %%&44!!"[0D4E4E3FFTTr%   already_has_special_tokensc                     |rt         |   ||d      S |'dgdgt        |      z  z   dgz   dgt        |      z  z   S dgdgt        |      z  z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`): List of IDs.
            token_ids_1 (`list[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   r(   r   )rV   get_special_tokens_maskr*   )r#   r   r   r   rZ   s       r   r   z'CpmAntTokenizer.get_special_tokens_mask   sy    " &72'[]a 3   "31#K 001QC7A3[AQ;QRRsqcC,,--r%   )	z<d>z</d>z<s>z</s>z<pad>r4   r   r   leftr    )NF)r6   r7   r8   __doc__r   vocab_files_namesmodel_input_namesadd_prefix_spacer$   propertyr]   r`   rc   intrf   rj   rq   rv   r{   r)   strr}   r   r   r   tupler   r   boolr   __classcell__)rZ   s   @r   r;   r;   O   sr   0 *$&67
 (
T , , , , " " !C ! !?4%tCy S I7c HSM ]bcf]g 8 JNU9U3;DI3FU	cU* sx.9.3;DI3F.ko.	c. .r%   r;   )r   r   r   typingr   transformers.utilsr   r   r?   tokenization_utilsr   utilsr   
get_loggerr6   r   r   r   r   r;   __all__r9   r%   r   <module>r      sp    '  	  D  5  
		H	%!;/  @~.) ~.B 
r%   