
    rh                     d    d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ  G d d      ZdefdZy	)
    )Regex	Tokenizerdecoderspre_tokenizers
processors)BPE)LlamaTokenizerFast)bytes_to_unicodec                   J     e Zd ZdZ	 	 	 	 d fd	ZdefdZd ZdefdZ	 xZ
S )	MistralConverterz'
    A general tiktoken converter.
    c                 V    t        |   |  || _        || _        || _        || _        y )N)super__init__vocabpatternadd_prefix_spaceadditional_special_tokens)selfr   r   r   r   argskwargs	__class__s          t/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/integrations/mistral.pyr   zMistralConverter.__init__   s0     	$
 0)B&    r   c                 &   |t               fd}g }i }t        j                               D ]  \  }\  }}|| j                  vr|| ||      <   t	        |      dk(  r1g }t        dt	        |            D ]2  }|d | ||d  }
}	|	v s|
v s|	|
z   v s|j                  |	|
|f       4 t        |fdd      }|j                  |       |||<    t        |d d      }|D cg c]  } ||d          ||d         f }}||fS c c}w )Nc           	          dj                  | j                  d      D cg c]  }t        |          c}      S c c}w )N zlatin-1)joindecodeord)bcharbyte_encoders     r   token_bytes_to_stringzOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string    s2    77@STLT3TUUTs   <   c                 $    | d      | d      fS )Nr   r$    )x	bpe_rankss    r   <lambda>zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>/   s    Yqt_iPQRSPTo4V r   F)keyreversec                     | d   S )N   r&   )vals    r   r)   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>3   s
    A r   r   )	r
   	enumerateitemsr   lenrangeappendsortedextend)r   r   r#   mergesidxtokenranklocalindexpiece_lpiece_rr.   r(   r"   s               @@r   extract_vocab_merges_from_modelz0MistralConverter.extract_vocab_merges_from_model   sK   	')	V "+IOO,="> 	#C%D:::69+E23u:?"1c%j1 ?E',Ve}eEFmWG)+90D'T[J[`iIigw%=>? u*V`efe$"e	# $6F\bcUX(Q02GA2OPccf} ds   +Dc                     | j                  | j                        \  }}t        t        ||d            }t	        |j
                  d      rd|j
                  _        |S )NF)fuse_unkignore_mergesT)r>   r   r   r   hasattrmodelrA   )r   vocab_scoresr6   	tokenizers       r   rE   zMistralConverter.tokenizer7   sN    #CCDJJOfc,GH	9??O4,0IOO)r   returnc                    | j                         }t        j                  t        j                  t	        | j
                        dd      t        j                  | j                  d      g      |_        t        j                         |_
        |j                  | j                         t        j                  d      |_        |S )NisolatedF)behaviorinvert)r   	use_regex)trim_offsets)rE   r   SequenceSplitr   r   	ByteLevelr   pre_tokenizerr   decoderadd_special_tokensr   r   post_processor)r   rE   s     r   	convertedzMistralConverter.converted>   s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$T%C%CD#-#7#7U#K	 r   )Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)__name__
__module____qualname____doc__r   strr>   rE   r   rT   __classcell__)r   s   @r   r   r      s<      K"&CS 69 r   r   tokenizer_filec                    ddl m} |j                  |       }|j                  j                  j
                  }|j                  j                  j                  D cg c]  }t        |d      r|j                  n| }}|D ci c]  }||j                  |       }}|j                  |       |}t        t        ||      j                               }|j                  d|i       |S c c}w c c}w )z1Convert a "tekken" tokenizer to a fast Tokenizer.r   )MistralTokenizervalue)r   r   )tokenizer_objectr   )(mistral_common.tokens.tokenizers.mistralr]   	from_fileinstruct_tokenizerrE   _tekken_token2id_nospecial_all_special_tokensrB   r^   r;   updater	   r   rT   rR   )r[   r]   mistral_tokenizerr   r8   all_specialspecials_tokensrE   s           r   convert_tekken_tokenizerri   N   s     J )22>B 00::UUE '99CCWW ug.E9K  EPP5uk//66POP5!E #)Q\]ggiI
   "={!KL! Qs   !C)C.N)
tokenizersr   r   r   r   r   tokenizers.modelsr   transformersr	   #transformers.convert_slow_tokenizerr
   r   rY   ri   r&   r   r   <module>rn      s-    M M ! + @C CLS r   