
    rh                        d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZmZmZ ddlmZmZmZmZ dd	lmZ  ej2                  e      Zdd
ZdedefdZd Z  G d d      Z! G d de!      Z"dedefdZ# G d d      Z$ G d de$      Z% G d de$      Z& G d de$      Z' G d de$      Z( G d d e$      Z) G d! d"e$      Z* G d# d$e$      Z+ G d% d&e$      Z, G d' d(e$      Z- G d) d*e$      Z. G d+ d,e$      Z/ G d- d.e$      Z0 G d/ d0e0      Z1 G d1 d2e0      Z2 G d3 d4e0      Z3 G d5 d6e0      Z4 G d7 d8e0      Z5 G d9 d:e0      Z6 G d; d<e0      Z7 G d= d>e0      Z8 G d? d@e0      Z9 G dA dBe0      Z: G dC dDe0      Z; G dE dFe0      Z< G dG dHe0      Z= G dI dJe0      Z> G dK dLe0      Z? G dM dNe0      Z@ G dO dPe$      ZA G dQ dRe0      ZB G dS dTe$      ZC G dU dVe$      ZD G dW dXe$      ZE G dY dZe0      ZF G d[ d\e0      ZG G d] d^e0      ZH G d_ d`e$      ZI G da dbe0      ZJ G dc dde0      ZKde ZL G df dg      ZMi dhe1die-dje2dke%dleBdmeEdne3doeCdpe*dqe%dre/dse4dte%due%dve%dwe%dxe%i dye1dze'd{e*d|e+d}e%d~e%de-de9de-de-de%deIde5de6de(de%de-i de7de)de>de,de%de;de<de%de-de.de8de%de?de@deAde9de:e&eFeHeHeGeHdZNdde	fdZOy)z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)Optional)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERRORc                    t               rddlm} |S t               rSdd l}t        j                  |j                  j                        t        j                  d      k  rddl	m} |S ddl	m
} |S t        t        j                  |             )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   googles      v/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufr#   #   sl    !#9&&==445g8NNB '& b&&/66}EFF    add_prefix_spacereturnc                 4    | rd}t        |dd      sd}|S d}|S )NalwayslegacyTfirstnever)getattr)r%   original_tokenizerprepend_schemes      r"   _get_prepend_schemer/   4   s1    !)8T:$N  !r$   c                     |d u}|rt        |      n }g }|j                         D ]j  \  }}g }t        dt        |            D ]*  }|d | ||d  }	}| v s|	 v s|j	                  ||	|f       , t        | fd      }|j                  |       l t        |d |      }|D 
cg c]  }
|
d   |
d   f }}
|S c c}
w )Nr   c                 $    | d      | d      fS Nr   r    )xvocabs    r"   <lambda>z!generate_merges.<locals>.<lambda>I   s    U1Q4[%!+,F r$   keyc                 B    | d   t        | d         t        | d         fS )N   r   r   )lenvals    r"   r6   z!generate_merges.<locals>.<lambda>L   s!    SVSQ[#c!f+,N r$   r8   reverser   )dictitemsranger;   appendsortedextend)r5   vocab_scoresr?   mergesmergepiece_scorelocalindexpiece_lpiece_rr=   s   `          r"   generate_mergesrN   >   s    $&G)04%eLF*002 {1c%j) 	>E$Ve}eEFmWG%Gu$4gw<=	> u"FGe F NX_`F*013s1vs1v1F1M 2s   'B<c                   D    e Zd ZdZdefdZddeeeef   e	e   f   fdZ
y)SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 v    t        | d       ddlm}  |       | _        | j                  j	                  |       y )Nr   r   )SentencePieceProcessor)r   r   rS   spLoad)selfrQ   rS   s      r"   __init__zSentencePieceExtractor.__init__V   s)    $08(*Ur$   Nr&   c                     | j                   }t        |j                               D ci c]  }|j                  |      | }}t	        ||      }||fS c c}w )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        )rT   rB   GetPieceSizeid_to_piecerN   rV   rF   rT   rK   r5   rG   s         r"   extractzSentencePieceExtractor.extract]   sV    
 WW;@AR;ST%&-TT 5f}	 Us   AN)__name__
__module____qualname____doc__strrW   tupler@   intlistr]   r3   r$   r"   rP   rP   Q   s5    c 
E$sCx.$u+2M,N 
r$   rP   c                   4    e Zd Zddeeeef   ee   f   fdZy)GemmaSentencePieceExtractorNr&   c                     | j                   }t        |j                               D ci c]  }|j                  |      | }}d|vr|j	                  d      |d<   t        ||      }||fS c c}w )rY   	<0x09>)rT   rB   rZ   r[   getrN   r\   s         r"   r]   z#GemmaSentencePieceExtractor.extractk   sr    
 WW;@AR;ST%&-TT u))H-E$K 5f} Us   A+r^   )	r_   r`   ra   rd   r@   rc   re   rf   r]   r3   r$   r"   rh   rh   j   s$    E$sCx.$u+2M,N r$   rh   piecec                 ^    t        |       dk  xs | d   dk7  xs | d   j                          S )Nr:   ,)r;   isdigit)rm   s    r"   check_number_commars   {   s3    u:>HU2Y#-HU2Y5F5F5H1HHr$   c                       e Zd Zd ZdefdZy)	Converterc                     || _         y r^   )r-   )rV   r-   s     r"   rW   zConverter.__init__   s
    "4r$   r&   c                     t               r^   )NotImplementedErrorrV   s    r"   	convertedzConverter.converted   s    !##r$   N)r_   r`   ra   rW   r   rz   r3   r$   r"   ru   ru      s    5$9 $r$   ru   c                       e Zd ZdefdZy)BertConverterr&   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr-   r5   r   r   rc   r   hasattrr   tokenize_chinese_charsr   do_lower_caser	   BertNormalizer
normalizerr
   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr   decoder
rV   r5   	tokenizerr   r   r   clssepr   r   s
             r"   rz   zBertConverter.converted      ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	r$   Nr_   r`   ra   r   rz   r3   r$   r"   r|   r|          #9 #r$   r|   c                       e Zd ZdefdZy)SplinterConverterr&   c           
         | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }t	        | j                   j&                        }d}	| j                   j(                  }
| j                   j*                  }| j                   j,                  }| j                   j/                  d      }| j                   j0                  dk(  r| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}t3        j4                  | d| d|||
f||f||f|	|fg      |_        t9        j                  d      |_        |S )Nr~   Fr   Tr   .rightr    r   r   r   r   r   r   )r-   r5   r   r   rc   r   r   r   r   r   r   r	   r   r   r
   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider   r   r   r   r   )rV   r5   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   s                  r"   rz   zSplinterConverter.converted   s"   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334t..==>..;;..;; 33EE..DDSI""//7:U(8*AcU!C5RHDU(3%xz3%qRHD#-#@#@U(3%r*l#l#,-l#		$
	  %..d;	r$   Nr   r3   r$   r"   r   r      s    .9 .r$   r   c                       e Zd ZdefdZy)FunnelConverterr&   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )Nr~   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   s
             r"   rz   zFunnelConverter.converted   r   r$   Nr   r3   r$   r"   r   r      r   r$   r   c                       e Zd ZdefdZy)MPNetConverterr&   c                 r   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	| d
||f||	fg      |_        t1        j                  d      |_        |S )Nr~   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   s
             r"   rz   zMPNetConverter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5SXcU"=l#l#$
	  %..d;	r$   Nr   r3   r$   r"   r   r     r   r$   r   c                       e Zd ZdefdZy)OpenAIGPTConverterr&   c           
         | j                   j                  }t        | j                   j                  j	                               }| j                   j
                  }t        t        ||d t        |      dd            }|j                  t        |            |j                  t        |      g       t        j                  d      |_        t        j                         |_        t#        j$                  d      |_        |S )N</w>F)r5   rG   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)r-   encoderrf   	bpe_rankskeysr   r   r   rc   token_to_idadd_special_tokensr	   r   r   r
   r   r   r   
BPEDecoderr   rV   r5   rG   r   r   s        r"   rz   zOpenAIGPTConverter.converted/  s    ''//d--77<<>?++55	i.#)	
	   Y0<((#i.)9:*99DI	"0"A"A"C	$//v>	r$   Nr   r3   r$   r"   r   r   .  s    9 r$   r   c            	       J    e Zd Z	 ddeeeef      deeeeef         de	fdZ
y)GPT2ConverterNr5   rG   r&   c           
      N   |s| j                   j                  }|st        | j                   j                        }t	        t        ||d ddd            }t        | j                   dd      }t        j                  |      |_	        t        j                         |_        t        | j                   dd      rT| j                   j                  }| j                   j                  }t        j                  | d| d||fg	      |_        |S t        j                  d
      |_        |S )N Fr5   rG   r   continuing_subword_prefixr   r   r%   r%   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r-   r   rf   r   r   r   r,   r
   	ByteLevelr   r   r   	bos_tokenbos_token_idr   r   r   )rV   r5   rG   r   r%   bosr   s          r"   rz   zGPT2Converter.convertedJ  s    ++33E$11;;<F*,#%	
	 #4#:#:<NPUV"0":":L\"]	$..0	4**OUC))33C22??L'1'D'DguL),' (I$  (2';';'OI$r$   NNr_   r`   ra   r   r@   rc   re   rf   rd   r   rz   r3   r$   r"   r   r   I  sD    `d$d38n-$>FtERUWZRZOG\>]$	$r$   r   c                       e Zd ZdefdZy)HerbertConverterr&   c           	         d}d}| j                   j                  }t        | j                   j                  j	                               }||d   d   v r|dd  }t        t        ||d | j                   j                  |            }t        j                  dd      |_
        t        j                         |_        t        j                  |      |_        t#        j$                  | j                   j&                  | j                   j(                  f| j                   j*                  | j                   j,                  f	      |_        |S )
Nz	#version:r   r   r   )r   r   r   F)r   r   r   )r   r   )r-   r   rf   r   r   r   r   r   r	   r   r   r
   r   r   r   r   r   r   BertProcessingr   r   r   r   r   )rV   tokenizer_info_strtoken_suffixr5   rG   r   s         r"   rz   zHerbertConverter.convertedr  s   (''//d--77<<>?1-ABZF11;;#/
	  +99EY^_	"0"A"A"C	$//|D	#-#<#<((22D4K4K4X4XY((22D4K4K4X4XY$
	 
 r$   Nr   r3   r$   r"   r   r   q      9 r$   r   c            	       J    e Zd Z	 ddeeeef      deeeeef         de	fdZ
y)Qwen2ConverterNr5   rG   r&   c                 0   |s| j                   j                  }|s-t        | j                   j                  j	                               }t        t        ||d d dddd            }t        j                         |_	        t        j                  t        j                  t        d      dd      t        j                  t        | j                   dd      d      g      |_        t#        j                         |_        t'        j                  d	      |_        |S )
Nr   F)r5   rG   r   r   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr%   r%   	use_regexr   )r-   r   rf   r   r   r   r   r	   NFCr   r
   SequenceSplitr   r   r,   r   r   r   r   r   )rV   r5   rG   r   s       r"   rz   zQwen2Converter.converted  s    ++33E$11;;@@BCF*,#%#	
	  +0	"0"9"9$$ N (  ((%,T-D-DFXZ_%`##
	  %..0	#-#7#7U#K	 r$   r   r   r3   r$   r"   r   r     sD    `d*d38n-*>FtERUWZRZOG\>]*	*r$   r   c                       e Zd ZdefdZy)RobertaConverterr&   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  |j                  |j                   f|j"                  |j$                  f|j                  d      |_        |S )Nr   Fr   r   Tr   r   r%   r   )r-   r   rf   r   r   r   r   r
   r   r%   r   r   r   r   RobertaProcessingr   r   r   r   r   rV   otr5   rG   r   s        r"   rz   zRobertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#?#?r/r/00	$
	  r$   Nr   r3   r$   r"   r   r         9 r$   r   c                       e Zd ZdefdZy)RoFormerConverterr&   c           	      V   ddl m} | j                  j                  }t	        t        |t        | j                  j                                    }d}d}t        | j                  d      r@| j                  j                  j                  }| j                  j                  j                  }t        j                  dd||      |_        t        j                   j#                   ||            |_        t        | j                  j&                        }t        | j                  j(                        }| j                  j*                  }| j                  j,                  }	t/        j0                  | d| d	| d| d
| d||f||	fg      |_        t5        j
                  d      |_        |S )Nr   )JiebaPreTokenizerr~   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr   r-   r5   r   r   rc   r   r   r   r   r   r	   r   r   r
   PreTokenizercustomr   r   r   r   r   r   r   r   r   r   )
rV   r   r5   r   r   r   r   r   r   r   s
             r"   rz   zRoFormerConverter.converted  sy   I''--iT=T=T=^=^9_`a	4**,=> 33CCQQM 33CCQQM*99!&'#	 
	 #1"="="D"DEVW\E]"^	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	r$   Nr   r3   r$   r"   r   r     r   r$   r   c                       e Zd ZdefdZy)DebertaConverterr&   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  ddd| j                   j                  d      fd| j                   j                  d      fg	      |_        |S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r-   r   rf   r   r   r   r   r
   r   r%   r   r   r   r   r   r   r   r   s        r"   rz   zDebertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@)4$11GGPQ$11GGPQ$
	  r$   Nr   r3   r$   r"   r   r     r   r$   r   c                   `     e Zd ZdZeZi Z fdZd Zd Z	d Z
d Zd Zd Zd	 Zd
efdZ xZS )SpmConverterFc                    t        | d       t        |   |  t               }|j	                         }t        | j                  j                  d      5 }|j                  |j                                d d d        || _
        | j                  j                  j                  r#| j                  st        j                  d       y y y # 1 sw Y   TxY w)Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrW   r#   
ModelProtoopenr-   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rV   args	model_pb2mf	__class__s        r"   rW   zSpmConverter.__init__&  s    $
+$ $%	  "$))44d; 	(qaffh'	(
::""009R9RMMe :S0		( 	(s    CCc                 l    |j                   D cg c]  }|j                  |j                  f c}S c c}w r^   piecesrm   scorerV   r  rm   s      r"   r5   zSpmConverter.vocab;  s'    8=Euekk*EEEs   1c                 .    |j                   j                  S r^   )r  unk_idrV   r  s     r"   r  zSpmConverter.unk_id>  s    !!(((r$   c                 ~   |j                   j                  }| j                  |      }|dk(  r1t        t	        || j                  |      | j                              }n|dk(  r| j                  | j                  j                        j                  |      \  }}t        |      D 	ci c]  \  }\  }}	|| }
}}}	t        t        |
||j                   j                  d| j                  d             }nt        d      t        |j                        D cg c]I  \  }}|j                   dv r6||j"                  |j                   dk(  xs |j"                  | j$                  v fK }}}|j'                  t)        |d	 
      D cg c]  \  }}}t+        |d|       c}}}       |S c c}	}}w c c}}w c c}}}w )Nr   r  r   r:   T)r   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithm      r#  c                     | d   S Nr   r3   r4   s    r"   r6   z(SpmConverter.tokenizer.<locals>.<lambda>m      QRSTQU r$   r7   F
normalizedspecial)r  
model_typer5   r   r   r  r  SpmExtractorr-   r  r]   	enumerater   	unk_piece	Exceptionr  typerm   r   
add_tokensrD   r   )rV   r  r,  rF   r   _rG   iwordr  	bpe_vocabidpspm_added_tokenstokenr+  s                   r"   r   zSpmConverter.tokenizerA  s   ''22
zz%(?! ;;u-"&";";I 1_))$*A*A*L*LMUUVbcIAv9B<9PQQ%5QuqQIQ!#00::!"&";"; 	I o  #5<<0
Avv !&&A+GD4G4G)GH
 

 	 +11A~*V &Bw 5UGD	
 C R*
s   )F+AF2F8c                     |j                   j                  }t        j                  dd      t        j                  t        d      d      g}|st        j                  |      S t        j                  t        j                  |      g|z         S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr	   StripReplacer   r   PrecompiledrV   r  r@  _normalizerss       r"   r   zSpmConverter.normalizers  s{    $44II55g6
 $''55'')@)@AU)V(WZf(fggr$   c                 \    t        || j                        }t        j                  ||      S Nreplacementr.   )r/   r-   r
   	MetaspacerV   rI  r%   r.   s       r"   r   zSpmConverter.pre_tokenizer~  s)    ,-=t?V?VW''KP^__r$   c                      y r^   r3   ry   s    r"   r   zSpmConverter.post_processor  s    r$   c                 \    t        || j                        }t        j                  ||      S rG  )r/   r-   r   rJ  rK  s       r"   r   zSpmConverter.decoder  s(    ,-=t?V?VW!!k.YYr$   r&   c                 z   | j                  | j                        }| j                  | j                        }|||_        d}d}t        | j                  d      r| j                  j
                  }| j                  ||      }|||_        | j                  ||      |_        | j                         }|r||_        |S )Nr>  Tr%   )	r   r  r   r   r-   r%   r   r   r   )rV   r   r   rI  r%   r   r   s          r"   rz   zSpmConverter.converted  s    NN4::.	 __TZZ0
!#-I 4**,>?#66GG**;8HI$&3I# LL6FG	,,.'5I$r$   )r_   r`   ra   r  rP   r-  r   rW   r5   r  r   r   r   r   r   r   rz   __classcell__r  s   @r"   r  r  !  sL     )LN*F)0d	h`Z9 r$   r  c                       e Zd Zd Zd Zd Zy)AlbertConverterc                     |j                   D cg c]J  }t        |j                        r|j                  |j                  fn|j                  |j                  dz
  fL c}S c c}w Nd   r  rs   rm   r  r  s      r"   r5   zAlbertConverter.vocab  ^     
 +=U[[*IU[[%++&PUP[P[]b]h]hkn]nOoo
 	
 
   AA!c                    t        j                  dd      t        j                  dd      g}| j                  j                  sF|j	                  t        j
                                |j	                  t        j                                | j                  j                  r#|j	                  t        j                                |j                  j                  }|r$|j	                  t        j                  |             |j	                  t        j                  t        d      d             t        j                  |      S Nz``"z''r=  r   r	   rB  r-   keep_accentsrC   NFKDStripAccentsr   	Lowercaser?  r@  rC  r   r   rV   r  list_normalizersr@  s       r"   r   zAlbertConverter.normalizer      c*c*
 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR 3 3E'NC HI##$455r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S Nr   r  r  r  r   r   r   r-   r   ry   s    r"   r   zAlbertConverter.post_processor  R    ,,)4$11GGPQ$11GGPQ
 	
r$   Nr_   r`   ra   r5   r   r   r3   r$   r"   rR  rR        
6&
r$   rR  c                       e Zd Zd Zd Zy)BarthezConverterc                 
    d}|S Nr#  r3   rV   r  r  s      r"   r  zBarthezConverter.unk_id      r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   rf  ry   s    r"   r   zBarthezConverter.post_processor  R    ,, +//EEeLM00FFvNO
 	
r$   N)r_   r`   ra   r  r   r3   r$   r"   rk  rk    s    
r$   rk  c                       e Zd Zd Zd Zd Zy)CamembertConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|dgz  }|S c c}w )N))z
<s>NOTUSED        <pad>rx  )z</s>NOTUSEDrx  z<unk>rx  )z<unk>NOTUSEDir   z<mask>rx  r  rV   r  r5   rm   s       r"   r5   zCamembertConverter.vocab  sP    
 	%,,qr:JK5;;,KK/"" L   Ac                      yrm  r3   r  s     r"   r  zCamembertConverter.unk_id  s    r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S rq  rf  ry   s    r"   r   z!CamembertConverter.post_processor  rt  r$   Nr_   r`   ra   r5   r  r   r3   r$   r"   rv  rv    s    
r$   rv  c                       e Zd Zd Zd Zd Zy)DebertaV2Converterc                    g }| j                   j                  r%|j                  t        j                  d             t        || j                         }|j                  t        j                  ||             t        j                  |      S )Nr   )r   rH  )r-   split_by_punctrC   r
   Punctuationr/   rJ  r   )rV   rI  r%   list_pretokenizersr.   s        r"   r   z DebertaV2Converter.pre_tokenizer  sq    ""11%%n&@&@*&UV,-=t?V?VW!!.":":{cq"rs&&'9::r$   c                    g }| j                   j                  r#|j                  t        j                                |j                  t        j
                                |j                  j                  }|r$|j                  t        j                  |             |j                  t        j                  t        d      d             t        j                  |      S )Nr=  r   )r-   r   rC   r	   r`  rA  r?  r@  rC  rB  r   r   ra  s       r"   r   zDebertaV2Converter.normalizer  s    ""00##K$9$9$;< 1 1 34$44II##K$;$;<P$QR 3 3E'NC HI##$455r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S re  rf  ry   s    r"   r   z!DebertaV2Converter.post_processor
  rg  r$   N)r_   r`   ra   r   r   r   r3   r$   r"   r  r    s    ;6
r$   r  c                       e Zd Zd Zd Zd Zy)MBartConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|dgz  }|S c c}w )Nrr  rx  ry  rs  rx  r{  r#  )ar_ARrx  cs_CZrx  de_DErx  en_XXrx  es_XXrx  et_EErx  fi_FIrx  fr_XXrx  gu_INrx  hi_INrx  it_ITrx  ja_XXrx  kk_KZrx  ko_KRrx  lt_LTrx  lv_LVrx  my_MMrx  ne_NPrx  nl_XXrx  ro_ROrx  ru_RUrx  si_LKrx  tr_TRrx  vi_VNrx  zh_CNrx  r|  r  r}  s       r"   r5   zMBartConverter.vocab  sa    
 	%,,qr:JK5;;,KK 
 	
6 	/""; L   A
c                      yrm  r3   r  s     r"   r  zMBartConverter.unk_id<      r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz$A </s> en_XXz$A $B </s> en_XXr  rs  r   rf  ry   s    r"   r   zMBartConverter.post_processor?  R    ,,"#$11GGPQ00FFvNO
 	
r$   Nr  r3   r$   r"   r  r    s    $L
r$   r  c                       e Zd Zd Zd Zd Zy)MBart50Converterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|dgz  }|S c c}w )Nr  r#  )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )af_ZArx  )az_AZrx  )bn_INrx  )fa_IRrx  )he_ILrx  )hr_HRrx  )id_IDrx  )ka_GErx  )km_KHrx  )mk_MKrx  )ml_INrx  )mn_MNrx  )mr_INrx  )pl_PLrx  )ps_AFrx  )pt_XXrx  )sv_SErx  )sw_KErx  )ta_INrx  )te_INrx  )th_THrx  )tl_XXrx  )uk_UArx  )ur_PKrx  )xh_ZArx  )gl_ESrx  )sl_SIrx  r|  r  r}  s       r"   r5   zMBart50Converter.vocabK  sa    
 	%,,qr:JK5;;,KK  R  	R/"" Lr  c                      yrm  r3   r  s     r"   r  zMBart50Converter.unk_idW  r  r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nzen_XX $A </s>zen_XX $A $B </s>r  rs  r   rf  ry   s    r"   r   zMBart50Converter.post_processorZ  r  r$   Nr  r3   r$   r"   r  r  J  s    

r$   r  c                       e Zd Zd Zd Zd Zy)NllbConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )Nr  r#  r  r}  s       r"   r5   zNllbConverter.vocabf  C    
 	%,,qr:JK5;;,KK L   =c                      yrm  r3   r  s     r"   r  zNllbConverter.unk_idp  r  r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnrs  r   rf  ry   s    r"   r   zNllbConverter.post_processors  sR    ,,%&T44JJ:VW00FFvNO
 	
r$   Nr  r3   r$   r"   r  r  e  s    
r$   r  c                       e Zd Zd Zd Zd Zy)SeamlessM4TConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )N)ry  r{  r  r  r#  r  r}  s       r"   r5   zSeamlessM4TConverter.vocab  r  r  c                 .    | j                   j                  S r^   )r-   unk_token_idr  s     r"   r  zSeamlessM4TConverter.unk_id  s    &&333r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__rs  r   rf  ry   s    r"   r   z#SeamlessM4TConverter.post_processor  sR    ,,$%D33II)TU00FFvNO
 	
r$   Nr  r3   r$   r"   r  r  ~  s    4
r$   r  c                       e Zd Zd Zd Zd Zy)XLMRobertaConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|dgz  }|S c c}w )Nr  r#  r|  r  r}  s       r"   r5   zXLMRobertaConverter.vocab  sP    
 	%,,qr:JK5;;,KK/"" Lr~  c                 
    d}|S rm  r3   rn  s      r"   r  zXLMRobertaConverter.unk_id  ro  r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S rq  rf  ry   s    r"   r   z"XLMRobertaConverter.post_processor  rt  r$   Nr  r3   r$   r"   r  r        	
r$   r  c                       e Zd Zd Zd Zd Zy)XLNetConverterc                     |j                   D cg c]J  }t        |j                        r|j                  |j                  fn|j                  |j                  dz
  fL c}S c c}w rT  rV  r  s      r"   r5   zXLNetConverter.vocab  rW  rX  c                    t        j                  dd      t        j                  dd      g}| j                  j                  sF|j	                  t        j
                                |j	                  t        j                                | j                  j                  r#|j	                  t        j                                |j                  j                  }|r$|j	                  t        j                  |             |j	                  t        j                  t        d      d             t        j                  |      S rZ  r\  ra  s       r"   r   zXLNetConverter.normalizer  rc  r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   rf  ry   s    r"   r   zXLNetConverter.post_processor  rg  r$   Nrh  r3   r$   r"   r  r    ri  r$   r  c                       e Zd Zy)ReformerConverterNr_   r`   ra   r3   r$   r"   r  r        r$   r  c                       e Zd Zd Zd Zy)RemBertConverterc                 b   t        j                  dd      t        j                  dd      t        j                  t        d      d      g}| j                  j                  sF|j                  t        j                                |j                  t        j                                | j                  j                  r#|j                  t        j                                |j                  j                  }|r$|j                  t        j                  |             t        j                  |      S rZ  )r	   rB  r   r-   r]  rC   r^  r_  r   r`  r?  r@  rC  r   ra  s       r"   r   zRemBertConverter.normalizer  s    c*c*g4

 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR##$455r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S re  rf  ry   s    r"   r   zRemBertConverter.post_processor  rg  r$   N)r_   r`   ra   r   r   r3   r$   r"   r  r    s    6&
r$   r  c                       e Zd Zy)BertGenerationConverterNr  r3   r$   r"   r
  r
    r  r$   r
  c                   $    e Zd Zd Zd Zd Zd Zy)PegasusConverterc                 v   | j                   j                  df| j                   j                  dfg}| j                   j                  || j                   j                  dfgz  }| j                   j                  I| j                   j
                  | j                   j                  k  r|| j                   j                  dfgz  }|t        d| j                   j                        D cg c]
  }d| ddf c}z  }||j                  dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w c c}w )Nrx  r:   z<unk_>g      Y)r-   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetrB   r  rm   r  )rV   r  r5   r4  rm   s        r"   r5   zPegasusConverter.vocab   s%   $$..4$$..4

 ""22>t..>>DEEE ##..:''558O8O8V8VVt..993?@@E%4;R;R;Y;Y2Z[QU1#Q<([[%,,qr:JK5;;,KK \Ks   %D1D6c                 \    |j                   j                  | j                  j                  z   S r^   )r  r  r-   r  r  s     r"   r  zPegasusConverter.unk_id  s%    !!((4+B+B+I+IIIr$   c                     t        || j                        }t        j                  t        j                         t        j
                  ||      g      S rG  )r/   r-   r
   r   WhitespaceSplitrJ  rK  s       r"   r   zPegasusConverter.pre_tokenizer  sJ    ,-=t?V?VW&&..0(([Q_`
 	
r$   c                     | j                   j                  }|| j                   j                  fg}t        j                  d|gdd|g|      S )N$A$Br   )r-   r  eos_token_idr   r   )rV   eosr   s      r"   r   zPegasusConverter.post_processor  sR    %%//$))667
 ,,T3KtTSVFWhvwwr$   N)r_   r`   ra   r5   r  r   r   r3   r$   r"   r  r    s    &J
xr$   r  c                       e Zd Zd Zd Zy)T5Converterc                     | j                   j                  }|j                  D cg c]  }|j                  |j                  f }}|t        |dz
  dd      D cg c]
  }d| ddf c}z  }|S c c}w c c}w )Nr   ro   z
<extra_id_r  rx  )r-   
_extra_idsr  rm   r  rB   )rV   r  num_extra_idsrm   r5   r4  s         r"   r5   zT5Converter.vocab(  sw    //::9>F%++u{{+FFE-!:KRQS4TUqZs!$c*UU GUs   A/A4c                 r    t        j                  ddgg dd| j                  j                  d      fg      S Nr  rs  )r  rs  r  rs  r   rf  ry   s    r"   r   zT5Converter.post_processor.  =    ,,&>-00FFvNO
 	
r$   N)r_   r`   ra   r5   r   r3   r$   r"   r  r  '  s    
r$   r  c                       e Zd Zd Zy)UdopConverterc                 r    t        j                  ddgg dd| j                  j                  d      fg      S r#  rf  ry   s    r"   r   zUdopConverter.post_processor9  r$  r$   Nr_   r`   ra   r   r3   r$   r"   r&  r&  8  s    
r$   r&  c                       e Zd ZdefdZy)WhisperConverterr&   c           
         | j                   j                  }t        | j                   j                  j	                               }t        t        ||d ddd            }t        j                  | j                   j                        |_
        t        j                         |_        | j                   j                  }| j                   j                  |      }| j                   j                  }| j                   j                   }dj#                  |D cg c]  }| d	 c}      }	t%        j&                  |	 d| d|	 d| d	||fgt)        ||      
      |_        |S c c}w )Nr   Fr   r   r   r   z $A:0 z $A:0 $B:1 r   r   )r-   r   rf   r   r   r   r   r
   r   r%   r   r   r   prefix_tokensconvert_ids_to_tokensr  r  joinr   r   zipr   )
rV   r5   rG   r   prefix_token_idsprefixesr  r  r:  prefix_templates
             r"   rz   zWhisperConverter.convertedD  sR   ''//d--77<<>?*,#%	
	 #1":":DLcLcLtLt"u	$..0	22@@**@@AQR%%//..;;((h#GUugRL#GH#-#@#@%&fSE4#$KuB7l#X/0$
	   $Hs   ENr   r3   r$   r"   r*  r*  C  s     9  r$   r*  c                       e Zd Zd Zy)BigBirdConverterc           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S re  rf  ry   s    r"   r   zBigBirdConverter.post_processorh  rg  r$   Nr(  r3   r$   r"   r4  r4  g  s    
r$   r4  c                       e Zd ZdefdZy)CLIPConverterr&   c                 p   | j                   j                  }t        | j                   j                  j	                               }| j                   j
                  }t        t        ||d dddt        |                  }t        j                  t        j                         t        j                  t        d      d      t        j                         g      |_        t!        j                  t!        j"                  t        d      dd	
      t!        j$                  d      g      |_        t)        j$                         |_        t-        j.                  | j                   j0                  | j                   j2                  f| j                   j4                  | j                   j6                  fdd      |_        |S )Nr   r   Fr5   rG   r   r   r   r   r   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r   )r-   r   rf   r   r   r   r   r   rc   r	   r   r   rB  r   r`  r   r
   r   r   r   r   r   r   r   r  r  r   r   r   r   s        r"   rz   zCLIPConverter.convertedt  sk   ''//d--77<<>?++55	*,#)i.

	  +33__ 3 3E&M3 GI^I^I`a 
	 #1"9"9$$Z[&
 ((%@	#
	 %..0	 $.#?#?((22D4K4K4X4XY((22D4K4K4X4XY"	$
	  r$   Nr   r3   r$   r"   r7  r7  s  s    '9 'r$   r7  c                       e Zd ZdefdZy)LayoutLMv2Converterr&   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )Nr~   FTr   r   r   r   r   r   r   r   r   r   r   s
             r"   rz   zLayoutLMv2Converter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	r$   Nr   r3   r$   r"   r<  r<    r   r$   r<  c                       e Zd ZdefdZy)BlenderbotConverterr&   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  d|j                   d|j                  |j                   fg      |_        |S )Nr   Fr   r   z$A:0 r   )r   r   )r-   r   rf   r   r   r   r   r
   r   r%   r   r   r   r   r   r  r  r   r   s        r"   rz   zBlenderbotConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@2<<.+r/$
	  r$   Nr   r3   r$   r"   r?  r?    r   r$   r?  c                       e Zd Zd Zd Zd Zy)XGLMConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|S c c}w )Nr  r#  ))z<madeupword0>rx  )z<madeupword1>rx  )z<madeupword2>rx  )z<madeupword3>rx  )z<madeupword4>rx  )z<madeupword5>rx  )z<madeupword6>rx  r  r}  s       r"   r5   zXGLMConverter.vocab  sT    
 	%,,qr:JK5;;,KK  z  	z Ls   Ac                 
    d}|S rm  r3   rn  s      r"   r  zXGLMConverter.unk_id  ro  r$   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz</s> $Az</s> $A </s> </s> $Brr  rs  r   rf  ry   s    r"   r   zXGLMConverter.post_processor  sR    ,,'//EEeLM00FFvNO
 	
r$   Nr  r3   r$   r"   rB  rB    r  r$   rB  c                   <    e Zd ZdZeZddhZ	 d Zd Zd Z	d Z
d Zy	)
GemmaConverterTz<start_of_turn>z<end_of_turn>c                 .    t        j                  dd      S Nr   r>  )r	   rB  r  s     r"   r   zGemmaConverter.normalizer  s    ""3..r$   c                 t   | j                   j                  df| j                   j                  df| j                   j                  dfg}||j                  dd  D cg c]  }|j
                  |j                  f c}z  }t        d |D              s#t        d t        |      D        d       }|d||<   |S c c}w )Nrx  r#  c              3   ,   K   | ]  }|d    dk(    yw)r   rj   Nr3   ).0r4   s     r"   	<genexpr>z'GemmaConverter.vocab.<locals>.<genexpr>  s     /A1Q44</s   c              3   8   K   | ]  \  }}|d    dk(  s|  yw)r   rk   Nr3   )rL  r4  r4   s      r"   rM  z'GemmaConverter.vocab.<locals>.<genexpr>  s     "VAQqTXEU1"Vs   )rj   rx  )
r-   r  r  r   r  rm   r  anynextr.  )rV   r  r5   rm   override_indexs        r"   r5   zGemmaConverter.vocab  s    $$..4$$..4$$..4

 	%,,qr:JK5;;,KK ///!"V51A"VX\]N)(3n% Ls   B5c                 .    t        j                  dd      S )Nr   merged_with_previous)r
   r   rV   rI  r%   s      r"   r   zGemmaConverter.pre_tokenizer   s    ##C)?@@r$   c                 
    d}|S rm  r3   rn  s      r"   r  zGemmaConverter.unk_id#  ro  r$   c                     t        j                  t        j                  dd      t        j                         t        j                         g      S )Nr>  r   )r   r   rB  ByteFallbackFuserT  s      r"   r   zGemmaConverter.decoder'  s?        ,%%'
 	
r$   N)r_   r`   ra   r  rh   r-  r   r   r5   r   r  r   r3   r$   r"   rG  rG    s6    .L'9N/ A
r$   rG  c                   4    e Zd ZdZd Zd Zd Zd Zd Zd Z	y)	LlamaConverterTc                 (   | j                   j                  d      df| j                   j                  d      df| j                   j                  d      dfg}||j                  dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )Nr   rx  r   r:   r#  )r-   r-  r  rm   r  r}  s       r"   r5   zLlamaConverter.vocab4  s    $$::1=sC$$::1=sC$$::1=sC

 	%,,qr:JK5;;,KK Ls   )Bc                 
    d}|S r&  r3   rn  s      r"   r  zLlamaConverter.unk_id=  ro  r$   c                     t        j                  dd      t        j                         t        j                         g}|r|t        j                  dd      gz  }t        j
                  |      S Nr>  r   r   )contentr<  r   rB  rW  rX  rA  r   rV   rI  r%   sequences       r"   r   zLlamaConverter.decoderA  \    UC(!!#MMO

 !<==H  **r$   c                     t        | j                  dd      rcg }t        | j                  dd      r|t        j                  d      gz  }|t        j                  dd      gz  }t        j
                  |      S y )Nr)   Tr%   r>  )prependr   )patternr_  )r,   r-   r	   PrependrB  r   )rV   r  rb  s      r"   r   zLlamaConverter.normalizerK  sr    4**Hd;Ht..0BDI[00?@@,,S%HIIH''11r$   c                     t        | j                  dd      s.t        || j                        }t        j                  ||d      S y )Nr)   TFrI  r.   split)r,   r-   r/   r
   rJ  rK  s       r"   r   zLlamaConverter.pre_tokenizerT  sA    t..$?01A4CZCZ[N!++Tbjoppr$   c                      y r^   r3   ry   s    r"   r   zLlamaConverter.post_processorZ  s    r$   N)
r_   r`   ra   r  r5   r  r   r   r   r   r3   r$   r"   rZ  rZ  1  s&    +r$   rZ  c                       e Zd ZdefdZy)MarkupLMConverterr&   c                    | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd| j                   j                              }t        j                  |j                        |_        t        j                         |_        t        | j                   j                        }t        | j                   j                         }| j                   j"                  }| j                   j$                  }t'        j(                  | d| | d| d| ||f||fg      |_        |S )Nr   Fr9  r   z $A z $B r   )r-   r   rf   r   r   r   r   r   r
   r   r%   r   r   r   rc   r   r   r   r   r   r   r   )	rV   r   r5   rG   r   r   r   r   r   s	            r"   rz   zMarkupLMConverter.converted`  s,   $$

bll'')**,#%11;;

	 #1":":BL_L_"`	$..0	$))334$))334..;;..;;#-#@#@U$se$5SEcU+l#l#$
	  r$   Nr   r3   r$   r"   rm  rm  _  s    "9 "r$   rm  c                   *    e Zd ZdZddZd Zd Zd Zy)MoshiConverterTNc                    t        | d       t        j                  | |       t               }|j	                         }t        |d      5 }|j                  |j                                d d d        || _        y # 1 sw Y   || _        y xY wNr   r  	r   ru   rW   r#   r	  r
  r  r  r  )rV   r  model_max_lengthkwargsr  r  r  s          r"   rW   zMoshiConverter.__init__  sr    $
+4, $%	  "*d# 	(qaffh'	(
	(
   	 A99B	c                     |j                   j                  }t        j                  dd      g}|st        j                  |      S t        j                  t        j
                  |      g|z         S rI  )r?  r@  r	   rB  r   rC  rD  s       r"   r   zMoshiConverter.normalizer  sg    $44IIU+
 $''55'')@)@AU)V(WZf(fggr$   c                     t        j                  dd      t        j                         t        j                         g}|r|t        j                  dd      gz  }t        j
                  |      S r^  r`  ra  s       r"   r   zMoshiConverter.decoder  rc  r$   c                 6    d}t        j                  ||d      S )Nr*   Fri  )r
   rJ  rK  s       r"   r   zMoshiConverter.pre_tokenizer  s     ''KP^fkllr$   r^   )r_   r`   ra   r  rW   r   r   r   r3   r$   r"   rp  rp    s    h+mr$   rp  c                   B    e Zd ZdZddZd Zd Zd Zd Zd Z	d	 Z
d
 Zy)HeliumConverterTNc                    t        | d       t        j                  | |       t               }|j	                         }t        |d      5 }|j                  |j                                d d d        || _        y # 1 sw Y   || _        y xY wrr  rs  )rV   r  r  r  r  r  s         r"   rW   zHeliumConverter.__init__  sp    $
+4,#%	  "*d# 	(qaffh'	(
	(
rv  c                 V   | j                  |      }t        t        || j                  |      | j                              }t        |j                        D cg c]I  \  }}|j                  dv r6||j                  |j                  dk(  xs |j                  | j                  v fK }}}|j                  t        |d       D cg c]  \  }}}t        |d|d       c}}}       |j                  t        d	dd
      g       |j                  dd       |S c c}}w c c}}}w )Nr!  r"  r#  c                     | d   S r&  r3   r'  s    r"   r6   z+HeliumConverter.tokenizer.<locals>.<lambda>  r(  r$   r7   FT)r*  r+  single_word
r)  rz  )r  pad_id)r5   r   r   r  r  r.  r  r1  rm   r   r2  rD   r   enable_padding)	rV   r  rF   r   r7  r8  r9  r:  r+  s	            r"   r   zHeliumConverter.tokenizer  s    zz%({{5)"77
	 #5<<0
Avv !&&A+GD4G4G)GH
 

 	 +11A~*V &Bw 5UGQUV	
 	j%OPQ  71 =
s   ADD$c                     g }|j                   D ]@  }|j                  dk(  r|d|j                  fgz  }%||j                  |j                  fgz  }B |S )Nz<0x0A>r  r  r}  s       r"   r5   zHeliumConverter.vocab  s]    \\ 	6E{{h&4-..5;;455		6
 r$   c                 
    d}|S r&  r3   rn  s      r"   r  zHeliumConverter.unk_id  ro  r$   c                     t        j                  dd      t        j                         t        j                         g}|t        j                  dd      gz  }t        j
                  |      S r^  r`  ra  s       r"   r   zHeliumConverter.decoder  sY    UC(!!#MMO

 	X^^Ca899  **r$   c                 ~    t        j                  t        j                  d      t        j                  dd      g      S rI  )r	   r   rg  rB  r  s     r"   r   zHeliumConverter.normalizer  s2    ##[%8%8%={?R?RSWY^?_$`aar$   c                 V    t        j                  t        j                  dd      g      S )Nr  
contiguous)r
   r   r   rT  s      r"   r   zHeliumConverter.pre_tokenizer  s#    &&(<(<T<(P'QRRr$   c                 <    t        j                  ddgg ddg      S )Nrr  r  )rr  r  rr  r  )rr  r   r   )r   r   ry   s    r"   r   zHeliumConverter.post_processor  s/    ,, 
 	
r$   r^   )r_   r`   ra   r  rW   r   r5   r  r   r   r   r   r3   r$   r"   r{  r{    s2    
8+bS
r$   r{  c            	         t        t        t        d      t        d      dz               t        t        t        d      t        d      dz               z   t        t        t        d      t        d      dz               z   } | dd }d	}t        d
      D ]1  }|| vs| j                  |       |j                  d
|z          |dz  }3 |D cg c]  }t	        |       }}t        t        | |            S c c}w )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      )rf   rB   ordrC   chrr@   r/  )bscsnbs       r"   bytes_to_unicoder    s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[ B;IIaLIIdQhFA	
 	Q#a&	B	B 
s   C4c                   J     e Zd ZdZ	 	 	 	 d fd	ZdefdZd ZdefdZ	 xZ
S )	TikTokenConverterz'
    A general tiktoken converter.
    c                     t        |   |  || _        || _        || _        t        |t              r|j                         | _        y || _        y r^   )	r  rW   r  rf  r%   
isinstancer@   r   additional_special_tokens)rV   r  rf  r%   r  r  ru  r  s          r"   rW   zTikTokenConverter.__init__%  sX     	$$ 0 3T: &**, 	& + 	&r$   tiktoken_urlc                 0   	 ddl m}  ||      t	               fd}g }i }j                         D ]  \  }}|| ||      <   t        |      dk(  r g }t        dt        |            D ]2  }	|d |	 ||	d  }}
|
v s|v s|
|z   v s|j                  |
||f       4 t        |fdd      }|j                  |        t        |d	 d      }|D cg c]  } ||d          ||d         f }}||fS # t        $ r t        d      w xY wc c}w )
Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c           	          dj                  | j                  d      D cg c]  }t        |          c}      S c c}w )Nr   zlatin-1)r.  decoder  )r  charbyte_encoders     r"   token_bytes_to_stringzPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringC  s2    77@STLT3TUUTs   <r   c                 $    | d      | d      fS r2   r3   )r4   r   s    r"   r6   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>Q  s    1Q4)AaD/0R r$   Fr>   c                     | d   S )Nr:   r3   r<   s    r"   r6   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>S  s
    A r$   )tiktoken.loadr  r0  
ValueErrorr  rA   r;   rB   rC   rD   rE   )rV   r  r  r  rG   r5   r:  rankrJ   rK   rL   rM   r=   r   r  s                @@r"   extract_vocab_merges_from_modelz1TikTokenConverter.extract_vocab_merges_from_model8  sY   	7 &l3	')	V $??, 
	!KE426E'./5zQEq#e*- ;#(%=%-i'Gy,@gPWFW\eEeLL'7D!9:; 5&R\abEMM% 
	! $6F\bcUX(Q02GA2OPccf}5  	k 	2 ds   C; D;Dc                     | j                  | j                        \  }}t        t        ||d            }t	        |j
                  d      rd|j
                  _        |S )NF)r   ignore_mergesT)r  r  r   r   r   rQ   r  )rV   rF   rG   r   s       r"   r   zTikTokenConverter.tokenizerW  sN    #CCDOOTfc,GH	9??O4,0IOO)r$   r&   c           
         | j                         }t        j                  t        j                  t	        | j
                        dd      t        j                  | j                  d      g      |_        t        j                         |_
        |j                  | j                  D cg c]  }t        |dd       c}       t        j                  d      |_        |S c c}w )Nr   Fr   r   Tr)  r   )r   r
   r   r   r   rf  r   r%   r   r   r   r   r  r   r   r   )rV   r   r:  s      r"   rz   zTikTokenConverter.converted^  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$LPLjLjk5Z%>k	
 $.#7#7U#K	  ls   'C )Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)r_   r`   ra   rb   rW   rc   r  r   r   rz   rO  rP  s   @r"   r  r     s;      K"&
&C >9 r$   r  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3Tokenizerc                 v   | j                   j                  }|t        v r!|st        |   } ||       j                         S 	 t        j                  d       t        | j                  | j                        j                         S # t        $ r* t        dt        t        j                                      w xY w)a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zConverting from Tiktoken)r  r  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r  r_   SLOW_TO_FAST_CONVERTERSrz   loggerinfor  r  r  r0  r  rf   r   )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs       r"   convert_slow_tokenizerr    s      1::CC66}12FG45??AA	KK23$0;;*?*Y*Y ik  	>>BCZC_C_Ca>b=ce 	s   AB 3B8)r   )F)Prb   r  typingr   	packagingr   
tokenizersr   r   r   r   r	   r
   r   tokenizers.modelsr   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerr_   r  r#   boolrc   r/   rN   rP   rh   rs   ru   r|   r   r   r   r   r   r   r   r   r   r   r  rR  rk  rv  r  r  r  r  r  r  r  r  r  r
  r  r  r&  r*  r4  r7  r<  r?  rB  rG  rZ  rm  rp  r{  r  r  r  r  r3   r$   r"   <module>r     sj      f f f 5 5 ` ` 5 
		H	%G"$ s & 2"8 "Ic Id I$ $$I $N/	 /d$i $N$Y $N 6%I %Py >+Y +\y :$	 $Ny >~9 ~B"
l "
J
| 
 
 
:
 
B2
\ 2
j
| 
6
L 
2
< 
2
, 
6"
\ "
J	 	
| 
@	l 	%x| %xP
, 
"
L 
!y !H	
| 	
(I (V$) $N) :
L 
61
\ 1
h+\ +\#	 #L&m\ &mRV
l V
t0N Nb::%: (: ]	:
 (: .: ,: ]: : : (: ,: =: -: "=:  !-!:" #:$ _%:& ':( ]):* (+:, -:. =/:0 +1:2 -3:4 +5:6 $7:8 }9:: *;:< n=:> (?:@ nA:B =C:D $E:F ]G:H ,I:J (K:L nM:N mO:P *Q:R (S:T -U:V (W:X *Y:Z 0[:\ M]:^ ;_:` ]a:b (c:d .e:f ng:h +"$($#s: z!) !r$   