
    rh                        d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ d
dlmZ d
dlmZ d
dlm Z  d
dl!m"Z" d
dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d
dl.m/Z/m0Z0m1Z1  e1jd                  e3      Z4dZ5dZ6dZ7dZ8dZ9e$dz  Z$eeeedZ:e5e8dZ; e0e$       G d de)             Z<y)z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)AnyOptionalUnion)Encoding)	Tokenizer)Decoder)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer   )convert_slow_tokenizer)convert_gguf_tokenizer)load_gguf_checkpoint)PreTrainedTokenizer)
INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBaseSpecialTokensMixin	TextInputTextInputPairTruncationStrategy)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)BPEUnigram	WordLevel	WordPiece)tokenizer_file
vocab_filec            )       <    e Zd ZU dZeZdZeed<    fdZ	e
defd       Ze
defd       Ze
defd       Zdeeef   fd	Ze
deeef   fd
       Ze
deeef   fd       Ze
deeef   fd       Zdeeef   fdZdefdZdefdZe
defd       Ze
defd       Z	 	 	 	 	 	 	 dFdedee   dee   dededededede eee!f   e"e   f   fdZ#de$ee%e   f   de$ee"e   f   fdZ&dedefd Z'd!edee   fd"Z(dGd#e"e$eef      defd$Z)dGd%edefd&Z*	 dGd'e$ee"e   f   d(ede$ee"e   f   fd)Z+dHd*ed%ee   d+ede"e   fd,Z,d-e-d.e.d/ed0ed1ee   d2ee   fd3Z/de-j`                  e.jb                  dd4ddddddddddddfd5e$e"e2   e"e3   e"e4   e"e5   f   d+ed-e-d.e.d/ee   d0ed6ed1ee   d2ee   d7ee   dee   dee   dededededed8ede6f&d9Z7dde-j`                  e.jb                  dd4ddddddddddddfd*e$e2e4f   d:ee$e2e4f      d+ed-e-d.e.d/ee   d0ed6ed1ee   d2ee   d7ee   dee   dee   dededededed8ede6f(d;Z8de"e   defd<Z9	 	 dId=e$ee"e   f   d(ed>ee   defd?Z:	 	 dJd@e$ee;jx                  f   dAe e   dBee   dCee   de e   f
dDZ=	 	 	 dKdEZ> xZ?S )LPreTrainedTokenizerFastaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    Nslow_tokenizer_classc           	      >   |j                  dd       }|j                  dd       }|j                  dd       }|j                  dd       }|j                  dd      }|j                  di       }|j                  dd      | _        |r|| j                  t	        d	      |t        j                  |      }	n||st        j                  |      }	n|rt        |      }	n|lt        |j                  d
            }
|
d   d   }|
d   }|
d   }t        ||      \  }	}|j                  |       t        |      dkD  r|j                  |       nx| j                  "|dur | j                  |i |}t        |      }	nJ|s=|j                  d
      | _        |j                  dg       | _        t        | d      }	d }nt	        d      |	| _        ||j                  |j"                         d| _        | j                   j&                  }|q | j                   j(                  d$i | |j+                  d|d          |j+                  d|d          |j+                  d|d          |j+                  d|d          n| j                   j-                          | j                   j.                  }| | j                   j0                  d$i | |j+                  d|d          |j+                  d|d          |j+                  d|d          |j+                  d|d          |j+                  d|d          t3        | h  d$i | | j6                  | j                   _        | j:                  D ch c]  }t=        t?        |             }}tA        |jC                         d  !      D cg c]  \  }}t=        t?        |            |vr| }}}tE        | jF                  jI                               |D cg c]  }tK        |       c}z   }|| jL                  D cg c]  }||vs||vs| c}z  }t        |      dkD  rg }| jN                  }|D ]p  }tQ        |tR              r|jT                  xs tK        |      |v ntK        |      |v }tQ        |tJ              rtS        ||"      }n||_*        |jW                  |       r |r| jY                  |       	 t[        j\                  | j^                  j`                  jc                               }|j                  d| j                        | j                  k7  rFte        tf        |j                  d#            }| j                  |d<    |d$i || j^                  _0        y y c c}w c c}}w c c}w c c}w # th        $ r Y y w xY w)%Ntokenizer_object__slow_tokenizer	gguf_filer%   	from_slowFadded_tokens_decoderadd_prefix_spacezCannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.r&   config
model_type	tokenizertokenizer_configr   additional_special_tokensT)from_tiktokena9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.
max_lengthtruncation_side	directionstridetruncation_strategystrategy	pad_tokenpad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofc                     | d   S Nr    )xs    w/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/tokenization_utils_fast.py<lambda>z2PreTrainedTokenizerFast.__init__.<locals>.<lambda>   s    STUVSW     key)specialtyperE   )5popgetr0   r)   
ValueErrorcopydeepcopyTokenizerFast	from_filer   r   r   updatelenr&   r5   
_tokenizerinit_kwargs_decode_use_source_tokenizer
truncationenable_truncation
setdefaultno_truncationpaddingenable_paddingsuper__init__split_special_tokensencode_special_tokensr/   hashreprsorteditemslistadded_tokens_encoderkeysstrall_special_tokens_extendedall_special_tokens
isinstancer   rL   append
add_tokensjsonloadsbackend_tokenizerpre_tokenizer__getstate__getattrpre_tokenizers_fast	Exception)selfargskwargsr+   slow_tokenizerr-   fast_tokenizer_filer.   r/   fast_tokenizer
gguf_paramarchitecturetokenizer_dictr4   additional_kwargs_truncation_paddingtokenadded_tokens_decoder_hashindextokens_to_addencodertokensspecial_tokens
is_specialpre_tok_statepre_tok_class	__class__s                              rG   ra   z PreTrainedTokenizerFast.__init__b   sv   !::&8$?$6=JJ{D1	$jj)94@JJ{E2	%zz*@"E &

+=u E/D4M4M4U0 
 '!]]+;<N ,Y*445HIN3NCN"-fjj.FGJ%h/=L'4N)*<=0F|Uc0d-N-MM*+$%)/0&&2~U7R6T66GGN3NCN$jj6DO-3ZZ8SUW-XD*3DMN!Nr  )%MM.445,1)oo00"-DOO--<<lK,EF/[1IJhH(=>3[5LMOO))+??***DOO**6X6k8K+@A18M3JKnh{.CDlHX,>?2H=Q4RS 	"6"040I0I-DHD]D]$^5T$u+%6$^!$^ !'';'A'A'C X
uDK (AA 
 

 t005578Ta;b5CJ;bb#??
5PWCW\aiv\vE
 	
 }!F!44N& 
% "%4 ]]Bc%jN&BU~5 
 eS)&ujAE$.EMe$
% '
	 JJt'='='K'K'X'X'Z[M  !3T5J5JKtOdOdd '(;]=N=Nv=V W484I4I017D7U}7U&&4 e? %_

 <c
6  	 		s7   *U;&"V 1V	V!V&VB%V 	VVreturnc                      y)NTrE   ry   s    rG   is_fastzPreTrainedTokenizerFast.is_fast   s    rI   c                     d| j                   v r`| j                   d   j                  d      rBt        | d      r5| j                  r)t        j
                  j                  | j                        S yy)z
        `bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        r&   z.modelFT)vocab_files_namesendswithhasattrr&   ospathisfiler   s    rG   can_save_slow_tokenizerz/PreTrainedTokenizerFast.can_save_slow_tokenizer   sX     4111d6L6L\6Z6c6cdl6mt\*tww~~doo66rI   c                 :    | j                   j                  d      S )zP
        `int`: Size of the base vocabulary (without the added tokens).
        Fwith_added_tokensrW   get_vocab_sizer   s    rG   
vocab_sizez"PreTrainedTokenizerFast.vocab_size   s    
 ---FFrI   c                 :    | j                   j                  d      S )NTr   )rW   	get_vocabr   s    rG   r   z!PreTrainedTokenizerFast.get_vocab   s    ((4(@@rI   c                 "    | j                         S N)r   r   s    rG   vocabzPreTrainedTokenizerFast.vocab   s    ~~rI   c                     t        | j                  j                         d       D ci c]  \  }}|j                  | c}}S c c}}w )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                     | d   S rD   rE   items    rG   rH   z>PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<lambda>      dhijdk rI   rJ   rf   r/   rg   contentry   vks      rG   ri   z,PreTrainedTokenizerFast.added_tokens_encoder   s;     *00I0I0O0O0QWk)lmA		1mmm   Ac                 6    | j                   j                         S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `dict[str, int]`: The added tokens.
        )rW   get_added_tokens_decoderr   s    rG   r/   z,PreTrainedTokenizerFast.added_tokens_decoder  s     7799rI   c                     t        | j                  j                         d       D ci c]  \  }}|j                  | c}}S c c}}w )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `dict[str, int]`: The added tokens.
        c                     | d   S rD   rE   r   s    rG   rH   z9PreTrainedTokenizerFast.get_added_vocab.<locals>.<lambda>  r   rI   rJ   r   r   s      rG   get_added_vocabz'PreTrainedTokenizerFast.get_added_vocab  s;     *00I0I0O0O0QWk)lmA		1mmmr   c                      y)zN
        Returns True, to avoid expensive `assert tokenizer` gotchas.
        TrE   r   s    rG   __bool__z PreTrainedTokenizerFast.__bool__  s     rI   c                 :    | j                   j                  d      S )zD
        Size of the full vocabulary with the added tokens.
        Tr   r   r   s    rG   __len__zPreTrainedTokenizerFast.__len__  s     ---EErI   c                     | j                   S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )rW   r   s    rG   rs   z)PreTrainedTokenizerFast.backend_tokenizer%  s    
 rI   c                 .    | j                   j                  S )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )rW   decoderr   s    rG   r   zPreTrainedTokenizerFast.decoder,  s    
 &&&rI   FTencodingreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 J   |d| j                   v }|d| j                   v }|r|j                  |g|j                  z   }	n|g}	t        t              }
|	D ]  }|
d   j	                  |j
                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |s|
d   j	                  t        |j
                                |
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        token_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingrA   )model_input_namesoverflowingr   rh   ro   idstype_idsr   r   offsetsrV   )ry   r   r   r   r   r   r   r   r   	encodingsencoding_dictes               rG   _convert_encodingz)PreTrainedTokenizerFast._convert_encoding3  s$   ( !($48N8N$N! ($48N8N$N!$)=)=)I!
X%9%99I!
I#D) 	;A+&--aee4$./66qzzB$./66q7G7GH)34;;A<Q<QR%./66qyyAh'..s155z:	; i''rI   r   c                     t        |t              r| j                  |      S |D cg c]  }| j                  |       c}S c c}w )aX  
        Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `list[int]`: The token id or list of token ids.
        )rn   rk   #_convert_token_to_id_with_added_voc)ry   r   r   s      rG   convert_tokens_to_idsz-PreTrainedTokenizerFast.convert_tokens_to_idsb  s>     fc";;FCCMSTE88?TTTs   Ar   c                 X    | j                   j                  |      }|| j                  S |S r   )rW   token_to_idunk_token_id)ry   r   r   s      rG   r   z;PreTrainedTokenizerFast._convert_token_to_id_with_added_vocr  s,    ++E2=$$$rI   r   c                 J    | j                   j                  t        |            S r   )rW   id_to_tokenint)ry   r   s     rG   _convert_id_to_tokenz,PreTrainedTokenizerFast._convert_id_to_tokenx  s    **3u:66rI   
new_tokensc                 r    |r| j                   j                  |      S | j                   j                  |      S r   )rW   add_special_tokensrp   )ry   r   r   s      rG   _add_tokensz#PreTrainedTokenizerFast._add_tokens{  s/    ??55jAA))*55rI   pairc                 8    | j                   j                  |      S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )rW   num_special_tokens_to_add)ry   r   s     rG   r   z1PreTrainedTokenizerFast.num_special_tokens_to_add  s    & 88>>rI   r   skip_special_tokensc                 $   t        |t              r| j                  j                  |      S g }|rt	        | j
                        n	t	               }|D ]<  }t        |      }||v r|j                  | j                  j                  |             > |S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `list[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `list[str]`: The decoded token(s).
        )rn   r   rW   r   setall_special_idsro   )ry   r   r   r   ids_to_skipr   s         rG   convert_ids_to_tokensz-PreTrainedTokenizerFast.convert_ids_to_tokens  s      c3??..s333Fc$../CE 	>EJE#MM$//55e<=		>
 rI   textr   c                 J     | j                   d|||d|j                         S )N)r   	text_pairr   rE   )encode_plusr   )ry   r   r   r   r{   s        rG   tokenizez PreTrainedTokenizerFast.tokenize  s,    tkTTN`kdjkrrttrI   padding_strategyr;   r7   r:   rB   r@   c                    | j                   j                  }| j                   j                  }|t        j                  k(  r||| j                   j                          na|||j                  | j                  d}	|d}
n |	D ci c]  }||j                  |d       }
}|
|	k7  r | j                   j                  di |	 |t        j                  k(  r|| j                   j                          yy|t        j                  k(  r|nd}|||n| j                  | j                  | j                   | j"                  |d}	||	k7  r | j                   j$                  di |	 yyc c}w )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r7   r:   r<   r9   )rA   r9   pad_idr=   r?   rB   rE   )rW   rZ   r^   r   DO_NOT_TRUNCATEr]   valuer8   rO   r[   r   
DO_NOT_PAD
no_padding
MAX_LENGTHr@   pad_token_idr=   r>   r_   )ry   r   r;   r7   r:   rB   r@   r   r   targetcurrentr   rA   s                rG   set_truncation_and_paddingz2PreTrainedTokenizerFast.set_truncation_and_padding  sZ   B oo00??**"4"D"DD&--/ ) /55!11	F "@FG11kooa66GG& 111;F;999#**, $ $47Q7Q#QZW[F -9-E\4K\K\++!^^#55&8F 6!...88 "% Hs   Er   batch_text_or_text_pairsis_split_into_wordsreturn_tensorsrb   c                    t        |t        t        f      st        dt	        |       d      | j                  ||||||	       | j                  j                  |k7  r|| j                  _        | j                  j                  |||      }|D cg c]  }| j                  ||||||||       }}i }|d   d   D ]'  }|D cg c]  \  }}||   D ]  }|  }}}}|||<   ) |D cg c]  \  }}|D ]  }|  }}}}|r2g }t        |      D ]  \  }\  }}||gt        |d         z  z  } ||d<   |d   D ]  } | j                  | ||        t        |||
	      S c c}w c c}}}w c c}}}w )
Nz:batch_text_or_text_pairs has to be a list or a tuple (got ))r   r;   r7   r:   rB   r@   )r   is_pretokenized)r   r   r   r   r   r   r   r   r   r   overflow_to_sample_mapping)tensor_type)rn   tuplerh   	TypeErrorrM   r   rW   rc   encode_batchr   	enumeraterV   &_eventual_warn_about_too_long_sequencer   )!ry   r   r   r   r;   r7   r:   r   rB   r@   r   r   r   r   r   r   r   r   rb   r   r   tokens_and_encodingssanitized_tokensrK   r   _r   stacksanitized_encodingsr  itoksr   s!                                    rG   _batch_encode_plusz*PreTrainedTokenizerFast._batch_encode_plus   s   . 2UDMBLTRjMkLllmn 
 	''- 3!1% 	( 	
 ??004HH4HDOO1OO00$1/ 1 
	. & 
  ""!&;&;*C+E'=+ # 	 
  
( '*1- 	*C&:NN74DINqQNQNEN$)S!	* 1ESSWQdSqSqSS %)+& )*> ? K9D!*qcC[8I4J.JJ*K=W9:)+6 	XI77	:wW	X-/BP^__I 
, OSs    E$E)
.E0r   c                    |r||fgn|g} | j                   |fi d|d|d|d|d|d|d|	d|
d	|d
|d|d|d|d|d|d|d||}|`|s^t        |j                         D ci c].  \  }}|t        |      dkD  rt	        |d   t
              r|d   n|0 c}}|j                        }| j                  |d   ||       |S c c}}w )Nr   r   r   r;   r7   r:   rB   r@   r   r   r   r   r   r   r   r   rb   r   r   )r  r   rg   rV   rn   rh   r   r	  )ry   r   r   r   r   r;   r7   r:   r   rB   r@   r   r   r   r   r   r   r   r   rb   r{   batched_inputbatched_outputrK   r   s                            rG   _encode_plusz$PreTrainedTokenizerFast._encode_plus[  sg   . 09$	*+tf000
 3
  2
 .	

 !4
 "
 
  2
 &
 *
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ "6'
0 !*C* '5&:&:&<"U c%j1nE!Hd9S%(Y^^ ((N 	33N;4OQ[]des   -3C
c                     | j                   j                  %| j                   j                  j                  |      S dj                  |      S )N )rs   r   decodejoin)ry   r   s     rG   convert_tokens_to_stringz0PreTrainedTokenizerFast.convert_tokens_to_string  sJ     %%--9 ""**11&9	
 &!	
rI   	token_idsclean_up_tokenization_spacesc                     |j                  dd      | _        t        |t              r|g}| j                  j                  ||      }||n| j                  }|r| j                  |      }|S |S )Nuse_source_tokenizerF)r   )rN   rY   rn   r   rW   r  r  clean_up_tokenization)ry   r  r   r  r{   r   
clean_texts          rG   _decodezPreTrainedTokenizerFast._decode  s~     -3JJ7Mu,U)i%"I%%iEX%Y ,7 )22 	%
 (33D9JKrI   save_directory
file_nameslegacy_formatfilename_prefixc                    t        |      }| j                  |du rt        d      |du xs |du xr | j                  duxr | j                  }|du xs |du }|rt        j
                  j                  ||r|dz   ndt        z         }| j                  j                         D 	ci c]  \  }}	|	| j                  k\  s||	 }
}}	|
rDt        |dd	      5 }t        j                  |
d
dd      dz   }|j                  |       ddd       | j                  ||      }||z   |fz   }|rOt        j
                  j                  ||r|dz   ndt         z         }| j"                  j%                  |       ||fz   }|S c c}	}w # 1 sw Y   ~xY w)z
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
        file containing {config + vocab + added-tokens}.
        NTzYour tokenizer does not have a legacy version defined and therefore cannot register this version. You might consider leaving the legacy_format at `None` or setting it to `False`.F- wzutf-8)r      )indent	sort_keysensure_ascii
)r%  )rk   r)   rP   r   r   r   r  ADDED_TOKENS_FILEri   rg   r   openrq   dumpswritesave_vocabularyTOKENIZER_FILErs   save)ry   r"  r#  r$  r%  	save_slow	save_fastadded_tokens_filetokr   added_vocabfout_strvocab_filesr%   s                  rG   _save_pretrainedz(PreTrainedTokenizerFast._save_pretrained  s    ^,$$,$1F`  d";mt&; -))5-,, 	
 "T)C]e-C	 "/3!6rUf f! 9=8Q8Q8W8W8Yv*#u]bfjfufu]u3:vKv+S7C %q"jjQ$]bcfjjGGGG$% ..~._K#k15F4HHJWW\\/3!6rUc cN ""''7#~&77J! w% %s   &E;>E;.FF
c           	      	   t        j                  | j                  j                               }|j	                  d      }|j	                  d      }	d}
|d   d   dk(  ri |d   d<   g |d   d<   np|d   d   d	k(  r=|d   d
   ]|d   d
   }|d   d   |   d   }
|	|
|v r||
   }
d|d   d
<   |
dgg|d   d<   n(|d   d   dv r	i |d   d<   nt        d|d   d    d      |"d|d   v r|d   d   |v r||d   d      |d   d<   t        j                  t        j                  |            }g }|D ]b  }|j	                  dd      }|j	                  dd      }|d   d   d	k7  r|s5||d   |v r||d      |d<   |j                  t        d'i |       d ||j                  |       |d   d   dk(  rd|vr|d   d   |d   d   |d<   |d   d   dk(  rd|vr|d   d   |d   d   |d<   |d   d   d	k(  r|
|
|d<   |d   V|d   d   dk(  s*|d   d   dk(  r@d|d   v r9t        d |d   d   D              r!t        j                  j                         |d<   t         |d   d      } |d'||d|}|j#                  |||       |	&t        j                  |j                               }d|	v r|	d   D ]  }|	d   |   d   }||D cg c]  }|j%                  ||       }}||	d   |   d<   |D ]   }|j'                  |      }|t        d        |D cg c]  }|j'                  |       c}|	d   |   d!<    d"D ]?  }||	v s|	|   \  }}|	||v r||   }|j'                  |      }|t        d       ||g|	|<   A |	|d<   t        j                  t        j                  |            }| j(                  j+                         }t,        j.                  j+                         }|j1                  d#       |D ]  }t3        | |      t3        | |      }|	||v r||   }| j4                  j%                  |d      }t7        |t              r=t        ||j8                  |j:                  |j<                  |j>                  d$%      ||<   |||<    | j@                  }||j                  |       tC        |      dkD  r||d#<    | jD                  d'd&|i|S c c}w c c}w )(uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `list[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokenspost_processorNmodelrM   r!   r   mergesr"   unk_idr   g        )r#   r$   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.	unk_tokenrL   idr   continuing_subword_prefixend_of_word_suffixrt   	ByteLevelSequencepretokenizersc              3   ,   K   | ]  }|d    dk(    yw)rM   rI  NrE   ).0pretokenizers     rG   	<genexpr>zBPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<genexpr>P  s"      $ !(K7s   initial_alphabet)r   r   )rA   trainerr   r   zQAttempted to set a token in the post processor that does not exist in the mappingr   )clssepr5   T)single_wordlstriprstrip
normalizedrL   r+   rE   )#rq   rr   rW   to_strrN   rP   rS   from_strr1  ro   r   extendanyrw   rI  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorrO   r   rX   rQ   r   SPECIAL_TOKENS_ATTRIBUTESremoverv   _special_tokens_maprn   rT  rU  rV  rW  r5   rV   r   )ry   text_iteratorr   rA   new_special_tokensspecial_tokens_mapr{   tokenizer_jsonr@  rA  rE  rD  r3   r   added_tokenrL   r  trainer_classrQ  trained_tokenizer_jsonrK   r   r   token_idspecial_tokenspecial_tokens_listspecial_token_fullr5   s                               rG   train_new_from_iteratorz/PreTrainedTokenizerFast.train_new_from_iterator  s   D DOO$:$:$<=%)).9'++,<=	'"6*e3/1N7#G,02N7#H-G$V,	9g&x0<'0:*73G<VDQG	%1iCU6U 29 =I45w'15>4D3Ew'0G$V,0JJ/1N7#G,Mn]dNeflNmMn o> >  *~g66w'48JJ3EnU\F]^iFj3kN7#K0!**4::n+EF	 ' 	=K!ooi6Gd+Ag&v.);G!-+i2HL^2^);K	<R)SI&!!*";{";<	= )!!"45 7#F+u4+69w'(CDP2@2IJe2fF./7#F+u4$F2w'(<=I+9'+BCW+XF'('"6*i7I<Q"+F;/*6/7;F!/26:jH#~o'FF (6(G(X 
 .A-J-J-S-S-U)*01H1PQ_:n_X^_%%mFG%T%%)ZZ	0@0@0B%C">1)*:; vC+,<=cB8LF)5TZ![5"4"8"8"F![![FLN#34S9(C!' #,#8#8#?#+", s#  ouCuejIDYDYZ_D`CuN#34S9%@v "0 
F N2-m<HE1)5%CU:U 25 9(44U;H'(o  6;H4EN=1
F 8F"#34%..tzz:P/QRI!!&&(0JJOOQ""#>?( 	2EtU#/ 'e 4%1mGY6Y$6}$EM%)%=%=%A%A%%N"0*=$.%$6$B$B188188#5#@#@ $%F5M %2F5M%	2( %)$B$B!)%,,-?@()A-2KF./t~~CyCFCCq "\ Dvs   SS)NNFFFFT)F)NF)FN)NN)NNN)@__name__
__module____qualname____doc__VOCAB_FILES_NAMESr   r)   r   __annotations__ra   propertyboolr   r   r   r   dictrk   r   r   ri   r   r/   r   r   r   rS   rs   DecoderFastr   EncodingFastr   r  r   rh   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r!  r   PathLiker>  rm  __classcell__)r   s   @rG   r(   r(   Q   s   
 *04-4zx       GC G GA4S> A  tCH~     nd38n n n :d3
?&; : :nc3h n$ F F =   ' ' ' 1504*/+0',#-(-(  (~-(  (~	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-(^UE#x}2D,E U%PSUYZ]U^P^J_ U   7# 7(3- 76d5j+A&B 6]` 6?d ?s ?, GLd3i(?C	sDI~	8uS u uRV umqrumv uI9)I9 0I9 	I9
 I9 %SMI9 smI9` $(,;,F,F2D2T2T$($),0&*(,0404*/+0',#%*+Y`"'OT-0$7H2I4PeKff#
Y`
 !Y` *Y` 0Y` SMY` Y` "Y` %SMY` smY` !Y`  (~Y`  (~Y`  $(!Y`" %)#Y`$ !%%Y`& 'Y`( )Y`* #+Y`, 
-Y`| DH#',;,F,F2D2T2T$($),0&*)-0404*/+0',#%*);I001; E)->">?@; !	;
 *; 0; SM; ; "; %SM; sm; !;  (~;  (~; $(;  %)!;" !%#;$ %;& ';( #);, 
-;z
tCy 
S 
 %*7;	d3i( " '/tn	 
8 )-)-/c2;;.// #J/  ~	/
 "#/ 
s/j rDrI   r(   )=rq  rQ   rq   r   collectionsr   collections.abcr   typingr   r   r   tokenizers.pre_tokenizerspre_tokenizersrw   
tokenizersr   rx  r	   rS   tokenizers.decodersr
   rw  tokenizers.trainersr   r   r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utilsr   tokenization_utils_baser   r   r   r   r   r   r   r   r   r   utilsr   r   r    
get_loggerrn  loggerr4  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEr/  r]  rr  r(   rE   rI   rG   <module>r     s   
   	 # $ ' ' 7 / 1 6 ^ ^ : 5 = 3   @ ? 
		H	% "3 / '  (      !!	  (6EXY  ,-HD5 HD .HDrI   