
    rh*                         d dl Z d dlmZ d dlmZ d dlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ  e       rd	d
lmZ ndZ ej"                  e      ZdddZg dZ G d de      ZdgZy)    N)copyfile)Optional)
processors   )
AddedTokenBatchEncoding)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )MBartTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                       e Zd ZU dZeZddgZeZg Z	e
e   ed<   g Ze
e   ed<   	 	 	 	 	 	 	 	 	 	 	 	 d fd	Zedefd	       Zej$                  d
eddfd       Z	 d de
e   dee
e      de
e   fdZ	 d de
e   dee
e      de
e   fdZdedee   dee   fdZ	 	 	 d!de
e   dedee
e      dedef
 fdZd Zd Zd"dZdeddfdZd dedee   dee   fdZ xZS )#MBartTokenizerFastuO  
    Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizerFast

    >>> tokenizer = MBartTokenizerFast.from_pretrained(
    ...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
    ... )
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokensNc                     t        |	t              rt        |	dd      n|	}	t        j	                         }|$|j                  |D cg c]	  }||vs| c}       t        |   d|||||||||	|
||d| || _        t        D ci c]  }|| j                  |       c}| _
        |
|
nd| _        | j                  | j                        | _        || _        | j                  | j                         y c c}w c c}w )NTF)lstriprstrip)r   r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokensr    )
isinstancestrr   FAIRSEQ_LANGUAGE_CODEScopyextendsuper__init__r   convert_tokens_to_idslang_code_to_id	_src_langcur_lang_coder:   set_src_lang_special_tokens)selfr   r   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   kwargs_additional_special_tokenst	lang_code	__class__s                    /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mbart/tokenization_mbart_fast.pyrC   zMBartTokenizerFast.__init__I   s   " KUU_adJeZ
4Fku
%;%@%@%B"$0&--5]qB\9\] 	 	
!)!&@	
 	
  %Nd 
AJIt11)<< 
 &.%9w!77G ((87 ^( 
s   	C6C6C;returnc                     | j                   S N)rF   rI   s    rO   r9   zMBartTokenizerFast.src_lang~   s    ~~    new_src_langc                 H    || _         | j                  | j                          y rR   )rF   rH   )rI   rU   s     rO   r9   zMBartTokenizerFast.src_lang   s    %((8rT   token_ids_0token_ids_1c                 |    || j                   |z   | j                  z   S | j                   |z   |z   | j                  z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. The special tokens depend on calling set_lang.

        An MBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r-   r.   )rI   rW   rX   s      rO    build_inputs_with_special_tokensz3MBartTokenizerFast.build_inputs_with_special_tokens   sG    0 %%3d6H6HHH!!K/+=@R@RRRrT   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.

        r   )sep_token_idcls_token_idlen)rI   rW   rX   sepclss        rO   $create_token_type_ids_from_sequencesz7MBartTokenizerFast.create_token_type_ids_from_sequences   sm    $   !  !s[(3./1#553$s*S0;>DEKKrT   return_tensorsr9   r:   c                 v    ||t        d      || _         | |fd|d|}| j                  |      }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensrb   forced_bos_token_id)
ValueErrorr9   rD   )rI   
raw_inputsrb   r9   r:   extra_kwargsinputstgt_lang_ids           rO   _build_translation_inputsz,MBartTokenizerFast._build_translation_inputs   sY     x/`aa jiT.i\hi00:(3$%rT   	src_texts	tgt_textsc                 B    || _         || _        t        |   ||fi |S rR   )r9   r:   rB   prepare_seq2seq_batch)rI   rl   r9   rm   r:   rJ   rN   s         rO   ro   z(MBartTokenizerFast.prepare_seq2seq_batch   s*     ! w,Y	LVLLrT   c                 8    | j                  | j                        S rR   )rH   r9   rS   s    rO   _switch_to_input_modez(MBartTokenizerFast._switch_to_input_mode       //>>rT   c                 8    | j                  | j                        S rR   )set_tgt_lang_special_tokensr:   rS   s    rO   _switch_to_target_modez)MBartTokenizerFast._switch_to_target_mode   rr   rT   c                    | j                  |      | _        g | _        | j                  | j                  g| _        | j                  | j                        }| j                  | j                        }t        j                  |dgz   |z   |ddgz   |z   t        t        ||z   | j                  | j                  z                     | j                  _        y)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].$A$Bsinglepairspecial_tokensNrD   rG   r-   eos_token_idr.   convert_ids_to_tokensr   TemplateProcessinglistzip
_tokenizerpost_processor)rI   r9   prefix_tokens_strsuffix_tokens_strs       rO   rH   z.MBartTokenizerFast.set_src_lang_special_tokens   s    !77A"//1C1CD 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$58I$I4K]K]`d`r`rKr st*
&rT   langc                    | j                  |      | _        g | _        | j                  | j                  g| _        | j                  | j                        }| j                  | j                        }t        j                  |dgz   |z   |ddgz   |z   t        t        ||z   | j                  | j                  z                     | j                  _        y)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].rw   rx   ry   Nr}   )rI   r   r   r   s       rO   rt   z.MBartTokenizerFast.set_tgt_lang_special_tokens   s    !77="//1C1CD 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$58I$I4K]K]`d`r`rKr st*
&rT   save_directoryfilename_prefixc                    | j                   st        d      t        j                  j	                  |      st
        j                  d| d       y t        j                  j                  ||r|dz   ndt        d   z         }t        j                  j                  | j                        t        j                  j                  |      k7  rt        | j                  |       |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )can_save_slow_tokenizerrf   ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )rI   r   r   out_vocab_files       rO   save_vocabularyz"MBartTokenizerFast.save_vocabulary   s    ++ 
 ww}}^,LL,^,<<TUVo_s22QbcoQpp
 77??4??+rww~/NNT__n5  rT   )NN<s></s>r   r   z<unk>z<pad>z<mask>NNNrR   )r   Nr#   )rP   N) __name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   slow_tokenizer_classr-   r   int__annotations__r.   rC   propertyr>   r9   setterr   rZ   ra   rk   r   ro   rq   ru   rH   rt   tupler   __classcell__)rN   s   @rO   r*   r*   *   s   . *$&67)!M49!!M49! "&39j #   __9S 9T 9 9
 JNS9S3;DI3FS	cS< JNL9L3;DI3FL	cL2
*-
9A#
RZ[^R_
  )-
M9
M 
M DI&	
M
 
M 

M??

 
 
!c !HSM !]bcf]g !rT   r*   )r   shutilr   typingr   
tokenizersr   tokenization_utilsr   r   tokenization_utils_fastr	   utilsr
   r   tokenization_mbartr   
get_loggerr   r   r   r?   r*   __all__r<   rT   rO   <module>r      sq     
   ! ; > 8 2N 
		H	% $=P`a  { `!0 `!F  
 rT   