
    rh1                         d dl mZmZ d dlZddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZmZ ddlmZ  e       rd	d
lmZ  G d ded      Z G d de
d      Z G d ded      Z G d de      ZdgZy)    )OptionalUnionN   )BatchFeature)
ImageInput)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixin
TextKwargsUnpack)PreTokenizedInput	TextInput)is_vision_available   )smart_resizec                       e Zd ZU eed<   y)Emu3TextKwargsreturn_for_image_generationN)__name__
__module____qualname__bool__annotations__     {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/emu3/processing_emu3.pyr   r       s    !%%r   r   F)totalc                   "    e Zd ZU eed<   eed<   y)Emu3ImagesKwargsratio
image_areaN)r   r   r   strr   intr   r   r   r    r    $   s    JOr   r    c                   8    e Zd ZU eed<   eed<   dddddddZy	)
Emu3ProcessorKwargstext_kwargsimages_kwargsF)r   return_mm_token_type_idsz1:1i  )r!   r"   )r'   r(   N)r   r   r   r   r   r    	_defaultsr   r   r   r&   r&   )   s/    ## ,1(-

  
	Ir   r&   c                        e Zd ZdZddgZdZdZ	 d fd	Z	 	 	 	 ddee	   dee
eeee   ee   f      d	ee   d
efdZddZd Zde	fdZd Zd Zed        Z xZS )Emu3Processora  
    Constructs a Emu3 processor which wraps a Emu3 image processor and a GPT2 tokenizer into a single
    processor.

    [`Emu3Processor`] offers all the functionalities of [`Emu3ImageProcessor`] and [`GPT2TokenizerFast`].
    See the [`~Emu3Processor.__call__`] and [`~Emu3Processor.decode`] for more information.

    Args:
        image_processor ([`Emu3ImageProcessor`]):
            The image processor is a required input.
        tokenizer ([`Emu3TokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    image_processor	tokenizer)GPT2TokenizerGPT2TokenizerFastEmu3ImageProcessorc                 &   |j                   | _         |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _	        d| _
        t        | 1  |||       y )N   )chat_template)image_tokenimage_token_id	boi_tokenimage_start_token	eoi_tokenimage_end_tokenimage_wrapper_tokenfake_token_around_image	eof_token	bos_tokendownsample_ratiosuper__init__)selfr-   r.   r4   kwargs	__class__s        r   rA   zEmu3Processor.__init__M   s     %00'66!*!4!4(22'0'D'D$",,",, !)=Qr   imagestextrC   returnc                 |   t        |t              r|g}n.t        |t              st        |d   t              st        d       | j                  t
        fd| j                  j                  i|}|d   j                  dd      }|d   j                  dd	      }|d   j                  d
d	      }	|r|t        d      |s||t        d      i }
| j                   }| j                   | j                   }|s| | j                  |fi |d   }
t        |
j                        }g }|D ]  }| j                   |v rt#        |      }|\  }}|| j$                  z  }|| j$                  z  }||dz   z  }| | d| | j&                   d|z   | }|j)                  | j                   |d      }| j*                   | }| j                   |v r|j-                  |        |D cg c]  }|j)                  d| j                           }}nj|rh| j/                  ||	| j$                        \  }}| | d| | j&                   }|D cg c]  }| j*                   | |  }}||ggt1        |      z  |
d<   |d   j                  dd	      }|d   j                  dd      } | j                  |fi |d   dd	i}| j3                  ||dg       |rUt5        j6                  |d         }t5        j8                  |d         }d||| j:                  k(  <   |j=                         |d<   t?        i ||
|      S c c}w c c}w )a  
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
        of the above two methods for more information.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        r   zAInvalid input text. Please provide a string, or a list of stringstokenizer_init_kwargsr'   r   Fr(   r!   Nr"   zGYou should not provide `images` when `return_for_image_generation=True`zOYou must provide either text or images when `return_for_image_generation=False`r   *z<placeholder>image_sizesreturn_tensorsr)   image)
modalities	input_idsmm_token_type_ids)datatensor_type) 
isinstancer#   list	TypeError_merge_kwargsr&   r.   init_kwargspop
ValueErrorr8   r=   r:   r-   iterrK   r5   nextr?   r<   replacer>   appendcalculate_generate_sizelen_check_special_mm_tokensnparray
zeros_liker6   tolistr   )rB   rE   rF   audiovideosrC   output_kwargsr   r!   r"   image_featuresimage_start_tokensimage_end_tokensrK   prompt_stringssample
image_sizeheightwidthimage_seq_lengthimage_placeholderimage_promptrL   r)   text_inputs	array_idsrP   s                              r   __call__zEmu3Processor.__call__^   s   T dC 6DD$'
47C0H_``***
"&.."<"<
 

 '4M&B&F&FGdfk&l#o.227DA"?377dK
&6+=fgg*t|noo $ 6 67"nn-d.B.B-CD +v/A1T11&[M/<Z[N~99:KN .&&&0!%k!2J$.MFE#t'<'<<F!T%:%::E'-';$+=*>vhawtOkOkNlm|  @P  nP  mQ  Rb  Qc  )d%#^^D,<,<>OQRSF $/x8F &&&0 %%f-. Ucc&FNN?D4D4DEcDc ) 88
DLaLabMFE01&5'$B^B^A_`LLPQ&t~~&vh|n=QDQ.4e_,=D	,IN=) '}599:JDQ#0#?#C#CD^`e#f $dnnT_]=-I_Z^_%%dKWI%N#[!9:I "k+.F GBCi4+>+>>?/@/G/G/IK+,!BK!B>!BP^__+ d Rs   ##L4L9c                    i }|g }|D ]  \  }}t        ||| j                  j                  | j                  j                  | j                  j                        \  }}|| j
                  z  }|| j
                  z  }||dz   z  }|j                  |        dgt        |      z  }|j                  ||d       t        di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        r   )num_image_tokensnum_image_patchesr   )
r   r-   spatial_factor
min_pixels
max_pixelsr?   r]   r_   updater	   )	rB   rK   rC   vision_datarw   rn   ro   rp   rx   s	            r   _get_num_multimodal_tokensz(Emu3Processor._get_num_multimodal_tokens   s     "!!, : ,((77((33((33!  4#8#88!6!66#)UQY#7  ''(89: "#c+&6 64D[lmn,,,r   c                     t        t        |j                  d            \  }}||z  }||z  dz  }t        t        ||z  |z              }t        t        ||z  |z              }	||	fS )N:g      ?)mapr$   splitround)
rB   r!   r"   ry   ro   rn   current_areatarget_ratiotoken_heighttoken_widths
             r   r^   z%Emu3Processor.calculate_generate_size   sp    CS!12vv~"\1c95,!6!GHI% 4~ EFG[((r   c                 <     | j                   j                  |fi |S N)r-   postprocess)rB   rE   rC   s      r   r   zEmu3Processor.postprocess   s     /t##//A&AAr   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to Emu3TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r.   batch_decoderB   argsrC   s      r   r   zEmu3Processor.batch_decode   s     
 +t~~**D;F;;r   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to Emu3TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r.   decoder   s      r   r   zEmu3Processor.decode   s     
 %t~~$$d5f55r   c                     | j                   j                  }| j                  j                  }t        t        j                  ||z               S r   )r.   model_input_namesr-   rT   dictfromkeys)rB   tokenizer_input_namesimage_processor_input_namess      r   r   zEmu3Processor.model_input_names  s?     $ @ @&*&:&:&L&L#DMM"7:U"UVWWr   r   )NNNN)r   r   r   __doc__
attributestokenizer_classimage_processor_classrA   r   r   r   r   r   rT   r   r&   r   ru   r~   r^   r   r   r   propertyr   __classcell__)rD   s   @r   r,   r,   8   s      $[1J<O0 	R& (,hli`$i` uY(94	?DQbLccdei` ,-i` 
i`V -D)B* B<6 X Xr   r,   )typingr   r   numpyra   image_processing_utilsr   image_utilsr   processing_utilsr   r	   r
   r   r   r   tokenization_utils_baser   r   utilsr   image_processing_emu3r   r   r    r&   r,   __all__r   r   r   <module>r      sp   " #  2 % r r C ( 3&Zu &|5 
*% QXN QXh 
r   