
    rh&                         d dl mZmZ d dlZddlmZ ddlmZ ddl	m
Z
mZmZmZ ddlmZmZ ddlmZ d	d
lmZ  G d ded      Z G d de      ZdgZy)    )OptionalUnionN   )BatchFeature)
ImageInput)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)
TensorType   )AutoTokenizerc                   8    e Zd Zddddddej                  dZy)AriaProcessorKwargsF)paddingreturn_mm_token_type_ids  )max_image_sizesplit_image)text_kwargsimages_kwargsreturn_tensorsN)__name__
__module____qualname__r   PYTORCH	_defaults     {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/aria/processing_aria.pyr   r   !   s.     (-

 " 
 %,,
Ir!   r   F)totalc                        e Zd ZdZddgZdZdZ	 	 	 	 ddeee	f   de
e	   de
eeeef   ef      f fdZ	 	 	 dd	eeeee   ee   f   d
e
e   dee   defdZddZd Zd Zed        Z xZS )AriaProcessora  
    AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer.

    Args:
        image_processor (`AriaImageProcessor`, *optional*):
            The AriaImageProcessor to use for image preprocessing.
        tokenizer (`PreTrainedTokenizerBase`, *optional*):
            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
        chat_template (`str`, *optional*):
            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
        size_conversion (`Dict`, *optional*):
            A dictionary indicating size conversions for images.
    image_processor	tokenizerAriaImageProcessorr   chat_templatesize_conversionc                 *   |ddd}|j                         D ci c]  \  }}t        |      | c}}| _        |j                  | _        |j                  | _        ||j
                  |j                  |_        t        | !  |||       y c c}}w )N      )i  r   )r)   )	itemsintr*   image_tokenimage_token_id	pad_token	unk_tokensuper__init__)selfr&   r'   r)   r*   kv	__class__s          r"   r5   zAriaProcessor.__init__B   s     "$'c2O6E6K6K6MNdaA	N$00'66 Y%8%8%@"+"5"5I)=Q  Os   Btextimageskwargsreturnc                     | j                   t        fd| j                  j                  i|}t	        |t
              r|g}n.t	        |t              st	        |d   t
              st        d      | | j                  |fi |d   }| j                  |j                  j                  d      }g }	|j                  d      |z  }
|D ]P  }|j                  | j                  j                  | j                  j                  |
z        }|	j                  |       R ni }|}	|d   j                  d	d      }|d   j                  d
d      } | j                  |	fi |d   d	di}| j!                  |	|dg       |rUt#        j$                  |d         }t#        j&                  |d         }d||| j(                  k(  <   |j+                         |d<   t-        i |||      S )a  
        Main method to prepare for the model one or several sequences(s) and image(s).

        Args:
            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            images (`ImageInput`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.


        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
            `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
            `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsNr   r   	num_cropsr   r   r   Fimage)
modalities	input_ids   mm_token_type_ids)datatensor_type)_merge_kwargsr   r'   init_kwargs
isinstancestrlist	TypeErrorr&   r*   pixel_valuesshapepopreplacer0   append_check_special_mm_tokensnparray
zeros_liker1   tolistr   )r6   r:   r;   audiovideosr<   output_kwargsimage_inputstokens_per_imageprompt_stringsr@   sampler   r   text_inputs	array_idsrE   s                    r"   __call__zAriaProcessor.__call__T   s   < +**
"&.."<"<
 
 dC 6DD$'
47C0H_``/4//Y-:XYL#33L4M4M4S4STU4VWN$((58HHI .(B(BDNND^D^ajDjk%%f-.
 L!N&}599:JDQ#0#?#C#CD^`e#f $dnn^i}]7Sidhi%%nkwi%X#[!9:I "k+.F GBCi4+>+>>?/@/G/G/IK+,!@K!@<!@n]]r!   c                    i }|t         j                  j                  di       }|j                  |       |j                  dd      xs | j                  j
                  }|D cg c]   } | j                  j                  g || " }}|D cg c]  }| j                  |   |z   }	}|j                  |	|d       t        di |S c c}w c c}w )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr   r   )num_image_tokensnum_image_patchesr    )	r   r   getupdater&   r   get_number_of_image_patchesr*   r   )
r6   image_sizesr<   vision_datar   max_size
image_sizerd   num_patchesrc   s
             r"   _get_num_multimodal_tokensz(AriaProcessor._get_num_multimodal_tokens   s     "/99==orRM  ($(()94@gDDXDXDgDgH #.! A$$@@\*\m\! ! arrQ\ 4 4X > Lrr4D[lmn,,,!  ss   $%C	Cc                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r'   batch_decoder6   argsr<   s      r"   ro   zAriaProcessor.batch_decode   s     
 +t~~**D;F;;r!   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r'   decoderp   s      r"   rs   zAriaProcessor.decode   s     
 %t~~$$d5f55r!   c                     | j                   j                  }| j                  j                  }|D cg c]
  }|dk7  s	| }}t        t        j                  ||z               S c c}w )Nr@   )r'   model_input_namesr&   rL   dictfromkeys)r6   tokenizer_input_namesimage_processor_input_namesnames       r"   ru   zAriaProcessor.model_input_names   se     $ @ @&*&:&:&L&L# 9T&kW[_jWjt&k#&kDMM"7:U"UVWW 'ls
   
A#A#)NNNN)NNN)N)r   r   r   __doc__
attributesimage_processor_classtokenizer_classr   r   rK   r   rv   floatr/   r5   r   r   rL   r   r   r   r   ra   rm   ro   rs   propertyru   __classcell__)r9   s   @r"   r%   r%   /   s    $[1J0%O /3'+BFR +,R  }	R
 "$uUCZ'8#'=">?R* (,B^I0$y/4HYCZZ[B^ $B^ ,-B^ 
B^H-4<6 X Xr!   r%   )typingr   r   numpyrT   image_processing_utilsr   image_utilsr   processing_utilsr   r	   r
   r   tokenization_utilsr   r   utilsr   autor   r   r%   __all__r    r!   r"   <module>r      sL   * #  2 % X X >   *% YXN YXx 
r!   