
    rh&Q                     D   d dl mZmZ ddlmZ ddlmZmZmZ ddl	m
Z
mZmZmZ ddlmZmZmZ ddlmZ  e       rd dlZ G d	 d
ed      ZdZ ed      D  cg c]	  } d| dd c}  ed      D  cg c]	  } d| dd c} z   Zd Z G d de      ZdgZyc c} w c c} w )    )OptionalUnion   )BatchFeature)
ImageInputis_valid_imagemake_flat_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenPreTokenizedInput	TextInput)is_torch_availableNc                   &    e Zd ZddidddddidZy	)
ColPaliProcessorKwargspaddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/colpali/processing_colpali.pyr   r   $   s,     y
 ,"
 +D1	Ir#   r   F)totalz<image>i   z<locz0>4>   z<segz0>3c                      ||z  |z   | |  dS )aZ  
    Builds a string from the input prompt and image tokens.
    For example, for the call:
    build_string_from_input(
        prompt="Prefix str"
        bos_token="<s>",
        image_seq_len=3,
        image_token="<im>",
    )
    The output will be:
    "<im><im><im><s>Initial str"
    Args:
        prompt (`list[Union[str, ImageInput]]`): The input prompt.
        bos_token (`str`): The beginning of sentence token.
        image_seq_len (`int`): The length of the image sequence.
        image_token (`str`): The image token.
        num_images (`int`): Number of images in the prompt.
    
r"   prompt	bos_tokenimage_seq_lenimage_token
num_imagess        r$   build_string_from_inputr0   5   s$    & M)J67	{6("MMr#   c                   d    e Zd ZdZddgZdZdZ	 	 	 	 	 ddedef fdZ	 	 	 	 dd	e	d
e
eeee   ee   f   dee   defdZd dZd Zd Zed        Zedefd       Z	 d d	e	dee   defdZd
e
eee   f   dee   defdZ	 	 	 d!de
ded   f   de
ded   f   deded   de
def   ddfdZ xZS )"ColPaliProcessora  
    Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as
    well as to compute the late-interaction retrieval score.

    [`ColPaliProcessor`] offers all the functionalities of [`PaliGemmaProcessor`]. See the [`~PaliGemmaProcessor.__call__`]
    for more information.

    Args:
        image_processor ([`SiglipImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`], *optional*):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
        visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`):
            A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*, defaults to `"Question: "`):
            A prefix to be used for the query.
    image_processor	tokenizer)SiglipImageProcessorSiglipImageProcessorFast)GemmaTokenizerGemmaTokenizerFastvisual_prompt_prefixquery_prefixc                    t         |   |||       |t        d      |t        d      t        |d      st        d      |j                  | _        t        |d      sNt        t        dd	      }d
|gi}|j                  |       |j                  t              | _	        t        | _
        n"|j                  | _	        |j                  | _
        |j                  t               d|_        d|_        || _        || _        y )N)chat_templatez)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.image_seq_lengthz;Image processor is missing an `image_seq_length` attribute.r.   FT)
normalizedspecialadditional_special_tokens)super__init__
ValueErrorhasattrr=   r   IMAGE_TOKENadd_special_tokensconvert_tokens_to_idsimage_token_idr.   
add_tokensEXTRA_TOKENSadd_bos_tokenadd_eos_tokenr9   r:   )	selfr3   r4   r<   r9   r:   r.   tokens_to_add	__class__s	           r$   rB   zColPaliProcessor.__init__d   s     	)=Q"HIIABB(:;Z[[ / @ @y-0$[UDQK8;-HM((7"+"A"A+"ND*D"+":":D(44D\*"'	"'	$8!(r#   imagestextkwargsreturnc                      | j                   t        fd| j                  j                  i|}|d   j	                  dd      }|du}||t        d      ||t        d      |t        |      r|g}n^t        |t              rt        |d         rn?t        |t              r$t        |d   t              rt        |d   d         st        d      | j                  gt        |      z  }	|D 
cg c]  }
|
j                  d	       }}
t        |	|      D cg c]R  \  }}t        || j                  j                  | j                  t         t        |t              rt        |      nd
      T }}}t#        |      } | j$                  |fi |d   d   }|d   j'                  dd      |d   dxx   | j                  z  cc<    | j                  |fddi|d   }i |d|i}|r.|d   j)                  |d   dk(  d      }|j+                  d|i       t-        |      S |t        |t.              r|g}n.t        |t              rt        |d   t.              st        d      || j0                  dz  }g }|D ]?  }| j                  j                  | j2                  z   |z   |z   dz   }|j5                  |       A |d   j'                  dd      |d   d<    | j                  |fddi|d   }|S yc c}
w c c}}w )a	  
        Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
        wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
        both text and images at the same time.

        When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
        [`~LlamaTokenizerFast.__call__`].
        When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
        [`~SiglipImageProcessor.__call__`].
        Please refer to the docstring of the above two methods for more information.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr   suffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timer   zAimages must be an image, list of images or list of list of imagesRGB   r*   r   pixel_values
max_lengthreturn_token_type_idsF	input_idstoken_type_idsilabels)dataz*Text must be a string or a list of strings
   r)   2   )_merge_kwargsr   r4   init_kwargspoprC   r   
isinstancelistr9   lenconvertzipr0   r,   r=   rE   r	   r3   getmasked_fillupdater   strquery_augmentation_tokenr:   append)rM   rP   rQ   audiovideosrR   output_kwargsrV   r[   	texts_docimager+   
image_listinput_stringsrY   inputsreturn_datar^   texts_queryquerybatch_querys                        r$   __call__zColPaliProcessor.__call__   sh   Z +**"
"&.."<"<
 

 }-11(DA &d 2<FNEFF 2TUUf% FD)nVAY.G .:fQi3NSabhijbklmbnSo !dee223c&kAI8>?uemmE*?F? +.i*@	 'FJ (!"nn66"&"7"7 +2<Z2Ns:TU	M 	 .f5F/4//Y-:XYZhiL ]+//dCOm,\:d>S>SS:#T^^&+  .F CVB^\BK$,88@P9QUV9VX\]""Hf#56[11$$v t,DGS1I !MNN~66;%'K *0043D3DDuLvUX\\""5)* :G}9U9Y9YZfhj9kM-(6($..&+  .K - C @	s   2K5AK:c                     i }|<| j                   gt        |      z  }dgt        |      z  }|j                  ||d       t        di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (list[list[str]], *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        rX   )num_image_tokensnum_image_patchesr"   )r=   rg   rl   r
   )rM   image_sizesrR   vision_datar~   r   s         r$   _get_num_multimodal_tokensz+ColPaliProcessor._get_num_multimodal_tokens  s]     " $ 5 56[9II!"c+&6 64D[lmn,,,r#   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r4   batch_decoderM   argsrR   s      r$   r   zColPaliProcessor.batch_decode  s     
 +t~~**D;F;;r#   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r4   decoder   s      r$   r   zColPaliProcessor.decode  s     
 %t~~$$d5f55r#   c                     | j                   j                  }| j                  j                  }t        t        j                  ||z               S N)r4   model_input_namesr3   rf   dictfromkeys)rM   tokenizer_input_namesimage_processor_input_namess      r$   r   z"ColPaliProcessor.model_input_names#  s?     $ @ @&*&:&:&L&L#DMM"7:U"UVWWr#   c                 .    | j                   j                  S )z
        Return the query augmentation token.

        Query augmentation buffers are used as reasoning buffers during inference.
        )r4   	pad_token)rM   s    r$   rn   z)ColPaliProcessor.query_augmentation_token)  s     ~~'''r#   c                 *     | j                   dd|i|S )a  
        Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColPaliProcessor's
        [`ColPaliProcessor.__call__`].

        This method forwards the `images` and `kwargs` arguments to the image processor.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        rP   r"   r|   )rM   rP   rR   s      r$   process_imageszColPaliProcessor.process_images2  s    B t}}5F5f55r#   c                 *     | j                   dd|i|S )a  
        Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColPaliProcessor's
        [`ColPaliProcessor.__call__`].

        This method forwards the `text` and `kwargs` arguments to the tokenizer.

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
        rQ   r"   r   )rM   rQ   rR   s      r$   process_queriesz ColPaliProcessor.process_queriesU  s    @ t}}1$1&11r#   query_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	         t        |      dk(  rt        d      t        |      dk(  rt        d      |d   j                  |d   j                  k7  rt        d      |d   j                  |d   j                  k7  rt        d      ||d   j                  }g }t	        dt        |      |      D ]%  }g }t
        j                  j                  j                  j                  ||||z    dd      }	t	        dt        |      |      D ]  }
t
        j                  j                  j                  j                  ||
|
|z    dd      }|j                  t        j                  d|	|      j                  d	
      d   j                  d
              |j                  t        j                  |d
      j                  |      j                  |             ( t        j                  |d
      S )aZ  
        Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
        query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the
        image of a document page.

        Because the embedding tensors are multi-vector and can thus have different shapes, they
        should be fed as:
        (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
        (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
            obtained by padding the list of tensors.

        Args:
            query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
            passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
            batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
            output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
                If `None`, the dtype of the input embeddings is used.
            output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

        Returns:
            `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
            tensor is saved on the "cpu" device.
        r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)batch_firstpadding_valuezbnd,csd->bcnsr   )dim   rX   )rg   rC   devicedtyperangetorchnnutilsrnnpad_sequencero   einsummaxsumcatto)rM   r   r   r   r   r   scoresibatch_scoresbatch_queriesjbatch_passagess               r$   score_retrievalz ColPaliProcessor.score_retrievalw  s   @  A%233!"a'344A%%);A)>)E)EENOOA$$(:1(=(C(CCLMM+A.44L%'q#./< 	]A/1L!HHNN..;; Q^4$VW < M 1c"45zB !&!3!3!@!@&q1z>:\] "A " ##LL-PTTYZT[\]^bbghbi	 MM%))La8;;LILL][\	] yyQ''r#   )NNNzDescribe the image.z
Question: )NNNNr   )r'   Ncpu)r   r   r    __doc__
attributesimage_processor_classtokenizer_classrm   rB   r   r   r   r   rf   r   r   r   r|   r   r   r   propertyr   rn   r   r   intr   r   __classcell__)rO   s   @r$   r2   r2   K   s   ( $[1JP>O $9( )
 " )  )H "^b{{ I0$y/4HYCZZ[{ /0{ 
{z-$<6 X X
 (# ( ( "!6!6 /0!6 
	!6F 2ItI./ 2 /0 2 
	 2L 0449>(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>(r#   r2   )typingr   r   feature_extraction_utilsr   image_utilsr   r   r	   processing_utilsr
   r   r   r   tokenization_utils_baser   r   r   r   r   r   r   rE   r   rJ   r0   r2   __all__)r   s   0r$   <module>r      s   . # 4 O O X X O O ' 
-U 
 ).t5A$qgQ5RWX[R\8]Q4#wa8]]N,j(~ j(Z 
M 68]s   B3B