
    rhH                        d dl mZ d dlmZmZ d dlmZmZ d dlm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZmZmZmZ ddlmZ  e       rd dl Z  ejB                  e"      Z# G d ded      Z$ G d de	      Z% G d de      Z&e ed       G d de                    Z' ed       G d de             Z(g dZ)y)    )	dataclass)OptionalUnion)ColPaliForRetrievalColPaliPreTrainedModel)ColPaliProcessor   )Cache)BatchFeature)
ImageInputis_valid_image)MultiModalDataProcessingKwargsUnpack)PreTokenizedInput	TextInput)ModelOutputauto_docstringcan_return_tupleis_torch_availablelogging   )ColQwen2ConfigNc                   &    e Zd ZddidddddidZy	)
ColQwen2ProcessorKwargspaddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/colqwen2/modular_colqwen2.pyr   r   &   s,     y
 ,"
 +D1	Ir+   r   F)totalc            
           e Zd ZdZdZdZ	 	 	 	 	 ddee   dee   fdZ	 	 	 	 dde	d	e
eeee   ee   f   d
ee   defdZddZy)ColQwen2Processora  
    Constructs a ColQwen2 processor which wraps a Qwen2VLProcessor and special methods to process images and queries, as
    well as to compute the late-interaction retrieval score.

    [`ColQwen2Processor`] offers all the functionalities of [`Qwen2VLProcessor`]. See the [`~Qwen2VLProcessor.__call__`]
    for more information.

    Args:
        image_processor ([`Qwen2VLImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`Qwen2TokenizerFast`], *optional*):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
        visual_prompt_prefix (`str`, *optional*): A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*): A prefix to be used for the query.
    AutoImageProcessor)Qwen2TokenizerQwen2TokenizerFastNvisual_prompt_prefixquery_prefixc                     t               j                  |||       t        |d      sdn|j                  | _        t        |d      sdn|j                  | _        |d}|| _        |d}|| _        y )N)chat_templateimage_tokenz<|image_pad|>video_tokenz<|video_pad|>zf<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>zQuery: )r   __init__hasattrr7   r8   r3   r4   )selfimage_processor	tokenizerr6   r3   r4   kwargss          r,   r9   zColQwen2Processor.__init__I   sz     	##OYm#\29)]2S?YbYnYn29)]2S?YbYnYn' $M $8!$L(r+   imagestextr>   returnc                 v    | j                   t        fd| j                  j                  i|}|d   j	                  dd      }|du}||t        d      ||t        d      |7t        |      r|g}n^t        |t              rt        |d         rn?t        |t              r$t        |d   t              rt        |d   d         st        d      | j                  gt        |      z  }	 | j                  dd	|i|d
   }
|
d   }|| j                  j                  dz  }d}t        t        |	            D ]  }| j                  |	|   v rQ|	|   j                  | j                  d||   j!                         |z  z  d      |	|<   |dz  }| j                  |	|   v rQ|	|   j                  d| j                        |	|<     | j                  |	fddi|d   }t#        i ||
      }|d   dddf   |d   dddf   z  }t        t%        j&                  |d   |j)                                     }t$        j*                  j,                  j.                  j1                  |d      |d<   |r.|d   j3                  |d   dk(  d      }|j5                  d|i       |S |t        |t6              r|g}n.t        |t              rt        |d   t6              st        d      || j8                  dz  }g }|D ]%  }| j:                  |z   |z   }|j=                  |       '  | j                  |fddi|d   }|S y)a	  
        Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
        wrapper around the Qwen2VLProcessor's [`~Qwen2VLProcessor.__call__`] method adapted for the ColQwen2 model. It cannot process
        both text and images at the same time.

        When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's
        [`~Qwen2TokenizerFast.__call__`].
        When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to Qwen2VLImageProcessor's
        [`~Qwen2VLImageProcessor.__call__`].
        Please refer to the doctsring of the above two methods for more information.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr#   suffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timer   zAimages must be an image, list of images or list of list of imagesr?   r$   image_grid_thw   z<|placeholder|>r   return_token_type_idsF)datapixel_valuesT)batch_first	input_idstoken_type_idsilabelsz*Text must be a string or a list of strings
   r*   )_merge_kwargsr   r=   init_kwargspop
ValueErrorr   
isinstancelistr3   lenr<   
merge_sizeranger7   replaceprodr   torchsplittolistnnutilsrnnpad_sequencemasked_fillupdatestrquery_augmentation_tokenr4   append)r;   r?   r@   audiovideosr>   output_kwargsrD   rG   	texts_docimage_inputsrE   merge_lengthindexitext_inputsreturn_dataoffsetsrI   rM   texts_queryqueryaugmented_querybatch_querys                           r,   __call__zColQwen2Processor.__call__^   s   Z +**#
"&.."<"<
 

 }-11(DA &d 2<FNEFF 2TUUf% FD)nVAY.G .:fQi3NSabhijbklmbnSo !dee223c&kAI/4//`v`A_`L)*:;N)#33>>As9~. ]A**il:'0|';'; ,,.?>RWCXC]C]C_coCo.prs(	! 
	 **il:
 $-Q<#7#78I4K[K[#\IaL] )$..&+  .K ',K{,Kl,KLK ""23AqD9KHX<YZ[]^Z^<__G  K79IJL
 +0((..*<*<*I*I$ +J +K' %$[1==kJZ>[_`>`bfg""Hf#56$$v t,DGS1I !MNN~66;%'K 4"&"3"3e";f"D""?34 )$..&+  .K + r+   c                    i }|t         j                  j                  di       }|j                  |       |j                  dd      xs | j                  j
                  }|D cg c]   } | j                  j                  g || " }}|D cg c]
  }||dz  z   }	}|j                  |	|d       t        di |S c c}w c c}w )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr$   rV   rF   )num_image_tokensnum_image_patchesr*   )r   r)   getrb   r<   rV   get_number_of_image_patchesr   )
r;   image_sizesr>   vision_datar$   rV   
image_sizerx   num_patchesrw   s
             r,   _get_num_multimodal_tokensz,ColQwen2Processor._get_num_multimodal_tokens   s     "3==AA/SUVM  (&**<>a$BVBVBaBaJ #.! A$$@@\*\m\! ! Sdd;
A!=dd4D[lmn,,,!  es   $%B?C)NNNNN)NNNN)N)r&   r'   r(   __doc__image_processor_classtokenizer_classr   rc   r9   r   r   r   r   rT   r   r   r   ru   r   r*   r+   r,   r/   r/   3   s    $ 1>O .2&*)
 'sm) sm). "^bCC I0$y/4HYCZZ[C 01C 
CJ-r+   r/   c                       e Zd Zy)ColQwen2PreTrainedModelN)r&   r'   r(   r*   r+   r,   r   r      s    r+   r   z4
    Base class for ColQwen2 embeddings output.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZeeeej                     ef      ed<   dZeeej                        ed<   dZeeej                        ed<   y)ColQwen2ForRetrievalOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The embeddings of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    Nloss
embeddingspast_key_valueshidden_states
attentions)r&   r'   r(   r   r   r   rZ   FloatTensor__annotations__r   Tensorr   r   rT   r
   r   tupler   r*   r+   r,   r   r     s     )-D(5$$
%,)-J&-GKOXeD):):$;U$BCDK8<M8E%"3"345<59Ju00129r+   r   uG  
    Following the ColPali approach, ColQwen2 leverages VLMs to construct efficient multi-vector embeddings directly
    from document images (“screenshots”) for document retrieval. The model is trained to maximize the similarity
    between these document embeddings and the corresponding query embeddings, using the late interaction method
    introduced in ColBERT.

    Using ColQwen2 removes the need for potentially complex and brittle layout recognition and OCR pipelines with
    a single model that can take into account both the textual and visual content (layout, charts, ...) of a document.

    ColQwen2 is part of the ColVision model family, which was introduced with ColPali in the following paper:
    [*ColPali: Efficient Document Retrieval with Vision Language Models*](https://huggingface.co/papers/2407.01449).
    c                       e Zd Zi Zdef fdZee	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee   dee	j                     dee	j                     d	ee   d
ee   dee   dee   dee	j                     dee	j                     dee	j                     defd              Z xZS )ColQwen2ForRetrievalconfigc                     t         |   |       | `| j                  j                  xs g D cg c]  }d| 	 c}| _        y c c}w )Nzvlm.)superr9   _tied_weights_keysvlm)r;   r   k	__class__s      r,   r9   zColQwen2ForRetrieval.__init__.  sB     #8<8S8S8YWY"[!T!:"["[s   ArK   attention_maskposition_idsr   rM   inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictrI   rE   cache_positionrA   c                 .   ||j                  | j                        }|L|J|dddf   |dddf   z  }t        j                  t	        ||      D cg c]
  \  }}|d|  c}}d      }||n| j
                  j                  }|	|	n| j
                  j                  }	|
|
n| j
                  j                  }
| j                  j                  j                  ||d|      \  }}|| j                  j                  j                  |      }||j                  | j                  j                  j!                               }| j                  j                  ||      }|| j
                  j"                  j$                  k(  j'                  d	      j)                  |      }|j                  |j*                  |j                        }|j-                  ||      }||j                  |j*                        }| j                  j                  d|||||||	|
|

      }|	r|j.                  nd}|d   }| j1                  |      }||j3                  d	d      z  }|||j'                  d	      z  }t5        ||j6                  ||j8                        S c c}}w )z
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        N)dtyper   rF   r   )dim)rK   rE   video_grid_thwr   )grid_thw)
rK   r   r   r   r   r   r   r   r   r   T)r   keepdim)r   r   r   r   )tor   rZ   catzipr   r   r   use_return_dictr   modelget_rope_indexlanguage_modelembed_tokenstypevisual	get_dtype
vlm_configimage_token_id	unsqueeze	expand_asdevicemasked_scatterr   embedding_proj_layernormr   r   r   )r;   rK   r   r   r   rM   r   r   r   r   r   rI   rE   r   rp   pixel_sequenceoffsetrope_deltasimage_embeds
image_mask
vlm_outputvlm_hidden_stateslast_hidden_statesr   s                           r,   forwardzColQwen2ForRetrieval.forward3  s   , #'???<L #(B$QT*^AqD-AAG 99GJ<Y`Gab-C^V(bL
 2C1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B]$(HHNN$A$A))	 %B %
!k   HH33@@KM'+001J1J1LM#xx|nU$++"8"8"G"GGRRSUV``ano   ,}/C/C]EXEXY - < <Z V)!/!2!2=3G3G!HXX^^%)+'/!5#) $ 

 9MJ44RV']../AB
  *//b$/"GG
%#n&>&>r&BBJ)!&66+!,,	
 	
s cs   J
)NNNNNNNNNNNNN)r&   r'   r(   _checkpoint_conversion_mappingr   r9   r   r   r   rZ   
LongTensorr   r
   r   boolr   r   __classcell__)r   s   @r,   r   r     s`    &("\~ \
  151537+/-159$(,0/3&*/35959Z
E,,-Z
 !.Z
 u//0	Z

 "%Z
 ))*Z
   1 12Z
 D>Z
 $D>Z
 'tnZ
 d^Z
 u||,Z
 !!1!12Z
 !!1!12Z
 
$Z
  Z
r+   r   )r   r   r/   )*dataclassesr   typingr   r   ,transformers.models.colpali.modeling_colpalir   r   .transformers.models.colpali.processing_colpalir   cache_utilsr
   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r   tokenization_utils_baser   r   r^   r   r   r   r   r   configuration_colqwen2r   rZ   
get_loggerr&   loggerr   r/   r   r   r   __all__r*   r+   r,   <module>r      s     " " d K   4 5 H H C _ _ 2  
		H	%
.e 
H-( H-V	4 	 
: : :* d
. d
d
Nr+   