
    rha[                        d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZmZmZmZ d
dlmZ d
dl m!Z!m"Z"  ejF                  e$      Z% G d de      Z& G d de      Z' G d de!      Z( G d de      Z) G d de      Z* G d de      Z+ G d de      Z, G d de      Z-e G d de             Z. G d  d!e      Z/g d"Z0y)#    )CallableOptionalUnionN   )PretrainedConfig)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionc                   F     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )MLCDVisionConfiga  
    This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
    [DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1664):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 1024):
            Dimensionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 336):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```python
    >>> from transformers import MLCDVisionConfig, MLCDVisionModel

    >>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> configuration = MLCDVisionConfig()

    >>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> model = MLCDVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```mlcd_vision_modelvision_configc                     t        |   di | || _        || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        |
| _        |	| _        y )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_groupsnum_channels
patch_size
image_sizeinitializer_rangeinitializer_factorattention_dropoutlayer_norm_eps
hidden_act)selfr#   r$   r%   r&   r'   r(   r*   r)   r/   r.   r-   r+   r,   kwargs	__class__s                  x/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mlcd/modular_mlcd.pyr"   zMLCDVisionConfig.__init__d   sz    " 	"6"&!2!2#6 $8!($$!2"4!2,$    )i  i    0         r   iP     gelugh㈵>        g{Gz?      ?)__name__
__module____qualname____doc__
model_typebase_config_keyr"   __classcell__r2   s   @r3   r   r   *   sH    4l %J%O % %r4   r   c                       e Zd Zy)MLCDMLPN)r<   r=   r>   r    r4   r3   rE   rE      s    r4   rE   c                   4    e Zd Zdededej
                  fdZy)MLCDRotaryEmbeddingnum_patches_heightnum_patches_widthreturnc                    t        j                  || j                  j                        j	                  d      j                  d|      }t        j                  || j                  j                        j	                  d      j                  |d      }t        j                  |j                         |j                         gd      }t        ||      }t        j                  || j                  j                  | j                  j                        }t        j                  || j                        }||   j                  d      }	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer7   r   dim)rL   dtype)torcharangeinv_freqrL   	unsqueezeexpandstackflattenmaxrP   outer)
r0   rH   rI   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             r3   forwardzMLCDRotaryEmbedding.forward   s	    LL+DMM4H4HISSTUV]]^`bst 	 LL*4==3G3GHRRSTU\\]oqst 	
 ++x//183C3C3EFBO .0ABll=1E1ET]]M`M`a#kk#t}}= -W5==a@r4   N)r<   r=   r>   intrQ   Tensorra   r    r4   r3   rG   rG      s     # # %,, r4   rG   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZ	S )MLCDVisionEmbeddingsconfigc                 (    t         |   |       | `y N)r!   r"   position_embeddingr0   rf   r2   s     r3   r"   zMLCDVisionEmbeddings.__init__   s     #r4   pixel_valuesrJ   c                 T   |j                   d   }| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }|S )Nr   )rP   r   r7   rM   rN   )shapepatch_embeddingweightrP   torW   	transposeclass_embeddingrU   rQ   cat)r0   rk   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingss          r3   ra   zMLCDVisionEmbeddings.forward   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
r4   )
r<   r=   r>   r   r"   rQ   FloatTensorrc   ra   rB   rC   s   @r3   re   re      s-    $/ $
E$5$5 
%,, 
r4   re   c                        e Zd ZdZdef fdZ	 d
dej                  deej                  ej                  f   de	ej                     de
e   deej                  e	ej                     f   f
d	Z xZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    rf   c                 T    t         |   |       |j                  | _        d| _        y NF)r!   r"   r'   	is_causalrj   s     r3   r"   zMLCDAttention.__init__   s%     $*$?$?!r4   hidden_statesposition_embeddingsattention_maskr1   rJ   c                    |j                   d d \  }}| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }	|d   j                  d      j                         }
|d   j                  d      j                         }t        |||
|      \  }}|j                  dddd      j                         }|j                  dddd      j                         }|	j                  dddd      j                         }	t        }| j                  j                  dk7  rt        | j                  j                     } || |||	|f| j                   sdn| j"                  | j$                  | j&                  d|\  }}|j                  dddd      j                         }|j)                  ||d      }| j+                  |      }|j                  ddd      j                         }||fS )	NrM   r   r7   r   r   eagerr:   )dropoutscalingr~   )rm   q_projreshape	num_headshead_dimk_projv_projrT   floatr   permute
contiguousr   rf   _attn_implementationr   trainingr   scaler~   viewout_proj)r0   r   r   r   r1   rt   
seq_lengthquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  r3   ra   zMLCDAttention.forward   sE    "/!4!4Sb!9
J {{=199:zSWSaSacgcpcp:qr[[/77ZQUQ_Q_aeanan8op
{{=199:zSWSaSacgcpcp:qr "!$..q1779!!$..q1779#>|ZY\^a#b j $++Aq!Q7BBD''1a3>>@
#++Aq!Q7BBD(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$,,JJnn
%
 
%
!\ "))!Q15@@B!&&z:rBmmK0!))!Q2==?L((r4   rh   )r<   r=   r>   r?   r   r"   rQ   rc   tupler   r   r   ra   rB   rC   s   @r3   r{   r{      s    /  26	,)||,) #5<<#=>,) !.	,)
 -.,) 
u||Xell33	4,)r4   r{   c                        e Zd Zdef fdZ	 	 d	dej                  deej                  ej                  f   deej                     dee	   deej                     f
dZ xZS )
MLCDEncoderLayerrf   c                 D    t         |   |       t        |      | _        y rh   )r!   r"   r{   	self_attnrj   s     r3   r"   zMLCDEncoderLayer.__init__   s     &v.r4   r   r   r   output_attentionsrJ   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        r   r   r   r   )layer_norm1r   layer_norm2mlp)r0   r   r   r   r   residualr   outputss           r3   ra   zMLCDEncoderLayer.forward   s    * !((7&*nn' 3)/	 '5 '
#| !=0 ((7/ =0 "&Gr4   r}   )r<   r=   r>   r   r"   rQ   rc   r   r   boolry   ra   rB   rC   s   @r3   r   r      s{    // / 26,1*||* #5<<#=>* !.	*
 $D>* 
u  	!*r4   r   c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  deej                  ej                  f   de
ej                     de
e   de
e   d	e
e   d
eeef   fdZ xZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    rf   c                 $    t         |   |       y)z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r!   r"   rj   s     r3   r"   zMLCDEncoder.__init__5  s     r4   inputs_embedsr   r   r   output_hidden_statesreturn_dictrJ   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }|st        d |	||fD              S t        |	||      S )aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr    r   r   r7   c              3   &   K   | ]	  }||  y wrh   r    ).0vs     r3   	<genexpr>z&MLCDEncoder.forward.<locals>.<genexpr>w  s     eqWXWdes   )last_hidden_stater   
attentions)rf   r   use_return_dictr   	enumeratelayersr   r	   )r0   r   r   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss                r3   ra   zMLCDEncoder.forward9  s   D %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq30d%"+DKK"8 	FC#!/=2B!B)+$7-"3	M *!,M !/=3C2E!E	F  +}.>>Ne]NN$Seee+(%
 	
r4   NNNN)r<   r=   r>   r?   r   r"   rQ   ry   r   rc   r   r   r   r	   ra   rB   rC   s   @r3   r   r   ,  s    !/ ! 26,0/3&*C
((C
 #5<<#=>C
 !.	C

 $D>C
 'tnC
 d^C
 
uo%	&C
r4   r   c                        e Zd Zdef fdZe	 	 	 	 d	deej                     dee	   dee	   dee	   de
eef   f
d       Z xZS )
MLCDVisionTransformerrf   c                    t         |   |       t        |j                  |j                  z  dz        | _        t        j                  t        j                  d|j                  |j                  z  dz              | _
        y )Nr   r7   )r!   r"   rG   r#   r&   vision_rotary_embeddingnn	ParameterrQ   randnclass_pos_embrj   s     r3   r"   zMLCDVisionTransformer.__init__  sh     ':6;M;MQWQkQk;kop;p'q$\\%++a9K9KvOiOi9imn9n*opr4   rk   r   r   r   rJ   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j
                  d   | j                   j                  z  }|j
                  d   | j                   j                  z  }| j                  ||      }|j                  | j                  j                        }t        j                  | j                  |gd      }t        j                  ||fd      }|j                         |j                         f}	| j                  |      }
| j!                  |
      }
| j#                  |
|	|||      }|d   }|d d dd d f   }| j%                  |      }|s
||f|dd  z   S t'        |||j(                  |j*                        S )	Nz You have to specify pixel_valuesrM   r   rN   )r   r   r   r   r   r7   )r   pooler_outputr   r   )rf   r   r   r   
ValueErrorrm   r)   r   rp   r   rL   rQ   rs   r   r   rx   pre_layrnormencoderpost_layernormr
   r   r   )r0   rk   r   r   r   rH   rI   r`   embr   r   encoder_outputsr   pooled_outputs                 r3   ra   zMLCDVisionTransformer.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq?@@)//3t{{7M7MM(..r2dkk6L6LL556HJ[\'**4+=+=+D+DED$6$6#GQOii8bA"wwy#'')45))-8,,' 3/!5# ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r4   r   )r<   r=   r>   r   r"   r   r   rQ   ry   r   r   r   r
   ra   rB   rC   s   @r3   r   r     s    q/ q
  59,0/3&*/
u001/
 $D>/
 'tn	/

 d^/
 
u00	1/
 /
r4   r   c                   .    e Zd ZU eed<   dZdZdZdZd Z	y)MLCDPreTrainedModelrf   mlcdTc                 L   | j                   j                  }t        |t              r| j                   j                  }t        j
                  j                  |j                  d|j                  dz  |z         t        j
                  j                  |j                  j                  |j                   j                  |z         yt        |t              r,| j                   j                  }|j                  dz  d|j                   j                  z  dz  z  |z  }|j                  dz  |z  }t        j
                  j                  |j                  j                  |       t        j
                  j                  |j                  j                  |       t        j
                  j                  |j                   j                  |       t        j
                  j                  |j"                  j                  |       yt        |t$              r| j                   j                  }|j                   j&                  dz  d|j                   j                  z  dz  z  |z  }d|j                   j&                  z  dz  |z  }t        j
                  j                  |j(                  j                  |       t        j
                  j                  |j*                  j                  |       yt        |t,              ry| j                   j                  }|j                   j&                  |j                   j.                  z  dz  dz  |z  }t        j
                  j                  |j0                  d|       yt        |t        j2                        rJ|j4                  j6                  j9                          |j                  j6                  j;                  d       yt        |t        j<                        r2|j4                  %|j4                  j6                  j9                          yyy)zInitialize the weightsr:   g      )meanstd)r   r   r;   N)rf   r,   
isinstancere   r   initnormal_rr   	embed_dimrn   ro   r+   r{   r%   r   r   r   r   rE   r#   fc1fc2r   r&   r   	LayerNormbiasdatazero_fill_Linear)r0   modulefactorin_proj_stdout_proj_stdfc_stdpos_emb_stds          r3   _init_weightsz!MLCDPreTrainedModel._init_weights  s   //f23[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOh.[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? 56[[33F!==448Y8YY]^^cggjppKGGOOF00sOL-KK""$MM$$S)		*v{{/FKK""$ 0G*r4   N)
r<   r=   r>   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpar   r    r4   r3   r   r     s$    &*#N%r4   r   c                   r    e Zd Ze	 	 	 	 ddeej                     dee   dee   dee   dee	e
f   f
d       Zy)	MLCDVisionModelNrk   r   r   r   rJ   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||      S )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```)rk   r   r   r   )rf   r   r   r   vision_model)r0   rk   r   r   r   s        r3   ra   zMLCDVisionModel.forward  su    > %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq  %/!5#	 ! 
 	
r4   r   )r<   r=   r>   r   r   rQ   ry   r   r   r   r
   ra   r    r4   r3   r   r     st     59,0/3&*(
u001(
 $D>(
 'tn	(

 d^(
 
u00	1(
 (
r4   r   )r   r   r   )1typingr   r   r   rQ   torch.nnr   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr	   r
   modeling_utilsr   r   processing_utilsr   utilsr   r   clip.modeling_clipr   r   r   r   r   r   r   llama.modeling_llamar   qwen2_vl.modeling_qwen2_vlr   r   
get_loggerr<   loggerr   rE   rG   re   r{   r   r   r   r   r   __all__r    r4   r3   <module>r     s    - ,   3 B K F & ,   ; [ 
		H	%Y%' Y%x	g 	/ D/ $9)M 9)x/' /dP
+ P
f6
1 6
r $%/ $% $%N*
o *
Zr4   