
    rhj                        d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZ  G d dej6                        Z G d dej6                        Z G d dej6                        Z	 d4dej6                  dej>                  dej>                  dej>                  deej>                     de de dee   fdZ!d Z"dej>                  de#dej>                  fd Z$d!ej>                  d"ej>                  d#ej>                  d$ej>                  de%ej>                  ej>                  f   f
d%Z& G d& d'ej6                        Z' G d( d)e      Z( G d* d+ej6                        Z) G d, d-ej6                        Z*e G d. d/e             Z+ ed01       G d2 d3e+             Z,d/d3gZ-y)5    )CallableOptionalUnionN   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring	torch_int   )MLCDVisionConfigc                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MLCDMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)super__init__configr   
hidden_actactivation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr   	__class__s     y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mlcd/modeling_mlcd.pyr   zMLCDMLP.__init__%   sd    #F$5$5699V//1I1IJ99V55v7I7IJ    hidden_statesreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r!   r   r"   )r$   r(   s     r&   forwardzMLCDMLP.forward,   s4    /**=9/r'   )__name__
__module____qualname__r   torchTensorr+   __classcell__r%   s   @r&   r   r   $   s$    KU\\ ell r'   r   c                   V     e Zd Zd	dededdf fdZdededej                  fdZ xZ	S )
MLCDRotaryEmbeddingdimthetar)   Nc                     t         |           d|t        j                  d|dt        j                        |z  z  z  }| j                  d|d       y )N      ?r      dtypeinv_freqF
persistent)r   r   r/   arangefloatregister_buffer)r$   r5   r6   r<   r%   s       r&   r   zMLCDRotaryEmbedding.__init__4   sK    %ELLC%++$NQT$TUVZeDr'   num_patches_heightnum_patches_widthc                    t        j                  || j                  j                        j	                  d      j                  d|      }t        j                  || j                  j                        j	                  d      j                  |d      }t        j                  |j                         |j                         gd      }t        ||      }t        j                  || j                  j                  | j                  j                        }t        j                  || j                        }||   j                  d      }	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer   r   r5   )rE   r;   )r/   r?   r<   rE   	unsqueezeexpandstackflattenmaxr;   outer)
r$   rB   rC   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             r&   r+   zMLCDRotaryEmbedding.forward9   s	    LL+DMM4H4HISSTUV]]^`bst 	 LL*4==3G3GHRRSTU\\]oqst 	
 ++x//183C3C3EFBO .0ABll=1E1ET]]M`M`a#kk#t}}= -W5==a@r'   )g     @)
r,   r-   r.   intr@   r   r/   r0   r+   r1   r2   s   @r&   r4   r4   3   s?    EC E ED E
# # %,, r'   r4   c                        e Zd Zdef fdZdej                  dededej                  fdZdej                  dej                  fd	Z
 xZS )
MLCDVisionEmbeddingsr   c                 |   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        | j#                  dt        j$                  | j                         j'                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr9   r   position_ids)r   rF   r=   )r   r   r   r   	embed_dim
image_size
patch_sizer   	Parameterr/   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsrA   r?   rI   r#   s     r&   r   zMLCDVisionEmbeddings.__init__[   s    ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1^U\\$:L:L-M-T-TU\-]jopr'   
embeddingsheightwidthr)   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrF   g      ?r   r9   bicubicF)sizemodealign_cornersrG   )shapeposition_embeddingweightrH   r/   jit
is_tracingr^   ra   r   reshapepermuter   
functionalinterpolateviewcat)r$   rj   rk   rl   rh   rs   ri   class_pos_embedpatch_pos_embedr5   
new_height	new_widthsqrt_num_positionss                r&   interpolate_pos_encodingz-MLCDVisionEmbeddings.interpolate_pos_encodingp   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr'   pixel_valuesc                 T   |j                   d   }| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }|S )Nr   r:   r9   r   rF   rG   )rr   rg   rt   r;   torK   	transposerd   rI   r/   r|   )r$   r   
batch_sizetarget_dtypepatch_embedsclass_embedsrj   s          r&   r+   zMLCDVisionEmbeddings.forward   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
r'   )r,   r-   r.   r   r   r/   r0   rU   r   FloatTensorr+   r1   r2   s   @r&   rW   rW   Z   sb    q/ q*'D5<< 'D 'DUX 'D]b]i]i 'DR
E$5$5 
%,, 
r'   rW   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr9   r   rF   )r5   r;   )ptrainingr   )	repeat_kvnum_key_value_groupsr/   matmulr   rr   r   ry   softmaxfloat32r   r;   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r&   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r'   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrF   r9   rG   )rr   r/   r|   )xx1x2s      r&   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r'   r(   n_repr)   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rr   rI   rw   )r(   r   batchnum_key_value_headsslenhead_dims         r&   r   r      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr'   qkcossinc                    | j                   }|j                   }| j                         |j                         }} |j                  d      j                         |j                  d      j                         }}| |z  t        |       |z  z   }||z  t        |      |z  z   }|j	                  |      }|j	                  |      }||fS )Nr   )r;   r@   rH   r   r   )r   r   r   r   orig_q_dtypeorig_k_dtypeq_embedk_embeds           r&   apply_rotary_pos_emb_visionr      s     77L77L779aggiqA}}R &&(#--*;*A*A*CC3w;q>C/0G3w;q>C/0Gjj&Gjj&GGr'   c                        e Zd ZdZdef fdZ	 d
dej                  deej                  ej                  f   de	ej                     de
e   deej                  e	ej                     f   f
d	Z xZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    r   c                 :   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        |j&                  | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r   r   r   r   r_   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutr   	is_causalr   r   k_projv_projq_projout_projr   r#   s     r&   r   zMLCDAttention.__init__   s(   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..A$*$?$?!r'   r(   position_embeddingsr   r   r)   c                    |j                   dd \  }}| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }	|d   j                  d      j                         }
|d   j                  d      j                         }t        |||
|      \  }}|j                  dddd      j                         }|j                  dddd      j                         }|	j                  dddd      j                         }	t        }| j                  j                  dk7  rt        | j                  j                     } || |||	|f| j                   sdn| j"                  | j$                  | j&                  d	|\  }}|j                  dddd      j                         }|j)                  ||d      }| j+                  |      }|j                  ddd      j                         }||fS )
z#Input shape: Batch x Time x ChannelNrF   r   r   r9   r   eager        )r   r   r   )rr   r   rw   r   r   r   r   rH   r@   r   rx   r   r   r   _attn_implementationr   r   r   r   r   r{   r   )r$   r(   r   r   r   r   
seq_lengthquery_statesr   r   r   r   attention_interfacer   r   s                  r&   r+   zMLCDAttention.forward   sE    "/!4!4Sb!9
J {{=199:zSWSaSacgcpcp:qr[[/77ZQUQ_Q_aeanan8op
{{=199:zSWSaSacgcpcp:qr "!$..q1779!!$..q1779#>|ZY\^a#b j $++Aq!Q7BBD''1a3>>@
#++Aq!Q7BBD(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$,,JJnn
%
 
%
!\ "))!Q15@@B!&&z:rBmmK0!))!Q2==?L((r'   r   )r,   r-   r.   __doc__r   r   r/   r0   tupler   r   r   r+   r1   r2   s   @r&   r   r      s    @/ @2 26	-)||-) #5<<#=>-) !.	-)
 -.-) 
u||Xell33	4-)r'   r   c                        e Zd Zdef fdZ	 	 d	dej                  deej                  ej                  f   deej                     dee	   deej                     f
dZ xZS )
MLCDEncoderLayerr   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y )Neps)r   r   r   r_   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r#   s     r&   r   zMLCDEncoderLayer.__init__/  sl    ++&v.<<F<Q<QR6?<<F<Q<QRr'   r(   r   r   output_attentionsr)   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        r(   r   r   r   )r   r   r   r   )r$   r(   r   r   r   residualr   outputss           r&   r+   zMLCDEncoderLayer.forward7  s    * !((7&*nn' 3)/	 '5 '
#| !=0 ((7/ =0 "&Gr'   )NF)r,   r-   r.   r   r   r/   r0   r   r   boolr   r+   r1   r2   s   @r&   r   r   .  s}    S/ S 26,1*||* #5<<#=>* !.	*
 $D>* 
u  	!*r'   r   c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  deej                  ej                  f   de
ej                     de
e   de
e   d	e
e   d
eeef   fdZ xZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    r   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        yc c}w )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.FN)
r   r   r   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r$   r   _r%   s      r&   r   zMLCDEncoder.__init__m  sO    mmuVMeMeGf$g!%5f%=$gh&+# %hs   A#inputs_embedsr   r   r   output_hidden_statesreturn_dictr)   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }|st        d |	||fD              S t        |	||      S )aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        N r   r   r   c              3   &   K   | ]	  }||  y wr   r   ).0vs     r&   	<genexpr>z&MLCDEncoder.forward.<locals>.<genexpr>  s     eqWXWdes   )last_hidden_stater(   
attentions)r   r   use_return_dictr   	enumerater   r   r
   )r$   r   r   r   r   r   r   encoder_statesall_attentionsr(   idxencoder_layerlayer_outputss                r&   r+   zMLCDEncoder.forwardt  s   D %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq30d%"+DKK"8 	FC#!/=2B!B)+$7-"3	M *!,M !/=3C2E!E	F  +}.>>Ne]NN$Seee+(%
 	
r'   NNNN)r,   r-   r.   r   r   r   r/   r   r   r0   r   r   r   r
   r+   r1   r2   s   @r&   r   r   d  s    ,/ , 26,0/3&*C
((C
 #5<<#=>C
 !.	C

 $D>C
 'tnC
 d^C
 
uo%	&C
r'   r   c                        e Zd Zdef fdZe	 	 	 	 d	deej                     dee	   dee	   dee	   de
eef   f
d       Z xZS )
MLCDVisionTransformerr   c                    t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        t        |j                  |j                  z  dz        | _        t        j                   t#        j$                  d|j                  |j                  z  dz              | _        y )Nr   r9   r   )r   r   r   r   rW   rj   r   r   r   pre_layrnormr   encoderpost_layernormr4   r   vision_rotary_embeddingrb   r/   rc   class_pos_emb)r$   r   r_   r%   s      r&   r   zMLCDVisionTransformer.__init__  s    &&	.v6LL8M8MN"6* ll9&:O:OP':6;M;MQWQkQk;kop;p'q$\\%++a9K9KvOiOi9imn9n*opr'   r   r   r   r   r)   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j
                  d   | j                   j                  z  }|j
                  d   | j                   j                  z  }| j                  ||      }|j                  | j                  j                        }t        j                  | j                  |gd      }t        j                  ||fd      }|j                         |j                         f}	| j                  |      }
| j!                  |
      }
| j#                  |
|	|||      }|d   }|d d dd d f   }| j%                  |      }|s
||f|dd  z   S t'        |||j(                  |j*                        S )	Nz You have to specify pixel_valuesr   rF   r   rG   )r   r   r   r   r   r   )r   pooler_outputr(   r   )r   r   r   r   r   rr   ra   r  r   r  rE   r/   r|   r   r   rj   r  r  r  r   r(   r   )r$   r   r   r   r   rB   rC   rT   embr   r(   encoder_outputsr   pooled_outputs                 r&   r+   zMLCDVisionTransformer.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq?@@)//3t{{7M7MM(..r2dkk6L6LL556HJ[\'**4+=+=+D+DED$6$6#GQOii8bA"wwy#'')45))-8,,' 3/!5# ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r'   r   )r,   r-   r.   r   r   r   r   r/   r   r   r   r   r   r+   r1   r2   s   @r&   r   r     s    
q/ 
q  59,0/3&*/
u001/
 $D>/
 'tn	/

 d^/
 
u00	1/
 /
r'   r   c                   .    e Zd ZU eed<   dZdZdZdZd Z	y)MLCDPreTrainedModelr   mlcdTc                 L   | j                   j                  }t        |t              r| j                   j                  }t        j
                  j                  |j                  d|j                  dz  |z         t        j
                  j                  |j                  j                  |j                   j                  |z         yt        |t              r,| j                   j                  }|j                  dz  d|j                   j                  z  dz  z  |z  }|j                  dz  |z  }t        j
                  j                  |j                  j                  |       t        j
                  j                  |j                  j                  |       t        j
                  j                  |j                   j                  |       t        j
                  j                  |j"                  j                  |       yt        |t$              r| j                   j                  }|j                   j&                  dz  d|j                   j                  z  dz  z  |z  }d|j                   j&                  z  dz  |z  }t        j
                  j                  |j(                  j                  |       t        j
                  j                  |j*                  j                  |       yt        |t,              ry| j                   j                  }|j                   j&                  |j                   j.                  z  dz  dz  |z  }t        j
                  j                  |j0                  d|       yt        |t        j2                        rJ|j4                  j6                  j9                          |j                  j6                  j;                  d       yt        |t        j<                        r2|j4                  %|j4                  j6                  j9                          yyy)zInitialize the weightsr   r   )meanstd)r  r9   r8   N)r   initializer_factor
isinstancerW   r   initnormal_rd   r_   rg   rt   initializer_ranger   r   r   r   r   r   r   r   r!   r"   r   r   r  r   r]   datazero_fill_r   )r$   r   factorin_proj_stdout_proj_stdfc_stdpos_emb_stds          r&   _init_weightsz!MLCDPreTrainedModel._init_weights  s   //f23[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOh.[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? 56[[33F!==448Y8YY]^^cggjppKGGOOF00sOL-KK""$MM$$S)		*v{{/FKK""$ 0G*r'   N)
r,   r-   r.   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpar  r   r'   r&   r  r    s$    &*#N%r'   r  zN
    The vision model from M_L_C_D without any head or projection on top.
    )custom_introc                        e Zd ZU eed<   dZdgZdef fdZdej                  fdZ
e	 	 	 	 ddeej                     dee   dee   d	ee   deeef   f
d
       Z xZS )MLCDVisionModelr   r   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r   vision_model	post_initr#   s     r&   r   zMLCDVisionModel.__init__,  s'     1&9r'   r)   c                 B    | j                   j                  j                  S r   )r(  rj   rg   )r$   s    r&   get_input_embeddingsz$MLCDVisionModel.get_input_embeddings2  s      ++;;;r'   r   r   r   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||      S )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```)r   r   r   r   )r   r   r   r   r(  )r$   r   r   r   r   s        r&   r+   zMLCDVisionModel.forward5  su    > %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq  %/!5#	 ! 
 	
r'   r   )r,   r-   r.   r   r  main_input_name_no_split_modulesr   r   Moduler+  r   r   r/   r   r   r   r   r   r+   r1   r2   s   @r&   r&  r&  "  s     $O+,/ <bii <  59,0/3&*(
u001(
 $D>(
 'tn	(

 d^(
 
u00	1(
 (
r'   r&  )r   ).typingr   r   r   r/   torch.nnr   activationsr   modeling_flash_attention_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_mlcdr   r/  r   r4   rW   r0   r@   r   r   rU   r   r   r   r   r   r   r   r  r&  __all__r   r'   r&   <module>r;     s  * - ,   ! B 9 K F & B B 0bii $")) $NI299 If %II%<<% 
% <<	%
 U\\*% % % '(%4(	UU\\ 	U# 	U%,, 	U||+0<<>Cll
5<<%&J)BII J)Z31 3lS
")) S
l=
BII =
@ $%/ $% $%N 
7
) 7

7
t !"3
4r'   