
    rh<                        d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	c m
Z d dlm	Z	 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZmZ ddlmZm Z m!Z! ee G d de                    Z" ed       G d de	jF                               Z$ G d de	jF                        Z% G d de	jF                        Z& G d de	jF                        Z'	 d9de	jF                  dejP                  dejP                  dejP                  deejP                     de)d e)fd!Z* G d" d#e	jF                        Z+ G d$ d%e      Z, G d& d'e	jF                        Z- G d( d)e	jF                        Z.e G d* d+e             Z/ ed,-       G d. d/e/             Z0 ed0-       G d1 d2e/             Z1d3ejP                  d4ejP                  fd5Z2e G d6 d7e/             Z3g d8Z4y):    N)	dataclass)AnyCallableOptional)nn   )ACT2FN)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuple   )Aimv2ConfigAimv2TextConfigAimv2VisionConfigc                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)Aimv2Outputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Aimv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Aimv2VisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r   r    N)getattrto_tuple).0kselfs     {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/aimv2/modeling_aimv2.py	<genexpr>z'Aimv2Output.to_tuple.<locals>.<genexpr>H   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysr(   s   `r)   r%   zAimv2Output.to_tupleG   s#     
YY[
 
 	
    )__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r   r   r    r+   r   r%    r.   r)   r   r   )   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r.   r   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Aimv2RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        Aimv2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parameterr3   onesweightvariance_epsilon)r(   hidden_sizeeps	__class__s      r)   r<   zAimv2RMSNorm.__init__P   s1     	ll5::k#:; #r.   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor3   float32powmeanrsqrtr@   r?   )r(   hidden_statesinput_dtypevariances       r)   forwardzAimv2RMSNorm.forwardX   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r.   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r+   r?   shaper@   r-   s    r)   
extra_reprzAimv2RMSNorm.extra_repr_   s*    ))*+6$2G2G1HIIr.   )gư>)r/   r0   r1   r<   rQ   rT   __classcell__rC   s   @r)   r9   r9   N   s    $;Jr.   r9   c                   $     e Zd Z fdZd Z xZS )Aimv2MLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nbias)r;   r<   configrA   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr(   r\   rC   s     r)   r<   zAimv2MLP.__init__d   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r.   c                     | j                  | j                  | j                  |            | j                  |      z        }|S N)rb   rd   r`   ra   )r(   xrb   s      r)   rQ   zAimv2MLP.forwardn   s6    NN4;;t~~a/@#ADLLQRO#ST	r.   )r/   r0   r1   r<   rQ   rU   rV   s   @r)   rX   rX   c   s    0r.   rX   c                        e Zd Zdef fdZedddej                  fdej                  fd       Z	dej                  dej                  fd	Z
 xZS )
Aimv2VisionEmbeddingsr\   c                 B   t         |           || _        |j                  | _        t	        j
                  |j                  |j                  |j                  |j                        | _        t        |j                  |j                        | _        |j                  |j                  z  dz  }| j                  j                  s%t	        j                  ||j                        | _        | j!                  dt#        j$                  |      j'                  d      d       y )N)kernel_sizestriderE   position_idsr   rF   F
persistent)r;   r<   r\   
patch_sizer   Conv2dnum_channelsrA   patch_embedr9   rms_norm_epsrms_norm
image_size	is_native	Embeddingposition_embeddingregister_bufferr3   arangeexpand)r(   r\   num_patchesrC   s      r)   r<   zAimv2VisionEmbeddings.__init__t   s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir.      g     @cpur!   c                 :   t        j                  t        |      ||      }t        j                  t        |       ||      }t        j                  ||d      \  }}|dz  }t        j                  |||      |z  }	d||	z  z  }	|j	                         d   |	d d d f   z  }
|j	                         d   |	d d d f   z  }t        j
                  |
j                         |
j                         |j                         |j                         gd      d d d d d f   S )	NrH   devicexy)indexing   g      ?).Nr   dim)r3   r}   intmeshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   rH   grid_wgrid_hpos_dimomegaout_hout_ws               r)   "build_2d_sincos_position_embeddingz8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding   s     c%jfEc&kvFFq.WE&AGK{E)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr.   pixel_valuesc                    |j                         \  }}}}| j                  |      j                  d      j                  dd      }| j	                  |      }| j
                  j                  rY| j                  || j                  z  || j                  z  | j
                  j                  |j                  |j                        }n| j                  | j                        }||z   }|S )NrE   r   )r   r   rH   )sizeru   r   	transposerw   r\   ry   r   rr   rA   r   rH   r{   rn   )r(   r   _r   r   rN   	pos_embeds          r)   rQ   zAimv2VisionEmbeddings.forward   s    *//11fe((6>>qAKKAqQm4;;  ??$//)(++11$++#)) @ I //0A0ABI%	1r.   )r/   r0   r1   r   r<   staticmethodr3   rJ   Tensorr   rQ   rU   rV   s   @r)   rj   rj   s   s]    j0 j !$'%u}}e	e e ELL U\\ r.   rj   c            	            e Zd Zdef fdZ	 	 	 ddeej                     deej                     deej                     dej                  fdZ
 xZS )	Aimv2TextEmbeddingsr\   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nrn   ro   Frp   )r;   r<   rA   r   rz   
vocab_sizetoken_embeddingmax_position_embeddingsr{   r|   r3   r}   r~   )r(   r\   r   rC   s      r)   r<   zAimv2TextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r.   	input_idsrn   inputs_embedsr!   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )NrF   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rS   r{   r?   
ValueErrorrn   r   )r(   r   rn   r   
seq_lengthmax_position_embeddingposition_embeddings
embeddingss           r)   rQ   zAimv2TextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r.   NNN)r/   r0   r1   r   r<   r   r3   
LongTensorr4   r   rQ   rU   rV   s   @r)   r   r      sj    

 

 153759	E,,- u//0   1 12	
 
r.   r   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrF   r   )r   rH   )ptrainingr   rE   )r3   matmulr   r   
functionalsoftmaxrJ   rI   rH   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r)   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r.   c            
            e Zd ZdZ fdZ	 ddej                  deej                     deej                  eej                     f   fdZ	 xZ
S )Aimv2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 x   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      FrZ   )r;   r<   r\   rA   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   r^   qkv_biask_projv_projq_projout_projre   s     r)   r<   zAimv2Attention.__init__   s2   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//iiV__UiiV__UiiV__U		$..$..vWr.   rN   r   r!   c           
      :   |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rt        | j                  j                     }
 |
| |||	|| j                  | j                  | j                  sdn| j                        \  }}|j!                  |||      j#                         }| j%                  |      }||fS )z#Input shape: Batch x Time x Channelr   rE   eager        )r   r   r   )rS   r   r   r   viewr   r   r   r   r\   _attn_implementationr   r   r   r   r   reshaper   r   )r(   rN   r   r   
batch_sizer   r   queriesr,   valuesattention_interfacer   r   s                r)   rQ   zAimv2Attention.forward   sa    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r.   rg   )r/   r0   r1   r2   r<   r3   r   r   r+   rQ   rU   rV   s   @r)   r   r      sV    GX, 26$)||$) !.$)
 
u||Xell33	4$)r.   r   c                        e Zd Zdef fdZ	 	 ddej                  deej                     dee   de	ej                  ej                  f   fdZ
 xZS )	Aimv2EncoderLayerr\   c                     t         |           t        |      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y rg   )r;   r<   r   	attentionrX   ffnr9   rA   rv   	rms_norm1	rms_norm2re   s     r)   r<   zAimv2EncoderLayer.__init__$  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr.   rN   r   output_attentionsr!   c                     | j                  |      }| j                  ||      \  }}||z   }| j                  |      }| j                  |      }||z   }|r||fS |d fS )N)rN   r   )r   r   r   r   )r(   rN   r   r   norm_hidden_statesr   r   
mlp_outputs           r)   rQ   zAimv2EncoderLayer.forward+  sv     "^^M:$(NNASdrN$s!\%3!^^M:XX01
%
20A|,\W[G\\r.   NF)r/   r0   r1   r   r<   r3   r   r   boolr+   rQ   rU   rV   s   @r)   r   r   #  sm    O0 O 26,1	]||] !.] $D>	]
 
u||U\\)	*]r.   r   c            
       x     e Zd ZdZdef fdZe	 	 	 d	deej                     dee
   dee
   defd       Z xZS )
Aimv2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Aimv2EncoderLayer`].

    Args:
        config: Aimv2Config
    r\   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r;   r<   r\   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r(   r\   r   rC   s      r)   r<   zAimv2Encoder.__init__E  sO    mmfNfNfHg$h1%6v%>$hi&+# %is   A#r   r   output_hidden_statesr!   c                    ||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}| j                  D ]&  }|r||fz   } ||||      }	|	d   }|s||	d   fz   }( |r||fz   }t	        |||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr6   )r   r   r   )last_hidden_staterN   
attentions)r\   r   r   r   r   )
r(   r   r   r   r   encoder_statesall_attentionsrN   encoder_layerlayer_outputss
             r)   rQ   zAimv2Encoder.forwardL  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%![[ 	FM#!/=2B!B)"3M *!,M !/=3C2E!E	F  +}.>>N+(%
 	
r.   r   )r/   r0   r1   r2   r   r<   r   r   r3   r   r   r   rQ   rU   rV   s   @r)   r   r   <  sl    ,{ ,  26,0/3<
 !.<
 $D>	<

 'tn<
 
<
 <
r.   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Aimv2AttentionPoolingHeadr\   c                 &   t         |           |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  t        j                  dd| j                              | _        t        j                  | j                  | j                  d      | _        y )NrZ   r   T)r;   r<   rA   r   r   r   r^   r   r   r   r=   r3   zeros	cls_tokenoutput_projre   s     r)   r<   z"Aimv2AttentionPoolingHead.__init__  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr.   rN   r!   c                    |j                   \  }}}| j                  j                  |dd      }| j                  |      j	                  ||| j
                  || j
                  z        }| j                  |      j	                  ||| j
                  || j
                  z        }|j	                  |d| j
                  || j
                  z        }|j                  dddd      }|j                  dddd      }|j                  dddd      }t        j                  |||      }	|	j                  dd      j	                  |d|      }	|	j                  d      }	| j                  |	      }
|
S )NrF   r   r   rE   r   r   )rS   r   r~   r   r   r   r   permuteFscaled_dot_product_attentionr   rL   r   )r(   rN   r   seq_len
hidden_dimr   r   r   r   r   outputs              r)   rQ   z!Aimv2AttentionPoolingHead.forward  sH   *7*=*='
GZNN))*b"=	kk-(00WdnnV`dhdrdrVrsM*22:wXbfjftftXtu!!*at~~A]^kk!Q1%aAq)aAq)44UCG!++Aq199*aT!&&1&-!!+.r.   )	r/   r0   r1   r   r<   r3   r   rQ   rU   rV   s   @r)   r   r     s-    	T0 	TU\\ ell r.   r   c                   J     e Zd ZU dZeed<   dZdZg dZdZ	dZ
dZ fdZ xZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    r\   aimv2T)r   r   rj   r   c                    t         |   |       t        |d      r^t        |j                  t
        j                        r9|j                  j                  j                  t        j                  d             y y t        |t              r<|j                  j                  j                  d| j                  j                         y y )Nlogit_scaleg$I$I,@r   )rL   std)r;   _init_weightshasattr
isinstancer  r   r=   datafill_mathlogr   r   normal_r\   initializer_range)r(   r   rC   s     r)   r  z"Aimv2PreTrainedModel._init_weights  s    f%6=)&,,bll;""''--dhhx.@A < 9:!!))s8U8U)V ;r.   )r/   r0   r1   r2   r   r5   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr  rU   rV   s   @r)   r	  r	    sC    
 &*# NW Wr.   r	  zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
e	 	 	 d
deej                     dee   dee   defd	              Z xZS )Aimv2VisionModelr\   r   c                 6   t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                  rt        |      | _        | j                          y rg   )r;   r<   r\   rj   r   r   encoderr9   rA   rv   rw   use_headr   head	post_initre   s     r)   r<   zAimv2VisionModel.__init__  sq     /7#F+$V%7%79L9LM==1&9DIr.   r!   c                 .    | j                   j                  S rg   )r   ru   r-   s    r)   get_input_embeddingsz%Aimv2VisionModel.get_input_embeddings  s    ***r.   r   r   r   c                 d   ||n| j                   j                  }||n| j                   j                  }| j                  |      }| j	                  |||      }|d   }| j                  |      }| j                  r| j                  |      nd}t        |||j                  |j                        S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```N)r   r   r   r   r   pooler_outputrN   r   )r\   r   r   r   r!  rw   r"  r#  r   rN   r   )	r(   r   r   r   r   rN   encoder_outputsr   r)  s	            r)   rQ   zAimv2VisionModel.forward  s    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 5,,'/!5 ' 
 ,A. MM*;<8<		"344)/')77&11	
 	
r.   r   )r/   r0   r1   r   r5   main_input_namer<   r   Moduler&  r   r   r   r3   r   r   r   rQ   rU   rV   s   @r)   r  r    s     $O0 +bii +  26,0/32
 !.2
 $D>	2

 'tn2
 
$2
  2
r.   r  zJ
    The text model from AIMv2 without any head or projection on top.
    c                        e Zd ZdZdef fdZdej                  fdZd Z	e
e	 	 	 ddeej                     dee   d	ee   defd
              Z xZS )Aimv2TextModelr   r\   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                          y rg   )r;   r<   r\   r   r   r   r!  r9   rA   rv   rw   eos_token_idr$  re   s     r)   r<   zAimv2TextModel.__init__$  sa     -f5#F+$V%7%79L9LM"//r.   r!   c                 .    | j                   j                  S rg   r   r   r-   s    r)   r&  z#Aimv2TextModel.get_input_embeddings/  s    ...r.   c                 &    || j                   _        y rg   r2  )r(   r   s     r)   set_input_embeddingsz#Aimv2TextModel.set_input_embeddings2  s    */'r.   r   r   r   c                 2   ||n| j                   j                  }||n| j                   j                  }| j                  |      }|j                  \  }}}t        j                  |t
        j                  |j                        }	|	j                  d      j                  |d      }
|t        | j                   ||
||	d       }| j                  ||||      }|d   }| j                  |      }|t        j                  |j                  d   |j                        |j                  t
        j                  |j                        | j                   k(  j                         j#                  d      f   }t%        |||j&                  |j(                        S )	Nr   r   rF   )r\   input_embedsrn   r   cache_positionpast_key_values)r   r   r   r   )r   r   r(  )r\   r   r   r   rS   r3   r}   longr   	unsqueezer~   r   r!  rw   rI   r   r0  argmaxr   rN   r   )r(   r   r   r   r   rN   r   r  r   r7  rn   r*  r   pooled_outputs                 r)   rQ   zAimv2TextModel.forward5  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 	2!.!4!4
GQgUZZH\H\]%//299*bI%/{{*)-- $N ,,')/!5	 ' 
 ,A. MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */')77&11	
 	
r.   r   )r/   r0   r1   r+  r   r<   r   r,  r&  r4  r   r   r   r3   r   r   r   rQ   rU   rV   s   @r)   r.  r.    s     "O	 	/bii /0  26,0/30
 !.0
 $D>	0

 'tn0
 
$0
  0
r.   r.  tensorr!   c                     t        j                  | d      }t        j                  |dd      }t        j                  |d      }|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
    rE   rF   T)r   rG   g      ?)r3   rK   sum)r=  square_tensor
sum_tensornormed_tensors       r)   _get_vector_normrC  j  s<    
 IIfa(M=b$?JIIj#.Mr.   c                       e Zd ZU eed<   g dZdef fdZe	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee   dee   d	e	j                  fd
       Ze	 	 	 	 ddee	j                     dee   dee   ded	e	j                  f
d       Zee	 	 	 	 	 ddee	j                      dee	j                     dee	j                     dee   dee   d	efd              Z xZS )
Aimv2Modelr\   )r   r   rj   c                    t         |   |       |j                  | _        |j                  j                  | _        |j                  j                  | _        t        j                  |j                        | _
        t        j                  |j                        | _        t        j                  | j
                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j"                  t%        j&                  | j(                  j*                              | _        t/        j0                  |j2                        | _        | j7                          y )NFrZ   )r;   r<   projection_dimvision_configrA   vision_embed_dimtext_configtext_embed_dimr  _from_configvision_modelr.  
text_modelr   r^   visual_projectiontext_projectionr=   r3   r=  r\   logit_scale_init_valuer  r  r  max_logit_scalemax_log_logit_scaler$  re   s     r)   r<   zAimv2Model.__init__z  s     $33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r.   r   r   rn   r   r   r!   c                     ||n| j                   j                  }||n| j                   j                  }| j                  |||||      }|j                  }| j                  |      }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`Aimv2TextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```)r   r   rn   r   r   )r\   r   r   rN  r)  rP  )	r(   r   r   rn   r   r   text_outputsr<  text_featuress	            r)   get_text_featureszAimv2Model.get_text_features  s    4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 48??)%/!5 4C 4
 %22,,];r.   r   interpolate_pos_encodingc                     ||n| j                   j                  }||n| j                   j                  }| j                  ||||      }|j                  }| j                  |      }|S )aI  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`Aimv2VisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```)r   r   r   rX  )r\   r   r   rM  r)  rO  )r(   r   r   r   rX  vision_outputsr<  image_featuress           r)   get_image_featureszAimv2Model.get_image_features  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5%=	 6G 6
 '44//>r.   c                 p   ||n| j                   j                  }||n| j                   j                  }| j                  |||      }| j	                  ||||      }|j
                  }| j                  |      }|j
                  }	| j                  |	      }	|t        |      z  }|	t        |	      z  }	| j                  j                  d| j                        j                         j                  |	j                        }
|
|	z  |j                         z  }|j                         }t!        |||	|||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```)r   r   r   )r   r   r   r   r   )r   r   r   r   r   r    )r\   r   r   rM  rN  r)  rO  rP  rC  r  clamprS  exprI   r   tr   )r(   r   r   r   r   r   rZ  rU  r   r   r  r   r   s                r)   rQ   zAimv2Model.forward  sU   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5 6G 6
 48??)/!5	 4C 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r.   )NNNNN)NNNF)r/   r0   r1   r   r5   r  r<   r   r   r3   r   r   r4   rW  r\  r   r   r   rQ   rU   rV   s   @r)   rE  rE  u  s   ]{ $  -115/3,0/3)ELL)) !.) u||,	)
 $D>) 'tn) 
		) )V  59,0/3).-u001- $D>- 'tn	-
 #'- 
		- -^  154815,0/3F
E,,-F
 u001F
 !.	F

 $D>F
 'tnF
 
F
  F
r.   rE  )r  rE  r	  r.  )r   )5r  dataclassesr   typingr   r   r   r3   torch.nn.functionalr   r   r  activationsr	   integrationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   configuration_aimv2r   r   r   r   r,  r9   rX   rj   r   r   floatr   r   r   r   r   r	  r  r.  rC  rE  __all__r6   r.   r)   <module>rn     s  .  ! * *     ! 7 / 9 K F B B P P  
+  
   
F Y'J299 J (J(ryy  1BII 1h%")) %^ %II%<<% 
% <<	%
 U\\*% % %.:)RYY :)z]2 ]2M
299 M
`		 D W? W W8 
I
+ I

I
X 
F
) F

F
RU\\ ell  z
% z
 z
z Wr.   