
    rhq                        d Z ddlZddlmZ ddlZddlmc mZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZ ddlmZmZ ddlmZmZmZ ddlm Z m!Z!m"Z"  G d de      Z# G d de      Z$ G d de      Z% G d de"      Z& G d de      Z' G d de      Z( G d dejR                        Z* G d de      Z+ G d  d!e       Z, G d" d#e      Z- G d$ d%e!      Z. G d& d'ejR                        Z/e G d( d)e             Z0 ed*+       G d, d-e0             Z1 ed.+       G d/ d0e0             Z2e G d1 d2eejR                               Z3g d3Z4y)4z%Pytorch implementation of AIMv2 Model    N)Optional)nn   )create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPooling)PreTrainedModel)auto_docstringcan_return_tuple   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutputc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededededededef fdZ xZ	S )Aimv2VisionConfiga  
    This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2816):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
        use_head (`str`, *optional*, defaults to `True`):
            Whether to use Attention Pooling Head or Not.
        is_native (`str`, *optional*, defaults to `False`):
            Whether to use ckpt trained for image native resolution or not.
    Example:

    ```python
    >>> from transformers import SiglipVisionConfig, SiglipVisionModel

    >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
    >>> configuration = Aimv2VisionConfig()

    >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
    >>> model = Aimv2VisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_sizerms_norm_epsattention_dropoutqkv_biasmlp_bias
hidden_actinitializer_rangeuse_head	is_nativec                     t        |   d|||||||||
d	| || _        || _        |	| _        || _        |
| _        || _        || _        | `	y )N)	r   r   r   r   r%   r   r   r    r#    )
super__init__r'   r&   r"   r$   r#   r!   r(   layer_norm_eps)selfr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   kwargs	__class__s                    z/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/aimv2/modular_aimv2.pyr,   zAimv2VisionConfig.__init__`   sx    & 	 	
#// 3!%!!	
 	
 !!2!2  ("    )i   i         r         h㈵>        FFsilu{Gz?TF)
__name__
__module____qualname____doc__intfloatboolstrr,   __classcell__r0   s   @r1   r   r   '   s    6t  !%!##$"#& #'!( (  (  	( 
 !(  (  (  (  (  !(  (  (  (  !(  (   !(  ( r2   r   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededee   dee   dededef fdZ	 xZ
S )Aimv2TextConfiga  
    This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
    AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 49408):
            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`Aimv2Model`].
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        pad_token_id (`int`, *optional*, defaults to 1):
            The id of the padding token in the vocabulary.
        bos_token_id (`int`, *optional*, defaults to 49406):
            The id of the beginning-of-sequence token in the vocabulary.
        eos_token_id (`int`, *optional*, defaults to 49407):
            The id of the end-of-sequence token in the vocabulary.
        max_position_embeddings (`int`, *optional*, defaults to 77):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
    
vocab_sizer   r   r   r   r!   r"   r#   r$   r%   pad_token_idbos_token_ideos_token_idmax_position_embeddingsr&   c                     t        |   d||||||
||||d
| || _        || _        |	| _        || _        || _        | `| `| `	| `
y )N)
rG   r   r   r   r   r%   rK   rH   rI   rJ   r*   )r+   r,   r&   r"   r$   r#   r!   rI   rH   projection_sizer-   )r.   rG   r   r   r   r   r!   r"   r#   r$   r%   rH   rI   rJ   rK   r&   r/   r0   s                    r1   r,   zAimv2TextConfig.__init__   s    & 	 	
!#// 3!$;%%%	
 	
 "3!2  ( r2   )i   i   i         r7   r8   FFr9   NNi  M   r:   )r;   r<   r=   r>   r?   r@   rA   rB   r   r,   rC   rD   s   @r1   rF   rF      s    +^  !%!##$"#& &*&*!')"&!* *  *  	* 
 *  !*  *  !*  *  *  *  sm*  sm*  *  "%*    !*  * r2   rF   c                   &     e Zd ZdZ	 d fd	Z xZS )Aimv2Configa@  
    [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
    instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
        projection_dim (`int`, *optional*, defaults to 512):
            Dimensionality of text and vision projection layers.
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The initial value of the *logit_scale* parameter.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import Aimv2Config, Aimv2Model

    >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
    >>> configuration = Aimv2Config()

    >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
    >>> model = Aimv2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
    >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
    >>> config_text = Aimv2TextConfig()
    >>> config_vision = Aimv2VisionConfig()

    >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
    ```c                 V    t        |   ||fi | || _        || _        d| _        | `y )Ng      Y@)r+   r,   projection_dimlogit_scale_init_valuemax_logit_scaleinitializer_factor)r.   text_configvision_configrT   rU   r/   r0   s         r1   r,   zAimv2Config.__init__  s7     	m>v>,&<#$#r2   )NNi   g/L
F@)r;   r<   r=   r>   r,   rC   rD   s   @r1   rR   rR      s    +\ `f$ $r2   rR   c                       e Zd Zy)Aimv2OutputNr;   r<   r=   r*   r2   r1   r[   r[         r2   r[   c                       e Zd Zy)Aimv2RMSNormNr\   r*   r2   r1   r_   r_   #  r]   r2   r_   c                       e Zd Zy)Aimv2MLPNr\   r*   r2   r1   ra   ra   '  r]   r2   ra   c                        e Zd Zdef fdZedddej                  fdej                  fd       Z	dej                  dej                  fd	Z
 xZS )
Aimv2VisionEmbeddingsconfigc                 B   t         |           || _        |j                  | _        t	        j
                  |j                  |j                  |j                  |j                        | _        t        |j                  |j                        | _        |j                  |j                  z  dz  }| j                  j                  s%t	        j                  ||j                        | _        | j!                  dt#        j$                  |      j'                  d      d       y )N)kernel_sizestrider   position_ids)   F)
persistent)r+   r,   rd   r    r   Conv2dr   r   patch_embedr_   r!   rms_normr   r(   	Embeddingposition_embeddingregister_buffertorcharangeexpand)r.   rd   num_patchesr0   s      r1   r,   zAimv2VisionEmbeddings.__init__,  s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir2      g     @cpureturnc                 :   t        j                  t        |      ||      }t        j                  t        |       ||      }t        j                  ||d      \  }}|dz  }t        j                  |||      |z  }	d||	z  z  }	|j	                         d   |	d d d f   z  }
|j	                         d   |	d d d f   z  }t        j
                  |
j                         |
j                         |j                         |j                         gd      d d d d d f   S )	Ndtypedevicexy)indexing   g      ?).Nri   dim)rr   rs   r?   meshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer|   r{   grid_wgrid_hpos_dimomegaout_hout_ws               r1   "build_2d_sincos_position_embeddingz8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding:  s     c%jfEc&kvFFq.WE&AGK{E)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr2   pixel_valuesc                    |j                         \  }}}}| j                  |      j                  d      j                  dd      }| j	                  |      }| j
                  j                  rY| j                  || j                  z  || j                  z  | j
                  j                  |j                  |j                        }n| j                  | j                        }||z   }|S )Nr   ri   )r   r|   r{   )sizerm   r   	transposern   rd   r(   r   r    r   r|   r{   rp   rh   )r.   r   _r   r   hidden_states	pos_embeds          r1   forwardzAimv2VisionEmbeddings.forwardK  s    *//11fe((6>>qAKKAqQm4;;  ??$//)(++11$++#)) @ I //0A0ABI%	1r2   )r;   r<   r=   r   r,   staticmethodrr   float32Tensorr   r   rC   rD   s   @r1   rc   rc   +  s]    j0 j !$'%u}}e	e e ELL U\\ r2   rc   c                       e Zd Zy)Aimv2TextEmbeddingsNr\   r*   r2   r1   r   r   _  r]   r2   r   c                        e Zd Z fdZ xZS )Aimv2Attentionc                    t         |   |       t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _	        y )Nbias)
r+   r,   r   Linearr   r#   k_projv_projq_projout_projr.   rd   r0   s     r1   r,   zAimv2Attention.__init__d  s     iiV__UiiV__UiiV__U		$..$..vWr2   )r;   r<   r=   r,   rC   rD   s   @r1   r   r   c  s    X Xr2   r   c                        e Zd Zdef fdZ	 	 ddej                  deej                     dee   de	ej                  ej                  f   fdZ
 xZS )	Aimv2EncoderLayerrd   c                     t         |           t        |      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y N)r+   r,   r   	attentionra   ffnr_   r   r!   	rms_norm1	rms_norm2r   s     r1   r,   zAimv2EncoderLayer.__init__m  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr2   r   attention_maskoutput_attentionsrx   c                     | j                  |      }| j                  ||      \  }}||z   }| j                  |      }| j                  |      }||z   }|r||fS |d fS )N)r   r   )r   r   r   r   )r.   r   r   r   norm_hidden_statesattn_outputattn_weights
mlp_outputs           r1   r   zAimv2EncoderLayer.forwardt  sv     "^^M:$(NNASdrN$s!\%3!^^M:XX01
%
20A|,\W[G\\r2   )NF)r;   r<   r=   r   r,   rr   r   r   rA   tupler   rC   rD   s   @r1   r   r   l  sm    O0 O 26,1	]||] !.] $D>	]
 
u||U\\)	*]r2   r   c                       e Zd Zy)Aimv2EncoderNr\   r*   r2   r1   r   r     r]   r2   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Aimv2AttentionPoolingHeadrd   c                 &   t         |           |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  t        j                  dd| j                              | _        t        j                  | j                  | j                  d      | _        y )Nr   ri   T)r+   r,   r   r   	num_headsr   r   r#   r   r   	Parameterrr   zeros	cls_tokenoutput_projr   s     r1   r,   z"Aimv2AttentionPoolingHead.__init__  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr2   r   rx   c                    |j                   \  }}}| j                  j                  |dd      }| j                  |      j	                  ||| j
                  || j
                  z        }| j                  |      j	                  ||| j
                  || j
                  z        }|j	                  |d| j
                  || j
                  z        }|j                  dddd      }|j                  dddd      }|j                  dddd      }t        j                  |||      }	|	j                  dd      j	                  |d|      }	|	j                  d      }	| j                  |	      }
|
S )Nrj   ri   r   r   r   r   )shaper   rt   r   reshaper   r   permuteFscaled_dot_product_attentionr   meanr   )r.   r   
batch_sizeseq_len
hidden_dimr   keyvaluequeryr   outputs              r1   r   z!Aimv2AttentionPoolingHead.forward  sH   *7*=*='
GZNN))*b"=	kk-(00WdnnV`dhdrdrVrsM*22:wXbfjftftXtu!!*at~~A]^kk!Q1%aAq)aAq)44UCG!++Aq199*aT!&&1&-!!+.r2   )	r;   r<   r=   r   r,   rr   r   r   rC   rD   s   @r1   r   r     s-    	T0 	TU\\ ell r2   r   c                   J     e Zd ZU dZeed<   dZdZg dZdZ	dZ
dZ fdZ xZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    rd   aimv2T)r   r   rc   r   c                    t         |   |       t        |d      r^t        |j                  t
        j                        r9|j                  j                  j                  t        j                  d             y y t        |t              r<|j                  j                  j                  d| j                  j                         y y )Nlogit_scaleg$I$I,@r8   )r   std)r+   _init_weightshasattr
isinstancer   r   r   datafill_mathlogr   r   normal_rd   r&   )r.   moduler0   s     r1   r   z"Aimv2PreTrainedModel._init_weights  s    f%6=)&,,bll;""''--dhhx.@A < 9:!!))s8U8U)V ;r2   )r;   r<   r=   r>   rR   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr   rC   rD   s   @r1   r   r     sC    
 &*# NW Wr2   r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
e	 	 	 d
deej                     dee   dee   defd	              Z xZS )Aimv2VisionModelrd   r   c                 6   t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                  rt        |      | _        | j                          y r   )r+   r,   rd   rc   
embeddingsr   encoderr_   r   r!   rn   r'   r   head	post_initr   s     r1   r,   zAimv2VisionModel.__init__  sq     /7#F+$V%7%79L9LM==1&9DIr2   rx   c                 .    | j                   j                  S r   )r   rm   r.   s    r1   get_input_embeddingsz%Aimv2VisionModel.get_input_embeddings  s    ***r2   r   r   output_hidden_statesc                 d   ||n| j                   j                  }||n| j                   j                  }| j                  |      }| j	                  |||      }|d   }| j                  |      }| j                  r| j                  |      nd}t        |||j                  |j                        S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```N)inputs_embedsr   r   r   last_hidden_statepooler_outputr   
attentions)rd   r   r   r   r   rn   r'   r   r   r   r   )	r.   r   r   r   r   r   encoder_outputsr   r   s	            r1   r   zAimv2VisionModel.forward  s    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 5,,'/!5 ' 
 ,A. MM*;<8<		"344)/')77&11	
 	
r2   NNN)r;   r<   r=   r   r   main_input_namer,   r   Moduler   r   r
   r   rr   r   rA   r   r   rC   rD   s   @r1   r   r     s     $O0 +bii +  26,0/32
 !.2
 $D>	2

 'tn2
 
$2
  2
r2   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c                        e Zd ZdZdef fdZdej                  fdZd Z	e
e	 	 	 ddeej                     dee   d	ee   defd
              Z xZS )Aimv2TextModel	input_idsrd   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                          y r   )r+   r,   rd   r   r   r   r   r_   r   r!   rn   rJ   r   r   s     r1   r,   zAimv2TextModel.__init__!  sa     -f5#F+$V%7%79L9LM"//r2   rx   c                 .    | j                   j                  S r   r   token_embeddingr   s    r1   r   z#Aimv2TextModel.get_input_embeddings,  s    ...r2   c                 &    || j                   _        y r   r  )r.   r   s     r1   set_input_embeddingsz#Aimv2TextModel.set_input_embeddings/  s    */'r2   r   r   r   c                 2   ||n| j                   j                  }||n| j                   j                  }| j                  |      }|j                  \  }}}t        j                  |t
        j                  |j                        }	|	j                  d      j                  |d      }
|t        | j                   ||
||	d       }| j                  ||||      }|d   }| j                  |      }|t        j                  |j                  d   |j                        |j                  t
        j                  |j                        | j                   k(  j                         j#                  d      f   }t%        |||j&                  |j(                        S )	Nrz   r   rj   )rd   input_embedsrh   r   cache_positionpast_key_values)r   r   r   r   )r|   r   r   )rd   r   r   r   r   rr   rs   longr|   	unsqueezert   r   r   rn   tor?   rJ   argmaxr   r   r   )r.   r   r   r   r   r   r   r   r   r	  rh   r   r   pooled_outputs                 r1   r   zAimv2TextModel.forward2  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 	2!.!4!4
GQgUZZH\H\]%//299*bI%/{{*)-- $N ,,')/!5	 ' 
 ,A. MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */')77&11	
 	
r2   r   )r;   r<   r=   r   rF   r,   r   r   r   r  r   r
   r   rr   r   rA   r   r   rC   rD   s   @r1   r   r     s     "O	 	/bii /0  26,0/30
 !.0
 $D>	0

 'tn0
 
$0
  0
r2   r   c                       e Zd ZdefdZee	 	 	 	 	 ddeej                     deej                     deej                     dee   dee   d	efd
              Zy)
Aimv2Modelrd   c                    t        j                         j                  |       |j                  | _        |j                  j
                  | _        |j                  j
                  | _        t        j                  |j                        | _        t        j                  |j                        | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j"                  t%        j&                  | j(                  j*                              | _        t/        j0                  |j2                        | _        | j7                          y )NFr   )r   r   r,   rT   rY   r   vision_embed_dimrX   text_embed_dimr   _from_configvision_modelr   
text_modelr   visual_projectiontext_projectionr   rr   tensorrd   rU   r   r   r   rV   max_log_logit_scaler   )r.   rd   s     r1   r,   zAimv2Model.__init__i  s   
		V$$33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r2   Nr   r   r   r   r   rx   c                 p   ||n| j                   j                  }||n| j                   j                  }| j                  |||      }| j	                  ||||      }|j
                  }| j                  |      }|j
                  }	| j                  |	      }	|t        |      z  }|	t        |	      z  }	| j                  j                  d| j                        j                         j                  |	j                        }
|
|	z  |j                         z  }|j                         }t!        |||	|||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```)r   r   r   )r   r   r   r   r8   )logits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_output)rd   r   r   r  r  r   r  r  r   r   clampr  expr  r|   tr[   )r.   r   r   r   r   r   vision_outputstext_outputsr   r  r   r  r  s                r1   r   zAimv2Model.forward{  sU   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5 6G 6
 48??)/!5	 4C 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r2   )NNNNN)r;   r<   r=   rR   r,   r
   r   r   rr   
LongTensorFloatTensorr   rA   r[   r   r*   r2   r1   r  r  g  s    { $  154815,0/3F
E,,-F
 u001F
 !.	F

 $D>F
 'tnF
 
F
  F
r2   r  )rR   r   rF   r   r  r   r   )5r>   r   typingr   rr   torch.nn.functionalr   
functionalr   masking_utilsr   modeling_layersr   modeling_outputsr   modeling_utilsr	   utilsr
   r   clip.modeling_clipr   r   r   llama.modeling_llamar   r   siglip.configuration_siglipr   r   r   siglip.modeling_siglipr   r   r   r   rF   rR   r[   r_   ra   r   rc   r   r   r   r   r   r   r   r   r  __all__r*   r2   r1   <module>r7     s    ,       / 9 : - Q P 9 \ \ Q Qa * a HX & X v6$, 6$r	, 		< 		x 	1BII 1h	, 	X_ X]2 ]2	= 			 D W? W W8 
I
+ I

I
X 
F
) F

F
R [
BII [
 [
|r2   