
    rh                        d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
mZ ddlZ	ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&  e!jN                  e(      Z)de	jT                  de	jT                  fdZ+de	jT                  de	jT                  fdZ,ee G d de                    Z- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1de0iZ2 G d dej\                        Z3 G d dej\                        Z4 G d  d!ej\                        Z5 G d" d#e      Z6 G d$ d%ej\                        Z7 G d& d'ej\                        Z8	 dJd(ej\                  d)e	jT                  d*e	jT                  d+e	jT                  d,ee	jT                     d-e9d.e9fd/Z: G d0 d1ej\                        Z; G d2 d3ej\                        Z< G d4 d5e      Z= G d6 d7ej\                        Z> G d8 d9ej\                        Z?e G d: d;e             Z@ G d< d=ej\                        ZA G d> d?e@      ZB ed@A       G dB dCe@             ZC G dD dEe@      ZD G dF dGe@      ZEdKdHZFg dIZGy)LzPyTorch AltCLIP model.    N)	dataclass)AnyCallableOptionalUnion   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)nn
functionalcross_entropytorcharangelenr!   )r   s    /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_lossr)   ,   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r)   t)r+   caption_loss
image_losss      r(   	clip_lossr0   0   s,    #J/L!*,,.1J:%,,r*   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)AltCLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r8   r9   N)getattrto_tuple).0kselfs     r(   	<genexpr>z)AltCLIPOutput.to_tuple.<locals>.<genexpr>V   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysr@   s   `r(   r=   zAltCLIPOutput.to_tupleU   s#     
YY[
 
 	
r*   )__name__
__module____qualname____doc__r3   r   r%   FloatTensor__annotations__r4   r5   r6   r7   r8   r   r9   rB   r   r=    r*   r(   r2   r2   6   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r*   r2   c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )AltRobertaEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       |j                  | _        t        j                  |j                  |j
                  | j6                        | _	        y )N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistenttoken_type_idsdtype)super__init__r"   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutr<   rR   register_bufferr%   r&   expandzerosrT   sizelongrO   r@   config	__class__s     r(   r]   zAltRobertaEmbeddings.__init__c   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
 r*   c                    |+|t        || j                  |      }n| j                  |      }||j                         }n|j                         d d }|d   }|st	        | d      r-| j
                  d d d |f   }|j                  |d   |      }	|	}n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j!                  |      }| j#                  |      }|S )NrV   r   rY   r   r[   r!   rS   )"create_position_ids_from_input_idsrO   &create_position_ids_from_inputs_embedsro   hasattrrY   rm   r%   rn   rp   rT   r!   rb   rf   rR   rd   rg   rk   )r@   	input_idsrY   rT   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrf   
embeddingsrd   s                r(   forwardzAltRobertaEmbeddings.forward|   sR    $A)TM]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r*   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrV   r   ru   r   )ro   r%   r&   rO   rp   r!   	unsqueezerm   )r@   rz   r|   sequence_lengthrT   s        r(   rw   z;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r*   )NNNNr   )rE   rF   rG   rH   r]   r   rw   __classcell__rs   s   @r(   rM   rM   ]   s    

4 rs&P=r*   rM   c                        e Zd Zd fd	Z	 	 	 d	dej
                  deej                     deej                     dee   de	ej
                     f
dZ
 xZS )
AltRobertaSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rG|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        y y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rR   rS   relative_keyrelative_key_query   r   )r\   r]   r`   num_attention_headsrx   
ValueErrorintattention_head_sizeall_head_sizer"   Linearquerykeyvalueri   attention_probs_dropout_probrk   r<   rR   rc   r^   distance_embeddingr@   rr   rR   rs   s      r(   r]   z AltRobertaSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr*   hidden_statesattention_mask	head_maskoutput_attentionsr   c                 D   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	t        j                  ||j	                  dd            }
| j                  dk(  s| j                  dk(  rQ|j                   d   |j                   d   }}t        j                  |t        j                  |j                        j                  dd      }t        j                  |t        j                  |j                        j                  dd      }||z
  }| j                  || j                  z   dz
        }|j                  |j                         }| j                  dk(  rt        j"                  d	||      }|
|z   }
nE| j                  dk(  r6t        j"                  d	||      }t        j"                  d
||      }|
|z   |z   }
|
t%        j&                  | j                        z  }
||
|z   }
t(        j*                  j-                  |
d      }| j/                  |      }|||z  }t        j                  ||	      }|j1                  dddd      j3                         }|j5                         d d | j6                  fz   }|j                  |      }|r||f}|S |f}|S )NrV   r   r   r   r   ru   rZ   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r   )shaper   r   view	transposer   r   r%   matmulrR   r&   rp   r!   r   rc   tor[   einsummathsqrtr"   r#   softmaxrk   permute
contiguousro   r   )r@   r   r   r   r   r|   hidden_shapequery_layer	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                           r(   r   zAltRobertaSelfAttention.forward   s    $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L"\\,ejjQ^QeQefkklnpqrN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r*   NNNF)rE   rF   rG   r]   r%   Tensorr   rI   boolrB   r   r   r   s   @r(   r   r      sp    u6 7;15,1:||: !!2!23: E--.	:
 $D>: 
u||	:r*   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AltRobertaSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrP   )r\   r]   r"   r   r`   denserg   rh   ri   rj   rk   rq   s     r(   r]   zAltRobertaSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r*   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   rk   rg   r@   r   r   s      r(   r   zAltRobertaSelfOutput.forward  7    

=1]3}|'CDr*   rE   rF   rG   r]   r%   r   r   r   r   s   @r(   r   r     1    >U\\  RWR^R^ r*   r   eagerc                        e Zd Zd	 fd	Zd Z	 	 	 d
dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )AltRobertaAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )N)rR   )	r\   r]   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationr@   r   outputsetpruned_headsr   s      r(   r]   zAltRobertaAttention.__init__!  sC    6v7R7RS,C
	 +62Er*   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )r'   r   r@   r   r   r   r   r   r   r   r   r   r   union)r@   headsindexs      r(   prune_headszAltRobertaAttention.prune_heads)  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r*   r   r   r   r   r   c                 l    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S N)r   r   r   r   r   )r@   r   )r@   r   r   r   r   self_outputsattention_outputr   s           r(   r   zAltRobertaAttention.forward;  sS     yy)/	 ! 
  ;;|AF#%QR(88r*   r   r   )rE   rF   rG   r]   r   r%   r   r   rI   r   rB   r   r   r   s   @r(   r   r      st    ";* 7;15,1|| !!2!23 E--.	
 $D> 
u||	r*   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AltRobertaIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r\   r]   r"   r   r`   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrq   s     r(   r]   zAltRobertaIntermediate.__init__O  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r*   r   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r   r@   r   s     r(   r   zAltRobertaIntermediate.forwardW  s&    

=100?r*   r   r   s   @r(   r   r   N  s#    9U\\ ell r*   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AltRobertaOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r\   r]   r"   r   r   r`   r   rg   rh   ri   rj   rk   rq   s     r(   r]   zAltRobertaOutput.__init___  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r*   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r(   r   zAltRobertaOutput.forwarde  r   r*   r   r   s   @r(   r   r   ^  r   r*   r   c                        e Zd Z fdZ	 	 	 d	dej
                  deej                     deej                     dee   de	ej
                     f
dZ
d Z xZS )
AltRobertaLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y )Nr   )
r\   r]   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   rq   s     r(   r]   zAltRobertaLayer.__init__n  sI    '-'E'E$,V426:&v.r*   r   r   r   r   r   c                      | j                   |f|||d|}|d   }|dd  }t        | j                  | j                  | j                  |      }	|	f|z   }|S r   )r   r   feed_forward_chunkr   r   )
r@   r   r   r   r   kwargsself_attention_outputsr   r   layer_outputs
             r(   r   zAltRobertaLayer.forwardv  s     "0"
)/	"

 "
 2!4(,0##T%A%A4CSCSUe
  /G+r*   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )r@   r   intermediate_outputr   s       r(   r   z"AltRobertaLayer.feed_forward_chunk  s,    "//0@A{{#68HIr*   r   )rE   rF   rG   r]   r%   r   r   rI   r   rB   r   r   r   r   s   @r(   r   r   m  st    / 7;15,1|| !!2!23 E--.	
 $D> 
u||	2r*   r   c                        e Zd Z fdZe	 	 	 	 	 d
dej                  deej                     deej                     dee	   dee	   dee	   de
eej                     ef   fd	       Z xZS )AltRobertaEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r\   r]   rr   r"   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r@   rr   irs   s      r(   r]   zAltRobertaEncoder.__init__  sN    ]]U6KcKcEd#eOF$;#ef
&+# $f   A#r   r   r   r   output_hidden_statesreturn_dictr   c           	          |rdnd }|rdnd }	t        | j                        D ]4  \  }
}|r||fz   }|||
   nd } |d||||d|}|d   }|s,|	|d   fz   }	6 |r||fz   }t        |||	      S )NrK   )r   r   r   r   r   r   last_hidden_stater   
attentions)	enumerater  r   )r@   r   r   r   r   r	  r
  r   all_hidden_statesall_self_attentionsr  layer_modulelayer_head_masklayer_outputss                 r(   r   zAltRobertaEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!.7.CilO( +-)"3	
 M *!,M &9]1=M<O&O#!	P$   1]4D D++*
 	
r*   )NNFFT)rE   rF   rG   r]   r   r%   r   r   rI   r   r   rB   r   r   r   r   s   @r(   r   r     s    ,  7;15,1/4&*&
||&
 !!2!23&
 E--.	&

 $D>&
 'tn&
 d^&
 
uU\\"O3	4&
 &
r*   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AltRobertaPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r\   r]   r"   r   r`   r   Tanh
activationrq   s     r(   r]   zAltRobertaPooler.__init__  s9    YYv1163E3EF
'')r*   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )r@   r   first_token_tensorpooled_outputs       r(   r   zAltRobertaPooler.forward  s6     +1a40

#566r*   r   r   s   @r(   r  r    s#    $
U\\ ell r*   r  moduler   r   r   r   scalingrk   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrV   r   )r   r[   )ptrainingr   r   )r%   r   r   r"   r#   r   float32r   r[   rk   r!  r   )
r  r   r   r   r   r  rk   r   attn_weightsattn_outputs
             r(   eager_attention_forwardr%    s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r*   c                        e Zd ZdZ fdZ	 	 	 d	dej                  deej                     deej                     dee   de	ej                  eej                     f   f
dZ
 xZS )
AltCLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)r\   r]   rr   r`   	embed_dimr   	num_headshead_dimr   scaleattention_dropoutrk   	is_causalr"   r   k_projv_projq_projout_projrq   s     r(   r]   zAltCLIPAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar*   r   r   causal_attention_maskr   r   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                  | j                  | j                   sdn| j"                  	      \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )
z#Input shape: Batch x Time x Channelr   r   flash_attention_2Nr   sdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r0  r  rk   )r   r3  r1  r2  r   r,  r-  r   rr   r   r0  r%  loggerwarning_oncer   r.  r!  rk   reshaper   r4  )r@   r   r   r5  r   
batch_sizer}   r+  queriesrC   valuesattention_interfacer$  r#  s                 r(   r   zAltCLIPAttention.forward  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r*   r   )rE   rF   rG   rH   r]   r%   r   r   r   rB   r   r   r   s   @r(   r'  r'    s}    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45)r*   r'  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
AltCLIPMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r\   r]   rr   r	   r   activation_fnr"   r   r`   r   fc1fc2rq   s     r(   r]   zAltCLIPMLP.__init__@  sd    #F$5$5699V//1I1IJ99V55v7I7IJr*   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rE  rD  rF  r   s     r(   r   zAltCLIPMLP.forwardG  s4    /**=9/r*   r   r   s   @r(   rB  rB  ?  s$    KU\\ ell r*   rB  c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
AltCLIPEncoderLayerrr   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )r\   r]   r`   r+  r'  	self_attnr"   rg   rh   layer_norm1rB  mlplayer_norm2rq   s     r(   r]   zAltCLIPEncoderLayer.__init__O  sm    ++)&1<<F<Q<QRf%<<F<Q<QRr*   r   r   r5  r   r   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r5  r   )rL  rK  rN  rM  )r@   r   r   r5  r   residualr#  r   s           r(   r   zAltCLIPEncoderLayer.forwardW  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr*   F)rE   rF   rG   r   r]   r%   r   r   r   rB   rI   r   r   r   s   @r(   rI  rI  N  sf    S} S -2&||& &  %||	&
 $D>& 
u  	!&r*   rI  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej                     deej                     dee
   dee
   dee
   d	eeef   fd
       Z xZS )AltCLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].

    Args:
        config: AltCLIPConfig
    rr   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r  )
r\   r]   rr   r"   r  r  r  rI  layersr  )r@   rr   _rs   s      r(   r]   zAltCLIPEncoder.__init__  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %kr  r   r5  r   r	  r
  r   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrK   )r   r   r   r  )rr   r   r	  use_return_dictr  rU  r   )r@   rz   r   r5  r   r	  r
  encoder_statesall_attentionsr   idxencoder_layerr  s                r(   r   zAltCLIPEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B)%"3	M *!,M !/=3C2E!E	F  +}.>>N+>Vd
 	
r*   )NNNNN)rE   rF   rG   rH   r   r]   r   r   r%   r   r   r   rB   r   r   r   r   s   @r(   rS  rS    s    ,} ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r*   rS  c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )AltCLIPVisionEmbeddingsrr   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr   r   rT   rU   rW   )r\   r]   rr   r`   r+  
image_size
patch_sizer"   	Parameterr%   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsr^   position_embeddingrl   r&   rm   rq   s     r(   r]   z AltCLIPVisionEmbeddings.__init__  s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr*   r   heightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrV   g      ?r   r   bicubicF)ro   modealign_cornersr   )r   ro  weightr   r%   jit
is_tracingrT   rf  r   r<  r   r"   r#   interpolater   cat)r@   r   rp  rq  rm  ro  rn  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r(   interpolate_pos_encodingz0AltCLIPVisionEmbeddings.interpolate_pos_encoding  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr*   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model (r)  rZ   r   r   rV   r   )r   re  r   rl  rv  r[   r   flattenr   ri  rm   r%   rz  r  ro  rT   )r@   r  r  r=  rV  rp  rq  target_dtypepatch_embedsclass_embedsr   s              r(   r   zAltCLIPVisionEmbeddings.forward  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr*   rQ  )rE   rF   rG   r   r]   r%   r   r   r  rI   r   r   r   s   @r(   r^  r^    se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r*   r^  c                   *    e Zd ZU eed<   dZdZg Zd Zy)AltCLIPPreTrainedModelrr   altclipTc                 :   | j                   j                  }t        |t              r| j                   j                  }t        j
                  j                  |j                  d|j                  dz  |z         t        j
                  j                  |j                  j                  |j                   j                  |z         t        j
                  j                  |j                  j                  |j                   j                  |z         yt        |t              r,| j                   j                  }|j                  dz  d|j                   j                  z  dz  z  |z  }|j                  dz  |z  }t        j
                  j                  |j                  j                  |       t        j
                  j                  |j                   j                  |       t        j
                  j                  |j"                  j                  |       t        j
                  j                  |j$                  j                  |       yt        |t&              r| j                   j                  }|j                   j(                  dz  d|j                   j                  z  dz  z  |z  }d|j                   j(                  z  dz  |z  }t        j
                  j                  |j*                  j                  |       t        j
                  j                  |j,                  j                  |       yt        |t.              rt        j
                  j                  |j0                  j                  |j2                  dz  | j                   j                  z         d|j0                  _        t        j
                  j                  |j6                  j                  |j8                  dz  | j                   j                  z         d|j6                  _        yt        |t        j:                        rJ|j<                  j>                  jA                          |j                  j>                  jC                  d       yt        |t        jD                        rm|j                  j>                  j                  d| j                   j                         |j<                  %|j<                  j>                  jA                          yyt        |t        jF                        rz|j                  j>                  j                  d| j                   j                         |jH                  2|j                  j>                  |jH                     jA                          yyy)	zInitialize the weightsr9  r*  )meanstd)r  r   Tg      ?N)%rr   initializer_factorr   r^  r"   initnormal_ri  r+  rl  rv  initializer_rangero  r'  r  r3  r1  r2  r4  rB  r`   rE  rF  AltCLIPModeltext_projectiontext_embed_dim_is_hf_initializedvisual_projectionvision_embed_dimrg   rd  datazero_fill_r   r^   rO   )r@   r  factorin_proj_stdout_proj_stdfc_stds         r(   _init_weightsz$AltCLIPPreTrainedModel._init_weights2  s   //f56[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 01[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE
+[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?-GGOO&&--))4/$++2P2PP   9=F""5GGOO((//++T1DKK4R4RR   ;?F$$7-KK""$MM$$S)		*MM&&CT[[5S5S&T{{&  &&( '-MM&&CT[[5S5S&T!!-""6#5#56<<> . .r*   N)	rE   rF   rG   r   rJ   base_model_prefixsupports_gradient_checkpointing_no_split_moduler  rK   r*   r(   r  r  +  s    !&*#+?r*   r  c                        e Zd Zdef fdZee	 	 	 	 	 d
deej                     dee
   dee
   dee
   dee
   deeef   fd	              Z xZS )AltCLIPVisionTransformerrr   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )r\   r]   rr   r`   r^  r   r"   rg   rh   pre_layrnormrS  encoderpost_layernorm)r@   rr   r+  rs   s      r(   r]   z!AltCLIPVisionTransformer.__init__a  sj    &&	1&9LL8M8MN%f- ll9&:O:OPr*   r  r   r	  r
  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  |||d      }|d   }|d d dd d f   }	| j                  |	      }	t        ||	|j                  |j                        S )Nz You have to specify pixel_values)r  T)rz   r   r	  r
  r   r  pooler_outputr   r  )rr   r   r	  rX  r   r   r  r  r  r   r   r  )
r@   r  r   r	  r
  r  r   encoder_outputsr  r  s
             r(   r   z AltCLIPVisionTransformer.forwardk  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5	 ' 
 ,A.)!Q'2++M:)/')77&11	
 	
r*   )NNNNF)rE   rF   rG   r   r]   r   r   r   r%   rI   r   r   rB   r   r   r   r   s   @r(   r  r  `  s    Q2 Q  59,0/3&*38$
u001$
 $D>$
 'tn	$

 d^$
 #+4.$
 
u00	1$
  $
r*   r  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )AltCLIPVisionModelrr   r  c                 d    t         |   |       t        |      | _        | j	                          y r   )r\   r]   r  vision_model	post_initrq   s     r(   r]   zAltCLIPVisionModel.__init__  s'     4V<r*   r   c                 B    | j                   j                  j                  S r   )r  r   rl  rD   s    r(   get_input_embeddingsz'AltCLIPVisionModel.get_input_embeddings  s      ++;;;r*   r   r	  r  r
  c                 b    ||n| j                   j                  }| j                  |||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPVisionModel

        >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r  r   r	  r  r
  )rr   rX  r  )r@   r  r   r	  r  r
  s         r(   r   zAltCLIPVisionModel.forward  sA    : &1%<k$++B]B]  %/!5%=# ! 
 	
r*   NNNFN)rE   rF   rG   r   rJ   main_input_namer]   r"   Moduler  r   r   r%   rI   r   r   rB   r   r   r   r   s   @r(   r  r    s    $O2 <bii <  59,0/3).&*$
u001$
 $D>$
 'tn	$

 #'$
 d^$
 
u00	1$
 $
r*   r  a=  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    )custom_introc                   ^    e Zd ZU eed<   d fd	Zd Zd Zd Ze		 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
ej                     de
e   de
e   de
e   deeej                     ef   fd       Z xZS )AltRobertaModelrr   c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r\   r]   rr   rM   r   r   r  r  poolerr  )r@   rr   add_pooling_layerrs   s      r(   r]   zAltRobertaModel.__init__  sN    
 	 .v6(02C&v. 	r*   c                 .    | j                   j                  S r   r   rb   rD   s    r(   r  z$AltRobertaModel.get_input_embeddings  s    ...r*   c                 &    || j                   _        y r   r  r@   r   s     r(   set_input_embeddingsz$AltRobertaModel.set_input_embeddings  s    */'r*   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  r   r   )r@   heads_to_pruner  r   s       r(   _prune_headszAltRobertaModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr*   ry   r   rY   rT   r   rz   r   r	  r
  r   c
                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j#                  || j                   j$                        }| j                  ||||      }| j'                  |||||d	      }|d
   }| j(                  | j)                  |      nd }t+        |||j,                  |j.                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerV   z5You have to specify either input_ids or inputs_embedsr    rY   ru   )ry   rT   rY   rz   T)r   r   r   r	  r
  r   r  )rr   r   r	  rX  r   %warn_if_padding_and_no_attention_maskro   r!   r%   onesrx   r   rY   rm   rn   rp   get_extended_attention_maskget_head_maskr  r  r  r   r   r  )r@   ry   r   rY   rT   r   rz   r   r	  r
  r|   r=  r}   r!   r~   r   extended_attention_maskembedding_outputr  sequence_outputr  s                        r(   r   zAltRobertaModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m &&y$++2O2OP	??%)'	 + 
 ,,2/!5 ' 
 *!,8<8OO4UY)-')77&11	
 	
r*   )T	NNNNNNNNN)rE   rF   rG   r   rJ   r]   r  r  r  r   r   r%   r   r   r   rB   r   r   r   r   s   @r(   r  r    s     /0C  -11515/3,004,0/3&*G
ELL)G
 !.G
 !.	G

 u||,G
 ELL)G
  -G
 $D>G
 'tnG
 d^G
 
uU\\"$PP	QG
 G
r*   r  c                       e Zd ZU eed<    fdZdej                  fdZdej                  ddfdZ
ddee   dej                  f fd	Zee	 	 	 	 	 	 	 	 	 dd
eej"                     deej"                     deej"                     deej"                     deej"                     deej"                     dee   dee   dee   deeef   fd              Z xZS )AltCLIPTextModelrr   c                 &   t         |   |       t        |d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _        | j                          y )NF)r  rP   )r\   r]   r  robertar"   r   r`   project_dimtransformationrg   rh   pre_LNr  rq   s     r(   r]   zAltCLIPTextModel.__init__C  se     &vG ii(:(:F<N<NOll6#5#56;P;PQr*   r   c                 B    | j                   j                  j                  S r   r  r   rb   rD   s    r(   r  z%AltCLIPTextModel.get_input_embeddingsJ  s    ||&&666r*   r   Nc                 :    || j                   j                  _        y r   r  r  s     r(   r  z%AltCLIPTextModel.set_input_embeddingsM  s    27/r*   new_num_tokensc                 "    t         |   |      S r   )r\   resize_token_embeddings)r@   r  rs   s     r(   r  z(AltCLIPTextModel.resize_token_embeddingsP  s    w.~>>r*   ry   r   rY   rT   r   rz   r   r
  r	  c
                    ||n| j                   j                  }| j                  ||||||||	d	      }
|
d   }| j                  |      }| j	                  |      }|dddf   }t        |||
j                  |
j                        S )a+  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPTextModel

        >>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> texts = ["it's a cat", "it's a dog"]

        >>> inputs = processor(text=texts, padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```NT)	ry   r   rY   rT   r   rz   r   r	  r
  r   r  )rr   rX  r  r  r  r   r   r  )r@   ry   r   rY   rT   r   rz   r   r
  r	  r   r  projection_stater  s                 r(   r   zAltCLIPTextModel.forwardS  s    @ &1%<k$++B]B],,))%'/!5  

 "!* ++o6  ..?(A.6.'!//))	
 	
r*   r   r  )rE   rF   rG   r   rJ   r]   r"   r  r  r^   r  r   r   r  r   r   r%   r   r   r   rB   r   r   r   r   s   @r(   r  r  @  s8   7bii 78",, 84 8?hsm ?r|| ?  -11515/3,004,0&*/3;
ELL);
 !.;
 !.	;

 u||,;
 ELL);
  -;
 $D>;
 d^;
 'tn;
 
u==	>;
  ;
r*   r  c                   P    e Zd ZU eed<   def fdZe	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   dee
   dee
   d	ej                  fd
       Ze	 	 	 	 	 ddeej                     dee
   dee
   de
dee
   d	ej                  fd       Ze	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     dee
   dee
   dee
   de
dee
   d	eeef   fd       Z xZS )r  rr   c                 r   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  |_	        |j                  | _
        |j                  | _        |j                  | _        t        |      | _        t#        |      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j.                  t1        j2                  | j4                  j6                              | _        | j;                          y )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)rd  )r\   r]   r   vision_configr   	TypeErrortypetext_configr   r   projection_dimr  r  r`   r  r  
text_modelr  r  r"   r   r  r  rg  r%   tensorrr   logit_scale_init_valuelogit_scaler  )r@   rr   r  r  rs   s       r(   r]   zAltCLIPModel.__init__  se    &..0CD--./q2  &,,.?@++,-Q0 
 ((,,-3-H-H*$33)55 - 9 9*;74]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r*   ry   r   rT   r   r	  r
  r   c           	          ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||||      }|d   }	| j                  |	      }
|
S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```)ry   r   rT   rY   r   r	  r
  r   )rr   r   r	  rX  r  r  )r@   ry   r   rT   rY   r   r	  r
  text_outputsr  text_featuress              r(   get_text_featureszAltCLIPModel.get_text_features  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%)/!5# ' 
 %Q,,];r*   r  r  c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||      }|d   }| j                  |      }|S )a*  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_features = model.get_image_features(**inputs)
        ```r  r   )rr   r   r	  rX  r  r  )	r@   r  r   r	  r  r
  vision_outputsr  image_featuress	            r(   get_image_featureszAltCLIPModel.get_image_features  s    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 'q)//>r*   rY   return_lossc           	         ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
| j	                  |||||||
      }| j                  ||||	|
      }|d   }| j                  |      }|d   }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }t        j                  ||j                               |z  }|j                  }d}|rt        |      }|
s||||||f}||f|z   S |S t!        |||||||	      S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```N)ry   r   rY   rT   r   r	  r
  r  r   r   rV   T)r   r   keepdim)r3   r4   r5   r6   r7   r8   r9   )rr   r   r	  rX  r  r  r  r  normr  expr%   r   r-   Tr0   r2   )r@   ry   r  r   rT   rY   r  r   r	  r  r
  r  r  r7   r6   r  r5   r4   r3   r   s                       r(   r   zAltCLIPModel.forward  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%/!5# ' 
 **%/!5%=# + 
 &a(--l;"1o**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO &&**,,,{LNN4DES*,,_-D&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r*   )NNNNNNNr  )
NNNNNNNNFN)rE   rF   rG   r   rJ   r]   r   r   r%   r   r   rI   r  r  
LongTensorr   rB   r2   r   r   r   s   @r(   r  r    s*   } B  -115/3,0/3&*,ELL), !., u||,	, $D>, 'tn, d^, 
		, ,\  59,0/3).&*-u001- $D>- 'tn	-
 #'- d^- 
		- -^  1548153715&*,0/3).&*[
E,,-[
 u001[
 !.	[

 u//0[
 !.[
 d^[
 $D>[
 'tn[
 #'[
 d^[
 
um#	$[
 [
r*   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )ner   r%   cumsumtype_asrp   )ry   rO   r{   maskincremental_indicess        r(   rv   rv   v  sW     <<$((*D <<!4<<TBE[[_cc##%33r*   )r  r  r  r  )r9  )r   )HrH   r   dataclassesr   typingr   r   r   r   r%   torch.nnr"   torch.utils.checkpointactivationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_altclipr   r   r   
get_loggerrE   r:  r   r)   r0   r2   r  rM   r   r   r   r   r   r   r   r   r  floatr%  r'  rB  rI  rS  r^  r  r  r  r  r  r  rv   __all__rK   r*   r(   <module>r     s     ! 1 1    ! 9  G l l V V X X 
		H	%
`U\\ `ell `-%,, -5<< -  
K  
   
HV=299 V=rSbii Sn299  $& "
*")) *\RYY  ryy %0 %R.
		 .
dryy . %II%<<% 
% <<	%
 U\\*% % %.L)ryy L)` /4 /dT
RYY T
pPbii Pf 1?_ 1? 1?h1
ryy 1
h2
/ 2
j k
, k
k
\P
- P
f_
) _
F4  _r*   