
    rh                        d Z ddlmZ ddlmZmZmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z%  e jL                  e'      Z(dejR                  dejR                  fdZ*dejR                  dejR                  fdZ+dejR                  dejR                  fdZ,e ed       G d de                    Z-e ed       G d de                    Z.ee G d de                    Z/ G d  d!e	j`                        Z1 G d" d#e	j`                        Z2	 	 dKd$e	j`                  d%ejR                  d&ejR                  d'ejR                  d(eejR                     d)e3d*e3d+e4fd,Z5 G d- d.e	j`                        Z6 G d/ d0e	j`                        Z7 G d1 d2e      Z8e G d3 d4e             Z9 G d5 d6e	j`                        Z: G d7 d8e	j`                        Z; ed9       G d: d;e9             Z< G d< d=e	j`                        Z= ed>       G d? d@e9             Z>e G dA dBe9             Z?e G dC dDe9             Z@e G dE dFe9             ZA edG       G dH dIe9             ZBg dJZCy)LzPyTorch CLIP model.    )	dataclass)AnyCallableOptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )
CLIPConfigCLIPTextConfigCLIPVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr#   )r   s    y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/clip/modeling_clip.pycontrastive_lossr*   &   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r*   t)r,   caption_loss
image_losss      r)   	clip_lossr1   *   s,    #J/L!*,,.1J:%,,r+   tensorc                     t        j                  | d      }t        j                  |dd      }t        j                  |d      }|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
       T)dimkeepdim      ?)r&   powsum)r2   square_tensor
sum_tensornormed_tensors       r)   _get_vector_normr>   0   s<    
 IIfa(M=b$?JIIj#.Mr+   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)CLIPVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__rB   r   r&   FloatTensor__annotations__rC   rD   tuplerE    r+   r)   rA   rA   ;   sr    
 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r+   rA   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)CLIPTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsrC   .rD   rE   )rF   rG   rH   rI   rP   r   r&   rJ   rK   rC   rD   rL   rE   rM   r+   r)   rO   rO   M   sr    
 04K%++,359x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r+   rO   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)
CLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_textrP   rB   text_model_outputvision_model_outputr    c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))rV   rW   N)getattrto_tuple).0kselfs     r)   	<genexpr>z&CLIPOutput.to_tuple.<locals>.<genexpr>~   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)rL   keysr^   s   `r)   r[   zCLIPOutput.to_tuple}   s#     
YY[
 
 	
r+   )rF   rG   rH   rI   rS   r   r&   rJ   rK   rT   rU   rP   rB   rV   r   rW   rL   r   r[   rM   r+   r)   rR   rR   _   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r+   rR   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )CLIPVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr4   r   position_idsr   r5   
persistent)super__init__rd   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr&   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr'   expandr^   rd   	__class__s     r)   rp   zCLIPVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr+   
embeddingsheightwidthr    c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr5   r8   r   r4   bicubicF)sizemodealign_cornersr6   )shaper~   weight	unsqueezer&   jit
is_tracingrk   rt   r   reshapepermuter   r$   interpolateviewcat)r^   r   r   r   r{   r~   r|   class_pos_embedpatch_pos_embedr6   
new_height	new_widthsqrt_num_positionss                r)   interpolate_pos_encodingz-CLIPVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr+   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().)dtyper4   r   r5   r   )r   rs   
ValueErrorrz   r   r   toflatten	transposerw   r   r&   r   r   r~   rk   )r^   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r)   forwardzCLIPVisionEmbeddings.forward   s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr+   F)rF   rG   rH   r   rp   r&   Tensorintr   rJ   r   __classcell__r   s   @r)   rc   rc      se    q/ q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r+   rc   c            	            e Zd Zdef fdZ	 	 	 ddeej                     deej                     deej                     dej                  fdZ
 xZS )	CLIPTextEmbeddingsrd   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nrk   rl   Frm   )ro   rp   rq   r   r}   
vocab_sizetoken_embeddingmax_position_embeddingsr~   r   r&   r'   r   r^   rd   rr   r   s      r)   rp   zCLIPTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r+   	input_idsrk   inputs_embedsr    c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nr5   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r~   r   r   rk   r   )r^   r   rk   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r)   r   zCLIPTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r+   )NNN)rF   rG   rH   r   rp   r   r&   
LongTensorrJ   r   r   r   r   s   @r)   r   r      sj    

~ 

 153759	E,,- u//0   1 12	
 
r+   r   modulequerykeyvalueattention_maskscalingdropoutoutput_attentionsc                    t        j                  ||j                  dd            |z  }	||	|z   }	t        j                  j                  |	dt         j                        j                  |j                        }	t        j                  j                  |	|| j                        }	t        j                  |	|      }
|
j                  dd      j                         }
|sd }	|
|	fS )Nr5   r   )r6   r   )ptrainingr   r4   )r&   matmulr   r   r$   softmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              r)   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r+   c                        e Zd ZdZdeeef   f fdZ	 	 	 d
dej                  de
ej                     de
ej                     de
e   deej                  e
ej                     f   f
d	Z xZS )CLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrd   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)ro   rp   rd   rq   rr   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r)   rp   zCLIPAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar+   rD   r   causal_attention_maskr   r    c                    |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||d| j
                        j                  dd      }|	j	                  ||d| j
                        j                  dd      }	|
j	                  ||d| j
                        j                  dd      }
| j                  j                  dk(  r
|du| _	        n||||z   }n||}t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                  | j                  | j                  sd	n| j                   |
	      \  }}|j#                  |||      j%                         }| j'                  |      }|sd}||fS )z#Input shape: Batch x Time x Channelr5   r   r4   flash_attention_2Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   r   )r   r   r   r   r   r   r   rd   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )r^   rD   r   r   r   r   r   rr   queriesr`   valuesattention_interfacer   r   s                 r)   r   zCLIPAttention.forward.  s    -:,?,?)
J	++m,{{=)]+,,z:r4==ISSTUWXYyyZT]]CMMaQRSZRGQQRSUVW ;;++/BB2$>DN).C.O!/2G!G&2!6(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,/
%
!\ "))*j)LWWYmmK0 LL((r+   )NNF)rF   rG   rH   rI   r   r   r   rp   r&   r   r   boolrL   r   r   r   s   @r)   r   r     s    GBu%5~%EF B. 268<,17)||7) !.7)  (5	7)
 $D>7) 
u||Xell33	47)r+   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CLIPMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)ro   rp   rd   r   
hidden_actactivation_fnr   r   rq   intermediate_sizefc1fc2r   s     r)   rp   zCLIPMLP.__init__i  sd    #F$5$5699V//1I1IJ99V55v7I7IJr+   rD   r    c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r^   rD   s     r)   r   zCLIPMLP.forwardp  s4    /**=9/r+   )rF   rG   rH   rp   r&   r   r   r   r   s   @r)   r   r   h  s$    KU\\ ell r+   r   c                        e Zd Zdeeef   f fdZ	 d	dej                  dej                  dej                  de	e
   deej                     f
dZ xZS )
CLIPEncoderLayerrd   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)ro   rp   rq   rr   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r)   rp   zCLIPEncoderLayer.__init__x  sl    ++&v.<<F<Q<QR6?<<F<Q<QRr+   rD   r   r   r   r    c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rD   r   r   r   )r  r   r  r  )r^   rD   r   r   r   residualr   outputss           r)   r   zCLIPEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr+   r   )rF   rG   rH   r   r   r   rp   r&   r   r   r   rL   rJ   r   r   r   s   @r)   r   r   w  sq    Su%5~%EF S -2&||& &  %||	&
 $D>& 
u  	!&r+   r   c                   6    e Zd ZU eed<   dZdZdZdZdZ	dZ
d Zy)CLIPPreTrainedModelrd   clipTc                    | j                   j                  }t        |t              rj|j                  j
                  j                  j                  d|dz         |j                  j
                  j                  j                  d|dz         nMt        |t              r| j                   j                  }t        j                  j                  |j                  d|j                  dz  |z         t        j                  j                  |j                  j
                  |j                   j                  |z         t        j                  j                  |j                  j
                  |j                   j                  |z         nQt        |t               r-| j                   j                  }|j                  dz  d|j                   j"                  z  dz  z  |z  }|j                  dz  |z  }t        j                  j                  |j$                  j
                  |       t        j                  j                  |j&                  j
                  |       t        j                  j                  |j(                  j
                  |       t        j                  j                  |j*                  j
                  |       nt        |t,              r| j                   j                  }|j                   j.                  dz  d|j                   j"                  z  dz  z  |z  }d|j                   j.                  z  dz  |z  }t        j                  j                  |j0                  j
                  |       t        j                  j                  |j2                  j
                  |       n)t        |t4              rt        j                  j                  |j6                  j
                  |j8                  dz  | j                   j                  z         t        j                  j                  |j:                  j
                  |j<                  dz  | j                   j                  z         net        |t>              rdt        j                  j                  |j:                  j
                  | j                   j.                  dz  | j                   j                  z         nt        |t@              rdt        j                  j                  |j6                  j
                  | j                   j.                  dz  | j                   j                  z         n}t        |tB              rmt        j                  j                  |jD                  j
                  | j                   jF                  j.                  dz  | j                   j                  z         t        |t        jH                        rI|jJ                  j                  jM                          |j
                  j                  jO                  d       t        |t        jP                        r2|jJ                  %|jJ                  j                  jM                          yyy)	zInitialize the weightsr   g{Gz?)meanstdr   )r  r4   g      ?N))rd   initializer_factor
isinstancer   r   r   datanormal_r~   rc   r   initrw   rr   rz   initializer_ranger   num_hidden_layersr   r   r   r   r   rq   r   r   	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr   rj   zero_fill_r   )r^   r   factorin_proj_stdout_proj_stdfc_stds         r)   _init_weightsz!CLIPPreTrainedModel._init_weights  s   //f01""))..66CVd]6S%%,,1199sQU9V 45[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk.[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?	*GGOO&&--))4/$++2P2PP   GGOO((//++T1DKK4R4RR    =>GGOO((//KK++T1DKK4R4RR    ;<GGOO&&--KK++T1DKK4R4RR    :;GGOO!!((KK--994?$++B`B``  
 fbll+KK""$MM$$S)fbii(V[[-DKK""$ .E(r+   N)rF   rG   rH   r   rK   base_model_prefixsupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr$  rM   r+   r)   r  r    s0    &*#N"&6%r+   r  c                        e Zd ZdZdef fdZ	 	 	 	 d
deej                     deej                     dee	   dee	   de
f
d	Z xZS )CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    rd   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
ro   rp   rd   r   
ModuleListranger  r   layersgradient_checkpointing)r^   rd   r   r   s      r)   rp   zCLIPEncoder.__init__  sO    mmuVMeMeGf$g!%5f%=$gh&+# %hs   A#r   r   r   output_hidden_statesr    c                 6   ||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}t        | j                        D ]*  \  }	}
|r||fz   } |
||||      }|d   }|s"||d   fz   }, |r||fz   }t        |||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrM   )r   r   r   )rC   rD   rE   )rd   r   r2  	enumerater0  r   )r^   r   r   r   r   r2  encoder_statesall_attentionsrD   idxencoder_layerlayer_outputss               r)   r   zCLIPEncoder.forward  s    J 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%"+DKK"8 	FC#!/=2B!B)%"3	M *!,M !/=3C2E!E	F  +}.>>N+(%
 	
r+   NNNN)rF   rG   rH   rI   r   rp   r   r&   r   r   r   r   r   r   s   @r)   r,  r,    sy    ,z , 268<,0/3D
 !.D
  (5	D

 $D>D
 'tnD
 
D
r+   r,  c                        e Zd Zdef fdZe	 	 	 	 	 d
deej                     deej                     deej                     dee	   dee	   de
fd	       Z xZS )CLIPTextTransformerrd   c                    t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        |j                  | _        |j                  dk(  | _        y )Nr   r   )ro   rp   rd   rq   r   r   r,  encoderr   r   r   final_layer_normeos_token_idr   _use_flash_attention_2r   s      r)   rp   zCLIPTextTransformer.__init__C  sw    &&	,V4"6* "YF<Q<Q R #// '-&A&AEX&X#r+   r   r   rk   r   r2  r    c                 2   ||n| j                   j                  }||n| j                   j                  }|t        d      |j	                         }|j                  d|d         }| j                  ||      }t        ||j                  |j                        }|"| j                  st        ||j                        }| j                  |||||      }	|	j                  }
| j                  |
      }
| j                  dk(  rm|
t!        j"                  |
j$                  d   |
j                        |j'                  t         j(                  |
j                        j+                  d	      f   }n|
t!        j"                  |
j$                  d   |
j                        |j'                  t         j(                  |
j                        | j                  k(  j)                         j+                  d	      f   }t-        |
||	j.                  |	j0                  
      S )NzYou have to specify input_idsr5   )r   rk   r"   )r   r   r   r   r2  r4   r   )r   r#   r   rC   pooler_outputrD   rE   )rd   r   r2  r   r   r   r   r   r   r#   rA  r   r>  rC   r?  r@  r&   r'   r   r   r   argmaxr   rD   rE   )r^   r   r   rk   r   r2  input_shaperD   r   encoder_outputsrC   pooled_outputs               r)   r   zCLIPTextTransformer.forwardQ  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 <==nn&NN2{27	),W !A,,]5I5I!

 %d.I.I7H[H[\N+/<<')"7/!5 ,8 ,
 ,== 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M */')77&11	
 	
r+   NNNNN)rF   rG   rH   r   rp   r   r   r&   r   r   r   r   r   r   s   @r)   r<  r<  B  s    Y~ Y  -115/3,0/3F
ELL)F
 !.F
 u||,	F

 $D>F
 'tnF
 
$F
 F
r+   r<  zI
    The text model from CLIP without any head or projection on top.
    c                        e Zd ZU eed<   ddgZdef fdZdej                  fdZ	d Z
ee	 	 	 	 	 ddeej                     d	eej                     d
eej                     dee   dee   defd              Z xZS )CLIPTextModelrd   r   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )ro   rp   r<  
text_model	post_initr   s     r)   rp   zCLIPTextModel.__init__  s&     -f5r+   r    c                 B    | j                   j                  j                  S r   rM  r   r   ra   s    r)   get_input_embeddingsz"CLIPTextModel.get_input_embeddings      ))999r+   c                 :    || j                   j                  _        y r   rP  r^   r   s     r)   set_input_embeddingsz"CLIPTextModel.set_input_embeddings      5:""2r+   r   r   rk   r   r2  c                 .    | j                  |||||      S )a9  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rk   r   r2  )rM  )r^   r   r   rk   r   r2  s         r)   r   zCLIPTextModel.forward  s)    4 )%/!5  
 	
r+   rI  )rF   rG   rH   r   rK   _no_split_modulesrp   r   ModulerQ  rU  r   r   r   r&   r   r   r   r   r   r   s   @r)   rK  rK    s     -/AB~ :bii :;  -115/3,0/3
ELL)
 !.
 u||,	

 $D>
 'tn
 
$
  
r+   rK  c                        e Zd Zdef fdZe	 	 	 	 d	deej                     dee	   dee	   dee	   de
f
d       Z xZS )
CLIPVisionTransformerrd   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )ro   rp   rd   rq   rc   r   r   r   r   pre_layrnormr,  r>  post_layernormr   s      r)   rp   zCLIPVisionTransformer.__init__  sj    &&	.v6LL8M8MN"6* ll9&:O:OPr+   r   r   r2  r   r    c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  ||      }| j                  |      }| j                  |||      }|j                  }|d d dd d f   }| j                  |      }t        |||j                  |j                        S )Nz You have to specify pixel_values)r   )r   r   r2  r   rC  )rd   r   r2  r   r   r^  r>  rC   r_  r   rD   rE   )	r^   r   r   r2  r   rD   rG  rC   rH  s	            r)   r   zCLIPVisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 ?@@Ogh))-8+/<<'/!5 ,8 ,
 ,==)!Q'2++M:)/')77&11	
 	
r+   NNNF)rF   rG   rH   r   rp   r   r   r&   rJ   r   r   r   r   r   s   @r)   r\  r\    s{    Q/ Q  59,0/338!
u001!
 $D>!
 'tn	!

 #+4.!
 
$!
 !
r+   r\  zK
    The vision model from CLIP without any head or projection on top.
    c                        e Zd ZU eed<   dZdgZdef fdZdej                  fdZ
ee	 	 	 	 ddeej                     dee   dee   d	edef
d
              Z xZS )CLIPVisionModelrd   r   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )ro   rp   r\  vision_modelrN  r   s     r)   rp   zCLIPVisionModel.__init__  s'     1&9r+   r    c                 B    | j                   j                  j                  S r   re  r   rz   ra   s    r)   rQ  z$CLIPVisionModel.get_input_embeddings        ++;;;r+   r   r2  r   c                 ,    | j                  ||||      S )a  
        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r2  r   )re  )r^   r   r   r2  r   s        r)   r   zCLIPVisionModel.forward  s(    <   %/!5%=	 ! 
 	
r+   ra  )rF   rG   rH   r   rK   main_input_namerY  rp   r   rZ  rQ  r   r   r   r&   rJ   r   r   r   r   r   s   @r)   rc  rc    s     $O+,/ <bii <  59,0/3).!
u001!
 $D>!
 'tn	!

 #'!
 
$!
  !
r+   rc  c                       e Zd ZU eed<   g dZdef fdZe	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee   dee   d	e	j                  fd
       Ze	 	 	 	 ddee	j                     dee   dee   ded	e	j                  f
d       Zee	 	 	 	 	 	 	 	 ddee	j                      dee	j                     dee	j                     dee	j                      dee   dee   dee   ded	efd              Z xZS )r  rd   )r   r   rc   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        |j                  | _        t        j                  |      }|j                  | _        t         j                  |      }|j"                  | _        t%        j&                  | j                  | j                  d      | _        t%        j&                  | j                  | j                  d      | _        t%        j,                  t/        j0                  | j2                  j4                              | _        | j9                          y )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type Frj   )ro   rp   r  text_configr   	TypeErrortyper  r   projection_dimrq   r  r  rK  _from_configrM  rc  re  r   r   r  r  ru   r&   r2   rd   logit_scale_init_valuelogit_scalerN  )r^   rd   rp  r  rM  re  r   s         r)   rp   zCLIPModel.__init__B  sx    &,,n=++,-Q0 
 &..0@A--./q2 
 ((,,$33)55 - 9 9"//<
$//&33MB(55!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r+   r   r   rk   r   r2  r    c                     ||n| j                   j                  }||n| j                   j                  }| j                  |||||      }|j                  }| j                  |      }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```rX  )rd   r   r2  rM  rD  r  )	r^   r   r   rk   r   r2  text_outputsrH  text_featuress	            r)   get_text_featureszCLIPModel.get_text_featurese  s    4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 48??)%/!5 4C 4
 %22,,];r+   r   r   c                     ||n| j                   j                  }||n| j                   j                  }| j                  ||||      }|j                  }| j                  |      }|S )aD  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```rj  )rd   r   r2  re  rD  r  )r^   r   r   r2  r   vision_outputsrH  image_featuress           r)   get_image_featureszCLIPModel.get_image_features  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5%=	 6G 6
 '44//>r+   return_lossc	           	         ||n| j                   j                  }||n| j                   j                  }| j                  ||||      }	| j	                  |||||      }
|	j
                  }| j                  |      }|
j
                  }| j                  |      }|t        |      z  }|t        |      z  }t        j                  ||j                         j                  |j                              }|| j                  j                         j                  |j                        z  }|j                         }d}|rt!        |      }t#        ||||||
|	      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nrj  rX  )rS   rT   rU   rP   rB   rV   rW   )rd   r   r2  re  rM  rD  r  r  r>   r&   r   r.   r   r#   rv  expr1   rR   )r^   r   r   r   rk   r  r   r2  r   r|  rx  rB   rP   rU   rT   rS   s                   r)   r   zCLIPModel.forward  su   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5%=	 6G 6
 48??)%/!5 4C 4
 &33--l;"00**;7 $&6|&DD!$4[$AA  ,,{LNN4D4G4GHZHZ4[\)D,<,<,@,@,B,E,EkFXFX,YY*,,._-D-+#%* .
 	
r+   rI  ra  )NNNNNNNF)rF   rG   rH   r   rK   rY  rp   r   r   r&   r   r   rJ   rz  r~  r   r   rR   r   r   r   s   @r)   r  r  =  s   Z!z !F  -115/3,0/3)ELL)) !.) u||,	)
 $D>) 'tn) 
		) )V  59,0/3).-u001- $D>- 'tn	-
 #'- 
		- -^  15481537&*,0/3).U
E,,-U
 u001U
 !.	U

 u//0U
 d^U
 $D>U
 'tnU
 #'U
 
U
  U
r+   r  c                        e Zd ZU eed<   ddgZdef fdZdej                  fdZ	d Z
ee	 	 	 	 	 ddeej                     d	eej                     d
eej                     dee   dee   defd              Z xZS )r  rd   r   r   c                     t         |   |       t        j                  |      }|j                  | _        t        j                  |j                  |j                  d      | _	        | j                          y NFro  )ro   rp   rK  rt  rM  r   r   rq   rs  r  rN  )r^   rd   rM  r   s      r)   rp   z$CLIPTextModelWithProjection.__init__!  s[     "//7
$//!yy););V=R=RY^_ 	r+   r    c                 B    | j                   j                  j                  S r   rP  ra   s    r)   rQ  z0CLIPTextModelWithProjection.get_input_embeddings,  rR  r+   c                 :    || j                   j                  _        y r   rP  rT  s     r)   rU  z0CLIPTextModelWithProjection.set_input_embeddings/  rV  r+   r   r   rk   r   r2  c                     | j                  |||||      }|j                  }| j                  |      }t        ||j                  |j
                  |j                        S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```rX  )rP   rC   rD   rE   )rM  rD  r  rO   rC   rD   rE   )	r^   r   r   rk   r   r2  rx  rH  rP   s	            r)   r   z#CLIPTextModelWithProjection.forward2  so    2 48??)%/!5 4C 4
 %22**=9"#*<<&44#..	
 	
r+   rI  )rF   rG   rH   r   rK   rY  rp   r   rZ  rQ  rU  r   r   r   r&   r   r   rO   r   r   r   s   @r)   r  r    s    -/AB	~ 	:bii :;  -115/3,0/3&
ELL)&
 !.&
 u||,	&

 $D>&
 'tn&
 
&
  &
r+   r  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
e	 	 	 	 d
deej                     dee   dee   dedef
d	              Z xZS )r  rd   r   c                     t         |   |       t        j                  |      }|j                  | _        t        j                  |j                  |j                  d      | _	        | j                          y r  )ro   rp   rc  rt  re  r   r   rq   rs  r  rN  r^   rd   re  r   s      r)   rp   z&CLIPVisionModelWithProjection.__init__b  s\     &33F;(55!#6+=+=v?T?T[`!a 	r+   r    c                 B    | j                   j                  j                  S r   rg  ra   s    r)   rQ  z2CLIPVisionModelWithProjection.get_input_embeddingsm  rh  r+   r   r2  r   c                     | j                  ||||      }|j                  }| j                  |      }t        ||j                  |j
                  |j                        S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection

        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> image_embeds = outputs.image_embeds
        ```rj  )rB   rC   rD   rE   )re  rD  r  rA   rC   rD   rE   )r^   r   r   r2  r   r|  rH  rB   s           r)   r   z%CLIPVisionModelWithProjection.forwardp  sn    : 6:5F5F%/!5%=	 6G 6
 '44--m<$%,>>(66%00	
 	
r+   ra  )rF   rG   rH   r   rK   rk  rp   r   rZ  rQ  r   r   r   r&   rJ   r   rA   r   r   r   s   @r)   r  r  ]  s    $O	/ 	<bii <  59,0/3).)
u001)
 $D>)
 'tn	)

 #')
 
)
  )
r+   r  z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                        e Zd ZdZdeddf fdZee	 	 	 	 d
dee	j                     dee	j                     dee   dee   def
d	              Z xZS )r  r   rd   r    Nc                 ~   t         |   |       |j                  | _        t        j	                  |j
                        }|j                  | _        |j                  dkD  r4t        j                  |j
                  j                  |j                        nt        j                         | _        | j                          y )Nr   )ro   rp   
num_labelsrc  rt  r  re  r   r   rq   Identityr  rN  r  s      r)   rp   z#CLIPForImageClassification.__init__  s      ++&33F4H4HI(55 OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r+   labelsr   r2  c                    ||n| j                   j                  }||n| j                   j                  }| j                  |||      }|j                  }t        j                  |ddddddf   d      }| j                  |      }d}||j                  |j                        }| j                   j                  | j                  dk(  rd| j                   _
        nl| j                  dkD  rL|j                  t
        j                  k(  s|j                  t
        j                  k(  rd| j                   _
        nd| j                   _
        | j                   j                  dk(  rIt               }	| j                  dk(  r& |	|j!                         |j!                               }n |	||      }n| j                   j                  dk(  r=t#               }	 |	|j%                  d| j                        |j%                  d            }n,| j                   j                  dk(  rt'               }	 |	||      }t)        |||j*                  |j,                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r2  r   r   
regressionsingle_label_classificationmulti_label_classificationr5   )rS   r   rD   rE   )rd   r   r2  re  rC   r&   r  r  r   r#   problem_typer  r   longr   r   squeezer
   r   r	   r   rD   rE   )
r^   r   r  r   r2  r  sequence_outputr   rS   loss_fcts
             r)   r   z"CLIPForImageClassification.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 /3.?.?/!5 /@ /
 "33  **_QAX%>AF1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./$!//))	
 	
r+   r:  )rF   rG   rH   rk  r   rp   r   r   r   r&   r   r   r   r   r   r   s   @r)   r  r    s     %Oz d   04)-,0/3=
u||,=
 &=
 $D>	=

 'tn=
 
=
  =
r+   r  )r  r  rK  r  rc  r  r  )r   T)DrI   dataclassesr   typingr   r   r   r   r&   r   torch.nnr	   r
   r   activationsr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   utilsr   r   r   r   r   configuration_clipr   r   r   
get_loggerrF   r   r   r*   r1   r>   rA   rO   rR   rZ  rc   r   floatr   r   r   r   r   r  r,  r<  rK  r\  rc  r  r  r  r  __all__rM   r+   r)   <module>r     s    ! 1 1   A A ! d 9 b b F V V L L 
		H	%
`U\\ `ell `-%,, -5<< -U\\ ell  
	?K 	? 	? 
	?+ 	? 	?  
  
   
FP299 Pf% %^ "%II%<<% 
% <<	%
 U\\*% % % %0N)BII N)bbii /1 /d ?%/ ?% ?%DS
")) S
lV
")) V
r 
1
' 1

1
h-
BII -
` 
1
) 1

1
h Z
# Z
 Z
z >
"5 >
 >
B =
$7 =
 =
@ Q
!4 Q
Q
hr+   