
    rh                     6   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZ ddlmZmZmZ ddlmZmZmZm Z  ddl!m"Z"m#Z#m$Z$  e jJ                  e&      Z'e ed       G d de                    Z(e ed       G d de                    Z)ee G d de                    Z*de	jV                  de	jV                  fdZ,de	jV                  de	jV                  fdZ-de$de.fdZ/dYd ee.e0f   d!e1fd"Z2 G d# d$ejf                        Z4 G d% d&ejj                        Z6 G d' d(ejf                        Z7 G d) d*ejf                        Z8 G d+ d,ejf                        Z9 G d- d.ejf                        Z: G d/ d0ejf                        Z; G d1 d2ejf                        Z< G d3 d4ejf                        Z=	 	 dZd5ejf                  d6e	jV                  d7e	jV                  d8e	jV                  d9ee	jV                     d:e>d;e>d<ee	jV                     fd=Z? G d> d?ejf                        Z@ G d@ dAejf                        ZA G dB dCejf                        ZB G dD dEejf                        ZC G dF dGejf                        ZD G dH dIe      ZE G dJ dKejf                        ZF G dL dMejf                        ZGe G dN dOe             ZH edP       G dQ dReH             ZI edS       G dT dUeH             ZJe G dV dWeH             ZKg dXZLy)[zPyTorch ALIGN model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithNoAttentionBaseModelOutputWithPooling(BaseModelOutputWithPoolingAndNoAttention)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   y)AlignVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_statehidden_states)__name__
__module____qualname____doc__r    r   torchFloatTensor__annotations__r!   r"   tuple     {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/align/modeling_align.pyr   r   *   sN    
 15L(5,,-459x 1 1298<M8E%"3"345<r,   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)AlignTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr!   r"   
attentions)r#   r$   r%   r&   r0   r   r'   r(   r)   r!   r"   r*   r1   r+   r,   r-   r/   r/   ;   sh    
 04K%++,359x 1 1298<M8E%"3"345<59Ju00129r,   r/   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)AlignOutputar  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The output of [`AlignVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AlignTextModel`].
    vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
        The output of the [`AlignVisionModel`].
    Nlosslogits_per_imagelogits_per_textr0   r    text_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r7   r8   N)getattrto_tuple).0kselfs     r-   	<genexpr>z'AlignOutput.to_tuple.<locals>.<genexpr>l   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)r*   keysr@   s   `r-   r=   zAlignOutput.to_tuplek   s#     
YY[
 
 	
r,   )r#   r$   r%   r&   r4   r   r'   r(   r)   r5   r6   r0   r    r7   r   r8   r   r*   r   r=   r+   r,   r-   r3   r3   M   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-44818DHAH
%* 
r,   r3   logitsr9   c                     t         j                  j                  | t        j                  t        |       | j                        d      S )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr'   arangelenrG   )rD   s    r-   contrastive_lossrM   t   s5    ==&&vu||CKPVP]P]/^ps&ttr,   
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)rM   t)rN   caption_loss
image_losss      r-   
align_lossrS   x   s,    #J/L!*,,.1J:%,,r,   confignum_channelsc                     | j                   }|| j                  z  }t        |t        ||dz  z         |z  |z        }|d|z  k  r||z  }t        |      S )z<
    Round number of filters based on depth multiplier.
       g?)depth_divisorwidth_coefficientmaxint)rT   rU   divisornew_dims       r-   round_filtersr^      sf     ""GF,,,L'3|gk9:gEOPG |##7w<r,   kernel_sizeadjustc                     t        | t              r| | f} | d   dz  | d   dz  f}|r|d   dz
  |d   |d   dz
  |d   fS |d   |d   |d   |d   fS )aJ  
    Utility function to get the tuple padding value for the depthwise convolution.

    Args:
        kernel_size (`int` or `tuple`):
            Kernel size of the convolution layers.
        adjust (`bool`, *optional*, defaults to `True`):
            Adjusts padding value to apply to right and bottom sides of the input.
    r   rW   r   )
isinstancer[   )r_   r`   corrects      r-   correct_padrd      s}     +s#"K01~"KNa$78G
Q
GAJNGAJGG
GAJ
GAJ??r,   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )AlignVisionEmbeddingszL
    A module that corresponds to the stem module of the original work.
    rT   c                    t         |           t        |d      | _        t	        j
                  d      | _        t	        j                  |j                  | j                  dddd      | _	        t	        j                  | j                  |j                  |j                  	      | _        t        |j                     | _        y )
N    )r   r   r   r   paddingr	   rW   validFr_   striderj   bias)epsmomentum)super__init__r^   out_dimr   	ZeroPad2drj   Conv2drU   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr
   
hidden_act
activationr@   rT   	__class__s     r-   rr   zAlignVisionEmbeddings.__init__   s    $VR0||L9991QPW^c
 &:O:OZ`ZtZtu !2!23r,   pixel_valuesr9   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)rj   rv   rz   r|   )r@   r   featuress      r-   forwardzAlignVisionEmbeddings.forward   sA    <<-##H->>(+??8,r,   )
r#   r$   r%   r&   r   rr   r'   Tensorr   __classcell__r~   s   @r-   rf   rf      s0    	40 	4ELL U\\ r,   rf   c                   .     e Zd Z	 	 	 	 	 	 	 d fd	Z xZS )AlignVisionDepthwiseConv2dc	                 @    ||z  }	t         
|   ||	|||||||	       y )N)	in_channelsout_channelsr_   rm   rj   dilationgroupsrn   padding_mode)rq   rr   )r@   r   depth_multiplierr_   rm   rj   r   rn   r   r   r~   s             r-   rr   z#AlignVisionDepthwiseConv2d.__init__   s=     #%55#%#% 	 
	
r,   )r   r	   r   r   r   Tzeros)r#   r$   r%   rr   r   r   s   @r-   r   r      s$     
 
r,   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z
 xZS )
AlignVisionExpansionLayerz_
    This corresponds to the expansion phase of each block in the original implementation.
    rT   in_dimrs   rm   c                     t         |           t        j                  ||ddd      | _        t        j
                  ||j                        | _        t        |j                     | _
        y )Nr   sameFr   r   r_   rj   rn   )num_featuresro   )rq   rr   r   ru   expand_convrw   rx   	expand_bnr
   r{   
expand_act)r@   rT   r   rs   rm   r~   s        r-   rr   z"AlignVisionExpansionLayer.__init__   sZ    99 
 W&BWBWX !2!23r,   r"   r9   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r@   r"   s     r-   r   z!AlignVisionExpansionLayer.forward   s4    ((7}56r,   )r#   r$   r%   r&   r   r[   rr   r'   r(   r   r   r   r   s   @r-   r   r      sH    
40 
4# 
4 
4UX 
4U%6%6 5<< r,   r   c            
       p     e Zd ZdZdededededef
 fdZdej                  d	ej                  fd
Z xZS )AlignVisionDepthwiseLayerzk
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    rT   r   rm   r_   adjust_paddingc                 b   t         |           || _        | j                  dk(  rdnd}t        ||      }t	        j
                  |      | _        t        ||||d      | _        t	        j                  ||j                  |j                        | _        t        |j                     | _        y )	NrW   rk   r   )r`   ri   Frl   r   ro   rp   )rq   rr   rm   rd   r   rt   depthwise_conv_padr   depthwise_convrw   rx   ry   depthwise_normr
   r{   depthwise_act)	r@   rT   r   rm   r_   r   conv_padrj   r~   s	           r-   rr   z"AlignVisionDepthwiseLayer.__init__   s     	"kkQ.7Fk.A"$,,w"?8FHSX
 !nnV%:%:VE_E_
 $F$5$56r,   r"   r9   c                     | j                   dk(  r| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S )NrW   )rm   r   r   r   r   r   s     r-   r   z!AlignVisionDepthwiseLayer.forward  sT    ;;! 33MBM++M:++M:**=9r,   r#   r$   r%   r&   r   r[   boolrr   r'   r(   r   r   r   r   s   @r-   r   r      sZ    7!7 7 	7
 7 7,	U%6%6 	5<< 	r,   r   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z xZS )AlignVisionSqueezeExciteLayerzl
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    rT   r   
expand_dimexpandc                    t         |           |r|n|| _        t        dt	        ||j
                  z              | _        t        j                  d      | _	        t        j                  | j                  | j                  dd      | _        t        j                  | j                  | j                  dd      | _        t        |j                     | _        t        j                          | _        y )Nr   )output_sizer   )r   r   r_   rj   )rq   rr   dimrZ   r[   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezeru   reducer   r
   r{   
act_reduceSigmoid
act_expand)r@   rT   r   r   r   r~   s        r-   rr   z&AlignVisionSqueezeExciteLayer.__init__!  s    !':V!S&*H*H!HIJ++:ii	
 ii	
 !!2!23**,r,   r"   r9   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }t        j                  ||      }|S r   )r   r   r   r   r   r'   mul)r@   r"   inputss      r-   r   z%AlignVisionSqueezeExciteLayer.forward6  sc    ]3M26M26		&-8r,   )Fr   r   s   @r-   r   r     sH    '0 '# '3 'X\ '*
U%6%6 
5<< 
r,   r   c                        e Zd ZdZdedededededef fdZd	e	j                  d
e	j                  de	j                  fdZ xZS )AlignVisionFinalBlockLayerz[
    This corresponds to the final phase of each block in the original implementation.
    rT   r   rs   rm   	drop_rateid_skipc                    t         |           |dk(  xr | | _        t        j                  ||ddd      | _        t        j                  ||j                  |j                        | _	        t        j                  |      | _        y )Nr   r   Fr   r   )p)rq   rr   apply_dropoutr   ru   project_convrw   rx   ry   
project_bnDropoutdropout)r@   rT   r   rs   rm   r   r   r~   s          r-   rr   z#AlignVisionFinalBlockLayer.__init__H  sz     	#q[8[II 
 .. f&;&;fF`F`
 zzI.r,   
embeddingsr"   r9   c                     | j                  |      }| j                  |      }| j                  r| j                  |      }||z   }|S r   )r   r   r   r   )r@   r   r"   s      r-   r   z"AlignVisionFinalBlockLayer.forwardY  sG    ))-86 LL7M)J6Mr,   r#   r$   r%   r&   r   r[   floatr   rr   r'   r(   r   r   r   r   s   @r-   r   r   C  sj    /'/14/?B/LO/\a/lp/"%"3"3 EDUDU Z_ZfZf r,   r   c                        e Zd ZdZdededededededed	ed
ef fdZde	j                  de	j                  fdZ xZS )AlignVisionBlocka  
    This corresponds to the block module of original the EfficientNet vision encoder implementation.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
        in_dim (`int`):
            Number of input channels.
        out_dim (`int`):
            Number of output channels.
        stride (`int`):
            Stride size to be used in convolution layers.
        expand_ratio (`int`):
            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
        kernel_size (`int`):
            Kernel size for the depthwise convolution layer.
        drop_rate (`float`):
            Dropout rate to be used in the final phase of each block.
        id_skip (`bool`):
            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
            of each block. Set to `True` for the first block of each stage.
        adjust_padding (`bool`):
            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
            operation, set to `True` for inputs with odd input sizes.
    rT   r   rs   rm   expand_ratior_   r   r   r   c
                 l   t         |           || _        | j                  dk7  | _        ||z  }
| j                  rt	        |||
|      | _        t        || j                  r|
n||||	      | _        t        |||
| j                        | _	        t        || j                  r|
n|||||      | _        y )Nr   )rT   r   rs   rm   )rT   r   rm   r_   r   )rT   r   r   r   )rT   r   rs   rm   r   r   )rq   rr   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)r@   rT   r   rs   rm   r   r_   r   r   r   expand_in_dimr~   s              r-   rr   zAlignVisionBlock.__init__  s     	(''1,-;;6fmFDN 8$(KK=V#)
 <&]4;;
 5$(KK=V
r,   r"   r9   c                     |}| j                   dk7  r| j                  |      }| j                  |      }| j                  |      }| j	                  ||      }|S Nr   )r   r   r   r   r   )r@   r"   r   s      r-   r   zAlignVisionBlock.forward  s[    "
! NN=9M++M: ++M:
MBr,   r   r   s   @r-   r   r   d  s    4'
!'
 '
 	'

 '
 '
 '
 '
 '
 '
R
U%6%6 
5<< 
r,   r   c            	       f     e Zd ZdZdef fdZ	 	 d	dej                  dee	   dee	   de
fdZ xZS )
AlignVisionEncoderz
    Forward propagates the embeddings through each vision encoder (EfficientNet) block.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
    rT   c                     t                    |j                   _         fdt        |j                        }t        fd|j                  D              }d}g }t        |      D ]  }t        ||j                  |         }t        ||j                  |         }|j                  |   }	|j                  |   }
|j                  |   }t         |j                  |               D ]c  }|dk(  }|dkD  rdn|	}	|dkD  r|n|}||j                  v}|j                  |z  |z  }t        ||||	|
||||	      }|j!                  |       |dz  }e  t#        j$                  |       _        y )Nc                 Z    t        t        j                  j                  | z              S r   )r[   mathceildepth_coefficient)repeatsr@   s    r-   round_repeatsz2AlignVisionEncoder.__init__.<locals>.round_repeats  s"    tyy!7!7'!ABCCr,   c              3   .   K   | ]  } |        y wr   r+   )r>   nr   s     r-   rA   z.AlignVisionEncoder.__init__.<locals>.<genexpr>  s     Laq)Ls   r   r   )	rT   r   rs   rm   r_   r   r   r   r   )rq   rr   r   rL   r   sumnum_block_repeatsranger^   r   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)r@   rT   num_base_blocks
num_blockscurr_block_numr   ir   rs   rm   r_   r   jr   r   r   blockr   r~   s   `                @r-   rr   zAlignVisionEncoder.__init__  s   !'!9!9	D f001L63K3KLL
' 	$A"66+=+=a+@AF#FF,?,?,BCG^^A&F --a0K!//2L=)A)A!)DEF $q&!e$%Ev!/v7O7O!O"44~E
R	(!!#! +!-'##1
 e$!#'$	$8 mmF+r,   r"   output_hidden_statesreturn_dictr9   c                     |r|fnd }| j                   D ]  } ||      }|s||fz  } |st        d ||fD              S t        ||      S )Nc              3   &   K   | ]	  }||  y wr   r+   )r>   vs     r-   rA   z-AlignVisionEncoder.forward.<locals>.<genexpr>  s     Xq!-Xs   )r!   r"   )r   r*   r   )r@   r"   r   r   all_hidden_statesr   s         r-   r   zAlignVisionEncoder.forward  so     1E],$[[ 	6E!-0M#!m%55!	6
 X]4E$FXXX-++
 	
r,   )FT)r#   r$   r%   r&   r   rr   r'   r(   r   r   r   r   r   r   s   @r-   r   r     sW    ),0 ),\ 05&*	
((
 'tn
 d^	

 
2
r,   r   c                        e Zd ZdZ fdZ	 	 	 	 d	deej                     deej                     deej                     deej                     dej                  f
dZ
 xZS )
AlignTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 >   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       y )N)padding_idxro   position_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_ids)dtype)rq   rr   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   r<   r   register_bufferr'   rK   r   r   r   sizelongr}   s     r-   rr   zAlignTextEmbeddings.__init__  s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r,   	input_idsr  r   inputs_embedsr9   c                 T   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  dk(  r| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr   r   r  r   r  rG   r   )r  r   hasattrr  r   r'   r   r  rG   r  r  r   r	  r  r   )r@   r  r  r   r  input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  r   r	  s               r-   r   zAlignTextEmbeddings.forward  s?     #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r,   )NNNN)r#   r$   r%   r&   rr   r   r'   
LongTensorr(   r   r   r   r   s   @r-   r   r     s~    Q
* 15593759&E,,-& !!1!12& u//0	&
   1 12& 
&r,   r   modulequerykeyvalueattention_maskscalingr   	head_maskc                 .   t        j                  ||j                  dd            |z  }	|#|d d d d d d d |j                  d   f   }
|	|
z   }	t        j
                  j                  |	dt         j                        j                  |j                        }	t        j
                  j                  |	|| j                        }	||	|j                  dddd      z  }	t        j                  |	|      }|j                  dd      j                         }||	fS )NrW   r	   r   )r   r  )r   trainingr   )r'   matmul	transposeshaper   rI   softmaxfloat32tor  r   r%  view
contiguous)r  r  r  r  r   r!  r   r"  kwargsattn_weightscausal_maskattn_outputs               r-   eager_attention_forwardr2  >  s     <<s}}Q':;gEL!$Q1o		"o%=>#k1==((2U]](SVVW\WbWbcL==((6??([L#innQAq&AA,,|U3K''1-88:K$$r,   c                        e Zd Z fdZ	 	 	 ddej
                  deej                     deej                     dee   de	ej
                     f
dZ
 xZS )	AlignTextSelfAttentionc                 $   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |j                   | _        | j                  dz  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )rq   rr   r  num_attention_headsr  
ValueErrorrT   r[   attention_head_sizeall_head_sizer   Linearr  r  r  r   attention_probs_dropout_probr   attention_dropoutr!  r}   s     r-   rr   zAlignTextSelfAttention.__init__Z  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r,   r"   r   r"  output_attentionsr9   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
t        }| j                  j                  dk7  rt        | j                  j                     } || ||	|
|f| j                  sdn| j                  | j                  |d|\  }} |j                  g |d j                         }|r||f}|S |f}|S )Nr   r   rW   eager        )r   r!  r"  )r(  r:  r  r,  r'  r  r  r2  rT   _attn_implementationr   r%  r>  r!  reshaper-  )r@   r"   r   r"  r?  r.  r  hidden_shapequery_states
key_statesvalue_statesattention_interfacer1  r/  outputss                  r-   r   zAlignTextSelfAttention.forwardo  sa    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL
%
 
%
!\ *k));;;;FFH1B;- JUr,   NNF)r#   r$   r%   rr   r'   r   r   r(   r   r*   r   r   r   s   @r-   r4  r4  Y  so    60 7;15,1!||! !!2!23! E--.	!
 $D>! 
u||	!r,   r4  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AlignTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )rq   rr   r   r<  r  denser  r  r   r  r   r}   s     r-   rr   zAlignTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r,   r"   input_tensorr9   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rP  r   r  r@   r"   rQ  s      r-   r   zAlignTextSelfOutput.forward  7    

=1]3}|'CDr,   r#   r$   r%   rr   r'   r   r   r   r   s   @r-   rM  rM    1    >U\\  RWR^R^ r,   rM  c                        e Zd Z fdZd Z	 	 	 d	dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )
AlignTextAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )rq   rr   r4  r@   rM  outputsetpruned_headsr}   s     r-   rr   zAlignTextAttention.__init__  s0    *62	)&1Er,   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   )r   )rL   r   r@   r8  r:  r]  r   r  r  r  r[  rP  r;  union)r@   headsindexs      r-   prune_headszAlignTextAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r,   r"   r   r"  r?  r9   c                 p     | j                   |f|||d|}| j                  |d   |      }|f|dd  z   }|S N)r   r"  r?  r   r   )r@   r[  )	r@   r"   r   r"  r?  r.  self_outputsattention_outputrJ  s	            r-   r   zAlignTextAttention.forward  s_     !tyy
)/	

 
  ;;|AF#%QR(88r,   rK  )r#   r$   r%   rr   rb  r'   r   r   r(   r   r*   r   r   r   s   @r-   rY  rY    st    ";* 7;15,1|| !!2!23 E--.	
 $D> 
u||	r,   rY  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AlignTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rq   rr   r   r<  r  intermediate_sizerP  rb   r{   strr
   intermediate_act_fnr}   s     r-   rr   zAlignTextIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r,   r"   r9   c                 J    | j                  |      }| j                  |      }|S r   )rP  rl  r   s     r-   r   zAlignTextIntermediate.forward  s&    

=100?r,   rV  r   s   @r-   rh  rh    s#    9U\\ ell r,   rh  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AlignTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y rO  )rq   rr   r   r<  rj  r  rP  r  r  r   r  r   r}   s     r-   rr   zAlignTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r,   r"   rQ  r9   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rS  rT  s      r-   r   zAlignTextOutput.forward  rU  r,   rV  r   s   @r-   ro  ro    rW  r,   ro  c                        e Zd Z fdZ	 	 	 d	dej
                  deej                     deej                     dee   de	ej
                     f
dZ
d Z xZS )
AlignTextLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y r   )
rq   rr   chunk_size_feed_forwardseq_len_dimrY  	attentionrh  intermediatero  r[  r}   s     r-   rr   zAlignTextLayer.__init__  sI    '-'E'E$+F31&9%f-r,   r"   r   r"  r?  r9   c                      | j                   |f|||d|}|d   }|dd  }t        | j                  | j                  | j                  |      }	|	f|z   }|S rd  )rw  r   feed_forward_chunkru  rv  )
r@   r"   r   r"  r?  r.  self_attention_outputsrf  rJ  layer_outputs
             r-   r   zAlignTextLayer.forward  s     "0"
)/	"

 "
 2!4(,0##T%A%A4CSCSUe
  /G+r,   c                 L    | j                  |      }| j                  ||      }|S r   )rx  r[  )r@   rf  intermediate_outputr|  s       r-   rz  z!AlignTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr,   rK  )r#   r$   r%   rr   r'   r   r   r(   r   r*   r   rz  r   r   s   @r-   rs  rs    st    . 7;15,1|| !!2!23 E--.	
 $D> 
u||	2r,   rs  c                        e Zd Z fdZe	 	 	 	 	 d
dej                  deej                     deej                     dee	   dee	   dee	   de
eej                     ef   fd	       Z xZS )AlignTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rq   rr   rT   r   r   r   num_hidden_layersrs  layergradient_checkpointing)r@   rT   r   r~   s      r-   rr   zAlignTextEncoder.__init__  sN    ]]E&JbJbDc#dqN6$:#de
&+# $es   A#r"   r   r"  r?  r   r   r9   c           	          |rdnd }|rdnd }	t        | j                        D ]4  \  }
}|r||fz   }|||
   nd } |d||||d|}|d   }|s,|	|d   fz   }	6 |r||fz   }t        |||	      S )Nr+   )r"   r   r"  r?  r   r   )r!   r"   r1   )	enumerater  r   )r@   r"   r   r"  r?  r   r   r.  r   all_self_attentionsr   layer_modulelayer_head_masklayer_outputss                 r-   r   zAlignTextEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!.7.CilO( +-)"3	
 M *!,M &9]1=M<O&O#!	P$   1]4D D++*
 	
r,   )NNFFT)r#   r$   r%   rr   r   r'   r   r   r(   r   r   r*   r   r   r   r   s   @r-   r  r    s    ,  7;15,1/4&*&
||&
 !!2!23&
 E--.	&

 $D>&
 'tn&
 d^&
 
uU\\"O3	4&
 &
r,   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AlignTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rq   rr   r   r<  r  rP  Tanhr|   r}   s     r-   rr   zAlignTextPooler.__init__I  s9    YYv1163E3EF
'')r,   r"   r9   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )rP  r|   )r@   r"   first_token_tensorpooled_outputs       r-   r   zAlignTextPooler.forwardN  s6     +1a40

#566r,   rV  r   s   @r-   r  r  H  s#    $
U\\ ell r,   r  c                   @    e Zd ZU eed<   dZdZdej                  fdZ	y)AlignPreTrainedModelrT   alignTr  c                 r   | j                   j                  }t        |t        j                  t        j
                  f      rZ|j                  j                  j                  d|       |j                  O|j                  j                  j                          n)t        |t              rt        j                  j                  |j                  j                         |j                  j                  j                  j                          |j                  j                  j!                  | j                   j"                         n~t        |t        j$                        rd|j                  j                  j                  d|       |j&                  1|j                  j                  |j&                     j                          t        |t        j(                  t        j*                  f      rJ|j                  j                  j                          |j                  j                  j!                  d       yy)zInitialize the weightsrB  )meanstdNg      ?)rT   initializer_rangerb   r   r<  ru   weightdatanormal_rn   zero_
AlignModelinitxavier_uniform_text_projectiontemperaturefill_temperature_init_valuer  r   r  rw   )r@   r  r  s      r-   _init_weightsz"AlignPreTrainedModel._init_weights]  sq   kk++fryy"))45MM&&CS&9{{&  &&(
+GG##F$:$:$A$AB""'',,224##))$++*L*LM-MM&&CS&9!!-""6#5#56<<>fr||R^^<=KK""$MM$$S) >r,   N)
r#   r$   r%   r   r)   base_model_prefixsupports_gradient_checkpointingr   Moduler  r+   r,   r-   r  r  W  s$    &*#*BII *r,   r  zJ
    The text model from ALIGN without any head or projection on top.
    c                   X    e Zd ZU eed<   dgZddedef fdZd Zd Z	e
e	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     deej                     deej                     dee   dee   dee   deeef   fd              Z xZS )AlignTextModelrT   r   add_pooling_layerc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rq   rr   rT   r   r   r  encoderr  pooler	post_init)r@   rT   r  r~   s      r-   rr   zAlignTextModel.__init__z  sM    
 	 -f5'/1Bof- 	r,   c                 .    | j                   j                  S r   r   r  rC   s    r-   get_input_embeddingsz#AlignTextModel.get_input_embeddings  s    ...r,   c                 &    || j                   _        y r   r  )r@   r  s     r-   set_input_embeddingsz#AlignTextModel.set_input_embeddings  s    */'r,   r  r   r  r   r"  r  r?  r   r   r9   c
           	         ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }n!||j                         dd }nt	        d      |\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  ddd|f   }|j                  ||      }|}n&t        j                  |t        j                  |      }| j!                  ||      }| j#                  || j                   j$                        }| j                  ||||      } | j&                  |f||||d	d
|
}|d   }| j(                  | j)                  |      nd}t+        |||j,                  |j.                        S )a-  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignTextModel

        >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsrF   r  r  )r  r   r  r  T)r   r"  r?  r   r   r   )r!   pooler_outputr"   r1   )rT   r?  r   use_return_dictr9  %warn_if_padding_and_no_attention_maskr  rG   r'   onesr  r   r  r   r   r  get_extended_attention_maskget_head_maskr  r  r  r   r"   r1   )r@   r  r   r  r   r"  r  r?  r   r   r.  r  
batch_sizer  rG   r  r  extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                         r-   r   zAlignTextModel.forward  s%   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m &&y$++2O2OP	??%)'	 + 
 '$,,
2/!5
 
 *!,8<8OO4UY)-')77&11	
 	
r,   T	NNNNNNNNN)r#   r$   r%   r   r)   _no_split_modulesr   rr   r  r  r   r   r   r'   r   r(   r   r*   r   r   r   r   s   @r-   r  r  q  s'    ./ 4  /0  -11515/31504,0/3&*\
ELL)\
 !.\
 !.	\

 u||,\
 E--.\
  -\
 $D>\
 'tn\
 d^\
 
u00	1\
  \
r,   r  zL
    The vision model from ALIGN without any head or projection on top.
    c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
ee	 	 	 d
deej                     dee   dee   deeef   fd	              Z xZS )AlignVisionModelrT   r   Fc                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dk(  r't        j                  |j                  d      | _        nN|j                  dk(  r't        j                  |j                  d      | _        nt        d|j                         | j                          y )Nr  T)	ceil_moderZ   z2config.pooling must be one of ['mean', 'max'] got )rq   rr   rT   rf   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2dr9  poolingr  r}   s     r-   rr   zAlignVisionModel.__init__  s     /7)&1 &(,,v'8'8DIDK  E),,v'8'8DIDKQRXR`R`Qabcc 	r,   r9   c                 B    | j                   j                  j                  S r   )vision_modelr   rv   rC   s    r-   r  z%AlignVisionModel.get_input_embeddings  s      ++777r,   r   r   c                 f   ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  ||d      }|d   }| j                  |      }|j                  |j                  dd       }t        |||j                        S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignVisionModel

        >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nz You have to specify pixel_valuesT)r   r   r   rW   )r!   r  r"   )rT   r   r  r9  r   r  r  rD  r(  r   r"   )r@   r   r   r   r  r  r!   r  s           r-   r   zAlignVisionModel.forward  s    : %9$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,!5 ' 
 ,A.$56%--m.A.A"1.EF7/')77
 	
r,   NNN)r#   r$   r%   r   r)   main_input_namer  rr   r   r  r  r   r   r   r'   r(   r   r   r*   r   r   r   r   s   @r-   r  r    s     $O&+#0 "8bii 8  59/3&*	2
u0012
 'tn2
 d^	2

 
u>>	?2
  2
r,   r  c                       e Zd ZU eed<   def fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	ee
   d
ee
   dee
   dej                  fd       Ze	 	 	 ddeej                     d
ee
   dee
   dej                  fd       Zee	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     dee
   d	ee
   d
ee
   dee
   deeef   fd              Z xZS )r  rT   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        t        |      | _        t        |      | _        t!        j"                  | j                  | j                        | _        t!        j&                  t)        j*                  | j,                  j.                              | _        | j3                          y )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )rq   rr   rb   text_configr   	TypeErrortypevision_configr   projection_dimr  text_embed_dimr  
text_modelr  r  r   r<  r  	Parameterr'   tensorrT   r  r  r  )r@   rT   r  r  r~   s       r-   rr   zAlignModel.__init__J  s#    &,,o>++,-Q0 
 &..0AB--./q2 
 ((,,$33)55(5,];!yy)<)<d>Q>QR<<T[[5W5W(XY 	r,   r  r   r  r   r"  r  r?  r   r   r9   c
                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  |||||||||		      }
|
d   dddddf   }| j                  |      }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AlignTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```N	r  r   r  r   r"  r  r?  r   r   r   )rT   r?  r   r  r  r  )r@   r  r   r  r   r"  r  r?  r   r   text_outputsr!   text_featuress                r-   get_text_featureszAlignModel.get_text_featuresh  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%'/!5# ' 

 )OAq!G4,,->?r,   r   c                     ||n| j                   j                  }||n| j                   j                  }| j                  |||      }|d   }|S )a9  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AlignVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```r   r   r   r   )rT   r   r  r  )r@   r   r   r   vision_outputsimage_featuress         r-   get_image_featureszAlignModel.get_image_features  sf    > %9$D $++JjJj 	 &1%<k$++B]B]**%!5# + 
 (*r,   return_lossc                 d   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }| j	                  ||
d      }| j                  |||||||	|
d	      }|d   }|d   dddddf   }| j                  |      }||j                  ddd	      z  }||j                  ddd	      z  }t        j                  ||j                               | j                  z  }|j                         }d}|rt        |      }t        |||||||
      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NTr  r  r   r   rW   r   )r   r   keepdim)r4   r5   r6   r0   r    r7   r8   )rT   r?  r   r  r  r  r  normr'   r&  rP   r  rS   r3   )r@   r  r   r   r  r   r"  r  r  r?  r   r   r  r  r    r0   r6   r5   r4   s                      r-   r   zAlignModel.forward  sr   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%!5 + 
 ))%'/!5 ' 

 &a("1oaAg.**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO  ,,{LNN4DEHXHXX*,,.o.D-+#%* .
 	
r,   r  r  )NNNNNNNNNNN)r#   r$   r%   r   r)   rr   r   r   r'   r   r   r(   r  r  r   r  r   r*   r3   r   r   r   s   @r-   r  r  F  sp   { <  -11515/3,004,0/3&*2ELL)2 !.2 !.	2
 u||,2 ELL)2  -2 $D>2 'tn2 d^2 
		2 2h  59/3&*	*u001* 'tn* d^	*
 
		* *X  15481515/3,004&*,0/3&*X
E,,-X
 u001X
 !.	X

 !.X
 u||,X
 ELL)X
  -X
 d^X
 $D>X
 'tnX
 d^X
 
uk!	"X
  X
r,   r  )r  r  r  r  r  )rB  N)Mr&   r   dataclassesr   typingr   r   r   r   r'   torch.utils.checkpointr   activationsr
   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_alignr   r   r   
get_loggerr#   loggerr   r/   r3   r   rM   rS   r[   r^   r*   r   rd   r  rf   ru   r   r   r   r   r   r   r   r   r   r2  r4  rM  rY  rh  ro  rs  r  r  r  r  r  r  __all__r+   r,   r-   <module>r     sr     ! 1 1    ! 9  G l l K K P P 
		H	% 
=[ = = 
	:; 	: 	:  
+  
   
JuU\\ uell u-5<< -ELL -+ 3  @U3:. @ @*BII 4
 
6		 6$		 $P$BII $N BNryy NbG
 G
T<")) <L (,%II%<<% 
% <<	%
 U\\*% % % %%67RYY 7v")) * *\BII  bii %/ %P.
ryy .
dbii  *? * *2 
x
) x

x
v 
M
+ M

M
` ]
% ]
 ]
@ Wr,   