
    rh                     Z   d Z ddlZddlZddlmZ ddlmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z! ddl"m#Z#  e!jH                  e%      Z&e e d       G d de                    Z' G d dejP                        Z) G d dejP                        Z* G d dejP                        Z+ G d dejP                        Z, G d dejP                        Z- G d dejP                        Z. G d  d!ejP                        Z/ G d" d#ejP                        Z0 G d$ d%e      Z1 G d& d'ejP                        Z2e  G d( d)e             Z3e  G d* d+e3             Z4 G d, d-ejP                        Z5 e d.       G d/ d0e3             Z6 G d1 d2ejP                        Z7 G d3 d4ejP                        Z8 e d5       G d6 d7e3             Z9 e d8       G d9 d:e3             Z: e d;       G d< d=e3             Z;e  G d> d?e3             Z<g d@Z=y)AzPyTorch ViLT model.    N)	dataclass)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)auto_docstringlogging   )
ViltConfigzF
    Class for outputs of [`ViltForImagesAndTextClassification`].
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeeej                           ed<   dZeeeej                           ed<   y)(ViltForImagesAndTextClassificationOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`list[tuple(torch.FloatTensor)]`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the output of
        the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    Nlosslogitshidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   listtupler        y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/vilt/modeling_vilt.pyr   r   .   sq    	 )-D(5$$
%,*.FHU&&'.>BM8Du'8'8!9:;B;?JeE$5$5678?r*   r   c                   4     e Zd ZdZ fdZddZ	 ddZ xZS )ViltEmbeddingsz
    Construct the text and patch embeddings.

    Text embeddings are equivalent to BERT embeddings.

    Patch embeddings are equivalent to ViT embeddings.
    c                 ,   t         |           t        |      | _        t	        j
                  t        j                  dd|j                              | _	        t        |      | _        | j                  j                  }t	        j
                  t        j                  d|dz   |j                              | _        t	        j                  |j                  |j                        | _        t	        j"                  |j$                        | _        || _        y Nr   )super__init__TextEmbeddingstext_embeddingsr   	Parameterr$   zeroshidden_size	cls_tokenViltPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddings	Embeddingmodality_type_vocab_sizetoken_type_embeddingsDropouthidden_dropout_probdropoutconfig)selfrB   r:   	__class__s      r+   r1   zViltEmbeddings.__init__O   s      .f5ekk!Q8J8J&KL 3F ;++77#%<<A{QPVPbPb0c#d %'\\&2Q2QSYSeSe%f"zz&"<"<=r*   c                 L   | j                   j                  j                  j                  \  }}}}| j                  |      }|d d d d d d d f   j	                         }t
        j                  j                  ||j                  d   |j                  d   f      j                         }|d d df   j                  d      d d df   }	|d d df   j                  d      d d df   }
|j                  \  }}}}| j                  j                  | j                  j                  z  }| j                  d d dd d d f   j                  dd      j                  d|||      }t!        j"                  t%        |	|
      D cg c]R  \  }}t
        j                  j'                  t
        j                  j                  |||fdd	      d||z
  d||z
  f      T c}}d      }|j)                  d      j                  dd      }|j)                  d      j                  dd      }t!        j*                  t-        t!        j.                  |j                  d
         t!        j.                  |j                  d         d      d      j1                  |j2                        }|d d d d d d d d f   }|j5                  |j                  d   |j                  d   ddd      }|j)                  dd      }|j)                  d      }|dk  s|t7        |t8              s|	|
z  }|j;                         }n|	|
z  }t=        |j;                         |      }|j?                  d      }d|z
  j?                  d      }|d d df   jA                         }|D cg c]  }||d d df   |k(      }}|D cg c]  }||d d df   |k(      }}|D cg c]  }|jC                  d       }}|D cg c]  }|jC                  d       }}|D cg c]  }||z
  	 }}g } tE        t%        |||            D ]  \  }!\  }}"}#|#dk  rOt!        jF                  t!        jH                  |      j	                         |      }$| jK                  ||!   |$          ^t!        jF                  t!        jH                  |"      j	                         |#d      }%| jK                  t!        j"                  ||!   ||!   |%   gd              t!        j"                  | d      } || d d df   | d d df   f   j                  |d|      }|| d d df   | d d df   f   j                  |d      }|| d d df   | d d df   f   j                  |dd      }|| d d df   | d d df   f   j                  |d|      }| jL                  j5                  |dd      }&t!        j"                  |&|fd      }t!        j"                  | j                  d d dd d f   d d d d d f   j5                  |dd      |fd      }||z   }| jO                  |      }t!        j"                  t!        jH                  |j                  d   d      j1                  |      |gd      }|||||fffS c c}}w c c}w c c}w c c}w c c}w c c}w )N   r   )sizer   r   dimbilinearT)rG   modealign_cornersij)indexingdeviceF)as_tuple)replacement)(r9   
projectionweightshapefloatr   
functionalinterpolatelongsumrB   
image_size
patch_sizer;   	transposeviewr$   catzippadflattenstackr   arangetorR   expand
isinstanceintmaxminnonzerouniquerG   	enumeratemultinomialonesappendr7   rA   )'rC   pixel_values
pixel_maskmax_image_length_phpwxx_maskx_hx_w
batch_sizenum_channelsheightwidth	patch_dimspatial_poshw	pos_embedpatch_indexeffective_resolution	valid_idxnon_valid_idxunique_rowsuvalid_row_idxnon_valid_row_idxv
valid_numsnon_valid_numspad_numsselectinvpvalid_choice
pad_choice
cls_tokenss'                                          r+   visual_embedzViltEmbeddings.visual_embed^   sR   ,,77>>DD1b"!!,/AtQM*002**6QWWQZ8P*QVVXQTl1%ad+QTl1%ad+23''/
L&%KK**dkk.D.DD	..q!"ax8BB1aHMMaQ]_hjstII  SM Aq !!MM--#V'&*	 .  	1fqj1 
	  %%a(221a8	IIaL""1a(kkU\\&,,r"23U\\&,,rBR5S^bcik

"FMM"
" 	 "$aA"56!((a&,,q/2rSUV!))!Q/"a#3#;:N^`cCd
 $'9 3779#&9 "#7#;#;#=?OPNNEN2	V,,e,<1o,,.BMNQ9QT?a#78NNNYZ]=A+>!+CDZZ)67AaffQi7
7->?!&&)??2<=Q$q(==&s:~x'PQ 	fMAz2qAv$00A1D1D1FHXYmA.|<="..uzz"~/C/C/EqVZ[
eiiq)9;LQ;OPZ;[(\bcde	f 6q)fQTlF1a4L()..z2|Lq!tfQTl2388RH!&A,q!t"<=BB:rSTUfQTlF1a4L89>>z2|\	^^**:r2>
IIz1o1-II%%aAg.q$z:AA*bRTUW`agh
	 	MLLOEJJv||A:==fEvNTUV&;888SP OZ7?=s%   ?AZ
Z+ZZ%ZZ!c	           	      (   | j                  |||      }	|-| j                  ||| j                  j                        \  }}
}n|j	                  d      }
|d}|	| j                  t        j                  |t        j                  |	j                              z   }	|| j                  t        j                  |
|t        j                  |	j                              z   }t        j                  |	|gd      }t        j                  ||
gd      }||fS )N)	input_idstoken_type_idsinputs_embeds)ru   r   dtyperR   rH   )r3   r   rB   ru   rd   r>   r$   
zeros_liker[   rR   	full_likera   )rC   r   attention_maskr   rs   rt   r   image_embedsimage_token_type_idxtext_embedsimage_masksr   
embeddingsmaskss                 r+   forwardzViltEmbeddings.forward   s    **m + 

 595F5Fj4;;;W;W 6G 62L+{ %,,Q/K  '#$ !D$>$>^5::kFXFXY%
 
 $d&@&@OOK)=UZZXcXjXjk'
 

 YY\:B
		>;7Q?5  r*   )   )r   )r    r!   r"   r#   r1   r   r   __classcell__rD   s   @r+   r-   r-   F   s    V9B '!r*   r-   c                   *     e Zd ZdZ fdZddZ xZS )r2   zGConstruct the embeddings from word, position and token_type embeddings.c                 >   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       y )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   rN   F)
persistentr   r   )r0   r1   r   r<   
vocab_sizer6   pad_token_idword_embeddingsmax_position_embeddingsr;   type_vocab_sizer>   	LayerNormlayer_norm_epsr?   r@   rA   getattrr   register_bufferr$   rf   rh   r5   r   rG   r[   rC   rB   rD   s     r+   r1   zTextEmbeddings.__init__   s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r*   c                 T   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  dk(  r| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )NrN   r   r   r   r   r   )rG   r   hasattrr   rh   r$   r5   r[   rR   r   r>   r   r;   r   rA   )rC   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr>   r   r;   s               r+   r   zTextEmbeddings.forward   s=    #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r*   )NNNNr    r!   r"   r#   r1   r   r   r   s   @r+   r2   r2      s    Q
& r*   r2   c                   (     e Zd ZdZ fdZd Z xZS )r8   z#
    Image to Patch Embedding.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)r0   r1   r]   r^   r~   r6   ri   collectionsabcIterabler:   r   Conv2drU   )rC   rB   r]   r^   r~   r6   r:   rD   s          r+   r1   zViltPatchEmbeddings.__init__  s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir*   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  j                  j
                  }| j                  |j                  |            }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   )rW   r~   
ValueErrorrU   rV   r   rg   )rC   rs   r}   r~   r   r   target_dtypery   s           r+   r   zViltPatchEmbeddings.forward-  si    2>2D2D/
L&%4,,,w  --33OOLOO,O?@r*   r   r   s   @r+   r8   r8     s    jr*   r8   c                   &     e Zd Z fdZddZ xZS )ViltSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                         | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)r0   r1   r6   num_attention_headsr   r   rj   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer?   attention_probs_dropout_probrA   r   s     r+   r1   zViltSelfAttention.__init__9  s.    : ::a?PVXhHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr*   c                    |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  |      j                  |d| j                  | j                        j                  dd      }
t        j                  ||	j                  dd            }|t        j                  | j                        z  }|||z   } t        j                  d      |      }| j                  |      }|||z  }t        j                  ||
      }|j                  dddd      j!                         }|j#                         d d | j$                  fz   } |j                  | }|r||f}|S |f}|S )NrN   r   rF   rM   rH   r   r   )rW   r   r`   r   r   r_   r   r   r$   matmulmathsqrtr   SoftmaxrA   permute
contiguousrG   r   )rC   r   r   	head_maskoutput_attentionsr}   r   rv   query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                   r+   r   zViltSelfAttention.forwardK  s   $1$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ -"**,-=> ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r*   NNFr    r!   r"   r1   r   r   r   s   @r+   r   r   8  s    G$,r*   r   c                   |     e Zd ZdZdeddf fdZdej                  dej                  dej                  fdZ xZ	S )	ViltSelfOutputz
    The residual connection is defined in ViltLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rB   returnNc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)	r0   r1   r   r   r6   denser?   r@   rA   r   s     r+   r1   zViltSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r*   r   input_tensorc                 J    | j                  |      }| j                  |      }|S r   r   rA   rC   r   r   s      r+   r   zViltSelfOutput.forward  s$    

=1]3r*   )
r    r!   r"   r#   r   r1   r$   Tensorr   r   r   s   @r+   r   r   {  sD    
>z >d >
U\\  RWR^R^ r*   r   c                   ,     e Zd Z fdZd ZddZ xZS )ViltAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )r0   r1   r   	attentionr   outputsetpruned_headsr   s     r+   r1   zViltAttention.__init__  s0    *62$V,Er*   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rH   )lenr   r  r   r   r  r   r   r   r   r  r   r   union)rC   headsindexs      r+   prune_headszViltAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r*   c                 j    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r  r  )rC   r   r   r   r   self_outputsattention_outputr   s           r+   r   zViltAttention.forward  sE    ~~m^YPab;;|AF#%QR(88r*   r   )r    r!   r"   r1   r  r   r   r   s   @r+   r  r    s    ";$r*   r  c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )ViltIntermediaterB   r   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r0   r1   r   r   r6   intermediate_sizer   ri   
hidden_actstrr	   intermediate_act_fnr   s     r+   r1   zViltIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r*   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r  rC   r   s     r+   r   zViltIntermediate.forward  s&    

=100?r*   	r    r!   r"   r   r1   r$   r   r   r   r   s   @r+   r  r    s1    9z 9d 9U\\ ell r*   r  c                   x     e Zd Zdeddf fdZdej                  dej                  dej                  fdZ xZS )
ViltOutputrB   r   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
r0   r1   r   r   r  r6   r   r?   r@   rA   r   s     r+   r1   zViltOutput.__init__  sB    YYv779K9KL
zz&"<"<=r*   r   r   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r+   r   zViltOutput.forward  s.    

=1]3%4r*   r  r   s   @r+   r  r    s?    >z >d >
U\\  RWR^R^ r*   r  c                   *     e Zd ZdZ fdZddZ xZS )	ViltLayerz?This corresponds to the Block class in the timm implementation.c                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   r   )r0   r1   chunk_size_feed_forwardseq_len_dimr  r  r  intermediater  r  r   r   r6   r   layernorm_beforelayernorm_afterr   s     r+   r1   zViltLayer.__init__  s    '-'E'E$&v.,V4 ( "V-?-?VEZEZ [!||F,>,>FDYDYZr*   c                    | j                  | j                  |      |||      }|d   }|dd  }||j                  |j                        z   }| j	                  |      }| j                  |      }| j                  ||      }|f|z   }|S )N)r   r   r   )r  r%  rg   rR   r&  r$  r  )	rC   r   r   r   r   self_attention_outputsr  r   layer_outputs	            r+   r   zViltLayer.forward  s    !%!!-0/	 "0 "
 2!4(, )=+;+;<L<S<S+TT ++M:((6 {{<?/G+r*   r   r   r   s   @r+   r   r     s    I[r*   r   c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )ViltEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r0   r1   rB   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rC   rB   rv   rD   s      r+   r1   zViltEncoder.__init__  sN    ]]uVE]E]?^#_!If$5#_`
&+# $`s   A#c                    |rdnd }|rdnd }t        | j                        D ]2  \  }	}
|r||fz   }|||	   nd } |
||||      }|d   }|s*||d   fz   }4 |r||fz   }|st        d |||fD              S t        |||      S )Nr)   r   r   c              3   &   K   | ]	  }||  y wr   r)   ).0r   s     r+   	<genexpr>z&ViltEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   )last_hidden_stater   r   )ro   r0  r(   r   )rC   r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputss                r+   r   zViltEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!.7.CilO(YjkM)!,M &9]1=M<O&O#	P   1]4D Dm]4EGZ$[mmm++*
 	
r*   )NNFFTr   r   s   @r+   r+  r+    s    , ""
r*   r+  c                   .    e Zd ZU eed<   dZdZddgZd Zy)ViltPreTrainedModelrB   viltTr-   r   c                 "   t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rz|j                  j
                  j                  d| j                  j                         |j                  2|j                  j
                  |j                     j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)ri   r   r   r   rV   datanormal_rB   initializer_ranger   zero_r<   r   r   fill_)rC   modules     r+   _init_weightsz!ViltPreTrainedModel._init_weights)  s   fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r*   N)	r    r!   r"   r   r&   base_model_prefixsupports_gradient_checkpointing_no_split_modulesrJ  r)   r*   r+   r?  r?  "  s%    &*#)+>?*r*   r?  c                       e Zd Zd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee   dee   dee   dee   deeee	j                     f   fd       Z xZS )	ViltModelc                    t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r0   r1   rB   r-   r   r+  encoderr   r   r6   r   	layernorm
ViltPoolerpooler	post_init)rC   rB   add_pooling_layerrD   s      r+   r1   zViltModel.__init__<  sk    
 	 (0"6*f&8&8f>S>ST,=j(4 	r*   c                 B    | j                   j                  j                  S r   r   r3   r   rC   s    r+   get_input_embeddingszViltModel.get_input_embeddingsM  s    ..>>>r*   c                 :    || j                   j                  _        y r   rX  )rC   r   s     r+   set_input_embeddingszViltModel.set_input_embeddingsP  s    :?''7r*   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrQ  r0  r  r  )rC   heads_to_pruner0  r  s       r+   _prune_headszViltModel._prune_headsS  sE    
 +002 	CLE5LLu%//;;EB	Cr*   r   r   r   rs   rt   r   r   r   r   r   r7  r8  r   c           
      ~   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }n!||j                         dd }nt	        d      |\  }}||j                  n|j                  }|t        j                  ||f|      }||t	        d      ||t	        d      ||j                  d   n|j                  d   }||k7  rt	        d	      |Bt        j                  || j                   j                  | j                   j                  f|      }| j                  || j                   j                        }| j                  ||||||||	
      \  }}| j                  ||      }| j!                  ||||
||      }|d   }| j#                  |      }| j$                  | j%                  |      nd}|s
||f|dd z   S t'        |||j(                  |j*                        S )ak  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        image_token_type_idx (`int`, *optional*):
            - The token type ids for images.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltModel
        >>> from PIL import Image
        >>> import requests

        >>> # prepare image and text
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "hello world"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")

        >>> inputs = processor(image, text, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timerN   z5You have to specify either input_ids or inputs_embedsrQ   zFYou cannot specify both pixel_values and image_embeds at the same timez7You have to specify either pixel_values or image_embedsr   zAThe text inputs and image inputs need to have the same batch size)r   )r   r   r   r7  r8  r   )r6  pooler_outputr   r   )rB   r   r7  use_return_dictr   %warn_if_padding_and_no_attention_maskrG   rR   r$   rq   rW   r]   get_head_maskr/  r   get_extended_attention_maskrQ  rR  rT  r   r   r   )rC   r   r   r   rs   rt   r   r   r   r   r   r7  r8  r   text_batch_sizer   rR   image_batch_sizeembedding_outputextended_attention_maskencoder_outputssequence_outputpooled_outputs                          r+   r   zViltModel.forward[  s   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU&1#%.%:!!@T@T!"ZZ/:)FPVWN#(@eff!l&:VWW4@4L<--a0R^RdRdefRg.`aa%5t{{7M7Mt{{OeOe$fouvJ &&y$++2O2OP	+/??!5 ,; 	,
(. 150P0PQ_al0m,,2/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r*   )TNNNNNNNNNNNN)r    r!   r"   r1   rZ  r\  r`  r   r   r$   
LongTensorr%   rj   boolr   r   r(   r   r   r   s   @r+   rO  rO  :  s`   "?@C  156:594815155948.2,0/3&*t
E,,-t
 !!2!23t
 !!1!12	t

 u001t
 U--.t
 E--.t
   1 12t
 u001t
 'smt
 $D>t
 'tnt
 d^t
 
)51B1B+CC	Dt
 t
r*   rO  c                   $     e Zd Z fdZd Z xZS )rS  c                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r0   r1   r   r   r6   r   Tanh
activationr   s     r+   r1   zViltPooler.__init__  s9    YYv1163E3EF
'')r*   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   rt  )rC   r   first_token_tensorrm  s       r+   r   zViltPooler.forward  s6     +1a40

#566r*   r   r   s   @r+   rS  rS    s    $
r*   rS  zU
    ViLT Model with a language modeling head on top as done during pretraining.
    c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   deeee	j                     f   fd       Z xZS )ViltForMaskedLMzmlm_score.decoder.weightzmlm_score.decoder.biasc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r0   r1   rO  r@  ViltMLMHead	mlm_scorerU  r   s     r+   r1   zViltForMaskedLM.__init__  s4     f%	$V, 	r*   c                 .    | j                   j                  S r   )r{  decoderrY  s    r+   get_output_embeddingsz%ViltForMaskedLM.get_output_embeddings  s    ~~%%%r*   c                 \    || j                   _        |j                  | j                   _        y r   )r{  r}  r   )rC   new_embeddingss     r+   set_output_embeddingsz%ViltForMaskedLM.set_output_embeddings  s     !/,11r*   r   r   r   rs   rt   r   r   r   labelsr   r7  r8  r   c                 F   ||n| j                   j                  }| j                  |||||||||
||      }|dd \  }}||j                  d   n|j                  d   }|ddd|f   |dd|df   }}| j	                  |      }d}|	at               }|	j                  |j                        }	 ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a/  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in *[-100, 0, ...,
            config.vocab_size]* (see *input_ids* docstring) Tokens with indices set to *-100* are ignored (masked), the
            loss is only computed for the tokens with labels in *[0, ..., config.vocab_size]*

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForMaskedLM
        >>> import requests
        >>> from PIL import Image
        >>> import re
        >>> import torch

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "a bunch of [MASK] laying on a [MASK]."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltForMaskedLM.from_pretrained("dandelin/vilt-b32-mlm")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)

        >>> tl = len(re.findall("\[MASK\]", text))
        >>> inferred_token = [text]

        >>> # gradually fill in the MASK tokens, one by one
        >>> with torch.no_grad():
        ...     for i in range(tl):
        ...         encoded = processor.tokenizer(inferred_token)
        ...         input_ids = torch.tensor(encoded.input_ids)
        ...         encoded = encoded["input_ids"][0][1:-1]
        ...         outputs = model(input_ids=input_ids, pixel_values=encoding.pixel_values)
        ...         mlm_logits = outputs.logits[0]  # shape (seq_len, vocab_size)
        ...         # only take into account text features (minus CLS and SEP token)
        ...         mlm_logits = mlm_logits[1 : input_ids.shape[1] - 1, :]
        ...         mlm_values, mlm_ids = mlm_logits.softmax(dim=-1).max(dim=-1)
        ...         # only take into account text
        ...         mlm_values[torch.tensor(encoded) != 103] = 0
        ...         select = mlm_values.argmax().item()
        ...         encoded[select] = mlm_ids[select].item()
        ...         inferred_token = [processor.decode(encoded)]

        >>> selected_token = ""
        >>> encoded = processor.tokenizer(inferred_token)
        >>> output = processor.decode(encoded.input_ids[0], skip_special_tokens=True)
        >>> print(output)
        a bunch of cats laying on a couch.
        ```N
r   r   rs   rt   r   r   r   r   r7  r8  rF   r   rN   r   r   r   r   )rB   rc  r@  rW   r{  r   rg   rR   r`   r   r   r   r   )rC   r   r   r   rs   rt   r   r   r   r  r   r7  r8  r   rl  rm  text_seq_lentext_featuresrv   
mlm_logitsmasked_lm_lossloss_fctr  s                          r+   r   zViltForMaskedLM.forward  s]   R &1%<k$++B]B]))))%!'%/!5#  
 *1!&-6-Byq)H[H[\]H^+A}},<=qR^R_O_?`q^^M2
')HYYz001F%joob$++:P:P&QSYS^S^_aSbcN ]WQR[0F3A3M^%.YSYY!//))	
 	
r*   rn  )r    r!   r"   _tied_weights_keysr1   r~  r  r   r   r$   ro  r%   rp  r   r   r(   r   r   r   s   @r+   rx  rx    sm    56NO&2  156:594815155948-1,0/3&*o
E,,-o
 !!2!23o
 !!1!12	o

 u001o
 U--.o
 E--.o
   1 12o
 u001o
 ))*o
 $D>o
 'tno
 d^o
 
~uU%6%677	8o
 o
r*   rx  c                   $     e Zd Z fdZd Z xZS )ViltPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y )Nr   )r0   r1   r   r   r6   r   ri   r  r  r	   transform_act_fnr   r   r   s     r+   r1   z$ViltPredictionHeadTransform.__init__n  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr*   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r  r   r  s     r+   r   z#ViltPredictionHeadTransform.forwardw  s4    

=1--m<}5r*   r   r   s   @r+   r  r  m  s    Ur*   r  c                   ,     e Zd Zd fd	Zd Zd Z xZS )rz  c                 |   t         |           || _        t        |      | _        t        j                  |j                  |j                  d      | _	        t        j                  t        j                  |j                              | _        ||| j                  _        | j                  | j                  _        y )NFr   )r0   r1   rB   r  	transformr   r   r6   r   r}  r4   r$   r5   r   rV   )rC   rB   rV   rD   s      r+   r1   zViltMLMHead.__init__  s    4V<yy!3!3V5F5FUSLLV->->!?@	"(DLL !IIr*   c                 :    | j                   | j                  _         y r   )r   r}  rY  s    r+   _tie_weightszViltMLMHead._tie_weights  s     IIr*   c                 J    | j                  |      }| j                  |      }|S r   )r  r}  )rC   ry   s     r+   r   zViltMLMHead.forward  s"    NN1LLOr*   r   )r    r!   r"   r1   r  r   r   r   s   @r+   rz  rz  ~  s    
&&r*   rz  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for visual question answering, e.g. for VQAv2.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee	   dee	   dee	   de
eeej                     f   fd       Z xZS )ViltForQuestionAnsweringc           	         t         |   |       |j                  | _        t        |      | _        t        j                  t        j                  |j                  |j                  dz        t        j                  |j                  dz        t        j                         t        j                  |j                  dz  |j                              | _        | j                          y )NrF   )r0   r1   
num_labelsrO  r@  r   
Sequentialr   r6   r   GELU
classifierrU  r   s     r+   r1   z!ViltForQuestionAnswering.__init__  s      ++f%	 --IIf((&*<*<q*@ALL++a/0GGIIIf((1,f.?.?@	
 	r*   r   r   r   rs   rt   r   r   r   r  r   r7  r8  r   c                    ||n| j                   j                  }| j                  |||||||||
||      }|r|j                  n|d   }| j	                  |      }d}|	K|	j                  |j                        }	t        j                  j                  ||	      |	j                  d   z  }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )aX  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
            Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
            all answers that are applicable for a given example in the batch, or a soft encoding indicating which
            answers are applicable, where 1.0 is the highest score.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForQuestionAnswering
        >>> import requests
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "How many cats are there?"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
        >>> model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: 2
        ```Nr  r   rF   r  )rB   rc  r@  rb  r  rg   rR   r   rY    binary_cross_entropy_with_logitsrW   r   r   r   )rC   r   r   r   rs   rt   r   r   r   r  r   r7  r8  r   rb  r   r   r  s                     r+   r   z ViltForQuestionAnswering.forward  s
   b &1%<k$++B]B]))))%!'%/!5#  
 2=--'!*/YYv}}-F==AA&&QTZT`T`abTccD Y,F)-)9TGf$EvE'!//))	
 	
r*   rn  r    r!   r"   r1   r   r   r$   ro  r%   rp  r   r   r(   r   r   r   s   @r+   r  r    sV   "  156:594815155948-1,0/3&*T
E,,-T
 !!2!23T
 !!1!12	T

 u001T
 U--.T
 E--.T
   1 12T
 u001T
 ))*T
 $D>T
 'tnT
 d^T
 
'u/@/@)AA	BT
 T
r*   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for image-to-text or text-to-image retrieval, e.g. MSCOCO and F30K.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee	   dee	   dee	   de
eeej                     f   fd       Z xZS )ViltForImageAndTextRetrievalc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y r/   )	r0   r1   rO  r@  r   r   r6   rank_outputrU  r   s     r+   r1   z%ViltForImageAndTextRetrieval.__init__  sC     f%	 99V%7%7; 	r*   r   r   r   rs   rt   r   r   r   r  r   r7  r8  r   c                 D   ||n| j                   j                  }d}|	t        d      | j                  |||||||||
||      }|r|j                  n|d   }| j                  |      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )ad  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels are currently not supported.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImageAndTextRetrieval
        >>> import requests
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")
        >>> model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco")

        >>> # forward pass
        >>> scores = dict()
        >>> for text in texts:
        ...     # prepare inputs
        ...     encoding = processor(image, text, return_tensors="pt")
        ...     outputs = model(**encoding)
        ...     scores[text] = outputs.logits[0, :].item()
        ```NzTraining is not yet supported.r  r   rF   r  )	rB   rc  NotImplementedErrorr@  rb  r  r   r   r   )rC   r   r   r   rs   rt   r   r   r   r  r   r7  r8  r   r   rb  r   r  s                     r+   r   z$ViltForImageAndTextRetrieval.forward  s    Z &1%<k$++B]B]%&FGG))))%!'%/!5#  
 2=--'!*!!-0Y,F)-)9TGf$EvE'!//))	
 	
r*   rn  r  r   s   @r+   r  r    sV   	  156:594815155948-1,0/3&*M
E,,-M
 !!2!23M
 !!1!12	M

 u001M
 U--.M
 E--.M
   1 12M
 u001M
 ))*M
 $D>M
 'tnM
 d^M
 
'u/@/@)AA	BM
 M
r*   r  zq
    Vilt Model transformer with a classifier head on top for natural language visual reasoning, e.g. NLVR2.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee	   dee	   dee	   de
eeej                     f   fd       Z xZS )"ViltForImagesAndTextClassificationc           	         t         |   |       |j                  | _        t        |      | _        |j
                  }t        j                  t        j                  |j                  |z  |j                  |z        t        j                  |j                  |z        t        j                         t        j                  |j                  |z  |j                              | _        | j                          y r   )r0   r1   r  rO  r@  
num_imagesr   r  r   r6   r   r  r  rU  )rC   rB   r  rD   s      r+   r1   z+ViltForImagesAndTextClassification.__init__m  s      ++f%	 &&
--IIf((:5v7I7IJ7VWLL++j89GGIIIf((:5v7H7HI	
 	r*   r   r   r   rs   rt   r   r   r   r  r   r7  r8  r   c                    |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }| |j                  dk(  r|j                  d      }| |j                  dk(  r|j                  d      }||j                  d   nd}|||j                  d   nd}|| j                   j                  k7  rt        d      g }|rg nd}|
rg nd}t        |      D ]  }| j                  |||||dd|ddddddf   nd||dd|ddddf   nd||||dd|ddddf   nd|dz   |
||      }|r|j                  n|d   }|j                  |       |r|j                  |j                         |
s|j                  |j                          t        j                   |d      }| j#                  |      }d}|	Wt%               }|	j'                  |j(                        }	 ||j+                  d| j,                        |	j+                  d            }|s|||f}||f|z   S |S t/        ||||	      S )
a3  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Binary classification labels.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImagesAndTextClassification
        >>> import requests
        >>> from PIL import Image

        >>> image1 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
        >>> image2 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg", stream=True).raw)
        >>> text = "The left image contains twice the number of dogs as the right image."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
        >>> model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")

        >>> # prepare inputs
        >>> encoding = processor([image1, image2], text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(input_ids=encoding.input_ids, pixel_values=encoding.pixel_values.unsqueeze(0))
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: True
        ```N   r   r   z\Make sure to match the number of images in the model with the number of images in the input.)r   r   rs   rt   r   r   r   r   r   r7  r8  rN   rH   r  )rB   r   r7  rc  ndim	unsqueezerW   r  r   r.  r@  rb  rr   r   r   r$   ra   r  r   rg   rR   r`   r  r   )rC   r   r   r   rs   rt   r   r   r   r  r   r7  r8  r  pooler_outputsr   r   r   r   rb  rm  r   r   r  r  s                            r+   r   z*ViltForImagesAndTextClassification.forward  s   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]#(9(9Q(>'11!4L#(9(9Q(>'11!4L.:.F\''*D
2>2J++A.PTJ///n  2,R$
z" 	6Aii--<H<T\!Q1a-8Z^5?5K:aAqj1QU#+9E9Q\!Q1*5W[%&U"3%9'   G 6AG11gajM!!-0#$$W%:%:; !!'"4"45+	6. 		.b9/')HYYv}}-FFKKDOO<fkk"oNDmZ8F)-)9TGf$EvE7'!	
 	
r*   rn  )r    r!   r"   r1   r   r   r$   ro  r%   rp  r   r   r(   r   r   r   s   @r+   r  r  g  sV   $  156:594815155948-1,0/3&*p
E,,-p
 !!2!23p
 !!1!12	p

 u001p
 U--.p
 E--.p
   1 12p
 u001p
 ))*p
 $D>p
 'tnp
 d^p
 
7u?P?P9QQ	Rp
 p
r*   r  c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee	   dee	   dee	   de
eeej                     f   fd       Z xZS )ViltForTokenClassificationc                 0   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y )NF)rV  )r0   r1   r  rO  r@  r   r?   r@   rA   r   r6   r  rU  r   s     r+   r1   z#ViltForTokenClassification.__init__  sk      ++f>	zz&"<"<=))F$6$68I8IJ 	r*   r   r   r   rs   rt   r   r   r   r  r   r7  r8  r   c                 4   ||n| j                   j                  }| j                  |||||||||
||      }|d   }||j                  d   n|j                  d   }| j	                  |      }| j                  |ddd|f         }d}|	Wt               }|	j                  |j                        }	 ||j                  d| j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a/  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   rN   rF   r  )rB   rc  r@  rW   rA   r  r   rg   rR   r`   r  r   r   r   )rC   r   r   r   rs   rt   r   r   r   r  r   r7  r8  r   rl  text_input_sizer   r   r  r  s                       r+   r   z"ViltForTokenClassification.forward  s?   0 &1%<k$++B]B]))))%!'%/!5#  
 "!*090E)//!,=K^K^_`Ka,,74D_4D1D!EF')HYYv}}-FFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r*   rn  )r    r!   r"   r1   r   r   r$   ro  r%   rp  r   r   r(   r   r   r   s   @r+   r  r    sG   
  156:594815155948-1,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u001>
 U--.>
 E--.>
   1 12>
 u001>
 ))*>
 $D>>
 'tn>
 d^>
 
$eE,=,=&>>	?>
 >
r*   r  )r  r  r  rx  r  r   rO  r?  )>r#   collections.abcr   r   dataclassesr   typingr   r   r$   torch.utils.checkpointr   torch.nnr   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_viltr   
get_loggerr    loggerr   Moduler-   r2   r8   r   r   r  r  r  r   r+  r?  rO  rS  rx  r  rz  r  r  r  r  __all__r)   r*   r+   <module>r     sf      ! "    % ! 9  . [ [ , * 
		H	% 
@{ @ @$W!RYY W!t6RYY 6r")) >?		 ?FRYY $BII Fryy " #* #L)
")) )
X */ * *. U
# U
 U
p  
C
) C

C
L")) "")) , g
2 g
g
T Z
#6 Z
Z
z 
D
)< D

D
N L
!4 L
 L
^	r*   